diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml index 1087b9030..ea489dead 100644 --- a/.github/backend-matrix.yml +++ b/.github/backend-matrix.yml @@ -4981,6 +4981,9 @@ includeDarwin: - backend: "vllm" tag-suffix: "-metal-darwin-arm64-vllm" build-type: "mps" + - backend: "liquid-audio" + tag-suffix: "-metal-darwin-arm64-liquid-audio" + build-type: "mps" - backend: "piper" tag-suffix: "-metal-darwin-arm64-piper" build-type: "metal" @@ -4997,6 +5000,10 @@ includeDarwin: tag-suffix: "-metal-darwin-arm64-sherpa-onnx" build-type: "metal" lang: "go" + - backend: "supertonic" + tag-suffix: "-metal-darwin-arm64-supertonic" + build-type: "metal" + lang: "go" - backend: "local-store" tag-suffix: "-metal-darwin-arm64-local-store" build-type: "metal" diff --git a/backend/go/supertonic/helper.go b/backend/go/supertonic/helper.go index 9f927d5d3..884077e75 100644 --- a/backend/go/supertonic/helper.go +++ b/backend/go/supertonic/helper.go @@ -16,6 +16,7 @@ import ( "os" "path/filepath" "regexp" + "runtime" "strings" "time" "unicode" @@ -943,7 +944,13 @@ func InitializeONNXRuntime() error { } } if libPath == "" { - libPath = "/usr/local/lib/libonnxruntime.so" + // LocalAI: default to the platform-native shared library + // extension when nothing else is found (dyld vs ld.so). + if runtime.GOOS == "darwin" { + libPath = "/usr/local/lib/libonnxruntime.dylib" + } else { + libPath = "/usr/local/lib/libonnxruntime.so" + } } } ort.SetSharedLibraryPath(libPath) diff --git a/backend/go/supertonic/package.sh b/backend/go/supertonic/package.sh index 9e2a01625..678ca5ead 100755 --- a/backend/go/supertonic/package.sh +++ b/backend/go/supertonic/package.sh @@ -32,6 +32,10 @@ elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2 cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1 cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0 +elif [ $(uname -s) = "Darwin" ]; then + # macOS: dyld resolves the bundled .dylib via DYLD_LIBRARY_PATH (set in + # run.sh); there is no ld.so loader nor glibc to bundle. + echo "Detected Darwin" else echo "Error: Could not detect architecture" exit 1 diff --git a/backend/go/supertonic/run.sh b/backend/go/supertonic/run.sh index 2dabf7eb3..683c52ab2 100755 --- a/backend/go/supertonic/run.sh +++ b/backend/go/supertonic/run.sh @@ -3,12 +3,19 @@ set -ex CURDIR=$(dirname "$(realpath $0)") -export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH -export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.so +if [ "$(uname)" = "Darwin" ]; then + # macOS uses dyld: there is no ld.so loader, and the search path env + # var is DYLD_LIBRARY_PATH. ONNX Runtime ships as a .dylib here. + export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH + export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.dylib +else + export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH + export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.so -if [ -f $CURDIR/lib/ld.so ]; then - echo "Using lib/ld.so" - exec $CURDIR/lib/ld.so $CURDIR/supertonic "$@" + if [ -f $CURDIR/lib/ld.so ]; then + echo "Using lib/ld.so" + exec $CURDIR/lib/ld.so $CURDIR/supertonic "$@" + fi fi exec $CURDIR/supertonic "$@" diff --git a/backend/index.yaml b/backend/index.yaml index 38d443e16..729de2abc 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -1285,6 +1285,7 @@ nvidia-cuda-13: "cuda13-liquid-audio" nvidia-cuda-12: "cuda12-liquid-audio" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-liquid-audio" + metal: "metal-liquid-audio" icon: https://cdn-avatars.huggingface.co/v1/production/uploads/61b8e2ba285851687028d395/7_6D7rWrLxp2hb6OHSV1p.png - &qwen-tts urls: @@ -1570,6 +1571,7 @@ - TTS capabilities: default: "cpu-supertonic" + metal: "metal-supertonic" - !!merge <<: *neutts name: "neutts-development" capabilities: @@ -4624,6 +4626,7 @@ nvidia-cuda-13: "cuda13-liquid-audio-development" nvidia-cuda-12: "cuda12-liquid-audio-development" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-liquid-audio-development" + metal: "metal-liquid-audio-development" - !!merge <<: *liquid-audio name: "cpu-liquid-audio" uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-liquid-audio" @@ -4634,6 +4637,16 @@ uri: "quay.io/go-skynet/local-ai-backends:master-cpu-liquid-audio" mirrors: - localai/localai-backends:master-cpu-liquid-audio +- !!merge <<: *liquid-audio + name: "metal-liquid-audio" + uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-liquid-audio" + mirrors: + - localai/localai-backends:latest-metal-darwin-arm64-liquid-audio +- !!merge <<: *liquid-audio + name: "metal-liquid-audio-development" + uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-liquid-audio" + mirrors: + - localai/localai-backends:master-metal-darwin-arm64-liquid-audio - !!merge <<: *liquid-audio name: "cuda12-liquid-audio" uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-liquid-audio" @@ -5496,6 +5509,7 @@ name: "supertonic-development" capabilities: default: "cpu-supertonic-development" + metal: "metal-supertonic-development" - !!merge <<: *supertonic name: "cpu-supertonic" uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-supertonic" @@ -5506,3 +5520,13 @@ uri: "quay.io/go-skynet/local-ai-backends:master-cpu-supertonic" mirrors: - localai/localai-backends:master-cpu-supertonic +- !!merge <<: *supertonic + name: "metal-supertonic" + uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-supertonic" + mirrors: + - localai/localai-backends:latest-metal-darwin-arm64-supertonic +- !!merge <<: *supertonic + name: "metal-supertonic-development" + uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-supertonic" + mirrors: + - localai/localai-backends:master-metal-darwin-arm64-supertonic diff --git a/backend/python/liquid-audio/install.sh b/backend/python/liquid-audio/install.sh index c7ed8eaa8..fe0f9caad 100755 --- a/backend/python/liquid-audio/install.sh +++ b/backend/python/liquid-audio/install.sh @@ -14,5 +14,11 @@ else fi # liquid-audio's torch wheels are large; allow upgrades to satisfy transitive pins -EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match" +EXTRA_PIP_INSTALL_FLAGS+=" --upgrade" +# --index-strategy is a uv-only flag. The darwin/MPS build installs with pip +# (USE_PIP=true in scripts/build/python-darwin.sh), which rejects it. Only add +# it on the uv path; Linux/CUDA resolution is unchanged. +if [ "x${USE_PIP:-}" != "xtrue" ]; then + EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match" +fi installRequirements diff --git a/backend/python/liquid-audio/requirements-mps.txt b/backend/python/liquid-audio/requirements-mps.txt index f57687f29..3c9c36cca 100644 --- a/backend/python/liquid-audio/requirements-mps.txt +++ b/backend/python/liquid-audio/requirements-mps.txt @@ -1,3 +1,4 @@ +# MPS (Apple Silicon / Metal) build profile - installed by the darwin CI job. torch>=2.8.0 torchaudio>=2.8.0 torchcodec>=0.9.1 diff --git a/core/config/hardware_defaults.go b/core/config/hardware_defaults.go index 18c321639..b4e0e74c6 100644 --- a/core/config/hardware_defaults.go +++ b/core/config/hardware_defaults.go @@ -54,8 +54,35 @@ func (g GPU) IsNVIDIABlackwell() bool { return maj >= 12 } +// Compute-buffer headroom guard for the raised physical batch. +// +// Raising n_ubatch grows the CUDA *compute buffer* (the scratch for the forward +// graph), which is allocated PER DEVICE — it does not benefit from a second GPU +// the way weights or KV (which are split across devices) do. The buffer scales +// ~linearly with n_ubatch * n_ctx, so a large context turns the GB10-tuned +// ub2048 into multi-GiB of extra scratch that must fit on a SINGLE card. On a +// 16 GiB consumer Blackwell with a 200k context that overflows (issue #10485), +// even though the GB10 it was measured on (128 GiB unified memory) had room. +// +// These constants size a conservative guard: only raise the batch when the +// extra scratch fits the per-device VRAM ceiling. +const ( + // computeBufferBytesPerCell approximates the CUDA compute-buffer cost of one + // (n_ubatch * n_ctx) cell. Derived from an observed allocation (ub2048 * + // ctx204800 ~= 4.5 GiB => ~11 B/cell) and rounded up to 16 for margin, since + // the real cost also grows with model width (heads / embedding dim) which we + // don't know at config time. + computeBufferBytesPerCell = 16 + // blackwellBatchHeadroomDivisor caps the extra compute buffer from raising the + // physical batch at VRAM/divisor. /4 keeps the bulk of a device for weights + + // KV, which already dominate VRAM use. + blackwellBatchHeadroomDivisor = 4 +) + // PhysicalBatch returns the canonical physical batch (n_batch/n_ubatch) for the -// given hardware, used when the model config leaves batch unset. +// given hardware class, ignoring context/VRAM headroom. Use +// PhysicalBatchForContext when a model context and per-device VRAM are known +// (the load paths) so the raised batch can't overflow a single device. func PhysicalBatch(g GPU) int { if g.IsNVIDIABlackwell() { return BlackwellPhysicalBatch @@ -63,6 +90,32 @@ func PhysicalBatch(g GPU) int { return DefaultPhysicalBatch } +// PhysicalBatchForContext is PhysicalBatch gated on per-device VRAM headroom for +// the given context: it only raises the batch above the conservative default +// when the extra compute buffer (which is allocated on a single device and grows +// with n_ubatch * n_ctx) fits within blackwellBatchHeadroomDivisor of the GPU's +// VRAM. g.VRAM must be the PER-DEVICE ceiling (the smallest device on a +// multi-GPU host), not the summed total — the compute buffer can't be split. +// +// VRAM 0 (unknown) stays conservative rather than risk a per-device OOM; the +// GB10 / unified-memory path reports system RAM, so it still clears the guard. +func PhysicalBatchForContext(g GPU, ctx int) int { + if !g.IsNVIDIABlackwell() { + return DefaultPhysicalBatch + } + if ctx <= 0 { + ctx = DefaultContextSize + } + if g.VRAM == 0 { + return DefaultPhysicalBatch + } + extra := uint64(ctx) * uint64(BlackwellPhysicalBatch-DefaultPhysicalBatch) * computeBufferBytesPerCell + if extra <= g.VRAM/blackwellBatchHeadroomDivisor { + return BlackwellPhysicalBatch + } + return DefaultPhysicalBatch +} + // IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns. // Callers that re-tune a value chosen by an upstream host (the distributed // router correcting the frontend's guess) use this to avoid clobbering an @@ -122,7 +175,12 @@ func hasParallelOption(opts []string) bool { // deterministic device — detection does a live nvidia-smi call. var localGPU = func() GPU { vendor, _ := xsysinfo.DetectGPUVendor() - vram, _ := xsysinfo.TotalAvailableVRAM() + // Use the SMALLEST device's VRAM, not the summed total: the parallel-slot + // tier and the batch headroom guard both reason about what fits on a single + // card, and per-device compute buffers can't be split across GPUs. Summing + // two 16 GiB cards into "32 GiB" is what over-provisioned multi-GPU hosts + // into OOM (issue #10485). + vram, _ := xsysinfo.MinPerGPUVRAM() return GPU{ Vendor: vendor, ComputeCapability: xsysinfo.NVIDIAComputeCapability(), @@ -137,10 +195,20 @@ func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) { if cfg == nil { return } - if cfg.Batch == 0 && gpu.IsNVIDIABlackwell() { - cfg.Batch = BlackwellPhysicalBatch - xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch", - "batch", cfg.Batch, "compute_cap", gpu.ComputeCapability) + // Raise the physical batch on Blackwell only when the resulting compute + // buffer fits the per-device VRAM at THIS model's context. Leaving Batch at 0 + // (rather than writing the default 512) preserves the downstream single-pass + // sizing in core/backend.EffectiveBatchSize for embedding/score/rerank. + if cfg.Batch == 0 { + ctx := DefaultContextSize + if cfg.ContextSize != nil { + ctx = *cfg.ContextSize + } + if PhysicalBatchForContext(gpu, ctx) == BlackwellPhysicalBatch { + cfg.Batch = BlackwellPhysicalBatch + xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch", + "batch", cfg.Batch, "compute_cap", gpu.ComputeCapability, "context", ctx, "vram_gib", gpu.VRAM>>30) + } } // Enable concurrent serving by default on a capable GPU: without this the diff --git a/core/config/hardware_defaults_internal_test.go b/core/config/hardware_defaults_internal_test.go index 52c674c2d..d6878c86e 100644 --- a/core/config/hardware_defaults_internal_test.go +++ b/core/config/hardware_defaults_internal_test.go @@ -9,26 +9,37 @@ import ( // GPU. The detection seam (localGPU) is injected so the path is deterministic // without a real GPU. var _ = Describe("SetDefaults hardware defaults (single-instance)", func() { + const gib = uint64(1) << 30 + var orig func() GPU BeforeEach(func() { orig = localGPU }) AfterEach(func() { localGPU = orig }) - It("sets the physical batch on a local Blackwell GPU", func() { - localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} } + It("sets the physical batch on a local Blackwell GPU with headroom", func() { + localGPU = func() GPU { return GPU{ComputeCapability: "12.1", VRAM: 119 * gib} } cfg := &ModelConfig{} cfg.SetDefaults() Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch)) }) + It("leaves batch unset when a large context would overflow the device", func() { + // Regression guard for issue #10485: 16 GiB consumer Blackwell + ~200k ctx. + localGPU = func() GPU { return GPU{ComputeCapability: "12.0", VRAM: 16 * gib} } + ctx := 204800 + cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}} + cfg.SetDefaults() + Expect(cfg.Batch).To(Equal(0)) + }) + It("leaves batch unset on a non-Blackwell local GPU", func() { - localGPU = func() GPU { return GPU{ComputeCapability: "8.9"} } + localGPU = func() GPU { return GPU{ComputeCapability: "8.9", VRAM: 119 * gib} } cfg := &ModelConfig{} cfg.SetDefaults() Expect(cfg.Batch).To(Equal(0)) }) It("never overrides an explicit batch", func() { - localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} } + localGPU = func() GPU { return GPU{ComputeCapability: "12.1", VRAM: 119 * gib} } cfg := &ModelConfig{} cfg.Batch = 1024 cfg.SetDefaults() diff --git a/core/config/hardware_defaults_test.go b/core/config/hardware_defaults_test.go index ae7bf3964..3bc1bf297 100644 --- a/core/config/hardware_defaults_test.go +++ b/core/config/hardware_defaults_test.go @@ -7,6 +7,8 @@ import ( ) var _ = Describe("Hardware-driven config defaults", func() { + const gib = uint64(1) << 30 + DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)", func(cc string, want bool) { Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want)) @@ -35,21 +37,54 @@ var _ = Describe("Hardware-driven config defaults", func() { }) }) + Describe("PhysicalBatchForContext (per-device VRAM headroom)", func() { + It("raises the batch when the compute buffer fits the device", func() { + // 16 GiB Blackwell with a small context: the extra scratch is tiny. + Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.0", VRAM: 16 * gib}, 8192)). + To(Equal(BlackwellPhysicalBatch)) + }) + It("keeps the default batch when a large context would overflow one device", func() { + // The issue #10485 case: 16 GiB consumer Blackwell, ~200k context. + Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.0", VRAM: 16 * gib}, 204800)). + To(Equal(DefaultPhysicalBatch)) + }) + It("still raises the batch on a large unified-memory device (GB10)", func() { + // GB10 reports system RAM (~119 GiB) as its single device's VRAM. + Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.1", VRAM: 119 * gib}, 204800)). + To(Equal(BlackwellPhysicalBatch)) + }) + It("stays conservative when VRAM is unknown", func() { + Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.1"}, 8192)). + To(Equal(DefaultPhysicalBatch)) + }) + It("never raises the batch on non-Blackwell", func() { + Expect(PhysicalBatchForContext(GPU{ComputeCapability: "9.0", VRAM: 80 * gib}, 8192)). + To(Equal(DefaultPhysicalBatch)) + }) + }) + Describe("ApplyHardwareDefaults", func() { - It("raises an unset batch to 2048 on Blackwell", func() { + It("raises an unset batch to 2048 on Blackwell with headroom", func() { cfg := &ModelConfig{} - ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"}) + ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib}) Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch)) }) + It("leaves batch unset when a large context would overflow one device", func() { + // Regression guard for issue #10485: 16 GiB card + ~200k context. + ctx := 204800 + cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}} + ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.0", VRAM: 16 * gib}) + Expect(cfg.Batch).To(Equal(0)) + }) It("leaves batch unset on non-Blackwell", func() { cfg := &ModelConfig{} - ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0"}) + ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0", VRAM: 119 * gib}) Expect(cfg.Batch).To(Equal(0)) }) It("never overrides an explicit batch", func() { cfg := &ModelConfig{} cfg.Batch = 1024 - ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"}) + ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib}) Expect(cfg.Batch).To(Equal(1024)) }) It("no-ops on nil", func() { @@ -57,8 +92,6 @@ var _ = Describe("Hardware-driven config defaults", func() { }) }) - const gib = uint64(1) << 30 - DescribeTable("DefaultParallelSlots (by VRAM)", func(vramGiB uint64, want int) { Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want)) diff --git a/core/config/model_config.go b/core/config/model_config.go index 8886ddfd5..2d1e18cc7 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -1204,11 +1204,6 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) { // This ensures gallery-installed and runtime-loaded models get optimal parameters. ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model) - // Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell). - // Uses the local GPU here; in distributed mode the router re-applies the same - // heuristics for the selected node's GPU before loading. Explicit config wins. - ApplyHardwareDefaults(cfg, localGPU()) - // Apply serving-policy defaults (device-independent): cross-request prefix // caching. Propagates to distributed nodes via the model options. ApplyServingDefaults(cfg) @@ -1247,6 +1242,16 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) { cfg.ContextSize = &ctx } runBackendHooks(cfg, lo.modelPath) + + // Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell) + // LAST, after the context size is fully resolved (explicit config, LoadOptions, + // then the GGUF guess inside runBackendHooks): the Blackwell batch guard sizes + // the per-device compute buffer against this model's context, so it must see + // the final value, not a pre-guess nil. Uses the local GPU here; in distributed + // mode the router re-applies the same heuristics for the selected node's GPU + // before loading. Explicit config always wins. + ApplyHardwareDefaults(cfg, localGPU()) + cfg.syncKnownUsecasesFromString() } diff --git a/core/http/endpoints/openai/realtime_model.go b/core/http/endpoints/openai/realtime_model.go index 6843a521d..0dafa0a35 100644 --- a/core/http/endpoints/openai/realtime_model.go +++ b/core/http/endpoints/openai/realtime_model.go @@ -432,7 +432,7 @@ func loadSoundDetectionConfig(pipeline *config.Pipeline, cl *config.ModelConfigL if pipeline.SoundDetection == "" { return nil, nil } - cfg, err := cl.LoadModelConfigFileByName(pipeline.SoundDetection, ml.ModelPath) + cfg, err := loadPipelineSubModel(cl, pipeline.SoundDetection, ml.ModelPath) if err != nil { return nil, fmt.Errorf("failed to load sound detection config: %w", err) } @@ -443,7 +443,7 @@ func loadSoundDetectionConfig(pipeline *config.Pipeline, cl *config.ModelConfigL } func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, *config.ModelConfig, error) { - cfgVAD, err := cl.LoadModelConfigFileByName(pipeline.VAD, ml.ModelPath) + cfgVAD, err := loadPipelineSubModel(cl, pipeline.VAD, ml.ModelPath) if err != nil { return nil, nil, fmt.Errorf("failed to load backend config: %w", err) @@ -453,7 +453,7 @@ func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfig return nil, nil, fmt.Errorf("failed to validate config: %w", err) } - cfgSST, err := cl.LoadModelConfigFileByName(pipeline.Transcription, ml.ModelPath) + cfgSST, err := loadPipelineSubModel(cl, pipeline.Transcription, ml.ModelPath) if err != nil { return nil, nil, fmt.Errorf("failed to load backend config: %w", err) @@ -542,11 +542,30 @@ func buildRealtimeRoutingContext(a *application.Application, sessionID string) * } } +// loadPipelineSubModel loads a pipeline sub-model config by name and follows a +// single alias hop, so a pipeline that references an alias (e.g. `llm: default`) +// gets the alias target's full config (Backend, Model, ...) rather than the +// alias stub with an empty Backend. Without this the alias survives unresolved +// into model loading and fails downstream — notably in distributed mode with +// "backend name is empty". Mirrors the top-level alias resolution in +// core/http/middleware/request.go. +func loadPipelineSubModel(cl *config.ModelConfigLoader, name, modelPath string) (*config.ModelConfig, error) { + cfg, err := cl.LoadModelConfigFileByName(name, modelPath) + if err != nil { + return nil, err + } + resolved, _, err := cl.ResolveAlias(cfg) + if err != nil { + return nil, err + } + return resolved, nil +} + // returns and loads either a wrapped model or a model that support audio-to-audio func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, evaluator *templates.Evaluator, routing *RealtimeRoutingContext) (Model, error) { xlog.Debug("Creating new model pipeline model", "pipeline", pipeline) - cfgVAD, err := cl.LoadModelConfigFileByName(pipeline.VAD, ml.ModelPath) + cfgVAD, err := loadPipelineSubModel(cl, pipeline.VAD, ml.ModelPath) if err != nil { return nil, fmt.Errorf("failed to load backend config: %w", err) @@ -557,7 +576,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model } // TODO: Do we always need a transcription model? It can be disabled. Note that any-to-any instruction following models don't transcribe as such, so if transcription is required it is a separate process - cfgSST, err := cl.LoadModelConfigFileByName(pipeline.Transcription, ml.ModelPath) + cfgSST, err := loadPipelineSubModel(cl, pipeline.Transcription, ml.ModelPath) if err != nil { return nil, fmt.Errorf("failed to load backend config: %w", err) @@ -589,7 +608,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model xlog.Debug("Loading a wrapped model") // Otherwise we want to return a wrapped model, which is a "virtual" model that re-uses other models to perform operations - cfgLLM, err := cl.LoadModelConfigFileByName(pipeline.LLM, ml.ModelPath) + cfgLLM, err := loadPipelineSubModel(cl, pipeline.LLM, ml.ModelPath) if err != nil { return nil, fmt.Errorf("failed to load backend config: %w", err) @@ -604,7 +623,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model applyPipelineReasoning(cfgLLM, *pipeline) applyPipelineThinking(cfgLLM, *pipeline) - cfgTTS, err := cl.LoadModelConfigFileByName(pipeline.TTS, ml.ModelPath) + cfgTTS, err := loadPipelineSubModel(cl, pipeline.TTS, ml.ModelPath) if err != nil { return nil, fmt.Errorf("failed to load backend config: %w", err) diff --git a/core/http/endpoints/openai/realtime_model_alias_test.go b/core/http/endpoints/openai/realtime_model_alias_test.go new file mode 100644 index 000000000..77179d963 --- /dev/null +++ b/core/http/endpoints/openai/realtime_model_alias_test.go @@ -0,0 +1,52 @@ +package openai + +import ( + "os" + "path/filepath" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/mudler/LocalAI/core/config" +) + +// loadPipelineSubModel must resolve a pipeline sub-model that references an +// alias (e.g. `llm: default`) one hop to the alias target's full config — so +// the effective backend is the target's backend, not the empty backend of the +// alias stub. This mirrors the top-level alias resolution done in +// core/http/middleware/request.go, which the realtime pipeline previously +// skipped (failing in distributed mode with "backend name is empty"). +var _ = Describe("loadPipelineSubModel", func() { + It("resolves a sub-model alias one hop to the target's config", func() { + tmpDir := GinkgoT().TempDir() + + // A real model config with a concrete backend. + realLLM := `name: real-llm +backend: llama-cpp +parameters: + model: real-llm.gguf +` + Expect(os.WriteFile(filepath.Join(tmpDir, "real-llm.yaml"), []byte(realLLM), 0644)).To(Succeed()) + + // An alias pointing at the real model. + aliasCfg := `name: default +alias: real-llm +` + Expect(os.WriteFile(filepath.Join(tmpDir, "default.yaml"), []byte(aliasCfg), 0644)).To(Succeed()) + + cl := config.NewModelConfigLoader(tmpDir) + Expect(cl.LoadModelConfigsFromPath(tmpDir)).To(Succeed()) + + // Resolving the alias must follow the hop to the target's full config. + resolved, err := loadPipelineSubModel(cl, "default", tmpDir) + Expect(err).NotTo(HaveOccurred()) + Expect(resolved.IsAlias()).To(BeFalse()) + Expect(resolved.Backend).To(Equal("llama-cpp")) + + // A non-alias name must load unchanged. + direct, err := loadPipelineSubModel(cl, "real-llm", tmpDir) + Expect(err).NotTo(HaveOccurred()) + Expect(direct.Backend).To(Equal("llama-cpp")) + Expect(direct.Name).To(Equal("real-llm")) + }) +}) diff --git a/core/http/react-ui/public/locales/en/chat.json b/core/http/react-ui/public/locales/en/chat.json index de9d0507d..ffda226db 100644 --- a/core/http/react-ui/public/locales/en/chat.json +++ b/core/http/react-ui/public/locales/en/chat.json @@ -86,6 +86,7 @@ "input": { "placeholder": "Message...", "attachFile": "Attach file", + "send": "Send message", "stopGenerating": "Stop generating", "canvasTitle": "Canvas — extract code blocks and media into a side panel for preview, copy, and download", "canvasLabel": "Canvas", diff --git a/core/http/react-ui/public/locales/en/home.json b/core/http/react-ui/public/locales/en/home.json index fabd9e9dd..142767999 100644 --- a/core/http/react-ui/public/locales/en/home.json +++ b/core/http/react-ui/public/locales/en/home.json @@ -77,6 +77,20 @@ "noModelsTitle": "No Models Available", "noModelsBody": "There are no models installed yet. Ask your administrator to set up models so you can start chatting." }, + "starters": { + "title": "Recommended for your hardware", + "tier": { + "cpu": "CPU-only", + "gpu-small": "GPU", + "gpu-large": "GPU" + }, + "cpuNote": "No GPU detected — these small models stay responsive on CPU.", + "gpuNote": "Picked to fit your available VRAM with room for context.", + "install": "Install", + "installing": "Installing", + "installStarted": "Installing {{model}}…", + "installFailed": "Install failed: {{message}}" + }, "connect": { "title": "One endpoint, every API", "subtitle": "LocalAI serves its own full API — image & video generation, depth, object detection, reranking, audio, face & voice recognition, and realtime voice over WebRTC and WebSocket. On top of that, a drop-in compatibility layer lets any app built for OpenAI, Anthropic, Ollama or OpenAI Responses talk to it unchanged.", diff --git a/core/http/react-ui/src/App.css b/core/http/react-ui/src/App.css index cf1a46bd3..40eddc2e9 100644 --- a/core/http/react-ui/src/App.css +++ b/core/http/react-ui/src/App.css @@ -6363,6 +6363,59 @@ select.input { justify-content: center; } +/* ──────────────────── Home: hardware-aware starter models ──────────────────── */ + +.home-starters { + margin: var(--spacing-lg) 0; + padding: var(--spacing-lg); +} +.home-starters-head { + display: flex; + align-items: center; + justify-content: space-between; + gap: var(--spacing-md); +} +.home-starters-head strong { + font-size: 0.9375rem; +} +.home-starters-tier { + display: inline-flex; + align-items: center; + gap: var(--spacing-xs); + font-size: 0.75rem; + color: var(--color-text-muted); +} +.home-starters-sub { + margin: var(--spacing-xs) 0 var(--spacing-md); + font-size: 0.8125rem; + color: var(--color-text-secondary); +} +.home-starters-list { + list-style: none; + margin: 0; + padding: 0; + display: flex; + flex-direction: column; + gap: var(--spacing-xs); +} +.home-starters-item { + display: flex; + align-items: center; + gap: var(--spacing-md); + padding: var(--spacing-xs) 0; +} +.home-starters-name { + font-weight: 500; + font-size: 0.875rem; + word-break: break-all; +} +.home-starters-size { + margin-left: auto; + font-size: 0.75rem; + color: var(--color-text-muted); + white-space: nowrap; +} + /* ──────────────────── Home: drop-in endpoint / API compatibility ──────────────────── */ .home-connect { diff --git a/core/http/react-ui/src/components/ModelSelector.jsx b/core/http/react-ui/src/components/ModelSelector.jsx index 9009524ee..76a118ec9 100644 --- a/core/http/react-ui/src/components/ModelSelector.jsx +++ b/core/http/react-ui/src/components/ModelSelector.jsx @@ -1,8 +1,25 @@ -import { useEffect, useMemo } from 'react' +import { useEffect, useMemo, useCallback } from 'react' import { useModels } from '../hooks/useModels' import SearchableSelect from './SearchableSelect' import { useTranslation } from 'react-i18next' +// Remember the last model the user picked, keyed by capability, so returning to +// a page (Home chat box, Image, TTS, Talk...) defaults to that model instead of +// whatever happens to sort first. Only persisted when a capability key exists — +// `externalOptions` callers pass no capability and get the old first-item +// behaviour. localStorage access is wrapped because private-browsing modes throw. +const LAST_MODEL_PREFIX = 'localai_last_model:' + +function readLastModel(capability) { + if (!capability) return null + try { return localStorage.getItem(LAST_MODEL_PREFIX + capability) } catch { return null } +} + +function writeLastModel(capability, model) { + if (!capability || !model) return + try { localStorage.setItem(LAST_MODEL_PREFIX + capability, model) } catch { /* ignore */ } +} + export default function ModelSelector({ value, onChange, capability, className = '', options: externalOptions, loading: externalLoading, @@ -19,16 +36,27 @@ export default function ModelSelector({ const isLoading = externalOptions ? (externalLoading || false) : hookLoading const isDisabled = isLoading || (externalDisabled || false) + // Persist genuine selections so the next visit can restore them. + const handleChange = useCallback((next) => { + writeLastModel(capability, next) + onChange(next) + }, [capability, onChange]) + useEffect(() => { if (modelNames.length > 0 && (!value || !modelNames.includes(value))) { - onChange(modelNames[0]) + // Prefer the remembered model when it's still available; otherwise fall + // back to the first option. Don't re-persist here — auto-select is not a + // user choice, and writing back the stored value would be a harmless but + // pointless round-trip. + const remembered = readLastModel(capability) + onChange(remembered && modelNames.includes(remembered) ? remembered : modelNames[0]) } - }, [modelNames, value, onChange]) + }, [modelNames, value, onChange, capability]) return ( new Set()) + + const tier = useMemo(() => pickTier(resources), [resources]) + const candidates = tier.list + + // Verify candidates exist in the live gallery. One search per name (the tier + // has at most a handful) keeps this resilient to gallery customization. + useEffect(() => { + let cancelled = false + const names = [...new Set(candidates.map(c => c.name))] + Promise.all(names.map(name => + modelsApi.list({ search: name, page: 1 }) + .then(data => (data?.models || []).some(m => (m.name || m.id) === name) ? name : null) + .catch(() => null) + )).then(found => { + if (cancelled) return + const hits = found.filter(Boolean) + // If verification yielded nothing (e.g. gallery unreachable), fall back to + // showing the curated list rather than an empty widget. + setAvailable(hits.length > 0 ? new Set(hits) : null) + }) + return () => { cancelled = true } + }, [candidates]) + + const visible = available === null + ? candidates + : candidates.filter(c => available.has(c.name)) + + if (visible.length === 0) return null + + const install = async (name) => { + setInstalling(prev => new Set(prev).add(name)) + try { + await modelsApi.install(name) + addToast?.(t('starters.installStarted', { model: name }), 'success') + onInstallStarted?.(name) + } catch (err) { + addToast?.(t('starters.installFailed', { message: err.message }), 'error') + setInstalling(prev => { + const next = new Set(prev) + next.delete(name) + return next + }) + } + } + + return ( +
+
+ {t('starters.title')} + +