fix(config): backend-gate the top_k=40 sampler default (#6632 ) (#10285 )

fix(config): gate top_k=40 default on backend family (#6632) SetDefaults injected top_k=40 (llama.cpp's sampling default) for every model config regardless of backend. That value is wrong for backends whose native default differs: mlx_lm's intended default is top_k=0 (disabled) and mlx does not remap 0->40, so a client that omits top_k silently got 40 shipped to mlx, changing sampling. The mlx backend's own getattr(request,'TopK',0) fallback is dead because proto3 int32 is always present. Gate the injection on backend family via UsesLlamaSamplerDefaults: keep top_k=40 for the llama.cpp family and for the empty/auto backend (the GGUF auto-detect path resolves to llama.cpp, so existing behavior is preserved), but leave TopK nil for the known non-llama backends (mlx, mlx-vlm, mlx-distributed). gRPCPredictOpts now sends 0 when TopK is nil, which is the value mlx actually wants. Only TopK is gated - the confirmed bug. The sibling sampler defaults (top_p, temperature, min_p) are left global to avoid widening scope and introducing nil-deref risk; revisit per-backend if needed. Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
chore: ⬆️ Update mudler/locate-anything.cpp to 92c1682da792c1e8a5dec91acc2be4b02c742ded (#10282 )
2026-06-13 03:09:03 -04:00 · 2026-06-13 09:04:25 +02:00 · 2026-06-13 09:01:17 +02:00 · 2026-06-13 01:01:36 +02:00
8 changed files with 123 additions and 7 deletions
--- a/backend/go/locate-anything-cpp/Makefile
+++ b/backend/go/locate-anything-cpp/Makefile
@@ -10,7 +10,7 @@ JOBS?=$(shell nproc --ignore=1)
 # this on `master` always picks up the latest C-API surface (incl. the
 # per-detection accessor functions used by golocateanythingcpp.go).
 LOCATEANYTHING_REPO?=https://github.com/mudler/locate-anything.cpp.git
-LOCATEANYTHING_VERSION?=60e450945476d5e97e0754a8c0e71a9ea81690e0
+LOCATEANYTHING_VERSION?=92c1682da792c1e8a5dec91acc2be4b02c742ded

 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
--- a/backend/python/neutts/requirements-cpu.txt
+++ b/backend/python/neutts/requirements-cpu.txt
@@ -1,7 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 accelerate
 torch==2.8.0
-torchaudio==2.8.0
 transformers==4.56.1
 librosa==0.11.0
 neucodec>=0.0.4
--- a/backend/python/neutts/requirements-cublas12.txt
+++ b/backend/python/neutts/requirements-cublas12.txt
@@ -3,7 +3,6 @@ neucodec>=0.0.4
 phonemizer==3.3.0
 soundfile==0.13.1
 torch==2.8.0
-torchaudio==2.8.0
 transformers==4.56.1
 resemble-perth==1.0.1
 accelerate
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -307,11 +307,19 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions
 		}
 	}

+	// TopK may be nil after SetDefaults for backends that don't use llama.cpp's
+	// top_k=40 default (issue #6632, e.g. mlx). proto3 int32 can't be unset, so
+	// send 0 — the value mlx actually wants (top-k disabled).
+	var topK int32
+	if c.TopK != nil {
+		topK = int32(*c.TopK)
+	}
+
 	pbOpts := &pb.PredictOptions{
 		Temperature:         float32(*c.Temperature),
 		TopP:                float32(*c.TopP),
 		NDraft:              c.NDraft,
-		TopK:                int32(*c.TopK),
+		TopK:                topK,
 		MinP:                float32(*c.MinP),
 		Tokens:              int32(*c.Maxtokens),
 		Threads:             int32(*c.Threads),
--- a/core/config/backend_capabilities.go
+++ b/core/config/backend_capabilities.go
@@ -517,6 +517,33 @@ func NormalizeBackendName(backend string) string {
 	return strings.ReplaceAll(backend, ".", "-")
 }

+// nonLlamaSamplerBackends lists backends whose native sampler defaults differ
+// from llama.cpp's, so LocalAI must NOT inject llama.cpp's top_k=40 default for
+// them (issue #6632). mlx_lm's intended default is top_k=0 (disabled) and mlx
+// does not remap 0->40, so shipping 40 silently changes sampling for clients
+// that omit top_k. Leaving TopK nil lets the wire value default to 0.
+//
+// This is intentionally a small allow-list of KNOWN non-llama backends: empty
+// and unknown backends fall through to the llama.cpp default to preserve the
+// GGUF auto-detect path's behavior.
+var nonLlamaSamplerBackends = map[string]struct{}{
+	"mlx":             {},
+	"mlx-vlm":         {},
+	"mlx-distributed": {},
+}
+
+// UsesLlamaSamplerDefaults reports whether a backend should receive llama.cpp's
+// sampler defaults (e.g. top_k=40). Empty/unknown backends return true so the
+// GGUF auto-detect path (which resolves to llama.cpp) keeps today's behavior;
+// only the known non-llama backends in nonLlamaSamplerBackends return false.
+func UsesLlamaSamplerDefaults(backend string) bool {
+	if backend == "" {
+		return true
+	}
+	_, isNonLlama := nonLlamaSamplerBackends[NormalizeBackendName(backend)]
+	return !isNonLlama
+}
+
 // GetBackendCapability returns the capability info for a backend, or nil if unknown.
 // Handles backend name normalization.
 func GetBackendCapability(backend string) *BackendCapability {
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -867,7 +867,12 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Seed = &defaultSeed
 	}

-	if cfg.TopK == nil {
+	// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
+	// native default differs (issue #6632). Only inject it for the llama.cpp
+	// family and the empty/auto backend; leave TopK nil for known non-llama
+	// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
+	// is 0 rather than a silently-changed 40.
+	if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
 		cfg.TopK = &defaultTopK
 	}

--- a/core/config/model_config_test.go
+++ b/core/config/model_config_test.go
@@ -529,4 +529,72 @@ concurrency_groups:
 				"models that template in Go still rely on the Go-generated grammar")
 		})
 	})
+
+	// The default top_k=40 is llama.cpp's sampling default and is WRONG for
+	// backends whose native default differs. mlx_lm's intended default is
+	// top_k=0 (disabled) and mlx does not remap 0->40, so injecting 40 silently
+	// changes sampling for mlx clients that omit top_k (issue #6632). Gate the
+	// injection on backend family: keep 40 for the llama.cpp family and for the
+	// empty/auto backend (the GGUF auto-detect path resolves to llama.cpp), but
+	// leave TopK nil for the mlx family so the wire value is 0.
+	Context("TopK default is backend-gated (issue #6632)", func() {
+		It("injects top_k=40 for the llama.cpp backend", func() {
+			cfg := &ModelConfig{}
+			cfg.Backend = "llama-cpp"
+
+			cfg.SetDefaults()
+
+			Expect(cfg.TopK).NotTo(BeNil(), "llama.cpp must keep its top_k=40 default")
+			Expect(*cfg.TopK).To(Equal(40))
+		})
+
+		It("injects top_k=40 for the empty/auto backend (GGUF auto-detect)", func() {
+			cfg := &ModelConfig{}
+
+			cfg.SetDefaults()
+
+			Expect(cfg.TopK).NotTo(BeNil(), "empty backend resolves to llama.cpp; default unchanged")
+			Expect(*cfg.TopK).To(Equal(40))
+		})
+
+		It("leaves TopK nil for the mlx backend", func() {
+			cfg := &ModelConfig{}
+			cfg.Backend = "mlx"
+
+			cfg.SetDefaults()
+
+			Expect(cfg.TopK).To(BeNil(),
+				"mlx_lm's intended default is top_k=0 (disabled); LocalAI must not inject 40")
+		})
+
+		It("leaves TopK nil for the mlx-vlm backend", func() {
+			cfg := &ModelConfig{}
+			cfg.Backend = "mlx-vlm"
+
+			cfg.SetDefaults()
+
+			Expect(cfg.TopK).To(BeNil())
+		})
+
+		It("leaves TopK nil for the mlx-distributed backend", func() {
+			cfg := &ModelConfig{}
+			cfg.Backend = "mlx-distributed"
+
+			cfg.SetDefaults()
+
+			Expect(cfg.TopK).To(BeNil())
+		})
+
+		It("respects an explicit top_k even for the mlx backend", func() {
+			explicit := 7
+			cfg := &ModelConfig{}
+			cfg.Backend = "mlx"
+			cfg.TopK = &explicit
+
+			cfg.SetDefaults()
+
+			Expect(cfg.TopK).NotTo(BeNil())
+			Expect(*cfg.TopK).To(Equal(7))
+		})
+	})
 })
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -990,8 +990,18 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode
 	}

 	if rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil {
-		session.InputAudioTranscription = rt.Audio.Input.Transcription
-		session.ModelConfig.Pipeline.Transcription = rt.Audio.Input.Transcription.Model
+		trUpd := rt.Audio.Input.Transcription
+		// A language-only update (e.g. a client forcing the STT language) carries
+		// an empty Model. Preserve the pipeline's configured transcription backend
+		// instead of blanking it — otherwise the next utterance transcribes against
+		// an empty model and the backend RPC fails with "unimplemented".
+		if trUpd.Model == "" && session.InputAudioTranscription != nil {
+			trUpd.Model = session.InputAudioTranscription.Model
+		}
+		session.InputAudioTranscription = trUpd
+		if trUpd.Model != "" {
+			session.ModelConfig.Pipeline.Transcription = trUpd.Model
+		}
 	}

 	if rt.Model != "" || (rt.Audio != nil && rt.Audio.Output != nil && rt.Audio.Output.Voice != "") || (rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil) {