Compare commits

..

1 Commits

Author SHA1 Message Date
Ettore Di Giacinto
0c9666c295 fix(config): gate top_k=40 default on backend family (#6632)
SetDefaults injected top_k=40 (llama.cpp's sampling default) for every
model config regardless of backend. That value is wrong for backends
whose native default differs: mlx_lm's intended default is top_k=0
(disabled) and mlx does not remap 0->40, so a client that omits top_k
silently got 40 shipped to mlx, changing sampling. The mlx backend's own
getattr(request,'TopK',0) fallback is dead because proto3 int32 is always
present.

Gate the injection on backend family via UsesLlamaSamplerDefaults: keep
top_k=40 for the llama.cpp family and for the empty/auto backend (the GGUF
auto-detect path resolves to llama.cpp, so existing behavior is preserved),
but leave TopK nil for the known non-llama backends (mlx, mlx-vlm,
mlx-distributed). gRPCPredictOpts now sends 0 when TopK is nil, which is
the value mlx actually wants.

Only TopK is gated - the confirmed bug. The sibling sampler defaults
(top_p, temperature, min_p) are left global to avoid widening scope and
introducing nil-deref risk; revisit per-backend if needed.

Assisted-by: claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-12 21:45:22 +00:00
4 changed files with 110 additions and 2 deletions

View File

@@ -307,11 +307,19 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions
}
}
// TopK may be nil after SetDefaults for backends that don't use llama.cpp's
// top_k=40 default (issue #6632, e.g. mlx). proto3 int32 can't be unset, so
// send 0 — the value mlx actually wants (top-k disabled).
var topK int32
if c.TopK != nil {
topK = int32(*c.TopK)
}
pbOpts := &pb.PredictOptions{
Temperature: float32(*c.Temperature),
TopP: float32(*c.TopP),
NDraft: c.NDraft,
TopK: int32(*c.TopK),
TopK: topK,
MinP: float32(*c.MinP),
Tokens: int32(*c.Maxtokens),
Threads: int32(*c.Threads),

View File

@@ -517,6 +517,33 @@ func NormalizeBackendName(backend string) string {
return strings.ReplaceAll(backend, ".", "-")
}
// nonLlamaSamplerBackends lists backends whose native sampler defaults differ
// from llama.cpp's, so LocalAI must NOT inject llama.cpp's top_k=40 default for
// them (issue #6632). mlx_lm's intended default is top_k=0 (disabled) and mlx
// does not remap 0->40, so shipping 40 silently changes sampling for clients
// that omit top_k. Leaving TopK nil lets the wire value default to 0.
//
// This is intentionally a small allow-list of KNOWN non-llama backends: empty
// and unknown backends fall through to the llama.cpp default to preserve the
// GGUF auto-detect path's behavior.
var nonLlamaSamplerBackends = map[string]struct{}{
"mlx": {},
"mlx-vlm": {},
"mlx-distributed": {},
}
// UsesLlamaSamplerDefaults reports whether a backend should receive llama.cpp's
// sampler defaults (e.g. top_k=40). Empty/unknown backends return true so the
// GGUF auto-detect path (which resolves to llama.cpp) keeps today's behavior;
// only the known non-llama backends in nonLlamaSamplerBackends return false.
func UsesLlamaSamplerDefaults(backend string) bool {
if backend == "" {
return true
}
_, isNonLlama := nonLlamaSamplerBackends[NormalizeBackendName(backend)]
return !isNonLlama
}
// GetBackendCapability returns the capability info for a backend, or nil if unknown.
// Handles backend name normalization.
func GetBackendCapability(backend string) *BackendCapability {

View File

@@ -867,7 +867,12 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
cfg.Seed = &defaultSeed
}
if cfg.TopK == nil {
// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
// native default differs (issue #6632). Only inject it for the llama.cpp
// family and the empty/auto backend; leave TopK nil for known non-llama
// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
// is 0 rather than a silently-changed 40.
if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
cfg.TopK = &defaultTopK
}

View File

@@ -529,4 +529,72 @@ concurrency_groups:
"models that template in Go still rely on the Go-generated grammar")
})
})
// The default top_k=40 is llama.cpp's sampling default and is WRONG for
// backends whose native default differs. mlx_lm's intended default is
// top_k=0 (disabled) and mlx does not remap 0->40, so injecting 40 silently
// changes sampling for mlx clients that omit top_k (issue #6632). Gate the
// injection on backend family: keep 40 for the llama.cpp family and for the
// empty/auto backend (the GGUF auto-detect path resolves to llama.cpp), but
// leave TopK nil for the mlx family so the wire value is 0.
Context("TopK default is backend-gated (issue #6632)", func() {
It("injects top_k=40 for the llama.cpp backend", func() {
cfg := &ModelConfig{}
cfg.Backend = "llama-cpp"
cfg.SetDefaults()
Expect(cfg.TopK).NotTo(BeNil(), "llama.cpp must keep its top_k=40 default")
Expect(*cfg.TopK).To(Equal(40))
})
It("injects top_k=40 for the empty/auto backend (GGUF auto-detect)", func() {
cfg := &ModelConfig{}
cfg.SetDefaults()
Expect(cfg.TopK).NotTo(BeNil(), "empty backend resolves to llama.cpp; default unchanged")
Expect(*cfg.TopK).To(Equal(40))
})
It("leaves TopK nil for the mlx backend", func() {
cfg := &ModelConfig{}
cfg.Backend = "mlx"
cfg.SetDefaults()
Expect(cfg.TopK).To(BeNil(),
"mlx_lm's intended default is top_k=0 (disabled); LocalAI must not inject 40")
})
It("leaves TopK nil for the mlx-vlm backend", func() {
cfg := &ModelConfig{}
cfg.Backend = "mlx-vlm"
cfg.SetDefaults()
Expect(cfg.TopK).To(BeNil())
})
It("leaves TopK nil for the mlx-distributed backend", func() {
cfg := &ModelConfig{}
cfg.Backend = "mlx-distributed"
cfg.SetDefaults()
Expect(cfg.TopK).To(BeNil())
})
It("respects an explicit top_k even for the mlx backend", func() {
explicit := 7
cfg := &ModelConfig{}
cfg.Backend = "mlx"
cfg.TopK = &explicit
cfg.SetDefaults()
Expect(cfg.TopK).NotTo(BeNil())
Expect(*cfg.TopK).To(Equal(7))
})
})
})