Compare commits

..

3 Commits

Author SHA1 Message Date
LocalAI [bot]
a906438a69 fix(config): backend-gate the top_k=40 sampler default (#6632) (#10285)
fix(config): gate top_k=40 default on backend family (#6632)

SetDefaults injected top_k=40 (llama.cpp's sampling default) for every
model config regardless of backend. That value is wrong for backends
whose native default differs: mlx_lm's intended default is top_k=0
(disabled) and mlx does not remap 0->40, so a client that omits top_k
silently got 40 shipped to mlx, changing sampling. The mlx backend's own
getattr(request,'TopK',0) fallback is dead because proto3 int32 is always
present.

Gate the injection on backend family via UsesLlamaSamplerDefaults: keep
top_k=40 for the llama.cpp family and for the empty/auto backend (the GGUF
auto-detect path resolves to llama.cpp, so existing behavior is preserved),
but leave TopK nil for the known non-llama backends (mlx, mlx-vlm,
mlx-distributed). gRPCPredictOpts now sends 0 when TopK is nil, which is
the value mlx actually wants.

Only TopK is gated - the confirmed bug. The sibling sampler defaults
(top_p, temperature, min_p) are left global to avoid widening scope and
introducing nil-deref risk; revisit per-backend if needed.

Assisted-by: claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-13 09:04:25 +02:00
LocalAI [bot]
d28a5b6da1 chore: ⬆️ Update mudler/locate-anything.cpp to 92c1682da792c1e8a5dec91acc2be4b02c742ded (#10282)
⬆️ Update mudler/locate-anything.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
2026-06-13 09:01:17 +02:00
LocalAI [bot]
edeacf22c4 fix(realtime): keep transcription model on a language-only session.update (#10295)
A transcription session.update that carries only a language (no model) —
e.g. a client forcing the STT input language — has an empty
Transcription.Model. updateSession unconditionally copied that into
session.ModelConfig.Pipeline.Transcription, blanking the pipeline's
configured transcription backend. The next utterance then transcribed
against an empty model and the backend RPC failed with "unimplemented"
(surfaced to the client as transcription_failed), so transcription
silently stopped whenever a language was selected.

Only adopt the incoming transcription model when it is non-empty, and
preserve the existing model otherwise (mirroring updateTransSession).

Signed-off-by: mudler <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-13 01:01:36 +02:00
8 changed files with 123 additions and 7 deletions

View File

@@ -10,7 +10,7 @@ JOBS?=$(shell nproc --ignore=1)
# this on `master` always picks up the latest C-API surface (incl. the
# per-detection accessor functions used by golocateanythingcpp.go).
LOCATEANYTHING_REPO?=https://github.com/mudler/locate-anything.cpp.git
LOCATEANYTHING_VERSION?=60e450945476d5e97e0754a8c0e71a9ea81690e0
LOCATEANYTHING_VERSION?=92c1682da792c1e8a5dec91acc2be4b02c742ded
ifeq ($(NATIVE),false)
CMAKE_ARGS+=-DGGML_NATIVE=OFF

View File

@@ -1,7 +1,6 @@
--extra-index-url https://download.pytorch.org/whl/cpu
accelerate
torch==2.8.0
torchaudio==2.8.0
transformers==4.56.1
librosa==0.11.0
neucodec>=0.0.4

View File

@@ -3,7 +3,6 @@ neucodec>=0.0.4
phonemizer==3.3.0
soundfile==0.13.1
torch==2.8.0
torchaudio==2.8.0
transformers==4.56.1
resemble-perth==1.0.1
accelerate

View File

@@ -307,11 +307,19 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions
}
}
// TopK may be nil after SetDefaults for backends that don't use llama.cpp's
// top_k=40 default (issue #6632, e.g. mlx). proto3 int32 can't be unset, so
// send 0 — the value mlx actually wants (top-k disabled).
var topK int32
if c.TopK != nil {
topK = int32(*c.TopK)
}
pbOpts := &pb.PredictOptions{
Temperature: float32(*c.Temperature),
TopP: float32(*c.TopP),
NDraft: c.NDraft,
TopK: int32(*c.TopK),
TopK: topK,
MinP: float32(*c.MinP),
Tokens: int32(*c.Maxtokens),
Threads: int32(*c.Threads),

View File

@@ -517,6 +517,33 @@ func NormalizeBackendName(backend string) string {
return strings.ReplaceAll(backend, ".", "-")
}
// nonLlamaSamplerBackends lists backends whose native sampler defaults differ
// from llama.cpp's, so LocalAI must NOT inject llama.cpp's top_k=40 default for
// them (issue #6632). mlx_lm's intended default is top_k=0 (disabled) and mlx
// does not remap 0->40, so shipping 40 silently changes sampling for clients
// that omit top_k. Leaving TopK nil lets the wire value default to 0.
//
// This is intentionally a small allow-list of KNOWN non-llama backends: empty
// and unknown backends fall through to the llama.cpp default to preserve the
// GGUF auto-detect path's behavior.
var nonLlamaSamplerBackends = map[string]struct{}{
"mlx": {},
"mlx-vlm": {},
"mlx-distributed": {},
}
// UsesLlamaSamplerDefaults reports whether a backend should receive llama.cpp's
// sampler defaults (e.g. top_k=40). Empty/unknown backends return true so the
// GGUF auto-detect path (which resolves to llama.cpp) keeps today's behavior;
// only the known non-llama backends in nonLlamaSamplerBackends return false.
func UsesLlamaSamplerDefaults(backend string) bool {
if backend == "" {
return true
}
_, isNonLlama := nonLlamaSamplerBackends[NormalizeBackendName(backend)]
return !isNonLlama
}
// GetBackendCapability returns the capability info for a backend, or nil if unknown.
// Handles backend name normalization.
func GetBackendCapability(backend string) *BackendCapability {

View File

@@ -867,7 +867,12 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
cfg.Seed = &defaultSeed
}
if cfg.TopK == nil {
// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
// native default differs (issue #6632). Only inject it for the llama.cpp
// family and the empty/auto backend; leave TopK nil for known non-llama
// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
// is 0 rather than a silently-changed 40.
if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
cfg.TopK = &defaultTopK
}

View File

@@ -529,4 +529,72 @@ concurrency_groups:
"models that template in Go still rely on the Go-generated grammar")
})
})
// The default top_k=40 is llama.cpp's sampling default and is WRONG for
// backends whose native default differs. mlx_lm's intended default is
// top_k=0 (disabled) and mlx does not remap 0->40, so injecting 40 silently
// changes sampling for mlx clients that omit top_k (issue #6632). Gate the
// injection on backend family: keep 40 for the llama.cpp family and for the
// empty/auto backend (the GGUF auto-detect path resolves to llama.cpp), but
// leave TopK nil for the mlx family so the wire value is 0.
Context("TopK default is backend-gated (issue #6632)", func() {
It("injects top_k=40 for the llama.cpp backend", func() {
cfg := &ModelConfig{}
cfg.Backend = "llama-cpp"
cfg.SetDefaults()
Expect(cfg.TopK).NotTo(BeNil(), "llama.cpp must keep its top_k=40 default")
Expect(*cfg.TopK).To(Equal(40))
})
It("injects top_k=40 for the empty/auto backend (GGUF auto-detect)", func() {
cfg := &ModelConfig{}
cfg.SetDefaults()
Expect(cfg.TopK).NotTo(BeNil(), "empty backend resolves to llama.cpp; default unchanged")
Expect(*cfg.TopK).To(Equal(40))
})
It("leaves TopK nil for the mlx backend", func() {
cfg := &ModelConfig{}
cfg.Backend = "mlx"
cfg.SetDefaults()
Expect(cfg.TopK).To(BeNil(),
"mlx_lm's intended default is top_k=0 (disabled); LocalAI must not inject 40")
})
It("leaves TopK nil for the mlx-vlm backend", func() {
cfg := &ModelConfig{}
cfg.Backend = "mlx-vlm"
cfg.SetDefaults()
Expect(cfg.TopK).To(BeNil())
})
It("leaves TopK nil for the mlx-distributed backend", func() {
cfg := &ModelConfig{}
cfg.Backend = "mlx-distributed"
cfg.SetDefaults()
Expect(cfg.TopK).To(BeNil())
})
It("respects an explicit top_k even for the mlx backend", func() {
explicit := 7
cfg := &ModelConfig{}
cfg.Backend = "mlx"
cfg.TopK = &explicit
cfg.SetDefaults()
Expect(cfg.TopK).NotTo(BeNil())
Expect(*cfg.TopK).To(Equal(7))
})
})
})

View File

@@ -990,8 +990,18 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode
}
if rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil {
session.InputAudioTranscription = rt.Audio.Input.Transcription
session.ModelConfig.Pipeline.Transcription = rt.Audio.Input.Transcription.Model
trUpd := rt.Audio.Input.Transcription
// A language-only update (e.g. a client forcing the STT language) carries
// an empty Model. Preserve the pipeline's configured transcription backend
// instead of blanking it — otherwise the next utterance transcribes against
// an empty model and the backend RPC fails with "unimplemented".
if trUpd.Model == "" && session.InputAudioTranscription != nil {
trUpd.Model = session.InputAudioTranscription.Model
}
session.InputAudioTranscription = trUpd
if trUpd.Model != "" {
session.ModelConfig.Pipeline.Transcription = trUpd.Model
}
}
if rt.Model != "" || (rt.Audio != nil && rt.Audio.Output != nil && rt.Audio.Output.Voice != "") || (rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil) {