Compare commits

..

1 Commits

Author SHA1 Message Date
Ettore Di Giacinto
69e482b0a8 fix(model): deterministic, file-type-filtered backend auto-detect (#9287)
When a model config declares no explicit `backend:`, Load() fell into a
trial loop built by ranging the external-backends Go map (random order)
with no filtering, returning the first backend whose gRPC LoadModel
succeeded. An unrelated installed backend - e.g. the "opus" audio codec -
could therefore win a GGUF/LLM model load, so a model that should run on
llama.cpp wrongly tried to use opus.

Extract the candidate selection into a pure, testable function
SelectAutoLoadBackends that:

  - sorts the candidate list deterministically (no more map-order
    nondeterminism), and
  - for a `.gguf` model, filters to LLM-capable backends (via
    core/config.BackendCapabilities) and puts llama-cpp first, so an
    incompatible audio/codec/image backend can never win the trial loop.

If filtering would leave zero candidates, the full sorted set is returned
unchanged, so a previously-loadable model is never made unloadable.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-12 21:46:25 +00:00
20 changed files with 167 additions and 1759 deletions

View File

@@ -10,7 +10,7 @@ JOBS?=$(shell nproc --ignore=1)
# this on `master` always picks up the latest C-API surface (incl. the
# per-detection accessor functions used by golocateanythingcpp.go).
LOCATEANYTHING_REPO?=https://github.com/mudler/locate-anything.cpp.git
LOCATEANYTHING_VERSION?=92c1682da792c1e8a5dec91acc2be4b02c742ded
LOCATEANYTHING_VERSION?=60e450945476d5e97e0754a8c0e71a9ea81690e0
ifeq ($(NATIVE),false)
CMAKE_ARGS+=-DGGML_NATIVE=OFF

View File

@@ -1,5 +1,5 @@
--extra-index-url https://download.pytorch.org/whl/rocm7.0
torch==2.12.0+cpu
torch==2.10.0+rocm7.0
torchaudio
torchvision

View File

@@ -1,5 +1,5 @@
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.12.0+cpu
torch==2.10.0
transformers>=4.56.2
huggingface-hub>=1.3.0
sentencepiece

View File

@@ -1,4 +1,4 @@
torch==2.12.0+cpu
torch==2.10.0
transformers>=4.56.2
huggingface-hub>=1.3.0
sentencepiece

View File

@@ -1,7 +1,6 @@
--extra-index-url https://download.pytorch.org/whl/cpu
accelerate
torch==2.8.0
torchaudio==2.8.0
transformers==4.56.1
librosa==0.11.0
neucodec>=0.0.4

View File

@@ -3,7 +3,6 @@ neucodec>=0.0.4
phonemizer==3.3.0
soundfile==0.13.1
torch==2.8.0
torchaudio==2.8.0
transformers==4.56.1
resemble-perth==1.0.1
accelerate

View File

@@ -1,6 +1,6 @@
--extra-index-url https://download.pytorch.org/whl/cpu
accelerate
torch==2.12.0+cpu
torch==2.9.0
torchvision
torchaudio
transformers

View File

@@ -6,7 +6,7 @@
# for cublas12 so uv consults this index alongside PyPI.
--extra-index-url https://download.pytorch.org/whl/cu128
accelerate
torch==2.12.0+cpu
torch==2.9.1
torchvision
torchaudio
transformers

View File

@@ -1,4 +1,4 @@
accelerate
torch==2.12.0+cu130
torch==2.7.0
transformers
bitsandbytes

View File

@@ -307,19 +307,11 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions
}
}
// TopK may be nil after SetDefaults for backends that don't use llama.cpp's
// top_k=40 default (issue #6632, e.g. mlx). proto3 int32 can't be unset, so
// send 0 — the value mlx actually wants (top-k disabled).
var topK int32
if c.TopK != nil {
topK = int32(*c.TopK)
}
pbOpts := &pb.PredictOptions{
Temperature: float32(*c.Temperature),
TopP: float32(*c.TopP),
NDraft: c.NDraft,
TopK: topK,
TopK: int32(*c.TopK),
MinP: float32(*c.MinP),
Tokens: int32(*c.Maxtokens),
Threads: int32(*c.Threads),

View File

@@ -517,33 +517,6 @@ func NormalizeBackendName(backend string) string {
return strings.ReplaceAll(backend, ".", "-")
}
// nonLlamaSamplerBackends lists backends whose native sampler defaults differ
// from llama.cpp's, so LocalAI must NOT inject llama.cpp's top_k=40 default for
// them (issue #6632). mlx_lm's intended default is top_k=0 (disabled) and mlx
// does not remap 0->40, so shipping 40 silently changes sampling for clients
// that omit top_k. Leaving TopK nil lets the wire value default to 0.
//
// This is intentionally a small allow-list of KNOWN non-llama backends: empty
// and unknown backends fall through to the llama.cpp default to preserve the
// GGUF auto-detect path's behavior.
var nonLlamaSamplerBackends = map[string]struct{}{
"mlx": {},
"mlx-vlm": {},
"mlx-distributed": {},
}
// UsesLlamaSamplerDefaults reports whether a backend should receive llama.cpp's
// sampler defaults (e.g. top_k=40). Empty/unknown backends return true so the
// GGUF auto-detect path (which resolves to llama.cpp) keeps today's behavior;
// only the known non-llama backends in nonLlamaSamplerBackends return false.
func UsesLlamaSamplerDefaults(backend string) bool {
if backend == "" {
return true
}
_, isNonLlama := nonLlamaSamplerBackends[NormalizeBackendName(backend)]
return !isNonLlama
}
// GetBackendCapability returns the capability info for a backend, or nil if unknown.
// Handles backend name normalization.
func GetBackendCapability(backend string) *BackendCapability {

View File

@@ -867,12 +867,7 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
cfg.Seed = &defaultSeed
}
// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
// native default differs (issue #6632). Only inject it for the llama.cpp
// family and the empty/auto backend; leave TopK nil for known non-llama
// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
// is 0 rather than a silently-changed 40.
if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
if cfg.TopK == nil {
cfg.TopK = &defaultTopK
}

View File

@@ -529,72 +529,4 @@ concurrency_groups:
"models that template in Go still rely on the Go-generated grammar")
})
})
// The default top_k=40 is llama.cpp's sampling default and is WRONG for
// backends whose native default differs. mlx_lm's intended default is
// top_k=0 (disabled) and mlx does not remap 0->40, so injecting 40 silently
// changes sampling for mlx clients that omit top_k (issue #6632). Gate the
// injection on backend family: keep 40 for the llama.cpp family and for the
// empty/auto backend (the GGUF auto-detect path resolves to llama.cpp), but
// leave TopK nil for the mlx family so the wire value is 0.
Context("TopK default is backend-gated (issue #6632)", func() {
It("injects top_k=40 for the llama.cpp backend", func() {
cfg := &ModelConfig{}
cfg.Backend = "llama-cpp"
cfg.SetDefaults()
Expect(cfg.TopK).NotTo(BeNil(), "llama.cpp must keep its top_k=40 default")
Expect(*cfg.TopK).To(Equal(40))
})
It("injects top_k=40 for the empty/auto backend (GGUF auto-detect)", func() {
cfg := &ModelConfig{}
cfg.SetDefaults()
Expect(cfg.TopK).NotTo(BeNil(), "empty backend resolves to llama.cpp; default unchanged")
Expect(*cfg.TopK).To(Equal(40))
})
It("leaves TopK nil for the mlx backend", func() {
cfg := &ModelConfig{}
cfg.Backend = "mlx"
cfg.SetDefaults()
Expect(cfg.TopK).To(BeNil(),
"mlx_lm's intended default is top_k=0 (disabled); LocalAI must not inject 40")
})
It("leaves TopK nil for the mlx-vlm backend", func() {
cfg := &ModelConfig{}
cfg.Backend = "mlx-vlm"
cfg.SetDefaults()
Expect(cfg.TopK).To(BeNil())
})
It("leaves TopK nil for the mlx-distributed backend", func() {
cfg := &ModelConfig{}
cfg.Backend = "mlx-distributed"
cfg.SetDefaults()
Expect(cfg.TopK).To(BeNil())
})
It("respects an explicit top_k even for the mlx backend", func() {
explicit := 7
cfg := &ModelConfig{}
cfg.Backend = "mlx"
cfg.TopK = &explicit
cfg.SetDefaults()
Expect(cfg.TopK).NotTo(BeNil())
Expect(*cfg.TopK).To(Equal(7))
})
})
})

View File

@@ -990,18 +990,8 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode
}
if rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil {
trUpd := rt.Audio.Input.Transcription
// A language-only update (e.g. a client forcing the STT language) carries
// an empty Model. Preserve the pipeline's configured transcription backend
// instead of blanking it — otherwise the next utterance transcribes against
// an empty model and the backend RPC fails with "unimplemented".
if trUpd.Model == "" && session.InputAudioTranscription != nil {
trUpd.Model = session.InputAudioTranscription.Model
}
session.InputAudioTranscription = trUpd
if trUpd.Model != "" {
session.ModelConfig.Pipeline.Transcription = trUpd.Model
}
session.InputAudioTranscription = rt.Audio.Input.Transcription
session.ModelConfig.Pipeline.Transcription = rt.Audio.Input.Transcription.Model
}
if rt.Model != "" || (rt.Audio != nil && rt.Audio.Output != nil && rt.Audio.Output.Voice != "") || (rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil) {

View File

File diff suppressed because it is too large Load Diff

2
go.mod
View File

@@ -36,7 +36,7 @@ require (
github.com/mholt/archiver/v3 v3.5.1
github.com/microcosm-cc/bluemonday v1.0.27
github.com/modelcontextprotocol/go-sdk v1.5.0
github.com/mudler/cogito v0.10.1-0.20260609212329-bf4010d31047
github.com/mudler/cogito v0.9.5-0.20260315222927-63abdec7189b
github.com/mudler/edgevpn v0.34.0
github.com/mudler/go-processmanager v0.1.1
github.com/mudler/memory v0.0.0-20260406210934-424c1ecf2cf8

4
go.sum
View File

@@ -968,8 +968,8 @@ github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM=
github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw=
github.com/mudler/LocalAGI v0.0.0-20260606071251-14aed1ae4336 h1:iKBkSnpisOvMVxFoYsAObvAuOqXBakRPMD0PWxWG5EE=
github.com/mudler/LocalAGI v0.0.0-20260606071251-14aed1ae4336/go.mod h1:U+g6u8mF2wQxhkdBl3dr8G4db1cv3n7KTKmraoJ7D0c=
github.com/mudler/cogito v0.10.1-0.20260609212329-bf4010d31047 h1:wJ8WbDah1YcpBNRDmovQro8JiR228YFk7TUqPCS4m04=
github.com/mudler/cogito v0.10.1-0.20260609212329-bf4010d31047/go.mod h1:6sfja3lcu2nWRzEc0wwqGNu/eCG3EWgij+8s7xyUeQ4=
github.com/mudler/cogito v0.9.5-0.20260315222927-63abdec7189b h1:A74T2Lauvg61KodYqsjTYDY05kPLcW+efVZjd23dghU=
github.com/mudler/cogito v0.9.5-0.20260315222927-63abdec7189b/go.mod h1:6sfja3lcu2nWRzEc0wwqGNu/eCG3EWgij+8s7xyUeQ4=
github.com/mudler/edgevpn v0.34.0 h1:qDrD/rCPFY/FdURbXudIZWihVKY4VOX3nMn3CcbeQEU=
github.com/mudler/edgevpn v0.34.0/go.mod h1:yki7uMi5LR9gSMrw8PdPieuxsrk8BLV2Ui7VBEmbbIA=
github.com/mudler/go-piper v0.0.0-20241023091659-2494246fd9fc h1:RxwneJl1VgvikiX28EkpdAyL4yQVnJMrbquKospjHyA=

99
pkg/model/autoload.go Normal file
View File

@@ -0,0 +1,99 @@
package model
import (
"slices"
"sort"
"strings"
"github.com/mudler/LocalAI/core/config"
)
// preferredGGUFBackend is tried first when auto-detecting the backend for a
// GGUF model, since GGUF is overwhelmingly llama.cpp's native format.
const preferredGGUFBackend = "llama-cpp"
// llmCapableUsecases are the BackendCapabilities usecases that signal a backend
// can serve a text/LLM GGUF model. A GGUF model that declares no explicit
// backend must only be auto-tried against backends carrying one of these
// usecases - never against audio/codec/image backends (e.g. opus) that happen
// to be installed alongside it (see issue #9287).
var llmCapableUsecases = []string{
config.UsecaseChat,
config.UsecaseCompletion,
config.UsecaseEdit,
config.UsecaseEmbeddings,
}
// SelectAutoLoadBackends returns the ordered, deterministic list of backend
// names to try when loading a model that declares no explicit backend.
//
// available is the set of installed backend names (unordered, as it comes from a
// Go map). modelFile is the model file name/path (may be empty).
//
// The trial loop in (*ModelLoader).Load picks the first backend whose gRPC
// LoadModel succeeds, so the order and membership of this list directly decide
// which backend wins. The previous implementation ranged a Go map (random
// order) with no filtering, so an unrelated installed backend such as the
// "opus" audio codec could win a GGUF/LLM model load (#9287).
//
// Behaviour:
// - The result is always deterministically ordered, so auto-detect no longer
// depends on map iteration order.
// - For a GGUF model file the list is filtered to LLM-capable backends and
// llama-cpp is placed first, so an incompatible audio/codec/image backend
// can never win the trial loop.
// - If filtering would leave no candidate, the full sorted set is returned
// instead, so a model that previously loaded never becomes unloadable.
func SelectAutoLoadBackends(available []string, modelFile string) []string {
sorted := append([]string(nil), available...)
sort.Strings(sorted)
if !isGGUFModelFile(modelFile) {
return sorted
}
filtered := make([]string, 0, len(sorted))
hasLlama := false
for _, b := range sorted {
if b == preferredGGUFBackend {
hasLlama = true
continue // added explicitly first below
}
if isLLMCapableBackend(b) {
filtered = append(filtered, b)
}
}
if hasLlama {
filtered = append([]string{preferredGGUFBackend}, filtered...)
}
if len(filtered) == 0 {
// Conservative fallback: no known LLM-capable backend is installed, so
// rather than refuse to load, fall back to the previous behaviour of
// trying every installed backend (now at least in a deterministic order).
return sorted
}
return filtered
}
func isGGUFModelFile(modelFile string) bool {
return strings.HasSuffix(strings.ToLower(modelFile), ".gguf")
}
// isLLMCapableBackend reports whether a backend is known to serve text/LLM
// models. Backends absent from the capability map (unknown) are treated as
// not LLM-capable here: for GGUF auto-detection we only want backends we can
// positively confirm handle LLMs, and the zero-candidate fallback keeps unknown
// setups working.
func isLLMCapableBackend(name string) bool {
capability := config.GetBackendCapability(name)
if capability == nil {
return false
}
for _, u := range capability.PossibleUsecases {
if slices.Contains(llmCapableUsecases, u) {
return true
}
}
return false
}

View File

@@ -0,0 +1,46 @@
package model_test
import (
"github.com/mudler/LocalAI/pkg/model"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = Describe("SelectAutoLoadBackends (#9287)", func() {
Describe("GGUF model auto-detection", func() {
It("excludes incompatible audio/codec backends (e.g. opus) for a .gguf model", func() {
// Regression for #9287: installing an unrelated audio backend like
// "opus" must never win the GGUF auto-detect trial loop.
got := model.SelectAutoLoadBackends([]string{"opus", "llama-cpp"}, "Qwen3.5-9b.gguf")
Expect(got).NotTo(ContainElement("opus"))
Expect(got).To(ContainElement("llama-cpp"))
})
It("places llama-cpp first for a .gguf model", func() {
got := model.SelectAutoLoadBackends([]string{"vllm", "opus", "llama-cpp"}, "model.gguf")
Expect(got).NotTo(BeEmpty())
Expect(got[0]).To(Equal("llama-cpp"))
})
It("is deterministic regardless of input ordering", func() {
a := model.SelectAutoLoadBackends([]string{"opus", "vllm", "llama-cpp", "whisper"}, "m.gguf")
b := model.SelectAutoLoadBackends([]string{"whisper", "llama-cpp", "vllm", "opus"}, "m.gguf")
Expect(a).To(Equal(b))
})
It("falls back to the full sorted set when filtering leaves no candidate", func() {
// No LLM-capable backend installed: never make a previously-loadable
// model unloadable, return the original set (sorted).
got := model.SelectAutoLoadBackends([]string{"opus"}, "model.gguf")
Expect(got).To(Equal([]string{"opus"}))
})
})
Describe("non-GGUF model auto-detection", func() {
It("returns a deterministic (sorted) set without filtering", func() {
got := model.SelectAutoLoadBackends([]string{"opus", "llama-cpp", "diffusers"}, "model-dir")
Expect(got).To(Equal([]string{"diffusers", "llama-cpp", "opus"}))
})
})
})

View File

@@ -350,14 +350,17 @@ func (ml *ModelLoader) Load(opts ...Option) (grpc.Backend, error) {
// Otherwise scan for backends in the asset directory
var err error
// get backends embedded in the binary
autoLoadBackends := []string{}
// append externalBackends supplied by the user via the CLI
// Collect the installed/external backends (the map is unordered).
available := []string{}
for b := range ml.GetAllExternalBackends(o) {
autoLoadBackends = append(autoLoadBackends, b)
available = append(available, b)
}
// Build a deterministic, file-type-filtered candidate list so an
// incompatible backend (e.g. an audio codec like opus) can never win the
// trial loop for a GGUF/LLM model. See SelectAutoLoadBackends / #9287.
autoLoadBackends := SelectAutoLoadBackends(available, o.model)
if len(autoLoadBackends) == 0 {
xlog.Error("No backends found")
return nil, fmt.Errorf("no backends found")