fix(model): deterministic, file-type-filtered backend auto-detect (#9287 )

When a model config declares no explicit `backend:`, Load() fell into a trial loop built by ranging the external-backends Go map (random order) with no filtering, returning the first backend whose gRPC LoadModel succeeded. An unrelated installed backend - e.g. the "opus" audio codec - could therefore win a GGUF/LLM model load, so a model that should run on llama.cpp wrongly tried to use opus. Extract the candidate selection into a pure, testable function SelectAutoLoadBackends that: - sorts the candidate list deterministically (no more map-order nondeterminism), and - for a `.gguf` model, filters to LLM-capable backends (via core/config.BackendCapabilities) and puts llama-cpp first, so an incompatible audio/codec/image backend can never win the trial loop. If filtering would leave zero candidates, the full sorted set is returned unchanged, so a previously-loadable model is never made unloadable. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-16 12:49:08 -04:00 · 2026-06-12 21:46:25 +00:00
20 changed files with 167 additions and 1759 deletions
--- a/backend/go/locate-anything-cpp/Makefile
+++ b/backend/go/locate-anything-cpp/Makefile
@@ -10,7 +10,7 @@ JOBS?=$(shell nproc --ignore=1)
 # this on `master` always picks up the latest C-API surface (incl. the
 # per-detection accessor functions used by golocateanythingcpp.go).
 LOCATEANYTHING_REPO?=https://github.com/mudler/locate-anything.cpp.git
-LOCATEANYTHING_VERSION?=92c1682da792c1e8a5dec91acc2be4b02c742ded
+LOCATEANYTHING_VERSION?=60e450945476d5e97e0754a8c0e71a9ea81690e0

 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
--- a/backend/python/ace-step/requirements-hipblas.txt
+++ b/backend/python/ace-step/requirements-hipblas.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm7.0
-torch==2.12.0+cpu
+torch==2.10.0+rocm7.0
 torchaudio
 torchvision

--- a/backend/python/llama-cpp-quantization/requirements-cpu.txt
+++ b/backend/python/llama-cpp-quantization/requirements-cpu.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.12.0+cpu
+torch==2.10.0
 transformers>=4.56.2
 huggingface-hub>=1.3.0
 sentencepiece
--- a/backend/python/llama-cpp-quantization/requirements-mps.txt
+++ b/backend/python/llama-cpp-quantization/requirements-mps.txt
@@ -1,4 +1,4 @@
-torch==2.12.0+cpu
+torch==2.10.0
 transformers>=4.56.2
 huggingface-hub>=1.3.0
 sentencepiece
--- a/backend/python/neutts/requirements-cpu.txt
+++ b/backend/python/neutts/requirements-cpu.txt
@@ -1,7 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 accelerate
 torch==2.8.0
-torchaudio==2.8.0
 transformers==4.56.1
 librosa==0.11.0
 neucodec>=0.0.4
--- a/backend/python/neutts/requirements-cublas12.txt
+++ b/backend/python/neutts/requirements-cublas12.txt
@@ -3,7 +3,6 @@ neucodec>=0.0.4
 phonemizer==3.3.0
 soundfile==0.13.1
 torch==2.8.0
-torchaudio==2.8.0
 transformers==4.56.1
 resemble-perth==1.0.1
 accelerate
--- a/backend/python/sglang/requirements-cpu.txt
+++ b/backend/python/sglang/requirements-cpu.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 accelerate
-torch==2.12.0+cpu
+torch==2.9.0
 torchvision
 torchaudio
 transformers
--- a/backend/python/sglang/requirements-cublas12.txt
+++ b/backend/python/sglang/requirements-cublas12.txt
@@ -6,7 +6,7 @@
 # for cublas12 so uv consults this index alongside PyPI.
 --extra-index-url https://download.pytorch.org/whl/cu128
 accelerate
-torch==2.12.0+cpu
+torch==2.9.1
 torchvision
 torchaudio
 transformers
--- a/backend/python/vllm-omni/requirements-cublas12.txt
+++ b/backend/python/vllm-omni/requirements-cublas12.txt
@@ -1,4 +1,4 @@
 accelerate
-torch==2.12.0+cu130
+torch==2.7.0
 transformers
 bitsandbytes
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -307,19 +307,11 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions
 		}
 	}

-	// TopK may be nil after SetDefaults for backends that don't use llama.cpp's
-	// top_k=40 default (issue #6632, e.g. mlx). proto3 int32 can't be unset, so
-	// send 0 — the value mlx actually wants (top-k disabled).
-	var topK int32
-	if c.TopK != nil {
-		topK = int32(*c.TopK)
-	}
-
 	pbOpts := &pb.PredictOptions{
 		Temperature:         float32(*c.Temperature),
 		TopP:                float32(*c.TopP),
 		NDraft:              c.NDraft,
-		TopK:                topK,
+		TopK:                int32(*c.TopK),
 		MinP:                float32(*c.MinP),
 		Tokens:              int32(*c.Maxtokens),
 		Threads:             int32(*c.Threads),
--- a/core/config/backend_capabilities.go
+++ b/core/config/backend_capabilities.go
@@ -517,33 +517,6 @@ func NormalizeBackendName(backend string) string {
 	return strings.ReplaceAll(backend, ".", "-")
 }

-// nonLlamaSamplerBackends lists backends whose native sampler defaults differ
-// from llama.cpp's, so LocalAI must NOT inject llama.cpp's top_k=40 default for
-// them (issue #6632). mlx_lm's intended default is top_k=0 (disabled) and mlx
-// does not remap 0->40, so shipping 40 silently changes sampling for clients
-// that omit top_k. Leaving TopK nil lets the wire value default to 0.
-//
-// This is intentionally a small allow-list of KNOWN non-llama backends: empty
-// and unknown backends fall through to the llama.cpp default to preserve the
-// GGUF auto-detect path's behavior.
-var nonLlamaSamplerBackends = map[string]struct{}{
-	"mlx":             {},
-	"mlx-vlm":         {},
-	"mlx-distributed": {},
-}
-
-// UsesLlamaSamplerDefaults reports whether a backend should receive llama.cpp's
-// sampler defaults (e.g. top_k=40). Empty/unknown backends return true so the
-// GGUF auto-detect path (which resolves to llama.cpp) keeps today's behavior;
-// only the known non-llama backends in nonLlamaSamplerBackends return false.
-func UsesLlamaSamplerDefaults(backend string) bool {
-	if backend == "" {
-		return true
-	}
-	_, isNonLlama := nonLlamaSamplerBackends[NormalizeBackendName(backend)]
-	return !isNonLlama
-}
-
 // GetBackendCapability returns the capability info for a backend, or nil if unknown.
 // Handles backend name normalization.
 func GetBackendCapability(backend string) *BackendCapability {
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -867,12 +867,7 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Seed = &defaultSeed
 	}

-	// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
-	// native default differs (issue #6632). Only inject it for the llama.cpp
-	// family and the empty/auto backend; leave TopK nil for known non-llama
-	// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
-	// is 0 rather than a silently-changed 40.
-	if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
+	if cfg.TopK == nil {
 		cfg.TopK = &defaultTopK
 	}

--- a/core/config/model_config_test.go
+++ b/core/config/model_config_test.go
@@ -529,72 +529,4 @@ concurrency_groups:
 				"models that template in Go still rely on the Go-generated grammar")
 		})
 	})
-
-	// The default top_k=40 is llama.cpp's sampling default and is WRONG for
-	// backends whose native default differs. mlx_lm's intended default is
-	// top_k=0 (disabled) and mlx does not remap 0->40, so injecting 40 silently
-	// changes sampling for mlx clients that omit top_k (issue #6632). Gate the
-	// injection on backend family: keep 40 for the llama.cpp family and for the
-	// empty/auto backend (the GGUF auto-detect path resolves to llama.cpp), but
-	// leave TopK nil for the mlx family so the wire value is 0.
-	Context("TopK default is backend-gated (issue #6632)", func() {
-		It("injects top_k=40 for the llama.cpp backend", func() {
-			cfg := &ModelConfig{}
-			cfg.Backend = "llama-cpp"
-
-			cfg.SetDefaults()
-
-			Expect(cfg.TopK).NotTo(BeNil(), "llama.cpp must keep its top_k=40 default")
-			Expect(*cfg.TopK).To(Equal(40))
-		})
-
-		It("injects top_k=40 for the empty/auto backend (GGUF auto-detect)", func() {
-			cfg := &ModelConfig{}
-
-			cfg.SetDefaults()
-
-			Expect(cfg.TopK).NotTo(BeNil(), "empty backend resolves to llama.cpp; default unchanged")
-			Expect(*cfg.TopK).To(Equal(40))
-		})
-
-		It("leaves TopK nil for the mlx backend", func() {
-			cfg := &ModelConfig{}
-			cfg.Backend = "mlx"
-
-			cfg.SetDefaults()
-
-			Expect(cfg.TopK).To(BeNil(),
-				"mlx_lm's intended default is top_k=0 (disabled); LocalAI must not inject 40")
-		})
-
-		It("leaves TopK nil for the mlx-vlm backend", func() {
-			cfg := &ModelConfig{}
-			cfg.Backend = "mlx-vlm"
-
-			cfg.SetDefaults()
-
-			Expect(cfg.TopK).To(BeNil())
-		})
-
-		It("leaves TopK nil for the mlx-distributed backend", func() {
-			cfg := &ModelConfig{}
-			cfg.Backend = "mlx-distributed"
-
-			cfg.SetDefaults()
-
-			Expect(cfg.TopK).To(BeNil())
-		})
-
-		It("respects an explicit top_k even for the mlx backend", func() {
-			explicit := 7
-			cfg := &ModelConfig{}
-			cfg.Backend = "mlx"
-			cfg.TopK = &explicit
-
-			cfg.SetDefaults()
-
-			Expect(cfg.TopK).NotTo(BeNil())
-			Expect(*cfg.TopK).To(Equal(7))
-		})
-	})
 })
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -990,18 +990,8 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode
 	}

 	if rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil {
-		trUpd := rt.Audio.Input.Transcription
-		// A language-only update (e.g. a client forcing the STT language) carries
-		// an empty Model. Preserve the pipeline's configured transcription backend
-		// instead of blanking it — otherwise the next utterance transcribes against
-		// an empty model and the backend RPC fails with "unimplemented".
-		if trUpd.Model == "" && session.InputAudioTranscription != nil {
-			trUpd.Model = session.InputAudioTranscription.Model
-		}
-		session.InputAudioTranscription = trUpd
-		if trUpd.Model != "" {
-			session.ModelConfig.Pipeline.Transcription = trUpd.Model
-		}
+		session.InputAudioTranscription = rt.Audio.Input.Transcription
+		session.ModelConfig.Pipeline.Transcription = rt.Audio.Input.Transcription.Model
 	}

 	if rt.Model != "" || (rt.Audio != nil && rt.Audio.Output != nil && rt.Audio.Output.Voice != "") || (rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil) {
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
--- a/go.mod
+++ b/go.mod
@@ -36,7 +36,7 @@ require (
 	github.com/mholt/archiver/v3 v3.5.1
 	github.com/microcosm-cc/bluemonday v1.0.27
 	github.com/modelcontextprotocol/go-sdk v1.5.0
-	github.com/mudler/cogito v0.10.1-0.20260609212329-bf4010d31047
+	github.com/mudler/cogito v0.9.5-0.20260315222927-63abdec7189b
 	github.com/mudler/edgevpn v0.34.0
 	github.com/mudler/go-processmanager v0.1.1
 	github.com/mudler/memory v0.0.0-20260406210934-424c1ecf2cf8
--- a/go.sum
+++ b/go.sum
@@ -968,8 +968,8 @@ github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM=
 github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw=
 github.com/mudler/LocalAGI v0.0.0-20260606071251-14aed1ae4336 h1:iKBkSnpisOvMVxFoYsAObvAuOqXBakRPMD0PWxWG5EE=
 github.com/mudler/LocalAGI v0.0.0-20260606071251-14aed1ae4336/go.mod h1:U+g6u8mF2wQxhkdBl3dr8G4db1cv3n7KTKmraoJ7D0c=
-github.com/mudler/cogito v0.10.1-0.20260609212329-bf4010d31047 h1:wJ8WbDah1YcpBNRDmovQro8JiR228YFk7TUqPCS4m04=
-github.com/mudler/cogito v0.10.1-0.20260609212329-bf4010d31047/go.mod h1:6sfja3lcu2nWRzEc0wwqGNu/eCG3EWgij+8s7xyUeQ4=
+github.com/mudler/cogito v0.9.5-0.20260315222927-63abdec7189b h1:A74T2Lauvg61KodYqsjTYDY05kPLcW+efVZjd23dghU=
+github.com/mudler/cogito v0.9.5-0.20260315222927-63abdec7189b/go.mod h1:6sfja3lcu2nWRzEc0wwqGNu/eCG3EWgij+8s7xyUeQ4=
 github.com/mudler/edgevpn v0.34.0 h1:qDrD/rCPFY/FdURbXudIZWihVKY4VOX3nMn3CcbeQEU=
 github.com/mudler/edgevpn v0.34.0/go.mod h1:yki7uMi5LR9gSMrw8PdPieuxsrk8BLV2Ui7VBEmbbIA=
 github.com/mudler/go-piper v0.0.0-20241023091659-2494246fd9fc h1:RxwneJl1VgvikiX28EkpdAyL4yQVnJMrbquKospjHyA=
--- a/pkg/model/autoload.go
+++ b/pkg/model/autoload.go
@@ -0,0 +1,99 @@
+package model
+
+import (
+	"slices"
+	"sort"
+	"strings"
+
+	"github.com/mudler/LocalAI/core/config"
+)
+
+// preferredGGUFBackend is tried first when auto-detecting the backend for a
+// GGUF model, since GGUF is overwhelmingly llama.cpp's native format.
+const preferredGGUFBackend = "llama-cpp"
+
+// llmCapableUsecases are the BackendCapabilities usecases that signal a backend
+// can serve a text/LLM GGUF model. A GGUF model that declares no explicit
+// backend must only be auto-tried against backends carrying one of these
+// usecases - never against audio/codec/image backends (e.g. opus) that happen
+// to be installed alongside it (see issue #9287).
+var llmCapableUsecases = []string{
+	config.UsecaseChat,
+	config.UsecaseCompletion,
+	config.UsecaseEdit,
+	config.UsecaseEmbeddings,
+}
+
+// SelectAutoLoadBackends returns the ordered, deterministic list of backend
+// names to try when loading a model that declares no explicit backend.
+//
+// available is the set of installed backend names (unordered, as it comes from a
+// Go map). modelFile is the model file name/path (may be empty).
+//
+// The trial loop in (*ModelLoader).Load picks the first backend whose gRPC
+// LoadModel succeeds, so the order and membership of this list directly decide
+// which backend wins. The previous implementation ranged a Go map (random
+// order) with no filtering, so an unrelated installed backend such as the
+// "opus" audio codec could win a GGUF/LLM model load (#9287).
+//
+// Behaviour:
+//   - The result is always deterministically ordered, so auto-detect no longer
+//     depends on map iteration order.
+//   - For a GGUF model file the list is filtered to LLM-capable backends and
+//     llama-cpp is placed first, so an incompatible audio/codec/image backend
+//     can never win the trial loop.
+//   - If filtering would leave no candidate, the full sorted set is returned
+//     instead, so a model that previously loaded never becomes unloadable.
+func SelectAutoLoadBackends(available []string, modelFile string) []string {
+	sorted := append([]string(nil), available...)
+	sort.Strings(sorted)
+
+	if !isGGUFModelFile(modelFile) {
+		return sorted
+	}
+
+	filtered := make([]string, 0, len(sorted))
+	hasLlama := false
+	for _, b := range sorted {
+		if b == preferredGGUFBackend {
+			hasLlama = true
+			continue // added explicitly first below
+		}
+		if isLLMCapableBackend(b) {
+			filtered = append(filtered, b)
+		}
+	}
+	if hasLlama {
+		filtered = append([]string{preferredGGUFBackend}, filtered...)
+	}
+
+	if len(filtered) == 0 {
+		// Conservative fallback: no known LLM-capable backend is installed, so
+		// rather than refuse to load, fall back to the previous behaviour of
+		// trying every installed backend (now at least in a deterministic order).
+		return sorted
+	}
+	return filtered
+}
+
+func isGGUFModelFile(modelFile string) bool {
+	return strings.HasSuffix(strings.ToLower(modelFile), ".gguf")
+}
+
+// isLLMCapableBackend reports whether a backend is known to serve text/LLM
+// models. Backends absent from the capability map (unknown) are treated as
+// not LLM-capable here: for GGUF auto-detection we only want backends we can
+// positively confirm handle LLMs, and the zero-candidate fallback keeps unknown
+// setups working.
+func isLLMCapableBackend(name string) bool {
+	capability := config.GetBackendCapability(name)
+	if capability == nil {
+		return false
+	}
+	for _, u := range capability.PossibleUsecases {
+		if slices.Contains(llmCapableUsecases, u) {
+			return true
+		}
+	}
+	return false
+}
--- a/pkg/model/autoload_test.go
+++ b/pkg/model/autoload_test.go
@@ -0,0 +1,46 @@
+package model_test
+
+import (
+	"github.com/mudler/LocalAI/pkg/model"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("SelectAutoLoadBackends (#9287)", func() {
+	Describe("GGUF model auto-detection", func() {
+		It("excludes incompatible audio/codec backends (e.g. opus) for a .gguf model", func() {
+			// Regression for #9287: installing an unrelated audio backend like
+			// "opus" must never win the GGUF auto-detect trial loop.
+			got := model.SelectAutoLoadBackends([]string{"opus", "llama-cpp"}, "Qwen3.5-9b.gguf")
+			Expect(got).NotTo(ContainElement("opus"))
+			Expect(got).To(ContainElement("llama-cpp"))
+		})
+
+		It("places llama-cpp first for a .gguf model", func() {
+			got := model.SelectAutoLoadBackends([]string{"vllm", "opus", "llama-cpp"}, "model.gguf")
+			Expect(got).NotTo(BeEmpty())
+			Expect(got[0]).To(Equal("llama-cpp"))
+		})
+
+		It("is deterministic regardless of input ordering", func() {
+			a := model.SelectAutoLoadBackends([]string{"opus", "vllm", "llama-cpp", "whisper"}, "m.gguf")
+			b := model.SelectAutoLoadBackends([]string{"whisper", "llama-cpp", "vllm", "opus"}, "m.gguf")
+			Expect(a).To(Equal(b))
+		})
+
+		It("falls back to the full sorted set when filtering leaves no candidate", func() {
+			// No LLM-capable backend installed: never make a previously-loadable
+			// model unloadable, return the original set (sorted).
+			got := model.SelectAutoLoadBackends([]string{"opus"}, "model.gguf")
+			Expect(got).To(Equal([]string{"opus"}))
+		})
+	})
+
+	Describe("non-GGUF model auto-detection", func() {
+		It("returns a deterministic (sorted) set without filtering", func() {
+			got := model.SelectAutoLoadBackends([]string{"opus", "llama-cpp", "diffusers"}, "model-dir")
+			Expect(got).To(Equal([]string{"diffusers", "llama-cpp", "opus"}))
+		})
+	})
+})
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -350,14 +350,17 @@ func (ml *ModelLoader) Load(opts ...Option) (grpc.Backend, error) {
 	// Otherwise scan for backends in the asset directory
 	var err error

-	// get backends embedded in the binary
-	autoLoadBackends := []string{}
-
-	// append externalBackends supplied by the user via the CLI
+	// Collect the installed/external backends (the map is unordered).
+	available := []string{}
 	for b := range ml.GetAllExternalBackends(o) {
-		autoLoadBackends = append(autoLoadBackends, b)
+		available = append(available, b)
 	}

+	// Build a deterministic, file-type-filtered candidate list so an
+	// incompatible backend (e.g. an audio codec like opus) can never win the
+	// trial loop for a GGUF/LLM model. See SelectAutoLoadBackends / #9287.
+	autoLoadBackends := SelectAutoLoadBackends(available, o.model)
+
 	if len(autoLoadBackends) == 0 {
 		xlog.Error("No backends found")
 		return nil, fmt.Errorf("no backends found")