fix(model): deterministic, file-type-filtered backend auto-detect (#9287 )

When a model config declares no explicit `backend:`, Load() fell into a trial loop built by ranging the external-backends Go map (random order) with no filtering, returning the first backend whose gRPC LoadModel succeeded. An unrelated installed backend - e.g. the "opus" audio codec - could therefore win a GGUF/LLM model load, so a model that should run on llama.cpp wrongly tried to use opus. Extract the candidate selection into a pure, testable function SelectAutoLoadBackends that: - sorts the candidate list deterministically (no more map-order nondeterminism), and - for a `.gguf` model, filters to LLM-capable backends (via core/config.BackendCapabilities) and puts llama-cpp first, so an incompatible audio/codec/image backend can never win the trial loop. If filtering would leave zero candidates, the full sorted set is returned unchanged, so a previously-loadable model is never made unloadable. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-12 18:58:49 -04:00 · 2026-06-12 21:46:25 +00:00
6 changed files with 158 additions and 247 deletions
--- a/pkg/model/autoload.go
+++ b/pkg/model/autoload.go
@@ -0,0 +1,99 @@
+package model
+
+import (
+	"slices"
+	"sort"
+	"strings"
+
+	"github.com/mudler/LocalAI/core/config"
+)
+
+// preferredGGUFBackend is tried first when auto-detecting the backend for a
+// GGUF model, since GGUF is overwhelmingly llama.cpp's native format.
+const preferredGGUFBackend = "llama-cpp"
+
+// llmCapableUsecases are the BackendCapabilities usecases that signal a backend
+// can serve a text/LLM GGUF model. A GGUF model that declares no explicit
+// backend must only be auto-tried against backends carrying one of these
+// usecases - never against audio/codec/image backends (e.g. opus) that happen
+// to be installed alongside it (see issue #9287).
+var llmCapableUsecases = []string{
+	config.UsecaseChat,
+	config.UsecaseCompletion,
+	config.UsecaseEdit,
+	config.UsecaseEmbeddings,
+}
+
+// SelectAutoLoadBackends returns the ordered, deterministic list of backend
+// names to try when loading a model that declares no explicit backend.
+//
+// available is the set of installed backend names (unordered, as it comes from a
+// Go map). modelFile is the model file name/path (may be empty).
+//
+// The trial loop in (*ModelLoader).Load picks the first backend whose gRPC
+// LoadModel succeeds, so the order and membership of this list directly decide
+// which backend wins. The previous implementation ranged a Go map (random
+// order) with no filtering, so an unrelated installed backend such as the
+// "opus" audio codec could win a GGUF/LLM model load (#9287).
+//
+// Behaviour:
+//   - The result is always deterministically ordered, so auto-detect no longer
+//     depends on map iteration order.
+//   - For a GGUF model file the list is filtered to LLM-capable backends and
+//     llama-cpp is placed first, so an incompatible audio/codec/image backend
+//     can never win the trial loop.
+//   - If filtering would leave no candidate, the full sorted set is returned
+//     instead, so a model that previously loaded never becomes unloadable.
+func SelectAutoLoadBackends(available []string, modelFile string) []string {
+	sorted := append([]string(nil), available...)
+	sort.Strings(sorted)
+
+	if !isGGUFModelFile(modelFile) {
+		return sorted
+	}
+
+	filtered := make([]string, 0, len(sorted))
+	hasLlama := false
+	for _, b := range sorted {
+		if b == preferredGGUFBackend {
+			hasLlama = true
+			continue // added explicitly first below
+		}
+		if isLLMCapableBackend(b) {
+			filtered = append(filtered, b)
+		}
+	}
+	if hasLlama {
+		filtered = append([]string{preferredGGUFBackend}, filtered...)
+	}
+
+	if len(filtered) == 0 {
+		// Conservative fallback: no known LLM-capable backend is installed, so
+		// rather than refuse to load, fall back to the previous behaviour of
+		// trying every installed backend (now at least in a deterministic order).
+		return sorted
+	}
+	return filtered
+}
+
+func isGGUFModelFile(modelFile string) bool {
+	return strings.HasSuffix(strings.ToLower(modelFile), ".gguf")
+}
+
+// isLLMCapableBackend reports whether a backend is known to serve text/LLM
+// models. Backends absent from the capability map (unknown) are treated as
+// not LLM-capable here: for GGUF auto-detection we only want backends we can
+// positively confirm handle LLMs, and the zero-candidate fallback keeps unknown
+// setups working.
+func isLLMCapableBackend(name string) bool {
+	capability := config.GetBackendCapability(name)
+	if capability == nil {
+		return false
+	}
+	for _, u := range capability.PossibleUsecases {
+		if slices.Contains(llmCapableUsecases, u) {
+			return true
+		}
+	}
+	return false
+}
--- a/pkg/model/autoload_test.go
+++ b/pkg/model/autoload_test.go
@@ -0,0 +1,46 @@
+package model_test
+
+import (
+	"github.com/mudler/LocalAI/pkg/model"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("SelectAutoLoadBackends (#9287)", func() {
+	Describe("GGUF model auto-detection", func() {
+		It("excludes incompatible audio/codec backends (e.g. opus) for a .gguf model", func() {
+			// Regression for #9287: installing an unrelated audio backend like
+			// "opus" must never win the GGUF auto-detect trial loop.
+			got := model.SelectAutoLoadBackends([]string{"opus", "llama-cpp"}, "Qwen3.5-9b.gguf")
+			Expect(got).NotTo(ContainElement("opus"))
+			Expect(got).To(ContainElement("llama-cpp"))
+		})
+
+		It("places llama-cpp first for a .gguf model", func() {
+			got := model.SelectAutoLoadBackends([]string{"vllm", "opus", "llama-cpp"}, "model.gguf")
+			Expect(got).NotTo(BeEmpty())
+			Expect(got[0]).To(Equal("llama-cpp"))
+		})
+
+		It("is deterministic regardless of input ordering", func() {
+			a := model.SelectAutoLoadBackends([]string{"opus", "vllm", "llama-cpp", "whisper"}, "m.gguf")
+			b := model.SelectAutoLoadBackends([]string{"whisper", "llama-cpp", "vllm", "opus"}, "m.gguf")
+			Expect(a).To(Equal(b))
+		})
+
+		It("falls back to the full sorted set when filtering leaves no candidate", func() {
+			// No LLM-capable backend installed: never make a previously-loadable
+			// model unloadable, return the original set (sorted).
+			got := model.SelectAutoLoadBackends([]string{"opus"}, "model.gguf")
+			Expect(got).To(Equal([]string{"opus"}))
+		})
+	})
+
+	Describe("non-GGUF model auto-detection", func() {
+		It("returns a deterministic (sorted) set without filtering", func() {
+			got := model.SelectAutoLoadBackends([]string{"opus", "llama-cpp", "diffusers"}, "model-dir")
+			Expect(got).To(Equal([]string{"diffusers", "llama-cpp", "opus"}))
+		})
+	})
+})
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -350,14 +350,17 @@ func (ml *ModelLoader) Load(opts ...Option) (grpc.Backend, error) {
 	// Otherwise scan for backends in the asset directory
 	var err error

-	// get backends embedded in the binary
-	autoLoadBackends := []string{}
-
-	// append externalBackends supplied by the user via the CLI
+	// Collect the installed/external backends (the map is unordered).
+	available := []string{}
 	for b := range ml.GetAllExternalBackends(o) {
-		autoLoadBackends = append(autoLoadBackends, b)
+		available = append(available, b)
 	}

+	// Build a deterministic, file-type-filtered candidate list so an
+	// incompatible backend (e.g. an audio codec like opus) can never win the
+	// trial loop for a GGUF/LLM model. See SelectAutoLoadBackends / #9287.
+	autoLoadBackends := SelectAutoLoadBackends(available, o.model)
+
 	if len(autoLoadBackends) == 0 {
 		xlog.Error("No backends found")
 		return nil, fmt.Errorf("no backends found")
--- a/pkg/xsysinfo/memory.go
+++ b/pkg/xsysinfo/memory.go
@@ -1,19 +1,9 @@
 package xsysinfo

 import (
-	"os"
-
 	"github.com/mudler/memory"
 )

-// cgroup/proc paths used to make the reported RAM total container-aware.
-// They are variables (not consts) so tests could override them if needed.
-var (
-	cgroupV2MaxPath   = "/sys/fs/cgroup/memory.max"
-	cgroupV1LimitPath = "/sys/fs/cgroup/memory/memory.limit_in_bytes"
-	procMemInfoPath   = "/proc/meminfo"
-)
-
 // SystemRAMInfo contains system RAM usage information
 type SystemRAMInfo struct {
 	Total        uint64  `json:"total"`
@@ -23,45 +13,12 @@ type SystemRAMInfo struct {
 	UsagePercent float64 `json:"usage_percent"`
 }

-// readFileBestEffort reads a file and returns its contents, or "" on any error.
-// Missing cgroup/proc files (e.g. on non-Linux hosts) are expected and benign.
-func readFileBestEffort(path string) string {
-	b, err := os.ReadFile(path)
-	if err != nil {
-		return ""
-	}
-	return string(b)
-}
-
-// systemTotalMemory returns the container-aware total system RAM in bytes.
-//
-// memory.TotalMemory() reports the HOST kernel total (syscall.Sysinfo on
-// Linux), which lxcfs/LXD does NOT virtualize. Inside a container that
-// over-reports physical RAM and, combined with the virtualized MemAvailable,
-// inflates the reported usage (see issue #8059). We instead derive the total
-// from the minimum of all available container-aware candidates.
-func systemTotalMemory() uint64 {
-	return chooseTotalMemory(
-		readFileBestEffort(cgroupV2MaxPath),
-		readFileBestEffort(cgroupV1LimitPath),
-		readFileBestEffort(procMemInfoPath),
-		memory.TotalMemory(),
-	)
-}
-
 // GetSystemRAMInfo returns real-time system RAM usage
 func GetSystemRAMInfo() (*SystemRAMInfo, error) {
-	total := systemTotalMemory()
-	available := memory.AvailableMemory()
+	total := memory.TotalMemory()
+	free := memory.AvailableMemory()

-	// AvailableMemory (MemAvailable) is virtualized by lxcfs, so in edge
-	// cases it can exceed our corrected total; clamp to avoid an unsigned
-	// underflow when computing Used.
-	if available > total {
-		available = total
-	}
-
-	used := total - available
+	used := total - free

 	usagePercent := 0.0
 	if total > 0 {
@@ -70,8 +27,8 @@ func GetSystemRAMInfo() (*SystemRAMInfo, error) {
 	return &SystemRAMInfo{
 		Total:        total,
 		Used:         used,
-		Free:         available,
-		Available:    available,
+		Free:         free,
+		Available:    total - used,
 		UsagePercent: usagePercent,
 	}, nil
 }
--- a/pkg/xsysinfo/memory_total.go
+++ b/pkg/xsysinfo/memory_total.go
@@ -1,120 +0,0 @@
-package xsysinfo
-
-import (
-	"strconv"
-	"strings"
-)
-
-// cgroupV1UnlimitedSentinel is the value the kernel writes to
-// memory.limit_in_bytes when no limit is set. It is PAGE_COUNTER_MAX
-// (LONG_MAX rounded down to a page boundary), i.e. 0x7FFFFFFFFFFFF000 on
-// 4 KiB-page systems. Any value at or above this is treated as "no limit".
-const cgroupV1UnlimitedSentinel = uint64(0x7FFFFFFFFFFFF000)
-
-// parseUintField parses a trimmed unsigned integer from raw file contents.
-// It returns (0, false) when the content is empty or not a number.
-func parseUintField(raw string) (uint64, bool) {
-	s := strings.TrimSpace(raw)
-	if s == "" {
-		return 0, false
-	}
-	v, err := strconv.ParseUint(s, 10, 64)
-	if err != nil {
-		return 0, false
-	}
-	return v, true
-}
-
-// parseCgroupV2Max interprets the contents of cgroup v2 memory.max.
-// The literal "max" means unlimited, returning 0.
-func parseCgroupV2Max(raw string) uint64 {
-	if strings.TrimSpace(raw) == "max" {
-		return 0
-	}
-	v, ok := parseUintField(raw)
-	if !ok {
-		return 0
-	}
-	return v
-}
-
-// parseCgroupV1Limit interprets the contents of cgroup v1
-// memory.limit_in_bytes. The kernel's "unlimited" sentinel (a value at or
-// above PAGE_COUNTER_MAX) is treated as no limit, returning 0.
-func parseCgroupV1Limit(raw string) uint64 {
-	v, ok := parseUintField(raw)
-	if !ok {
-		return 0
-	}
-	if v >= cgroupV1UnlimitedSentinel {
-		return 0
-	}
-	return v
-}
-
-// parseMemTotal extracts the MemTotal value (in bytes) from raw
-// /proc/meminfo contents. MemTotal is reported in kibibytes, so the parsed
-// value is multiplied by 1024. Returns 0 when the field is missing.
-func parseMemTotal(raw string) uint64 {
-	for _, line := range strings.Split(raw, "\n") {
-		if !strings.HasPrefix(line, "MemTotal:") {
-			continue
-		}
-		fields := strings.Fields(line)
-		// Expected: ["MemTotal:", "<value>", "kB"]
-		if len(fields) < 2 {
-			return 0
-		}
-		v, err := strconv.ParseUint(fields[1], 10, 64)
-		if err != nil {
-			return 0
-		}
-		if len(fields) >= 3 {
-			switch strings.ToLower(fields[2]) {
-			case "kb":
-				return v * 1024
-			case "mb":
-				return v * 1024 * 1024
-			case "gb":
-				return v * 1024 * 1024 * 1024
-			}
-		}
-		return v
-	}
-	return 0
-}
-
-// chooseTotalMemory selects the most accurate system RAM total in bytes.
-//
-// On Linux the host kernel total (sysinfoTotal, from syscall.Sysinfo) is NOT
-// virtualized by lxcfs/LXD, so inside a container it over-reports physical
-// RAM. The cgroup limits and /proc/meminfo MemTotal, by contrast, do reflect
-// the container's view. We therefore take the MINIMUM of all non-zero,
-// non-unlimited candidates:
-//
-//   - cgroup v2 memory.max ("max" => unlimited, skipped)
-//   - cgroup v1 memory.limit_in_bytes (kernel sentinel => unlimited, skipped)
-//   - /proc/meminfo MemTotal (lxcfs/LXD virtualizes this)
-//   - sysinfoTotal (bare-metal fallback)
-//
-// On bare metal the cgroup limits are unlimited and MemTotal == sysinfoTotal,
-// so the result equals the host total exactly as before.
-func chooseTotalMemory(cgroupV2Max, cgroupV1Limit, procMemInfo string, sysinfoTotal uint64) uint64 {
-	candidates := []uint64{
-		parseCgroupV2Max(cgroupV2Max),
-		parseCgroupV1Limit(cgroupV1Limit),
-		parseMemTotal(procMemInfo),
-		sysinfoTotal,
-	}
-
-	var best uint64
-	for _, c := range candidates {
-		if c == 0 {
-			continue
-		}
-		if best == 0 || c < best {
-			best = c
-		}
-	}
-	return best
-}
--- a/pkg/xsysinfo/memory_total_test.go
+++ b/pkg/xsysinfo/memory_total_test.go
@@ -1,74 +0,0 @@
-package xsysinfo
-
-import (
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("chooseTotalMemory", func() {
-	const (
-		gi128 = uint64(128) * 1024 * 1024 * 1024
-		gi20  = uint64(20) * 1024 * 1024 * 1024
-		gi10  = uint64(10) * 1024 * 1024 * 1024
-	)
-
-	// /proc/meminfo MemTotal is in kB; build a snippet for a given byte total.
-	memInfo := func(bytes uint64) string {
-		kb := bytes / 1024
-		return "MemTotal:       " + itoa(kb) + " kB\nMemFree:        123 kB\n"
-	}
-
-	Context("bare metal (no cgroup cap, memory.max == max)", func() {
-		It("uses the host sysinfo total", func() {
-			// MemTotal mirrors sysinfo on bare metal.
-			got := chooseTotalMemory("max\n", string(rune(0)), memInfo(gi128), gi128)
-			Expect(got).To(Equal(gi128))
-		})
-	})
-
-	Context("LXD/lxcfs container (MemTotal virtualized below host, no cap)", func() {
-		It("uses the virtualized MemTotal, not the host sysinfo total", func() {
-			// This is issue #8059: host sysinfo says 128Gi, but lxcfs
-			// virtualizes /proc/meminfo MemTotal to 20Gi and there is no
-			// cgroup cap. The corrected total must be 20Gi.
-			got := chooseTotalMemory("max\n", "", memInfo(gi20), gi128)
-			Expect(got).To(Equal(gi20))
-		})
-	})
-
-	Context("cgroup v2 cap set below MemTotal", func() {
-		It("uses the cgroup cap", func() {
-			got := chooseTotalMemory(itoa(gi10)+"\n", "", memInfo(gi20), gi128)
-			Expect(got).To(Equal(gi10))
-		})
-	})
-
-	Context("cgroup v1 with the kernel unlimited sentinel", func() {
-		It("ignores the sentinel and falls back to MemTotal", func() {
-			got := chooseTotalMemory("", "9223372036854771712\n", memInfo(gi20), gi128)
-			Expect(got).To(Equal(gi20))
-		})
-	})
-
-	Context("all candidates empty/unlimited", func() {
-		It("falls back to sysinfo total", func() {
-			got := chooseTotalMemory("max\n", "", "", gi128)
-			Expect(got).To(Equal(gi128))
-		})
-	})
-})
-
-// itoa is a tiny base-10 formatter to avoid importing strconv into the test.
-func itoa(v uint64) string {
-	if v == 0 {
-		return "0"
-	}
-	var buf [20]byte
-	i := len(buf)
-	for v > 0 {
-		i--
-		buf[i] = byte('0' + v%10)
-		v /= 10
-	}
-	return string(buf[i:])
-}