fix(mlx): strip file:// LocalPrefix before loading filesystem-imported models

MLX backends passed request.Model verbatim to mlx_lm/mlx_vlm load(). For a model imported from the filesystem, LocalAI hands the backend a file:// URI (its LocalPrefix), which load() rejects: the scheme is neither a valid HF repo id nor an existing path (Path(model).exists() fails on the scheme), producing "Repo id must be in the form 'repo_name' or 'namespace/repo_name' ... Use repo_type argument if needed". Add a pure, unit-testable resolve_model_path(model, model_file) helper in the shared python_utils: it prefers the resolved ModelFile, strips a file:// scheme and percent-decodes the path, and leaves plain repo ids and local paths untouched. Wire it into the mlx, mlx-vlm and mlx-distributed backends (load, model_key, and the distributed broadcast all use the normalized path). Fixes #7461. Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-12 18:58:49 -04:00 · 2026-06-12 22:07:06 +00:00
8 changed files with 94 additions and 258 deletions
--- a/backend/python/common/python_utils.py
+++ b/backend/python/common/python_utils.py
@@ -5,6 +5,31 @@ imported by any backend that needs to parse LocalAI gRPC options or build a
 chat-template-compatible message list from proto Message objects.
 """
 import json
+from urllib.parse import unquote
+
+
+def resolve_model_path(model, model_file=""):
+    """Resolve a LocalAI model reference to something an HF/MLX loader accepts.
+
+    LocalAI hands backends either a plain HuggingFace repo id
+    (``namespace/name``), an already-local filesystem path, or a
+    ``file://`` URI (its ``LocalPrefix``) for models imported from disk.
+    Loaders such as ``mlx_lm.load`` reject the ``file://`` form because the
+    scheme is neither a valid repo id nor an existing path, so we normalize
+    it here before loading.
+
+    Resolution order:
+      1. Prefer ``model_file`` when set and non-empty - that is the resolved
+         local path LocalAI computed for the model.
+      2. Strip a ``file://`` scheme and percent-decode it to a plain path.
+      3. Leave plain repo ids and already-local paths unchanged.
+    """
+    candidate = model_file if model_file else model
+    if candidate is None:
+        return candidate
+    if candidate.startswith("file://"):
+        return unquote(candidate[len("file://"):])
+    return candidate


 def parse_options(options_list):
--- a/backend/python/mlx-distributed/backend.py
+++ b/backend/python/mlx-distributed/backend.py
@@ -28,7 +28,7 @@ import grpc
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
 from grpc_auth import get_auth_interceptors
-from python_utils import messages_to_dicts, parse_options as _shared_parse_options
+from python_utils import messages_to_dicts, parse_options as _shared_parse_options, resolve_model_path
 from mlx_utils import parse_tool_calls, split_reasoning


@@ -99,7 +99,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            from mlx_lm import load
            from mlx_lm.models.cache import make_prompt_cache, can_trim_prompt_cache, trim_prompt_cache

-            print(f"[Rank 0] Loading model: {request.Model}", file=sys.stderr)
+            # Normalize the model reference: strip LocalAI's file:// LocalPrefix
+            # and prefer the resolved ModelFile so mlx_lm.load() gets a plain
+            # repo id or filesystem path (it rejects file:// URIs).
+            model_path = resolve_model_path(request.Model, request.ModelFile)
+            print(f"[Rank 0] Loading model: {model_path}", file=sys.stderr)

            self.options = parse_options(request.Options)
            print(f"Options: {self.options}", file=sys.stderr)
@@ -128,7 +132,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                )
                self.coordinator = DistributedCoordinator(self.group)
                self.coordinator.broadcast_command(CMD_LOAD_MODEL)
-                self.coordinator.broadcast_model_name(request.Model)
+                self.coordinator.broadcast_model_name(model_path)
            else:
                print("[Rank 0] No hostfile configured, running single-node", file=sys.stderr)

@@ -144,9 +148,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

            if tokenizer_config:
                print(f"Loading with tokenizer_config: {tokenizer_config}", file=sys.stderr)
-                self.model, self.tokenizer = load(request.Model, tokenizer_config=tokenizer_config)
+                self.model, self.tokenizer = load(model_path, tokenizer_config=tokenizer_config)
            else:
-                self.model, self.tokenizer = load(request.Model)
+                self.model, self.tokenizer = load(model_path)

            if self.group is not None:
                from sharding import pipeline_auto_parallel
@@ -157,7 +161,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                from mlx_cache import ThreadSafeLRUPromptCache
                max_cache_entries = self.options.get("max_cache_entries", 10)
                self.max_kv_size = self.options.get("max_kv_size", None)
-                self.model_key = request.Model
+                self.model_key = model_path
                self.lru_cache = ThreadSafeLRUPromptCache(
                    max_size=max_cache_entries,
                    can_trim_fn=can_trim_prompt_cache,
--- a/backend/python/mlx-vlm/backend.py
+++ b/backend/python/mlx-vlm/backend.py
@@ -18,7 +18,7 @@ import grpc
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
 from grpc_auth import get_auth_interceptors
-from python_utils import messages_to_dicts, parse_options
+from python_utils import messages_to_dicts, parse_options, resolve_model_path
 from mlx_utils import parse_tool_calls, split_reasoning

 from mlx_vlm import load, stream_generate
@@ -67,7 +67,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            backend_pb2.Result: The load model result.
        """
        try:
-            print(f"Loading MLX-VLM model: {request.Model}", file=sys.stderr)
+            # Normalize the model reference: strip LocalAI's file:// LocalPrefix
+            # and prefer the resolved ModelFile so mlx_vlm.load() gets a plain
+            # repo id or filesystem path (it rejects file:// URIs).
+            model_path = resolve_model_path(request.Model, request.ModelFile)
+            print(f"Loading MLX-VLM model: {model_path}", file=sys.stderr)
            print(f"Request: {request}", file=sys.stderr)

            # Parse Options[] key:value strings into a typed dict
@@ -76,10 +80,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

            # Load model and processor using MLX-VLM
            # mlx-vlm load function returns (model, processor) instead of (model, tokenizer)
-            self.model, self.processor = load(request.Model)
+            self.model, self.processor = load(model_path)

            # Load model config for chat template support
-            self.config = load_config(request.Model)
+            self.config = load_config(model_path)

            # Auto-infer the tool parser from the chat template. mlx-vlm has
            # its own _infer_tool_parser that falls back to mlx-lm parsers.
--- a/backend/python/mlx/backend.py
+++ b/backend/python/mlx/backend.py
@@ -17,7 +17,7 @@ import grpc
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
 from grpc_auth import get_auth_interceptors
-from python_utils import messages_to_dicts, parse_options
+from python_utils import messages_to_dicts, parse_options, resolve_model_path
 from mlx_utils import parse_tool_calls, split_reasoning

 from mlx_lm import load, stream_generate
@@ -63,7 +63,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            backend_pb2.Result: The load model result.
        """
        try:
-            print(f"Loading MLX model: {request.Model}", file=sys.stderr)
+            # Normalize the model reference: strip LocalAI's file:// LocalPrefix
+            # and prefer the resolved ModelFile so mlx_lm.load() gets a plain
+            # repo id or filesystem path (it rejects file:// URIs).
+            model_path = resolve_model_path(request.Model, request.ModelFile)
+            print(f"Loading MLX model: {model_path}", file=sys.stderr)
            print(f"Request: {request}", file=sys.stderr)

            # Parse Options[] key:value strings into a typed dict (shared helper)
@@ -89,9 +93,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            # Load model and tokenizer using MLX
            if tokenizer_config:
                print(f"Loading with tokenizer_config: {tokenizer_config}", file=sys.stderr)
-                self.model, self.tokenizer = load(request.Model, tokenizer_config=tokenizer_config)
+                self.model, self.tokenizer = load(model_path, tokenizer_config=tokenizer_config)
            else:
-                self.model, self.tokenizer = load(request.Model)
+                self.model, self.tokenizer = load(model_path)

            # mlx_lm.load() returns a TokenizerWrapper that detects tool
            # calling and thinking markers from the chat template / vocab.
@@ -111,7 +115,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            # Initialize thread-safe LRU prompt cache for efficient generation
            max_cache_entries = self.options.get("max_cache_entries", 10)
            self.max_kv_size = self.options.get("max_kv_size", None)
-            self.model_key = request.Model
+            self.model_key = model_path
            self.lru_cache = ThreadSafeLRUPromptCache(
                max_size=max_cache_entries,
                can_trim_fn=can_trim_prompt_cache,
--- a/backend/python/mlx/test.py
+++ b/backend/python/mlx/test.py
@@ -12,7 +12,7 @@ import backend_pb2_grpc
 # Make the shared helpers importable so we can unit-test them without a
 # running gRPC server.
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
-from python_utils import messages_to_dicts, parse_options
+from python_utils import messages_to_dicts, parse_options, resolve_model_path
 from mlx_utils import parse_tool_calls, split_reasoning

 class TestBackendServicer(unittest.TestCase):
@@ -322,6 +322,42 @@ class TestSharedHelpers(unittest.TestCase):
        self.assertEqual(r, "")
        self.assertEqual(c, "just text")

+    def test_resolve_model_path_file_uri(self):
+        # file:// LocalPrefix (LocalAI import) is stripped to a plain path.
+        self.assertEqual(resolve_model_path("file:///a/b"), "/a/b")
+
+    def test_resolve_model_path_file_uri_percent_decoded(self):
+        # Percent-encoded characters (e.g. spaces) are decoded.
+        self.assertEqual(
+            resolve_model_path("file:///Users/me/My%20Models/Qwen3"),
+            "/Users/me/My Models/Qwen3",
+        )
+
+    def test_resolve_model_path_hf_repo_id_unchanged(self):
+        # Plain HuggingFace repo ids must pass through untouched.
+        self.assertEqual(
+            resolve_model_path("mlx-community/Qwen3-Coder-30B"),
+            "mlx-community/Qwen3-Coder-30B",
+        )
+
+    def test_resolve_model_path_local_path_unchanged(self):
+        # An already-local absolute path is left as-is.
+        self.assertEqual(resolve_model_path("/models/Qwen3"), "/models/Qwen3")
+
+    def test_resolve_model_path_prefers_model_file(self):
+        # The resolved ModelFile wins over Model when both are set.
+        self.assertEqual(
+            resolve_model_path("file:///ignored", "/resolved/local/path"),
+            "/resolved/local/path",
+        )
+
+    def test_resolve_model_path_model_file_file_uri(self):
+        # A ModelFile that is itself a file:// URI is also normalized.
+        self.assertEqual(
+            resolve_model_path("ignored", "file:///a/b"),
+            "/a/b",
+        )
+
    def test_parse_tool_calls_with_shim(self):
        tm = types.SimpleNamespace(
            tool_call_start="<tool_call>",
--- a/pkg/xsysinfo/memory.go
+++ b/pkg/xsysinfo/memory.go
@@ -1,19 +1,9 @@
 package xsysinfo

 import (
-	"os"
-
 	"github.com/mudler/memory"
 )

-// cgroup/proc paths used to make the reported RAM total container-aware.
-// They are variables (not consts) so tests could override them if needed.
-var (
-	cgroupV2MaxPath   = "/sys/fs/cgroup/memory.max"
-	cgroupV1LimitPath = "/sys/fs/cgroup/memory/memory.limit_in_bytes"
-	procMemInfoPath   = "/proc/meminfo"
-)
-
 // SystemRAMInfo contains system RAM usage information
 type SystemRAMInfo struct {
 	Total        uint64  `json:"total"`
@@ -23,45 +13,12 @@ type SystemRAMInfo struct {
 	UsagePercent float64 `json:"usage_percent"`
 }

-// readFileBestEffort reads a file and returns its contents, or "" on any error.
-// Missing cgroup/proc files (e.g. on non-Linux hosts) are expected and benign.
-func readFileBestEffort(path string) string {
-	b, err := os.ReadFile(path)
-	if err != nil {
-		return ""
-	}
-	return string(b)
-}
-
-// systemTotalMemory returns the container-aware total system RAM in bytes.
-//
-// memory.TotalMemory() reports the HOST kernel total (syscall.Sysinfo on
-// Linux), which lxcfs/LXD does NOT virtualize. Inside a container that
-// over-reports physical RAM and, combined with the virtualized MemAvailable,
-// inflates the reported usage (see issue #8059). We instead derive the total
-// from the minimum of all available container-aware candidates.
-func systemTotalMemory() uint64 {
-	return chooseTotalMemory(
-		readFileBestEffort(cgroupV2MaxPath),
-		readFileBestEffort(cgroupV1LimitPath),
-		readFileBestEffort(procMemInfoPath),
-		memory.TotalMemory(),
-	)
-}
-
 // GetSystemRAMInfo returns real-time system RAM usage
 func GetSystemRAMInfo() (*SystemRAMInfo, error) {
-	total := systemTotalMemory()
-	available := memory.AvailableMemory()
+	total := memory.TotalMemory()
+	free := memory.AvailableMemory()

-	// AvailableMemory (MemAvailable) is virtualized by lxcfs, so in edge
-	// cases it can exceed our corrected total; clamp to avoid an unsigned
-	// underflow when computing Used.
-	if available > total {
-		available = total
-	}
-
-	used := total - available
+	used := total - free

 	usagePercent := 0.0
 	if total > 0 {
@@ -70,8 +27,8 @@ func GetSystemRAMInfo() (*SystemRAMInfo, error) {
 	return &SystemRAMInfo{
 		Total:        total,
 		Used:         used,
-		Free:         available,
-		Available:    available,
+		Free:         free,
+		Available:    total - used,
 		UsagePercent: usagePercent,
 	}, nil
 }
--- a/pkg/xsysinfo/memory_total.go
+++ b/pkg/xsysinfo/memory_total.go
@@ -1,120 +0,0 @@
-package xsysinfo
-
-import (
-	"strconv"
-	"strings"
-)
-
-// cgroupV1UnlimitedSentinel is the value the kernel writes to
-// memory.limit_in_bytes when no limit is set. It is PAGE_COUNTER_MAX
-// (LONG_MAX rounded down to a page boundary), i.e. 0x7FFFFFFFFFFFF000 on
-// 4 KiB-page systems. Any value at or above this is treated as "no limit".
-const cgroupV1UnlimitedSentinel = uint64(0x7FFFFFFFFFFFF000)
-
-// parseUintField parses a trimmed unsigned integer from raw file contents.
-// It returns (0, false) when the content is empty or not a number.
-func parseUintField(raw string) (uint64, bool) {
-	s := strings.TrimSpace(raw)
-	if s == "" {
-		return 0, false
-	}
-	v, err := strconv.ParseUint(s, 10, 64)
-	if err != nil {
-		return 0, false
-	}
-	return v, true
-}
-
-// parseCgroupV2Max interprets the contents of cgroup v2 memory.max.
-// The literal "max" means unlimited, returning 0.
-func parseCgroupV2Max(raw string) uint64 {
-	if strings.TrimSpace(raw) == "max" {
-		return 0
-	}
-	v, ok := parseUintField(raw)
-	if !ok {
-		return 0
-	}
-	return v
-}
-
-// parseCgroupV1Limit interprets the contents of cgroup v1
-// memory.limit_in_bytes. The kernel's "unlimited" sentinel (a value at or
-// above PAGE_COUNTER_MAX) is treated as no limit, returning 0.
-func parseCgroupV1Limit(raw string) uint64 {
-	v, ok := parseUintField(raw)
-	if !ok {
-		return 0
-	}
-	if v >= cgroupV1UnlimitedSentinel {
-		return 0
-	}
-	return v
-}
-
-// parseMemTotal extracts the MemTotal value (in bytes) from raw
-// /proc/meminfo contents. MemTotal is reported in kibibytes, so the parsed
-// value is multiplied by 1024. Returns 0 when the field is missing.
-func parseMemTotal(raw string) uint64 {
-	for _, line := range strings.Split(raw, "\n") {
-		if !strings.HasPrefix(line, "MemTotal:") {
-			continue
-		}
-		fields := strings.Fields(line)
-		// Expected: ["MemTotal:", "<value>", "kB"]
-		if len(fields) < 2 {
-			return 0
-		}
-		v, err := strconv.ParseUint(fields[1], 10, 64)
-		if err != nil {
-			return 0
-		}
-		if len(fields) >= 3 {
-			switch strings.ToLower(fields[2]) {
-			case "kb":
-				return v * 1024
-			case "mb":
-				return v * 1024 * 1024
-			case "gb":
-				return v * 1024 * 1024 * 1024
-			}
-		}
-		return v
-	}
-	return 0
-}
-
-// chooseTotalMemory selects the most accurate system RAM total in bytes.
-//
-// On Linux the host kernel total (sysinfoTotal, from syscall.Sysinfo) is NOT
-// virtualized by lxcfs/LXD, so inside a container it over-reports physical
-// RAM. The cgroup limits and /proc/meminfo MemTotal, by contrast, do reflect
-// the container's view. We therefore take the MINIMUM of all non-zero,
-// non-unlimited candidates:
-//
-//   - cgroup v2 memory.max ("max" => unlimited, skipped)
-//   - cgroup v1 memory.limit_in_bytes (kernel sentinel => unlimited, skipped)
-//   - /proc/meminfo MemTotal (lxcfs/LXD virtualizes this)
-//   - sysinfoTotal (bare-metal fallback)
-//
-// On bare metal the cgroup limits are unlimited and MemTotal == sysinfoTotal,
-// so the result equals the host total exactly as before.
-func chooseTotalMemory(cgroupV2Max, cgroupV1Limit, procMemInfo string, sysinfoTotal uint64) uint64 {
-	candidates := []uint64{
-		parseCgroupV2Max(cgroupV2Max),
-		parseCgroupV1Limit(cgroupV1Limit),
-		parseMemTotal(procMemInfo),
-		sysinfoTotal,
-	}
-
-	var best uint64
-	for _, c := range candidates {
-		if c == 0 {
-			continue
-		}
-		if best == 0 || c < best {
-			best = c
-		}
-	}
-	return best
-}
--- a/pkg/xsysinfo/memory_total_test.go
+++ b/pkg/xsysinfo/memory_total_test.go
@@ -1,74 +0,0 @@
-package xsysinfo
-
-import (
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("chooseTotalMemory", func() {
-	const (
-		gi128 = uint64(128) * 1024 * 1024 * 1024
-		gi20  = uint64(20) * 1024 * 1024 * 1024
-		gi10  = uint64(10) * 1024 * 1024 * 1024
-	)
-
-	// /proc/meminfo MemTotal is in kB; build a snippet for a given byte total.
-	memInfo := func(bytes uint64) string {
-		kb := bytes / 1024
-		return "MemTotal:       " + itoa(kb) + " kB\nMemFree:        123 kB\n"
-	}
-
-	Context("bare metal (no cgroup cap, memory.max == max)", func() {
-		It("uses the host sysinfo total", func() {
-			// MemTotal mirrors sysinfo on bare metal.
-			got := chooseTotalMemory("max\n", string(rune(0)), memInfo(gi128), gi128)
-			Expect(got).To(Equal(gi128))
-		})
-	})
-
-	Context("LXD/lxcfs container (MemTotal virtualized below host, no cap)", func() {
-		It("uses the virtualized MemTotal, not the host sysinfo total", func() {
-			// This is issue #8059: host sysinfo says 128Gi, but lxcfs
-			// virtualizes /proc/meminfo MemTotal to 20Gi and there is no
-			// cgroup cap. The corrected total must be 20Gi.
-			got := chooseTotalMemory("max\n", "", memInfo(gi20), gi128)
-			Expect(got).To(Equal(gi20))
-		})
-	})
-
-	Context("cgroup v2 cap set below MemTotal", func() {
-		It("uses the cgroup cap", func() {
-			got := chooseTotalMemory(itoa(gi10)+"\n", "", memInfo(gi20), gi128)
-			Expect(got).To(Equal(gi10))
-		})
-	})
-
-	Context("cgroup v1 with the kernel unlimited sentinel", func() {
-		It("ignores the sentinel and falls back to MemTotal", func() {
-			got := chooseTotalMemory("", "9223372036854771712\n", memInfo(gi20), gi128)
-			Expect(got).To(Equal(gi20))
-		})
-	})
-
-	Context("all candidates empty/unlimited", func() {
-		It("falls back to sysinfo total", func() {
-			got := chooseTotalMemory("max\n", "", "", gi128)
-			Expect(got).To(Equal(gi128))
-		})
-	})
-})
-
-// itoa is a tiny base-10 formatter to avoid importing strconv into the test.
-func itoa(v uint64) string {
-	if v == 0 {
-		return "0"
-	}
-	var buf [20]byte
-	i := len(buf)
-	for v > 0 {
-		i--
-		buf[i] = byte('0' + v%10)
-		v /= 10
-	}
-	return string(buf[i:])
-}