From 69c7a8e71df8e261842452d76982535e726dfac2 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 12 Jun 2026 22:07:06 +0000 Subject: [PATCH] fix(mlx): strip file:// LocalPrefix before loading filesystem-imported models MLX backends passed request.Model verbatim to mlx_lm/mlx_vlm load(). For a model imported from the filesystem, LocalAI hands the backend a file:// URI (its LocalPrefix), which load() rejects: the scheme is neither a valid HF repo id nor an existing path (Path(model).exists() fails on the scheme), producing "Repo id must be in the form 'repo_name' or 'namespace/repo_name' ... Use repo_type argument if needed". Add a pure, unit-testable resolve_model_path(model, model_file) helper in the shared python_utils: it prefers the resolved ModelFile, strips a file:// scheme and percent-decodes the path, and leaves plain repo ids and local paths untouched. Wire it into the mlx, mlx-vlm and mlx-distributed backends (load, model_key, and the distributed broadcast all use the normalized path). Fixes #7461. Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto --- backend/python/common/python_utils.py | 25 +++++++++++++++ backend/python/mlx-distributed/backend.py | 16 ++++++---- backend/python/mlx-vlm/backend.py | 12 ++++--- backend/python/mlx/backend.py | 14 ++++++--- backend/python/mlx/test.py | 38 ++++++++++++++++++++++- 5 files changed, 89 insertions(+), 16 deletions(-) diff --git a/backend/python/common/python_utils.py b/backend/python/common/python_utils.py index aa61ab578..093d2491d 100644 --- a/backend/python/common/python_utils.py +++ b/backend/python/common/python_utils.py @@ -5,6 +5,31 @@ imported by any backend that needs to parse LocalAI gRPC options or build a chat-template-compatible message list from proto Message objects. """ import json +from urllib.parse import unquote + + +def resolve_model_path(model, model_file=""): + """Resolve a LocalAI model reference to something an HF/MLX loader accepts. + + LocalAI hands backends either a plain HuggingFace repo id + (``namespace/name``), an already-local filesystem path, or a + ``file://`` URI (its ``LocalPrefix``) for models imported from disk. + Loaders such as ``mlx_lm.load`` reject the ``file://`` form because the + scheme is neither a valid repo id nor an existing path, so we normalize + it here before loading. + + Resolution order: + 1. Prefer ``model_file`` when set and non-empty - that is the resolved + local path LocalAI computed for the model. + 2. Strip a ``file://`` scheme and percent-decode it to a plain path. + 3. Leave plain repo ids and already-local paths unchanged. + """ + candidate = model_file if model_file else model + if candidate is None: + return candidate + if candidate.startswith("file://"): + return unquote(candidate[len("file://"):]) + return candidate def parse_options(options_list): diff --git a/backend/python/mlx-distributed/backend.py b/backend/python/mlx-distributed/backend.py index b03775f58..bbb632fc0 100644 --- a/backend/python/mlx-distributed/backend.py +++ b/backend/python/mlx-distributed/backend.py @@ -28,7 +28,7 @@ import grpc sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common')) sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common')) from grpc_auth import get_auth_interceptors -from python_utils import messages_to_dicts, parse_options as _shared_parse_options +from python_utils import messages_to_dicts, parse_options as _shared_parse_options, resolve_model_path from mlx_utils import parse_tool_calls, split_reasoning @@ -99,7 +99,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): from mlx_lm import load from mlx_lm.models.cache import make_prompt_cache, can_trim_prompt_cache, trim_prompt_cache - print(f"[Rank 0] Loading model: {request.Model}", file=sys.stderr) + # Normalize the model reference: strip LocalAI's file:// LocalPrefix + # and prefer the resolved ModelFile so mlx_lm.load() gets a plain + # repo id or filesystem path (it rejects file:// URIs). + model_path = resolve_model_path(request.Model, request.ModelFile) + print(f"[Rank 0] Loading model: {model_path}", file=sys.stderr) self.options = parse_options(request.Options) print(f"Options: {self.options}", file=sys.stderr) @@ -128,7 +132,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): ) self.coordinator = DistributedCoordinator(self.group) self.coordinator.broadcast_command(CMD_LOAD_MODEL) - self.coordinator.broadcast_model_name(request.Model) + self.coordinator.broadcast_model_name(model_path) else: print("[Rank 0] No hostfile configured, running single-node", file=sys.stderr) @@ -144,9 +148,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): if tokenizer_config: print(f"Loading with tokenizer_config: {tokenizer_config}", file=sys.stderr) - self.model, self.tokenizer = load(request.Model, tokenizer_config=tokenizer_config) + self.model, self.tokenizer = load(model_path, tokenizer_config=tokenizer_config) else: - self.model, self.tokenizer = load(request.Model) + self.model, self.tokenizer = load(model_path) if self.group is not None: from sharding import pipeline_auto_parallel @@ -157,7 +161,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): from mlx_cache import ThreadSafeLRUPromptCache max_cache_entries = self.options.get("max_cache_entries", 10) self.max_kv_size = self.options.get("max_kv_size", None) - self.model_key = request.Model + self.model_key = model_path self.lru_cache = ThreadSafeLRUPromptCache( max_size=max_cache_entries, can_trim_fn=can_trim_prompt_cache, diff --git a/backend/python/mlx-vlm/backend.py b/backend/python/mlx-vlm/backend.py index 074214ece..8e1b23900 100644 --- a/backend/python/mlx-vlm/backend.py +++ b/backend/python/mlx-vlm/backend.py @@ -18,7 +18,7 @@ import grpc sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common')) sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common')) from grpc_auth import get_auth_interceptors -from python_utils import messages_to_dicts, parse_options +from python_utils import messages_to_dicts, parse_options, resolve_model_path from mlx_utils import parse_tool_calls, split_reasoning from mlx_vlm import load, stream_generate @@ -67,7 +67,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): backend_pb2.Result: The load model result. """ try: - print(f"Loading MLX-VLM model: {request.Model}", file=sys.stderr) + # Normalize the model reference: strip LocalAI's file:// LocalPrefix + # and prefer the resolved ModelFile so mlx_vlm.load() gets a plain + # repo id or filesystem path (it rejects file:// URIs). + model_path = resolve_model_path(request.Model, request.ModelFile) + print(f"Loading MLX-VLM model: {model_path}", file=sys.stderr) print(f"Request: {request}", file=sys.stderr) # Parse Options[] key:value strings into a typed dict @@ -76,10 +80,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): # Load model and processor using MLX-VLM # mlx-vlm load function returns (model, processor) instead of (model, tokenizer) - self.model, self.processor = load(request.Model) + self.model, self.processor = load(model_path) # Load model config for chat template support - self.config = load_config(request.Model) + self.config = load_config(model_path) # Auto-infer the tool parser from the chat template. mlx-vlm has # its own _infer_tool_parser that falls back to mlx-lm parsers. diff --git a/backend/python/mlx/backend.py b/backend/python/mlx/backend.py index 6a4602c01..a21191a2d 100644 --- a/backend/python/mlx/backend.py +++ b/backend/python/mlx/backend.py @@ -17,7 +17,7 @@ import grpc sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common')) sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common')) from grpc_auth import get_auth_interceptors -from python_utils import messages_to_dicts, parse_options +from python_utils import messages_to_dicts, parse_options, resolve_model_path from mlx_utils import parse_tool_calls, split_reasoning from mlx_lm import load, stream_generate @@ -63,7 +63,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): backend_pb2.Result: The load model result. """ try: - print(f"Loading MLX model: {request.Model}", file=sys.stderr) + # Normalize the model reference: strip LocalAI's file:// LocalPrefix + # and prefer the resolved ModelFile so mlx_lm.load() gets a plain + # repo id or filesystem path (it rejects file:// URIs). + model_path = resolve_model_path(request.Model, request.ModelFile) + print(f"Loading MLX model: {model_path}", file=sys.stderr) print(f"Request: {request}", file=sys.stderr) # Parse Options[] key:value strings into a typed dict (shared helper) @@ -89,9 +93,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): # Load model and tokenizer using MLX if tokenizer_config: print(f"Loading with tokenizer_config: {tokenizer_config}", file=sys.stderr) - self.model, self.tokenizer = load(request.Model, tokenizer_config=tokenizer_config) + self.model, self.tokenizer = load(model_path, tokenizer_config=tokenizer_config) else: - self.model, self.tokenizer = load(request.Model) + self.model, self.tokenizer = load(model_path) # mlx_lm.load() returns a TokenizerWrapper that detects tool # calling and thinking markers from the chat template / vocab. @@ -111,7 +115,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): # Initialize thread-safe LRU prompt cache for efficient generation max_cache_entries = self.options.get("max_cache_entries", 10) self.max_kv_size = self.options.get("max_kv_size", None) - self.model_key = request.Model + self.model_key = model_path self.lru_cache = ThreadSafeLRUPromptCache( max_size=max_cache_entries, can_trim_fn=can_trim_prompt_cache, diff --git a/backend/python/mlx/test.py b/backend/python/mlx/test.py index ac5ff4c06..7774cbb4f 100644 --- a/backend/python/mlx/test.py +++ b/backend/python/mlx/test.py @@ -12,7 +12,7 @@ import backend_pb2_grpc # Make the shared helpers importable so we can unit-test them without a # running gRPC server. sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common')) -from python_utils import messages_to_dicts, parse_options +from python_utils import messages_to_dicts, parse_options, resolve_model_path from mlx_utils import parse_tool_calls, split_reasoning class TestBackendServicer(unittest.TestCase): @@ -322,6 +322,42 @@ class TestSharedHelpers(unittest.TestCase): self.assertEqual(r, "") self.assertEqual(c, "just text") + def test_resolve_model_path_file_uri(self): + # file:// LocalPrefix (LocalAI import) is stripped to a plain path. + self.assertEqual(resolve_model_path("file:///a/b"), "/a/b") + + def test_resolve_model_path_file_uri_percent_decoded(self): + # Percent-encoded characters (e.g. spaces) are decoded. + self.assertEqual( + resolve_model_path("file:///Users/me/My%20Models/Qwen3"), + "/Users/me/My Models/Qwen3", + ) + + def test_resolve_model_path_hf_repo_id_unchanged(self): + # Plain HuggingFace repo ids must pass through untouched. + self.assertEqual( + resolve_model_path("mlx-community/Qwen3-Coder-30B"), + "mlx-community/Qwen3-Coder-30B", + ) + + def test_resolve_model_path_local_path_unchanged(self): + # An already-local absolute path is left as-is. + self.assertEqual(resolve_model_path("/models/Qwen3"), "/models/Qwen3") + + def test_resolve_model_path_prefers_model_file(self): + # The resolved ModelFile wins over Model when both are set. + self.assertEqual( + resolve_model_path("file:///ignored", "/resolved/local/path"), + "/resolved/local/path", + ) + + def test_resolve_model_path_model_file_file_uri(self): + # A ModelFile that is itself a file:// URI is also normalized. + self.assertEqual( + resolve_model_path("ignored", "file:///a/b"), + "/a/b", + ) + def test_parse_tool_calls_with_shim(self): tm = types.SimpleNamespace( tool_call_start="",