Compare commits

..

1 Commits

Author SHA1 Message Date
Ettore Di Giacinto
69c7a8e71d fix(mlx): strip file:// LocalPrefix before loading filesystem-imported models
MLX backends passed request.Model verbatim to mlx_lm/mlx_vlm load(). For a
model imported from the filesystem, LocalAI hands the backend a file:// URI
(its LocalPrefix), which load() rejects: the scheme is neither a valid HF
repo id nor an existing path (Path(model).exists() fails on the scheme),
producing "Repo id must be in the form 'repo_name' or 'namespace/repo_name'
... Use repo_type argument if needed".

Add a pure, unit-testable resolve_model_path(model, model_file) helper in the
shared python_utils: it prefers the resolved ModelFile, strips a file://
scheme and percent-decodes the path, and leaves plain repo ids and local
paths untouched. Wire it into the mlx, mlx-vlm and mlx-distributed backends
(load, model_key, and the distributed broadcast all use the normalized path).

Fixes #7461.

Assisted-by: claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-12 22:07:06 +00:00
18 changed files with 103 additions and 1779 deletions

View File

@@ -10,7 +10,7 @@ JOBS?=$(shell nproc --ignore=1)
# this on `master` always picks up the latest C-API surface (incl. the
# per-detection accessor functions used by golocateanythingcpp.go).
LOCATEANYTHING_REPO?=https://github.com/mudler/locate-anything.cpp.git
LOCATEANYTHING_VERSION?=92c1682da792c1e8a5dec91acc2be4b02c742ded
LOCATEANYTHING_VERSION?=60e450945476d5e97e0754a8c0e71a9ea81690e0
ifeq ($(NATIVE),false)
CMAKE_ARGS+=-DGGML_NATIVE=OFF

View File

@@ -1163,11 +1163,11 @@
- &opus
name: "opus"
alias: "opus"
capabilities:
default: "cpu-opus"
metal: "metal-opus"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-opus"
urls:
- https://opus-codec.org/
mirrors:
- localai/localai-backends:latest-cpu-opus
license: BSD-3-Clause
description: |
Opus audio codec backend for encoding and decoding audio.
@@ -1177,11 +1177,7 @@
- opus
- WebRTC
- realtime
- !!merge <<: *opus
name: "opus-development"
capabilities:
default: "cpu-opus-development"
metal: "metal-opus-development"
- CPU
- &silero-vad
name: "silero-vad"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-silero-vad"
@@ -1607,12 +1603,7 @@
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-local-store
- !!merge <<: *opus
name: "cpu-opus"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-opus"
mirrors:
- localai/localai-backends:latest-cpu-opus
- !!merge <<: *opus
name: "cpu-opus-development"
name: "opus-development"
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-opus"
mirrors:
- localai/localai-backends:master-cpu-opus

View File

@@ -5,6 +5,31 @@ imported by any backend that needs to parse LocalAI gRPC options or build a
chat-template-compatible message list from proto Message objects.
"""
import json
from urllib.parse import unquote
def resolve_model_path(model, model_file=""):
"""Resolve a LocalAI model reference to something an HF/MLX loader accepts.
LocalAI hands backends either a plain HuggingFace repo id
(``namespace/name``), an already-local filesystem path, or a
``file://`` URI (its ``LocalPrefix``) for models imported from disk.
Loaders such as ``mlx_lm.load`` reject the ``file://`` form because the
scheme is neither a valid repo id nor an existing path, so we normalize
it here before loading.
Resolution order:
1. Prefer ``model_file`` when set and non-empty - that is the resolved
local path LocalAI computed for the model.
2. Strip a ``file://`` scheme and percent-decode it to a plain path.
3. Leave plain repo ids and already-local paths unchanged.
"""
candidate = model_file if model_file else model
if candidate is None:
return candidate
if candidate.startswith("file://"):
return unquote(candidate[len("file://"):])
return candidate
def parse_options(options_list):

View File

@@ -28,7 +28,7 @@ import grpc
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
from grpc_auth import get_auth_interceptors
from python_utils import messages_to_dicts, parse_options as _shared_parse_options
from python_utils import messages_to_dicts, parse_options as _shared_parse_options, resolve_model_path
from mlx_utils import parse_tool_calls, split_reasoning
@@ -99,7 +99,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
from mlx_lm import load
from mlx_lm.models.cache import make_prompt_cache, can_trim_prompt_cache, trim_prompt_cache
print(f"[Rank 0] Loading model: {request.Model}", file=sys.stderr)
# Normalize the model reference: strip LocalAI's file:// LocalPrefix
# and prefer the resolved ModelFile so mlx_lm.load() gets a plain
# repo id or filesystem path (it rejects file:// URIs).
model_path = resolve_model_path(request.Model, request.ModelFile)
print(f"[Rank 0] Loading model: {model_path}", file=sys.stderr)
self.options = parse_options(request.Options)
print(f"Options: {self.options}", file=sys.stderr)
@@ -128,7 +132,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
)
self.coordinator = DistributedCoordinator(self.group)
self.coordinator.broadcast_command(CMD_LOAD_MODEL)
self.coordinator.broadcast_model_name(request.Model)
self.coordinator.broadcast_model_name(model_path)
else:
print("[Rank 0] No hostfile configured, running single-node", file=sys.stderr)
@@ -144,9 +148,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
if tokenizer_config:
print(f"Loading with tokenizer_config: {tokenizer_config}", file=sys.stderr)
self.model, self.tokenizer = load(request.Model, tokenizer_config=tokenizer_config)
self.model, self.tokenizer = load(model_path, tokenizer_config=tokenizer_config)
else:
self.model, self.tokenizer = load(request.Model)
self.model, self.tokenizer = load(model_path)
if self.group is not None:
from sharding import pipeline_auto_parallel
@@ -157,7 +161,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
from mlx_cache import ThreadSafeLRUPromptCache
max_cache_entries = self.options.get("max_cache_entries", 10)
self.max_kv_size = self.options.get("max_kv_size", None)
self.model_key = request.Model
self.model_key = model_path
self.lru_cache = ThreadSafeLRUPromptCache(
max_size=max_cache_entries,
can_trim_fn=can_trim_prompt_cache,

View File

@@ -18,7 +18,7 @@ import grpc
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
from grpc_auth import get_auth_interceptors
from python_utils import messages_to_dicts, parse_options
from python_utils import messages_to_dicts, parse_options, resolve_model_path
from mlx_utils import parse_tool_calls, split_reasoning
from mlx_vlm import load, stream_generate
@@ -67,7 +67,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
backend_pb2.Result: The load model result.
"""
try:
print(f"Loading MLX-VLM model: {request.Model}", file=sys.stderr)
# Normalize the model reference: strip LocalAI's file:// LocalPrefix
# and prefer the resolved ModelFile so mlx_vlm.load() gets a plain
# repo id or filesystem path (it rejects file:// URIs).
model_path = resolve_model_path(request.Model, request.ModelFile)
print(f"Loading MLX-VLM model: {model_path}", file=sys.stderr)
print(f"Request: {request}", file=sys.stderr)
# Parse Options[] key:value strings into a typed dict
@@ -76,10 +80,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
# Load model and processor using MLX-VLM
# mlx-vlm load function returns (model, processor) instead of (model, tokenizer)
self.model, self.processor = load(request.Model)
self.model, self.processor = load(model_path)
# Load model config for chat template support
self.config = load_config(request.Model)
self.config = load_config(model_path)
# Auto-infer the tool parser from the chat template. mlx-vlm has
# its own _infer_tool_parser that falls back to mlx-lm parsers.

View File

@@ -17,7 +17,7 @@ import grpc
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
from grpc_auth import get_auth_interceptors
from python_utils import messages_to_dicts, parse_options
from python_utils import messages_to_dicts, parse_options, resolve_model_path
from mlx_utils import parse_tool_calls, split_reasoning
from mlx_lm import load, stream_generate
@@ -63,7 +63,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
backend_pb2.Result: The load model result.
"""
try:
print(f"Loading MLX model: {request.Model}", file=sys.stderr)
# Normalize the model reference: strip LocalAI's file:// LocalPrefix
# and prefer the resolved ModelFile so mlx_lm.load() gets a plain
# repo id or filesystem path (it rejects file:// URIs).
model_path = resolve_model_path(request.Model, request.ModelFile)
print(f"Loading MLX model: {model_path}", file=sys.stderr)
print(f"Request: {request}", file=sys.stderr)
# Parse Options[] key:value strings into a typed dict (shared helper)
@@ -89,9 +93,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
# Load model and tokenizer using MLX
if tokenizer_config:
print(f"Loading with tokenizer_config: {tokenizer_config}", file=sys.stderr)
self.model, self.tokenizer = load(request.Model, tokenizer_config=tokenizer_config)
self.model, self.tokenizer = load(model_path, tokenizer_config=tokenizer_config)
else:
self.model, self.tokenizer = load(request.Model)
self.model, self.tokenizer = load(model_path)
# mlx_lm.load() returns a TokenizerWrapper that detects tool
# calling and thinking markers from the chat template / vocab.
@@ -111,7 +115,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
# Initialize thread-safe LRU prompt cache for efficient generation
max_cache_entries = self.options.get("max_cache_entries", 10)
self.max_kv_size = self.options.get("max_kv_size", None)
self.model_key = request.Model
self.model_key = model_path
self.lru_cache = ThreadSafeLRUPromptCache(
max_size=max_cache_entries,
can_trim_fn=can_trim_prompt_cache,

View File

@@ -12,7 +12,7 @@ import backend_pb2_grpc
# Make the shared helpers importable so we can unit-test them without a
# running gRPC server.
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
from python_utils import messages_to_dicts, parse_options
from python_utils import messages_to_dicts, parse_options, resolve_model_path
from mlx_utils import parse_tool_calls, split_reasoning
class TestBackendServicer(unittest.TestCase):
@@ -322,6 +322,42 @@ class TestSharedHelpers(unittest.TestCase):
self.assertEqual(r, "")
self.assertEqual(c, "just text")
def test_resolve_model_path_file_uri(self):
# file:// LocalPrefix (LocalAI import) is stripped to a plain path.
self.assertEqual(resolve_model_path("file:///a/b"), "/a/b")
def test_resolve_model_path_file_uri_percent_decoded(self):
# Percent-encoded characters (e.g. spaces) are decoded.
self.assertEqual(
resolve_model_path("file:///Users/me/My%20Models/Qwen3"),
"/Users/me/My Models/Qwen3",
)
def test_resolve_model_path_hf_repo_id_unchanged(self):
# Plain HuggingFace repo ids must pass through untouched.
self.assertEqual(
resolve_model_path("mlx-community/Qwen3-Coder-30B"),
"mlx-community/Qwen3-Coder-30B",
)
def test_resolve_model_path_local_path_unchanged(self):
# An already-local absolute path is left as-is.
self.assertEqual(resolve_model_path("/models/Qwen3"), "/models/Qwen3")
def test_resolve_model_path_prefers_model_file(self):
# The resolved ModelFile wins over Model when both are set.
self.assertEqual(
resolve_model_path("file:///ignored", "/resolved/local/path"),
"/resolved/local/path",
)
def test_resolve_model_path_model_file_file_uri(self):
# A ModelFile that is itself a file:// URI is also normalized.
self.assertEqual(
resolve_model_path("ignored", "file:///a/b"),
"/a/b",
)
def test_parse_tool_calls_with_shim(self):
tm = types.SimpleNamespace(
tool_call_start="<tool_call>",

View File

@@ -1,7 +1,6 @@
--extra-index-url https://download.pytorch.org/whl/cpu
accelerate
torch==2.8.0
torchaudio==2.8.0
transformers==4.56.1
librosa==0.11.0
neucodec>=0.0.4

View File

@@ -3,7 +3,6 @@ neucodec>=0.0.4
phonemizer==3.3.0
soundfile==0.13.1
torch==2.8.0
torchaudio==2.8.0
transformers==4.56.1
resemble-perth==1.0.1
accelerate

View File

@@ -1,6 +1,6 @@
--extra-index-url https://download.pytorch.org/whl/cpu
accelerate
torch==2.12.0+cpu
torch==2.9.1+cpu
torchvision
torchaudio
transformers

View File

@@ -307,19 +307,11 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions
}
}
// TopK may be nil after SetDefaults for backends that don't use llama.cpp's
// top_k=40 default (issue #6632, e.g. mlx). proto3 int32 can't be unset, so
// send 0 — the value mlx actually wants (top-k disabled).
var topK int32
if c.TopK != nil {
topK = int32(*c.TopK)
}
pbOpts := &pb.PredictOptions{
Temperature: float32(*c.Temperature),
TopP: float32(*c.TopP),
NDraft: c.NDraft,
TopK: topK,
TopK: int32(*c.TopK),
MinP: float32(*c.MinP),
Tokens: int32(*c.Maxtokens),
Threads: int32(*c.Threads),

View File

@@ -517,33 +517,6 @@ func NormalizeBackendName(backend string) string {
return strings.ReplaceAll(backend, ".", "-")
}
// nonLlamaSamplerBackends lists backends whose native sampler defaults differ
// from llama.cpp's, so LocalAI must NOT inject llama.cpp's top_k=40 default for
// them (issue #6632). mlx_lm's intended default is top_k=0 (disabled) and mlx
// does not remap 0->40, so shipping 40 silently changes sampling for clients
// that omit top_k. Leaving TopK nil lets the wire value default to 0.
//
// This is intentionally a small allow-list of KNOWN non-llama backends: empty
// and unknown backends fall through to the llama.cpp default to preserve the
// GGUF auto-detect path's behavior.
var nonLlamaSamplerBackends = map[string]struct{}{
"mlx": {},
"mlx-vlm": {},
"mlx-distributed": {},
}
// UsesLlamaSamplerDefaults reports whether a backend should receive llama.cpp's
// sampler defaults (e.g. top_k=40). Empty/unknown backends return true so the
// GGUF auto-detect path (which resolves to llama.cpp) keeps today's behavior;
// only the known non-llama backends in nonLlamaSamplerBackends return false.
func UsesLlamaSamplerDefaults(backend string) bool {
if backend == "" {
return true
}
_, isNonLlama := nonLlamaSamplerBackends[NormalizeBackendName(backend)]
return !isNonLlama
}
// GetBackendCapability returns the capability info for a backend, or nil if unknown.
// Handles backend name normalization.
func GetBackendCapability(backend string) *BackendCapability {

View File

@@ -867,12 +867,7 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
cfg.Seed = &defaultSeed
}
// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
// native default differs (issue #6632). Only inject it for the llama.cpp
// family and the empty/auto backend; leave TopK nil for known non-llama
// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
// is 0 rather than a silently-changed 40.
if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
if cfg.TopK == nil {
cfg.TopK = &defaultTopK
}

View File

@@ -529,72 +529,4 @@ concurrency_groups:
"models that template in Go still rely on the Go-generated grammar")
})
})
// The default top_k=40 is llama.cpp's sampling default and is WRONG for
// backends whose native default differs. mlx_lm's intended default is
// top_k=0 (disabled) and mlx does not remap 0->40, so injecting 40 silently
// changes sampling for mlx clients that omit top_k (issue #6632). Gate the
// injection on backend family: keep 40 for the llama.cpp family and for the
// empty/auto backend (the GGUF auto-detect path resolves to llama.cpp), but
// leave TopK nil for the mlx family so the wire value is 0.
Context("TopK default is backend-gated (issue #6632)", func() {
It("injects top_k=40 for the llama.cpp backend", func() {
cfg := &ModelConfig{}
cfg.Backend = "llama-cpp"
cfg.SetDefaults()
Expect(cfg.TopK).NotTo(BeNil(), "llama.cpp must keep its top_k=40 default")
Expect(*cfg.TopK).To(Equal(40))
})
It("injects top_k=40 for the empty/auto backend (GGUF auto-detect)", func() {
cfg := &ModelConfig{}
cfg.SetDefaults()
Expect(cfg.TopK).NotTo(BeNil(), "empty backend resolves to llama.cpp; default unchanged")
Expect(*cfg.TopK).To(Equal(40))
})
It("leaves TopK nil for the mlx backend", func() {
cfg := &ModelConfig{}
cfg.Backend = "mlx"
cfg.SetDefaults()
Expect(cfg.TopK).To(BeNil(),
"mlx_lm's intended default is top_k=0 (disabled); LocalAI must not inject 40")
})
It("leaves TopK nil for the mlx-vlm backend", func() {
cfg := &ModelConfig{}
cfg.Backend = "mlx-vlm"
cfg.SetDefaults()
Expect(cfg.TopK).To(BeNil())
})
It("leaves TopK nil for the mlx-distributed backend", func() {
cfg := &ModelConfig{}
cfg.Backend = "mlx-distributed"
cfg.SetDefaults()
Expect(cfg.TopK).To(BeNil())
})
It("respects an explicit top_k even for the mlx backend", func() {
explicit := 7
cfg := &ModelConfig{}
cfg.Backend = "mlx"
cfg.TopK = &explicit
cfg.SetDefaults()
Expect(cfg.TopK).NotTo(BeNil())
Expect(*cfg.TopK).To(Equal(7))
})
})
})

View File

@@ -990,18 +990,8 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode
}
if rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil {
trUpd := rt.Audio.Input.Transcription
// A language-only update (e.g. a client forcing the STT language) carries
// an empty Model. Preserve the pipeline's configured transcription backend
// instead of blanking it — otherwise the next utterance transcribes against
// an empty model and the backend RPC fails with "unimplemented".
if trUpd.Model == "" && session.InputAudioTranscription != nil {
trUpd.Model = session.InputAudioTranscription.Model
}
session.InputAudioTranscription = trUpd
if trUpd.Model != "" {
session.ModelConfig.Pipeline.Transcription = trUpd.Model
}
session.InputAudioTranscription = rt.Audio.Input.Transcription
session.ModelConfig.Pipeline.Transcription = rt.Audio.Input.Transcription.Model
}
if rt.Model != "" || (rt.Audio != nil && rt.Audio.Output != nil && rt.Audio.Output.Voice != "") || (rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil) {

View File

File diff suppressed because it is too large Load Diff

2
go.mod
View File

@@ -36,7 +36,7 @@ require (
github.com/mholt/archiver/v3 v3.5.1
github.com/microcosm-cc/bluemonday v1.0.27
github.com/modelcontextprotocol/go-sdk v1.5.0
github.com/mudler/cogito v0.10.1-0.20260609212329-bf4010d31047
github.com/mudler/cogito v0.9.5-0.20260315222927-63abdec7189b
github.com/mudler/edgevpn v0.34.0
github.com/mudler/go-processmanager v0.1.1
github.com/mudler/memory v0.0.0-20260406210934-424c1ecf2cf8

4
go.sum
View File

@@ -968,8 +968,8 @@ github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM=
github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw=
github.com/mudler/LocalAGI v0.0.0-20260606071251-14aed1ae4336 h1:iKBkSnpisOvMVxFoYsAObvAuOqXBakRPMD0PWxWG5EE=
github.com/mudler/LocalAGI v0.0.0-20260606071251-14aed1ae4336/go.mod h1:U+g6u8mF2wQxhkdBl3dr8G4db1cv3n7KTKmraoJ7D0c=
github.com/mudler/cogito v0.10.1-0.20260609212329-bf4010d31047 h1:wJ8WbDah1YcpBNRDmovQro8JiR228YFk7TUqPCS4m04=
github.com/mudler/cogito v0.10.1-0.20260609212329-bf4010d31047/go.mod h1:6sfja3lcu2nWRzEc0wwqGNu/eCG3EWgij+8s7xyUeQ4=
github.com/mudler/cogito v0.9.5-0.20260315222927-63abdec7189b h1:A74T2Lauvg61KodYqsjTYDY05kPLcW+efVZjd23dghU=
github.com/mudler/cogito v0.9.5-0.20260315222927-63abdec7189b/go.mod h1:6sfja3lcu2nWRzEc0wwqGNu/eCG3EWgij+8s7xyUeQ4=
github.com/mudler/edgevpn v0.34.0 h1:qDrD/rCPFY/FdURbXudIZWihVKY4VOX3nMn3CcbeQEU=
github.com/mudler/edgevpn v0.34.0/go.mod h1:yki7uMi5LR9gSMrw8PdPieuxsrk8BLV2Ui7VBEmbbIA=
github.com/mudler/go-piper v0.0.0-20241023091659-2494246fd9fc h1:RxwneJl1VgvikiX28EkpdAyL4yQVnJMrbquKospjHyA=