mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-12 18:58:49 -04:00
Compare commits
1 Commits
fix/9813-o
...
fix/7461-m
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
69c7a8e71d |
@@ -5,6 +5,31 @@ imported by any backend that needs to parse LocalAI gRPC options or build a
|
||||
chat-template-compatible message list from proto Message objects.
|
||||
"""
|
||||
import json
|
||||
from urllib.parse import unquote
|
||||
|
||||
|
||||
def resolve_model_path(model, model_file=""):
|
||||
"""Resolve a LocalAI model reference to something an HF/MLX loader accepts.
|
||||
|
||||
LocalAI hands backends either a plain HuggingFace repo id
|
||||
(``namespace/name``), an already-local filesystem path, or a
|
||||
``file://`` URI (its ``LocalPrefix``) for models imported from disk.
|
||||
Loaders such as ``mlx_lm.load`` reject the ``file://`` form because the
|
||||
scheme is neither a valid repo id nor an existing path, so we normalize
|
||||
it here before loading.
|
||||
|
||||
Resolution order:
|
||||
1. Prefer ``model_file`` when set and non-empty - that is the resolved
|
||||
local path LocalAI computed for the model.
|
||||
2. Strip a ``file://`` scheme and percent-decode it to a plain path.
|
||||
3. Leave plain repo ids and already-local paths unchanged.
|
||||
"""
|
||||
candidate = model_file if model_file else model
|
||||
if candidate is None:
|
||||
return candidate
|
||||
if candidate.startswith("file://"):
|
||||
return unquote(candidate[len("file://"):])
|
||||
return candidate
|
||||
|
||||
|
||||
def parse_options(options_list):
|
||||
|
||||
@@ -28,7 +28,7 @@ import grpc
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
|
||||
from grpc_auth import get_auth_interceptors
|
||||
from python_utils import messages_to_dicts, parse_options as _shared_parse_options
|
||||
from python_utils import messages_to_dicts, parse_options as _shared_parse_options, resolve_model_path
|
||||
from mlx_utils import parse_tool_calls, split_reasoning
|
||||
|
||||
|
||||
@@ -99,7 +99,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
from mlx_lm import load
|
||||
from mlx_lm.models.cache import make_prompt_cache, can_trim_prompt_cache, trim_prompt_cache
|
||||
|
||||
print(f"[Rank 0] Loading model: {request.Model}", file=sys.stderr)
|
||||
# Normalize the model reference: strip LocalAI's file:// LocalPrefix
|
||||
# and prefer the resolved ModelFile so mlx_lm.load() gets a plain
|
||||
# repo id or filesystem path (it rejects file:// URIs).
|
||||
model_path = resolve_model_path(request.Model, request.ModelFile)
|
||||
print(f"[Rank 0] Loading model: {model_path}", file=sys.stderr)
|
||||
|
||||
self.options = parse_options(request.Options)
|
||||
print(f"Options: {self.options}", file=sys.stderr)
|
||||
@@ -128,7 +132,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
)
|
||||
self.coordinator = DistributedCoordinator(self.group)
|
||||
self.coordinator.broadcast_command(CMD_LOAD_MODEL)
|
||||
self.coordinator.broadcast_model_name(request.Model)
|
||||
self.coordinator.broadcast_model_name(model_path)
|
||||
else:
|
||||
print("[Rank 0] No hostfile configured, running single-node", file=sys.stderr)
|
||||
|
||||
@@ -144,9 +148,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
if tokenizer_config:
|
||||
print(f"Loading with tokenizer_config: {tokenizer_config}", file=sys.stderr)
|
||||
self.model, self.tokenizer = load(request.Model, tokenizer_config=tokenizer_config)
|
||||
self.model, self.tokenizer = load(model_path, tokenizer_config=tokenizer_config)
|
||||
else:
|
||||
self.model, self.tokenizer = load(request.Model)
|
||||
self.model, self.tokenizer = load(model_path)
|
||||
|
||||
if self.group is not None:
|
||||
from sharding import pipeline_auto_parallel
|
||||
@@ -157,7 +161,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
from mlx_cache import ThreadSafeLRUPromptCache
|
||||
max_cache_entries = self.options.get("max_cache_entries", 10)
|
||||
self.max_kv_size = self.options.get("max_kv_size", None)
|
||||
self.model_key = request.Model
|
||||
self.model_key = model_path
|
||||
self.lru_cache = ThreadSafeLRUPromptCache(
|
||||
max_size=max_cache_entries,
|
||||
can_trim_fn=can_trim_prompt_cache,
|
||||
|
||||
@@ -18,7 +18,7 @@ import grpc
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
|
||||
from grpc_auth import get_auth_interceptors
|
||||
from python_utils import messages_to_dicts, parse_options
|
||||
from python_utils import messages_to_dicts, parse_options, resolve_model_path
|
||||
from mlx_utils import parse_tool_calls, split_reasoning
|
||||
|
||||
from mlx_vlm import load, stream_generate
|
||||
@@ -67,7 +67,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
backend_pb2.Result: The load model result.
|
||||
"""
|
||||
try:
|
||||
print(f"Loading MLX-VLM model: {request.Model}", file=sys.stderr)
|
||||
# Normalize the model reference: strip LocalAI's file:// LocalPrefix
|
||||
# and prefer the resolved ModelFile so mlx_vlm.load() gets a plain
|
||||
# repo id or filesystem path (it rejects file:// URIs).
|
||||
model_path = resolve_model_path(request.Model, request.ModelFile)
|
||||
print(f"Loading MLX-VLM model: {model_path}", file=sys.stderr)
|
||||
print(f"Request: {request}", file=sys.stderr)
|
||||
|
||||
# Parse Options[] key:value strings into a typed dict
|
||||
@@ -76,10 +80,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
# Load model and processor using MLX-VLM
|
||||
# mlx-vlm load function returns (model, processor) instead of (model, tokenizer)
|
||||
self.model, self.processor = load(request.Model)
|
||||
self.model, self.processor = load(model_path)
|
||||
|
||||
# Load model config for chat template support
|
||||
self.config = load_config(request.Model)
|
||||
self.config = load_config(model_path)
|
||||
|
||||
# Auto-infer the tool parser from the chat template. mlx-vlm has
|
||||
# its own _infer_tool_parser that falls back to mlx-lm parsers.
|
||||
|
||||
@@ -17,7 +17,7 @@ import grpc
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
|
||||
from grpc_auth import get_auth_interceptors
|
||||
from python_utils import messages_to_dicts, parse_options
|
||||
from python_utils import messages_to_dicts, parse_options, resolve_model_path
|
||||
from mlx_utils import parse_tool_calls, split_reasoning
|
||||
|
||||
from mlx_lm import load, stream_generate
|
||||
@@ -63,7 +63,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
backend_pb2.Result: The load model result.
|
||||
"""
|
||||
try:
|
||||
print(f"Loading MLX model: {request.Model}", file=sys.stderr)
|
||||
# Normalize the model reference: strip LocalAI's file:// LocalPrefix
|
||||
# and prefer the resolved ModelFile so mlx_lm.load() gets a plain
|
||||
# repo id or filesystem path (it rejects file:// URIs).
|
||||
model_path = resolve_model_path(request.Model, request.ModelFile)
|
||||
print(f"Loading MLX model: {model_path}", file=sys.stderr)
|
||||
print(f"Request: {request}", file=sys.stderr)
|
||||
|
||||
# Parse Options[] key:value strings into a typed dict (shared helper)
|
||||
@@ -89,9 +93,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
# Load model and tokenizer using MLX
|
||||
if tokenizer_config:
|
||||
print(f"Loading with tokenizer_config: {tokenizer_config}", file=sys.stderr)
|
||||
self.model, self.tokenizer = load(request.Model, tokenizer_config=tokenizer_config)
|
||||
self.model, self.tokenizer = load(model_path, tokenizer_config=tokenizer_config)
|
||||
else:
|
||||
self.model, self.tokenizer = load(request.Model)
|
||||
self.model, self.tokenizer = load(model_path)
|
||||
|
||||
# mlx_lm.load() returns a TokenizerWrapper that detects tool
|
||||
# calling and thinking markers from the chat template / vocab.
|
||||
@@ -111,7 +115,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
# Initialize thread-safe LRU prompt cache for efficient generation
|
||||
max_cache_entries = self.options.get("max_cache_entries", 10)
|
||||
self.max_kv_size = self.options.get("max_kv_size", None)
|
||||
self.model_key = request.Model
|
||||
self.model_key = model_path
|
||||
self.lru_cache = ThreadSafeLRUPromptCache(
|
||||
max_size=max_cache_entries,
|
||||
can_trim_fn=can_trim_prompt_cache,
|
||||
|
||||
@@ -12,7 +12,7 @@ import backend_pb2_grpc
|
||||
# Make the shared helpers importable so we can unit-test them without a
|
||||
# running gRPC server.
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
|
||||
from python_utils import messages_to_dicts, parse_options
|
||||
from python_utils import messages_to_dicts, parse_options, resolve_model_path
|
||||
from mlx_utils import parse_tool_calls, split_reasoning
|
||||
|
||||
class TestBackendServicer(unittest.TestCase):
|
||||
@@ -322,6 +322,42 @@ class TestSharedHelpers(unittest.TestCase):
|
||||
self.assertEqual(r, "")
|
||||
self.assertEqual(c, "just text")
|
||||
|
||||
def test_resolve_model_path_file_uri(self):
|
||||
# file:// LocalPrefix (LocalAI import) is stripped to a plain path.
|
||||
self.assertEqual(resolve_model_path("file:///a/b"), "/a/b")
|
||||
|
||||
def test_resolve_model_path_file_uri_percent_decoded(self):
|
||||
# Percent-encoded characters (e.g. spaces) are decoded.
|
||||
self.assertEqual(
|
||||
resolve_model_path("file:///Users/me/My%20Models/Qwen3"),
|
||||
"/Users/me/My Models/Qwen3",
|
||||
)
|
||||
|
||||
def test_resolve_model_path_hf_repo_id_unchanged(self):
|
||||
# Plain HuggingFace repo ids must pass through untouched.
|
||||
self.assertEqual(
|
||||
resolve_model_path("mlx-community/Qwen3-Coder-30B"),
|
||||
"mlx-community/Qwen3-Coder-30B",
|
||||
)
|
||||
|
||||
def test_resolve_model_path_local_path_unchanged(self):
|
||||
# An already-local absolute path is left as-is.
|
||||
self.assertEqual(resolve_model_path("/models/Qwen3"), "/models/Qwen3")
|
||||
|
||||
def test_resolve_model_path_prefers_model_file(self):
|
||||
# The resolved ModelFile wins over Model when both are set.
|
||||
self.assertEqual(
|
||||
resolve_model_path("file:///ignored", "/resolved/local/path"),
|
||||
"/resolved/local/path",
|
||||
)
|
||||
|
||||
def test_resolve_model_path_model_file_file_uri(self):
|
||||
# A ModelFile that is itself a file:// URI is also normalized.
|
||||
self.assertEqual(
|
||||
resolve_model_path("ignored", "file:///a/b"),
|
||||
"/a/b",
|
||||
)
|
||||
|
||||
def test_parse_tool_calls_with_shim(self):
|
||||
tm = types.SimpleNamespace(
|
||||
tool_call_start="<tool_call>",
|
||||
|
||||
@@ -2,8 +2,6 @@ package openai
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"runtime"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/labstack/echo/v4"
|
||||
@@ -13,52 +11,6 @@ import (
|
||||
"github.com/pion/webrtc/v4"
|
||||
)
|
||||
|
||||
// opusBackendName is the canonical gallery name/alias of the opus audio codec
|
||||
// backend that the realtime WebRTC transport needs.
|
||||
const opusBackendName = "opus"
|
||||
|
||||
// resolveOpusBackend picks which installed opus-codec backend the realtime
|
||||
// WebRTC transport should load. The transport historically hardcoded the
|
||||
// literal "opus" backend name, but on darwin/arm64 the only installable opus
|
||||
// codec is "metal-opus" (it shares the gallery alias "opus"). When that
|
||||
// platform-specific variant is registered under its concrete directory name
|
||||
// rather than the "opus" alias key, loading the literal "opus" fails with
|
||||
// "opus backend not available" (issue #9813). Given the set of currently
|
||||
// loadable backend names, this returns the best opus codec to load for the
|
||||
// running platform, falling back to the literal name so the caller surfaces
|
||||
// the same error as before when no opus codec is installed at all.
|
||||
func resolveOpusBackend(installed []string, goos, goarch string) string {
|
||||
// An exact match wins: this covers the plain "opus" backend as well as the
|
||||
// "opus" alias key registered by gallery alias resolution for a
|
||||
// user-installed platform variant.
|
||||
for _, b := range installed {
|
||||
if b == opusBackendName {
|
||||
return opusBackendName
|
||||
}
|
||||
}
|
||||
|
||||
// No "opus" key is registered (e.g. a system-path metal-opus whose alias
|
||||
// was never collected). Fall back to a platform-appropriate "*opus*" codec
|
||||
// backend; on darwin/arm64 prefer the metal build.
|
||||
var fallback string
|
||||
for _, b := range installed {
|
||||
if !strings.Contains(strings.ToLower(b), opusBackendName) {
|
||||
continue
|
||||
}
|
||||
if goos == "darwin" && goarch == "arm64" && strings.Contains(strings.ToLower(b), "metal") {
|
||||
return b
|
||||
}
|
||||
if fallback == "" {
|
||||
fallback = b
|
||||
}
|
||||
}
|
||||
if fallback != "" {
|
||||
return fallback
|
||||
}
|
||||
|
||||
return opusBackendName
|
||||
}
|
||||
|
||||
// RealtimeCallRequest is the JSON body for POST /v1/realtime/calls.
|
||||
type RealtimeCallRequest struct {
|
||||
SDP string `json:"sdp"`
|
||||
@@ -142,25 +94,15 @@ func RealtimeCalls(application *application.Application) echo.HandlerFunc {
|
||||
}
|
||||
}()
|
||||
|
||||
// Load the Opus backend. The opus codec ships under different backend
|
||||
// names per platform (e.g. "metal-opus" on darwin/arm64), so resolve the
|
||||
// platform-appropriate variant from the installed backends instead of
|
||||
// hardcoding the literal "opus" name (issue #9813).
|
||||
ml := application.ModelLoader()
|
||||
installed := make([]string, 0)
|
||||
for name := range ml.GetAllExternalBackends(nil) {
|
||||
installed = append(installed, name)
|
||||
}
|
||||
opusName := resolveOpusBackend(installed, runtime.GOOS, runtime.GOARCH)
|
||||
|
||||
opusBackend, err := ml.Load(
|
||||
model.WithBackendString(opusName),
|
||||
// Load the Opus backend
|
||||
opusBackend, err := application.ModelLoader().Load(
|
||||
model.WithBackendString("opus"),
|
||||
model.WithModelID("__opus_codec__"),
|
||||
model.WithModel(opusName),
|
||||
model.WithModel("opus"),
|
||||
)
|
||||
if err != nil {
|
||||
pc.Close()
|
||||
xlog.Error("failed to load opus backend", "error", err, "backend", opusName)
|
||||
xlog.Error("failed to load opus backend", "error", err)
|
||||
return c.JSON(http.StatusInternalServerError, map[string]string{"error": "opus backend not available"})
|
||||
}
|
||||
|
||||
|
||||
@@ -1,32 +0,0 @@
|
||||
package openai
|
||||
|
||||
import (
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("resolveOpusBackend", func() {
|
||||
It("prefers the exact opus backend when it is installed", func() {
|
||||
Expect(resolveOpusBackend([]string{"opus", "metal-opus"}, "linux", "amd64")).To(Equal("opus"))
|
||||
})
|
||||
|
||||
It("resolves to the opus alias key on linux", func() {
|
||||
Expect(resolveOpusBackend([]string{"opus"}, "linux", "amd64")).To(Equal("opus"))
|
||||
})
|
||||
|
||||
It("selects metal-opus on darwin/arm64 when no plain opus is installed", func() {
|
||||
Expect(resolveOpusBackend([]string{"metal-opus"}, "darwin", "arm64")).To(Equal("metal-opus"))
|
||||
})
|
||||
|
||||
It("selects metal-opus on darwin/arm64 even when other backends are present", func() {
|
||||
Expect(resolveOpusBackend([]string{"silero-vad", "metal-opus", "whisper"}, "darwin", "arm64")).To(Equal("metal-opus"))
|
||||
})
|
||||
|
||||
It("falls back to any opus codec backend when there is no exact match (non-darwin)", func() {
|
||||
Expect(resolveOpusBackend([]string{"metal-opus"}, "linux", "amd64")).To(Equal("metal-opus"))
|
||||
})
|
||||
|
||||
It("returns the literal opus name when no opus codec is installed", func() {
|
||||
Expect(resolveOpusBackend([]string{"silero-vad", "whisper"}, "darwin", "arm64")).To(Equal("opus"))
|
||||
})
|
||||
})
|
||||
Reference in New Issue
Block a user