mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-12 18:58:49 -04:00
Compare commits
1 Commits
fix/8059-c
...
fix/7461-m
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
69c7a8e71d |
@@ -5,6 +5,31 @@ imported by any backend that needs to parse LocalAI gRPC options or build a
|
||||
chat-template-compatible message list from proto Message objects.
|
||||
"""
|
||||
import json
|
||||
from urllib.parse import unquote
|
||||
|
||||
|
||||
def resolve_model_path(model, model_file=""):
|
||||
"""Resolve a LocalAI model reference to something an HF/MLX loader accepts.
|
||||
|
||||
LocalAI hands backends either a plain HuggingFace repo id
|
||||
(``namespace/name``), an already-local filesystem path, or a
|
||||
``file://`` URI (its ``LocalPrefix``) for models imported from disk.
|
||||
Loaders such as ``mlx_lm.load`` reject the ``file://`` form because the
|
||||
scheme is neither a valid repo id nor an existing path, so we normalize
|
||||
it here before loading.
|
||||
|
||||
Resolution order:
|
||||
1. Prefer ``model_file`` when set and non-empty - that is the resolved
|
||||
local path LocalAI computed for the model.
|
||||
2. Strip a ``file://`` scheme and percent-decode it to a plain path.
|
||||
3. Leave plain repo ids and already-local paths unchanged.
|
||||
"""
|
||||
candidate = model_file if model_file else model
|
||||
if candidate is None:
|
||||
return candidate
|
||||
if candidate.startswith("file://"):
|
||||
return unquote(candidate[len("file://"):])
|
||||
return candidate
|
||||
|
||||
|
||||
def parse_options(options_list):
|
||||
|
||||
@@ -28,7 +28,7 @@ import grpc
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
|
||||
from grpc_auth import get_auth_interceptors
|
||||
from python_utils import messages_to_dicts, parse_options as _shared_parse_options
|
||||
from python_utils import messages_to_dicts, parse_options as _shared_parse_options, resolve_model_path
|
||||
from mlx_utils import parse_tool_calls, split_reasoning
|
||||
|
||||
|
||||
@@ -99,7 +99,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
from mlx_lm import load
|
||||
from mlx_lm.models.cache import make_prompt_cache, can_trim_prompt_cache, trim_prompt_cache
|
||||
|
||||
print(f"[Rank 0] Loading model: {request.Model}", file=sys.stderr)
|
||||
# Normalize the model reference: strip LocalAI's file:// LocalPrefix
|
||||
# and prefer the resolved ModelFile so mlx_lm.load() gets a plain
|
||||
# repo id or filesystem path (it rejects file:// URIs).
|
||||
model_path = resolve_model_path(request.Model, request.ModelFile)
|
||||
print(f"[Rank 0] Loading model: {model_path}", file=sys.stderr)
|
||||
|
||||
self.options = parse_options(request.Options)
|
||||
print(f"Options: {self.options}", file=sys.stderr)
|
||||
@@ -128,7 +132,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
)
|
||||
self.coordinator = DistributedCoordinator(self.group)
|
||||
self.coordinator.broadcast_command(CMD_LOAD_MODEL)
|
||||
self.coordinator.broadcast_model_name(request.Model)
|
||||
self.coordinator.broadcast_model_name(model_path)
|
||||
else:
|
||||
print("[Rank 0] No hostfile configured, running single-node", file=sys.stderr)
|
||||
|
||||
@@ -144,9 +148,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
if tokenizer_config:
|
||||
print(f"Loading with tokenizer_config: {tokenizer_config}", file=sys.stderr)
|
||||
self.model, self.tokenizer = load(request.Model, tokenizer_config=tokenizer_config)
|
||||
self.model, self.tokenizer = load(model_path, tokenizer_config=tokenizer_config)
|
||||
else:
|
||||
self.model, self.tokenizer = load(request.Model)
|
||||
self.model, self.tokenizer = load(model_path)
|
||||
|
||||
if self.group is not None:
|
||||
from sharding import pipeline_auto_parallel
|
||||
@@ -157,7 +161,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
from mlx_cache import ThreadSafeLRUPromptCache
|
||||
max_cache_entries = self.options.get("max_cache_entries", 10)
|
||||
self.max_kv_size = self.options.get("max_kv_size", None)
|
||||
self.model_key = request.Model
|
||||
self.model_key = model_path
|
||||
self.lru_cache = ThreadSafeLRUPromptCache(
|
||||
max_size=max_cache_entries,
|
||||
can_trim_fn=can_trim_prompt_cache,
|
||||
|
||||
@@ -18,7 +18,7 @@ import grpc
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
|
||||
from grpc_auth import get_auth_interceptors
|
||||
from python_utils import messages_to_dicts, parse_options
|
||||
from python_utils import messages_to_dicts, parse_options, resolve_model_path
|
||||
from mlx_utils import parse_tool_calls, split_reasoning
|
||||
|
||||
from mlx_vlm import load, stream_generate
|
||||
@@ -67,7 +67,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
backend_pb2.Result: The load model result.
|
||||
"""
|
||||
try:
|
||||
print(f"Loading MLX-VLM model: {request.Model}", file=sys.stderr)
|
||||
# Normalize the model reference: strip LocalAI's file:// LocalPrefix
|
||||
# and prefer the resolved ModelFile so mlx_vlm.load() gets a plain
|
||||
# repo id or filesystem path (it rejects file:// URIs).
|
||||
model_path = resolve_model_path(request.Model, request.ModelFile)
|
||||
print(f"Loading MLX-VLM model: {model_path}", file=sys.stderr)
|
||||
print(f"Request: {request}", file=sys.stderr)
|
||||
|
||||
# Parse Options[] key:value strings into a typed dict
|
||||
@@ -76,10 +80,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
# Load model and processor using MLX-VLM
|
||||
# mlx-vlm load function returns (model, processor) instead of (model, tokenizer)
|
||||
self.model, self.processor = load(request.Model)
|
||||
self.model, self.processor = load(model_path)
|
||||
|
||||
# Load model config for chat template support
|
||||
self.config = load_config(request.Model)
|
||||
self.config = load_config(model_path)
|
||||
|
||||
# Auto-infer the tool parser from the chat template. mlx-vlm has
|
||||
# its own _infer_tool_parser that falls back to mlx-lm parsers.
|
||||
|
||||
@@ -17,7 +17,7 @@ import grpc
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
|
||||
from grpc_auth import get_auth_interceptors
|
||||
from python_utils import messages_to_dicts, parse_options
|
||||
from python_utils import messages_to_dicts, parse_options, resolve_model_path
|
||||
from mlx_utils import parse_tool_calls, split_reasoning
|
||||
|
||||
from mlx_lm import load, stream_generate
|
||||
@@ -63,7 +63,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
backend_pb2.Result: The load model result.
|
||||
"""
|
||||
try:
|
||||
print(f"Loading MLX model: {request.Model}", file=sys.stderr)
|
||||
# Normalize the model reference: strip LocalAI's file:// LocalPrefix
|
||||
# and prefer the resolved ModelFile so mlx_lm.load() gets a plain
|
||||
# repo id or filesystem path (it rejects file:// URIs).
|
||||
model_path = resolve_model_path(request.Model, request.ModelFile)
|
||||
print(f"Loading MLX model: {model_path}", file=sys.stderr)
|
||||
print(f"Request: {request}", file=sys.stderr)
|
||||
|
||||
# Parse Options[] key:value strings into a typed dict (shared helper)
|
||||
@@ -89,9 +93,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
# Load model and tokenizer using MLX
|
||||
if tokenizer_config:
|
||||
print(f"Loading with tokenizer_config: {tokenizer_config}", file=sys.stderr)
|
||||
self.model, self.tokenizer = load(request.Model, tokenizer_config=tokenizer_config)
|
||||
self.model, self.tokenizer = load(model_path, tokenizer_config=tokenizer_config)
|
||||
else:
|
||||
self.model, self.tokenizer = load(request.Model)
|
||||
self.model, self.tokenizer = load(model_path)
|
||||
|
||||
# mlx_lm.load() returns a TokenizerWrapper that detects tool
|
||||
# calling and thinking markers from the chat template / vocab.
|
||||
@@ -111,7 +115,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
# Initialize thread-safe LRU prompt cache for efficient generation
|
||||
max_cache_entries = self.options.get("max_cache_entries", 10)
|
||||
self.max_kv_size = self.options.get("max_kv_size", None)
|
||||
self.model_key = request.Model
|
||||
self.model_key = model_path
|
||||
self.lru_cache = ThreadSafeLRUPromptCache(
|
||||
max_size=max_cache_entries,
|
||||
can_trim_fn=can_trim_prompt_cache,
|
||||
|
||||
@@ -12,7 +12,7 @@ import backend_pb2_grpc
|
||||
# Make the shared helpers importable so we can unit-test them without a
|
||||
# running gRPC server.
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
|
||||
from python_utils import messages_to_dicts, parse_options
|
||||
from python_utils import messages_to_dicts, parse_options, resolve_model_path
|
||||
from mlx_utils import parse_tool_calls, split_reasoning
|
||||
|
||||
class TestBackendServicer(unittest.TestCase):
|
||||
@@ -322,6 +322,42 @@ class TestSharedHelpers(unittest.TestCase):
|
||||
self.assertEqual(r, "")
|
||||
self.assertEqual(c, "just text")
|
||||
|
||||
def test_resolve_model_path_file_uri(self):
|
||||
# file:// LocalPrefix (LocalAI import) is stripped to a plain path.
|
||||
self.assertEqual(resolve_model_path("file:///a/b"), "/a/b")
|
||||
|
||||
def test_resolve_model_path_file_uri_percent_decoded(self):
|
||||
# Percent-encoded characters (e.g. spaces) are decoded.
|
||||
self.assertEqual(
|
||||
resolve_model_path("file:///Users/me/My%20Models/Qwen3"),
|
||||
"/Users/me/My Models/Qwen3",
|
||||
)
|
||||
|
||||
def test_resolve_model_path_hf_repo_id_unchanged(self):
|
||||
# Plain HuggingFace repo ids must pass through untouched.
|
||||
self.assertEqual(
|
||||
resolve_model_path("mlx-community/Qwen3-Coder-30B"),
|
||||
"mlx-community/Qwen3-Coder-30B",
|
||||
)
|
||||
|
||||
def test_resolve_model_path_local_path_unchanged(self):
|
||||
# An already-local absolute path is left as-is.
|
||||
self.assertEqual(resolve_model_path("/models/Qwen3"), "/models/Qwen3")
|
||||
|
||||
def test_resolve_model_path_prefers_model_file(self):
|
||||
# The resolved ModelFile wins over Model when both are set.
|
||||
self.assertEqual(
|
||||
resolve_model_path("file:///ignored", "/resolved/local/path"),
|
||||
"/resolved/local/path",
|
||||
)
|
||||
|
||||
def test_resolve_model_path_model_file_file_uri(self):
|
||||
# A ModelFile that is itself a file:// URI is also normalized.
|
||||
self.assertEqual(
|
||||
resolve_model_path("ignored", "file:///a/b"),
|
||||
"/a/b",
|
||||
)
|
||||
|
||||
def test_parse_tool_calls_with_shim(self):
|
||||
tm = types.SimpleNamespace(
|
||||
tool_call_start="<tool_call>",
|
||||
|
||||
@@ -1,19 +1,9 @@
|
||||
package xsysinfo
|
||||
|
||||
import (
|
||||
"os"
|
||||
|
||||
"github.com/mudler/memory"
|
||||
)
|
||||
|
||||
// cgroup/proc paths used to make the reported RAM total container-aware.
|
||||
// They are variables (not consts) so tests could override them if needed.
|
||||
var (
|
||||
cgroupV2MaxPath = "/sys/fs/cgroup/memory.max"
|
||||
cgroupV1LimitPath = "/sys/fs/cgroup/memory/memory.limit_in_bytes"
|
||||
procMemInfoPath = "/proc/meminfo"
|
||||
)
|
||||
|
||||
// SystemRAMInfo contains system RAM usage information
|
||||
type SystemRAMInfo struct {
|
||||
Total uint64 `json:"total"`
|
||||
@@ -23,45 +13,12 @@ type SystemRAMInfo struct {
|
||||
UsagePercent float64 `json:"usage_percent"`
|
||||
}
|
||||
|
||||
// readFileBestEffort reads a file and returns its contents, or "" on any error.
|
||||
// Missing cgroup/proc files (e.g. on non-Linux hosts) are expected and benign.
|
||||
func readFileBestEffort(path string) string {
|
||||
b, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return string(b)
|
||||
}
|
||||
|
||||
// systemTotalMemory returns the container-aware total system RAM in bytes.
|
||||
//
|
||||
// memory.TotalMemory() reports the HOST kernel total (syscall.Sysinfo on
|
||||
// Linux), which lxcfs/LXD does NOT virtualize. Inside a container that
|
||||
// over-reports physical RAM and, combined with the virtualized MemAvailable,
|
||||
// inflates the reported usage (see issue #8059). We instead derive the total
|
||||
// from the minimum of all available container-aware candidates.
|
||||
func systemTotalMemory() uint64 {
|
||||
return chooseTotalMemory(
|
||||
readFileBestEffort(cgroupV2MaxPath),
|
||||
readFileBestEffort(cgroupV1LimitPath),
|
||||
readFileBestEffort(procMemInfoPath),
|
||||
memory.TotalMemory(),
|
||||
)
|
||||
}
|
||||
|
||||
// GetSystemRAMInfo returns real-time system RAM usage
|
||||
func GetSystemRAMInfo() (*SystemRAMInfo, error) {
|
||||
total := systemTotalMemory()
|
||||
available := memory.AvailableMemory()
|
||||
total := memory.TotalMemory()
|
||||
free := memory.AvailableMemory()
|
||||
|
||||
// AvailableMemory (MemAvailable) is virtualized by lxcfs, so in edge
|
||||
// cases it can exceed our corrected total; clamp to avoid an unsigned
|
||||
// underflow when computing Used.
|
||||
if available > total {
|
||||
available = total
|
||||
}
|
||||
|
||||
used := total - available
|
||||
used := total - free
|
||||
|
||||
usagePercent := 0.0
|
||||
if total > 0 {
|
||||
@@ -70,8 +27,8 @@ func GetSystemRAMInfo() (*SystemRAMInfo, error) {
|
||||
return &SystemRAMInfo{
|
||||
Total: total,
|
||||
Used: used,
|
||||
Free: available,
|
||||
Available: available,
|
||||
Free: free,
|
||||
Available: total - used,
|
||||
UsagePercent: usagePercent,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -1,120 +0,0 @@
|
||||
package xsysinfo
|
||||
|
||||
import (
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// cgroupV1UnlimitedSentinel is the value the kernel writes to
|
||||
// memory.limit_in_bytes when no limit is set. It is PAGE_COUNTER_MAX
|
||||
// (LONG_MAX rounded down to a page boundary), i.e. 0x7FFFFFFFFFFFF000 on
|
||||
// 4 KiB-page systems. Any value at or above this is treated as "no limit".
|
||||
const cgroupV1UnlimitedSentinel = uint64(0x7FFFFFFFFFFFF000)
|
||||
|
||||
// parseUintField parses a trimmed unsigned integer from raw file contents.
|
||||
// It returns (0, false) when the content is empty or not a number.
|
||||
func parseUintField(raw string) (uint64, bool) {
|
||||
s := strings.TrimSpace(raw)
|
||||
if s == "" {
|
||||
return 0, false
|
||||
}
|
||||
v, err := strconv.ParseUint(s, 10, 64)
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
return v, true
|
||||
}
|
||||
|
||||
// parseCgroupV2Max interprets the contents of cgroup v2 memory.max.
|
||||
// The literal "max" means unlimited, returning 0.
|
||||
func parseCgroupV2Max(raw string) uint64 {
|
||||
if strings.TrimSpace(raw) == "max" {
|
||||
return 0
|
||||
}
|
||||
v, ok := parseUintField(raw)
|
||||
if !ok {
|
||||
return 0
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
// parseCgroupV1Limit interprets the contents of cgroup v1
|
||||
// memory.limit_in_bytes. The kernel's "unlimited" sentinel (a value at or
|
||||
// above PAGE_COUNTER_MAX) is treated as no limit, returning 0.
|
||||
func parseCgroupV1Limit(raw string) uint64 {
|
||||
v, ok := parseUintField(raw)
|
||||
if !ok {
|
||||
return 0
|
||||
}
|
||||
if v >= cgroupV1UnlimitedSentinel {
|
||||
return 0
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
// parseMemTotal extracts the MemTotal value (in bytes) from raw
|
||||
// /proc/meminfo contents. MemTotal is reported in kibibytes, so the parsed
|
||||
// value is multiplied by 1024. Returns 0 when the field is missing.
|
||||
func parseMemTotal(raw string) uint64 {
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
if !strings.HasPrefix(line, "MemTotal:") {
|
||||
continue
|
||||
}
|
||||
fields := strings.Fields(line)
|
||||
// Expected: ["MemTotal:", "<value>", "kB"]
|
||||
if len(fields) < 2 {
|
||||
return 0
|
||||
}
|
||||
v, err := strconv.ParseUint(fields[1], 10, 64)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
if len(fields) >= 3 {
|
||||
switch strings.ToLower(fields[2]) {
|
||||
case "kb":
|
||||
return v * 1024
|
||||
case "mb":
|
||||
return v * 1024 * 1024
|
||||
case "gb":
|
||||
return v * 1024 * 1024 * 1024
|
||||
}
|
||||
}
|
||||
return v
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// chooseTotalMemory selects the most accurate system RAM total in bytes.
|
||||
//
|
||||
// On Linux the host kernel total (sysinfoTotal, from syscall.Sysinfo) is NOT
|
||||
// virtualized by lxcfs/LXD, so inside a container it over-reports physical
|
||||
// RAM. The cgroup limits and /proc/meminfo MemTotal, by contrast, do reflect
|
||||
// the container's view. We therefore take the MINIMUM of all non-zero,
|
||||
// non-unlimited candidates:
|
||||
//
|
||||
// - cgroup v2 memory.max ("max" => unlimited, skipped)
|
||||
// - cgroup v1 memory.limit_in_bytes (kernel sentinel => unlimited, skipped)
|
||||
// - /proc/meminfo MemTotal (lxcfs/LXD virtualizes this)
|
||||
// - sysinfoTotal (bare-metal fallback)
|
||||
//
|
||||
// On bare metal the cgroup limits are unlimited and MemTotal == sysinfoTotal,
|
||||
// so the result equals the host total exactly as before.
|
||||
func chooseTotalMemory(cgroupV2Max, cgroupV1Limit, procMemInfo string, sysinfoTotal uint64) uint64 {
|
||||
candidates := []uint64{
|
||||
parseCgroupV2Max(cgroupV2Max),
|
||||
parseCgroupV1Limit(cgroupV1Limit),
|
||||
parseMemTotal(procMemInfo),
|
||||
sysinfoTotal,
|
||||
}
|
||||
|
||||
var best uint64
|
||||
for _, c := range candidates {
|
||||
if c == 0 {
|
||||
continue
|
||||
}
|
||||
if best == 0 || c < best {
|
||||
best = c
|
||||
}
|
||||
}
|
||||
return best
|
||||
}
|
||||
@@ -1,74 +0,0 @@
|
||||
package xsysinfo
|
||||
|
||||
import (
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("chooseTotalMemory", func() {
|
||||
const (
|
||||
gi128 = uint64(128) * 1024 * 1024 * 1024
|
||||
gi20 = uint64(20) * 1024 * 1024 * 1024
|
||||
gi10 = uint64(10) * 1024 * 1024 * 1024
|
||||
)
|
||||
|
||||
// /proc/meminfo MemTotal is in kB; build a snippet for a given byte total.
|
||||
memInfo := func(bytes uint64) string {
|
||||
kb := bytes / 1024
|
||||
return "MemTotal: " + itoa(kb) + " kB\nMemFree: 123 kB\n"
|
||||
}
|
||||
|
||||
Context("bare metal (no cgroup cap, memory.max == max)", func() {
|
||||
It("uses the host sysinfo total", func() {
|
||||
// MemTotal mirrors sysinfo on bare metal.
|
||||
got := chooseTotalMemory("max\n", string(rune(0)), memInfo(gi128), gi128)
|
||||
Expect(got).To(Equal(gi128))
|
||||
})
|
||||
})
|
||||
|
||||
Context("LXD/lxcfs container (MemTotal virtualized below host, no cap)", func() {
|
||||
It("uses the virtualized MemTotal, not the host sysinfo total", func() {
|
||||
// This is issue #8059: host sysinfo says 128Gi, but lxcfs
|
||||
// virtualizes /proc/meminfo MemTotal to 20Gi and there is no
|
||||
// cgroup cap. The corrected total must be 20Gi.
|
||||
got := chooseTotalMemory("max\n", "", memInfo(gi20), gi128)
|
||||
Expect(got).To(Equal(gi20))
|
||||
})
|
||||
})
|
||||
|
||||
Context("cgroup v2 cap set below MemTotal", func() {
|
||||
It("uses the cgroup cap", func() {
|
||||
got := chooseTotalMemory(itoa(gi10)+"\n", "", memInfo(gi20), gi128)
|
||||
Expect(got).To(Equal(gi10))
|
||||
})
|
||||
})
|
||||
|
||||
Context("cgroup v1 with the kernel unlimited sentinel", func() {
|
||||
It("ignores the sentinel and falls back to MemTotal", func() {
|
||||
got := chooseTotalMemory("", "9223372036854771712\n", memInfo(gi20), gi128)
|
||||
Expect(got).To(Equal(gi20))
|
||||
})
|
||||
})
|
||||
|
||||
Context("all candidates empty/unlimited", func() {
|
||||
It("falls back to sysinfo total", func() {
|
||||
got := chooseTotalMemory("max\n", "", "", gi128)
|
||||
Expect(got).To(Equal(gi128))
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
// itoa is a tiny base-10 formatter to avoid importing strconv into the test.
|
||||
func itoa(v uint64) string {
|
||||
if v == 0 {
|
||||
return "0"
|
||||
}
|
||||
var buf [20]byte
|
||||
i := len(buf)
|
||||
for v > 0 {
|
||||
i--
|
||||
buf[i] = byte('0' + v%10)
|
||||
v /= 10
|
||||
}
|
||||
return string(buf[i:])
|
||||
}
|
||||
Reference in New Issue
Block a user