mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-25 00:59:28 -04:00
Compare commits
12 Commits
fix/parake
...
feat/darwi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e4ea2dcfa8 | ||
|
|
f88981cdce | ||
|
|
0d6de15ae9 | ||
|
|
5c3d48ab50 | ||
|
|
764b0352b9 | ||
|
|
75ba2daba1 | ||
|
|
62b14fd635 | ||
|
|
193d0e6aef | ||
|
|
40daa857c7 | ||
|
|
482314c623 | ||
|
|
c0efc28968 | ||
|
|
e8ae88a2a0 |
10
.github/backend-matrix.yml
vendored
10
.github/backend-matrix.yml
vendored
@@ -4974,6 +4974,12 @@ includeDarwin:
|
||||
- backend: "kitten-tts"
|
||||
tag-suffix: "-metal-darwin-arm64-kitten-tts"
|
||||
build-type: "mps"
|
||||
- backend: "trl"
|
||||
tag-suffix: "-metal-darwin-arm64-trl"
|
||||
build-type: "mps"
|
||||
- backend: "liquid-audio"
|
||||
tag-suffix: "-metal-darwin-arm64-liquid-audio"
|
||||
build-type: "mps"
|
||||
- backend: "piper"
|
||||
tag-suffix: "-metal-darwin-arm64-piper"
|
||||
build-type: "metal"
|
||||
@@ -4990,6 +4996,10 @@ includeDarwin:
|
||||
tag-suffix: "-metal-darwin-arm64-sherpa-onnx"
|
||||
build-type: "metal"
|
||||
lang: "go"
|
||||
- backend: "supertonic"
|
||||
tag-suffix: "-metal-darwin-arm64-supertonic"
|
||||
build-type: "metal"
|
||||
lang: "go"
|
||||
- backend: "local-store"
|
||||
tag-suffix: "-metal-darwin-arm64-local-store"
|
||||
build-type: "metal"
|
||||
|
||||
@@ -16,6 +16,7 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"runtime"
|
||||
"strings"
|
||||
"time"
|
||||
"unicode"
|
||||
@@ -943,7 +944,13 @@ func InitializeONNXRuntime() error {
|
||||
}
|
||||
}
|
||||
if libPath == "" {
|
||||
libPath = "/usr/local/lib/libonnxruntime.so"
|
||||
// LocalAI: default to the platform-native shared library
|
||||
// extension when nothing else is found (dyld vs ld.so).
|
||||
if runtime.GOOS == "darwin" {
|
||||
libPath = "/usr/local/lib/libonnxruntime.dylib"
|
||||
} else {
|
||||
libPath = "/usr/local/lib/libonnxruntime.so"
|
||||
}
|
||||
}
|
||||
}
|
||||
ort.SetSharedLibraryPath(libPath)
|
||||
|
||||
@@ -32,6 +32,10 @@ elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
|
||||
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
|
||||
elif [ $(uname -s) = "Darwin" ]; then
|
||||
# macOS: dyld resolves the bundled .dylib via DYLD_LIBRARY_PATH (set in
|
||||
# run.sh); there is no ld.so loader nor glibc to bundle.
|
||||
echo "Detected Darwin"
|
||||
else
|
||||
echo "Error: Could not detect architecture"
|
||||
exit 1
|
||||
|
||||
@@ -3,12 +3,19 @@ set -ex
|
||||
|
||||
CURDIR=$(dirname "$(realpath $0)")
|
||||
|
||||
export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
|
||||
export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.so
|
||||
if [ "$(uname)" = "Darwin" ]; then
|
||||
# macOS uses dyld: there is no ld.so loader, and the search path env
|
||||
# var is DYLD_LIBRARY_PATH. ONNX Runtime ships as a .dylib here.
|
||||
export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
|
||||
export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.dylib
|
||||
else
|
||||
export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
|
||||
export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.so
|
||||
|
||||
if [ -f $CURDIR/lib/ld.so ]; then
|
||||
echo "Using lib/ld.so"
|
||||
exec $CURDIR/lib/ld.so $CURDIR/supertonic "$@"
|
||||
if [ -f $CURDIR/lib/ld.so ]; then
|
||||
echo "Using lib/ld.so"
|
||||
exec $CURDIR/lib/ld.so $CURDIR/supertonic "$@"
|
||||
fi
|
||||
fi
|
||||
|
||||
exec $CURDIR/supertonic "$@"
|
||||
|
||||
@@ -1284,6 +1284,7 @@
|
||||
nvidia-cuda-13: "cuda13-liquid-audio"
|
||||
nvidia-cuda-12: "cuda12-liquid-audio"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-liquid-audio"
|
||||
metal: "metal-liquid-audio"
|
||||
icon: https://cdn-avatars.huggingface.co/v1/production/uploads/61b8e2ba285851687028d395/7_6D7rWrLxp2hb6OHSV1p.png
|
||||
- &qwen-tts
|
||||
urls:
|
||||
@@ -1569,6 +1570,7 @@
|
||||
- TTS
|
||||
capabilities:
|
||||
default: "cpu-supertonic"
|
||||
metal: "metal-supertonic"
|
||||
- !!merge <<: *neutts
|
||||
name: "neutts-development"
|
||||
capabilities:
|
||||
@@ -4612,6 +4614,7 @@
|
||||
nvidia-cuda-13: "cuda13-liquid-audio-development"
|
||||
nvidia-cuda-12: "cuda12-liquid-audio-development"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-liquid-audio-development"
|
||||
metal: "metal-liquid-audio-development"
|
||||
- !!merge <<: *liquid-audio
|
||||
name: "cpu-liquid-audio"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-liquid-audio"
|
||||
@@ -4622,6 +4625,16 @@
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-liquid-audio"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-cpu-liquid-audio
|
||||
- !!merge <<: *liquid-audio
|
||||
name: "metal-liquid-audio"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-liquid-audio"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-metal-darwin-arm64-liquid-audio
|
||||
- !!merge <<: *liquid-audio
|
||||
name: "metal-liquid-audio-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-liquid-audio"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-metal-darwin-arm64-liquid-audio
|
||||
- !!merge <<: *liquid-audio
|
||||
name: "cuda12-liquid-audio"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-liquid-audio"
|
||||
@@ -5282,6 +5295,7 @@
|
||||
nvidia: "cuda12-trl"
|
||||
nvidia-cuda-12: "cuda12-trl"
|
||||
nvidia-cuda-13: "cuda13-trl"
|
||||
metal: "metal-trl"
|
||||
## TRL backend images
|
||||
- !!merge <<: *trl
|
||||
name: "cpu-trl"
|
||||
@@ -5313,6 +5327,16 @@
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-trl"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-nvidia-cuda-13-trl
|
||||
- !!merge <<: *trl
|
||||
name: "metal-trl"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-trl"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-metal-darwin-arm64-trl
|
||||
- !!merge <<: *trl
|
||||
name: "metal-trl-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-trl"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-metal-darwin-arm64-trl
|
||||
## llama.cpp quantization backend
|
||||
- &llama-cpp-quantization
|
||||
name: "llama-cpp-quantization"
|
||||
@@ -5484,6 +5508,7 @@
|
||||
name: "supertonic-development"
|
||||
capabilities:
|
||||
default: "cpu-supertonic-development"
|
||||
metal: "metal-supertonic-development"
|
||||
- !!merge <<: *supertonic
|
||||
name: "cpu-supertonic"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-supertonic"
|
||||
@@ -5494,3 +5519,13 @@
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-supertonic"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-cpu-supertonic
|
||||
- !!merge <<: *supertonic
|
||||
name: "metal-supertonic"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-supertonic"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-metal-darwin-arm64-supertonic
|
||||
- !!merge <<: *supertonic
|
||||
name: "metal-supertonic-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-supertonic"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-metal-darwin-arm64-supertonic
|
||||
|
||||
@@ -14,5 +14,11 @@ else
|
||||
fi
|
||||
|
||||
# liquid-audio's torch wheels are large; allow upgrades to satisfy transitive pins
|
||||
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
|
||||
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade"
|
||||
# --index-strategy is a uv-only flag. The darwin/MPS build installs with pip
|
||||
# (USE_PIP=true in scripts/build/python-darwin.sh), which rejects it. Only add
|
||||
# it on the uv path; Linux/CUDA resolution is unchanged.
|
||||
if [ "x${USE_PIP:-}" != "xtrue" ]; then
|
||||
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
|
||||
fi
|
||||
installRequirements
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
# MPS (Apple Silicon / Metal) build profile - installed by the darwin CI job.
|
||||
torch>=2.8.0
|
||||
torchaudio>=2.8.0
|
||||
torchcodec>=0.9.1
|
||||
|
||||
@@ -8,7 +8,13 @@ else
|
||||
source $backend_dir/../common/libbackend.sh
|
||||
fi
|
||||
|
||||
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
|
||||
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade"
|
||||
# --index-strategy is a uv-only flag. The darwin/MPS build installs with pip
|
||||
# (USE_PIP=true in scripts/build/python-darwin.sh), which rejects it. Only add
|
||||
# it when uv is the installer, keeping the Linux/CUDA resolution unchanged.
|
||||
if [ "x${USE_PIP:-}" != "xtrue" ]; then
|
||||
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
|
||||
fi
|
||||
installRequirements
|
||||
|
||||
# Fetch convert_hf_to_gguf.py and gguf package from the same llama.cpp version
|
||||
|
||||
12
backend/python/trl/requirements-mps.txt
Normal file
12
backend/python/trl/requirements-mps.txt
Normal file
@@ -0,0 +1,12 @@
|
||||
torch==2.10.0
|
||||
trl
|
||||
peft
|
||||
datasets>=3.0.0
|
||||
transformers>=4.56.2
|
||||
accelerate>=1.4.0
|
||||
huggingface-hub>=1.3.0
|
||||
sentencepiece
|
||||
# Note: bitsandbytes is intentionally omitted on MPS. It is only used by the
|
||||
# CUDA (cublas) variants for 8-bit/4-bit quantization and has poor support on
|
||||
# Apple Silicon. torch here uses the plain PyPI wheels, which ship MPS support
|
||||
# on macOS arm64.
|
||||
@@ -54,8 +54,35 @@ func (g GPU) IsNVIDIABlackwell() bool {
|
||||
return maj >= 12
|
||||
}
|
||||
|
||||
// Compute-buffer headroom guard for the raised physical batch.
|
||||
//
|
||||
// Raising n_ubatch grows the CUDA *compute buffer* (the scratch for the forward
|
||||
// graph), which is allocated PER DEVICE — it does not benefit from a second GPU
|
||||
// the way weights or KV (which are split across devices) do. The buffer scales
|
||||
// ~linearly with n_ubatch * n_ctx, so a large context turns the GB10-tuned
|
||||
// ub2048 into multi-GiB of extra scratch that must fit on a SINGLE card. On a
|
||||
// 16 GiB consumer Blackwell with a 200k context that overflows (issue #10485),
|
||||
// even though the GB10 it was measured on (128 GiB unified memory) had room.
|
||||
//
|
||||
// These constants size a conservative guard: only raise the batch when the
|
||||
// extra scratch fits the per-device VRAM ceiling.
|
||||
const (
|
||||
// computeBufferBytesPerCell approximates the CUDA compute-buffer cost of one
|
||||
// (n_ubatch * n_ctx) cell. Derived from an observed allocation (ub2048 *
|
||||
// ctx204800 ~= 4.5 GiB => ~11 B/cell) and rounded up to 16 for margin, since
|
||||
// the real cost also grows with model width (heads / embedding dim) which we
|
||||
// don't know at config time.
|
||||
computeBufferBytesPerCell = 16
|
||||
// blackwellBatchHeadroomDivisor caps the extra compute buffer from raising the
|
||||
// physical batch at VRAM/divisor. /4 keeps the bulk of a device for weights +
|
||||
// KV, which already dominate VRAM use.
|
||||
blackwellBatchHeadroomDivisor = 4
|
||||
)
|
||||
|
||||
// PhysicalBatch returns the canonical physical batch (n_batch/n_ubatch) for the
|
||||
// given hardware, used when the model config leaves batch unset.
|
||||
// given hardware class, ignoring context/VRAM headroom. Use
|
||||
// PhysicalBatchForContext when a model context and per-device VRAM are known
|
||||
// (the load paths) so the raised batch can't overflow a single device.
|
||||
func PhysicalBatch(g GPU) int {
|
||||
if g.IsNVIDIABlackwell() {
|
||||
return BlackwellPhysicalBatch
|
||||
@@ -63,6 +90,32 @@ func PhysicalBatch(g GPU) int {
|
||||
return DefaultPhysicalBatch
|
||||
}
|
||||
|
||||
// PhysicalBatchForContext is PhysicalBatch gated on per-device VRAM headroom for
|
||||
// the given context: it only raises the batch above the conservative default
|
||||
// when the extra compute buffer (which is allocated on a single device and grows
|
||||
// with n_ubatch * n_ctx) fits within blackwellBatchHeadroomDivisor of the GPU's
|
||||
// VRAM. g.VRAM must be the PER-DEVICE ceiling (the smallest device on a
|
||||
// multi-GPU host), not the summed total — the compute buffer can't be split.
|
||||
//
|
||||
// VRAM 0 (unknown) stays conservative rather than risk a per-device OOM; the
|
||||
// GB10 / unified-memory path reports system RAM, so it still clears the guard.
|
||||
func PhysicalBatchForContext(g GPU, ctx int) int {
|
||||
if !g.IsNVIDIABlackwell() {
|
||||
return DefaultPhysicalBatch
|
||||
}
|
||||
if ctx <= 0 {
|
||||
ctx = DefaultContextSize
|
||||
}
|
||||
if g.VRAM == 0 {
|
||||
return DefaultPhysicalBatch
|
||||
}
|
||||
extra := uint64(ctx) * uint64(BlackwellPhysicalBatch-DefaultPhysicalBatch) * computeBufferBytesPerCell
|
||||
if extra <= g.VRAM/blackwellBatchHeadroomDivisor {
|
||||
return BlackwellPhysicalBatch
|
||||
}
|
||||
return DefaultPhysicalBatch
|
||||
}
|
||||
|
||||
// IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
|
||||
// Callers that re-tune a value chosen by an upstream host (the distributed
|
||||
// router correcting the frontend's guess) use this to avoid clobbering an
|
||||
@@ -122,7 +175,12 @@ func hasParallelOption(opts []string) bool {
|
||||
// deterministic device — detection does a live nvidia-smi call.
|
||||
var localGPU = func() GPU {
|
||||
vendor, _ := xsysinfo.DetectGPUVendor()
|
||||
vram, _ := xsysinfo.TotalAvailableVRAM()
|
||||
// Use the SMALLEST device's VRAM, not the summed total: the parallel-slot
|
||||
// tier and the batch headroom guard both reason about what fits on a single
|
||||
// card, and per-device compute buffers can't be split across GPUs. Summing
|
||||
// two 16 GiB cards into "32 GiB" is what over-provisioned multi-GPU hosts
|
||||
// into OOM (issue #10485).
|
||||
vram, _ := xsysinfo.MinPerGPUVRAM()
|
||||
return GPU{
|
||||
Vendor: vendor,
|
||||
ComputeCapability: xsysinfo.NVIDIAComputeCapability(),
|
||||
@@ -137,10 +195,20 @@ func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
|
||||
if cfg == nil {
|
||||
return
|
||||
}
|
||||
if cfg.Batch == 0 && gpu.IsNVIDIABlackwell() {
|
||||
cfg.Batch = BlackwellPhysicalBatch
|
||||
xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
|
||||
"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability)
|
||||
// Raise the physical batch on Blackwell only when the resulting compute
|
||||
// buffer fits the per-device VRAM at THIS model's context. Leaving Batch at 0
|
||||
// (rather than writing the default 512) preserves the downstream single-pass
|
||||
// sizing in core/backend.EffectiveBatchSize for embedding/score/rerank.
|
||||
if cfg.Batch == 0 {
|
||||
ctx := DefaultContextSize
|
||||
if cfg.ContextSize != nil {
|
||||
ctx = *cfg.ContextSize
|
||||
}
|
||||
if PhysicalBatchForContext(gpu, ctx) == BlackwellPhysicalBatch {
|
||||
cfg.Batch = BlackwellPhysicalBatch
|
||||
xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
|
||||
"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability, "context", ctx, "vram_gib", gpu.VRAM>>30)
|
||||
}
|
||||
}
|
||||
|
||||
// Enable concurrent serving by default on a capable GPU: without this the
|
||||
|
||||
@@ -9,26 +9,37 @@ import (
|
||||
// GPU. The detection seam (localGPU) is injected so the path is deterministic
|
||||
// without a real GPU.
|
||||
var _ = Describe("SetDefaults hardware defaults (single-instance)", func() {
|
||||
const gib = uint64(1) << 30
|
||||
|
||||
var orig func() GPU
|
||||
BeforeEach(func() { orig = localGPU })
|
||||
AfterEach(func() { localGPU = orig })
|
||||
|
||||
It("sets the physical batch on a local Blackwell GPU", func() {
|
||||
localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
|
||||
It("sets the physical batch on a local Blackwell GPU with headroom", func() {
|
||||
localGPU = func() GPU { return GPU{ComputeCapability: "12.1", VRAM: 119 * gib} }
|
||||
cfg := &ModelConfig{}
|
||||
cfg.SetDefaults()
|
||||
Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
|
||||
})
|
||||
|
||||
It("leaves batch unset when a large context would overflow the device", func() {
|
||||
// Regression guard for issue #10485: 16 GiB consumer Blackwell + ~200k ctx.
|
||||
localGPU = func() GPU { return GPU{ComputeCapability: "12.0", VRAM: 16 * gib} }
|
||||
ctx := 204800
|
||||
cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
|
||||
cfg.SetDefaults()
|
||||
Expect(cfg.Batch).To(Equal(0))
|
||||
})
|
||||
|
||||
It("leaves batch unset on a non-Blackwell local GPU", func() {
|
||||
localGPU = func() GPU { return GPU{ComputeCapability: "8.9"} }
|
||||
localGPU = func() GPU { return GPU{ComputeCapability: "8.9", VRAM: 119 * gib} }
|
||||
cfg := &ModelConfig{}
|
||||
cfg.SetDefaults()
|
||||
Expect(cfg.Batch).To(Equal(0))
|
||||
})
|
||||
|
||||
It("never overrides an explicit batch", func() {
|
||||
localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
|
||||
localGPU = func() GPU { return GPU{ComputeCapability: "12.1", VRAM: 119 * gib} }
|
||||
cfg := &ModelConfig{}
|
||||
cfg.Batch = 1024
|
||||
cfg.SetDefaults()
|
||||
|
||||
@@ -7,6 +7,8 @@ import (
|
||||
)
|
||||
|
||||
var _ = Describe("Hardware-driven config defaults", func() {
|
||||
const gib = uint64(1) << 30
|
||||
|
||||
DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
|
||||
func(cc string, want bool) {
|
||||
Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
|
||||
@@ -35,21 +37,54 @@ var _ = Describe("Hardware-driven config defaults", func() {
|
||||
})
|
||||
})
|
||||
|
||||
Describe("PhysicalBatchForContext (per-device VRAM headroom)", func() {
|
||||
It("raises the batch when the compute buffer fits the device", func() {
|
||||
// 16 GiB Blackwell with a small context: the extra scratch is tiny.
|
||||
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.0", VRAM: 16 * gib}, 8192)).
|
||||
To(Equal(BlackwellPhysicalBatch))
|
||||
})
|
||||
It("keeps the default batch when a large context would overflow one device", func() {
|
||||
// The issue #10485 case: 16 GiB consumer Blackwell, ~200k context.
|
||||
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.0", VRAM: 16 * gib}, 204800)).
|
||||
To(Equal(DefaultPhysicalBatch))
|
||||
})
|
||||
It("still raises the batch on a large unified-memory device (GB10)", func() {
|
||||
// GB10 reports system RAM (~119 GiB) as its single device's VRAM.
|
||||
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.1", VRAM: 119 * gib}, 204800)).
|
||||
To(Equal(BlackwellPhysicalBatch))
|
||||
})
|
||||
It("stays conservative when VRAM is unknown", func() {
|
||||
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.1"}, 8192)).
|
||||
To(Equal(DefaultPhysicalBatch))
|
||||
})
|
||||
It("never raises the batch on non-Blackwell", func() {
|
||||
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "9.0", VRAM: 80 * gib}, 8192)).
|
||||
To(Equal(DefaultPhysicalBatch))
|
||||
})
|
||||
})
|
||||
|
||||
Describe("ApplyHardwareDefaults", func() {
|
||||
It("raises an unset batch to 2048 on Blackwell", func() {
|
||||
It("raises an unset batch to 2048 on Blackwell with headroom", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
|
||||
Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
|
||||
})
|
||||
It("leaves batch unset when a large context would overflow one device", func() {
|
||||
// Regression guard for issue #10485: 16 GiB card + ~200k context.
|
||||
ctx := 204800
|
||||
cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.0", VRAM: 16 * gib})
|
||||
Expect(cfg.Batch).To(Equal(0))
|
||||
})
|
||||
It("leaves batch unset on non-Blackwell", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0"})
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0", VRAM: 119 * gib})
|
||||
Expect(cfg.Batch).To(Equal(0))
|
||||
})
|
||||
It("never overrides an explicit batch", func() {
|
||||
cfg := &ModelConfig{}
|
||||
cfg.Batch = 1024
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
|
||||
Expect(cfg.Batch).To(Equal(1024))
|
||||
})
|
||||
It("no-ops on nil", func() {
|
||||
@@ -57,8 +92,6 @@ var _ = Describe("Hardware-driven config defaults", func() {
|
||||
})
|
||||
})
|
||||
|
||||
const gib = uint64(1) << 30
|
||||
|
||||
DescribeTable("DefaultParallelSlots (by VRAM)",
|
||||
func(vramGiB uint64, want int) {
|
||||
Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want))
|
||||
|
||||
@@ -1204,11 +1204,6 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
|
||||
// This ensures gallery-installed and runtime-loaded models get optimal parameters.
|
||||
ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model)
|
||||
|
||||
// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell).
|
||||
// Uses the local GPU here; in distributed mode the router re-applies the same
|
||||
// heuristics for the selected node's GPU before loading. Explicit config wins.
|
||||
ApplyHardwareDefaults(cfg, localGPU())
|
||||
|
||||
// Apply serving-policy defaults (device-independent): cross-request prefix
|
||||
// caching. Propagates to distributed nodes via the model options.
|
||||
ApplyServingDefaults(cfg)
|
||||
@@ -1247,6 +1242,16 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
|
||||
cfg.ContextSize = &ctx
|
||||
}
|
||||
runBackendHooks(cfg, lo.modelPath)
|
||||
|
||||
// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell)
|
||||
// LAST, after the context size is fully resolved (explicit config, LoadOptions,
|
||||
// then the GGUF guess inside runBackendHooks): the Blackwell batch guard sizes
|
||||
// the per-device compute buffer against this model's context, so it must see
|
||||
// the final value, not a pre-guess nil. Uses the local GPU here; in distributed
|
||||
// mode the router re-applies the same heuristics for the selected node's GPU
|
||||
// before loading. Explicit config always wins.
|
||||
ApplyHardwareDefaults(cfg, localGPU())
|
||||
|
||||
cfg.syncKnownUsecasesFromString()
|
||||
}
|
||||
|
||||
|
||||
@@ -432,7 +432,7 @@ func loadSoundDetectionConfig(pipeline *config.Pipeline, cl *config.ModelConfigL
|
||||
if pipeline.SoundDetection == "" {
|
||||
return nil, nil
|
||||
}
|
||||
cfg, err := cl.LoadModelConfigFileByName(pipeline.SoundDetection, ml.ModelPath)
|
||||
cfg, err := loadPipelineSubModel(cl, pipeline.SoundDetection, ml.ModelPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to load sound detection config: %w", err)
|
||||
}
|
||||
@@ -443,7 +443,7 @@ func loadSoundDetectionConfig(pipeline *config.Pipeline, cl *config.ModelConfigL
|
||||
}
|
||||
|
||||
func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, *config.ModelConfig, error) {
|
||||
cfgVAD, err := cl.LoadModelConfigFileByName(pipeline.VAD, ml.ModelPath)
|
||||
cfgVAD, err := loadPipelineSubModel(cl, pipeline.VAD, ml.ModelPath)
|
||||
if err != nil {
|
||||
|
||||
return nil, nil, fmt.Errorf("failed to load backend config: %w", err)
|
||||
@@ -453,7 +453,7 @@ func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfig
|
||||
return nil, nil, fmt.Errorf("failed to validate config: %w", err)
|
||||
}
|
||||
|
||||
cfgSST, err := cl.LoadModelConfigFileByName(pipeline.Transcription, ml.ModelPath)
|
||||
cfgSST, err := loadPipelineSubModel(cl, pipeline.Transcription, ml.ModelPath)
|
||||
if err != nil {
|
||||
|
||||
return nil, nil, fmt.Errorf("failed to load backend config: %w", err)
|
||||
@@ -542,11 +542,30 @@ func buildRealtimeRoutingContext(a *application.Application, sessionID string) *
|
||||
}
|
||||
}
|
||||
|
||||
// loadPipelineSubModel loads a pipeline sub-model config by name and follows a
|
||||
// single alias hop, so a pipeline that references an alias (e.g. `llm: default`)
|
||||
// gets the alias target's full config (Backend, Model, ...) rather than the
|
||||
// alias stub with an empty Backend. Without this the alias survives unresolved
|
||||
// into model loading and fails downstream — notably in distributed mode with
|
||||
// "backend name is empty". Mirrors the top-level alias resolution in
|
||||
// core/http/middleware/request.go.
|
||||
func loadPipelineSubModel(cl *config.ModelConfigLoader, name, modelPath string) (*config.ModelConfig, error) {
|
||||
cfg, err := cl.LoadModelConfigFileByName(name, modelPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
resolved, _, err := cl.ResolveAlias(cfg)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return resolved, nil
|
||||
}
|
||||
|
||||
// returns and loads either a wrapped model or a model that support audio-to-audio
|
||||
func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, evaluator *templates.Evaluator, routing *RealtimeRoutingContext) (Model, error) {
|
||||
xlog.Debug("Creating new model pipeline model", "pipeline", pipeline)
|
||||
|
||||
cfgVAD, err := cl.LoadModelConfigFileByName(pipeline.VAD, ml.ModelPath)
|
||||
cfgVAD, err := loadPipelineSubModel(cl, pipeline.VAD, ml.ModelPath)
|
||||
if err != nil {
|
||||
|
||||
return nil, fmt.Errorf("failed to load backend config: %w", err)
|
||||
@@ -557,7 +576,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
|
||||
}
|
||||
|
||||
// TODO: Do we always need a transcription model? It can be disabled. Note that any-to-any instruction following models don't transcribe as such, so if transcription is required it is a separate process
|
||||
cfgSST, err := cl.LoadModelConfigFileByName(pipeline.Transcription, ml.ModelPath)
|
||||
cfgSST, err := loadPipelineSubModel(cl, pipeline.Transcription, ml.ModelPath)
|
||||
if err != nil {
|
||||
|
||||
return nil, fmt.Errorf("failed to load backend config: %w", err)
|
||||
@@ -589,7 +608,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
|
||||
xlog.Debug("Loading a wrapped model")
|
||||
|
||||
// Otherwise we want to return a wrapped model, which is a "virtual" model that re-uses other models to perform operations
|
||||
cfgLLM, err := cl.LoadModelConfigFileByName(pipeline.LLM, ml.ModelPath)
|
||||
cfgLLM, err := loadPipelineSubModel(cl, pipeline.LLM, ml.ModelPath)
|
||||
if err != nil {
|
||||
|
||||
return nil, fmt.Errorf("failed to load backend config: %w", err)
|
||||
@@ -604,7 +623,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
|
||||
applyPipelineReasoning(cfgLLM, *pipeline)
|
||||
applyPipelineThinking(cfgLLM, *pipeline)
|
||||
|
||||
cfgTTS, err := cl.LoadModelConfigFileByName(pipeline.TTS, ml.ModelPath)
|
||||
cfgTTS, err := loadPipelineSubModel(cl, pipeline.TTS, ml.ModelPath)
|
||||
if err != nil {
|
||||
|
||||
return nil, fmt.Errorf("failed to load backend config: %w", err)
|
||||
|
||||
52
core/http/endpoints/openai/realtime_model_alias_test.go
Normal file
52
core/http/endpoints/openai/realtime_model_alias_test.go
Normal file
@@ -0,0 +1,52 @@
|
||||
package openai
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
)
|
||||
|
||||
// loadPipelineSubModel must resolve a pipeline sub-model that references an
|
||||
// alias (e.g. `llm: default`) one hop to the alias target's full config — so
|
||||
// the effective backend is the target's backend, not the empty backend of the
|
||||
// alias stub. This mirrors the top-level alias resolution done in
|
||||
// core/http/middleware/request.go, which the realtime pipeline previously
|
||||
// skipped (failing in distributed mode with "backend name is empty").
|
||||
var _ = Describe("loadPipelineSubModel", func() {
|
||||
It("resolves a sub-model alias one hop to the target's config", func() {
|
||||
tmpDir := GinkgoT().TempDir()
|
||||
|
||||
// A real model config with a concrete backend.
|
||||
realLLM := `name: real-llm
|
||||
backend: llama-cpp
|
||||
parameters:
|
||||
model: real-llm.gguf
|
||||
`
|
||||
Expect(os.WriteFile(filepath.Join(tmpDir, "real-llm.yaml"), []byte(realLLM), 0644)).To(Succeed())
|
||||
|
||||
// An alias pointing at the real model.
|
||||
aliasCfg := `name: default
|
||||
alias: real-llm
|
||||
`
|
||||
Expect(os.WriteFile(filepath.Join(tmpDir, "default.yaml"), []byte(aliasCfg), 0644)).To(Succeed())
|
||||
|
||||
cl := config.NewModelConfigLoader(tmpDir)
|
||||
Expect(cl.LoadModelConfigsFromPath(tmpDir)).To(Succeed())
|
||||
|
||||
// Resolving the alias must follow the hop to the target's full config.
|
||||
resolved, err := loadPipelineSubModel(cl, "default", tmpDir)
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
Expect(resolved.IsAlias()).To(BeFalse())
|
||||
Expect(resolved.Backend).To(Equal("llama-cpp"))
|
||||
|
||||
// A non-alias name must load unchanged.
|
||||
direct, err := loadPipelineSubModel(cl, "real-llm", tmpDir)
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
Expect(direct.Backend).To(Equal("llama-cpp"))
|
||||
Expect(direct.Name).To(Equal("real-llm"))
|
||||
})
|
||||
})
|
||||
@@ -86,6 +86,7 @@
|
||||
"input": {
|
||||
"placeholder": "Message...",
|
||||
"attachFile": "Attach file",
|
||||
"send": "Send message",
|
||||
"stopGenerating": "Stop generating",
|
||||
"canvasTitle": "Canvas — extract code blocks and media into a side panel for preview, copy, and download",
|
||||
"canvasLabel": "Canvas",
|
||||
|
||||
@@ -77,6 +77,21 @@
|
||||
"noModelsTitle": "No Models Available",
|
||||
"noModelsBody": "There are no models installed yet. Ask your administrator to set up models so you can start chatting."
|
||||
},
|
||||
"starters": {
|
||||
"title": "Recommended for your hardware",
|
||||
"tier": {
|
||||
"cpu": "CPU-only",
|
||||
"gpu-small": "GPU",
|
||||
"gpu-mid": "GPU",
|
||||
"gpu-large": "GPU"
|
||||
},
|
||||
"cpuNote": "No GPU detected — these small models stay responsive on CPU.",
|
||||
"gpuNote": "Picked to fit your available VRAM with room for context.",
|
||||
"install": "Install",
|
||||
"installing": "Installing",
|
||||
"installStarted": "Installing {{model}}…",
|
||||
"installFailed": "Install failed: {{message}}"
|
||||
},
|
||||
"connect": {
|
||||
"title": "One endpoint, every API",
|
||||
"subtitle": "LocalAI serves its own full API — image & video generation, depth, object detection, reranking, audio, face & voice recognition, and realtime voice over WebRTC and WebSocket. On top of that, a drop-in compatibility layer lets any app built for OpenAI, Anthropic, Ollama or OpenAI Responses talk to it unchanged.",
|
||||
|
||||
@@ -2,6 +2,16 @@
|
||||
"title": "Install Models",
|
||||
"subtitle": "Browse and install AI models from the gallery",
|
||||
"models": "Models",
|
||||
"recommended": {
|
||||
"title": "Recommended for your hardware",
|
||||
"cpuNote": "No GPU detected - small models that stay responsive on CPU.",
|
||||
"gpuNote": "Sized to fit your available VRAM with room for context.",
|
||||
"install": "Install",
|
||||
"installing": "Installing",
|
||||
"installStarted": "Installing {{model}}…",
|
||||
"installFailed": "Install failed: {{message}}",
|
||||
"dismiss": "Dismiss recommendations"
|
||||
},
|
||||
"stats": {
|
||||
"available": "Available",
|
||||
"installed": "Installed"
|
||||
|
||||
@@ -45,7 +45,7 @@
|
||||
},
|
||||
"scheduling": {
|
||||
"title": "Penjadwalan",
|
||||
"subtitle": "Aturan penempatan model dan replika di seluruh klaster"
|
||||
"subtitle": "Aturan penempatan model dan replika di seluruh kluster"
|
||||
},
|
||||
"p2p": {
|
||||
"title": "Komputasi AI Terdistribusi",
|
||||
@@ -86,4 +86,4 @@
|
||||
"title": "Penjelajah",
|
||||
"subtitle": "Jelajahi file dan konfigurasi"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -72,7 +72,7 @@
|
||||
"actions": {
|
||||
"copy": "Salin",
|
||||
"regenerate": "Hasilkan ulang",
|
||||
"jumpToLatest": "Jump to latest"
|
||||
"jumpToLatest": "Lompat ke terbaru"
|
||||
},
|
||||
"streaming": {
|
||||
"transferring": "Mentransfer model...",
|
||||
@@ -115,4 +115,4 @@
|
||||
"clearAll": "Hapus semua",
|
||||
"deleteAllTitle": "Hapus semua percakapan"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
{
|
||||
"unsaved": {
|
||||
"title": "Discard unsaved changes?",
|
||||
"message": "You have unsaved changes that will be lost if you leave this page.",
|
||||
"leave": "Leave"
|
||||
"title": "Buang perubahan yang belum disimpan?",
|
||||
"message": "Anda memiliki perubahan yang belum disimpan. Perubahan tersebut akan hilang jika Anda meninggalkan halaman ini.",
|
||||
"leave": "Tinggalkan Halaman"
|
||||
},
|
||||
"actions": {
|
||||
"save": "Simpan",
|
||||
|
||||
@@ -7,15 +7,15 @@
|
||||
"resourceGpu": "GPU",
|
||||
"resourceRam": "RAM",
|
||||
"greeting": {
|
||||
"morning": "Good morning",
|
||||
"afternoon": "Good afternoon",
|
||||
"evening": "Good evening",
|
||||
"night": "Working late"
|
||||
"morning": "Selamat pagi",
|
||||
"afternoon": "Selamat siang",
|
||||
"evening": "Selamat malam",
|
||||
"night": "Selamat lembur"
|
||||
},
|
||||
"statusLine": {
|
||||
"modelsLoaded_one": "{{count}} model loaded",
|
||||
"modelsLoaded_other": "{{count}} models loaded",
|
||||
"noModelsLoaded": "No models loaded",
|
||||
"modelsLoaded_one": "{{count}} model dimuat",
|
||||
"modelsLoaded_other": "{{count}} model dimuat",
|
||||
"noModelsLoaded": "Tidak ada model yang dimuat",
|
||||
"nodes_one": "{{count}} node",
|
||||
"nodes_other": "{{count}} nodes"
|
||||
},
|
||||
@@ -79,14 +79,14 @@
|
||||
},
|
||||
"connect": {
|
||||
"title": "Satu endpoint, semua API",
|
||||
"subtitle": "LocalAI menyediakan API miliknya sendiri yang lengkap — pembuatan gambar & video, depth, deteksi objek, reranking, audio, pengenalan wajah & suara, serta suara realtime melalui WebRTC dan WebSocket. Di atas itu, lapisan kompatibilitas drop-in membuat aplikasi apa pun yang dibuat untuk OpenAI, Anthropic, Ollama, atau OpenAI Responses bekerja tanpa perubahan.",
|
||||
"subtitle": "LocalAI menyediakan API miliknya sendiri yang lengkap — pembuatan gambar & video, depth, deteksi objek, reranking, audio, pengenalan wajah & suara, serta suara realtime melalui WebRTC dan WebSocket. Selain itu, lapisan kompatibilitas drop-in membuat aplikasi apa pun yang dibuat untuk OpenAI, Anthropic, Ollama, atau OpenAI Responses bekerja tanpa perubahan.",
|
||||
"nativeTitle": "API native",
|
||||
"compatTitle": "Kompatibilitas drop-in",
|
||||
"apiReference": "Referensi API lengkap",
|
||||
"copy": "Salin",
|
||||
"copied": "Disalin",
|
||||
"browse": "Browse the API",
|
||||
"hide": "Hide endpoints",
|
||||
"dismiss": "Dismiss"
|
||||
"browse": "Jelajahi API",
|
||||
"hide": "Sembunyikan endpoint",
|
||||
"dismiss": "Abaikan"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
"video": "Video",
|
||||
"tts": "TTS",
|
||||
"sound": "Suara",
|
||||
"transform": "Transform"
|
||||
"transform": "Transformasi"
|
||||
}
|
||||
},
|
||||
"image": {
|
||||
@@ -30,7 +30,7 @@
|
||||
"refImagesAdded_other": "{{count}} gambar ditambahkan"
|
||||
},
|
||||
"actions": {
|
||||
"view": "View",
|
||||
"view": "Lihat",
|
||||
"generate": "Hasilkan",
|
||||
"generating": "Menghasilkan..."
|
||||
},
|
||||
@@ -153,4 +153,4 @@
|
||||
"clearConfirm": "Hapus",
|
||||
"cleared": "Riwayat dihapus"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,11 +19,11 @@
|
||||
"operate": "Operasikan"
|
||||
},
|
||||
"operate": {
|
||||
"inference": "Inference",
|
||||
"cluster": "Cluster",
|
||||
"observability": "Observability",
|
||||
"access": "Access",
|
||||
"system": "System"
|
||||
"inference": "Inferensi",
|
||||
"cluster": "Kluster",
|
||||
"observability": "Observabilitas",
|
||||
"access": "Akses",
|
||||
"system": "Sistem"
|
||||
},
|
||||
"items": {
|
||||
"home": "Beranda",
|
||||
@@ -64,7 +64,7 @@
|
||||
"copyright": "© 2023-{{year}} {{author}}"
|
||||
},
|
||||
"console": {
|
||||
"automation": "Otomasi",
|
||||
"automation": "Automasi",
|
||||
"training": "Pelatihan"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6363,6 +6363,130 @@ select.input {
|
||||
justify-content: center;
|
||||
}
|
||||
|
||||
/* ──────────────────── Home: hardware-aware starter models ──────────────────── */
|
||||
|
||||
.home-starters {
|
||||
margin: var(--spacing-lg) 0;
|
||||
padding: var(--spacing-lg);
|
||||
}
|
||||
.home-starters-head {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
gap: var(--spacing-md);
|
||||
}
|
||||
.home-starters-head strong {
|
||||
font-size: 0.9375rem;
|
||||
}
|
||||
.home-starters-tier {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: var(--spacing-xs);
|
||||
font-size: 0.75rem;
|
||||
color: var(--color-text-muted);
|
||||
}
|
||||
.home-starters-sub {
|
||||
margin: var(--spacing-xs) 0 var(--spacing-md);
|
||||
font-size: 0.8125rem;
|
||||
color: var(--color-text-secondary);
|
||||
}
|
||||
.home-starters-list {
|
||||
list-style: none;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: var(--spacing-xs);
|
||||
}
|
||||
.home-starters-item {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: var(--spacing-md);
|
||||
padding: var(--spacing-xs) 0;
|
||||
}
|
||||
.home-starters-name {
|
||||
font-weight: 500;
|
||||
font-size: 0.875rem;
|
||||
word-break: break-all;
|
||||
}
|
||||
.home-starters-badge {
|
||||
font-size: 0.625rem;
|
||||
}
|
||||
.home-starters-size {
|
||||
margin-left: auto;
|
||||
font-size: 0.75rem;
|
||||
color: var(--color-text-muted);
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
/* ──────────────────── Models gallery: recommended-for-your-hardware strip ──────────────────── */
|
||||
|
||||
.rec-models {
|
||||
margin-bottom: var(--spacing-md);
|
||||
padding: var(--spacing-md) var(--spacing-lg);
|
||||
}
|
||||
.rec-models-head {
|
||||
display: flex;
|
||||
align-items: flex-start;
|
||||
justify-content: space-between;
|
||||
gap: var(--spacing-md);
|
||||
}
|
||||
.rec-models-title {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: var(--spacing-sm);
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
.rec-models-title i {
|
||||
color: var(--color-primary);
|
||||
}
|
||||
.rec-models-note {
|
||||
font-size: 0.8125rem;
|
||||
color: var(--color-text-secondary);
|
||||
}
|
||||
.rec-models-dismiss {
|
||||
background: none;
|
||||
border: none;
|
||||
color: var(--color-text-muted);
|
||||
cursor: pointer;
|
||||
padding: 4px;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
.rec-models-dismiss:hover {
|
||||
color: var(--color-text-primary);
|
||||
}
|
||||
.rec-models-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fill, minmax(220px, 1fr));
|
||||
gap: var(--spacing-sm);
|
||||
margin-top: var(--spacing-md);
|
||||
}
|
||||
.rec-models-item {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: var(--spacing-xs);
|
||||
padding: var(--spacing-sm) var(--spacing-md);
|
||||
border: 1px solid var(--color-border-subtle);
|
||||
border-radius: var(--radius-md);
|
||||
background: var(--color-bg-primary);
|
||||
}
|
||||
.rec-models-item-name {
|
||||
font-weight: 500;
|
||||
font-size: 0.8125rem;
|
||||
word-break: break-all;
|
||||
}
|
||||
.rec-models-item-meta {
|
||||
display: flex;
|
||||
gap: var(--spacing-sm);
|
||||
font-size: 0.75rem;
|
||||
color: var(--color-text-muted);
|
||||
}
|
||||
.rec-models-item-fit {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 4px;
|
||||
}
|
||||
|
||||
/* ──────────────────── Home: drop-in endpoint / API compatibility ──────────────────── */
|
||||
|
||||
.home-connect {
|
||||
|
||||
@@ -1,8 +1,25 @@
|
||||
import { useEffect, useMemo } from 'react'
|
||||
import { useEffect, useMemo, useCallback } from 'react'
|
||||
import { useModels } from '../hooks/useModels'
|
||||
import SearchableSelect from './SearchableSelect'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
|
||||
// Remember the last model the user picked, keyed by capability, so returning to
|
||||
// a page (Home chat box, Image, TTS, Talk...) defaults to that model instead of
|
||||
// whatever happens to sort first. Only persisted when a capability key exists —
|
||||
// `externalOptions` callers pass no capability and get the old first-item
|
||||
// behaviour. localStorage access is wrapped because private-browsing modes throw.
|
||||
const LAST_MODEL_PREFIX = 'localai_last_model:'
|
||||
|
||||
function readLastModel(capability) {
|
||||
if (!capability) return null
|
||||
try { return localStorage.getItem(LAST_MODEL_PREFIX + capability) } catch { return null }
|
||||
}
|
||||
|
||||
function writeLastModel(capability, model) {
|
||||
if (!capability || !model) return
|
||||
try { localStorage.setItem(LAST_MODEL_PREFIX + capability, model) } catch { /* ignore */ }
|
||||
}
|
||||
|
||||
export default function ModelSelector({
|
||||
value, onChange, capability, className = '',
|
||||
options: externalOptions, loading: externalLoading,
|
||||
@@ -19,16 +36,27 @@ export default function ModelSelector({
|
||||
const isLoading = externalOptions ? (externalLoading || false) : hookLoading
|
||||
const isDisabled = isLoading || (externalDisabled || false)
|
||||
|
||||
// Persist genuine selections so the next visit can restore them.
|
||||
const handleChange = useCallback((next) => {
|
||||
writeLastModel(capability, next)
|
||||
onChange(next)
|
||||
}, [capability, onChange])
|
||||
|
||||
useEffect(() => {
|
||||
if (modelNames.length > 0 && (!value || !modelNames.includes(value))) {
|
||||
onChange(modelNames[0])
|
||||
// Prefer the remembered model when it's still available; otherwise fall
|
||||
// back to the first option. Don't re-persist here — auto-select is not a
|
||||
// user choice, and writing back the stored value would be a harmless but
|
||||
// pointless round-trip.
|
||||
const remembered = readLastModel(capability)
|
||||
onChange(remembered && modelNames.includes(remembered) ? remembered : modelNames[0])
|
||||
}
|
||||
}, [modelNames, value, onChange])
|
||||
}, [modelNames, value, onChange, capability])
|
||||
|
||||
return (
|
||||
<SearchableSelect
|
||||
value={value || ''}
|
||||
onChange={onChange}
|
||||
onChange={handleChange}
|
||||
options={modelNames}
|
||||
placeholder={isLoading ? t('selector.loading') : (modelNames.length === 0 ? t('selector.noModels') : t('selector.selectModel'))}
|
||||
searchPlaceholder={searchPlaceholder || t('selector.searchPlaceholder')}
|
||||
|
||||
86
core/http/react-ui/src/components/RecommendedModels.jsx
Normal file
86
core/http/react-ui/src/components/RecommendedModels.jsx
Normal file
@@ -0,0 +1,86 @@
|
||||
import { useState } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { modelsApi } from '../utils/api'
|
||||
import { useRecommendedModels, isNvfp4Name } from '../hooks/useRecommendedModels'
|
||||
|
||||
const DISMISS_KEY = 'localai_rec_models_dismissed'
|
||||
|
||||
// "Recommended for your hardware" strip at the top of the Models gallery. Shares
|
||||
// the hardware-fit ranking with the empty-state starter widget via
|
||||
// useRecommendedModels, but styled for the gallery page and dismissible (the
|
||||
// gallery is a repeat-visit surface, so it shouldn't nag).
|
||||
export default function RecommendedModels({ addToast }) {
|
||||
const { t } = useTranslation('models')
|
||||
const { recommended, tier, loading } = useRecommendedModels({ count: 4 })
|
||||
const [installing, setInstalling] = useState(() => new Set())
|
||||
const [dismissed, setDismissed] = useState(() => {
|
||||
try { return localStorage.getItem(DISMISS_KEY) === '1' } catch { return false }
|
||||
})
|
||||
|
||||
if (loading || dismissed) return null
|
||||
if (!recommended || recommended.length === 0) return null
|
||||
|
||||
const dismiss = () => {
|
||||
try { localStorage.setItem(DISMISS_KEY, '1') } catch { /* ignore */ }
|
||||
setDismissed(true)
|
||||
}
|
||||
|
||||
const install = async (name) => {
|
||||
setInstalling(prev => new Set(prev).add(name))
|
||||
try {
|
||||
await modelsApi.install(name)
|
||||
addToast?.(t('recommended.installStarted', { model: name }), 'success')
|
||||
} catch (err) {
|
||||
addToast?.(t('recommended.installFailed', { message: err.message }), 'error')
|
||||
setInstalling(prev => {
|
||||
const next = new Set(prev)
|
||||
next.delete(name)
|
||||
return next
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
const isGpu = tier.id !== 'cpu'
|
||||
|
||||
return (
|
||||
<div className="rec-models card">
|
||||
<div className="rec-models-head">
|
||||
<div className="rec-models-title">
|
||||
<i className={`fas ${isGpu ? 'fa-microchip' : 'fa-memory'}`} aria-hidden="true" />
|
||||
<strong>{t('recommended.title')}</strong>
|
||||
<span className="rec-models-note">{isGpu ? t('recommended.gpuNote') : t('recommended.cpuNote')}</span>
|
||||
</div>
|
||||
<button type="button" className="rec-models-dismiss" onClick={dismiss} aria-label={t('recommended.dismiss')} title={t('recommended.dismiss')}>
|
||||
<i className="fas fa-times" aria-hidden="true" />
|
||||
</button>
|
||||
</div>
|
||||
<div className="rec-models-grid">
|
||||
{recommended.map(m => {
|
||||
const busy = installing.has(m.name)
|
||||
return (
|
||||
<div key={m.name} className="rec-models-item">
|
||||
<div className="rec-models-item-name">{m.name}</div>
|
||||
<div className="rec-models-item-meta">
|
||||
{isNvfp4Name(m.name) && <span className="badge badge-info">NVFP4</span>}
|
||||
{m.sizeDisplay && <span>{m.sizeDisplay}</span>}
|
||||
{isGpu && m.vramDisplay && (
|
||||
<span className="rec-models-item-fit"><i className="fas fa-microchip" aria-hidden="true" /> {m.vramDisplay}</span>
|
||||
)}
|
||||
</div>
|
||||
<button
|
||||
type="button"
|
||||
className="btn btn-primary btn-sm"
|
||||
disabled={busy}
|
||||
onClick={() => install(m.name)}
|
||||
>
|
||||
{busy
|
||||
? (<><i className="fas fa-spinner fa-spin" aria-hidden="true" /> {t('recommended.installing')}</>)
|
||||
: (<><i className="fas fa-download" aria-hidden="true" /> {t('recommended.install')}</>)}
|
||||
</button>
|
||||
</div>
|
||||
)
|
||||
})}
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
129
core/http/react-ui/src/components/StarterModels.jsx
Normal file
129
core/http/react-ui/src/components/StarterModels.jsx
Normal file
@@ -0,0 +1,129 @@
|
||||
import { useState } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { modelsApi } from '../utils/api'
|
||||
import { useRecommendedModels, isNvfp4Name } from '../hooks/useRecommendedModels'
|
||||
|
||||
// Static fallback used only when the live gallery / estimates can't be reached
|
||||
// (offline, trimmed gallery). The hook is the primary, data-driven path; these
|
||||
// are real gallery names kept as a safety net so onboarding never shows nothing.
|
||||
// Gemma picks use the QAT (quantization-aware-trained) Q4 builds. NVIDIA boxes
|
||||
// get NVFP4 + MTP variants at the mid/large tiers (see NVIDIA below).
|
||||
const BASE = {
|
||||
cpu: [
|
||||
{ name: 'gemma-4-e2b-it-qat-q4_0', size: '~1.5 GB' },
|
||||
{ name: 'qwen3.5-4b-claude-4.6-opus-reasoning-distilled', size: '~2.5 GB' },
|
||||
{ name: 'gemma-4-e4b-it-qat-q4_0', size: '~3 GB' },
|
||||
{ name: 'lfm2.5-1.2b-instruct', size: '~0.8 GB' },
|
||||
],
|
||||
'gpu-small': [
|
||||
{ name: 'gemma-4-e4b-it-qat-q4_0', size: '~3 GB' },
|
||||
{ name: 'lfm2.5-8b-a1b', size: '~5 GB' },
|
||||
{ name: 'qwen3.5-9b', size: '~5.5 GB' },
|
||||
{ name: 'gemma-4-12b-it-qat-q4_0', size: '~7 GB' },
|
||||
],
|
||||
'gpu-mid': [
|
||||
{ name: 'qwen3.6-27b', size: '~16 GB' },
|
||||
{ name: 'qwen3.6-27b-mtp-pi-tune', size: '~16 GB' },
|
||||
{ name: 'gemma-4-26b-a4b-it-qat-q4_0', size: '~16 GB' },
|
||||
{ name: 'qwen3.5-27b', size: '~16 GB' },
|
||||
],
|
||||
'gpu-large': [
|
||||
{ name: 'qwen3.6-35b-a3b-apex', size: '~20 GB' },
|
||||
{ name: 'qwen3.6-35b-a3b-claude-4.6-opus-reasoning-distilled', size: '~20 GB' },
|
||||
{ name: 'gemma-4-31b-it-qat-q4_0', size: '~18 GB' },
|
||||
{ name: 'qwen3.5-35b-a3b-apex', size: '~20 GB' },
|
||||
],
|
||||
}
|
||||
|
||||
// NVIDIA-only overrides: NVFP4 is a Blackwell-optimised 4-bit format paired with
|
||||
// MTP (multi-token prediction) for speed. Only the mid/large tiers have these.
|
||||
const NVIDIA = {
|
||||
'gpu-mid': [
|
||||
{ name: 'qwen3.6-27b-nvfp4-mtp', size: '~14 GB' },
|
||||
{ name: 'qwen3.6-27b-mtp-pi-tune', size: '~16 GB' },
|
||||
{ name: 'gemma-4-26b-a4b-it-qat-q4_0', size: '~16 GB' },
|
||||
{ name: 'qwen3.6-27b', size: '~16 GB' },
|
||||
],
|
||||
'gpu-large': [
|
||||
{ name: 'qwen3.6-35b-a3b-nvfp4-mtp', size: '~18 GB' },
|
||||
{ name: 'qwen3.6-27b-nvfp4-mtp', size: '~14 GB' },
|
||||
{ name: 'qwen3.6-35b-a3b-apex', size: '~20 GB' },
|
||||
{ name: 'gemma-4-31b-it-qat-q4_0', size: '~18 GB' },
|
||||
],
|
||||
}
|
||||
|
||||
function fallbackFor(tierId, isNvidia) {
|
||||
if (isNvidia && NVIDIA[tierId]) return NVIDIA[tierId]
|
||||
return BASE[tierId] || BASE.cpu
|
||||
}
|
||||
|
||||
export default function StarterModels({ addToast, onInstallStarted }) {
|
||||
const { t } = useTranslation('home')
|
||||
const { recommended, tier, isNvidia, loading } = useRecommendedModels({ count: 4 })
|
||||
const [installing, setInstalling] = useState(() => new Set())
|
||||
|
||||
// While the hardware probe + gallery query are in flight, render nothing
|
||||
// rather than flashing fallback content that may be replaced a moment later.
|
||||
if (loading) return null
|
||||
|
||||
// Prefer live recommendations; fall back to the static list only when the
|
||||
// gallery yielded nothing.
|
||||
const items = (recommended && recommended.length > 0)
|
||||
? recommended.map(r => ({ name: r.name, size: r.sizeDisplay }))
|
||||
: fallbackFor(tier.id, isNvidia)
|
||||
|
||||
if (items.length === 0) return null
|
||||
|
||||
const install = async (name) => {
|
||||
setInstalling(prev => new Set(prev).add(name))
|
||||
try {
|
||||
await modelsApi.install(name)
|
||||
addToast?.(t('starters.installStarted', { model: name }), 'success')
|
||||
onInstallStarted?.(name)
|
||||
} catch (err) {
|
||||
addToast?.(t('starters.installFailed', { message: err.message }), 'error')
|
||||
setInstalling(prev => {
|
||||
const next = new Set(prev)
|
||||
next.delete(name)
|
||||
return next
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return (
|
||||
<section className="home-starters card">
|
||||
<div className="home-starters-head">
|
||||
<strong>{t('starters.title')}</strong>
|
||||
<span className="home-starters-tier">
|
||||
<i className={`fas ${tier.id === 'cpu' ? 'fa-memory' : 'fa-microchip'}`} aria-hidden="true" />
|
||||
{t(`starters.tier.${tier.id}`)}
|
||||
</span>
|
||||
</div>
|
||||
<p className="home-starters-sub">
|
||||
{tier.id === 'cpu' ? t('starters.cpuNote') : t('starters.gpuNote')}
|
||||
</p>
|
||||
<ul className="home-starters-list">
|
||||
{items.map(c => {
|
||||
const busy = installing.has(c.name)
|
||||
return (
|
||||
<li key={c.name} className="home-starters-item">
|
||||
<span className="home-starters-name">{c.name}</span>
|
||||
{isNvfp4Name(c.name) && <span className="badge badge-info home-starters-badge">NVFP4</span>}
|
||||
{c.size && <span className="home-starters-size">{c.size}</span>}
|
||||
<button
|
||||
type="button"
|
||||
className="btn btn-primary btn-sm"
|
||||
disabled={busy}
|
||||
onClick={() => install(c.name)}
|
||||
>
|
||||
{busy
|
||||
? (<><i className="fas fa-spinner fa-spin" aria-hidden="true" /> {t('starters.installing')}</>)
|
||||
: (<><i className="fas fa-download" aria-hidden="true" /> {t('starters.install')}</>)}
|
||||
</button>
|
||||
</li>
|
||||
)
|
||||
})}
|
||||
</ul>
|
||||
</section>
|
||||
)
|
||||
}
|
||||
66
core/http/react-ui/src/hooks/usePolling.js
vendored
Normal file
66
core/http/react-ui/src/hooks/usePolling.js
vendored
Normal file
@@ -0,0 +1,66 @@
|
||||
import { useEffect, useRef, useCallback } from 'react'
|
||||
|
||||
// usePolling runs `fn` immediately and then on a fixed interval, with two
|
||||
// behaviours every hand-rolled setInterval in this app was missing:
|
||||
//
|
||||
// 1. Visibility-aware: the timer pauses while the tab is hidden
|
||||
// (document.hidden) and fires an immediate catch-up poll when the tab
|
||||
// becomes visible again. A backgrounded dashboard no longer hammers the
|
||||
// server every few seconds for data nobody is looking at.
|
||||
// 2. Non-overlapping: if `fn` returns a promise that takes longer than the
|
||||
// interval, the next tick waits for it instead of stacking requests.
|
||||
//
|
||||
// `enabled: false` stops polling entirely (one-shot or gated polls). The
|
||||
// returned `refetch` runs `fn` on demand and is stable across renders.
|
||||
export function usePolling(fn, intervalMs = 5000, { enabled = true, immediate = true } = {}) {
|
||||
const fnRef = useRef(fn)
|
||||
fnRef.current = fn
|
||||
|
||||
const runningRef = useRef(false)
|
||||
const refetch = useCallback(async () => {
|
||||
// Guard against overlap: a slow poll shouldn't pile up behind a fast timer.
|
||||
if (runningRef.current) return
|
||||
runningRef.current = true
|
||||
try {
|
||||
return await fnRef.current()
|
||||
} finally {
|
||||
runningRef.current = false
|
||||
}
|
||||
}, [])
|
||||
|
||||
useEffect(() => {
|
||||
if (!enabled) return
|
||||
let timer = null
|
||||
|
||||
const tick = () => { refetch() }
|
||||
|
||||
const start = () => {
|
||||
if (timer != null) return
|
||||
timer = setInterval(tick, intervalMs)
|
||||
}
|
||||
const stop = () => {
|
||||
if (timer != null) { clearInterval(timer); timer = null }
|
||||
}
|
||||
|
||||
const onVisibility = () => {
|
||||
if (document.hidden) {
|
||||
stop()
|
||||
} else {
|
||||
// Catch up immediately on return, then resume the cadence.
|
||||
tick()
|
||||
start()
|
||||
}
|
||||
}
|
||||
|
||||
if (immediate) tick()
|
||||
if (!document.hidden) start()
|
||||
document.addEventListener('visibilitychange', onVisibility)
|
||||
|
||||
return () => {
|
||||
stop()
|
||||
document.removeEventListener('visibilitychange', onVisibility)
|
||||
}
|
||||
}, [enabled, intervalMs, immediate, refetch])
|
||||
|
||||
return { refetch }
|
||||
}
|
||||
108
core/http/react-ui/src/hooks/useRecommendedModels.js
vendored
Normal file
108
core/http/react-ui/src/hooks/useRecommendedModels.js
vendored
Normal file
@@ -0,0 +1,108 @@
|
||||
import { useState, useEffect } from 'react'
|
||||
import { modelsApi } from '../utils/api'
|
||||
import { useResources } from './useResources'
|
||||
|
||||
// Data-driven "recommended for your hardware" model picks. The gallery exposes
|
||||
// no popularity/download signal and the list response carries no size, so we:
|
||||
// 1. ask the server for chat-capable models in their natural (curated) order,
|
||||
// 2. estimate size/VRAM for the top candidates (same endpoint the Models page
|
||||
// uses), and
|
||||
// 3. rank by hardware fit — smallest on CPU-only boxes, largest-that-fits on
|
||||
// GPUs (bigger == better quality while still fitting VRAM).
|
||||
//
|
||||
// Returns `recommended === null` while loading, `[]` when nothing could be
|
||||
// resolved (gallery/estimates unavailable) so callers can fall back.
|
||||
|
||||
const GB = 1024 * 1024 * 1024
|
||||
const DEFAULT_CTX = 4096
|
||||
|
||||
// NVFP4 is a Blackwell/NVIDIA-specific 4-bit format — only worth suggesting on
|
||||
// NVIDIA hardware, and to be filtered out elsewhere.
|
||||
export const isNvfp4Name = (name) => /nvfp4/i.test(name || '')
|
||||
|
||||
export function hasNvidiaGpu(resources) {
|
||||
return Array.isArray(resources?.gpus) &&
|
||||
resources.gpus.some(g => (g?.vendor || '').toLowerCase() === 'nvidia')
|
||||
}
|
||||
|
||||
export function recommendTier(resources) {
|
||||
const isGpu = resources?.type === 'gpu'
|
||||
const vram = resources?.aggregate?.total_memory || 0
|
||||
if (!isGpu || vram <= 0) return { id: 'cpu', vram: 0 }
|
||||
if (vram < 8 * GB) return { id: 'gpu-small', vram }
|
||||
if (vram < 24 * GB) return { id: 'gpu-mid', vram }
|
||||
return { id: 'gpu-large', vram }
|
||||
}
|
||||
|
||||
function rank(candidates, tier, count, isNvidia) {
|
||||
// NVFP4 only runs on NVIDIA (Blackwell) — drop it everywhere else, and prefer
|
||||
// it on NVIDIA boxes where it's the fastest path.
|
||||
const pool = candidates.filter(c => c.sizeBytes != null && (isNvidia || !isNvfp4Name(c.name)))
|
||||
if (tier.id === 'cpu') {
|
||||
// No GPU: smallest models stay responsive on CPU.
|
||||
return [...pool].sort((a, b) => a.sizeBytes - b.sizeBytes).slice(0, count)
|
||||
}
|
||||
const limit = tier.vram * 0.95
|
||||
const fits = pool.filter(c => c.vramBytes != null && c.vramBytes <= limit)
|
||||
const base = fits.length > 0 ? fits : pool // tiny GPU where nothing fits → fall through to smallest
|
||||
const byPreference = (a, b) => {
|
||||
// On NVIDIA, surface NVFP4 first; then largest-that-fits (best quality).
|
||||
if (isNvidia) {
|
||||
const an = isNvfp4Name(a.name), bn = isNvfp4Name(b.name)
|
||||
if (an !== bn) return an ? -1 : 1
|
||||
}
|
||||
return fits.length > 0 ? b.sizeBytes - a.sizeBytes : a.sizeBytes - b.sizeBytes
|
||||
}
|
||||
return [...base].sort(byPreference).slice(0, count)
|
||||
}
|
||||
|
||||
export function useRecommendedModels({ count = 4, candidatePool = 10 } = {}) {
|
||||
const { resources } = useResources()
|
||||
const [recommended, setRecommended] = useState(null)
|
||||
const [error, setError] = useState(null)
|
||||
|
||||
const resReady = resources !== null
|
||||
const tier = recommendTier(resources)
|
||||
const isNvidia = hasNvidiaGpu(resources)
|
||||
|
||||
useEffect(() => {
|
||||
if (!resReady) return
|
||||
let cancelled = false
|
||||
setRecommended(null)
|
||||
setError(null)
|
||||
;(async () => {
|
||||
try {
|
||||
const data = await modelsApi.list({ tag: 'chat', items: candidatePool, page: 1 })
|
||||
// Recommend models the user hasn't installed yet.
|
||||
const models = (data?.models || []).filter(m => !m.installed)
|
||||
const estimated = await Promise.all(models.map(async (m) => {
|
||||
const name = m.name || m.id
|
||||
try {
|
||||
const e = await modelsApi.estimate(name, [DEFAULT_CTX])
|
||||
const ctx = e?.estimates?.[String(DEFAULT_CTX)]
|
||||
return {
|
||||
name,
|
||||
description: m.description,
|
||||
sizeBytes: e?.sizeBytes ?? null,
|
||||
sizeDisplay: e?.sizeDisplay ?? null,
|
||||
vramBytes: ctx?.vramBytes ?? null,
|
||||
vramDisplay: ctx?.vramDisplay ?? null,
|
||||
}
|
||||
} catch {
|
||||
return { name, sizeBytes: null }
|
||||
}
|
||||
}))
|
||||
if (cancelled) return
|
||||
setRecommended(rank(estimated, tier, count, isNvidia))
|
||||
} catch (e) {
|
||||
if (cancelled) return
|
||||
setError(e.message)
|
||||
setRecommended([])
|
||||
}
|
||||
})()
|
||||
return () => { cancelled = true }
|
||||
// tier.id / tier.vram / isNvidia are primitives, so resource polling doesn't re-run this.
|
||||
}, [resReady, tier.id, tier.vram, isNvidia, count, candidatePool])
|
||||
|
||||
return { recommended, tier, isNvidia, error, loading: recommended === null }
|
||||
}
|
||||
17
core/http/react-ui/src/hooks/useResources.js
vendored
17
core/http/react-ui/src/hooks/useResources.js
vendored
@@ -1,11 +1,11 @@
|
||||
import { useState, useEffect, useCallback, useRef } from 'react'
|
||||
import { useState, useCallback } from 'react'
|
||||
import { resourcesApi } from '../utils/api'
|
||||
import { usePolling } from './usePolling'
|
||||
|
||||
export function useResources(pollInterval = 5000) {
|
||||
const [resources, setResources] = useState(null)
|
||||
const [loading, setLoading] = useState(true)
|
||||
const [error, setError] = useState(null)
|
||||
const intervalRef = useRef(null)
|
||||
|
||||
const fetchResources = useCallback(async () => {
|
||||
try {
|
||||
@@ -19,13 +19,10 @@ export function useResources(pollInterval = 5000) {
|
||||
}
|
||||
}, [])
|
||||
|
||||
useEffect(() => {
|
||||
fetchResources()
|
||||
intervalRef.current = setInterval(fetchResources, pollInterval)
|
||||
return () => {
|
||||
if (intervalRef.current) clearInterval(intervalRef.current)
|
||||
}
|
||||
}, [fetchResources, pollInterval])
|
||||
// Visibility-aware polling: pauses while the tab is hidden and catches up on
|
||||
// return (see usePolling). Resource stats are pure dashboard data, so there's
|
||||
// no reason to keep fetching them for a backgrounded tab.
|
||||
const { refetch } = usePolling(fetchResources, pollInterval)
|
||||
|
||||
return { resources, loading, error, refetch: fetchResources }
|
||||
return { resources, loading, error, refetch }
|
||||
}
|
||||
|
||||
@@ -765,8 +765,10 @@ export default function AgentChat() {
|
||||
className="chat-send-btn"
|
||||
onClick={handleSend}
|
||||
disabled={processing || !input.trim()}
|
||||
aria-label="Send message"
|
||||
title="Send message"
|
||||
>
|
||||
<i className="fas fa-paper-plane" />
|
||||
<i className="fas fa-paper-plane" aria-hidden="true" />
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -1427,8 +1427,10 @@ export default function Chat() {
|
||||
className="chat-send-btn"
|
||||
onClick={handleSend}
|
||||
disabled={!input.trim() && files.length === 0}
|
||||
aria-label={t('input.send')}
|
||||
title={t('input.send')}
|
||||
>
|
||||
<i className="fas fa-paper-plane" />
|
||||
<i className="fas fa-paper-plane" aria-hidden="true" />
|
||||
</button>
|
||||
)}
|
||||
</div>
|
||||
|
||||
@@ -10,6 +10,7 @@ import UnifiedMCPDropdown from '../components/UnifiedMCPDropdown'
|
||||
import ConfirmDialog from '../components/ConfirmDialog'
|
||||
import HomeConnect from '../components/HomeConnect'
|
||||
import { useResources } from '../hooks/useResources'
|
||||
import { usePolling } from '../hooks/usePolling'
|
||||
import { fileToBase64, backendControlApi, systemApi, modelsApi, mcpApi, nodesApi } from '../utils/api'
|
||||
import { API_CONFIG } from '../utils/config'
|
||||
import { greetingKey } from '../utils/greeting'
|
||||
@@ -17,6 +18,7 @@ import StatusPill from '../components/StatusPill'
|
||||
import Skeleton from '../components/Skeleton'
|
||||
import SectionHeading from '../components/SectionHeading'
|
||||
import EmptyState from '../components/EmptyState'
|
||||
import StarterModels from '../components/StarterModels'
|
||||
import { staggerStyle } from '../hooks/useStagger'
|
||||
|
||||
export default function Home() {
|
||||
@@ -68,40 +70,36 @@ export default function Home() {
|
||||
.catch(() => {})
|
||||
}, [])
|
||||
|
||||
// Poll cluster node data in distributed mode
|
||||
useEffect(() => {
|
||||
if (!distributedMode) return
|
||||
const fetchCluster = async () => {
|
||||
try {
|
||||
const data = await nodesApi.list()
|
||||
const nodes = Array.isArray(data) ? data : []
|
||||
const backendNodes = nodes.filter(n => !n.node_type || n.node_type === 'backend')
|
||||
const totalVRAM = backendNodes.reduce((sum, n) => sum + (n.total_vram || 0), 0)
|
||||
const usedVRAM = backendNodes.reduce((sum, n) => {
|
||||
if (n.total_vram && n.available_vram != null) return sum + (n.total_vram - n.available_vram)
|
||||
return sum
|
||||
}, 0)
|
||||
const totalRAM = backendNodes.reduce((sum, n) => sum + (n.total_ram || 0), 0)
|
||||
const usedRAM = backendNodes.reduce((sum, n) => {
|
||||
if (n.total_ram && n.available_ram != null) return sum + (n.total_ram - n.available_ram)
|
||||
return sum
|
||||
}, 0)
|
||||
const isGPU = totalVRAM > 0
|
||||
const healthyCount = backendNodes.filter(n => n.status === 'healthy').length
|
||||
const totalCount = backendNodes.length
|
||||
setClusterData({
|
||||
totalMem: isGPU ? totalVRAM : totalRAM,
|
||||
usedMem: isGPU ? usedVRAM : usedRAM,
|
||||
isGPU,
|
||||
healthyCount,
|
||||
totalCount,
|
||||
})
|
||||
} catch { setClusterData(null) }
|
||||
}
|
||||
fetchCluster()
|
||||
const interval = setInterval(fetchCluster, 5000)
|
||||
return () => clearInterval(interval)
|
||||
}, [distributedMode])
|
||||
// Poll cluster node data in distributed mode. Visibility-aware + gated on
|
||||
// distributedMode so a non-distributed or backgrounded tab makes no calls.
|
||||
const fetchCluster = useCallback(async () => {
|
||||
try {
|
||||
const data = await nodesApi.list()
|
||||
const nodes = Array.isArray(data) ? data : []
|
||||
const backendNodes = nodes.filter(n => !n.node_type || n.node_type === 'backend')
|
||||
const totalVRAM = backendNodes.reduce((sum, n) => sum + (n.total_vram || 0), 0)
|
||||
const usedVRAM = backendNodes.reduce((sum, n) => {
|
||||
if (n.total_vram && n.available_vram != null) return sum + (n.total_vram - n.available_vram)
|
||||
return sum
|
||||
}, 0)
|
||||
const totalRAM = backendNodes.reduce((sum, n) => sum + (n.total_ram || 0), 0)
|
||||
const usedRAM = backendNodes.reduce((sum, n) => {
|
||||
if (n.total_ram && n.available_ram != null) return sum + (n.total_ram - n.available_ram)
|
||||
return sum
|
||||
}, 0)
|
||||
const isGPU = totalVRAM > 0
|
||||
const healthyCount = backendNodes.filter(n => n.status === 'healthy').length
|
||||
const totalCount = backendNodes.length
|
||||
setClusterData({
|
||||
totalMem: isGPU ? totalVRAM : totalRAM,
|
||||
usedMem: isGPU ? usedVRAM : usedRAM,
|
||||
isGPU,
|
||||
healthyCount,
|
||||
totalCount,
|
||||
})
|
||||
} catch { setClusterData(null) }
|
||||
}, [])
|
||||
usePolling(fetchCluster, 5000, { enabled: distributedMode })
|
||||
|
||||
// Fetch configured models (to know if any exist) and loaded models (currently running)
|
||||
const fetchSystemInfo = useCallback(async () => {
|
||||
@@ -123,11 +121,7 @@ export default function Home() {
|
||||
}
|
||||
}, [])
|
||||
|
||||
useEffect(() => {
|
||||
fetchSystemInfo()
|
||||
const interval = setInterval(fetchSystemInfo, 5000)
|
||||
return () => clearInterval(interval)
|
||||
}, [fetchSystemInfo])
|
||||
usePolling(fetchSystemInfo, 5000)
|
||||
|
||||
// Check MCP availability when selected model changes
|
||||
useEffect(() => {
|
||||
@@ -523,6 +517,8 @@ export default function Home() {
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<StarterModels addToast={addToast} onInstallStarted={fetchSystemInfo} />
|
||||
|
||||
<div className="home-wizard-actions">
|
||||
<button className="btn btn-primary" onClick={() => navigate('/app/models')}>
|
||||
<i className="fas fa-store" /> {t('wizard.browseGallery')}
|
||||
|
||||
@@ -13,6 +13,7 @@ import ConfirmDialog from '../components/ConfirmDialog'
|
||||
import GalleryLoader from '../components/GalleryLoader'
|
||||
import Toggle from '../components/Toggle'
|
||||
import ResponsiveTable from '../components/ResponsiveTable'
|
||||
import RecommendedModels from '../components/RecommendedModels'
|
||||
import React from 'react'
|
||||
|
||||
|
||||
@@ -301,6 +302,8 @@ export default function Models() {
|
||||
}
|
||||
/>
|
||||
|
||||
<RecommendedModels addToast={addToast} />
|
||||
|
||||
{/* Search */}
|
||||
<div className="search-bar" style={{ marginBottom: 'var(--spacing-md)' }}>
|
||||
<i className="fas fa-search search-icon" />
|
||||
|
||||
@@ -24,7 +24,37 @@ function formatNumber(n) {
|
||||
return String(n)
|
||||
}
|
||||
|
||||
function StatCard({ icon, label, value, muted }) {
|
||||
// Opt-in token pricing. LocalAI is self-hosted and has no inherent monetary
|
||||
// cost, but multi-user deployments use estimated cost for chargeback/budgeting.
|
||||
// Prices are admin-supplied $ per 1M tokens, stored locally (per-browser), and
|
||||
// the whole cost surface stays hidden until a non-zero price is set.
|
||||
const TOKEN_PRICING_KEY = 'localai_token_pricing'
|
||||
|
||||
function loadPricing() {
|
||||
try {
|
||||
const p = JSON.parse(localStorage.getItem(TOKEN_PRICING_KEY) || '{}')
|
||||
return { prompt: Number(p.prompt) || 0, completion: Number(p.completion) || 0 }
|
||||
} catch { return { prompt: 0, completion: 0 } }
|
||||
}
|
||||
|
||||
function savePricing(p) {
|
||||
try { localStorage.setItem(TOKEN_PRICING_KEY, JSON.stringify(p)) } catch { /* ignore */ }
|
||||
}
|
||||
|
||||
function pricingEnabled(p) { return (p?.prompt || 0) > 0 || (p?.completion || 0) > 0 }
|
||||
|
||||
function costOf(row, p) {
|
||||
return (row.prompt_tokens / 1_000_000) * (p.prompt || 0)
|
||||
+ (row.completion_tokens / 1_000_000) * (p.completion || 0)
|
||||
}
|
||||
|
||||
function formatCost(n) {
|
||||
if (!n) return '$0.00'
|
||||
if (n < 0.01) return '<$0.01'
|
||||
return '$' + n.toFixed(2)
|
||||
}
|
||||
|
||||
function StatCard({ icon, label, value, muted, text }) {
|
||||
return (
|
||||
<div className="card" style={{ padding: 'var(--spacing-sm) var(--spacing-md)', flex: '1 1 0', minWidth: 120, opacity: muted ? 0.7 : 1 }}>
|
||||
<div style={{ display: 'flex', alignItems: 'center', gap: 6, marginBottom: 2 }}>
|
||||
@@ -32,7 +62,7 @@ function StatCard({ icon, label, value, muted }) {
|
||||
<span style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)', fontWeight: 500, textTransform: 'uppercase', letterSpacing: '0.03em' }}>{label}</span>
|
||||
</div>
|
||||
<div style={{ fontSize: '1.375rem', fontWeight: 700, fontFamily: 'var(--font-mono)', color: muted ? 'var(--color-text-secondary)' : 'var(--color-text-primary)' }}>
|
||||
{muted ? '~' : ''}{formatNumber(value)}
|
||||
{text != null ? text : `${muted ? '~' : ''}${formatNumber(value)}`}
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
@@ -642,6 +672,10 @@ export default function Usage() {
|
||||
const [activeTab, setActiveTab] = useState('models')
|
||||
const [quotas, setQuotas] = useState([])
|
||||
const [selectedUserId, setSelectedUserId] = useState(null)
|
||||
const [pricing, setPricingState] = useState(loadPricing)
|
||||
const [showPricing, setShowPricing] = useState(false)
|
||||
const setPricing = (p) => { setPricingState(p); savePricing(p) }
|
||||
const costEnabled = pricingEnabled(pricing)
|
||||
|
||||
const fetchUsage = useCallback(async () => {
|
||||
setLoading(true)
|
||||
@@ -743,11 +777,50 @@ export default function Usage() {
|
||||
<i className="fas fa-key" style={{ fontSize: '0.7rem' }} /> {t('usage.sources.tab')}
|
||||
</button>
|
||||
<div style={{ flex: 1 }} />
|
||||
<button
|
||||
className={`btn btn-sm ${costEnabled ? 'btn-primary' : 'btn-secondary'}`}
|
||||
onClick={() => setShowPricing(v => !v)}
|
||||
style={{ gap: 4 }}
|
||||
title="Set token pricing to estimate cost"
|
||||
>
|
||||
<i className="fas fa-dollar-sign" /> {costEnabled ? 'Pricing' : 'Set pricing'}
|
||||
</button>
|
||||
<button className="btn btn-secondary btn-sm" onClick={fetchUsage} disabled={loading} style={{ gap: 4 }}>
|
||||
<i className={`fas fa-rotate${loading ? ' fa-spin' : ''}`} /> Refresh
|
||||
</button>
|
||||
</div>
|
||||
|
||||
{showPricing && (
|
||||
<div className="card" style={{ display: 'flex', alignItems: 'flex-end', gap: 'var(--spacing-md)', flexWrap: 'wrap', padding: 'var(--spacing-md)', marginBottom: 'var(--spacing-md)' }}>
|
||||
<div style={{ display: 'flex', flexDirection: 'column', gap: 2 }}>
|
||||
<label style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)', textTransform: 'uppercase', letterSpacing: '0.03em' }}>Prompt $/1M tokens</label>
|
||||
<input
|
||||
className="input" type="number" min="0" step="0.01" style={{ width: 140 }}
|
||||
value={pricing.prompt || ''}
|
||||
placeholder="0.00"
|
||||
onChange={e => setPricing({ ...pricing, prompt: Number(e.target.value) || 0 })}
|
||||
/>
|
||||
</div>
|
||||
<div style={{ display: 'flex', flexDirection: 'column', gap: 2 }}>
|
||||
<label style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)', textTransform: 'uppercase', letterSpacing: '0.03em' }}>Completion $/1M tokens</label>
|
||||
<input
|
||||
className="input" type="number" min="0" step="0.01" style={{ width: 140 }}
|
||||
value={pricing.completion || ''}
|
||||
placeholder="0.00"
|
||||
onChange={e => setPricing({ ...pricing, completion: Number(e.target.value) || 0 })}
|
||||
/>
|
||||
</div>
|
||||
{costEnabled && (
|
||||
<button className="btn btn-secondary btn-sm" onClick={() => setPricing({ prompt: 0, completion: 0 })} style={{ gap: 4 }}>
|
||||
<i className="fas fa-times" /> Clear
|
||||
</button>
|
||||
)}
|
||||
<span style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', flex: '1 1 200px' }}>
|
||||
Estimated cost only. Prices are stored in this browser and applied to recorded token counts.
|
||||
</span>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{loading ? (
|
||||
<div style={{ display: 'flex', justifyContent: 'center', padding: 'var(--spacing-xl)' }}>
|
||||
<LoadingSpinner size="lg" />
|
||||
@@ -760,6 +833,9 @@ export default function Usage() {
|
||||
<StatCard icon="fas fa-arrow-up" label="Prompt" value={displayTotals.prompt_tokens} />
|
||||
<StatCard icon="fas fa-arrow-down" label="Completion" value={displayTotals.completion_tokens} />
|
||||
<StatCard icon="fas fa-coins" label="Total" value={displayTotals.total_tokens} />
|
||||
{costEnabled && (
|
||||
<StatCard icon="fas fa-dollar-sign" label="Est. Cost" text={formatCost(costOf(displayTotals, pricing))} />
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* Predictions */}
|
||||
@@ -789,6 +865,7 @@ export default function Usage() {
|
||||
<th style={{ width: 110 }}>Prompt</th>
|
||||
<th style={{ width: 110 }}>Completion</th>
|
||||
<th style={{ width: 110 }}>Total</th>
|
||||
{costEnabled && <th style={{ width: 100 }}>Est. Cost</th>}
|
||||
<th style={{ width: 140 }}></th>
|
||||
</tr>
|
||||
</thead>
|
||||
@@ -800,6 +877,7 @@ export default function Usage() {
|
||||
<td style={monoCell}>{formatNumber(row.prompt_tokens)}</td>
|
||||
<td style={monoCell}>{formatNumber(row.completion_tokens)}</td>
|
||||
<td style={{ ...monoCell, fontWeight: 600 }}>{formatNumber(row.total_tokens)}</td>
|
||||
{costEnabled && <td style={monoCell}>{formatCost(costOf(row, pricing))}</td>}
|
||||
<td><UsageBar value={row.total_tokens} max={maxTokens} /></td>
|
||||
</tr>
|
||||
))}
|
||||
@@ -827,6 +905,7 @@ export default function Usage() {
|
||||
<th style={{ width: 110 }}>Prompt</th>
|
||||
<th style={{ width: 110 }}>Completion</th>
|
||||
<th style={{ width: 110 }}>Total</th>
|
||||
{costEnabled && <th style={{ width: 100 }}>Est. Cost</th>}
|
||||
<th style={{ width: 110 }}>Proj. Total</th>
|
||||
<th style={{ width: 140 }}></th>
|
||||
</tr>
|
||||
@@ -849,6 +928,7 @@ export default function Usage() {
|
||||
<td style={monoCell}>{formatNumber(row.prompt_tokens)}</td>
|
||||
<td style={monoCell}>{formatNumber(row.completion_tokens)}</td>
|
||||
<td style={{ ...monoCell, fontWeight: 600 }}>{formatNumber(row.total_tokens)}</td>
|
||||
{costEnabled && <td style={monoCell}>{formatCost(costOf(row, pricing))}</td>}
|
||||
<td style={{ ...monoCell, color: 'var(--color-text-muted)', fontStyle: 'italic' }}>
|
||||
{up?.predictions ? `~${formatNumber(up.predictions.projectedTotals.total_tokens)}` : '-'}
|
||||
</td>
|
||||
@@ -856,7 +936,7 @@ export default function Usage() {
|
||||
</tr>
|
||||
{isExpanded && up && (
|
||||
<tr>
|
||||
<td colSpan={8} style={{ padding: 0, background: 'var(--color-bg-secondary)' }}>
|
||||
<td colSpan={costEnabled ? 9 : 8} style={{ padding: 0, background: 'var(--color-bg-secondary)' }}>
|
||||
<div style={{ padding: 'var(--spacing-md)' }}>
|
||||
{up.predictions && (
|
||||
<div style={{ display: 'grid', gridTemplateColumns: 'repeat(auto-fit, minmax(100px, 1fr))', gap: 'var(--spacing-xs)', marginBottom: 'var(--spacing-sm)' }}>
|
||||
|
||||
@@ -156,7 +156,10 @@ func applyNodeHardwareDefaults(opts *pb.ModelOptions, node *BackendNode) {
|
||||
VRAM: node.TotalVRAM,
|
||||
}
|
||||
if config.IsManagedPhysicalBatch(int(opts.NBatch)) {
|
||||
opts.NBatch = int32(config.PhysicalBatch(gpu))
|
||||
// Gate the raised batch on the selected node's per-device VRAM at this
|
||||
// model's context, so a large context can't overflow the node's compute
|
||||
// buffer (issue #10485). node.TotalVRAM is the node's reported ceiling.
|
||||
opts.NBatch = int32(config.PhysicalBatchForContext(gpu, int(opts.ContextSize)))
|
||||
}
|
||||
// Default concurrent serving for the selected node (the frontend that built
|
||||
// the options may have no GPU). Only adds when no parallel option is set.
|
||||
|
||||
@@ -8,12 +8,19 @@ import (
|
||||
)
|
||||
|
||||
var _ = Describe("applyNodeHardwareDefaults", func() {
|
||||
It("raises a managed default batch on a Blackwell node", func() {
|
||||
opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch}
|
||||
applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1"})
|
||||
It("raises a managed default batch on a Blackwell node with headroom", func() {
|
||||
opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, ContextSize: 8192}
|
||||
applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1", TotalVRAM: 119 << 30})
|
||||
Expect(opts.NBatch).To(BeEquivalentTo(config.BlackwellPhysicalBatch))
|
||||
})
|
||||
|
||||
It("keeps the default batch when a large context would overflow the node", func() {
|
||||
// Regression guard for issue #10485 on the distributed path.
|
||||
opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, ContextSize: 204800}
|
||||
applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.0", TotalVRAM: 16 << 30})
|
||||
Expect(opts.NBatch).To(BeEquivalentTo(config.DefaultPhysicalBatch))
|
||||
})
|
||||
|
||||
It("resets a Blackwell guess on a non-Blackwell node", func() {
|
||||
// frontend (Blackwell) guessed high, but the selected node is not Blackwell
|
||||
opts := &pb.ModelOptions{NBatch: config.BlackwellPhysicalBatch}
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
{
|
||||
"version": "v4.4.3"
|
||||
"version": "v4.5.0"
|
||||
}
|
||||
|
||||
@@ -3,24 +3,7 @@
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
urls:
|
||||
- https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct-GGUF
|
||||
description: |
|
||||
Try LFM • Docs • LEAP • Discord
|
||||
|
||||
# LFM2.5-1.2B-Instruct
|
||||
|
||||
LFM2.5 is a new family of hybrid models designed for **on-device deployment**. It builds on the LFM2 architecture with extended pre-training and reinforcement learning.
|
||||
|
||||
- **Best-in-class performance**: A 1.2B model rivaling much larger models, bringing high-quality AI to your pocket.
|
||||
- **Fast edge inference**: 239 tok/s decode on AMD CPU, 82 tok/s on mobile NPU. Runs under 1GB of memory with day-one support for llama.cpp, MLX, and vLLM.
|
||||
- **Scaled training**: Extended pre-training from 10T to 28T tokens and large-scale multi-stage reinforcement learning.
|
||||
|
||||
Find more information about LFM2.5 in our blog post.
|
||||
|
||||
## 🗒️ Model Details
|
||||
|
||||
LFM2.5-1.2B-Instruct is a general-purpose text-only model with the following features:
|
||||
|
||||
...
|
||||
description: "Try LFM • Docs • LEAP • Discord\n\n# LFM2.5-1.2B-Instruct\n\nLFM2.5 is a new family of hybrid models designed for **on-device deployment**. It builds on the LFM2 architecture with extended pre-training and reinforcement learning.\n\n - **Best-in-class performance**: A 1.2B model rivaling much larger models, bringing high-quality AI to your pocket.\n - **Fast edge inference**: 239 tok/s decode on AMD CPU, 82 tok/s on mobile NPU. Runs under 1GB of memory with day-one support for llama.cpp, MLX, and vLLM.\n - **Scaled training**: Extended pre-training from 10T to 28T tokens and large-scale multi-stage reinforcement learning.\n\nFind more information about LFM2.5 in our blog post.\n\n## \U0001F5D2️ Model Details\n\nLFM2.5-1.2B-Instruct is a general-purpose text-only model with the following features:\n\n...\n"
|
||||
license: "other"
|
||||
tags:
|
||||
- llm
|
||||
@@ -842,8 +825,8 @@
|
||||
use_tokenizer_template: true
|
||||
files:
|
||||
- filename: llama-cpp/models/Qwopus3.6-27B-Coder-MTP-GGUF/Qwopus3.6-27B-Coder-MTP-Q4_K_M.gguf
|
||||
sha256: b2898667ed7b2388f0ab7691393833ae777f247492bbe62fdb4b2bd3e3cf3f79
|
||||
uri: https://huggingface.co/Jackrong/Qwopus3.6-27B-Coder-MTP-GGUF/resolve/main/Qwopus3.6-27B-Coder-MTP-Q4_K_M.gguf
|
||||
sha256: b2b9180093496da2e00439e3fa23227c591355901bfa579bc6897bbc01b755ef
|
||||
- filename: llama-cpp/mmproj/Qwopus3.6-27B-Coder-MTP-GGUF/mmproj-F32.gguf
|
||||
sha256: 32f7ea0600c07272547da401d460f8abbd980f3a57b69d6df87be0e2505e0b9c
|
||||
uri: https://huggingface.co/Jackrong/Qwopus3.6-27B-Coder-MTP-GGUF/resolve/main/mmproj-F32.gguf
|
||||
|
||||
@@ -129,6 +129,61 @@ func TotalAvailableVRAM() (uint64, error) {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
// MinPerGPUVRAM returns the total VRAM of the SMALLEST GPU on the host (in
|
||||
// bytes), or 0 when no per-device VRAM is known. Unlike TotalAvailableVRAM
|
||||
// (which sums across devices) this reports a single device's ceiling, which is
|
||||
// the right figure for decisions about what must fit on one card: the compute
|
||||
// buffer (sized by n_ubatch) and the parallel-slot tier. Summing a multi-GPU
|
||||
// host's VRAM over-provisions those into a per-device OOM (issue #10485).
|
||||
//
|
||||
// Unified-memory devices (GB10, Apple) report system RAM as their single
|
||||
// device's VRAM, so they are unaffected.
|
||||
func MinPerGPUVRAM() (uint64, error) {
|
||||
// Prefer per-device binary detection (nvidia-smi/rocm-smi report true
|
||||
// per-card VRAM); ghw's per-card memory can reflect NUMA node RAM on some
|
||||
// hosts, which is why TotalAvailableVRAM treats it as a sum.
|
||||
if infos := GetGPUMemoryUsage(); len(infos) > 0 {
|
||||
if v := minNonZeroVRAM(infos); v > 0 {
|
||||
return v, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: ghw per-card memory, taking the minimum non-zero card.
|
||||
if gpus, err := GPUs(); err == nil {
|
||||
var min uint64
|
||||
for _, gpu := range gpus {
|
||||
if gpu == nil || gpu.Node == nil || gpu.Node.Memory == nil {
|
||||
continue
|
||||
}
|
||||
if b := gpu.Node.Memory.TotalUsableBytes; b > 0 {
|
||||
if u := uint64(b); min == 0 || u < min {
|
||||
min = u
|
||||
}
|
||||
}
|
||||
}
|
||||
if min > 0 {
|
||||
return min, nil
|
||||
}
|
||||
}
|
||||
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
// minNonZeroVRAM returns the smallest non-zero TotalVRAM across the given GPUs,
|
||||
// or 0 when none report VRAM.
|
||||
func minNonZeroVRAM(infos []GPUMemoryInfo) uint64 {
|
||||
var min uint64
|
||||
for _, g := range infos {
|
||||
if g.TotalVRAM == 0 {
|
||||
continue
|
||||
}
|
||||
if min == 0 || g.TotalVRAM < min {
|
||||
min = g.TotalVRAM
|
||||
}
|
||||
}
|
||||
return min
|
||||
}
|
||||
|
||||
func HasGPU(vendor string) bool {
|
||||
gpus, err := GPUs()
|
||||
if err != nil {
|
||||
|
||||
37
pkg/xsysinfo/minvram_internal_test.go
Normal file
37
pkg/xsysinfo/minvram_internal_test.go
Normal file
@@ -0,0 +1,37 @@
|
||||
package xsysinfo
|
||||
|
||||
import (
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("minNonZeroVRAM", func() {
|
||||
const gib = uint64(1) << 30
|
||||
|
||||
It("returns the smallest device on a multi-GPU host", func() {
|
||||
// Two unequal cards (e.g. RTX 5070 Ti + 5060 Ti, both 16 GiB, or a
|
||||
// mixed pair): the smallest device is the per-card allocation ceiling.
|
||||
infos := []GPUMemoryInfo{
|
||||
{TotalVRAM: 16 * gib},
|
||||
{TotalVRAM: 12 * gib},
|
||||
}
|
||||
Expect(minNonZeroVRAM(infos)).To(Equal(12 * gib))
|
||||
})
|
||||
|
||||
It("ignores devices that report zero VRAM", func() {
|
||||
infos := []GPUMemoryInfo{
|
||||
{TotalVRAM: 0},
|
||||
{TotalVRAM: 24 * gib},
|
||||
}
|
||||
Expect(minNonZeroVRAM(infos)).To(Equal(24 * gib))
|
||||
})
|
||||
|
||||
It("returns the single device's VRAM on a one-GPU host", func() {
|
||||
Expect(minNonZeroVRAM([]GPUMemoryInfo{{TotalVRAM: 16 * gib}})).To(Equal(16 * gib))
|
||||
})
|
||||
|
||||
It("returns 0 when no device reports VRAM", func() {
|
||||
Expect(minNonZeroVRAM([]GPUMemoryInfo{{TotalVRAM: 0}})).To(BeZero())
|
||||
Expect(minNonZeroVRAM(nil)).To(BeZero())
|
||||
})
|
||||
})
|
||||
Reference in New Issue
Block a user