mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-25 00:59:28 -04:00
Compare commits
35 Commits
dependabot
...
feat/darwi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4f7bf33b2d | ||
|
|
0d6de15ae9 | ||
|
|
5e3774dfe3 | ||
|
|
5c3d48ab50 | ||
|
|
764b0352b9 | ||
|
|
75ba2daba1 | ||
|
|
62b14fd635 | ||
|
|
bfb9a40d58 | ||
|
|
af7d0e8b40 | ||
|
|
193d0e6aef | ||
|
|
7743a0abc0 | ||
|
|
482314c623 | ||
|
|
3447b28bbd | ||
|
|
e8ae88a2a0 | ||
|
|
e1994579f8 | ||
|
|
e5620989dd | ||
|
|
fc618dcee6 | ||
|
|
e6042080c0 | ||
|
|
0f3b24436d | ||
|
|
4b6f911835 | ||
|
|
a5e28942a6 | ||
|
|
dba9cd7ca4 | ||
|
|
c93190de50 | ||
|
|
4dbf69f889 | ||
|
|
deb430f3ec | ||
|
|
dd8c8778e2 | ||
|
|
06a7b6cadb | ||
|
|
67c8889866 | ||
|
|
1d49041c85 | ||
|
|
2edc4e25b3 | ||
|
|
7888067914 | ||
|
|
9eedbf537a | ||
|
|
69c16481c8 | ||
|
|
56f8a6623f | ||
|
|
4755d676a3 |
14
.github/backend-matrix.yml
vendored
14
.github/backend-matrix.yml
vendored
@@ -4974,6 +4974,16 @@ includeDarwin:
|
||||
- backend: "kitten-tts"
|
||||
tag-suffix: "-metal-darwin-arm64-kitten-tts"
|
||||
build-type: "mps"
|
||||
# vLLM on Apple Silicon via vllm-metal (MLX). The install is custom
|
||||
# (backend/python/vllm/install.sh has a darwin branch); lang stays python so
|
||||
# backend_build_darwin.yml drives it through build-darwin-python-backend ->
|
||||
# scripts/build/python-darwin.sh, which runs the backend's install.sh.
|
||||
- backend: "vllm"
|
||||
tag-suffix: "-metal-darwin-arm64-vllm"
|
||||
build-type: "mps"
|
||||
- backend: "liquid-audio"
|
||||
tag-suffix: "-metal-darwin-arm64-liquid-audio"
|
||||
build-type: "mps"
|
||||
- backend: "piper"
|
||||
tag-suffix: "-metal-darwin-arm64-piper"
|
||||
build-type: "metal"
|
||||
@@ -4990,6 +5000,10 @@ includeDarwin:
|
||||
tag-suffix: "-metal-darwin-arm64-sherpa-onnx"
|
||||
build-type: "metal"
|
||||
lang: "go"
|
||||
- backend: "supertonic"
|
||||
tag-suffix: "-metal-darwin-arm64-supertonic"
|
||||
build-type: "metal"
|
||||
lang: "go"
|
||||
- backend: "local-store"
|
||||
tag-suffix: "-metal-darwin-arm64-local-store"
|
||||
build-type: "metal"
|
||||
|
||||
55
.github/bump_vllm_metal.sh
vendored
Executable file
55
.github/bump_vllm_metal.sh
vendored
Executable file
@@ -0,0 +1,55 @@
|
||||
#!/bin/bash
|
||||
# Bump the single vllm-metal pin (VLLM_METAL_VERSION) in the vLLM backend's
|
||||
# darwin (Apple Silicon) install path. The macOS/Metal build
|
||||
# (backend/python/vllm/install.sh, Darwin branch) installs vllm-metal, which is
|
||||
# version-locked to a specific vLLM source release. install.sh derives that vLLM
|
||||
# version at build time from vllm-metal's own installer (`vllm_v=`) at the pinned
|
||||
# tag, so there is only ONE value to bump here -- mirroring bump_vllm_wheel.sh,
|
||||
# which bumps the Linux cu130 wheel pin.
|
||||
#
|
||||
# This deliberately tracks vllm-project/vllm-metal, NOT vllm-project/vllm: the
|
||||
# darwin build can only use the exact vLLM version vllm-metal supports, so it may
|
||||
# lag the Linux pin (requirements-cublas13-after.txt) until vllm-metal catches up.
|
||||
set -xe
|
||||
REPO=$1 # vllm-project/vllm-metal
|
||||
FILE=$2 # backend/python/vllm/install.sh
|
||||
VAR=$3 # VLLM_METAL_VERSION (used for the workflow's output file names)
|
||||
|
||||
if [ -z "$FILE" ] || [ -z "$REPO" ] || [ -z "$VAR" ]; then
|
||||
echo "usage: $0 <repo> <install-file> <var-name>" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# vllm-metal ships frequent dev releases, all flagged as non-prerelease, so
|
||||
# /releases/latest returns the newest one (with its cp312 wheel asset).
|
||||
LATEST_TAG=$(curl -sS -H "Accept: application/vnd.github+json" \
|
||||
"https://api.github.com/repos/$REPO/releases/latest" \
|
||||
| python3 -c "import json,sys; print(json.load(sys.stdin)['tag_name'])")
|
||||
|
||||
# The coupled vLLM source version lives in vllm-metal's installer at that tag.
|
||||
NEW_VLLM_VERSION=$(curl -fsSL \
|
||||
"https://raw.githubusercontent.com/$REPO/$LATEST_TAG/install.sh" \
|
||||
| grep -oE 'vllm_v="[0-9]+\.[0-9]+\.[0-9]+"' | head -1 | cut -d'"' -f2)
|
||||
|
||||
if [ -z "$LATEST_TAG" ] || [ -z "$NEW_VLLM_VERSION" ]; then
|
||||
echo "Could not resolve vllm-metal tag ($LATEST_TAG) or its vllm_v ($NEW_VLLM_VERSION)." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
set +e
|
||||
CURRENT_TAG=$(grep -oE 'VLLM_METAL_VERSION="[^"]*"' "$FILE" | head -1 | cut -d'"' -f2)
|
||||
set -e
|
||||
|
||||
# Rewrite the single pin. install.sh derives VLLM_VERSION from this tag at build
|
||||
# time, so there is nothing else to touch. peter-evans/create-pull-request opens
|
||||
# no PR on a clean tree, so a no-op rewrite (already current) is safe.
|
||||
sed -i "$FILE" \
|
||||
-e "s|VLLM_METAL_VERSION=\"[^\"]*\"|VLLM_METAL_VERSION=\"$LATEST_TAG\"|"
|
||||
|
||||
if [ -z "$CURRENT_TAG" ]; then
|
||||
echo "Could not find VLLM_METAL_VERSION=\"...\" in $FILE." >&2
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "vllm-metal ${CURRENT_TAG} -> ${LATEST_TAG} (builds vLLM ${NEW_VLLM_VERSION}): https://github.com/$REPO/releases/tag/${LATEST_TAG}" >> "${VAR}_message.txt"
|
||||
echo "${LATEST_TAG}" >> "${VAR}_commit.txt"
|
||||
36
.github/workflows/bump_deps.yaml
vendored
36
.github/workflows/bump_deps.yaml
vendored
@@ -154,3 +154,39 @@ jobs:
|
||||
branch: "update/VLLM_VERSION"
|
||||
body: ${{ steps.bump.outputs.message }}
|
||||
signoff: true
|
||||
|
||||
bump-vllm-metal:
|
||||
# The darwin (Apple Silicon) vLLM build installs vllm-metal, which is locked
|
||||
# to a specific vLLM source release. install.sh pins both VLLM_METAL_VERSION
|
||||
# (the wheel release) and VLLM_VERSION (the vLLM it builds against); this job
|
||||
# tracks vllm-project/vllm-metal and rewrites both atomically. Separate from
|
||||
# bump-vllm-wheel because darwin follows vllm-metal, not vllm/vllm latest.
|
||||
if: github.repository == 'mudler/LocalAI'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v7
|
||||
- name: Bump vllm-metal pin 🔧
|
||||
id: bump
|
||||
run: |
|
||||
bash .github/bump_vllm_metal.sh vllm-project/vllm-metal backend/python/vllm/install.sh VLLM_METAL_VERSION
|
||||
{
|
||||
echo 'message<<EOF'
|
||||
cat "VLLM_METAL_VERSION_message.txt"
|
||||
echo EOF
|
||||
} >> "$GITHUB_OUTPUT"
|
||||
{
|
||||
echo 'commit<<EOF'
|
||||
cat "VLLM_METAL_VERSION_commit.txt"
|
||||
echo EOF
|
||||
} >> "$GITHUB_OUTPUT"
|
||||
rm -rfv VLLM_METAL_VERSION_message.txt VLLM_METAL_VERSION_commit.txt
|
||||
- name: Create Pull Request
|
||||
uses: peter-evans/create-pull-request@v8
|
||||
with:
|
||||
token: ${{ secrets.UPDATE_BOT_TOKEN }}
|
||||
push-to-fork: ci-forks/LocalAI
|
||||
commit-message: ':arrow_up: Update vllm-project/vllm-metal (darwin)'
|
||||
title: 'chore: :arrow_up: Update vllm-metal (darwin) to `${{ steps.bump.outputs.commit }}`'
|
||||
branch: "update/VLLM_METAL_VERSION"
|
||||
body: ${{ steps.bump.outputs.message }}
|
||||
signoff: true
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
|
||||
IK_LLAMA_VERSION?=6c00e87ac84404af588ad2e65935bd6f079c696f
|
||||
IK_LLAMA_VERSION?=7ccf1d209588962b96eacca325b37e9b3e8faf5e
|
||||
LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
|
||||
|
||||
CMAKE_ARGS?=
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
|
||||
LLAMA_VERSION?=7c082bc417bbe53210a83df4ba5b49e18ce6193c
|
||||
LLAMA_VERSION?=be4a6a63eb2b848e19c277bdcf2bd399e8af76d9
|
||||
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
|
||||
|
||||
CMAKE_ARGS?=
|
||||
|
||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
||||
|
||||
# CrispASR version (release tag)
|
||||
CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
|
||||
CRISPASR_VERSION?=7a8cb80907341c0204bd0488c1244764f4163883
|
||||
CRISPASR_VERSION?=96b2a6ee31d30389fed8a7ef1a54239b75231ddc
|
||||
SO_TARGET?=libgocrispasr.so
|
||||
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# parakeet-cpp backend Makefile.
|
||||
#
|
||||
# Upstream pin lives below as PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36
|
||||
# Upstream pin lives below as PARAKEET_VERSION?=89f5e2977b4d8bccd45e7bcc6f2ef7c4ed49e89a
|
||||
# (.github/bump_deps.sh) can find and update it - matches the
|
||||
# whisper.cpp / ds4 / vibevoice-cpp convention.
|
||||
#
|
||||
@@ -15,7 +15,7 @@
|
||||
# That's what the L0 smoke test uses. The default target below does the
|
||||
# proper clone-at-pin + cmake build so CI doesn't need a side-checkout.
|
||||
|
||||
PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36
|
||||
PARAKEET_VERSION?=89f5e2977b4d8bccd45e7bcc6f2ef7c4ed49e89a
|
||||
PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp
|
||||
|
||||
GOCMD?=go
|
||||
|
||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
||||
|
||||
# stablediffusion.cpp (ggml)
|
||||
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
|
||||
STABLEDIFFUSION_GGML_VERSION?=b12098f5d09fc83da36e65c784f7bdb16a5a5ebf
|
||||
STABLEDIFFUSION_GGML_VERSION?=f440ad9c29dd8bc34e5d1f4b863832b96d6ea05f
|
||||
|
||||
CMAKE_ARGS+=-DGGML_MAX_NAME=128
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"runtime"
|
||||
"strings"
|
||||
"time"
|
||||
"unicode"
|
||||
@@ -943,7 +944,13 @@ func InitializeONNXRuntime() error {
|
||||
}
|
||||
}
|
||||
if libPath == "" {
|
||||
libPath = "/usr/local/lib/libonnxruntime.so"
|
||||
// LocalAI: default to the platform-native shared library
|
||||
// extension when nothing else is found (dyld vs ld.so).
|
||||
if runtime.GOOS == "darwin" {
|
||||
libPath = "/usr/local/lib/libonnxruntime.dylib"
|
||||
} else {
|
||||
libPath = "/usr/local/lib/libonnxruntime.so"
|
||||
}
|
||||
}
|
||||
}
|
||||
ort.SetSharedLibraryPath(libPath)
|
||||
|
||||
@@ -32,6 +32,10 @@ elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
|
||||
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
|
||||
elif [ $(uname -s) = "Darwin" ]; then
|
||||
# macOS: dyld resolves the bundled .dylib via DYLD_LIBRARY_PATH (set in
|
||||
# run.sh); there is no ld.so loader nor glibc to bundle.
|
||||
echo "Detected Darwin"
|
||||
else
|
||||
echo "Error: Could not detect architecture"
|
||||
exit 1
|
||||
|
||||
@@ -3,12 +3,19 @@ set -ex
|
||||
|
||||
CURDIR=$(dirname "$(realpath $0)")
|
||||
|
||||
export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
|
||||
export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.so
|
||||
if [ "$(uname)" = "Darwin" ]; then
|
||||
# macOS uses dyld: there is no ld.so loader, and the search path env
|
||||
# var is DYLD_LIBRARY_PATH. ONNX Runtime ships as a .dylib here.
|
||||
export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
|
||||
export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.dylib
|
||||
else
|
||||
export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
|
||||
export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.so
|
||||
|
||||
if [ -f $CURDIR/lib/ld.so ]; then
|
||||
echo "Using lib/ld.so"
|
||||
exec $CURDIR/lib/ld.so $CURDIR/supertonic "$@"
|
||||
if [ -f $CURDIR/lib/ld.so ]; then
|
||||
echo "Using lib/ld.so"
|
||||
exec $CURDIR/lib/ld.so $CURDIR/supertonic "$@"
|
||||
fi
|
||||
fi
|
||||
|
||||
exec $CURDIR/supertonic "$@"
|
||||
|
||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
||||
|
||||
# whisper.cpp version
|
||||
WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
|
||||
WHISPER_CPP_VERSION?=5ed76e9a079962f1c85cfce44edd325c27ef1f97
|
||||
WHISPER_CPP_VERSION?=43d78af5be58f41d6ffbc227d608f104577741ea
|
||||
SO_TARGET?=libgowhisper.so
|
||||
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||
|
||||
@@ -645,6 +645,7 @@
|
||||
nvidia-cuda-13: "cuda13-vllm"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm"
|
||||
cpu: "cpu-vllm"
|
||||
metal: "metal-vllm"
|
||||
- &sglang
|
||||
name: "sglang"
|
||||
license: apache-2.0
|
||||
@@ -1284,6 +1285,7 @@
|
||||
nvidia-cuda-13: "cuda13-liquid-audio"
|
||||
nvidia-cuda-12: "cuda12-liquid-audio"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-liquid-audio"
|
||||
metal: "metal-liquid-audio"
|
||||
icon: https://cdn-avatars.huggingface.co/v1/production/uploads/61b8e2ba285851687028d395/7_6D7rWrLxp2hb6OHSV1p.png
|
||||
- &qwen-tts
|
||||
urls:
|
||||
@@ -1569,6 +1571,7 @@
|
||||
- TTS
|
||||
capabilities:
|
||||
default: "cpu-supertonic"
|
||||
metal: "metal-supertonic"
|
||||
- !!merge <<: *neutts
|
||||
name: "neutts-development"
|
||||
capabilities:
|
||||
@@ -2927,6 +2930,17 @@
|
||||
nvidia-cuda-13: "cuda13-vllm-development"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm-development"
|
||||
cpu: "cpu-vllm-development"
|
||||
metal: "metal-vllm-development"
|
||||
- !!merge <<: *vllm
|
||||
name: "metal-vllm"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-vllm"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-metal-darwin-arm64-vllm
|
||||
- !!merge <<: *vllm
|
||||
name: "metal-vllm-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-vllm"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-metal-darwin-arm64-vllm
|
||||
- !!merge <<: *vllm
|
||||
name: "cuda12-vllm"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-vllm"
|
||||
@@ -4612,6 +4626,7 @@
|
||||
nvidia-cuda-13: "cuda13-liquid-audio-development"
|
||||
nvidia-cuda-12: "cuda12-liquid-audio-development"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-liquid-audio-development"
|
||||
metal: "metal-liquid-audio-development"
|
||||
- !!merge <<: *liquid-audio
|
||||
name: "cpu-liquid-audio"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-liquid-audio"
|
||||
@@ -4622,6 +4637,16 @@
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-liquid-audio"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-cpu-liquid-audio
|
||||
- !!merge <<: *liquid-audio
|
||||
name: "metal-liquid-audio"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-liquid-audio"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-metal-darwin-arm64-liquid-audio
|
||||
- !!merge <<: *liquid-audio
|
||||
name: "metal-liquid-audio-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-liquid-audio"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-metal-darwin-arm64-liquid-audio
|
||||
- !!merge <<: *liquid-audio
|
||||
name: "cuda12-liquid-audio"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-liquid-audio"
|
||||
@@ -5484,6 +5509,7 @@
|
||||
name: "supertonic-development"
|
||||
capabilities:
|
||||
default: "cpu-supertonic-development"
|
||||
metal: "metal-supertonic-development"
|
||||
- !!merge <<: *supertonic
|
||||
name: "cpu-supertonic"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-supertonic"
|
||||
@@ -5494,3 +5520,13 @@
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-supertonic"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-cpu-supertonic
|
||||
- !!merge <<: *supertonic
|
||||
name: "metal-supertonic"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-supertonic"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-metal-darwin-arm64-supertonic
|
||||
- !!merge <<: *supertonic
|
||||
name: "metal-supertonic-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-supertonic"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-metal-darwin-arm64-supertonic
|
||||
|
||||
@@ -14,5 +14,11 @@ else
|
||||
fi
|
||||
|
||||
# liquid-audio's torch wheels are large; allow upgrades to satisfy transitive pins
|
||||
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
|
||||
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade"
|
||||
# --index-strategy is a uv-only flag. The darwin/MPS build installs with pip
|
||||
# (USE_PIP=true in scripts/build/python-darwin.sh), which rejects it. Only add
|
||||
# it on the uv path; Linux/CUDA resolution is unchanged.
|
||||
if [ "x${USE_PIP:-}" != "xtrue" ]; then
|
||||
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
|
||||
fi
|
||||
installRequirements
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
# MPS (Apple Silicon / Metal) build profile - installed by the darwin CI job.
|
||||
torch>=2.8.0
|
||||
torchaudio>=2.8.0
|
||||
torchcodec>=0.9.1
|
||||
|
||||
@@ -457,9 +457,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if last_output is None or not getattr(last_output, "prompt_logprobs", None):
|
||||
context.set_code(grpc.StatusCode.INTERNAL)
|
||||
context.set_details("vLLM did not return prompt_logprobs")
|
||||
_pl = getattr(last_output, "prompt_logprobs", None) if last_output is not None else None
|
||||
# Some engines accept the prompt_logprobs request but return a
|
||||
# list of all-None entries instead of computing them (observed
|
||||
# with vllm-metal's MLX backend on macOS). Treat that as
|
||||
# unsupported rather than silently scoring every candidate as 0.
|
||||
if not _pl or all(e is None for e in _pl):
|
||||
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||
context.set_details("This backend did not return prompt_logprobs; scoring is unsupported on this engine (e.g. vllm-metal / MLX on macOS).")
|
||||
return backend_pb2.ScoreResponse()
|
||||
|
||||
prompt_logprobs = last_output.prompt_logprobs
|
||||
|
||||
@@ -43,6 +43,24 @@ if [ "x${BUILD_PROFILE}" == "xcublas13" ]; then
|
||||
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
|
||||
fi
|
||||
|
||||
# Apple Silicon (Metal/MLX) via vllm-metal.
|
||||
# vllm-metal (github.com/vllm-project/vllm-metal) brings vLLM to macOS on Apple
|
||||
# Silicon: it registers through vLLM's platform-plugin entry point
|
||||
# (metal -> vllm_metal:register), MetalPlatform activates, and the vLLM v1
|
||||
# AsyncLLM engine runs on the GPU through MLX. LocalAI's backend.py is UNCHANGED
|
||||
# on darwin — AsyncEngineArgs(...) -> AsyncLLMEngine.from_engine_args transparently
|
||||
# resolves to the MLX engine (proven on a real M4 / macOS 26.5 against Qwen3-0.6B).
|
||||
#
|
||||
# vllm-metal REQUIRES Python 3.12, so force the portable CPython before the venv
|
||||
# is created (ensureVenv reads PYTHON_VERSION/PYTHON_PATCH/PY_STANDALONE_TAG).
|
||||
# The patch + standalone tag mirror the l4t13 cp312 pin — a known-good
|
||||
# python-build-standalone release that also ships an aarch64-apple-darwin asset.
|
||||
if [ "$(uname -s)" = "Darwin" ]; then
|
||||
PYTHON_VERSION="3.12"
|
||||
PYTHON_PATCH="12"
|
||||
PY_STANDALONE_TAG="20251120"
|
||||
fi
|
||||
|
||||
# JetPack 7 / L4T arm64 vllm + torch wheels come straight from PyPI now
|
||||
# (torch 2.11+ ships aarch64 + cu130 manylinux wheels and vllm 0.20+ ships
|
||||
# an aarch64 wheel pinned to that torch). They're cp312-only, so bump the
|
||||
@@ -57,11 +75,87 @@ if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
|
||||
PY_STANDALONE_TAG="20251120"
|
||||
fi
|
||||
|
||||
# ===================== Apple Silicon (Metal/MLX) =====================
|
||||
# Reproduce vllm-metal's upstream installer
|
||||
# (curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm-metal/main/install.sh)
|
||||
# but INTO LocalAI's managed venv (ensureVenv) instead of a throwaway
|
||||
# ~/.venv-vllm-metal, so the backend integrates with LocalAI's venv lifecycle
|
||||
# (portable CPython, _makeVenvPortable relocation, runtime activation). The
|
||||
# normal CUDA/CPU installRequirements is skipped on darwin — there is no
|
||||
# macOS/arm64 vLLM wheel on PyPI; vLLM is built from source and the MLX engine
|
||||
# is layered on by the vllm-metal wheel.
|
||||
if [ "$(uname -s)" = "Darwin" ]; then
|
||||
# Create/activate the portable 3.12 venv. On darwin USE_PIP=true and
|
||||
# PORTABLE_PYTHON=true (set by scripts/build/python-darwin.sh), so this is a
|
||||
# `python -m venv` based, relocatable venv.
|
||||
ensureVenv
|
||||
|
||||
# vllm-metal's installer drives everything through `uv`: building vLLM from
|
||||
# the CPU requirements needs `--index-strategy unsafe-best-match` (mixes the
|
||||
# pytorch CPU channel with PyPI), a flag plain pip does not have. The darwin
|
||||
# venv is pip-based, so bootstrap uv into it. uv honours $VIRTUAL_ENV (set by
|
||||
# libbackend's _activateVenv) and installs into THIS venv — same pattern the
|
||||
# intel branch below relies on.
|
||||
pip install uv
|
||||
|
||||
# The ONLY darwin version pin -- AUTO-BUMPED by .github/bump_vllm_metal.sh,
|
||||
# which tracks vllm-project/vllm-metal releases (NOT vllm/vllm latest). Keep
|
||||
# it as a plain double-quoted assignment on its own line so the bumper's sed
|
||||
# can rewrite it. Darwin therefore follows vllm-metal and can lag the Linux
|
||||
# vllm pin (requirements-cublas13-after.txt, bumped independently against
|
||||
# vllm/vllm) until vllm-metal supports a newer vLLM.
|
||||
VLLM_METAL_VERSION="v0.3.0.dev20260622062346"
|
||||
|
||||
# The coupled vLLM source version is whatever this vllm-metal release builds
|
||||
# against -- it declares it in its own installer as `vllm_v=`. Derive it from
|
||||
# the PINNED tag rather than hardcoding a second value that could drift. The
|
||||
# tag is immutable, so this stays reproducible across rebuilds.
|
||||
VLLM_VERSION=$(curl -fsSL "https://raw.githubusercontent.com/vllm-project/vllm-metal/${VLLM_METAL_VERSION}/install.sh" \
|
||||
| grep -oE 'vllm_v="[0-9]+\.[0-9]+\.[0-9]+"' | head -n1 | cut -d'"' -f2)
|
||||
if [ -z "${VLLM_VERSION}" ]; then
|
||||
echo "ERROR: could not derive the vLLM version from vllm-metal ${VLLM_METAL_VERSION}" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "vllm-metal ${VLLM_METAL_VERSION} builds against vLLM ${VLLM_VERSION}"
|
||||
|
||||
_vllm_src=$(mktemp -d)
|
||||
trap 'rm -rf "${_vllm_src}"' EXIT
|
||||
pushd "${_vllm_src}"
|
||||
# 1) Build vLLM ${VLLM_VERSION} from the release source tarball against
|
||||
# the CPU requirements. vllm-metal layers its MLX platform plugin on
|
||||
# top of this exact build.
|
||||
curl -fsSL -o "vllm-${VLLM_VERSION}.tar.gz" \
|
||||
"https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}.tar.gz"
|
||||
tar -xzf "vllm-${VLLM_VERSION}.tar.gz"
|
||||
pushd "vllm-${VLLM_VERSION}"
|
||||
uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
|
||||
# -Wno-parentheses: clang on macOS treats one of vLLM's C++ warnings
|
||||
# as an error without it (matches the upstream installer's CXXFLAGS).
|
||||
CXXFLAGS="-Wno-parentheses" uv pip install .
|
||||
popd
|
||||
popd
|
||||
|
||||
# 2) Install the prebuilt vllm-metal wheel for the PINNED release. It pulls
|
||||
# mlx / mlx-metal as deps and registers the `metal` platform plugin that
|
||||
# backend.py resolves to at engine-init time. Build the release-asset URL
|
||||
# deterministically (tag + the cp312/arm64 wheel name) rather than querying
|
||||
# api.github.com, whose unauthenticated rate limit (60/hr per IP) 403s on
|
||||
# shared CI runners. The wheel version is the tag without its leading 'v'.
|
||||
_metal_wheel="vllm_metal-${VLLM_METAL_VERSION#v}-cp312-cp312-macosx_11_0_arm64.whl"
|
||||
_metal_wheel_url="https://github.com/vllm-project/vllm-metal/releases/download/${VLLM_METAL_VERSION}/${_metal_wheel}"
|
||||
echo "Installing vllm-metal wheel: ${_metal_wheel_url}"
|
||||
uv pip install "${_metal_wheel_url}"
|
||||
|
||||
# Generate the gRPC stubs (backend_pb2*). installRequirements normally does
|
||||
# this via runProtogen at the end; we skipped installRequirements on darwin,
|
||||
# so call it explicitly here.
|
||||
runProtogen
|
||||
|
||||
# Intel XPU has no upstream-published vllm wheels, so we always build vllm
|
||||
# from source against torch-xpu and replace the default triton with
|
||||
# triton-xpu (matching torch 2.11). Mirrors the upstream procedure:
|
||||
# https://github.com/vllm-project/vllm/blob/main/docs/getting_started/installation/gpu.xpu.inc.md
|
||||
if [ "x${BUILD_TYPE}" == "xintel" ]; then
|
||||
elif [ "x${BUILD_TYPE}" == "xintel" ]; then
|
||||
# Hide requirements-intel-after.txt so installRequirements doesn't
|
||||
# try `pip install vllm` (would either fail or grab a non-XPU wheel).
|
||||
_intel_after="${backend_dir}/requirements-intel-after.txt"
|
||||
|
||||
@@ -4,4 +4,7 @@
|
||||
# instead — the cublas13 case in install.sh adds --index-strategy=unsafe-best-match
|
||||
# so uv consults this index alongside PyPI.
|
||||
--extra-index-url https://wheels.vllm.ai/0.23.0/cu130
|
||||
# VERSION COUPLING: darwin/Apple-Silicon builds use vllm-metal (see install.sh),
|
||||
# which pins this exact vLLM version. Bumping vllm here means coordinating with a
|
||||
# vllm-metal release that supports the new version, or macOS/Metal builds break.
|
||||
vllm==0.23.0
|
||||
|
||||
@@ -215,6 +215,7 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
|
||||
envBackendGalleries := slices.Equal(appConfig.BackendGalleries, startupAppConfig.BackendGalleries)
|
||||
envAutoloadGalleries := appConfig.AutoloadGalleries == startupAppConfig.AutoloadGalleries
|
||||
envAutoloadBackendGalleries := appConfig.AutoloadBackendGalleries == startupAppConfig.AutoloadBackendGalleries
|
||||
envPIIDefaultDetectors := slices.Equal(appConfig.PIIDefaultDetectors, startupAppConfig.PIIDefaultDetectors)
|
||||
envAgentJobRetentionDays := appConfig.AgentJobRetentionDays == startupAppConfig.AgentJobRetentionDays
|
||||
envForceEvictionWhenBusy := appConfig.ForceEvictionWhenBusy == startupAppConfig.ForceEvictionWhenBusy
|
||||
envLRUEvictionMaxRetries := appConfig.LRUEvictionMaxRetries == startupAppConfig.LRUEvictionMaxRetries
|
||||
@@ -335,6 +336,15 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
|
||||
if settings.AutoloadBackendGalleries != nil && !envAutoloadBackendGalleries {
|
||||
appConfig.AutoloadBackendGalleries = *settings.AutoloadBackendGalleries
|
||||
}
|
||||
if settings.PIIDefaultDetectors != nil && !envPIIDefaultDetectors {
|
||||
// Request-side default redaction reads this live via
|
||||
// ResolvePIIPolicy, so a file edit takes effect on the next chat
|
||||
// request. The MITM listener resolves its per-host detector map
|
||||
// once at start, so a raw file edit reaches cloud-proxy traffic
|
||||
// only after a restart or a POST /api/settings (which rebuilds
|
||||
// the listener) — the admin UI uses the latter.
|
||||
appConfig.PIIDefaultDetectors = append([]string(nil), (*settings.PIIDefaultDetectors)...)
|
||||
}
|
||||
if settings.AutoUpgradeBackends != nil {
|
||||
appConfig.AutoUpgradeBackends = *settings.AutoUpgradeBackends
|
||||
}
|
||||
|
||||
@@ -109,6 +109,52 @@ var _ = Describe("loadRuntimeSettingsFromFile", func() {
|
||||
})
|
||||
})
|
||||
|
||||
// Instance-wide default PII detectors. The file is the only source (no
|
||||
// env var), and the loader runs immediately before startMITMIfConfigured,
|
||||
// so a regression here means the cloud-proxy MITM listener resolves an
|
||||
// empty detector set at boot and forwards intercepted traffic unredacted —
|
||||
// even though pii_default_detectors is on disk and the MITM model has PII
|
||||
// enabled. It also breaks request-side default redaction the same way.
|
||||
Describe("PII default detectors", func() {
|
||||
It("loads pii_default_detectors from the file", func() {
|
||||
cfg := &config.ApplicationConfig{DynamicConfigsDir: seedSettings(`{"pii_default_detectors": ["privacy-filter-nemotron", "secret-filter"]}`)}
|
||||
loadRuntimeSettingsFromFile(cfg)
|
||||
Expect(cfg.PIIDefaultDetectors).To(Equal([]string{"privacy-filter-nemotron", "secret-filter"}))
|
||||
})
|
||||
|
||||
It("does not override an env/CLI-set value (LOCALAI_PII_DEFAULT_DETECTORS)", func() {
|
||||
cfg := &config.ApplicationConfig{
|
||||
DynamicConfigsDir: seedSettings(`{"pii_default_detectors": ["from-file"]}`),
|
||||
PIIDefaultDetectors: []string{"from-env"}, // simulate WithPIIDefaultDetectors(env)
|
||||
}
|
||||
loadRuntimeSettingsFromFile(cfg)
|
||||
Expect(cfg.PIIDefaultDetectors).To(Equal([]string{"from-env"}), "env var must win over the persisted file value")
|
||||
})
|
||||
})
|
||||
|
||||
// The live file watcher applies pii_default_detectors on a runtime change
|
||||
// the same way it handles galleries/threads/etc.: env-set values (current
|
||||
// == startup snapshot) are left alone, otherwise the file value is applied
|
||||
// to the live config so request-side default redaction picks it up without
|
||||
// a restart.
|
||||
Describe("file watcher: pii_default_detectors", func() {
|
||||
It("applies a changed file value to the live config", func() {
|
||||
startup := config.ApplicationConfig{} // no env baseline
|
||||
live := &config.ApplicationConfig{PIIDefaultDetectors: []string{"old"}}
|
||||
handler := readRuntimeSettingsJson(startup)
|
||||
Expect(handler([]byte(`{"pii_default_detectors":["new-a","new-b"]}`), live)).To(Succeed())
|
||||
Expect(live.PIIDefaultDetectors).To(Equal([]string{"new-a", "new-b"}))
|
||||
})
|
||||
|
||||
It("leaves an env-controlled value untouched", func() {
|
||||
startup := config.ApplicationConfig{PIIDefaultDetectors: []string{"from-env"}}
|
||||
live := &config.ApplicationConfig{PIIDefaultDetectors: []string{"from-env"}}
|
||||
handler := readRuntimeSettingsJson(startup)
|
||||
Expect(handler([]byte(`{"pii_default_detectors":["from-file"]}`), live)).To(Succeed())
|
||||
Expect(live.PIIDefaultDetectors).To(Equal([]string{"from-env"}), "env-controlled detectors must not be overwritten by the file")
|
||||
})
|
||||
})
|
||||
|
||||
// The Agent Pool block has a mix of zero and non-zero defaults
|
||||
// (Enabled=true, EmbeddingModel="granite-...", MaxChunkingSize=400,
|
||||
// VectorEngine="chromem", AgentHubURL="https://agenthub.localai.io").
|
||||
|
||||
@@ -750,6 +750,20 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
|
||||
options.MITMListen = *settings.MITMListen
|
||||
}
|
||||
|
||||
// Instance-wide default PII detectors. LOCALAI_PII_DEFAULT_DETECTORS (via
|
||||
// WithPIIDefaultDetectors) wins when set; otherwise the file is the source
|
||||
// — apply it only when the env/CLI left the value empty, mirroring the
|
||||
// "env > file" precedence used for the other fields. This must land before
|
||||
// startMITMIfConfigured (called right after this loader): the cloud-proxy
|
||||
// listener resolves each intercept host's detectors once at start via
|
||||
// ResolvePIIPolicy, and a MITM model that names no detectors of its own
|
||||
// falls back to these defaults. Without it the listener (and request-side
|
||||
// default redaction) starts with an empty detector set and forwards
|
||||
// traffic unredacted even though pii_default_detectors is on disk.
|
||||
if settings.PIIDefaultDetectors != nil && len(options.PIIDefaultDetectors) == 0 {
|
||||
options.PIIDefaultDetectors = append([]string(nil), (*settings.PIIDefaultDetectors)...)
|
||||
}
|
||||
|
||||
// Backend upgrade flags
|
||||
if settings.AutoUpgradeBackends != nil {
|
||||
if !options.AutoUpgradeBackends {
|
||||
|
||||
@@ -181,6 +181,8 @@ type RunCMD struct {
|
||||
// Cloud-proxy MITM listener (off by default).
|
||||
MITMListen string `env:"LOCALAI_MITM_LISTEN" help:"Address (host:port) for the cloudproxy MITM listener. Empty = disabled. Clients set HTTPS_PROXY=http://<this>:<port>. Intercept hosts are declared per-model via the model YAML mitm.hosts: block; create one from the Add Model UI." group:"middleware"`
|
||||
MITMCADir string `env:"LOCALAI_MITM_CA_DIR" type:"path" help:"Directory holding the MITM proxy CA cert + key. Defaults to <data-path>/mitm-ca." group:"middleware"`
|
||||
|
||||
PIIDefaultDetectors []string `env:"LOCALAI_PII_DEFAULT_DETECTORS" help:"Instance-wide default PII/secret detector model names applied to any PII-enabled model (chiefly cloud-proxy / MITM models) that names no pii.detectors of its own. Comma-separated, e.g. privacy-filter-nemotron,secret-filter. Takes precedence over the value persisted via the Middleware UI." group:"middleware"`
|
||||
}
|
||||
|
||||
func (r *RunCMD) Run(ctx *cliContext.Context) error {
|
||||
@@ -243,6 +245,7 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
|
||||
config.WithAPIAddress(r.Address),
|
||||
config.WithMITMListen(r.MITMListen),
|
||||
config.WithMITMCADir(r.MITMCADir),
|
||||
config.WithPIIDefaultDetectors(r.PIIDefaultDetectors),
|
||||
config.WithAgentJobRetentionDays(r.AgentJobRetentionDays),
|
||||
config.WithLlamaCPPTunnelCallback(func(tunnels []string) {
|
||||
tunnelEnvVar := strings.Join(tunnels, ",")
|
||||
|
||||
@@ -712,6 +712,18 @@ func WithMITMCADir(dir string) AppOption {
|
||||
}
|
||||
}
|
||||
|
||||
// WithPIIDefaultDetectors sets the instance-wide default PII/secret detector
|
||||
// model names applied to any PII-enabled model (chiefly cloud-proxy / MITM
|
||||
// models) that names no pii.detectors of its own. CLI/env:
|
||||
// LOCALAI_PII_DEFAULT_DETECTORS. Empty leaves the value to
|
||||
// runtime_settings.json / the Middleware UI; a non-empty value takes
|
||||
// precedence over the file (env > file).
|
||||
func WithPIIDefaultDetectors(detectors []string) AppOption {
|
||||
return func(o *ApplicationConfig) {
|
||||
o.PIIDefaultDetectors = detectors
|
||||
}
|
||||
}
|
||||
|
||||
func WithDynamicConfigDir(dynamicConfigsDir string) AppOption {
|
||||
return func(o *ApplicationConfig) {
|
||||
o.DynamicConfigsDir = dynamicConfigsDir
|
||||
|
||||
@@ -54,8 +54,35 @@ func (g GPU) IsNVIDIABlackwell() bool {
|
||||
return maj >= 12
|
||||
}
|
||||
|
||||
// Compute-buffer headroom guard for the raised physical batch.
|
||||
//
|
||||
// Raising n_ubatch grows the CUDA *compute buffer* (the scratch for the forward
|
||||
// graph), which is allocated PER DEVICE — it does not benefit from a second GPU
|
||||
// the way weights or KV (which are split across devices) do. The buffer scales
|
||||
// ~linearly with n_ubatch * n_ctx, so a large context turns the GB10-tuned
|
||||
// ub2048 into multi-GiB of extra scratch that must fit on a SINGLE card. On a
|
||||
// 16 GiB consumer Blackwell with a 200k context that overflows (issue #10485),
|
||||
// even though the GB10 it was measured on (128 GiB unified memory) had room.
|
||||
//
|
||||
// These constants size a conservative guard: only raise the batch when the
|
||||
// extra scratch fits the per-device VRAM ceiling.
|
||||
const (
|
||||
// computeBufferBytesPerCell approximates the CUDA compute-buffer cost of one
|
||||
// (n_ubatch * n_ctx) cell. Derived from an observed allocation (ub2048 *
|
||||
// ctx204800 ~= 4.5 GiB => ~11 B/cell) and rounded up to 16 for margin, since
|
||||
// the real cost also grows with model width (heads / embedding dim) which we
|
||||
// don't know at config time.
|
||||
computeBufferBytesPerCell = 16
|
||||
// blackwellBatchHeadroomDivisor caps the extra compute buffer from raising the
|
||||
// physical batch at VRAM/divisor. /4 keeps the bulk of a device for weights +
|
||||
// KV, which already dominate VRAM use.
|
||||
blackwellBatchHeadroomDivisor = 4
|
||||
)
|
||||
|
||||
// PhysicalBatch returns the canonical physical batch (n_batch/n_ubatch) for the
|
||||
// given hardware, used when the model config leaves batch unset.
|
||||
// given hardware class, ignoring context/VRAM headroom. Use
|
||||
// PhysicalBatchForContext when a model context and per-device VRAM are known
|
||||
// (the load paths) so the raised batch can't overflow a single device.
|
||||
func PhysicalBatch(g GPU) int {
|
||||
if g.IsNVIDIABlackwell() {
|
||||
return BlackwellPhysicalBatch
|
||||
@@ -63,6 +90,32 @@ func PhysicalBatch(g GPU) int {
|
||||
return DefaultPhysicalBatch
|
||||
}
|
||||
|
||||
// PhysicalBatchForContext is PhysicalBatch gated on per-device VRAM headroom for
|
||||
// the given context: it only raises the batch above the conservative default
|
||||
// when the extra compute buffer (which is allocated on a single device and grows
|
||||
// with n_ubatch * n_ctx) fits within blackwellBatchHeadroomDivisor of the GPU's
|
||||
// VRAM. g.VRAM must be the PER-DEVICE ceiling (the smallest device on a
|
||||
// multi-GPU host), not the summed total — the compute buffer can't be split.
|
||||
//
|
||||
// VRAM 0 (unknown) stays conservative rather than risk a per-device OOM; the
|
||||
// GB10 / unified-memory path reports system RAM, so it still clears the guard.
|
||||
func PhysicalBatchForContext(g GPU, ctx int) int {
|
||||
if !g.IsNVIDIABlackwell() {
|
||||
return DefaultPhysicalBatch
|
||||
}
|
||||
if ctx <= 0 {
|
||||
ctx = DefaultContextSize
|
||||
}
|
||||
if g.VRAM == 0 {
|
||||
return DefaultPhysicalBatch
|
||||
}
|
||||
extra := uint64(ctx) * uint64(BlackwellPhysicalBatch-DefaultPhysicalBatch) * computeBufferBytesPerCell
|
||||
if extra <= g.VRAM/blackwellBatchHeadroomDivisor {
|
||||
return BlackwellPhysicalBatch
|
||||
}
|
||||
return DefaultPhysicalBatch
|
||||
}
|
||||
|
||||
// IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
|
||||
// Callers that re-tune a value chosen by an upstream host (the distributed
|
||||
// router correcting the frontend's guess) use this to avoid clobbering an
|
||||
@@ -122,7 +175,12 @@ func hasParallelOption(opts []string) bool {
|
||||
// deterministic device — detection does a live nvidia-smi call.
|
||||
var localGPU = func() GPU {
|
||||
vendor, _ := xsysinfo.DetectGPUVendor()
|
||||
vram, _ := xsysinfo.TotalAvailableVRAM()
|
||||
// Use the SMALLEST device's VRAM, not the summed total: the parallel-slot
|
||||
// tier and the batch headroom guard both reason about what fits on a single
|
||||
// card, and per-device compute buffers can't be split across GPUs. Summing
|
||||
// two 16 GiB cards into "32 GiB" is what over-provisioned multi-GPU hosts
|
||||
// into OOM (issue #10485).
|
||||
vram, _ := xsysinfo.MinPerGPUVRAM()
|
||||
return GPU{
|
||||
Vendor: vendor,
|
||||
ComputeCapability: xsysinfo.NVIDIAComputeCapability(),
|
||||
@@ -137,10 +195,20 @@ func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
|
||||
if cfg == nil {
|
||||
return
|
||||
}
|
||||
if cfg.Batch == 0 && gpu.IsNVIDIABlackwell() {
|
||||
cfg.Batch = BlackwellPhysicalBatch
|
||||
xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
|
||||
"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability)
|
||||
// Raise the physical batch on Blackwell only when the resulting compute
|
||||
// buffer fits the per-device VRAM at THIS model's context. Leaving Batch at 0
|
||||
// (rather than writing the default 512) preserves the downstream single-pass
|
||||
// sizing in core/backend.EffectiveBatchSize for embedding/score/rerank.
|
||||
if cfg.Batch == 0 {
|
||||
ctx := DefaultContextSize
|
||||
if cfg.ContextSize != nil {
|
||||
ctx = *cfg.ContextSize
|
||||
}
|
||||
if PhysicalBatchForContext(gpu, ctx) == BlackwellPhysicalBatch {
|
||||
cfg.Batch = BlackwellPhysicalBatch
|
||||
xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
|
||||
"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability, "context", ctx, "vram_gib", gpu.VRAM>>30)
|
||||
}
|
||||
}
|
||||
|
||||
// Enable concurrent serving by default on a capable GPU: without this the
|
||||
|
||||
@@ -9,26 +9,37 @@ import (
|
||||
// GPU. The detection seam (localGPU) is injected so the path is deterministic
|
||||
// without a real GPU.
|
||||
var _ = Describe("SetDefaults hardware defaults (single-instance)", func() {
|
||||
const gib = uint64(1) << 30
|
||||
|
||||
var orig func() GPU
|
||||
BeforeEach(func() { orig = localGPU })
|
||||
AfterEach(func() { localGPU = orig })
|
||||
|
||||
It("sets the physical batch on a local Blackwell GPU", func() {
|
||||
localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
|
||||
It("sets the physical batch on a local Blackwell GPU with headroom", func() {
|
||||
localGPU = func() GPU { return GPU{ComputeCapability: "12.1", VRAM: 119 * gib} }
|
||||
cfg := &ModelConfig{}
|
||||
cfg.SetDefaults()
|
||||
Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
|
||||
})
|
||||
|
||||
It("leaves batch unset when a large context would overflow the device", func() {
|
||||
// Regression guard for issue #10485: 16 GiB consumer Blackwell + ~200k ctx.
|
||||
localGPU = func() GPU { return GPU{ComputeCapability: "12.0", VRAM: 16 * gib} }
|
||||
ctx := 204800
|
||||
cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
|
||||
cfg.SetDefaults()
|
||||
Expect(cfg.Batch).To(Equal(0))
|
||||
})
|
||||
|
||||
It("leaves batch unset on a non-Blackwell local GPU", func() {
|
||||
localGPU = func() GPU { return GPU{ComputeCapability: "8.9"} }
|
||||
localGPU = func() GPU { return GPU{ComputeCapability: "8.9", VRAM: 119 * gib} }
|
||||
cfg := &ModelConfig{}
|
||||
cfg.SetDefaults()
|
||||
Expect(cfg.Batch).To(Equal(0))
|
||||
})
|
||||
|
||||
It("never overrides an explicit batch", func() {
|
||||
localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
|
||||
localGPU = func() GPU { return GPU{ComputeCapability: "12.1", VRAM: 119 * gib} }
|
||||
cfg := &ModelConfig{}
|
||||
cfg.Batch = 1024
|
||||
cfg.SetDefaults()
|
||||
|
||||
@@ -7,6 +7,8 @@ import (
|
||||
)
|
||||
|
||||
var _ = Describe("Hardware-driven config defaults", func() {
|
||||
const gib = uint64(1) << 30
|
||||
|
||||
DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
|
||||
func(cc string, want bool) {
|
||||
Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
|
||||
@@ -35,21 +37,54 @@ var _ = Describe("Hardware-driven config defaults", func() {
|
||||
})
|
||||
})
|
||||
|
||||
Describe("PhysicalBatchForContext (per-device VRAM headroom)", func() {
|
||||
It("raises the batch when the compute buffer fits the device", func() {
|
||||
// 16 GiB Blackwell with a small context: the extra scratch is tiny.
|
||||
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.0", VRAM: 16 * gib}, 8192)).
|
||||
To(Equal(BlackwellPhysicalBatch))
|
||||
})
|
||||
It("keeps the default batch when a large context would overflow one device", func() {
|
||||
// The issue #10485 case: 16 GiB consumer Blackwell, ~200k context.
|
||||
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.0", VRAM: 16 * gib}, 204800)).
|
||||
To(Equal(DefaultPhysicalBatch))
|
||||
})
|
||||
It("still raises the batch on a large unified-memory device (GB10)", func() {
|
||||
// GB10 reports system RAM (~119 GiB) as its single device's VRAM.
|
||||
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.1", VRAM: 119 * gib}, 204800)).
|
||||
To(Equal(BlackwellPhysicalBatch))
|
||||
})
|
||||
It("stays conservative when VRAM is unknown", func() {
|
||||
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.1"}, 8192)).
|
||||
To(Equal(DefaultPhysicalBatch))
|
||||
})
|
||||
It("never raises the batch on non-Blackwell", func() {
|
||||
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "9.0", VRAM: 80 * gib}, 8192)).
|
||||
To(Equal(DefaultPhysicalBatch))
|
||||
})
|
||||
})
|
||||
|
||||
Describe("ApplyHardwareDefaults", func() {
|
||||
It("raises an unset batch to 2048 on Blackwell", func() {
|
||||
It("raises an unset batch to 2048 on Blackwell with headroom", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
|
||||
Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
|
||||
})
|
||||
It("leaves batch unset when a large context would overflow one device", func() {
|
||||
// Regression guard for issue #10485: 16 GiB card + ~200k context.
|
||||
ctx := 204800
|
||||
cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.0", VRAM: 16 * gib})
|
||||
Expect(cfg.Batch).To(Equal(0))
|
||||
})
|
||||
It("leaves batch unset on non-Blackwell", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0"})
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0", VRAM: 119 * gib})
|
||||
Expect(cfg.Batch).To(Equal(0))
|
||||
})
|
||||
It("never overrides an explicit batch", func() {
|
||||
cfg := &ModelConfig{}
|
||||
cfg.Batch = 1024
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
|
||||
Expect(cfg.Batch).To(Equal(1024))
|
||||
})
|
||||
It("no-ops on nil", func() {
|
||||
@@ -57,8 +92,6 @@ var _ = Describe("Hardware-driven config defaults", func() {
|
||||
})
|
||||
})
|
||||
|
||||
const gib = uint64(1) << 30
|
||||
|
||||
DescribeTable("DefaultParallelSlots (by VRAM)",
|
||||
func(vramGiB uint64, want int) {
|
||||
Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want))
|
||||
|
||||
@@ -1204,11 +1204,6 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
|
||||
// This ensures gallery-installed and runtime-loaded models get optimal parameters.
|
||||
ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model)
|
||||
|
||||
// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell).
|
||||
// Uses the local GPU here; in distributed mode the router re-applies the same
|
||||
// heuristics for the selected node's GPU before loading. Explicit config wins.
|
||||
ApplyHardwareDefaults(cfg, localGPU())
|
||||
|
||||
// Apply serving-policy defaults (device-independent): cross-request prefix
|
||||
// caching. Propagates to distributed nodes via the model options.
|
||||
ApplyServingDefaults(cfg)
|
||||
@@ -1247,6 +1242,16 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
|
||||
cfg.ContextSize = &ctx
|
||||
}
|
||||
runBackendHooks(cfg, lo.modelPath)
|
||||
|
||||
// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell)
|
||||
// LAST, after the context size is fully resolved (explicit config, LoadOptions,
|
||||
// then the GGUF guess inside runBackendHooks): the Blackwell batch guard sizes
|
||||
// the per-device compute buffer against this model's context, so it must see
|
||||
// the final value, not a pre-guess nil. Uses the local GPU here; in distributed
|
||||
// mode the router re-applies the same heuristics for the selected node's GPU
|
||||
// before loading. Explicit config always wins.
|
||||
ApplyHardwareDefaults(cfg, localGPU())
|
||||
|
||||
cfg.syncKnownUsecasesFromString()
|
||||
}
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"errors"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
)
|
||||
|
||||
// runtimeSettingsFile is the on-disk filename inside DynamicConfigsDir.
|
||||
@@ -33,6 +34,35 @@ func (o *ApplicationConfig) ReadPersistedSettings() (RuntimeSettings, error) {
|
||||
return settings, nil
|
||||
}
|
||||
|
||||
// MergeNonNil overlays every set (non-nil) field of overlay onto the
|
||||
// receiver, leaving the receiver's value untouched wherever overlay left a
|
||||
// field unset. Every RuntimeSettings field is a pointer precisely so "set"
|
||||
// can be told apart from "absent" (see the type doc), which makes this a
|
||||
// faithful partial update: a caller that submits only the field it owns
|
||||
// changes exactly that field and never clobbers unrelated settings.
|
||||
//
|
||||
// This is the read-modify-write contract the persistence helpers exist for.
|
||||
// UpdateSettingsEndpoint reads the on-disk settings, merges the request body
|
||||
// on top, and writes the result — so a focused admin page that POSTs only its
|
||||
// own field (the Middleware page sends only mitm_listen; the detector table
|
||||
// only pii_default_detectors) no longer nulls every other setting.
|
||||
//
|
||||
// Reflection keeps the merge total over the struct: a field added to
|
||||
// RuntimeSettings later is merged automatically, so the persistence path can
|
||||
// never silently drop a new setting the way a hand-maintained field list
|
||||
// would. Non-pointer fields (none today) are skipped — they cannot express
|
||||
// "absent", so the receiver wins.
|
||||
func (s *RuntimeSettings) MergeNonNil(overlay RuntimeSettings) {
|
||||
dst := reflect.ValueOf(s).Elem()
|
||||
src := reflect.ValueOf(overlay)
|
||||
for i := 0; i < src.NumField(); i++ {
|
||||
f := src.Field(i)
|
||||
if f.Kind() == reflect.Pointer && !f.IsNil() {
|
||||
dst.Field(i).Set(f)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// WritePersistedSettings serialises the given RuntimeSettings to
|
||||
// runtime_settings.json with restricted permissions (it may carry API
|
||||
// keys and P2P tokens).
|
||||
|
||||
@@ -12,6 +12,7 @@ import (
|
||||
)
|
||||
|
||||
func strPtr(s string) *string { return &s }
|
||||
func boolPtr(b bool) *bool { return &b }
|
||||
|
||||
var _ = Describe("RuntimeSettings persistence helpers", func() {
|
||||
var (
|
||||
@@ -51,6 +52,47 @@ var _ = Describe("RuntimeSettings persistence helpers", func() {
|
||||
})
|
||||
})
|
||||
|
||||
// MergeNonNil is the partial-update primitive UpdateSettingsEndpoint
|
||||
// relies on: a focused admin page POSTs only the field it owns, and the
|
||||
// handler reads the on-disk settings and overlays the request on top.
|
||||
// Without it, the body would be written verbatim and every field the
|
||||
// caller omitted would be nulled (the reported regression: changing
|
||||
// mitm_listen wiped the galleries, api keys, watchdog config, etc.).
|
||||
Describe("MergeNonNil partial update", func() {
|
||||
It("overlays set fields and preserves unset ones", func() {
|
||||
base := config.RuntimeSettings{
|
||||
MITMListen: strPtr(":9000"),
|
||||
Galleries: &[]config.Gallery{{Name: "g1", URL: "http://example/g1"}},
|
||||
WatchdogIdleEnabled: boolPtr(true),
|
||||
ApiKeys: &[]string{"persisted-key"},
|
||||
PIIDefaultDetectors: &[]string{"det-a"},
|
||||
}
|
||||
|
||||
// Simulate the Middleware proxy tab: only mitm_listen is sent.
|
||||
overlay := config.RuntimeSettings{MITMListen: strPtr(":8443")}
|
||||
base.MergeNonNil(overlay)
|
||||
|
||||
Expect(base.MITMListen).ToNot(BeNil())
|
||||
Expect(*base.MITMListen).To(Equal(":8443"), "set field should be overlaid")
|
||||
// Everything the overlay left unset must survive untouched.
|
||||
Expect(base.Galleries).ToNot(BeNil(), "galleries were clobbered")
|
||||
Expect(*base.Galleries).To(HaveLen(1))
|
||||
Expect(base.WatchdogIdleEnabled).ToNot(BeNil())
|
||||
Expect(*base.WatchdogIdleEnabled).To(BeTrue())
|
||||
Expect(base.ApiKeys).ToNot(BeNil(), "api_keys were clobbered")
|
||||
Expect(*base.ApiKeys).To(Equal([]string{"persisted-key"}))
|
||||
Expect(base.PIIDefaultDetectors).ToNot(BeNil(), "pii_default_detectors were clobbered")
|
||||
Expect(*base.PIIDefaultDetectors).To(Equal([]string{"det-a"}))
|
||||
})
|
||||
|
||||
It("lets an explicit empty slice clear a field", func() {
|
||||
base := config.RuntimeSettings{PIIDefaultDetectors: &[]string{"det-a"}}
|
||||
base.MergeNonNil(config.RuntimeSettings{PIIDefaultDetectors: &[]string{}})
|
||||
Expect(base.PIIDefaultDetectors).ToNot(BeNil())
|
||||
Expect(*base.PIIDefaultDetectors).To(BeEmpty(), "an explicit empty slice should clear, not preserve")
|
||||
})
|
||||
})
|
||||
|
||||
// MITM round trip pins the contract that loadRuntimeSettingsFromFile
|
||||
// MITM listener address must survive a write/read round trip so the
|
||||
// next process restart can bring the listener back up. (Intercept
|
||||
|
||||
@@ -70,7 +70,7 @@ func UploadToCollectionEndpoint(app *application.Application) echo.HandlerFunc {
|
||||
return func(c echo.Context) error {
|
||||
svc := app.AgentPoolService()
|
||||
userID := effectiveUserID(c)
|
||||
name := c.Param("name")
|
||||
name := decodedParam(c, "name")
|
||||
file, err := c.FormFile("file")
|
||||
if err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]string{"error": "file required"})
|
||||
@@ -116,7 +116,7 @@ func ListCollectionEntriesEndpoint(app *application.Application) echo.HandlerFun
|
||||
return func(c echo.Context) error {
|
||||
svc := app.AgentPoolService()
|
||||
userID := effectiveUserID(c)
|
||||
entries, err := svc.ListCollectionEntriesForUser(userID, c.Param("name"))
|
||||
entries, err := svc.ListCollectionEntriesForUser(userID, decodedParam(c, "name"))
|
||||
if err != nil {
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
|
||||
@@ -139,7 +139,7 @@ func GetCollectionEntryContentEndpoint(app *application.Application) echo.Handle
|
||||
if err != nil {
|
||||
entry = entryParam
|
||||
}
|
||||
content, chunkCount, err := svc.GetCollectionEntryContentForUser(userID, c.Param("name"), entry)
|
||||
content, chunkCount, err := svc.GetCollectionEntryContentForUser(userID, decodedParam(c, "name"), entry)
|
||||
if err != nil {
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
|
||||
@@ -164,7 +164,7 @@ func SearchCollectionEndpoint(app *application.Application) echo.HandlerFunc {
|
||||
if err := c.Bind(&payload); err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]string{"error": err.Error()})
|
||||
}
|
||||
results, err := svc.SearchCollectionForUser(userID, c.Param("name"), payload.Query, payload.MaxResults)
|
||||
results, err := svc.SearchCollectionForUser(userID, decodedParam(c, "name"), payload.Query, payload.MaxResults)
|
||||
if err != nil {
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
|
||||
@@ -182,7 +182,7 @@ func ResetCollectionEndpoint(app *application.Application) echo.HandlerFunc {
|
||||
return func(c echo.Context) error {
|
||||
svc := app.AgentPoolService()
|
||||
userID := effectiveUserID(c)
|
||||
if err := svc.ResetCollectionForUser(userID, c.Param("name")); err != nil {
|
||||
if err := svc.ResetCollectionForUser(userID, decodedParam(c, "name")); err != nil {
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
|
||||
}
|
||||
@@ -202,7 +202,7 @@ func DeleteCollectionEntryEndpoint(app *application.Application) echo.HandlerFun
|
||||
if err := c.Bind(&payload); err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]string{"error": err.Error()})
|
||||
}
|
||||
remaining, err := svc.DeleteCollectionEntryForUser(userID, c.Param("name"), payload.Entry)
|
||||
remaining, err := svc.DeleteCollectionEntryForUser(userID, decodedParam(c, "name"), payload.Entry)
|
||||
if err != nil {
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
|
||||
@@ -230,7 +230,7 @@ func AddCollectionSourceEndpoint(app *application.Application) echo.HandlerFunc
|
||||
if payload.UpdateInterval < 1 {
|
||||
payload.UpdateInterval = 60
|
||||
}
|
||||
if err := svc.AddCollectionSourceForUser(userID, c.Param("name"), payload.URL, payload.UpdateInterval); err != nil {
|
||||
if err := svc.AddCollectionSourceForUser(userID, decodedParam(c, "name"), payload.URL, payload.UpdateInterval); err != nil {
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
|
||||
}
|
||||
@@ -250,7 +250,7 @@ func RemoveCollectionSourceEndpoint(app *application.Application) echo.HandlerFu
|
||||
if err := c.Bind(&payload); err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]string{"error": err.Error()})
|
||||
}
|
||||
if err := svc.RemoveCollectionSourceForUser(userID, c.Param("name"), payload.URL); err != nil {
|
||||
if err := svc.RemoveCollectionSourceForUser(userID, decodedParam(c, "name"), payload.URL); err != nil {
|
||||
return c.JSON(http.StatusInternalServerError, map[string]string{"error": err.Error()})
|
||||
}
|
||||
return c.JSON(http.StatusOK, map[string]string{"status": "ok"})
|
||||
@@ -267,7 +267,7 @@ func GetCollectionEntryRawFileEndpoint(app *application.Application) echo.Handle
|
||||
if err != nil {
|
||||
entry = entryParam
|
||||
}
|
||||
fpath, err := svc.GetCollectionEntryFilePathForUser(userID, c.Param("name"), entry)
|
||||
fpath, err := svc.GetCollectionEntryFilePathForUser(userID, decodedParam(c, "name"), entry)
|
||||
if err != nil {
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
|
||||
@@ -282,7 +282,7 @@ func ListCollectionSourcesEndpoint(app *application.Application) echo.HandlerFun
|
||||
return func(c echo.Context) error {
|
||||
svc := app.AgentPoolService()
|
||||
userID := effectiveUserID(c)
|
||||
sources, err := svc.ListCollectionSourcesForUser(userID, c.Param("name"))
|
||||
sources, err := svc.ListCollectionSourcesForUser(userID, decodedParam(c, "name"))
|
||||
if err != nil {
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
|
||||
|
||||
49
core/http/endpoints/localai/agent_collections_param_test.go
Normal file
49
core/http/endpoints/localai/agent_collections_param_test.go
Normal file
@@ -0,0 +1,49 @@
|
||||
package localai
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
|
||||
"github.com/labstack/echo/v4"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
// Regression for #10443: agent/collection names carry a "legacy-api-key:"
|
||||
// prefix, so the ':' is percent-encoded as %3A in the request path. Echo routes
|
||||
// such paths via URL.RawPath and stores the path-param value still escaped, so
|
||||
// handlers must URL-decode it before looking the collection up in the store -
|
||||
// otherwise the lookup sees "legacy-api-key%3ALiteraryResearch" and 404s.
|
||||
var _ = Describe("decodedParam", func() {
|
||||
var e *echo.Echo
|
||||
|
||||
BeforeEach(func() {
|
||||
e = echo.New()
|
||||
})
|
||||
|
||||
// route runs a request through Echo's real router so the path param is
|
||||
// populated exactly as it would be in production, then returns the decoded
|
||||
// value the handler would observe.
|
||||
route := func(rawPath string) string {
|
||||
var got string
|
||||
e.GET("/api/agents/collections/:name/upload", func(c echo.Context) error {
|
||||
got = decodedParam(c, "name")
|
||||
return c.NoContent(http.StatusOK)
|
||||
})
|
||||
req := httptest.NewRequest(http.MethodGet, rawPath, nil)
|
||||
rec := httptest.NewRecorder()
|
||||
e.ServeHTTP(rec, req)
|
||||
Expect(rec.Code).To(Equal(http.StatusOK))
|
||||
return got
|
||||
}
|
||||
|
||||
It("decodes a percent-encoded colon in the collection name", func() {
|
||||
got := route("/api/agents/collections/legacy-api-key%3ALiteraryResearch/upload")
|
||||
Expect(got).To(Equal("legacy-api-key:LiteraryResearch"))
|
||||
})
|
||||
|
||||
It("leaves an unencoded name untouched", func() {
|
||||
got := route("/api/agents/collections/PlainCollection/upload")
|
||||
Expect(got).To(Equal("PlainCollection"))
|
||||
})
|
||||
})
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"io"
|
||||
"maps"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"slices"
|
||||
@@ -33,6 +34,22 @@ func getUserID(c echo.Context) string {
|
||||
return user.ID
|
||||
}
|
||||
|
||||
// decodedParam returns the named path parameter, URL-decoding it.
|
||||
//
|
||||
// Echo routes a request via URL.RawPath whenever the path contains
|
||||
// percent-encoded characters (e.g. %3A for ':'), and in that case stores the
|
||||
// matched path-param value raw/escaped. Agent and collection names carry a
|
||||
// "legacy-api-key:" prefix, so the ':' arrives as %3A and the raw param no
|
||||
// longer matches the stored name. Callers must unescape before lookups.
|
||||
// Falls back to the raw value if it isn't valid percent-encoding.
|
||||
func decodedParam(c echo.Context, name string) string {
|
||||
raw := c.Param(name)
|
||||
if decoded, err := url.PathUnescape(raw); err == nil {
|
||||
return decoded
|
||||
}
|
||||
return raw
|
||||
}
|
||||
|
||||
// isAdminUser returns true if the authenticated user has admin role.
|
||||
func isAdminUser(c echo.Context) bool {
|
||||
user := auth.GetUser(c)
|
||||
@@ -127,7 +144,7 @@ func GetAgentEndpoint(app *application.Application) echo.HandlerFunc {
|
||||
return func(c echo.Context) error {
|
||||
svc := app.AgentPoolService()
|
||||
userID := effectiveUserID(c)
|
||||
name := c.Param("name")
|
||||
name := decodedParam(c, "name")
|
||||
|
||||
statuses := svc.ListAgentsForUser(userID)
|
||||
active, exists := statuses[name]
|
||||
@@ -142,7 +159,7 @@ func UpdateAgentEndpoint(app *application.Application) echo.HandlerFunc {
|
||||
return func(c echo.Context) error {
|
||||
svc := app.AgentPoolService()
|
||||
userID := effectiveUserID(c)
|
||||
name := c.Param("name")
|
||||
name := decodedParam(c, "name")
|
||||
var cfg state.AgentConfig
|
||||
if err := c.Bind(&cfg); err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]string{"error": err.Error()})
|
||||
@@ -161,7 +178,7 @@ func DeleteAgentEndpoint(app *application.Application) echo.HandlerFunc {
|
||||
return func(c echo.Context) error {
|
||||
svc := app.AgentPoolService()
|
||||
userID := effectiveUserID(c)
|
||||
name := c.Param("name")
|
||||
name := decodedParam(c, "name")
|
||||
if err := svc.DeleteAgentForUser(userID, name); err != nil {
|
||||
return c.JSON(http.StatusInternalServerError, map[string]string{"error": err.Error()})
|
||||
}
|
||||
@@ -173,7 +190,7 @@ func GetAgentConfigEndpoint(app *application.Application) echo.HandlerFunc {
|
||||
return func(c echo.Context) error {
|
||||
svc := app.AgentPoolService()
|
||||
userID := effectiveUserID(c)
|
||||
name := c.Param("name")
|
||||
name := decodedParam(c, "name")
|
||||
cfg := svc.GetAgentConfigForUser(userID, name)
|
||||
if cfg == nil {
|
||||
return c.JSON(http.StatusNotFound, map[string]string{"error": "Agent not found"})
|
||||
@@ -186,7 +203,7 @@ func PauseAgentEndpoint(app *application.Application) echo.HandlerFunc {
|
||||
return func(c echo.Context) error {
|
||||
svc := app.AgentPoolService()
|
||||
userID := effectiveUserID(c)
|
||||
if err := svc.PauseAgentForUser(userID, c.Param("name")); err != nil {
|
||||
if err := svc.PauseAgentForUser(userID, decodedParam(c, "name")); err != nil {
|
||||
return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
|
||||
}
|
||||
return c.JSON(http.StatusOK, map[string]string{"status": "ok"})
|
||||
@@ -197,7 +214,7 @@ func ResumeAgentEndpoint(app *application.Application) echo.HandlerFunc {
|
||||
return func(c echo.Context) error {
|
||||
svc := app.AgentPoolService()
|
||||
userID := effectiveUserID(c)
|
||||
if err := svc.ResumeAgentForUser(userID, c.Param("name")); err != nil {
|
||||
if err := svc.ResumeAgentForUser(userID, decodedParam(c, "name")); err != nil {
|
||||
return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
|
||||
}
|
||||
return c.JSON(http.StatusOK, map[string]string{"status": "ok"})
|
||||
@@ -208,7 +225,7 @@ func GetAgentStatusEndpoint(app *application.Application) echo.HandlerFunc {
|
||||
return func(c echo.Context) error {
|
||||
svc := app.AgentPoolService()
|
||||
userID := effectiveUserID(c)
|
||||
name := c.Param("name")
|
||||
name := decodedParam(c, "name")
|
||||
|
||||
history := svc.GetAgentStatusForUser(userID, name)
|
||||
if history == nil {
|
||||
@@ -241,7 +258,7 @@ func GetAgentObservablesEndpoint(app *application.Application) echo.HandlerFunc
|
||||
return func(c echo.Context) error {
|
||||
svc := app.AgentPoolService()
|
||||
userID := effectiveUserID(c)
|
||||
name := c.Param("name")
|
||||
name := decodedParam(c, "name")
|
||||
|
||||
history, err := svc.GetAgentObservablesForUser(userID, name)
|
||||
if err != nil {
|
||||
@@ -261,7 +278,7 @@ func ClearAgentObservablesEndpoint(app *application.Application) echo.HandlerFun
|
||||
return func(c echo.Context) error {
|
||||
svc := app.AgentPoolService()
|
||||
userID := effectiveUserID(c)
|
||||
name := c.Param("name")
|
||||
name := decodedParam(c, "name")
|
||||
if err := svc.ClearAgentObservablesForUser(userID, name); err != nil {
|
||||
return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
|
||||
}
|
||||
@@ -273,7 +290,7 @@ func ChatWithAgentEndpoint(app *application.Application) echo.HandlerFunc {
|
||||
return func(c echo.Context) error {
|
||||
svc := app.AgentPoolService()
|
||||
userID := effectiveUserID(c)
|
||||
name := c.Param("name")
|
||||
name := decodedParam(c, "name")
|
||||
var payload struct {
|
||||
Message string `json:"message"`
|
||||
}
|
||||
@@ -302,7 +319,7 @@ func AgentSSEEndpoint(app *application.Application) echo.HandlerFunc {
|
||||
return func(c echo.Context) error {
|
||||
svc := app.AgentPoolService()
|
||||
userID := effectiveUserID(c)
|
||||
name := c.Param("name")
|
||||
name := decodedParam(c, "name")
|
||||
|
||||
// Try local SSE manager first
|
||||
manager := svc.GetSSEManagerForUser(userID, name)
|
||||
@@ -334,7 +351,7 @@ func ExportAgentEndpoint(app *application.Application) echo.HandlerFunc {
|
||||
return func(c echo.Context) error {
|
||||
svc := app.AgentPoolService()
|
||||
userID := effectiveUserID(c)
|
||||
name := c.Param("name")
|
||||
name := decodedParam(c, "name")
|
||||
data, err := svc.ExportAgentForUser(userID, name)
|
||||
if err != nil {
|
||||
return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
|
||||
|
||||
@@ -4,8 +4,6 @@ import (
|
||||
"encoding/json"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
|
||||
"github.com/labstack/echo/v4"
|
||||
@@ -110,6 +108,18 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
|
||||
})
|
||||
}
|
||||
|
||||
// Read whatever is already persisted: it is both the source of truth
|
||||
// for branding asset filenames (below) and the base we merge this
|
||||
// request onto before writing. A read failure must not let a Save
|
||||
// silently discard the existing settings — surface it instead.
|
||||
persisted, err := appConfig.ReadPersistedSettings()
|
||||
if err != nil {
|
||||
return c.JSON(http.StatusInternalServerError, schema.SettingsResponse{
|
||||
Success: false,
|
||||
Error: "Failed to read existing settings: " + err.Error(),
|
||||
})
|
||||
}
|
||||
|
||||
// Branding asset filenames are owned exclusively by
|
||||
// /api/branding/asset/{kind} (upload/delete). The Settings page also
|
||||
// round-trips them via GET /api/settings, but its local state is stale
|
||||
@@ -118,11 +128,9 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
|
||||
// at page open. Replace whatever the body sent for these three fields
|
||||
// with the values currently on disk so /api/settings can never
|
||||
// regress them.
|
||||
if existing, err := appConfig.ReadPersistedSettings(); err == nil {
|
||||
settings.LogoFile = existing.LogoFile
|
||||
settings.LogoHorizontalFile = existing.LogoHorizontalFile
|
||||
settings.FaviconFile = existing.FaviconFile
|
||||
}
|
||||
settings.LogoFile = persisted.LogoFile
|
||||
settings.LogoHorizontalFile = persisted.LogoHorizontalFile
|
||||
settings.FaviconFile = persisted.FaviconFile
|
||||
|
||||
// The UI reads ApiKeys from GET /api/settings, which already returns the
|
||||
// merged env+runtime list. When the user clicks Save, the same merged
|
||||
@@ -145,16 +153,17 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
|
||||
settings.ApiKeys = &runtimeOnly
|
||||
}
|
||||
|
||||
settingsFile := filepath.Join(appConfig.DynamicConfigsDir, "runtime_settings.json")
|
||||
settingsJSON, err := json.MarshalIndent(settings, "", " ")
|
||||
if err != nil {
|
||||
return c.JSON(http.StatusInternalServerError, schema.SettingsResponse{
|
||||
Success: false,
|
||||
Error: "Failed to marshal settings: " + err.Error(),
|
||||
})
|
||||
}
|
||||
|
||||
if err := os.WriteFile(settingsFile, settingsJSON, 0600); err != nil {
|
||||
// Persist as a partial update: overlay only the fields this request set
|
||||
// onto the settings already on disk. Focused admin pages POST just the
|
||||
// keys they own (the Middleware proxy tab sends only mitm_listen; the
|
||||
// detector table only pii_default_detectors), so writing the request
|
||||
// body verbatim would null every unrelated setting (the no-omitempty
|
||||
// api_keys / pii_default_detectors fields even round-trip as JSON
|
||||
// null). The full Settings page still round-trips every field, so its
|
||||
// Save is unchanged.
|
||||
toPersist := persisted
|
||||
toPersist.MergeNonNil(settings)
|
||||
if err := appConfig.WritePersistedSettings(toPersist); err != nil {
|
||||
return c.JSON(http.StatusInternalServerError, schema.SettingsResponse{
|
||||
Success: false,
|
||||
Error: "Failed to write settings file: " + err.Error(),
|
||||
@@ -262,7 +271,14 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
|
||||
}
|
||||
}
|
||||
|
||||
if settings.MITMListen != nil {
|
||||
// Rebuild the MITM listener when its address OR the instance-wide
|
||||
// default detectors change. The per-host detector map is resolved once
|
||||
// at listener start (startMITMLocked → ResolvePIIPolicy), so a
|
||||
// default-detector change is otherwise invisible to cloud-proxy traffic
|
||||
// until the next restart — an admin toggling a default detector would
|
||||
// see no redaction. RestartMITM is a no-op when the listener is
|
||||
// disabled (empty address).
|
||||
if settings.MITMListen != nil || settings.PIIDefaultDetectors != nil {
|
||||
if err := app.RestartMITM(); err != nil {
|
||||
xlog.Error("Failed to restart MITM proxy", "error", err)
|
||||
return c.JSON(http.StatusInternalServerError, schema.SettingsResponse{
|
||||
|
||||
@@ -52,6 +52,10 @@ var _ = Describe("Settings endpoints", func() {
|
||||
// Settings are persisted here; set after construction since there's no
|
||||
// dedicated AppOption for it.
|
||||
app.ApplicationConfig().DynamicConfigsDir = tmp
|
||||
// Contain the MITM CA inside tmp too. The partial-save spec flips
|
||||
// mitm_listen, which starts the listener and writes a CA; without this
|
||||
// it defaults to ./mitm-ca and litters the package source tree.
|
||||
app.ApplicationConfig().MITMCADir = filepath.Join(tmp, "mitm-ca")
|
||||
|
||||
e = echo.New()
|
||||
e.GET("/api/settings", GetSettingsEndpoint(app))
|
||||
@@ -109,6 +113,57 @@ var _ = Describe("Settings endpoints", func() {
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
})
|
||||
|
||||
// Regression: a focused admin page (the Middleware proxy tab) POSTs only
|
||||
// the one field it owns — mitm_listen. The old handler wrote the request
|
||||
// body verbatim, so every other persisted setting was dropped (and
|
||||
// api_keys / pii_default_detectors, which lack omitempty, were written as
|
||||
// null). A partial POST must now merge onto what is already on disk.
|
||||
It("preserves unrelated persisted settings when a partial POST sets only mitm_listen", func() {
|
||||
// First save establishes a fuller settings file (as the full Settings
|
||||
// page would): galleries, an API key, and the MITM listener. The
|
||||
// listener restart binds a real socket, so use 127.0.0.1:0 for an
|
||||
// ephemeral free port rather than a fixed one that may be in use.
|
||||
rec := post(`{"mitm_listen":"127.0.0.1:0","galleries":[{"name":"g1","url":"http://example/g1"}],"api_keys":["k1"],"pii_default_detectors":["det-a"]}`)
|
||||
Expect(rec.Code).To(Equal(http.StatusOK), rec.Body.String())
|
||||
|
||||
// The Middleware proxy tab then changes only the listen address — the
|
||||
// exact partial body that nulled everything else before the fix.
|
||||
rec = post(`{"mitm_listen":"127.0.0.1:0"}`)
|
||||
Expect(rec.Code).To(Equal(http.StatusOK), rec.Body.String())
|
||||
|
||||
raw, err := os.ReadFile(filepath.Join(tmp, "runtime_settings.json"))
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
var ondisk config.RuntimeSettings
|
||||
Expect(json.Unmarshal(raw, &ondisk)).To(Succeed())
|
||||
|
||||
Expect(ondisk.MITMListen).ToNot(BeNil())
|
||||
Expect(*ondisk.MITMListen).To(Equal("127.0.0.1:0"), "the changed field should be saved")
|
||||
Expect(ondisk.Galleries).ToNot(BeNil(), "galleries were clobbered by the partial save")
|
||||
Expect(*ondisk.Galleries).To(HaveLen(1))
|
||||
Expect(ondisk.ApiKeys).ToNot(BeNil(), "api_keys were nulled by the partial save")
|
||||
Expect(*ondisk.ApiKeys).To(Equal([]string{"k1"}))
|
||||
Expect(ondisk.PIIDefaultDetectors).ToNot(BeNil(), "pii_default_detectors were nulled by the partial save")
|
||||
Expect(*ondisk.PIIDefaultDetectors).To(Equal([]string{"det-a"}))
|
||||
})
|
||||
|
||||
// The MITM listener resolves its per-host PII detectors once at start
|
||||
// (startMITMLocked → ResolvePIIPolicy), and the handler used to restart it
|
||||
// only when mitm_listen changed. So an admin toggling a default detector
|
||||
// (the Middleware detector table POSTs only pii_default_detectors) left
|
||||
// cloud-proxy traffic unredacted until the next reboot. A
|
||||
// pii_default_detectors change must now rebuild the listener.
|
||||
It("rebuilds the MITM listener when only pii_default_detectors changes", func() {
|
||||
rec := post(`{"mitm_listen":"127.0.0.1:0"}`)
|
||||
Expect(rec.Code).To(Equal(http.StatusOK), rec.Body.String())
|
||||
srv1 := app.MITMServer()
|
||||
Expect(srv1).ToNot(BeNil(), "listener should be running after mitm_listen is set")
|
||||
|
||||
rec = post(`{"pii_default_detectors":["det-a"]}`)
|
||||
Expect(rec.Code).To(Equal(http.StatusOK), rec.Body.String())
|
||||
Expect(app.MITMServer()).ToNot(BeIdenticalTo(srv1),
|
||||
"a default-detector change must restart the listener so it picks up the new detectors")
|
||||
})
|
||||
|
||||
// Residual #9125: enabling the watchdog from a cold (off) state via the
|
||||
// React master toggle must start the live watchdog immediately, without a
|
||||
// restart. The toggle posts watchdog_idle_enabled/busy_enabled=true while
|
||||
|
||||
@@ -432,7 +432,7 @@ func loadSoundDetectionConfig(pipeline *config.Pipeline, cl *config.ModelConfigL
|
||||
if pipeline.SoundDetection == "" {
|
||||
return nil, nil
|
||||
}
|
||||
cfg, err := cl.LoadModelConfigFileByName(pipeline.SoundDetection, ml.ModelPath)
|
||||
cfg, err := loadPipelineSubModel(cl, pipeline.SoundDetection, ml.ModelPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to load sound detection config: %w", err)
|
||||
}
|
||||
@@ -443,7 +443,7 @@ func loadSoundDetectionConfig(pipeline *config.Pipeline, cl *config.ModelConfigL
|
||||
}
|
||||
|
||||
func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, *config.ModelConfig, error) {
|
||||
cfgVAD, err := cl.LoadModelConfigFileByName(pipeline.VAD, ml.ModelPath)
|
||||
cfgVAD, err := loadPipelineSubModel(cl, pipeline.VAD, ml.ModelPath)
|
||||
if err != nil {
|
||||
|
||||
return nil, nil, fmt.Errorf("failed to load backend config: %w", err)
|
||||
@@ -453,7 +453,7 @@ func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfig
|
||||
return nil, nil, fmt.Errorf("failed to validate config: %w", err)
|
||||
}
|
||||
|
||||
cfgSST, err := cl.LoadModelConfigFileByName(pipeline.Transcription, ml.ModelPath)
|
||||
cfgSST, err := loadPipelineSubModel(cl, pipeline.Transcription, ml.ModelPath)
|
||||
if err != nil {
|
||||
|
||||
return nil, nil, fmt.Errorf("failed to load backend config: %w", err)
|
||||
@@ -542,11 +542,30 @@ func buildRealtimeRoutingContext(a *application.Application, sessionID string) *
|
||||
}
|
||||
}
|
||||
|
||||
// loadPipelineSubModel loads a pipeline sub-model config by name and follows a
|
||||
// single alias hop, so a pipeline that references an alias (e.g. `llm: default`)
|
||||
// gets the alias target's full config (Backend, Model, ...) rather than the
|
||||
// alias stub with an empty Backend. Without this the alias survives unresolved
|
||||
// into model loading and fails downstream — notably in distributed mode with
|
||||
// "backend name is empty". Mirrors the top-level alias resolution in
|
||||
// core/http/middleware/request.go.
|
||||
func loadPipelineSubModel(cl *config.ModelConfigLoader, name, modelPath string) (*config.ModelConfig, error) {
|
||||
cfg, err := cl.LoadModelConfigFileByName(name, modelPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
resolved, _, err := cl.ResolveAlias(cfg)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return resolved, nil
|
||||
}
|
||||
|
||||
// returns and loads either a wrapped model or a model that support audio-to-audio
|
||||
func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, evaluator *templates.Evaluator, routing *RealtimeRoutingContext) (Model, error) {
|
||||
xlog.Debug("Creating new model pipeline model", "pipeline", pipeline)
|
||||
|
||||
cfgVAD, err := cl.LoadModelConfigFileByName(pipeline.VAD, ml.ModelPath)
|
||||
cfgVAD, err := loadPipelineSubModel(cl, pipeline.VAD, ml.ModelPath)
|
||||
if err != nil {
|
||||
|
||||
return nil, fmt.Errorf("failed to load backend config: %w", err)
|
||||
@@ -557,7 +576,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
|
||||
}
|
||||
|
||||
// TODO: Do we always need a transcription model? It can be disabled. Note that any-to-any instruction following models don't transcribe as such, so if transcription is required it is a separate process
|
||||
cfgSST, err := cl.LoadModelConfigFileByName(pipeline.Transcription, ml.ModelPath)
|
||||
cfgSST, err := loadPipelineSubModel(cl, pipeline.Transcription, ml.ModelPath)
|
||||
if err != nil {
|
||||
|
||||
return nil, fmt.Errorf("failed to load backend config: %w", err)
|
||||
@@ -589,7 +608,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
|
||||
xlog.Debug("Loading a wrapped model")
|
||||
|
||||
// Otherwise we want to return a wrapped model, which is a "virtual" model that re-uses other models to perform operations
|
||||
cfgLLM, err := cl.LoadModelConfigFileByName(pipeline.LLM, ml.ModelPath)
|
||||
cfgLLM, err := loadPipelineSubModel(cl, pipeline.LLM, ml.ModelPath)
|
||||
if err != nil {
|
||||
|
||||
return nil, fmt.Errorf("failed to load backend config: %w", err)
|
||||
@@ -604,7 +623,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
|
||||
applyPipelineReasoning(cfgLLM, *pipeline)
|
||||
applyPipelineThinking(cfgLLM, *pipeline)
|
||||
|
||||
cfgTTS, err := cl.LoadModelConfigFileByName(pipeline.TTS, ml.ModelPath)
|
||||
cfgTTS, err := loadPipelineSubModel(cl, pipeline.TTS, ml.ModelPath)
|
||||
if err != nil {
|
||||
|
||||
return nil, fmt.Errorf("failed to load backend config: %w", err)
|
||||
|
||||
52
core/http/endpoints/openai/realtime_model_alias_test.go
Normal file
52
core/http/endpoints/openai/realtime_model_alias_test.go
Normal file
@@ -0,0 +1,52 @@
|
||||
package openai
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
)
|
||||
|
||||
// loadPipelineSubModel must resolve a pipeline sub-model that references an
|
||||
// alias (e.g. `llm: default`) one hop to the alias target's full config — so
|
||||
// the effective backend is the target's backend, not the empty backend of the
|
||||
// alias stub. This mirrors the top-level alias resolution done in
|
||||
// core/http/middleware/request.go, which the realtime pipeline previously
|
||||
// skipped (failing in distributed mode with "backend name is empty").
|
||||
var _ = Describe("loadPipelineSubModel", func() {
|
||||
It("resolves a sub-model alias one hop to the target's config", func() {
|
||||
tmpDir := GinkgoT().TempDir()
|
||||
|
||||
// A real model config with a concrete backend.
|
||||
realLLM := `name: real-llm
|
||||
backend: llama-cpp
|
||||
parameters:
|
||||
model: real-llm.gguf
|
||||
`
|
||||
Expect(os.WriteFile(filepath.Join(tmpDir, "real-llm.yaml"), []byte(realLLM), 0644)).To(Succeed())
|
||||
|
||||
// An alias pointing at the real model.
|
||||
aliasCfg := `name: default
|
||||
alias: real-llm
|
||||
`
|
||||
Expect(os.WriteFile(filepath.Join(tmpDir, "default.yaml"), []byte(aliasCfg), 0644)).To(Succeed())
|
||||
|
||||
cl := config.NewModelConfigLoader(tmpDir)
|
||||
Expect(cl.LoadModelConfigsFromPath(tmpDir)).To(Succeed())
|
||||
|
||||
// Resolving the alias must follow the hop to the target's full config.
|
||||
resolved, err := loadPipelineSubModel(cl, "default", tmpDir)
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
Expect(resolved.IsAlias()).To(BeFalse())
|
||||
Expect(resolved.Backend).To(Equal("llama-cpp"))
|
||||
|
||||
// A non-alias name must load unchanged.
|
||||
direct, err := loadPipelineSubModel(cl, "real-llm", tmpDir)
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
Expect(direct.Backend).To(Equal("llama-cpp"))
|
||||
Expect(direct.Name).To(Equal("real-llm"))
|
||||
})
|
||||
})
|
||||
@@ -1,100 +0,0 @@
|
||||
import { test, expect } from './coverage-fixtures.js'
|
||||
|
||||
// These specs stub /api/features and /api/auth/status per cell. The test server
|
||||
// disables auth (isAdmin=true) and reports its own features, so we intercept
|
||||
// before navigation to simulate each role x mode cell.
|
||||
|
||||
function stubFeatures(page, features) {
|
||||
return page.route('**/api/features', route =>
|
||||
route.fulfill({ contentType: 'application/json', body: JSON.stringify(features) }))
|
||||
}
|
||||
|
||||
function stubNoP2P(page) {
|
||||
// P2P token endpoint returns empty -> p2pEnabled=false.
|
||||
return page.route('**/api/p2p/token', route =>
|
||||
route.fulfill({ contentType: 'text/plain', body: '' }))
|
||||
}
|
||||
|
||||
test.describe('Adaptive landing (HomeRoute)', () => {
|
||||
test('admin + distributed redirects /app to Nodes', async ({ page }) => {
|
||||
await stubFeatures(page, { distributed: true })
|
||||
await stubNoP2P(page)
|
||||
await page.goto('/app')
|
||||
await expect(page).toHaveURL(/\/app\/nodes$/)
|
||||
await expect(page.locator('.page-title').first()).toBeVisible({ timeout: 15_000 })
|
||||
})
|
||||
|
||||
test('admin + single-node stays on Home', async ({ page }) => {
|
||||
await stubFeatures(page, { distributed: false })
|
||||
await stubNoP2P(page)
|
||||
await page.goto('/app')
|
||||
await expect(page).toHaveURL(/\/app$/)
|
||||
await expect(page.locator('.home-greeting')).toBeVisible({ timeout: 15_000 })
|
||||
})
|
||||
})
|
||||
|
||||
test.describe('Adaptive sidebar', () => {
|
||||
test('distributed pins the Cluster group with Nodes at the top', async ({ page }) => {
|
||||
await stubFeatures(page, { distributed: true })
|
||||
await stubNoP2P(page)
|
||||
await page.goto('/app/chat') // any in-app page so the sidebar is mounted
|
||||
const pinned = page.locator('.sidebar-nav .sidebar-section-items').first()
|
||||
await expect(pinned.getByText('Nodes', { exact: false })).toBeVisible({ timeout: 15_000 })
|
||||
})
|
||||
|
||||
test('single-node does not pin a Cluster group', async ({ page }) => {
|
||||
await stubFeatures(page, { distributed: false })
|
||||
await stubNoP2P(page)
|
||||
await page.goto('/app/chat')
|
||||
// Nodes is reachable only via the Operate rail, not pinned at the top.
|
||||
await expect(page.locator('.sidebar-nav')).toBeVisible({ timeout: 15_000 })
|
||||
await expect(page.locator('.sidebar-nav .sidebar-section-items').first()
|
||||
.getByText('Nodes', { exact: false })).toHaveCount(0)
|
||||
})
|
||||
})
|
||||
|
||||
test.describe('Top navbar', () => {
|
||||
test('admin sees the mode pill and settings cog', async ({ page }) => {
|
||||
await stubFeatures(page, { distributed: true })
|
||||
await stubNoP2P(page)
|
||||
await page.goto('/app/chat')
|
||||
await expect(page.locator('.top-navbar__mode')).toBeVisible({ timeout: 15_000 })
|
||||
await expect(page.locator('.top-navbar__icon[aria-label]')).not.toHaveCount(0)
|
||||
})
|
||||
|
||||
test('admin-via-chat jump shows when localai_assistant is enabled', async ({ page }) => {
|
||||
await stubFeatures(page, { distributed: false, localai_assistant: true })
|
||||
await stubNoP2P(page)
|
||||
await page.goto('/app/chat')
|
||||
await expect(page.locator('.top-navbar__assistant')).toBeVisible({ timeout: 15_000 })
|
||||
})
|
||||
|
||||
test('admin-via-chat jump hidden when localai_assistant is off', async ({ page }) => {
|
||||
await stubFeatures(page, { distributed: false, localai_assistant: false })
|
||||
await stubNoP2P(page)
|
||||
await page.goto('/app/chat')
|
||||
await expect(page.locator('.top-navbar__assistant')).toHaveCount(0)
|
||||
})
|
||||
})
|
||||
|
||||
test.describe('Token usage meter', () => {
|
||||
test('renders when admin usage has data', async ({ page }) => {
|
||||
await stubFeatures(page, { distributed: false })
|
||||
await stubNoP2P(page)
|
||||
await page.route('**/api/auth/admin/usage**', route =>
|
||||
route.fulfill({ contentType: 'application/json',
|
||||
body: JSON.stringify({ buckets: [{ total_tokens: 1234 }] }) }))
|
||||
await page.goto('/app/chat')
|
||||
await expect(page.locator('.top-navbar__meter')).toBeVisible({ timeout: 15_000 })
|
||||
})
|
||||
|
||||
test('hidden when admin usage is empty (graceful degrade)', async ({ page }) => {
|
||||
await stubFeatures(page, { distributed: false })
|
||||
await stubNoP2P(page)
|
||||
await page.route('**/api/auth/admin/usage**', route =>
|
||||
route.fulfill({ contentType: 'application/json', body: JSON.stringify({ buckets: [] }) }))
|
||||
await page.goto('/app/chat')
|
||||
await expect(page.locator('.top-navbar')).toBeVisible({ timeout: 15_000 })
|
||||
await expect(page.locator('.top-navbar__meter')).toHaveCount(0)
|
||||
})
|
||||
})
|
||||
@@ -86,6 +86,7 @@
|
||||
"input": {
|
||||
"placeholder": "Message...",
|
||||
"attachFile": "Attach file",
|
||||
"send": "Send message",
|
||||
"stopGenerating": "Stop generating",
|
||||
"canvasTitle": "Canvas — extract code blocks and media into a side panel for preview, copy, and download",
|
||||
"canvasLabel": "Canvas",
|
||||
|
||||
@@ -77,6 +77,20 @@
|
||||
"noModelsTitle": "No Models Available",
|
||||
"noModelsBody": "There are no models installed yet. Ask your administrator to set up models so you can start chatting."
|
||||
},
|
||||
"starters": {
|
||||
"title": "Recommended for your hardware",
|
||||
"tier": {
|
||||
"cpu": "CPU-only",
|
||||
"gpu-small": "GPU",
|
||||
"gpu-large": "GPU"
|
||||
},
|
||||
"cpuNote": "No GPU detected — these small models stay responsive on CPU.",
|
||||
"gpuNote": "Picked to fit your available VRAM with room for context.",
|
||||
"install": "Install",
|
||||
"installing": "Installing",
|
||||
"installStarted": "Installing {{model}}…",
|
||||
"installFailed": "Install failed: {{message}}"
|
||||
},
|
||||
"connect": {
|
||||
"title": "One endpoint, every API",
|
||||
"subtitle": "LocalAI serves its own full API — image & video generation, depth, object detection, reranking, audio, face & voice recognition, and realtime voice over WebRTC and WebSocket. On top of that, a drop-in compatibility layer lets any app built for OpenAI, Anthropic, Ollama or OpenAI Responses talk to it unchanged.",
|
||||
|
||||
@@ -12,16 +12,6 @@
|
||||
"accountSettings": "Account settings",
|
||||
"account": "Account",
|
||||
"accountFor": "Account: {{name}}",
|
||||
"topbar": {
|
||||
"label": "Top bar",
|
||||
"modeDistributed": "Distributed",
|
||||
"modeSwarm": "Swarm",
|
||||
"modeSingle": "Single-node",
|
||||
"pickModel": "Models",
|
||||
"adminViaChat": "Admin via chat",
|
||||
"tokensToday": "Tokens today",
|
||||
"usageDetail": "View usage detail"
|
||||
},
|
||||
"sections": {
|
||||
"create": "Create",
|
||||
"recognition": "Recognition",
|
||||
|
||||
@@ -45,7 +45,7 @@
|
||||
},
|
||||
"scheduling": {
|
||||
"title": "Penjadwalan",
|
||||
"subtitle": "Aturan penempatan model dan replika di seluruh klaster"
|
||||
"subtitle": "Aturan penempatan model dan replika di seluruh kluster"
|
||||
},
|
||||
"p2p": {
|
||||
"title": "Komputasi AI Terdistribusi",
|
||||
@@ -86,4 +86,4 @@
|
||||
"title": "Penjelajah",
|
||||
"subtitle": "Jelajahi file dan konfigurasi"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -72,7 +72,7 @@
|
||||
"actions": {
|
||||
"copy": "Salin",
|
||||
"regenerate": "Hasilkan ulang",
|
||||
"jumpToLatest": "Jump to latest"
|
||||
"jumpToLatest": "Lompat ke terbaru"
|
||||
},
|
||||
"streaming": {
|
||||
"transferring": "Mentransfer model...",
|
||||
@@ -115,4 +115,4 @@
|
||||
"clearAll": "Hapus semua",
|
||||
"deleteAllTitle": "Hapus semua percakapan"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
{
|
||||
"unsaved": {
|
||||
"title": "Discard unsaved changes?",
|
||||
"message": "You have unsaved changes that will be lost if you leave this page.",
|
||||
"leave": "Leave"
|
||||
"title": "Buang perubahan yang belum disimpan?",
|
||||
"message": "Anda memiliki perubahan yang belum disimpan. Perubahan tersebut akan hilang jika Anda meninggalkan halaman ini.",
|
||||
"leave": "Tinggalkan Halaman"
|
||||
},
|
||||
"actions": {
|
||||
"save": "Simpan",
|
||||
|
||||
@@ -7,15 +7,15 @@
|
||||
"resourceGpu": "GPU",
|
||||
"resourceRam": "RAM",
|
||||
"greeting": {
|
||||
"morning": "Good morning",
|
||||
"afternoon": "Good afternoon",
|
||||
"evening": "Good evening",
|
||||
"night": "Working late"
|
||||
"morning": "Selamat pagi",
|
||||
"afternoon": "Selamat siang",
|
||||
"evening": "Selamat malam",
|
||||
"night": "Selamat lembur"
|
||||
},
|
||||
"statusLine": {
|
||||
"modelsLoaded_one": "{{count}} model loaded",
|
||||
"modelsLoaded_other": "{{count}} models loaded",
|
||||
"noModelsLoaded": "No models loaded",
|
||||
"modelsLoaded_one": "{{count}} model dimuat",
|
||||
"modelsLoaded_other": "{{count}} model dimuat",
|
||||
"noModelsLoaded": "Tidak ada model yang dimuat",
|
||||
"nodes_one": "{{count}} node",
|
||||
"nodes_other": "{{count}} nodes"
|
||||
},
|
||||
@@ -79,14 +79,14 @@
|
||||
},
|
||||
"connect": {
|
||||
"title": "Satu endpoint, semua API",
|
||||
"subtitle": "LocalAI menyediakan API miliknya sendiri yang lengkap — pembuatan gambar & video, depth, deteksi objek, reranking, audio, pengenalan wajah & suara, serta suara realtime melalui WebRTC dan WebSocket. Di atas itu, lapisan kompatibilitas drop-in membuat aplikasi apa pun yang dibuat untuk OpenAI, Anthropic, Ollama, atau OpenAI Responses bekerja tanpa perubahan.",
|
||||
"subtitle": "LocalAI menyediakan API miliknya sendiri yang lengkap — pembuatan gambar & video, depth, deteksi objek, reranking, audio, pengenalan wajah & suara, serta suara realtime melalui WebRTC dan WebSocket. Selain itu, lapisan kompatibilitas drop-in membuat aplikasi apa pun yang dibuat untuk OpenAI, Anthropic, Ollama, atau OpenAI Responses bekerja tanpa perubahan.",
|
||||
"nativeTitle": "API native",
|
||||
"compatTitle": "Kompatibilitas drop-in",
|
||||
"apiReference": "Referensi API lengkap",
|
||||
"copy": "Salin",
|
||||
"copied": "Disalin",
|
||||
"browse": "Browse the API",
|
||||
"hide": "Hide endpoints",
|
||||
"dismiss": "Dismiss"
|
||||
"browse": "Jelajahi API",
|
||||
"hide": "Sembunyikan endpoint",
|
||||
"dismiss": "Abaikan"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
"video": "Video",
|
||||
"tts": "TTS",
|
||||
"sound": "Suara",
|
||||
"transform": "Transform"
|
||||
"transform": "Transformasi"
|
||||
}
|
||||
},
|
||||
"image": {
|
||||
@@ -30,7 +30,7 @@
|
||||
"refImagesAdded_other": "{{count}} gambar ditambahkan"
|
||||
},
|
||||
"actions": {
|
||||
"view": "View",
|
||||
"view": "Lihat",
|
||||
"generate": "Hasilkan",
|
||||
"generating": "Menghasilkan..."
|
||||
},
|
||||
@@ -153,4 +153,4 @@
|
||||
"clearConfirm": "Hapus",
|
||||
"cleared": "Riwayat dihapus"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,11 +19,11 @@
|
||||
"operate": "Operasikan"
|
||||
},
|
||||
"operate": {
|
||||
"inference": "Inference",
|
||||
"cluster": "Cluster",
|
||||
"observability": "Observability",
|
||||
"access": "Access",
|
||||
"system": "System"
|
||||
"inference": "Inferensi",
|
||||
"cluster": "Kluster",
|
||||
"observability": "Observabilitas",
|
||||
"access": "Akses",
|
||||
"system": "Sistem"
|
||||
},
|
||||
"items": {
|
||||
"home": "Beranda",
|
||||
@@ -64,7 +64,7 @@
|
||||
"copyright": "© 2023-{{year}} {{author}}"
|
||||
},
|
||||
"console": {
|
||||
"automation": "Otomasi",
|
||||
"automation": "Automasi",
|
||||
"training": "Pelatihan"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -184,50 +184,6 @@
|
||||
font-size: 1.5rem;
|
||||
}
|
||||
|
||||
/* Desktop top bar: deployment + admin affordances on wide screens. Hidden on
|
||||
mobile, where .mobile-header carries the equivalent actions. */
|
||||
.top-navbar {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
gap: var(--spacing-md);
|
||||
padding: var(--spacing-sm) var(--spacing-lg);
|
||||
border-bottom: 1px solid var(--color-border-default);
|
||||
background: var(--color-bg-secondary);
|
||||
}
|
||||
.top-navbar__right { display: flex; align-items: center; gap: var(--spacing-sm); }
|
||||
.top-navbar__mode {
|
||||
font-size: 0.75rem;
|
||||
padding: 2px 10px;
|
||||
border-radius: 999px;
|
||||
border: 1px solid var(--color-border-default);
|
||||
color: var(--color-text-secondary);
|
||||
}
|
||||
.top-navbar__mode.is-active { color: var(--color-success); border-color: var(--color-success); }
|
||||
.top-navbar__btn {
|
||||
display: inline-flex; align-items: center; gap: 6px;
|
||||
font-size: 0.8125rem; padding: 5px 10px; border-radius: 8px;
|
||||
border: 1px solid var(--color-border-default); background: var(--color-bg-tertiary);
|
||||
color: var(--color-text-primary); cursor: pointer;
|
||||
}
|
||||
.top-navbar__icon {
|
||||
width: 32px; height: 32px; display: inline-flex; align-items: center;
|
||||
justify-content: center; border-radius: 8px; border: 1px solid var(--color-border-default);
|
||||
background: var(--color-bg-tertiary); color: var(--color-text-secondary); cursor: pointer;
|
||||
}
|
||||
.top-navbar__avatar img { width: 100%; height: 100%; border-radius: 50%; object-fit: cover; }
|
||||
.top-navbar__meter {
|
||||
display: inline-flex; flex-direction: column; gap: 3px; align-items: flex-start;
|
||||
padding: 4px 10px; border-radius: 8px; border: 1px solid var(--color-border-default);
|
||||
background: var(--color-bg-tertiary); cursor: pointer; min-width: 150px;
|
||||
}
|
||||
.top-navbar__meter-label { font-size: 0.6875rem; color: var(--color-text-secondary); }
|
||||
.top-navbar__meter-bar { width: 100%; height: 5px; border-radius: 3px; background: var(--color-bg-secondary); overflow: hidden; }
|
||||
.top-navbar__meter-bar i { display: block; height: 100%; background: var(--color-primary); }
|
||||
@media (max-width: 639px) {
|
||||
.top-navbar { display: none; }
|
||||
}
|
||||
|
||||
/* Sidebar */
|
||||
.sidebar {
|
||||
position: fixed;
|
||||
@@ -6407,6 +6363,59 @@ select.input {
|
||||
justify-content: center;
|
||||
}
|
||||
|
||||
/* ──────────────────── Home: hardware-aware starter models ──────────────────── */
|
||||
|
||||
.home-starters {
|
||||
margin: var(--spacing-lg) 0;
|
||||
padding: var(--spacing-lg);
|
||||
}
|
||||
.home-starters-head {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
gap: var(--spacing-md);
|
||||
}
|
||||
.home-starters-head strong {
|
||||
font-size: 0.9375rem;
|
||||
}
|
||||
.home-starters-tier {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: var(--spacing-xs);
|
||||
font-size: 0.75rem;
|
||||
color: var(--color-text-muted);
|
||||
}
|
||||
.home-starters-sub {
|
||||
margin: var(--spacing-xs) 0 var(--spacing-md);
|
||||
font-size: 0.8125rem;
|
||||
color: var(--color-text-secondary);
|
||||
}
|
||||
.home-starters-list {
|
||||
list-style: none;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: var(--spacing-xs);
|
||||
}
|
||||
.home-starters-item {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: var(--spacing-md);
|
||||
padding: var(--spacing-xs) 0;
|
||||
}
|
||||
.home-starters-name {
|
||||
font-weight: 500;
|
||||
font-size: 0.875rem;
|
||||
word-break: break-all;
|
||||
}
|
||||
.home-starters-size {
|
||||
margin-left: auto;
|
||||
font-size: 0.75rem;
|
||||
color: var(--color-text-muted);
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
/* ──────────────────── Home: drop-in endpoint / API compatibility ──────────────────── */
|
||||
|
||||
.home-connect {
|
||||
|
||||
@@ -3,7 +3,6 @@ import { Outlet, useLocation, useNavigate } from 'react-router-dom'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import Sidebar from './components/Sidebar'
|
||||
import OperationsBar from './components/OperationsBar'
|
||||
import TopNavbar from './components/TopNavbar'
|
||||
import { ToastContainer, useToast } from './components/Toast'
|
||||
import { systemApi } from './utils/api'
|
||||
import { useTheme } from './contexts/ThemeContext'
|
||||
@@ -99,7 +98,6 @@ export default function App() {
|
||||
<Sidebar isOpen={sidebarOpen} onClose={() => setSidebarOpen(false)} />
|
||||
<main className="main-content" {...(sidebarOpen ? { 'aria-hidden': 'true', inert: '' } : {})}>
|
||||
<OperationsBar />
|
||||
<TopNavbar />
|
||||
{/* Mobile header — primary actions reachable without opening the
|
||||
drawer. Hamburger is the only way to expand the nav on phones;
|
||||
theme toggle and account avatar are mirrored from the sidebar
|
||||
|
||||
@@ -1,28 +0,0 @@
|
||||
import { lazy, Suspense } from 'react'
|
||||
import { Navigate } from 'react-router-dom'
|
||||
import { useAuth } from '../context/AuthContext'
|
||||
import { useDeployment } from '../contexts/DeploymentContext'
|
||||
import { resolveHome } from '../utils/resolveHome'
|
||||
import RouteFallback from './RouteFallback'
|
||||
|
||||
const Home = lazy(() => import('../pages/Home'))
|
||||
|
||||
// Index-route element. Waits for auth + deployment signals to load (so we never
|
||||
// flash the wrong landing), then either renders Home or redirects to the cell's
|
||||
// landing page. Redirecting (rather than rendering Nodes/Chat inline at /app)
|
||||
// keeps each target's own route guard, active-nav state, and deep-linkability.
|
||||
export default function HomeRoute() {
|
||||
const { isAdmin, loading: authLoading } = useAuth()
|
||||
const { distributed, p2pEnabled, loading: deployLoading } = useDeployment()
|
||||
|
||||
if (authLoading || deployLoading) return <RouteFallback />
|
||||
|
||||
const target = resolveHome({ isAdmin, distributed, p2pEnabled })
|
||||
if (target) return <Navigate to={target} replace />
|
||||
|
||||
return (
|
||||
<Suspense fallback={<RouteFallback />}>
|
||||
<Home />
|
||||
</Suspense>
|
||||
)
|
||||
}
|
||||
@@ -1,8 +1,25 @@
|
||||
import { useEffect, useMemo } from 'react'
|
||||
import { useEffect, useMemo, useCallback } from 'react'
|
||||
import { useModels } from '../hooks/useModels'
|
||||
import SearchableSelect from './SearchableSelect'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
|
||||
// Remember the last model the user picked, keyed by capability, so returning to
|
||||
// a page (Home chat box, Image, TTS, Talk...) defaults to that model instead of
|
||||
// whatever happens to sort first. Only persisted when a capability key exists —
|
||||
// `externalOptions` callers pass no capability and get the old first-item
|
||||
// behaviour. localStorage access is wrapped because private-browsing modes throw.
|
||||
const LAST_MODEL_PREFIX = 'localai_last_model:'
|
||||
|
||||
function readLastModel(capability) {
|
||||
if (!capability) return null
|
||||
try { return localStorage.getItem(LAST_MODEL_PREFIX + capability) } catch { return null }
|
||||
}
|
||||
|
||||
function writeLastModel(capability, model) {
|
||||
if (!capability || !model) return
|
||||
try { localStorage.setItem(LAST_MODEL_PREFIX + capability, model) } catch { /* ignore */ }
|
||||
}
|
||||
|
||||
export default function ModelSelector({
|
||||
value, onChange, capability, className = '',
|
||||
options: externalOptions, loading: externalLoading,
|
||||
@@ -19,16 +36,27 @@ export default function ModelSelector({
|
||||
const isLoading = externalOptions ? (externalLoading || false) : hookLoading
|
||||
const isDisabled = isLoading || (externalDisabled || false)
|
||||
|
||||
// Persist genuine selections so the next visit can restore them.
|
||||
const handleChange = useCallback((next) => {
|
||||
writeLastModel(capability, next)
|
||||
onChange(next)
|
||||
}, [capability, onChange])
|
||||
|
||||
useEffect(() => {
|
||||
if (modelNames.length > 0 && (!value || !modelNames.includes(value))) {
|
||||
onChange(modelNames[0])
|
||||
// Prefer the remembered model when it's still available; otherwise fall
|
||||
// back to the first option. Don't re-persist here — auto-select is not a
|
||||
// user choice, and writing back the stored value would be a harmless but
|
||||
// pointless round-trip.
|
||||
const remembered = readLastModel(capability)
|
||||
onChange(remembered && modelNames.includes(remembered) ? remembered : modelNames[0])
|
||||
}
|
||||
}, [modelNames, value, onChange])
|
||||
}, [modelNames, value, onChange, capability])
|
||||
|
||||
return (
|
||||
<SearchableSelect
|
||||
value={value || ''}
|
||||
onChange={onChange}
|
||||
onChange={handleChange}
|
||||
options={modelNames}
|
||||
placeholder={isLoading ? t('selector.loading') : (modelNames.length === 0 ? t('selector.noModels') : t('selector.selectModel'))}
|
||||
searchPlaceholder={searchPlaceholder || t('selector.searchPlaceholder')}
|
||||
|
||||
@@ -5,11 +5,9 @@ import ThemeToggle from './ThemeToggle'
|
||||
import LanguageSwitcher from './LanguageSwitcher'
|
||||
import { useAuth } from '../context/AuthContext'
|
||||
import { useBranding } from '../contexts/BrandingContext'
|
||||
import { useDeployment } from '../contexts/DeploymentContext'
|
||||
import { apiUrl } from '../utils/basePath'
|
||||
import { preloadRoute } from '../router'
|
||||
import { consoles, firstVisiblePath, consolePaths } from './console/consoleConfig'
|
||||
import { clusterPinItems, shouldCollapseCreate } from '../utils/sidebarPolicy'
|
||||
|
||||
const COLLAPSED_KEY = 'localai_sidebar_collapsed'
|
||||
const SECTIONS_KEY = 'localai_sidebar_sections'
|
||||
@@ -60,13 +58,11 @@ function NavItem({ item, onClose, collapsed }) {
|
||||
)
|
||||
}
|
||||
|
||||
function loadSectionState(collapseCreate = false) {
|
||||
// Tiers render expanded by default; users can collapse any tier and the
|
||||
// choice persists (stored values override defaults). In cluster cells we
|
||||
// start Create collapsed so the pinned cluster group leads - but only when
|
||||
// the user has not already expressed a preference.
|
||||
function loadSectionState() {
|
||||
// Tiers render expanded by default (the redesign favours showing the few
|
||||
// intent groups up front); users can still collapse any tier and the choice
|
||||
// is persisted. Stored values override the defaults so a saved collapse wins.
|
||||
const defaults = Object.fromEntries(sections.map(s => [s.id, true]))
|
||||
if (collapseCreate) defaults.create = false
|
||||
try {
|
||||
const stored = localStorage.getItem(SECTIONS_KEY)
|
||||
return stored ? { ...defaults, ...JSON.parse(stored) } : defaults
|
||||
@@ -81,34 +77,20 @@ function saveSectionState(state) {
|
||||
|
||||
export default function Sidebar({ isOpen, onClose }) {
|
||||
const { t } = useTranslation('nav')
|
||||
const { isAdmin, authEnabled, user, logout, hasFeature } = useAuth()
|
||||
// Deployment shape (server features + p2p) drives the adaptive sidebar; the
|
||||
// shared context replaces the sidebar's own /api/features fetch so the
|
||||
// landing resolver, navbar, and this policy agree on one snapshot.
|
||||
const deployment = useDeployment()
|
||||
const features = deployment.features
|
||||
// Shared shape for the console gating helpers (consoleConfig.js); in scope for
|
||||
// both the pinned cluster group and the console-tier rendering below.
|
||||
const auth = { isAdmin, authEnabled, hasFeature, features }
|
||||
const collapseCreate = shouldCollapseCreate(auth, deployment)
|
||||
const [features, setFeatures] = useState({})
|
||||
const [collapsed, setCollapsed] = useState(() => {
|
||||
try { return localStorage.getItem(COLLAPSED_KEY) === 'true' } catch (_) { return false }
|
||||
})
|
||||
const [openSections, setOpenSections] = useState(loadSectionState)
|
||||
const { isAdmin, authEnabled, user, logout, hasFeature } = useAuth()
|
||||
const branding = useBranding()
|
||||
const navigate = useNavigate()
|
||||
const location = useLocation()
|
||||
const closeBtnRef = useRef(null)
|
||||
|
||||
// Apply the cluster-cell Create-collapse default once, only when the user has
|
||||
// no stored section preference (so we never override an explicit choice).
|
||||
useEffect(() => {
|
||||
if (deployment.loading) return
|
||||
let hasStored = false
|
||||
try { hasStored = !!localStorage.getItem(SECTIONS_KEY) } catch { hasStored = false }
|
||||
if (hasStored || !collapseCreate) return
|
||||
setOpenSections(prev => (prev.create === false ? prev : { ...prev, create: false }))
|
||||
}, [deployment.loading, collapseCreate])
|
||||
fetch(apiUrl('/api/features')).then(r => r.json()).then(setFeatures).catch(() => {})
|
||||
}, [])
|
||||
|
||||
// Stay in sync with external collapse dispatches (e.g. the chat
|
||||
// page's focus mode). The collapse-toggle button still owns the
|
||||
@@ -175,6 +157,8 @@ export default function Sidebar({ isOpen, onClose }) {
|
||||
}
|
||||
|
||||
const visibleTopItems = topItems.filter(filterItem)
|
||||
// Shared shape for the console gating helpers (consoleConfig.js).
|
||||
const auth = { isAdmin, authEnabled, hasFeature, features }
|
||||
|
||||
// Inline sections (Create) carry no gating; a plain filterItem pass suffices.
|
||||
const getVisibleSectionItems = (section) => section.items.filter(filterItem)
|
||||
@@ -215,28 +199,6 @@ export default function Sidebar({ isOpen, onClose }) {
|
||||
))}
|
||||
</div>
|
||||
|
||||
{/* Pinned Cluster quick-access (admin + distributed/p2p). Same gate
|
||||
as the Operate rail; surfaced at the top for cluster operators. */}
|
||||
{(() => {
|
||||
const pinned = clusterPinItems(auth, deployment)
|
||||
if (pinned.length === 0) return null
|
||||
return (
|
||||
<div className="sidebar-section">
|
||||
<div className="sidebar-section-title">{t('operate.cluster')}</div>
|
||||
<div className="sidebar-section-items">
|
||||
{pinned.map(item => (
|
||||
<NavItem
|
||||
key={item.path}
|
||||
item={{ path: item.path, icon: item.icon, labelKey: item.labelKey }}
|
||||
onClose={onClose}
|
||||
collapsed={collapsed}
|
||||
/>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
})()}
|
||||
|
||||
{/* Collapsible sections */}
|
||||
{sections.map(section => {
|
||||
const visibleItems = getVisibleSectionItems(section)
|
||||
|
||||
129
core/http/react-ui/src/components/StarterModels.jsx
Normal file
129
core/http/react-ui/src/components/StarterModels.jsx
Normal file
@@ -0,0 +1,129 @@
|
||||
import { useState, useEffect, useMemo } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { modelsApi } from '../utils/api'
|
||||
import { useResources } from '../hooks/useResources'
|
||||
|
||||
// Curated, hardware-tiered starter models for the empty-state onboarding. Names
|
||||
// are real gallery entries (gallery/index.yaml); we intersect them against the
|
||||
// live gallery at render time so a custom/trimmed gallery degrades gracefully
|
||||
// (unmatched entries simply don't render).
|
||||
//
|
||||
// The guiding rule the maintainer asked for: CPU-only machines should be
|
||||
// steered to genuinely small models (1-4B, Q4) that stay responsive without a
|
||||
// GPU. GPU tiers scale the suggestion up with available VRAM.
|
||||
const SMALL = [
|
||||
{ name: 'llama-3.2-1b-instruct:q4_k_m', size: '~0.8 GB' },
|
||||
{ name: 'llama-3.2-3b-instruct:q4_k_m', size: '~2 GB' },
|
||||
{ name: 'qwen3-1.7b', size: '~1.4 GB' },
|
||||
{ name: 'gemma-3-1b-it', size: '~0.8 GB' },
|
||||
]
|
||||
const MID = [
|
||||
{ name: 'qwen3-4b', size: '~2.5 GB' },
|
||||
{ name: 'gemma-3-4b-it', size: '~3 GB' },
|
||||
{ name: 'llama-3.2-3b-instruct:q4_k_m', size: '~2 GB' },
|
||||
]
|
||||
const LARGE = [
|
||||
{ name: 'meta-llama-3.1-8b-instruct', size: '~5 GB' },
|
||||
{ name: 'qwen3-4b', size: '~2.5 GB' },
|
||||
{ name: 'mistral-7b-instruct-v0.3', size: '~4 GB' },
|
||||
]
|
||||
|
||||
const GB = 1024 * 1024 * 1024
|
||||
|
||||
// Pick a tier from detected hardware. total_memory is GPU VRAM in bytes (0 when
|
||||
// CPU-only). Thresholds are deliberately conservative so a suggestion that
|
||||
// "fits" really does.
|
||||
function pickTier(resources) {
|
||||
const isGpu = resources?.type === 'gpu'
|
||||
const vram = resources?.aggregate?.total_memory || 0
|
||||
if (!isGpu || vram <= 0) return { id: 'cpu', list: SMALL }
|
||||
if (vram < 8 * GB) return { id: 'gpu-small', list: MID }
|
||||
return { id: 'gpu-large', list: LARGE }
|
||||
}
|
||||
|
||||
export default function StarterModels({ addToast, onInstallStarted }) {
|
||||
const { t } = useTranslation('home')
|
||||
const { resources } = useResources()
|
||||
const [available, setAvailable] = useState(null) // Set of gallery names, or null while loading
|
||||
const [installing, setInstalling] = useState(() => new Set())
|
||||
|
||||
const tier = useMemo(() => pickTier(resources), [resources])
|
||||
const candidates = tier.list
|
||||
|
||||
// Verify candidates exist in the live gallery. One search per name (the tier
|
||||
// has at most a handful) keeps this resilient to gallery customization.
|
||||
useEffect(() => {
|
||||
let cancelled = false
|
||||
const names = [...new Set(candidates.map(c => c.name))]
|
||||
Promise.all(names.map(name =>
|
||||
modelsApi.list({ search: name, page: 1 })
|
||||
.then(data => (data?.models || []).some(m => (m.name || m.id) === name) ? name : null)
|
||||
.catch(() => null)
|
||||
)).then(found => {
|
||||
if (cancelled) return
|
||||
const hits = found.filter(Boolean)
|
||||
// If verification yielded nothing (e.g. gallery unreachable), fall back to
|
||||
// showing the curated list rather than an empty widget.
|
||||
setAvailable(hits.length > 0 ? new Set(hits) : null)
|
||||
})
|
||||
return () => { cancelled = true }
|
||||
}, [candidates])
|
||||
|
||||
const visible = available === null
|
||||
? candidates
|
||||
: candidates.filter(c => available.has(c.name))
|
||||
|
||||
if (visible.length === 0) return null
|
||||
|
||||
const install = async (name) => {
|
||||
setInstalling(prev => new Set(prev).add(name))
|
||||
try {
|
||||
await modelsApi.install(name)
|
||||
addToast?.(t('starters.installStarted', { model: name }), 'success')
|
||||
onInstallStarted?.(name)
|
||||
} catch (err) {
|
||||
addToast?.(t('starters.installFailed', { message: err.message }), 'error')
|
||||
setInstalling(prev => {
|
||||
const next = new Set(prev)
|
||||
next.delete(name)
|
||||
return next
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return (
|
||||
<section className="home-starters card">
|
||||
<div className="home-starters-head">
|
||||
<strong>{t('starters.title')}</strong>
|
||||
<span className="home-starters-tier">
|
||||
<i className={`fas ${tier.id === 'cpu' ? 'fa-memory' : 'fa-microchip'}`} aria-hidden="true" />
|
||||
{t(`starters.tier.${tier.id}`)}
|
||||
</span>
|
||||
</div>
|
||||
<p className="home-starters-sub">
|
||||
{tier.id === 'cpu' ? t('starters.cpuNote') : t('starters.gpuNote')}
|
||||
</p>
|
||||
<ul className="home-starters-list">
|
||||
{visible.map(c => {
|
||||
const busy = installing.has(c.name)
|
||||
return (
|
||||
<li key={c.name} className="home-starters-item">
|
||||
<span className="home-starters-name">{c.name}</span>
|
||||
<span className="home-starters-size">{c.size}</span>
|
||||
<button
|
||||
type="button"
|
||||
className="btn btn-primary btn-sm"
|
||||
disabled={busy}
|
||||
onClick={() => install(c.name)}
|
||||
>
|
||||
{busy
|
||||
? (<><i className="fas fa-spinner fa-spin" aria-hidden="true" /> {t('starters.installing')}</>)
|
||||
: (<><i className="fas fa-download" aria-hidden="true" /> {t('starters.install')}</>)}
|
||||
</button>
|
||||
</li>
|
||||
)
|
||||
})}
|
||||
</ul>
|
||||
</section>
|
||||
)
|
||||
}
|
||||
@@ -1,96 +0,0 @@
|
||||
import { useNavigate } from 'react-router-dom'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { useAuth } from '../context/AuthContext'
|
||||
import { useDeployment } from '../contexts/DeploymentContext'
|
||||
import { useTheme } from '../contexts/ThemeContext'
|
||||
import { launchAssistantChat } from '../utils/launchAssistantChat'
|
||||
import TokenUsageMeter from './navbar/TokenUsageMeter'
|
||||
|
||||
// Desktop top bar. Complementary to the mobile-only header in App.jsx: this is
|
||||
// hidden on small screens (see .top-navbar CSS) and shows deployment/admin
|
||||
// affordances on wide screens where the sidebar footer is far from the content.
|
||||
export default function TopNavbar() {
|
||||
const { t } = useTranslation('nav')
|
||||
const navigate = useNavigate()
|
||||
const { isAdmin, authEnabled, user } = useAuth()
|
||||
const { features, distributed, p2pEnabled } = useDeployment()
|
||||
const { theme, toggleTheme } = useTheme()
|
||||
|
||||
const modeLabel = distributed
|
||||
? t('topbar.modeDistributed')
|
||||
: p2pEnabled
|
||||
? t('topbar.modeSwarm')
|
||||
: t('topbar.modeSingle')
|
||||
|
||||
const showAssistantJump = isAdmin && !!features.localai_assistant
|
||||
const showAvatar = authEnabled && user
|
||||
const themeLabel = theme === 'dark' ? t('switchToLightMode') : t('switchToDarkMode')
|
||||
|
||||
return (
|
||||
<div className="top-navbar" role="navigation" aria-label={t('topbar.label')}>
|
||||
<div className="top-navbar__left">
|
||||
{isAdmin && (
|
||||
<span className={`top-navbar__mode ${distributed || p2pEnabled ? 'is-active' : ''}`}>
|
||||
<i className="fas fa-circle-nodes" aria-hidden="true" /> {modeLabel}
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
<div className="top-navbar__right">
|
||||
{!isAdmin && (
|
||||
<button
|
||||
type="button"
|
||||
className="top-navbar__btn"
|
||||
onClick={() => navigate('/app/chat')}
|
||||
title={t('topbar.pickModel')}
|
||||
>
|
||||
<i className="fas fa-cube" aria-hidden="true" /> {t('topbar.pickModel')}
|
||||
</button>
|
||||
)}
|
||||
{showAssistantJump && (
|
||||
<button
|
||||
type="button"
|
||||
className="top-navbar__btn top-navbar__assistant"
|
||||
onClick={() => launchAssistantChat(navigate)}
|
||||
title={t('topbar.adminViaChat')}
|
||||
>
|
||||
<i className="fas fa-user-shield" aria-hidden="true" /> {t('topbar.adminViaChat')}
|
||||
</button>
|
||||
)}
|
||||
{isAdmin && <TokenUsageMeter />}
|
||||
{isAdmin && (
|
||||
<button
|
||||
type="button"
|
||||
className="top-navbar__icon"
|
||||
onClick={() => navigate('/app/settings')}
|
||||
aria-label={t('items.settings')}
|
||||
title={t('items.settings')}
|
||||
>
|
||||
<i className="fas fa-cog" aria-hidden="true" />
|
||||
</button>
|
||||
)}
|
||||
<button
|
||||
type="button"
|
||||
className="top-navbar__icon"
|
||||
onClick={toggleTheme}
|
||||
aria-label={themeLabel}
|
||||
title={themeLabel}
|
||||
>
|
||||
<i className={`fas ${theme === 'dark' ? 'fa-sun' : 'fa-moon'}`} aria-hidden="true" />
|
||||
</button>
|
||||
{showAvatar && (
|
||||
<button
|
||||
type="button"
|
||||
className="top-navbar__icon top-navbar__avatar"
|
||||
onClick={() => navigate('/app/account')}
|
||||
aria-label={user.name || user.email}
|
||||
title={user.name || user.email}
|
||||
>
|
||||
{user.avatarUrl
|
||||
? <img src={user.avatarUrl} alt="" />
|
||||
: <i className="fas fa-user-circle" aria-hidden="true" />}
|
||||
</button>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
@@ -1,52 +0,0 @@
|
||||
import { useState, useEffect } from 'react'
|
||||
import { useNavigate } from 'react-router-dom'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { usageApi } from '../../utils/api'
|
||||
|
||||
// Compact admin-only usage glance: today's total tokens, optionally against a
|
||||
// quota cap, linking to the full /app/usage page. Self-contained data fetch so
|
||||
// a usage-API failure cannot break the navbar - it just renders nothing.
|
||||
function sumTotalTokens(res) {
|
||||
const buckets = res?.buckets || res?.usage || (Array.isArray(res) ? res : [])
|
||||
if (!Array.isArray(buckets) || buckets.length === 0) return null
|
||||
return buckets.reduce((s, b) => s + (b.total_tokens || 0), 0)
|
||||
}
|
||||
|
||||
export default function TokenUsageMeter() {
|
||||
const { t } = useTranslation('nav')
|
||||
const navigate = useNavigate()
|
||||
const [tokens, setTokens] = useState(null)
|
||||
const [cap, setCap] = useState(null)
|
||||
|
||||
useEffect(() => {
|
||||
let cancelled = false
|
||||
usageApi.getAdminUsage('day')
|
||||
.then(res => { if (!cancelled) setTokens(sumTotalTokens(res)) })
|
||||
.catch(() => { if (!cancelled) setTokens(null) })
|
||||
usageApi.getMyQuotas()
|
||||
.then(q => { if (!cancelled) setCap(q?.token_limit || q?.tokens?.limit || null) })
|
||||
.catch(() => { if (!cancelled) setCap(null) })
|
||||
return () => { cancelled = true }
|
||||
}, [])
|
||||
|
||||
if (tokens === null) return null
|
||||
|
||||
const pct = cap ? Math.min(100, Math.round((tokens / cap) * 100)) : null
|
||||
|
||||
return (
|
||||
<button
|
||||
type="button"
|
||||
className="top-navbar__meter"
|
||||
onClick={() => navigate('/app/usage')}
|
||||
title={t('topbar.usageDetail')}
|
||||
>
|
||||
<span className="top-navbar__meter-label">
|
||||
{t('topbar.tokensToday')}: {Intl.NumberFormat().format(tokens)}
|
||||
{cap ? ` / ${Intl.NumberFormat().format(cap)}` : ''}
|
||||
</span>
|
||||
{pct !== null && (
|
||||
<span className="top-navbar__meter-bar"><i style={{ width: `${pct}%` }} /></span>
|
||||
)}
|
||||
</button>
|
||||
)
|
||||
}
|
||||
@@ -1,55 +0,0 @@
|
||||
import { createContext, useContext, useState, useEffect } from 'react'
|
||||
import { apiUrl } from '../utils/basePath'
|
||||
import { p2pApi } from '../utils/api'
|
||||
|
||||
const DeploymentContext = createContext(null)
|
||||
|
||||
// One shared fetch of the deployment-shape signals the adaptive UI keys off:
|
||||
// server features (/api/features) and whether a P2P network token exists.
|
||||
// Components used to fetch /api/features independently (Sidebar, Home); this
|
||||
// centralises it so the landing resolver, sidebar policy, and navbar agree on
|
||||
// one snapshot and we issue a single request.
|
||||
export function DeploymentProvider({ children }) {
|
||||
const [features, setFeatures] = useState({})
|
||||
const [p2pEnabled, setP2pEnabled] = useState(false)
|
||||
const [loading, setLoading] = useState(true)
|
||||
|
||||
useEffect(() => {
|
||||
let cancelled = false
|
||||
const featuresP = fetch(apiUrl('/api/features'))
|
||||
.then(r => r.json())
|
||||
.catch(() => ({}))
|
||||
// P2P has no /api/features flag: it is "enabled" when a network token
|
||||
// exists (mirrors pages/P2P.jsx). A 404/disabled endpoint throws and we
|
||||
// treat that as not-enabled.
|
||||
const p2pP = p2pApi.getToken()
|
||||
.then(tok => (typeof tok === 'string' ? tok : (tok?.token || '')).trim())
|
||||
.catch(() => '')
|
||||
Promise.all([featuresP, p2pP]).then(([f, tok]) => {
|
||||
if (cancelled) return
|
||||
setFeatures(f || {})
|
||||
setP2pEnabled(!!tok)
|
||||
setLoading(false)
|
||||
})
|
||||
return () => { cancelled = true }
|
||||
}, [])
|
||||
|
||||
const value = {
|
||||
features,
|
||||
distributed: !!features.distributed,
|
||||
p2pEnabled,
|
||||
loading,
|
||||
}
|
||||
|
||||
return (
|
||||
<DeploymentContext.Provider value={value}>
|
||||
{children}
|
||||
</DeploymentContext.Provider>
|
||||
)
|
||||
}
|
||||
|
||||
export function useDeployment() {
|
||||
const ctx = useContext(DeploymentContext)
|
||||
if (!ctx) throw new Error('useDeployment must be used within DeploymentProvider')
|
||||
return ctx
|
||||
}
|
||||
66
core/http/react-ui/src/hooks/usePolling.js
vendored
Normal file
66
core/http/react-ui/src/hooks/usePolling.js
vendored
Normal file
@@ -0,0 +1,66 @@
|
||||
import { useEffect, useRef, useCallback } from 'react'
|
||||
|
||||
// usePolling runs `fn` immediately and then on a fixed interval, with two
|
||||
// behaviours every hand-rolled setInterval in this app was missing:
|
||||
//
|
||||
// 1. Visibility-aware: the timer pauses while the tab is hidden
|
||||
// (document.hidden) and fires an immediate catch-up poll when the tab
|
||||
// becomes visible again. A backgrounded dashboard no longer hammers the
|
||||
// server every few seconds for data nobody is looking at.
|
||||
// 2. Non-overlapping: if `fn` returns a promise that takes longer than the
|
||||
// interval, the next tick waits for it instead of stacking requests.
|
||||
//
|
||||
// `enabled: false` stops polling entirely (one-shot or gated polls). The
|
||||
// returned `refetch` runs `fn` on demand and is stable across renders.
|
||||
export function usePolling(fn, intervalMs = 5000, { enabled = true, immediate = true } = {}) {
|
||||
const fnRef = useRef(fn)
|
||||
fnRef.current = fn
|
||||
|
||||
const runningRef = useRef(false)
|
||||
const refetch = useCallback(async () => {
|
||||
// Guard against overlap: a slow poll shouldn't pile up behind a fast timer.
|
||||
if (runningRef.current) return
|
||||
runningRef.current = true
|
||||
try {
|
||||
return await fnRef.current()
|
||||
} finally {
|
||||
runningRef.current = false
|
||||
}
|
||||
}, [])
|
||||
|
||||
useEffect(() => {
|
||||
if (!enabled) return
|
||||
let timer = null
|
||||
|
||||
const tick = () => { refetch() }
|
||||
|
||||
const start = () => {
|
||||
if (timer != null) return
|
||||
timer = setInterval(tick, intervalMs)
|
||||
}
|
||||
const stop = () => {
|
||||
if (timer != null) { clearInterval(timer); timer = null }
|
||||
}
|
||||
|
||||
const onVisibility = () => {
|
||||
if (document.hidden) {
|
||||
stop()
|
||||
} else {
|
||||
// Catch up immediately on return, then resume the cadence.
|
||||
tick()
|
||||
start()
|
||||
}
|
||||
}
|
||||
|
||||
if (immediate) tick()
|
||||
if (!document.hidden) start()
|
||||
document.addEventListener('visibilitychange', onVisibility)
|
||||
|
||||
return () => {
|
||||
stop()
|
||||
document.removeEventListener('visibilitychange', onVisibility)
|
||||
}
|
||||
}, [enabled, intervalMs, immediate, refetch])
|
||||
|
||||
return { refetch }
|
||||
}
|
||||
17
core/http/react-ui/src/hooks/useResources.js
vendored
17
core/http/react-ui/src/hooks/useResources.js
vendored
@@ -1,11 +1,11 @@
|
||||
import { useState, useEffect, useCallback, useRef } from 'react'
|
||||
import { useState, useCallback } from 'react'
|
||||
import { resourcesApi } from '../utils/api'
|
||||
import { usePolling } from './usePolling'
|
||||
|
||||
export function useResources(pollInterval = 5000) {
|
||||
const [resources, setResources] = useState(null)
|
||||
const [loading, setLoading] = useState(true)
|
||||
const [error, setError] = useState(null)
|
||||
const intervalRef = useRef(null)
|
||||
|
||||
const fetchResources = useCallback(async () => {
|
||||
try {
|
||||
@@ -19,13 +19,10 @@ export function useResources(pollInterval = 5000) {
|
||||
}
|
||||
}, [])
|
||||
|
||||
useEffect(() => {
|
||||
fetchResources()
|
||||
intervalRef.current = setInterval(fetchResources, pollInterval)
|
||||
return () => {
|
||||
if (intervalRef.current) clearInterval(intervalRef.current)
|
||||
}
|
||||
}, [fetchResources, pollInterval])
|
||||
// Visibility-aware polling: pauses while the tab is hidden and catches up on
|
||||
// return (see usePolling). Resource stats are pure dashboard data, so there's
|
||||
// no reason to keep fetching them for a backgrounded tab.
|
||||
const { refetch } = usePolling(fetchResources, pollInterval)
|
||||
|
||||
return { resources, loading, error, refetch: fetchResources }
|
||||
return { resources, loading, error, refetch }
|
||||
}
|
||||
|
||||
@@ -4,7 +4,6 @@ import { RouterProvider } from 'react-router-dom'
|
||||
import { ThemeProvider } from './contexts/ThemeContext'
|
||||
import { BrandingProvider } from './contexts/BrandingContext'
|
||||
import { AuthProvider } from './context/AuthContext'
|
||||
import { DeploymentProvider } from './contexts/DeploymentContext'
|
||||
import { OperationsProvider } from './contexts/OperationsContext'
|
||||
import { router } from './router'
|
||||
import './i18n'
|
||||
@@ -33,11 +32,9 @@ createRoot(document.getElementById('root')).render(
|
||||
<ThemeProvider>
|
||||
<BrandingProvider>
|
||||
<AuthProvider>
|
||||
<DeploymentProvider>
|
||||
<OperationsProvider>
|
||||
<RouterProvider router={router} />
|
||||
</OperationsProvider>
|
||||
</DeploymentProvider>
|
||||
<OperationsProvider>
|
||||
<RouterProvider router={router} />
|
||||
</OperationsProvider>
|
||||
</AuthProvider>
|
||||
</BrandingProvider>
|
||||
</ThemeProvider>
|
||||
|
||||
@@ -765,8 +765,10 @@ export default function AgentChat() {
|
||||
className="chat-send-btn"
|
||||
onClick={handleSend}
|
||||
disabled={processing || !input.trim()}
|
||||
aria-label="Send message"
|
||||
title="Send message"
|
||||
>
|
||||
<i className="fas fa-paper-plane" />
|
||||
<i className="fas fa-paper-plane" aria-hidden="true" />
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -541,73 +541,58 @@ export default function Chat() {
|
||||
updateChatSettings(activeChat.id, { clientMCPServers: next })
|
||||
}, [activeChat, updateChatSettings])
|
||||
|
||||
// Load initial message / assistant launch from the Home page or the navbar
|
||||
// quick-jump. Factored into a callback so both the mount-time reader and the
|
||||
// navbar re-trigger event below consume the same payload through one path.
|
||||
// Load initial message from home page
|
||||
const homeDataProcessed = useRef(false)
|
||||
const consumeHomeChatData = useCallback(() => {
|
||||
const stored = localStorage.getItem('localai_index_chat_data')
|
||||
if (!stored) return
|
||||
try {
|
||||
const data = JSON.parse(stored)
|
||||
localStorage.removeItem('localai_index_chat_data')
|
||||
|
||||
// Two entry shapes from Home:
|
||||
// - "compose-and-send": data.message present → open new chat,
|
||||
// prefill the composer, click submit.
|
||||
// - "open-assistant": no message, just data.localaiAssistant → open
|
||||
// a fresh chat already in admin mode so the wizard can fire.
|
||||
const hasMessage = !!data.message
|
||||
const wantsAssistant = !!data.localaiAssistant
|
||||
|
||||
if (hasMessage || wantsAssistant) {
|
||||
let targetChat = activeChat
|
||||
if (data.newChat) {
|
||||
targetChat = addChat(data.model || '', '', data.mcpMode || false)
|
||||
} else {
|
||||
if (data.model && activeChat) {
|
||||
updateChatSettings(activeChat.id, { model: data.model })
|
||||
}
|
||||
if (data.mcpMode && activeChat) {
|
||||
updateChatSettings(activeChat.id, { mcpMode: true })
|
||||
}
|
||||
}
|
||||
if (data.mcpServers?.length > 0 && targetChat) {
|
||||
updateChatSettings(targetChat.id, { mcpServers: data.mcpServers })
|
||||
}
|
||||
if (data.clientMCPServers?.length > 0 && targetChat) {
|
||||
updateChatSettings(targetChat.id, { clientMCPServers: data.clientMCPServers })
|
||||
}
|
||||
if (wantsAssistant && targetChat) {
|
||||
updateChatSettings(targetChat.id, { localaiAssistant: true })
|
||||
}
|
||||
if (hasMessage) {
|
||||
setInput(data.message)
|
||||
if (data.files) setFiles(data.files)
|
||||
setTimeout(() => {
|
||||
const submitBtn = document.getElementById('chat-submit-btn')
|
||||
submitBtn?.click()
|
||||
}, 100)
|
||||
}
|
||||
}
|
||||
} catch (_e) { /* ignore */ }
|
||||
}, [activeChat, addChat, updateChatSettings])
|
||||
|
||||
useEffect(() => {
|
||||
if (homeDataProcessed.current) return
|
||||
homeDataProcessed.current = true
|
||||
consumeHomeChatData()
|
||||
}, [consumeHomeChatData])
|
||||
const stored = localStorage.getItem('localai_index_chat_data')
|
||||
if (stored) {
|
||||
homeDataProcessed.current = true
|
||||
try {
|
||||
const data = JSON.parse(stored)
|
||||
localStorage.removeItem('localai_index_chat_data')
|
||||
|
||||
// Admins can re-trigger the assistant jump from the navbar while already on
|
||||
// the chat page; navigate('/app/chat') does not remount Chat, so the
|
||||
// mount-time reader above never fires. The launcher dispatches this event
|
||||
// after writing the payload so we re-consume it and open a fresh assistant.
|
||||
useEffect(() => {
|
||||
const onOpenAssistant = () => consumeHomeChatData()
|
||||
window.addEventListener('localai-open-assistant', onOpenAssistant)
|
||||
return () => window.removeEventListener('localai-open-assistant', onOpenAssistant)
|
||||
}, [consumeHomeChatData])
|
||||
// Two entry shapes from Home:
|
||||
// - "compose-and-send": data.message present → open new chat,
|
||||
// prefill the composer, click submit.
|
||||
// - "open-assistant": no message, just data.localaiAssistant → open
|
||||
// a fresh chat already in admin mode so the wizard can fire.
|
||||
const hasMessage = !!data.message
|
||||
const wantsAssistant = !!data.localaiAssistant
|
||||
|
||||
if (hasMessage || wantsAssistant) {
|
||||
let targetChat = activeChat
|
||||
if (data.newChat) {
|
||||
targetChat = addChat(data.model || '', '', data.mcpMode || false)
|
||||
} else {
|
||||
if (data.model && activeChat) {
|
||||
updateChatSettings(activeChat.id, { model: data.model })
|
||||
}
|
||||
if (data.mcpMode && activeChat) {
|
||||
updateChatSettings(activeChat.id, { mcpMode: true })
|
||||
}
|
||||
}
|
||||
if (data.mcpServers?.length > 0 && targetChat) {
|
||||
updateChatSettings(targetChat.id, { mcpServers: data.mcpServers })
|
||||
}
|
||||
if (data.clientMCPServers?.length > 0 && targetChat) {
|
||||
updateChatSettings(targetChat.id, { clientMCPServers: data.clientMCPServers })
|
||||
}
|
||||
if (wantsAssistant && targetChat) {
|
||||
updateChatSettings(targetChat.id, { localaiAssistant: true })
|
||||
}
|
||||
if (hasMessage) {
|
||||
setInput(data.message)
|
||||
if (data.files) setFiles(data.files)
|
||||
setTimeout(() => {
|
||||
const submitBtn = document.getElementById('chat-submit-btn')
|
||||
submitBtn?.click()
|
||||
}, 100)
|
||||
}
|
||||
}
|
||||
} catch (_e) { /* ignore */ }
|
||||
}
|
||||
}, [])
|
||||
|
||||
// Track whether the user is pinned to the bottom. If they scroll up
|
||||
// while a response is streaming, stop forcing them back down.
|
||||
@@ -1442,8 +1427,10 @@ export default function Chat() {
|
||||
className="chat-send-btn"
|
||||
onClick={handleSend}
|
||||
disabled={!input.trim() && files.length === 0}
|
||||
aria-label={t('input.send')}
|
||||
title={t('input.send')}
|
||||
>
|
||||
<i className="fas fa-paper-plane" />
|
||||
<i className="fas fa-paper-plane" aria-hidden="true" />
|
||||
</button>
|
||||
)}
|
||||
</div>
|
||||
|
||||
@@ -10,14 +10,15 @@ import UnifiedMCPDropdown from '../components/UnifiedMCPDropdown'
|
||||
import ConfirmDialog from '../components/ConfirmDialog'
|
||||
import HomeConnect from '../components/HomeConnect'
|
||||
import { useResources } from '../hooks/useResources'
|
||||
import { usePolling } from '../hooks/usePolling'
|
||||
import { fileToBase64, backendControlApi, systemApi, modelsApi, mcpApi, nodesApi } from '../utils/api'
|
||||
import { API_CONFIG } from '../utils/config'
|
||||
import { greetingKey } from '../utils/greeting'
|
||||
import { launchAssistantChat } from '../utils/launchAssistantChat'
|
||||
import StatusPill from '../components/StatusPill'
|
||||
import Skeleton from '../components/Skeleton'
|
||||
import SectionHeading from '../components/SectionHeading'
|
||||
import EmptyState from '../components/EmptyState'
|
||||
import StarterModels from '../components/StarterModels'
|
||||
import { staggerStyle } from '../hooks/useStagger'
|
||||
|
||||
export default function Home() {
|
||||
@@ -69,40 +70,36 @@ export default function Home() {
|
||||
.catch(() => {})
|
||||
}, [])
|
||||
|
||||
// Poll cluster node data in distributed mode
|
||||
useEffect(() => {
|
||||
if (!distributedMode) return
|
||||
const fetchCluster = async () => {
|
||||
try {
|
||||
const data = await nodesApi.list()
|
||||
const nodes = Array.isArray(data) ? data : []
|
||||
const backendNodes = nodes.filter(n => !n.node_type || n.node_type === 'backend')
|
||||
const totalVRAM = backendNodes.reduce((sum, n) => sum + (n.total_vram || 0), 0)
|
||||
const usedVRAM = backendNodes.reduce((sum, n) => {
|
||||
if (n.total_vram && n.available_vram != null) return sum + (n.total_vram - n.available_vram)
|
||||
return sum
|
||||
}, 0)
|
||||
const totalRAM = backendNodes.reduce((sum, n) => sum + (n.total_ram || 0), 0)
|
||||
const usedRAM = backendNodes.reduce((sum, n) => {
|
||||
if (n.total_ram && n.available_ram != null) return sum + (n.total_ram - n.available_ram)
|
||||
return sum
|
||||
}, 0)
|
||||
const isGPU = totalVRAM > 0
|
||||
const healthyCount = backendNodes.filter(n => n.status === 'healthy').length
|
||||
const totalCount = backendNodes.length
|
||||
setClusterData({
|
||||
totalMem: isGPU ? totalVRAM : totalRAM,
|
||||
usedMem: isGPU ? usedVRAM : usedRAM,
|
||||
isGPU,
|
||||
healthyCount,
|
||||
totalCount,
|
||||
})
|
||||
} catch { setClusterData(null) }
|
||||
}
|
||||
fetchCluster()
|
||||
const interval = setInterval(fetchCluster, 5000)
|
||||
return () => clearInterval(interval)
|
||||
}, [distributedMode])
|
||||
// Poll cluster node data in distributed mode. Visibility-aware + gated on
|
||||
// distributedMode so a non-distributed or backgrounded tab makes no calls.
|
||||
const fetchCluster = useCallback(async () => {
|
||||
try {
|
||||
const data = await nodesApi.list()
|
||||
const nodes = Array.isArray(data) ? data : []
|
||||
const backendNodes = nodes.filter(n => !n.node_type || n.node_type === 'backend')
|
||||
const totalVRAM = backendNodes.reduce((sum, n) => sum + (n.total_vram || 0), 0)
|
||||
const usedVRAM = backendNodes.reduce((sum, n) => {
|
||||
if (n.total_vram && n.available_vram != null) return sum + (n.total_vram - n.available_vram)
|
||||
return sum
|
||||
}, 0)
|
||||
const totalRAM = backendNodes.reduce((sum, n) => sum + (n.total_ram || 0), 0)
|
||||
const usedRAM = backendNodes.reduce((sum, n) => {
|
||||
if (n.total_ram && n.available_ram != null) return sum + (n.total_ram - n.available_ram)
|
||||
return sum
|
||||
}, 0)
|
||||
const isGPU = totalVRAM > 0
|
||||
const healthyCount = backendNodes.filter(n => n.status === 'healthy').length
|
||||
const totalCount = backendNodes.length
|
||||
setClusterData({
|
||||
totalMem: isGPU ? totalVRAM : totalRAM,
|
||||
usedMem: isGPU ? usedVRAM : usedRAM,
|
||||
isGPU,
|
||||
healthyCount,
|
||||
totalCount,
|
||||
})
|
||||
} catch { setClusterData(null) }
|
||||
}, [])
|
||||
usePolling(fetchCluster, 5000, { enabled: distributedMode })
|
||||
|
||||
// Fetch configured models (to know if any exist) and loaded models (currently running)
|
||||
const fetchSystemInfo = useCallback(async () => {
|
||||
@@ -124,11 +121,7 @@ export default function Home() {
|
||||
}
|
||||
}, [])
|
||||
|
||||
useEffect(() => {
|
||||
fetchSystemInfo()
|
||||
const interval = setInterval(fetchSystemInfo, 5000)
|
||||
return () => clearInterval(interval)
|
||||
}, [fetchSystemInfo])
|
||||
usePolling(fetchSystemInfo, 5000)
|
||||
|
||||
// Check MCP availability when selected model changes
|
||||
useEffect(() => {
|
||||
@@ -229,8 +222,16 @@ export default function Home() {
|
||||
// requiring an initial message or model selection. Useful when an admin
|
||||
// wants to start the assistant from a cold home page.
|
||||
const openAssistantChat = useCallback(() => {
|
||||
launchAssistantChat(navigate, selectedModel)
|
||||
const chatData = {
|
||||
model: selectedModel || '',
|
||||
mcpMode: false,
|
||||
localaiAssistant: true,
|
||||
newChat: true,
|
||||
}
|
||||
localStorage.setItem('localai_index_chat_data', JSON.stringify(chatData))
|
||||
try { localStorage.setItem('localai_assistant_used', '1') } catch { /* ignore */ }
|
||||
setAssistantUsed(true)
|
||||
navigate('/app/chat')
|
||||
}, [navigate, selectedModel])
|
||||
|
||||
const handleSubmit = (e) => {
|
||||
@@ -516,6 +517,8 @@ export default function Home() {
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<StarterModels addToast={addToast} onInstallStarted={fetchSystemInfo} />
|
||||
|
||||
<div className="home-wizard-actions">
|
||||
<button className="btn btn-primary" onClick={() => navigate('/app/models')}>
|
||||
<i className="fas fa-store" /> {t('wizard.browseGallery')}
|
||||
|
||||
@@ -24,7 +24,37 @@ function formatNumber(n) {
|
||||
return String(n)
|
||||
}
|
||||
|
||||
function StatCard({ icon, label, value, muted }) {
|
||||
// Opt-in token pricing. LocalAI is self-hosted and has no inherent monetary
|
||||
// cost, but multi-user deployments use estimated cost for chargeback/budgeting.
|
||||
// Prices are admin-supplied $ per 1M tokens, stored locally (per-browser), and
|
||||
// the whole cost surface stays hidden until a non-zero price is set.
|
||||
const TOKEN_PRICING_KEY = 'localai_token_pricing'
|
||||
|
||||
function loadPricing() {
|
||||
try {
|
||||
const p = JSON.parse(localStorage.getItem(TOKEN_PRICING_KEY) || '{}')
|
||||
return { prompt: Number(p.prompt) || 0, completion: Number(p.completion) || 0 }
|
||||
} catch { return { prompt: 0, completion: 0 } }
|
||||
}
|
||||
|
||||
function savePricing(p) {
|
||||
try { localStorage.setItem(TOKEN_PRICING_KEY, JSON.stringify(p)) } catch { /* ignore */ }
|
||||
}
|
||||
|
||||
function pricingEnabled(p) { return (p?.prompt || 0) > 0 || (p?.completion || 0) > 0 }
|
||||
|
||||
function costOf(row, p) {
|
||||
return (row.prompt_tokens / 1_000_000) * (p.prompt || 0)
|
||||
+ (row.completion_tokens / 1_000_000) * (p.completion || 0)
|
||||
}
|
||||
|
||||
function formatCost(n) {
|
||||
if (!n) return '$0.00'
|
||||
if (n < 0.01) return '<$0.01'
|
||||
return '$' + n.toFixed(2)
|
||||
}
|
||||
|
||||
function StatCard({ icon, label, value, muted, text }) {
|
||||
return (
|
||||
<div className="card" style={{ padding: 'var(--spacing-sm) var(--spacing-md)', flex: '1 1 0', minWidth: 120, opacity: muted ? 0.7 : 1 }}>
|
||||
<div style={{ display: 'flex', alignItems: 'center', gap: 6, marginBottom: 2 }}>
|
||||
@@ -32,7 +62,7 @@ function StatCard({ icon, label, value, muted }) {
|
||||
<span style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)', fontWeight: 500, textTransform: 'uppercase', letterSpacing: '0.03em' }}>{label}</span>
|
||||
</div>
|
||||
<div style={{ fontSize: '1.375rem', fontWeight: 700, fontFamily: 'var(--font-mono)', color: muted ? 'var(--color-text-secondary)' : 'var(--color-text-primary)' }}>
|
||||
{muted ? '~' : ''}{formatNumber(value)}
|
||||
{text != null ? text : `${muted ? '~' : ''}${formatNumber(value)}`}
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
@@ -642,6 +672,10 @@ export default function Usage() {
|
||||
const [activeTab, setActiveTab] = useState('models')
|
||||
const [quotas, setQuotas] = useState([])
|
||||
const [selectedUserId, setSelectedUserId] = useState(null)
|
||||
const [pricing, setPricingState] = useState(loadPricing)
|
||||
const [showPricing, setShowPricing] = useState(false)
|
||||
const setPricing = (p) => { setPricingState(p); savePricing(p) }
|
||||
const costEnabled = pricingEnabled(pricing)
|
||||
|
||||
const fetchUsage = useCallback(async () => {
|
||||
setLoading(true)
|
||||
@@ -743,11 +777,50 @@ export default function Usage() {
|
||||
<i className="fas fa-key" style={{ fontSize: '0.7rem' }} /> {t('usage.sources.tab')}
|
||||
</button>
|
||||
<div style={{ flex: 1 }} />
|
||||
<button
|
||||
className={`btn btn-sm ${costEnabled ? 'btn-primary' : 'btn-secondary'}`}
|
||||
onClick={() => setShowPricing(v => !v)}
|
||||
style={{ gap: 4 }}
|
||||
title="Set token pricing to estimate cost"
|
||||
>
|
||||
<i className="fas fa-dollar-sign" /> {costEnabled ? 'Pricing' : 'Set pricing'}
|
||||
</button>
|
||||
<button className="btn btn-secondary btn-sm" onClick={fetchUsage} disabled={loading} style={{ gap: 4 }}>
|
||||
<i className={`fas fa-rotate${loading ? ' fa-spin' : ''}`} /> Refresh
|
||||
</button>
|
||||
</div>
|
||||
|
||||
{showPricing && (
|
||||
<div className="card" style={{ display: 'flex', alignItems: 'flex-end', gap: 'var(--spacing-md)', flexWrap: 'wrap', padding: 'var(--spacing-md)', marginBottom: 'var(--spacing-md)' }}>
|
||||
<div style={{ display: 'flex', flexDirection: 'column', gap: 2 }}>
|
||||
<label style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)', textTransform: 'uppercase', letterSpacing: '0.03em' }}>Prompt $/1M tokens</label>
|
||||
<input
|
||||
className="input" type="number" min="0" step="0.01" style={{ width: 140 }}
|
||||
value={pricing.prompt || ''}
|
||||
placeholder="0.00"
|
||||
onChange={e => setPricing({ ...pricing, prompt: Number(e.target.value) || 0 })}
|
||||
/>
|
||||
</div>
|
||||
<div style={{ display: 'flex', flexDirection: 'column', gap: 2 }}>
|
||||
<label style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)', textTransform: 'uppercase', letterSpacing: '0.03em' }}>Completion $/1M tokens</label>
|
||||
<input
|
||||
className="input" type="number" min="0" step="0.01" style={{ width: 140 }}
|
||||
value={pricing.completion || ''}
|
||||
placeholder="0.00"
|
||||
onChange={e => setPricing({ ...pricing, completion: Number(e.target.value) || 0 })}
|
||||
/>
|
||||
</div>
|
||||
{costEnabled && (
|
||||
<button className="btn btn-secondary btn-sm" onClick={() => setPricing({ prompt: 0, completion: 0 })} style={{ gap: 4 }}>
|
||||
<i className="fas fa-times" /> Clear
|
||||
</button>
|
||||
)}
|
||||
<span style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', flex: '1 1 200px' }}>
|
||||
Estimated cost only. Prices are stored in this browser and applied to recorded token counts.
|
||||
</span>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{loading ? (
|
||||
<div style={{ display: 'flex', justifyContent: 'center', padding: 'var(--spacing-xl)' }}>
|
||||
<LoadingSpinner size="lg" />
|
||||
@@ -760,6 +833,9 @@ export default function Usage() {
|
||||
<StatCard icon="fas fa-arrow-up" label="Prompt" value={displayTotals.prompt_tokens} />
|
||||
<StatCard icon="fas fa-arrow-down" label="Completion" value={displayTotals.completion_tokens} />
|
||||
<StatCard icon="fas fa-coins" label="Total" value={displayTotals.total_tokens} />
|
||||
{costEnabled && (
|
||||
<StatCard icon="fas fa-dollar-sign" label="Est. Cost" text={formatCost(costOf(displayTotals, pricing))} />
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* Predictions */}
|
||||
@@ -789,6 +865,7 @@ export default function Usage() {
|
||||
<th style={{ width: 110 }}>Prompt</th>
|
||||
<th style={{ width: 110 }}>Completion</th>
|
||||
<th style={{ width: 110 }}>Total</th>
|
||||
{costEnabled && <th style={{ width: 100 }}>Est. Cost</th>}
|
||||
<th style={{ width: 140 }}></th>
|
||||
</tr>
|
||||
</thead>
|
||||
@@ -800,6 +877,7 @@ export default function Usage() {
|
||||
<td style={monoCell}>{formatNumber(row.prompt_tokens)}</td>
|
||||
<td style={monoCell}>{formatNumber(row.completion_tokens)}</td>
|
||||
<td style={{ ...monoCell, fontWeight: 600 }}>{formatNumber(row.total_tokens)}</td>
|
||||
{costEnabled && <td style={monoCell}>{formatCost(costOf(row, pricing))}</td>}
|
||||
<td><UsageBar value={row.total_tokens} max={maxTokens} /></td>
|
||||
</tr>
|
||||
))}
|
||||
@@ -827,6 +905,7 @@ export default function Usage() {
|
||||
<th style={{ width: 110 }}>Prompt</th>
|
||||
<th style={{ width: 110 }}>Completion</th>
|
||||
<th style={{ width: 110 }}>Total</th>
|
||||
{costEnabled && <th style={{ width: 100 }}>Est. Cost</th>}
|
||||
<th style={{ width: 110 }}>Proj. Total</th>
|
||||
<th style={{ width: 140 }}></th>
|
||||
</tr>
|
||||
@@ -849,6 +928,7 @@ export default function Usage() {
|
||||
<td style={monoCell}>{formatNumber(row.prompt_tokens)}</td>
|
||||
<td style={monoCell}>{formatNumber(row.completion_tokens)}</td>
|
||||
<td style={{ ...monoCell, fontWeight: 600 }}>{formatNumber(row.total_tokens)}</td>
|
||||
{costEnabled && <td style={monoCell}>{formatCost(costOf(row, pricing))}</td>}
|
||||
<td style={{ ...monoCell, color: 'var(--color-text-muted)', fontStyle: 'italic' }}>
|
||||
{up?.predictions ? `~${formatNumber(up.predictions.projectedTotals.total_tokens)}` : '-'}
|
||||
</td>
|
||||
@@ -856,7 +936,7 @@ export default function Usage() {
|
||||
</tr>
|
||||
{isExpanded && up && (
|
||||
<tr>
|
||||
<td colSpan={8} style={{ padding: 0, background: 'var(--color-bg-secondary)' }}>
|
||||
<td colSpan={costEnabled ? 9 : 8} style={{ padding: 0, background: 'var(--color-bg-secondary)' }}>
|
||||
<div style={{ padding: 'var(--spacing-md)' }}>
|
||||
{up.predictions && (
|
||||
<div style={{ display: 'grid', gridTemplateColumns: 'repeat(auto-fit, minmax(100px, 1fr))', gap: 'var(--spacing-xs)', marginBottom: 'var(--spacing-sm)' }}>
|
||||
|
||||
@@ -6,7 +6,6 @@ import RequireAdmin from './components/RequireAdmin'
|
||||
import RequireAuth from './components/RequireAuth'
|
||||
import RequireAuthEnabled from './components/RequireAuthEnabled'
|
||||
import RequireFeature from './components/RequireFeature'
|
||||
import HomeRoute from './components/HomeRoute'
|
||||
|
||||
// Pages are code-split: each becomes its own chunk loaded on demand, so a route
|
||||
// no longer drags every other page (and its heavy deps — CodeMirror, the MCP
|
||||
@@ -33,7 +32,7 @@ export function preloadRoute(path) {
|
||||
preloaders[m[1] ?? '']?.().catch(() => { /* network blip — real click will retry */ })
|
||||
}
|
||||
|
||||
page('', () => import('./pages/Home'))
|
||||
const Home = page('', () => import('./pages/Home'))
|
||||
const Chat = page('chat', () => import('./pages/Chat'))
|
||||
const Models = page('models', () => import('./pages/Models'))
|
||||
const Manage = page('manage', () => import('./pages/Manage'))
|
||||
@@ -97,7 +96,7 @@ function Feature({ feature, children }) {
|
||||
}
|
||||
|
||||
const appChildren = [
|
||||
{ index: true, element: <HomeRoute /> },
|
||||
{ index: true, element: <Home /> },
|
||||
{ path: 'chat', element: <Chat /> },
|
||||
{ path: 'chat/:model', element: <Chat /> },
|
||||
{ path: 'image', element: <ImageGen /> },
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
// Opens a fresh chat already in LocalAI Assistant ("manage") mode. Chat.jsx
|
||||
// reads localai_index_chat_data on mount and enables localaiAssistant for the
|
||||
// new chat. Shared by the Home CTA and the top navbar quick-jump so there is
|
||||
// one definition of how the assistant is launched.
|
||||
export function launchAssistantChat(navigate, model = '') {
|
||||
const chatData = {
|
||||
model: model || '',
|
||||
mcpMode: false,
|
||||
localaiAssistant: true,
|
||||
newChat: true,
|
||||
}
|
||||
try { localStorage.setItem('localai_index_chat_data', JSON.stringify(chatData)) } catch { /* ignore */ }
|
||||
try { localStorage.setItem('localai_assistant_used', '1') } catch { /* ignore */ }
|
||||
navigate('/app/chat')
|
||||
// When already on /app/chat, navigate() does not remount Chat, so its
|
||||
// mount-time reader would never see the payload above. Signal the mounted
|
||||
// Chat to re-consume it; harmless elsewhere since Chat reads on mount anyway.
|
||||
try { window.dispatchEvent(new CustomEvent('localai-open-assistant')) } catch { /* ignore */ }
|
||||
}
|
||||
11
core/http/react-ui/src/utils/resolveHome.js
vendored
11
core/http/react-ui/src/utils/resolveHome.js
vendored
@@ -1,11 +0,0 @@
|
||||
// Pure landing-page resolver for the index route. Returns a target path, or ''
|
||||
// meaning "render the default Home". Admin precedence is distributed > p2p >
|
||||
// plain; non-admins always go to Chat (distributed/p2p are admin-only and
|
||||
// invisible to them). Visibility gates are enforced elsewhere - this only
|
||||
// chooses where /app lands.
|
||||
export function resolveHome({ isAdmin, distributed, p2pEnabled }) {
|
||||
if (!isAdmin) return '/app/chat'
|
||||
if (distributed) return '/app/nodes'
|
||||
if (p2pEnabled) return '/app/p2p'
|
||||
return ''
|
||||
}
|
||||
20
core/http/react-ui/src/utils/sidebarPolicy.js
vendored
20
core/http/react-ui/src/utils/sidebarPolicy.js
vendored
@@ -1,20 +0,0 @@
|
||||
import { operateConsole, isConsoleItemVisible } from '../components/console/consoleConfig'
|
||||
|
||||
// The Operate > Cluster group, surfaced as a pinned top-of-sidebar quick-access
|
||||
// group when the admin is running a cluster (NATS-distributed) or a P2P swarm.
|
||||
// Items are filtered through the SAME gate as everywhere else, so e.g. in a
|
||||
// p2p-only deployment Nodes/Scheduling (feature: 'distributed') drop out and
|
||||
// only Swarm remains. Returns [] when the pin does not apply.
|
||||
export function clusterPinItems(auth, deployment) {
|
||||
if (!auth.isAdmin) return []
|
||||
if (!deployment.distributed && !deployment.p2pEnabled) return []
|
||||
const group = operateConsole.groups.find(g => g.titleKey === 'operate.cluster')
|
||||
if (!group) return []
|
||||
return group.items.filter(item => isConsoleItemVisible(item, auth))
|
||||
}
|
||||
|
||||
// In the cluster cells the Create group defaults collapsed so the pinned
|
||||
// cluster group leads. Users can still expand it; their stored choice wins.
|
||||
export function shouldCollapseCreate(auth, deployment) {
|
||||
return !!auth.isAdmin && (!!deployment.distributed || !!deployment.p2pEnabled)
|
||||
}
|
||||
@@ -79,21 +79,29 @@ func (s *GalleryStore) Create(op *GalleryOperationRecord) error {
|
||||
}).Create(op).Error
|
||||
}
|
||||
|
||||
// UpdateProgress updates progress for an operation.
|
||||
func (s *GalleryStore) UpdateProgress(id string, progress float64, message, downloadedSize string) error {
|
||||
// UpdateProgress updates progress for an operation. The cancellable flag is
|
||||
// persisted on every tick so a replica that restarts mid-install rehydrates the
|
||||
// op as still cancellable — otherwise the column keeps its Create-time zero
|
||||
// value (false), the UI hides the cancel button, and the orphaned op can only
|
||||
// be dismissed by waiting for the 30-minute stale reaper.
|
||||
func (s *GalleryStore) UpdateProgress(id string, progress float64, message, downloadedSize string, cancellable bool) error {
|
||||
return s.db.Model(&GalleryOperationRecord{}).Where("id = ?", id).Updates(map[string]any{
|
||||
"progress": progress,
|
||||
"message": message,
|
||||
"downloaded_file_size": downloadedSize,
|
||||
"cancellable": cancellable,
|
||||
"updated_at": time.Now(),
|
||||
}).Error
|
||||
}
|
||||
|
||||
// UpdateStatus updates the status of an operation.
|
||||
// UpdateStatus updates the status of an operation. A terminal status is never
|
||||
// cancellable, so the flag is cleared here to keep the persisted row consistent
|
||||
// with what the UI should offer.
|
||||
func (s *GalleryStore) UpdateStatus(id, status, errMsg string) error {
|
||||
updates := map[string]any{
|
||||
"status": status,
|
||||
"updated_at": time.Now(),
|
||||
"status": status,
|
||||
"cancellable": false,
|
||||
"updated_at": time.Now(),
|
||||
}
|
||||
if errMsg != "" {
|
||||
updates["error"] = errMsg
|
||||
|
||||
56
core/services/galleryop/cancellable_persist_test.go
Normal file
56
core/services/galleryop/cancellable_persist_test.go
Normal file
@@ -0,0 +1,56 @@
|
||||
package galleryop_test
|
||||
|
||||
import (
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/services/distributed"
|
||||
"github.com/mudler/LocalAI/core/services/galleryop"
|
||||
"github.com/mudler/LocalAI/core/services/testutil"
|
||||
)
|
||||
|
||||
// Reproduces "an in-flight install can't be cancelled after a restart". The
|
||||
// live install path marks OpStatus.Cancellable=true on every progress tick, but
|
||||
// UpdateStatus persisted progress/status to the gallery store WITHOUT the
|
||||
// cancellable flag, and Create defaulted it to false. So after a replica
|
||||
// restart Hydrate rebuilt the op with Cancellable=false, /api/operations
|
||||
// reported cancellable:false, and the UI hid the cancel button — the orphaned
|
||||
// op lingered until the 30-minute stale reaper expired it. The cancellable
|
||||
// state must be persisted so a rehydrated in-flight op stays cancellable.
|
||||
var _ = Describe("GalleryService cancellable persistence across restart", func() {
|
||||
It("rehydrates an in-flight op as still cancellable", func() {
|
||||
db := testutil.SetupTestDB()
|
||||
store, err := distributed.NewGalleryStore(db)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
svc := galleryop.NewGalleryService(&config.ApplicationConfig{}, nil)
|
||||
svc.SetGalleryStore(store)
|
||||
|
||||
// Seed the in-flight op row as the worker goroutine does on admission.
|
||||
Expect(store.Create(&distributed.GalleryOperationRecord{
|
||||
ID: "op-inflight",
|
||||
GalleryElementName: "llama-cpp-development",
|
||||
OpType: "backend_install",
|
||||
Status: "pending",
|
||||
})).To(Succeed())
|
||||
|
||||
// Simulate a progress tick: the live path always marks installs
|
||||
// cancellable while they are downloading/processing.
|
||||
svc.UpdateStatus("op-inflight", &galleryop.OpStatus{
|
||||
Message: "downloading",
|
||||
Progress: 25,
|
||||
Cancellable: true,
|
||||
})
|
||||
|
||||
// A fresh replica boots and hydrates from the store.
|
||||
fresh := galleryop.NewGalleryService(&config.ApplicationConfig{}, nil)
|
||||
fresh.SetGalleryStore(store)
|
||||
Expect(fresh.Hydrate()).To(Succeed())
|
||||
|
||||
st := fresh.GetStatus("op-inflight")
|
||||
Expect(st).ToNot(BeNil(), "the in-flight op must hydrate after a restart")
|
||||
Expect(st.Cancellable).To(BeTrue(),
|
||||
"a still-active install must rehydrate as cancellable so the admin can dismiss it")
|
||||
})
|
||||
})
|
||||
@@ -167,7 +167,7 @@ func (g *GalleryService) UpdateStatus(s string, op *OpStatus) {
|
||||
xlog.Warn("Failed to persist gallery operation status", "op_id", s, "error", err)
|
||||
}
|
||||
} else {
|
||||
if err := store.UpdateProgress(s, op.Progress, op.Message, op.DownloadedFileSize); err != nil {
|
||||
if err := store.UpdateProgress(s, op.Progress, op.Message, op.DownloadedFileSize, op.Cancellable); err != nil {
|
||||
xlog.Warn("Failed to persist gallery operation progress", "op_id", s, "error", err)
|
||||
}
|
||||
}
|
||||
@@ -467,6 +467,7 @@ func (g *GalleryService) Start(c context.Context, cl *config.ModelConfigLoader,
|
||||
GalleryElementName: op.GalleryElementName,
|
||||
OpType: "backend_install",
|
||||
Status: "pending",
|
||||
Cancellable: true,
|
||||
})
|
||||
}
|
||||
err := g.backendHandler(&op, systemState)
|
||||
@@ -499,6 +500,8 @@ func (g *GalleryService) Start(c context.Context, cl *config.ModelConfigLoader,
|
||||
GalleryElementName: op.GalleryElementName,
|
||||
OpType: opType,
|
||||
Status: "pending",
|
||||
// A delete is not cancellable; an install is.
|
||||
Cancellable: !op.Delete,
|
||||
})
|
||||
}
|
||||
err := g.modelHandler(&op, cl, systemState)
|
||||
|
||||
@@ -19,25 +19,40 @@ import (
|
||||
// Per-replica: a single tracker instance is bound to (nodeID, modelName, replicaIndex).
|
||||
// The router constructs one tracker per Route() result, so each in-flight tick lands
|
||||
// on the correct row even when multiple replicas of the same model live on the same node.
|
||||
//
|
||||
// Embedding only grpc.ControlBackend (not the whole grpc.Backend) is what makes
|
||||
// the in-flight accounting safe by construction: the control-plane methods pass
|
||||
// through untracked, while every grpc.InferenceBackend method must be declared
|
||||
// explicitly below to satisfy grpc.Backend. Adding an inference method to the
|
||||
// interface therefore breaks this file's build (see the var assertion below)
|
||||
// until it is wrapped with track() - so a new inference path can't be added
|
||||
// without an in-flight accounting decision.
|
||||
type InFlightTrackingClient struct {
|
||||
grpc.Backend // embed for passthrough of untracked methods
|
||||
registry InFlightTracker
|
||||
nodeID string
|
||||
modelName string
|
||||
replicaIndex int
|
||||
grpc.ControlBackend // passthrough for control-plane / streaming-constructor methods
|
||||
inner grpc.InferenceBackend // tracked inference methods delegate here
|
||||
registry InFlightTracker
|
||||
nodeID string
|
||||
modelName string
|
||||
replicaIndex int
|
||||
|
||||
firstOnce sync.Once // guards onFirstComplete
|
||||
onFirstComplete func() // called once after the first tracked inference call completes
|
||||
}
|
||||
|
||||
// Compile-time contract: *InFlightTrackingClient must implement the FULL backend
|
||||
// surface. Because it embeds only ControlBackend, this fails to compile if any
|
||||
// InferenceBackend method is left unwrapped.
|
||||
var _ grpc.Backend = (*InFlightTrackingClient)(nil)
|
||||
|
||||
// NewInFlightTrackingClient wraps a gRPC backend client with in-flight tracking.
|
||||
func NewInFlightTrackingClient(inner grpc.Backend, registry InFlightTracker, nodeID, modelName string, replicaIndex int) *InFlightTrackingClient {
|
||||
return &InFlightTrackingClient{
|
||||
Backend: inner,
|
||||
registry: registry,
|
||||
nodeID: nodeID,
|
||||
modelName: modelName,
|
||||
replicaIndex: replicaIndex,
|
||||
ControlBackend: inner,
|
||||
inner: inner,
|
||||
registry: registry,
|
||||
nodeID: nodeID,
|
||||
modelName: modelName,
|
||||
replicaIndex: replicaIndex,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -91,154 +106,162 @@ func (c *InFlightTrackingClient) reconcile(err error) error {
|
||||
|
||||
func (c *InFlightTrackingClient) Predict(ctx context.Context, in *pb.PredictOptions, opts ...ggrpc.CallOption) (*pb.Reply, error) {
|
||||
defer c.track(ctx)()
|
||||
reply, err := c.Backend.Predict(ctx, in, opts...)
|
||||
reply, err := c.inner.Predict(ctx, in, opts...)
|
||||
return reply, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) PredictStream(ctx context.Context, in *pb.PredictOptions, f func(reply *pb.Reply), opts ...ggrpc.CallOption) error {
|
||||
defer c.track(ctx)()
|
||||
return c.reconcile(c.Backend.PredictStream(ctx, in, f, opts...))
|
||||
return c.reconcile(c.inner.PredictStream(ctx, in, f, opts...))
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) Embeddings(ctx context.Context, in *pb.PredictOptions, opts ...ggrpc.CallOption) (*pb.EmbeddingResult, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.Backend.Embeddings(ctx, in, opts...)
|
||||
res, err := c.inner.Embeddings(ctx, in, opts...)
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) GenerateImage(ctx context.Context, in *pb.GenerateImageRequest, opts ...ggrpc.CallOption) (*pb.Result, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.Backend.GenerateImage(ctx, in, opts...)
|
||||
res, err := c.inner.GenerateImage(ctx, in, opts...)
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) GenerateVideo(ctx context.Context, in *pb.GenerateVideoRequest, opts ...ggrpc.CallOption) (*pb.Result, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.Backend.GenerateVideo(ctx, in, opts...)
|
||||
res, err := c.inner.GenerateVideo(ctx, in, opts...)
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) TTS(ctx context.Context, in *pb.TTSRequest, opts ...ggrpc.CallOption) (*pb.Result, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.Backend.TTS(ctx, in, opts...)
|
||||
res, err := c.inner.TTS(ctx, in, opts...)
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) TTSStream(ctx context.Context, in *pb.TTSRequest, f func(reply *pb.Reply), opts ...ggrpc.CallOption) error {
|
||||
defer c.track(ctx)()
|
||||
return c.reconcile(c.Backend.TTSStream(ctx, in, f, opts...))
|
||||
return c.reconcile(c.inner.TTSStream(ctx, in, f, opts...))
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) SoundGeneration(ctx context.Context, in *pb.SoundGenerationRequest, opts ...ggrpc.CallOption) (*pb.Result, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.Backend.SoundGeneration(ctx, in, opts...)
|
||||
res, err := c.inner.SoundGeneration(ctx, in, opts...)
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) AudioTranscription(ctx context.Context, in *pb.TranscriptRequest, opts ...ggrpc.CallOption) (*pb.TranscriptResult, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.Backend.AudioTranscription(ctx, in, opts...)
|
||||
res, err := c.inner.AudioTranscription(ctx, in, opts...)
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) AudioTranscriptionStream(ctx context.Context, in *pb.TranscriptRequest, f func(chunk *pb.TranscriptStreamResponse), opts ...ggrpc.CallOption) error {
|
||||
defer c.track(ctx)()
|
||||
return c.reconcile(c.Backend.AudioTranscriptionStream(ctx, in, f, opts...))
|
||||
return c.reconcile(c.inner.AudioTranscriptionStream(ctx, in, f, opts...))
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) Detect(ctx context.Context, in *pb.DetectOptions, opts ...ggrpc.CallOption) (*pb.DetectResponse, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.Backend.Detect(ctx, in, opts...)
|
||||
res, err := c.inner.Detect(ctx, in, opts...)
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) Depth(ctx context.Context, in *pb.DepthRequest, opts ...ggrpc.CallOption) (*pb.DepthResponse, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.Backend.Depth(ctx, in, opts...)
|
||||
res, err := c.inner.Depth(ctx, in, opts...)
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) Rerank(ctx context.Context, in *pb.RerankRequest, opts ...ggrpc.CallOption) (*pb.RerankResult, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.Backend.Rerank(ctx, in, opts...)
|
||||
res, err := c.inner.Rerank(ctx, in, opts...)
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) VAD(ctx context.Context, in *pb.VADRequest, opts ...ggrpc.CallOption) (*pb.VADResponse, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.Backend.VAD(ctx, in, opts...)
|
||||
res, err := c.inner.VAD(ctx, in, opts...)
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) Diarize(ctx context.Context, in *pb.DiarizeRequest, opts ...ggrpc.CallOption) (*pb.DiarizeResponse, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.Backend.Diarize(ctx, in, opts...)
|
||||
res, err := c.inner.Diarize(ctx, in, opts...)
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) FaceVerify(ctx context.Context, in *pb.FaceVerifyRequest, opts ...ggrpc.CallOption) (*pb.FaceVerifyResponse, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.Backend.FaceVerify(ctx, in, opts...)
|
||||
res, err := c.inner.FaceVerify(ctx, in, opts...)
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) FaceAnalyze(ctx context.Context, in *pb.FaceAnalyzeRequest, opts ...ggrpc.CallOption) (*pb.FaceAnalyzeResponse, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.Backend.FaceAnalyze(ctx, in, opts...)
|
||||
res, err := c.inner.FaceAnalyze(ctx, in, opts...)
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) VoiceVerify(ctx context.Context, in *pb.VoiceVerifyRequest, opts ...ggrpc.CallOption) (*pb.VoiceVerifyResponse, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.Backend.VoiceVerify(ctx, in, opts...)
|
||||
res, err := c.inner.VoiceVerify(ctx, in, opts...)
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) VoiceAnalyze(ctx context.Context, in *pb.VoiceAnalyzeRequest, opts ...ggrpc.CallOption) (*pb.VoiceAnalyzeResponse, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.Backend.VoiceAnalyze(ctx, in, opts...)
|
||||
res, err := c.inner.VoiceAnalyze(ctx, in, opts...)
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) VoiceEmbed(ctx context.Context, in *pb.VoiceEmbedRequest, opts ...ggrpc.CallOption) (*pb.VoiceEmbedResponse, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.Backend.VoiceEmbed(ctx, in, opts...)
|
||||
res, err := c.inner.VoiceEmbed(ctx, in, opts...)
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) TokenClassify(ctx context.Context, in *pb.TokenClassifyRequest, opts ...ggrpc.CallOption) (*pb.TokenClassifyResponse, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.Backend.TokenClassify(ctx, in, opts...)
|
||||
res, err := c.inner.TokenClassify(ctx, in, opts...)
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) Score(ctx context.Context, in *pb.ScoreRequest, opts ...ggrpc.CallOption) (*pb.ScoreResponse, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.Backend.Score(ctx, in, opts...)
|
||||
res, err := c.inner.Score(ctx, in, opts...)
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) SoundDetection(ctx context.Context, in *pb.SoundDetectionRequest, opts ...ggrpc.CallOption) (*pb.SoundDetectionResponse, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.inner.SoundDetection(ctx, in, opts...)
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) AudioEncode(ctx context.Context, in *pb.AudioEncodeRequest, opts ...ggrpc.CallOption) (*pb.AudioEncodeResult, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.Backend.AudioEncode(ctx, in, opts...)
|
||||
res, err := c.inner.AudioEncode(ctx, in, opts...)
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) AudioDecode(ctx context.Context, in *pb.AudioDecodeRequest, opts ...ggrpc.CallOption) (*pb.AudioDecodeResult, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.Backend.AudioDecode(ctx, in, opts...)
|
||||
res, err := c.inner.AudioDecode(ctx, in, opts...)
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) AudioTransform(ctx context.Context, in *pb.AudioTransformRequest, opts ...ggrpc.CallOption) (*pb.AudioTransformResult, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.Backend.AudioTransform(ctx, in, opts...)
|
||||
res, err := c.inner.AudioTransform(ctx, in, opts...)
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
// AudioTransformStream, AudioToAudioStream and Forward are deliberately left as
|
||||
// embedded passthrough: they return a stream client and the inference spans the
|
||||
// stream's lifetime, not the constructor call. Wrapping the constructor with
|
||||
// track() would increment and immediately decrement (and fire onFirstComplete)
|
||||
// before any audio flows. Tracking those correctly needs the done() func tied to
|
||||
// stream close, which the current Backend interface doesn't surface here.
|
||||
// AudioTransformStream, AudioToAudioStream and Forward live in grpc.ControlBackend
|
||||
// and are passed through via the embedded field, NOT tracked: they return a stream
|
||||
// client and the inference spans the stream's lifetime, not the constructor call.
|
||||
// Wrapping the constructor with track() would increment and immediately decrement
|
||||
// (and fire onFirstComplete) before any audio flows. Tracking those correctly needs
|
||||
// the done() func tied to stream close, which the Backend interface doesn't surface
|
||||
// here. If they ever need tracking, move them to grpc.InferenceBackend - the build
|
||||
// will then force an explicit wrapper here.
|
||||
|
||||
@@ -408,6 +408,13 @@ var _ = Describe("InFlightTrackingClient", func() {
|
||||
return err
|
||||
})
|
||||
})
|
||||
|
||||
It("SoundDetection", func() {
|
||||
assertTracked(func() error {
|
||||
_, err := client.SoundDetection(context.Background(), &pb.SoundDetectionRequest{})
|
||||
return err
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
Describe("stale model reload (self-heal)", func() {
|
||||
|
||||
@@ -156,7 +156,10 @@ func applyNodeHardwareDefaults(opts *pb.ModelOptions, node *BackendNode) {
|
||||
VRAM: node.TotalVRAM,
|
||||
}
|
||||
if config.IsManagedPhysicalBatch(int(opts.NBatch)) {
|
||||
opts.NBatch = int32(config.PhysicalBatch(gpu))
|
||||
// Gate the raised batch on the selected node's per-device VRAM at this
|
||||
// model's context, so a large context can't overflow the node's compute
|
||||
// buffer (issue #10485). node.TotalVRAM is the node's reported ceiling.
|
||||
opts.NBatch = int32(config.PhysicalBatchForContext(gpu, int(opts.ContextSize)))
|
||||
}
|
||||
// Default concurrent serving for the selected node (the frontend that built
|
||||
// the options may have no GPU). Only adds when no parallel option is set.
|
||||
|
||||
@@ -8,12 +8,19 @@ import (
|
||||
)
|
||||
|
||||
var _ = Describe("applyNodeHardwareDefaults", func() {
|
||||
It("raises a managed default batch on a Blackwell node", func() {
|
||||
opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch}
|
||||
applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1"})
|
||||
It("raises a managed default batch on a Blackwell node with headroom", func() {
|
||||
opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, ContextSize: 8192}
|
||||
applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1", TotalVRAM: 119 << 30})
|
||||
Expect(opts.NBatch).To(BeEquivalentTo(config.BlackwellPhysicalBatch))
|
||||
})
|
||||
|
||||
It("keeps the default batch when a large context would overflow the node", func() {
|
||||
// Regression guard for issue #10485 on the distributed path.
|
||||
opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, ContextSize: 204800}
|
||||
applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.0", TotalVRAM: 16 << 30})
|
||||
Expect(opts.NBatch).To(BeEquivalentTo(config.DefaultPhysicalBatch))
|
||||
})
|
||||
|
||||
It("resets a Blackwell guess on a non-Blackwell node", func() {
|
||||
// frontend (Blackwell) guessed high, but the selected node is not Blackwell
|
||||
opts := &pb.ModelOptions{NBatch: config.BlackwellPhysicalBatch}
|
||||
|
||||
@@ -185,6 +185,13 @@ It is persisted through `POST /api/settings` and read live, so a change takes
|
||||
effect on the next request without a restart. A default that names a model no
|
||||
longer loaded still appears (marked *not loaded*) so it can be toggled off.
|
||||
|
||||
The default set can also be supplied out-of-band with the
|
||||
`LOCALAI_PII_DEFAULT_DETECTORS` environment variable (comma-separated model
|
||||
names, e.g. `privacy-filter-nemotron,secret-filter`). When set it takes
|
||||
precedence over the value persisted via the UI (env > file), which is the
|
||||
right behaviour for immutable container deployments that pin filtering policy
|
||||
at boot rather than via the admin UI.
|
||||
|
||||
This is what makes `cloud-proxy` / MITM redaction work out of the box: those
|
||||
backends default to PII-enabled but ship no detector list, so without a
|
||||
default detector the filter runs with nothing to scan. Set one here and
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
{
|
||||
"version": "v4.4.3"
|
||||
"version": "v4.5.0"
|
||||
}
|
||||
|
||||
@@ -1,4 +1,208 @@
|
||||
---
|
||||
- name: "lfm2.5-1.2b-instruct"
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
urls:
|
||||
- https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct-GGUF
|
||||
description: "Try LFM • Docs • LEAP • Discord\n\n# LFM2.5-1.2B-Instruct\n\nLFM2.5 is a new family of hybrid models designed for **on-device deployment**. It builds on the LFM2 architecture with extended pre-training and reinforcement learning.\n\n - **Best-in-class performance**: A 1.2B model rivaling much larger models, bringing high-quality AI to your pocket.\n - **Fast edge inference**: 239 tok/s decode on AMD CPU, 82 tok/s on mobile NPU. Runs under 1GB of memory with day-one support for llama.cpp, MLX, and vLLM.\n - **Scaled training**: Extended pre-training from 10T to 28T tokens and large-scale multi-stage reinforcement learning.\n\nFind more information about LFM2.5 in our blog post.\n\n## \U0001F5D2️ Model Details\n\nLFM2.5-1.2B-Instruct is a general-purpose text-only model with the following features:\n\n...\n"
|
||||
license: "other"
|
||||
tags:
|
||||
- llm
|
||||
- gguf
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/61b8e2ba285851687028d395/dxnYF2fuLpulismtFSGFi.png
|
||||
overrides:
|
||||
backend: llama-cpp
|
||||
function:
|
||||
automatic_tool_parsing_fallback: true
|
||||
grammar:
|
||||
disable: true
|
||||
known_usecases:
|
||||
- chat
|
||||
options:
|
||||
- use_jinja:true
|
||||
parameters:
|
||||
min_p: 0.15
|
||||
model: llama-cpp/models/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf
|
||||
repeat_penalty: 1.05
|
||||
temperature: 0.1
|
||||
top_k: 50
|
||||
top_p: 0.1
|
||||
template:
|
||||
use_tokenizer_template: true
|
||||
files:
|
||||
- filename: llama-cpp/models/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf
|
||||
sha256: b1b3de114215d9507409a662a501a631095a479a419584e8a2ded6304b19b4f5
|
||||
uri: https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct-GGUF/resolve/main/LFM2.5-1.2B-Instruct-Q4_K_M.gguf
|
||||
- name: "qwopus3.6-27b-coder-compat-mtp"
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
urls:
|
||||
- https://huggingface.co/Jackrong/Qwopus3.6-27B-Coder-Compat-MTP-GGUF
|
||||
description: "\U0001FA90 Qwopus-3.6-27B-Coder\nCoder SFT Release\n\nAgentic Coding & Tool-Use Reasoning Model Fine-Tuned on Qwopus3.6-27B-v2\n\n\U0001F9EC Trace Inversion & Negentropy\n\U0001F9E0 27B Dense Model\n⚡ Agentic Coding\n\U0001F6E0️ Tool Calling & Agent\n\U0001F3C6 SWE-bench Verified: 67.0% (off-thinking)\n\n\U0001F4A1 What is Qwopus-3.6-27B-Coder?\n\U0001FA90 Qwopus-3.6-27B-Coder is a reasoning-enhanced agentic coding model built on top of Qwopus3.6-27B-v2. It inherits the powerful reasoning foundation of the v2 base — which achieved 87.43% MMLU-Pro and 75.25% SWE-bench Verified — and further specializes it for agentic code generation, structured tool calling, debugging, and instruction-following in developer workflows. The model is designed to excel at repository-level coding tasks, multi-turn tool orchestration, and complex logical reasoning under realistic agent environments.\n\n\U0001F9E9 Agentic Coding\nOptimized for repository-level coding, debugging, patch generation, and structured multi-step development workflows.\n\n\U0001F6E0️ Tool Calling\nLearns from real agent trajectories with tool definitions, tool calls, and environment feedback for robust multi-turn execution.\n\n...\n"
|
||||
license: "apache-2.0"
|
||||
tags:
|
||||
- llm
|
||||
- gguf
|
||||
- vision
|
||||
- multimodal
|
||||
- reasoning
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/66309bd090589b7c65950665/sGQKmrMc6L6guMoaB5_Y2.png
|
||||
overrides:
|
||||
backend: llama-cpp
|
||||
function:
|
||||
automatic_tool_parsing_fallback: true
|
||||
grammar:
|
||||
disable: true
|
||||
known_usecases:
|
||||
- chat
|
||||
mmproj: llama-cpp/mmproj/Qwopus3.6-27B-Coder-Compat-MTP-GGUF/mmproj-F32.gguf
|
||||
options:
|
||||
- use_jinja:true
|
||||
- spec_type:draft-mtp
|
||||
- spec_n_max:6
|
||||
- spec_p_min:0.75
|
||||
parameters:
|
||||
model: llama-cpp/models/Qwopus3.6-27B-Coder-Compat-MTP-GGUF/Qwopus3.6-27B-Coder-Compat-MTP-Q4_K_M.gguf
|
||||
template:
|
||||
use_tokenizer_template: true
|
||||
files:
|
||||
- filename: llama-cpp/models/Qwopus3.6-27B-Coder-Compat-MTP-GGUF/Qwopus3.6-27B-Coder-Compat-MTP-Q4_K_M.gguf
|
||||
sha256: f893632170124da60e159b7bcc9d91e1cda3014b2c6b8ad9c6cde38a1fcd2f6f
|
||||
uri: https://huggingface.co/Jackrong/Qwopus3.6-27B-Coder-Compat-MTP-GGUF/resolve/main/Qwopus3.6-27B-Coder-Compat-MTP-Q4_K_M.gguf
|
||||
- filename: llama-cpp/mmproj/Qwopus3.6-27B-Coder-Compat-MTP-GGUF/mmproj-F32.gguf
|
||||
sha256: 32f7ea0600c07272547da401d460f8abbd980f3a57b69d6df87be0e2505e0b9c
|
||||
uri: https://huggingface.co/Jackrong/Qwopus3.6-27B-Coder-Compat-MTP-GGUF/resolve/main/mmproj-F32.gguf
|
||||
- name: "kimi-k2.7-code"
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
urls:
|
||||
- https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF
|
||||
description: |
|
||||
## 1. Model Introduction
|
||||
|
||||
Kimi K2.7 Code is a coding-focused agentic model built upon Kimi K2.6. With substantial improvements on real-world long-horizon coding tasks, it strengthens end-to-end task completion across complex software engineering workflows while improving token efficiency, reducing thinking-token usage by approximately 30% compared with Kimi K2.6.
|
||||
|
||||
## 2. Model Summary
|
||||
|
||||
## 3. Evaluation Results
|
||||
|
||||
Benchmark
|
||||
Kimi K2.6
|
||||
Kimi K2.7 Code
|
||||
GPT-5.5
|
||||
Claude Opus 4.8
|
||||
|
||||
Coding
|
||||
|
||||
Kimi Code Bench v2
|
||||
50.9
|
||||
62.0
|
||||
69.0
|
||||
67.4
|
||||
|
||||
Program Bench
|
||||
48.3
|
||||
53.6
|
||||
69.1
|
||||
63.8
|
||||
|
||||
MLS Bench Lite
|
||||
26.7
|
||||
35.1
|
||||
35.5
|
||||
42.8
|
||||
|
||||
Agentic
|
||||
|
||||
Kimi Claw 24/7 Bench
|
||||
42.9
|
||||
46.9
|
||||
52.8
|
||||
50.4
|
||||
|
||||
MCP Atlas
|
||||
69.4
|
||||
76.0
|
||||
79.4
|
||||
81.3
|
||||
|
||||
MCP Mark Verified
|
||||
72.8
|
||||
81.1
|
||||
92.9
|
||||
76.4
|
||||
|
||||
Footnotes
|
||||
|
||||
...
|
||||
license: "other"
|
||||
tags:
|
||||
- llm
|
||||
- gguf
|
||||
icon: https://huggingface.co/moonshotai/Kimi-K2.7-Code/resolve/main/figures/kimi-logo.png
|
||||
overrides:
|
||||
backend: llama-cpp
|
||||
function:
|
||||
automatic_tool_parsing_fallback: true
|
||||
grammar:
|
||||
disable: true
|
||||
known_usecases:
|
||||
- chat
|
||||
mmproj: llama-cpp/mmproj/Kimi-K2.7-Code-GGUF/mmproj-F32.gguf
|
||||
options:
|
||||
- use_jinja:true
|
||||
parameters:
|
||||
min_p: 0.01
|
||||
model: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00001-of-00014.gguf
|
||||
repeat_penalty: 1
|
||||
temperature: 0.6
|
||||
top_k: -1
|
||||
top_p: 0.95
|
||||
template:
|
||||
use_tokenizer_template: true
|
||||
files:
|
||||
- filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00001-of-00014.gguf
|
||||
sha256: 65f0aca336f876902323a90e2aff32cac76d071b2cdd818c6a8d78be8fc2c680
|
||||
uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00001-of-00014.gguf
|
||||
- filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00002-of-00014.gguf
|
||||
sha256: 40f4416c130827a11502778891f4ef95b2144db90f51d63aa3548d0952a39683
|
||||
uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00002-of-00014.gguf
|
||||
- filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00003-of-00014.gguf
|
||||
sha256: ba2ba0b5168784ace7c752ecadfc3631279b2bb023824cb0fe9e2dab3dd28f22
|
||||
uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00003-of-00014.gguf
|
||||
- filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00004-of-00014.gguf
|
||||
sha256: 10298a6c98b13ef49be286fefbea8663e16473fb69bbeabe153bc80c60ae116e
|
||||
uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00004-of-00014.gguf
|
||||
- filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00005-of-00014.gguf
|
||||
sha256: 8e9e4c8e35d34fc4fef6bfb65a715ad7defbd196970d833c1df6924d701c88b3
|
||||
uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00005-of-00014.gguf
|
||||
- filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00006-of-00014.gguf
|
||||
sha256: ccff6e7f299742f82cf6f51a871e3eb3167511efaee967477cc8387f54d16442
|
||||
uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00006-of-00014.gguf
|
||||
- filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00007-of-00014.gguf
|
||||
sha256: 1a3b639633a2d22f71156a9f643ded2329cdd969cc21177b644b5741bac1af8e
|
||||
uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00007-of-00014.gguf
|
||||
- filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00008-of-00014.gguf
|
||||
sha256: bde28f682a1eab973538b2102007d952f37a13c1f7d55e2ed99177445ddc4282
|
||||
uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00008-of-00014.gguf
|
||||
- filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00009-of-00014.gguf
|
||||
sha256: b6a23a95b61e100f7593fa75e2363966323fa767b7e4fdf45d963b59e8fdc69f
|
||||
uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00009-of-00014.gguf
|
||||
- filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00010-of-00014.gguf
|
||||
sha256: fb10231c2e6d76921d40f22690f4aa08a8090c708edeaf7e581abafc24d3b25c
|
||||
uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00010-of-00014.gguf
|
||||
- filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00011-of-00014.gguf
|
||||
sha256: d2290be7ed1a22ac1f9f8a4813389689e075ce2ab8abc3aaaa1157a3cb1462d8
|
||||
uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00011-of-00014.gguf
|
||||
- filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00012-of-00014.gguf
|
||||
sha256: ce0d028314aa3fc783082dbca097e1055d69686a17ab8306574e2949568f26a5
|
||||
uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00012-of-00014.gguf
|
||||
- filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00013-of-00014.gguf
|
||||
sha256: 217864ce63a1d130ab39dcb0996b6097e1aa78eb896e38efaefdbbac3a00b7ec
|
||||
uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00013-of-00014.gguf
|
||||
- filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00014-of-00014.gguf
|
||||
sha256: eb7582ad7066c5eaa01bde95acb00b4ad9cd7b07cd50a6cf5c9ee427258bc9dd
|
||||
uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00014-of-00014.gguf
|
||||
- filename: llama-cpp/mmproj/Kimi-K2.7-Code-GGUF/mmproj-F32.gguf
|
||||
sha256: b2cc50c8c13fe70fc4968a83332f31e9007ea09ebb9ae91d46a4e4cd2a3053cd
|
||||
uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/mmproj-F32.gguf
|
||||
- name: "qwythos-9b-claude-mythos-5-1m"
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
urls:
|
||||
@@ -49,33 +253,7 @@
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
urls:
|
||||
- https://huggingface.co/unsloth/GLM-5.2-GGUF
|
||||
description: |
|
||||
# GLM-5.2
|
||||
|
||||
👋 Join our WeChat or Discord community.
|
||||
|
||||
📖 Check out the GLM-5.2 blog and GLM-5 Technical report.
|
||||
|
||||
📍 Use GLM-5.2 API services on Z.ai API Platform.
|
||||
|
||||
🔜 Try GLM-5.2 here.
|
||||
|
||||
[Paper]
|
||||
[GitHub]
|
||||
|
||||
## Introduction
|
||||
|
||||
We're introducing GLM-5.2, our latest flagship model for long-horizon tasks. It marks a substantial leap in long-horizon task capability over its predecessor GLM-5.1 and, for the first time, delivers that capability on a **solid 1M-token context**. GLM-5.2's new capabilities include:
|
||||
- **Solid 1M Context:** A solid 1M-token context that stably sustains long-horizon work
|
||||
- **Advanced Coding with Flexible Effort**: Stronger coding capabilities with multiple thinking effort levels to balance performance and latency
|
||||
- **Improved Architecture**: We propose IndexShare, which reuses the same indexer across every four sparse attention layers, reducing per-token FLOPs by 2.9× at a 1M context length. We also improve GLM-5.2’s MTP layer for speculative decoding, increasing the acceptance length by up to 20%
|
||||
- **Pure Open**: An MIT open-source license — no regional limits, technical access without borders
|
||||
|
||||
## Benchmark
|
||||
|
||||
## Serve GLM-5.2 Locally
|
||||
|
||||
...
|
||||
description: "# GLM-5.2\n\n\U0001F44B Join our WeChat or Discord community.\n\n\U0001F4D6 Check out the GLM-5.2 blog and GLM-5 Technical report.\n\n\U0001F4CD Use GLM-5.2 API services on Z.ai API Platform.\n\n\U0001F51C Try GLM-5.2 here.\n\n[Paper]\n[GitHub]\n\n## Introduction\n\nWe're introducing GLM-5.2, our latest flagship model for long-horizon tasks. It marks a substantial leap in long-horizon task capability over its predecessor GLM-5.1 and, for the first time, delivers that capability on a **solid 1M-token context**. GLM-5.2's new capabilities include:\n - **Solid 1M Context:** A solid 1M-token context that stably sustains long-horizon work\n - **Advanced Coding with Flexible Effort**: Stronger coding capabilities with multiple thinking effort levels to balance performance and latency\n - **Improved Architecture**: We propose IndexShare, which reuses the same indexer across every four sparse attention layers, reducing per-token FLOPs by 2.9× at a 1M context length. We also improve GLM-5.2’s MTP layer for speculative decoding, increasing the acceptance length by up to 20%\n - **Pure Open**: An MIT open-source license — no regional limits, technical access without borders\n\n## Benchmark\n\n## Serve GLM-5.2 Locally\n\n...\n"
|
||||
license: "mit"
|
||||
tags:
|
||||
- llm
|
||||
@@ -198,26 +376,7 @@
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
urls:
|
||||
- https://huggingface.co/michaelw9999/Qwopus3.6-27B-v2-MTP-NVFP4-GGUF
|
||||
description: |
|
||||
🪐 Qwopus3.6-27B-v2-MTP
|
||||
MTP Release
|
||||
|
||||
Multi-Token Prediction reasoning model fine-tuned from Qwen3.6-27B
|
||||
|
||||
🧬 Trace Inversion & Negentropy
|
||||
🧠 27B Parameters
|
||||
⚡ Speculative Decoding
|
||||
🛠️ Coding / DevOps / Math
|
||||
|
||||
💡 What is Qwopus3.6-27B-v2-MTP?
|
||||
🪐 Qwopus3.6-27B-v2-MTP is a speed-oriented reasoning release built on top of Qwen3.6-27B. It keeps the Qwopus line's focus on reconstructed reasoning traces, coding discipline, DevOps procedures, and mathematical derivations, while adding Multi-Token Prediction for faster generation. The goal is simple: preserve the depth and structure of a 27B reasoning model while making real interactive use noticeably faster.
|
||||
|
||||
⚡ MTP DecodingAuxiliary future-token prediction improves throughput on long reasoning, code, math, and strict-format prompts.
|
||||
🧩 Structured ReasoningInherits the Qwopus training recipe built around reconstructed step-by-step reasoning trajectories.
|
||||
🧪 GB10 TestedValidated on a 30-question local benchmark across Logic, Coding, DevOps, Math, and Edge tasks.
|
||||
🚀 Practical SpeedDesigned for workflows where strong answers matter, but waiting several extra minutes per task does not.
|
||||
|
||||
...
|
||||
description: "\U0001FA90 Qwopus3.6-27B-v2-MTP\nMTP Release\n\nMulti-Token Prediction reasoning model fine-tuned from Qwen3.6-27B\n\n\U0001F9EC Trace Inversion & Negentropy\n\U0001F9E0 27B Parameters\n⚡ Speculative Decoding\n\U0001F6E0️ Coding / DevOps / Math\n\n\U0001F4A1 What is Qwopus3.6-27B-v2-MTP?\n\U0001FA90 Qwopus3.6-27B-v2-MTP is a speed-oriented reasoning release built on top of Qwen3.6-27B. It keeps the Qwopus line's focus on reconstructed reasoning traces, coding discipline, DevOps procedures, and mathematical derivations, while adding Multi-Token Prediction for faster generation. The goal is simple: preserve the depth and structure of a 27B reasoning model while making real interactive use noticeably faster.\n\n⚡ MTP DecodingAuxiliary future-token prediction improves throughput on long reasoning, code, math, and strict-format prompts.\n\U0001F9E9 Structured ReasoningInherits the Qwopus training recipe built around reconstructed step-by-step reasoning trajectories.\n\U0001F9EA GB10 TestedValidated on a 30-question local benchmark across Logic, Coding, DevOps, Math, and Edge tasks.\n\U0001F680 Practical SpeedDesigned for workflows where strong answers matter, but waiting several extra minutes per task does not.\n\n...\n"
|
||||
tags:
|
||||
- llm
|
||||
- gguf
|
||||
@@ -243,28 +402,7 @@
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
urls:
|
||||
- https://huggingface.co/michaelw9999/Qwopus3.6-27B-Coder-MTP-NVFP4-GGUF
|
||||
description: |
|
||||
🪐 Qwopus-3.6-27B-Coder
|
||||
Coder SFT Release
|
||||
|
||||
Agentic Coding & Tool-Use Reasoning Model Fine-Tuned on Qwopus3.6-27B-v2
|
||||
|
||||
🧬 Trace Inversion & Negentropy
|
||||
🧠 27B Dense Model
|
||||
⚡ Agentic Coding
|
||||
🛠️ Tool Calling & Agent
|
||||
🏆 SWE-bench Verified: 67.0% (off-thinking)
|
||||
|
||||
💡 What is Qwopus-3.6-27B-Coder?
|
||||
🪐 Qwopus-3.6-27B-Coder is a reasoning-enhanced agentic coding model built on top of Qwopus3.6-27B-v2. It inherits the powerful reasoning foundation of the v2 base — which achieved 87.43% MMLU-Pro (300ex) and 75.25% SWE-bench Verified — and further specializes it for agentic code generation, structured tool calling, debugging, and instruction-following in developer workflows. The model is designed to excel at repository-level coding tasks, multi-turn tool orchestration, and complex logical reasoning under realistic agent environments.
|
||||
|
||||
🧩 Agentic Coding
|
||||
Optimized for repository-level coding, debugging, patch generation, and structured multi-step development workflows.
|
||||
|
||||
🛠️ Tool Calling
|
||||
Learns from real agent trajectories with tool definitions, tool calls, and environment feedback for robust multi-turn execution.
|
||||
|
||||
...
|
||||
description: "\U0001FA90 Qwopus-3.6-27B-Coder\nCoder SFT Release\n\nAgentic Coding & Tool-Use Reasoning Model Fine-Tuned on Qwopus3.6-27B-v2\n\n\U0001F9EC Trace Inversion & Negentropy\n\U0001F9E0 27B Dense Model\n⚡ Agentic Coding\n\U0001F6E0️ Tool Calling & Agent\n\U0001F3C6 SWE-bench Verified: 67.0% (off-thinking)\n\n\U0001F4A1 What is Qwopus-3.6-27B-Coder?\n\U0001FA90 Qwopus-3.6-27B-Coder is a reasoning-enhanced agentic coding model built on top of Qwopus3.6-27B-v2. It inherits the powerful reasoning foundation of the v2 base — which achieved 87.43% MMLU-Pro (300ex) and 75.25% SWE-bench Verified — and further specializes it for agentic code generation, structured tool calling, debugging, and instruction-following in developer workflows. The model is designed to excel at repository-level coding tasks, multi-turn tool orchestration, and complex logical reasoning under realistic agent environments.\n\n\U0001F9E9 Agentic Coding\nOptimized for repository-level coding, debugging, patch generation, and structured multi-step development workflows.\n\n\U0001F6E0️ Tool Calling\nLearns from real agent trajectories with tool definitions, tool calls, and environment feedback for robust multi-turn execution.\n\n...\n"
|
||||
tags:
|
||||
- llm
|
||||
- gguf
|
||||
@@ -687,8 +825,8 @@
|
||||
use_tokenizer_template: true
|
||||
files:
|
||||
- filename: llama-cpp/models/Qwopus3.6-27B-Coder-MTP-GGUF/Qwopus3.6-27B-Coder-MTP-Q4_K_M.gguf
|
||||
sha256: b2898667ed7b2388f0ab7691393833ae777f247492bbe62fdb4b2bd3e3cf3f79
|
||||
uri: https://huggingface.co/Jackrong/Qwopus3.6-27B-Coder-MTP-GGUF/resolve/main/Qwopus3.6-27B-Coder-MTP-Q4_K_M.gguf
|
||||
sha256: b2b9180093496da2e00439e3fa23227c591355901bfa579bc6897bbc01b755ef
|
||||
- filename: llama-cpp/mmproj/Qwopus3.6-27B-Coder-MTP-GGUF/mmproj-F32.gguf
|
||||
sha256: 32f7ea0600c07272547da401d460f8abbd980f3a57b69d6df87be0e2505e0b9c
|
||||
uri: https://huggingface.co/Jackrong/Qwopus3.6-27B-Coder-MTP-GGUF/resolve/main/mmproj-F32.gguf
|
||||
@@ -1484,8 +1622,8 @@
|
||||
use_tokenizer_template: true
|
||||
files:
|
||||
- filename: llama-cpp/models/Qwopus3.6-27B-v2-MTP-GGUF/Qwopus3.6-27B-v2-MTP-Q4_K_M.gguf
|
||||
sha256: 818d68223be4d8518dac0b3b5604dde633cbbcbae1f491d842a3e26711c6606d
|
||||
uri: https://huggingface.co/Jackrong/Qwopus3.6-27B-v2-MTP-GGUF/resolve/main/Qwopus3.6-27B-v2-MTP-Q4_K_M.gguf
|
||||
sha256: 31cf5fc2406a0c7aaebcc26d440bf0df94e215d0589d5205bf319649c052b50a
|
||||
- name: "qwen3.6-40b-claude-4.6-opus-deckard-heretic-uncensored-thinking-neo-code-di-imatrix-max"
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
urls:
|
||||
|
||||
@@ -41,11 +41,34 @@ func buildClient(address string, parallel bool, wd WatchDog, enableWatchDog bool
|
||||
}
|
||||
}
|
||||
|
||||
// Backend is the full client surface of a model backend. It is deliberately
|
||||
// composed of two sub-interfaces so that wrappers can get a COMPILE-TIME
|
||||
// guarantee about which methods they must account for:
|
||||
//
|
||||
// - InferenceBackend - methods that each perform one discrete inference call
|
||||
// (the call begins on entry and ends on return). A wrapper that does
|
||||
// per-call accounting - e.g. the distributed router's in-flight tracker,
|
||||
// core/services/nodes.InFlightTrackingClient - embeds only ControlBackend
|
||||
// and implements every InferenceBackend method explicitly. Adding a method
|
||||
// to InferenceBackend therefore breaks that wrapper's build until it is
|
||||
// implemented: inference can't be added without an accounting decision.
|
||||
// - ControlBackend - everything that is NOT a discrete inference call:
|
||||
// lifecycle/control-plane operations and the streaming constructors whose
|
||||
// work spans the returned stream rather than the constructor call. These
|
||||
// are safe to pass through untracked.
|
||||
//
|
||||
// Keep the two sets disjoint; every backend method belongs to exactly one.
|
||||
type Backend interface {
|
||||
IsBusy() bool
|
||||
HealthCheck(ctx context.Context) (bool, error)
|
||||
InferenceBackend
|
||||
ControlBackend
|
||||
}
|
||||
|
||||
// InferenceBackend is the subset of Backend whose methods each map to a single
|
||||
// inference call. Wrappers that account for in-flight work must implement these
|
||||
// explicitly (see Backend). Do NOT add methods that return a stream client or
|
||||
// that are control-plane only - those belong in ControlBackend.
|
||||
type InferenceBackend interface {
|
||||
Embeddings(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.EmbeddingResult, error)
|
||||
LoadModel(ctx context.Context, in *pb.ModelOptions, opts ...grpc.CallOption) (*pb.Result, error)
|
||||
PredictStream(ctx context.Context, in *pb.PredictOptions, f func(reply *pb.Reply), opts ...grpc.CallOption) error
|
||||
Predict(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.Reply, error)
|
||||
GenerateImage(ctx context.Context, in *pb.GenerateImageRequest, opts ...grpc.CallOption) (*pb.Result, error)
|
||||
@@ -53,6 +76,8 @@ type Backend interface {
|
||||
TTS(ctx context.Context, in *pb.TTSRequest, opts ...grpc.CallOption) (*pb.Result, error)
|
||||
TTSStream(ctx context.Context, in *pb.TTSRequest, f func(reply *pb.Reply), opts ...grpc.CallOption) error
|
||||
SoundGeneration(ctx context.Context, in *pb.SoundGenerationRequest, opts ...grpc.CallOption) (*pb.Result, error)
|
||||
AudioTranscription(ctx context.Context, in *pb.TranscriptRequest, opts ...grpc.CallOption) (*pb.TranscriptResult, error)
|
||||
AudioTranscriptionStream(ctx context.Context, in *pb.TranscriptRequest, f func(chunk *pb.TranscriptStreamResponse), opts ...grpc.CallOption) error
|
||||
Detect(ctx context.Context, in *pb.DetectOptions, opts ...grpc.CallOption) (*pb.DetectResponse, error)
|
||||
Depth(ctx context.Context, in *pb.DepthRequest, opts ...grpc.CallOption) (*pb.DepthResponse, error)
|
||||
FaceVerify(ctx context.Context, in *pb.FaceVerifyRequest, opts ...grpc.CallOption) (*pb.FaceVerifyResponse, error)
|
||||
@@ -60,8 +85,25 @@ type Backend interface {
|
||||
VoiceVerify(ctx context.Context, in *pb.VoiceVerifyRequest, opts ...grpc.CallOption) (*pb.VoiceVerifyResponse, error)
|
||||
VoiceAnalyze(ctx context.Context, in *pb.VoiceAnalyzeRequest, opts ...grpc.CallOption) (*pb.VoiceAnalyzeResponse, error)
|
||||
VoiceEmbed(ctx context.Context, in *pb.VoiceEmbedRequest, opts ...grpc.CallOption) (*pb.VoiceEmbedResponse, error)
|
||||
AudioTranscription(ctx context.Context, in *pb.TranscriptRequest, opts ...grpc.CallOption) (*pb.TranscriptResult, error)
|
||||
AudioTranscriptionStream(ctx context.Context, in *pb.TranscriptRequest, f func(chunk *pb.TranscriptStreamResponse), opts ...grpc.CallOption) error
|
||||
Rerank(ctx context.Context, in *pb.RerankRequest, opts ...grpc.CallOption) (*pb.RerankResult, error)
|
||||
TokenClassify(ctx context.Context, in *pb.TokenClassifyRequest, opts ...grpc.CallOption) (*pb.TokenClassifyResponse, error)
|
||||
Score(ctx context.Context, in *pb.ScoreRequest, opts ...grpc.CallOption) (*pb.ScoreResponse, error)
|
||||
VAD(ctx context.Context, in *pb.VADRequest, opts ...grpc.CallOption) (*pb.VADResponse, error)
|
||||
Diarize(ctx context.Context, in *pb.DiarizeRequest, opts ...grpc.CallOption) (*pb.DiarizeResponse, error)
|
||||
SoundDetection(ctx context.Context, in *pb.SoundDetectionRequest, opts ...grpc.CallOption) (*pb.SoundDetectionResponse, error)
|
||||
AudioEncode(ctx context.Context, in *pb.AudioEncodeRequest, opts ...grpc.CallOption) (*pb.AudioEncodeResult, error)
|
||||
AudioDecode(ctx context.Context, in *pb.AudioDecodeRequest, opts ...grpc.CallOption) (*pb.AudioDecodeResult, error)
|
||||
AudioTransform(ctx context.Context, in *pb.AudioTransformRequest, opts ...grpc.CallOption) (*pb.AudioTransformResult, error)
|
||||
}
|
||||
|
||||
// ControlBackend is the subset of Backend that is NOT per-call inference:
|
||||
// lifecycle/control-plane operations and the streaming constructors whose work
|
||||
// spans the returned stream rather than the constructor call. In-flight-tracking
|
||||
// wrappers embed this directly and pass it through untracked (see Backend).
|
||||
type ControlBackend interface {
|
||||
IsBusy() bool
|
||||
HealthCheck(ctx context.Context) (bool, error)
|
||||
LoadModel(ctx context.Context, in *pb.ModelOptions, opts ...grpc.CallOption) (*pb.Result, error)
|
||||
TokenizeString(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.TokenizationResponse, error)
|
||||
Status(ctx context.Context) (*pb.StatusResponse, error)
|
||||
|
||||
@@ -70,24 +112,11 @@ type Backend interface {
|
||||
StoresGet(ctx context.Context, in *pb.StoresGetOptions, opts ...grpc.CallOption) (*pb.StoresGetResult, error)
|
||||
StoresFind(ctx context.Context, in *pb.StoresFindOptions, opts ...grpc.CallOption) (*pb.StoresFindResult, error)
|
||||
|
||||
Rerank(ctx context.Context, in *pb.RerankRequest, opts ...grpc.CallOption) (*pb.RerankResult, error)
|
||||
|
||||
TokenClassify(ctx context.Context, in *pb.TokenClassifyRequest, opts ...grpc.CallOption) (*pb.TokenClassifyResponse, error)
|
||||
|
||||
Score(ctx context.Context, in *pb.ScoreRequest, opts ...grpc.CallOption) (*pb.ScoreResponse, error)
|
||||
|
||||
GetTokenMetrics(ctx context.Context, in *pb.MetricsRequest, opts ...grpc.CallOption) (*pb.MetricsResponse, error)
|
||||
|
||||
VAD(ctx context.Context, in *pb.VADRequest, opts ...grpc.CallOption) (*pb.VADResponse, error)
|
||||
|
||||
Diarize(ctx context.Context, in *pb.DiarizeRequest, opts ...grpc.CallOption) (*pb.DiarizeResponse, error)
|
||||
|
||||
SoundDetection(ctx context.Context, in *pb.SoundDetectionRequest, opts ...grpc.CallOption) (*pb.SoundDetectionResponse, error)
|
||||
|
||||
AudioEncode(ctx context.Context, in *pb.AudioEncodeRequest, opts ...grpc.CallOption) (*pb.AudioEncodeResult, error)
|
||||
AudioDecode(ctx context.Context, in *pb.AudioDecodeRequest, opts ...grpc.CallOption) (*pb.AudioDecodeResult, error)
|
||||
|
||||
AudioTransform(ctx context.Context, in *pb.AudioTransformRequest, opts ...grpc.CallOption) (*pb.AudioTransformResult, error)
|
||||
// Streaming constructors: these return a stream client immediately; the
|
||||
// actual inference spans the stream's lifetime, not this call, so they are
|
||||
// NOT tracked as a single in-flight unit.
|
||||
AudioTransformStream(ctx context.Context, opts ...grpc.CallOption) (AudioTransformStreamClient, error)
|
||||
AudioToAudioStream(ctx context.Context, opts ...grpc.CallOption) (AudioToAudioStreamClient, error)
|
||||
|
||||
|
||||
@@ -129,6 +129,61 @@ func TotalAvailableVRAM() (uint64, error) {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
// MinPerGPUVRAM returns the total VRAM of the SMALLEST GPU on the host (in
|
||||
// bytes), or 0 when no per-device VRAM is known. Unlike TotalAvailableVRAM
|
||||
// (which sums across devices) this reports a single device's ceiling, which is
|
||||
// the right figure for decisions about what must fit on one card: the compute
|
||||
// buffer (sized by n_ubatch) and the parallel-slot tier. Summing a multi-GPU
|
||||
// host's VRAM over-provisions those into a per-device OOM (issue #10485).
|
||||
//
|
||||
// Unified-memory devices (GB10, Apple) report system RAM as their single
|
||||
// device's VRAM, so they are unaffected.
|
||||
func MinPerGPUVRAM() (uint64, error) {
|
||||
// Prefer per-device binary detection (nvidia-smi/rocm-smi report true
|
||||
// per-card VRAM); ghw's per-card memory can reflect NUMA node RAM on some
|
||||
// hosts, which is why TotalAvailableVRAM treats it as a sum.
|
||||
if infos := GetGPUMemoryUsage(); len(infos) > 0 {
|
||||
if v := minNonZeroVRAM(infos); v > 0 {
|
||||
return v, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: ghw per-card memory, taking the minimum non-zero card.
|
||||
if gpus, err := GPUs(); err == nil {
|
||||
var min uint64
|
||||
for _, gpu := range gpus {
|
||||
if gpu == nil || gpu.Node == nil || gpu.Node.Memory == nil {
|
||||
continue
|
||||
}
|
||||
if b := gpu.Node.Memory.TotalUsableBytes; b > 0 {
|
||||
if u := uint64(b); min == 0 || u < min {
|
||||
min = u
|
||||
}
|
||||
}
|
||||
}
|
||||
if min > 0 {
|
||||
return min, nil
|
||||
}
|
||||
}
|
||||
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
// minNonZeroVRAM returns the smallest non-zero TotalVRAM across the given GPUs,
|
||||
// or 0 when none report VRAM.
|
||||
func minNonZeroVRAM(infos []GPUMemoryInfo) uint64 {
|
||||
var min uint64
|
||||
for _, g := range infos {
|
||||
if g.TotalVRAM == 0 {
|
||||
continue
|
||||
}
|
||||
if min == 0 || g.TotalVRAM < min {
|
||||
min = g.TotalVRAM
|
||||
}
|
||||
}
|
||||
return min
|
||||
}
|
||||
|
||||
func HasGPU(vendor string) bool {
|
||||
gpus, err := GPUs()
|
||||
if err != nil {
|
||||
|
||||
37
pkg/xsysinfo/minvram_internal_test.go
Normal file
37
pkg/xsysinfo/minvram_internal_test.go
Normal file
@@ -0,0 +1,37 @@
|
||||
package xsysinfo
|
||||
|
||||
import (
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("minNonZeroVRAM", func() {
|
||||
const gib = uint64(1) << 30
|
||||
|
||||
It("returns the smallest device on a multi-GPU host", func() {
|
||||
// Two unequal cards (e.g. RTX 5070 Ti + 5060 Ti, both 16 GiB, or a
|
||||
// mixed pair): the smallest device is the per-card allocation ceiling.
|
||||
infos := []GPUMemoryInfo{
|
||||
{TotalVRAM: 16 * gib},
|
||||
{TotalVRAM: 12 * gib},
|
||||
}
|
||||
Expect(minNonZeroVRAM(infos)).To(Equal(12 * gib))
|
||||
})
|
||||
|
||||
It("ignores devices that report zero VRAM", func() {
|
||||
infos := []GPUMemoryInfo{
|
||||
{TotalVRAM: 0},
|
||||
{TotalVRAM: 24 * gib},
|
||||
}
|
||||
Expect(minNonZeroVRAM(infos)).To(Equal(24 * gib))
|
||||
})
|
||||
|
||||
It("returns the single device's VRAM on a one-GPU host", func() {
|
||||
Expect(minNonZeroVRAM([]GPUMemoryInfo{{TotalVRAM: 16 * gib}})).To(Equal(16 * gib))
|
||||
})
|
||||
|
||||
It("returns 0 when no device reports VRAM", func() {
|
||||
Expect(minNonZeroVRAM([]GPUMemoryInfo{{TotalVRAM: 0}})).To(BeZero())
|
||||
Expect(minNonZeroVRAM(nil)).To(BeZero())
|
||||
})
|
||||
})
|
||||
@@ -53,12 +53,13 @@ var _ = Describe("Gallery Distributed", Label("Distributed"), func() {
|
||||
Expect(retrieved.Status).To(Equal("downloading"))
|
||||
Expect(retrieved.FrontendID).To(Equal("f1"))
|
||||
|
||||
// Update progress
|
||||
Expect(galleryStore.UpdateProgress(op.ID, 0.75, "75% complete", "6GB")).To(Succeed())
|
||||
// Update progress (cancellable: a downloading install can be cancelled)
|
||||
Expect(galleryStore.UpdateProgress(op.ID, 0.75, "75% complete", "6GB", true)).To(Succeed())
|
||||
|
||||
updated, _ := galleryStore.Get(op.ID)
|
||||
Expect(updated.Progress).To(BeNumerically("~", 0.75, 0.01))
|
||||
Expect(updated.Message).To(Equal("75% complete"))
|
||||
Expect(updated.Cancellable).To(BeTrue())
|
||||
|
||||
// Complete
|
||||
Expect(galleryStore.UpdateStatus(op.ID, "completed", "")).To(Succeed())
|
||||
|
||||
@@ -104,11 +104,12 @@ var _ = Describe("Phase 4: MCP, Skills, Gallery, Fine-Tuning", Label("Distribute
|
||||
}
|
||||
stores.Gallery.Create(op)
|
||||
|
||||
Expect(stores.Gallery.UpdateProgress(op.ID, 0.5, "50% complete", "2GB")).To(Succeed())
|
||||
Expect(stores.Gallery.UpdateProgress(op.ID, 0.5, "50% complete", "2GB", true)).To(Succeed())
|
||||
|
||||
updated, _ := stores.Gallery.Get(op.ID)
|
||||
Expect(updated.Progress).To(BeNumerically("~", 0.5, 0.01))
|
||||
Expect(updated.Message).To(Equal("50% complete"))
|
||||
Expect(updated.Cancellable).To(BeTrue())
|
||||
})
|
||||
|
||||
It("should deduplicate concurrent downloads", func() {
|
||||
|
||||
Reference in New Issue
Block a user