Merge origin/master into feat/darwin-vllm-metal

Resolve includeDarwin conflict in backend-matrix.yml: keep both the vllm and the newly-merged liquid-audio darwin entries (additive). Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:opus-4.8 [Claude Code]
fix(config): per-device VRAM headroom for Blackwell defaults (#10485 ) (#10494 )
2026-06-25 00:59:28 -04:00 · 2026-06-24 22:15:55 +00:00 · 2026-06-25 00:07:48 +02:00 · 2026-06-24 21:31:41 +00:00 · 2026-06-24 23:30:08 +02:00 · 2026-06-24 23:18:24 +02:00
81 changed files with 1883 additions and 877 deletions
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -4974,6 +4974,16 @@ includeDarwin:
  - backend: "kitten-tts"
    tag-suffix: "-metal-darwin-arm64-kitten-tts"
    build-type: "mps"
+  # vLLM on Apple Silicon via vllm-metal (MLX). The install is custom
+  # (backend/python/vllm/install.sh has a darwin branch); lang stays python so
+  # backend_build_darwin.yml drives it through build-darwin-python-backend ->
+  # scripts/build/python-darwin.sh, which runs the backend's install.sh.
+  - backend: "vllm"
+    tag-suffix: "-metal-darwin-arm64-vllm"
+    build-type: "mps"
+  - backend: "liquid-audio"
+    tag-suffix: "-metal-darwin-arm64-liquid-audio"
+    build-type: "mps"
  - backend: "piper"
    tag-suffix: "-metal-darwin-arm64-piper"
    build-type: "metal"
@@ -4990,6 +5000,10 @@ includeDarwin:
    tag-suffix: "-metal-darwin-arm64-sherpa-onnx"
    build-type: "metal"
    lang: "go"
+  - backend: "supertonic"
+    tag-suffix: "-metal-darwin-arm64-supertonic"
+    build-type: "metal"
+    lang: "go"
  - backend: "local-store"
    tag-suffix: "-metal-darwin-arm64-local-store"
    build-type: "metal"
--- a/.github/bump_vllm_metal.sh
+++ b/.github/bump_vllm_metal.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Bump the single vllm-metal pin (VLLM_METAL_VERSION) in the vLLM backend's
+# darwin (Apple Silicon) install path. The macOS/Metal build
+# (backend/python/vllm/install.sh, Darwin branch) installs vllm-metal, which is
+# version-locked to a specific vLLM source release. install.sh derives that vLLM
+# version at build time from vllm-metal's own installer (`vllm_v=`) at the pinned
+# tag, so there is only ONE value to bump here -- mirroring bump_vllm_wheel.sh,
+# which bumps the Linux cu130 wheel pin.
+#
+# This deliberately tracks vllm-project/vllm-metal, NOT vllm-project/vllm: the
+# darwin build can only use the exact vLLM version vllm-metal supports, so it may
+# lag the Linux pin (requirements-cublas13-after.txt) until vllm-metal catches up.
+set -xe
+REPO=$1   # vllm-project/vllm-metal
+FILE=$2   # backend/python/vllm/install.sh
+VAR=$3    # VLLM_METAL_VERSION (used for the workflow's output file names)
+
+if [ -z "$FILE" ] || [ -z "$REPO" ] || [ -z "$VAR" ]; then
+    echo "usage: $0 <repo> <install-file> <var-name>" >&2
+    exit 1
+fi
+
+# vllm-metal ships frequent dev releases, all flagged as non-prerelease, so
+# /releases/latest returns the newest one (with its cp312 wheel asset).
+LATEST_TAG=$(curl -sS -H "Accept: application/vnd.github+json" \
+    "https://api.github.com/repos/$REPO/releases/latest" \
+    | python3 -c "import json,sys; print(json.load(sys.stdin)['tag_name'])")
+
+# The coupled vLLM source version lives in vllm-metal's installer at that tag.
+NEW_VLLM_VERSION=$(curl -fsSL \
+    "https://raw.githubusercontent.com/$REPO/$LATEST_TAG/install.sh" \
+    | grep -oE 'vllm_v="[0-9]+\.[0-9]+\.[0-9]+"' | head -1 | cut -d'"' -f2)
+
+if [ -z "$LATEST_TAG" ] || [ -z "$NEW_VLLM_VERSION" ]; then
+    echo "Could not resolve vllm-metal tag ($LATEST_TAG) or its vllm_v ($NEW_VLLM_VERSION)." >&2
+    exit 1
+fi
+
+set +e
+CURRENT_TAG=$(grep -oE 'VLLM_METAL_VERSION="[^"]*"' "$FILE" | head -1 | cut -d'"' -f2)
+set -e
+
+# Rewrite the single pin. install.sh derives VLLM_VERSION from this tag at build
+# time, so there is nothing else to touch. peter-evans/create-pull-request opens
+# no PR on a clean tree, so a no-op rewrite (already current) is safe.
+sed -i "$FILE" \
+    -e "s|VLLM_METAL_VERSION=\"[^\"]*\"|VLLM_METAL_VERSION=\"$LATEST_TAG\"|"
+
+if [ -z "$CURRENT_TAG" ]; then
+    echo "Could not find VLLM_METAL_VERSION=\"...\" in $FILE." >&2
+    exit 0
+fi
+
+echo "vllm-metal ${CURRENT_TAG} -> ${LATEST_TAG} (builds vLLM ${NEW_VLLM_VERSION}): https://github.com/$REPO/releases/tag/${LATEST_TAG}" >> "${VAR}_message.txt"
+echo "${LATEST_TAG}" >> "${VAR}_commit.txt"
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -154,3 +154,39 @@ jobs:
          branch: "update/VLLM_VERSION"
          body: ${{ steps.bump.outputs.message }}
          signoff: true
+
+  bump-vllm-metal:
+    # The darwin (Apple Silicon) vLLM build installs vllm-metal, which is locked
+    # to a specific vLLM source release. install.sh pins both VLLM_METAL_VERSION
+    # (the wheel release) and VLLM_VERSION (the vLLM it builds against); this job
+    # tracks vllm-project/vllm-metal and rewrites both atomically. Separate from
+    # bump-vllm-wheel because darwin follows vllm-metal, not vllm/vllm latest.
+    if: github.repository == 'mudler/LocalAI'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v7
+      - name: Bump vllm-metal pin 🔧
+        id: bump
+        run: |
+          bash .github/bump_vllm_metal.sh vllm-project/vllm-metal backend/python/vllm/install.sh VLLM_METAL_VERSION
+          {
+            echo 'message<<EOF'
+            cat "VLLM_METAL_VERSION_message.txt"
+            echo EOF
+          } >> "$GITHUB_OUTPUT"
+          {
+            echo 'commit<<EOF'
+            cat "VLLM_METAL_VERSION_commit.txt"
+            echo EOF
+          } >> "$GITHUB_OUTPUT"
+          rm -rfv VLLM_METAL_VERSION_message.txt VLLM_METAL_VERSION_commit.txt
+      - name: Create Pull Request
+        uses: peter-evans/create-pull-request@v8
+        with:
+          token: ${{ secrets.UPDATE_BOT_TOKEN }}
+          push-to-fork: ci-forks/LocalAI
+          commit-message: ':arrow_up: Update vllm-project/vllm-metal (darwin)'
+          title: 'chore: :arrow_up: Update vllm-metal (darwin) to `${{ steps.bump.outputs.commit }}`'
+          branch: "update/VLLM_METAL_VERSION"
+          body: ${{ steps.bump.outputs.message }}
+          signoff: true
--- a/backend/cpp/ik-llama-cpp/Makefile
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -1,5 +1,5 @@

-IK_LLAMA_VERSION?=6c00e87ac84404af588ad2e65935bd6f079c696f
+IK_LLAMA_VERSION?=7ccf1d209588962b96eacca325b37e9b3e8faf5e
 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=7c082bc417bbe53210a83df4ba5b49e18ce6193c
+LLAMA_VERSION?=be4a6a63eb2b848e19c277bdcf2bd399e8af76d9
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
--- a/backend/go/crispasr/Makefile
+++ b/backend/go/crispasr/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # CrispASR version (release tag)
 CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
-CRISPASR_VERSION?=7a8cb80907341c0204bd0488c1244764f4163883
+CRISPASR_VERSION?=96b2a6ee31d30389fed8a7ef1a54239b75231ddc
 SO_TARGET?=libgocrispasr.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/parakeet-cpp/Makefile
+++ b/backend/go/parakeet-cpp/Makefile
@@ -1,6 +1,6 @@
 # parakeet-cpp backend Makefile.
 #
-# Upstream pin lives below as PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36
+# Upstream pin lives below as PARAKEET_VERSION?=89f5e2977b4d8bccd45e7bcc6f2ef7c4ed49e89a
 # (.github/bump_deps.sh) can find and update it - matches the
 # whisper.cpp / ds4 / vibevoice-cpp convention.
 #
@@ -15,7 +15,7 @@
 # That's what the L0 smoke test uses. The default target below does the
 # proper clone-at-pin + cmake build so CI doesn't need a side-checkout.

-PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36
+PARAKEET_VERSION?=89f5e2977b4d8bccd45e7bcc6f2ef7c4ed49e89a
 PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp

 GOCMD?=go
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=b12098f5d09fc83da36e65c784f7bdb16a5a5ebf
+STABLEDIFFUSION_GGML_VERSION?=f440ad9c29dd8bc34e5d1f4b863832b96d6ea05f

 CMAKE_ARGS+=-DGGML_MAX_NAME=128

--- a/backend/go/supertonic/helper.go
+++ b/backend/go/supertonic/helper.go
@@ -16,6 +16,7 @@ import (
 	"os"
 	"path/filepath"
 	"regexp"
+	"runtime"
 	"strings"
 	"time"
 	"unicode"
@@ -943,7 +944,13 @@ func InitializeONNXRuntime() error {
 			}
 		}
 		if libPath == "" {
-			libPath = "/usr/local/lib/libonnxruntime.so"
+			// LocalAI: default to the platform-native shared library
+			// extension when nothing else is found (dyld vs ld.so).
+			if runtime.GOOS == "darwin" {
+				libPath = "/usr/local/lib/libonnxruntime.dylib"
+			} else {
+				libPath = "/usr/local/lib/libonnxruntime.so"
+			}
 		}
 	}
 	ort.SetSharedLibraryPath(libPath)
--- a/backend/go/supertonic/package.sh
+++ b/backend/go/supertonic/package.sh
@@ -32,6 +32,10 @@ elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
+elif [ $(uname -s) = "Darwin" ]; then
+    # macOS: dyld resolves the bundled .dylib via DYLD_LIBRARY_PATH (set in
+    # run.sh); there is no ld.so loader nor glibc to bundle.
+    echo "Detected Darwin"
 else
    echo "Error: Could not detect architecture"
    exit 1
--- a/backend/go/supertonic/run.sh
+++ b/backend/go/supertonic/run.sh
@@ -3,12 +3,19 @@ set -ex

 CURDIR=$(dirname "$(realpath $0)")

-export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
-export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.so
+if [ "$(uname)" = "Darwin" ]; then
+	# macOS uses dyld: there is no ld.so loader, and the search path env
+	# var is DYLD_LIBRARY_PATH. ONNX Runtime ships as a .dylib here.
+	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
+	export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.dylib
+else
+	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
+	export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.so

-if [ -f $CURDIR/lib/ld.so ]; then
-	echo "Using lib/ld.so"
-	exec $CURDIR/lib/ld.so $CURDIR/supertonic "$@"
+	if [ -f $CURDIR/lib/ld.so ]; then
+		echo "Using lib/ld.so"
+		exec $CURDIR/lib/ld.so $CURDIR/supertonic "$@"
+	fi
 fi

 exec $CURDIR/supertonic "$@"
--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=5ed76e9a079962f1c85cfce44edd325c27ef1f97
+WHISPER_CPP_VERSION?=43d78af5be58f41d6ffbc227d608f104577741ea
 SO_TARGET?=libgowhisper.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -645,6 +645,7 @@
    nvidia-cuda-13: "cuda13-vllm"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm"
    cpu: "cpu-vllm"
+    metal: "metal-vllm"
 - &sglang
  name: "sglang"
  license: apache-2.0
@@ -1284,6 +1285,7 @@
    nvidia-cuda-13: "cuda13-liquid-audio"
    nvidia-cuda-12: "cuda12-liquid-audio"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-liquid-audio"
+    metal: "metal-liquid-audio"
  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/61b8e2ba285851687028d395/7_6D7rWrLxp2hb6OHSV1p.png
 - &qwen-tts
  urls:
@@ -1569,6 +1571,7 @@
    - TTS
  capabilities:
    default: "cpu-supertonic"
+    metal: "metal-supertonic"
 - !!merge <<: *neutts
  name: "neutts-development"
  capabilities:
@@ -2927,6 +2930,17 @@
    nvidia-cuda-13: "cuda13-vllm-development"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm-development"
    cpu: "cpu-vllm-development"
+    metal: "metal-vllm-development"
+- !!merge <<: *vllm
+  name: "metal-vllm"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-vllm"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-vllm
+- !!merge <<: *vllm
+  name: "metal-vllm-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-vllm"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-vllm
 - !!merge <<: *vllm
  name: "cuda12-vllm"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-vllm"
@@ -4612,6 +4626,7 @@
    nvidia-cuda-13: "cuda13-liquid-audio-development"
    nvidia-cuda-12: "cuda12-liquid-audio-development"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-liquid-audio-development"
+    metal: "metal-liquid-audio-development"
 - !!merge <<: *liquid-audio
  name: "cpu-liquid-audio"
  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-liquid-audio"
@@ -4622,6 +4637,16 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-liquid-audio"
  mirrors:
    - localai/localai-backends:master-cpu-liquid-audio
+- !!merge <<: *liquid-audio
+  name: "metal-liquid-audio"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-liquid-audio"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-liquid-audio
+- !!merge <<: *liquid-audio
+  name: "metal-liquid-audio-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-liquid-audio"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-liquid-audio
 - !!merge <<: *liquid-audio
  name: "cuda12-liquid-audio"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-liquid-audio"
@@ -5484,6 +5509,7 @@
  name: "supertonic-development"
  capabilities:
    default: "cpu-supertonic-development"
+    metal: "metal-supertonic-development"
 - !!merge <<: *supertonic
  name: "cpu-supertonic"
  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-supertonic"
@@ -5494,3 +5520,13 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-supertonic"
  mirrors:
    - localai/localai-backends:master-cpu-supertonic
+- !!merge <<: *supertonic
+  name: "metal-supertonic"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-supertonic"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-supertonic
+- !!merge <<: *supertonic
+  name: "metal-supertonic-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-supertonic"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-supertonic
--- a/backend/python/liquid-audio/install.sh
+++ b/backend/python/liquid-audio/install.sh
@@ -14,5 +14,11 @@ else
 fi

 # liquid-audio's torch wheels are large; allow upgrades to satisfy transitive pins
-EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
+EXTRA_PIP_INSTALL_FLAGS+=" --upgrade"
+# --index-strategy is a uv-only flag. The darwin/MPS build installs with pip
+# (USE_PIP=true in scripts/build/python-darwin.sh), which rejects it. Only add
+# it on the uv path; Linux/CUDA resolution is unchanged.
+if [ "x${USE_PIP:-}" != "xtrue" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
+fi
 installRequirements
--- a/backend/python/liquid-audio/requirements-mps.txt
+++ b/backend/python/liquid-audio/requirements-mps.txt
@@ -1,3 +1,4 @@
+# MPS (Apple Silicon / Metal) build profile - installed by the darwin CI job.
 torch>=2.8.0
 torchaudio>=2.8.0
 torchcodec>=0.9.1
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -457,9 +457,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                    except Exception:
                        pass

-                if last_output is None or not getattr(last_output, "prompt_logprobs", None):
-                    context.set_code(grpc.StatusCode.INTERNAL)
-                    context.set_details("vLLM did not return prompt_logprobs")
+                _pl = getattr(last_output, "prompt_logprobs", None) if last_output is not None else None
+                # Some engines accept the prompt_logprobs request but return a
+                # list of all-None entries instead of computing them (observed
+                # with vllm-metal's MLX backend on macOS). Treat that as
+                # unsupported rather than silently scoring every candidate as 0.
+                if not _pl or all(e is None for e in _pl):
+                    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+                    context.set_details("This backend did not return prompt_logprobs; scoring is unsupported on this engine (e.g. vllm-metal / MLX on macOS).")
                    return backend_pb2.ScoreResponse()

                prompt_logprobs = last_output.prompt_logprobs
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -43,6 +43,24 @@ if [ "x${BUILD_PROFILE}" == "xcublas13" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
 fi

+# Apple Silicon (Metal/MLX) via vllm-metal.
+# vllm-metal (github.com/vllm-project/vllm-metal) brings vLLM to macOS on Apple
+# Silicon: it registers through vLLM's platform-plugin entry point
+# (metal -> vllm_metal:register), MetalPlatform activates, and the vLLM v1
+# AsyncLLM engine runs on the GPU through MLX. LocalAI's backend.py is UNCHANGED
+# on darwin — AsyncEngineArgs(...) -> AsyncLLMEngine.from_engine_args transparently
+# resolves to the MLX engine (proven on a real M4 / macOS 26.5 against Qwen3-0.6B).
+#
+# vllm-metal REQUIRES Python 3.12, so force the portable CPython before the venv
+# is created (ensureVenv reads PYTHON_VERSION/PYTHON_PATCH/PY_STANDALONE_TAG).
+# The patch + standalone tag mirror the l4t13 cp312 pin — a known-good
+# python-build-standalone release that also ships an aarch64-apple-darwin asset.
+if [ "$(uname -s)" = "Darwin" ]; then
+    PYTHON_VERSION="3.12"
+    PYTHON_PATCH="12"
+    PY_STANDALONE_TAG="20251120"
+fi
+
 # JetPack 7 / L4T arm64 vllm + torch wheels come straight from PyPI now
 # (torch 2.11+ ships aarch64 + cu130 manylinux wheels and vllm 0.20+ ships
 # an aarch64 wheel pinned to that torch). They're cp312-only, so bump the
@@ -57,11 +75,87 @@ if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
    PY_STANDALONE_TAG="20251120"
 fi

+# ===================== Apple Silicon (Metal/MLX) =====================
+# Reproduce vllm-metal's upstream installer
+# (curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm-metal/main/install.sh)
+# but INTO LocalAI's managed venv (ensureVenv) instead of a throwaway
+# ~/.venv-vllm-metal, so the backend integrates with LocalAI's venv lifecycle
+# (portable CPython, _makeVenvPortable relocation, runtime activation). The
+# normal CUDA/CPU installRequirements is skipped on darwin — there is no
+# macOS/arm64 vLLM wheel on PyPI; vLLM is built from source and the MLX engine
+# is layered on by the vllm-metal wheel.
+if [ "$(uname -s)" = "Darwin" ]; then
+    # Create/activate the portable 3.12 venv. On darwin USE_PIP=true and
+    # PORTABLE_PYTHON=true (set by scripts/build/python-darwin.sh), so this is a
+    # `python -m venv` based, relocatable venv.
+    ensureVenv
+
+    # vllm-metal's installer drives everything through `uv`: building vLLM from
+    # the CPU requirements needs `--index-strategy unsafe-best-match` (mixes the
+    # pytorch CPU channel with PyPI), a flag plain pip does not have. The darwin
+    # venv is pip-based, so bootstrap uv into it. uv honours $VIRTUAL_ENV (set by
+    # libbackend's _activateVenv) and installs into THIS venv — same pattern the
+    # intel branch below relies on.
+    pip install uv
+
+    # The ONLY darwin version pin -- AUTO-BUMPED by .github/bump_vllm_metal.sh,
+    # which tracks vllm-project/vllm-metal releases (NOT vllm/vllm latest). Keep
+    # it as a plain double-quoted assignment on its own line so the bumper's sed
+    # can rewrite it. Darwin therefore follows vllm-metal and can lag the Linux
+    # vllm pin (requirements-cublas13-after.txt, bumped independently against
+    # vllm/vllm) until vllm-metal supports a newer vLLM.
+    VLLM_METAL_VERSION="v0.3.0.dev20260622062346"
+
+    # The coupled vLLM source version is whatever this vllm-metal release builds
+    # against -- it declares it in its own installer as `vllm_v=`. Derive it from
+    # the PINNED tag rather than hardcoding a second value that could drift. The
+    # tag is immutable, so this stays reproducible across rebuilds.
+    VLLM_VERSION=$(curl -fsSL "https://raw.githubusercontent.com/vllm-project/vllm-metal/${VLLM_METAL_VERSION}/install.sh" \
+        | grep -oE 'vllm_v="[0-9]+\.[0-9]+\.[0-9]+"' | head -n1 | cut -d'"' -f2)
+    if [ -z "${VLLM_VERSION}" ]; then
+        echo "ERROR: could not derive the vLLM version from vllm-metal ${VLLM_METAL_VERSION}" >&2
+        exit 1
+    fi
+    echo "vllm-metal ${VLLM_METAL_VERSION} builds against vLLM ${VLLM_VERSION}"
+
+    _vllm_src=$(mktemp -d)
+    trap 'rm -rf "${_vllm_src}"' EXIT
+    pushd "${_vllm_src}"
+        # 1) Build vLLM ${VLLM_VERSION} from the release source tarball against
+        #    the CPU requirements. vllm-metal layers its MLX platform plugin on
+        #    top of this exact build.
+        curl -fsSL -o "vllm-${VLLM_VERSION}.tar.gz" \
+            "https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}.tar.gz"
+        tar -xzf "vllm-${VLLM_VERSION}.tar.gz"
+        pushd "vllm-${VLLM_VERSION}"
+            uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
+            # -Wno-parentheses: clang on macOS treats one of vLLM's C++ warnings
+            # as an error without it (matches the upstream installer's CXXFLAGS).
+            CXXFLAGS="-Wno-parentheses" uv pip install .
+        popd
+    popd
+
+    # 2) Install the prebuilt vllm-metal wheel for the PINNED release. It pulls
+    #    mlx / mlx-metal as deps and registers the `metal` platform plugin that
+    #    backend.py resolves to at engine-init time. Build the release-asset URL
+    #    deterministically (tag + the cp312/arm64 wheel name) rather than querying
+    #    api.github.com, whose unauthenticated rate limit (60/hr per IP) 403s on
+    #    shared CI runners. The wheel version is the tag without its leading 'v'.
+    _metal_wheel="vllm_metal-${VLLM_METAL_VERSION#v}-cp312-cp312-macosx_11_0_arm64.whl"
+    _metal_wheel_url="https://github.com/vllm-project/vllm-metal/releases/download/${VLLM_METAL_VERSION}/${_metal_wheel}"
+    echo "Installing vllm-metal wheel: ${_metal_wheel_url}"
+    uv pip install "${_metal_wheel_url}"
+
+    # Generate the gRPC stubs (backend_pb2*). installRequirements normally does
+    # this via runProtogen at the end; we skipped installRequirements on darwin,
+    # so call it explicitly here.
+    runProtogen
+
 # Intel XPU has no upstream-published vllm wheels, so we always build vllm
 # from source against torch-xpu and replace the default triton with
 # triton-xpu (matching torch 2.11). Mirrors the upstream procedure:
 # https://github.com/vllm-project/vllm/blob/main/docs/getting_started/installation/gpu.xpu.inc.md
-if [ "x${BUILD_TYPE}" == "xintel" ]; then
+elif [ "x${BUILD_TYPE}" == "xintel" ]; then
    # Hide requirements-intel-after.txt so installRequirements doesn't
    # try `pip install vllm` (would either fail or grab a non-XPU wheel).
    _intel_after="${backend_dir}/requirements-intel-after.txt"
--- a/backend/python/vllm/requirements-cublas13-after.txt
+++ b/backend/python/vllm/requirements-cublas13-after.txt
@@ -4,4 +4,7 @@
 # instead — the cublas13 case in install.sh adds --index-strategy=unsafe-best-match
 # so uv consults this index alongside PyPI.
 --extra-index-url https://wheels.vllm.ai/0.23.0/cu130
+# VERSION COUPLING: darwin/Apple-Silicon builds use vllm-metal (see install.sh),
+# which pins this exact vLLM version. Bumping vllm here means coordinating with a
+# vllm-metal release that supports the new version, or macOS/Metal builds break.
 vllm==0.23.0
--- a/core/application/config_file_watcher.go
+++ b/core/application/config_file_watcher.go
@@ -215,6 +215,7 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
 		envBackendGalleries := slices.Equal(appConfig.BackendGalleries, startupAppConfig.BackendGalleries)
 		envAutoloadGalleries := appConfig.AutoloadGalleries == startupAppConfig.AutoloadGalleries
 		envAutoloadBackendGalleries := appConfig.AutoloadBackendGalleries == startupAppConfig.AutoloadBackendGalleries
+		envPIIDefaultDetectors := slices.Equal(appConfig.PIIDefaultDetectors, startupAppConfig.PIIDefaultDetectors)
 		envAgentJobRetentionDays := appConfig.AgentJobRetentionDays == startupAppConfig.AgentJobRetentionDays
 		envForceEvictionWhenBusy := appConfig.ForceEvictionWhenBusy == startupAppConfig.ForceEvictionWhenBusy
 		envLRUEvictionMaxRetries := appConfig.LRUEvictionMaxRetries == startupAppConfig.LRUEvictionMaxRetries
@@ -335,6 +336,15 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
 			if settings.AutoloadBackendGalleries != nil && !envAutoloadBackendGalleries {
 				appConfig.AutoloadBackendGalleries = *settings.AutoloadBackendGalleries
 			}
+			if settings.PIIDefaultDetectors != nil && !envPIIDefaultDetectors {
+				// Request-side default redaction reads this live via
+				// ResolvePIIPolicy, so a file edit takes effect on the next chat
+				// request. The MITM listener resolves its per-host detector map
+				// once at start, so a raw file edit reaches cloud-proxy traffic
+				// only after a restart or a POST /api/settings (which rebuilds
+				// the listener) — the admin UI uses the latter.
+				appConfig.PIIDefaultDetectors = append([]string(nil), (*settings.PIIDefaultDetectors)...)
+			}
 			if settings.AutoUpgradeBackends != nil {
 				appConfig.AutoUpgradeBackends = *settings.AutoUpgradeBackends
 			}
--- a/core/application/runtime_settings_branding_test.go
+++ b/core/application/runtime_settings_branding_test.go
@@ -109,6 +109,52 @@ var _ = Describe("loadRuntimeSettingsFromFile", func() {
 		})
 	})

+	// Instance-wide default PII detectors. The file is the only source (no
+	// env var), and the loader runs immediately before startMITMIfConfigured,
+	// so a regression here means the cloud-proxy MITM listener resolves an
+	// empty detector set at boot and forwards intercepted traffic unredacted —
+	// even though pii_default_detectors is on disk and the MITM model has PII
+	// enabled. It also breaks request-side default redaction the same way.
+	Describe("PII default detectors", func() {
+		It("loads pii_default_detectors from the file", func() {
+			cfg := &config.ApplicationConfig{DynamicConfigsDir: seedSettings(`{"pii_default_detectors": ["privacy-filter-nemotron", "secret-filter"]}`)}
+			loadRuntimeSettingsFromFile(cfg)
+			Expect(cfg.PIIDefaultDetectors).To(Equal([]string{"privacy-filter-nemotron", "secret-filter"}))
+		})
+
+		It("does not override an env/CLI-set value (LOCALAI_PII_DEFAULT_DETECTORS)", func() {
+			cfg := &config.ApplicationConfig{
+				DynamicConfigsDir:   seedSettings(`{"pii_default_detectors": ["from-file"]}`),
+				PIIDefaultDetectors: []string{"from-env"}, // simulate WithPIIDefaultDetectors(env)
+			}
+			loadRuntimeSettingsFromFile(cfg)
+			Expect(cfg.PIIDefaultDetectors).To(Equal([]string{"from-env"}), "env var must win over the persisted file value")
+		})
+	})
+
+	// The live file watcher applies pii_default_detectors on a runtime change
+	// the same way it handles galleries/threads/etc.: env-set values (current
+	// == startup snapshot) are left alone, otherwise the file value is applied
+	// to the live config so request-side default redaction picks it up without
+	// a restart.
+	Describe("file watcher: pii_default_detectors", func() {
+		It("applies a changed file value to the live config", func() {
+			startup := config.ApplicationConfig{} // no env baseline
+			live := &config.ApplicationConfig{PIIDefaultDetectors: []string{"old"}}
+			handler := readRuntimeSettingsJson(startup)
+			Expect(handler([]byte(`{"pii_default_detectors":["new-a","new-b"]}`), live)).To(Succeed())
+			Expect(live.PIIDefaultDetectors).To(Equal([]string{"new-a", "new-b"}))
+		})
+
+		It("leaves an env-controlled value untouched", func() {
+			startup := config.ApplicationConfig{PIIDefaultDetectors: []string{"from-env"}}
+			live := &config.ApplicationConfig{PIIDefaultDetectors: []string{"from-env"}}
+			handler := readRuntimeSettingsJson(startup)
+			Expect(handler([]byte(`{"pii_default_detectors":["from-file"]}`), live)).To(Succeed())
+			Expect(live.PIIDefaultDetectors).To(Equal([]string{"from-env"}), "env-controlled detectors must not be overwritten by the file")
+		})
+	})
+
 	// The Agent Pool block has a mix of zero and non-zero defaults
 	// (Enabled=true, EmbeddingModel="granite-...", MaxChunkingSize=400,
 	// VectorEngine="chromem", AgentHubURL="https://agenthub.localai.io").
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -750,6 +750,20 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
 		options.MITMListen = *settings.MITMListen
 	}

+	// Instance-wide default PII detectors. LOCALAI_PII_DEFAULT_DETECTORS (via
+	// WithPIIDefaultDetectors) wins when set; otherwise the file is the source
+	// — apply it only when the env/CLI left the value empty, mirroring the
+	// "env > file" precedence used for the other fields. This must land before
+	// startMITMIfConfigured (called right after this loader): the cloud-proxy
+	// listener resolves each intercept host's detectors once at start via
+	// ResolvePIIPolicy, and a MITM model that names no detectors of its own
+	// falls back to these defaults. Without it the listener (and request-side
+	// default redaction) starts with an empty detector set and forwards
+	// traffic unredacted even though pii_default_detectors is on disk.
+	if settings.PIIDefaultDetectors != nil && len(options.PIIDefaultDetectors) == 0 {
+		options.PIIDefaultDetectors = append([]string(nil), (*settings.PIIDefaultDetectors)...)
+	}
+
 	// Backend upgrade flags
 	if settings.AutoUpgradeBackends != nil {
 		if !options.AutoUpgradeBackends {
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -181,6 +181,8 @@ type RunCMD struct {
 	// Cloud-proxy MITM listener (off by default).
 	MITMListen string `env:"LOCALAI_MITM_LISTEN" help:"Address (host:port) for the cloudproxy MITM listener. Empty = disabled. Clients set HTTPS_PROXY=http://<this>:<port>. Intercept hosts are declared per-model via the model YAML mitm.hosts: block; create one from the Add Model UI." group:"middleware"`
 	MITMCADir  string `env:"LOCALAI_MITM_CA_DIR" type:"path" help:"Directory holding the MITM proxy CA cert + key. Defaults to <data-path>/mitm-ca." group:"middleware"`
+
+	PIIDefaultDetectors []string `env:"LOCALAI_PII_DEFAULT_DETECTORS" help:"Instance-wide default PII/secret detector model names applied to any PII-enabled model (chiefly cloud-proxy / MITM models) that names no pii.detectors of its own. Comma-separated, e.g. privacy-filter-nemotron,secret-filter. Takes precedence over the value persisted via the Middleware UI." group:"middleware"`
 }

 func (r *RunCMD) Run(ctx *cliContext.Context) error {
@@ -243,6 +245,7 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		config.WithAPIAddress(r.Address),
 		config.WithMITMListen(r.MITMListen),
 		config.WithMITMCADir(r.MITMCADir),
+		config.WithPIIDefaultDetectors(r.PIIDefaultDetectors),
 		config.WithAgentJobRetentionDays(r.AgentJobRetentionDays),
 		config.WithLlamaCPPTunnelCallback(func(tunnels []string) {
 			tunnelEnvVar := strings.Join(tunnels, ",")
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -712,6 +712,18 @@ func WithMITMCADir(dir string) AppOption {
 	}
 }

+// WithPIIDefaultDetectors sets the instance-wide default PII/secret detector
+// model names applied to any PII-enabled model (chiefly cloud-proxy / MITM
+// models) that names no pii.detectors of its own. CLI/env:
+// LOCALAI_PII_DEFAULT_DETECTORS. Empty leaves the value to
+// runtime_settings.json / the Middleware UI; a non-empty value takes
+// precedence over the file (env > file).
+func WithPIIDefaultDetectors(detectors []string) AppOption {
+	return func(o *ApplicationConfig) {
+		o.PIIDefaultDetectors = detectors
+	}
+}
+
 func WithDynamicConfigDir(dynamicConfigsDir string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.DynamicConfigsDir = dynamicConfigsDir
--- a/core/config/hardware_defaults.go
+++ b/core/config/hardware_defaults.go
@@ -54,8 +54,35 @@ func (g GPU) IsNVIDIABlackwell() bool {
 	return maj >= 12
 }

+// Compute-buffer headroom guard for the raised physical batch.
+//
+// Raising n_ubatch grows the CUDA *compute buffer* (the scratch for the forward
+// graph), which is allocated PER DEVICE — it does not benefit from a second GPU
+// the way weights or KV (which are split across devices) do. The buffer scales
+// ~linearly with n_ubatch * n_ctx, so a large context turns the GB10-tuned
+// ub2048 into multi-GiB of extra scratch that must fit on a SINGLE card. On a
+// 16 GiB consumer Blackwell with a 200k context that overflows (issue #10485),
+// even though the GB10 it was measured on (128 GiB unified memory) had room.
+//
+// These constants size a conservative guard: only raise the batch when the
+// extra scratch fits the per-device VRAM ceiling.
+const (
+	// computeBufferBytesPerCell approximates the CUDA compute-buffer cost of one
+	// (n_ubatch * n_ctx) cell. Derived from an observed allocation (ub2048 *
+	// ctx204800 ~= 4.5 GiB => ~11 B/cell) and rounded up to 16 for margin, since
+	// the real cost also grows with model width (heads / embedding dim) which we
+	// don't know at config time.
+	computeBufferBytesPerCell = 16
+	// blackwellBatchHeadroomDivisor caps the extra compute buffer from raising the
+	// physical batch at VRAM/divisor. /4 keeps the bulk of a device for weights +
+	// KV, which already dominate VRAM use.
+	blackwellBatchHeadroomDivisor = 4
+)
+
 // PhysicalBatch returns the canonical physical batch (n_batch/n_ubatch) for the
-// given hardware, used when the model config leaves batch unset.
+// given hardware class, ignoring context/VRAM headroom. Use
+// PhysicalBatchForContext when a model context and per-device VRAM are known
+// (the load paths) so the raised batch can't overflow a single device.
 func PhysicalBatch(g GPU) int {
 	if g.IsNVIDIABlackwell() {
 		return BlackwellPhysicalBatch
@@ -63,6 +90,32 @@ func PhysicalBatch(g GPU) int {
 	return DefaultPhysicalBatch
 }

+// PhysicalBatchForContext is PhysicalBatch gated on per-device VRAM headroom for
+// the given context: it only raises the batch above the conservative default
+// when the extra compute buffer (which is allocated on a single device and grows
+// with n_ubatch * n_ctx) fits within blackwellBatchHeadroomDivisor of the GPU's
+// VRAM. g.VRAM must be the PER-DEVICE ceiling (the smallest device on a
+// multi-GPU host), not the summed total — the compute buffer can't be split.
+//
+// VRAM 0 (unknown) stays conservative rather than risk a per-device OOM; the
+// GB10 / unified-memory path reports system RAM, so it still clears the guard.
+func PhysicalBatchForContext(g GPU, ctx int) int {
+	if !g.IsNVIDIABlackwell() {
+		return DefaultPhysicalBatch
+	}
+	if ctx <= 0 {
+		ctx = DefaultContextSize
+	}
+	if g.VRAM == 0 {
+		return DefaultPhysicalBatch
+	}
+	extra := uint64(ctx) * uint64(BlackwellPhysicalBatch-DefaultPhysicalBatch) * computeBufferBytesPerCell
+	if extra <= g.VRAM/blackwellBatchHeadroomDivisor {
+		return BlackwellPhysicalBatch
+	}
+	return DefaultPhysicalBatch
+}
+
 // IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
 // Callers that re-tune a value chosen by an upstream host (the distributed
 // router correcting the frontend's guess) use this to avoid clobbering an
@@ -122,7 +175,12 @@ func hasParallelOption(opts []string) bool {
 // deterministic device — detection does a live nvidia-smi call.
 var localGPU = func() GPU {
 	vendor, _ := xsysinfo.DetectGPUVendor()
-	vram, _ := xsysinfo.TotalAvailableVRAM()
+	// Use the SMALLEST device's VRAM, not the summed total: the parallel-slot
+	// tier and the batch headroom guard both reason about what fits on a single
+	// card, and per-device compute buffers can't be split across GPUs. Summing
+	// two 16 GiB cards into "32 GiB" is what over-provisioned multi-GPU hosts
+	// into OOM (issue #10485).
+	vram, _ := xsysinfo.MinPerGPUVRAM()
 	return GPU{
 		Vendor:            vendor,
 		ComputeCapability: xsysinfo.NVIDIAComputeCapability(),
@@ -137,10 +195,20 @@ func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
 	if cfg == nil {
 		return
 	}
-	if cfg.Batch == 0 && gpu.IsNVIDIABlackwell() {
-		cfg.Batch = BlackwellPhysicalBatch
-		xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
-			"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability)
+	// Raise the physical batch on Blackwell only when the resulting compute
+	// buffer fits the per-device VRAM at THIS model's context. Leaving Batch at 0
+	// (rather than writing the default 512) preserves the downstream single-pass
+	// sizing in core/backend.EffectiveBatchSize for embedding/score/rerank.
+	if cfg.Batch == 0 {
+		ctx := DefaultContextSize
+		if cfg.ContextSize != nil {
+			ctx = *cfg.ContextSize
+		}
+		if PhysicalBatchForContext(gpu, ctx) == BlackwellPhysicalBatch {
+			cfg.Batch = BlackwellPhysicalBatch
+			xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
+				"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability, "context", ctx, "vram_gib", gpu.VRAM>>30)
+		}
 	}

 	// Enable concurrent serving by default on a capable GPU: without this the
--- a/core/config/hardware_defaults_internal_test.go
+++ b/core/config/hardware_defaults_internal_test.go
@@ -9,26 +9,37 @@ import (
 // GPU. The detection seam (localGPU) is injected so the path is deterministic
 // without a real GPU.
 var _ = Describe("SetDefaults hardware defaults (single-instance)", func() {
+	const gib = uint64(1) << 30
+
 	var orig func() GPU
 	BeforeEach(func() { orig = localGPU })
 	AfterEach(func() { localGPU = orig })

-	It("sets the physical batch on a local Blackwell GPU", func() {
-		localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
+	It("sets the physical batch on a local Blackwell GPU with headroom", func() {
+		localGPU = func() GPU { return GPU{ComputeCapability: "12.1", VRAM: 119 * gib} }
 		cfg := &ModelConfig{}
 		cfg.SetDefaults()
 		Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
 	})

+	It("leaves batch unset when a large context would overflow the device", func() {
+		// Regression guard for issue #10485: 16 GiB consumer Blackwell + ~200k ctx.
+		localGPU = func() GPU { return GPU{ComputeCapability: "12.0", VRAM: 16 * gib} }
+		ctx := 204800
+		cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
+		cfg.SetDefaults()
+		Expect(cfg.Batch).To(Equal(0))
+	})
+
 	It("leaves batch unset on a non-Blackwell local GPU", func() {
-		localGPU = func() GPU { return GPU{ComputeCapability: "8.9"} }
+		localGPU = func() GPU { return GPU{ComputeCapability: "8.9", VRAM: 119 * gib} }
 		cfg := &ModelConfig{}
 		cfg.SetDefaults()
 		Expect(cfg.Batch).To(Equal(0))
 	})

 	It("never overrides an explicit batch", func() {
-		localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
+		localGPU = func() GPU { return GPU{ComputeCapability: "12.1", VRAM: 119 * gib} }
 		cfg := &ModelConfig{}
 		cfg.Batch = 1024
 		cfg.SetDefaults()
--- a/core/config/hardware_defaults_test.go
+++ b/core/config/hardware_defaults_test.go
@@ -7,6 +7,8 @@ import (
 )

 var _ = Describe("Hardware-driven config defaults", func() {
+	const gib = uint64(1) << 30
+
 	DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
 		func(cc string, want bool) {
 			Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
@@ -35,21 +37,54 @@ var _ = Describe("Hardware-driven config defaults", func() {
 		})
 	})

+	Describe("PhysicalBatchForContext (per-device VRAM headroom)", func() {
+		It("raises the batch when the compute buffer fits the device", func() {
+			// 16 GiB Blackwell with a small context: the extra scratch is tiny.
+			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.0", VRAM: 16 * gib}, 8192)).
+				To(Equal(BlackwellPhysicalBatch))
+		})
+		It("keeps the default batch when a large context would overflow one device", func() {
+			// The issue #10485 case: 16 GiB consumer Blackwell, ~200k context.
+			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.0", VRAM: 16 * gib}, 204800)).
+				To(Equal(DefaultPhysicalBatch))
+		})
+		It("still raises the batch on a large unified-memory device (GB10)", func() {
+			// GB10 reports system RAM (~119 GiB) as its single device's VRAM.
+			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.1", VRAM: 119 * gib}, 204800)).
+				To(Equal(BlackwellPhysicalBatch))
+		})
+		It("stays conservative when VRAM is unknown", func() {
+			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.1"}, 8192)).
+				To(Equal(DefaultPhysicalBatch))
+		})
+		It("never raises the batch on non-Blackwell", func() {
+			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "9.0", VRAM: 80 * gib}, 8192)).
+				To(Equal(DefaultPhysicalBatch))
+		})
+	})
+
 	Describe("ApplyHardwareDefaults", func() {
-		It("raises an unset batch to 2048 on Blackwell", func() {
+		It("raises an unset batch to 2048 on Blackwell with headroom", func() {
 			cfg := &ModelConfig{}
-			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
 			Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
 		})
+		It("leaves batch unset when a large context would overflow one device", func() {
+			// Regression guard for issue #10485: 16 GiB card + ~200k context.
+			ctx := 204800
+			cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.0", VRAM: 16 * gib})
+			Expect(cfg.Batch).To(Equal(0))
+		})
 		It("leaves batch unset on non-Blackwell", func() {
 			cfg := &ModelConfig{}
-			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0"})
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0", VRAM: 119 * gib})
 			Expect(cfg.Batch).To(Equal(0))
 		})
 		It("never overrides an explicit batch", func() {
 			cfg := &ModelConfig{}
 			cfg.Batch = 1024
-			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
 			Expect(cfg.Batch).To(Equal(1024))
 		})
 		It("no-ops on nil", func() {
@@ -57,8 +92,6 @@ var _ = Describe("Hardware-driven config defaults", func() {
 		})
 	})

-	const gib = uint64(1) << 30
-
 	DescribeTable("DefaultParallelSlots (by VRAM)",
 		func(vramGiB uint64, want int) {
 			Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want))
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -1204,11 +1204,6 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	// This ensures gallery-installed and runtime-loaded models get optimal parameters.
 	ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model)

-	// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell).
-	// Uses the local GPU here; in distributed mode the router re-applies the same
-	// heuristics for the selected node's GPU before loading. Explicit config wins.
-	ApplyHardwareDefaults(cfg, localGPU())
-
 	// Apply serving-policy defaults (device-independent): cross-request prefix
 	// caching. Propagates to distributed nodes via the model options.
 	ApplyServingDefaults(cfg)
@@ -1247,6 +1242,16 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.ContextSize = &ctx
 	}
 	runBackendHooks(cfg, lo.modelPath)
+
+	// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell)
+	// LAST, after the context size is fully resolved (explicit config, LoadOptions,
+	// then the GGUF guess inside runBackendHooks): the Blackwell batch guard sizes
+	// the per-device compute buffer against this model's context, so it must see
+	// the final value, not a pre-guess nil. Uses the local GPU here; in distributed
+	// mode the router re-applies the same heuristics for the selected node's GPU
+	// before loading. Explicit config always wins.
+	ApplyHardwareDefaults(cfg, localGPU())
+
 	cfg.syncKnownUsecasesFromString()
 }

--- a/core/config/runtime_settings_persist.go
+++ b/core/config/runtime_settings_persist.go
@@ -5,6 +5,7 @@ import (
 	"errors"
 	"os"
 	"path/filepath"
+	"reflect"
 )

 // runtimeSettingsFile is the on-disk filename inside DynamicConfigsDir.
@@ -33,6 +34,35 @@ func (o *ApplicationConfig) ReadPersistedSettings() (RuntimeSettings, error) {
 	return settings, nil
 }

+// MergeNonNil overlays every set (non-nil) field of overlay onto the
+// receiver, leaving the receiver's value untouched wherever overlay left a
+// field unset. Every RuntimeSettings field is a pointer precisely so "set"
+// can be told apart from "absent" (see the type doc), which makes this a
+// faithful partial update: a caller that submits only the field it owns
+// changes exactly that field and never clobbers unrelated settings.
+//
+// This is the read-modify-write contract the persistence helpers exist for.
+// UpdateSettingsEndpoint reads the on-disk settings, merges the request body
+// on top, and writes the result — so a focused admin page that POSTs only its
+// own field (the Middleware page sends only mitm_listen; the detector table
+// only pii_default_detectors) no longer nulls every other setting.
+//
+// Reflection keeps the merge total over the struct: a field added to
+// RuntimeSettings later is merged automatically, so the persistence path can
+// never silently drop a new setting the way a hand-maintained field list
+// would. Non-pointer fields (none today) are skipped — they cannot express
+// "absent", so the receiver wins.
+func (s *RuntimeSettings) MergeNonNil(overlay RuntimeSettings) {
+	dst := reflect.ValueOf(s).Elem()
+	src := reflect.ValueOf(overlay)
+	for i := 0; i < src.NumField(); i++ {
+		f := src.Field(i)
+		if f.Kind() == reflect.Pointer && !f.IsNil() {
+			dst.Field(i).Set(f)
+		}
+	}
+}
+
 // WritePersistedSettings serialises the given RuntimeSettings to
 // runtime_settings.json with restricted permissions (it may carry API
 // keys and P2P tokens).
--- a/core/config/runtime_settings_persist_test.go
+++ b/core/config/runtime_settings_persist_test.go
@@ -12,6 +12,7 @@ import (
 )

 func strPtr(s string) *string { return &s }
+func boolPtr(b bool) *bool     { return &b }

 var _ = Describe("RuntimeSettings persistence helpers", func() {
 	var (
@@ -51,6 +52,47 @@ var _ = Describe("RuntimeSettings persistence helpers", func() {
 		})
 	})

+	// MergeNonNil is the partial-update primitive UpdateSettingsEndpoint
+	// relies on: a focused admin page POSTs only the field it owns, and the
+	// handler reads the on-disk settings and overlays the request on top.
+	// Without it, the body would be written verbatim and every field the
+	// caller omitted would be nulled (the reported regression: changing
+	// mitm_listen wiped the galleries, api keys, watchdog config, etc.).
+	Describe("MergeNonNil partial update", func() {
+		It("overlays set fields and preserves unset ones", func() {
+			base := config.RuntimeSettings{
+				MITMListen:          strPtr(":9000"),
+				Galleries:           &[]config.Gallery{{Name: "g1", URL: "http://example/g1"}},
+				WatchdogIdleEnabled: boolPtr(true),
+				ApiKeys:             &[]string{"persisted-key"},
+				PIIDefaultDetectors: &[]string{"det-a"},
+			}
+
+			// Simulate the Middleware proxy tab: only mitm_listen is sent.
+			overlay := config.RuntimeSettings{MITMListen: strPtr(":8443")}
+			base.MergeNonNil(overlay)
+
+			Expect(base.MITMListen).ToNot(BeNil())
+			Expect(*base.MITMListen).To(Equal(":8443"), "set field should be overlaid")
+			// Everything the overlay left unset must survive untouched.
+			Expect(base.Galleries).ToNot(BeNil(), "galleries were clobbered")
+			Expect(*base.Galleries).To(HaveLen(1))
+			Expect(base.WatchdogIdleEnabled).ToNot(BeNil())
+			Expect(*base.WatchdogIdleEnabled).To(BeTrue())
+			Expect(base.ApiKeys).ToNot(BeNil(), "api_keys were clobbered")
+			Expect(*base.ApiKeys).To(Equal([]string{"persisted-key"}))
+			Expect(base.PIIDefaultDetectors).ToNot(BeNil(), "pii_default_detectors were clobbered")
+			Expect(*base.PIIDefaultDetectors).To(Equal([]string{"det-a"}))
+		})
+
+		It("lets an explicit empty slice clear a field", func() {
+			base := config.RuntimeSettings{PIIDefaultDetectors: &[]string{"det-a"}}
+			base.MergeNonNil(config.RuntimeSettings{PIIDefaultDetectors: &[]string{}})
+			Expect(base.PIIDefaultDetectors).ToNot(BeNil())
+			Expect(*base.PIIDefaultDetectors).To(BeEmpty(), "an explicit empty slice should clear, not preserve")
+		})
+	})
+
 	// MITM round trip pins the contract that loadRuntimeSettingsFromFile
 	// MITM listener address must survive a write/read round trip so the
 	// next process restart can bring the listener back up. (Intercept
--- a/core/http/endpoints/localai/agent_collections.go
+++ b/core/http/endpoints/localai/agent_collections.go
@@ -70,7 +70,7 @@ func UploadToCollectionEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := c.Param("name")
+		name := decodedParam(c, "name")
 		file, err := c.FormFile("file")
 		if err != nil {
 			return c.JSON(http.StatusBadRequest, map[string]string{"error": "file required"})
@@ -116,7 +116,7 @@ func ListCollectionEntriesEndpoint(app *application.Application) echo.HandlerFun
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		entries, err := svc.ListCollectionEntriesForUser(userID, c.Param("name"))
+		entries, err := svc.ListCollectionEntriesForUser(userID, decodedParam(c, "name"))
 		if err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
@@ -139,7 +139,7 @@ func GetCollectionEntryContentEndpoint(app *application.Application) echo.Handle
 		if err != nil {
 			entry = entryParam
 		}
-		content, chunkCount, err := svc.GetCollectionEntryContentForUser(userID, c.Param("name"), entry)
+		content, chunkCount, err := svc.GetCollectionEntryContentForUser(userID, decodedParam(c, "name"), entry)
 		if err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
@@ -164,7 +164,7 @@ func SearchCollectionEndpoint(app *application.Application) echo.HandlerFunc {
 		if err := c.Bind(&payload); err != nil {
 			return c.JSON(http.StatusBadRequest, map[string]string{"error": err.Error()})
 		}
-		results, err := svc.SearchCollectionForUser(userID, c.Param("name"), payload.Query, payload.MaxResults)
+		results, err := svc.SearchCollectionForUser(userID, decodedParam(c, "name"), payload.Query, payload.MaxResults)
 		if err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
@@ -182,7 +182,7 @@ func ResetCollectionEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		if err := svc.ResetCollectionForUser(userID, c.Param("name")); err != nil {
+		if err := svc.ResetCollectionForUser(userID, decodedParam(c, "name")); err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
 			}
@@ -202,7 +202,7 @@ func DeleteCollectionEntryEndpoint(app *application.Application) echo.HandlerFun
 		if err := c.Bind(&payload); err != nil {
 			return c.JSON(http.StatusBadRequest, map[string]string{"error": err.Error()})
 		}
-		remaining, err := svc.DeleteCollectionEntryForUser(userID, c.Param("name"), payload.Entry)
+		remaining, err := svc.DeleteCollectionEntryForUser(userID, decodedParam(c, "name"), payload.Entry)
 		if err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
@@ -230,7 +230,7 @@ func AddCollectionSourceEndpoint(app *application.Application) echo.HandlerFunc
 		if payload.UpdateInterval < 1 {
 			payload.UpdateInterval = 60
 		}
-		if err := svc.AddCollectionSourceForUser(userID, c.Param("name"), payload.URL, payload.UpdateInterval); err != nil {
+		if err := svc.AddCollectionSourceForUser(userID, decodedParam(c, "name"), payload.URL, payload.UpdateInterval); err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
 			}
@@ -250,7 +250,7 @@ func RemoveCollectionSourceEndpoint(app *application.Application) echo.HandlerFu
 		if err := c.Bind(&payload); err != nil {
 			return c.JSON(http.StatusBadRequest, map[string]string{"error": err.Error()})
 		}
-		if err := svc.RemoveCollectionSourceForUser(userID, c.Param("name"), payload.URL); err != nil {
+		if err := svc.RemoveCollectionSourceForUser(userID, decodedParam(c, "name"), payload.URL); err != nil {
 			return c.JSON(http.StatusInternalServerError, map[string]string{"error": err.Error()})
 		}
 		return c.JSON(http.StatusOK, map[string]string{"status": "ok"})
@@ -267,7 +267,7 @@ func GetCollectionEntryRawFileEndpoint(app *application.Application) echo.Handle
 		if err != nil {
 			entry = entryParam
 		}
-		fpath, err := svc.GetCollectionEntryFilePathForUser(userID, c.Param("name"), entry)
+		fpath, err := svc.GetCollectionEntryFilePathForUser(userID, decodedParam(c, "name"), entry)
 		if err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
@@ -282,7 +282,7 @@ func ListCollectionSourcesEndpoint(app *application.Application) echo.HandlerFun
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		sources, err := svc.ListCollectionSourcesForUser(userID, c.Param("name"))
+		sources, err := svc.ListCollectionSourcesForUser(userID, decodedParam(c, "name"))
 		if err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
--- a/core/http/endpoints/localai/agent_collections_param_test.go
+++ b/core/http/endpoints/localai/agent_collections_param_test.go
@@ -0,0 +1,49 @@
+package localai
+
+import (
+	"net/http"
+	"net/http/httptest"
+
+	"github.com/labstack/echo/v4"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// Regression for #10443: agent/collection names carry a "legacy-api-key:"
+// prefix, so the ':' is percent-encoded as %3A in the request path. Echo routes
+// such paths via URL.RawPath and stores the path-param value still escaped, so
+// handlers must URL-decode it before looking the collection up in the store -
+// otherwise the lookup sees "legacy-api-key%3ALiteraryResearch" and 404s.
+var _ = Describe("decodedParam", func() {
+	var e *echo.Echo
+
+	BeforeEach(func() {
+		e = echo.New()
+	})
+
+	// route runs a request through Echo's real router so the path param is
+	// populated exactly as it would be in production, then returns the decoded
+	// value the handler would observe.
+	route := func(rawPath string) string {
+		var got string
+		e.GET("/api/agents/collections/:name/upload", func(c echo.Context) error {
+			got = decodedParam(c, "name")
+			return c.NoContent(http.StatusOK)
+		})
+		req := httptest.NewRequest(http.MethodGet, rawPath, nil)
+		rec := httptest.NewRecorder()
+		e.ServeHTTP(rec, req)
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		return got
+	}
+
+	It("decodes a percent-encoded colon in the collection name", func() {
+		got := route("/api/agents/collections/legacy-api-key%3ALiteraryResearch/upload")
+		Expect(got).To(Equal("legacy-api-key:LiteraryResearch"))
+	})
+
+	It("leaves an unencoded name untouched", func() {
+		got := route("/api/agents/collections/PlainCollection/upload")
+		Expect(got).To(Equal("PlainCollection"))
+	})
+})
--- a/core/http/endpoints/localai/agents.go
+++ b/core/http/endpoints/localai/agents.go
@@ -6,6 +6,7 @@ import (
 	"io"
 	"maps"
 	"net/http"
+	"net/url"
 	"os"
 	"path/filepath"
 	"slices"
@@ -33,6 +34,22 @@ func getUserID(c echo.Context) string {
 	return user.ID
 }

+// decodedParam returns the named path parameter, URL-decoding it.
+//
+// Echo routes a request via URL.RawPath whenever the path contains
+// percent-encoded characters (e.g. %3A for ':'), and in that case stores the
+// matched path-param value raw/escaped. Agent and collection names carry a
+// "legacy-api-key:" prefix, so the ':' arrives as %3A and the raw param no
+// longer matches the stored name. Callers must unescape before lookups.
+// Falls back to the raw value if it isn't valid percent-encoding.
+func decodedParam(c echo.Context, name string) string {
+	raw := c.Param(name)
+	if decoded, err := url.PathUnescape(raw); err == nil {
+		return decoded
+	}
+	return raw
+}
+
 // isAdminUser returns true if the authenticated user has admin role.
 func isAdminUser(c echo.Context) bool {
 	user := auth.GetUser(c)
@@ -127,7 +144,7 @@ func GetAgentEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := c.Param("name")
+		name := decodedParam(c, "name")

 		statuses := svc.ListAgentsForUser(userID)
 		active, exists := statuses[name]
@@ -142,7 +159,7 @@ func UpdateAgentEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := c.Param("name")
+		name := decodedParam(c, "name")
 		var cfg state.AgentConfig
 		if err := c.Bind(&cfg); err != nil {
 			return c.JSON(http.StatusBadRequest, map[string]string{"error": err.Error()})
@@ -161,7 +178,7 @@ func DeleteAgentEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := c.Param("name")
+		name := decodedParam(c, "name")
 		if err := svc.DeleteAgentForUser(userID, name); err != nil {
 			return c.JSON(http.StatusInternalServerError, map[string]string{"error": err.Error()})
 		}
@@ -173,7 +190,7 @@ func GetAgentConfigEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := c.Param("name")
+		name := decodedParam(c, "name")
 		cfg := svc.GetAgentConfigForUser(userID, name)
 		if cfg == nil {
 			return c.JSON(http.StatusNotFound, map[string]string{"error": "Agent not found"})
@@ -186,7 +203,7 @@ func PauseAgentEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		if err := svc.PauseAgentForUser(userID, c.Param("name")); err != nil {
+		if err := svc.PauseAgentForUser(userID, decodedParam(c, "name")); err != nil {
 			return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
 		}
 		return c.JSON(http.StatusOK, map[string]string{"status": "ok"})
@@ -197,7 +214,7 @@ func ResumeAgentEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		if err := svc.ResumeAgentForUser(userID, c.Param("name")); err != nil {
+		if err := svc.ResumeAgentForUser(userID, decodedParam(c, "name")); err != nil {
 			return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
 		}
 		return c.JSON(http.StatusOK, map[string]string{"status": "ok"})
@@ -208,7 +225,7 @@ func GetAgentStatusEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := c.Param("name")
+		name := decodedParam(c, "name")

 		history := svc.GetAgentStatusForUser(userID, name)
 		if history == nil {
@@ -241,7 +258,7 @@ func GetAgentObservablesEndpoint(app *application.Application) echo.HandlerFunc
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := c.Param("name")
+		name := decodedParam(c, "name")

 		history, err := svc.GetAgentObservablesForUser(userID, name)
 		if err != nil {
@@ -261,7 +278,7 @@ func ClearAgentObservablesEndpoint(app *application.Application) echo.HandlerFun
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := c.Param("name")
+		name := decodedParam(c, "name")
 		if err := svc.ClearAgentObservablesForUser(userID, name); err != nil {
 			return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
 		}
@@ -273,7 +290,7 @@ func ChatWithAgentEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := c.Param("name")
+		name := decodedParam(c, "name")
 		var payload struct {
 			Message string `json:"message"`
 		}
@@ -302,7 +319,7 @@ func AgentSSEEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := c.Param("name")
+		name := decodedParam(c, "name")

 		// Try local SSE manager first
 		manager := svc.GetSSEManagerForUser(userID, name)
@@ -334,7 +351,7 @@ func ExportAgentEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := c.Param("name")
+		name := decodedParam(c, "name")
 		data, err := svc.ExportAgentForUser(userID, name)
 		if err != nil {
 			return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
--- a/core/http/endpoints/localai/settings.go
+++ b/core/http/endpoints/localai/settings.go
@@ -4,8 +4,6 @@ import (
 	"encoding/json"
 	"io"
 	"net/http"
-	"os"
-	"path/filepath"
 	"time"

 	"github.com/labstack/echo/v4"
@@ -110,6 +108,18 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
 			})
 		}

+		// Read whatever is already persisted: it is both the source of truth
+		// for branding asset filenames (below) and the base we merge this
+		// request onto before writing. A read failure must not let a Save
+		// silently discard the existing settings — surface it instead.
+		persisted, err := appConfig.ReadPersistedSettings()
+		if err != nil {
+			return c.JSON(http.StatusInternalServerError, schema.SettingsResponse{
+				Success: false,
+				Error:   "Failed to read existing settings: " + err.Error(),
+			})
+		}
+
 		// Branding asset filenames are owned exclusively by
 		// /api/branding/asset/{kind} (upload/delete). The Settings page also
 		// round-trips them via GET /api/settings, but its local state is stale
@@ -118,11 +128,9 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
 		// at page open. Replace whatever the body sent for these three fields
 		// with the values currently on disk so /api/settings can never
 		// regress them.
-		if existing, err := appConfig.ReadPersistedSettings(); err == nil {
-			settings.LogoFile = existing.LogoFile
-			settings.LogoHorizontalFile = existing.LogoHorizontalFile
-			settings.FaviconFile = existing.FaviconFile
-		}
+		settings.LogoFile = persisted.LogoFile
+		settings.LogoHorizontalFile = persisted.LogoHorizontalFile
+		settings.FaviconFile = persisted.FaviconFile

 		// The UI reads ApiKeys from GET /api/settings, which already returns the
 		// merged env+runtime list. When the user clicks Save, the same merged
@@ -145,16 +153,17 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
 			settings.ApiKeys = &runtimeOnly
 		}

-		settingsFile := filepath.Join(appConfig.DynamicConfigsDir, "runtime_settings.json")
-		settingsJSON, err := json.MarshalIndent(settings, "", "  ")
-		if err != nil {
-			return c.JSON(http.StatusInternalServerError, schema.SettingsResponse{
-				Success: false,
-				Error:   "Failed to marshal settings: " + err.Error(),
-			})
-		}
-
-		if err := os.WriteFile(settingsFile, settingsJSON, 0600); err != nil {
+		// Persist as a partial update: overlay only the fields this request set
+		// onto the settings already on disk. Focused admin pages POST just the
+		// keys they own (the Middleware proxy tab sends only mitm_listen; the
+		// detector table only pii_default_detectors), so writing the request
+		// body verbatim would null every unrelated setting (the no-omitempty
+		// api_keys / pii_default_detectors fields even round-trip as JSON
+		// null). The full Settings page still round-trips every field, so its
+		// Save is unchanged.
+		toPersist := persisted
+		toPersist.MergeNonNil(settings)
+		if err := appConfig.WritePersistedSettings(toPersist); err != nil {
 			return c.JSON(http.StatusInternalServerError, schema.SettingsResponse{
 				Success: false,
 				Error:   "Failed to write settings file: " + err.Error(),
@@ -262,7 +271,14 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
 			}
 		}

-		if settings.MITMListen != nil {
+		// Rebuild the MITM listener when its address OR the instance-wide
+		// default detectors change. The per-host detector map is resolved once
+		// at listener start (startMITMLocked → ResolvePIIPolicy), so a
+		// default-detector change is otherwise invisible to cloud-proxy traffic
+		// until the next restart — an admin toggling a default detector would
+		// see no redaction. RestartMITM is a no-op when the listener is
+		// disabled (empty address).
+		if settings.MITMListen != nil || settings.PIIDefaultDetectors != nil {
 			if err := app.RestartMITM(); err != nil {
 				xlog.Error("Failed to restart MITM proxy", "error", err)
 				return c.JSON(http.StatusInternalServerError, schema.SettingsResponse{
--- a/core/http/endpoints/localai/settings_test.go
+++ b/core/http/endpoints/localai/settings_test.go
@@ -52,6 +52,10 @@ var _ = Describe("Settings endpoints", func() {
 		// Settings are persisted here; set after construction since there's no
 		// dedicated AppOption for it.
 		app.ApplicationConfig().DynamicConfigsDir = tmp
+		// Contain the MITM CA inside tmp too. The partial-save spec flips
+		// mitm_listen, which starts the listener and writes a CA; without this
+		// it defaults to ./mitm-ca and litters the package source tree.
+		app.ApplicationConfig().MITMCADir = filepath.Join(tmp, "mitm-ca")

 		e = echo.New()
 		e.GET("/api/settings", GetSettingsEndpoint(app))
@@ -109,6 +113,57 @@ var _ = Describe("Settings endpoints", func() {
 		Expect(err).ToNot(HaveOccurred())
 	})

+	// Regression: a focused admin page (the Middleware proxy tab) POSTs only
+	// the one field it owns — mitm_listen. The old handler wrote the request
+	// body verbatim, so every other persisted setting was dropped (and
+	// api_keys / pii_default_detectors, which lack omitempty, were written as
+	// null). A partial POST must now merge onto what is already on disk.
+	It("preserves unrelated persisted settings when a partial POST sets only mitm_listen", func() {
+		// First save establishes a fuller settings file (as the full Settings
+		// page would): galleries, an API key, and the MITM listener. The
+		// listener restart binds a real socket, so use 127.0.0.1:0 for an
+		// ephemeral free port rather than a fixed one that may be in use.
+		rec := post(`{"mitm_listen":"127.0.0.1:0","galleries":[{"name":"g1","url":"http://example/g1"}],"api_keys":["k1"],"pii_default_detectors":["det-a"]}`)
+		Expect(rec.Code).To(Equal(http.StatusOK), rec.Body.String())
+
+		// The Middleware proxy tab then changes only the listen address — the
+		// exact partial body that nulled everything else before the fix.
+		rec = post(`{"mitm_listen":"127.0.0.1:0"}`)
+		Expect(rec.Code).To(Equal(http.StatusOK), rec.Body.String())
+
+		raw, err := os.ReadFile(filepath.Join(tmp, "runtime_settings.json"))
+		Expect(err).ToNot(HaveOccurred())
+		var ondisk config.RuntimeSettings
+		Expect(json.Unmarshal(raw, &ondisk)).To(Succeed())
+
+		Expect(ondisk.MITMListen).ToNot(BeNil())
+		Expect(*ondisk.MITMListen).To(Equal("127.0.0.1:0"), "the changed field should be saved")
+		Expect(ondisk.Galleries).ToNot(BeNil(), "galleries were clobbered by the partial save")
+		Expect(*ondisk.Galleries).To(HaveLen(1))
+		Expect(ondisk.ApiKeys).ToNot(BeNil(), "api_keys were nulled by the partial save")
+		Expect(*ondisk.ApiKeys).To(Equal([]string{"k1"}))
+		Expect(ondisk.PIIDefaultDetectors).ToNot(BeNil(), "pii_default_detectors were nulled by the partial save")
+		Expect(*ondisk.PIIDefaultDetectors).To(Equal([]string{"det-a"}))
+	})
+
+	// The MITM listener resolves its per-host PII detectors once at start
+	// (startMITMLocked → ResolvePIIPolicy), and the handler used to restart it
+	// only when mitm_listen changed. So an admin toggling a default detector
+	// (the Middleware detector table POSTs only pii_default_detectors) left
+	// cloud-proxy traffic unredacted until the next reboot. A
+	// pii_default_detectors change must now rebuild the listener.
+	It("rebuilds the MITM listener when only pii_default_detectors changes", func() {
+		rec := post(`{"mitm_listen":"127.0.0.1:0"}`)
+		Expect(rec.Code).To(Equal(http.StatusOK), rec.Body.String())
+		srv1 := app.MITMServer()
+		Expect(srv1).ToNot(BeNil(), "listener should be running after mitm_listen is set")
+
+		rec = post(`{"pii_default_detectors":["det-a"]}`)
+		Expect(rec.Code).To(Equal(http.StatusOK), rec.Body.String())
+		Expect(app.MITMServer()).ToNot(BeIdenticalTo(srv1),
+			"a default-detector change must restart the listener so it picks up the new detectors")
+	})
+
 	// Residual #9125: enabling the watchdog from a cold (off) state via the
 	// React master toggle must start the live watchdog immediately, without a
 	// restart. The toggle posts watchdog_idle_enabled/busy_enabled=true while
--- a/core/http/endpoints/openai/realtime_model.go
+++ b/core/http/endpoints/openai/realtime_model.go
@@ -432,7 +432,7 @@ func loadSoundDetectionConfig(pipeline *config.Pipeline, cl *config.ModelConfigL
 	if pipeline.SoundDetection == "" {
 		return nil, nil
 	}
-	cfg, err := cl.LoadModelConfigFileByName(pipeline.SoundDetection, ml.ModelPath)
+	cfg, err := loadPipelineSubModel(cl, pipeline.SoundDetection, ml.ModelPath)
 	if err != nil {
 		return nil, fmt.Errorf("failed to load sound detection config: %w", err)
 	}
@@ -443,7 +443,7 @@ func loadSoundDetectionConfig(pipeline *config.Pipeline, cl *config.ModelConfigL
 }

 func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, *config.ModelConfig, error) {
-	cfgVAD, err := cl.LoadModelConfigFileByName(pipeline.VAD, ml.ModelPath)
+	cfgVAD, err := loadPipelineSubModel(cl, pipeline.VAD, ml.ModelPath)
 	if err != nil {

 		return nil, nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -453,7 +453,7 @@ func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfig
 		return nil, nil, fmt.Errorf("failed to validate config: %w", err)
 	}

-	cfgSST, err := cl.LoadModelConfigFileByName(pipeline.Transcription, ml.ModelPath)
+	cfgSST, err := loadPipelineSubModel(cl, pipeline.Transcription, ml.ModelPath)
 	if err != nil {

 		return nil, nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -542,11 +542,30 @@ func buildRealtimeRoutingContext(a *application.Application, sessionID string) *
 	}
 }

+// loadPipelineSubModel loads a pipeline sub-model config by name and follows a
+// single alias hop, so a pipeline that references an alias (e.g. `llm: default`)
+// gets the alias target's full config (Backend, Model, ...) rather than the
+// alias stub with an empty Backend. Without this the alias survives unresolved
+// into model loading and fails downstream — notably in distributed mode with
+// "backend name is empty". Mirrors the top-level alias resolution in
+// core/http/middleware/request.go.
+func loadPipelineSubModel(cl *config.ModelConfigLoader, name, modelPath string) (*config.ModelConfig, error) {
+	cfg, err := cl.LoadModelConfigFileByName(name, modelPath)
+	if err != nil {
+		return nil, err
+	}
+	resolved, _, err := cl.ResolveAlias(cfg)
+	if err != nil {
+		return nil, err
+	}
+	return resolved, nil
+}
+
 // returns and loads either a wrapped model or a model that support audio-to-audio
 func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, evaluator *templates.Evaluator, routing *RealtimeRoutingContext) (Model, error) {
 	xlog.Debug("Creating new model pipeline model", "pipeline", pipeline)

-	cfgVAD, err := cl.LoadModelConfigFileByName(pipeline.VAD, ml.ModelPath)
+	cfgVAD, err := loadPipelineSubModel(cl, pipeline.VAD, ml.ModelPath)
 	if err != nil {

 		return nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -557,7 +576,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
 	}

 	// TODO: Do we always need a transcription model? It can be disabled. Note that any-to-any instruction following models don't transcribe as such, so if transcription is required it is a separate process
-	cfgSST, err := cl.LoadModelConfigFileByName(pipeline.Transcription, ml.ModelPath)
+	cfgSST, err := loadPipelineSubModel(cl, pipeline.Transcription, ml.ModelPath)
 	if err != nil {

 		return nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -589,7 +608,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
 	xlog.Debug("Loading a wrapped model")

 	// Otherwise we want to return a wrapped model, which is a "virtual" model that re-uses other models to perform operations
-	cfgLLM, err := cl.LoadModelConfigFileByName(pipeline.LLM, ml.ModelPath)
+	cfgLLM, err := loadPipelineSubModel(cl, pipeline.LLM, ml.ModelPath)
 	if err != nil {

 		return nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -604,7 +623,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
 	applyPipelineReasoning(cfgLLM, *pipeline)
 	applyPipelineThinking(cfgLLM, *pipeline)

-	cfgTTS, err := cl.LoadModelConfigFileByName(pipeline.TTS, ml.ModelPath)
+	cfgTTS, err := loadPipelineSubModel(cl, pipeline.TTS, ml.ModelPath)
 	if err != nil {

 		return nil, fmt.Errorf("failed to load backend config: %w", err)
--- a/core/http/endpoints/openai/realtime_model_alias_test.go
+++ b/core/http/endpoints/openai/realtime_model_alias_test.go
@@ -0,0 +1,52 @@
+package openai
+
+import (
+	"os"
+	"path/filepath"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/config"
+)
+
+// loadPipelineSubModel must resolve a pipeline sub-model that references an
+// alias (e.g. `llm: default`) one hop to the alias target's full config — so
+// the effective backend is the target's backend, not the empty backend of the
+// alias stub. This mirrors the top-level alias resolution done in
+// core/http/middleware/request.go, which the realtime pipeline previously
+// skipped (failing in distributed mode with "backend name is empty").
+var _ = Describe("loadPipelineSubModel", func() {
+	It("resolves a sub-model alias one hop to the target's config", func() {
+		tmpDir := GinkgoT().TempDir()
+
+		// A real model config with a concrete backend.
+		realLLM := `name: real-llm
+backend: llama-cpp
+parameters:
+  model: real-llm.gguf
+`
+		Expect(os.WriteFile(filepath.Join(tmpDir, "real-llm.yaml"), []byte(realLLM), 0644)).To(Succeed())
+
+		// An alias pointing at the real model.
+		aliasCfg := `name: default
+alias: real-llm
+`
+		Expect(os.WriteFile(filepath.Join(tmpDir, "default.yaml"), []byte(aliasCfg), 0644)).To(Succeed())
+
+		cl := config.NewModelConfigLoader(tmpDir)
+		Expect(cl.LoadModelConfigsFromPath(tmpDir)).To(Succeed())
+
+		// Resolving the alias must follow the hop to the target's full config.
+		resolved, err := loadPipelineSubModel(cl, "default", tmpDir)
+		Expect(err).NotTo(HaveOccurred())
+		Expect(resolved.IsAlias()).To(BeFalse())
+		Expect(resolved.Backend).To(Equal("llama-cpp"))
+
+		// A non-alias name must load unchanged.
+		direct, err := loadPipelineSubModel(cl, "real-llm", tmpDir)
+		Expect(err).NotTo(HaveOccurred())
+		Expect(direct.Backend).To(Equal("llama-cpp"))
+		Expect(direct.Name).To(Equal("real-llm"))
+	})
+})
--- a/core/http/react-ui/e2e/role-mode-adaptive.spec.js
+++ b/core/http/react-ui/e2e/role-mode-adaptive.spec.js
@@ -1,100 +0,0 @@
-import { test, expect } from './coverage-fixtures.js'
-
-// These specs stub /api/features and /api/auth/status per cell. The test server
-// disables auth (isAdmin=true) and reports its own features, so we intercept
-// before navigation to simulate each role x mode cell.
-
-function stubFeatures(page, features) {
-  return page.route('**/api/features', route =>
-    route.fulfill({ contentType: 'application/json', body: JSON.stringify(features) }))
-}
-
-function stubNoP2P(page) {
-  // P2P token endpoint returns empty -> p2pEnabled=false.
-  return page.route('**/api/p2p/token', route =>
-    route.fulfill({ contentType: 'text/plain', body: '' }))
-}
-
-test.describe('Adaptive landing (HomeRoute)', () => {
-  test('admin + distributed redirects /app to Nodes', async ({ page }) => {
-    await stubFeatures(page, { distributed: true })
-    await stubNoP2P(page)
-    await page.goto('/app')
-    await expect(page).toHaveURL(/\/app\/nodes$/)
-    await expect(page.locator('.page-title').first()).toBeVisible({ timeout: 15_000 })
-  })
-
-  test('admin + single-node stays on Home', async ({ page }) => {
-    await stubFeatures(page, { distributed: false })
-    await stubNoP2P(page)
-    await page.goto('/app')
-    await expect(page).toHaveURL(/\/app$/)
-    await expect(page.locator('.home-greeting')).toBeVisible({ timeout: 15_000 })
-  })
-})
-
-test.describe('Adaptive sidebar', () => {
-  test('distributed pins the Cluster group with Nodes at the top', async ({ page }) => {
-    await stubFeatures(page, { distributed: true })
-    await stubNoP2P(page)
-    await page.goto('/app/chat') // any in-app page so the sidebar is mounted
-    const pinned = page.locator('.sidebar-nav .sidebar-section-items').first()
-    await expect(pinned.getByText('Nodes', { exact: false })).toBeVisible({ timeout: 15_000 })
-  })
-
-  test('single-node does not pin a Cluster group', async ({ page }) => {
-    await stubFeatures(page, { distributed: false })
-    await stubNoP2P(page)
-    await page.goto('/app/chat')
-    // Nodes is reachable only via the Operate rail, not pinned at the top.
-    await expect(page.locator('.sidebar-nav')).toBeVisible({ timeout: 15_000 })
-    await expect(page.locator('.sidebar-nav .sidebar-section-items').first()
-      .getByText('Nodes', { exact: false })).toHaveCount(0)
-  })
-})
-
-test.describe('Top navbar', () => {
-  test('admin sees the mode pill and settings cog', async ({ page }) => {
-    await stubFeatures(page, { distributed: true })
-    await stubNoP2P(page)
-    await page.goto('/app/chat')
-    await expect(page.locator('.top-navbar__mode')).toBeVisible({ timeout: 15_000 })
-    await expect(page.locator('.top-navbar__icon[aria-label]')).not.toHaveCount(0)
-  })
-
-  test('admin-via-chat jump shows when localai_assistant is enabled', async ({ page }) => {
-    await stubFeatures(page, { distributed: false, localai_assistant: true })
-    await stubNoP2P(page)
-    await page.goto('/app/chat')
-    await expect(page.locator('.top-navbar__assistant')).toBeVisible({ timeout: 15_000 })
-  })
-
-  test('admin-via-chat jump hidden when localai_assistant is off', async ({ page }) => {
-    await stubFeatures(page, { distributed: false, localai_assistant: false })
-    await stubNoP2P(page)
-    await page.goto('/app/chat')
-    await expect(page.locator('.top-navbar__assistant')).toHaveCount(0)
-  })
-})
-
-test.describe('Token usage meter', () => {
-  test('renders when admin usage has data', async ({ page }) => {
-    await stubFeatures(page, { distributed: false })
-    await stubNoP2P(page)
-    await page.route('**/api/auth/admin/usage**', route =>
-      route.fulfill({ contentType: 'application/json',
-        body: JSON.stringify({ buckets: [{ total_tokens: 1234 }] }) }))
-    await page.goto('/app/chat')
-    await expect(page.locator('.top-navbar__meter')).toBeVisible({ timeout: 15_000 })
-  })
-
-  test('hidden when admin usage is empty (graceful degrade)', async ({ page }) => {
-    await stubFeatures(page, { distributed: false })
-    await stubNoP2P(page)
-    await page.route('**/api/auth/admin/usage**', route =>
-      route.fulfill({ contentType: 'application/json', body: JSON.stringify({ buckets: [] }) }))
-    await page.goto('/app/chat')
-    await expect(page.locator('.top-navbar')).toBeVisible({ timeout: 15_000 })
-    await expect(page.locator('.top-navbar__meter')).toHaveCount(0)
-  })
-})
--- a/core/http/react-ui/public/locales/en/chat.json
+++ b/core/http/react-ui/public/locales/en/chat.json
@@ -86,6 +86,7 @@
  "input": {
    "placeholder": "Message...",
    "attachFile": "Attach file",
+    "send": "Send message",
    "stopGenerating": "Stop generating",
    "canvasTitle": "Canvas — extract code blocks and media into a side panel for preview, copy, and download",
    "canvasLabel": "Canvas",
--- a/core/http/react-ui/public/locales/en/home.json
+++ b/core/http/react-ui/public/locales/en/home.json
@@ -77,6 +77,20 @@
    "noModelsTitle": "No Models Available",
    "noModelsBody": "There are no models installed yet. Ask your administrator to set up models so you can start chatting."
  },
+  "starters": {
+    "title": "Recommended for your hardware",
+    "tier": {
+      "cpu": "CPU-only",
+      "gpu-small": "GPU",
+      "gpu-large": "GPU"
+    },
+    "cpuNote": "No GPU detected — these small models stay responsive on CPU.",
+    "gpuNote": "Picked to fit your available VRAM with room for context.",
+    "install": "Install",
+    "installing": "Installing",
+    "installStarted": "Installing {{model}}…",
+    "installFailed": "Install failed: {{message}}"
+  },
  "connect": {
    "title": "One endpoint, every API",
    "subtitle": "LocalAI serves its own full API — image & video generation, depth, object detection, reranking, audio, face & voice recognition, and realtime voice over WebRTC and WebSocket. On top of that, a drop-in compatibility layer lets any app built for OpenAI, Anthropic, Ollama or OpenAI Responses talk to it unchanged.",
--- a/core/http/react-ui/public/locales/en/nav.json
+++ b/core/http/react-ui/public/locales/en/nav.json
@@ -12,16 +12,6 @@
  "accountSettings": "Account settings",
  "account": "Account",
  "accountFor": "Account: {{name}}",
-  "topbar": {
-    "label": "Top bar",
-    "modeDistributed": "Distributed",
-    "modeSwarm": "Swarm",
-    "modeSingle": "Single-node",
-    "pickModel": "Models",
-    "adminViaChat": "Admin via chat",
-    "tokensToday": "Tokens today",
-    "usageDetail": "View usage detail"
-  },
  "sections": {
    "create": "Create",
    "recognition": "Recognition",
--- a/core/http/react-ui/public/locales/id/admin.json
+++ b/core/http/react-ui/public/locales/id/admin.json
@@ -45,7 +45,7 @@
  },
  "scheduling": {
    "title": "Penjadwalan",
-    "subtitle": "Aturan penempatan model dan replika di seluruh klaster"
+    "subtitle": "Aturan penempatan model dan replika di seluruh kluster"
  },
  "p2p": {
    "title": "Komputasi AI Terdistribusi",
@@ -86,4 +86,4 @@
    "title": "Penjelajah",
    "subtitle": "Jelajahi file dan konfigurasi"
  }
-}
+}
--- a/core/http/react-ui/public/locales/id/chat.json
+++ b/core/http/react-ui/public/locales/id/chat.json
@@ -72,7 +72,7 @@
  "actions": {
    "copy": "Salin",
    "regenerate": "Hasilkan ulang",
-    "jumpToLatest": "Jump to latest"
+    "jumpToLatest": "Lompat ke terbaru"
  },
  "streaming": {
    "transferring": "Mentransfer model...",
@@ -115,4 +115,4 @@
    "clearAll": "Hapus semua",
    "deleteAllTitle": "Hapus semua percakapan"
  }
-}
+}
--- a/core/http/react-ui/public/locales/id/common.json
+++ b/core/http/react-ui/public/locales/id/common.json
@@ -1,8 +1,8 @@
 {
  "unsaved": {
-    "title": "Discard unsaved changes?",
-    "message": "You have unsaved changes that will be lost if you leave this page.",
-    "leave": "Leave"
+    "title": "Buang perubahan yang belum disimpan?",
+    "message": "Anda memiliki perubahan yang belum disimpan. Perubahan tersebut akan hilang jika Anda meninggalkan halaman ini.",
+    "leave": "Tinggalkan Halaman"
  },
  "actions": {
    "save": "Simpan",
--- a/core/http/react-ui/public/locales/id/home.json
+++ b/core/http/react-ui/public/locales/id/home.json
@@ -7,15 +7,15 @@
  "resourceGpu": "GPU",
  "resourceRam": "RAM",
  "greeting": {
-    "morning": "Good morning",
-    "afternoon": "Good afternoon",
-    "evening": "Good evening",
-    "night": "Working late"
+    "morning": "Selamat pagi",
+    "afternoon": "Selamat siang",
+    "evening": "Selamat malam",
+    "night": "Selamat lembur"
  },
  "statusLine": {
-    "modelsLoaded_one": "{{count}} model loaded",
-    "modelsLoaded_other": "{{count}} models loaded",
-    "noModelsLoaded": "No models loaded",
+    "modelsLoaded_one": "{{count}} model dimuat",
+    "modelsLoaded_other": "{{count}} model dimuat",
+    "noModelsLoaded": "Tidak ada model yang dimuat",
    "nodes_one": "{{count}} node",
    "nodes_other": "{{count}} nodes"
  },
@@ -79,14 +79,14 @@
  },
  "connect": {
    "title": "Satu endpoint, semua API",
-    "subtitle": "LocalAI menyediakan API miliknya sendiri yang lengkap — pembuatan gambar & video, depth, deteksi objek, reranking, audio, pengenalan wajah & suara, serta suara realtime melalui WebRTC dan WebSocket. Di atas itu, lapisan kompatibilitas drop-in membuat aplikasi apa pun yang dibuat untuk OpenAI, Anthropic, Ollama, atau OpenAI Responses bekerja tanpa perubahan.",
+    "subtitle": "LocalAI menyediakan API miliknya sendiri yang lengkap — pembuatan gambar & video, depth, deteksi objek, reranking, audio, pengenalan wajah & suara, serta suara realtime melalui WebRTC dan WebSocket. Selain itu, lapisan kompatibilitas drop-in membuat aplikasi apa pun yang dibuat untuk OpenAI, Anthropic, Ollama, atau OpenAI Responses bekerja tanpa perubahan.",
    "nativeTitle": "API native",
    "compatTitle": "Kompatibilitas drop-in",
    "apiReference": "Referensi API lengkap",
    "copy": "Salin",
    "copied": "Disalin",
-    "browse": "Browse the API",
-    "hide": "Hide endpoints",
-    "dismiss": "Dismiss"
+    "browse": "Jelajahi API",
+    "hide": "Sembunyikan endpoint",
+    "dismiss": "Abaikan"
  }
 }
--- a/core/http/react-ui/public/locales/id/media.json
+++ b/core/http/react-ui/public/locales/id/media.json
@@ -5,7 +5,7 @@
      "video": "Video",
      "tts": "TTS",
      "sound": "Suara",
-      "transform": "Transform"
+      "transform": "Transformasi"
    }
  },
  "image": {
@@ -30,7 +30,7 @@
      "refImagesAdded_other": "{{count}} gambar ditambahkan"
    },
    "actions": {
-      "view": "View",
+      "view": "Lihat",
      "generate": "Hasilkan",
      "generating": "Menghasilkan..."
    },
@@ -153,4 +153,4 @@
    "clearConfirm": "Hapus",
    "cleared": "Riwayat dihapus"
  }
-}
+}
--- a/core/http/react-ui/public/locales/id/nav.json
+++ b/core/http/react-ui/public/locales/id/nav.json
@@ -19,11 +19,11 @@
    "operate": "Operasikan"
  },
  "operate": {
-    "inference": "Inference",
-    "cluster": "Cluster",
-    "observability": "Observability",
-    "access": "Access",
-    "system": "System"
+    "inference": "Inferensi",
+    "cluster": "Kluster",
+    "observability": "Observabilitas",
+    "access": "Akses",
+    "system": "Sistem"
  },
  "items": {
    "home": "Beranda",
@@ -64,7 +64,7 @@
    "copyright": "© 2023-{{year}} {{author}}"
  },
  "console": {
-    "automation": "Otomasi",
+    "automation": "Automasi",
    "training": "Pelatihan"
  }
 }
--- a/core/http/react-ui/src/App.css
+++ b/core/http/react-ui/src/App.css
@@ -184,50 +184,6 @@
  font-size: 1.5rem;
 }

-/* Desktop top bar: deployment + admin affordances on wide screens. Hidden on
-   mobile, where .mobile-header carries the equivalent actions. */
-.top-navbar {
-  display: flex;
-  align-items: center;
-  justify-content: space-between;
-  gap: var(--spacing-md);
-  padding: var(--spacing-sm) var(--spacing-lg);
-  border-bottom: 1px solid var(--color-border-default);
-  background: var(--color-bg-secondary);
-}
-.top-navbar__right { display: flex; align-items: center; gap: var(--spacing-sm); }
-.top-navbar__mode {
-  font-size: 0.75rem;
-  padding: 2px 10px;
-  border-radius: 999px;
-  border: 1px solid var(--color-border-default);
-  color: var(--color-text-secondary);
-}
-.top-navbar__mode.is-active { color: var(--color-success); border-color: var(--color-success); }
-.top-navbar__btn {
-  display: inline-flex; align-items: center; gap: 6px;
-  font-size: 0.8125rem; padding: 5px 10px; border-radius: 8px;
-  border: 1px solid var(--color-border-default); background: var(--color-bg-tertiary);
-  color: var(--color-text-primary); cursor: pointer;
-}
-.top-navbar__icon {
-  width: 32px; height: 32px; display: inline-flex; align-items: center;
-  justify-content: center; border-radius: 8px; border: 1px solid var(--color-border-default);
-  background: var(--color-bg-tertiary); color: var(--color-text-secondary); cursor: pointer;
-}
-.top-navbar__avatar img { width: 100%; height: 100%; border-radius: 50%; object-fit: cover; }
-.top-navbar__meter {
-  display: inline-flex; flex-direction: column; gap: 3px; align-items: flex-start;
-  padding: 4px 10px; border-radius: 8px; border: 1px solid var(--color-border-default);
-  background: var(--color-bg-tertiary); cursor: pointer; min-width: 150px;
-}
-.top-navbar__meter-label { font-size: 0.6875rem; color: var(--color-text-secondary); }
-.top-navbar__meter-bar { width: 100%; height: 5px; border-radius: 3px; background: var(--color-bg-secondary); overflow: hidden; }
-.top-navbar__meter-bar i { display: block; height: 100%; background: var(--color-primary); }
-@media (max-width: 639px) {
-  .top-navbar { display: none; }
-}
-
 /* Sidebar */
 .sidebar {
  position: fixed;
@@ -6407,6 +6363,59 @@ select.input {
  justify-content: center;
 }

+/* ──────────────────── Home: hardware-aware starter models ──────────────────── */
+
+.home-starters {
+  margin: var(--spacing-lg) 0;
+  padding: var(--spacing-lg);
+}
+.home-starters-head {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  gap: var(--spacing-md);
+}
+.home-starters-head strong {
+  font-size: 0.9375rem;
+}
+.home-starters-tier {
+  display: inline-flex;
+  align-items: center;
+  gap: var(--spacing-xs);
+  font-size: 0.75rem;
+  color: var(--color-text-muted);
+}
+.home-starters-sub {
+  margin: var(--spacing-xs) 0 var(--spacing-md);
+  font-size: 0.8125rem;
+  color: var(--color-text-secondary);
+}
+.home-starters-list {
+  list-style: none;
+  margin: 0;
+  padding: 0;
+  display: flex;
+  flex-direction: column;
+  gap: var(--spacing-xs);
+}
+.home-starters-item {
+  display: flex;
+  align-items: center;
+  gap: var(--spacing-md);
+  padding: var(--spacing-xs) 0;
+}
+.home-starters-name {
+  font-weight: 500;
+  font-size: 0.875rem;
+  word-break: break-all;
+}
+.home-starters-size {
+  margin-left: auto;
+  font-size: 0.75rem;
+  color: var(--color-text-muted);
+  white-space: nowrap;
+}
+
 /* ──────────────────── Home: drop-in endpoint / API compatibility ──────────────────── */

 .home-connect {
--- a/core/http/react-ui/src/App.jsx
+++ b/core/http/react-ui/src/App.jsx
@@ -3,7 +3,6 @@ import { Outlet, useLocation, useNavigate } from 'react-router-dom'
 import { useTranslation } from 'react-i18next'
 import Sidebar from './components/Sidebar'
 import OperationsBar from './components/OperationsBar'
-import TopNavbar from './components/TopNavbar'
 import { ToastContainer, useToast } from './components/Toast'
 import { systemApi } from './utils/api'
 import { useTheme } from './contexts/ThemeContext'
@@ -99,7 +98,6 @@ export default function App() {
      <Sidebar isOpen={sidebarOpen} onClose={() => setSidebarOpen(false)} />
      <main className="main-content" {...(sidebarOpen ? { 'aria-hidden': 'true', inert: '' } : {})}>
        <OperationsBar />
-        <TopNavbar />
        {/* Mobile header — primary actions reachable without opening the
            drawer. Hamburger is the only way to expand the nav on phones;
            theme toggle and account avatar are mirrored from the sidebar
--- a/core/http/react-ui/src/components/HomeRoute.jsx
+++ b/core/http/react-ui/src/components/HomeRoute.jsx
@@ -1,28 +0,0 @@
-import { lazy, Suspense } from 'react'
-import { Navigate } from 'react-router-dom'
-import { useAuth } from '../context/AuthContext'
-import { useDeployment } from '../contexts/DeploymentContext'
-import { resolveHome } from '../utils/resolveHome'
-import RouteFallback from './RouteFallback'
-
-const Home = lazy(() => import('../pages/Home'))
-
-// Index-route element. Waits for auth + deployment signals to load (so we never
-// flash the wrong landing), then either renders Home or redirects to the cell's
-// landing page. Redirecting (rather than rendering Nodes/Chat inline at /app)
-// keeps each target's own route guard, active-nav state, and deep-linkability.
-export default function HomeRoute() {
-  const { isAdmin, loading: authLoading } = useAuth()
-  const { distributed, p2pEnabled, loading: deployLoading } = useDeployment()
-
-  if (authLoading || deployLoading) return <RouteFallback />
-
-  const target = resolveHome({ isAdmin, distributed, p2pEnabled })
-  if (target) return <Navigate to={target} replace />
-
-  return (
-    <Suspense fallback={<RouteFallback />}>
-      <Home />
-    </Suspense>
-  )
-}
--- a/core/http/react-ui/src/components/ModelSelector.jsx
+++ b/core/http/react-ui/src/components/ModelSelector.jsx
@@ -1,8 +1,25 @@
-import { useEffect, useMemo } from 'react'
+import { useEffect, useMemo, useCallback } from 'react'
 import { useModels } from '../hooks/useModels'
 import SearchableSelect from './SearchableSelect'
 import { useTranslation } from 'react-i18next'

+// Remember the last model the user picked, keyed by capability, so returning to
+// a page (Home chat box, Image, TTS, Talk...) defaults to that model instead of
+// whatever happens to sort first. Only persisted when a capability key exists —
+// `externalOptions` callers pass no capability and get the old first-item
+// behaviour. localStorage access is wrapped because private-browsing modes throw.
+const LAST_MODEL_PREFIX = 'localai_last_model:'
+
+function readLastModel(capability) {
+  if (!capability) return null
+  try { return localStorage.getItem(LAST_MODEL_PREFIX + capability) } catch { return null }
+}
+
+function writeLastModel(capability, model) {
+  if (!capability || !model) return
+  try { localStorage.setItem(LAST_MODEL_PREFIX + capability, model) } catch { /* ignore */ }
+}
+
 export default function ModelSelector({
  value, onChange, capability, className = '',
  options: externalOptions, loading: externalLoading,
@@ -19,16 +36,27 @@ export default function ModelSelector({
  const isLoading = externalOptions ? (externalLoading || false) : hookLoading
  const isDisabled = isLoading || (externalDisabled || false)

+  // Persist genuine selections so the next visit can restore them.
+  const handleChange = useCallback((next) => {
+    writeLastModel(capability, next)
+    onChange(next)
+  }, [capability, onChange])
+
  useEffect(() => {
    if (modelNames.length > 0 && (!value || !modelNames.includes(value))) {
-      onChange(modelNames[0])
+      // Prefer the remembered model when it's still available; otherwise fall
+      // back to the first option. Don't re-persist here — auto-select is not a
+      // user choice, and writing back the stored value would be a harmless but
+      // pointless round-trip.
+      const remembered = readLastModel(capability)
+      onChange(remembered && modelNames.includes(remembered) ? remembered : modelNames[0])
    }
-  }, [modelNames, value, onChange])
+  }, [modelNames, value, onChange, capability])

  return (
    <SearchableSelect
      value={value || ''}
-      onChange={onChange}
+      onChange={handleChange}
      options={modelNames}
      placeholder={isLoading ? t('selector.loading') : (modelNames.length === 0 ? t('selector.noModels') : t('selector.selectModel'))}
      searchPlaceholder={searchPlaceholder || t('selector.searchPlaceholder')}
--- a/core/http/react-ui/src/components/Sidebar.jsx
+++ b/core/http/react-ui/src/components/Sidebar.jsx
@@ -5,11 +5,9 @@ import ThemeToggle from './ThemeToggle'
 import LanguageSwitcher from './LanguageSwitcher'
 import { useAuth } from '../context/AuthContext'
 import { useBranding } from '../contexts/BrandingContext'
-import { useDeployment } from '../contexts/DeploymentContext'
 import { apiUrl } from '../utils/basePath'
 import { preloadRoute } from '../router'
 import { consoles, firstVisiblePath, consolePaths } from './console/consoleConfig'
-import { clusterPinItems, shouldCollapseCreate } from '../utils/sidebarPolicy'

 const COLLAPSED_KEY = 'localai_sidebar_collapsed'
 const SECTIONS_KEY = 'localai_sidebar_sections'
@@ -60,13 +58,11 @@ function NavItem({ item, onClose, collapsed }) {
  )
 }

-function loadSectionState(collapseCreate = false) {
-  // Tiers render expanded by default; users can collapse any tier and the
-  // choice persists (stored values override defaults). In cluster cells we
-  // start Create collapsed so the pinned cluster group leads - but only when
-  // the user has not already expressed a preference.
+function loadSectionState() {
+  // Tiers render expanded by default (the redesign favours showing the few
+  // intent groups up front); users can still collapse any tier and the choice
+  // is persisted. Stored values override the defaults so a saved collapse wins.
  const defaults = Object.fromEntries(sections.map(s => [s.id, true]))
-  if (collapseCreate) defaults.create = false
  try {
    const stored = localStorage.getItem(SECTIONS_KEY)
    return stored ? { ...defaults, ...JSON.parse(stored) } : defaults
@@ -81,34 +77,20 @@ function saveSectionState(state) {

 export default function Sidebar({ isOpen, onClose }) {
  const { t } = useTranslation('nav')
-  const { isAdmin, authEnabled, user, logout, hasFeature } = useAuth()
-  // Deployment shape (server features + p2p) drives the adaptive sidebar; the
-  // shared context replaces the sidebar's own /api/features fetch so the
-  // landing resolver, navbar, and this policy agree on one snapshot.
-  const deployment = useDeployment()
-  const features = deployment.features
-  // Shared shape for the console gating helpers (consoleConfig.js); in scope for
-  // both the pinned cluster group and the console-tier rendering below.
-  const auth = { isAdmin, authEnabled, hasFeature, features }
-  const collapseCreate = shouldCollapseCreate(auth, deployment)
+  const [features, setFeatures] = useState({})
  const [collapsed, setCollapsed] = useState(() => {
    try { return localStorage.getItem(COLLAPSED_KEY) === 'true' } catch (_) { return false }
  })
  const [openSections, setOpenSections] = useState(loadSectionState)
+  const { isAdmin, authEnabled, user, logout, hasFeature } = useAuth()
  const branding = useBranding()
  const navigate = useNavigate()
  const location = useLocation()
  const closeBtnRef = useRef(null)

-  // Apply the cluster-cell Create-collapse default once, only when the user has
-  // no stored section preference (so we never override an explicit choice).
  useEffect(() => {
-    if (deployment.loading) return
-    let hasStored = false
-    try { hasStored = !!localStorage.getItem(SECTIONS_KEY) } catch { hasStored = false }
-    if (hasStored || !collapseCreate) return
-    setOpenSections(prev => (prev.create === false ? prev : { ...prev, create: false }))
-  }, [deployment.loading, collapseCreate])
+    fetch(apiUrl('/api/features')).then(r => r.json()).then(setFeatures).catch(() => {})
+  }, [])

  // Stay in sync with external collapse dispatches (e.g. the chat
  // page's focus mode). The collapse-toggle button still owns the
@@ -175,6 +157,8 @@ export default function Sidebar({ isOpen, onClose }) {
  }

  const visibleTopItems = topItems.filter(filterItem)
+  // Shared shape for the console gating helpers (consoleConfig.js).
+  const auth = { isAdmin, authEnabled, hasFeature, features }

  // Inline sections (Create) carry no gating; a plain filterItem pass suffices.
  const getVisibleSectionItems = (section) => section.items.filter(filterItem)
@@ -215,28 +199,6 @@ export default function Sidebar({ isOpen, onClose }) {
            ))}
          </div>

-          {/* Pinned Cluster quick-access (admin + distributed/p2p). Same gate
-              as the Operate rail; surfaced at the top for cluster operators. */}
-          {(() => {
-            const pinned = clusterPinItems(auth, deployment)
-            if (pinned.length === 0) return null
-            return (
-              <div className="sidebar-section">
-                <div className="sidebar-section-title">{t('operate.cluster')}</div>
-                <div className="sidebar-section-items">
-                  {pinned.map(item => (
-                    <NavItem
-                      key={item.path}
-                      item={{ path: item.path, icon: item.icon, labelKey: item.labelKey }}
-                      onClose={onClose}
-                      collapsed={collapsed}
-                    />
-                  ))}
-                </div>
-              </div>
-            )
-          })()}
-
          {/* Collapsible sections */}
          {sections.map(section => {
            const visibleItems = getVisibleSectionItems(section)
--- a/core/http/react-ui/src/components/StarterModels.jsx
+++ b/core/http/react-ui/src/components/StarterModels.jsx
@@ -0,0 +1,129 @@
+import { useState, useEffect, useMemo } from 'react'
+import { useTranslation } from 'react-i18next'
+import { modelsApi } from '../utils/api'
+import { useResources } from '../hooks/useResources'
+
+// Curated, hardware-tiered starter models for the empty-state onboarding. Names
+// are real gallery entries (gallery/index.yaml); we intersect them against the
+// live gallery at render time so a custom/trimmed gallery degrades gracefully
+// (unmatched entries simply don't render).
+//
+// The guiding rule the maintainer asked for: CPU-only machines should be
+// steered to genuinely small models (1-4B, Q4) that stay responsive without a
+// GPU. GPU tiers scale the suggestion up with available VRAM.
+const SMALL = [
+  { name: 'llama-3.2-1b-instruct:q4_k_m', size: '~0.8 GB' },
+  { name: 'llama-3.2-3b-instruct:q4_k_m', size: '~2 GB' },
+  { name: 'qwen3-1.7b', size: '~1.4 GB' },
+  { name: 'gemma-3-1b-it', size: '~0.8 GB' },
+]
+const MID = [
+  { name: 'qwen3-4b', size: '~2.5 GB' },
+  { name: 'gemma-3-4b-it', size: '~3 GB' },
+  { name: 'llama-3.2-3b-instruct:q4_k_m', size: '~2 GB' },
+]
+const LARGE = [
+  { name: 'meta-llama-3.1-8b-instruct', size: '~5 GB' },
+  { name: 'qwen3-4b', size: '~2.5 GB' },
+  { name: 'mistral-7b-instruct-v0.3', size: '~4 GB' },
+]
+
+const GB = 1024 * 1024 * 1024
+
+// Pick a tier from detected hardware. total_memory is GPU VRAM in bytes (0 when
+// CPU-only). Thresholds are deliberately conservative so a suggestion that
+// "fits" really does.
+function pickTier(resources) {
+  const isGpu = resources?.type === 'gpu'
+  const vram = resources?.aggregate?.total_memory || 0
+  if (!isGpu || vram <= 0) return { id: 'cpu', list: SMALL }
+  if (vram < 8 * GB) return { id: 'gpu-small', list: MID }
+  return { id: 'gpu-large', list: LARGE }
+}
+
+export default function StarterModels({ addToast, onInstallStarted }) {
+  const { t } = useTranslation('home')
+  const { resources } = useResources()
+  const [available, setAvailable] = useState(null) // Set of gallery names, or null while loading
+  const [installing, setInstalling] = useState(() => new Set())
+
+  const tier = useMemo(() => pickTier(resources), [resources])
+  const candidates = tier.list
+
+  // Verify candidates exist in the live gallery. One search per name (the tier
+  // has at most a handful) keeps this resilient to gallery customization.
+  useEffect(() => {
+    let cancelled = false
+    const names = [...new Set(candidates.map(c => c.name))]
+    Promise.all(names.map(name =>
+      modelsApi.list({ search: name, page: 1 })
+        .then(data => (data?.models || []).some(m => (m.name || m.id) === name) ? name : null)
+        .catch(() => null)
+    )).then(found => {
+      if (cancelled) return
+      const hits = found.filter(Boolean)
+      // If verification yielded nothing (e.g. gallery unreachable), fall back to
+      // showing the curated list rather than an empty widget.
+      setAvailable(hits.length > 0 ? new Set(hits) : null)
+    })
+    return () => { cancelled = true }
+  }, [candidates])
+
+  const visible = available === null
+    ? candidates
+    : candidates.filter(c => available.has(c.name))
+
+  if (visible.length === 0) return null
+
+  const install = async (name) => {
+    setInstalling(prev => new Set(prev).add(name))
+    try {
+      await modelsApi.install(name)
+      addToast?.(t('starters.installStarted', { model: name }), 'success')
+      onInstallStarted?.(name)
+    } catch (err) {
+      addToast?.(t('starters.installFailed', { message: err.message }), 'error')
+      setInstalling(prev => {
+        const next = new Set(prev)
+        next.delete(name)
+        return next
+      })
+    }
+  }
+
+  return (
+    <section className="home-starters card">
+      <div className="home-starters-head">
+        <strong>{t('starters.title')}</strong>
+        <span className="home-starters-tier">
+          <i className={`fas ${tier.id === 'cpu' ? 'fa-memory' : 'fa-microchip'}`} aria-hidden="true" />
+          {t(`starters.tier.${tier.id}`)}
+        </span>
+      </div>
+      <p className="home-starters-sub">
+        {tier.id === 'cpu' ? t('starters.cpuNote') : t('starters.gpuNote')}
+      </p>
+      <ul className="home-starters-list">
+        {visible.map(c => {
+          const busy = installing.has(c.name)
+          return (
+            <li key={c.name} className="home-starters-item">
+              <span className="home-starters-name">{c.name}</span>
+              <span className="home-starters-size">{c.size}</span>
+              <button
+                type="button"
+                className="btn btn-primary btn-sm"
+                disabled={busy}
+                onClick={() => install(c.name)}
+              >
+                {busy
+                  ? (<><i className="fas fa-spinner fa-spin" aria-hidden="true" /> {t('starters.installing')}</>)
+                  : (<><i className="fas fa-download" aria-hidden="true" /> {t('starters.install')}</>)}
+              </button>
+            </li>
+          )
+        })}
+      </ul>
+    </section>
+  )
+}
--- a/core/http/react-ui/src/components/TopNavbar.jsx
+++ b/core/http/react-ui/src/components/TopNavbar.jsx
@@ -1,96 +0,0 @@
-import { useNavigate } from 'react-router-dom'
-import { useTranslation } from 'react-i18next'
-import { useAuth } from '../context/AuthContext'
-import { useDeployment } from '../contexts/DeploymentContext'
-import { useTheme } from '../contexts/ThemeContext'
-import { launchAssistantChat } from '../utils/launchAssistantChat'
-import TokenUsageMeter from './navbar/TokenUsageMeter'
-
-// Desktop top bar. Complementary to the mobile-only header in App.jsx: this is
-// hidden on small screens (see .top-navbar CSS) and shows deployment/admin
-// affordances on wide screens where the sidebar footer is far from the content.
-export default function TopNavbar() {
-  const { t } = useTranslation('nav')
-  const navigate = useNavigate()
-  const { isAdmin, authEnabled, user } = useAuth()
-  const { features, distributed, p2pEnabled } = useDeployment()
-  const { theme, toggleTheme } = useTheme()
-
-  const modeLabel = distributed
-    ? t('topbar.modeDistributed')
-    : p2pEnabled
-      ? t('topbar.modeSwarm')
-      : t('topbar.modeSingle')
-
-  const showAssistantJump = isAdmin && !!features.localai_assistant
-  const showAvatar = authEnabled && user
-  const themeLabel = theme === 'dark' ? t('switchToLightMode') : t('switchToDarkMode')
-
-  return (
-    <div className="top-navbar" role="navigation" aria-label={t('topbar.label')}>
-      <div className="top-navbar__left">
-        {isAdmin && (
-          <span className={`top-navbar__mode ${distributed || p2pEnabled ? 'is-active' : ''}`}>
-            <i className="fas fa-circle-nodes" aria-hidden="true" /> {modeLabel}
-          </span>
-        )}
-      </div>
-      <div className="top-navbar__right">
-        {!isAdmin && (
-          <button
-            type="button"
-            className="top-navbar__btn"
-            onClick={() => navigate('/app/chat')}
-            title={t('topbar.pickModel')}
-          >
-            <i className="fas fa-cube" aria-hidden="true" /> {t('topbar.pickModel')}
-          </button>
-        )}
-        {showAssistantJump && (
-          <button
-            type="button"
-            className="top-navbar__btn top-navbar__assistant"
-            onClick={() => launchAssistantChat(navigate)}
-            title={t('topbar.adminViaChat')}
-          >
-            <i className="fas fa-user-shield" aria-hidden="true" /> {t('topbar.adminViaChat')}
-          </button>
-        )}
-        {isAdmin && <TokenUsageMeter />}
-        {isAdmin && (
-          <button
-            type="button"
-            className="top-navbar__icon"
-            onClick={() => navigate('/app/settings')}
-            aria-label={t('items.settings')}
-            title={t('items.settings')}
-          >
-            <i className="fas fa-cog" aria-hidden="true" />
-          </button>
-        )}
-        <button
-          type="button"
-          className="top-navbar__icon"
-          onClick={toggleTheme}
-          aria-label={themeLabel}
-          title={themeLabel}
-        >
-          <i className={`fas ${theme === 'dark' ? 'fa-sun' : 'fa-moon'}`} aria-hidden="true" />
-        </button>
-        {showAvatar && (
-          <button
-            type="button"
-            className="top-navbar__icon top-navbar__avatar"
-            onClick={() => navigate('/app/account')}
-            aria-label={user.name || user.email}
-            title={user.name || user.email}
-          >
-            {user.avatarUrl
-              ? <img src={user.avatarUrl} alt="" />
-              : <i className="fas fa-user-circle" aria-hidden="true" />}
-          </button>
-        )}
-      </div>
-    </div>
-  )
-}
--- a/core/http/react-ui/src/components/navbar/TokenUsageMeter.jsx
+++ b/core/http/react-ui/src/components/navbar/TokenUsageMeter.jsx
@@ -1,52 +0,0 @@
-import { useState, useEffect } from 'react'
-import { useNavigate } from 'react-router-dom'
-import { useTranslation } from 'react-i18next'
-import { usageApi } from '../../utils/api'
-
-// Compact admin-only usage glance: today's total tokens, optionally against a
-// quota cap, linking to the full /app/usage page. Self-contained data fetch so
-// a usage-API failure cannot break the navbar - it just renders nothing.
-function sumTotalTokens(res) {
-  const buckets = res?.buckets || res?.usage || (Array.isArray(res) ? res : [])
-  if (!Array.isArray(buckets) || buckets.length === 0) return null
-  return buckets.reduce((s, b) => s + (b.total_tokens || 0), 0)
-}
-
-export default function TokenUsageMeter() {
-  const { t } = useTranslation('nav')
-  const navigate = useNavigate()
-  const [tokens, setTokens] = useState(null)
-  const [cap, setCap] = useState(null)
-
-  useEffect(() => {
-    let cancelled = false
-    usageApi.getAdminUsage('day')
-      .then(res => { if (!cancelled) setTokens(sumTotalTokens(res)) })
-      .catch(() => { if (!cancelled) setTokens(null) })
-    usageApi.getMyQuotas()
-      .then(q => { if (!cancelled) setCap(q?.token_limit || q?.tokens?.limit || null) })
-      .catch(() => { if (!cancelled) setCap(null) })
-    return () => { cancelled = true }
-  }, [])
-
-  if (tokens === null) return null
-
-  const pct = cap ? Math.min(100, Math.round((tokens / cap) * 100)) : null
-
-  return (
-    <button
-      type="button"
-      className="top-navbar__meter"
-      onClick={() => navigate('/app/usage')}
-      title={t('topbar.usageDetail')}
-    >
-      <span className="top-navbar__meter-label">
-        {t('topbar.tokensToday')}: {Intl.NumberFormat().format(tokens)}
-        {cap ? ` / ${Intl.NumberFormat().format(cap)}` : ''}
-      </span>
-      {pct !== null && (
-        <span className="top-navbar__meter-bar"><i style={{ width: `${pct}%` }} /></span>
-      )}
-    </button>
-  )
-}
--- a/core/http/react-ui/src/contexts/DeploymentContext.jsx
+++ b/core/http/react-ui/src/contexts/DeploymentContext.jsx
@@ -1,55 +0,0 @@
-import { createContext, useContext, useState, useEffect } from 'react'
-import { apiUrl } from '../utils/basePath'
-import { p2pApi } from '../utils/api'
-
-const DeploymentContext = createContext(null)
-
-// One shared fetch of the deployment-shape signals the adaptive UI keys off:
-// server features (/api/features) and whether a P2P network token exists.
-// Components used to fetch /api/features independently (Sidebar, Home); this
-// centralises it so the landing resolver, sidebar policy, and navbar agree on
-// one snapshot and we issue a single request.
-export function DeploymentProvider({ children }) {
-  const [features, setFeatures] = useState({})
-  const [p2pEnabled, setP2pEnabled] = useState(false)
-  const [loading, setLoading] = useState(true)
-
-  useEffect(() => {
-    let cancelled = false
-    const featuresP = fetch(apiUrl('/api/features'))
-      .then(r => r.json())
-      .catch(() => ({}))
-    // P2P has no /api/features flag: it is "enabled" when a network token
-    // exists (mirrors pages/P2P.jsx). A 404/disabled endpoint throws and we
-    // treat that as not-enabled.
-    const p2pP = p2pApi.getToken()
-      .then(tok => (typeof tok === 'string' ? tok : (tok?.token || '')).trim())
-      .catch(() => '')
-    Promise.all([featuresP, p2pP]).then(([f, tok]) => {
-      if (cancelled) return
-      setFeatures(f || {})
-      setP2pEnabled(!!tok)
-      setLoading(false)
-    })
-    return () => { cancelled = true }
-  }, [])
-
-  const value = {
-    features,
-    distributed: !!features.distributed,
-    p2pEnabled,
-    loading,
-  }
-
-  return (
-    <DeploymentContext.Provider value={value}>
-      {children}
-    </DeploymentContext.Provider>
-  )
-}
-
-export function useDeployment() {
-  const ctx = useContext(DeploymentContext)
-  if (!ctx) throw new Error('useDeployment must be used within DeploymentProvider')
-  return ctx
-}
--- a/core/http/react-ui/src/hooks/usePolling.js
+++ b/core/http/react-ui/src/hooks/usePolling.js
@@ -0,0 +1,66 @@
+import { useEffect, useRef, useCallback } from 'react'
+
+// usePolling runs `fn` immediately and then on a fixed interval, with two
+// behaviours every hand-rolled setInterval in this app was missing:
+//
+//   1. Visibility-aware: the timer pauses while the tab is hidden
+//      (document.hidden) and fires an immediate catch-up poll when the tab
+//      becomes visible again. A backgrounded dashboard no longer hammers the
+//      server every few seconds for data nobody is looking at.
+//   2. Non-overlapping: if `fn` returns a promise that takes longer than the
+//      interval, the next tick waits for it instead of stacking requests.
+//
+// `enabled: false` stops polling entirely (one-shot or gated polls). The
+// returned `refetch` runs `fn` on demand and is stable across renders.
+export function usePolling(fn, intervalMs = 5000, { enabled = true, immediate = true } = {}) {
+  const fnRef = useRef(fn)
+  fnRef.current = fn
+
+  const runningRef = useRef(false)
+  const refetch = useCallback(async () => {
+    // Guard against overlap: a slow poll shouldn't pile up behind a fast timer.
+    if (runningRef.current) return
+    runningRef.current = true
+    try {
+      return await fnRef.current()
+    } finally {
+      runningRef.current = false
+    }
+  }, [])
+
+  useEffect(() => {
+    if (!enabled) return
+    let timer = null
+
+    const tick = () => { refetch() }
+
+    const start = () => {
+      if (timer != null) return
+      timer = setInterval(tick, intervalMs)
+    }
+    const stop = () => {
+      if (timer != null) { clearInterval(timer); timer = null }
+    }
+
+    const onVisibility = () => {
+      if (document.hidden) {
+        stop()
+      } else {
+        // Catch up immediately on return, then resume the cadence.
+        tick()
+        start()
+      }
+    }
+
+    if (immediate) tick()
+    if (!document.hidden) start()
+    document.addEventListener('visibilitychange', onVisibility)
+
+    return () => {
+      stop()
+      document.removeEventListener('visibilitychange', onVisibility)
+    }
+  }, [enabled, intervalMs, immediate, refetch])
+
+  return { refetch }
+}
--- a/core/http/react-ui/src/hooks/useResources.js
+++ b/core/http/react-ui/src/hooks/useResources.js
@@ -1,11 +1,11 @@
-import { useState, useEffect, useCallback, useRef } from 'react'
+import { useState, useCallback } from 'react'
 import { resourcesApi } from '../utils/api'
+import { usePolling } from './usePolling'

 export function useResources(pollInterval = 5000) {
  const [resources, setResources] = useState(null)
  const [loading, setLoading] = useState(true)
  const [error, setError] = useState(null)
-  const intervalRef = useRef(null)

  const fetchResources = useCallback(async () => {
    try {
@@ -19,13 +19,10 @@ export function useResources(pollInterval = 5000) {
    }
  }, [])

-  useEffect(() => {
-    fetchResources()
-    intervalRef.current = setInterval(fetchResources, pollInterval)
-    return () => {
-      if (intervalRef.current) clearInterval(intervalRef.current)
-    }
-  }, [fetchResources, pollInterval])
+  // Visibility-aware polling: pauses while the tab is hidden and catches up on
+  // return (see usePolling). Resource stats are pure dashboard data, so there's
+  // no reason to keep fetching them for a backgrounded tab.
+  const { refetch } = usePolling(fetchResources, pollInterval)

-  return { resources, loading, error, refetch: fetchResources }
+  return { resources, loading, error, refetch }
 }
--- a/core/http/react-ui/src/main.jsx
+++ b/core/http/react-ui/src/main.jsx
@@ -4,7 +4,6 @@ import { RouterProvider } from 'react-router-dom'
 import { ThemeProvider } from './contexts/ThemeContext'
 import { BrandingProvider } from './contexts/BrandingContext'
 import { AuthProvider } from './context/AuthContext'
-import { DeploymentProvider } from './contexts/DeploymentContext'
 import { OperationsProvider } from './contexts/OperationsContext'
 import { router } from './router'
 import './i18n'
@@ -33,11 +32,9 @@ createRoot(document.getElementById('root')).render(
      <ThemeProvider>
        <BrandingProvider>
          <AuthProvider>
-            <DeploymentProvider>
-              <OperationsProvider>
-                <RouterProvider router={router} />
-              </OperationsProvider>
-            </DeploymentProvider>
+            <OperationsProvider>
+              <RouterProvider router={router} />
+            </OperationsProvider>
          </AuthProvider>
        </BrandingProvider>
      </ThemeProvider>
--- a/core/http/react-ui/src/pages/AgentChat.jsx
+++ b/core/http/react-ui/src/pages/AgentChat.jsx
@@ -765,8 +765,10 @@ export default function AgentChat() {
            className="chat-send-btn"
            onClick={handleSend}
            disabled={processing || !input.trim()}
+            aria-label="Send message"
+            title="Send message"
          >
-            <i className="fas fa-paper-plane" />
+            <i className="fas fa-paper-plane" aria-hidden="true" />
          </button>
        </div>
      </div>
--- a/core/http/react-ui/src/pages/Chat.jsx
+++ b/core/http/react-ui/src/pages/Chat.jsx
@@ -541,73 +541,58 @@ export default function Chat() {
    updateChatSettings(activeChat.id, { clientMCPServers: next })
  }, [activeChat, updateChatSettings])

-  // Load initial message / assistant launch from the Home page or the navbar
-  // quick-jump. Factored into a callback so both the mount-time reader and the
-  // navbar re-trigger event below consume the same payload through one path.
+  // Load initial message from home page
  const homeDataProcessed = useRef(false)
-  const consumeHomeChatData = useCallback(() => {
-    const stored = localStorage.getItem('localai_index_chat_data')
-    if (!stored) return
-    try {
-      const data = JSON.parse(stored)
-      localStorage.removeItem('localai_index_chat_data')
-
-      // Two entry shapes from Home:
-      //   - "compose-and-send": data.message present → open new chat,
-      //     prefill the composer, click submit.
-      //   - "open-assistant": no message, just data.localaiAssistant → open
-      //     a fresh chat already in admin mode so the wizard can fire.
-      const hasMessage = !!data.message
-      const wantsAssistant = !!data.localaiAssistant
-
-      if (hasMessage || wantsAssistant) {
-        let targetChat = activeChat
-        if (data.newChat) {
-          targetChat = addChat(data.model || '', '', data.mcpMode || false)
-        } else {
-          if (data.model && activeChat) {
-            updateChatSettings(activeChat.id, { model: data.model })
-          }
-          if (data.mcpMode && activeChat) {
-            updateChatSettings(activeChat.id, { mcpMode: true })
-          }
-        }
-        if (data.mcpServers?.length > 0 && targetChat) {
-          updateChatSettings(targetChat.id, { mcpServers: data.mcpServers })
-        }
-        if (data.clientMCPServers?.length > 0 && targetChat) {
-          updateChatSettings(targetChat.id, { clientMCPServers: data.clientMCPServers })
-        }
-        if (wantsAssistant && targetChat) {
-          updateChatSettings(targetChat.id, { localaiAssistant: true })
-        }
-        if (hasMessage) {
-          setInput(data.message)
-          if (data.files) setFiles(data.files)
-          setTimeout(() => {
-            const submitBtn = document.getElementById('chat-submit-btn')
-            submitBtn?.click()
-          }, 100)
-        }
-      }
-    } catch (_e) { /* ignore */ }
-  }, [activeChat, addChat, updateChatSettings])
-
  useEffect(() => {
    if (homeDataProcessed.current) return
-    homeDataProcessed.current = true
-    consumeHomeChatData()
-  }, [consumeHomeChatData])
+    const stored = localStorage.getItem('localai_index_chat_data')
+    if (stored) {
+      homeDataProcessed.current = true
+      try {
+        const data = JSON.parse(stored)
+        localStorage.removeItem('localai_index_chat_data')

-  // Admins can re-trigger the assistant jump from the navbar while already on
-  // the chat page; navigate('/app/chat') does not remount Chat, so the
-  // mount-time reader above never fires. The launcher dispatches this event
-  // after writing the payload so we re-consume it and open a fresh assistant.
-  useEffect(() => {
-    const onOpenAssistant = () => consumeHomeChatData()
-    window.addEventListener('localai-open-assistant', onOpenAssistant)
-    return () => window.removeEventListener('localai-open-assistant', onOpenAssistant)
-  }, [consumeHomeChatData])
+        // Two entry shapes from Home:
+        //   - "compose-and-send": data.message present → open new chat,
+        //     prefill the composer, click submit.
+        //   - "open-assistant": no message, just data.localaiAssistant → open
+        //     a fresh chat already in admin mode so the wizard can fire.
+        const hasMessage = !!data.message
+        const wantsAssistant = !!data.localaiAssistant
+
+        if (hasMessage || wantsAssistant) {
+          let targetChat = activeChat
+          if (data.newChat) {
+            targetChat = addChat(data.model || '', '', data.mcpMode || false)
+          } else {
+            if (data.model && activeChat) {
+              updateChatSettings(activeChat.id, { model: data.model })
+            }
+            if (data.mcpMode && activeChat) {
+              updateChatSettings(activeChat.id, { mcpMode: true })
+            }
+          }
+          if (data.mcpServers?.length > 0 && targetChat) {
+            updateChatSettings(targetChat.id, { mcpServers: data.mcpServers })
+          }
+          if (data.clientMCPServers?.length > 0 && targetChat) {
+            updateChatSettings(targetChat.id, { clientMCPServers: data.clientMCPServers })
+          }
+          if (wantsAssistant && targetChat) {
+            updateChatSettings(targetChat.id, { localaiAssistant: true })
+          }
+          if (hasMessage) {
+            setInput(data.message)
+            if (data.files) setFiles(data.files)
+            setTimeout(() => {
+              const submitBtn = document.getElementById('chat-submit-btn')
+              submitBtn?.click()
+            }, 100)
+          }
+        }
+      } catch (_e) { /* ignore */ }
+    }
+  }, [])

  // Track whether the user is pinned to the bottom. If they scroll up
  // while a response is streaming, stop forcing them back down.
@@ -1442,8 +1427,10 @@ export default function Chat() {
                className="chat-send-btn"
                onClick={handleSend}
                disabled={!input.trim() && files.length === 0}
+                aria-label={t('input.send')}
+                title={t('input.send')}
              >
-                <i className="fas fa-paper-plane" />
+                <i className="fas fa-paper-plane" aria-hidden="true" />
              </button>
            )}
          </div>
--- a/core/http/react-ui/src/pages/Home.jsx
+++ b/core/http/react-ui/src/pages/Home.jsx
@@ -10,14 +10,15 @@ import UnifiedMCPDropdown from '../components/UnifiedMCPDropdown'
 import ConfirmDialog from '../components/ConfirmDialog'
 import HomeConnect from '../components/HomeConnect'
 import { useResources } from '../hooks/useResources'
+import { usePolling } from '../hooks/usePolling'
 import { fileToBase64, backendControlApi, systemApi, modelsApi, mcpApi, nodesApi } from '../utils/api'
 import { API_CONFIG } from '../utils/config'
 import { greetingKey } from '../utils/greeting'
-import { launchAssistantChat } from '../utils/launchAssistantChat'
 import StatusPill from '../components/StatusPill'
 import Skeleton from '../components/Skeleton'
 import SectionHeading from '../components/SectionHeading'
 import EmptyState from '../components/EmptyState'
+import StarterModels from '../components/StarterModels'
 import { staggerStyle } from '../hooks/useStagger'

 export default function Home() {
@@ -69,40 +70,36 @@ export default function Home() {
      .catch(() => {})
  }, [])

-  // Poll cluster node data in distributed mode
-  useEffect(() => {
-    if (!distributedMode) return
-    const fetchCluster = async () => {
-      try {
-        const data = await nodesApi.list()
-        const nodes = Array.isArray(data) ? data : []
-        const backendNodes = nodes.filter(n => !n.node_type || n.node_type === 'backend')
-        const totalVRAM = backendNodes.reduce((sum, n) => sum + (n.total_vram || 0), 0)
-        const usedVRAM = backendNodes.reduce((sum, n) => {
-          if (n.total_vram && n.available_vram != null) return sum + (n.total_vram - n.available_vram)
-          return sum
-        }, 0)
-        const totalRAM = backendNodes.reduce((sum, n) => sum + (n.total_ram || 0), 0)
-        const usedRAM = backendNodes.reduce((sum, n) => {
-          if (n.total_ram && n.available_ram != null) return sum + (n.total_ram - n.available_ram)
-          return sum
-        }, 0)
-        const isGPU = totalVRAM > 0
-        const healthyCount = backendNodes.filter(n => n.status === 'healthy').length
-        const totalCount = backendNodes.length
-        setClusterData({
-          totalMem: isGPU ? totalVRAM : totalRAM,
-          usedMem: isGPU ? usedVRAM : usedRAM,
-          isGPU,
-          healthyCount,
-          totalCount,
-        })
-      } catch { setClusterData(null) }
-    }
-    fetchCluster()
-    const interval = setInterval(fetchCluster, 5000)
-    return () => clearInterval(interval)
-  }, [distributedMode])
+  // Poll cluster node data in distributed mode. Visibility-aware + gated on
+  // distributedMode so a non-distributed or backgrounded tab makes no calls.
+  const fetchCluster = useCallback(async () => {
+    try {
+      const data = await nodesApi.list()
+      const nodes = Array.isArray(data) ? data : []
+      const backendNodes = nodes.filter(n => !n.node_type || n.node_type === 'backend')
+      const totalVRAM = backendNodes.reduce((sum, n) => sum + (n.total_vram || 0), 0)
+      const usedVRAM = backendNodes.reduce((sum, n) => {
+        if (n.total_vram && n.available_vram != null) return sum + (n.total_vram - n.available_vram)
+        return sum
+      }, 0)
+      const totalRAM = backendNodes.reduce((sum, n) => sum + (n.total_ram || 0), 0)
+      const usedRAM = backendNodes.reduce((sum, n) => {
+        if (n.total_ram && n.available_ram != null) return sum + (n.total_ram - n.available_ram)
+        return sum
+      }, 0)
+      const isGPU = totalVRAM > 0
+      const healthyCount = backendNodes.filter(n => n.status === 'healthy').length
+      const totalCount = backendNodes.length
+      setClusterData({
+        totalMem: isGPU ? totalVRAM : totalRAM,
+        usedMem: isGPU ? usedVRAM : usedRAM,
+        isGPU,
+        healthyCount,
+        totalCount,
+      })
+    } catch { setClusterData(null) }
+  }, [])
+  usePolling(fetchCluster, 5000, { enabled: distributedMode })

  // Fetch configured models (to know if any exist) and loaded models (currently running)
  const fetchSystemInfo = useCallback(async () => {
@@ -124,11 +121,7 @@ export default function Home() {
    }
  }, [])

-  useEffect(() => {
-    fetchSystemInfo()
-    const interval = setInterval(fetchSystemInfo, 5000)
-    return () => clearInterval(interval)
-  }, [fetchSystemInfo])
+  usePolling(fetchSystemInfo, 5000)

  // Check MCP availability when selected model changes
  useEffect(() => {
@@ -229,8 +222,16 @@ export default function Home() {
  // requiring an initial message or model selection. Useful when an admin
  // wants to start the assistant from a cold home page.
  const openAssistantChat = useCallback(() => {
-    launchAssistantChat(navigate, selectedModel)
+    const chatData = {
+      model: selectedModel || '',
+      mcpMode: false,
+      localaiAssistant: true,
+      newChat: true,
+    }
+    localStorage.setItem('localai_index_chat_data', JSON.stringify(chatData))
+    try { localStorage.setItem('localai_assistant_used', '1') } catch { /* ignore */ }
    setAssistantUsed(true)
+    navigate('/app/chat')
  }, [navigate, selectedModel])

  const handleSubmit = (e) => {
@@ -516,6 +517,8 @@ export default function Home() {
            </div>
          </div>

+          <StarterModels addToast={addToast} onInstallStarted={fetchSystemInfo} />
+
          <div className="home-wizard-actions">
            <button className="btn btn-primary" onClick={() => navigate('/app/models')}>
              <i className="fas fa-store" /> {t('wizard.browseGallery')}
--- a/core/http/react-ui/src/pages/Usage.jsx
+++ b/core/http/react-ui/src/pages/Usage.jsx
@@ -24,7 +24,37 @@ function formatNumber(n) {
  return String(n)
 }

-function StatCard({ icon, label, value, muted }) {
+// Opt-in token pricing. LocalAI is self-hosted and has no inherent monetary
+// cost, but multi-user deployments use estimated cost for chargeback/budgeting.
+// Prices are admin-supplied $ per 1M tokens, stored locally (per-browser), and
+// the whole cost surface stays hidden until a non-zero price is set.
+const TOKEN_PRICING_KEY = 'localai_token_pricing'
+
+function loadPricing() {
+  try {
+    const p = JSON.parse(localStorage.getItem(TOKEN_PRICING_KEY) || '{}')
+    return { prompt: Number(p.prompt) || 0, completion: Number(p.completion) || 0 }
+  } catch { return { prompt: 0, completion: 0 } }
+}
+
+function savePricing(p) {
+  try { localStorage.setItem(TOKEN_PRICING_KEY, JSON.stringify(p)) } catch { /* ignore */ }
+}
+
+function pricingEnabled(p) { return (p?.prompt || 0) > 0 || (p?.completion || 0) > 0 }
+
+function costOf(row, p) {
+  return (row.prompt_tokens / 1_000_000) * (p.prompt || 0)
+       + (row.completion_tokens / 1_000_000) * (p.completion || 0)
+}
+
+function formatCost(n) {
+  if (!n) return '$0.00'
+  if (n < 0.01) return '<$0.01'
+  return '$' + n.toFixed(2)
+}
+
+function StatCard({ icon, label, value, muted, text }) {
  return (
    <div className="card" style={{ padding: 'var(--spacing-sm) var(--spacing-md)', flex: '1 1 0', minWidth: 120, opacity: muted ? 0.7 : 1 }}>
      <div style={{ display: 'flex', alignItems: 'center', gap: 6, marginBottom: 2 }}>
@@ -32,7 +62,7 @@ function StatCard({ icon, label, value, muted }) {
        <span style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)', fontWeight: 500, textTransform: 'uppercase', letterSpacing: '0.03em' }}>{label}</span>
      </div>
      <div style={{ fontSize: '1.375rem', fontWeight: 700, fontFamily: 'var(--font-mono)', color: muted ? 'var(--color-text-secondary)' : 'var(--color-text-primary)' }}>
-        {muted ? '~' : ''}{formatNumber(value)}
+        {text != null ? text : `${muted ? '~' : ''}${formatNumber(value)}`}
      </div>
    </div>
  )
@@ -642,6 +672,10 @@ export default function Usage() {
  const [activeTab, setActiveTab] = useState('models')
  const [quotas, setQuotas] = useState([])
  const [selectedUserId, setSelectedUserId] = useState(null)
+  const [pricing, setPricingState] = useState(loadPricing)
+  const [showPricing, setShowPricing] = useState(false)
+  const setPricing = (p) => { setPricingState(p); savePricing(p) }
+  const costEnabled = pricingEnabled(pricing)

  const fetchUsage = useCallback(async () => {
    setLoading(true)
@@ -743,11 +777,50 @@ export default function Usage() {
          <i className="fas fa-key" style={{ fontSize: '0.7rem' }} /> {t('usage.sources.tab')}
        </button>
        <div style={{ flex: 1 }} />
+        <button
+          className={`btn btn-sm ${costEnabled ? 'btn-primary' : 'btn-secondary'}`}
+          onClick={() => setShowPricing(v => !v)}
+          style={{ gap: 4 }}
+          title="Set token pricing to estimate cost"
+        >
+          <i className="fas fa-dollar-sign" /> {costEnabled ? 'Pricing' : 'Set pricing'}
+        </button>
        <button className="btn btn-secondary btn-sm" onClick={fetchUsage} disabled={loading} style={{ gap: 4 }}>
          <i className={`fas fa-rotate${loading ? ' fa-spin' : ''}`} /> Refresh
        </button>
      </div>

+      {showPricing && (
+        <div className="card" style={{ display: 'flex', alignItems: 'flex-end', gap: 'var(--spacing-md)', flexWrap: 'wrap', padding: 'var(--spacing-md)', marginBottom: 'var(--spacing-md)' }}>
+          <div style={{ display: 'flex', flexDirection: 'column', gap: 2 }}>
+            <label style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)', textTransform: 'uppercase', letterSpacing: '0.03em' }}>Prompt $/1M tokens</label>
+            <input
+              className="input" type="number" min="0" step="0.01" style={{ width: 140 }}
+              value={pricing.prompt || ''}
+              placeholder="0.00"
+              onChange={e => setPricing({ ...pricing, prompt: Number(e.target.value) || 0 })}
+            />
+          </div>
+          <div style={{ display: 'flex', flexDirection: 'column', gap: 2 }}>
+            <label style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)', textTransform: 'uppercase', letterSpacing: '0.03em' }}>Completion $/1M tokens</label>
+            <input
+              className="input" type="number" min="0" step="0.01" style={{ width: 140 }}
+              value={pricing.completion || ''}
+              placeholder="0.00"
+              onChange={e => setPricing({ ...pricing, completion: Number(e.target.value) || 0 })}
+            />
+          </div>
+          {costEnabled && (
+            <button className="btn btn-secondary btn-sm" onClick={() => setPricing({ prompt: 0, completion: 0 })} style={{ gap: 4 }}>
+              <i className="fas fa-times" /> Clear
+            </button>
+          )}
+          <span style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', flex: '1 1 200px' }}>
+            Estimated cost only. Prices are stored in this browser and applied to recorded token counts.
+          </span>
+        </div>
+      )}
+
      {loading ? (
        <div style={{ display: 'flex', justifyContent: 'center', padding: 'var(--spacing-xl)' }}>
          <LoadingSpinner size="lg" />
@@ -760,6 +833,9 @@ export default function Usage() {
            <StatCard icon="fas fa-arrow-up" label="Prompt" value={displayTotals.prompt_tokens} />
            <StatCard icon="fas fa-arrow-down" label="Completion" value={displayTotals.completion_tokens} />
            <StatCard icon="fas fa-coins" label="Total" value={displayTotals.total_tokens} />
+            {costEnabled && (
+              <StatCard icon="fas fa-dollar-sign" label="Est. Cost" text={formatCost(costOf(displayTotals, pricing))} />
+            )}
          </div>

          {/* Predictions */}
@@ -789,6 +865,7 @@ export default function Usage() {
                      <th style={{ width: 110 }}>Prompt</th>
                      <th style={{ width: 110 }}>Completion</th>
                      <th style={{ width: 110 }}>Total</th>
+                      {costEnabled && <th style={{ width: 100 }}>Est. Cost</th>}
                      <th style={{ width: 140 }}></th>
                    </tr>
                  </thead>
@@ -800,6 +877,7 @@ export default function Usage() {
                        <td style={monoCell}>{formatNumber(row.prompt_tokens)}</td>
                        <td style={monoCell}>{formatNumber(row.completion_tokens)}</td>
                        <td style={{ ...monoCell, fontWeight: 600 }}>{formatNumber(row.total_tokens)}</td>
+                        {costEnabled && <td style={monoCell}>{formatCost(costOf(row, pricing))}</td>}
                        <td><UsageBar value={row.total_tokens} max={maxTokens} /></td>
                      </tr>
                    ))}
@@ -827,6 +905,7 @@ export default function Usage() {
                      <th style={{ width: 110 }}>Prompt</th>
                      <th style={{ width: 110 }}>Completion</th>
                      <th style={{ width: 110 }}>Total</th>
+                      {costEnabled && <th style={{ width: 100 }}>Est. Cost</th>}
                      <th style={{ width: 110 }}>Proj. Total</th>
                      <th style={{ width: 140 }}></th>
                    </tr>
@@ -849,6 +928,7 @@ export default function Usage() {
                            <td style={monoCell}>{formatNumber(row.prompt_tokens)}</td>
                            <td style={monoCell}>{formatNumber(row.completion_tokens)}</td>
                            <td style={{ ...monoCell, fontWeight: 600 }}>{formatNumber(row.total_tokens)}</td>
+                            {costEnabled && <td style={monoCell}>{formatCost(costOf(row, pricing))}</td>}
                            <td style={{ ...monoCell, color: 'var(--color-text-muted)', fontStyle: 'italic' }}>
                              {up?.predictions ? `~${formatNumber(up.predictions.projectedTotals.total_tokens)}` : '-'}
                            </td>
@@ -856,7 +936,7 @@ export default function Usage() {
                          </tr>
                          {isExpanded && up && (
                            <tr>
-                              <td colSpan={8} style={{ padding: 0, background: 'var(--color-bg-secondary)' }}>
+                              <td colSpan={costEnabled ? 9 : 8} style={{ padding: 0, background: 'var(--color-bg-secondary)' }}>
                                <div style={{ padding: 'var(--spacing-md)' }}>
                                  {up.predictions && (
                                    <div style={{ display: 'grid', gridTemplateColumns: 'repeat(auto-fit, minmax(100px, 1fr))', gap: 'var(--spacing-xs)', marginBottom: 'var(--spacing-sm)' }}>
--- a/core/http/react-ui/src/router.jsx
+++ b/core/http/react-ui/src/router.jsx
@@ -6,7 +6,6 @@ import RequireAdmin from './components/RequireAdmin'
 import RequireAuth from './components/RequireAuth'
 import RequireAuthEnabled from './components/RequireAuthEnabled'
 import RequireFeature from './components/RequireFeature'
-import HomeRoute from './components/HomeRoute'

 // Pages are code-split: each becomes its own chunk loaded on demand, so a route
 // no longer drags every other page (and its heavy deps — CodeMirror, the MCP
@@ -33,7 +32,7 @@ export function preloadRoute(path) {
  preloaders[m[1] ?? '']?.().catch(() => { /* network blip — real click will retry */ })
 }

-page('', () => import('./pages/Home'))
+const Home = page('', () => import('./pages/Home'))
 const Chat = page('chat', () => import('./pages/Chat'))
 const Models = page('models', () => import('./pages/Models'))
 const Manage = page('manage', () => import('./pages/Manage'))
@@ -97,7 +96,7 @@ function Feature({ feature, children }) {
 }

 const appChildren = [
-  { index: true, element: <HomeRoute /> },
+  { index: true, element: <Home /> },
  { path: 'chat', element: <Chat /> },
  { path: 'chat/:model', element: <Chat /> },
  { path: 'image', element: <ImageGen /> },
--- a/core/http/react-ui/src/utils/launchAssistantChat.js
+++ b/core/http/react-ui/src/utils/launchAssistantChat.js
@@ -1,19 +0,0 @@
-// Opens a fresh chat already in LocalAI Assistant ("manage") mode. Chat.jsx
-// reads localai_index_chat_data on mount and enables localaiAssistant for the
-// new chat. Shared by the Home CTA and the top navbar quick-jump so there is
-// one definition of how the assistant is launched.
-export function launchAssistantChat(navigate, model = '') {
-  const chatData = {
-    model: model || '',
-    mcpMode: false,
-    localaiAssistant: true,
-    newChat: true,
-  }
-  try { localStorage.setItem('localai_index_chat_data', JSON.stringify(chatData)) } catch { /* ignore */ }
-  try { localStorage.setItem('localai_assistant_used', '1') } catch { /* ignore */ }
-  navigate('/app/chat')
-  // When already on /app/chat, navigate() does not remount Chat, so its
-  // mount-time reader would never see the payload above. Signal the mounted
-  // Chat to re-consume it; harmless elsewhere since Chat reads on mount anyway.
-  try { window.dispatchEvent(new CustomEvent('localai-open-assistant')) } catch { /* ignore */ }
-}
--- a/core/http/react-ui/src/utils/resolveHome.js
+++ b/core/http/react-ui/src/utils/resolveHome.js
@@ -1,11 +0,0 @@
-// Pure landing-page resolver for the index route. Returns a target path, or ''
-// meaning "render the default Home". Admin precedence is distributed > p2p >
-// plain; non-admins always go to Chat (distributed/p2p are admin-only and
-// invisible to them). Visibility gates are enforced elsewhere - this only
-// chooses where /app lands.
-export function resolveHome({ isAdmin, distributed, p2pEnabled }) {
-  if (!isAdmin) return '/app/chat'
-  if (distributed) return '/app/nodes'
-  if (p2pEnabled) return '/app/p2p'
-  return ''
-}
--- a/core/http/react-ui/src/utils/sidebarPolicy.js
+++ b/core/http/react-ui/src/utils/sidebarPolicy.js
@@ -1,20 +0,0 @@
-import { operateConsole, isConsoleItemVisible } from '../components/console/consoleConfig'
-
-// The Operate > Cluster group, surfaced as a pinned top-of-sidebar quick-access
-// group when the admin is running a cluster (NATS-distributed) or a P2P swarm.
-// Items are filtered through the SAME gate as everywhere else, so e.g. in a
-// p2p-only deployment Nodes/Scheduling (feature: 'distributed') drop out and
-// only Swarm remains. Returns [] when the pin does not apply.
-export function clusterPinItems(auth, deployment) {
-  if (!auth.isAdmin) return []
-  if (!deployment.distributed && !deployment.p2pEnabled) return []
-  const group = operateConsole.groups.find(g => g.titleKey === 'operate.cluster')
-  if (!group) return []
-  return group.items.filter(item => isConsoleItemVisible(item, auth))
-}
-
-// In the cluster cells the Create group defaults collapsed so the pinned
-// cluster group leads. Users can still expand it; their stored choice wins.
-export function shouldCollapseCreate(auth, deployment) {
-  return !!auth.isAdmin && (!!deployment.distributed || !!deployment.p2pEnabled)
-}
--- a/core/services/distributed/gallery.go
+++ b/core/services/distributed/gallery.go
@@ -79,21 +79,29 @@ func (s *GalleryStore) Create(op *GalleryOperationRecord) error {
 	}).Create(op).Error
 }

-// UpdateProgress updates progress for an operation.
-func (s *GalleryStore) UpdateProgress(id string, progress float64, message, downloadedSize string) error {
+// UpdateProgress updates progress for an operation. The cancellable flag is
+// persisted on every tick so a replica that restarts mid-install rehydrates the
+// op as still cancellable — otherwise the column keeps its Create-time zero
+// value (false), the UI hides the cancel button, and the orphaned op can only
+// be dismissed by waiting for the 30-minute stale reaper.
+func (s *GalleryStore) UpdateProgress(id string, progress float64, message, downloadedSize string, cancellable bool) error {
 	return s.db.Model(&GalleryOperationRecord{}).Where("id = ?", id).Updates(map[string]any{
 		"progress":             progress,
 		"message":              message,
 		"downloaded_file_size": downloadedSize,
+		"cancellable":          cancellable,
 		"updated_at":           time.Now(),
 	}).Error
 }

-// UpdateStatus updates the status of an operation.
+// UpdateStatus updates the status of an operation. A terminal status is never
+// cancellable, so the flag is cleared here to keep the persisted row consistent
+// with what the UI should offer.
 func (s *GalleryStore) UpdateStatus(id, status, errMsg string) error {
 	updates := map[string]any{
-		"status":     status,
-		"updated_at": time.Now(),
+		"status":      status,
+		"cancellable": false,
+		"updated_at":  time.Now(),
 	}
 	if errMsg != "" {
 		updates["error"] = errMsg
--- a/core/services/galleryop/cancellable_persist_test.go
+++ b/core/services/galleryop/cancellable_persist_test.go
@@ -0,0 +1,56 @@
+package galleryop_test
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/services/distributed"
+	"github.com/mudler/LocalAI/core/services/galleryop"
+	"github.com/mudler/LocalAI/core/services/testutil"
+)
+
+// Reproduces "an in-flight install can't be cancelled after a restart". The
+// live install path marks OpStatus.Cancellable=true on every progress tick, but
+// UpdateStatus persisted progress/status to the gallery store WITHOUT the
+// cancellable flag, and Create defaulted it to false. So after a replica
+// restart Hydrate rebuilt the op with Cancellable=false, /api/operations
+// reported cancellable:false, and the UI hid the cancel button — the orphaned
+// op lingered until the 30-minute stale reaper expired it. The cancellable
+// state must be persisted so a rehydrated in-flight op stays cancellable.
+var _ = Describe("GalleryService cancellable persistence across restart", func() {
+	It("rehydrates an in-flight op as still cancellable", func() {
+		db := testutil.SetupTestDB()
+		store, err := distributed.NewGalleryStore(db)
+		Expect(err).ToNot(HaveOccurred())
+
+		svc := galleryop.NewGalleryService(&config.ApplicationConfig{}, nil)
+		svc.SetGalleryStore(store)
+
+		// Seed the in-flight op row as the worker goroutine does on admission.
+		Expect(store.Create(&distributed.GalleryOperationRecord{
+			ID:                 "op-inflight",
+			GalleryElementName: "llama-cpp-development",
+			OpType:             "backend_install",
+			Status:             "pending",
+		})).To(Succeed())
+
+		// Simulate a progress tick: the live path always marks installs
+		// cancellable while they are downloading/processing.
+		svc.UpdateStatus("op-inflight", &galleryop.OpStatus{
+			Message:     "downloading",
+			Progress:    25,
+			Cancellable: true,
+		})
+
+		// A fresh replica boots and hydrates from the store.
+		fresh := galleryop.NewGalleryService(&config.ApplicationConfig{}, nil)
+		fresh.SetGalleryStore(store)
+		Expect(fresh.Hydrate()).To(Succeed())
+
+		st := fresh.GetStatus("op-inflight")
+		Expect(st).ToNot(BeNil(), "the in-flight op must hydrate after a restart")
+		Expect(st.Cancellable).To(BeTrue(),
+			"a still-active install must rehydrate as cancellable so the admin can dismiss it")
+	})
+})
--- a/core/services/galleryop/service.go
+++ b/core/services/galleryop/service.go
@@ -167,7 +167,7 @@ func (g *GalleryService) UpdateStatus(s string, op *OpStatus) {
 				xlog.Warn("Failed to persist gallery operation status", "op_id", s, "error", err)
 			}
 		} else {
-			if err := store.UpdateProgress(s, op.Progress, op.Message, op.DownloadedFileSize); err != nil {
+			if err := store.UpdateProgress(s, op.Progress, op.Message, op.DownloadedFileSize, op.Cancellable); err != nil {
 				xlog.Warn("Failed to persist gallery operation progress", "op_id", s, "error", err)
 			}
 		}
@@ -467,6 +467,7 @@ func (g *GalleryService) Start(c context.Context, cl *config.ModelConfigLoader,
 						GalleryElementName: op.GalleryElementName,
 						OpType:             "backend_install",
 						Status:             "pending",
+						Cancellable:        true,
 					})
 				}
 				err := g.backendHandler(&op, systemState)
@@ -499,6 +500,8 @@ func (g *GalleryService) Start(c context.Context, cl *config.ModelConfigLoader,
 						GalleryElementName: op.GalleryElementName,
 						OpType:             opType,
 						Status:             "pending",
+						// A delete is not cancellable; an install is.
+						Cancellable: !op.Delete,
 					})
 				}
 				err := g.modelHandler(&op, cl, systemState)
--- a/core/services/nodes/inflight.go
+++ b/core/services/nodes/inflight.go
@@ -19,25 +19,40 @@ import (
 // Per-replica: a single tracker instance is bound to (nodeID, modelName, replicaIndex).
 // The router constructs one tracker per Route() result, so each in-flight tick lands
 // on the correct row even when multiple replicas of the same model live on the same node.
+//
+// Embedding only grpc.ControlBackend (not the whole grpc.Backend) is what makes
+// the in-flight accounting safe by construction: the control-plane methods pass
+// through untracked, while every grpc.InferenceBackend method must be declared
+// explicitly below to satisfy grpc.Backend. Adding an inference method to the
+// interface therefore breaks this file's build (see the var assertion below)
+// until it is wrapped with track() - so a new inference path can't be added
+// without an in-flight accounting decision.
 type InFlightTrackingClient struct {
-	grpc.Backend // embed for passthrough of untracked methods
-	registry     InFlightTracker
-	nodeID       string
-	modelName    string
-	replicaIndex int
+	grpc.ControlBackend                       // passthrough for control-plane / streaming-constructor methods
+	inner               grpc.InferenceBackend // tracked inference methods delegate here
+	registry            InFlightTracker
+	nodeID              string
+	modelName           string
+	replicaIndex        int

 	firstOnce       sync.Once // guards onFirstComplete
 	onFirstComplete func()    // called once after the first tracked inference call completes
 }

+// Compile-time contract: *InFlightTrackingClient must implement the FULL backend
+// surface. Because it embeds only ControlBackend, this fails to compile if any
+// InferenceBackend method is left unwrapped.
+var _ grpc.Backend = (*InFlightTrackingClient)(nil)
+
 // NewInFlightTrackingClient wraps a gRPC backend client with in-flight tracking.
 func NewInFlightTrackingClient(inner grpc.Backend, registry InFlightTracker, nodeID, modelName string, replicaIndex int) *InFlightTrackingClient {
 	return &InFlightTrackingClient{
-		Backend:      inner,
-		registry:     registry,
-		nodeID:       nodeID,
-		modelName:    modelName,
-		replicaIndex: replicaIndex,
+		ControlBackend: inner,
+		inner:          inner,
+		registry:       registry,
+		nodeID:         nodeID,
+		modelName:      modelName,
+		replicaIndex:   replicaIndex,
 	}
 }

@@ -91,154 +106,162 @@ func (c *InFlightTrackingClient) reconcile(err error) error {

 func (c *InFlightTrackingClient) Predict(ctx context.Context, in *pb.PredictOptions, opts ...ggrpc.CallOption) (*pb.Reply, error) {
 	defer c.track(ctx)()
-	reply, err := c.Backend.Predict(ctx, in, opts...)
+	reply, err := c.inner.Predict(ctx, in, opts...)
 	return reply, c.reconcile(err)
 }

 func (c *InFlightTrackingClient) PredictStream(ctx context.Context, in *pb.PredictOptions, f func(reply *pb.Reply), opts ...ggrpc.CallOption) error {
 	defer c.track(ctx)()
-	return c.reconcile(c.Backend.PredictStream(ctx, in, f, opts...))
+	return c.reconcile(c.inner.PredictStream(ctx, in, f, opts...))
 }

 func (c *InFlightTrackingClient) Embeddings(ctx context.Context, in *pb.PredictOptions, opts ...ggrpc.CallOption) (*pb.EmbeddingResult, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.Embeddings(ctx, in, opts...)
+	res, err := c.inner.Embeddings(ctx, in, opts...)
 	return res, c.reconcile(err)
 }

 func (c *InFlightTrackingClient) GenerateImage(ctx context.Context, in *pb.GenerateImageRequest, opts ...ggrpc.CallOption) (*pb.Result, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.GenerateImage(ctx, in, opts...)
+	res, err := c.inner.GenerateImage(ctx, in, opts...)
 	return res, c.reconcile(err)
 }

 func (c *InFlightTrackingClient) GenerateVideo(ctx context.Context, in *pb.GenerateVideoRequest, opts ...ggrpc.CallOption) (*pb.Result, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.GenerateVideo(ctx, in, opts...)
+	res, err := c.inner.GenerateVideo(ctx, in, opts...)
 	return res, c.reconcile(err)
 }

 func (c *InFlightTrackingClient) TTS(ctx context.Context, in *pb.TTSRequest, opts ...ggrpc.CallOption) (*pb.Result, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.TTS(ctx, in, opts...)
+	res, err := c.inner.TTS(ctx, in, opts...)
 	return res, c.reconcile(err)
 }

 func (c *InFlightTrackingClient) TTSStream(ctx context.Context, in *pb.TTSRequest, f func(reply *pb.Reply), opts ...ggrpc.CallOption) error {
 	defer c.track(ctx)()
-	return c.reconcile(c.Backend.TTSStream(ctx, in, f, opts...))
+	return c.reconcile(c.inner.TTSStream(ctx, in, f, opts...))
 }

 func (c *InFlightTrackingClient) SoundGeneration(ctx context.Context, in *pb.SoundGenerationRequest, opts ...ggrpc.CallOption) (*pb.Result, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.SoundGeneration(ctx, in, opts...)
+	res, err := c.inner.SoundGeneration(ctx, in, opts...)
 	return res, c.reconcile(err)
 }

 func (c *InFlightTrackingClient) AudioTranscription(ctx context.Context, in *pb.TranscriptRequest, opts ...ggrpc.CallOption) (*pb.TranscriptResult, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.AudioTranscription(ctx, in, opts...)
+	res, err := c.inner.AudioTranscription(ctx, in, opts...)
 	return res, c.reconcile(err)
 }

 func (c *InFlightTrackingClient) AudioTranscriptionStream(ctx context.Context, in *pb.TranscriptRequest, f func(chunk *pb.TranscriptStreamResponse), opts ...ggrpc.CallOption) error {
 	defer c.track(ctx)()
-	return c.reconcile(c.Backend.AudioTranscriptionStream(ctx, in, f, opts...))
+	return c.reconcile(c.inner.AudioTranscriptionStream(ctx, in, f, opts...))
 }

 func (c *InFlightTrackingClient) Detect(ctx context.Context, in *pb.DetectOptions, opts ...ggrpc.CallOption) (*pb.DetectResponse, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.Detect(ctx, in, opts...)
+	res, err := c.inner.Detect(ctx, in, opts...)
 	return res, c.reconcile(err)
 }

 func (c *InFlightTrackingClient) Depth(ctx context.Context, in *pb.DepthRequest, opts ...ggrpc.CallOption) (*pb.DepthResponse, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.Depth(ctx, in, opts...)
+	res, err := c.inner.Depth(ctx, in, opts...)
 	return res, c.reconcile(err)
 }

 func (c *InFlightTrackingClient) Rerank(ctx context.Context, in *pb.RerankRequest, opts ...ggrpc.CallOption) (*pb.RerankResult, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.Rerank(ctx, in, opts...)
+	res, err := c.inner.Rerank(ctx, in, opts...)
 	return res, c.reconcile(err)
 }

 func (c *InFlightTrackingClient) VAD(ctx context.Context, in *pb.VADRequest, opts ...ggrpc.CallOption) (*pb.VADResponse, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.VAD(ctx, in, opts...)
+	res, err := c.inner.VAD(ctx, in, opts...)
 	return res, c.reconcile(err)
 }

 func (c *InFlightTrackingClient) Diarize(ctx context.Context, in *pb.DiarizeRequest, opts ...ggrpc.CallOption) (*pb.DiarizeResponse, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.Diarize(ctx, in, opts...)
+	res, err := c.inner.Diarize(ctx, in, opts...)
 	return res, c.reconcile(err)
 }

 func (c *InFlightTrackingClient) FaceVerify(ctx context.Context, in *pb.FaceVerifyRequest, opts ...ggrpc.CallOption) (*pb.FaceVerifyResponse, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.FaceVerify(ctx, in, opts...)
+	res, err := c.inner.FaceVerify(ctx, in, opts...)
 	return res, c.reconcile(err)
 }

 func (c *InFlightTrackingClient) FaceAnalyze(ctx context.Context, in *pb.FaceAnalyzeRequest, opts ...ggrpc.CallOption) (*pb.FaceAnalyzeResponse, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.FaceAnalyze(ctx, in, opts...)
+	res, err := c.inner.FaceAnalyze(ctx, in, opts...)
 	return res, c.reconcile(err)
 }

 func (c *InFlightTrackingClient) VoiceVerify(ctx context.Context, in *pb.VoiceVerifyRequest, opts ...ggrpc.CallOption) (*pb.VoiceVerifyResponse, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.VoiceVerify(ctx, in, opts...)
+	res, err := c.inner.VoiceVerify(ctx, in, opts...)
 	return res, c.reconcile(err)
 }

 func (c *InFlightTrackingClient) VoiceAnalyze(ctx context.Context, in *pb.VoiceAnalyzeRequest, opts ...ggrpc.CallOption) (*pb.VoiceAnalyzeResponse, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.VoiceAnalyze(ctx, in, opts...)
+	res, err := c.inner.VoiceAnalyze(ctx, in, opts...)
 	return res, c.reconcile(err)
 }

 func (c *InFlightTrackingClient) VoiceEmbed(ctx context.Context, in *pb.VoiceEmbedRequest, opts ...ggrpc.CallOption) (*pb.VoiceEmbedResponse, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.VoiceEmbed(ctx, in, opts...)
+	res, err := c.inner.VoiceEmbed(ctx, in, opts...)
 	return res, c.reconcile(err)
 }

 func (c *InFlightTrackingClient) TokenClassify(ctx context.Context, in *pb.TokenClassifyRequest, opts ...ggrpc.CallOption) (*pb.TokenClassifyResponse, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.TokenClassify(ctx, in, opts...)
+	res, err := c.inner.TokenClassify(ctx, in, opts...)
 	return res, c.reconcile(err)
 }

 func (c *InFlightTrackingClient) Score(ctx context.Context, in *pb.ScoreRequest, opts ...ggrpc.CallOption) (*pb.ScoreResponse, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.Score(ctx, in, opts...)
+	res, err := c.inner.Score(ctx, in, opts...)
+	return res, c.reconcile(err)
+}
+
+func (c *InFlightTrackingClient) SoundDetection(ctx context.Context, in *pb.SoundDetectionRequest, opts ...ggrpc.CallOption) (*pb.SoundDetectionResponse, error) {
+	defer c.track(ctx)()
+	res, err := c.inner.SoundDetection(ctx, in, opts...)
 	return res, c.reconcile(err)
 }

 func (c *InFlightTrackingClient) AudioEncode(ctx context.Context, in *pb.AudioEncodeRequest, opts ...ggrpc.CallOption) (*pb.AudioEncodeResult, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.AudioEncode(ctx, in, opts...)
+	res, err := c.inner.AudioEncode(ctx, in, opts...)
 	return res, c.reconcile(err)
 }

 func (c *InFlightTrackingClient) AudioDecode(ctx context.Context, in *pb.AudioDecodeRequest, opts ...ggrpc.CallOption) (*pb.AudioDecodeResult, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.AudioDecode(ctx, in, opts...)
+	res, err := c.inner.AudioDecode(ctx, in, opts...)
 	return res, c.reconcile(err)
 }

 func (c *InFlightTrackingClient) AudioTransform(ctx context.Context, in *pb.AudioTransformRequest, opts ...ggrpc.CallOption) (*pb.AudioTransformResult, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.AudioTransform(ctx, in, opts...)
+	res, err := c.inner.AudioTransform(ctx, in, opts...)
 	return res, c.reconcile(err)
 }

-// AudioTransformStream, AudioToAudioStream and Forward are deliberately left as
-// embedded passthrough: they return a stream client and the inference spans the
-// stream's lifetime, not the constructor call. Wrapping the constructor with
-// track() would increment and immediately decrement (and fire onFirstComplete)
-// before any audio flows. Tracking those correctly needs the done() func tied to
-// stream close, which the current Backend interface doesn't surface here.
+// AudioTransformStream, AudioToAudioStream and Forward live in grpc.ControlBackend
+// and are passed through via the embedded field, NOT tracked: they return a stream
+// client and the inference spans the stream's lifetime, not the constructor call.
+// Wrapping the constructor with track() would increment and immediately decrement
+// (and fire onFirstComplete) before any audio flows. Tracking those correctly needs
+// the done() func tied to stream close, which the Backend interface doesn't surface
+// here. If they ever need tracking, move them to grpc.InferenceBackend - the build
+// will then force an explicit wrapper here.
--- a/core/services/nodes/inflight_test.go
+++ b/core/services/nodes/inflight_test.go
@@ -408,6 +408,13 @@ var _ = Describe("InFlightTrackingClient", func() {
 				return err
 			})
 		})
+
+		It("SoundDetection", func() {
+			assertTracked(func() error {
+				_, err := client.SoundDetection(context.Background(), &pb.SoundDetectionRequest{})
+				return err
+			})
+		})
 	})

 	Describe("stale model reload (self-heal)", func() {
--- a/core/services/nodes/router.go
+++ b/core/services/nodes/router.go
@@ -156,7 +156,10 @@ func applyNodeHardwareDefaults(opts *pb.ModelOptions, node *BackendNode) {
 		VRAM:              node.TotalVRAM,
 	}
 	if config.IsManagedPhysicalBatch(int(opts.NBatch)) {
-		opts.NBatch = int32(config.PhysicalBatch(gpu))
+		// Gate the raised batch on the selected node's per-device VRAM at this
+		// model's context, so a large context can't overflow the node's compute
+		// buffer (issue #10485). node.TotalVRAM is the node's reported ceiling.
+		opts.NBatch = int32(config.PhysicalBatchForContext(gpu, int(opts.ContextSize)))
 	}
 	// Default concurrent serving for the selected node (the frontend that built
 	// the options may have no GPU). Only adds when no parallel option is set.
--- a/core/services/nodes/router_hardware_internal_test.go
+++ b/core/services/nodes/router_hardware_internal_test.go
@@ -8,12 +8,19 @@ import (
 )

 var _ = Describe("applyNodeHardwareDefaults", func() {
-	It("raises a managed default batch on a Blackwell node", func() {
-		opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch}
-		applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1"})
+	It("raises a managed default batch on a Blackwell node with headroom", func() {
+		opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, ContextSize: 8192}
+		applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1", TotalVRAM: 119 << 30})
 		Expect(opts.NBatch).To(BeEquivalentTo(config.BlackwellPhysicalBatch))
 	})

+	It("keeps the default batch when a large context would overflow the node", func() {
+		// Regression guard for issue #10485 on the distributed path.
+		opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, ContextSize: 204800}
+		applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.0", TotalVRAM: 16 << 30})
+		Expect(opts.NBatch).To(BeEquivalentTo(config.DefaultPhysicalBatch))
+	})
+
 	It("resets a Blackwell guess on a non-Blackwell node", func() {
 		// frontend (Blackwell) guessed high, but the selected node is not Blackwell
 		opts := &pb.ModelOptions{NBatch: config.BlackwellPhysicalBatch}
--- a/docs/content/features/middleware.md
+++ b/docs/content/features/middleware.md
@@ -185,6 +185,13 @@ It is persisted through `POST /api/settings` and read live, so a change takes
 effect on the next request without a restart. A default that names a model no
 longer loaded still appears (marked *not loaded*) so it can be toggled off.

+The default set can also be supplied out-of-band with the
+`LOCALAI_PII_DEFAULT_DETECTORS` environment variable (comma-separated model
+names, e.g. `privacy-filter-nemotron,secret-filter`). When set it takes
+precedence over the value persisted via the UI (env > file), which is the
+right behaviour for immutable container deployments that pin filtering policy
+at boot rather than via the admin UI.
+
 This is what makes `cloud-proxy` / MITM redaction work out of the box: those
 backends default to PII-enabled but ship no detector list, so without a
 default detector the filter runs with nothing to scan. Set one here and
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v4.4.3"
+  "version": "v4.5.0"
 }
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,208 @@
 ---
+- name: "lfm2.5-1.2b-instruct"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct-GGUF
+  description: "Try LFM • Docs • LEAP • Discord\n\n# LFM2.5-1.2B-Instruct\n\nLFM2.5 is a new family of hybrid models designed for **on-device deployment**. It builds on the LFM2 architecture with extended pre-training and reinforcement learning.\n\n  - **Best-in-class performance**: A 1.2B model rivaling much larger models, bringing high-quality AI to your pocket.\n  - **Fast edge inference**: 239 tok/s decode on AMD CPU, 82 tok/s on mobile NPU. Runs under 1GB of memory with day-one support for llama.cpp, MLX, and vLLM.\n  - **Scaled training**: Extended pre-training from 10T to 28T tokens and large-scale multi-stage reinforcement learning.\n\nFind more information about LFM2.5 in our blog post.\n\n## \U0001F5D2️ Model Details\n\nLFM2.5-1.2B-Instruct is a general-purpose text-only model with the following features:\n\n...\n"
+  license: "other"
+  tags:
+    - llm
+    - gguf
+  icon: https://cdn-uploads.huggingface.co/production/uploads/61b8e2ba285851687028d395/dxnYF2fuLpulismtFSGFi.png
+  overrides:
+    backend: llama-cpp
+    function:
+      automatic_tool_parsing_fallback: true
+      grammar:
+        disable: true
+    known_usecases:
+      - chat
+    options:
+      - use_jinja:true
+    parameters:
+      min_p: 0.15
+      model: llama-cpp/models/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf
+      repeat_penalty: 1.05
+      temperature: 0.1
+      top_k: 50
+      top_p: 0.1
+    template:
+      use_tokenizer_template: true
+  files:
+    - filename: llama-cpp/models/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf
+      sha256: b1b3de114215d9507409a662a501a631095a479a419584e8a2ded6304b19b4f5
+      uri: https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct-GGUF/resolve/main/LFM2.5-1.2B-Instruct-Q4_K_M.gguf
+- name: "qwopus3.6-27b-coder-compat-mtp"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/Jackrong/Qwopus3.6-27B-Coder-Compat-MTP-GGUF
+  description: "\U0001FA90 Qwopus-3.6-27B-Coder\nCoder SFT Release\n\nAgentic Coding &amp; Tool-Use Reasoning Model Fine-Tuned on Qwopus3.6-27B-v2\n\n\U0001F9EC Trace Inversion & Negentropy\n\U0001F9E0 27B Dense Model\n⚡ Agentic Coding\n\U0001F6E0️ Tool Calling & Agent\n\U0001F3C6 SWE-bench Verified: 67.0% (off-thinking)\n\n\U0001F4A1 What is Qwopus-3.6-27B-Coder?\n\U0001FA90 Qwopus-3.6-27B-Coder is a reasoning-enhanced agentic coding model built on top of Qwopus3.6-27B-v2. It inherits the powerful reasoning foundation of the v2 base — which achieved 87.43% MMLU-Pro and 75.25% SWE-bench Verified — and further specializes it for agentic code generation, structured tool calling, debugging, and instruction-following in developer workflows. The model is designed to excel at repository-level coding tasks, multi-turn tool orchestration, and complex logical reasoning under realistic agent environments.\n\n\U0001F9E9 Agentic Coding\nOptimized for repository-level coding, debugging, patch generation, and structured multi-step development workflows.\n\n\U0001F6E0️ Tool Calling\nLearns from real agent trajectories with tool definitions, tool calls, and environment feedback for robust multi-turn execution.\n\n...\n"
+  license: "apache-2.0"
+  tags:
+    - llm
+    - gguf
+    - vision
+    - multimodal
+    - reasoning
+  icon: https://cdn-uploads.huggingface.co/production/uploads/66309bd090589b7c65950665/sGQKmrMc6L6guMoaB5_Y2.png
+  overrides:
+    backend: llama-cpp
+    function:
+      automatic_tool_parsing_fallback: true
+      grammar:
+        disable: true
+    known_usecases:
+      - chat
+    mmproj: llama-cpp/mmproj/Qwopus3.6-27B-Coder-Compat-MTP-GGUF/mmproj-F32.gguf
+    options:
+      - use_jinja:true
+      - spec_type:draft-mtp
+      - spec_n_max:6
+      - spec_p_min:0.75
+    parameters:
+      model: llama-cpp/models/Qwopus3.6-27B-Coder-Compat-MTP-GGUF/Qwopus3.6-27B-Coder-Compat-MTP-Q4_K_M.gguf
+    template:
+      use_tokenizer_template: true
+  files:
+    - filename: llama-cpp/models/Qwopus3.6-27B-Coder-Compat-MTP-GGUF/Qwopus3.6-27B-Coder-Compat-MTP-Q4_K_M.gguf
+      sha256: f893632170124da60e159b7bcc9d91e1cda3014b2c6b8ad9c6cde38a1fcd2f6f
+      uri: https://huggingface.co/Jackrong/Qwopus3.6-27B-Coder-Compat-MTP-GGUF/resolve/main/Qwopus3.6-27B-Coder-Compat-MTP-Q4_K_M.gguf
+    - filename: llama-cpp/mmproj/Qwopus3.6-27B-Coder-Compat-MTP-GGUF/mmproj-F32.gguf
+      sha256: 32f7ea0600c07272547da401d460f8abbd980f3a57b69d6df87be0e2505e0b9c
+      uri: https://huggingface.co/Jackrong/Qwopus3.6-27B-Coder-Compat-MTP-GGUF/resolve/main/mmproj-F32.gguf
+- name: "kimi-k2.7-code"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF
+  description: |
+    ## 1. Model Introduction
+
+    Kimi K2.7 Code is a coding-focused agentic model built upon Kimi K2.6. With substantial improvements on real-world long-horizon coding tasks, it strengthens end-to-end task completion across complex software engineering workflows while improving token efficiency, reducing thinking-token usage by approximately 30% compared with Kimi K2.6.
+
+    ## 2. Model Summary
+
+    ## 3. Evaluation Results
+
+    Benchmark
+    Kimi K2.6
+    Kimi K2.7 Code
+    GPT-5.5
+    Claude Opus 4.8
+
+    Coding
+
+    Kimi Code Bench v2
+    50.9
+    62.0
+    69.0
+    67.4
+
+    Program Bench
+    48.3
+    53.6
+    69.1
+    63.8
+
+    MLS Bench Lite
+    26.7
+    35.1
+    35.5
+    42.8
+
+    Agentic
+
+    Kimi Claw 24/7 Bench
+    42.9
+    46.9
+    52.8
+    50.4
+
+    MCP Atlas
+    69.4
+    76.0
+    79.4
+    81.3
+
+    MCP Mark Verified
+    72.8
+    81.1
+    92.9
+    76.4
+
+    Footnotes
+
+    ...
+  license: "other"
+  tags:
+    - llm
+    - gguf
+  icon: https://huggingface.co/moonshotai/Kimi-K2.7-Code/resolve/main/figures/kimi-logo.png
+  overrides:
+    backend: llama-cpp
+    function:
+      automatic_tool_parsing_fallback: true
+      grammar:
+        disable: true
+    known_usecases:
+      - chat
+    mmproj: llama-cpp/mmproj/Kimi-K2.7-Code-GGUF/mmproj-F32.gguf
+    options:
+      - use_jinja:true
+    parameters:
+      min_p: 0.01
+      model: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00001-of-00014.gguf
+      repeat_penalty: 1
+      temperature: 0.6
+      top_k: -1
+      top_p: 0.95
+    template:
+      use_tokenizer_template: true
+  files:
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00001-of-00014.gguf
+      sha256: 65f0aca336f876902323a90e2aff32cac76d071b2cdd818c6a8d78be8fc2c680
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00001-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00002-of-00014.gguf
+      sha256: 40f4416c130827a11502778891f4ef95b2144db90f51d63aa3548d0952a39683
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00002-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00003-of-00014.gguf
+      sha256: ba2ba0b5168784ace7c752ecadfc3631279b2bb023824cb0fe9e2dab3dd28f22
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00003-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00004-of-00014.gguf
+      sha256: 10298a6c98b13ef49be286fefbea8663e16473fb69bbeabe153bc80c60ae116e
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00004-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00005-of-00014.gguf
+      sha256: 8e9e4c8e35d34fc4fef6bfb65a715ad7defbd196970d833c1df6924d701c88b3
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00005-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00006-of-00014.gguf
+      sha256: ccff6e7f299742f82cf6f51a871e3eb3167511efaee967477cc8387f54d16442
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00006-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00007-of-00014.gguf
+      sha256: 1a3b639633a2d22f71156a9f643ded2329cdd969cc21177b644b5741bac1af8e
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00007-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00008-of-00014.gguf
+      sha256: bde28f682a1eab973538b2102007d952f37a13c1f7d55e2ed99177445ddc4282
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00008-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00009-of-00014.gguf
+      sha256: b6a23a95b61e100f7593fa75e2363966323fa767b7e4fdf45d963b59e8fdc69f
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00009-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00010-of-00014.gguf
+      sha256: fb10231c2e6d76921d40f22690f4aa08a8090c708edeaf7e581abafc24d3b25c
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00010-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00011-of-00014.gguf
+      sha256: d2290be7ed1a22ac1f9f8a4813389689e075ce2ab8abc3aaaa1157a3cb1462d8
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00011-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00012-of-00014.gguf
+      sha256: ce0d028314aa3fc783082dbca097e1055d69686a17ab8306574e2949568f26a5
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00012-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00013-of-00014.gguf
+      sha256: 217864ce63a1d130ab39dcb0996b6097e1aa78eb896e38efaefdbbac3a00b7ec
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00013-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00014-of-00014.gguf
+      sha256: eb7582ad7066c5eaa01bde95acb00b4ad9cd7b07cd50a6cf5c9ee427258bc9dd
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00014-of-00014.gguf
+    - filename: llama-cpp/mmproj/Kimi-K2.7-Code-GGUF/mmproj-F32.gguf
+      sha256: b2cc50c8c13fe70fc4968a83332f31e9007ea09ebb9ae91d46a4e4cd2a3053cd
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/mmproj-F32.gguf
 - name: "qwythos-9b-claude-mythos-5-1m"
  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
  urls:
@@ -49,33 +253,7 @@
  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
  urls:
    - https://huggingface.co/unsloth/GLM-5.2-GGUF
-  description: |
-    # GLM-5.2
-
-    👋 Join our WeChat or Discord community.
-
-    📖 Check out the GLM-5.2 blog and GLM-5 Technical report.
-
-    📍 Use GLM-5.2 API services on Z.ai API Platform.
-
-    🔜 Try GLM-5.2 here.
-
-    [Paper]
-    [GitHub]
-
-    ## Introduction
-
-    We're introducing GLM-5.2, our latest flagship model for long-horizon tasks. It marks a substantial leap in long-horizon task capability over its predecessor GLM-5.1 and, for the first time, delivers that capability on a **solid 1M-token context**. GLM-5.2's new capabilities include:
-      - **Solid 1M Context:** A solid 1M-token context that stably sustains long-horizon work
-      - **Advanced Coding with Flexible Effort**: Stronger coding capabilities with multiple thinking effort levels to balance performance and latency
-      - **Improved Architecture**: We propose IndexShare, which reuses the same indexer across every four sparse attention layers, reducing per-token FLOPs by 2.9× at a 1M context length. We also improve GLM-5.2’s MTP layer for speculative decoding, increasing the acceptance length by up to 20%
-      - **Pure Open**: An MIT open-source license — no regional limits, technical access without borders
-
-    ## Benchmark
-
-    ## Serve GLM-5.2 Locally
-
-    ...
+  description: "# GLM-5.2\n\n\U0001F44B Join our WeChat or Discord community.\n\n\U0001F4D6 Check out the GLM-5.2 blog and GLM-5 Technical report.\n\n\U0001F4CD Use GLM-5.2 API services on Z.ai API Platform.\n\n\U0001F51C Try GLM-5.2 here.\n\n[Paper]\n[GitHub]\n\n## Introduction\n\nWe're introducing GLM-5.2, our latest flagship model for long-horizon tasks. It marks a substantial leap in long-horizon task capability over its predecessor GLM-5.1 and, for the first time, delivers that capability on a **solid 1M-token context**. GLM-5.2's new capabilities include:\n  - **Solid 1M Context:** A solid 1M-token context that stably sustains long-horizon work\n  - **Advanced Coding with Flexible Effort**: Stronger coding capabilities with multiple thinking effort levels to balance performance and latency\n  - **Improved Architecture**: We propose IndexShare, which reuses the same indexer across every four sparse attention layers, reducing per-token FLOPs by 2.9× at a 1M context length. We also improve GLM-5.2’s MTP layer for speculative decoding, increasing the acceptance length by up to 20%\n  - **Pure Open**: An MIT open-source license — no regional limits, technical access without borders\n\n## Benchmark\n\n## Serve GLM-5.2 Locally\n\n...\n"
  license: "mit"
  tags:
    - llm
@@ -198,26 +376,7 @@
  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
  urls:
    - https://huggingface.co/michaelw9999/Qwopus3.6-27B-v2-MTP-NVFP4-GGUF
-  description: |
-    🪐 Qwopus3.6-27B-v2-MTP
-    MTP Release
-
-    Multi-Token Prediction reasoning model fine-tuned from Qwen3.6-27B
-
-    🧬 Trace Inversion & Negentropy
-    🧠 27B Parameters
-    ⚡ Speculative Decoding
-    🛠️ Coding / DevOps / Math
-
-    💡 What is Qwopus3.6-27B-v2-MTP?
-    🪐 Qwopus3.6-27B-v2-MTP is a speed-oriented reasoning release built on top of Qwen3.6-27B. It keeps the Qwopus line's focus on reconstructed reasoning traces, coding discipline, DevOps procedures, and mathematical derivations, while adding Multi-Token Prediction for faster generation. The goal is simple: preserve the depth and structure of a 27B reasoning model while making real interactive use noticeably faster.
-
-    ⚡ MTP DecodingAuxiliary future-token prediction improves throughput on long reasoning, code, math, and strict-format prompts.
-    🧩 Structured ReasoningInherits the Qwopus training recipe built around reconstructed step-by-step reasoning trajectories.
-    🧪 GB10 TestedValidated on a 30-question local benchmark across Logic, Coding, DevOps, Math, and Edge tasks.
-    🚀 Practical SpeedDesigned for workflows where strong answers matter, but waiting several extra minutes per task does not.
-
-    ...
+  description: "\U0001FA90 Qwopus3.6-27B-v2-MTP\nMTP Release\n\nMulti-Token Prediction reasoning model fine-tuned from Qwen3.6-27B\n\n\U0001F9EC Trace Inversion & Negentropy\n\U0001F9E0 27B Parameters\n⚡ Speculative Decoding\n\U0001F6E0️ Coding / DevOps / Math\n\n\U0001F4A1 What is Qwopus3.6-27B-v2-MTP?\n\U0001FA90 Qwopus3.6-27B-v2-MTP is a speed-oriented reasoning release built on top of Qwen3.6-27B. It keeps the Qwopus line's focus on reconstructed reasoning traces, coding discipline, DevOps procedures, and mathematical derivations, while adding Multi-Token Prediction for faster generation. The goal is simple: preserve the depth and structure of a 27B reasoning model while making real interactive use noticeably faster.\n\n⚡ MTP DecodingAuxiliary future-token prediction improves throughput on long reasoning, code, math, and strict-format prompts.\n\U0001F9E9 Structured ReasoningInherits the Qwopus training recipe built around reconstructed step-by-step reasoning trajectories.\n\U0001F9EA GB10 TestedValidated on a 30-question local benchmark across Logic, Coding, DevOps, Math, and Edge tasks.\n\U0001F680 Practical SpeedDesigned for workflows where strong answers matter, but waiting several extra minutes per task does not.\n\n...\n"
  tags:
    - llm
    - gguf
@@ -243,28 +402,7 @@
  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
  urls:
    - https://huggingface.co/michaelw9999/Qwopus3.6-27B-Coder-MTP-NVFP4-GGUF
-  description: |
-    🪐 Qwopus-3.6-27B-Coder
-    Coder SFT Release
-
-    Agentic Coding &amp; Tool-Use Reasoning Model Fine-Tuned on Qwopus3.6-27B-v2
-
-    🧬 Trace Inversion & Negentropy
-    🧠 27B Dense Model
-    ⚡ Agentic Coding
-    🛠️ Tool Calling & Agent
-    🏆 SWE-bench Verified: 67.0% (off-thinking)
-
-    💡 What is Qwopus-3.6-27B-Coder?
-    🪐 Qwopus-3.6-27B-Coder is a reasoning-enhanced agentic coding model built on top of Qwopus3.6-27B-v2. It inherits the powerful reasoning foundation of the v2 base — which achieved 87.43% MMLU-Pro (300ex) and 75.25% SWE-bench Verified — and further specializes it for agentic code generation, structured tool calling, debugging, and instruction-following in developer workflows. The model is designed to excel at repository-level coding tasks, multi-turn tool orchestration, and complex logical reasoning under realistic agent environments.
-
-    🧩 Agentic Coding
-    Optimized for repository-level coding, debugging, patch generation, and structured multi-step development workflows.
-
-    🛠️ Tool Calling
-    Learns from real agent trajectories with tool definitions, tool calls, and environment feedback for robust multi-turn execution.
-
-    ...
+  description: "\U0001FA90 Qwopus-3.6-27B-Coder\nCoder SFT Release\n\nAgentic Coding &amp; Tool-Use Reasoning Model Fine-Tuned on Qwopus3.6-27B-v2\n\n\U0001F9EC Trace Inversion & Negentropy\n\U0001F9E0 27B Dense Model\n⚡ Agentic Coding\n\U0001F6E0️ Tool Calling & Agent\n\U0001F3C6 SWE-bench Verified: 67.0% (off-thinking)\n\n\U0001F4A1 What is Qwopus-3.6-27B-Coder?\n\U0001FA90 Qwopus-3.6-27B-Coder is a reasoning-enhanced agentic coding model built on top of Qwopus3.6-27B-v2. It inherits the powerful reasoning foundation of the v2 base — which achieved 87.43% MMLU-Pro (300ex) and 75.25% SWE-bench Verified — and further specializes it for agentic code generation, structured tool calling, debugging, and instruction-following in developer workflows. The model is designed to excel at repository-level coding tasks, multi-turn tool orchestration, and complex logical reasoning under realistic agent environments.\n\n\U0001F9E9 Agentic Coding\nOptimized for repository-level coding, debugging, patch generation, and structured multi-step development workflows.\n\n\U0001F6E0️ Tool Calling\nLearns from real agent trajectories with tool definitions, tool calls, and environment feedback for robust multi-turn execution.\n\n...\n"
  tags:
    - llm
    - gguf
@@ -687,8 +825,8 @@
      use_tokenizer_template: true
  files:
    - filename: llama-cpp/models/Qwopus3.6-27B-Coder-MTP-GGUF/Qwopus3.6-27B-Coder-MTP-Q4_K_M.gguf
-      sha256: b2898667ed7b2388f0ab7691393833ae777f247492bbe62fdb4b2bd3e3cf3f79
      uri: https://huggingface.co/Jackrong/Qwopus3.6-27B-Coder-MTP-GGUF/resolve/main/Qwopus3.6-27B-Coder-MTP-Q4_K_M.gguf
+      sha256: b2b9180093496da2e00439e3fa23227c591355901bfa579bc6897bbc01b755ef
    - filename: llama-cpp/mmproj/Qwopus3.6-27B-Coder-MTP-GGUF/mmproj-F32.gguf
      sha256: 32f7ea0600c07272547da401d460f8abbd980f3a57b69d6df87be0e2505e0b9c
      uri: https://huggingface.co/Jackrong/Qwopus3.6-27B-Coder-MTP-GGUF/resolve/main/mmproj-F32.gguf
@@ -1484,8 +1622,8 @@
      use_tokenizer_template: true
  files:
    - filename: llama-cpp/models/Qwopus3.6-27B-v2-MTP-GGUF/Qwopus3.6-27B-v2-MTP-Q4_K_M.gguf
-      sha256: 818d68223be4d8518dac0b3b5604dde633cbbcbae1f491d842a3e26711c6606d
      uri: https://huggingface.co/Jackrong/Qwopus3.6-27B-v2-MTP-GGUF/resolve/main/Qwopus3.6-27B-v2-MTP-Q4_K_M.gguf
+      sha256: 31cf5fc2406a0c7aaebcc26d440bf0df94e215d0589d5205bf319649c052b50a
 - name: "qwen3.6-40b-claude-4.6-opus-deckard-heretic-uncensored-thinking-neo-code-di-imatrix-max"
  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
  urls:
--- a/pkg/grpc/backend.go
+++ b/pkg/grpc/backend.go
@@ -41,11 +41,34 @@ func buildClient(address string, parallel bool, wd WatchDog, enableWatchDog bool
 	}
 }

+// Backend is the full client surface of a model backend. It is deliberately
+// composed of two sub-interfaces so that wrappers can get a COMPILE-TIME
+// guarantee about which methods they must account for:
+//
+//   - InferenceBackend - methods that each perform one discrete inference call
+//     (the call begins on entry and ends on return). A wrapper that does
+//     per-call accounting - e.g. the distributed router's in-flight tracker,
+//     core/services/nodes.InFlightTrackingClient - embeds only ControlBackend
+//     and implements every InferenceBackend method explicitly. Adding a method
+//     to InferenceBackend therefore breaks that wrapper's build until it is
+//     implemented: inference can't be added without an accounting decision.
+//   - ControlBackend - everything that is NOT a discrete inference call:
+//     lifecycle/control-plane operations and the streaming constructors whose
+//     work spans the returned stream rather than the constructor call. These
+//     are safe to pass through untracked.
+//
+// Keep the two sets disjoint; every backend method belongs to exactly one.
 type Backend interface {
-	IsBusy() bool
-	HealthCheck(ctx context.Context) (bool, error)
+	InferenceBackend
+	ControlBackend
+}
+
+// InferenceBackend is the subset of Backend whose methods each map to a single
+// inference call. Wrappers that account for in-flight work must implement these
+// explicitly (see Backend). Do NOT add methods that return a stream client or
+// that are control-plane only - those belong in ControlBackend.
+type InferenceBackend interface {
 	Embeddings(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.EmbeddingResult, error)
-	LoadModel(ctx context.Context, in *pb.ModelOptions, opts ...grpc.CallOption) (*pb.Result, error)
 	PredictStream(ctx context.Context, in *pb.PredictOptions, f func(reply *pb.Reply), opts ...grpc.CallOption) error
 	Predict(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.Reply, error)
 	GenerateImage(ctx context.Context, in *pb.GenerateImageRequest, opts ...grpc.CallOption) (*pb.Result, error)
@@ -53,6 +76,8 @@ type Backend interface {
 	TTS(ctx context.Context, in *pb.TTSRequest, opts ...grpc.CallOption) (*pb.Result, error)
 	TTSStream(ctx context.Context, in *pb.TTSRequest, f func(reply *pb.Reply), opts ...grpc.CallOption) error
 	SoundGeneration(ctx context.Context, in *pb.SoundGenerationRequest, opts ...grpc.CallOption) (*pb.Result, error)
+	AudioTranscription(ctx context.Context, in *pb.TranscriptRequest, opts ...grpc.CallOption) (*pb.TranscriptResult, error)
+	AudioTranscriptionStream(ctx context.Context, in *pb.TranscriptRequest, f func(chunk *pb.TranscriptStreamResponse), opts ...grpc.CallOption) error
 	Detect(ctx context.Context, in *pb.DetectOptions, opts ...grpc.CallOption) (*pb.DetectResponse, error)
 	Depth(ctx context.Context, in *pb.DepthRequest, opts ...grpc.CallOption) (*pb.DepthResponse, error)
 	FaceVerify(ctx context.Context, in *pb.FaceVerifyRequest, opts ...grpc.CallOption) (*pb.FaceVerifyResponse, error)
@@ -60,8 +85,25 @@ type Backend interface {
 	VoiceVerify(ctx context.Context, in *pb.VoiceVerifyRequest, opts ...grpc.CallOption) (*pb.VoiceVerifyResponse, error)
 	VoiceAnalyze(ctx context.Context, in *pb.VoiceAnalyzeRequest, opts ...grpc.CallOption) (*pb.VoiceAnalyzeResponse, error)
 	VoiceEmbed(ctx context.Context, in *pb.VoiceEmbedRequest, opts ...grpc.CallOption) (*pb.VoiceEmbedResponse, error)
-	AudioTranscription(ctx context.Context, in *pb.TranscriptRequest, opts ...grpc.CallOption) (*pb.TranscriptResult, error)
-	AudioTranscriptionStream(ctx context.Context, in *pb.TranscriptRequest, f func(chunk *pb.TranscriptStreamResponse), opts ...grpc.CallOption) error
+	Rerank(ctx context.Context, in *pb.RerankRequest, opts ...grpc.CallOption) (*pb.RerankResult, error)
+	TokenClassify(ctx context.Context, in *pb.TokenClassifyRequest, opts ...grpc.CallOption) (*pb.TokenClassifyResponse, error)
+	Score(ctx context.Context, in *pb.ScoreRequest, opts ...grpc.CallOption) (*pb.ScoreResponse, error)
+	VAD(ctx context.Context, in *pb.VADRequest, opts ...grpc.CallOption) (*pb.VADResponse, error)
+	Diarize(ctx context.Context, in *pb.DiarizeRequest, opts ...grpc.CallOption) (*pb.DiarizeResponse, error)
+	SoundDetection(ctx context.Context, in *pb.SoundDetectionRequest, opts ...grpc.CallOption) (*pb.SoundDetectionResponse, error)
+	AudioEncode(ctx context.Context, in *pb.AudioEncodeRequest, opts ...grpc.CallOption) (*pb.AudioEncodeResult, error)
+	AudioDecode(ctx context.Context, in *pb.AudioDecodeRequest, opts ...grpc.CallOption) (*pb.AudioDecodeResult, error)
+	AudioTransform(ctx context.Context, in *pb.AudioTransformRequest, opts ...grpc.CallOption) (*pb.AudioTransformResult, error)
+}
+
+// ControlBackend is the subset of Backend that is NOT per-call inference:
+// lifecycle/control-plane operations and the streaming constructors whose work
+// spans the returned stream rather than the constructor call. In-flight-tracking
+// wrappers embed this directly and pass it through untracked (see Backend).
+type ControlBackend interface {
+	IsBusy() bool
+	HealthCheck(ctx context.Context) (bool, error)
+	LoadModel(ctx context.Context, in *pb.ModelOptions, opts ...grpc.CallOption) (*pb.Result, error)
 	TokenizeString(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.TokenizationResponse, error)
 	Status(ctx context.Context) (*pb.StatusResponse, error)

@@ -70,24 +112,11 @@ type Backend interface {
 	StoresGet(ctx context.Context, in *pb.StoresGetOptions, opts ...grpc.CallOption) (*pb.StoresGetResult, error)
 	StoresFind(ctx context.Context, in *pb.StoresFindOptions, opts ...grpc.CallOption) (*pb.StoresFindResult, error)

-	Rerank(ctx context.Context, in *pb.RerankRequest, opts ...grpc.CallOption) (*pb.RerankResult, error)
-
-	TokenClassify(ctx context.Context, in *pb.TokenClassifyRequest, opts ...grpc.CallOption) (*pb.TokenClassifyResponse, error)
-
-	Score(ctx context.Context, in *pb.ScoreRequest, opts ...grpc.CallOption) (*pb.ScoreResponse, error)
-
 	GetTokenMetrics(ctx context.Context, in *pb.MetricsRequest, opts ...grpc.CallOption) (*pb.MetricsResponse, error)

-	VAD(ctx context.Context, in *pb.VADRequest, opts ...grpc.CallOption) (*pb.VADResponse, error)
-
-	Diarize(ctx context.Context, in *pb.DiarizeRequest, opts ...grpc.CallOption) (*pb.DiarizeResponse, error)
-
-	SoundDetection(ctx context.Context, in *pb.SoundDetectionRequest, opts ...grpc.CallOption) (*pb.SoundDetectionResponse, error)
-
-	AudioEncode(ctx context.Context, in *pb.AudioEncodeRequest, opts ...grpc.CallOption) (*pb.AudioEncodeResult, error)
-	AudioDecode(ctx context.Context, in *pb.AudioDecodeRequest, opts ...grpc.CallOption) (*pb.AudioDecodeResult, error)
-
-	AudioTransform(ctx context.Context, in *pb.AudioTransformRequest, opts ...grpc.CallOption) (*pb.AudioTransformResult, error)
+	// Streaming constructors: these return a stream client immediately; the
+	// actual inference spans the stream's lifetime, not this call, so they are
+	// NOT tracked as a single in-flight unit.
 	AudioTransformStream(ctx context.Context, opts ...grpc.CallOption) (AudioTransformStreamClient, error)
 	AudioToAudioStream(ctx context.Context, opts ...grpc.CallOption) (AudioToAudioStreamClient, error)

--- a/pkg/xsysinfo/gpu.go
+++ b/pkg/xsysinfo/gpu.go
@@ -129,6 +129,61 @@ func TotalAvailableVRAM() (uint64, error) {
 	return 0, nil
 }

+// MinPerGPUVRAM returns the total VRAM of the SMALLEST GPU on the host (in
+// bytes), or 0 when no per-device VRAM is known. Unlike TotalAvailableVRAM
+// (which sums across devices) this reports a single device's ceiling, which is
+// the right figure for decisions about what must fit on one card: the compute
+// buffer (sized by n_ubatch) and the parallel-slot tier. Summing a multi-GPU
+// host's VRAM over-provisions those into a per-device OOM (issue #10485).
+//
+// Unified-memory devices (GB10, Apple) report system RAM as their single
+// device's VRAM, so they are unaffected.
+func MinPerGPUVRAM() (uint64, error) {
+	// Prefer per-device binary detection (nvidia-smi/rocm-smi report true
+	// per-card VRAM); ghw's per-card memory can reflect NUMA node RAM on some
+	// hosts, which is why TotalAvailableVRAM treats it as a sum.
+	if infos := GetGPUMemoryUsage(); len(infos) > 0 {
+		if v := minNonZeroVRAM(infos); v > 0 {
+			return v, nil
+		}
+	}
+
+	// Fallback: ghw per-card memory, taking the minimum non-zero card.
+	if gpus, err := GPUs(); err == nil {
+		var min uint64
+		for _, gpu := range gpus {
+			if gpu == nil || gpu.Node == nil || gpu.Node.Memory == nil {
+				continue
+			}
+			if b := gpu.Node.Memory.TotalUsableBytes; b > 0 {
+				if u := uint64(b); min == 0 || u < min {
+					min = u
+				}
+			}
+		}
+		if min > 0 {
+			return min, nil
+		}
+	}
+
+	return 0, nil
+}
+
+// minNonZeroVRAM returns the smallest non-zero TotalVRAM across the given GPUs,
+// or 0 when none report VRAM.
+func minNonZeroVRAM(infos []GPUMemoryInfo) uint64 {
+	var min uint64
+	for _, g := range infos {
+		if g.TotalVRAM == 0 {
+			continue
+		}
+		if min == 0 || g.TotalVRAM < min {
+			min = g.TotalVRAM
+		}
+	}
+	return min
+}
+
 func HasGPU(vendor string) bool {
 	gpus, err := GPUs()
 	if err != nil {
--- a/pkg/xsysinfo/minvram_internal_test.go
+++ b/pkg/xsysinfo/minvram_internal_test.go
@@ -0,0 +1,37 @@
+package xsysinfo
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("minNonZeroVRAM", func() {
+	const gib = uint64(1) << 30
+
+	It("returns the smallest device on a multi-GPU host", func() {
+		// Two unequal cards (e.g. RTX 5070 Ti + 5060 Ti, both 16 GiB, or a
+		// mixed pair): the smallest device is the per-card allocation ceiling.
+		infos := []GPUMemoryInfo{
+			{TotalVRAM: 16 * gib},
+			{TotalVRAM: 12 * gib},
+		}
+		Expect(minNonZeroVRAM(infos)).To(Equal(12 * gib))
+	})
+
+	It("ignores devices that report zero VRAM", func() {
+		infos := []GPUMemoryInfo{
+			{TotalVRAM: 0},
+			{TotalVRAM: 24 * gib},
+		}
+		Expect(minNonZeroVRAM(infos)).To(Equal(24 * gib))
+	})
+
+	It("returns the single device's VRAM on a one-GPU host", func() {
+		Expect(minNonZeroVRAM([]GPUMemoryInfo{{TotalVRAM: 16 * gib}})).To(Equal(16 * gib))
+	})
+
+	It("returns 0 when no device reports VRAM", func() {
+		Expect(minNonZeroVRAM([]GPUMemoryInfo{{TotalVRAM: 0}})).To(BeZero())
+		Expect(minNonZeroVRAM(nil)).To(BeZero())
+	})
+})
--- a/tests/e2e/distributed/gallery_distributed_test.go
+++ b/tests/e2e/distributed/gallery_distributed_test.go
@@ -53,12 +53,13 @@ var _ = Describe("Gallery Distributed", Label("Distributed"), func() {
 			Expect(retrieved.Status).To(Equal("downloading"))
 			Expect(retrieved.FrontendID).To(Equal("f1"))

-			// Update progress
-			Expect(galleryStore.UpdateProgress(op.ID, 0.75, "75% complete", "6GB")).To(Succeed())
+			// Update progress (cancellable: a downloading install can be cancelled)
+			Expect(galleryStore.UpdateProgress(op.ID, 0.75, "75% complete", "6GB", true)).To(Succeed())

 			updated, _ := galleryStore.Get(op.ID)
 			Expect(updated.Progress).To(BeNumerically("~", 0.75, 0.01))
 			Expect(updated.Message).To(Equal("75% complete"))
+			Expect(updated.Cancellable).To(BeTrue())

 			// Complete
 			Expect(galleryStore.UpdateStatus(op.ID, "completed", "")).To(Succeed())
--- a/tests/e2e/distributed/phase4_test.go
+++ b/tests/e2e/distributed/phase4_test.go
@@ -104,11 +104,12 @@ var _ = Describe("Phase 4: MCP, Skills, Gallery, Fine-Tuning", Label("Distribute
 			}
 			stores.Gallery.Create(op)

-			Expect(stores.Gallery.UpdateProgress(op.ID, 0.5, "50% complete", "2GB")).To(Succeed())
+			Expect(stores.Gallery.UpdateProgress(op.ID, 0.5, "50% complete", "2GB", true)).To(Succeed())

 			updated, _ := stores.Gallery.Get(op.ID)
 			Expect(updated.Progress).To(BeNumerically("~", 0.5, 0.01))
 			Expect(updated.Message).To(Equal("50% complete"))
+			Expect(updated.Cancellable).To(BeTrue())
 		})

 		It("should deduplicate concurrent downloads", func() {