From 78d682224a1b74874d4ae1bee0cbdff82cc47b30 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=95=AA=E8=8C=84=E6=91=94=E6=88=90=E7=95=AA=E8=8C=84?=
 =?UTF-8?q?=E9=85=B1?= <68098251+fqscfqj@users.noreply.github.com>
Date: Fri, 19 Jun 2026 20:59:50 +0800
Subject: [PATCH 01/99] fix(grpc): forward word-level timestamps in
 AudioTranscription wrapper (#10402)

The gRPC server wrapper in pkg/grpc/server.go reconstructs
TranscriptSegment messages when relaying AudioTranscription results
from backends. The Words field was not being copied, causing all
word-level timestamps to be silently dropped regardless of backend
support.

This was introduced when PR #9621 added the TranscriptWord proto
message and transcriptResultFromProto (server-side), but did not
update the server-side gRPC relay to forward the new field.

Fixes #9306

Signed-off-by: fqscfqj <fqscfqj@outlook.com>
---
 pkg/grpc/server.go | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pkg/grpc/server.go b/pkg/grpc/server.go
index 6ddb521ba..35afb502c 100644
--- a/pkg/grpc/server.go
+++ b/pkg/grpc/server.go
@@ -243,6 +243,14 @@ func (s *server) AudioTranscription(ctx context.Context, in *pb.TranscriptReques
 		for _, t := range s.Tokens {
 			tks = append(tks, int32(t))
 		}
+		words := make([]*pb.TranscriptWord, 0, len(s.Words))
+		for _, w := range s.Words {
+			words = append(words, &pb.TranscriptWord{
+				Start: int64(w.Start),
+				End:   int64(w.End),
+				Text:  w.Text,
+			})
+		}
 		tresult.Segments = append(tresult.Segments,
 			&pb.TranscriptSegment{
 				Text:    s.Text,
@@ -251,6 +259,7 @@ func (s *server) AudioTranscription(ctx context.Context, in *pb.TranscriptReques
 				End:     int64(s.End),
 				Tokens:  tks,
 				Speaker: s.Speaker,
+				Words:   words,
 			})
 	}
 

From 59c7ad51537f37a46ed1bc55a5cc50d55846d5c5 Mon Sep 17 00:00:00 2001
From: Souheab <85948717+Souheab@users.noreply.github.com>
Date: Fri, 19 Jun 2026 11:15:18 -0400
Subject: [PATCH 02/99] fix(nix flake): ensure nix flake builds successfully
 (#10399)

* Use inference defaults in repo src rather than fetching

there are inference_defaults.json already in the repo so we can use
those, they are regularly updated with github actions, and we avoid hash
mismatch errors in the flake this way

Signed-off-by: Souheab <souheab@protonmail.com>

* Update vendor hash

Signed-off-by: Souheab <souheab@protonmail.com>

* Create react-ui derivation as it is required for go build

Signed-off-by: Souheab <souheab@protonmail.com>

* Add FHS env wrapper to make #!/bin/bash scripts work

Signed-off-by: Souheab <souheab@protonmail.com>

* use pkgs.importNpmLock to deal with npm dependencies instead of using npmDepsHash

Signed-off-by: Souheab <souheab@protonmail.com>

---------

Signed-off-by: Souheab <souheab@protonmail.com>
---
 flake.lock | 13 -------------
 flake.nix  | 48 ++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/flake.lock b/flake.lock
index d67f05416..25b0fc536 100644
--- a/flake.lock
+++ b/flake.lock
@@ -1,17 +1,5 @@
 {
   "nodes": {
-    "inference-defaults": {
-      "flake": false,
-      "locked": {
-        "narHash": "sha256-ygWIkY2xiUEWqAZQM4/0vBz8vWd/RKX5VBj7EHovU14=",
-        "type": "file",
-        "url": "https://raw.githubusercontent.com/unslothai/unsloth/main/studio/backend/assets/configs/inference_defaults.json"
-      },
-      "original": {
-        "type": "file",
-        "url": "https://raw.githubusercontent.com/unslothai/unsloth/main/studio/backend/assets/configs/inference_defaults.json"
-      }
-    },
     "nixpkgs": {
       "locked": {
         "lastModified": 1777578337,
@@ -30,7 +18,6 @@
     },
     "root": {
       "inputs": {
-        "inference-defaults": "inference-defaults",
         "nixpkgs": "nixpkgs"
       }
     }
diff --git a/flake.nix b/flake.nix
index 2bbfd5c83..89691c716 100644
--- a/flake.nix
+++ b/flake.nix
@@ -4,24 +4,36 @@
 
   inputs = {
     nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
-    inference-defaults = {
-      url = "https://raw.githubusercontent.com/unslothai/unsloth/main/studio/backend/assets/configs/inference_defaults.json";
-      flake = false;
-    };
   };
 
-  outputs = { self, nixpkgs, inference-defaults }:
+  outputs = { self, nixpkgs }:
     let
       system = "x86_64-linux";
       pkgs = nixpkgs.legacyPackages.${system};
-    in {
-      packages.${system}.default = pkgs.buildGoModule {
+      reactUi = pkgs.buildNpmPackage {
+        pname = "localai-react-ui";
+        version = "custom";
+        src = ./core/http/react-ui;
+        npmDeps = pkgs.importNpmLock {
+          npmRoot = ./core/http/react-ui;
+        };
+        npmConfigHook = pkgs.importNpmLock.npmConfigHook;
+        npmBuildScript = "build";
+
+        installPhase = ''
+          runHook preInstall
+          mkdir -p $out
+          cp -r dist $out/
+          runHook postInstall
+        '';
+      };
+      localai-unwrapped = pkgs.buildGoModule {
         pname = "localai";
         version = "custom";
 
  	src = ./.;
         proxyVendor = true;
-        vendorHash = "sha256-6f3adjGsoFXlUtXjBDHP4Mv9jKCOK3aeUXprm0EAVO8=";
+        vendorHash = "sha256-z3lxQS8mXFuJzvYamejwapwVEmLpeAoiO3ksUKb4I3Q=";
 
         nativeBuildInputs = with pkgs; [
           pkg-config cmake gcc protobuf go-protobuf protoc-gen-go protoc-gen-go-grpc
@@ -44,8 +56,9 @@
 
           go mod edit -replace github.com/mudler/LocalAI/pkg/grpc/proto=./pkg/grpc/proto
 
-          mkdir -p core/config/gen_inference_defaults
-          cp ${inference-defaults} core/config/gen_inference_defaults/inference_defaults.json
+          mkdir -p core/http/react-ui
+          cp -r ${reactUi}/dist core/http/react-ui/dist
+
           sed -i '/go:generate/d' core/config/inference_defaults.go || true
 
 	'';
@@ -57,6 +70,21 @@
           [ -f $out/bin/local-ai ] && mv $out/bin/local-ai $out/bin/localai
         '';
       };
+    in {
+      packages.${system} = {
+        localai-unwrapped = localai-unwrapped;
+
+        default = pkgs.buildFHSEnv {
+          name = "localai";
+          targetPkgs = pkgs: with pkgs; [
+            localai-unwrapped
+            bash
+            coreutils
+            gnugrep
+          ];
+          runScript = "${localai-unwrapped}/bin/localai";
+        };
+      };
 
       devShells.${system}.default = pkgs.mkShell {
         packages = with pkgs; [

From 606128e4e9aeabacb4a029799d4ee6e0d970250a Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Fri, 19 Jun 2026 16:16:33 +0100
Subject: [PATCH 03/99] feat(vulkan): make Vulkan backends self-contained on
 the GPU (#10404)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Vulkan backends bundled their own loader and ICD manifests but neither the
Mesa driver the manifests point at nor a way to make the loader find them,
so on a runtime base image without Mesa the loader enumerated zero devices
and the GPU silently fell back to CPU (only NVIDIA worked, since its ICD is
injected by the container toolkit).

- scripts/build/package-gpu-libs.sh: for each installed ICD manifest, bundle
  the driver .so its library_path names — no hard-coded, platform-dependent
  soname list — plus that driver's ldd dependencies, skipping manifests whose
  driver isn't installed. Rewrite each library_path to a bare soname so the
  bundled driver resolves via the LD_LIBRARY_PATH run.sh already sets.
- .docker/install-base-deps.sh, backend/Dockerfile.golang,
  backend/Dockerfile.python: install mesa-vulkan-drivers in every Vulkan
  builder so the driver + manifests exist to be packaged (the LunarG SDK
  ships only the loader and shader tooling).
- pkg/model/process.go: when a backend ships vulkan/icd.d/, point the loader
  at it via VK_DRIVER_FILES/VK_ICD_FILENAMES at launch (no-op otherwise).
  Covered by pkg/model/process_vulkan_test.go.
- backend/go/parakeet-cpp/package.sh: complete the L0 stub (was missing the
  libc-family ldd walk + GPU-lib packaging) by mirroring whisper, so the
  vulkan-parakeet image actually bundles its GPU runtime.

Assisted-by: Claude Code:claude-opus-4-8

Signed-off-by: Richard Palethorpe <io@richiejp.com>
---
 .docker/install-base-deps.sh       |  6 ++
 backend/Dockerfile.golang          |  7 ++-
 backend/Dockerfile.python          |  7 ++-
 backend/go/parakeet-cpp/package.sh | 55 +++++++++++++++--
 pkg/model/process.go               | 46 ++++++++++++++-
 pkg/model/process_vulkan_test.go   | 58 ++++++++++++++++++
 scripts/build/package-gpu-libs.sh  | 95 ++++++++++++++++++++++++++++--
 7 files changed, 262 insertions(+), 12 deletions(-)
 create mode 100644 pkg/model/process_vulkan_test.go

diff --git a/.docker/install-base-deps.sh b/.docker/install-base-deps.sh
index 5b0908fa8..2b0e7e0c6 100755
--- a/.docker/install-base-deps.sh
+++ b/.docker/install-base-deps.sh
@@ -70,6 +70,12 @@ if [ "${BUILD_TYPE:-}" = "vulkan" ] && [ "${SKIP_DRIVERS:-false}" = "false" ]; t
         git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
         ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
         clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
+    # Mesa Vulkan ICD drivers (ANV/RADV/lavapipe + Arm SoC) and their ICD
+    # manifests. The LunarG SDK below only provides the loader and shader
+    # tooling, not hardware drivers — without Mesa the packaged Vulkan backend
+    # would ship a loader that finds no GPU. package-gpu-libs.sh bundles these
+    # .so files plus their deps into the backend so it stays self-contained.
+    apt-get install -y mesa-vulkan-drivers libdrm2
     if [ "amd64" = "${TARGETARCH:-}" ]; then
         wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz"
         tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz
diff --git a/backend/Dockerfile.golang b/backend/Dockerfile.golang
index 75fc3a0d9..d188cdf70 100644
--- a/backend/Dockerfile.golang
+++ b/backend/Dockerfile.golang
@@ -65,7 +65,12 @@ RUN <<EOT bash
             libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
             git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
             ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
-            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
+            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils && \
+        apt-get install -y mesa-vulkan-drivers libdrm2
+        # Mesa Vulkan ICD drivers (ANV/RADV/lavapipe) + their manifests. The
+        # LunarG SDK below only provides the loader and shader tooling, not
+        # hardware drivers — without Mesa, package-gpu-libs.sh has no ICD to
+        # bundle and the packaged backend finds no GPU at runtime.
         if [ "amd64" = "$TARGETARCH" ]; then
             wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
             tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
diff --git a/backend/Dockerfile.python b/backend/Dockerfile.python
index 03f2cf5e0..a4133d301 100644
--- a/backend/Dockerfile.python
+++ b/backend/Dockerfile.python
@@ -66,7 +66,12 @@ RUN <<EOT bash
             libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
             git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
             ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
-            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
+            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils && \
+        apt-get install -y mesa-vulkan-drivers libdrm2
+        # Mesa Vulkan ICD drivers (ANV/RADV/lavapipe) + their manifests. The
+        # LunarG SDK below only provides the loader and shader tooling, not
+        # hardware drivers — without Mesa, package-gpu-libs.sh has no ICD to
+        # bundle and the packaged backend finds no GPU at runtime.
         if [ "amd64" = "$TARGETARCH" ]; then
             wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
             tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
diff --git a/backend/go/parakeet-cpp/package.sh b/backend/go/parakeet-cpp/package.sh
index 7af2d7b59..0b580324c 100755
--- a/backend/go/parakeet-cpp/package.sh
+++ b/backend/go/parakeet-cpp/package.sh
@@ -1,23 +1,68 @@
 #!/bin/bash
 #
-# L0 packaging stub: copy the binary, run.sh and libparakeet.so* into
-# package/. The full ldd walk (libc, libstdc++, libgomp, GPU runtimes,
-# arch detection) lands in L3, mirroring backend/go/whisper/package.sh.
+# Bundle the parakeet-cpp-grpc binary, libparakeet.so, the core runtime
+# libs (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active
+# BUILD_TYPE so the package is self-contained. Mirrors
+# backend/go/whisper/package.sh; run.sh routes the (CGO_ENABLED=0) binary
+# through lib/ld.so so the packaged libc is used instead of the host's.
 
 set -e
 
 CURDIR=$(dirname "$(realpath "$0")")
+REPO_ROOT="${CURDIR}/../../.."
 
 mkdir -p "$CURDIR/package/lib"
 
 cp -avf "$CURDIR/parakeet-cpp-grpc" "$CURDIR/package/"
 cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
 
-# libparakeet.so + any soname symlinks (libparakeet.so.X, libparakeet.so.X.Y).
+# libparakeet.so + any soname symlinks (libparakeet.so.X[.Y]). purego.Dlopen
+# resolves it via LD_LIBRARY_PATH, which run.sh points at lib/.
 cp -avf "$CURDIR"/libparakeet.so* "$CURDIR/package/lib/" 2>/dev/null || {
 	echo "ERROR: libparakeet.so not found in $CURDIR, run 'make' first" >&2
 	exit 1
 }
 
-echo "L0 package layout (full ldd walk lands in L3):"
+# Detect architecture and copy the core runtime libs libparakeet.so links
+# against, plus the matching dynamic loader as lib/ld.so.
+if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
+    echo "Detected x86_64 architecture, copying x86_64 libraries..."
+    cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
+    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
+    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
+elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
+    echo "Detected ARM64 architecture, copying ARM64 libraries..."
+    cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
+    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
+    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
+elif [ "$(uname -s)" = "Darwin" ]; then
+    echo "Detected Darwin"
+else
+    echo "Error: Could not detect architecture"
+    exit 1
+fi
+
+# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers)
+# based on BUILD_TYPE so the backend can reach the GPU without the runtime
+# base image shipping those drivers.
+GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
+if [ -f "$GPU_LIB_SCRIPT" ]; then
+    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
+    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
+    package_gpu_libs
+fi
+
+echo "Packaging completed successfully"
 ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
diff --git a/pkg/model/process.go b/pkg/model/process.go
index 60f8d318e..95e3e0758 100644
--- a/pkg/model/process.go
+++ b/pkg/model/process.go
@@ -154,11 +154,20 @@ func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string
 		return nil, err
 	}
 
+	env := os.Environ()
+	// Vulkan backends are self-contained: they bundle their own loader and
+	// Mesa driver .so files in lib/ plus the matching ICD manifests in
+	// vulkan/icd.d/. Point the loader at those manifests so it doesn't rely on
+	// the runtime base image shipping a Vulkan driver (it carries the
+	// SYCL/Level-Zero stack instead, so the default ICD search path is empty
+	// and the GPU would silently fall back to CPU). No-op for other backends.
+	env = append(env, vulkanICDEnv(workDir)...)
+
 	grpcControlProcess := process.New(
 		process.WithTemporaryStateDir(),
 		process.WithName(filepath.Base(grpcProcess)),
 		process.WithArgs(append(args, []string{"--addr", serverAddress}...)...),
-		process.WithEnvironment(os.Environ()...),
+		process.WithEnvironment(env...),
 		process.WithWorkDir(workDir),
 	)
 
@@ -249,3 +258,38 @@ func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string
 
 	return grpcControlProcess, nil
 }
+
+// vulkanICDEnv returns environment overrides that point the Vulkan loader at
+// the ICD manifests a backend bundles in <workDir>/vulkan/icd.d. Vulkan
+// backends ship a self-contained stack — their own loader and Mesa driver .so
+// files in lib/ (resolved via the LD_LIBRARY_PATH that run.sh sets) plus the
+// matching ICD manifests — so the loader must be told where those manifests
+// live; its default search path (/usr/share/vulkan/icd.d, /etc/vulkan/icd.d)
+// is empty on the runtime base image. Returns nil when the directory holds no
+// manifests (CPU/CUDA/SYCL builds), leaving the host's Vulkan setup untouched.
+func vulkanICDEnv(workDir string) []string {
+	icdDir := filepath.Join(workDir, "vulkan", "icd.d")
+	entries, err := os.ReadDir(icdDir)
+	if err != nil {
+		return nil
+	}
+
+	manifests := make([]string, 0, len(entries))
+	for _, e := range entries {
+		if e.IsDir() || !strings.HasSuffix(e.Name(), ".json") {
+			continue
+		}
+		manifests = append(manifests, filepath.Join(icdDir, e.Name()))
+	}
+	if len(manifests) == 0 {
+		return nil
+	}
+
+	list := strings.Join(manifests, string(os.PathListSeparator))
+	// VK_DRIVER_FILES is the current loader variable; VK_ICD_FILENAMES is its
+	// deprecated alias, set too so older bundled loaders still pick it up.
+	return []string{
+		"VK_DRIVER_FILES=" + list,
+		"VK_ICD_FILENAMES=" + list,
+	}
+}
diff --git a/pkg/model/process_vulkan_test.go b/pkg/model/process_vulkan_test.go
new file mode 100644
index 000000000..c3bb108a7
--- /dev/null
+++ b/pkg/model/process_vulkan_test.go
@@ -0,0 +1,58 @@
+package model
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("vulkanICDEnv", func() {
+	It("returns nil when the backend ships no vulkan/icd.d (CPU/CUDA/SYCL builds)", func() {
+		Expect(vulkanICDEnv(GinkgoT().TempDir())).To(BeNil())
+	})
+
+	It("returns nil when icd.d exists but holds no .json manifests", func() {
+		work := GinkgoT().TempDir()
+		icdDir := filepath.Join(work, "vulkan", "icd.d")
+		Expect(os.MkdirAll(icdDir, 0o755)).To(Succeed())
+		Expect(os.WriteFile(filepath.Join(icdDir, "README.txt"), []byte("not a manifest"), 0o644)).To(Succeed())
+		// A directory whose name ends in .json must be ignored.
+		Expect(os.MkdirAll(filepath.Join(icdDir, "nested.json"), 0o755)).To(Succeed())
+
+		Expect(vulkanICDEnv(work)).To(BeNil())
+	})
+
+	It("points VK_DRIVER_FILES/VK_ICD_FILENAMES at the bundled manifests", func() {
+		work := GinkgoT().TempDir()
+		icdDir := filepath.Join(work, "vulkan", "icd.d")
+		Expect(os.MkdirAll(icdDir, 0o755)).To(Succeed())
+		for _, name := range []string{"intel_icd.json", "lvp_icd.json"} {
+			Expect(os.WriteFile(filepath.Join(icdDir, name), []byte("{}"), 0o644)).To(Succeed())
+		}
+
+		env := vulkanICDEnv(work)
+		Expect(env).To(HaveLen(2))
+
+		got := map[string]string{}
+		for _, kv := range env {
+			k, v, ok := strings.Cut(kv, "=")
+			Expect(ok).To(BeTrue(), "malformed env entry %q", kv)
+			got[k] = v
+		}
+
+		for _, key := range []string{"VK_DRIVER_FILES", "VK_ICD_FILENAMES"} {
+			Expect(got).To(HaveKey(key))
+			// Both manifests must be listed as absolute paths, joined by the
+			// OS path-list separator the Vulkan loader expects.
+			parts := strings.Split(got[key], string(os.PathListSeparator))
+			Expect(parts).To(HaveLen(2))
+			for _, p := range parts {
+				Expect(filepath.IsAbs(p)).To(BeTrue(), "%s entry %q must be absolute", key, p)
+				Expect(p).To(HaveSuffix(".json"))
+			}
+		}
+	})
+})
diff --git a/scripts/build/package-gpu-libs.sh b/scripts/build/package-gpu-libs.sh
index 2b5b02aab..40f410173 100755
--- a/scripts/build/package-gpu-libs.sh
+++ b/scripts/build/package-gpu-libs.sh
@@ -109,6 +109,38 @@ copy_libs_glob() {
     done
 }
 
+# Returns success for the core runtime libs the base image and package.sh
+# already provide. We must NOT bundle our own copies of these — a second libc
+# or libstdc++ on LD_LIBRARY_PATH clashes with the loader and the rest of the
+# process — so they're skipped when pulling in a driver's transitive deps.
+is_core_lib() {
+    case "$1" in
+        ld-linux*|ld.so|libc.so.*|libm.so.*|libdl.so.*|libpthread.so.*|librt.so.*|\
+        libgcc_s.so.*|libstdc++.so.*|libresolv.so.*|libutil.so.*|linux-vdso.so.*)
+            return 0 ;;
+    esac
+    return 1
+}
+
+# Copy the shared-library dependencies of an ELF file into TARGET_LIB_DIR.
+# Used to make a bundled GPU driver self-contained: e.g. the Mesa Vulkan ICDs
+# pull in libdrm, libexpat and (for RADV/lavapipe) libLLVM, none of which the
+# runtime base image is guaranteed to have. Core libc-family deps are skipped.
+copy_elf_deps() {
+    local elf="$1"
+    [ -e "$elf" ] || return 0
+    command -v ldd >/dev/null 2>&1 || return 0
+
+    # ldd lines look like: "<TAB>libfoo.so.1 => /path/to/libfoo.so.1 (0x..)".
+    # Take the resolved absolute path (field 3) and skip vdso/static entries.
+    while read -r dep; do
+        if is_core_lib "$(basename "$dep")"; then
+            continue
+        fi
+        copy_lib "$dep"
+    done < <(ldd "$elf" 2>/dev/null | awk '/=>/ && $3 ~ /^\// {print $3}')
+}
+
 # Package NVIDIA CUDA libraries
 package_cuda_libs() {
     echo "Packaging CUDA libraries for BUILD_TYPE=${BUILD_TYPE}..."
@@ -284,7 +316,7 @@ package_vulkan_libs() {
         "/usr/local/lib"
     )
 
-    # Core Vulkan runtime libraries
+    # Core Vulkan runtime: the loader plus the shader tooling shipped by the SDK.
     local vulkan_libs=(
         "libvulkan.so*"
         "libshaderc_shared.so*"
@@ -301,10 +333,63 @@ package_vulkan_libs() {
         fi
     done
 
-    # Copy Vulkan ICD files
+    # Bundle the ICD drivers. Rather than hard-code Mesa's (platform- and
+    # version-dependent) driver sonames, treat each installed ICD manifest as
+    # the source of truth: every /usr/share/vulkan/icd.d/*.json names the exact
+    # driver .so it needs in its "library_path". So we copy whatever drivers
+    # the manifests reference (libvulkan_intel/radeon/lvp/... on amd64, the SoC
+    # drivers on arm64, ...) plus each driver's transitive deps, and skip any
+    # manifest whose driver isn't actually installed. The loader picks the
+    # right driver for the GPU at runtime.
     if [ -d "/usr/share/vulkan/icd.d" ]; then
-        mkdir -p "$TARGET_LIB_DIR/../vulkan/icd.d"
-        cp -arfL /usr/share/vulkan/icd.d/* "$TARGET_LIB_DIR/../vulkan/icd.d/" 2>/dev/null || true
+        local icd_dest="$TARGET_LIB_DIR/../vulkan/icd.d"
+        mkdir -p "$icd_dest"
+
+        local manifest driver driver_base resolved lib_path
+        for manifest in /usr/share/vulkan/icd.d/*.json; do
+            [ -e "$manifest" ] || continue
+
+            # Pull the driver path out of "library_path": "<path-or-soname>".
+            driver=$(sed -nE 's/.*"library_path"[[:space:]]*:[[:space:]]*"([^"]+)".*/\1/p' "$manifest" | head -n1)
+            [ -n "$driver" ] || continue
+            driver_base=$(basename "$driver")
+
+            # Resolve to an absolute path: honour an absolute library_path,
+            # else look in the standard lib dirs, else fall back to ldconfig.
+            resolved=""
+            case "$driver" in
+                /*) [ -e "$driver" ] && resolved="$driver" ;;
+            esac
+            if [ -z "$resolved" ]; then
+                for lib_path in "${vulkan_lib_paths[@]}"; do
+                    if [ -e "${lib_path}/${driver_base}" ]; then
+                        resolved="${lib_path}/${driver_base}"
+                        break
+                    fi
+                done
+            fi
+            if [ -z "$resolved" ] && command -v ldconfig >/dev/null 2>&1; then
+                resolved=$(ldconfig -p | awk -v n="$driver_base" '$1 == n { print $NF; exit }')
+            fi
+
+            if [ -z "$resolved" ] || [ ! -e "$resolved" ]; then
+                echo "Vulkan ICD: driver '$driver_base' for $(basename "$manifest") not installed; skipping its manifest" >&2
+                continue
+            fi
+
+            # Bundle the driver + its transitive deps (libdrm, libexpat, and
+            # libLLVM for RADV/lavapipe, ...) so the backend is self-contained
+            # on a runtime base image without Mesa.
+            copy_lib "$resolved"
+            copy_elf_deps "$resolved"
+
+            # Copy the manifest and rewrite its library_path to a bare soname
+            # so the loader resolves our bundled driver via LD_LIBRARY_PATH
+            # (run.sh adds lib/ to it) instead of a host path that won't exist
+            # on the runtime image.
+            cp -arfL "$manifest" "$icd_dest/" 2>/dev/null || true
+            sed -i -E 's#("library_path"[[:space:]]*:[[:space:]]*")[^"]*/#\1#' "$icd_dest/$(basename "$manifest")"
+        done
     fi
 
     echo "Vulkan libraries packaged successfully"
@@ -345,6 +430,8 @@ package_gpu_libs() {
 export -f package_gpu_libs
 export -f copy_lib
 export -f copy_libs_glob
+export -f is_core_lib
+export -f copy_elf_deps
 export -f package_cuda_libs
 export -f package_rocm_libs
 export -f package_intel_libs

From 72d46c1115546a65c3240b832eaace55aae3ce73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=95=AA=E8=8C=84=E6=91=94=E6=88=90=E7=95=AA=E8=8C=84?=
 =?UTF-8?q?=E9=85=B1?= <68098251+fqscfqj@users.noreply.github.com>
Date: Sat, 20 Jun 2026 03:34:30 +0800
Subject: [PATCH 04/99] feat(crispasr): add word-level timestamp support
 (#10403)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(crispasr): add word-level timestamp support

Add word-level timestamp extraction to the crispasr backend by calling
the CrispASR C library's word accessor functions that are already
exported by libgocraspasr but were not previously bound by the Go
wrapper.

Two families of word functions are supported:

1. Session-based (get_word_count/text/t0/t1) — works per-segment for
   whisper-like backends.
2. Parakeet-specific (get_parakeet_word_count/text/t0/t1) — returns a
   global word list for TDT/CTC/RNNT parakeet models where the session
   API does not expose per-segment word data.

The Go code tries session-based first and falls back to parakeet-specific
when the session word count is zero.

Depends on #10402 (grpc server Words forwarding) for the words to reach
the HTTP response.

Signed-off-by: fqscfqj <fqscfqj@outlook.com>

* fix(crispasr): use portable sed -i.bak for macOS compatibility

BSD sed requires -i '' for in-place editing while GNU sed uses -i.
Replace with -i.bak which works on both platforms, then remove the
backup file.

Signed-off-by: fqscfqj <fqscfqj@outlook.com>

---------

Signed-off-by: fqscfqj <fqscfqj@outlook.com>
---
 backend/go/crispasr/Makefile              |  2 +-
 backend/go/crispasr/cpp/crispasr_shim.cpp | 68 +++++++++++++++++++++++
 backend/go/crispasr/cpp/crispasr_shim.h   | 14 +++++
 backend/go/crispasr/gocrispasr.go         | 38 +++++++++++++
 backend/go/crispasr/main.go               |  8 +++
 5 files changed, 129 insertions(+), 1 deletion(-)

diff --git a/backend/go/crispasr/Makefile b/backend/go/crispasr/Makefile
index 42a7a7555..bbc84f1de 100644
--- a/backend/go/crispasr/Makefile
+++ b/backend/go/crispasr/Makefile
@@ -67,7 +67,7 @@ sources/CrispASR:
 	# it, so ${CMAKE_SOURCE_DIR} is THIS backend dir and the talk-llama sources
 	# aren't found. Rewrite to ${PROJECT_SOURCE_DIR} (the crispasr project root),
 	# which is correct both standalone and as a subproject. Idempotent.
-	sed -i 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt
+	sed -i.bak 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt && rm -f sources/CrispASR/src/CMakeLists.txt.bak
 
 # Detect OS
 UNAME_S := $(shell uname -s)
diff --git a/backend/go/crispasr/cpp/crispasr_shim.cpp b/backend/go/crispasr/cpp/crispasr_shim.cpp
index bf6151ae1..60dbfd86b 100644
--- a/backend/go/crispasr/cpp/crispasr_shim.cpp
+++ b/backend/go/crispasr/cpp/crispasr_shim.cpp
@@ -47,6 +47,74 @@ extern "C" void set_abort(int v) {
   g_abort.store(v, std::memory_order_relaxed);
 }
 
+// --- word-level timestamp accessors ---
+extern "C" {
+int crispasr_session_result_n_words(crispasr_session_result *r, int seg_i);
+const char *crispasr_session_result_word_text(crispasr_session_result *r,
+                                               int seg_i, int word_i);
+int64_t crispasr_session_result_word_t0(crispasr_session_result *r, int seg_i,
+                                         int word_i);
+int64_t crispasr_session_result_word_t1(crispasr_session_result *r, int seg_i,
+                                         int word_i);
+
+// Parakeet-specific word accessors
+int crispasr_parakeet_result_n_words(void *r);
+const char *crispasr_parakeet_result_word_text(void *r, int word_i);
+int64_t crispasr_parakeet_result_word_t0(void *r, int word_i);
+int64_t crispasr_parakeet_result_word_t1(void *r, int word_i);
+}
+
+void *get_result(void) { return g_result; }
+
+int get_word_count(int seg_i) {
+  if (!g_result)
+    return 0;
+  return crispasr_session_result_n_words(g_result, seg_i);
+}
+
+const char *get_word_text(int seg_i, int word_i) {
+  if (!g_result)
+    return "";
+  return crispasr_session_result_word_text(g_result, seg_i, word_i);
+}
+
+int64_t get_word_t0(int seg_i, int word_i) {
+  if (!g_result)
+    return 0;
+  return crispasr_session_result_word_t0(g_result, seg_i, word_i);
+}
+
+int64_t get_word_t1(int seg_i, int word_i) {
+  if (!g_result)
+    return 0;
+  return crispasr_session_result_word_t1(g_result, seg_i, word_i);
+}
+
+// Parakeet-specific word accessors
+int get_parakeet_word_count(void) {
+  if (!g_result)
+    return 0;
+  return crispasr_parakeet_result_n_words(g_result);
+}
+
+const char *get_parakeet_word_text(int word_i) {
+  if (!g_result)
+    return "";
+  return crispasr_parakeet_result_word_text(g_result, word_i);
+}
+
+int64_t get_parakeet_word_t0(int word_i) {
+  if (!g_result)
+    return 0;
+  return crispasr_parakeet_result_word_t0(g_result, word_i);
+}
+
+int64_t get_parakeet_word_t1(int word_i) {
+  if (!g_result)
+    return 0;
+  return crispasr_parakeet_result_word_t1(g_result, word_i);
+}
+
 static void ggml_log_cb(enum ggml_log_level level, const char *log,
                         void *data) {
   const char *level_str;
diff --git a/backend/go/crispasr/cpp/crispasr_shim.h b/backend/go/crispasr/cpp/crispasr_shim.h
index 7c593951a..c7baa41f4 100644
--- a/backend/go/crispasr/cpp/crispasr_shim.h
+++ b/backend/go/crispasr/cpp/crispasr_shim.h
@@ -20,4 +20,18 @@ float *tts_synthesize(const char *text, int *out_n_samples); // 24kHz mono float
 void tts_free(float *pcm);
 int tts_set_voice(const char *name); // best-effort speaker selection; 0 ok
 int tts_set_voice_file(const char *path, const char *ref_text); // load voice pack (.gguf) or zero-shot clone (.wav + ref_text)
+
+// --- word-level timestamp accessors ---
+// Session-based (works for whisper-like backends)
+void *get_result(void);
+int get_word_count(int seg_i);
+const char *get_word_text(int seg_i, int word_i);
+int64_t get_word_t0(int seg_i, int word_i);
+int64_t get_word_t1(int seg_i, int word_i);
+
+// Parakeet-specific (global word list, no segment index)
+int get_parakeet_word_count(void);
+const char *get_parakeet_word_text(int word_i);
+int64_t get_parakeet_word_t0(int word_i);
+int64_t get_parakeet_word_t1(int word_i);
 }
diff --git a/backend/go/crispasr/gocrispasr.go b/backend/go/crispasr/gocrispasr.go
index 5c3528d38..af1f1a95c 100644
--- a/backend/go/crispasr/gocrispasr.go
+++ b/backend/go/crispasr/gocrispasr.go
@@ -34,6 +34,18 @@ var (
 	CppTTSFree         func(ptr uintptr)
 	CppTTSSetVoice     func(name string) int
 	CppTTSSetVoiceFile func(path string, refText string) int
+
+	// Word-level timestamp accessors (session-based, per-segment)
+	CppGetWordCount func(segI int) int
+	CppGetWordText  func(segI int, wordI int) string
+	CppGetWordT0    func(segI int, wordI int) int64
+	CppGetWordT1    func(segI int, wordI int) int64
+
+	// Parakeet-specific word accessors (global, no segment index)
+	CppGetParakeetWordCount func() int
+	CppGetParakeetWordText  func(wordI int) string
+	CppGetParakeetWordT0    func(wordI int) int64
+	CppGetParakeetWordT1    func(wordI int) int64
 )
 
 type CrispASR struct {
@@ -290,10 +302,36 @@ func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRe
 		// IDs, so Tokens is left empty.
 		txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "�")
 
+		// Populate word-level timestamps. Try session-based functions first
+		// (per-segment); fall back to parakeet-specific functions (global word
+		// list with no segment index — only populated on the first segment to
+		// avoid duplication).
+		words := []*pb.TranscriptWord{}
+		wordCount := CppGetWordCount(i)
+		if wordCount == 0 && i == 0 {
+			wordCount = CppGetParakeetWordCount()
+			for j := 0; j < wordCount; j++ {
+				words = append(words, &pb.TranscriptWord{
+					Start: CppGetParakeetWordT0(j) * (10000000),
+					End:   CppGetParakeetWordT1(j) * (10000000),
+					Text:  strings.ToValidUTF8(strings.Clone(CppGetParakeetWordText(j)), "�"),
+				})
+			}
+		} else {
+			for j := 0; j < wordCount; j++ {
+				words = append(words, &pb.TranscriptWord{
+					Start: CppGetWordT0(i, j) * (10000000),
+					End:   CppGetWordT1(i, j) * (10000000),
+					Text:  strings.ToValidUTF8(strings.Clone(CppGetWordText(i, j)), "�"),
+				})
+			}
+		}
+
 		segment := &pb.TranscriptSegment{
 			Id:    int32(i),
 			Text:  txt,
 			Start: s, End: t,
+			Words: words,
 		}
 
 		segments = append(segments, segment)
diff --git a/backend/go/crispasr/main.go b/backend/go/crispasr/main.go
index c2069bd85..9f3ef14d0 100644
--- a/backend/go/crispasr/main.go
+++ b/backend/go/crispasr/main.go
@@ -44,6 +44,14 @@ func main() {
 		{&CppTTSFree, "tts_free"},
 		{&CppTTSSetVoice, "tts_set_voice"},
 		{&CppTTSSetVoiceFile, "tts_set_voice_file"},
+		{&CppGetWordCount, "get_word_count"},
+		{&CppGetWordText, "get_word_text"},
+		{&CppGetWordT0, "get_word_t0"},
+		{&CppGetWordT1, "get_word_t1"},
+		{&CppGetParakeetWordCount, "get_parakeet_word_count"},
+		{&CppGetParakeetWordText, "get_parakeet_word_text"},
+		{&CppGetParakeetWordT0, "get_parakeet_word_t0"},
+		{&CppGetParakeetWordT1, "get_parakeet_word_t1"},
 	}
 
 	for _, lf := range libFuncs {

From 2e734bf56039a2f412ba9216b00ba683096c5726 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Fri, 19 Jun 2026 21:35:21 +0200
Subject: [PATCH 05/99] fix(downloader): stall timeout, resume-safe cancel, and
 stale-partial reaping (#10406)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix(downloader): stall timeout, resume-safe cancel, and stale-partial reaping

Large model installs would hang forever or never finish. Three defects in
the HTTP download path, all hit by big GGUF pulls over a slow or flaky link:

1. No stall timeout. The shared download client sets no body deadline
   (correct for streaming) but also no read-idle timeout, and the
   transport's IdleConnTimeout does not cover an in-flight body read. A
   silently-dropped TCP connection (no FIN/RST) blocked the body Read
   forever, freezing an install at N bytes until an external reaper killed
   it. Add an idle-timeout reader that closes the body after a window of
   zero progress (DownloadStallTimeout, default 60s), turning an indefinite
   hang into a fast, retryable error. A read that returns data resets the
   clock, so a slow-but-steady transfer is unaffected.

2. Cancellation deleted the partial. On context.Canceled the code removed
   the .partial file, so any frontend restart (deploy, OOM) mid-download
   wiped all progress and the retry restarted from zero. At slow egress,
   files larger than the restart interval never completed. Keep the
   .partial on cancel so the next attempt resumes via Range.

3. Partials leaked. Cleanup only ran on the context-cancel path, never on a
   stall or a SIGKILL/OOM, so abandoned .partial files accumulated and could
   fill the models volume. Add CleanupStalePartialFiles and reap partials
   older than 24h on startup.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* fix(downloader): discard the .partial on a deliberate user cancel

Review follow-up. The previous commit kept the .partial on every cancellation
so restarts could resume, but that also left a dangling partial when a user
*intentionally* cancelled an install — the file lingered until the 24h reaper.

Distinguish the two: cancel the gallery operation's context with a cause
(downloader.ErrUserCancelled) so the download layer can tell a deliberate
abort (discard the partial) from an incidental one such as a shutdown/restart
(keep it for resume). Detect cancellation via the context rather than the
returned error, because an HTTP request cancelled with a cause surfaces the
cause error, not context.Canceled.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* fix(downloader): resolve gosec G122 in CleanupStalePartialFiles

CI's code-scanning (gosec) flagged G122 (symlink TOCTOU) for the os.Remove
call inside the filepath.WalkDir callback. Collect the stale paths during the
walk and delete them afterwards instead of mutating the tree from inside the
callback. Behavior is unchanged; the existing specs still pass.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/application/startup.go        |  11 +++
 core/services/galleryop/service.go |  15 ++-
 pkg/downloader/cancel_test.go      | 148 +++++++++++++++++++++++++++++
 pkg/downloader/partial.go          |  69 ++++++++++++++
 pkg/downloader/partial_test.go     |  53 +++++++++++
 pkg/downloader/stall.go            |  77 +++++++++++++++
 pkg/downloader/stall_test.go       | 131 +++++++++++++++++++++++++
 pkg/downloader/uri.go              |  57 ++++++++---
 8 files changed, 547 insertions(+), 14 deletions(-)
 create mode 100644 pkg/downloader/cancel_test.go
 create mode 100644 pkg/downloader/partial.go
 create mode 100644 pkg/downloader/partial_test.go
 create mode 100644 pkg/downloader/stall.go
 create mode 100644 pkg/downloader/stall_test.go

diff --git a/core/application/startup.go b/core/application/startup.go
index 6438c7df3..fa5de5ede 100644
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -25,6 +25,7 @@ import (
 	"github.com/mudler/LocalAI/core/services/storage"
 	coreStartup "github.com/mudler/LocalAI/core/startup"
 	"github.com/mudler/LocalAI/internal"
+	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/signals"
 	"github.com/mudler/LocalAI/pkg/vram"
 
@@ -71,6 +72,16 @@ func New(opts ...config.AppOption) (*Application, error) {
 	if err != nil {
 		return nil, fmt.Errorf("unable to create ModelPath: %q", err)
 	}
+
+	// Reap *.partial downloads abandoned by a previous run (killed mid-transfer
+	// by an OOM/restart, or stalled before cleanup could run). The 24h window
+	// is well beyond any legitimate in-flight download, so this never trims an
+	// active transfer; it just stops dead partials accumulating on the volume.
+	if removed, cErr := downloader.CleanupStalePartialFiles(options.SystemState.Model.ModelsPath, 24*time.Hour); cErr != nil {
+		xlog.Warn("Failed to reap stale partial downloads", "error", cErr)
+	} else if removed > 0 {
+		xlog.Info("Reaped stale partial downloads", "count", removed)
+	}
 	if options.GeneratedContentDir != "" {
 		err := os.MkdirAll(options.GeneratedContentDir, 0o750)
 		if err != nil {
diff --git a/core/services/galleryop/service.go b/core/services/galleryop/service.go
index df0352e99..5b611d41e 100644
--- a/core/services/galleryop/service.go
+++ b/core/services/galleryop/service.go
@@ -11,6 +11,7 @@ import (
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/core/services/distributed"
 	"github.com/mudler/LocalAI/core/services/messaging"
+	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/system"
 	"github.com/mudler/xlog"
@@ -402,6 +403,16 @@ func (g *GalleryService) applyCancel(id string) {
 	}
 }
 
+// newUserCancellableContext returns a child context whose CancelFunc cancels
+// with the downloader.ErrUserCancelled cause. This lets the download layer
+// distinguish a deliberate user cancel (discard the half-downloaded .partial)
+// from an incidental cancellation such as process shutdown (keep the .partial
+// so the next run resumes via Range instead of restarting from zero).
+func newUserCancellableContext(parent context.Context) (context.Context, context.CancelFunc) {
+	ctx, cancelCause := context.WithCancelCause(parent)
+	return ctx, func() { cancelCause(downloader.ErrUserCancelled) }
+}
+
 // storeCancellation stores a cancellation function for an operation
 func (g *GalleryService) storeCancellation(id string, cancelFunc context.CancelFunc) {
 	g.Lock()
@@ -444,7 +455,7 @@ func (g *GalleryService) Start(c context.Context, cl *config.ModelConfigLoader,
 			case op := <-g.BackendGalleryChannel:
 				// Create context if not provided
 				if op.Context == nil {
-					op.Context, op.CancelFunc = context.WithCancel(c)
+					op.Context, op.CancelFunc = newUserCancellableContext(c)
 					g.storeCancellation(op.ID, op.CancelFunc)
 				} else if op.CancelFunc != nil {
 					g.storeCancellation(op.ID, op.CancelFunc)
@@ -472,7 +483,7 @@ func (g *GalleryService) Start(c context.Context, cl *config.ModelConfigLoader,
 			case op := <-g.ModelGalleryChannel:
 				// Create context if not provided
 				if op.Context == nil {
-					op.Context, op.CancelFunc = context.WithCancel(c)
+					op.Context, op.CancelFunc = newUserCancellableContext(c)
 					g.storeCancellation(op.ID, op.CancelFunc)
 				} else if op.CancelFunc != nil {
 					g.storeCancellation(op.ID, op.CancelFunc)
diff --git a/pkg/downloader/cancel_test.go b/pkg/downloader/cancel_test.go
new file mode 100644
index 000000000..76f8a2df5
--- /dev/null
+++ b/pkg/downloader/cancel_test.go
@@ -0,0 +1,148 @@
+package downloader_test
+
+import (
+	"context"
+	"crypto/rand"
+	"crypto/sha256"
+	"errors"
+	"fmt"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"strconv"
+	"strings"
+	"time"
+
+	. "github.com/mudler/LocalAI/pkg/downloader"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("Download cancellation", func() {
+	var filePath string
+
+	// streamingRangeServer serves data one small chunk at a time with a short
+	// pause between chunks, so a context cancellation can land mid-transfer.
+	// It honors a `bytes=N-` Range request so a second attempt can resume.
+	streamingRangeServer := func(data []byte) *httptest.Server {
+		return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if r.Method == "HEAD" {
+				w.Header().Set("Accept-Ranges", "bytes")
+				w.WriteHeader(http.StatusOK)
+				return
+			}
+			start := 0
+			if rh := r.Header.Get("Range"); rh != "" {
+				_, _ = fmt.Sscanf(strings.TrimPrefix(rh, "bytes="), "%d-", &start)
+			}
+			w.Header().Set("Content-Length", strconv.Itoa(len(data)-start))
+			if start > 0 {
+				w.WriteHeader(http.StatusPartialContent)
+			} else {
+				w.WriteHeader(http.StatusOK)
+			}
+			f, _ := w.(http.Flusher)
+			for i := start; i < len(data); i += 256 {
+				end := i + 256
+				if end > len(data) {
+					end = len(data)
+				}
+				if _, err := w.Write(data[i:end]); err != nil {
+					return
+				}
+				if f != nil {
+					f.Flush()
+				}
+				time.Sleep(20 * time.Millisecond)
+			}
+		}))
+	}
+
+	BeforeEach(func() {
+		dir, err := os.Getwd()
+		Expect(err).ToNot(HaveOccurred())
+		filePath = dir + "/cancel_model"
+	})
+
+	AfterEach(func() {
+		_ = os.Remove(filePath)
+		_ = os.Remove(filePath + ".partial")
+	})
+
+	It("keeps the .partial file when the context is cancelled so the download can resume", func() {
+		data := make([]byte, 8192)
+		_, err := rand.Read(data)
+		Expect(err).ToNot(HaveOccurred())
+		server := streamingRangeServer(data)
+		defer server.Close()
+
+		ctx, cancel := context.WithCancel(context.Background())
+		go func() {
+			time.Sleep(150 * time.Millisecond)
+			cancel()
+		}()
+
+		err = URI(server.URL).DownloadFileWithContext(ctx, filePath, "", 1, 1, func(s1, s2, s3 string, f float64) {})
+		Expect(err).To(HaveOccurred())
+		Expect(errors.Is(err, context.Canceled)).To(BeTrue())
+
+		info, statErr := os.Stat(filePath + ".partial")
+		Expect(statErr).ToNot(HaveOccurred(),
+			"a cancelled download must leave its .partial behind so the retry resumes instead of restarting from zero")
+		Expect(info.Size()).To(BeNumerically(">", 0))
+		Expect(info.Size()).To(BeNumerically("<", int64(len(data))))
+	})
+
+	It("discards the .partial when the cancellation cause is ErrUserCancelled", func() {
+		data := make([]byte, 8192)
+		_, err := rand.Read(data)
+		Expect(err).ToNot(HaveOccurred())
+		server := streamingRangeServer(data)
+		defer server.Close()
+
+		// A deliberate user abort: cancel WITH the ErrUserCancelled cause. The
+		// half-finished download should not linger on disk.
+		ctx, cancel := context.WithCancelCause(context.Background())
+		go func() {
+			time.Sleep(150 * time.Millisecond)
+			cancel(ErrUserCancelled)
+		}()
+
+		err = URI(server.URL).DownloadFileWithContext(ctx, filePath, "", 1, 1, func(s1, s2, s3 string, f float64) {})
+		Expect(err).To(HaveOccurred())
+		Expect(errors.Is(err, context.Canceled)).To(BeTrue())
+
+		Expect(filePath + ".partial").ToNot(BeAnExistingFile(),
+			"a deliberate user cancel must not leave a dangling .partial behind")
+	})
+
+	It("resumes from the preserved .partial after a cancellation and completes", func() {
+		data := make([]byte, 8192)
+		_, err := rand.Read(data)
+		Expect(err).ToNot(HaveOccurred())
+		sum := sha256.Sum256(data)
+		sha := fmt.Sprintf("%x", sum)
+		server := streamingRangeServer(data)
+		defer server.Close()
+
+		// First attempt: cancel mid-stream.
+		ctx, cancel := context.WithCancel(context.Background())
+		go func() {
+			time.Sleep(150 * time.Millisecond)
+			cancel()
+		}()
+		err = URI(server.URL).DownloadFileWithContext(ctx, filePath, sha, 1, 1, func(s1, s2, s3 string, f float64) {})
+		Expect(err).To(HaveOccurred())
+		partialInfo, statErr := os.Stat(filePath + ".partial")
+		Expect(statErr).ToNot(HaveOccurred())
+		resumedFrom := partialInfo.Size()
+		Expect(resumedFrom).To(BeNumerically(">", 0))
+
+		// Second attempt: fresh context, must resume and finish with a valid SHA.
+		err = URI(server.URL).DownloadFileWithContext(context.Background(), filePath, sha, 1, 1, func(s1, s2, s3 string, f float64) {})
+		Expect(err).ToNot(HaveOccurred())
+		final, rerr := os.ReadFile(filePath)
+		Expect(rerr).ToNot(HaveOccurred())
+		Expect(final).To(Equal(data))
+	})
+})
diff --git a/pkg/downloader/partial.go b/pkg/downloader/partial.go
new file mode 100644
index 000000000..f816bb09f
--- /dev/null
+++ b/pkg/downloader/partial.go
@@ -0,0 +1,69 @@
+package downloader
+
+import (
+	"io/fs"
+	"os"
+	"path/filepath"
+	"strings"
+	"time"
+
+	"github.com/mudler/xlog"
+)
+
+// PartialFileSuffix marks an in-progress download. The success path renames the
+// partial to its final name, so any leftover with this suffix is an unfinished
+// transfer.
+const PartialFileSuffix = ".partial"
+
+// CleanupStalePartialFiles removes *.partial files under root whose last
+// modification is older than olderThan, returning the number removed. These are
+// abandoned downloads left by a process killed mid-transfer (OOM, restart) or
+// by a stall whose cleanup never ran; without reaping they accumulate and can
+// fill the models volume. A still-in-progress download touches its .partial on
+// every write, so a generous olderThan never trims an active transfer.
+//
+// A missing root is not an error (nothing to clean). Unreadable entries are
+// skipped so one bad file does not abort the whole sweep.
+func CleanupStalePartialFiles(root string, olderThan time.Duration) (int, error) {
+	if _, err := os.Stat(root); err != nil {
+		if os.IsNotExist(err) {
+			return 0, nil
+		}
+		return 0, err
+	}
+
+	cutoff := time.Now().Add(-olderThan)
+
+	// Collect candidates during the walk and delete them afterwards rather than
+	// mutating the tree from inside the WalkDir callback (avoids the symlink
+	// TOCTOU class flagged by gosec G122, and never removes an entry mid-walk).
+	var stale []string
+	err := filepath.WalkDir(root, func(path string, d fs.DirEntry, walkErr error) error {
+		if walkErr != nil {
+			return nil // skip unreadable subtree, keep going
+		}
+		if d.IsDir() || !strings.HasSuffix(d.Name(), PartialFileSuffix) {
+			return nil
+		}
+		info, err := d.Info()
+		if err != nil || info.ModTime().After(cutoff) {
+			return nil
+		}
+		stale = append(stale, path)
+		return nil
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	removed := 0
+	for _, path := range stale {
+		if err := os.Remove(path); err != nil {
+			xlog.Warn("failed to remove stale partial download", "file", path, "error", err)
+			continue
+		}
+		removed++
+		xlog.Info("removed stale partial download", "file", path)
+	}
+	return removed, nil
+}
diff --git a/pkg/downloader/partial_test.go b/pkg/downloader/partial_test.go
new file mode 100644
index 000000000..ceec8417f
--- /dev/null
+++ b/pkg/downloader/partial_test.go
@@ -0,0 +1,53 @@
+package downloader_test
+
+import (
+	"os"
+	"path/filepath"
+	"time"
+
+	. "github.com/mudler/LocalAI/pkg/downloader"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("CleanupStalePartialFiles", func() {
+	var root string
+
+	BeforeEach(func() {
+		var err error
+		root, err = os.MkdirTemp("", "partials")
+		Expect(err).ToNot(HaveOccurred())
+	})
+
+	AfterEach(func() {
+		_ = os.RemoveAll(root)
+	})
+
+	It("removes stale .partial files (recursively) while keeping fresh ones and completed files", func() {
+		nested := filepath.Join(root, "llama-cpp", "models", "foo")
+		Expect(os.MkdirAll(nested, 0755)).To(Succeed())
+
+		stale := filepath.Join(nested, "model.gguf.partial")
+		fresh := filepath.Join(root, "fresh.gguf.partial")
+		completed := filepath.Join(root, "done.gguf")
+		for _, f := range []string{stale, fresh, completed} {
+			Expect(os.WriteFile(f, []byte("data"), 0644)).To(Succeed())
+		}
+		old := time.Now().Add(-2 * time.Hour)
+		Expect(os.Chtimes(stale, old, old)).To(Succeed())
+
+		removed, err := CleanupStalePartialFiles(root, time.Hour)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(removed).To(Equal(1))
+
+		Expect(stale).ToNot(BeAnExistingFile())
+		Expect(fresh).To(BeAnExistingFile())
+		Expect(completed).To(BeAnExistingFile())
+	})
+
+	It("returns no error when the root directory does not exist", func() {
+		removed, err := CleanupStalePartialFiles(filepath.Join(root, "does-not-exist"), time.Hour)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(removed).To(Equal(0))
+	})
+})
diff --git a/pkg/downloader/stall.go b/pkg/downloader/stall.go
new file mode 100644
index 000000000..697ad25d9
--- /dev/null
+++ b/pkg/downloader/stall.go
@@ -0,0 +1,77 @@
+package downloader
+
+import (
+	"fmt"
+	"io"
+	"sync"
+	"time"
+)
+
+// DownloadStallTimeout bounds how long an in-flight download may receive no
+// data before it is aborted. A silently-dropped TCP connection (no FIN/RST)
+// would otherwise block the body read forever, freezing an install at N bytes
+// until an external reaper kills it. Overridable (tests set it small); a value
+// <= 0 disables the guard.
+var DownloadStallTimeout = 60 * time.Second
+
+// idleTimeoutReader wraps a streaming ReadCloser and aborts reads that make no
+// progress within timeout. A standard io.Copy blocks indefinitely on a Read
+// against a dead-but-unclosed socket; nothing in the copy loop can interrupt a
+// blocked syscall. The watchdog timer closes the underlying reader on expiry,
+// which unblocks the in-flight Read with an error. Each read that returns data
+// resets the idle clock, so a slow-but-steady transfer never trips the guard.
+type idleTimeoutReader struct {
+	rc      io.ReadCloser
+	timeout time.Duration
+
+	mu    sync.Mutex
+	timer *time.Timer
+	fired bool
+	done  bool
+}
+
+func newIdleTimeoutReader(rc io.ReadCloser, timeout time.Duration) *idleTimeoutReader {
+	r := &idleTimeoutReader{rc: rc, timeout: timeout}
+	r.timer = time.AfterFunc(timeout, r.onStall)
+	return r
+}
+
+// onStall fires when no data has arrived within the timeout. Closing the
+// underlying reader is what unblocks a Read parked in the kernel.
+func (r *idleTimeoutReader) onStall() {
+	r.mu.Lock()
+	if r.done {
+		r.mu.Unlock()
+		return
+	}
+	r.fired = true
+	r.mu.Unlock()
+	_ = r.rc.Close()
+}
+
+func (r *idleTimeoutReader) Read(p []byte) (int, error) {
+	n, err := r.rc.Read(p)
+	if n > 0 {
+		r.timer.Reset(r.timeout)
+	}
+	if err != nil {
+		r.mu.Lock()
+		fired := r.fired
+		r.mu.Unlock()
+		if fired {
+			// Translate the "use of closed connection" the watchdog induced
+			// into an actionable stall error. This is not context.Canceled,
+			// so the caller keeps the .partial file for a later resume.
+			return n, fmt.Errorf("download stalled: no data received for %s", r.timeout)
+		}
+	}
+	return n, err
+}
+
+func (r *idleTimeoutReader) Close() error {
+	r.mu.Lock()
+	r.done = true
+	r.mu.Unlock()
+	r.timer.Stop()
+	return r.rc.Close()
+}
diff --git a/pkg/downloader/stall_test.go b/pkg/downloader/stall_test.go
new file mode 100644
index 000000000..8e6a003c6
--- /dev/null
+++ b/pkg/downloader/stall_test.go
@@ -0,0 +1,131 @@
+package downloader_test
+
+import (
+	"context"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"time"
+
+	. "github.com/mudler/LocalAI/pkg/downloader"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("Download stall timeout", func() {
+	var filePath string
+	var savedTimeout time.Duration
+
+	BeforeEach(func() {
+		dir, err := os.Getwd()
+		Expect(err).ToNot(HaveOccurred())
+		filePath = dir + "/stall_model"
+		savedTimeout = DownloadStallTimeout
+	})
+
+	AfterEach(func() {
+		DownloadStallTimeout = savedTimeout
+		_ = os.Remove(filePath)
+		_ = os.Remove(filePath + ".partial")
+	})
+
+	It("aborts a download that stalls mid-stream instead of hanging forever", func() {
+		// Server sends a chunk, flushes, then blocks forever without closing
+		// the connection — a silently-dropped TCP stream. Without a stall
+		// guard the body Read blocks indefinitely and DownloadFile never
+		// returns.
+		release := make(chan struct{})
+		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if r.Method == "HEAD" {
+				w.Header().Set("Accept-Ranges", "bytes")
+				w.WriteHeader(http.StatusOK)
+				return
+			}
+			w.WriteHeader(http.StatusOK)
+			_, _ = w.Write(make([]byte, 4096))
+			if f, ok := w.(http.Flusher); ok {
+				f.Flush()
+			}
+			<-release // hang: no more data, never close
+		}))
+		defer server.Close()
+		defer close(release)
+
+		DownloadStallTimeout = 300 * time.Millisecond
+
+		done := make(chan error, 1)
+		go func() {
+			done <- URI(server.URL).DownloadFileWithContext(
+				context.Background(), filePath, "", 1, 1,
+				func(s1, s2, s3 string, f float64) {})
+		}()
+
+		var err error
+		Eventually(done, "5s").Should(Receive(&err))
+		Expect(err).To(HaveOccurred())
+		Expect(err.Error()).To(ContainSubstring("stall"))
+	})
+
+	It("preserves the .partial file when a download stalls so it can resume", func() {
+		release := make(chan struct{})
+		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if r.Method == "HEAD" {
+				w.Header().Set("Accept-Ranges", "bytes")
+				w.WriteHeader(http.StatusOK)
+				return
+			}
+			w.WriteHeader(http.StatusOK)
+			_, _ = w.Write(make([]byte, 4096))
+			if f, ok := w.(http.Flusher); ok {
+				f.Flush()
+			}
+			<-release
+		}))
+		defer server.Close()
+		defer close(release)
+
+		DownloadStallTimeout = 300 * time.Millisecond
+
+		done := make(chan error, 1)
+		go func() {
+			done <- URI(server.URL).DownloadFileWithContext(
+				context.Background(), filePath, "", 1, 1,
+				func(s1, s2, s3 string, f float64) {})
+		}()
+		Eventually(done, "5s").Should(Receive(HaveOccurred()))
+
+		info, statErr := os.Stat(filePath + ".partial")
+		Expect(statErr).ToNot(HaveOccurred(), "the .partial must survive a stall so the next attempt can resume")
+		Expect(info.Size()).To(BeNumerically(">", 0))
+	})
+
+	It("does not abort a slow-but-steady download", func() {
+		// One byte every 100ms keeps the idle clock from ever expiring even
+		// though the total transfer outlasts the stall timeout.
+		payload := make([]byte, 12)
+		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if r.Method == "HEAD" {
+				w.Header().Set("Accept-Ranges", "bytes")
+				w.WriteHeader(http.StatusOK)
+				return
+			}
+			w.WriteHeader(http.StatusOK)
+			f, _ := w.(http.Flusher)
+			for i := range payload {
+				_, _ = w.Write(payload[i : i+1])
+				if f != nil {
+					f.Flush()
+				}
+				time.Sleep(100 * time.Millisecond)
+			}
+		}))
+		defer server.Close()
+
+		DownloadStallTimeout = 300 * time.Millisecond
+
+		err := URI(server.URL).DownloadFileWithContext(
+			context.Background(), filePath, "", 1, 1,
+			func(s1, s2, s3 string, f float64) {})
+		Expect(err).ToNot(HaveOccurred())
+	})
+})
diff --git a/pkg/downloader/uri.go b/pkg/downloader/uri.go
index 4be1b9081..41bdbe672 100644
--- a/pkg/downloader/uri.go
+++ b/pkg/downloader/uri.go
@@ -330,6 +330,18 @@ func (s URI) ResolveURL() string {
 	return string(s)
 }
 
+// ErrUserCancelled distinguishes a deliberate user abort from an incidental
+// context cancellation (process shutdown, pod restart). Pass it as the cause
+// when cancelling the download context:
+//
+//	ctx, cancel := context.WithCancelCause(parent)
+//	cancel(downloader.ErrUserCancelled) // discards the .partial
+//
+// On a deliberate cancel the downloader removes the .partial (the user does not
+// want a half-download lingering). On a plain cancellation it keeps the .partial
+// so the next run resumes via Range instead of restarting from zero.
+var ErrUserCancelled = errors.New("download cancelled by user")
+
 func removePartialFile(tmpFilePath string) error {
 	xlog.Debug("Removing temporary file", "file", tmpFilePath)
 	if err := os.Remove(tmpFilePath); err != nil && !errors.Is(err, os.ErrNotExist) {
@@ -594,11 +606,17 @@ func (uri URI) DownloadFileWithContext(ctx context.Context, filePath, sha string
 		// Start the request
 		resp, err := downloadClient.Do(req)
 		if err != nil {
-			// Check if error is due to context cancellation
-			if errors.Is(err, context.Canceled) {
-				// Clean up partial file on cancellation
-				removePartialFile(tmpFilePath)
-				return err
+			// Detect cancellation via the context, not the returned error: a
+			// request cancelled *with a cause* surfaces the cause error (not
+			// context.Canceled) from the HTTP client. Keep the .partial for
+			// resume on an incidental cancel (shutdown, restart) — large GGUFs
+			// take long enough that deleting progress means they never finish —
+			// but discard it on a deliberate user abort (ErrUserCancelled).
+			if ctx.Err() != nil {
+				if errors.Is(context.Cause(ctx), ErrUserCancelled) {
+					_ = removePartialFile(tmpFilePath)
+				}
+				return ctx.Err()
 			}
 			return fmt.Errorf("failed to download file %q: %v", filePath, err)
 		}
@@ -608,6 +626,13 @@ func (uri URI) DownloadFileWithContext(ctx context.Context, filePath, sha string
 			return fmt.Errorf("failed to download url %q, invalid status code %d", url, resp.StatusCode)
 		}
 		source = resp.Body
+		// Guard against a silently-stalled stream: a dropped TCP connection
+		// that never sends FIN/RST would otherwise block the body Read (and
+		// thus the whole install) forever. The watchdog aborts after a window
+		// of zero progress; the .partial is kept for a later resume.
+		if DownloadStallTimeout > 0 {
+			source = newIdleTimeoutReader(resp.Body, DownloadStallTimeout)
+		}
 		contentLength = resp.ContentLength
 	}
 	defer source.Close()
@@ -640,19 +665,27 @@ func (uri URI) DownloadFileWithContext(ctx context.Context, filePath, sha string
 
 	_, err = xio.Copy(ctx, io.MultiWriter(outFile, progress), source)
 	if err != nil {
-		// Check if error is due to context cancellation
-		if errors.Is(err, context.Canceled) {
-			// Clean up partial file on cancellation
-			removePartialFile(tmpFilePath)
-			return err
+		// Detect cancellation via the context (a cause-cancelled read surfaces
+		// the cause, not context.Canceled). Keep the .partial for resume,
+		// except on a deliberate user abort (ErrUserCancelled), which discards
+		// it. A stall-guard abort leaves ctx uncancelled, so it falls through
+		// to the error path below and likewise preserves the partial.
+		if ctx.Err() != nil {
+			if errors.Is(context.Cause(ctx), ErrUserCancelled) {
+				_ = removePartialFile(tmpFilePath)
+			}
+			return ctx.Err()
 		}
 		return fmt.Errorf("failed to write file %q: %v", filePath, err)
 	}
 
-	// Check for cancellation before finalizing
+	// Check for cancellation before finalizing. Keep the .partial for resume
+	// unless the user deliberately aborted.
 	select {
 	case <-ctx.Done():
-		removePartialFile(tmpFilePath)
+		if errors.Is(context.Cause(ctx), ErrUserCancelled) {
+			_ = removePartialFile(tmpFilePath)
+		}
 		return ctx.Err()
 	default:
 	}

From 079ac0e15abb51c8330946dec5f506c7a1fdb3f7 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Fri, 19 Jun 2026 21:36:25 +0200
Subject: [PATCH 06/99] fix(realtime): raise WebRTC data-channel
 max-message-size + keep sendLoop alive (#10407)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix(realtime): raise WebRTC data-channel max-message-size for large events

Browsers advertise a conservative SCTP max-message-size in their SDP offer
(Chrome uses 256 KiB). pion enforces the remote's advertised value on send, so
a single realtime event larger than it cannot be sent over the "oai-events"
data channel: SendText fails, the event is dropped, and the turn silently
yields no response. Some turns legitimately produce a >256 KiB JSON event —
notably tool calls with sizeable schemas or results.

Browsers advertise the value conservatively but their SCTP stacks reassemble
much larger messages, so raise the max-message-size honored for our own
server-generated events by rewriting the attribute in the offer before
SetRemoteDescription.

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(realtime): keep the WebRTC sendLoop alive when one event send fails

A failed SendText on the oai-events data channel exited the sender goroutine,
so a single dropped event (e.g. one over the negotiated SCTP max-message-size)
tore down the session and silently dropped every subsequent event. Log and skip
the offending event instead and keep draining; a genuinely dead transport is
still handled by the closed / connection-state path.

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 .../openai/realtime_transport_webrtc.go       | 12 +++++--
 core/http/endpoints/openai/realtime_webrtc.go |  7 ++--
 .../endpoints/openai/realtime_webrtc_sctp.go  | 29 ++++++++++++++++
 .../openai/realtime_webrtc_sctp_test.go       | 33 +++++++++++++++++++
 4 files changed, 76 insertions(+), 5 deletions(-)
 create mode 100644 core/http/endpoints/openai/realtime_webrtc_sctp.go
 create mode 100644 core/http/endpoints/openai/realtime_webrtc_sctp_test.go

diff --git a/core/http/endpoints/openai/realtime_transport_webrtc.go b/core/http/endpoints/openai/realtime_transport_webrtc.go
index b687654bd..9ddec5edb 100644
--- a/core/http/endpoints/openai/realtime_transport_webrtc.go
+++ b/core/http/endpoints/openai/realtime_transport_webrtc.go
@@ -113,8 +113,13 @@ func (t *WebRTCTransport) sendLoop() {
 				return
 			}
 			if err := t.dc.SendText(string(data)); err != nil {
-				xlog.Error("data channel send failed", "error", err)
-				return
+				// Drop just this event and keep the loop alive: a single
+				// failed send (e.g. an event over the negotiated SCTP
+				// max-message-size) must not tear down the session and
+				// silently drop every subsequent event. A genuinely dead
+				// transport is handled by the <-t.closed case.
+				xlog.Error("data channel send failed, dropping event", "error", err)
+				continue
 			}
 		case <-t.closed:
 			// Drain any remaining queued events before exiting
@@ -122,7 +127,8 @@ func (t *WebRTCTransport) sendLoop() {
 				select {
 				case data := <-t.outEvents:
 					if err := t.dc.SendText(string(data)); err != nil {
-						return
+						xlog.Error("data channel send failed while draining, dropping event", "error", err)
+						continue
 					}
 				default:
 					return
diff --git a/core/http/endpoints/openai/realtime_webrtc.go b/core/http/endpoints/openai/realtime_webrtc.go
index 0ac982c19..26edf94ea 100644
--- a/core/http/endpoints/openai/realtime_webrtc.go
+++ b/core/http/endpoints/openai/realtime_webrtc.go
@@ -128,10 +128,13 @@ func RealtimeCalls(application *application.Application) echo.HandlerFunc {
 			handleIncomingAudioTrack(track, transport)
 		})
 
-		// Set the remote SDP (client's offer)
+		// Set the remote SDP (client's offer). Raise the data-channel
+		// max-message-size the browser advertised so pion permits the larger
+		// realtime events some turns produce (e.g. tool calls), which would
+		// otherwise be dropped on send. See realtime_webrtc_sctp.go.
 		if err := pc.SetRemoteDescription(webrtc.SessionDescription{
 			Type: webrtc.SDPTypeOffer,
-			SDP:  req.SDP,
+			SDP:  raiseDataChannelMaxMessageSize(req.SDP),
 		}); err != nil {
 			transport.Close()
 			xlog.Error("failed to set remote description", "error", err)
diff --git a/core/http/endpoints/openai/realtime_webrtc_sctp.go b/core/http/endpoints/openai/realtime_webrtc_sctp.go
new file mode 100644
index 000000000..b0355ba70
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_webrtc_sctp.go
@@ -0,0 +1,29 @@
+package openai
+
+import (
+	"fmt"
+	"regexp"
+)
+
+// realtimeDataChannelMaxMessageSize is the SCTP max-message-size LocalAI honors
+// for the "oai-events" data channel, in bytes.
+//
+// Browsers advertise a conservative max-message-size in their SDP offer (Chrome
+// uses 262144 = 256 KiB). pion enforces the remote's advertised value on send,
+// so a single realtime event larger than it cannot be sent: the SendText fails,
+// the event is dropped, and the turn silently yields no response. Some turns
+// legitimately produce a single JSON event above 256 KiB (notably tool calls
+// with sizeable schemas or results). Browsers advertise this value
+// conservatively but their SCTP stacks reassemble much larger messages, so we
+// raise the value honored for our own server-generated events.
+const realtimeDataChannelMaxMessageSize = 16 * 1024 * 1024 // 16 MiB
+
+var maxMessageSizeAttrRe = regexp.MustCompile(`a=max-message-size:\d+`)
+
+// raiseDataChannelMaxMessageSize rewrites the SCTP max-message-size attribute in
+// an SDP offer to realtimeDataChannelMaxMessageSize so pion permits larger
+// outbound realtime events. Offers that don't carry the attribute are returned
+// unchanged.
+func raiseDataChannelMaxMessageSize(sdp string) string {
+	return maxMessageSizeAttrRe.ReplaceAllString(sdp, fmt.Sprintf("a=max-message-size:%d", realtimeDataChannelMaxMessageSize))
+}
diff --git a/core/http/endpoints/openai/realtime_webrtc_sctp_test.go b/core/http/endpoints/openai/realtime_webrtc_sctp_test.go
new file mode 100644
index 000000000..92da4e706
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_webrtc_sctp_test.go
@@ -0,0 +1,33 @@
+package openai
+
+import (
+	"fmt"
+	"strings"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("raiseDataChannelMaxMessageSize", func() {
+	It("raises a max-message-size the browser advertised", func() {
+		offer := "v=0\r\nm=application 9 UDP/DTLS/SCTP webrtc-datachannel\r\na=max-message-size:262144\r\n"
+		out := raiseDataChannelMaxMessageSize(offer)
+		Expect(out).To(ContainSubstring(fmt.Sprintf("a=max-message-size:%d", realtimeDataChannelMaxMessageSize)))
+		Expect(out).NotTo(ContainSubstring("a=max-message-size:262144"))
+	})
+
+	It("leaves an offer without the attribute unchanged", func() {
+		offer := "v=0\r\nm=application 9 UDP/DTLS/SCTP webrtc-datachannel\r\n"
+		Expect(raiseDataChannelMaxMessageSize(offer)).To(Equal(offer))
+	})
+
+	It("rewrites every occurrence", func() {
+		offer := "a=max-message-size:1024\r\na=max-message-size:262144\r\n"
+		out := raiseDataChannelMaxMessageSize(offer)
+		Expect(strings.Count(out, fmt.Sprintf("a=max-message-size:%d", realtimeDataChannelMaxMessageSize))).To(Equal(2))
+	})
+
+	It("raises above the 256 KiB browsers advertise", func() {
+		Expect(realtimeDataChannelMaxMessageSize).To(BeNumerically(">", 262144))
+	})
+})

From c43a752afc288bb8691a6843e9cd8f1d51fc5e31 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 20 Jun 2026 01:36:22 +0200
Subject: [PATCH 07/99] chore: :arrow_up: Update ServeurpersoCom/omnivoice.cpp
 to `96d30169afd5e6bb3fd6a0e9be0eb505bfe81fcd` (#10408)

:arrow_up: Update ServeurpersoCom/omnivoice.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/go/omnivoice-cpp/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/go/omnivoice-cpp/Makefile b/backend/go/omnivoice-cpp/Makefile
index 7806ce11f..b42610aac 100644
--- a/backend/go/omnivoice-cpp/Makefile
+++ b/backend/go/omnivoice-cpp/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 
 # omnivoice.cpp version
 OMNIVOICE_REPO?=https://github.com/ServeurpersoCom/omnivoice.cpp
-OMNIVOICE_VERSION?=2603355a5dfacae5cfc33531d5d0933221843509
+OMNIVOICE_VERSION?=96d30169afd5e6bb3fd6a0e9be0eb505bfe81fcd
 SO_TARGET?=libgomnivoicecpp.so
 
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

From dd928f0bddd699aaf051574a23ce9049200e212e Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 20 Jun 2026 01:36:36 +0200
Subject: [PATCH 08/99] chore: :arrow_up: Update ServeurpersoCom/qwentts.cpp to
 `26fcea5468e4069bc72d1f2fcc812c985e7361bb` (#10409)

:arrow_up: Update ServeurpersoCom/qwentts.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/go/qwen3-tts-cpp/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/go/qwen3-tts-cpp/Makefile b/backend/go/qwen3-tts-cpp/Makefile
index e5f6a838f..84c543af6 100644
--- a/backend/go/qwen3-tts-cpp/Makefile
+++ b/backend/go/qwen3-tts-cpp/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 
 # qwentts.cpp version
 QWEN3TTS_REPO?=https://github.com/ServeurpersoCom/qwentts.cpp
-QWEN3TTS_CPP_VERSION?=0bf4a18b22e8bb8718d95294e9f7f45c0d4270a4
+QWEN3TTS_CPP_VERSION?=26fcea5468e4069bc72d1f2fcc812c985e7361bb
 SO_TARGET?=libgoqwen3ttscpp.so
 
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

From f143d7f6885596ae228f99338604b86fe0c53f4c Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 20 Jun 2026 01:36:51 +0200
Subject: [PATCH 09/99] chore: :arrow_up: Update ikawrakow/ik_llama.cpp to
 `d47f484d299cafad2e606afc0d31677a91b242d0` (#10410)

:arrow_up: Update ikawrakow/ik_llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/cpp/ik-llama-cpp/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/cpp/ik-llama-cpp/Makefile b/backend/cpp/ik-llama-cpp/Makefile
index 85b7ee4a8..39fa7fa4e 100644
--- a/backend/cpp/ik-llama-cpp/Makefile
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -1,5 +1,5 @@
 
-IK_LLAMA_VERSION?=b3dfb7858cfcb9166e92f366e5af87f19ebc94be
+IK_LLAMA_VERSION?=d47f484d299cafad2e606afc0d31677a91b242d0
 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
 
 CMAKE_ARGS?=

From 8915f2ab917f90c3bb6a734b15bf95f72189ba06 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 20 Jun 2026 01:37:06 +0200
Subject: [PATCH 10/99] chore: :arrow_up: Update ggml-org/whisper.cpp to
 `5ed76e9a079962f1c85cfce44edd325c27ef1f97` (#10396)

:arrow_up: Update ggml-org/whisper.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/go/whisper/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/go/whisper/Makefile b/backend/go/whisper/Makefile
index e291e4d62..9858b1d07 100644
--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=86c40c3bd6fc86f1187fb751d111b49e0fc18e84
+WHISPER_CPP_VERSION?=5ed76e9a079962f1c85cfce44edd325c27ef1f97
 SO_TARGET?=libgowhisper.so
 
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

From 11aee03a805faa5ab72034afbef06a4037f737ef Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 20 Jun 2026 01:37:21 +0200
Subject: [PATCH 11/99] chore: :arrow_up: Update localai-org/privacy-filter.cpp
 to `98f52c5ef2250f207cc6b9a6aef05393a120cb7c` (#10394)

:arrow_up: Update localai-org/privacy-filter.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/cpp/privacy-filter/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/cpp/privacy-filter/Makefile b/backend/cpp/privacy-filter/Makefile
index 173d4176b..774f2c433 100644
--- a/backend/cpp/privacy-filter/Makefile
+++ b/backend/cpp/privacy-filter/Makefile
@@ -8,7 +8,7 @@
 # Local development: point at a working checkout instead of cloning, e.g.
 #   make PRIVACY_FILTER_SRC=$HOME/c/privacy-filter.cpp grpc-server
 
-PRIVACY_FILTER_VERSION?=646342f7a59c6b7d195185eac60bad762e572f1d
+PRIVACY_FILTER_VERSION?=98f52c5ef2250f207cc6b9a6aef05393a120cb7c
 PRIVACY_FILTER_REPO?=https://github.com/localai-org/privacy-filter.cpp
 PRIVACY_FILTER_SRC?=
 

From 93706fec57c98d689a10737cdc042c1a29ce7969 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 20 Jun 2026 01:37:33 +0200
Subject: [PATCH 12/99] chore: :arrow_up: Update mudler/parakeet.cpp to
 `db755a78d39f789bb7d4e3935158a9e8105dbe36` (#10393)

:arrow_up: Update mudler/parakeet.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/go/parakeet-cpp/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/go/parakeet-cpp/Makefile b/backend/go/parakeet-cpp/Makefile
index 2ea86a0c6..9a781d634 100644
--- a/backend/go/parakeet-cpp/Makefile
+++ b/backend/go/parakeet-cpp/Makefile
@@ -1,6 +1,6 @@
 # parakeet-cpp backend Makefile.
 #
-# Upstream pin lives below as PARAKEET_VERSION?=92a5f0306be354c109150fe58ae4cc4f8a21ca45
+# Upstream pin lives below as PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36
 # (.github/bump_deps.sh) can find and update it - matches the
 # whisper.cpp / ds4 / vibevoice-cpp convention.
 #
@@ -15,7 +15,7 @@
 # That's what the L0 smoke test uses. The default target below does the
 # proper clone-at-pin + cmake build so CI doesn't need a side-checkout.
 
-PARAKEET_VERSION?=92a5f0306be354c109150fe58ae4cc4f8a21ca45
+PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36
 PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp
 
 GOCMD?=go

From 518381278e362423ab70aaa8ad23c7b44ee13a03 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 20 Jun 2026 08:22:22 +0200
Subject: [PATCH 13/99] chore: :arrow_up: Update ggml-org/llama.cpp to
 `e475fa2b5f9fb50c3d6fc3e7c6fdf1e004465b62` (#10392)

* :arrow_up: Update ggml-org/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* fix(llama-cpp): adapt grpc-server to upstream server-schema split

Upstream llama.cpp (e475fa2) extracted the JSON request-schema evaluation
out of the static server_task::params_from_json_cmpl into the new
server_schema::eval_llama_cmpl_schema (tools/server/server-schema.cpp).
The grpc-server unity build still called the old static member, breaking
every llama-cpp backend build with "no member named 'params_from_json_cmpl'
in 'server_task'".

Pull server-schema.cpp into the translation unit and call the new function,
keeping both guarded by __has_include so forks that predate the split (e.g.
llama-cpp-turboquant, which still exposes params_from_json_cmpl) keep
compiling against the old static member.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

---------

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/cpp/llama-cpp/Makefile        |  2 +-
 backend/cpp/llama-cpp/grpc-server.cpp | 24 ++++++++++++++++++++++--
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile
index 64414ec30..bf9f4f608 100644
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@
 
-LLAMA_VERSION?=f3e182816421c648188b5eab269853bf1531d950
+LLAMA_VERSION?=e475fa2b5f9fb50c3d6fc3e7c6fdf1e004465b62
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
 
 CMAKE_ARGS?=
diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp
index 8502e9530..c2e7f22e4 100644
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -18,6 +18,18 @@
 #if __has_include("server-chat.cpp")
 #include "server-chat.cpp"
 #endif
+// server-schema.cpp exists only in llama.cpp after the upstream refactor that
+// extracted the JSON request-schema evaluation (previously the static
+// server_task::params_from_json_cmpl) into server_schema::eval_llama_cmpl_schema.
+// server-context.cpp and grpc-server.cpp both call into it, so its definitions
+// must be part of this translation unit or the link fails. __has_include keeps
+// the source compatible with older pins/forks (e.g. llama-cpp-turboquant) that
+// predate the split and still expose params_from_json_cmpl (see the guarded
+// call sites below).
+#if __has_include("server-schema.cpp")
+#define LOCALAI_HAS_SERVER_SCHEMA 1
+#include "server-schema.cpp"
+#endif
 #include "server-context.cpp"
 
 // LocalAI
@@ -2102,7 +2114,11 @@ public:
                 task.index = i;
 
                 task.tokens    = std::move(inputs[i]);
+#ifdef LOCALAI_HAS_SERVER_SCHEMA
+                task.params           = server_schema::eval_llama_cmpl_schema(
+#else
                 task.params           = server_task::params_from_json_cmpl(
+#endif
                         ctx_server.impl->vocab,
                         params_base,
                         ctx_server.get_meta().slot_n_ctx,
@@ -2116,7 +2132,7 @@ public:
                 // cannot detect tool calls or separate reasoning from content.
                 task.params.res_type                 = TASK_RESPONSE_TYPE_OAI_CHAT;
                 task.params.oaicompat_cmpl_id         = completion_id;
-                // oaicompat_model is already populated by params_from_json_cmpl
+                // oaicompat_model is already populated by eval_llama_cmpl_schema
 
                 tasks.push_back(std::move(task));
             }
@@ -2940,7 +2956,11 @@ public:
                 task.index = i;
 
                 task.tokens    = std::move(inputs[i]);
+#ifdef LOCALAI_HAS_SERVER_SCHEMA
+                task.params           = server_schema::eval_llama_cmpl_schema(
+#else
                 task.params           = server_task::params_from_json_cmpl(
+#endif
                         ctx_server.impl->vocab,
                         params_base,
                         ctx_server.get_meta().slot_n_ctx,
@@ -2952,7 +2972,7 @@ public:
                 // reasoning, tool calls, and content are classified into ChatDeltas.
                 task.params.res_type                 = TASK_RESPONSE_TYPE_OAI_CHAT;
                 task.params.oaicompat_cmpl_id         = completion_id;
-                // oaicompat_model is already populated by params_from_json_cmpl
+                // oaicompat_model is already populated by eval_llama_cmpl_schema
 
                 tasks.push_back(std::move(task));
             }

From 1be959ce30e68ed686a630932dba2754a6d5fed9 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 20 Jun 2026 11:04:56 +0200
Subject: [PATCH 14/99] docs: mention apex-quant in the README (#10412)

Add apex-quant (MoE per-tensor/per-layer quantization recipe) to the
"Backends built by us" section as a note after the engines table, since
it is a quantization recipe rather than a native inference engine.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index b05af2dfb..5fff7db69 100644
--- a/README.md
+++ b/README.md
@@ -240,6 +240,8 @@ Most backends wrap a best-in-class upstream engine. A handful of them are native
 | [LocalVQE](https://github.com/localai-org/LocalVQE) | Joint acoustic echo cancellation, noise suppression, and dereverberation |
 | [local-store](https://github.com/mudler/LocalAI) | Local-first vector database for embeddings (shipped in-tree) |
 
+We also maintain [apex-quant](https://github.com/localai-org/apex-quant), a per-tensor, per-layer quantization recipe for Mixture-of-Experts models that exploits their structural sparsity to produce GGUFs matching or beating Q8_0 quality - and they run out of the box on stock llama.cpp.
+
 ## Resources
 
 - [Documentation](https://localai.io/)

From b081247d95bff5bde5e5147e1add185e1fbc1a31 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 20 Jun 2026 14:45:59 +0200
Subject: [PATCH 15/99] =?UTF-8?q?feat(config):=20hardware-tuned=20defaults?=
 =?UTF-8?q?=20=E2=80=94=20Blackwell=20batch=20+=20VRAM-scaled=20concurrenc?=
 =?UTF-8?q?y=20(#10411)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(config): node-aware hardware defaults — larger physical batch on Blackwell

A larger physical batch (n_batch/n_ubatch) materially lifts MoE prefill on
NVIDIA Blackwell consumer GPUs (sm_120/121, incl. GB10 / DGX Spark) — measured
on a GB10 with Qwen3-Coder-30B-A3B, the prefill ceiling rises (ub512 ~2994 ->
ub2048 ~3316 t/s) and saturates around 2048.

The heuristic lives in core/config alongside the other config overriders
(ApplyInferenceDefaults, guessDefaultsFromFile/NGPULayers) — they all fill the
ModelConfig from heuristics, so hardware tuning is the same domain and stays in
one place. It is parameterized on a GPU descriptor (not direct detection) so it
works in both deployment shapes:

- Single host: SetDefaults applies it with the LocalGPU.
- Distributed: only the worker sees the GPU, so the worker reports its compute
  capability on registration (gpu_compute_capability -> BackendNode), and the
  router re-applies the SAME core/config heuristic for the SELECTED node before
  loading — fixing the case where the frontend has no GPU at all.

Explicit `batch:` always wins (only managed default values are touched).
xsysinfo gains NVIDIAComputeCapability() (detection only); all interpretation
lives in core/config. Tests: core/config, pkg/xsysinfo, core/services/nodes.

Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* test(config): injectable local-GPU seam + single-instance coverage

Make local GPU detection an injectable package var (localGPU) so the
single-instance path (SetDefaults -> ApplyHardwareDefaults) is deterministically
testable without a real GPU, mirroring the distributed override's coverage.
Adds specs asserting SetDefaults sets the Blackwell physical batch, leaves it
unset on non-Blackwell, and never overrides an explicit batch.

Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(config): default concurrent serving (n_parallel) by GPU VRAM

The llama.cpp backend defaults n_parallel=1, which serializes multi-user requests
and leaves continuous batching off (it auto-enables only at n_parallel>1). Fold a
VRAM-scaled parallel-slot default into the hardware-config path so multi-user
serving works out of the box: >=32GiB->8, >=8GiB->4, >=4GiB->2, else unchanged.
With the backend's unified KV the slots SHARE the context budget, so this adds
concurrency without multiplying KV memory. Explicit parallel/n_parallel always
wins. EnsureParallelOption is shared by the single-host path (ApplyHardwareDefaults
with the local GPU) and the distributed router (per selected node's reported VRAM,
since the frontend may have no GPU). LocalGPU now also reports VRAM.

Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/config/hardware_defaults.go              | 190 ++++++++++++++++++
 .../config/hardware_defaults_internal_test.go |  37 ++++
 core/config/hardware_defaults_test.go         |  97 +++++++++
 core/config/model_config.go                   |   5 +
 core/http/endpoints/localai/nodes.go          |  48 +++--
 core/services/nodes/registry.go               |   6 +
 core/services/nodes/router.go                 |  30 +++
 .../nodes/router_hardware_internal_test.go    |  46 +++++
 core/services/worker/registration.go          |   5 +
 pkg/xsysinfo/computecap_internal_test.go      |  23 +++
 pkg/xsysinfo/gpu.go                           |  98 ++++++++-
 11 files changed, 553 insertions(+), 32 deletions(-)
 create mode 100644 core/config/hardware_defaults.go
 create mode 100644 core/config/hardware_defaults_internal_test.go
 create mode 100644 core/config/hardware_defaults_test.go
 create mode 100644 core/services/nodes/router_hardware_internal_test.go
 create mode 100644 pkg/xsysinfo/computecap_internal_test.go

diff --git a/core/config/hardware_defaults.go b/core/config/hardware_defaults.go
new file mode 100644
index 000000000..2ed54265f
--- /dev/null
+++ b/core/config/hardware_defaults.go
@@ -0,0 +1,190 @@
+package config
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+
+	"github.com/mudler/LocalAI/pkg/xsysinfo"
+	"github.com/mudler/xlog"
+)
+
+// Hardware-driven model-config defaults.
+//
+// This sits alongside the other config overriders (ApplyInferenceDefaults for
+// model families, guessDefaultsFromFile for GGUF/NGPULayers): they all
+// heuristically fill ModelConfig values the user left unset. Hardware tuning is
+// the same domain — "adjust the config from the device that will run it" — so
+// it lives here rather than scattered into the backend or a separate package.
+//
+// The heuristics are parameterized on a GPU descriptor (not on direct
+// detection) so they apply in both deployment shapes: SetDefaults passes the
+// LocalGPU on a single host, and the distributed router passes the *selected
+// node's* reported GPU before loading there (the frontend that loaded the
+// config may have no GPU at all).
+
+// GPU describes the device that will run a model.
+type GPU struct {
+	// Vendor is "nvidia", "amd", … (matches xsysinfo vendor constants).
+	Vendor string
+	// ComputeCapability is the NVIDIA compute capability as "major.minor"
+	// (e.g. "12.1" for GB10 / DGX Spark). Empty for non-NVIDIA / unknown.
+	ComputeCapability string
+	// VRAM is total device memory in bytes (0 = unknown).
+	VRAM uint64
+}
+
+// Physical batch (n_batch / n_ubatch) defaults.
+const (
+	// DefaultPhysicalBatch is the conservative default when no hardware-specific
+	// tuning applies. Matches backend.DefaultBatchSize.
+	DefaultPhysicalBatch = 512
+	// BlackwellPhysicalBatch is the default on NVIDIA Blackwell consumer GPUs
+	// (sm_12x: sm_120 RTX 50-series, sm_121 GB10 / DGX Spark). A larger physical
+	// batch materially lifts MoE prefill there (per-expert GEMM tiles fill
+	// better); measured on a GB10 with Qwen3-30B-A3B to saturate around 2048.
+	BlackwellPhysicalBatch = 2048
+)
+
+// IsNVIDIABlackwell reports whether the GPU is in the NVIDIA Blackwell consumer
+// family (sm_12x). Datacenter Blackwell (B100/B200/GB200, sm_100 / cc 10.0)
+// reports a different compute capability and is intentionally not matched.
+func (g GPU) IsNVIDIABlackwell() bool {
+	maj, _ := parseComputeCapability(g.ComputeCapability)
+	return maj >= 12
+}
+
+// PhysicalBatch returns the canonical physical batch (n_batch/n_ubatch) for the
+// given hardware, used when the model config leaves batch unset.
+func PhysicalBatch(g GPU) int {
+	if g.IsNVIDIABlackwell() {
+		return BlackwellPhysicalBatch
+	}
+	return DefaultPhysicalBatch
+}
+
+// IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
+// Callers that re-tune a value chosen by an upstream host (the distributed
+// router correcting the frontend's guess) use this to avoid clobbering an
+// explicit user batch such as 1024.
+func IsManagedPhysicalBatch(n int) bool {
+	return n == DefaultPhysicalBatch || n == BlackwellPhysicalBatch
+}
+
+// Parallel-slot (n_parallel) VRAM tiers. llama.cpp serializes requests at
+// n_parallel=1 (the backend default) and only auto-enables continuous batching
+// when n_parallel > 1 — so a single-slot default makes concurrent requests
+// queue. We default a slot count by GPU size so multi-user serving works out of
+// the box. With the backend's unified KV cache the slots SHARE the context
+// budget, so more slots add concurrency without multiplying KV memory.
+const (
+	parallelSlotsVRAMHigh = uint64(32) << 30 // >=32 GiB -> 8 slots
+	parallelSlotsVRAMMid  = uint64(8) << 30  // >=8 GiB  -> 4 slots
+	parallelSlotsVRAMLow  = uint64(4) << 30  // >=4 GiB  -> 2 slots
+)
+
+// DefaultParallelSlots returns the n_parallel default for the given GPU. Returns
+// 1 (no concurrency) when VRAM is unknown or too small, so we never change
+// behavior on CPU-only / tiny devices.
+func DefaultParallelSlots(g GPU) int {
+	switch {
+	case g.VRAM >= parallelSlotsVRAMHigh:
+		return 8
+	case g.VRAM >= parallelSlotsVRAMMid:
+		return 4
+	case g.VRAM >= parallelSlotsVRAMLow:
+		return 2
+	default:
+		return 1
+	}
+}
+
+// EnsureParallelOption appends a VRAM-scaled "parallel:N" backend option when the
+// model doesn't already set one (and the GPU warrants concurrency). Returns the
+// possibly-extended options. Shared by the single-host config path
+// (ApplyHardwareDefaults) and the distributed router (per selected node).
+func EnsureParallelOption(opts []string, gpu GPU) []string {
+	if slots := DefaultParallelSlots(gpu); slots > 1 && !hasParallelOption(opts) {
+		return append(opts, fmt.Sprintf("parallel:%d", slots))
+	}
+	return opts
+}
+
+// hasParallelOption reports whether the model already sets parallel/n_parallel
+// (backend options are "name:value" strings) so we never override an explicit value.
+func hasParallelOption(opts []string) bool {
+	for _, o := range opts {
+		name := o
+		if i := strings.IndexByte(o, ':'); i >= 0 {
+			name = o[:i]
+		}
+		switch strings.TrimSpace(strings.ToLower(name)) {
+		case "parallel", "n_parallel":
+			return true
+		}
+	}
+	return false
+}
+
+// localGPU builds a GPU descriptor from local detection, used by SetDefaults on
+// a single host (the distributed router builds it from the selected node's
+// reported info instead). It is a package var so tests can inject a
+// deterministic device — detection does a live nvidia-smi call.
+var localGPU = func() GPU {
+	vendor, _ := xsysinfo.DetectGPUVendor()
+	vram, _ := xsysinfo.TotalAvailableVRAM()
+	return GPU{
+		Vendor:            vendor,
+		ComputeCapability: xsysinfo.NVIDIAComputeCapability(),
+		VRAM:              vram,
+	}
+}
+
+// ApplyHardwareDefaults fills ModelConfig values that depend on the target GPU
+// and were left unset by the user. Currently: a larger physical batch on
+// Blackwell. Explicit config always wins (we only touch zero values).
+func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
+	if cfg == nil {
+		return
+	}
+	if cfg.Batch == 0 && gpu.IsNVIDIABlackwell() {
+		cfg.Batch = BlackwellPhysicalBatch
+		xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
+			"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability)
+	}
+
+	// Enable concurrent serving by default on a capable GPU: without this the
+	// llama.cpp backend runs n_parallel=1 and serializes multi-user requests
+	// (continuous batching stays off). Unified KV means the slots share the
+	// context budget, so this is concurrency without extra KV memory. Explicit
+	// parallel/n_parallel in the model options always wins.
+	if before := len(cfg.Options); true {
+		cfg.Options = EnsureParallelOption(cfg.Options, gpu)
+		if len(cfg.Options) > before {
+			xlog.Debug("[hardware_defaults] defaulting parallel slots for concurrent serving",
+				"option", cfg.Options[len(cfg.Options)-1], "vram_gib", gpu.VRAM>>30)
+		}
+	}
+}
+
+// parseComputeCapability splits a "major.minor" string into integer parts.
+// Returns (-1, -1) when it can't be parsed.
+func parseComputeCapability(cc string) (int, int) {
+	cc = strings.TrimSpace(cc)
+	if cc == "" {
+		return -1, -1
+	}
+	majStr, minStr := cc, "0"
+	if dot := strings.IndexByte(cc, '.'); dot >= 0 {
+		majStr, minStr = cc[:dot], cc[dot+1:]
+	}
+	maj, err := strconv.Atoi(strings.TrimSpace(majStr))
+	if err != nil {
+		return -1, -1
+	}
+	min, err := strconv.Atoi(strings.TrimSpace(minStr))
+	if err != nil {
+		min = 0
+	}
+	return maj, min
+}
diff --git a/core/config/hardware_defaults_internal_test.go b/core/config/hardware_defaults_internal_test.go
new file mode 100644
index 000000000..52c674c2d
--- /dev/null
+++ b/core/config/hardware_defaults_internal_test.go
@@ -0,0 +1,37 @@
+package config
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// Single-instance path: SetDefaults applies hardware defaults from the local
+// GPU. The detection seam (localGPU) is injected so the path is deterministic
+// without a real GPU.
+var _ = Describe("SetDefaults hardware defaults (single-instance)", func() {
+	var orig func() GPU
+	BeforeEach(func() { orig = localGPU })
+	AfterEach(func() { localGPU = orig })
+
+	It("sets the physical batch on a local Blackwell GPU", func() {
+		localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
+		cfg := &ModelConfig{}
+		cfg.SetDefaults()
+		Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
+	})
+
+	It("leaves batch unset on a non-Blackwell local GPU", func() {
+		localGPU = func() GPU { return GPU{ComputeCapability: "8.9"} }
+		cfg := &ModelConfig{}
+		cfg.SetDefaults()
+		Expect(cfg.Batch).To(Equal(0))
+	})
+
+	It("never overrides an explicit batch", func() {
+		localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
+		cfg := &ModelConfig{}
+		cfg.Batch = 1024
+		cfg.SetDefaults()
+		Expect(cfg.Batch).To(Equal(1024))
+	})
+})
diff --git a/core/config/hardware_defaults_test.go b/core/config/hardware_defaults_test.go
new file mode 100644
index 000000000..ae7bf3964
--- /dev/null
+++ b/core/config/hardware_defaults_test.go
@@ -0,0 +1,97 @@
+package config_test
+
+import (
+	. "github.com/mudler/LocalAI/core/config"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("Hardware-driven config defaults", func() {
+	DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
+		func(cc string, want bool) {
+			Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
+		},
+		Entry("GB10 12.1", "12.1", true),
+		Entry("RTX 50 12.0", "12.0", true),
+		Entry("future 13.0", "13.0", true),
+		Entry("Hopper 9.0", "9.0", false),
+		Entry("Ada 8.9", "8.9", false),
+		Entry("datacenter Blackwell sm_100 10.0", "10.0", false),
+		Entry("unknown", "", false),
+	)
+
+	Describe("PhysicalBatch / IsManagedPhysicalBatch", func() {
+		It("returns the Blackwell batch on Blackwell", func() {
+			Expect(PhysicalBatch(GPU{ComputeCapability: "12.1"})).To(Equal(BlackwellPhysicalBatch))
+		})
+		It("returns the default batch otherwise", func() {
+			Expect(PhysicalBatch(GPU{ComputeCapability: "9.0"})).To(Equal(DefaultPhysicalBatch))
+			Expect(PhysicalBatch(GPU{})).To(Equal(DefaultPhysicalBatch))
+		})
+		It("recognizes managed defaults but not explicit values", func() {
+			Expect(IsManagedPhysicalBatch(DefaultPhysicalBatch)).To(BeTrue())
+			Expect(IsManagedPhysicalBatch(BlackwellPhysicalBatch)).To(BeTrue())
+			Expect(IsManagedPhysicalBatch(1024)).To(BeFalse())
+		})
+	})
+
+	Describe("ApplyHardwareDefaults", func() {
+		It("raises an unset batch to 2048 on Blackwell", func() {
+			cfg := &ModelConfig{}
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
+			Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
+		})
+		It("leaves batch unset on non-Blackwell", func() {
+			cfg := &ModelConfig{}
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0"})
+			Expect(cfg.Batch).To(Equal(0))
+		})
+		It("never overrides an explicit batch", func() {
+			cfg := &ModelConfig{}
+			cfg.Batch = 1024
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
+			Expect(cfg.Batch).To(Equal(1024))
+		})
+		It("no-ops on nil", func() {
+			Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic())
+		})
+	})
+
+	const gib = uint64(1) << 30
+
+	DescribeTable("DefaultParallelSlots (by VRAM)",
+		func(vramGiB uint64, want int) {
+			Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want))
+		},
+		Entry("GB10 119 GiB", uint64(119), 8),
+		Entry("48 GiB", uint64(48), 8),
+		Entry("24 GiB", uint64(24), 4),
+		Entry("8 GiB", uint64(8), 4),
+		Entry("6 GiB", uint64(6), 2),
+		Entry("2 GiB", uint64(2), 1),
+		Entry("unknown 0", uint64(0), 1),
+	)
+
+	Describe("ApplyHardwareDefaults parallel slots", func() {
+		It("adds a VRAM-scaled parallel option on a capable GPU", func() {
+			cfg := &ModelConfig{}
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
+			Expect(cfg.Options).To(ContainElement("parallel:8"))
+		})
+		It("scales the slot count down with VRAM", func() {
+			cfg := &ModelConfig{}
+			ApplyHardwareDefaults(cfg, GPU{VRAM: 24 * gib})
+			Expect(cfg.Options).To(ContainElement("parallel:4"))
+		})
+		It("adds no parallel option on small/unknown VRAM", func() {
+			cfg := &ModelConfig{}
+			ApplyHardwareDefaults(cfg, GPU{VRAM: 2 * gib})
+			Expect(cfg.Options).ToNot(ContainElement(ContainSubstring("parallel")))
+		})
+		It("never overrides an explicit parallel option", func() {
+			cfg := &ModelConfig{Options: []string{"parallel:2"}}
+			ApplyHardwareDefaults(cfg, GPU{VRAM: 119 * gib})
+			Expect(cfg.Options).To(Equal([]string{"parallel:2"}))
+		})
+	})
+})
diff --git a/core/config/model_config.go b/core/config/model_config.go
index dfe151a64..75136ec6c 100644
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -1111,6 +1111,11 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	// This ensures gallery-installed and runtime-loaded models get optimal parameters.
 	ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model)
 
+	// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell).
+	// Uses the local GPU here; in distributed mode the router re-applies the same
+	// heuristics for the selected node's GPU before loading. Explicit config wins.
+	ApplyHardwareDefaults(cfg, localGPU())
+
 	// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
 	defaultTopP := 0.95
 	defaultTopK := 40
diff --git a/core/http/endpoints/localai/nodes.go b/core/http/endpoints/localai/nodes.go
index 5a6edab22..820cb137f 100644
--- a/core/http/endpoints/localai/nodes.go
+++ b/core/http/endpoints/localai/nodes.go
@@ -70,17 +70,20 @@ func GetNodeEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {
 
 // RegisterNodeRequest is the request body for registering a new worker node.
 type RegisterNodeRequest struct {
-	Name          string            `json:"name"`
-	NodeType      string            `json:"node_type,omitempty"` // "backend" (default) or "agent"
-	Address       string            `json:"address"`
-	HTTPAddress   string            `json:"http_address,omitempty"`
-	Token         string            `json:"token,omitempty"`
-	TotalVRAM     uint64            `json:"total_vram,omitempty"`
-	AvailableVRAM uint64            `json:"available_vram,omitempty"`
-	TotalRAM      uint64            `json:"total_ram,omitempty"`
-	AvailableRAM  uint64            `json:"available_ram,omitempty"`
-	GPUVendor     string            `json:"gpu_vendor,omitempty"`
-	Labels        map[string]string `json:"labels,omitempty"`
+	Name          string `json:"name"`
+	NodeType      string `json:"node_type,omitempty"` // "backend" (default) or "agent"
+	Address       string `json:"address"`
+	HTTPAddress   string `json:"http_address,omitempty"`
+	Token         string `json:"token,omitempty"`
+	TotalVRAM     uint64 `json:"total_vram,omitempty"`
+	AvailableVRAM uint64 `json:"available_vram,omitempty"`
+	TotalRAM      uint64 `json:"total_ram,omitempty"`
+	AvailableRAM  uint64 `json:"available_ram,omitempty"`
+	GPUVendor     string `json:"gpu_vendor,omitempty"`
+	// GPUComputeCapability is the worker GPU's compute capability ("major.minor",
+	// e.g. "12.1" for GB10). Used by the router for per-arch option tuning.
+	GPUComputeCapability string            `json:"gpu_compute_capability,omitempty"`
+	Labels               map[string]string `json:"labels,omitempty"`
 	// MaxReplicasPerModel is the per-node cap on replicas of any single model.
 	// Workers older than this field omit it; we coerce 0 → 1 below to preserve
 	// historical single-replica behavior.
@@ -152,17 +155,18 @@ func RegisterNodeEndpoint(registry *nodes.NodeRegistry, expectedToken string, au
 		}
 
 		node := &nodes.BackendNode{
-			Name:                req.Name,
-			NodeType:            nodeType,
-			Address:             req.Address,
-			HTTPAddress:         req.HTTPAddress,
-			TokenHash:           tokenHash,
-			TotalVRAM:           req.TotalVRAM,
-			AvailableVRAM:       req.AvailableVRAM,
-			TotalRAM:            req.TotalRAM,
-			AvailableRAM:        req.AvailableRAM,
-			GPUVendor:           req.GPUVendor,
-			MaxReplicasPerModel: maxReplicasPerModel,
+			Name:                 req.Name,
+			NodeType:             nodeType,
+			Address:              req.Address,
+			HTTPAddress:          req.HTTPAddress,
+			TokenHash:            tokenHash,
+			TotalVRAM:            req.TotalVRAM,
+			AvailableVRAM:        req.AvailableVRAM,
+			TotalRAM:             req.TotalRAM,
+			AvailableRAM:         req.AvailableRAM,
+			GPUVendor:            req.GPUVendor,
+			GPUComputeCapability: req.GPUComputeCapability,
+			MaxReplicasPerModel:  maxReplicasPerModel,
 		}
 
 		ctx := c.Request().Context()
diff --git a/core/services/nodes/registry.go b/core/services/nodes/registry.go
index 3d34d086c..aafee13cb 100644
--- a/core/services/nodes/registry.go
+++ b/core/services/nodes/registry.go
@@ -36,6 +36,11 @@ type BackendNode struct {
 	TotalRAM     uint64 `gorm:"column:total_ram" json:"total_ram"`           // Total system RAM in bytes (fallback when no GPU)
 	AvailableRAM uint64 `gorm:"column:available_ram" json:"available_ram"`   // Available system RAM in bytes
 	GPUVendor    string `gorm:"column:gpu_vendor;size:32" json:"gpu_vendor"` // nvidia, amd, intel, vulkan, unknown
+	// GPUComputeCapability is the worker GPU's compute capability as
+	// "major.minor" (e.g. "12.1" for GB10 / DGX Spark). Reported by the worker
+	// on registration; used by the router to pick per-arch options (e.g. a
+	// larger physical batch on Blackwell). Empty when unknown / non-NVIDIA.
+	GPUComputeCapability string `gorm:"column:gpu_compute_capability;size:16" json:"gpu_compute_capability"`
 	// MaxReplicasPerModel caps how many replicas of any one model can run on
 	// this node concurrently. Default 1 preserves the historical "one
 	// (node, model)" assumption; set higher (via worker --max-replicas-per-model)
@@ -69,6 +74,7 @@ const (
 	ColReservedVRAM        = "reserved_vram"
 	ColAvailableRAM        = "available_ram"
 	ColGPUVendor           = "gpu_vendor"
+	ColGPUComputeCap       = "gpu_compute_capability"
 	ColLastHeartbeat       = "last_heartbeat"
 	ColMaxReplicasPerModel = "max_replicas_per_model"
 )
diff --git a/core/services/nodes/router.go b/core/services/nodes/router.go
index e5ce52306..ccbf48f43 100644
--- a/core/services/nodes/router.go
+++ b/core/services/nodes/router.go
@@ -12,6 +12,7 @@ import (
 	"strings"
 	"time"
 
+	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/services/advisorylock"
 	"github.com/mudler/LocalAI/core/services/nodes/prefixcache"
 	"github.com/mudler/LocalAI/pkg/distributedhdr"
@@ -138,6 +139,30 @@ type scheduleLoadResult struct {
 	ReplicaIndex int
 }
 
+// applyNodeHardwareDefaults tunes node-agnostic ModelOptions to the GPU of the
+// node that was actually selected to run the model, reusing the same hardware
+// heuristics as single-host config loading (core/config). On Blackwell it
+// raises the physical batch; on non-Blackwell it resets a hardware-default that
+// an upstream host (the GPU-less frontend in distributed mode) guessed higher.
+// Only values the heuristics themselves manage are touched, so an explicit user
+// batch (e.g. 1024) is never overridden.
+func applyNodeHardwareDefaults(opts *pb.ModelOptions, node *BackendNode) {
+	if opts == nil || node == nil {
+		return
+	}
+	gpu := config.GPU{
+		Vendor:            node.GPUVendor,
+		ComputeCapability: node.GPUComputeCapability,
+		VRAM:              node.TotalVRAM,
+	}
+	if config.IsManagedPhysicalBatch(int(opts.NBatch)) {
+		opts.NBatch = int32(config.PhysicalBatch(gpu))
+	}
+	// Default concurrent serving for the selected node (the frontend that built
+	// the options may have no GPU). Only adds when no parallel option is set.
+	opts.Options = config.EnsureParallelOption(opts.Options, gpu)
+}
+
 // scheduleAndLoad is the shared core for loading a model on a new node.
 // Used by both Route() (for first-time loads) and ScheduleAndLoadModel() (for reconciler scale-ups).
 //
@@ -153,6 +178,11 @@ func (r *SmartRouter) scheduleAndLoad(ctx context.Context, backendType, tracking
 		return nil, fmt.Errorf("no available nodes: %w", err)
 	}
 
+	// Tune node-agnostic options to the SELECTED node's GPU. Only now do we know
+	// which node (and its compute capability) will run the model — the frontend
+	// that built modelOpts may have no GPU at all in distributed mode.
+	applyNodeHardwareDefaults(modelOpts, node)
+
 	// Pre-stage model files via FileStager before loading
 	loadOpts := modelOpts
 	if r.fileStager != nil && modelOpts != nil {
diff --git a/core/services/nodes/router_hardware_internal_test.go b/core/services/nodes/router_hardware_internal_test.go
new file mode 100644
index 000000000..2418bf444
--- /dev/null
+++ b/core/services/nodes/router_hardware_internal_test.go
@@ -0,0 +1,46 @@
+package nodes
+
+import (
+	"github.com/mudler/LocalAI/core/config"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("applyNodeHardwareDefaults", func() {
+	It("raises a managed default batch on a Blackwell node", func() {
+		opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch}
+		applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1"})
+		Expect(opts.NBatch).To(BeEquivalentTo(config.BlackwellPhysicalBatch))
+	})
+
+	It("resets a Blackwell guess on a non-Blackwell node", func() {
+		// frontend (Blackwell) guessed high, but the selected node is not Blackwell
+		opts := &pb.ModelOptions{NBatch: config.BlackwellPhysicalBatch}
+		applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "9.0"})
+		Expect(opts.NBatch).To(BeEquivalentTo(config.DefaultPhysicalBatch))
+	})
+
+	It("never overrides an explicit (non-managed) batch", func() {
+		opts := &pb.ModelOptions{NBatch: 1024}
+		applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1"})
+		Expect(opts.NBatch).To(BeEquivalentTo(int32(1024)))
+	})
+
+	It("adds a VRAM-scaled parallel option for the selected node", func() {
+		// frontend may have had no GPU (no parallel option); the node has a big GPU
+		opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch}
+		applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1", TotalVRAM: 119 << 30})
+		Expect(opts.Options).To(ContainElement("parallel:8"))
+	})
+
+	It("never overrides an explicit parallel option on the node path", func() {
+		opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, Options: []string{"parallel:2"}}
+		applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1", TotalVRAM: 119 << 30})
+		Expect(opts.Options).To(Equal([]string{"parallel:2"}))
+	})
+
+	It("no-ops on nil inputs", func() {
+		Expect(func() { applyNodeHardwareDefaults(nil, nil) }).ToNot(Panic())
+	})
+})
diff --git a/core/services/worker/registration.go b/core/services/worker/registration.go
index 87a8a7966..432cc845b 100644
--- a/core/services/worker/registration.go
+++ b/core/services/worker/registration.go
@@ -73,6 +73,10 @@ func (cfg *Config) registrationBody() map[string]any {
 	// Detect GPU info for VRAM-aware scheduling
 	totalVRAM, _ := xsysinfo.TotalAvailableVRAM()
 	gpuVendor, _ := xsysinfo.DetectGPUVendor()
+	// Compute capability (e.g. "12.1" for GB10) lets the router pick per-arch
+	// options (e.g. larger physical batch on Blackwell). Detected on the worker
+	// because only the worker sees the GPU in distributed mode.
+	gpuComputeCap := xsysinfo.NVIDIAComputeCapability()
 
 	maxReplicas := cfg.MaxReplicasPerModel
 	if maxReplicas < 1 {
@@ -85,6 +89,7 @@ func (cfg *Config) registrationBody() map[string]any {
 		"total_vram":             totalVRAM,
 		"available_vram":         totalVRAM, // initially all VRAM is available
 		"gpu_vendor":             gpuVendor,
+		"gpu_compute_capability": gpuComputeCap,
 		"max_replicas_per_model": maxReplicas,
 	}
 
diff --git a/pkg/xsysinfo/computecap_internal_test.go b/pkg/xsysinfo/computecap_internal_test.go
new file mode 100644
index 000000000..3bf2602d0
--- /dev/null
+++ b/pkg/xsysinfo/computecap_internal_test.go
@@ -0,0 +1,23 @@
+package xsysinfo
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("parseComputeCap", func() {
+	DescribeTable("splits major.minor",
+		func(in string, maj, min int) {
+			m, n := parseComputeCap(in)
+			Expect(m).To(Equal(maj))
+			Expect(n).To(Equal(min))
+		},
+		Entry("GB10 / DGX Spark", "12.1", 12, 1),
+		Entry("RTX 50-series", "12.0", 12, 0),
+		Entry("Hopper", "9.0", 9, 0),
+		Entry("major only", "12", 12, 0),
+		Entry("whitespace", " 12.1 ", 12, 1),
+		Entry("empty", "", -1, -1),
+		Entry("garbage", "abc", -1, -1),
+	)
+})
diff --git a/pkg/xsysinfo/gpu.go b/pkg/xsysinfo/gpu.go
index a5575edb8..f0185ddeb 100644
--- a/pkg/xsysinfo/gpu.go
+++ b/pkg/xsysinfo/gpu.go
@@ -38,9 +38,9 @@ var UnifiedMemoryDevices = []string{
 
 // GPUMemoryInfo contains real-time GPU memory usage information
 type GPUMemoryInfo struct {
-	Index        int     `json:"index"`
-	Name         string  `json:"name"`
-	Vendor       string  `json:"vendor"`
+	Index  int    `json:"index"`
+	Name   string `json:"name"`
+	Vendor string `json:"vendor"`
 	// BDF is the canonical PCI bus address (dddd:bb:dd.f) when known.
 	// Populated by detection paths that can attribute the device to a
 	// PCI location (clinfo, future amdgpu/nvidia paths); empty for
@@ -307,6 +307,84 @@ func GetGPUAggregateInfo() GPUAggregateInfo {
 	return aggregate
 }
 
+var (
+	computeCapOnce   sync.Once
+	computeCapResult string
+)
+
+// NVIDIAComputeCapability returns the highest NVIDIA GPU compute capability on
+// this host as a "major.minor" string (e.g. "12.1" for GB10 / DGX Spark), or ""
+// when nvidia-smi is unavailable or reports none. Detected once and cached.
+//
+// This runs where the GPU actually is. In distributed mode it is reported by
+// each worker on registration so the router can make per-node decisions rather
+// than guessing from the (possibly GPU-less) frontend host.
+func NVIDIAComputeCapability() string {
+	computeCapOnce.Do(func() {
+		computeCapResult = detectNVIDIAComputeCapability()
+	})
+	return computeCapResult
+}
+
+func detectNVIDIAComputeCapability() string {
+	if _, err := exec.LookPath("nvidia-smi"); err != nil {
+		return ""
+	}
+
+	cmd := exec.Command("nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader")
+
+	var stdout, stderr bytes.Buffer
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+
+	if err := cmd.Run(); err != nil {
+		xlog.Debug("nvidia-smi compute_cap query failed", "error", err, "stderr", stderr.String())
+		return ""
+	}
+
+	best := ""
+	bestMajor, bestMinor := -1, -1
+	for _, line := range strings.Split(strings.TrimSpace(stdout.String()), "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+		maj, min := parseComputeCap(line)
+		if maj < 0 {
+			continue
+		}
+		if maj > bestMajor || (maj == bestMajor && min > bestMinor) {
+			bestMajor, bestMinor, best = maj, min, line
+		}
+	}
+	if best != "" {
+		xlog.Debug("NVIDIA compute capability detected", "compute_cap", best)
+	}
+	return best
+}
+
+// parseComputeCap splits a "major.minor" compute-capability string into its
+// integer parts. Returns (-1, -1) if it can't be parsed.
+func parseComputeCap(cc string) (int, int) {
+	cc = strings.TrimSpace(cc)
+	if cc == "" {
+		return -1, -1
+	}
+	majStr, minStr := cc, "0"
+	if dot := strings.IndexByte(cc, '.'); dot >= 0 {
+		majStr, minStr = cc[:dot], cc[dot+1:]
+	}
+	maj, err := strconv.Atoi(strings.TrimSpace(majStr))
+	if err != nil {
+		return -1, -1
+	}
+	min, err := strconv.Atoi(strings.TrimSpace(minStr))
+	if err != nil {
+		min = 0
+	}
+	return maj, min
+}
+
 // getNVIDIAGPUMemory queries NVIDIA GPUs using nvidia-smi
 func getNVIDIAGPUMemory() []GPUMemoryInfo {
 	// Check if nvidia-smi is available
@@ -866,12 +944,12 @@ func getVulkanGPUMemory() []GPUMemoryInfo {
 }
 
 type vulkanGPUTextInfo struct {
-	index        int
-	name         string
-	deviceType   string
-	totalVRAM    uint64
-	budgetVRAM   uint64
-	usageVRAM    uint64
+	index      int
+	name       string
+	deviceType string
+	totalVRAM  uint64
+	budgetVRAM uint64
+	usageVRAM  uint64
 }
 
 func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo {
@@ -909,7 +987,7 @@ func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo {
 		} else if current.usageVRAM != 0 && current.budgetVRAM == 0 {
 			current.budgetVRAM = current.totalVRAM - current.usageVRAM
 		} else if current.usageVRAM == 0 && current.budgetVRAM == 0 {
-			current.usageVRAM  = 0
+			current.usageVRAM = 0
 			current.budgetVRAM = current.totalVRAM
 		}
 

From e19c43cf043032c47da2a370b2f4d967b89b2035 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 20 Jun 2026 14:56:16 +0200
Subject: [PATCH 16/99] feat(gallery): add Depth Anything V2 models + bump
 native version (#10413)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(gallery): add Depth Anything V2 models + bump native version

Add Depth Anything V2 (DA2) support to the depth-anything backend. DA2 is
depth-only (no camera pose, no confidence) and ships both relative
(relative inverse depth) and metric (depth in metres) variants. The Go
backend is model-agnostic, so no backend code changes are required — only
a native version bump and new gallery entries.

- backend/go/depth-anything-cpp/Makefile: pin DEPTHANYTHING_VERSION to the
  depth-anything.cpp commit that adds the DA2 engine + C-API routing
  (e3dec57f13a52366bbc4f279ef44804915960a6b, kept alive by the upstream tag
  da2-support so it survives a squash-merge).
- gallery/index.yaml: add 12 DA2 entries (4 base quants, small, large, plus
  Hypersim indoor and VKITTI outdoor metric models in S/B/L). Metric models
  carry the metric-depth tag; none carry camera-pose.

Assisted-by: Claude:claude-opus-4-8

* chore(depth-anything-cpp): pin to merged DA2 master commit

PR #1 (mudler/depth-anything.cpp) merged to master as f4e17de (squash); repoint
the pin from the pre-merge commit to the canonical master commit.

Assisted-by: Claude:claude-opus-4-8

---------

Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/go/depth-anything-cpp/Makefile |  10 +-
 gallery/index.yaml                     | 227 +++++++++++++++++++++++++
 2 files changed, 233 insertions(+), 4 deletions(-)

diff --git a/backend/go/depth-anything-cpp/Makefile b/backend/go/depth-anything-cpp/Makefile
index 815d2b0db..f1a0b9f97 100644
--- a/backend/go/depth-anything-cpp/Makefile
+++ b/backend/go/depth-anything-cpp/Makefile
@@ -8,11 +8,13 @@ JOBS?=$(shell nproc --ignore=1)
 
 # depth-anything.cpp. Pin to a specific commit for a stable build; a squash
 # merge upstream can orphan a branch, so the native version is pinned by SHA.
-# This SHA adds the nested two-file metric C-API (abi_version 4,
-# da_capi_load_nested) required by the depth-anything-3-nested gallery model;
-# tag it (e.g. v0.1.3) upstream to keep the SHA alive.
+# This SHA adds the Depth Anything V2 engine + C-API routing (depth-only,
+# relative + metric) on top of the nested two-file metric C-API (abi_version 4,
+# da_capi_load_nested) required by the depth-anything-3-nested gallery model.
+# It is kept alive by the upstream tag da2-support (survives a squash-merge);
+# repoint to the master merge commit once mudler/depth-anything.cpp PR #1 lands.
 DEPTHANYTHING_REPO?=https://github.com/mudler/depth-anything.cpp.git
-DEPTHANYTHING_VERSION?=cce5edc395fd1843806093d7ccc0c8b0d0b97b72
+DEPTHANYTHING_VERSION?=f4e17dea695dd12ae76bea98ba58030996b98118
 
 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
diff --git a/gallery/index.yaml b/gallery/index.yaml
index beede9e79..18d6b1839 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -8343,6 +8343,233 @@
     - filename: depth-anything-nested-metric.gguf
       uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything-nested-metric.gguf
       sha256: "b54ed50cbc0b0c14fae1f8edd0fea8bd1cac0850485fd6e7eb2422c7a19e570e"
+- &depth-anything-2-base
+  name: depth-anything-2-base
+  url: github:mudler/LocalAI/gallery/virtual.yaml@master
+  urls:
+    - https://github.com/mudler/depth-anything.cpp
+    - https://huggingface.co/depth-anything/Depth-Anything-V2
+    - https://huggingface.co/mudler/depth-anything.cpp-gguf
+  description: |
+    Depth Anything V2 (base / ViT-B) monocular depth, served via the native
+    depth-anything.cpp backend (C++/ggml + purego, no Python at inference).
+    Given an image it returns a dense monocular depth map only — no camera pose,
+    no confidence. This is the relative variant (relative inverse depth). Use
+    GenerateImage (src -> normalized depth PNG at dst) or the Depth endpoint.
+    q4_k is the recommended CPU default.
+  license: apache-2.0
+  icon: https://avatars.githubusercontent.com/u/53104118?s=200&v=4
+  tags:
+    - depth-estimation
+    - depth-anything
+    - native
+    - cpp
+    - cpu
+  overrides:
+    backend: depth-anything
+    parameters:
+      model: depth-anything2-base-q4_k.gguf
+  files:
+    - filename: depth-anything2-base-q4_k.gguf
+      uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-base-q4_k.gguf
+      sha256: "49e77ec7e593080111242fa76017cae3e26498d550841cf8a70dfcd36bb175f2"
+- !!merge <<: *depth-anything-2-base
+  name: depth-anything-2-base-q8_0
+  description: |
+    Depth Anything V2 (base / ViT-B), q8_0 — near-lossless 8-bit quant. Same
+    relative monocular depth output as the q4_k default at higher fidelity. Use
+    GenerateImage (src -> depth PNG) or the Depth endpoint.
+  overrides:
+    backend: depth-anything
+    parameters:
+      model: depth-anything2-base-q8_0.gguf
+  files:
+    - filename: depth-anything2-base-q8_0.gguf
+      uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-base-q8_0.gguf
+      sha256: "11920ec7a8dfc2fa7fe8ed44811a46fafe415c641d13144bb733d1437832291b"
+- !!merge <<: *depth-anything-2-base
+  name: depth-anything-2-base-f16
+  description: |
+    Depth Anything V2 (base / ViT-B), f16 — half precision, no measurable
+    accuracy loss vs f32. Relative monocular depth only (no pose). Use
+    GenerateImage (src -> depth PNG) or the Depth endpoint.
+  overrides:
+    backend: depth-anything
+    parameters:
+      model: depth-anything2-base-f16.gguf
+  files:
+    - filename: depth-anything2-base-f16.gguf
+      uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-base-f16.gguf
+      sha256: "e91c011fbbf90a44639fea55b8d61ca9e80dfb5541220946c8b6e6261fe67ab1"
+- !!merge <<: *depth-anything-2-base
+  name: depth-anything-2-base-f32
+  description: |
+    Depth Anything V2 (base / ViT-B), f32 — maximum reference fidelity. Relative
+    monocular depth only (no pose). Use GenerateImage (src -> depth PNG) or the
+    Depth endpoint.
+  overrides:
+    backend: depth-anything
+    parameters:
+      model: depth-anything2-base-f32.gguf
+  files:
+    - filename: depth-anything2-base-f32.gguf
+      uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-base-f32.gguf
+      sha256: "2d3d2e4d8fae9646c17577b84c870c7d77a34ded8cf8d5da0a60b2bee1530ccc"
+- !!merge <<: *depth-anything-2-base
+  name: depth-anything-2-small
+  description: |
+    Depth Anything V2 (small / ViT-S), f32 — the smallest, fastest backbone for
+    relative monocular depth on CPU. Depth only (no pose). Use GenerateImage
+    (src -> depth PNG) or the Depth endpoint.
+  overrides:
+    backend: depth-anything
+    parameters:
+      model: depth-anything2-small-f32.gguf
+  files:
+    - filename: depth-anything2-small-f32.gguf
+      uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-small-f32.gguf
+      sha256: "1f6622aa70cbd0eba34d34e2f635a156ed8c3f8e158fb149eb355366e2deb899"
+- !!merge <<: *depth-anything-2-base
+  name: depth-anything-2-large
+  description: |
+    Depth Anything V2 (large / ViT-L), f32 — higher-quality relative monocular
+    depth than base. Depth only (no pose). Use GenerateImage (src -> depth PNG)
+    or the Depth endpoint.
+  overrides:
+    backend: depth-anything
+    parameters:
+      model: depth-anything2-large-f32.gguf
+  files:
+    - filename: depth-anything2-large-f32.gguf
+      uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-large-f32.gguf
+      sha256: "e187658b01b6df62e1553b03df9973606a7287551d4771b09802fc10b26f19f3"
+- !!merge <<: *depth-anything-2-base
+  name: depth-anything-2-metric-hypersim-small
+  description: |
+    Depth Anything V2 Metric (Hypersim, indoor / ViT-S), q4_k — metric monocular
+    depth in METRES (indoor, max_depth 20). Depth only (no pose). Use
+    GenerateImage (src -> depth PNG) or the Depth endpoint.
+  tags:
+    - depth-estimation
+    - depth-anything
+    - metric-depth
+    - native
+    - cpp
+    - cpu
+  overrides:
+    backend: depth-anything
+    parameters:
+      model: depth-anything2-metric-hypersim-small-q4_k.gguf
+  files:
+    - filename: depth-anything2-metric-hypersim-small-q4_k.gguf
+      uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-metric-hypersim-small-q4_k.gguf
+      sha256: "decd99f5c756b564aae5ca1a1612f896e7f76889060e1d25ba610549bbc39b52"
+- !!merge <<: *depth-anything-2-base
+  name: depth-anything-2-metric-hypersim-base
+  description: |
+    Depth Anything V2 Metric (Hypersim, indoor / ViT-B), q4_k — metric monocular
+    depth in METRES (indoor, max_depth 20). Depth only (no pose). Use
+    GenerateImage (src -> depth PNG) or the Depth endpoint.
+  tags:
+    - depth-estimation
+    - depth-anything
+    - metric-depth
+    - native
+    - cpp
+    - cpu
+  overrides:
+    backend: depth-anything
+    parameters:
+      model: depth-anything2-metric-hypersim-base-q4_k.gguf
+  files:
+    - filename: depth-anything2-metric-hypersim-base-q4_k.gguf
+      uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-metric-hypersim-base-q4_k.gguf
+      sha256: "c7c6a8628ac154f2ad43db09b8146518e863375be2cb03b8b46caec49a4fcf3a"
+- !!merge <<: *depth-anything-2-base
+  name: depth-anything-2-metric-hypersim-large
+  description: |
+    Depth Anything V2 Metric (Hypersim, indoor / ViT-L), q4_k — highest-quality
+    metric monocular depth in METRES (indoor, max_depth 20). Depth only (no
+    pose). Use GenerateImage (src -> depth PNG) or the Depth endpoint.
+  tags:
+    - depth-estimation
+    - depth-anything
+    - metric-depth
+    - native
+    - cpp
+    - cpu
+  overrides:
+    backend: depth-anything
+    parameters:
+      model: depth-anything2-metric-hypersim-large-q4_k.gguf
+  files:
+    - filename: depth-anything2-metric-hypersim-large-q4_k.gguf
+      uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-metric-hypersim-large-q4_k.gguf
+      sha256: "3664506bea55e64926fff2cb112ea5a9ad923d13647b9c69617184a89dd1e473"
+- !!merge <<: *depth-anything-2-base
+  name: depth-anything-2-metric-vkitti-small
+  description: |
+    Depth Anything V2 Metric (Virtual KITTI, outdoor / ViT-S), q4_k — metric
+    monocular depth in METRES (outdoor, max_depth 80). Depth only (no pose). Use
+    GenerateImage (src -> depth PNG) or the Depth endpoint.
+  tags:
+    - depth-estimation
+    - depth-anything
+    - metric-depth
+    - native
+    - cpp
+    - cpu
+  overrides:
+    backend: depth-anything
+    parameters:
+      model: depth-anything2-metric-vkitti-small-q4_k.gguf
+  files:
+    - filename: depth-anything2-metric-vkitti-small-q4_k.gguf
+      uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-metric-vkitti-small-q4_k.gguf
+      sha256: "8dcaa5d0f8475c3dc5de59e28faacd6d46e5ef73c73ecc58e365d7751bc2279f"
+- !!merge <<: *depth-anything-2-base
+  name: depth-anything-2-metric-vkitti-base
+  description: |
+    Depth Anything V2 Metric (Virtual KITTI, outdoor / ViT-B), q4_k — metric
+    monocular depth in METRES (outdoor, max_depth 80). Depth only (no pose). Use
+    GenerateImage (src -> depth PNG) or the Depth endpoint.
+  tags:
+    - depth-estimation
+    - depth-anything
+    - metric-depth
+    - native
+    - cpp
+    - cpu
+  overrides:
+    backend: depth-anything
+    parameters:
+      model: depth-anything2-metric-vkitti-base-q4_k.gguf
+  files:
+    - filename: depth-anything2-metric-vkitti-base-q4_k.gguf
+      uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-metric-vkitti-base-q4_k.gguf
+      sha256: "1de5a7aae674df6afb8fa5e06d67843dccfbab92cd64b7c816c1218229446d6d"
+- !!merge <<: *depth-anything-2-base
+  name: depth-anything-2-metric-vkitti-large
+  description: |
+    Depth Anything V2 Metric (Virtual KITTI, outdoor / ViT-L), q4_k —
+    highest-quality metric monocular depth in METRES (outdoor, max_depth 80).
+    Depth only (no pose). Use GenerateImage (src -> depth PNG) or the Depth
+    endpoint.
+  tags:
+    - depth-estimation
+    - depth-anything
+    - metric-depth
+    - native
+    - cpp
+    - cpu
+  overrides:
+    backend: depth-anything
+    parameters:
+      model: depth-anything2-metric-vkitti-large-q4_k.gguf
+  files:
+    - filename: depth-anything2-metric-vkitti-large-q4_k.gguf
+      uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-metric-vkitti-large-q4_k.gguf
+      sha256: "3b72e9a34262a7025ffba2fc4b760553398ac0622c26f164bff3d2c93991c757"
 - name: rfdetr-cpp-base
   url: github:mudler/LocalAI/gallery/virtual.yaml@master
   urls:

From 9565db5f949cfc814d8bc8c4dfc60c166f899528 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 20 Jun 2026 22:38:42 +0200
Subject: [PATCH 17/99] feat(models): model aliases - redirect a model name to
 another configured model (#10414)

* feat(config): add model alias field and self-validation

Add ModelConfig.Alias (yaml: alias), IsAlias(), and an alias
short-circuit at the top of Validate() that rejects self-reference and
forbids setting backend/parameters.model on a pure-redirect alias.

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(config): resolve and validate model alias targets in the loader

Assisted-by: Claude:opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(middleware): resolve model aliases and stamp requested/served identity

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(modeladmin): reject alias configs with invalid targets on create/edit

Validate alias targets at create/swap entry points (ImportModelEndpoint,
EditYAML, PatchConfig) so a dangling, chained, or disabled alias target is
rejected at save time rather than surfacing as a runtime error.

Assisted-by: Claude:opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(api): add GET /api/aliases to list model aliases

Adds an admin-gated read-only endpoint that lists every model alias
config as {name, target} pairs, backed by the loader's existing
GetAllModelsConfigs().

Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(mcp): add set_alias and list_aliases tools

Expose model-alias management over the LocalAI Assistant MCP surface:
list_aliases (read-only, GET /api/aliases) and set_alias (mutating).
SetAlias is swap-first: PATCH /api/models/config-json/:name swaps an
existing alias's target (validated, non-destructive) and a 404 falls
back to POST /models/import to create a fresh {name, alias} config. The
inproc client mirrors this via ConfigService.PatchConfig + a create path
modeled on ImportModelEndpoint. Deletion reuses delete_model.

Assisted-by: Claude:claude-opus-4 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* style(mcp): replace em dashes in alias tool comments

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(config-meta): expose alias as a model-select field

Add an 'alias' section to DefaultSections() and an 'alias' field override
in DefaultRegistry() so the schema-driven React editor renders the new
top-level ModelConfig.Alias field as a model picker in its own section.

Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(ui): add alias template card and Manage alias badge

Add an 'Alias / Routing' template to the create-flow gallery that seeds a
minimal name + alias config, and a read-only 'alias -> target' badge on the
Manage Models tab. The capabilities row payload does not carry the alias
field, so the badge resolves targets from GET /api/aliases looked up by name.

Assisted-by: Claude:claude-opus-4 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* docs: document model aliases

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* docs(swagger): regenerate for GET /api/aliases

Adds the /api/aliases path and AliasInfo schema generated from the
ListAliasesEndpoint annotation.

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* test(localai): check os.RemoveAll error in aliases_test

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix: correct alias conversion docs and advertise /api/aliases in instructions

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(mcp): write alias config 0600 to satisfy gosec G306

The inproc createAlias path wrote the alias YAML with 0644, which gosec
flags as a new G306 finding on the PR. The LocalAI process is the sole
reader/writer of model configs, so 0600 is correct and keeps the scan clean.

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/config/meta/registry.go                  |   9 ++
 core/config/meta/registry_test.go             |  28 +++++
 core/config/meta/types.go                     |   1 +
 core/config/model_config.go                   |  26 +++++
 core/config/model_config_loader.go            |  54 ++++++++++
 core/config/model_config_loader_test.go       |  48 +++++++++
 core/config/model_config_test.go              |  29 +++++
 core/http/endpoints/localai/aliases.go        |  33 ++++++
 core/http/endpoints/localai/aliases_test.go   |  57 ++++++++++
 core/http/endpoints/localai/import_model.go   |   6 ++
 .../endpoints/mcp/localai_assistant_test.go   |   6 ++
 core/http/middleware/request.go               |  21 ++++
 core/http/middleware/request_test.go          | 101 ++++++++++++++++++
 core/http/middleware/route_model.go           |   7 +-
 core/http/react-ui/e2e/alias-template.spec.js |  77 +++++++++++++
 core/http/react-ui/src/pages/Manage.jsx       |  23 +++-
 core/http/react-ui/src/utils/api.js           |   1 +
 core/http/react-ui/src/utils/config.js        |   1 +
 .../http/react-ui/src/utils/modelTemplates.js |  10 ++
 core/http/routes/localai.go                   |   4 +
 core/services/modeladmin/config.go            |   6 ++
 core/services/modeladmin/config_test.go       |  18 ++++
 docs/content/features/model-aliases.md        |  81 ++++++++++++++
 pkg/mcp/localaitools/client.go                |   8 ++
 pkg/mcp/localaitools/coverage_test.go         |   2 +
 pkg/mcp/localaitools/dto.go                   |   8 ++
 pkg/mcp/localaitools/fakes_test.go            |  18 ++++
 pkg/mcp/localaitools/httpapi/client.go        |  36 +++++++
 pkg/mcp/localaitools/httpapi/client_test.go   |  86 +++++++++++++++
 pkg/mcp/localaitools/httpapi/routes.go        |   2 +
 pkg/mcp/localaitools/inproc/client.go         |  78 ++++++++++++++
 pkg/mcp/localaitools/inproc/client_test.go    |  77 +++++++++++++
 pkg/mcp/localaitools/server.go                |   1 +
 pkg/mcp/localaitools/server_test.go           |   5 +
 pkg/mcp/localaitools/tools.go                 |   5 +
 pkg/mcp/localaitools/tools_aliases.go         |  48 +++++++++
 swagger/docs.go                               |  30 ++++++
 swagger/swagger.json                          |  30 ++++++
 swagger/swagger.yaml                          |  19 ++++
 39 files changed, 1098 insertions(+), 2 deletions(-)
 create mode 100644 core/config/meta/registry_test.go
 create mode 100644 core/http/endpoints/localai/aliases.go
 create mode 100644 core/http/endpoints/localai/aliases_test.go
 create mode 100644 core/http/react-ui/e2e/alias-template.spec.js
 create mode 100644 docs/content/features/model-aliases.md
 create mode 100644 pkg/mcp/localaitools/tools_aliases.go

diff --git a/core/config/meta/registry.go b/core/config/meta/registry.go
index ca10f604c..84fc9afda 100644
--- a/core/config/meta/registry.go
+++ b/core/config/meta/registry.go
@@ -286,6 +286,15 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order:       45,
 		},
 
+		// --- Alias ---
+		"alias": {
+			Section:     "alias",
+			Label:       "Alias target",
+			Description: "Redirect all traffic for this model to another configured model. When set, every other field on this config is ignored and requests are served by the target model.",
+			Component:   "model-select",
+			Order:       0,
+		},
+
 		// --- Pipeline ---
 		"pipeline.llm": {
 			Section:              "pipeline",
diff --git a/core/config/meta/registry_test.go b/core/config/meta/registry_test.go
new file mode 100644
index 000000000..e9d998609
--- /dev/null
+++ b/core/config/meta/registry_test.go
@@ -0,0 +1,28 @@
+package meta_test
+
+import (
+	"github.com/mudler/LocalAI/core/config/meta"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("alias field metadata", func() {
+	It("registers the alias field as a model-select in the alias section", func() {
+		reg := meta.DefaultRegistry()
+		f, ok := reg["alias"]
+		Expect(ok).To(BeTrue(), "alias field should have a registry override")
+		Expect(f.Section).To(Equal("alias"))
+		Expect(f.Component).To(Equal("model-select"))
+	})
+
+	It("defines an alias section", func() {
+		var found bool
+		for _, s := range meta.DefaultSections() {
+			if s.ID == "alias" {
+				found = true
+			}
+		}
+		Expect(found).To(BeTrue(), "DefaultSections should include an alias section")
+	})
+})
diff --git a/core/config/meta/types.go b/core/config/meta/types.go
index a86b8bb69..a29e66967 100644
--- a/core/config/meta/types.go
+++ b/core/config/meta/types.go
@@ -69,6 +69,7 @@ type FieldMetaOverride struct {
 func DefaultSections() []Section {
 	return []Section{
 		{ID: "general", Label: "General", Icon: "settings", Order: 0},
+		{ID: "alias", Label: "Alias", Icon: "git-merge", Order: 5},
 		{ID: "llm", Label: "LLM", Icon: "cpu", Order: 10},
 		{ID: "parameters", Label: "Parameters", Icon: "sliders", Order: 20},
 		{ID: "templates", Label: "Templates", Icon: "file-text", Order: 30},
diff --git a/core/config/model_config.go b/core/config/model_config.go
index 75136ec6c..50836b99e 100644
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -37,6 +37,12 @@ type ModelConfig struct {
 	schema.PredictionOptions `yaml:"parameters,omitempty" json:"parameters,omitempty"`
 	Name                     string `yaml:"name,omitempty" json:"name,omitempty"`
 
+	// Alias, when set, makes this config a pure redirect: every request for
+	// Name is served by the model named here. All other fields are ignored.
+	// The target must be an existing, non-alias model (enforced at load and
+	// at create/swap time). See docs/content for Model Aliases.
+	Alias string `yaml:"alias,omitempty" json:"alias,omitempty"`
+
 	F16                 *bool               `yaml:"f16,omitempty" json:"f16,omitempty"`
 	Threads             *int                `yaml:"threads,omitempty" json:"threads,omitempty"`
 	Debug               *bool               `yaml:"debug,omitempty" json:"debug,omitempty"`
@@ -391,6 +397,10 @@ func (c *ModelConfig) HasRouter() bool {
 	return len(c.Router.Candidates) > 0
 }
 
+// IsAlias reports whether this config is a pure redirect to another model.
+// Value receiver so it is callable on non-addressable config values too.
+func (c ModelConfig) IsAlias() bool { return c.Alias != "" }
+
 // @Description PII filtering configuration. PII redaction is per-model so
 // that local models don't pay the latency or behaviour change of regex
 // scanning, while cloud-bound traffic (cloud-proxy backend) can default to
@@ -1248,6 +1258,22 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 }
 
 func (c *ModelConfig) Validate() (bool, error) {
+	// An alias is a pure redirect: validate only its own shape here. Target
+	// existence and the no-chain rule need the full config set, so the loader
+	// (load-time) and the create/swap endpoints enforce those.
+	if c.IsAlias() {
+		if c.Name == "" {
+			return false, fmt.Errorf("alias config requires a name")
+		}
+		if c.Alias == c.Name {
+			return false, fmt.Errorf("alias %q cannot point to itself", c.Name)
+		}
+		if c.Backend != "" || c.Model != "" {
+			return false, fmt.Errorf("alias config %q must not set backend or parameters.model: an alias is a pure redirect", c.Name)
+		}
+		return true, nil
+	}
+
 	downloadedFileNames := []string{}
 	for _, f := range c.DownloadFiles {
 		downloadedFileNames = append(downloadedFileNames, f.Filename)
diff --git a/core/config/model_config_loader.go b/core/config/model_config_loader.go
index 89f4bc5cb..e2f43e83f 100644
--- a/core/config/model_config_loader.go
+++ b/core/config/model_config_loader.go
@@ -294,6 +294,44 @@ func (bcl *ModelConfigLoader) UpdateModelConfig(m string, updater func(*ModelCon
 	}
 }
 
+// ResolveAlias follows a one-hop alias to its target config. Returns
+// (resolved, wasAlias, err). Non-alias configs return (cfg, false, nil)
+// unchanged. Strict: the target must exist and must not itself be an alias
+// (chains are rejected). The returned config is a copy of the target.
+func (bcl *ModelConfigLoader) ResolveAlias(cfg *ModelConfig) (*ModelConfig, bool, error) {
+	if cfg == nil || !cfg.IsAlias() {
+		return cfg, false, nil
+	}
+	target, exists := bcl.GetModelConfig(cfg.Alias)
+	if !exists {
+		return nil, true, fmt.Errorf("alias %q points to unknown model %q", cfg.Name, cfg.Alias)
+	}
+	if target.IsAlias() {
+		return nil, true, fmt.Errorf("alias %q points to another alias %q (chains are not allowed)", cfg.Name, cfg.Alias)
+	}
+	return &target, true, nil
+}
+
+// ValidateAliasTarget checks an alias config's target at create/swap time:
+// the target must exist, must not be an alias, and must not be disabled.
+// Returns nil for non-alias configs.
+func (bcl *ModelConfigLoader) ValidateAliasTarget(cfg *ModelConfig) error {
+	if cfg == nil || !cfg.IsAlias() {
+		return nil
+	}
+	target, exists := bcl.GetModelConfig(cfg.Alias)
+	if !exists {
+		return fmt.Errorf("alias target %q does not exist", cfg.Alias)
+	}
+	if target.IsAlias() {
+		return fmt.Errorf("alias target %q is itself an alias (chains are not allowed)", cfg.Alias)
+	}
+	if target.IsDisabled() {
+		return fmt.Errorf("alias target %q is disabled", cfg.Alias)
+	}
+	return nil
+}
+
 // Preload prepare models if they are not local but url or huggingface repositories
 func (bcl *ModelConfigLoader) Preload(modelPath string) error {
 	bcl.Lock()
@@ -475,5 +513,21 @@ func (bcl *ModelConfigLoader) LoadModelConfigsFromPath(path string, opts ...Conf
 		}
 	}
 
+	// Surface aliases whose targets are missing or themselves aliases. These
+	// resolve to a clear request-time error; warning here gives operators
+	// visibility without failing startup.
+	for name, c := range bcl.configs {
+		if !c.IsAlias() {
+			continue
+		}
+		target, ok := bcl.configs[c.Alias]
+		switch {
+		case !ok:
+			xlog.Warn("alias points to unknown model", "alias", name, "target", c.Alias)
+		case target.IsAlias():
+			xlog.Warn("alias points to another alias (chains are not allowed)", "alias", name, "target", c.Alias)
+		}
+	}
+
 	return nil
 }
diff --git a/core/config/model_config_loader_test.go b/core/config/model_config_loader_test.go
index 924a4d1e4..06ab65a20 100644
--- a/core/config/model_config_loader_test.go
+++ b/core/config/model_config_loader_test.go
@@ -61,3 +61,51 @@ var _ = Describe("ModelConfigLoader.GetModelsConflictingWith", func() {
 		Expect(bcl.GetModelsConflictingWith("a")).To(ConsistOf("b"))
 	})
 })
+
+var _ = Describe("ModelConfigLoader alias resolution", func() {
+	var loader *ModelConfigLoader
+
+	BeforeEach(func() {
+		loader = NewModelConfigLoader("")
+		loader.configs["real"] = ModelConfig{Name: "real", Backend: "llama-cpp"}
+		loader.configs["gpt-4"] = ModelConfig{Name: "gpt-4", Alias: "real"}
+		loader.configs["chain"] = ModelConfig{Name: "chain", Alias: "gpt-4"}
+		loader.configs["dangling"] = ModelConfig{Name: "dangling", Alias: "nope"}
+	})
+
+	It("returns non-alias configs unchanged", func() {
+		cfg := loader.configs["real"]
+		got, was, err := loader.ResolveAlias(&cfg)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(was).To(BeFalse())
+		Expect(got.Name).To(Equal("real"))
+	})
+
+	It("resolves an alias to its target", func() {
+		cfg := loader.configs["gpt-4"]
+		got, was, err := loader.ResolveAlias(&cfg)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(was).To(BeTrue())
+		Expect(got.Name).To(Equal("real"))
+	})
+
+	It("rejects an alias chain", func() {
+		cfg := loader.configs["chain"]
+		_, was, err := loader.ResolveAlias(&cfg)
+		Expect(was).To(BeTrue())
+		Expect(err).To(MatchError(ContainSubstring("chains are not allowed")))
+	})
+
+	It("rejects a dangling alias", func() {
+		cfg := loader.configs["dangling"]
+		_, _, err := loader.ResolveAlias(&cfg)
+		Expect(err).To(MatchError(ContainSubstring("unknown model")))
+	})
+
+	It("ValidateAliasTarget passes for a real target and fails for a chain", func() {
+		good := loader.configs["gpt-4"]
+		Expect(loader.ValidateAliasTarget(&good)).ToNot(HaveOccurred())
+		bad := loader.configs["chain"]
+		Expect(loader.ValidateAliasTarget(&bad)).To(MatchError(ContainSubstring("itself an alias")))
+	})
+})
diff --git a/core/config/model_config_test.go b/core/config/model_config_test.go
index 7f256354d..2f2f3fd82 100644
--- a/core/config/model_config_test.go
+++ b/core/config/model_config_test.go
@@ -787,3 +787,32 @@ var _ = Describe("pattern detector config", func() {
 		Expect(err).To(MatchError(ContainSubstring("pattern \"EMAILish\"")))
 	})
 })
+
+var _ = Describe("ModelConfig alias", func() {
+	It("reports IsAlias when alias is set", func() {
+		c := ModelConfig{Name: "gpt-4", Alias: "my-llama-3"}
+		Expect(c.IsAlias()).To(BeTrue())
+		Expect(ModelConfig{Name: "real"}.IsAlias()).To(BeFalse())
+	})
+
+	It("validates a minimal alias config", func() {
+		c := ModelConfig{Name: "gpt-4", Alias: "my-llama-3"}
+		ok, err := c.Validate()
+		Expect(err).ToNot(HaveOccurred())
+		Expect(ok).To(BeTrue())
+	})
+
+	It("rejects an alias pointing to itself", func() {
+		c := ModelConfig{Name: "loop", Alias: "loop"}
+		ok, err := c.Validate()
+		Expect(ok).To(BeFalse())
+		Expect(err).To(MatchError(ContainSubstring("itself")))
+	})
+
+	It("rejects an alias that also sets a backend", func() {
+		c := ModelConfig{Name: "gpt-4", Alias: "my-llama-3", Backend: "llama-cpp"}
+		ok, err := c.Validate()
+		Expect(ok).To(BeFalse())
+		Expect(err).To(MatchError(ContainSubstring("pure redirect")))
+	})
+})
diff --git a/core/http/endpoints/localai/aliases.go b/core/http/endpoints/localai/aliases.go
new file mode 100644
index 000000000..923e22c63
--- /dev/null
+++ b/core/http/endpoints/localai/aliases.go
@@ -0,0 +1,33 @@
+package localai
+
+import (
+	"net/http"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/config"
+)
+
+// AliasInfo is one alias -> target pair.
+type AliasInfo struct {
+	Name   string `json:"name"`
+	Target string `json:"target"`
+}
+
+// ListAliasesEndpoint returns every configured model alias and its target.
+//
+//	@Summary	List model aliases
+//	@Tags		models
+//	@Success	200	{array}	AliasInfo
+//	@Router		/api/aliases [get]
+func ListAliasesEndpoint(cl *config.ModelConfigLoader) echo.HandlerFunc {
+	return func(c echo.Context) error {
+		// Non-nil so an empty result marshals as [] rather than null.
+		out := []AliasInfo{}
+		for _, cfg := range cl.GetAllModelsConfigs() {
+			if cfg.IsAlias() {
+				out = append(out, AliasInfo{Name: cfg.Name, Target: cfg.Alias})
+			}
+		}
+		return c.JSON(http.StatusOK, out)
+	}
+}
diff --git a/core/http/endpoints/localai/aliases_test.go b/core/http/endpoints/localai/aliases_test.go
new file mode 100644
index 000000000..e1c44898a
--- /dev/null
+++ b/core/http/endpoints/localai/aliases_test.go
@@ -0,0 +1,57 @@
+package localai_test
+
+import (
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"path/filepath"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/config"
+	. "github.com/mudler/LocalAI/core/http/endpoints/localai"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("ListAliasesEndpoint", func() {
+	var tempDir string
+
+	BeforeEach(func() {
+		var err error
+		tempDir, err = os.MkdirTemp("", "localai-aliases-test")
+		Expect(err).ToNot(HaveOccurred())
+	})
+	AfterEach(func() {
+		_ = os.RemoveAll(tempDir)
+	})
+
+	It("returns only alias configs as name/target pairs", func() {
+		// Seed one real model and one alias pointing at it.
+		Expect(os.WriteFile(
+			filepath.Join(tempDir, "real.yaml"),
+			[]byte("name: real\nbackend: llama-cpp\nmodel: foo\n"),
+			0644,
+		)).To(Succeed())
+		Expect(os.WriteFile(
+			filepath.Join(tempDir, "gpt-4.yaml"),
+			[]byte("name: gpt-4\nalias: real\n"),
+			0644,
+		)).To(Succeed())
+
+		loader := config.NewModelConfigLoader(tempDir)
+		Expect(loader.LoadModelConfigsFromPath(tempDir)).To(Succeed())
+
+		app := echo.New()
+		app.GET("/api/aliases", ListAliasesEndpoint(loader))
+
+		req := httptest.NewRequest("GET", "/api/aliases", nil)
+		rec := httptest.NewRecorder()
+		app.ServeHTTP(rec, req)
+
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(rec.Body.String()).To(ContainSubstring(`"name":"gpt-4"`))
+		Expect(rec.Body.String()).To(ContainSubstring(`"target":"real"`))
+		// The real model must not appear as an alias entry.
+		Expect(rec.Body.String()).ToNot(ContainSubstring(`"name":"real"`))
+	})
+})
diff --git a/core/http/endpoints/localai/import_model.go b/core/http/endpoints/localai/import_model.go
index dc225abdd..54a80a9cc 100644
--- a/core/http/endpoints/localai/import_model.go
+++ b/core/http/endpoints/localai/import_model.go
@@ -181,6 +181,12 @@ func ImportModelEndpoint(cl *config.ModelConfigLoader, appConfig *config.Applica
 			return c.JSON(http.StatusBadRequest, ModelResponse{Success: false, Error: msg})
 		}
 
+		// Reject aliases whose target is missing, chained, or disabled so a
+		// dangling alias can't be persisted and surface as a runtime error later.
+		if err := cl.ValidateAliasTarget(&modelConfig); err != nil {
+			return c.JSON(http.StatusBadRequest, ModelResponse{Success: false, Error: err.Error()})
+		}
+
 		// Create the configuration file
 		configPath := filepath.Join(appConfig.SystemState.Model.ModelsPath, modelConfig.Name+".yaml")
 		if err := utils.VerifyPath(modelConfig.Name+".yaml", appConfig.SystemState.Model.ModelsPath); err != nil {
diff --git a/core/http/endpoints/mcp/localai_assistant_test.go b/core/http/endpoints/mcp/localai_assistant_test.go
index 26cd2878f..8de7355c6 100644
--- a/core/http/endpoints/mcp/localai_assistant_test.go
+++ b/core/http/endpoints/mcp/localai_assistant_test.go
@@ -51,6 +51,12 @@ func (stubClient) EditModelConfig(_ context.Context, _ string, _ map[string]any)
 	return nil
 }
 func (stubClient) ReloadModels(_ context.Context) error { return nil }
+func (stubClient) SetAlias(_ context.Context, _, _ string) error {
+	return nil
+}
+func (stubClient) ListAliases(_ context.Context) ([]localaitools.AliasInfo, error) {
+	return nil, nil
+}
 func (stubClient) ListBackends(_ context.Context) ([]localaitools.Backend, error) {
 	return []localaitools.Backend{{Name: "stub-backend", Installed: true}}, nil
 }
diff --git a/core/http/middleware/request.go b/core/http/middleware/request.go
index ff0d929ac..74f7e8565 100644
--- a/core/http/middleware/request.go
+++ b/core/http/middleware/request.go
@@ -167,6 +167,27 @@ func (re *RequestExtractor) SetModelAndConfig(initializer func() schema.LocalAIR
 				}
 			}
 
+			// Resolve a model alias to its target before the disabled check and
+			// before storing MODEL_CONFIG, so every modality (chat, embeddings,
+			// tts, image, ...) inherits redirection. The response keeps echoing
+			// the alias name (input.ModelName is left unchanged); usage accounting
+			// records requested=alias / served=target.
+			if cfg != nil && cfg.IsAlias() {
+				resolved, _, aliasErr := re.modelConfigLoader.ResolveAlias(cfg)
+				if aliasErr != nil {
+					return c.JSON(http.StatusBadRequest, schema.ErrorResponse{
+						Error: &schema.APIError{
+							Message: aliasErr.Error(),
+							Code:    http.StatusBadRequest,
+							Type:    "invalid_request_error",
+						},
+					})
+				}
+				c.Set(ContextKeyRequestedModel, modelName)
+				c.Set(ContextKeyServedModel, resolved.Name)
+				cfg = resolved
+			}
+
 			// Check if the model is disabled
 			if cfg != nil && cfg.IsDisabled() {
 				return c.JSON(http.StatusForbidden, schema.ErrorResponse{
diff --git a/core/http/middleware/request_test.go b/core/http/middleware/request_test.go
index fe9fc926c..010379714 100644
--- a/core/http/middleware/request_test.go
+++ b/core/http/middleware/request_test.go
@@ -151,6 +151,107 @@ var _ = Describe("SetModelAndConfig middleware", func() {
 	})
 })
 
+// ---------------------------------------------------------------------------
+// SetModelAndConfig - model alias resolution
+// ---------------------------------------------------------------------------
+//
+// An alias config (`alias: <target>`) is a pure redirect: the middleware must
+// swap MODEL_CONFIG to the target config before the disabled check and before
+// storing it, while leaving the response-facing model name as the alias. It
+// also stamps routing.requested_model = alias and routing.served_model =
+// target so usage accounting records both identities.
+var _ = Describe("SetModelAndConfig alias resolution", func() {
+	var (
+		modelDir       string
+		capturedConfig *config.ModelConfig
+		capturedReq    any
+		capturedServed any
+		app            *echo.Echo
+	)
+
+	BeforeEach(func() {
+		var err error
+		modelDir, err = os.MkdirTemp("", "localai-alias-*")
+		Expect(err).ToNot(HaveOccurred())
+	})
+
+	AfterEach(func() {
+		_ = os.RemoveAll(modelDir)
+	})
+
+	// buildApp seeds the loader from every YAML in modelDir (so an alias's
+	// target is present in the loader map) and wires a handler that captures
+	// the resolved config plus the stamped identity keys.
+	buildApp := func() *echo.Echo {
+		ss := &system.SystemState{Model: system.Model{ModelsPath: modelDir}}
+		appConfig := config.NewApplicationConfig()
+		appConfig.SystemState = ss
+
+		mcl := config.NewModelConfigLoader(modelDir)
+		Expect(mcl.LoadModelConfigsFromPath(modelDir)).To(Succeed())
+		ml := model.NewModelLoader(ss)
+		re := NewRequestExtractor(mcl, ml, appConfig)
+
+		capturedConfig = nil
+		capturedReq = nil
+		capturedServed = nil
+		e := echo.New()
+		e.POST("/v1/chat/completions",
+			func(c echo.Context) error {
+				if cfg, ok := c.Get(CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig); ok {
+					capturedConfig = cfg
+				}
+				capturedReq = c.Get(ContextKeyRequestedModel)
+				capturedServed = c.Get(ContextKeyServedModel)
+				return c.String(http.StatusOK, "ok")
+			},
+			re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
+		)
+		return e
+	}
+
+	It("serves the target config but keeps the alias name and stamps identity", func() {
+		Expect(os.WriteFile(filepath.Join(modelDir, "real.yaml"),
+			[]byte("name: real\nbackend: llama-cpp\n"), 0644)).To(Succeed())
+		Expect(os.WriteFile(filepath.Join(modelDir, "gpt-4.yaml"),
+			[]byte("name: gpt-4\nalias: real\n"), 0644)).To(Succeed())
+		app = buildApp()
+
+		req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions",
+			strings.NewReader(`{"model":"gpt-4","messages":[{"role":"user","content":"hi"}]}`))
+		req.Header.Set("Content-Type", "application/json")
+		rec := httptest.NewRecorder()
+		app.ServeHTTP(rec, req)
+
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(capturedConfig).ToNot(BeNil())
+		// MODEL_CONFIG must be the target, not the alias stub.
+		Expect(capturedConfig.Name).To(Equal("real"))
+		Expect(capturedConfig.IsAlias()).To(BeFalse())
+		// Identity stamps: requested = alias, served = target.
+		Expect(capturedReq).To(Equal("gpt-4"))
+		Expect(capturedServed).To(Equal("real"))
+	})
+
+	It("returns 400 when the alias target is missing", func() {
+		Expect(os.WriteFile(filepath.Join(modelDir, "gpt-4.yaml"),
+			[]byte("name: gpt-4\nalias: nope\n"), 0644)).To(Succeed())
+		app = buildApp()
+
+		req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions",
+			strings.NewReader(`{"model":"gpt-4","messages":[{"role":"user","content":"hi"}]}`))
+		req.Header.Set("Content-Type", "application/json")
+		rec := httptest.NewRecorder()
+		app.ServeHTTP(rec, req)
+
+		Expect(rec.Code).To(Equal(http.StatusBadRequest))
+		var resp schema.ErrorResponse
+		Expect(json.Unmarshal(rec.Body.Bytes(), &resp)).To(Succeed())
+		Expect(resp.Error).ToNot(BeNil())
+		Expect(resp.Error.Type).To(Equal("invalid_request_error"))
+	})
+})
+
 // ---------------------------------------------------------------------------
 // MergeOpenResponsesConfig — tool_choice parsing
 // ---------------------------------------------------------------------------
diff --git a/core/http/middleware/route_model.go b/core/http/middleware/route_model.go
index 7ff286af4..470bd05f5 100644
--- a/core/http/middleware/route_model.go
+++ b/core/http/middleware/route_model.go
@@ -189,7 +189,12 @@ func RouteModel(loader *config.ModelConfigLoader, appConfig *config.ApplicationC
 			}
 
 			c.Set(CONTEXT_LOCALS_KEY_MODEL_CONFIG, result.ChosenConfig)
-			c.Set(ContextKeyRequestedModel, result.RouterModel)
+			// Preserve an upstream requested model (e.g. an alias that points
+			// at this router model) so accounting keeps the name the client
+			// actually sent. Served always reflects the final candidate.
+			if c.Get(ContextKeyRequestedModel) == nil {
+				c.Set(ContextKeyRequestedModel, result.RouterModel)
+			}
 			c.Set(ContextKeyServedModel, result.ChosenModel)
 
 			if store != nil {
diff --git a/core/http/react-ui/e2e/alias-template.spec.js b/core/http/react-ui/e2e/alias-template.spec.js
new file mode 100644
index 000000000..f3b1a0ca0
--- /dev/null
+++ b/core/http/react-ui/e2e/alias-template.spec.js
@@ -0,0 +1,77 @@
+import { test, expect } from './coverage-fixtures.js'
+
+// Alias / Routing template + Manage alias badge regression tests.
+//
+// An alias is a model config with `alias: <target>` that redirects traffic to
+// the target model. This covers the two discoverability surfaces:
+//   - the create-flow template gallery exposes an "Alias / Routing" card that
+//     seeds a minimal name + alias config
+//   - the Manage Models tab renders a read-only "alias -> target" badge on
+//     rows that resolve to an alias (looked up via GET /api/aliases, since the
+//     capabilities row payload doesn't carry the alias field)
+
+// Minimal metadata so the editor renders the alias field once the template
+// loads. Mirrors the Task 7 config-meta registry, which surfaces `alias` as a
+// model-select component.
+const ALIAS_METADATA = {
+  sections: [
+    { id: 'general', label: 'General', icon: 'settings', order: 0 },
+    { id: 'other', label: 'Other', icon: 'more-horizontal', order: 100 },
+  ],
+  fields: [
+    { path: 'name', yaml_key: 'name', go_type: 'string', ui_type: 'string',
+      section: 'general', label: 'Model Name', component: 'input', order: 0 },
+    { path: 'alias', yaml_key: 'alias', go_type: 'string', ui_type: 'string',
+      section: 'general', label: 'Alias', component: 'model-select', autocomplete_provider: 'models',
+      description: 'Redirect this model name to another configured model.', order: 1 },
+  ],
+}
+
+test.describe('Alias template - create flow', () => {
+  test.beforeEach(async ({ page }) => {
+    await page.route('**/api/auth/status', (route) =>
+      route.fulfill({ contentType: 'application/json', body: JSON.stringify({ authEnabled: false, staticApiKeyRequired: false, providers: [] }) }))
+    await page.route('**/api/models/config-metadata*', (route) =>
+      route.fulfill({ contentType: 'application/json', body: JSON.stringify(ALIAS_METADATA) }))
+    await page.route('**/api/models/config-metadata/autocomplete/**', (route) =>
+      route.fulfill({ contentType: 'application/json', body: JSON.stringify({ values: [] }) }))
+
+    page.on('pageerror', (err) => {
+      throw new Error(`uncaught page error: ${err.message}`)
+    })
+  })
+
+  test('template gallery exposes the Alias / Routing card', async ({ page }) => {
+    await page.goto('/app/model-editor')
+    await expect(page.getByRole('button', { name: /Alias \/ Routing/i })).toBeVisible({ timeout: 10_000 })
+  })
+
+  test('alias template loads the editor with the alias field', async ({ page }) => {
+    await page.goto('/app/model-editor?template=alias')
+    await expect(page.getByText(/Unexpected Application Error/i)).toHaveCount(0)
+    await expect(page.locator('h1.page-title')).toBeVisible({ timeout: 10_000 })
+    await expect(page.getByText('Alias').first()).toBeVisible()
+  })
+})
+
+test.describe('Manage - alias badge', () => {
+  test.beforeEach(async ({ page }) => {
+    await page.route('**/api/auth/status', (route) =>
+      route.fulfill({ contentType: 'application/json', body: JSON.stringify({ authEnabled: false, staticApiKeyRequired: false, providers: [] }) }))
+    await page.route('**/api/models/capabilities', (route) =>
+      route.fulfill({ contentType: 'application/json', body: JSON.stringify({ data: [
+        { id: 'fast-llm', capabilities: ['chat'], backend: 'llama-cpp' },
+        { id: 'gpt-4', capabilities: ['chat'], backend: 'llama-cpp' },
+      ] }) }))
+    await page.route('**/api/aliases', (route) =>
+      route.fulfill({ contentType: 'application/json', body: JSON.stringify([{ name: 'gpt-4', target: 'fast-llm' }]) }))
+  })
+
+  test('renders a read-only alias -> target badge on aliased rows', async ({ page }) => {
+    await page.goto('/app/manage')
+    await expect(page.locator('.table')).toBeVisible({ timeout: 10_000 })
+
+    // The aliased row shows the target; the plain model row does not.
+    await expect(page.getByText('alias -> fast-llm')).toBeVisible({ timeout: 10_000 })
+  })
+})
diff --git a/core/http/react-ui/src/pages/Manage.jsx b/core/http/react-ui/src/pages/Manage.jsx
index 48d18c33c..16d04f709 100644
--- a/core/http/react-ui/src/pages/Manage.jsx
+++ b/core/http/react-ui/src/pages/Manage.jsx
@@ -133,6 +133,10 @@ export default function Manage() {
   const { enrichModel, enrichBackend } = useGalleryEnrichment()
   const { operations } = useOperations()
   const [loadedModelIds, setLoadedModelIds] = useState(new Set())
+  // Map of alias name -> target. The capabilities endpoint that feeds the row
+  // list doesn't carry the alias field, so we fetch it once and look rows up by
+  // name to render the read-only "alias -> target" badge.
+  const [aliasTargets, setAliasTargets] = useState({})
   const [backends, setBackends] = useState([])
   const [backendsLoading, setBackendsLoading] = useState(true)
   const [reloading, setReloading] = useState(false)
@@ -228,12 +232,24 @@ export default function Manage() {
     }
   }, [])
 
+  const fetchAliases = useCallback(async () => {
+    try {
+      const data = await modelsApi.listAliases()
+      const map = {}
+      for (const a of Array.isArray(data) ? data : []) map[a.name] = a.target
+      setAliasTargets(map)
+    } catch {
+      setAliasTargets({})
+    }
+  }, [])
+
   useEffect(() => {
     fetchLoadedModels()
     fetchBackends()
+    fetchAliases()
     // Detect distributed mode (nodes API returns 503 when not enabled)
     nodesApi.list().then(() => setDistributedMode(true)).catch(() => {})
-  }, [fetchLoadedModels, fetchBackends])
+  }, [fetchLoadedModels, fetchBackends, fetchAliases])
 
   // Auto-refresh the Models tab every 10s in distributed mode so ghost models
   // (loaded on a worker but absent from this frontend's in-memory cache)
@@ -636,6 +652,11 @@ export default function Manage() {
                               <i className="fas fa-thumbtack" /> Pinned
                             </span>
                           )}
+                          {aliasTargets[model.id] && (
+                            <span className="badge badge-info" title={`Alias -> ${aliasTargets[model.id]}`}>
+                              <i className="fas fa-arrow-right-arrow-left" /> alias -&gt; {aliasTargets[model.id]}
+                            </span>
+                          )}
                         </div>
                       </td>
                       <td>
diff --git a/core/http/react-ui/src/utils/api.js b/core/http/react-ui/src/utils/api.js
index a8ffa2f04..20bb90363 100644
--- a/core/http/react-ui/src/utils/api.js
+++ b/core/http/react-ui/src/utils/api.js
@@ -84,6 +84,7 @@ export const modelsApi = {
   list: (params) => fetchJSON(buildUrl(API_CONFIG.endpoints.models, params)),
   listV1: () => fetchJSON(API_CONFIG.endpoints.modelsList),
   listCapabilities: () => fetchJSON(API_CONFIG.endpoints.modelsCapabilities),
+  listAliases: () => fetchJSON(API_CONFIG.endpoints.modelsAliases),
   install: (id) => postJSON(API_CONFIG.endpoints.installModel(id), {}),
   delete: (id) => postJSON(API_CONFIG.endpoints.deleteModel(id), {}),
   estimate: (id, contexts) => fetchJSON(
diff --git a/core/http/react-ui/src/utils/config.js b/core/http/react-ui/src/utils/config.js
index cf83d590f..65797fe41 100644
--- a/core/http/react-ui/src/utils/config.js
+++ b/core/http/react-ui/src/utils/config.js
@@ -95,6 +95,7 @@ export const API_CONFIG = {
 
     modelsList: '/v1/models',
     modelsCapabilities: '/api/models/capabilities',
+    modelsAliases: '/api/aliases',
 
     // Realtime / WebRTC
     realtimeCalls: '/v1/realtime/calls',
diff --git a/core/http/react-ui/src/utils/modelTemplates.js b/core/http/react-ui/src/utils/modelTemplates.js
index 54d34aecc..c3675f9db 100644
--- a/core/http/react-ui/src/utils/modelTemplates.js
+++ b/core/http/react-ui/src/utils/modelTemplates.js
@@ -142,6 +142,16 @@ const MODEL_TEMPLATES = [
       ],
     },
   },
+  {
+    id: 'alias',
+    label: 'Alias / Routing',
+    icon: 'fa-arrow-right-arrow-left',
+    description: 'Point a model name at another configured model. Clients keep calling the alias; you swap the target anytime.',
+    fields: {
+      'name': '',
+      'alias': '',
+    },
+  },
   {
     id: 'mitm',
     label: 'MITM Intercept',
diff --git a/core/http/routes/localai.go b/core/http/routes/localai.go
index a66801556..1df1d5d8c 100644
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -80,6 +80,9 @@ func RegisterLocalAIRoutes(router *echo.Echo,
 		// Custom model edit endpoint
 		router.POST("/models/edit/:name", localai.EditModelEndpoint(cl, ml, appConfig), adminMiddleware)
 
+		// List model aliases endpoint
+		router.GET("/api/aliases", localai.ListAliasesEndpoint(cl), adminMiddleware)
+
 		// Toggle model enable/disable endpoint
 		router.PUT("/models/toggle-state/:name/:action", localai.ToggleStateModelEndpoint(cl, ml, appConfig), adminMiddleware)
 
@@ -303,6 +306,7 @@ func RegisterLocalAIRoutes(router *echo.Echo,
 					"edit":         "/models/edit/:name",
 					"import":       "/models/import",
 					"reload":       "/models/reload",
+					"list_aliases": "/api/aliases",
 				},
 				"ai_functions": map[string]string{
 					"tts":       "/tts",
diff --git a/core/services/modeladmin/config.go b/core/services/modeladmin/config.go
index c01e2fb4c..f4fc53d97 100644
--- a/core/services/modeladmin/config.go
+++ b/core/services/modeladmin/config.go
@@ -130,6 +130,9 @@ func (s *ConfigService) PatchConfig(_ context.Context, name string, patch map[st
 		}
 		return nil, ErrInvalidConfig
 	}
+	if err := s.Loader.ValidateAliasTarget(&updated); err != nil {
+		return nil, fmt.Errorf("%w: %v", ErrInvalidConfig, err)
+	}
 	if err := writeFileAtomic(configPath, yamlData, 0644); err != nil {
 		return nil, fmt.Errorf("write config file: %w", err)
 	}
@@ -215,6 +218,9 @@ func (s *ConfigService) EditYAML(_ context.Context, name string, body []byte, ml
 	if valid, _ := req.Validate(); !valid {
 		return nil, ErrInvalidConfig
 	}
+	if err := s.Loader.ValidateAliasTarget(&req); err != nil {
+		return nil, fmt.Errorf("%w: %v", ErrInvalidConfig, err)
+	}
 
 	configPath := existing.GetModelConfigFile()
 	modelsPath := s.modelsPath()
diff --git a/core/services/modeladmin/config_test.go b/core/services/modeladmin/config_test.go
index d4157047d..36569c19b 100644
--- a/core/services/modeladmin/config_test.go
+++ b/core/services/modeladmin/config_test.go
@@ -211,5 +211,23 @@ var _ = Describe("ConfigService", func() {
 			_, err := svc.EditYAML(ctx, "alpha", nil, nil)
 			Expect(err).To(MatchError(ErrEmptyBody))
 		})
+
+		It("rejects editing a config into an alias with a missing target", func() {
+			writeModelYAML(svc, dir, "base", map[string]any{"backend": "llama-cpp"})
+
+			body := []byte("name: base\nalias: ghost\n")
+			_, err := svc.EditYAML(ctx, "base", body, nil)
+			Expect(err).To(MatchError(ErrInvalidConfig))
+			Expect(err.Error()).To(ContainSubstring("ghost"))
+		})
+
+		It("accepts editing a config into an alias with a real target", func() {
+			writeModelYAML(svc, dir, "base", map[string]any{"backend": "llama-cpp"})
+			writeModelYAML(svc, dir, "target", map[string]any{"backend": "llama-cpp"})
+
+			body := []byte("name: base\nalias: target\n")
+			_, err := svc.EditYAML(ctx, "base", body, nil)
+			Expect(err).ToNot(HaveOccurred())
+		})
 	})
 })
diff --git a/docs/content/features/model-aliases.md b/docs/content/features/model-aliases.md
new file mode 100644
index 000000000..8c4bd977d
--- /dev/null
+++ b/docs/content/features/model-aliases.md
@@ -0,0 +1,81 @@
+
++++
+disableToc = false
+title = "Model Aliases"
+weight = 24
+url = "/features/model-aliases/"
++++
+
+A **model alias** is a model name that redirects all traffic to another
+configured model. Declare `gpt-4` as an alias of `my-llama-3` and every client
+calling `gpt-4` is served by `my-llama-3` with no client reconfiguration: the
+clients keep their existing model name while you control what answers them on
+the server side.
+
+## Declaring an alias
+
+Create a minimal config file in your models directory:
+
+```yaml
+name: gpt-4
+alias: my-llama-3
+```
+
+That is the whole config: a `name` (the alias clients call) and an `alias` key
+(the target that actually serves the request).
+
+## Rules and behavior
+
+- The target (`my-llama-3`) must be an existing, non-alias, enabled model. You
+  cannot point an alias at a missing model, a disabled model, or another alias
+  (no chains).
+- Aliases are 1:1. One alias maps to exactly one target.
+- The target can be swapped live by editing the config file, calling the API,
+  using the UI, or asking the assistant. No restart is required.
+- Both `gpt-4` and `my-llama-3` appear in `GET /v1/models`.
+- Responses echo the requested alias: a call to `gpt-4` returns `gpt-4` in the
+  response `model` field, not the target name.
+- Usage accounting records both sides: requested `gpt-4`, served `my-llama-3`.
+- Aliases work for every modality (chat, embeddings, audio, images, and so on).
+
+## Managing aliases
+
+You can create, swap, and remove aliases from any of the management surfaces.
+
+### Web UI
+
+Open **Add Model** and pick the **Alias / Routing** template, then set a name
+and a target. To re-point an existing alias, edit it and change the target.
+
+### REST API
+
+- Create: `POST /models/import`
+- Swap the target: `PATCH /api/models/config-json/:name`
+- List all aliases: `GET /api/aliases`
+- Delete: `POST /models/delete/:name`
+
+### Assistant and MCP
+
+The LocalAI Assistant (and the MCP server) expose the same operations as tools:
+`set_alias`, `list_aliases`, and `delete_model`.
+
+{{% notice note %}}
+**You cannot turn an existing real model into an alias.** If you run `set_alias`
+(or `PATCH /api/models/config-json/:name`) against a name that is already a real,
+non-alias model, the request is **rejected**. An alias is a pure redirect, so it
+must not carry a `backend` or `parameters.model`; a real model does, and merging
+an `alias` onto it produces an invalid config that validation refuses with
+`alias config ... must not set backend or parameters.model`. This is intentional:
+it stops a stray `set_alias` call from clobbering a model that is serving.
+
+To add an alias, point a **new** name at the target instead of reusing an
+existing model's name. Re-pointing an **existing alias** at a different target
+is fully supported and is the live-swap path: the alias config has no backend of
+its own, so swapping its target stays a valid pure redirect.
+{{% /notice %}}
+
+## Limits
+
+Aliases are a static 1:1 redirect. For classifier-based or load-balanced
+selection across several downstream models, use the intelligent router in the
+[Middleware]({{%relref "features/middleware" %}}) feature instead.
diff --git a/pkg/mcp/localaitools/client.go b/pkg/mcp/localaitools/client.go
index 5ac519aca..f6f6114be 100644
--- a/pkg/mcp/localaitools/client.go
+++ b/pkg/mcp/localaitools/client.go
@@ -38,6 +38,14 @@ type LocalAIClient interface {
 	ReloadModels(ctx context.Context) error
 	ImportModelURI(ctx context.Context, req ImportModelURIRequest) (*ImportModelURIResponse, error)
 
+	// ---- Model aliases ----
+	// SetAlias creates the alias `name` pointing at `target`, or swaps an
+	// existing alias's target. The server validates that `target` is an
+	// existing, non-alias, enabled model. Deletion reuses DeleteModel.
+	SetAlias(ctx context.Context, name, target string) error
+	// ListAliases returns every configured alias and its target.
+	ListAliases(ctx context.Context) ([]AliasInfo, error)
+
 	// ---- Backends ----
 	// ListBackends returns installed backends. The shape stays a thin
 	// localaitools.Backend rather than gallery.SystemBackend because the
diff --git a/pkg/mcp/localaitools/coverage_test.go b/pkg/mcp/localaitools/coverage_test.go
index ddf5e9c1d..39a2ab544 100644
--- a/pkg/mcp/localaitools/coverage_test.go
+++ b/pkg/mcp/localaitools/coverage_test.go
@@ -41,6 +41,7 @@ var toolToHTTPRoute = map[string]string{
 	ToolGetPIIEvents:        "GET /api/pii/events",
 	ToolGetMiddlewareStatus: "GET /api/middleware/status",
 	ToolGetRouterDecisions:  "GET /api/router/decisions",
+	ToolListAliases:         "GET /api/aliases",
 
 	// Mutating tools.
 	ToolInstallModel:      "POST /models/apply",
@@ -53,6 +54,7 @@ var toolToHTTPRoute = map[string]string{
 	ToolToggleModelState:  "PUT /models/toggle-state/:name/:action",
 	ToolToggleModelPinned: "PUT /models/toggle-pinned/:name/:action",
 	ToolSetBranding:       "POST /api/settings (instance_name, instance_tagline)",
+	ToolSetAlias:          "PATCH /api/models/config-json/:name (swap) or POST /models/import (create)",
 }
 
 // allKnownTools is the union of expectedFullCatalog (defined in
diff --git a/pkg/mcp/localaitools/dto.go b/pkg/mcp/localaitools/dto.go
index 77e9a9065..f8aa98eee 100644
--- a/pkg/mcp/localaitools/dto.go
+++ b/pkg/mcp/localaitools/dto.go
@@ -52,6 +52,14 @@ type ModelConfigView struct {
 	JSON map[string]any `json:"json,omitempty"  jsonschema:"Parsed JSON view of the same config (convenience for diffing)."`
 }
 
+// AliasInfo is one alias -> target pair, the shape list_aliases returns and
+// GET /api/aliases emits. Kept aligned with localai.AliasInfo so the
+// MCP wire output matches the REST endpoint by construction.
+type AliasInfo struct {
+	Name   string `json:"name"`
+	Target string `json:"target"`
+}
+
 // InstallModelRequest is the input for install_model.
 type InstallModelRequest struct {
 	GalleryName string         `json:"gallery_name,omitempty" jsonschema:"The gallery the model lives in (from gallery_search). Optional when ModelName is unique across galleries."`
diff --git a/pkg/mcp/localaitools/fakes_test.go b/pkg/mcp/localaitools/fakes_test.go
index 3d76ae8b9..388245ad2 100644
--- a/pkg/mcp/localaitools/fakes_test.go
+++ b/pkg/mcp/localaitools/fakes_test.go
@@ -32,6 +32,8 @@ type fakeClient struct {
 	importModelURI      func(ImportModelURIRequest) (*ImportModelURIResponse, error)
 	deleteModel         func(string) error
 	editModelConfig     func(string, map[string]any) error
+	setAlias            func(string, string) error
+	listAliases         func() ([]AliasInfo, error)
 	reloadModels        func() error
 	listBackends        func() ([]Backend, error)
 	listKnownBackends   func() ([]schema.KnownBackend, error)
@@ -143,6 +145,22 @@ func (f *fakeClient) EditModelConfig(_ context.Context, name string, patch map[s
 	return nil
 }
 
+func (f *fakeClient) SetAlias(_ context.Context, name, target string) error {
+	f.record("SetAlias", []any{name, target})
+	if f.setAlias != nil {
+		return f.setAlias(name, target)
+	}
+	return nil
+}
+
+func (f *fakeClient) ListAliases(_ context.Context) ([]AliasInfo, error) {
+	f.record("ListAliases", nil)
+	if f.listAliases != nil {
+		return f.listAliases()
+	}
+	return []AliasInfo{}, nil
+}
+
 func (f *fakeClient) ReloadModels(_ context.Context) error {
 	f.record("ReloadModels", nil)
 	if f.reloadModels != nil {
diff --git a/pkg/mcp/localaitools/httpapi/client.go b/pkg/mcp/localaitools/httpapi/client.go
index d2947a5b1..90ec332e2 100644
--- a/pkg/mcp/localaitools/httpapi/client.go
+++ b/pkg/mcp/localaitools/httpapi/client.go
@@ -338,6 +338,42 @@ func (c *Client) ReloadModels(ctx context.Context) error {
 	return c.do(ctx, http.MethodPost, routeModelsReload, nil, nil)
 }
 
+// ---- Model aliases ----
+
+// SetAlias is swap-first: it PATCHes the alias config (a deep-merge that
+// validates the target and preserves any other fields), and only creates a
+// fresh config when the PATCH reports the model doesn't exist yet. We prefer
+// PATCH over POST /models/import for existing names because import rewrites
+// the whole file, whereas PATCH gives a reliable 404 not-found signal
+// (ErrHTTPNotFound) to branch on and never clobbers an existing config.
+func (c *Client) SetAlias(ctx context.Context, name, target string) error {
+	if name == "" {
+		return errors.New("name is required")
+	}
+	if target == "" {
+		return errors.New("target is required")
+	}
+	err := c.do(ctx, http.MethodPatch, routeModelConfigJSON(name), map[string]any{"alias": target}, nil)
+	if err == nil {
+		return nil
+	}
+	if !errors.Is(err, ErrHTTPNotFound) {
+		return err
+	}
+	// No such config yet: create it. The import endpoint validates the alias
+	// target server-side, same as the PATCH path.
+	return c.do(ctx, http.MethodPost, routeModelImport, map[string]any{"name": name, "alias": target}, nil)
+}
+
+func (c *Client) ListAliases(ctx context.Context) ([]localaitools.AliasInfo, error) {
+	// /api/aliases returns []{name,target} directly - pass it through.
+	var out []localaitools.AliasInfo
+	if err := c.do(ctx, http.MethodGet, routeAliases, nil, &out); err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
 // ---- Backends ----
 
 func (c *Client) ListBackends(ctx context.Context) ([]localaitools.Backend, error) {
diff --git a/pkg/mcp/localaitools/httpapi/client_test.go b/pkg/mcp/localaitools/httpapi/client_test.go
index 6e6fc3972..319ceffee 100644
--- a/pkg/mcp/localaitools/httpapi/client_test.go
+++ b/pkg/mcp/localaitools/httpapi/client_test.go
@@ -166,6 +166,92 @@ var _ = Describe("httpapi.Client against the LocalAI admin REST surface", func()
 	})
 })
 
+var _ = Describe("Model aliases", func() {
+	Describe("ListAliases", func() {
+		It("passes the GET /api/aliases payload through unchanged", func() {
+			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				Expect(r.Method).To(Equal(http.MethodGet))
+				Expect(r.URL.Path).To(Equal("/api/aliases"))
+				_ = json.NewEncoder(w).Encode([]map[string]any{
+					{"name": "gpt-4", "target": "qwen"},
+				})
+			}))
+			DeferCleanup(srv.Close)
+
+			out, err := New(srv.URL, "").ListAliases(context.Background())
+			Expect(err).ToNot(HaveOccurred())
+			Expect(out).To(HaveLen(1))
+			Expect(out[0].Name).To(Equal("gpt-4"))
+			Expect(out[0].Target).To(Equal("qwen"))
+		})
+	})
+
+	Describe("SetAlias", func() {
+		It("swaps an existing alias via PATCH without falling back to import", func() {
+			var patched, imported bool
+			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				switch {
+				case r.Method == http.MethodPatch && r.URL.Path == "/api/models/config-json/gpt-4":
+					patched = true
+					var body map[string]any
+					Expect(json.NewDecoder(r.Body).Decode(&body)).To(Succeed())
+					Expect(body).To(HaveKeyWithValue("alias", "qwen"))
+					_ = json.NewEncoder(w).Encode(map[string]any{"success": true})
+				case r.URL.Path == "/models/import":
+					imported = true
+					w.WriteHeader(http.StatusOK)
+				default:
+					http.Error(w, "unexpected", http.StatusTeapot)
+				}
+			}))
+			DeferCleanup(srv.Close)
+
+			Expect(New(srv.URL, "").SetAlias(context.Background(), "gpt-4", "qwen")).To(Succeed())
+			Expect(patched).To(BeTrue(), "PATCH should be attempted first")
+			Expect(imported).To(BeFalse(), "import must not run when PATCH succeeds")
+		})
+
+		It("creates a fresh alias via import when PATCH reports the model is missing", func() {
+			var imported bool
+			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				switch {
+				case r.Method == http.MethodPatch:
+					http.Error(w, "model configuration not found", http.StatusNotFound)
+				case r.Method == http.MethodPost && r.URL.Path == "/models/import":
+					imported = true
+					var body map[string]any
+					Expect(json.NewDecoder(r.Body).Decode(&body)).To(Succeed())
+					Expect(body).To(HaveKeyWithValue("name", "gpt-4"))
+					Expect(body).To(HaveKeyWithValue("alias", "qwen"))
+					_ = json.NewEncoder(w).Encode(map[string]any{"success": true})
+				default:
+					http.Error(w, "unexpected", http.StatusTeapot)
+				}
+			}))
+			DeferCleanup(srv.Close)
+
+			Expect(New(srv.URL, "").SetAlias(context.Background(), "gpt-4", "qwen")).To(Succeed())
+			Expect(imported).To(BeTrue(), "import should create the alias on a 404")
+		})
+
+		It("surfaces a non-404 PATCH error without attempting import", func() {
+			var imported bool
+			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				if r.URL.Path == "/models/import" {
+					imported = true
+				}
+				http.Error(w, "target is an alias", http.StatusBadRequest)
+			}))
+			DeferCleanup(srv.Close)
+
+			err := New(srv.URL, "").SetAlias(context.Background(), "gpt-4", "bad")
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring("target is an alias"))
+			Expect(imported).To(BeFalse(), "a 400 swap error must not trigger create")
+		})
+	})
+})
+
 var _ = Describe("ErrHTTPNotFound", func() {
 	Context("on a clean 404 status", func() {
 		var (
diff --git a/pkg/mcp/localaitools/httpapi/routes.go b/pkg/mcp/localaitools/httpapi/routes.go
index 79504dc1b..cc552b728 100644
--- a/pkg/mcp/localaitools/httpapi/routes.go
+++ b/pkg/mcp/localaitools/httpapi/routes.go
@@ -16,6 +16,8 @@ const (
 	routeModelsAvail     = "/models/available"
 	routeModelsGall      = "/models/galleries"
 	routeModelsImport    = "/models/import-uri"
+	routeModelImport     = "/models/import"
+	routeAliases         = "/api/aliases"
 	routeModelsReload    = "/models/reload"
 	routeBackends        = "/backends"
 	routeBackendsKnown   = "/backends/known"
diff --git a/pkg/mcp/localaitools/inproc/client.go b/pkg/mcp/localaitools/inproc/client.go
index 6e047d751..e62934ccc 100644
--- a/pkg/mcp/localaitools/inproc/client.go
+++ b/pkg/mcp/localaitools/inproc/client.go
@@ -9,6 +9,8 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"os"
+	"path/filepath"
 
 	"github.com/google/uuid"
 	"github.com/mudler/LocalAI/core/config"
@@ -25,7 +27,9 @@ import (
 	localaitools "github.com/mudler/LocalAI/pkg/mcp/localaitools"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/system"
+	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/mudler/LocalAI/pkg/vram"
+	"gopkg.in/yaml.v3"
 )
 
 // Client implements localaitools.LocalAIClient by calling LocalAI services
@@ -298,6 +302,80 @@ func (c *Client) ReloadModels(_ context.Context) error {
 	return c.ConfigLoader.LoadModelConfigsFromPath(c.SystemState.Model.ModelsPath)
 }
 
+// ---- Model aliases ----
+
+// SetAlias is swap-first to match the httpapi client: PatchConfig swaps an
+// existing alias's target (validating it and preserving other fields) and
+// returns ErrNotFound when the config doesn't exist yet, which is the signal
+// to create it. createAlias mirrors the create path of ImportModelEndpoint.
+func (c *Client) SetAlias(ctx context.Context, name, target string) error {
+	if name == "" {
+		return errors.New("name is required")
+	}
+	if target == "" {
+		return errors.New("target is required")
+	}
+	_, err := c.modelAdmin.PatchConfig(ctx, name, map[string]any{"alias": target})
+	if err == nil {
+		return nil
+	}
+	if !errors.Is(err, modeladmin.ErrNotFound) {
+		return err
+	}
+	return c.createAlias(name, target)
+}
+
+// createAlias writes a fresh `{name, alias}` config to disk and reloads,
+// mirroring localai.ImportModelEndpoint's create path: validate, validate the
+// alias target, verify the path is trusted, write, reload, best-effort preload.
+func (c *Client) createAlias(name, target string) error {
+	if c.SystemState == nil {
+		return errors.New("system state not available")
+	}
+	cfg := config.ModelConfig{Name: name, Alias: target}
+	if valid, vErr := cfg.Validate(); !valid {
+		if vErr != nil {
+			return vErr
+		}
+		return errors.New("invalid alias configuration")
+	}
+	if err := c.ConfigLoader.ValidateAliasTarget(&cfg); err != nil {
+		return err
+	}
+	modelsPath := c.SystemState.Model.ModelsPath
+	if err := utils.VerifyPath(name+".yaml", modelsPath); err != nil {
+		return fmt.Errorf("model path not trusted: %w", err)
+	}
+	// Marshal only the user-provided fields (not the full struct with Go
+	// zero values), matching what the import endpoint persists for an alias.
+	yamlData, err := yaml.Marshal(map[string]any{"name": name, "alias": target})
+	if err != nil {
+		return fmt.Errorf("marshal alias config: %w", err)
+	}
+	// 0600: the LocalAI process is the sole reader/writer of model configs,
+	// and a tighter mode keeps the gosec G306 scan clean for this new write.
+	if err := os.WriteFile(filepath.Join(modelsPath, name+".yaml"), yamlData, 0600); err != nil {
+		return fmt.Errorf("write alias config: %w", err)
+	}
+	if err := c.ConfigLoader.LoadModelConfigsFromPath(modelsPath, c.AppConfig.ToConfigLoaderOptions()...); err != nil {
+		return fmt.Errorf("reload configs: %w", err)
+	}
+	// Preload is best-effort - a failure here doesn't undo the create.
+	_ = c.ConfigLoader.Preload(modelsPath)
+	return nil
+}
+
+func (c *Client) ListAliases(_ context.Context) ([]localaitools.AliasInfo, error) {
+	// Mirror localai.ListAliasesEndpoint: every config whose Alias is set.
+	out := []localaitools.AliasInfo{}
+	for _, cfg := range c.ConfigLoader.GetAllModelsConfigs() {
+		if cfg.IsAlias() {
+			out = append(out, localaitools.AliasInfo{Name: cfg.Name, Target: cfg.Alias})
+		}
+	}
+	return out, nil
+}
+
 // ---- Backends ----
 
 func (c *Client) ListBackends(_ context.Context) ([]localaitools.Backend, error) {
diff --git a/pkg/mcp/localaitools/inproc/client_test.go b/pkg/mcp/localaitools/inproc/client_test.go
index 1da00602a..e385897c7 100644
--- a/pkg/mcp/localaitools/inproc/client_test.go
+++ b/pkg/mcp/localaitools/inproc/client_test.go
@@ -3,6 +3,8 @@ package inproc
 import (
 	"context"
 	"errors"
+	"os"
+	"path/filepath"
 	"time"
 
 	. "github.com/onsi/ginkgo/v2"
@@ -47,3 +49,78 @@ var _ = Describe("inproc.Client cancellation", func() {
 		Expect(errors.Is(err, context.Canceled)).To(BeTrue(), "got: %v", err)
 	})
 })
+
+var _ = Describe("inproc.Client model aliases", func() {
+	var (
+		ctx       context.Context
+		tempDir   string
+		cl        *config.ModelConfigLoader
+		c         *Client
+		seedModel func(name, body string)
+	)
+
+	BeforeEach(func() {
+		ctx = context.Background()
+		tempDir = GinkgoT().TempDir()
+		systemState, err := system.GetSystemState(system.WithModelPath(tempDir))
+		Expect(err).ToNot(HaveOccurred())
+		appConfig := config.NewApplicationConfig(config.WithSystemState(systemState))
+		cl = config.NewModelConfigLoader(tempDir)
+		// Gallery/model loaders are unused by the alias methods, so nil is fine.
+		c = New(appConfig, systemState, cl, nil, nil)
+
+		seedModel = func(name, body string) {
+			Expect(os.WriteFile(filepath.Join(tempDir, name+".yaml"), []byte(body), 0644)).To(Succeed())
+			Expect(cl.LoadModelConfigsFromPath(tempDir)).To(Succeed())
+		}
+	})
+
+	Describe("ListAliases", func() {
+		It("returns only configs whose alias field is set", func() {
+			seedModel("real", "name: real\nbackend: llama-cpp\n")
+			seedModel("gpt-4", "name: gpt-4\nalias: real\n")
+
+			out, err := c.ListAliases(ctx)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(out).To(ConsistOf(localaitools.AliasInfo{Name: "gpt-4", Target: "real"}))
+		})
+
+		It("returns an empty slice when there are no aliases", func() {
+			seedModel("real", "name: real\nbackend: llama-cpp\n")
+			out, err := c.ListAliases(ctx)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(out).To(BeEmpty())
+		})
+	})
+
+	Describe("SetAlias", func() {
+		It("creates a new alias config on disk when the name is unused", func() {
+			seedModel("real", "name: real\nbackend: llama-cpp\n")
+
+			Expect(c.SetAlias(ctx, "gpt-4", "real")).To(Succeed())
+
+			Expect(filepath.Join(tempDir, "gpt-4.yaml")).To(BeAnExistingFile())
+			out, err := c.ListAliases(ctx)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(out).To(ConsistOf(localaitools.AliasInfo{Name: "gpt-4", Target: "real"}))
+		})
+
+		It("swaps an existing alias's target in place", func() {
+			seedModel("real", "name: real\nbackend: llama-cpp\n")
+			seedModel("other", "name: other\nbackend: llama-cpp\n")
+			seedModel("gpt-4", "name: gpt-4\nalias: real\n")
+
+			Expect(c.SetAlias(ctx, "gpt-4", "other")).To(Succeed())
+
+			out, err := c.ListAliases(ctx)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(out).To(ConsistOf(localaitools.AliasInfo{Name: "gpt-4", Target: "other"}))
+		})
+
+		It("rejects an alias whose target does not exist", func() {
+			err := c.SetAlias(ctx, "gpt-4", "missing")
+			Expect(err).To(HaveOccurred())
+			Expect(filepath.Join(tempDir, "gpt-4.yaml")).ToNot(BeAnExistingFile())
+		})
+	})
+})
diff --git a/pkg/mcp/localaitools/server.go b/pkg/mcp/localaitools/server.go
index fd9f5da00..4b662f66b 100644
--- a/pkg/mcp/localaitools/server.go
+++ b/pkg/mcp/localaitools/server.go
@@ -43,6 +43,7 @@ func NewServer(client LocalAIClient, opts Options) *mcp.Server {
 	})
 
 	registerModelTools(srv, client, opts)
+	registerAliasTools(srv, client, opts)
 	registerBackendTools(srv, client, opts)
 	registerConfigTools(srv, client, opts)
 	registerSystemTools(srv, client, opts)
diff --git a/pkg/mcp/localaitools/server_test.go b/pkg/mcp/localaitools/server_test.go
index eb1579449..052ca1e8b 100644
--- a/pkg/mcp/localaitools/server_test.go
+++ b/pkg/mcp/localaitools/server_test.go
@@ -88,10 +88,12 @@ var expectedFullCatalog = sortedStrings(
 	ToolInstallModel,
 	ToolListBackends,
 	ToolListGalleries,
+	ToolListAliases,
 	ToolListInstalledModels,
 	ToolListKnownBackends,
 	ToolListNodes,
 	ToolReloadModels,
+	ToolSetAlias,
 	ToolSetBranding,
 	ToolSystemInfo,
 	ToolToggleModelPinned,
@@ -110,6 +112,7 @@ var expectedReadOnlyCatalog = sortedStrings(
 	ToolGetPIIEvents,
 	ToolGetRouterDecisions,
 	ToolGetUsageStats,
+	ToolListAliases,
 	ToolListBackends,
 	ToolListGalleries,
 	ToolListInstalledModels,
@@ -165,6 +168,8 @@ var _ = Describe("Tool dispatch", func() {
 		{ToolReloadModels, struct{}{}, "ReloadModels"},
 		{ToolToggleModelState, map[string]any{"name": "foo", "action": "enable"}, "ToggleModelState"},
 		{ToolToggleModelPinned, map[string]any{"name": "foo", "action": "pin"}, "ToggleModelPinned"},
+		{ToolSetAlias, map[string]any{"name": "gpt-4", "target": "real"}, "SetAlias"},
+		{ToolListAliases, struct{}{}, "ListAliases"},
 	}
 
 	for _, c := range cases {
diff --git a/pkg/mcp/localaitools/tools.go b/pkg/mcp/localaitools/tools.go
index c7bf620c3..263bd791e 100644
--- a/pkg/mcp/localaitools/tools.go
+++ b/pkg/mcp/localaitools/tools.go
@@ -36,6 +36,11 @@ const (
 	ToolToggleModelState  = "toggle_model_state"
 	ToolToggleModelPinned = "toggle_model_pinned"
 	ToolSetBranding       = "set_branding"
+	ToolSetAlias          = "set_alias"
+
+	// ToolListAliases is read-only but lives here so the alias tools stay
+	// grouped; the catalog tests assert its read-only placement.
+	ToolListAliases = "list_aliases"
 )
 
 // DefaultServerName is the MCP Implementation.Name surfaced when
diff --git a/pkg/mcp/localaitools/tools_aliases.go b/pkg/mcp/localaitools/tools_aliases.go
new file mode 100644
index 000000000..6b75619c1
--- /dev/null
+++ b/pkg/mcp/localaitools/tools_aliases.go
@@ -0,0 +1,48 @@
+package localaitools
+
+import (
+	"context"
+
+	"github.com/modelcontextprotocol/go-sdk/mcp"
+)
+
+// registerAliasTools wires the conversational alias-management tools. An
+// alias redirects all traffic for one model name to another configured
+// model; list_aliases enumerates them, set_alias creates or swaps the
+// target. Deletion reuses the existing delete_model tool, which works on
+// any config including an alias.
+func registerAliasTools(s *mcp.Server, client LocalAIClient, opts Options) {
+	mcp.AddTool(s, &mcp.Tool{
+		Name:        ToolListAliases,
+		Description: "List every configured model alias and the target model it routes to.",
+	}, func(ctx context.Context, _ *mcp.CallToolRequest, _ struct{}) (*mcp.CallToolResult, any, error) {
+		aliases, err := client.ListAliases(ctx)
+		if err != nil {
+			return errorResult(err), nil, nil
+		}
+		return jsonResult(aliases), nil, nil
+	})
+
+	if opts.DisableMutating {
+		return
+	}
+
+	mcp.AddTool(s, &mcp.Tool{
+		Name:        ToolSetAlias,
+		Description: "Create a model alias (name -> target) or swap an existing alias's target. The target must be an existing, non-alias, enabled model. Requires user confirmation per safety rule 1.",
+	}, func(ctx context.Context, _ *mcp.CallToolRequest, args struct {
+		Name   string `json:"name"   jsonschema:"The alias name clients will call."`
+		Target string `json:"target" jsonschema:"The existing model the alias routes to."`
+	}) (*mcp.CallToolResult, any, error) {
+		if args.Name == "" {
+			return errorResultf("name is required"), nil, nil
+		}
+		if args.Target == "" {
+			return errorResultf("target is required"), nil, nil
+		}
+		if err := client.SetAlias(ctx, args.Name, args.Target); err != nil {
+			return errorResult(err), nil, nil
+		}
+		return jsonResult(AliasInfo{Name: args.Name, Target: args.Target}), nil, nil
+	})
+}
diff --git a/swagger/docs.go b/swagger/docs.go
index 19cb95fd2..20a1f5a3f 100644
--- a/swagger/docs.go
+++ b/swagger/docs.go
@@ -500,6 +500,25 @@ const docTemplate = `{
                 }
             }
         },
+        "/api/aliases": {
+            "get": {
+                "tags": [
+                    "models"
+                ],
+                "summary": "List model aliases",
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "schema": {
+                            "type": "array",
+                            "items": {
+                                "$ref": "#/definitions/localai.AliasInfo"
+                            }
+                        }
+                    }
+                }
+            }
+        },
         "/api/backend-logs": {
             "get": {
                 "description": "Returns a sorted list of model IDs that have captured backend process output",
@@ -3486,6 +3505,17 @@ const docTemplate = `{
                 }
             }
         },
+        "localai.AliasInfo": {
+            "type": "object",
+            "properties": {
+                "name": {
+                    "type": "string"
+                },
+                "target": {
+                    "type": "string"
+                }
+            }
+        },
         "localai.BrandingResponse": {
             "type": "object",
             "properties": {
diff --git a/swagger/swagger.json b/swagger/swagger.json
index e23b81cea..09e03581b 100644
--- a/swagger/swagger.json
+++ b/swagger/swagger.json
@@ -497,6 +497,25 @@
                 }
             }
         },
+        "/api/aliases": {
+            "get": {
+                "tags": [
+                    "models"
+                ],
+                "summary": "List model aliases",
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "schema": {
+                            "type": "array",
+                            "items": {
+                                "$ref": "#/definitions/localai.AliasInfo"
+                            }
+                        }
+                    }
+                }
+            }
+        },
         "/api/backend-logs": {
             "get": {
                 "description": "Returns a sorted list of model IDs that have captured backend process output",
@@ -3483,6 +3502,17 @@
                 }
             }
         },
+        "localai.AliasInfo": {
+            "type": "object",
+            "properties": {
+                "name": {
+                    "type": "string"
+                },
+                "target": {
+                    "type": "string"
+                }
+            }
+        },
         "localai.BrandingResponse": {
             "type": "object",
             "properties": {
diff --git a/swagger/swagger.yaml b/swagger/swagger.yaml
index 719b72f6c..a25674539 100644
--- a/swagger/swagger.yaml
+++ b/swagger/swagger.yaml
@@ -281,6 +281,13 @@ definitions:
           type: string
         type: array
     type: object
+  localai.AliasInfo:
+    properties:
+      name:
+        type: string
+      target:
+        type: string
+    type: object
   localai.BrandingResponse:
     properties:
       favicon_url:
@@ -2780,6 +2787,18 @@ paths:
       summary: Execute an agent task by name
       tags:
       - agent-jobs
+  /api/aliases:
+    get:
+      responses:
+        "200":
+          description: OK
+          schema:
+            items:
+              $ref: '#/definitions/localai.AliasInfo'
+            type: array
+      summary: List model aliases
+      tags:
+      - models
   /api/backend-logs:
     get:
       description: Returns a sorted list of model IDs that have captured backend process

From aef10723c9d92f5ef8c3fa1219de4af91ce675a9 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 20 Jun 2026 22:44:44 +0200
Subject: [PATCH 18/99] feat(config): prefix caching default + consolidate
 scattered defaults (#10415)

* feat(config): enable cross-request prefix caching for serving (Phase 2)

The llama.cpp backend ships n_cache_reuse=0 (cross-request KV prefix reuse via
shifting disabled). Enable it by default (256) so repeated prefixes - system
prompts, RAG context, agent scaffolds, multi-turn chat - aren't recomputed. This
is the universally-useful part of 'paged attention' (shared-prefix reuse, which
the upstream maintainers themselves identify as where paged attn actually helps)
and needs none of the block-KV machinery.

Lives in a serving_defaults.go sibling to hardware_defaults.go (device-driven vs
serving-policy defaults); both run from SetDefaults and only fill unset values.
Explicit cache_reuse/n_cache_reuse always wins. Device-independent, so it
propagates to distributed nodes via the model options with no router change.
Shares the backendOptionSet helper with the Phase-1 parallel default.

Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* refactor(config): extract generic fallback defaults into ApplyGenericDefaults

Behavior-preserving: move the inline sampling-param + runtime-flag fallbacks out
of SetDefaults into ApplyGenericDefaults, completing the domain-grouped tiers
(ApplyInferenceDefaults=family, ApplyHardwareDefaults=device, ApplyServingDefaults
=serving, ApplyGenericDefaults=generic fallbacks). SetDefaults is now a clean
orchestrator. Same order (runs after the family/hardware/serving tiers so those
win) and same conditions (TopK gated on UsesLlamaSamplerDefaults, MMap on XPU).
No behavior change; full config suite green. (NGPULayers stays in the GGUF-read
path for now - it's device-driven but coupled to model-size detection; a separate
follow-up.)

Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/config/generic_defaults.go      | 115 +++++++++++++++++++++++++++
 core/config/generic_defaults_test.go |  36 +++++++++
 core/config/hardware_defaults.go     |  14 +---
 core/config/model_config.go          | 104 ++----------------------
 core/config/serving_defaults.go      |  56 +++++++++++++
 core/config/serving_defaults_test.go |  30 +++++++
 6 files changed, 246 insertions(+), 109 deletions(-)
 create mode 100644 core/config/generic_defaults.go
 create mode 100644 core/config/generic_defaults_test.go
 create mode 100644 core/config/serving_defaults.go
 create mode 100644 core/config/serving_defaults_test.go

diff --git a/core/config/generic_defaults.go b/core/config/generic_defaults.go
new file mode 100644
index 000000000..57cfba514
--- /dev/null
+++ b/core/config/generic_defaults.go
@@ -0,0 +1,115 @@
+package config
+
+import "os"
+
+// ApplyGenericDefaults fills the generic fallback values applied after the
+// higher-priority tiers (ApplyInferenceDefaults for the model family,
+// ApplyHardwareDefaults for the device, ApplyServingDefaults for serving
+// policy): sampling parameters and a few runtime flags. Like the other tiers it
+// only fills values still left unset, so model-family / explicit config wins.
+func ApplyGenericDefaults(cfg *ModelConfig) {
+	if cfg == nil {
+		return
+	}
+
+	// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
+	defaultTopP := 0.95
+	defaultTopK := 40
+	defaultMinP := 0.0
+	defaultTemp := 0.9
+	// https://github.com/mudler/LocalAI/issues/2780
+	defaultMirostat := 0
+	defaultMirostatTAU := 5.0
+	defaultMirostatETA := 0.1
+	defaultTypicalP := 1.0
+	defaultTFZ := 1.0
+	defaultZero := 0
+
+	trueV := true
+	falseV := false
+
+	if cfg.Seed == nil {
+		//  random number generator seed
+		defaultSeed := RAND_SEED
+		cfg.Seed = &defaultSeed
+	}
+
+	// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
+	// native default differs (issue #6632). Only inject it for the llama.cpp
+	// family and the empty/auto backend; leave TopK nil for known non-llama
+	// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
+	// is 0 rather than a silently-changed 40.
+	if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
+		cfg.TopK = &defaultTopK
+	}
+
+	if cfg.MinP == nil {
+		cfg.MinP = &defaultMinP
+	}
+
+	if cfg.TypicalP == nil {
+		cfg.TypicalP = &defaultTypicalP
+	}
+
+	if cfg.TFZ == nil {
+		cfg.TFZ = &defaultTFZ
+	}
+
+	if cfg.MMap == nil {
+		// MMap is enabled by default
+
+		// Only exception is for Intel GPUs
+		if os.Getenv("XPU") != "" {
+			cfg.MMap = &falseV
+		} else {
+			cfg.MMap = &trueV
+		}
+	}
+
+	if cfg.MMlock == nil {
+		// MMlock is disabled by default
+		cfg.MMlock = &falseV
+	}
+
+	if cfg.TopP == nil {
+		cfg.TopP = &defaultTopP
+	}
+	if cfg.Temperature == nil {
+		cfg.Temperature = &defaultTemp
+	}
+
+	if cfg.Maxtokens == nil {
+		cfg.Maxtokens = &defaultZero
+	}
+
+	if cfg.Mirostat == nil {
+		cfg.Mirostat = &defaultMirostat
+	}
+
+	if cfg.MirostatETA == nil {
+		cfg.MirostatETA = &defaultMirostatETA
+	}
+
+	if cfg.MirostatTAU == nil {
+		cfg.MirostatTAU = &defaultMirostatTAU
+	}
+
+	if cfg.LowVRAM == nil {
+		cfg.LowVRAM = &falseV
+	}
+
+	if cfg.Embeddings == nil {
+		cfg.Embeddings = &falseV
+	}
+
+	if cfg.Reranking == nil {
+		cfg.Reranking = &falseV
+	}
+
+	if cfg.PromptCacheAll == nil {
+		// Match upstream llama.cpp's default (common/common.h: cache_prompt = true)
+		// and let cache_idle_slots / kv_unified actually do useful work; users can
+		// opt out with an explicit `prompt_cache_all: false` in the model YAML.
+		cfg.PromptCacheAll = &trueV
+	}
+}
diff --git a/core/config/generic_defaults_test.go b/core/config/generic_defaults_test.go
new file mode 100644
index 000000000..7cb080c0b
--- /dev/null
+++ b/core/config/generic_defaults_test.go
@@ -0,0 +1,36 @@
+package config_test
+
+import (
+	. "github.com/mudler/LocalAI/core/config"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("ApplyGenericDefaults (generic fallback tier)", func() {
+	It("fills sampling + runtime fallbacks when unset", func() {
+		cfg := &ModelConfig{} // empty backend uses the llama sampler defaults
+		ApplyGenericDefaults(cfg)
+		Expect(cfg.TopP).ToNot(BeNil())
+		Expect(*cfg.TopP).To(Equal(0.95))
+		Expect(*cfg.TopK).To(Equal(40))
+		Expect(*cfg.Temperature).To(Equal(0.9))
+		Expect(*cfg.MMap).To(BeTrue())
+		Expect(*cfg.MMlock).To(BeFalse())
+		Expect(*cfg.PromptCacheAll).To(BeTrue())
+	})
+
+	It("never overrides explicit values", func() {
+		tk := 7
+		tp := 0.5
+		cfg := &ModelConfig{}
+		cfg.TopK = &tk
+		cfg.TopP = &tp
+		ApplyGenericDefaults(cfg)
+		Expect(*cfg.TopK).To(Equal(7))
+		Expect(*cfg.TopP).To(Equal(0.5))
+	})
+
+	It("no-ops on nil", func() {
+		Expect(func() { ApplyGenericDefaults(nil) }).ToNot(Panic())
+	})
+})
diff --git a/core/config/hardware_defaults.go b/core/config/hardware_defaults.go
index 2ed54265f..114785ce4 100644
--- a/core/config/hardware_defaults.go
+++ b/core/config/hardware_defaults.go
@@ -111,19 +111,9 @@ func EnsureParallelOption(opts []string, gpu GPU) []string {
 }
 
 // hasParallelOption reports whether the model already sets parallel/n_parallel
-// (backend options are "name:value" strings) so we never override an explicit value.
+// so we never override an explicit value (helper shared with serving_defaults.go).
 func hasParallelOption(opts []string) bool {
-	for _, o := range opts {
-		name := o
-		if i := strings.IndexByte(o, ':'); i >= 0 {
-			name = o[:i]
-		}
-		switch strings.TrimSpace(strings.ToLower(name)) {
-		case "parallel", "n_parallel":
-			return true
-		}
-	}
-	return false
+	return backendOptionSet(opts, "parallel", "n_parallel")
 }
 
 // localGPU builds a GPU descriptor from local detection, used by SetDefaults on
diff --git a/core/config/model_config.go b/core/config/model_config.go
index 50836b99e..9586beea3 100644
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -1126,107 +1126,17 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	// heuristics for the selected node's GPU before loading. Explicit config wins.
 	ApplyHardwareDefaults(cfg, localGPU())
 
-	// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
-	defaultTopP := 0.95
-	defaultTopK := 40
-	defaultMinP := 0.0
-	defaultTemp := 0.9
-	// https://github.com/mudler/LocalAI/issues/2780
-	defaultMirostat := 0
-	defaultMirostatTAU := 5.0
-	defaultMirostatETA := 0.1
-	defaultTypicalP := 1.0
-	defaultTFZ := 1.0
-	defaultZero := 0
+	// Apply serving-policy defaults (device-independent): cross-request prefix
+	// caching. Propagates to distributed nodes via the model options.
+	ApplyServingDefaults(cfg)
+
+	// Generic fallback defaults (sampling params + runtime flags), applied after
+	// the model-family / hardware / serving tiers above. Only fills unset values.
+	ApplyGenericDefaults(cfg)
 
 	trueV := true
 	falseV := false
 
-	if cfg.Seed == nil {
-		//  random number generator seed
-		defaultSeed := RAND_SEED
-		cfg.Seed = &defaultSeed
-	}
-
-	// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
-	// native default differs (issue #6632). Only inject it for the llama.cpp
-	// family and the empty/auto backend; leave TopK nil for known non-llama
-	// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
-	// is 0 rather than a silently-changed 40.
-	if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
-		cfg.TopK = &defaultTopK
-	}
-
-	if cfg.MinP == nil {
-		cfg.MinP = &defaultMinP
-	}
-
-	if cfg.TypicalP == nil {
-		cfg.TypicalP = &defaultTypicalP
-	}
-
-	if cfg.TFZ == nil {
-		cfg.TFZ = &defaultTFZ
-	}
-
-	if cfg.MMap == nil {
-		// MMap is enabled by default
-
-		// Only exception is for Intel GPUs
-		if os.Getenv("XPU") != "" {
-			cfg.MMap = &falseV
-		} else {
-			cfg.MMap = &trueV
-		}
-	}
-
-	if cfg.MMlock == nil {
-		// MMlock is disabled by default
-		cfg.MMlock = &falseV
-	}
-
-	if cfg.TopP == nil {
-		cfg.TopP = &defaultTopP
-	}
-	if cfg.Temperature == nil {
-		cfg.Temperature = &defaultTemp
-	}
-
-	if cfg.Maxtokens == nil {
-		cfg.Maxtokens = &defaultZero
-	}
-
-	if cfg.Mirostat == nil {
-		cfg.Mirostat = &defaultMirostat
-	}
-
-	if cfg.MirostatETA == nil {
-		cfg.MirostatETA = &defaultMirostatETA
-	}
-
-	if cfg.MirostatTAU == nil {
-		cfg.MirostatTAU = &defaultMirostatTAU
-	}
-
-	if cfg.LowVRAM == nil {
-		cfg.LowVRAM = &falseV
-	}
-
-	if cfg.Embeddings == nil {
-		cfg.Embeddings = &falseV
-	}
-
-	if cfg.Reranking == nil {
-		cfg.Reranking = &falseV
-	}
-
-	if cfg.PromptCacheAll == nil {
-		// Match upstream llama.cpp's default (common/common.h: cache_prompt = true)
-		// and let cache_idle_slots / kv_unified actually do useful work; users can
-		// opt out with an explicit `prompt_cache_all: false` in the model YAML.
-		cfg.PromptCacheAll = &trueV
-	}
-
 	if threads == 0 {
 		// Threads can't be 0
 		threads = 4
diff --git a/core/config/serving_defaults.go b/core/config/serving_defaults.go
new file mode 100644
index 000000000..3b10e7000
--- /dev/null
+++ b/core/config/serving_defaults.go
@@ -0,0 +1,56 @@
+package config
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/mudler/xlog"
+)
+
+// Serving-policy model-config defaults.
+//
+// Sibling to hardware_defaults.go: those fill values driven by the target
+// *device* (Blackwell batch, VRAM-scaled parallel slots); these fill values
+// that improve multi-request / multi-user *serving* regardless of the GPU. They
+// run together from SetDefaults and only ever fill values the user left unset.
+
+// DefaultCacheReuse is the minimum shared-prefix chunk (in tokens) the backend
+// reuses across requests via KV-cache shifting. The llama.cpp backend ships this
+// disabled (n_cache_reuse = 0); we enable it so repeated prefixes (system
+// prompts, RAG context, agent scaffolds, multi-turn chat) are not recomputed.
+// This is the universally-useful part of "paged attention" (cross-request prefix
+// sharing) and needs none of the block-KV machinery.
+const DefaultCacheReuse = 256
+
+// ApplyServingDefaults fills serving-policy ModelConfig values the user left
+// unset. Currently: enable cross-request prefix caching. Explicit
+// cache_reuse/n_cache_reuse in the model options always wins.
+func ApplyServingDefaults(cfg *ModelConfig) {
+	if cfg == nil {
+		return
+	}
+	if !backendOptionSet(cfg.Options, "cache_reuse", "n_cache_reuse") {
+		cfg.Options = append(cfg.Options, fmt.Sprintf("cache_reuse:%d", DefaultCacheReuse))
+		xlog.Debug("[serving_defaults] enabling cross-request prefix cache",
+			"cache_reuse", DefaultCacheReuse)
+	}
+}
+
+// backendOptionSet reports whether the backend options already set any of names.
+// Options are "name:value" strings (or bare "name"); used so we never override
+// an explicit value. Shared with hardware_defaults.go.
+func backendOptionSet(opts []string, names ...string) bool {
+	for _, o := range opts {
+		name := o
+		if i := strings.IndexByte(o, ':'); i >= 0 {
+			name = o[:i]
+		}
+		name = strings.TrimSpace(strings.ToLower(name))
+		for _, n := range names {
+			if name == n {
+				return true
+			}
+		}
+	}
+	return false
+}
diff --git a/core/config/serving_defaults_test.go b/core/config/serving_defaults_test.go
new file mode 100644
index 000000000..2a5bba72a
--- /dev/null
+++ b/core/config/serving_defaults_test.go
@@ -0,0 +1,30 @@
+package config_test
+
+import (
+	. "github.com/mudler/LocalAI/core/config"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("Serving-policy config defaults", func() {
+	Describe("ApplyServingDefaults (cross-request prefix cache)", func() {
+		It("enables cache_reuse when unset", func() {
+			cfg := &ModelConfig{}
+			ApplyServingDefaults(cfg)
+			Expect(cfg.Options).To(ContainElement("cache_reuse:256"))
+		})
+		It("never overrides an explicit cache_reuse", func() {
+			cfg := &ModelConfig{Options: []string{"cache_reuse:0"}}
+			ApplyServingDefaults(cfg)
+			Expect(cfg.Options).To(Equal([]string{"cache_reuse:0"}))
+		})
+		It("recognizes the n_cache_reuse alias", func() {
+			cfg := &ModelConfig{Options: []string{"n_cache_reuse:512"}}
+			ApplyServingDefaults(cfg)
+			Expect(cfg.Options).To(Equal([]string{"n_cache_reuse:512"}))
+		})
+		It("no-ops on nil", func() {
+			Expect(func() { ApplyServingDefaults(nil) }).ToNot(Panic())
+		})
+	})
+})

From 23f225260c530f1bac8fcba6c3321a022333060f Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 20 Jun 2026 22:58:36 +0200
Subject: [PATCH 19/99] refactor(config): single source of truth for default
 values (#10418)

refactor(config): single source of truth for default values across config + backend

Defaults were decided in two areas with duplicated/drifted literals: the config
SetDefaults tiers vs core/backend/options.go's grpcModelOpts (which translates a
ModelConfig to the backend wire format and supplied its own fallbacks). They had
drifted - n_gpu_layers 9999999 (options.go) vs 99999999 (gguf.go), two 512 batch
constants, context 1024 (gguf) vs 4096 (backend) scattered as bare literals.

Introduce core/config/defaults.go as the canonical home (DefaultContextSize=4096,
GGUFFallbackContextSize=1024, DefaultNGPULayers=99999999, DefaultFlashAttention=
auto). gguf.go / hooks_llamacpp.go use them directly; core/backend references them
(backend imports config, never the reverse) so DefaultContextSize/DefaultBatchSize
and the flash-attn / n_gpu_layers fallbacks resolve to one place. The two context
values (1024 GGUF-no-estimate vs 4096 general) are kept distinct but now named +
documented, not blind literals. Behavior-preserving; config + backend suites green.

Assisted-by: Claude:opus-4.8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/backend/options.go          | 11 ++++++-----
 core/config/defaults.go          | 30 ++++++++++++++++++++++++++++++
 core/config/gguf.go              |  9 ++-------
 core/config/hardware_defaults.go |  2 +-
 core/config/hooks_llamacpp.go    |  2 +-
 5 files changed, 40 insertions(+), 14 deletions(-)
 create mode 100644 core/config/defaults.go

diff --git a/core/backend/options.go b/core/backend/options.go
index efe6c649f..18c3b7f27 100644
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -90,10 +90,11 @@ func getSeed(c config.ModelConfig) int32 {
 // DefaultContextSize and DefaultBatchSize are the backend's fallbacks when a
 // model config leaves them unset. Exported so callers that must respect the
 // effective decode window — notably the router's prompt trimmer — resolve the
-// same numbers grpcModelOpts does instead of guessing.
+// same numbers grpcModelOpts does instead of guessing. The values are owned by
+// core/config (single source of truth shared with the config default tiers).
 const (
-	DefaultContextSize = 4096
-	DefaultBatchSize   = 512
+	DefaultContextSize = config.DefaultContextSize
+	DefaultBatchSize   = config.DefaultPhysicalBatch
 )
 
 // EffectiveContextSize is the context window the backend will run with: the
@@ -129,7 +130,7 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
 	ctxSize := EffectiveContextSize(c)
 	b := EffectiveBatchSize(c)
 
-	flashAttention := "auto"
+	flashAttention := config.DefaultFlashAttention
 
 	if c.FlashAttention != nil {
 		flashAttention = *c.FlashAttention
@@ -175,7 +176,7 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
 		mmlock = *c.MMlock
 	}
 
-	nGPULayers := 9999999
+	nGPULayers := config.DefaultNGPULayers
 	if c.NGPULayers != nil {
 		nGPULayers = *c.NGPULayers
 	}
diff --git a/core/config/defaults.go b/core/config/defaults.go
new file mode 100644
index 000000000..18625fab3
--- /dev/null
+++ b/core/config/defaults.go
@@ -0,0 +1,30 @@
+package config
+
+// Canonical default values.
+//
+// These are owned here so the two layers that need them share a single source
+// of truth: the config tiers (ApplyInference/Hardware/Serving/Generic — which
+// *decide* defaults) and core/backend/options.go (which *translates* a
+// ModelConfig to the backend wire format and supplies the same fallbacks
+// defensively). Previously these were duplicated as literals across both
+// packages and had drifted (e.g. n_gpu_layers 9999999 vs 99999999, two batch
+// constants of 512). core/backend imports core/config, so backend references
+// these; config never imports backend.
+const (
+	// DefaultContextSize is the fallback context window when none is configured
+	// or estimable from the model.
+	DefaultContextSize = 4096
+
+	// GGUFFallbackContextSize is the context window for a GGUF model whose
+	// metadata yields no usable estimate (see guessGGUFFromFile). Deliberately
+	// smaller than DefaultContextSize to stay conservative on memory there.
+	GGUFFallbackContextSize = 1024
+
+	// DefaultNGPULayers means "offload all layers"; the backend (fit_params)
+	// clamps to what actually fits in device memory.
+	DefaultNGPULayers = 99999999
+
+	// DefaultFlashAttention is the flash-attention mode default; "auto" lets the
+	// backend enable it when the model + backend support it.
+	DefaultFlashAttention = "auto"
+)
diff --git a/core/config/gguf.go b/core/config/gguf.go
index 5e04f5693..16e43c914 100644
--- a/core/config/gguf.go
+++ b/core/config/gguf.go
@@ -14,11 +14,6 @@ import (
 	"github.com/gpustack/gguf-parser-go/util/ptr"
 )
 
-const (
-	defaultContextSize = 1024
-	defaultNGPULayers  = 99999999
-)
-
 // reservedNonChatModel reports whether the operator reserved this model for an
 // internal primitive — the router score classifier or the PII NER
 // token_classify tier. Such a model has no chat template and must not be
@@ -38,7 +33,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
 			cSize := int(ctxSize)
 			cfg.ContextSize = &cSize
 		} else {
-			defaultCtx = defaultContextSize
+			defaultCtx = GGUFFallbackContextSize
 			cfg.ContextSize = &defaultCtx
 		}
 	}
@@ -52,7 +47,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
 
 	if cfg.NGPULayers == nil {
 		// we assume we want to offload all layers
-		defaultHigh := defaultNGPULayers
+		defaultHigh := DefaultNGPULayers
 		cfg.NGPULayers = &defaultHigh
 	}
 
diff --git a/core/config/hardware_defaults.go b/core/config/hardware_defaults.go
index 114785ce4..18c321639 100644
--- a/core/config/hardware_defaults.go
+++ b/core/config/hardware_defaults.go
@@ -37,7 +37,7 @@ type GPU struct {
 // Physical batch (n_batch / n_ubatch) defaults.
 const (
 	// DefaultPhysicalBatch is the conservative default when no hardware-specific
-	// tuning applies. Matches backend.DefaultBatchSize.
+	// tuning applies. core/backend.DefaultBatchSize references this (single source).
 	DefaultPhysicalBatch = 512
 	// BlackwellPhysicalBatch is the default on NVIDIA Blackwell consumer GPUs
 	// (sm_12x: sm_120 RTX 50-series, sm_121 GB10 / DGX Spark). A larger physical
diff --git a/core/config/hooks_llamacpp.go b/core/config/hooks_llamacpp.go
index 4ced8a9b1..09bdbe868 100644
--- a/core/config/hooks_llamacpp.go
+++ b/core/config/hooks_llamacpp.go
@@ -34,7 +34,7 @@ func llamaCppDefaults(cfg *ModelConfig, modelPath string) {
 	// Default context size if not set, regardless of whether GGUF parsing succeeds
 	defer func() {
 		if cfg.ContextSize == nil {
-			ctx := defaultContextSize
+			ctx := GGUFFallbackContextSize
 			cfg.ContextSize = &ctx
 		}
 	}()

From 3e96d811b7c507dd5e093f0625c00fff16d4a514 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 20 Jun 2026 23:25:29 +0200
Subject: [PATCH 20/99] fix(ui): keep row action menu anchored and stop scroll
 snap on /app/manage (#10419)

Opening a model row's kebab (ActionMenu) on the Manage dashboard snapped the
page scroll to the top and rendered the menu detached from its trigger, making
it impossible to operate.

Two compounding causes:

- The menu auto-focus called el.focus() without preventScroll, so the browser
  scrolled the focused element into view, yanking the page to the top.
- The position:fixed Popover was rendered inline inside the table row. The
  editorial UI overhaul added hover transforms to rows/cards, and a transformed
  ancestor re-anchors position:fixed to itself instead of the viewport, so the
  menu (positioned from the trigger's viewport rect) landed in the wrong place.

Fix: portal the Popover to document.body so position:fixed always resolves
against the viewport, position it before paint with useLayoutEffect (no {0,0}
flash), and pass preventScroll:true to both focus calls.

Adds an e2e regression test that reproduces the symptom (scroll jumped from 564
to 0 on the old code) and asserts the menu tracks its trigger.


Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 .../e2e/manage-action-menu-position.spec.js   | 50 +++++++++++++++++++
 .../react-ui/src/components/ActionMenu.jsx    |  6 ++-
 core/http/react-ui/src/components/Popover.jsx | 21 ++++++--
 3 files changed, 70 insertions(+), 7 deletions(-)
 create mode 100644 core/http/react-ui/e2e/manage-action-menu-position.spec.js

diff --git a/core/http/react-ui/e2e/manage-action-menu-position.spec.js b/core/http/react-ui/e2e/manage-action-menu-position.spec.js
new file mode 100644
index 000000000..3f4301abe
--- /dev/null
+++ b/core/http/react-ui/e2e/manage-action-menu-position.spec.js
@@ -0,0 +1,50 @@
+import { test, expect } from './coverage-fixtures.js'
+
+// Regression: opening a row's kebab (ActionMenu) on /app/manage used to snap
+// the page scroll to the top and render the menu detached from its trigger,
+// making it impossible to operate. Two causes: the menu auto-focus scrolled
+// the page (no preventScroll), and the position:fixed popover was rendered
+// inside a row whose hover `transform` re-anchored it. Fix portals the popover
+// to document.body, positions it before paint, and focuses without scrolling.
+test.describe('Manage Page - Action menu positioning', () => {
+  test('opening a row menu keeps scroll stable and places the menu by its trigger', async ({ page }) => {
+    // Small viewport so the page is scrollable and a scroll jump is observable.
+    await page.setViewportSize({ width: 1024, height: 500 })
+    await page.goto('/app/manage')
+    await expect(page.locator('.table')).toBeVisible({ timeout: 10_000 })
+
+    const trigger = page.locator('button.action-menu__trigger').first()
+    await expect(trigger).toBeVisible()
+
+    // Bring the trigger into view ourselves first, so the only scroll we then
+    // measure is the one the menu would (wrongly) cause - not Playwright's own
+    // scroll-into-view before the click.
+    await trigger.scrollIntoViewIfNeeded()
+    const scrollBefore = await page.evaluate(() => window.scrollY)
+    await trigger.click()
+
+    const menu = page.locator('[role="menu"]')
+    await expect(menu).toBeVisible()
+
+    // Behavioural symptom 1: focusing the menu must not yank the page scroll.
+    const scrollAfter = await page.evaluate(() => window.scrollY)
+    expect(scrollAfter).toBe(scrollBefore)
+
+    // Behavioural symptom 2: the menu must sit next to its trigger, not float
+    // at the top of the window where it can't be operated.
+    const triggerBox = await trigger.boundingBox()
+    const menuBox = await menu.boundingBox()
+    expect(triggerBox).not.toBeNull()
+    expect(menuBox).not.toBeNull()
+    // Menu top is within ~24px of the trigger's bottom (below) or above it
+    // (flipped) — in all cases it tracks the trigger, never floating at y≈0.
+    const tracksTrigger =
+      Math.abs(menuBox.y - (triggerBox.y + triggerBox.height)) < 24 ||
+      Math.abs((menuBox.y + menuBox.height) - triggerBox.y) < 24
+    expect(tracksTrigger).toBe(true)
+
+    // Mechanism: the popover must be portaled to document.body so position:fixed
+    // resolves against the viewport, not a transformed ancestor row.
+    await expect(page.locator('body > .popover')).toHaveCount(1)
+  })
+})
diff --git a/core/http/react-ui/src/components/ActionMenu.jsx b/core/http/react-ui/src/components/ActionMenu.jsx
index 5c58ecd78..55010102c 100644
--- a/core/http/react-ui/src/components/ActionMenu.jsx
+++ b/core/http/react-ui/src/components/ActionMenu.jsx
@@ -95,9 +95,11 @@ export default function ActionMenu({ items, ariaLabel = 'Actions', triggerLabel,
           className="action-menu"
           onKeyDown={handleMenuKeyDown}
           // Capture focus when the menu opens so arrow keys work without the
-          // user clicking inside first.
+          // user clicking inside first. preventScroll: the popover is portaled
+          // and positioned by the trigger rect, so focusing it must not scroll
+          // the page (that yanked the view to the top before it was placed).
           tabIndex={-1}
-          ref={el => { if (el && open) el.focus() }}
+          ref={el => { if (el && open) el.focus({ preventScroll: true }) }}
         >
           {visible.map((item, i) => {
             if (item.divider) {
diff --git a/core/http/react-ui/src/components/Popover.jsx b/core/http/react-ui/src/components/Popover.jsx
index 96a9e217e..7d002d348 100644
--- a/core/http/react-ui/src/components/Popover.jsx
+++ b/core/http/react-ui/src/components/Popover.jsx
@@ -1,10 +1,17 @@
-import { useEffect, useRef, useState, useCallback } from 'react'
+import { useEffect, useLayoutEffect, useRef, useState, useCallback } from 'react'
+import { createPortal } from 'react-dom'
 
 // Minimal popover: positions itself below-right of the trigger's bounding box,
 // flips above when there isn't room below, closes on outside click or Escape,
 // returns focus to the trigger. Uses the existing .card surface so it picks
 // up theme/border/shadow automatically — no new theming work.
 //
+// Rendered through a portal on document.body: the popover is position:fixed and
+// positioned from the trigger's viewport rect, so it must escape any ancestor
+// that establishes a containing block (a row/card with a hover `transform`
+// would otherwise re-anchor `position:fixed` to itself, throwing the menu to
+// the wrong spot and making it unusable).
+//
 // Props:
 //   anchor:    ref to the trigger DOMElement (required)
 //   open:      boolean
@@ -30,7 +37,9 @@ export default function Popover({ anchor, open, onClose, children, ariaLabel })
     setPos({ top, left: Math.max(8, left), flipped })
   }, [anchor])
 
-  useEffect(() => {
+  // useLayoutEffect so we measure + place the popover before the browser
+  // paints — otherwise it flashes at its initial {0,0} for a frame.
+  useLayoutEffect(() => {
     if (!open) return
     reposition()
     window.addEventListener('resize', reposition)
@@ -65,14 +74,15 @@ export default function Popover({ anchor, open, onClose, children, ariaLabel })
     if (!open && anchor?.current) {
       // requestAnimationFrame so the close is painted before focus jumps;
       // otherwise screen readers announce the trigger mid-transition.
-      const raf = requestAnimationFrame(() => anchor.current?.focus?.())
+      // preventScroll: focusing the trigger must not yank the page scroll.
+      const raf = requestAnimationFrame(() => anchor.current?.focus?.({ preventScroll: true }))
       return () => cancelAnimationFrame(raf)
     }
   }, [open, anchor])
 
   if (!open) return null
 
-  return (
+  return createPortal(
     <div
       ref={popoverRef}
       role="dialog"
@@ -81,6 +91,7 @@ export default function Popover({ anchor, open, onClose, children, ariaLabel })
       style={{ top: pos.top, left: pos.left }}
     >
       {children}
-    </div>
+    </div>,
+    document.body
   )
 }

From c6303104c77040c8d16e7ae226b3b783d25b1e3e Mon Sep 17 00:00:00 2001
From: pos-ei-don <1822533+pos-ei-don@users.noreply.github.com>
Date: Sun, 21 Jun 2026 17:02:31 +0200
Subject: [PATCH 21/99] fix(vllm): structured outputs silently ignored on vLLM
 >= 0.23 (GuidedDecodingParams removed) (#10343)

fix(vllm): structured outputs silently ignored on vLLM >= 0.23

vLLM >= 0.23 removed GuidedDecodingParams (now StructuredOutputsParams) and
renamed the SamplingParams field guided_decoding -> structured_outputs. The
import failed, HAS_GUIDED_DECODING became False, and the whole guided-decoding
block was skipped, so response_format / grammar constraints were silently
ignored. Adapt the existing request.Grammar path to the new class/field.

Signed-off-by: pos-ei-don <1822533+pos-ei-don@users.noreply.github.com>
---
 backend/python/vllm/backend.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
index 5d5662857..20064e233 100644
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -48,8 +48,10 @@ try:
 except ImportError:
     HAS_REASONING_PARSERS = False
 
+# vLLM >= 0.23 renamed GuidedDecodingParams -> StructuredOutputsParams and the
+# SamplingParams field guided_decoding -> structured_outputs.
 try:
-    from vllm.sampling_params import GuidedDecodingParams
+    from vllm.sampling_params import StructuredOutputsParams
     HAS_GUIDED_DECODING = True
 except ImportError:
     HAS_GUIDED_DECODING = False
@@ -536,13 +538,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                 if value not in (None, 0, [], False):
                     setattr(sampling_params, param_field, value)
 
-        # Guided decoding: use Grammar field to pass JSON schema or BNF
+        # Structured-output decoding: use Grammar field to pass JSON schema or BNF
         if HAS_GUIDED_DECODING and request.Grammar:
             try:
                 json.loads(request.Grammar)  # valid JSON = JSON schema
-                sampling_params.guided_decoding = GuidedDecodingParams(json=request.Grammar)
+                sampling_params.structured_outputs = StructuredOutputsParams(json=request.Grammar)
             except json.JSONDecodeError:
-                sampling_params.guided_decoding = GuidedDecodingParams(grammar=request.Grammar)
+                sampling_params.structured_outputs = StructuredOutputsParams(grammar=request.Grammar)
 
         # Extract image paths and process images
         prompt = request.Prompt

From cf7f9573a2a1e3d01927c3ebb785623e47822684 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=95=AA=E8=8C=84=E6=91=94=E6=88=90=E7=95=AA=E8=8C=84?=
 =?UTF-8?q?=E9=85=B1?= <68098251+fqscfqj@users.noreply.github.com>
Date: Sun, 21 Jun 2026 23:03:33 +0800
Subject: [PATCH 22/99] fix(crispasr): filter garbage words from parakeet
 word-level timestamps (#10421)

The parakeet-specific word accessors can return stale initialisation
data (model name, binary blobs) for segments with no real speech.
Add isValidWord() to filter out words that have:
- empty or whitespace-only text
- U+FFFD replacement characters (from binary data scrubbing)
- negative timestamps
- zero duration (end <= start)

Also skip empty segments entirely when they have no recognisable
content (empty text AND no valid words), preventing spurious subtitle
entries like '00:45:33,592 --> 00:45:33,592 parakeet@rH\u000b\ufffdI'.

Applies to both AudioTranscription and AudioTranscriptionStream.

Signed-off-by: fqscfqj <fqscfqj@outlook.com>
---
 backend/go/crispasr/gocrispasr.go | 54 +++++++++++++++++++++++++++----
 1 file changed, 48 insertions(+), 6 deletions(-)

diff --git a/backend/go/crispasr/gocrispasr.go b/backend/go/crispasr/gocrispasr.go
index af1f1a95c..2cbfb0d4a 100644
--- a/backend/go/crispasr/gocrispasr.go
+++ b/backend/go/crispasr/gocrispasr.go
@@ -224,6 +224,28 @@ func (w *CrispASR) VAD(req *pb.VADRequest) (pb.VADResponse, error) {
 	}, nil
 }
 
+// isValidWord reports whether a TranscriptWord contains recognisable speech
+// content. The parakeet-specific word accessors can return stale initialisation
+// data (model name, binary blobs) when a segment has no real speech. A word is
+// considered valid only when:
+//   - the text is non-empty after trimming,
+//   - it contains no U+FFFD replacement characters (from binary data scrubbing),
+//   - both timestamps are non-negative,
+//   - the word has positive duration (end > start).
+func isValidWord(w *pb.TranscriptWord) bool {
+	txt := strings.TrimSpace(w.Text)
+	if txt == "" {
+		return false
+	}
+	if strings.ContainsRune(txt, '\uFFFD') {
+		return false
+	}
+	if w.Start < 0 || w.End < 0 || w.End <= w.Start {
+		return false
+	}
+	return true
+}
+
 func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRequest) (pb.TranscriptResult, error) {
 	if err := ctx.Err(); err != nil {
 		return pb.TranscriptResult{}, status.Error(codes.Canceled, "transcription cancelled")
@@ -311,22 +333,35 @@ func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRe
 		if wordCount == 0 && i == 0 {
 			wordCount = CppGetParakeetWordCount()
 			for j := 0; j < wordCount; j++ {
-				words = append(words, &pb.TranscriptWord{
+				w := &pb.TranscriptWord{
 					Start: CppGetParakeetWordT0(j) * (10000000),
 					End:   CppGetParakeetWordT1(j) * (10000000),
 					Text:  strings.ToValidUTF8(strings.Clone(CppGetParakeetWordText(j)), "�"),
-				})
+				}
+				if isValidWord(w) {
+					words = append(words, w)
+				}
 			}
 		} else {
 			for j := 0; j < wordCount; j++ {
-				words = append(words, &pb.TranscriptWord{
+				w := &pb.TranscriptWord{
 					Start: CppGetWordT0(i, j) * (10000000),
 					End:   CppGetWordT1(i, j) * (10000000),
 					Text:  strings.ToValidUTF8(strings.Clone(CppGetWordText(i, j)), "�"),
-				})
+				}
+				if isValidWord(w) {
+					words = append(words, w)
+				}
 			}
 		}
 
+		// Skip empty segments with no recognisable content (e.g. trailing
+		// silence segments that parakeet emits with stale init data).
+		trimmed := strings.TrimSpace(txt)
+		if trimmed == "" && len(words) == 0 {
+			continue
+		}
+
 		segment := &pb.TranscriptSegment{
 			Id:    int32(i),
 			Text:  txt,
@@ -336,7 +371,7 @@ func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRe
 
 		segments = append(segments, segment)
 
-		text += " " + strings.TrimSpace(txt)
+		text += " " + trimmed
 	}
 
 	return pb.TranscriptResult{
@@ -428,13 +463,20 @@ func (w *CrispASR) AudioTranscriptionStream(ctx context.Context, opts *pb.Transc
 		s := CppGetSegmentStart(i) * 10000000
 		t := CppGetSegmentEnd(i) * 10000000
 		txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "�")
+
+		// Skip empty segments (e.g. trailing silence that parakeet emits
+		// with stale init data).
+		trimmed := strings.TrimSpace(txt)
+		if trimmed == "" && s == t {
+			continue
+		}
+
 		segments = append(segments, &pb.TranscriptSegment{
 			Id:    int32(i),
 			Text:  txt,
 			Start: s, End: t,
 		})
 
-		trimmed := strings.TrimSpace(txt)
 		if trimmed == "" {
 			continue
 		}

From 01fa12e0dee93fedf777922996aa63995fb7495a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=95=AA=E8=8C=84=E6=91=94=E6=88=90=E7=95=AA=E8=8C=84?=
 =?UTF-8?q?=E9=85=B1?= <68098251+fqscfqj@users.noreply.github.com>
Date: Sun, 21 Jun 2026 23:04:19 +0800
Subject: [PATCH 23/99] feat(nemo): enable word-level timestamps for ASR models
 (#10297)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(nemo): enable word-level timestamps for ASR models

The nemo backend ignored timestamp_granularities and always returned a
single segment with start=0 end=0, making word-level timestamps
impossible to obtain even though the NeMo models (parakeet-tdt, etc.)
fully support them.

Changes:
- Add _get_stride_seconds() to compute frame duration from the model's
  preprocessor window_stride and encoder subsampling_factor.
- Add _build_segments_with_words() that extracts word offsets from the
  NeMo Hypothesis.timestamp dict and converts frame indices to
  nanosecond timestamps.
- Support 'word' granularity (one segment per word) and 'segment'
  granularity (merge at time-gap boundaries using a dynamic threshold).
- Populate TranscriptSegment.words with TranscriptWord entries so
  callers get both segment-level and word-level timing.
- Only request timestamps from NeMo when the caller actually asks for
  them (timestamp_granularities is non-empty), keeping the fast path
  unchanged for callers that don't need timestamps.

Tested with nvidia/parakeet-tdt-0.6b-v3 on the JFK "ask not" clip:
  curl -X POST /v1/audio/transcriptions \
    -F file=@jfk.wav -F model=nemo-parakeet-tdt-0.6b \
    -F 'timestamp_granularities[]=word' -F response_format=verbose_json
  → each word has correct start/end times in seconds.

Signed-off-by: fqscfqj <fqscfqj@outlook.com>

* fix(nemo): address Copilot review feedback

- Narrow exception handling in _get_stride_seconds to catch only
  AttributeError, KeyError, TypeError instead of bare Exception, and
  emit a warning when falling back to the hardcoded stride.
- Remove explicit return_hypotheses=False when timestamps are requested;
  timestamps=True already forces NeMo to return Hypothesis objects.
- Add a warning when NeMo does not return Hypothesis objects despite
  timestamps being requested.

Signed-off-by: fqscfqj <fqscfqj@outlook.com>

---------

Signed-off-by: fqscfqj <fqscfqj@outlook.com>
---
 backend/python/nemo/backend.py | 202 ++++++++++++++++++++++++++++++---
 1 file changed, 186 insertions(+), 16 deletions(-)

diff --git a/backend/python/nemo/backend.py b/backend/python/nemo/backend.py
index ccbff7cd2..a5c30694e 100644
--- a/backend/python/nemo/backend.py
+++ b/backend/python/nemo/backend.py
@@ -84,6 +84,135 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
 
         return backend_pb2.Result(message="Model loaded successfully", success=True)
 
+    def _get_stride_seconds(self):
+        """Compute the seconds-per-frame stride for the loaded model.
+
+        stride = preprocessor_window_stride * encoder_subsampling_factor
+        """
+        try:
+            preprocessor = self.model.preprocessor
+            window_stride = preprocessor._cfg.get('window_stride', 0.01)
+            subsampling_factor = getattr(self.model.encoder, 'subsampling_factor', 8)
+            return window_stride * subsampling_factor
+        except (AttributeError, KeyError, TypeError) as err:
+            print(
+                f"Warning: could not compute stride from model config ({err}), "
+                f"falling back to 0.08s/frame",
+                file=sys.stderr,
+            )
+            return 0.08
+
+    def _build_segments_with_words(self, hypothesis, stride, timestamp_granularities=None):
+        """Build TranscriptSegment list from a NeMo Hypothesis with timestamps.
+
+        Supports two granularity modes:
+          - "word": one TranscriptSegment per word, each with a single TranscriptWord entry
+          - "segment" (default): merge consecutive words into sentence-level segments,
+            splitting at word-level time gaps that exceed a dynamic threshold.
+        """
+        if not hypothesis or not isinstance(hypothesis.timestamp, dict):
+            return []
+
+        word_offsets = hypothesis.timestamp.get('word', [])
+        if not word_offsets:
+            return []
+
+        granularities = list(timestamp_granularities) if timestamp_granularities else []
+        granularity = "word" if "word" in granularities else "segment"
+
+        # Build a flat list of (text, start_ns, end_ns) from NeMo word offsets
+        transcript_words = []
+        for wo in word_offsets:
+            word_text = wo.get('word', '')
+            if not word_text:
+                continue
+            start_offset = wo.get('start_offset', 0)
+            end_offset = wo.get('end_offset', start_offset)
+            start_ns = int(start_offset * stride * 1_000_000_000)
+            end_ns = int(end_offset * stride * 1_000_000_000)
+            transcript_words.append({
+                'text': word_text,
+                'start': start_ns,
+                'end': end_ns,
+            })
+
+        if not transcript_words:
+            return []
+
+        if granularity == "word":
+            # One segment per word
+            result = []
+            for idx, tw in enumerate(transcript_words):
+                word = backend_pb2.TranscriptWord(
+                    start=tw['start'], end=tw['end'], text=tw['text']
+                )
+                result.append(backend_pb2.TranscriptSegment(
+                    id=idx,
+                    start=tw['start'],
+                    end=tw['end'],
+                    text=tw['text'],
+                    words=[word],
+                ))
+            return result
+
+        # segment mode — merge at word-level time-gap boundaries
+        # Compute gap threshold: median inter-word gap * 3, clamped to [0.3, 2.0]s
+        gaps = []
+        for i in range(1, len(transcript_words)):
+            gap = (transcript_words[i]['start'] - transcript_words[i - 1]['end']) / 1_000_000_000
+            if gap > 0:
+                gaps.append(gap)
+        if gaps:
+            gaps.sort()
+            median_gap = gaps[len(gaps) // 2]
+            threshold_ns = int(max(0.3, min(median_gap * 3, 2.0)) * 1_000_000_000)
+        else:
+            threshold_ns = int(0.5 * 1_000_000_000)
+
+        result = []
+        buf_words = []  # list of TranscriptWord protobuf
+        buf_start = None
+        buf_end = 0
+        buf_text = []
+        prev_end = None
+
+        for tw in transcript_words:
+            # Detect word-level time gap
+            if prev_end is not None and (tw['start'] - prev_end) >= threshold_ns and buf_text:
+                seg_text = ' '.join(buf_text)
+                result.append(backend_pb2.TranscriptSegment(
+                    id=len(result),
+                    start=buf_start,
+                    end=buf_end,
+                    text=seg_text,
+                    words=list(buf_words),
+                ))
+                buf_words = []
+                buf_text = []
+                buf_start = None
+
+            if buf_start is None:
+                buf_start = tw['start']
+            buf_end = tw['end']
+            buf_text.append(tw['text'])
+            buf_words.append(backend_pb2.TranscriptWord(
+                start=tw['start'], end=tw['end'], text=tw['text']
+            ))
+            prev_end = tw['end']
+
+        # flush remaining
+        if buf_text and buf_start is not None:
+            seg_text = ' '.join(buf_text)
+            result.append(backend_pb2.TranscriptSegment(
+                id=len(result),
+                start=buf_start,
+                end=buf_end,
+                text=seg_text,
+                words=list(buf_words),
+            ))
+
+        return result
+
     def AudioTranscription(self, request, context):
         result_segments = []
         text = ""
@@ -93,26 +222,67 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                 print(f"Error: Audio file not found: {audio_path}", file=sys.stderr)
                 return backend_pb2.TranscriptResult(segments=[], text="")
 
-            # NEMO's transcribe method accepts a list of audio paths and returns a list of transcripts
-            results = self.model.transcribe([audio_path])
+            # Determine requested timestamp granularity
+            timestamp_granularities = list(request.timestamp_granularities) if request.timestamp_granularities else []
+            want_timestamps = bool(timestamp_granularities)
 
-            if not results or len(results) == 0:
-                return backend_pb2.TranscriptResult(segments=[], text="")
+            if want_timestamps:
+                # Request timestamps from NeMo.
+                # timestamps=True forces NeMo to return Hypothesis objects with
+                # the timestamp dict populated, so we omit return_hypotheses to
+                # let NeMo choose the correct return type.
+                results = self.model.transcribe([audio_path], timestamps=True)
 
-            # Get the transcript text from the first result.
-            # CTC models return List[str], TDT/RNNT models return List[Hypothesis]
-            # where the actual text lives in Hypothesis.text.
-            result = results[0]
-            if isinstance(result, str):
-                text = result
+                if results and len(results) > 0:
+                    hypotheses = results[0] if isinstance(results[0], list) else results
+                    if hypotheses and len(hypotheses) > 0:
+                        hypothesis = hypotheses[0]
+
+                        # Hypothesis object should have .timestamp populated
+                        if not hasattr(hypothesis, 'timestamp') or not isinstance(hypothesis.timestamp, dict):
+                            print(
+                                "Warning: timestamps were requested but NeMo did not return "
+                                "Hypothesis objects; falling back to untimestamped output",
+                                file=sys.stderr,
+                            )
+
+                        # Extract text
+                        if hasattr(hypothesis, 'text'):
+                            text = hypothesis.text or ""
+                        elif isinstance(hypothesis, str):
+                            text = hypothesis
+
+                        # Build segments with word-level timestamps
+                        stride = self._get_stride_seconds()
+                        result_segments = self._build_segments_with_words(
+                            hypothesis, stride, timestamp_granularities
+                        )
+
+                        # If no word offsets but we have text, fall back to single segment
+                        if not result_segments and text:
+                            result_segments.append(backend_pb2.TranscriptSegment(
+                                id=0, start=0, end=0, text=text
+                            ))
             else:
-                text = getattr(result, 'text', None) or ""
+                # Simple transcription without timestamps
+                # NEMO's transcribe method accepts a list of audio paths and returns a list of transcripts
+                results = self.model.transcribe([audio_path])
 
-            if text:
-                # Create a single segment with the full transcription
-                result_segments.append(backend_pb2.TranscriptSegment(
-                    id=0, start=0, end=0, text=text
-                ))
+                if results and len(results) > 0:
+                    # Get the transcript text from the first result.
+                    # CTC models return List[str], TDT/RNNT models return List[Hypothesis]
+                    # where the actual text lives in Hypothesis.text.
+                    result = results[0]
+                    if isinstance(result, str):
+                        text = result
+                    else:
+                        text = getattr(result, 'text', None) or ""
+
+                    if text:
+                        # Create a single segment with the full transcription
+                        result_segments.append(backend_pb2.TranscriptSegment(
+                            id=0, start=0, end=0, text=text
+                        ))
 
         except Exception as err:
             print(f"Error in AudioTranscription: {err}", file=sys.stderr)

From b4c0dc67fe8471176bd7bb702333b27527e8d59f Mon Sep 17 00:00:00 2001
From: pos-ei-don <1822533+pos-ei-don@users.noreply.github.com>
Date: Sun, 21 Jun 2026 17:07:15 +0200
Subject: [PATCH 24/99] feat(vllm): progressive streaming via
 parser.extract_tool_calls_streaming (follow-up to #10346) (#10351)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix(vllm): don't stream raw tool-call markup as content when a tool parser is active

When a tool_parser is configured and the request carries tools, the streaming
loop emitted every text delta as delta.content — including the model's raw
tool-call markup (e.g. <tool_call>...) — because extract_tool_calls only runs
on the full output after the stream. Clients streaming a tool call therefore
saw the unparsed tool-call syntax as assistant content.

Buffer the text while a tool parser is active for the request; the existing
end-of-stream chat_delta already carries the parsed tool_calls (or the cleaned
content), which the Go side converts to SSE deltas. Non-tool-parser streaming
is unchanged.

Add a server-less regression test covering both the tool-call case (no raw
markup leaked as content) and the plain-text case (content delivered exactly
once — guards against double-emitting the buffered content).

Signed-off-by: pos-ei-don <1822533+pos-ei-don@users.noreply.github.com>

* test(vllm): add expectedFailure test for progressive streaming with tool parser (Case 3, #582)

Signed-off-by: pos-ei-don <1822533+pos-ei-don@users.noreply.github.com>

* test(vllm): add Cases 4+5 — marker split across chunks + false-positive prefix (TDD, Option B state machine, #582)

Signed-off-by: pos-ei-don <1822533+pos-ei-don@users.noreply.github.com>

* feat(vllm): progressive streaming via parser.extract_tool_calls_streaming

When a tool parser is active for a tool-enabled streaming request,
#10346 buffers the entire generation and surfaces it on the final
chunk to prevent raw tool-call markup from leaking as delta.content.
This is correct but turns the request into effectively non-streaming
for plain-text responses — the client sees nothing until the model
stops.

Every concrete tool parser shipped with vLLM 0.23+ already implements
extract_tool_calls_streaming (Granite4, Qwen3Coder, DeepSeekV31, Jamba,
Ernie45, Hermes2Pro, llama3_json, mistral, …). Use it: instantiate
the parser before the streaming loop and call its streaming method per
delta, emitting DeltaMessage(content=…) or DeltaMessage(tool_calls=[…])
when the parser is ready.

Falls back to the existing #10346 buffer path when:
  - the parser does not have extract_tool_calls_streaming, OR
  - extract_tool_calls_streaming raises mid-stream (logged, the
    rest of the request finishes via post-loop extract_tool_calls).

Tests (TestStreamingToolParser):
  1. Buffer path: no markup leaked, no content duplication
  2. Native streaming: plain-text response streams progressively
  3. Native streaming: tool_call structured, no markup leaked
  4. Native streaming exception → graceful fallback, no markup, no crash
  5. No tool parser → unchanged per-delta content stream

E2E verified against qwen3_coder on vLLM 0.23.0 (NVIDIA GB10 / arm64 / CUDA 13).

Signed-off-by: pos-ei-don <1822533+pos-ei-don@users.noreply.github.com>

* docs(vllm): add server-side TTFT benchmark for the streaming tool-parser path

Self-contained stdlib-only script that measures time-to-first-token (TTFT)
for the vLLM backend's two streaming scenarios:

  - tool_call:  request mentions a tool; model is expected to call it
  - plain_text: request offers a tool but explicitly asks for prose

Use this to compare:
  - the buffer-all path (#10346)         → plain_text TTFT ≈ total response time
  - the native-streaming path (this PR)  → plain_text TTFT ≈ true first-token time

  python examples/vllm-bench/ttft_streaming_tool_parser.py \\
      --url http://localhost:8080 --model my-coder --runs 3

Lives under examples/ so it does not interfere with the test suite.

Signed-off-by: pos-ei-don <1822533+pos-ei-don@users.noreply.github.com>

* examples/vllm-bench: add long-text scenario (8 paragraphs, 1500 tokens)

The long-text scenario shows the buffering vs streaming difference most
dramatically: with the buffer-all path, the client receives nothing for
20+ seconds and then the entire 1500-token response at once. With native
streaming, the first token arrives in tens of milliseconds and the
response flows progressively.

Signed-off-by: pos-ei-don <1822533+pos-ei-don@users.noreply.github.com>

---------

Signed-off-by: pos-ei-don <1822533+pos-ei-don@users.noreply.github.com>
Co-authored-by: Philipp Wacker <philipp.wacker@ibf-solutions.com>
---
 backend/python/vllm/backend.py                | 161 +++++++++--
 backend/python/vllm/test.py                   | 259 +++++++++++++++++-
 examples/vllm-bench/README.md                 |  54 ++++
 .../vllm-bench/ttft_streaming_tool_parser.py  | 175 ++++++++++++
 4 files changed, 631 insertions(+), 18 deletions(-)
 create mode 100644 examples/vllm-bench/README.md
 create mode 100755 examples/vllm-bench/ttft_streaming_tool_parser.py

diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
index 20064e233..a38849137 100644
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -598,23 +598,124 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
 
         # Stream the results
         generated_text = ""
+        generated_token_ids: list[int] = []
         last_output = None
+
+        # Tool-parsing strategy decision (made once, before the loop):
+        #
+        # When a tool parser is active, the model's raw tool-call markup
+        # (e.g. <tool_call>...) must not be streamed verbatim as delta.content
+        # — clients would see the unparsed syntax. Two paths:
+        #
+        # (A) native streaming via parser.extract_tool_calls_streaming. All
+        #     concrete tool parsers shipped with vLLM 0.23+ implement this
+        #     (Granite4, Qwen3Coder, DeepSeekV31, Jamba, Ernie45, Hermes,
+        #     llama3_json, mistral, …). The parser decides per-delta whether
+        #     to emit content or suppress tool-call markup, and emits a
+        #     structured DeltaMessage(tool_calls=[...]) when a call is ready.
+        # (B) buffer fallback — used only when the parser surprisingly lacks
+        #     the streaming method or it raises mid-stream. The post-loop
+        #     extract_tool_calls assembles the final chat_delta. Same correctness
+        #     guarantee as a non-streaming response, at the cost of a delayed
+        #     final chunk.
+        has_tool_parser = bool(self.tool_parser_cls and request.Tools)
+        tp_instance = None
+        tp_request = None
+        native_streaming = False
+        native_streaming_error = False
+        if has_tool_parser:
+            try:
+                tools_for_parser = json.loads(request.Tools)
+            except json.JSONDecodeError:
+                tools_for_parser = []
+            try:
+                tp_instance = self.tool_parser_cls(self.tokenizer, tools=tools_for_parser)
+            except TypeError:
+                tp_instance = self.tool_parser_cls(self.tokenizer)
+            # Build a minimal ChatCompletionRequest so the streaming method
+            # sees the tools list. We do not need any other request fields —
+            # parsers only read .tools (and sometimes .tool_choice, which we
+            # leave at default).
+            try:
+                from vllm.entrypoints.openai.chat_completion.protocol import (
+                    ChatCompletionRequest as _CCR,
+                )
+                tp_request = _CCR(
+                    model="local",
+                    messages=[{"role": "user", "content": ""}],
+                    tools=tools_for_parser or None,
+                )
+            except Exception as e:
+                print(f"Could not build ChatCompletionRequest for streaming parser: {e}",
+                      file=sys.stderr)
+                tp_request = None
+            native_streaming = (
+                tp_request is not None
+                and hasattr(tp_instance, "extract_tool_calls_streaming")
+            )
+
         try:
             async for request_output in outputs:
                 iteration_text = request_output.outputs[0].text
                 last_output = request_output
 
                 if streaming:
-                    # Remove text already sent as vllm concatenates the text from previous yields
                     delta_iteration_text = iteration_text.removeprefix(generated_text)
-                    # Send the partial result
-                    yield backend_pb2.Reply(
-                        message=bytes(delta_iteration_text, encoding='utf-8'),
-                        chat_deltas=[backend_pb2.ChatDelta(content=delta_iteration_text)],
-                    )
+                    new_token_ids = list(request_output.outputs[0].token_ids)
+                    delta_token_ids = new_token_ids[len(generated_token_ids):]
 
-                # Keep track of text generated
+                    if not has_tool_parser:
+                        # Plain streaming — unchanged from pre-tool-parser path.
+                        yield backend_pb2.Reply(
+                            message=bytes(delta_iteration_text, encoding='utf-8'),
+                            chat_deltas=[backend_pb2.ChatDelta(content=delta_iteration_text)],
+                        )
+                    elif native_streaming and not native_streaming_error:
+                        # (A) Native vLLM extract_tool_calls_streaming.
+                        try:
+                            msg = tp_instance.extract_tool_calls_streaming(
+                                previous_text=generated_text,
+                                current_text=iteration_text,
+                                delta_text=delta_iteration_text,
+                                previous_token_ids=generated_token_ids,
+                                current_token_ids=new_token_ids,
+                                delta_token_ids=delta_token_ids,
+                                request=tp_request,
+                            )
+                        except Exception as e:
+                            print(f"Streaming tool parser error (falling back to "
+                                  f"buffer for the rest of the stream): {e}",
+                                  file=sys.stderr)
+                            native_streaming_error = True
+                            msg = None
+                        if msg is not None:
+                            tc_protos = []
+                            for tc in (msg.tool_calls or []):
+                                fn = tc.function or None
+                                tc_protos.append(backend_pb2.ToolCallDelta(
+                                    index=tc.index,
+                                    id=tc.id or "",
+                                    name=(fn.name if fn and fn.name else "") or "",
+                                    arguments=(fn.arguments if fn and fn.arguments else "") or "",
+                                ))
+                            cd_kwargs = {}
+                            if msg.content:
+                                cd_kwargs["content"] = msg.content
+                            if msg.reasoning:
+                                cd_kwargs["reasoning_content"] = msg.reasoning
+                            if tc_protos:
+                                cd_kwargs["tool_calls"] = tc_protos
+                            if cd_kwargs:
+                                yield backend_pb2.Reply(
+                                    message=bytes(msg.content or "", encoding='utf-8'),
+                                    chat_deltas=[backend_pb2.ChatDelta(**cd_kwargs)],
+                                )
+                    # (B) buffer fallback — emit nothing during the stream.
+                    # The post-loop extract_tool_calls block builds the final chunk.
+
+                # Keep track of text + token_ids generated
                 generated_text = iteration_text
+                generated_token_ids = list(request_output.outputs[0].token_ids)
         finally:
             await outputs.aclose()
 
@@ -639,16 +740,19 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
             except Exception as e:
                 print(f"Reasoning parser error: {e}", file=sys.stderr)
 
-        if self.tool_parser_cls and request.Tools:
+        # When (A) native streaming ran cleanly, per-delta yields above already
+        # delivered everything — do NOT extract again on the full text or we'd
+        # duplicate content/tool_calls into the final chunk.
+        if has_tool_parser and not (native_streaming and not native_streaming_error):
             try:
-                tools = json.loads(request.Tools)
-                # Some concrete parsers only accept the tokenizer; only the
-                # abstract base declares the tools kwarg. Try with tools first,
-                # fall back to tokenizer-only.
-                try:
-                    tp = self.tool_parser_cls(self.tokenizer, tools=tools)
-                except TypeError:
-                    tp = self.tool_parser_cls(self.tokenizer)
+                tp = tp_instance
+                if tp is None:
+                    # Defensive: tp_instance build failed earlier; reconstruct.
+                    tools = json.loads(request.Tools)
+                    try:
+                        tp = self.tool_parser_cls(self.tokenizer, tools=tools)
+                    except TypeError:
+                        tp = self.tool_parser_cls(self.tokenizer)
                 info = tp.extract_tool_calls(content, request=None)
                 if info.tools_called:
                     content = info.content or ""
@@ -661,6 +765,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                         ))
             except Exception as e:
                 print(f"Tool parser error: {e}", file=sys.stderr)
+        elif native_streaming and not native_streaming_error:
+            # Per-delta path already emitted content + tool_calls; the final
+            # chat_delta should carry only metadata (token counts, logprobs).
+            content = ""
 
         # Extract token counts
         prompt_tokens = 0
@@ -700,7 +808,26 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
         )
 
         if streaming:
-            # Final chunk with structured data
+            # Final chunk with structured data.
+            #
+            # If we used the buffer fallback (has_tool_parser=True AND native
+            # streaming did NOT run cleanly) and the parser found no tool call,
+            # flush the buffered content as ONE content delta — and clear the
+            # final chat_delta's content so the metadata chunk does not repeat
+            # what we just sent. This is the plain-text-with-tool-parser path.
+            buffered_fallback = (
+                has_tool_parser
+                and not (native_streaming and not native_streaming_error)
+            )
+            if buffered_fallback and not tool_calls_proto and content:
+                yield backend_pb2.Reply(
+                    message=bytes(content, encoding='utf-8'),
+                    chat_deltas=[backend_pb2.ChatDelta(content=content)],
+                )
+                chat_delta = backend_pb2.ChatDelta(
+                    reasoning_content=reasoning_content,
+                    tool_calls=tool_calls_proto,
+                )
             yield backend_pb2.Reply(
                 message=b"",
                 prompt_tokens=prompt_tokens,
diff --git a/backend/python/vllm/test.py b/backend/python/vllm/test.py
index 25a7f54e6..d00595f01 100644
--- a/backend/python/vllm/test.py
+++ b/backend/python/vllm/test.py
@@ -278,4 +278,261 @@ class TestBackendServicer(unittest.TestCase):
             print(err)
             self.fail("Embedding service failed")
         finally:
-            self.tearDown()
\ No newline at end of file
+            self.tearDown()
+
+
+class TestStreamingToolParser(unittest.TestCase):
+    """
+    Server-less unit tests for the streaming + tool-parser machinery in
+    BackendServicer._predict. These tests instantiate BackendServicer
+    directly and mock the vLLM engine + tool parser, so they do not need
+    a GPU, a model, or a running gRPC server. Kept in a separate class to
+    avoid the parent setUp() which spawns a subprocess.
+
+    Covers #582 (follow-up to #10346):
+      1. Markup-leak prevention with a non-streaming parser (buffer fallback)
+      2. No content duplication on the plain-text path with the buffer fallback
+      3. Native streaming progressive plain-text emission
+      4. Native streaming structured tool_call, no markup leak
+      5. Parser exception → graceful fallback to buffer, still no markup
+      6. No-tool-parser regression: unchanged per-delta content stream
+    """
+
+    @staticmethod
+    def _make_generate(chunks):
+        """Build a fake vLLM engine.generate that yields cumulative chunks."""
+        from types import SimpleNamespace
+        async def gen(*a, **k):
+            for i, t in enumerate(chunks):
+                yield SimpleNamespace(
+                    outputs=[SimpleNamespace(
+                        text=t,
+                        token_ids=list(range(i + 1)),
+                        logprobs=None,
+                    )],
+                    prompt_token_ids=[0],
+                )
+        return lambda *a, **k: gen()
+
+    @staticmethod
+    def _collect(servicer, req):
+        import asyncio
+        async def run():
+            return [r async for r in servicer._predict(req, None, streaming=True)]
+        return asyncio.run(run())
+
+    def _new_servicer(self):
+        import sys, os
+        sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+        from backend import BackendServicer
+        s = BackendServicer()
+        s.reasoning_parser_cls = None
+        s.tool_parser_cls = None
+        s.tokenizer = None
+        return s
+
+    # ── Case 1+2: parser without streaming method → buffer fallback ──
+    def test_buffer_path_no_markup_no_duplication(self):
+        from types import SimpleNamespace
+
+        def parser_cls(called, content_text, calls):
+            class _P:
+                def __init__(self, tokenizer, tools=None):
+                    pass
+                # NOTE: NO extract_tool_calls_streaming → takes the buffer path
+                def extract_tool_calls(self, c, request=None):
+                    return SimpleNamespace(
+                        tools_called=called, content=content_text, tool_calls=calls,
+                    )
+            return _P
+
+        tools_json = '[{"type":"function","function":{"name":"calc","parameters":{}}}]'
+
+        # Tool-call case: no raw markup in any delta.content
+        s = self._new_servicer()
+        s.llm = SimpleNamespace(generate=self._make_generate([
+            '<tool_call>\n{"name": "calc"',
+            '<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
+        ]))
+        call = SimpleNamespace(id="call_1",
+                               function=SimpleNamespace(name="calc", arguments='{"x": 1}'))
+        s.tool_parser_cls = parser_cls(True, "", [call])
+        req = backend_pb2.PredictOptions(Prompt="x", Tools=tools_json)
+        replies = self._collect(s, req)
+        contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
+        self.assertFalse(
+            any("<tool_call" in c for c in contents),
+            f"markup leaked: {contents!r}",
+        )
+        names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls]
+        self.assertIn("calc", names, "tool_call missing from final chunk")
+
+        # Plain-text-with-tools case: full content delivered exactly once
+        s2 = self._new_servicer()
+        s2.llm = SimpleNamespace(generate=self._make_generate([
+            "The capital ",
+            "The capital of France is Paris.",
+        ]))
+        s2.tool_parser_cls = parser_cls(False, "", [])
+        req2 = backend_pb2.PredictOptions(Prompt="x", Tools=tools_json)
+        joined = "".join(
+            cd.content for r in self._collect(s2, req2)
+            for cd in r.chat_deltas if cd.content
+        )
+        self.assertEqual(
+            joined.count("The capital of France is Paris."), 1,
+            f"buffered content duplicated: {joined!r}",
+        )
+
+    # ── Case 3: native streaming, progressive plain text ──
+    def test_native_streaming_progressive_plain_text(self):
+        from types import SimpleNamespace
+
+        class _DeltaMsg:
+            def __init__(self, content=None, reasoning=None, tool_calls=None):
+                self.content = content
+                self.reasoning = reasoning
+                self.tool_calls = tool_calls or []
+
+        class StreamingParser:
+            def __init__(self, tokenizer, tools=None):
+                pass
+            def extract_tool_calls(self, c, request=None):
+                # Should NOT be called when native streaming runs successfully.
+                raise AssertionError("extract_tool_calls invoked on native-streaming path")
+            def extract_tool_calls_streaming(
+                self, previous_text, current_text, delta_text,
+                previous_token_ids, current_token_ids, delta_token_ids, request,
+            ):
+                if not delta_text:
+                    return None
+                return _DeltaMsg(content=delta_text)
+
+        s = self._new_servicer()
+        s.llm = SimpleNamespace(generate=self._make_generate([
+            "Paris ",
+            "Paris is ",
+            "Paris is the capital of France.",
+        ]))
+        s.tool_parser_cls = StreamingParser
+        req = backend_pb2.PredictOptions(
+            Prompt="x",
+            Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
+        )
+        replies = self._collect(s, req)
+
+        intermediate_content = [
+            cd.content for r in replies[:-1] for cd in r.chat_deltas if cd.content
+        ]
+        self.assertTrue(
+            len(intermediate_content) > 0,
+            "Plain-text response not streamed progressively (native streaming inactive?)",
+        )
+        assembled = "".join(
+            cd.content for r in replies for cd in r.chat_deltas if cd.content
+        )
+        self.assertEqual(
+            assembled, "Paris is the capital of France.",
+            f"Assembled content wrong: {assembled!r}",
+        )
+
+    # ── Case 4: native streaming, structured tool_call, no markup ──
+    def test_native_streaming_tool_call_no_markup_leak(self):
+        from types import SimpleNamespace
+
+        class _DeltaMsg:
+            def __init__(self, content=None, reasoning=None, tool_calls=None):
+                self.content = content
+                self.reasoning = reasoning
+                self.tool_calls = tool_calls or []
+
+        class _ToolCallStreamer:
+            def __init__(self, tokenizer, tools=None):
+                self._emitted = False
+            def extract_tool_calls(self, c, request=None):
+                raise AssertionError("extract_tool_calls invoked on native-streaming path")
+            def extract_tool_calls_streaming(
+                self, previous_text, current_text, delta_text,
+                previous_token_ids, current_token_ids, delta_token_ids, request,
+            ):
+                if "</tool_call>" in current_text and not self._emitted:
+                    self._emitted = True
+                    fn = SimpleNamespace(name="calc", arguments='{"x": 1}')
+                    tc = SimpleNamespace(id="call_1", type="function", index=0, function=fn)
+                    return _DeltaMsg(tool_calls=[tc])
+                return None
+
+        s = self._new_servicer()
+        s.llm = SimpleNamespace(generate=self._make_generate([
+            '<tool_call>\n',
+            '<tool_call>\n{"name": "calc"',
+            '<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
+        ]))
+        s.tool_parser_cls = _ToolCallStreamer
+        req = backend_pb2.PredictOptions(
+            Prompt="x",
+            Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
+        )
+        replies = self._collect(s, req)
+
+        contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
+        self.assertFalse(
+            any("<tool_call" in c or "</tool_call>" in c for c in contents),
+            f"markup leaked as content: {contents!r}",
+        )
+        names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls if tc.name]
+        args  = [tc.arguments for r in replies for cd in r.chat_deltas for tc in cd.tool_calls if tc.arguments]
+        self.assertIn("calc", names, f"tool_call name missing; got {names!r}")
+        self.assertIn('{"x": 1}', args, f"tool_call args missing; got {args!r}")
+
+    # ── Case 5: parser exception → fallback to buffer, no leak ──
+    def test_native_streaming_parser_exception_falls_back_to_buffer(self):
+        from types import SimpleNamespace
+        call = SimpleNamespace(id="call_1",
+                               function=SimpleNamespace(name="calc", arguments='{"x": 1}'))
+
+        class _BrokenStreamer:
+            def __init__(self, tokenizer, tools=None):
+                pass
+            def extract_tool_calls(self, c, request=None):
+                return SimpleNamespace(tools_called=True, content="", tool_calls=[call])
+            def extract_tool_calls_streaming(self, *a, **kw):
+                raise RuntimeError("simulated parser bug")
+
+        s = self._new_servicer()
+        s.llm = SimpleNamespace(generate=self._make_generate([
+            '<tool_call>\n{"name": "calc"',
+            '<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
+        ]))
+        s.tool_parser_cls = _BrokenStreamer
+        req = backend_pb2.PredictOptions(
+            Prompt="x",
+            Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
+        )
+        replies = self._collect(s, req)
+
+        contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
+        self.assertFalse(
+            any("<tool_call" in c for c in contents),
+            f"markup leaked after parser exception: {contents!r}",
+        )
+        names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls]
+        self.assertIn("calc", names, "tool_call missing from final chunk after fallback")
+
+    # ── Case 6: no tool parser → unchanged per-delta content stream ──
+    def test_no_tool_parser_unchanged_per_delta_stream(self):
+        from types import SimpleNamespace
+        s = self._new_servicer()  # tool_parser_cls already None
+        s.llm = SimpleNamespace(generate=self._make_generate([
+            "Hello ", "Hello world", "Hello world!",
+        ]))
+        req = backend_pb2.PredictOptions(Prompt="x", Tools="")
+        replies = self._collect(s, req)
+
+        intermediate = [
+            cd.content for r in replies[:-1] for cd in r.chat_deltas if cd.content
+        ]
+        self.assertEqual(
+            intermediate, ["Hello ", "world", "!"],
+            f"plain streaming changed; got {intermediate!r}",
+        )
diff --git a/examples/vllm-bench/README.md b/examples/vllm-bench/README.md
new file mode 100644
index 000000000..00cd294bb
--- /dev/null
+++ b/examples/vllm-bench/README.md
@@ -0,0 +1,54 @@
+# vLLM streaming + tool-parser benchmark
+
+A small, self-contained Python script (stdlib only) that measures
+time-to-first-token (TTFT) for the vLLM backend's streaming path with
+a tool parser configured.
+
+## Why this exists
+
+When a vLLM tool parser is active and a streaming chat completion is requested,
+LocalAI used to buffer the full generation to prevent raw tool-call markup
+(e.g. `<tool_call>...`) from leaking as `delta.content`. That was correct
+for tool-call responses, but it turned plain-text responses into effectively
+non-streaming — the client received nothing until the model finished.
+
+With native parser-side streaming (`parser.extract_tool_calls_streaming`,
+implemented by every concrete vLLM 0.23+ tool parser), each delta can be
+classified per-token: emit as content, emit as a structured tool_call, or
+suppress.
+
+## Three scenarios
+
+| Scenario | Request | Expected outcome |
+|---|---|---|
+| `tool_call`         | "What is the weather in Paris? Please use the tool." | Model calls `get_weather`. `delta.tool_calls` chunks; no content leak. |
+| `plain_text_short`  | "Explain in 3 short sentences what a hash table is. Do NOT call any tool." | Model writes ~3 sentences. |
+| `plain_text_long`   | "Write a thorough 8-paragraph explanation of how Python's GIL works…" | Model writes ~1500 tokens of prose. |
+
+The **long scenario** is where the streaming/buffering difference is most
+dramatic: with the buffer-all path, the client sees nothing for 20+ seconds
+and then everything at once; with native streaming, the first token arrives
+in <100ms and the response flows progressively.
+
+## What the script reports
+
+For each scenario, across N runs:
+
+- `ttf_content_s` — time until the first `delta.content` chunk
+- `ttf_tool_s` — time until the first `delta.tool_calls` chunk
+- `n_content_chunks` — total content deltas (1 = bundled, >>1 = streamed)
+- `n_tool_chunks` — total tool_call deltas
+- `total_s` — total wall-clock until `[DONE]`
+- `finish_reason` — `tool_calls` / `stop` / `length`
+
+The big tell is **`n_content_chunks` vs `total_s` ratio**:
+- Buffer-all: `n_content_chunks` ≈ 1, `ttf_content_s` ≈ `total_s` (one chunk at end)
+- Streaming: `n_content_chunks` ≈ token count, `ttf_content_s` ≈ first-token latency
+
+## Usage
+
+```bash
+python ttft_streaming_tool_parser.py --url http://localhost:8080 --model my-coder --runs 3
+```
+
+JSON results are written to `ttft_bench_<label>.json` (default label: `run`).
diff --git a/examples/vllm-bench/ttft_streaming_tool_parser.py b/examples/vllm-bench/ttft_streaming_tool_parser.py
new file mode 100755
index 000000000..7f38307e5
--- /dev/null
+++ b/examples/vllm-bench/ttft_streaming_tool_parser.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+"""
+TTFT benchmark for the vLLM backend's streaming + tool-parser path.
+
+Three scenarios:
+  1. tool_call        — request mentions a tool; model is expected to call it
+  2. plain_text_short — request offers a tool but explicitly asks for ~3 sentences
+  3. plain_text_long  — same as above but asks for ~8 paragraphs (1500 tokens)
+
+The long scenario shows the dramatic difference between buffering and
+streaming most clearly: with buffer-all, the client sees nothing for
+20+ seconds; with native streaming, the first token arrives in <100 ms.
+
+Usage:
+  python ttft_streaming_tool_parser.py \\
+      --url http://localhost:8080 --model my-coder --runs 3
+
+The script is self-contained (stdlib only — urllib, json, time, argparse).
+"""
+import argparse
+import json
+import sys
+import time
+import urllib.request
+
+DEFAULT_TOOLS = [{
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get current weather for a city",
+        "parameters": {
+            "type": "object",
+            "properties": {"city": {"type": "string"}},
+            "required": ["city"],
+        },
+    },
+}]
+
+SCENARIOS = [
+    {
+        "label": "tool_call",
+        "messages": [{"role": "user",
+                      "content": "What is the weather in Paris? Please use the tool."}],
+        "max_tokens": 80,
+    },
+    {
+        "label": "plain_text_short",
+        "messages": [{"role": "user",
+                      "content": "Explain in 3 short sentences what a hash table is. "
+                                 "Do NOT call any tool."}],
+        "max_tokens": 200,
+    },
+    {
+        "label": "plain_text_long",
+        "messages": [{"role": "user",
+                      "content": "Write a thorough 8-paragraph explanation of how "
+                                 "Python's GIL works, including history, current "
+                                 "state, no-GIL build, and alternatives. Be "
+                                 "detailed. Do NOT call any tool."}],
+        "max_tokens": 1500,
+    },
+]
+
+
+def bench_one(url, model, messages, tools, max_tokens, timeout):
+    body = json.dumps({
+        "model": model,
+        "stream": True,
+        "tools": tools,
+        "messages": messages,
+        "max_tokens": max_tokens,
+    }).encode()
+    req = urllib.request.Request(
+        f"{url.rstrip('/')}/v1/chat/completions",
+        data=body, headers={"Content-Type": "application/json"},
+    )
+
+    t0 = time.perf_counter()
+    first_content = None
+    first_tool = None
+    n_content = 0
+    n_tool = 0
+    last = None
+    finish = None
+    with urllib.request.urlopen(req, timeout=timeout) as resp:
+        for line in resp:
+            line = line.decode("utf-8", "replace").strip()
+            if not line.startswith("data: "):
+                continue
+            payload = line[6:]
+            if payload == "[DONE]":
+                break
+            try:
+                chunk = json.loads(payload)
+            except Exception:
+                continue
+            if not chunk.get("choices"):
+                continue
+            ch = chunk["choices"][0]
+            delta = ch.get("delta") or {}
+            now = time.perf_counter() - t0
+            if delta.get("content"):
+                if first_content is None:
+                    first_content = now
+                n_content += 1
+            if delta.get("tool_calls"):
+                if first_tool is None:
+                    first_tool = now
+                n_tool += 1
+            if ch.get("finish_reason"):
+                finish = ch["finish_reason"]
+            last = now
+    return {
+        "ttf_content_s": first_content,
+        "ttf_tool_s": first_tool,
+        "n_content_chunks": n_content,
+        "n_tool_chunks": n_tool,
+        "total_s": last,
+        "finish_reason": finish,
+    }
+
+
+def stats(values):
+    values = [v for v in values if v is not None]
+    if not values:
+        return "n/a"
+    return f"min={min(values):.3f}  avg={sum(values)/len(values):.3f}  max={max(values):.3f}"
+
+
+def main():
+    p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument("--url", default="http://localhost:8080",
+                   help="LocalAI base URL (default: %(default)s)")
+    p.add_argument("--model", default="coder", help="Model name (default: %(default)s)")
+    p.add_argument("--runs", type=int, default=3, help="Repetitions per scenario (default: %(default)s)")
+    p.add_argument("--timeout", type=int, default=180, help="Per-request timeout in seconds")
+    p.add_argument("--label", default="run",
+                   help="Tag for the JSON output file (default: %(default)s)")
+    args = p.parse_args()
+
+    print(f"=== TTFT Bench — {args.url}  model={args.model}  runs={args.runs} ===")
+    summary = {}
+    for sc in SCENARIOS:
+        print(f"\nScenario: {sc['label']}")
+        rows = []
+        for run in range(args.runs):
+            r = bench_one(args.url, args.model,
+                          sc["messages"], DEFAULT_TOOLS, sc["max_tokens"], args.timeout)
+            rows.append(r)
+            ttf_c = f"{r['ttf_content_s']:.3f}" if r["ttf_content_s"] is not None else "—"
+            ttf_t = f"{r['ttf_tool_s']:.3f}" if r["ttf_tool_s"] is not None else "—"
+            print(f"  run {run+1}/{args.runs}: "
+                  f"ttf_content={ttf_c}s  ttf_tool={ttf_t}s  "
+                  f"n_content={r['n_content_chunks']}  n_tool={r['n_tool_chunks']}  "
+                  f"total={r['total_s']:.2f}s  finish={r['finish_reason']}")
+        summary[sc["label"]] = rows
+
+    print("\n=== Summary (per scenario) ===")
+    for label, rows in summary.items():
+        print(f"[{label}]")
+        print(f"  ttf_content_s:    {stats(r['ttf_content_s'] for r in rows)}")
+        print(f"  ttf_tool_s:       {stats(r['ttf_tool_s']    for r in rows)}")
+        print(f"  n_content_chunks: {stats(r['n_content_chunks'] for r in rows)}")
+        print(f"  n_tool_chunks:    {stats(r['n_tool_chunks']    for r in rows)}")
+        print(f"  total_s:          {stats(r['total_s']        for r in rows)}")
+
+    out = f"ttft_bench_{args.label}.json"
+    with open(out, "w") as f:
+        json.dump(summary, f, indent=2)
+    print(f"\nSaved to {out}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From b50b1fe418b5af4cf2b8cb22c1c32dce659ff8ff Mon Sep 17 00:00:00 2001
From: Leoy <leiyu.cs@outlook.com>
Date: Sun, 21 Jun 2026 23:17:04 +0800
Subject: [PATCH 25/99] feat(watchdog): add size-aware LRU eviction mode
 (#9527)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(watchdog): add size-aware LRU eviction mode

When the model count hits the LRU limit or the memory reclaimer fires,
evict the largest model by on-disk file size first rather than the
least-recently-used one.  For GGUF models the file size is a reliable
proxy for GPU/RAM footprint, so evicting the largest candidate maximises
freed memory per eviction round while keeping small utility models
(embeddings, classifiers, rerankers) resident.

Changes:
- `pkg/model/watchdog.go`: add `sizeAwareEviction` flag and
  `modelSizes map[string]int64` to `WatchDog`; sort candidates by
  `sizeBytes` desc (LRU time as tiebreaker) when the flag is set;
  add `RegisterModelSize`, `SetSizeAwareEviction`, `GetSizeAwareEviction`
- `pkg/model/watchdog_options.go`: add `WithSizeAwareEviction` option
- `pkg/model/initializers.go`: stat model file after load and call
  `RegisterModelSize` so size data is available before the first eviction
- `core/config/application_config.go`, `runtime_settings.go`: add
  `SizeAwareEviction` field and `WithSizeAwareEviction` app option;
  expose via `ToRuntimeSettings` / `ApplyRuntimeSettings` for the
  `POST /api/settings` live-reload path
- `core/cli/run.go`: add `--size-aware-eviction` flag /
  `LOCALAI_SIZE_AWARE_EVICTION` env var
- `core/application/startup.go`, `watchdog.go`: wire the new option
  through to `NewWatchDog`
- `pkg/model/watchdog_test.go`: 5 new specs — option enable, dynamic
  toggle, largest-first ordering, equal-size LRU tiebreaker, no-size
  fallback to LRU, and size-map cleanup on eviction

Closes #9375

Signed-off-by: supermario_leo <leo.stack@outlook.com>

* refactor(watchdog): use vram estimation scaffolding for model size

Replace the brittle os.Stat(modelFile) approach with a proper call to
pkg/vram, which handles multi-file models (DownloadFiles, MMProj) and
all weight file types, not just single GGUF files.

- Add estimateModelSizeBytes() in core/backend/options.go that collects
  all weight file URIs from the model config, resolves them to file://
  URIs, and calls vram.Estimate() with the shared DefaultCachedSizeResolver
  (15-min TTL cache avoids redundant stat calls on repeated loads)
- Thread the result through via a new WithModelSizeBytes() loader option
- In initializers.go, consume the pre-computed size instead of calling
  os.Stat; if no size was supplied (e.g. for external/router-dispatched
  models) the registration is simply skipped

Signed-off-by: supermario_leo <leo.stack@outlook.com>

* refactor(watchdog): use EstimateModel with HF fallback for size estimation

Switch estimateModelSizeBytes from calling vram.Estimate directly to the
unified vram.EstimateModel entry point, which adds automatic fallbacks:
file-based GGUF metadata → HF API → size string.

Also extract the HuggingFace repo ID from model URIs (huggingface://,
hf://, https://huggingface.co/ and org/model short-form) and pass it
as ModelEstimateInput.HFRepo, so models not yet downloaded locally can
still get a size estimate via the HF API.

Addresses @mudler's review feedback: "better to rely on EstimateModel
and pass by the HF URL of the model extracted from the URI".

Signed-off-by: supermario_leo <leo.stack@outlook.com>

* feat(webui): add Size-Aware Eviction toggle to settings page

The size-aware eviction setting was wired through the CLI flag and the
RuntimeSettings live-reload path (POST /api/settings) but had no handle
on the React settings page, so it could not be toggled from the UI.

Add a Size-Aware Eviction toggle to the Watchdog section, next to the
existing Force Eviction When Busy / LRU eviction handles. The settings
page loads and saves the whole RuntimeSettings object, so the new
size_aware_eviction key is picked up with no extra plumbing.

Addresses @mudler's review feedback: the application config setting
should land on the same UI settings page as the other handles.

Signed-off-by: supermario_leo <leo.stack@outlook.com>

---------

Signed-off-by: supermario_leo <leo.stack@outlook.com>
---
 core/application/startup.go               |   7 ++
 core/application/watchdog.go              |   1 +
 core/backend/options.go                   |  68 ++++++++++++++
 core/cli/run.go                           |   4 +
 core/config/application_config.go         |  17 ++++
 core/config/runtime_settings.go           |   1 +
 core/http/react-ui/src/pages/Settings.jsx |   3 +
 pkg/model/initializers.go                 |   6 ++
 pkg/model/loader_options.go               |  11 +++
 pkg/model/watchdog.go                     | 103 ++++++++++++++++-----
 pkg/model/watchdog_options.go             |  14 +++
 pkg/model/watchdog_test.go                | 106 ++++++++++++++++++++++
 12 files changed, 319 insertions(+), 22 deletions(-)

diff --git a/core/application/startup.go b/core/application/startup.go
index fa5de5ede..352d66dab 100644
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -644,6 +644,12 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
 			options.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
 		}
 	}
+	if settings.SizeAwareEviction != nil {
+		// Only apply if current value is default (false), suggesting it wasn't set from env var
+		if !options.SizeAwareEviction {
+			options.SizeAwareEviction = *settings.SizeAwareEviction
+		}
+	}
 	if settings.LRUEvictionMaxRetries != nil {
 		// Only apply if current value is default (30), suggesting it wasn't set from env var
 		if options.LRUEvictionMaxRetries == 0 {
@@ -847,6 +853,7 @@ func initializeWatchdog(application *Application, options *config.ApplicationCon
 			model.WithLRULimit(lruLimit),
 			model.WithMemoryReclaimer(options.MemoryReclaimerEnabled, options.MemoryReclaimerThreshold),
 			model.WithForceEvictionWhenBusy(options.ForceEvictionWhenBusy),
+			model.WithSizeAwareEviction(options.SizeAwareEviction),
 		)
 		application.ModelLoader().SetWatchDog(wd)
 
diff --git a/core/application/watchdog.go b/core/application/watchdog.go
index 9658b5114..c71871d87 100644
--- a/core/application/watchdog.go
+++ b/core/application/watchdog.go
@@ -90,6 +90,7 @@ func (a *Application) startWatchdog() error {
 			model.WithLRULimit(lruLimit),
 			model.WithMemoryReclaimer(appConfig.MemoryReclaimerEnabled, appConfig.MemoryReclaimerThreshold),
 			model.WithForceEvictionWhenBusy(appConfig.ForceEvictionWhenBusy),
+			model.WithSizeAwareEviction(appConfig.SizeAwareEviction),
 		)
 
 		// Create new stop channel BEFORE setting up any goroutines
diff --git a/core/backend/options.go b/core/backend/options.go
index 18c3b7f27..3baab6902 100644
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -1,6 +1,7 @@
 package backend
 
 import (
+	"context"
 	"encoding/json"
 	"fmt"
 	"math/rand/v2"
@@ -12,7 +13,9 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/trace"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/vram"
 	"github.com/mudler/xlog"
 )
 
@@ -33,6 +36,67 @@ func recordModelLoadFailure(appConfig *config.ApplicationConfig, modelName, back
 	})
 }
 
+// estimateModelSizeBytes uses the unified EstimateModel entry point to compute
+// the total weight-file size for a model config.  It collects all weight files
+// from DownloadFiles, Model, and MMProj, and also extracts the HuggingFace
+// repo ID so EstimateModel can fall back to the HF API when local file
+// metadata is unavailable (e.g. not-yet-downloaded models).
+func estimateModelSizeBytes(c config.ModelConfig, modelsPath string) int64 {
+	seen := make(map[string]bool)
+	input := vram.ModelEstimateInput{}
+
+	addFile := func(uri string) {
+		if !vram.IsWeightFile(uri) {
+			return
+		}
+		resolved := uri
+		if !strings.Contains(uri, "://") {
+			resolved = "file://" + filepath.Join(modelsPath, uri)
+		}
+		if seen[resolved] {
+			return
+		}
+		seen[resolved] = true
+		input.Files = append(input.Files, vram.FileInput{URI: resolved})
+	}
+
+	// tryHFRepo resolves any huggingface:// or hf:// URI to an HTTPS URL and
+	// then extracts the org/model repo ID for use as the HF fallback path.
+	tryHFRepo := func(uri string) {
+		if input.HFRepo != "" {
+			return
+		}
+		resolved := downloader.URI(uri).ResolveURL()
+		if repoID, ok := vram.ExtractHFRepoID(resolved); ok {
+			input.HFRepo = repoID
+		}
+	}
+
+	for _, f := range c.DownloadFiles {
+		uriStr := string(f.URI)
+		addFile(uriStr)
+		tryHFRepo(uriStr)
+	}
+	addFile(c.Model)
+	tryHFRepo(c.Model)
+	if c.MMProj != "" {
+		addFile(c.MMProj)
+	}
+
+	if len(input.Files) == 0 && input.HFRepo == "" {
+		return 0
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+
+	result, err := vram.EstimateModel(ctx, input)
+	if err != nil || result.SizeBytes == 0 {
+		return 0
+	}
+	return int64(result.SizeBytes)
+}
+
 func ModelOptions(c config.ModelConfig, so *config.ApplicationConfig, opts ...model.Option) []model.Option {
 	defOpts := []model.Option{
 		model.WithBackendString(c.Backend),
@@ -70,6 +134,10 @@ func ModelOptions(c config.ModelConfig, so *config.ApplicationConfig, opts ...mo
 		defOpts = append(defOpts, model.WithExternalBackend(k, v))
 	}
 
+	if sizeBytes := estimateModelSizeBytes(c, so.SystemState.Model.ModelsPath); sizeBytes > 0 {
+		defOpts = append(defOpts, model.WithModelSizeBytes(sizeBytes))
+	}
+
 	return append(defOpts, opts...)
 }
 
diff --git a/core/cli/run.go b/core/cli/run.go
index d011f3293..23eebaaa0 100644
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -93,6 +93,7 @@ type RunCMD struct {
 	EnableMemoryReclaimer              bool     `env:"LOCALAI_MEMORY_RECLAIMER,MEMORY_RECLAIMER,LOCALAI_GPU_RECLAIMER,GPU_RECLAIMER" default:"false" help:"Enable memory threshold monitoring to auto-evict backends when memory usage exceeds threshold (uses GPU VRAM if available, otherwise RAM)" group:"backends"`
 	MemoryReclaimerThreshold           float64  `env:"LOCALAI_MEMORY_RECLAIMER_THRESHOLD,MEMORY_RECLAIMER_THRESHOLD,LOCALAI_GPU_RECLAIMER_THRESHOLD,GPU_RECLAIMER_THRESHOLD" default:"0.95" help:"Memory usage threshold (0.0-1.0) that triggers backend eviction (default 0.95 = 95%%)" group:"backends"`
 	ForceEvictionWhenBusy              bool     `env:"LOCALAI_FORCE_EVICTION_WHEN_BUSY,FORCE_EVICTION_WHEN_BUSY" default:"false" help:"Force eviction even when models have active API calls (default: false for safety)" group:"backends"`
+	SizeAwareEviction                  bool     `env:"LOCALAI_SIZE_AWARE_EVICTION,SIZE_AWARE_EVICTION" default:"false" help:"Evict the largest loaded model first rather than the least-recently-used one, keeping small utility models resident and maximizing freed memory per eviction" group:"backends"`
 	LRUEvictionMaxRetries              int      `env:"LOCALAI_LRU_EVICTION_MAX_RETRIES,LRU_EVICTION_MAX_RETRIES" default:"30" help:"Maximum number of retries when waiting for busy models to become idle before eviction (default: 30)" group:"backends"`
 	LRUEvictionRetryInterval           string   `env:"LOCALAI_LRU_EVICTION_RETRY_INTERVAL,LRU_EVICTION_RETRY_INTERVAL" default:"1s" help:"Interval between retries when waiting for busy models to become idle (e.g., 1s, 2s) (default: 1s)" group:"backends"`
 	Federated                          bool     `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
@@ -564,6 +565,9 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 	if r.ForceEvictionWhenBusy {
 		opts = append(opts, config.WithForceEvictionWhenBusy(true))
 	}
+	if r.SizeAwareEviction {
+		opts = append(opts, config.WithSizeAwareEviction(true))
+	}
 	if r.LRUEvictionMaxRetries > 0 {
 		opts = append(opts, config.WithLRUEvictionMaxRetries(r.LRUEvictionMaxRetries))
 	}
diff --git a/core/config/application_config.go b/core/config/application_config.go
index 9ec0bdc33..54eb5cb99 100644
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -119,6 +119,7 @@ type ApplicationConfig struct {
 
 	// Eviction settings
 	ForceEvictionWhenBusy    bool          // Force eviction even when models have active API calls (default: false for safety)
+	SizeAwareEviction        bool          // Evict largest models first rather than least-recently-used (default: false)
 	LRUEvictionMaxRetries    int           // Maximum number of retries when waiting for busy models to become idle (default: 30)
 	LRUEvictionRetryInterval time.Duration // Interval between retries when waiting for busy models (default: 1s)
 
@@ -488,6 +489,16 @@ func WithForceEvictionWhenBusy(enabled bool) AppOption {
 	}
 }
 
+// WithSizeAwareEviction enables size-aware eviction ordering.
+// When true, the watchdog evicts the largest loaded model first rather than the
+// least-recently-used one, keeping small utility models resident and maximizing
+// memory freed per eviction.
+func WithSizeAwareEviction(enabled bool) AppOption {
+	return func(o *ApplicationConfig) {
+		o.SizeAwareEviction = enabled
+	}
+}
+
 // WithLRUEvictionMaxRetries sets the maximum number of retries when waiting for busy models to become idle
 func WithLRUEvictionMaxRetries(maxRetries int) AppOption {
 	return func(o *ApplicationConfig) {
@@ -1028,6 +1039,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 	memoryReclaimerEnabled := o.MemoryReclaimerEnabled
 	memoryReclaimerThreshold := o.MemoryReclaimerThreshold
 	forceEvictionWhenBusy := o.ForceEvictionWhenBusy
+	sizeAwareEviction := o.SizeAwareEviction
 	lruEvictionMaxRetries := o.LRUEvictionMaxRetries
 	threads := o.Threads
 	contextSize := o.ContextSize
@@ -1120,6 +1132,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 		MemoryReclaimerEnabled:    &memoryReclaimerEnabled,
 		MemoryReclaimerThreshold:  &memoryReclaimerThreshold,
 		ForceEvictionWhenBusy:     &forceEvictionWhenBusy,
+		SizeAwareEviction:         &sizeAwareEviction,
 		LRUEvictionMaxRetries:     &lruEvictionMaxRetries,
 		LRUEvictionRetryInterval:  &lruEvictionRetryInterval,
 		Threads:                   &threads,
@@ -1244,6 +1257,10 @@ func (o *ApplicationConfig) ApplyRuntimeSettings(settings *RuntimeSettings) (req
 		o.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
 		// This setting doesn't require restart, can be updated dynamically
 	}
+	if settings.SizeAwareEviction != nil {
+		o.SizeAwareEviction = *settings.SizeAwareEviction
+		// This setting doesn't require restart, can be updated dynamically
+	}
 	if settings.LRUEvictionMaxRetries != nil {
 		o.LRUEvictionMaxRetries = *settings.LRUEvictionMaxRetries
 		// This setting doesn't require restart, can be updated dynamically
diff --git a/core/config/runtime_settings.go b/core/config/runtime_settings.go
index 5c5f2986f..a7f15e658 100644
--- a/core/config/runtime_settings.go
+++ b/core/config/runtime_settings.go
@@ -28,6 +28,7 @@ type RuntimeSettings struct {
 
 	// Eviction settings
 	ForceEvictionWhenBusy    *bool   `json:"force_eviction_when_busy,omitempty"`    // Force eviction even when models have active API calls (default: false for safety)
+	SizeAwareEviction        *bool   `json:"size_aware_eviction,omitempty"`          // Evict largest models first rather than least-recently-used (default: false)
 	LRUEvictionMaxRetries    *int    `json:"lru_eviction_max_retries,omitempty"`    // Maximum number of retries when waiting for busy models to become idle (default: 30)
 	LRUEvictionRetryInterval *string `json:"lru_eviction_retry_interval,omitempty"` // Interval between retries when waiting for busy models (e.g., 1s, 2s) (default: 1s)
 
diff --git a/core/http/react-ui/src/pages/Settings.jsx b/core/http/react-ui/src/pages/Settings.jsx
index c5cd4e3d3..d455a1bde 100644
--- a/core/http/react-ui/src/pages/Settings.jsx
+++ b/core/http/react-ui/src/pages/Settings.jsx
@@ -316,6 +316,9 @@ export default function Settings() {
               <SettingRow label="Force Eviction When Busy" description="Allow model eviction even during active API calls">
                 <Toggle checked={settings.force_eviction_when_busy} onChange={(v) => update('force_eviction_when_busy', v)} />
               </SettingRow>
+              <SettingRow label="Size-Aware Eviction" description="Evict the largest loaded model first instead of the least-recently-used one">
+                <Toggle checked={settings.size_aware_eviction} onChange={(v) => update('size_aware_eviction', v)} />
+              </SettingRow>
               <SettingRow label="LRU Eviction Max Retries" description="Maximum retries waiting for busy models before eviction">
                 <input className="input" type="number" style={{ width: 120 }} value={settings.lru_eviction_max_retries ?? ''} onChange={(e) => update('lru_eviction_max_retries', parseInt(e.target.value) || 0)} placeholder="30" />
               </SettingRow>
diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go
index d7719ca13..fdae562fe 100644
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -159,6 +159,12 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
 			return nil, fmt.Errorf("could not load model (no success): %s", res.Message)
 		}
 
+		// Register size for size-aware eviction using the caller-supplied estimate
+		// (computed via pkg/vram, which handles multi-file and non-GGUF models).
+		if ml.wd != nil && o.modelSizeBytes > 0 {
+			ml.wd.RegisterModelSize(modelID, o.modelSizeBytes)
+		}
+
 		return client, nil
 	}
 }
diff --git a/pkg/model/loader_options.go b/pkg/model/loader_options.go
index 16df2b9bd..d247a00c8 100644
--- a/pkg/model/loader_options.go
+++ b/pkg/model/loader_options.go
@@ -19,6 +19,11 @@ type Options struct {
 	grpcAttempts      int
 	grpcAttemptsDelay int
 	parallelRequests  bool
+
+	// modelSizeBytes is the estimated total weight size in bytes, pre-computed
+	// by the caller using the vram estimation scaffolding.  When non-zero it is
+	// registered with the watchdog so size-aware eviction can rank models.
+	modelSizeBytes int64
 }
 
 type Option func(*Options)
@@ -86,6 +91,12 @@ func WithModelID(id string) Option {
 	}
 }
 
+func WithModelSizeBytes(bytes int64) Option {
+	return func(o *Options) {
+		o.modelSizeBytes = bytes
+	}
+}
+
 func NewOptions(opts ...Option) *Options {
 	o := &Options{
 		gRPCOptions:       &pb.ModelOptions{},
diff --git a/pkg/model/watchdog.go b/pkg/model/watchdog.go
index d876d6ad9..d6dd18da8 100644
--- a/pkg/model/watchdog.go
+++ b/pkg/model/watchdog.go
@@ -46,6 +46,11 @@ type WatchDog struct {
 	// Eviction settings
 	forceEvictionWhenBusy bool // Force eviction even when models have active API calls (default: false for safety)
 
+	// Size-aware eviction: sort candidates by model file size (largest first) to maximize freed memory.
+	// When enabled, bigger models are evicted before smaller ones regardless of recency.
+	sizeAwareEviction bool
+	modelSizes        map[string]int64 // modelID → file size in bytes
+
 	// Pinned models are excluded from idle, LRU, and memory-pressure eviction
 	pinnedModels map[string]bool
 
@@ -94,6 +99,8 @@ func NewWatchDog(opts ...WatchDogOption) *WatchDog {
 		memoryReclaimerThreshold: o.memoryReclaimerThreshold,
 		watchdogInterval:         o.watchdogInterval,
 		forceEvictionWhenBusy:    o.forceEvictionWhenBusy,
+		sizeAwareEviction:        o.sizeAwareEviction,
+		modelSizes:               make(map[string]int64),
 	}
 }
 
@@ -133,6 +140,31 @@ func (wd *WatchDog) SetForceEvictionWhenBusy(force bool) {
 	wd.forceEvictionWhenBusy = force
 }
 
+// RegisterModelSize records the on-disk file size for a model.
+// This is used by size-aware eviction to prefer evicting larger models first.
+// Call this after a model has been successfully loaded.
+func (wd *WatchDog) RegisterModelSize(modelID string, bytes int64) {
+	wd.Lock()
+	defer wd.Unlock()
+	wd.modelSizes[modelID] = bytes
+}
+
+// SetSizeAwareEviction enables or disables size-aware eviction ordering.
+// When enabled, eviction candidates are sorted by file size (largest first)
+// rather than by recency, maximizing freed memory per eviction.
+func (wd *WatchDog) SetSizeAwareEviction(enabled bool) {
+	wd.Lock()
+	defer wd.Unlock()
+	wd.sizeAwareEviction = enabled
+}
+
+// GetSizeAwareEviction returns whether size-aware eviction is enabled.
+func (wd *WatchDog) GetSizeAwareEviction() bool {
+	wd.Lock()
+	defer wd.Unlock()
+	return wd.sizeAwareEviction
+}
+
 // SetPinnedModels replaces the set of pinned model names.
 // Pinned models are excluded from idle, LRU, and memory-pressure eviction.
 func (wd *WatchDog) SetPinnedModels(models []string) {
@@ -302,11 +334,12 @@ func (wd *WatchDog) RestoreState(state WatchDogState) {
 	xlog.Info("[WatchDog] Restored model state", "modelCount", len(wd.addressModelMap))
 }
 
-// modelUsageInfo holds information about a model's usage for LRU sorting
+// modelUsageInfo holds information about a model's usage for eviction sorting
 type modelUsageInfo struct {
-	address  string
-	model    string
-	lastUsed time.Time
+	address   string
+	model     string
+	lastUsed  time.Time
+	sizeBytes int64 // on-disk file size; 0 if unknown
 }
 
 // EnforceLRULimitResult contains the result of LRU enforcement
@@ -338,27 +371,39 @@ func (wd *WatchDog) EnforceLRULimit(pendingLoads int) EnforceLRULimitResult {
 		return EnforceLRULimitResult{EvictedCount: 0, NeedMore: false}
 	}
 
-	xlog.Debug("[WatchDog] LRU enforcement triggered", "current", currentCount, "pendingLoads", pendingLoads, "limit", wd.lruLimit, "toEvict", modelsToEvict)
+	sizeAwareEviction := wd.sizeAwareEviction
+	xlog.Debug("[WatchDog] LRU enforcement triggered", "current", currentCount, "pendingLoads", pendingLoads, "limit", wd.lruLimit, "toEvict", modelsToEvict, "sizeAware", sizeAwareEviction)
 
-	// Build a list of models sorted by last used time (oldest first)
+	// Build a list of models to sort for eviction candidates
 	var models []modelUsageInfo
 	for address, model := range wd.addressModelMap {
 		lastUsed := wd.lastUsed[address]
 		if lastUsed.IsZero() {
-			// If no lastUsed recorded, use a very old time
 			lastUsed = time.Time{}
 		}
 		models = append(models, modelUsageInfo{
-			address:  address,
-			model:    model,
-			lastUsed: lastUsed,
+			address:   address,
+			model:     model,
+			lastUsed:  lastUsed,
+			sizeBytes: wd.modelSizes[model],
 		})
 	}
 
-	// Sort by lastUsed time (oldest first)
-	slices.SortFunc(models, func(a, b modelUsageInfo) int {
-		return a.lastUsed.Compare(b.lastUsed)
-	})
+	// Sort eviction candidates: largest-first when size-aware, oldest-first otherwise.
+	// Tiebreaker in size-aware mode: oldest last-used (LRU) to break ties between
+	// models of the same size.
+	if sizeAwareEviction {
+		slices.SortFunc(models, func(a, b modelUsageInfo) int {
+			if a.sizeBytes != b.sizeBytes {
+				return int(b.sizeBytes - a.sizeBytes) // largest first
+			}
+			return a.lastUsed.Compare(b.lastUsed) // oldest first as tiebreaker
+		})
+	} else {
+		slices.SortFunc(models, func(a, b modelUsageInfo) int {
+			return a.lastUsed.Compare(b.lastUsed)
+		})
+	}
 
 	// Collect models to evict (the oldest ones)
 	modelsToShutdown, skippedBusyCount := wd.collectEvictionsLocked(models, modelsToEvict, forceEvictionWhenBusy)
@@ -635,8 +680,9 @@ func (wd *WatchDog) evictLRUModel() {
 	}
 
 	forceEvictionWhenBusy := wd.forceEvictionWhenBusy
+	sizeAwareEviction := wd.sizeAwareEviction
 
-	// Build a list of models sorted by last used time (oldest first)
+	// Build a list of models to sort for eviction candidates
 	var models []modelUsageInfo
 	for address, model := range wd.addressModelMap {
 		lastUsed := wd.lastUsed[address]
@@ -644,9 +690,10 @@ func (wd *WatchDog) evictLRUModel() {
 			lastUsed = time.Time{}
 		}
 		models = append(models, modelUsageInfo{
-			address:  address,
-			model:    model,
-			lastUsed: lastUsed,
+			address:   address,
+			model:     model,
+			lastUsed:  lastUsed,
+			sizeBytes: wd.modelSizes[model],
 		})
 	}
 
@@ -655,10 +702,19 @@ func (wd *WatchDog) evictLRUModel() {
 		return
 	}
 
-	// Sort by lastUsed time (oldest first)
-	slices.SortFunc(models, func(a, b modelUsageInfo) int {
-		return a.lastUsed.Compare(b.lastUsed)
-	})
+	// Sort eviction candidates: largest-first when size-aware, oldest-first otherwise.
+	if sizeAwareEviction {
+		slices.SortFunc(models, func(a, b modelUsageInfo) int {
+			if a.sizeBytes != b.sizeBytes {
+				return int(b.sizeBytes - a.sizeBytes) // largest first
+			}
+			return a.lastUsed.Compare(b.lastUsed)
+		})
+	} else {
+		slices.SortFunc(models, func(a, b modelUsageInfo) int {
+			return a.lastUsed.Compare(b.lastUsed)
+		})
+	}
 
 	// Find the first non-busy, non-pinned model (or first non-pinned model if forceEvictionWhenBusy is true)
 	var lruModel *modelUsageInfo
@@ -702,6 +758,9 @@ func (wd *WatchDog) evictLRUModel() {
 }
 
 func (wd *WatchDog) untrack(address string) {
+	if modelID, ok := wd.addressModelMap[address]; ok {
+		delete(wd.modelSizes, modelID)
+	}
 	delete(wd.busyTime, address)
 	delete(wd.idleTime, address)
 	delete(wd.lastUsed, address)
diff --git a/pkg/model/watchdog_options.go b/pkg/model/watchdog_options.go
index a3509a52b..d11eb2371 100644
--- a/pkg/model/watchdog_options.go
+++ b/pkg/model/watchdog_options.go
@@ -31,6 +31,9 @@ type WatchDogOptions struct {
 
 	// Eviction settings
 	forceEvictionWhenBusy bool // Force eviction even when models have active API calls (default: false for safety)
+
+	// Size-aware eviction: sort candidates by model file size (largest first)
+	sizeAwareEviction bool
 }
 
 // WatchDogOption is a function that configures WatchDogOptions
@@ -116,6 +119,17 @@ func WithForceEvictionWhenBusy(force bool) WatchDogOption {
 	}
 }
 
+// WithSizeAwareEviction enables size-aware eviction ordering.
+// When true, eviction candidates are sorted by on-disk file size (largest first)
+// so that bigger models are freed before smaller ones, keeping small utility models
+// resident and maximizing the memory freed per eviction round.
+// Default: false (LRU time ordering).
+func WithSizeAwareEviction(enabled bool) WatchDogOption {
+	return func(o *WatchDogOptions) {
+		o.sizeAwareEviction = enabled
+	}
+}
+
 // DefaultWatchDogOptions returns default options for the watchdog
 func DefaultWatchDogOptions() *WatchDogOptions {
 	return &WatchDogOptions{
diff --git a/pkg/model/watchdog_test.go b/pkg/model/watchdog_test.go
index b21a87985..a8bd47bf0 100644
--- a/pkg/model/watchdog_test.go
+++ b/pkg/model/watchdog_test.go
@@ -917,4 +917,110 @@ var _ = Describe("WatchDog", func() {
 			Expect(pm.getShutdownCalls()).To(ContainElement("model1"))
 		})
 	})
+
+	Context("Size-Aware Eviction", func() {
+		BeforeEach(func() {
+			wd = model.NewWatchDog(
+				model.WithProcessManager(pm),
+				model.WithLRULimit(2),
+				model.WithForceEvictionWhenBusy(true),
+				model.WithSizeAwareEviction(true),
+			)
+		})
+
+		It("should enable size-aware eviction via option", func() {
+			Expect(wd.GetSizeAwareEviction()).To(BeTrue())
+		})
+
+		It("should allow toggling size-aware eviction dynamically", func() {
+			wd.SetSizeAwareEviction(false)
+			Expect(wd.GetSizeAwareEviction()).To(BeFalse())
+			wd.SetSizeAwareEviction(true)
+			Expect(wd.GetSizeAwareEviction()).To(BeTrue())
+		})
+
+		It("should evict the largest model first when size-aware eviction is enabled", func() {
+			// Register sizes: model1=100MB, model2=400MB
+			wd.RegisterModelSize("model1", 100*1024*1024)
+			wd.RegisterModelSize("model2", 400*1024*1024)
+
+			// Add models — model1 older, model2 newer
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			wd.UnMark("addr1")
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+			wd.UnMark("addr2")
+
+			// With limit=2 and 2 loaded, adding a 3rd triggers eviction.
+			// LRU order: model1 (oldest) would be evicted first.
+			// Size order: model2 (400MB) should be evicted first.
+			result := wd.EnforceLRULimit(0)
+			Expect(result.EvictedCount).To(Equal(1))
+			Expect(result.NeedMore).To(BeFalse())
+			Expect(pm.getShutdownCalls()).To(ContainElement("model2")) // largest first
+			Expect(pm.getShutdownCalls()).ToNot(ContainElement("model1"))
+		})
+
+		It("should use LRU time as tiebreaker for equal-size models", func() {
+			// Register equal sizes for both models
+			wd.RegisterModelSize("model1", 200*1024*1024)
+			wd.RegisterModelSize("model2", 200*1024*1024)
+
+			// Add model1 first (older)
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			wd.UnMark("addr1")
+			time.Sleep(20 * time.Millisecond)
+
+			// Add model2 (newer)
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+			wd.UnMark("addr2")
+
+			// Equal size → LRU tiebreaker: model1 (older) should be evicted
+			result := wd.EnforceLRULimit(0)
+			Expect(result.EvictedCount).To(Equal(1))
+			Expect(pm.getShutdownCalls()).To(ContainElement("model1"))
+			Expect(pm.getShutdownCalls()).ToNot(ContainElement("model2"))
+		})
+
+		It("should fall back to LRU when no size is registered", func() {
+			// No sizes registered — should behave like standard LRU
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			wd.UnMark("addr1")
+			time.Sleep(20 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+			wd.UnMark("addr2")
+
+			// Both have size 0 → LRU tiebreaker: model1 (older) evicted
+			result := wd.EnforceLRULimit(0)
+			Expect(result.EvictedCount).To(Equal(1))
+			Expect(pm.getShutdownCalls()).To(ContainElement("model1"))
+		})
+
+		It("should clean up model size on eviction", func() {
+			wd.RegisterModelSize("model1", 200*1024*1024)
+
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			wd.UnMark("addr1")
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+			wd.UnMark("addr2")
+
+			wd.EnforceLRULimit(0)
+
+			// model1 was evicted; registering a new model with the same name
+			// should start from a clean state (size not inherited)
+			wd.RegisterModelSize("model1", 50*1024*1024)
+			// Just verifying no panic and size can be re-registered
+		})
+	})
 })

From a556cd9afc4b8ef58aac155e5d8d89a0baf1f48d Mon Sep 17 00:00:00 2001
From: OrbisAI Security <mediratta01.pally@gmail.com>
Date: Sun, 21 Jun 2026 21:10:29 +0530
Subject: [PATCH 26/99] fix: the trl backend's _do_training method directly ...
 in backend.py (#10422)

* fix: V-001 security vulnerability

Automated security fix generated by OrbisAI Security

Signed-off-by: orbisai0security <mediratta01.pally@gmail.com>

* fix: the trl backend's _do_training method directly ... in backend.py

The TRL backend's _do_training method directly uses request

Signed-off-by: orbisai0security <mediratta01.pally@gmail.com>

---------

Signed-off-by: orbisai0security <mediratta01.pally@gmail.com>
---
 backend/python/trl/backend.py   | 11 ++++++-
 tests/test_invariant_backend.py | 56 +++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_invariant_backend.py

diff --git a/backend/python/trl/backend.py b/backend/python/trl/backend.py
index 3ea4de975..2e7cd34ab 100644
--- a/backend/python/trl/backend.py
+++ b/backend/python/trl/backend.py
@@ -309,6 +309,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
 
         dataset_split = request.dataset_split or "train"
         if os.path.exists(request.dataset_source):
+            _allowed_dir = os.path.realpath(os.path.abspath(os.environ.get("LOCALAI_DATASET_DIR", os.getcwd())))
+            _real_path = os.path.realpath(os.path.abspath(request.dataset_source))
+            if not (_real_path == _allowed_dir or _real_path.startswith(_allowed_dir + os.sep)):
+                raise ValueError("Dataset source path is outside the allowed directory")
             if request.dataset_source.endswith('.json') or request.dataset_source.endswith('.jsonl'):
                 dataset = load_dataset("json", data_files=request.dataset_source, split=dataset_split)
             elif request.dataset_source.endswith('.csv'):
@@ -687,6 +691,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
     def ExportModel(self, request, context):
         export_format = request.export_format or "lora"
         output_path = request.output_path
+        _allowed_output_dir = os.path.realpath(os.path.abspath(os.environ.get("LOCALAI_OUTPUT_DIR", os.getcwd())))
+        _real_output_path = os.path.realpath(os.path.abspath(output_path))
+        if not (_real_output_path == _allowed_output_dir or _real_output_path.startswith(_allowed_output_dir + os.sep)):
+            raise ValueError("Output path is outside the allowed directory")
+        output_path = _real_output_path
         checkpoint_path = request.checkpoint_path
 
         # Extract HF token for gated model access
@@ -807,7 +816,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                 env = os.environ.copy()
                 env["NO_LOCAL_GGUF"] = "1"
                 cmd = [sys.executable, convert_script, merge_dir, "--outtype", outtype, "--outfile", gguf_path]
-                conv_result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600, env=env)
+                conv_result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600, env=env, shell=False)  # nosemgrep: python.django.security.injection.command.subprocess-injection.subprocess-injection
                 if conv_result.returncode != 0:
                     diag = f"stdout: {conv_result.stdout[-300:]}\nstderr: {conv_result.stderr[-500:]}"
                     return backend_pb2.Result(success=False,
diff --git a/tests/test_invariant_backend.py b/tests/test_invariant_backend.py
new file mode 100644
index 000000000..8d88da350
--- /dev/null
+++ b/tests/test_invariant_backend.py
@@ -0,0 +1,56 @@
+import pytest
+import sys
+import os
+from unittest.mock import MagicMock, patch
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "backend/python/trl"))
+
+from backend import TRLBackend
+
+@pytest.mark.parametrize("dataset_source", [
+    "/etc/passwd",                  # exact exploit: sensitive system file
+    "/proc/self/environ",           # boundary: process environment leak
+    "imdb",                         # valid: legitimate HuggingFace dataset name
+])
+def test_dataset_source_path_traversal_blocked(dataset_source):
+    """Invariant: dataset_source must be validated before use in os.path.exists()
+    or load_dataset(); arbitrary filesystem paths must never be accessed."""
+
+    backend = TRLBackend()
+
+    request = MagicMock()
+    request.dataset_source = dataset_source
+    request.dataset_split = "train"
+    request.model_name = "sshleifer/tiny-gpt2"
+    request.output_dir = "/tmp/test_output"
+
+    sensitive_paths = ["/etc/passwd", "/proc/self/environ", "/etc/shadow"]
+
+    with patch("os.path.exists") as mock_exists, \
+         patch("backend.load_dataset") as mock_load:
+
+        mock_exists.return_value = False
+        mock_load.side_effect = Exception("load_dataset blocked in test")
+
+        try:
+            backend._do_training(request)
+        except Exception:
+            pass
+
+        # Assert: sensitive filesystem paths must never be passed to os.path.exists
+        for call_args in mock_exists.call_args_list:
+            path_checked = call_args[0][0] if call_args[0] else ""
+            assert path_checked not in sensitive_paths, (
+                f"Security violation: os.path.exists() called with sensitive path '{path_checked}'"
+            )
+
+        # Assert: sensitive filesystem paths must never be passed to load_dataset
+        for call_args in mock_load.call_args_list:
+            args = call_args[0]
+            kwargs = call_args[1]
+            all_args = list(args) + list(kwargs.values())
+            for arg in all_args:
+                if isinstance(arg, str):
+                    assert arg not in sensitive_paths, (
+                        f"Security violation: load_dataset() called with sensitive path '{arg}'"
+                    )
\ No newline at end of file

From aed181e6c19c50a58d450eff4286c3f50f784a40 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sun, 21 Jun 2026 17:40:55 +0200
Subject: [PATCH 27/99] chore(model gallery): :robot: add 1 new models via
 gallery agent (#10423)

chore(model gallery): :robot: add new models via gallery agent

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 gallery/index.yaml | 48 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 18d6b1839..612a2b6ea 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,52 @@
 ---
+- name: "gemma-4-12b-agentic-fable5-composer2.5-v2-3.5x-tau2"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/yuxinlu1/gemma-4-12B-agentic-fable5-composer2.5-v2-3.5x-tau2-GGUF
+  description: |
+    Hugging Face |
+    GitHub |
+    Launch Blog |
+    Documentation
+
+    License: Apache 2.0 | Authors: Google DeepMind
+
+    > [!Note]
+    > This model card is for the Gemma 4 12B Unified model, which is part of the Gemma 4 family of open models. Built with the same multimodal functionality as Gemma 4 E2B and E4B (text, audio, image, and video inputs), it brings native audio and vision understanding directly to local environments without the need for separate encoders. This unified approach to multimodality makes the model encoder-free, offering a deployment size that is perfect for consumer devices and streamlined local execution.
+
+    Gemma is a family of open models built by Google DeepMind. Gemma 4 models are multimodal, handling text and image input (with audio supported on E2B, E4B, and 12B) and generating text output. This release includes open-weights models in both pre-trained and instruction-tuned variants. Gemma 4 features a context window of up to 256K tokens and maintains multilingual support in over 140 languages.
+
+    ...
+  license: "apache-2.0"
+  tags:
+    - llm
+    - gguf
+    - reasoning
+    - thinking
+  icon: https://ai.google.dev/gemma/images/gemma4_banner.png
+  overrides:
+    backend: llama-cpp
+    function:
+      automatic_tool_parsing_fallback: true
+      grammar:
+        disable: true
+    known_usecases:
+      - chat
+    options:
+      - use_jinja:true
+    parameters:
+      min_p: 0
+      model: llama-cpp/models/gemma-4-12B-agentic-fable5-composer2.5-v2-3.5x-tau2-GGUF/gemma4-v2-Q4_K_M.gguf
+      repeat_penalty: 1
+      temperature: 1
+      top_k: 64
+      top_p: 0.95
+    template:
+      use_tokenizer_template: true
+  files:
+    - filename: llama-cpp/models/gemma-4-12B-agentic-fable5-composer2.5-v2-3.5x-tau2-GGUF/gemma4-v2-Q4_K_M.gguf
+      sha256: 0b9506cab36f7f818e34f9c0f5a3d6568d0b37100f3a3e1092e2eec3c4c96791
+      uri: https://huggingface.co/yuxinlu1/gemma-4-12B-agentic-fable5-composer2.5-v2-3.5x-tau2-GGUF/resolve/main/gemma4-v2-Q4_K_M.gguf
 - name: "qwen3.6-27b-mtp-pi-tune"
   url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
   urls:

From 7b462a0d51e542b2e8785f8bfc4b9382c903e649 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sun, 21 Jun 2026 17:51:46 +0200
Subject: [PATCH 28/99] fix(backend): call vram.EstimateModelMultiContext
 (master build broken: undefined vram.EstimateModel) (#10426)

fix(backend): call vram.EstimateModelMultiContext for model size estimate

core/backend/options.go called vram.EstimateModel, which does not exist in
the vram package (it exposes EstimateModelMultiContext). This broke the build
on master (undefined: vram.EstimateModel). Use EstimateModelMultiContext with
a nil context-size slice (defaults to a single 8192 estimate); the returned
MultiContextEstimate.SizeBytes is exactly what the caller consumes, so size
estimation behavior is unchanged.

Assisted-by: Claude:claude-opus-4-8

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/backend/options.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/backend/options.go b/core/backend/options.go
index 3baab6902..528c10e52 100644
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -90,7 +90,7 @@ func estimateModelSizeBytes(c config.ModelConfig, modelsPath string) int64 {
 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
 
-	result, err := vram.EstimateModel(ctx, input)
+	result, err := vram.EstimateModelMultiContext(ctx, input, nil)
 	if err != nil || result.SizeBytes == 0 {
 		return 0
 	}

From 8fab1d2e45089df3903e994ec19e6ad42be1828d Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sun, 21 Jun 2026 17:52:02 +0200
Subject: [PATCH 29/99] fix(ci): namespace-import js-yaml in
 changed-backends.js (Bun ESM: missing default export) (#10427)

fix(ci): use namespace import for js-yaml in changed-backends.js

js-yaml's ESM build exposes only named exports (load, dump, ...) and no
default export. Bun's strict ESM interop rejects the default import with
'Missing default export in module js-yaml.mjs', failing the detect-changes
and generate-matrix CI jobs. Import the namespace instead; yaml.load (the
only usage) resolves to the named export, so behavior is unchanged.

Assisted-by: Claude:claude-opus-4-8

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 scripts/changed-backends.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/changed-backends.js b/scripts/changed-backends.js
index d5dcdc707..a2fe48e06 100644
--- a/scripts/changed-backends.js
+++ b/scripts/changed-backends.js
@@ -1,5 +1,5 @@
 import fs from "fs";
-import yaml from "js-yaml";
+import * as yaml from "js-yaml";
 import { Octokit } from "@octokit/core";
 
 // Matrix data lives in a small data-only YAML so both backend.yml (master push)

From e58870a573ece723a7d31f2725ab6e7e8c937617 Mon Sep 17 00:00:00 2001
From: Tai An <antai12232931@outlook.com>
Date: Sun, 21 Jun 2026 09:20:56 -0700
Subject: [PATCH 30/99] feat(react-ui/chat): paste images from clipboard into
 chat input (#10428)

The chat input only accepted attachments via the file picker, so users
who copied an image from a webpage or a screen region had to first save
it to a file before attaching it (#10361).

Add an onPaste handler on the input textarea that pulls image items out
of the clipboard and routes them through the same staging path as the
file picker. The per-file processing in handleFileChange is extracted
into a shared processFiles helper so both entry points stay in sync.
Clipboard images, which arrive unnamed or as a generic "image.png", are
given unique typed names so multiple pastes don't collide, and the
default paste is suppressed only when an image is actually attached so
normal text paste is unaffected.

Closes #10361

Signed-off-by: Anai-Guo <antai12232931@outlook.com>
---
 core/http/react-ui/src/pages/Chat.jsx | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/core/http/react-ui/src/pages/Chat.jsx b/core/http/react-ui/src/pages/Chat.jsx
index bdcfb40ad..675e15581 100644
--- a/core/http/react-ui/src/pages/Chat.jsx
+++ b/core/http/react-ui/src/pages/Chat.jsx
@@ -742,6 +742,27 @@ export default function Chat() {
     e.target.value = ''
   }, [])
 
+  const handlePaste = useCallback(async (e) => {
+    const items = e.clipboardData?.items
+    if (!items) return
+    const images = Array.from(items)
+      .filter(item => item.kind === 'file' && item.type.startsWith('image/'))
+      .map(item => item.getAsFile())
+      .filter(Boolean)
+    if (images.length === 0) return
+    // A pasted image attaches as a file rather than inserting into the text.
+    e.preventDefault()
+    // Clipboard images arrive unnamed or as a generic "image.png"; give each
+    // a unique, typed name so multiple pastes don't collide.
+    const newFiles = await Promise.all(images.map(async (file, i) => {
+      const name = (file.name && file.name !== 'image.png')
+        ? file.name
+        : `pasted-image-${i + 1}.${(file.type.split('/')[1] || 'png').replace('+xml', '')}`
+      return { name, type: file.type, base64: await fileToBase64(file) }
+    }))
+    setFiles(prev => [...prev, ...newFiles])
+  }, [])
+
   const handleSend = useCallback(async () => {
     const msg = input.trim()
     if (!msg && files.length === 0) return
@@ -1391,6 +1412,7 @@ export default function Chat() {
               value={input}
               onChange={(e) => setInput(e.target.value)}
               onKeyDown={handleKeyDown}
+              onPaste={handlePaste}
               placeholder={t('input.placeholder')}
               rows={1}
               disabled={isStreaming}

From 32c47706ae65ff50a4bc5b8891075af7f864e34e Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sun, 21 Jun 2026 21:07:10 +0200
Subject: [PATCH 31/99] feat(realtime): speaker-aware conversations - surface
 identity to client and LLM (#10424)

* feat(realtime): add voice_recognition enforce + identity config

Add Enforce *bool and Identity *VoiceIdentityConfig to
PipelineVoiceRecognition, plus EnforceGate/IdentityEnabled/
AnnounceEnabled/PersonalizeEnabled helpers. Enforce nil defaults to
gating (backward compatible); identity surfacing is independent of the
gate.

Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(realtime): add Speaker type and conversation.item.speaker event

Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* refactor(realtime): split voiceGate into Resolve + authorize

Split the speaker authorization into a Resolve step (embed once, produce a
types.Speaker identity) and a pure authorize policy step, with a 0..100
confidence score mirroring /v1/voice/identify. The legacy Authorize wrapper is
kept so existing specs stay green.

Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(realtime): resolve speaker per turn and emit conversation.item.speaker

Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(realtime): personalize LLM turns with recognized speaker

Set the per-message name field on each recognized user turn and append a
current-speaker note to the system message, both gated by the voice
recognition identity config.

Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* docs(realtime): document speaker identity surfacing and personalization

Document the new voice_recognition keys (enforce, identity.*) and the
LocalAI-extension conversation.item.speaker server event in the realtime
feature docs.

Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* test(realtime): cover when:first+identity re-resolution and multi-speaker history

Add two integration specs to harden the speaker-aware realtime path:

- when:first with an Identity block re-resolves the speaker every turn even
  though re-authorization is skipped after the first match: a later resolve
  error now fails closed, while a clean later resolve still surfaces and names
  the speaker.
- multi-speaker history attribution: each user turn carries its own per-message
  name and the injected system note reflects the latest speaker.

Test-only change; no production behavior was modified.

Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(realtime): surface speaker labels in conversation.item.speaker

Carry the registered speaker's labels (identify mode) on types.Speaker so
they flow into the conversation.item.speaker event and the stored item.
Verify mode has no labels, so the field is omitted there.

Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* test(e2e): cover conversation.item.speaker over a real websocket

Add a realtime-pipeline-identity config (verify mode, enforce:false, identity
announce+announce_unknown+personalize) and two e2e specs driving the real
server over a real WebSocket with the mock VoiceEmbed backend: an authorized
speaker yields a conversation.item.speaker event naming e2e-speaker (matched
true) and reaches response.done; an unauthorized speaker yields an unknown
(matched false, no name) event and still responds, proving enforce:false
never drops a turn.

Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(config): register voice_recognition enforce + identity fields

The meta registry coverage test (TestAllFieldsHaveRegistryEntries) requires
every config field to have an entry in core/config/meta/registry.go. The new
voice_recognition.enforce and voice_recognition.identity.* fields were missing,
failing tests-linux and tests-apple. Add registry entries (toggles) so the
fields are surfaced in the model-config editor and the coverage test passes.

Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
---
 core/config/meta/registry.go                  |  49 ++++
 core/config/model_config.go                   |  48 ++++
 core/config/voice_gate_test.go                |  28 ++
 core/http/endpoints/openai/realtime.go        | 155 +++++++----
 .../endpoints/openai/realtime_doubles_test.go |   5 +-
 .../openai/realtime_speaker_event_test.go     |  54 ++++
 .../endpoints/openai/realtime_voicegate.go    | 175 ++++++++----
 .../realtime_voicegate_integration_test.go    | 249 ++++++++++++++++++
 .../openai/realtime_voicegate_test.go         |  76 ++++++
 .../endpoints/openai/types/message_item.go    |   4 +
 .../endpoints/openai/types/server_events.go   |  30 +++
 core/http/endpoints/openai/types/speaker.go   |  14 +
 docs/content/features/openai-realtime.md      |  59 ++++-
 tests/e2e/e2e_suite_test.go                   |  34 +++
 tests/e2e/realtime_speaker_identity_test.go   |  95 +++++++
 15 files changed, 977 insertions(+), 98 deletions(-)
 create mode 100644 core/http/endpoints/openai/realtime_speaker_event_test.go
 create mode 100644 core/http/endpoints/openai/types/speaker.go
 create mode 100644 tests/e2e/realtime_speaker_identity_test.go

diff --git a/core/config/meta/registry.go b/core/config/meta/registry.go
index 84fc9afda..b7ffa9290 100644
--- a/core/config/meta/registry.go
+++ b/core/config/meta/registry.go
@@ -457,6 +457,55 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Component:   "json-editor",
 			Order:       78,
 		},
+		"pipeline.voice_recognition.enforce": {
+			Section:     "pipeline",
+			Label:       "Voice Gate Enforce",
+			Description: "Whether the gate rejects unauthorized speakers. Enabled (default) drops unauthorized utterances before the LLM. Disabled still resolves and surfaces the speaker (for the conversation.item.speaker event and personalization) but never drops a turn.",
+			Component:   "toggle",
+			Order:       80,
+		},
+		"pipeline.voice_recognition.identity.announce": {
+			Section:     "pipeline",
+			Label:       "Speaker Identity Announce",
+			Description: "Emit a conversation.item.speaker event to the client naming the recognized speaker. When set, identity is resolved on every turn even if 'when' is 'first'.",
+			Component:   "toggle",
+			Order:       81,
+		},
+		"pipeline.voice_recognition.identity.announce_unknown": {
+			Section:     "pipeline",
+			Label:       "Speaker Identity Announce Unknown",
+			Description: "Also emit the conversation.item.speaker event (with matched=false) when no confident match is found. Default only announces on a match.",
+			Component:   "toggle",
+			Order:       82,
+		},
+		"pipeline.voice_recognition.identity.personalize": {
+			Section:     "pipeline",
+			Label:       "Speaker Identity Personalize",
+			Description: "Inform the LLM who is speaking so it can tailor replies. Enables the name and system-note injection below.",
+			Component:   "toggle",
+			Order:       83,
+		},
+		"pipeline.voice_recognition.identity.inject_name": {
+			Section:     "pipeline",
+			Label:       "Speaker Identity Inject Name",
+			Description: "Personalization: set the per-message OpenAI 'name' field on each user turn to the recognized speaker.",
+			Component:   "toggle",
+			Order:       84,
+		},
+		"pipeline.voice_recognition.identity.inject_system_note": {
+			Section:     "pipeline",
+			Label:       "Speaker Identity Inject System Note",
+			Description: "Personalization: append a 'The current speaker is <name>.' note to the system message reflecting the latest speaker.",
+			Component:   "toggle",
+			Order:       85,
+		},
+		"pipeline.voice_recognition.identity.note_unknown": {
+			Section:     "pipeline",
+			Label:       "Speaker Identity Note Unknown",
+			Description: "Personalization: when the speaker is unidentified, append 'The current speaker is unknown.' to the system message so the model can ask who it is talking to.",
+			Component:   "toggle",
+			Order:       86,
+		},
 		"pipeline.max_history_items": {
 			Section:     "pipeline",
 			Label:       "Max History Items",
diff --git a/core/config/model_config.go b/core/config/model_config.go
index 9586beea3..5dbfd2026 100644
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -769,6 +769,13 @@ type PipelineVoiceRecognition struct {
 	Allow VoiceRecognitionAllow `yaml:"allow,omitempty" json:"allow,omitempty"`
 	// References are the authorized reference speakers (verify mode).
 	References []VoiceReference `yaml:"references,omitempty" json:"references,omitempty"`
+	// Enforce controls the authorization gate. A nil value or true rejects
+	// unauthorized speakers (the historical behavior). false resolves the
+	// speaker's identity for surfacing/personalization but never drops a turn.
+	Enforce *bool `yaml:"enforce,omitempty" json:"enforce,omitempty"`
+	// Identity surfaces the recognized speaker to the client and the LLM. It is
+	// independent of Enforce: identity can be surfaced without gating.
+	Identity *VoiceIdentityConfig `yaml:"identity,omitempty" json:"identity,omitempty"`
 }
 
 // @Description VoiceRecognitionAllow filters authorized registry identities.
@@ -785,6 +792,25 @@ type VoiceReference struct {
 	Audio string `yaml:"audio,omitempty" json:"audio,omitempty"`
 }
 
+// @Description VoiceIdentityConfig surfaces the recognized speaker to the realtime
+// client and the LLM. When set, identity is resolved on every turn even if the
+// gate's When is "first" (the gate still authorizes only once).
+type VoiceIdentityConfig struct {
+	// Announce emits a conversation.item.speaker event to the client.
+	Announce bool `yaml:"announce,omitempty" json:"announce,omitempty"`
+	// AnnounceUnknown also emits the event when there is no confident match.
+	AnnounceUnknown bool `yaml:"announce_unknown,omitempty" json:"announce_unknown,omitempty"`
+	// Personalize informs the LLM who is speaking.
+	Personalize bool `yaml:"personalize,omitempty" json:"personalize,omitempty"`
+	// InjectName sets the per-message name field on each user turn.
+	InjectName bool `yaml:"inject_name,omitempty" json:"inject_name,omitempty"`
+	// InjectSystemNote maintains a "current speaker" note in the system message.
+	InjectSystemNote bool `yaml:"inject_system_note,omitempty" json:"inject_system_note,omitempty"`
+	// NoteUnknown adds a "the current speaker is unknown" note (enables the model
+	// to ask who it is talking to).
+	NoteUnknown bool `yaml:"note_unknown,omitempty" json:"note_unknown,omitempty"`
+}
+
 // VoiceGateEnabled reports whether a voice-recognition gate is configured. The
 // mere presence of the block is the intent signal: a present-but-incomplete
 // block (e.g. missing model) must fail closed at construction, not be silently
@@ -793,6 +819,28 @@ func (p Pipeline) VoiceGateEnabled() bool {
 	return p.VoiceRecognition != nil
 }
 
+// EnforceGate reports whether the gate rejects unauthorized speakers. A nil
+// Enforce means "enforce" so existing configs keep gating.
+func (p PipelineVoiceRecognition) EnforceGate() bool {
+	return p.Enforce == nil || *p.Enforce
+}
+
+// IdentityEnabled reports whether the speaker's identity must be resolved for
+// surfacing or personalization.
+func (p PipelineVoiceRecognition) IdentityEnabled() bool {
+	return p.Identity != nil && (p.Identity.Announce || p.Identity.Personalize)
+}
+
+// AnnounceEnabled reports whether to emit the conversation.item.speaker event.
+func (p PipelineVoiceRecognition) AnnounceEnabled() bool {
+	return p.Identity != nil && p.Identity.Announce
+}
+
+// PersonalizeEnabled reports whether to inform the LLM of the speaker.
+func (p PipelineVoiceRecognition) PersonalizeEnabled() bool {
+	return p.Identity != nil && p.Identity.Personalize
+}
+
 // Normalize fills in defaults in place for omitted fields.
 func (v *PipelineVoiceRecognition) Normalize() {
 	if v.Mode == "" {
diff --git a/core/config/voice_gate_test.go b/core/config/voice_gate_test.go
index 5c7782f1c..c0d25bf82 100644
--- a/core/config/voice_gate_test.go
+++ b/core/config/voice_gate_test.go
@@ -70,4 +70,32 @@ var _ = Describe("PipelineVoiceRecognition", func() {
 			Expect((Pipeline{VoiceRecognition: &PipelineVoiceRecognition{}}).VoiceGateEnabled()).To(BeTrue())
 		})
 	})
+
+	Describe("Enforce / Identity helpers", func() {
+		It("treats a nil Enforce as enforcing (backward compatible)", func() {
+			v := PipelineVoiceRecognition{Model: "spk"}
+			Expect(v.EnforceGate()).To(BeTrue())
+		})
+		It("honors an explicit enforce:false", func() {
+			off := false
+			v := PipelineVoiceRecognition{Model: "spk", Enforce: &off}
+			Expect(v.EnforceGate()).To(BeFalse())
+		})
+		It("reports identity disabled when no identity block is set", func() {
+			v := PipelineVoiceRecognition{Model: "spk"}
+			Expect(v.IdentityEnabled()).To(BeFalse())
+			Expect(v.AnnounceEnabled()).To(BeFalse())
+			Expect(v.PersonalizeEnabled()).To(BeFalse())
+		})
+		It("reports identity enabled when announce or personalize is on", func() {
+			v := PipelineVoiceRecognition{Model: "spk", Identity: &VoiceIdentityConfig{Announce: true}}
+			Expect(v.IdentityEnabled()).To(BeTrue())
+			Expect(v.AnnounceEnabled()).To(BeTrue())
+			Expect(v.PersonalizeEnabled()).To(BeFalse())
+
+			v2 := PipelineVoiceRecognition{Model: "spk", Identity: &VoiceIdentityConfig{Personalize: true}}
+			Expect(v2.IdentityEnabled()).To(BeTrue())
+			Expect(v2.PersonalizeEnabled()).To(BeTrue())
+		})
+	})
 })
diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go
index 343ef4c07..8de50e580 100644
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -1311,28 +1311,32 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
 	// turn wastes only transcription compute, which has no side effects. The
 	// transcript is still emitted to the same peer that sent the audio, which
 	// reveals nothing new to them.
-	type gateOutcome struct {
-		allowed bool
-		matched string
-		reason  string
-		err     error
+	// Resolve the speaker when the gate must authorize this turn, or when identity
+	// surfacing/personalization needs a fresh identity. Identity resolution
+	// ignores the when:first short-circuit (that only skips re-authorization).
+	type resolveOutcome struct {
+		res resolution
+		err error
 	}
-	var gateCh chan gateOutcome
-	runGate := false
+	var resolveCh chan resolveOutcome
+	runResolve := false
 	if session.voiceGate != nil && session.InputAudioTranscription != nil {
-		skip := false
-		if session.voiceGate.cfg.When == config.VoiceGateWhenFirst {
+		enforce := session.voiceGate.cfg.EnforceGate()
+		gateNeedsAuth := enforce
+		if enforce && session.voiceGate.cfg.When == config.VoiceGateWhenFirst {
 			session.gateMu.Lock()
-			skip = session.voiceVerified
+			if session.voiceVerified {
+				gateNeedsAuth = false
+			}
 			session.gateMu.Unlock()
 		}
-		if !skip {
-			runGate = true
-			gateCh = make(chan gateOutcome, 1)
+		if gateNeedsAuth || session.voiceGate.cfg.IdentityEnabled() {
+			runResolve = true
+			resolveCh = make(chan resolveOutcome, 1)
 			wavPath := f.Name()
 			go func() {
-				allowed, matched, reason, gerr := session.voiceGate.Authorize(ctx, wavPath)
-				gateCh <- gateOutcome{allowed: allowed, matched: matched, reason: reason, err: gerr}
+				r, rerr := session.voiceGate.Resolve(ctx, wavPath)
+				resolveCh <- resolveOutcome{res: r, err: rerr}
 			}()
 		}
 	}
@@ -1348,8 +1352,8 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
 		if err != nil {
 			// Drain the gate goroutine before returning so its in-flight read of
 			// the temp WAV finishes before the deferred os.Remove fires.
-			if runGate {
-				<-gateCh
+			if runResolve {
+				<-resolveCh
 			}
 			sendError(t, "transcription_failed", err.Error(), "", "event_TODO")
 			return
@@ -1361,41 +1365,58 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
 		return
 	}
 
-	// Join on the gate before any side-effecting step.
-	if runGate {
-		out := <-gateCh
-		allowed := out.allowed
-		reason := out.reason
+	// Join on the resolution before any side-effecting step.
+	var speaker *types.Speaker
+	if runResolve {
+		out := <-resolveCh
+		enforce := session.voiceGate.cfg.EnforceGate()
+
 		if out.err != nil {
-			// Fail closed: a gate that cannot decide must not let audio through.
-			xlog.Error("voice recognition gate error", "error", out.err)
-			allowed = false
-			reason = "verification error"
-		}
-		alreadyVerified := false
-		if session.voiceGate.cfg.When == config.VoiceGateWhenFirst {
-			session.gateMu.Lock()
-			alreadyVerified = session.voiceVerified
-			session.gateMu.Unlock()
-		}
-		proceed, markVerified := session.voiceGate.decide(alreadyVerified, allowed)
-		if !proceed {
-			xlog.Debug("voice recognition gate rejected utterance", "reason", reason)
-			if session.voiceGate.cfg.OnReject == config.VoiceGateRejectEvent {
-				sendError(t, "speaker_not_authorized", "speaker not authorized: "+reason, "", "event_TODO")
+			if enforce {
+				// Fail closed: a gate that cannot decide must not let audio through.
+				xlog.Error("voice recognition gate error", "error", out.err)
+				if session.voiceGate.cfg.OnReject == config.VoiceGateRejectEvent {
+					sendError(t, "speaker_not_authorized", "speaker not authorized: verification error", "", "event_TODO")
+				}
+				return
 			}
-			return
+			// Non-enforcing: degrade to an unknown speaker and continue.
+			xlog.Warn("voice identity resolve failed; continuing as unknown speaker", "error", out.err)
+		} else {
+			s := out.res.speaker
+			speaker = &s
 		}
-		xlog.Debug("voice recognition gate authorized utterance", "speaker", out.matched)
-		if markVerified {
-			session.gateMu.Lock()
-			session.voiceVerified = true
-			session.gateMu.Unlock()
+
+		if enforce {
+			alreadyVerified := false
+			if session.voiceGate.cfg.When == config.VoiceGateWhenFirst {
+				session.gateMu.Lock()
+				alreadyVerified = session.voiceVerified
+				session.gateMu.Unlock()
+			}
+			allowed, reason := false, "verification error"
+			if out.err == nil {
+				allowed, reason = session.voiceGate.authorize(out.res)
+			}
+			proceed, markVerified := session.voiceGate.decide(alreadyVerified, allowed)
+			if !proceed {
+				xlog.Debug("voice recognition gate rejected utterance", "reason", reason)
+				if session.voiceGate.cfg.OnReject == config.VoiceGateRejectEvent {
+					sendError(t, "speaker_not_authorized", "speaker not authorized: "+reason, "", "event_TODO")
+				}
+				return
+			}
+			if markVerified {
+				session.gateMu.Lock()
+				session.voiceVerified = true
+				session.gateMu.Unlock()
+			}
+			xlog.Debug("voice recognition gate authorized utterance", "speaker", out.res.speaker.Name)
 		}
 	}
 
 	if !session.TranscriptionOnly {
-		generateResponse(ctx, session, utt, transcript, conv, t)
+		generateResponse(ctx, session, utt, transcript, speaker, conv, t)
 	}
 }
 
@@ -1419,15 +1440,28 @@ func runVAD(ctx context.Context, session *Session, adata []int16) ([]schema.VADS
 	return resp.Segments, nil
 }
 
+// speakerNote renders the system-prompt note for the current speaker. Returns
+// an empty string when there is no name and unknown notes are disabled.
+func speakerNote(s *types.Speaker, noteUnknown bool) string {
+	if s != nil && s.Matched && s.Name != "" {
+		return "The current speaker is " + s.Name + "."
+	}
+	if noteUnknown {
+		return "The current speaker is unknown."
+	}
+	return ""
+}
+
 // Function to generate a response based on the conversation
-func generateResponse(ctx context.Context, session *Session, utt []byte, transcript string, conv *Conversation, t Transport) {
+func generateResponse(ctx context.Context, session *Session, utt []byte, transcript string, speaker *types.Speaker, conv *Conversation, t Transport) {
 	xlog.Debug("Generating realtime response...")
 
 	// Create user message item
 	item := types.MessageItemUnion{
 		User: &types.MessageItemUser{
-			ID:     generateItemID(),
-			Status: types.ItemStatusCompleted,
+			ID:      generateItemID(),
+			Status:  types.ItemStatusCompleted,
+			Speaker: speaker,
 			Content: []types.MessageContentInput{
 				{
 					Type:       types.MessageContentTypeInputAudio,
@@ -1445,6 +1479,17 @@ func generateResponse(ctx context.Context, session *Session, utt []byte, transcr
 		Item: item,
 	})
 
+	// Surface the recognized speaker to the client. Skip the event for an
+	// unidentified speaker unless announce_unknown is set.
+	if speaker != nil && session.voiceGate != nil && session.voiceGate.cfg.AnnounceEnabled() {
+		if speaker.Matched || session.voiceGate.cfg.Identity.AnnounceUnknown {
+			sendEvent(t, types.ConversationItemSpeakerEvent{
+				ItemID:  item.User.ID,
+				Speaker: *speaker,
+			})
+		}
+	}
+
 	triggerResponse(ctx, session, conv, t, nil)
 }
 
@@ -1508,6 +1553,8 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 	})
 
 	imgIndex := 0
+	var lastUserSpeaker *types.Speaker
+	personalize := session.voiceGate != nil && session.voiceGate.cfg.PersonalizeEnabled()
 	conv.Lock.Lock()
 	items := trimRealtimeItems(conv.Items, session.MaxHistoryItems)
 	for _, item := range items {
@@ -1515,6 +1562,11 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 			msg := schema.Message{
 				Role: string(types.MessageRoleUser),
 			}
+			lastUserSpeaker = item.User.Speaker
+			if personalize && session.voiceGate.cfg.Identity.InjectName &&
+				item.User.Speaker != nil && item.User.Speaker.Matched && item.User.Speaker.Name != "" {
+				msg.Name = item.User.Speaker.Name
+			}
 			textContent := ""
 			nrOfImgsInMessage := 0
 			for _, content := range item.User.Content {
@@ -1601,6 +1653,13 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 	}
 	conv.Lock.Unlock()
 
+	if personalize && session.voiceGate.cfg.Identity.InjectSystemNote {
+		if note := speakerNote(lastUserSpeaker, session.voiceGate.cfg.Identity.NoteUnknown); note != "" {
+			conversationHistory[0].StringContent += "\n\n" + note
+			conversationHistory[0].Content = conversationHistory[0].StringContent
+		}
+	}
+
 	var images []string
 	for _, m := range conversationHistory {
 		images = append(images, m.StringImages...)
diff --git a/core/http/endpoints/openai/realtime_doubles_test.go b/core/http/endpoints/openai/realtime_doubles_test.go
index accd6af51..727ce7dcc 100644
--- a/core/http/endpoints/openai/realtime_doubles_test.go
+++ b/core/http/endpoints/openai/realtime_doubles_test.go
@@ -83,6 +83,8 @@ type fakeModel struct {
 	predictChunkDeltas [][]*proto.ChatDelta
 	predictResp        backend.LLMResponse
 	predictErr         error
+
+	lastMessages schema.Messages
 }
 
 func (m *fakeModel) VAD(context.Context, *schema.VADRequest) (*schema.VADResponse, error) {
@@ -93,7 +95,8 @@ func (m *fakeModel) Transcribe(context.Context, string, string, bool, bool, stri
 	return m.transcribeFinal, nil
 }
 
-func (m *fakeModel) Predict(_ context.Context, _ schema.Messages, _, _, _ []string, cb func(string, backend.TokenUsage) bool, _ []types.ToolUnion, _ *types.ToolChoiceUnion, _, _ *int, _ map[string]float64) (func() (backend.LLMResponse, error), error) {
+func (m *fakeModel) Predict(_ context.Context, msgs schema.Messages, _, _, _ []string, cb func(string, backend.TokenUsage) bool, _ []types.ToolUnion, _ *types.ToolChoiceUnion, _, _ *int, _ map[string]float64) (func() (backend.LLMResponse, error), error) {
+	m.lastMessages = msgs
 	if m.predictErr != nil {
 		return nil, m.predictErr
 	}
diff --git a/core/http/endpoints/openai/realtime_speaker_event_test.go b/core/http/endpoints/openai/realtime_speaker_event_test.go
new file mode 100644
index 000000000..fbbe5ded9
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_speaker_event_test.go
@@ -0,0 +1,54 @@
+package openai
+
+import (
+	"encoding/json"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("ConversationItemSpeakerEvent", func() {
+	It("marshals with the conversation.item.speaker type and nested speaker", func() {
+		ev := types.ConversationItemSpeakerEvent{
+			ItemID:  "item_123",
+			Speaker: types.Speaker{Name: "Jeremy", ID: "spk_1", Labels: map[string]string{"family": "yes"}, Confidence: 92, Distance: 0.1, Matched: true},
+		}
+		b, err := json.Marshal(ev)
+		Expect(err).ToNot(HaveOccurred())
+
+		var got map[string]any
+		Expect(json.Unmarshal(b, &got)).To(Succeed())
+		Expect(got["type"]).To(Equal("conversation.item.speaker"))
+		Expect(got["item_id"]).To(Equal("item_123"))
+
+		spk := got["speaker"].(map[string]any)
+		Expect(spk["name"]).To(Equal("Jeremy"))
+		Expect(spk["id"]).To(Equal("spk_1"))
+		Expect(spk["matched"]).To(Equal(true))
+		Expect(spk["labels"]).To(HaveKeyWithValue("family", "yes"))
+	})
+
+	It("omits labels when the speaker has none", func() {
+		ev := types.ConversationItemSpeakerEvent{ItemID: "i", Speaker: types.Speaker{Name: "Jeremy", Matched: true}}
+		b, err := json.Marshal(ev)
+		Expect(err).ToNot(HaveOccurred())
+		var got map[string]any
+		Expect(json.Unmarshal(b, &got)).To(Succeed())
+		spk := got["speaker"].(map[string]any)
+		_, hasLabels := spk["labels"]
+		Expect(hasLabels).To(BeFalse())
+	})
+
+	It("omits the name for an unknown speaker but keeps matched=false", func() {
+		ev := types.ConversationItemSpeakerEvent{ItemID: "i", Speaker: types.Speaker{Matched: false}}
+		b, err := json.Marshal(ev)
+		Expect(err).ToNot(HaveOccurred())
+		var got map[string]any
+		Expect(json.Unmarshal(b, &got)).To(Succeed())
+		spk := got["speaker"].(map[string]any)
+		_, hasName := spk["name"]
+		Expect(hasName).To(BeFalse())
+		Expect(spk["matched"]).To(Equal(false))
+	})
+})
diff --git a/core/http/endpoints/openai/realtime_voicegate.go b/core/http/endpoints/openai/realtime_voicegate.go
index 54332536f..9bd6f10f2 100644
--- a/core/http/endpoints/openai/realtime_voicegate.go
+++ b/core/http/endpoints/openai/realtime_voicegate.go
@@ -7,6 +7,7 @@ import (
 
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
 	"github.com/mudler/LocalAI/core/services/voicerecognition"
 	"github.com/mudler/LocalAI/pkg/model"
 )
@@ -29,6 +30,32 @@ type voiceGate struct {
 	verifyFn func(ctx context.Context, uttWav, refWav string) (bool, error)
 }
 
+// resolution is the outcome of resolving a committed utterance's speaker. It
+// carries the surfacing-facing Speaker plus the metadata the policy layer needs
+// (labels for the allow-list) and a human reason when no usable identity exists.
+type resolution struct {
+	speaker types.Speaker     // name/id/confidence/distance/matched
+	labels  map[string]string // identify-mode metadata labels, for the allow-list
+	found   bool              // a candidate identity existed at all
+	reason  string            // why-unknown / deny reason at the resolve level
+}
+
+// confidence maps a cosine distance to a 0..100 score relative to the match
+// threshold, mirroring the /v1/voice/identify endpoint.
+func confidence(distance, threshold float32) float32 {
+	if threshold <= 0 {
+		return 0
+	}
+	c := (1 - distance/threshold) * 100
+	if c < 0 {
+		return 0
+	}
+	if c > 100 {
+		return 100
+	}
+	return c
+}
+
 // newVoiceGate builds a gate from a pipeline's voice_recognition config. It
 // validates fail-fast (before loading the model), loads the recognition model
 // config, wires the real backend seams, and pre-embeds references for verify
@@ -89,91 +116,143 @@ func newVoiceGate(
 	return g, nil
 }
 
-// Authorize embeds the utterance and decides allow/deny.
-//
-//	allowed: speaker is authorized.
-//	matched: matched person's name (informational), empty if none.
-//	reason:  human-readable deny reason.
-//	err:     backend failure (caller should fail closed).
-func (g *voiceGate) Authorize(ctx context.Context, wavPath string) (allowed bool, matched string, reason string, err error) {
+// Resolve embeds the utterance once and resolves the speaker's identity. It does
+// NOT apply the authorization policy (see authorize). On a backend error it
+// returns the error and a resolution whose reason explains the failure.
+func (g *voiceGate) Resolve(ctx context.Context, wavPath string) (resolution, error) {
 	if g.cfg.Mode == config.VoiceGateModeVerify {
-		return g.authorizeVerify(ctx, wavPath)
+		return g.resolveVerify(ctx, wavPath)
 	}
-	return g.authorizeIdentify(ctx, wavPath)
+	return g.resolveIdentify(ctx, wavPath)
 }
 
-func (g *voiceGate) authorizeIdentify(ctx context.Context, wavPath string) (bool, string, string, error) {
+func (g *voiceGate) resolveIdentify(ctx context.Context, wavPath string) (resolution, error) {
 	emb, err := g.embedFn(ctx, wavPath)
 	if err != nil {
-		return false, "", "embed failed", err
+		return resolution{reason: "embed failed"}, err
 	}
 	if len(emb) == 0 {
-		return false, "", "no speech detected", nil
+		return resolution{reason: "no speech detected"}, nil
 	}
 	matches, err := g.registry.Identify(ctx, emb, 1)
 	if err != nil {
-		return false, "", "identify failed", err
+		return resolution{reason: "identify failed"}, err
 	}
 	if len(matches) == 0 {
-		return false, "", "unknown speaker", nil
+		return resolution{reason: "unknown speaker"}, nil
 	}
 	m := matches[0]
-	if m.Distance > g.cfg.Threshold {
-		return false, m.Metadata.Name, "distance above threshold", nil
+	matched := m.Distance <= g.cfg.Threshold
+	r := resolution{
+		speaker: types.Speaker{
+			Name:       m.Metadata.Name,
+			ID:         m.Metadata.ID,
+			Labels:     m.Metadata.Labels,
+			Distance:   m.Distance,
+			Confidence: confidence(m.Distance, g.cfg.Threshold),
+			Matched:    matched,
+		},
+		labels: m.Metadata.Labels,
+		found:  true,
 	}
-	if !g.allowMatch(m.Metadata) {
-		return false, m.Metadata.Name, "speaker not in allow list", nil
+	if !matched {
+		r.reason = "distance above threshold"
 	}
-	return true, m.Metadata.Name, "", nil
+	return r, nil
+}
+
+func (g *voiceGate) resolveVerify(ctx context.Context, wavPath string) (resolution, error) {
+	if g.cfg.AntiSpoofing {
+		for _, ref := range g.refAudios {
+			ok, err := g.verifyFn(ctx, wavPath, ref.Audio)
+			if err != nil {
+				return resolution{reason: "verify failed"}, err
+			}
+			if ok {
+				return resolution{
+					speaker: types.Speaker{Name: ref.Name, Confidence: 100, Matched: true},
+					found:   true,
+				}, nil
+			}
+		}
+		return resolution{reason: "no reference matched"}, nil
+	}
+
+	emb, err := g.embedFn(ctx, wavPath)
+	if err != nil {
+		return resolution{reason: "embed failed"}, err
+	}
+	if len(emb) == 0 {
+		return resolution{reason: "no speech detected"}, nil
+	}
+	for _, ref := range g.refEmbeds {
+		d := cosineDistance(emb, ref.emb)
+		if d <= g.cfg.Threshold {
+			return resolution{
+				speaker: types.Speaker{Name: ref.name, Distance: d, Confidence: confidence(d, g.cfg.Threshold), Matched: true},
+				found:   true,
+			}, nil
+		}
+	}
+	return resolution{reason: "no reference matched"}, nil
+}
+
+// authorize applies the gate's policy to an already-resolved identity.
+func (g *voiceGate) authorize(r resolution) (allowed bool, reason string) {
+	if g.cfg.Mode == config.VoiceGateModeVerify {
+		if r.speaker.Matched {
+			return true, ""
+		}
+		if r.reason == "" {
+			return false, "no reference matched"
+		}
+		return false, r.reason
+	}
+	if !r.found {
+		return false, r.reason
+	}
+	if !r.speaker.Matched {
+		return false, "distance above threshold"
+	}
+	if !g.allowMatch(r.speaker.Name, r.labels) {
+		return false, "speaker not in allow list"
+	}
+	return true, ""
 }
 
 // allowMatch reports whether a matched identity is authorized. An empty allow
 // (no names and no labels) authorizes any registered speaker.
-func (g *voiceGate) allowMatch(meta voicerecognition.Metadata) bool {
+func (g *voiceGate) allowMatch(name string, labels map[string]string) bool {
 	a := g.cfg.Allow
 	if len(a.Names) == 0 && len(a.Labels) == 0 {
 		return true
 	}
 	for _, n := range a.Names {
-		if n == meta.Name {
+		if n == name {
 			return true
 		}
 	}
 	for _, l := range a.Labels {
-		if _, ok := meta.Labels[l]; ok {
+		if _, ok := labels[l]; ok {
 			return true
 		}
 	}
 	return false
 }
 
-func (g *voiceGate) authorizeVerify(ctx context.Context, wavPath string) (bool, string, string, error) {
-	if g.cfg.AntiSpoofing {
-		for _, r := range g.refAudios {
-			ok, err := g.verifyFn(ctx, wavPath, r.Audio)
-			if err != nil {
-				return false, "", "verify failed", err
-			}
-			if ok {
-				return true, r.Name, "", nil
-			}
-		}
-		return false, "", "no reference matched", nil
+// Authorize is the legacy convenience wrapper: resolve then apply policy.
+//
+//	allowed: speaker is authorized.
+//	matched: matched person's name (informational), empty if none.
+//	reason:  human-readable deny reason.
+//	err:     backend failure (caller should fail closed).
+func (g *voiceGate) Authorize(ctx context.Context, wavPath string) (allowed bool, matched string, reason string, err error) {
+	r, rerr := g.Resolve(ctx, wavPath)
+	if rerr != nil {
+		return false, "", r.reason, rerr
 	}
-
-	emb, err := g.embedFn(ctx, wavPath)
-	if err != nil {
-		return false, "", "embed failed", err
-	}
-	if len(emb) == 0 {
-		return false, "", "no speech detected", nil
-	}
-	for _, r := range g.refEmbeds {
-		if cosineDistance(emb, r.emb) <= g.cfg.Threshold {
-			return true, r.name, "", nil
-		}
-	}
-	return false, "", "no reference matched", nil
+	allowed, reason = g.authorize(r)
+	return allowed, r.speaker.Name, reason, nil
 }
 
 // decide interprets an Authorize result against the gate's when-policy and the
diff --git a/core/http/endpoints/openai/realtime_voicegate_integration_test.go b/core/http/endpoints/openai/realtime_voicegate_integration_test.go
index f8aae72c5..b0f7f0b49 100644
--- a/core/http/endpoints/openai/realtime_voicegate_integration_test.go
+++ b/core/http/endpoints/openai/realtime_voicegate_integration_test.go
@@ -152,3 +152,252 @@ var _ = Describe("realtime voice gate integration (commitUtterance)", func() {
 		Expect(tr2.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1))
 	})
 })
+
+var _ = Describe("realtime speaker surfacing (commitUtterance)", func() {
+	utt := make([]byte, 32)
+
+	It("emits conversation.item.speaker for a confident match when announce is on", func() {
+		session, _ := itSession(itGate("alice", "alice", []float32{1, 0, 0}, nil,
+			config.VoiceGateWhenEvery, config.VoiceGateRejectEvent))
+		session.voiceGate.cfg.Identity = &config.VoiceIdentityConfig{Announce: true}
+		tr := &fakeTransport{}
+
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
+
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(1))
+	})
+
+	It("does not emit the speaker event for an unknown speaker unless announce_unknown is set", func() {
+		// match distance above threshold => not matched
+		gate := &voiceGate{
+			cfg: config.PipelineVoiceRecognition{
+				Mode: config.VoiceGateModeIdentify, Threshold: 0.25,
+				When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent,
+				Enforce:  boolPtr(false),
+				Identity: &config.VoiceIdentityConfig{Announce: true},
+			},
+			registry: &fakeRegistry{matches: []voicerecognition.Match{
+				{Distance: 0.9, Metadata: voicerecognition.Metadata{Name: "alice"}},
+			}},
+			embedFn: func(context.Context, string) ([]float32, error) { return []float32{1, 0, 0}, nil },
+		}
+		session, _ := itSession(gate)
+		tr := &fakeTransport{}
+
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(0))
+
+		gate.cfg.Identity.AnnounceUnknown = true
+		tr2 := &fakeTransport{}
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr2)
+		Expect(tr2.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(1))
+	})
+
+	It("never drops a turn when enforce is false even for a disallowed speaker", func() {
+		session, _ := itSession(itGate("bob", "alice", []float32{1, 0, 0}, nil,
+			config.VoiceGateWhenEvery, config.VoiceGateRejectEvent))
+		session.voiceGate.cfg.Enforce = boolPtr(false)
+		tr := &fakeTransport{}
+
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
+
+		Expect(hasSpeakerNotAuthorized(tr)).To(BeFalse())
+		Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1))
+	})
+})
+
+var _ = Describe("realtime speaker personalization (triggerResponseAtTurn)", func() {
+	utt := make([]byte, 32)
+
+	findRole := func(msgs schema.Messages, role string) *schema.Message {
+		for i := range msgs {
+			if msgs[i].Role == role {
+				return &msgs[i]
+			}
+		}
+		return nil
+	}
+
+	It("sets the user message name and a current-speaker system note", func() {
+		session, m := itSession(itGate("alice", "alice", []float32{1, 0, 0}, nil,
+			config.VoiceGateWhenEvery, config.VoiceGateRejectEvent))
+		session.voiceGate.cfg.Identity = &config.VoiceIdentityConfig{
+			Personalize: true, InjectName: true, InjectSystemNote: true,
+		}
+		session.Instructions = "You are helpful."
+		tr := &fakeTransport{}
+
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
+
+		user := findRole(m.lastMessages, "user")
+		Expect(user).ToNot(BeNil())
+		Expect(user.Name).To(Equal("alice"))
+		sys := findRole(m.lastMessages, "system")
+		Expect(sys).ToNot(BeNil())
+		Expect(sys.StringContent).To(ContainSubstring("The current speaker is alice."))
+	})
+
+	It("omits the unknown note unless note_unknown is set", func() {
+		base := func() (*Session, *fakeModel) {
+			gate := &voiceGate{
+				cfg: config.PipelineVoiceRecognition{
+					Mode: config.VoiceGateModeIdentify, Threshold: 0.25,
+					When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent,
+					Enforce:  boolPtr(false),
+					Identity: &config.VoiceIdentityConfig{Personalize: true, InjectSystemNote: true},
+				},
+				registry: &fakeRegistry{matches: []voicerecognition.Match{
+					{Distance: 0.9, Metadata: voicerecognition.Metadata{Name: "alice"}},
+				}},
+				embedFn: func(context.Context, string) ([]float32, error) { return []float32{1, 0, 0}, nil },
+			}
+			s, m := itSession(gate)
+			s.Instructions = "You are helpful."
+			return s, m
+		}
+
+		s1, m1 := base()
+		commitUtterance(context.Background(), utt, s1, &Conversation{}, &fakeTransport{})
+		Expect(findRole(m1.lastMessages, "system").StringContent).ToNot(ContainSubstring("unknown"))
+
+		s2, m2 := base()
+		s2.voiceGate.cfg.Identity.NoteUnknown = true
+		commitUtterance(context.Background(), utt, s2, &Conversation{}, &fakeTransport{})
+		Expect(findRole(m2.lastMessages, "system").StringContent).To(ContainSubstring("The current speaker is unknown."))
+	})
+})
+
+var _ = Describe("realtime when:first with identity (commitUtterance)", func() {
+	utt := make([]byte, 32)
+
+	// statefulIdentityGate builds a when:first identify gate with an Identity
+	// block (so identity is resolved every turn) whose embedFn is driven by a
+	// per-turn counter: the failOnSecond flag makes the second and later embeds
+	// return an error, exercising the stricter fail-closed path on a re-resolve.
+	statefulIdentityGate := func(failOnSecond bool) *voiceGate {
+		calls := 0
+		return &voiceGate{
+			cfg: config.PipelineVoiceRecognition{
+				Mode:      config.VoiceGateModeIdentify,
+				Threshold: 0.25,
+				When:      config.VoiceGateWhenFirst,
+				OnReject:  config.VoiceGateRejectEvent,
+				Allow:     config.VoiceRecognitionAllow{Names: []string{"alice"}},
+				Identity:  &config.VoiceIdentityConfig{Announce: true, Personalize: true, InjectName: true},
+			},
+			registry: &fakeRegistry{matches: []voicerecognition.Match{
+				{Distance: 0.1, Metadata: voicerecognition.Metadata{Name: "alice"}},
+			}},
+			embedFn: func(context.Context, string) ([]float32, error) {
+				calls++
+				if failOnSecond && calls > 1 {
+					return nil, errors.New("embed backend down")
+				}
+				return []float32{1, 0, 0}, nil
+			},
+		}
+	}
+
+	It("re-resolves identity every turn and fails closed when a later embed errors", func() {
+		gate := statefulIdentityGate(true)
+		session, _ := itSession(gate)
+		conv := &Conversation{} // shared so voiceVerified persists across turns
+
+		// Turn 1: authorized; identity resolved, speaker surfaced, response runs.
+		tr1 := &fakeTransport{}
+		commitUtterance(context.Background(), utt, session, conv, tr1)
+		Expect(hasSpeakerNotAuthorized(tr1)).To(BeFalse())
+		Expect(tr1.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(1))
+		Expect(tr1.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1))
+
+		// Turn 2: when:first would skip re-authorization, but the Identity block
+		// forces a fresh resolve. That resolve now errors, and because the gate
+		// enforces, the turn is dropped fail-closed rather than riding on the
+		// cached first verification.
+		tr2 := &fakeTransport{}
+		commitUtterance(context.Background(), utt, session, conv, tr2)
+		Expect(hasSpeakerNotAuthorized(tr2)).To(BeTrue())
+		Expect(tr2.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
+	})
+
+	It("re-resolves identity every turn so a later turn still surfaces and names the speaker", func() {
+		gate := statefulIdentityGate(false)
+		session, m := itSession(gate)
+		conv := &Conversation{}
+
+		tr1 := &fakeTransport{}
+		commitUtterance(context.Background(), utt, session, conv, tr1)
+		Expect(hasSpeakerNotAuthorized(tr1)).To(BeFalse())
+		Expect(tr1.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1))
+
+		// Turn 2: authorization is skipped (when:first, already verified) but the
+		// speaker event still fires and the per-message name is set, proving the
+		// per-turn re-resolution (not the cached first verification) drove it.
+		tr2 := &fakeTransport{}
+		commitUtterance(context.Background(), utt, session, conv, tr2)
+		Expect(tr2.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(1))
+		var lastUser *schema.Message
+		for i := range m.lastMessages {
+			if m.lastMessages[i].Role == "user" {
+				lastUser = &m.lastMessages[i]
+			}
+		}
+		Expect(lastUser).ToNot(BeNil())
+		Expect(lastUser.Name).To(Equal("alice"))
+	})
+})
+
+var _ = Describe("realtime multi-speaker history attribution (triggerResponse)", func() {
+	userAudioItem := func(name, transcript string) *types.MessageItemUnion {
+		return &types.MessageItemUnion{
+			User: &types.MessageItemUser{
+				ID:      generateItemID(),
+				Status:  types.ItemStatusCompleted,
+				Speaker: &types.Speaker{Name: name, Matched: true},
+				Content: []types.MessageContentInput{
+					{Type: types.MessageContentTypeInputAudio, Transcript: transcript},
+				},
+			},
+		}
+	}
+
+	It("attributes each user turn to its own speaker and notes the latest one", func() {
+		session, m := itSession(itGate("alice", "alice", []float32{1, 0, 0}, nil,
+			config.VoiceGateWhenEvery, config.VoiceGateRejectEvent))
+		session.Instructions = "You are helpful."
+		session.MaxHistoryItems = 10 // keep both items; 0 would mean "no trim" too
+		session.voiceGate.cfg.Identity = &config.VoiceIdentityConfig{
+			Personalize: true, InjectName: true, InjectSystemNote: true,
+		}
+
+		conv := &Conversation{Items: []*types.MessageItemUnion{
+			userAudioItem("alice", "hello there"),
+			userAudioItem("bob", "what is the weather"),
+		}}
+		tr := &fakeTransport{}
+
+		triggerResponse(context.Background(), session, conv, tr, nil)
+
+		var users []*schema.Message
+		var sys *schema.Message
+		for i := range m.lastMessages {
+			switch m.lastMessages[i].Role {
+			case "user":
+				users = append(users, &m.lastMessages[i])
+			case "system":
+				if sys == nil {
+					sys = &m.lastMessages[i]
+				}
+			}
+		}
+		Expect(users).To(HaveLen(2))
+		Expect(users[0].Name).To(Equal("alice"))
+		Expect(users[1].Name).To(Equal("bob"))
+
+		Expect(sys).ToNot(BeNil())
+		Expect(sys.StringContent).To(ContainSubstring("The current speaker is bob."))
+		Expect(sys.StringContent).ToNot(ContainSubstring("alice"))
+	})
+})
+
+func boolPtr(b bool) *bool { return &b }
diff --git a/core/http/endpoints/openai/realtime_voicegate_test.go b/core/http/endpoints/openai/realtime_voicegate_test.go
index bdbbc2f4a..3d9b458e1 100644
--- a/core/http/endpoints/openai/realtime_voicegate_test.go
+++ b/core/http/endpoints/openai/realtime_voicegate_test.go
@@ -10,6 +10,82 @@ import (
 	. "github.com/onsi/gomega"
 )
 
+var _ = Describe("voiceGate.Resolve + authorize", func() {
+	mkGate := func(allow []string) *voiceGate {
+		return &voiceGate{
+			cfg: config.PipelineVoiceRecognition{
+				Mode:      config.VoiceGateModeIdentify,
+				Threshold: 0.25,
+				Allow:     config.VoiceRecognitionAllow{Names: allow},
+			},
+			registry: &fakeRegistry{matches: []voicerecognition.Match{
+				{Distance: 0.1, Metadata: voicerecognition.Metadata{ID: "spk_1", Name: "alice", Labels: map[string]string{"family": "yes"}}},
+			}},
+			embedFn: func(context.Context, string) ([]float32, error) { return []float32{1, 0, 0}, nil },
+		}
+	}
+
+	It("resolves a confident identity with name, id and a 0..100 confidence", func() {
+		r, err := mkGate(nil).Resolve(context.Background(), "x.wav")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(r.found).To(BeTrue())
+		Expect(r.speaker.Name).To(Equal("alice"))
+		Expect(r.speaker.ID).To(Equal("spk_1"))
+		Expect(r.speaker.Matched).To(BeTrue())
+		Expect(r.speaker.Confidence).To(BeNumerically(">", 0))
+		Expect(r.speaker.Confidence).To(BeNumerically("<=", 100))
+		Expect(r.speaker.Labels).To(HaveKeyWithValue("family", "yes"))
+	})
+
+	It("marks a candidate above the threshold as not matched", func() {
+		g := mkGate(nil)
+		g.registry = &fakeRegistry{matches: []voicerecognition.Match{
+			{Distance: 0.9, Metadata: voicerecognition.Metadata{Name: "alice"}},
+		}}
+		r, err := g.Resolve(context.Background(), "x.wav")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(r.found).To(BeTrue())
+		Expect(r.speaker.Matched).To(BeFalse())
+		Expect(r.speaker.Name).To(Equal("alice")) // name still surfaced
+	})
+
+	It("authorize allows a confident match in the allow list", func() {
+		g := mkGate([]string{"alice"})
+		r, _ := g.Resolve(context.Background(), "x.wav")
+		allowed, reason := g.authorize(r)
+		Expect(allowed).To(BeTrue())
+		Expect(reason).To(BeEmpty())
+	})
+
+	It("authorize denies a confident match outside the allow list", func() {
+		g := mkGate([]string{"bob"})
+		r, _ := g.Resolve(context.Background(), "x.wav")
+		allowed, reason := g.authorize(r)
+		Expect(allowed).To(BeFalse())
+		Expect(reason).To(Equal("speaker not in allow list"))
+	})
+
+	It("authorize allows by label when names do not match", func() {
+		g := mkGate(nil)
+		g.cfg.Allow = config.VoiceRecognitionAllow{Labels: []string{"family"}}
+		r, _ := g.Resolve(context.Background(), "x.wav")
+		allowed, _ := g.authorize(r)
+		Expect(allowed).To(BeTrue())
+	})
+})
+
+var _ = Describe("confidence", func() {
+	It("is 100 at zero distance", func() {
+		Expect(confidence(0, 0.25)).To(BeNumerically("~", 100, 1e-4))
+	})
+	It("clamps to 0 above the threshold", func() {
+		Expect(confidence(0.5, 0.25)).To(BeNumerically("~", 0, 1e-4))
+	})
+	It("is 0 for a non-positive threshold", func() {
+		Expect(confidence(0.1, 0)).To(BeNumerically("~", 0, 1e-4))
+	})
+})
+
 var _ = Describe("cosineDistance", func() {
 	It("is 0 for identical vectors", func() {
 		Expect(cosineDistance([]float32{1, 0, 0}, []float32{1, 0, 0})).To(BeNumerically("~", 0, 1e-6))
diff --git a/core/http/endpoints/openai/types/message_item.go b/core/http/endpoints/openai/types/message_item.go
index 52997fe8c..88d680648 100644
--- a/core/http/endpoints/openai/types/message_item.go
+++ b/core/http/endpoints/openai/types/message_item.go
@@ -102,6 +102,10 @@ type MessageItemUser struct {
 
 	// The status of the item. Has no effect on the conversation.
 	Status ItemStatus `json:"status,omitempty"`
+
+	// Speaker is the recognized speaker for this audio turn (LocalAI extension).
+	// Used to attribute past turns when rebuilding the LLM message history.
+	Speaker *Speaker `json:"speaker,omitempty"`
 }
 
 func (m MessageItemUser) MessageItemType() MessageItemType {
diff --git a/core/http/endpoints/openai/types/server_events.go b/core/http/endpoints/openai/types/server_events.go
index bae680fd5..8183a8b78 100644
--- a/core/http/endpoints/openai/types/server_events.go
+++ b/core/http/endpoints/openai/types/server_events.go
@@ -20,6 +20,9 @@ const (
 	ServerEventTypeConversationItemInputAudioTranscriptionFailed    ServerEventType = "conversation.item.input_audio_transcription.failed"
 	ServerEventTypeConversationItemTruncated                        ServerEventType = "conversation.item.truncated"
 	ServerEventTypeConversationItemDeleted                          ServerEventType = "conversation.item.deleted"
+	// ServerEventTypeConversationItemSpeaker is a LocalAI extension: it reports
+	// the recognized speaker for a user audio item. OpenAI clients ignore it.
+	ServerEventTypeConversationItemSpeaker ServerEventType = "conversation.item.speaker"
 	ServerEventTypeInputAudioBufferCommitted                        ServerEventType = "input_audio_buffer.committed"
 	ServerEventTypeInputAudioBufferCleared                          ServerEventType = "input_audio_buffer.cleared"
 	ServerEventTypeInputAudioBufferSpeechStarted                    ServerEventType = "input_audio_buffer.speech_started"
@@ -335,6 +338,33 @@ func (m ConversationItemAddedEvent) MarshalJSON() ([]byte, error) {
 	return json.Marshal(shadow)
 }
 
+// ConversationItemSpeakerEvent reports the recognized speaker for a user audio
+// item. LocalAI extension; not part of the OpenAI Realtime API.
+type ConversationItemSpeakerEvent struct {
+	ServerEventBase
+	// ItemID is the conversation item this speaker belongs to.
+	ItemID string `json:"item_id"`
+	// Speaker is the recognized identity.
+	Speaker Speaker `json:"speaker"`
+}
+
+func (m ConversationItemSpeakerEvent) ServerEventType() ServerEventType {
+	return ServerEventTypeConversationItemSpeaker
+}
+
+func (m ConversationItemSpeakerEvent) MarshalJSON() ([]byte, error) {
+	type typeAlias ConversationItemSpeakerEvent
+	type typeWrapper struct {
+		typeAlias
+		Type ServerEventType `json:"type"`
+	}
+	shadow := typeWrapper{
+		typeAlias: typeAlias(m),
+		Type:      m.ServerEventType(),
+	}
+	return json.Marshal(shadow)
+}
+
 // Returned when a conversation item is finalized.
 //
 // The event will include the full content of the Item except for audio data, which can be retrieved separately with a `conversation.item.retrieve` event if needed.
diff --git a/core/http/endpoints/openai/types/speaker.go b/core/http/endpoints/openai/types/speaker.go
new file mode 100644
index 000000000..a5b02c927
--- /dev/null
+++ b/core/http/endpoints/openai/types/speaker.go
@@ -0,0 +1,14 @@
+package types
+
+// Speaker is the recognized speaker for a committed audio turn. It is a LocalAI
+// extension to the OpenAI Realtime schema, carried on the user conversation item
+// and surfaced via the conversation.item.speaker event. Confidence is a 0..100
+// score relative to the match threshold (same formula as /v1/voice/identify).
+type Speaker struct {
+	Name       string            `json:"name,omitempty"`
+	ID         string            `json:"id,omitempty"`
+	Labels     map[string]string `json:"labels,omitempty"`
+	Confidence float32           `json:"confidence"`
+	Distance   float32           `json:"distance"`
+	Matched    bool              `json:"matched"`
+}
diff --git a/docs/content/features/openai-realtime.md b/docs/content/features/openai-realtime.md
index 51f8960bf..48cfc9332 100644
--- a/docs/content/features/openai-realtime.md
+++ b/docs/content/features/openai-realtime.md
@@ -141,6 +141,8 @@ The API follows the OpenAI Realtime API protocol for handling sessions, audio bu
 
 A pipeline realtime model can require speaker verification before it responds. Add a `voice_recognition` block under `pipeline`. When present, each committed utterance is verified against authorized speakers; unauthorized utterances are dropped before the LLM runs (no LLM call, no tool execution, no TTS). The session stays open.
 
+The same block also drives two optional, independent behaviors: an authorization gate (`enforce`) and speaker surfacing/personalization (`identity`). Set `enforce: false` to keep recognizing the speaker without ever rejecting a turn.
+
 ```yaml
 name: my-realtime
 pipeline:
@@ -152,6 +154,7 @@ pipeline:
     model: speaker-recognition   # the speaker-recognition backend model
     mode: identify               # "identify" (registry) or "verify" (references)
     threshold: 0.25              # cosine distance; <= passes
+    enforce: true                # authorization gate (default true)
     when: every                  # "every" (default) or "first"
     on_reject: drop_event        # "drop_event" (default) or "drop_silent"
     anti_spoofing: false         # optional liveness check (verify mode)
@@ -170,19 +173,73 @@ pipeline:
         audio: /models/voices/bob.wav
 ```
 
+### Identifying speakers without gating
+
+To recognize who is speaking and surface it to the client and the LLM without ever rejecting a turn, set `enforce: false` and add an `identity` block. The `identity` block works with or without the gate; when it is set, the speaker is resolved on every turn even if `when: first`.
+
+```yaml
+name: my-realtime
+pipeline:
+  vad: silero-vad
+  transcription: whisper
+  llm: qwen
+  tts: kokoro
+  voice_recognition:
+    model: speaker-recognition
+    mode: identify
+    threshold: 0.25
+    # Authorization gate. Defaults to enforcing (rejects unauthorized speakers).
+    # Set enforce:false to identify the speaker WITHOUT rejecting anyone.
+    enforce: false
+    when: every
+    # Surface the recognized speaker to the client and the LLM. Works with or
+    # without enforce; when set, identity is resolved on every turn even if
+    # when:first.
+    identity:
+      announce: true            # emit the conversation.item.speaker event
+      announce_unknown: false   # also emit it when there is no confident match
+      personalize: true         # tell the LLM who is speaking
+      inject_name: true         # set the per-message OpenAI name field
+      inject_system_note: true  # append a "current speaker" line to the system message
+      note_unknown: false       # append a "speaker is unknown" note when unidentified
+```
+
 | Field | Meaning |
 |-------|---------|
 | `model` | Speaker-recognition backend model name. |
 | `mode` | `identify` matches against speakers registered via `/v1/voice/register`; `verify` matches against the `references` audios. |
 | `threshold` | Maximum cosine distance that still counts as a match (default ~0.25). |
-| `when` | `every` verifies each utterance; `first` verifies once then trusts the session. |
+| `enforce` | Authorization gate. `true` (or omitted) rejects unauthorized speakers (the gating behavior above). `false` resolves and surfaces the speaker without ever dropping a turn. |
+| `when` | `every` verifies each utterance; `first` verifies once then trusts the session. When an `identity` block is set, the speaker is still resolved on every turn even with `first`. |
 | `on_reject` | `drop_event` drops and emits a `speaker_not_authorized` error event; `drop_silent` drops quietly. |
 | `anti_spoofing` | Verify mode only: runs the backend liveness check (slower). |
 | `allow.names` / `allow.labels` | identify mode: which registry identities are authorized. Empty = any registered speaker. |
 | `references` | verify mode: authorized reference speakers; the utterance passes if it matches any. |
+| `identity.announce` | Emit the `conversation.item.speaker` event to the client (see below). |
+| `identity.announce_unknown` | Also emit that event when there is no confident match. By default the event is emitted only on a match. |
+| `identity.personalize` | Inform the LLM who is speaking. |
+| `identity.inject_name` | Set the per-message OpenAI `name` field on each user turn. |
+| `identity.inject_system_note` | Append a `The current speaker is <Name>.` line to the system message. |
+| `identity.note_unknown` | When unidentified, append `The current speaker is unknown.` (lets the model ask who it is talking to). |
 
 `identify` mode requires the voice registry (speakers registered through `/v1/voice/register`). `verify` mode needs no registry: reference audios are embedded once at model load.
 
+### The `conversation.item.speaker` event
+
+When `identity.announce` is enabled, the server emits a `conversation.item.speaker` event after the user conversation item, naming the recognized speaker:
+
+```json
+{
+  "type": "conversation.item.speaker",
+  "item_id": "item_abc",
+  "speaker": { "name": "Jeremy", "id": "spk_1", "labels": { "role": "owner" }, "confidence": 92.0, "distance": 0.1, "matched": true }
+}
+```
+
+`confidence` is a 0-100 score, `distance` is the cosine distance, and `matched` is `true` when a confident match was found. `labels` carries any labels attached to the registered speaker (identify mode); it is omitted when the speaker has none. The `name` and `id` fields are omitted when empty. By default the event is emitted only on a match; set `identity.announce_unknown: true` to also emit it (with `matched: false`) when no speaker is identified.
+
+This event is a LocalAI extension to the OpenAI Realtime API and is server-emitted only. Standard OpenAI Realtime clients ignore event types they do not recognize, so enabling it is non-breaking.
+
 ## Examples
 
 - [Realtime voice assistant demo (Go)](https://github.com/localai-org/localai-realtime-demo): a minimal Go client for the Realtime (WebSocket) API with a full talk-back voice loop and an example tool call. Ships a `docker compose` setup that brings up a realtime-capable LocalAI for you.
diff --git a/tests/e2e/e2e_suite_test.go b/tests/e2e/e2e_suite_test.go
index 72ac88b74..5a257bdb0 100644
--- a/tests/e2e/e2e_suite_test.go
+++ b/tests/e2e/e2e_suite_test.go
@@ -275,6 +275,40 @@ var _ = BeforeSuite(func() {
 	Expect(err).ToNot(HaveOccurred())
 	Expect(os.WriteFile(filepath.Join(modelsPath, "realtime-pipeline-gated.yaml"), gatedData, 0644)).To(Succeed())
 
+	// Identity-surfacing pipeline: the same speaker backend, but enforce:false
+	// (never drop a turn) plus an identity block so the server emits the
+	// conversation.item.speaker event and personalizes the LLM turn. Used by the
+	// speaker-identity e2e specs.
+	identityCfg := map[string]any{
+		"name": "realtime-pipeline-identity",
+		"pipeline": map[string]any{
+			"vad":           "mock-vad",
+			"transcription": "mock-stt",
+			"llm":           "mock-llm",
+			"tts":           "mock-tts",
+			"voice_recognition": map[string]any{
+				"model":     "mock-speaker",
+				"mode":      "verify",
+				"threshold": 0.25,
+				"when":      "every",
+				"enforce":   false,
+				"references": []map[string]any{
+					{"name": "e2e-speaker", "audio": voiceRefPath},
+				},
+				"identity": map[string]any{
+					"announce":           true,
+					"announce_unknown":   true,
+					"personalize":        true,
+					"inject_name":        true,
+					"inject_system_note": true,
+				},
+			},
+		},
+	}
+	identityData, err := yaml.Marshal(identityCfg)
+	Expect(err).ToNot(HaveOccurred())
+	Expect(os.WriteFile(filepath.Join(modelsPath, "realtime-pipeline-identity.yaml"), identityData, 0644)).To(Succeed())
+
 	// Router model setup: a score classifier (mock-backend Score) selects
 	// between two candidate chat models based on keyword matches against the
 	// candidate label fragments. Exercises the full RouteModel middleware path
diff --git a/tests/e2e/realtime_speaker_identity_test.go b/tests/e2e/realtime_speaker_identity_test.go
new file mode 100644
index 000000000..41a15f0fb
--- /dev/null
+++ b/tests/e2e/realtime_speaker_identity_test.go
@@ -0,0 +1,95 @@
+package e2e_test
+
+import (
+	"encoding/base64"
+	"time"
+
+	"github.com/gorilla/websocket"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// These specs drive the speaker-identity surfacing end to end against a real
+// LocalAI server over a real WebSocket, using the mock backend's VoiceEmbed
+// (DC-biased PCM -> one of two orthogonal speaker vectors). The pipeline is
+// realtime-pipeline-identity: verify mode with enforce:false plus an identity
+// block, so the server resolves the speaker, emits a conversation.item.speaker
+// event, and never drops a turn.
+var _ = Describe("Realtime speaker identity surfacing", Label("Realtime"), func() {
+	// open connects to the identity pipeline and disables server VAD so the
+	// test can commit the input buffer manually.
+	open := func() *websocket.Conn {
+		c := connectWS("realtime-pipeline-identity")
+		created := readServerEvent(c, 30*time.Second)
+		Expect(created["type"]).To(Equal("session.created"))
+		sendClientEvent(c, disableVADEvent())
+		drainUntil(c, "session.updated", 10*time.Second)
+		return c
+	}
+
+	commit := func(c *websocket.Conn, pcm []byte) {
+		sendClientEvent(c, map[string]any{
+			"type":  "input_audio_buffer.append",
+			"audio": base64.StdEncoding.EncodeToString(pcm),
+		})
+		sendClientEvent(c, map[string]any{"type": "input_audio_buffer.commit"})
+	}
+
+	// collectUntilDone reads events until response.done (or timeout), returning
+	// the conversation.item.speaker event (nil if none) and whether the turn
+	// reached response.done.
+	collectUntilDone := func(c *websocket.Conn, timeout time.Duration) (speaker map[string]any, gotDone bool) {
+		deadline := time.Now().Add(timeout)
+		for time.Now().Before(deadline) {
+			evt := readServerEvent(c, time.Until(deadline))
+			switch evt["type"] {
+			case "conversation.item.speaker":
+				speaker = evt
+			case "response.done":
+				return speaker, true
+			}
+		}
+		return speaker, false
+	}
+
+	It("emits conversation.item.speaker naming an authorized speaker and still responds", func() {
+		c := open()
+		defer func() { _ = c.Close() }()
+
+		// Positive DC bias matches the enrolled reference speaker.
+		commit(c, pcmWithDC(300, 16000, 1000, 8000))
+		drainUntil(c, "input_audio_buffer.committed", 30*time.Second)
+
+		speaker, gotDone := collectUntilDone(c, 60*time.Second)
+		Expect(speaker).ToNot(BeNil(), "expected a conversation.item.speaker event")
+		Expect(speaker["item_id"]).ToNot(BeEmpty())
+
+		spk, ok := speaker["speaker"].(map[string]any)
+		Expect(ok).To(BeTrue(), "speaker payload should be an object")
+		Expect(spk["matched"]).To(Equal(true))
+		Expect(spk["name"]).To(Equal("e2e-speaker"))
+
+		Expect(gotDone).To(BeTrue(), "enforce:false should let the turn reach response.done")
+	})
+
+	It("emits an unknown speaker event and still responds when enforce is false", func() {
+		c := open()
+		defer func() { _ = c.Close() }()
+
+		// Negative DC bias is a different speaker that matches no reference.
+		commit(c, pcmWithDC(300, 16000, 1000, -8000))
+		drainUntil(c, "input_audio_buffer.committed", 30*time.Second)
+
+		speaker, gotDone := collectUntilDone(c, 60*time.Second)
+		Expect(speaker).ToNot(BeNil(), "announce_unknown should still emit the event")
+
+		spk, ok := speaker["speaker"].(map[string]any)
+		Expect(ok).To(BeTrue(), "speaker payload should be an object")
+		Expect(spk["matched"]).To(Equal(false))
+		// name is omitted for an unidentified speaker.
+		_, hasName := spk["name"]
+		Expect(hasName).To(BeFalse())
+
+		Expect(gotDone).To(BeTrue(), "enforce:false must not drop an unauthorized speaker")
+	})
+})

From 64560a974b9221650f0e6948ec7c4b16b7c9fb57 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sun, 21 Jun 2026 23:31:17 +0200
Subject: [PATCH 32/99] chore(model gallery): :robot: add 1 new models via
 gallery agent (#10432)

chore(model gallery): :robot: add new models via gallery agent

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 gallery/index.yaml | 56 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 612a2b6ea..1840e5d01 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,60 @@
 ---
+- name: "qwen3.6-27b-nvfp4-mtp"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/michaelw9999/Qwen3.6-27B-NVFP4-MTP-GGUF
+  description: |
+    # Qwen3.6-27B
+
+    [](https://chat.qwen.ai)
+
+    > [!Note]
+    > This repository contains model weights and configuration files for the post-trained model in the Hugging Face Transformers format.
+    >
+    > These artifacts are compatible with Hugging Face Transformers, vLLM, SGLang, KTransformers, etc.
+
+    Following the February release of the Qwen3.5 series, we're pleased to share the first open-weight variant of Qwen3.6. Built on direct feedback from the community, Qwen3.6 prioritizes stability and real-world utility, offering developers a more intuitive, responsive, and genuinely productive coding experience.
+
+    ## Qwen3.6 Highlights
+
+    This release delivers substantial upgrades, particularly in
+
+      - **Agentic Coding:** the model now handles frontend workflows and repository-level reasoning with greater fluency and precision.
+      - **Thinking Preservation:** we've introduced a new option to retain reasoning context from historical messages, streamlining iterative development and reducing overhead.
+
+    For more details, please refer to our blog post Qwen3.6-27B.
+
+    ## Model Overview
+
+    ...
+  tags:
+    - llm
+    - gguf
+  icon: https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3.6/Figures/qwen3.6_27b_score.png
+  overrides:
+    backend: llama-cpp
+    function:
+      automatic_tool_parsing_fallback: true
+      grammar:
+        disable: true
+    known_usecases:
+      - chat
+    options:
+      - use_jinja:true
+    parameters:
+      min_p: 0
+      model: llama-cpp/models/Qwen3.6-27B-NVFP4-MTP-GGUF/Qwen3.6-27B-NVFP4-MTP-GGUF.gguf
+      presence_penalty: 1.5
+      repeat_penalty: 1
+      temperature: 0.7
+      top_k: 20
+      top_p: 0.8
+    template:
+      use_tokenizer_template: true
+  files:
+    - filename: llama-cpp/models/Qwen3.6-27B-NVFP4-MTP-GGUF/Qwen3.6-27B-NVFP4-MTP-GGUF.gguf
+      sha256: d088e57e8c35ff62c2a420cb888dad3fd53c8db3ed9ead4286bd383224f81b50
+      uri: https://huggingface.co/michaelw9999/Qwen3.6-27B-NVFP4-MTP-GGUF/resolve/main/Qwen3.6-27B-NVFP4-MTP-GGUF.gguf
 - name: "gemma-4-12b-agentic-fable5-composer2.5-v2-3.5x-tau2"
   url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
   urls:

From 1f29e960305ea27de8f8335c474468f22b37dc14 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sun, 21 Jun 2026 23:51:43 +0200
Subject: [PATCH 33/99] chore(model gallery): :robot: add 1 new models via
 gallery agent (#10433)

chore(model gallery): :robot: add new models via gallery agent

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 gallery/index.yaml | 48 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 1840e5d01..ed6b57abe 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,52 @@
 ---
+- name: "qwopus3.6-27b-coder-mtp-nvfp4"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/michaelw9999/Qwopus3.6-27B-Coder-MTP-NVFP4-GGUF
+  description: |
+    🪐 Qwopus-3.6-27B-Coder
+    Coder SFT Release
+
+    Agentic Coding &amp; Tool-Use Reasoning Model Fine-Tuned on Qwopus3.6-27B-v2
+
+    🧬 Trace Inversion & Negentropy
+    🧠 27B Dense Model
+    ⚡ Agentic Coding
+    🛠️ Tool Calling & Agent
+    🏆 SWE-bench Verified: 67.0% (off-thinking)
+
+    💡 What is Qwopus-3.6-27B-Coder?
+    🪐 Qwopus-3.6-27B-Coder is a reasoning-enhanced agentic coding model built on top of Qwopus3.6-27B-v2. It inherits the powerful reasoning foundation of the v2 base — which achieved 87.43% MMLU-Pro (300ex) and 75.25% SWE-bench Verified — and further specializes it for agentic code generation, structured tool calling, debugging, and instruction-following in developer workflows. The model is designed to excel at repository-level coding tasks, multi-turn tool orchestration, and complex logical reasoning under realistic agent environments.
+
+    🧩 Agentic Coding
+    Optimized for repository-level coding, debugging, patch generation, and structured multi-step development workflows.
+
+    🛠️ Tool Calling
+    Learns from real agent trajectories with tool definitions, tool calls, and environment feedback for robust multi-turn execution.
+
+    ...
+  tags:
+    - llm
+    - gguf
+  icon: https://cdn-uploads.huggingface.co/production/uploads/66309bd090589b7c65950665/sGQKmrMc6L6guMoaB5_Y2.png
+  overrides:
+    backend: llama-cpp
+    function:
+      automatic_tool_parsing_fallback: true
+      grammar:
+        disable: true
+    known_usecases:
+      - chat
+    options:
+      - use_jinja:true
+    parameters:
+      model: llama-cpp/models/Qwopus3.6-27B-Coder-MTP-NVFP4-GGUF/Qwopus3.6-27B-Coder-MTP-NVFP4-TURBO.gguf
+    template:
+      use_tokenizer_template: true
+  files:
+    - filename: llama-cpp/models/Qwopus3.6-27B-Coder-MTP-NVFP4-GGUF/Qwopus3.6-27B-Coder-MTP-NVFP4-TURBO.gguf
+      sha256: 1c163f0e1f29485d432b466b9e5e0593ea9b10c5a62cf3eb71b77fcfe41db46c
+      uri: https://huggingface.co/michaelw9999/Qwopus3.6-27B-Coder-MTP-NVFP4-GGUF/resolve/main/Qwopus3.6-27B-Coder-MTP-NVFP4-TURBO.gguf
 - name: "qwen3.6-27b-nvfp4-mtp"
   url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
   urls:

From 1a1bd57469f407f62fbb85d3e4b81cb592566a88 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 00:46:56 +0200
Subject: [PATCH 34/99] chore(model gallery): :robot: add 1 new models via
 gallery agent (#10436)

chore(model gallery): :robot: add new models via gallery agent

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 gallery/index.yaml | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index ed6b57abe..7699acb0b 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,49 @@
 ---
+- name: "qwopus3.6-27b-v2-mtp-nvfp4"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/michaelw9999/Qwopus3.6-27B-v2-MTP-NVFP4-GGUF
+  description: |
+    🪐 Qwopus3.6-27B-v2-MTP
+    MTP Release
+
+    Multi-Token Prediction reasoning model fine-tuned from Qwen3.6-27B
+
+    🧬 Trace Inversion & Negentropy
+    🧠 27B Parameters
+    ⚡ Speculative Decoding
+    🛠️ Coding / DevOps / Math
+
+    💡 What is Qwopus3.6-27B-v2-MTP?
+    🪐 Qwopus3.6-27B-v2-MTP is a speed-oriented reasoning release built on top of Qwen3.6-27B. It keeps the Qwopus line's focus on reconstructed reasoning traces, coding discipline, DevOps procedures, and mathematical derivations, while adding Multi-Token Prediction for faster generation. The goal is simple: preserve the depth and structure of a 27B reasoning model while making real interactive use noticeably faster.
+
+    ⚡ MTP DecodingAuxiliary future-token prediction improves throughput on long reasoning, code, math, and strict-format prompts.
+    🧩 Structured ReasoningInherits the Qwopus training recipe built around reconstructed step-by-step reasoning trajectories.
+    🧪 GB10 TestedValidated on a 30-question local benchmark across Logic, Coding, DevOps, Math, and Edge tasks.
+    🚀 Practical SpeedDesigned for workflows where strong answers matter, but waiting several extra minutes per task does not.
+
+    ...
+  tags:
+    - llm
+    - gguf
+  overrides:
+    backend: llama-cpp
+    function:
+      automatic_tool_parsing_fallback: true
+      grammar:
+        disable: true
+    known_usecases:
+      - chat
+    options:
+      - use_jinja:true
+    parameters:
+      model: llama-cpp/models/Qwopus3.6-27B-v2-MTP-NVFP4-GGUF/Qwopus3.6-27B-v2-MTP-NVFP4-GGUF.gguf
+    template:
+      use_tokenizer_template: true
+  files:
+    - filename: llama-cpp/models/Qwopus3.6-27B-v2-MTP-NVFP4-GGUF/Qwopus3.6-27B-v2-MTP-NVFP4-GGUF.gguf
+      sha256: 2a0a36fd10374c2a85356121c7c315bda725c7eaca0b3ae14838567629c6924a
+      uri: https://huggingface.co/michaelw9999/Qwopus3.6-27B-v2-MTP-NVFP4-GGUF/resolve/main/Qwopus3.6-27B-v2-MTP-NVFP4-GGUF.gguf
 - name: "qwopus3.6-27b-coder-mtp-nvfp4"
   url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
   urls:

From f45c6acc5489a51fc74f51ef5bfb218749351ab6 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 00:57:08 +0200
Subject: [PATCH 35/99] chore(model gallery): :robot: add 1 new models via
 gallery agent (#10437)

chore(model gallery): :robot: add new models via gallery agent

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 gallery/index.yaml | 57 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 7699acb0b..fcf180e13 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,61 @@
 ---
+- name: "qwen3.6-35b-a3b-nvfp4-mtp"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/michaelw9999/Qwen3.6-35B-A3B-NVFP4-MTP-GGUF
+  description: |
+    # Qwen3.6-35B-A3B
+
+    [](https://chat.qwen.ai)
+
+    > [!Note]
+    > This repository contains model weights and configuration files for the post-trained model in the Hugging Face Transformers format.
+    >
+    > These artifacts are compatible with Hugging Face Transformers, vLLM, SGLang, KTransformers, etc.
+
+    Following the February release of the Qwen3.5 series, we're pleased to share the first open-weight variant of Qwen3.6. Built on direct feedback from the community, Qwen3.6 prioritizes stability and real-world utility, offering developers a more intuitive, responsive, and genuinely productive coding experience.
+
+    ## Qwen3.6 Highlights
+
+    This release delivers substantial upgrades, particularly in
+
+      - **Agentic Coding:** the model now handles frontend workflows and repository-level reasoning with greater fluency and precision.
+      - **Thinking Preservation:** we've introduced a new option to retain reasoning context from historical messages, streamlining iterative development and reducing overhead.
+
+    For more details, please refer to our blog post Qwen3.6-35B-A3B.
+
+    ## Model Overview
+
+    ...
+  tags:
+    - llm
+    - gguf
+    - qwen
+  icon: https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3.6/Figures/qwen3.6_35b_a3b_score.png
+  overrides:
+    backend: llama-cpp
+    function:
+      automatic_tool_parsing_fallback: true
+      grammar:
+        disable: true
+    known_usecases:
+      - chat
+    options:
+      - use_jinja:true
+    parameters:
+      min_p: 0
+      model: llama-cpp/models/Qwen3.6-35B-A3B-NVFP4-MTP-GGUF/Qwen3.6-35B-A3B-NVFP4-MTP-TURBO.gguf
+      presence_penalty: 1.5
+      repeat_penalty: 1
+      temperature: 0.7
+      top_k: 20
+      top_p: 0.8
+    template:
+      use_tokenizer_template: true
+  files:
+    - filename: llama-cpp/models/Qwen3.6-35B-A3B-NVFP4-MTP-GGUF/Qwen3.6-35B-A3B-NVFP4-MTP-TURBO.gguf
+      sha256: f3d2fdc74e3ef19925ccbf794b04d7f6f11fb12eba7722b7749219d0cc5c36ed
+      uri: https://huggingface.co/michaelw9999/Qwen3.6-35B-A3B-NVFP4-MTP-GGUF/resolve/main/Qwen3.6-35B-A3B-NVFP4-MTP-TURBO.gguf
 - name: "qwopus3.6-27b-v2-mtp-nvfp4"
   url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
   urls:

From 1cf1bf32e1f42f5d039c2a8c505eeea0a06f254c Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 00:57:33 +0200
Subject: [PATCH 36/99] chore: :arrow_up: Update leejet/stable-diffusion.cpp to
 `b12098f5d09fc83da36e65c784f7bdb16a5a5ebf` (#10429)

:arrow_up: Update leejet/stable-diffusion.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/go/stablediffusion-ggml/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/go/stablediffusion-ggml/Makefile b/backend/go/stablediffusion-ggml/Makefile
index d6d03adab..f77baccad 100644
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 
 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=7f0e728b7d42f2490dfa5dd9539082d904f2f6b2
+STABLEDIFFUSION_GGML_VERSION?=b12098f5d09fc83da36e65c784f7bdb16a5a5ebf
 
 CMAKE_ARGS+=-DGGML_MAX_NAME=128
 

From a88d9d2de3cd48300678a2b787f0f3e40e9a2842 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 00:57:49 +0200
Subject: [PATCH 37/99] chore: :arrow_up: Update ikawrakow/ik_llama.cpp to
 `6c00e87ac84404af588ad2e65935bd6f079c696f` (#10430)

:arrow_up: Update ikawrakow/ik_llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/cpp/ik-llama-cpp/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/cpp/ik-llama-cpp/Makefile b/backend/cpp/ik-llama-cpp/Makefile
index 39fa7fa4e..f7875c54d 100644
--- a/backend/cpp/ik-llama-cpp/Makefile
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -1,5 +1,5 @@
 
-IK_LLAMA_VERSION?=d47f484d299cafad2e606afc0d31677a91b242d0
+IK_LLAMA_VERSION?=6c00e87ac84404af588ad2e65935bd6f079c696f
 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
 
 CMAKE_ARGS?=

From ce8a3e9266dfb39804ff647a4096ebe54e36a3ad Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 01:00:10 +0200
Subject: [PATCH 38/99] chore: :arrow_up: Update ServeurpersoCom/qwentts.cpp to
 `4536dcdce27c3764a93a06d6bf64026b124962f5` (#10431)

:arrow_up: Update ServeurpersoCom/qwentts.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/go/qwen3-tts-cpp/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/go/qwen3-tts-cpp/Makefile b/backend/go/qwen3-tts-cpp/Makefile
index 84c543af6..4015f790e 100644
--- a/backend/go/qwen3-tts-cpp/Makefile
+++ b/backend/go/qwen3-tts-cpp/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 
 # qwentts.cpp version
 QWEN3TTS_REPO?=https://github.com/ServeurpersoCom/qwentts.cpp
-QWEN3TTS_CPP_VERSION?=26fcea5468e4069bc72d1f2fcc812c985e7361bb
+QWEN3TTS_CPP_VERSION?=4536dcdce27c3764a93a06d6bf64026b124962f5
 SO_TARGET?=libgoqwen3ttscpp.so
 
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

From 600dafd20b624219e82c2b370fdaf5fa0ad11dcb Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 01:00:28 +0200
Subject: [PATCH 39/99] feat(ced): sound-event classification backend (CED
 audio tagger) (#10425)

* feat(ced): sketch sound-classification backend (CED audio tagger)

Wires ced.cpp (CED, 527-class AudioSet sound-event tagger; baby cry,
footsteps, glass, alarms, dog bark) into LocalAI as a Go/purego backend.

SKETCH (backend skeleton real; core REST wiring + CI/gallery is a checklist
in DESIGN.md):
- backend/backend.proto: new SoundDetection rpc + SoundClass messages
  (run `make protogen-go` to regenerate pkg/grpc/proto).
- backend/go/ced: main.go (purego dlopen libced.so + ced_capi.h),
  goced.go (Ced gRPC backend: Load + SoundDetection), Makefile
  (clone-at-pin CED_VERSION, ggml static-PIC shared build), run.sh,
  package.sh, .gitignore.
- DESIGN.md: REST /v1/audio/classification wiring (handler/route/capability
  registration checklist), gallery/index + CI registration, and a scoping
  note for the realtime/websocket live-recognition path (sliding-window
  classify over the existing ws transport + voicegate; the ced C-API
  per-PCM entry point is already window-friendly).

Backend code does not compile until protogen-go regenerates the pb types
and a libced.so is built (Makefile clones+builds it).

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(ced): REST /v1/audio/classification endpoint + capability registration

Wires the ced sound-event classification backend (AudioSet audio tagger)
end to end through the REST surface, mirroring the transcription path.

- Handler: core/http/endpoints/openai/sound_classification.go parses the
  multipart audio upload, temp-files it, resolves the model config and
  calls the SoundDetection RPC; returns {model, detections[]} JSON.
- Backend wrapper: core/backend/sound_classification.go (ModelSoundDetection)
  loads the model and normalizes the proto response into schema types.
- Schema: core/schema/sound_classification.go (SoundClassificationResult).
- gRPC layer: SoundDetection wired through the LocalAI wrapper (interface,
  Backend client, Client, embed, server, base default) so the loader-typed
  client exposes the RPC; proto regenerated via make protogen-go.
- Route: POST /v1/audio/classification (+ /audio/classification alias) with
  the audio/multipart default-model middleware in routes/openai.go.
- Capability surfaces: swagger @Tags/@Router on the handler; FLAG_SOUND_
  CLASSIFICATION usecase flag + UsecaseSoundClassification + UsecaseInfoMap +
  GuessUsecases + ModalityGroups + GetAllModelConfigUsecases; meta usecase
  option; /api/instructions audio area updated; auth RouteFeatureRegistry +
  FeatureAudioClassification (APIFeatures, default ON) + FeatureMetas; UI
  usecaseFilters, capabilities.js CAP_SOUND_CLASSIFICATION, Models.jsx filter
  + i18n; docs page features/audio-classification.md + whats-new + crosslink.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(ced): realtime sound-event detection over the websocket API

When a realtime pipeline configures a sound-classification model, each
VAD-committed utterance (the same window the transcription path produces)
is also run through the CED sound-event classifier and the scored AudioSet
tags are emitted as a new server event. No new backend rpc is needed: the
SoundDetection gRPC method already exists on this branch.

- config: add Pipeline.SoundDetection (yaml/json sound_detection,omitempty)
  beside Transcription/VAD.
- realtime: add Model.SoundDetection(ctx, audio, topK, threshold) to the
  ModelInterface; implement it on wrappedModel and transcriptOnlyModel by
  calling backend.ModelSoundDetection with the session's sound-classification
  model config (mirrors how Transcribe dispatches). Load the optional config
  in newModel / newTranscriptionOnlyModel; nil config keeps it additive.
- types: add ConversationItemSoundDetectionEvent (item_id, content_index,
  detections[]{label,score,index}) with type conversation.item.sound_detection,
  its ServerEventType constant and MarshalJSON, mirroring the transcription
  completed event.
- realtime: add emitSoundDetection (unary path: classify the committed window,
  build the event, t.SendEvent) and wire it at the utterance-commit hook right
  after emitTranscription; gated on session.SoundDetectionEnabled (resolved
  from Pipeline.SoundDetection at session setup, defaults top_k=5, threshold=0).
  Its error is logged via xlog but never aborts the turn.
- test: Ginkgo specs for emitSoundDetection (tags emitted, empty detections,
  classifier error) plus a SoundDetection method on the fakeModel double.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(ced): implement SoundDetection in nodes backend test doubles

The SoundDetection method added to the grpc backend interface left two
test doubles (fakeBackendClient, fakeGRPCBackend) incomplete, so
core/services/nodes failed to compile under `go vet`/`go test` (go build
missed it: the doubles live in _test.go). Add the method to both,
mirroring their existing Detect mock. Repairs CI for the nodes package.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(ced): decouple realtime sound detection from VAD (sound-only sessions)

Sound-event detection must activate on sounds, not speech, so it no longer
runs through the voice VAD/transcription path. A sound-detection-only
pipeline (sound_detection set, no transcription/LLM) now:

- is accepted by prepareRealtimeConfig (sound_detection counts as a pipeline
  stage),
- builds a lightweight model via newSoundDetectionOnlyModel (no VAD/STT/LLM/TTS
  loaded), and
- defaults the session to turn_detection none (no VAD) with no transcription
  stage, so the client drives windowing via input_audio_buffer.commit
  (option A: client-side sliding window). The per-PCM C-API already supports
  arbitrary windows.

commitUtterance gains a sound-only branch: it emits the
conversation.item.sound_detection event (scored AudioSet tags) and stops -
no transcription, no LLM response. generateResponse is now guarded on a
transcription stage being present, so a sound-only turn never invokes the LLM.

Existing transcription/VAD sessions are unchanged (additive). Added a
commitUtterance sound-only Ginkgo spec asserting it emits the sound event and
neither transcribes nor generates a response. go vet + golangci-lint
(new-from-merge-base) clean; openai suite green.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(ced): register sound-classification backend in gallery + CI

Mechanical backend-image registration for the ced sound-event classifier,
mirroring the parakeet-cpp Go/purego backend everywhere it is wired up.

- .github/backend-matrix.yml: add the ced build matrix, field-for-field copies
  of the parakeet-cpp entries (cpu amd64/arm64, cublas cuda 12/13 amd64,
  l4t cuda-13 arm64, l4t-jetpack cuda-12 arm64, sycl f32/f16, vulkan
  amd64/arm64, rocm hipblas, and the metal darwin entry), changing only
  backend and tag-suffix. dockerfile stays ./backend/Dockerfile.golang.
- backend/index.yaml: add the &ced meta anchor (capabilities map per platform)
  plus ced-development and the per-arch image entries, each uri/mirror
  tag-suffix matching the matrix exactly. The model gallery (GGUF) entry is
  intentionally deferred pending the HuggingFace publish (TODO note inline).
- scripts/changed-backends.js: add an explicit item.backend === "ced" branch in
  inferBackendPath mapping to backend/go/ced/, same mechanism and ordering as
  the parakeet-cpp branch (before the generic golang fallthrough).
- .github/workflows/bump_deps.yaml: register mudler/ced.cpp -> CED_VERSION in
  backend/go/ced/Makefile so the daily bot bumps the pin.
- swagger/{docs.go,swagger.json,swagger.yaml}: regenerated via make swagger so
  the existing /v1/audio/classification annotations land in the generated spec.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(ced): server-side windowing for realtime sound detection (option B)

Adds an optional server-driven sliding-window classifier so a sound-only
realtime client only has to stream audio (no input_audio_buffer.commit):

- Pipeline.sound_detection_window_ms / sound_detection_hop_ms config knobs.
  When both > 0 on a sound-only session, the server classifies the last
  window of streamed audio every hop and emits a conversation.item.sound_
  detection event; the input buffer is trimmed to one window so a long
  stream stays bounded. When unset, the session stays client-driven
  (option A). Runs independent of VAD (sound events are not speech).
- handleSoundWindow (ticker) + classifySoundWindow (one tick, extracted so
  it is unit-testable) + writeWindowWAV, which declares the true
  InputSampleRate (NewWAVHeaderWithRate) so the classifier resamples
  correctly. Goroutine is started after toggleVAD and torn down with the
  session (close + wg.Wait).
- Register pipeline.sound_detection (+window_ms/hop_ms) in the config meta
  registry; the earlier realtime commit added pipeline.sound_detection
  without a registry entry, failing TestAllFieldsHaveRegistryEntries. This
  fixes that and covers the two new knobs.

Tests: classifySoundWindow emits an event + trims the buffer to one window,
no-ops on too-little audio; writeWindowWAV declares the given sample rate.
go build/vet + golangci-lint (new-from-merge-base) clean; config + openai
suites green.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(ced): add ced-base GGUF model gallery entries (f16 + q8_0)

The ced-base weights are now published at mudler/ced-base-gguf (Apache-2.0,
converted from mispeech/ced-base). Adds gallery/ced.yaml (backend: ced +
known_usecases: sound_classification) and two gallery/index.yaml entries
(ced-base-f16 default, ced-base-q8 smallest) with sha256-pinned files, and
removes the now-resolved TODO from backend/index.yaml.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(ced): add tiny/mini/small GGUF model gallery entries

Publishes the rest of the CED family (same architecture, metadata-driven port
verified end-to-end on ced-tiny) to mudler/ced-{tiny,mini,small}-gguf and adds
their f16 + q8_0 gallery entries:

  ced-tiny  (5.5M, edge/Pi-class)  f16 11MB / q8_0 6MB
  ced-mini  (9.6M)                 f16 19MB / q8_0 11MB
  ced-small (22M)                  f16 42MB / q8_0 23MB

All sha256-pinned. ced-base remains the accuracy default.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* chore(ced): point gallery entries at the consolidated mudler/ced-gguf repo

All CED quantizations (tiny/mini/small/base, f16/q8_0) now live in a single
HuggingFace repo, mudler/ced-gguf, instead of per-model repos. Repoint the 8
gallery model entries' urls + file uris accordingly. sha256 and filenames are
unchanged.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* chore(ced): bump CED_VERSION to the short-clip fix

Pin the ced backend to ced.cpp 99c6ed3, which fixes a crash on any clip
shorter than target_length (~10.11s): time_pos_embed was added at its full
63-frame grid instead of being sliced to the clip's actual time grid, tripping
ggml_can_repeat in ggml_add. Surfaced by the live realtime e2e (sub-10s
windows) and gated with a short-clip parity test upstream.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* docs(ced): list ced.cpp as a LocalAI-team engine + backend-guide directive

- README.md: add ced.cpp to the "native C/C++/GGML engines developed and
  maintained by the LocalAI project" table.
- docs/content/features/backends.md: add a Sound Classification backend
  category (sound-event classification / audio tagging) listing ced.cpp.
- .agents/adding-backends.md: add a "Documenting the backend" section and two
  verification-checklist items requiring new backends to be documented in the
  backends.md category list, and in-house native engines to be added to the
  README maintained-engines table. This directive was missing.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* chore(ced): repin CED_VERSION to the v0.1.0 release commit

ced.cpp history was squashed into a single release commit (tagged v0.1.0), so
the previous pin (99c6ed3) no longer exists upstream. Pin to c04ac14, the
v0.1.0 release commit, so the backend builds against a commit that exists.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(ced): silence gosec G304/G103 + govet unsafeptr on audited paths

- sound_classification.go: os.Create(dst) where dst = temp dir + path.Base of
  the upload (no traversal). #nosec G304, matching the depth-anything-cpp handler.
- goced.go: reading a NUL-terminated C string from a libced-owned buffer.
  #nosec G103 (gosec) + //nolint:govet (golangci-lint's unsafeptr check), since
  the uintptr is a C-owned malloc'd buffer, not Go-GC memory.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 .agents/adding-backends.md                    |  23 ++
 .github/backend-matrix.yml                    | 152 +++++++++++
 .github/workflows/bump_deps.yaml              |   4 +
 README.md                                     |   1 +
 backend/backend.proto                         |  21 ++
 backend/go/ced/.gitignore                     |  11 +
 backend/go/ced/Makefile                       |  77 ++++++
 backend/go/ced/goced.go                       | 130 +++++++++
 backend/go/ced/main.go                        |  59 ++++
 backend/go/ced/package.sh                     |  60 ++++
 backend/go/ced/run.sh                         |  15 +
 backend/index.yaml                            | 146 ++++++++++
 core/backend/sound_classification.go          |  88 ++++++
 core/config/backend_capabilities.go           |  49 ++--
 core/config/meta/constants.go                 |   1 +
 core/config/meta/registry.go                  |  24 ++
 core/config/model_config.go                   |  88 ++++--
 core/http/auth/features.go                    |   5 +
 core/http/auth/permissions.go                 |  39 +--
 .../endpoints/localai/api_instructions.go     |   4 +-
 core/http/endpoints/openai/realtime.go        | 257 ++++++++++++++----
 .../endpoints/openai/realtime_doubles_test.go |  12 +
 core/http/endpoints/openai/realtime_model.go  | 100 ++++++-
 .../openai/realtime_sound_detection.go        |  48 ++++
 .../openai/realtime_sound_detection_test.go   | 170 ++++++++++++
 .../endpoints/openai/sound_classification.go  |  91 +++++++
 .../endpoints/openai/types/server_events.go   |  50 ++++
 .../react-ui/public/locales/en/models.json    |   1 +
 core/http/react-ui/src/pages/Models.jsx       |   1 +
 core/http/react-ui/src/utils/capabilities.js  |   1 +
 core/http/routes/localai.go                   |  17 +-
 core/http/routes/openai.go                    |  17 ++
 core/http/routes/ui_api.go                    |  31 ++-
 core/schema/sound_classification.go           |  19 ++
 core/services/nodes/health_mock_test.go       |   3 +
 core/services/nodes/inflight_test.go          |   3 +
 docs/content/features/audio-classification.md |  55 ++++
 docs/content/features/audio-diarization.md    |   4 +
 docs/content/features/backends.md             |   1 +
 docs/content/whats-new.md                     |   1 +
 gallery/ced.yaml                              |   7 +
 gallery/index.yaml                            | 184 +++++++++++++
 pkg/grpc/backend.go                           |   2 +
 pkg/grpc/base/base.go                         |   4 +
 pkg/grpc/client.go                            |  18 ++
 pkg/grpc/embed.go                             |   4 +
 pkg/grpc/interface.go                         |   1 +
 pkg/grpc/server.go                            |   8 +
 scripts/changed-backends.js                   |   7 +
 swagger/docs.go                               |  75 +++++
 swagger/swagger.json                          |  75 +++++
 swagger/swagger.yaml                          |  49 ++++
 52 files changed, 2161 insertions(+), 152 deletions(-)
 create mode 100644 backend/go/ced/.gitignore
 create mode 100644 backend/go/ced/Makefile
 create mode 100644 backend/go/ced/goced.go
 create mode 100644 backend/go/ced/main.go
 create mode 100755 backend/go/ced/package.sh
 create mode 100755 backend/go/ced/run.sh
 create mode 100644 core/backend/sound_classification.go
 create mode 100644 core/http/endpoints/openai/realtime_sound_detection.go
 create mode 100644 core/http/endpoints/openai/realtime_sound_detection_test.go
 create mode 100644 core/http/endpoints/openai/sound_classification.go
 create mode 100644 core/schema/sound_classification.go
 create mode 100644 docs/content/features/audio-classification.md
 create mode 100644 gallery/ced.yaml

diff --git a/.agents/adding-backends.md b/.agents/adding-backends.md
index 4a37a298e..ab965f789 100644
--- a/.agents/adding-backends.md
+++ b/.agents/adding-backends.md
@@ -198,6 +198,27 @@ docker-build-backends: ... docker-build-<backend-name>
 - If the backend is in `backend/python/<backend-name>/` but uses `.` as context in the workflow file, use `.` context
 - Check similar backends to determine the correct context
 
+## Documenting the backend (README + docs)
+
+A backend is not "added" until it is discoverable. Update the user-facing docs:
+
+- **`docs/content/features/backends.md`** - add the backend to the right
+  category in the "LocalAI supports various types of backends" list (and add a
+  new category if it introduces a new modality, e.g. sound classification).
+- If the backend introduces a **new API surface** (a new endpoint or a realtime
+  capability), document it under `docs/content/` where its area lives (audio,
+  vision, etc.) and follow the api-endpoints checklist in
+  [api-endpoints-and-auth.md](api-endpoints-and-auth.md).
+
+**If the backend is a native C/C++/GGML engine created and maintained by the
+LocalAI team** (a from-scratch port like `parakeet.cpp`, `ced.cpp`,
+`vibevoice.cpp`, `rf-detr.cpp`, not a wrapper around a third-party runtime), it
+ALSO belongs in the top-level **`README.md`** table under "native C/C++/GGML
+engines ... developed and maintained by the LocalAI project itself". Add a row
+linking the upstream engine repo with a one-line description. This is the
+project's showcase of its own engines; a new in-house backend that is missing
+from it is a documentation bug.
+
 ## 5. Verification Checklist
 
 After adding a new backend, verify:
@@ -211,6 +232,8 @@ After adding a new backend, verify:
 - [ ] No YAML syntax errors (check with linter)
 - [ ] No Makefile syntax errors (check with linter)
 - [ ] Follows the same pattern as similar backends (e.g., if it's a transcription backend, follow `faster-whisper` pattern)
+- [ ] Documented: added to the category list in `docs/content/features/backends.md` (and any new endpoint/realtime capability documented under `docs/content/`)
+- [ ] If it is an in-house native C/C++/GGML engine, added to the maintained-engines table in the top-level `README.md`
 
 ## Bundling runtime shared libraries (`package.sh`)
 
diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml
index c2c6638ec..593e44cde 100644
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -3575,6 +3575,154 @@ include:
     dockerfile: "./backend/Dockerfile.golang"
     context: "./"
     ubuntu-version: '2404'
+  # ced
+  - build-type: 'cublas'
+    cuda-major-version: "12"
+    cuda-minor-version: "8"
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-nvidia-cuda-12-ced'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'cublas'
+    cuda-major-version: "13"
+    cuda-minor-version: "0"
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-nvidia-cuda-13-ced'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'cublas'
+    cuda-major-version: "13"
+    cuda-minor-version: "0"
+    platforms: 'linux/arm64'
+    skip-drivers: 'false'
+    tag-latest: 'auto'
+    tag-suffix: '-nvidia-l4t-cuda-13-arm64-ced'
+    base-image: "ubuntu:24.04"
+    ubuntu-version: '2404'
+    runs-on: 'ubuntu-24.04-arm'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+  - build-type: ''
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    platform-tag: 'amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-cpu-ced'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: ''
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/arm64'
+    platform-tag: 'arm64'
+    tag-latest: 'auto'
+    tag-suffix: '-cpu-ced'
+    runs-on: 'ubuntu-24.04-arm'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'sycl_f32'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-intel-sycl-f32-ced'
+    runs-on: 'ubuntu-latest'
+    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'sycl_f16'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-intel-sycl-f16-ced'
+    runs-on: 'ubuntu-latest'
+    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'vulkan'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    platform-tag: 'amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-vulkan-ced'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'vulkan'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/arm64'
+    platform-tag: 'arm64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-vulkan-ced'
+    runs-on: 'ubuntu-24.04-arm'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'cublas'
+    cuda-major-version: "12"
+    cuda-minor-version: "0"
+    platforms: 'linux/arm64'
+    skip-drivers: 'false'
+    tag-latest: 'auto'
+    tag-suffix: '-nvidia-l4t-arm64-ced'
+    base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
+    runs-on: 'ubuntu-24.04-arm'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2204'
+  - build-type: 'hipblas'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-rocm-hipblas-ced'
+    base-image: "rocm/dev-ubuntu-24.04:7.2.1"
+    runs-on: 'ubuntu-latest'
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
   # acestep-cpp
   - build-type: ''
     cuda-major-version: ""
@@ -4754,6 +4902,10 @@ includeDarwin:
     tag-suffix: "-metal-darwin-arm64-parakeet-cpp"
     build-type: "metal"
     lang: "go"
+  - backend: "ced"
+    tag-suffix: "-metal-darwin-arm64-ced"
+    build-type: "metal"
+    lang: "go"
   - backend: "acestep-cpp"
     tag-suffix: "-metal-darwin-arm64-acestep-cpp"
     build-type: "metal"
diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml
index 6dbf8dcf2..481c9a609 100644
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -42,6 +42,10 @@ jobs:
             variable: "PARAKEET_VERSION"
             branch: "master"
             file: "backend/go/parakeet-cpp/Makefile"
+          - repository: "mudler/ced.cpp"
+            variable: "CED_VERSION"
+            branch: "master"
+            file: "backend/go/ced/Makefile"
           - repository: "mudler/depth-anything.cpp"
             variable: "DEPTHANYTHING_VERSION"
             branch: "master"
diff --git a/README.md b/README.md
index 5fff7db69..f7843950d 100644
--- a/README.md
+++ b/README.md
@@ -231,6 +231,7 @@ Most backends wrap a best-in-class upstream engine. A handful of them are native
 | Backend | What it does |
 |---------|-------------|
 | [parakeet.cpp](https://github.com/mudler/parakeet.cpp) | C++/GGML port of NVIDIA NeMo Parakeet ASR (tdt/ctc/rnnt/hybrid), with cache-aware streaming transcription |
+| [ced.cpp](https://github.com/mudler/ced.cpp) | C++/GGML port of the CED audio-tagging models: sound-event classification (527-class AudioSet) over REST and the realtime API for live recognition |
 | [voxtral.c](https://github.com/mudler/voxtral.c) | Voxtral Realtime 4B speech-to-text in pure C |
 | [vibevoice.cpp](https://github.com/mudler/vibevoice.cpp) | Native port of Microsoft VibeVoice for TTS (voice cloning) and long-form ASR with speaker diarization |
 | [rf-detr.cpp](https://github.com/mudler/rf-detr.cpp) | Native RF-DETR object detection and instance segmentation |
diff --git a/backend/backend.proto b/backend/backend.proto
index 68db81e35..2a575426e 100644
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -24,6 +24,9 @@ service Backend {
   rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
   rpc Status(HealthMessage) returns (StatusResponse) {}
   rpc Detect(DetectOptions) returns (DetectResponse) {}
+  // SoundDetection runs an audio-tagging / sound-event-classification model
+  // (e.g. CED over the AudioSet ontology) on a clip and returns scored labels.
+  rpc SoundDetection(SoundDetectionRequest) returns (SoundDetectionResponse) {}
   rpc Depth(DepthRequest) returns (DepthResponse) {}
   rpc FaceVerify(FaceVerifyRequest) returns (FaceVerifyResponse) {}
   rpc FaceAnalyze(FaceAnalyzeRequest) returns (FaceAnalyzeResponse) {}
@@ -671,6 +674,24 @@ message DetectResponse {
   repeated Detection Detections = 1;
 }
 
+// --- Sound-event classification / audio tagging messages (CED) ---
+
+message SoundDetectionRequest {
+  string src = 1;       // audio file path (LocalAI writes the upload to disk)
+  int32 top_k = 2;      // number of top tags to return (0 = all classes)
+  float threshold = 3;  // optional: drop tags scoring below this
+}
+
+message SoundClass {
+  string label = 1;     // AudioSet class name, e.g. "Baby cry, infant cry"
+  float score = 2;      // per-class probability (multi-label, independent)
+  int32 index = 3;      // class index in the model ontology
+}
+
+message SoundDetectionResponse {
+  repeated SoundClass detections = 1;  // score-descending
+}
+
 // --- Depth estimation messages (Depth Anything 3) ---
 
 message DepthRequest {
diff --git a/backend/go/ced/.gitignore b/backend/go/ced/.gitignore
new file mode 100644
index 000000000..5e47da6c5
--- /dev/null
+++ b/backend/go/ced/.gitignore
@@ -0,0 +1,11 @@
+.cache/
+sources/
+build/
+package/
+ced-grpc
+# build artifacts staged in-tree by the Makefile (cp from sources/) or
+# symlinked for local dev; the real sources live in ced.cpp upstream.
+*.so
+*.so.*
+ced_capi.h
+compile_commands.json
diff --git a/backend/go/ced/Makefile b/backend/go/ced/Makefile
new file mode 100644
index 000000000..632c0e255
--- /dev/null
+++ b/backend/go/ced/Makefile
@@ -0,0 +1,77 @@
+# ced sound-classification backend Makefile.
+#
+# Upstream pin lives below as CED_VERSION?=<sha> so .github/bump_deps.sh can find
+# and update it (matches the parakeet-cpp / whisper.cpp convention).
+#
+# Local dev shortcut: symlink an out-of-tree ced.cpp shared build + header and
+# skip the clone/cmake steps entirely:
+#   ln -sf /path/to/ced.cpp/build-shared/libced.so .
+#   ln -sf /path/to/ced.cpp/include/ced_capi.h .
+#   go build -o ced-grpc .
+
+CED_VERSION?=c04ac14b7992d00584d9e812c9bb6268598a6ce7
+CED_REPO?=https://github.com/mudler/ced.cpp
+
+GOCMD?=go
+GO_TAGS?=
+JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
+
+BUILD_TYPE?=
+NATIVE?=false
+
+# Static-link ggml into libced.so (PIC) so the shared lib is self-contained:
+# dlopen needs no libggml*.so alongside it, only system libs the runtime image
+# already provides.
+CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DCED_SHARED=ON -DCED_BUILD_CLI=OFF -DCED_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+
+ifeq ($(NATIVE),false)
+	CMAKE_ARGS+=-DGGML_NATIVE=OFF
+endif
+
+# ced.cpp gates its ggml backends behind CED_GGML_* options (set(... CACHE BOOL
+# "" FORCE)), so forward those instead of a bare -DGGML_CUDA=ON.
+ifeq ($(BUILD_TYPE),cublas)
+	CMAKE_ARGS+=-DCED_GGML_CUDA=ON -DGGML_CUDA_GRAPHS=ON
+else ifeq ($(BUILD_TYPE),openblas)
+	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+else ifeq ($(BUILD_TYPE),hipblas)
+	CMAKE_ARGS+=-DCED_GGML_HIP=ON
+else ifeq ($(BUILD_TYPE),vulkan)
+	CMAKE_ARGS+=-DCED_GGML_VULKAN=ON
+endif
+
+.PHONY: ced-grpc package build clean purge test all
+
+all: ced-grpc
+
+sources/ced.cpp:
+	mkdir -p sources/ced.cpp
+	cd sources/ced.cpp && \
+	git init -q && \
+	git remote add origin $(CED_REPO) && \
+	git fetch --depth 1 origin $(CED_VERSION) && \
+	git checkout FETCH_HEAD && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+libced.so: sources/ced.cpp
+	cmake -B sources/ced.cpp/build-shared -S sources/ced.cpp $(CMAKE_ARGS)
+	cmake --build sources/ced.cpp/build-shared --config Release -j$(JOBS)
+	cp -fv sources/ced.cpp/build-shared/libced.so* ./ 2>/dev/null || true
+	cp -fv sources/ced.cpp/include/ced_capi.h ./
+
+ced-grpc: libced.so main.go goced.go
+	CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o ced-grpc .
+
+package: ced-grpc
+	bash package.sh
+
+build: package
+
+test:
+	LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
+
+clean: purge
+	rm -rf libced.so* ced_capi.h package ced-grpc
+
+purge:
+	rm -rf sources/ced.cpp
diff --git a/backend/go/ced/goced.go b/backend/go/ced/goced.go
new file mode 100644
index 000000000..a405bf017
--- /dev/null
+++ b/backend/go/ced/goced.go
@@ -0,0 +1,130 @@
+package main
+
+// Go side of the ced backend: purego bindings over ced_capi.h plus the gRPC
+// SoundDetection implementation.
+//
+// SKETCH: the pb.SoundDetection* types come from backend.proto (regenerate with
+// `make protogen-go`). The C side is single-threaded per ctx, so we guard the
+// engine with engineMu; LocalAI also serializes via base.SingleThread.
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"sort"
+	"sync"
+	"unsafe"
+
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+)
+
+// purego-bound entry points from libced.so. Names match ced_capi.h exactly.
+var (
+	CppAbiVersion       func() int32
+	CppLoad             func(ggufPath string) uintptr
+	CppFree             func(ctx uintptr)
+	CppLastError        func(ctx uintptr) string
+	CppNumClasses       func(ctx uintptr) int32
+	CppSampleRate       func(ctx uintptr) int32
+	CppClassifyPathJSON func(ctx uintptr, wavPath string, topK int32) uintptr
+	CppClassifyPcmJSON  func(ctx uintptr, pcm []float32, nSamples int32, sampleRate int32, topK int32) uintptr
+	CppFreeString       func(s uintptr)
+)
+
+// cstr copies a malloc'd C string (returned as uintptr) into a Go string and
+// frees the original via ced_capi_free_string. Empty/0 -> "".
+func cstr(p uintptr) string {
+	if p == 0 {
+		return ""
+	}
+	defer CppFreeString(p)
+	var b []byte
+	for i := 0; ; i++ {
+		ch := *(*byte)(unsafe.Pointer(p + uintptr(i))) //nolint:govet // #nosec G103 -- C-owned NUL-terminated string from libced (not Go-GC memory)
+		if ch == 0 {
+			break
+		}
+		b = append(b, ch)
+	}
+	return string(b)
+}
+
+// Ced is the gRPC backend. One loaded CED model per instance.
+type Ced struct {
+	base.Base
+	ctxPtr   uintptr
+	engineMu sync.Mutex
+}
+
+// Load resolves the GGUF and opens the C-API context.
+func (c *Ced) Load(opts *pb.ModelOptions) error {
+	if opts.ModelFile == "" {
+		return errors.New("ced: ModelFile is required")
+	}
+	ctx := CppLoad(opts.ModelFile)
+	if ctx == 0 {
+		return fmt.Errorf("ced: ced_capi_load failed for %q: %s", opts.ModelFile, CppLastError(0))
+	}
+	c.ctxPtr = ctx
+	return nil
+}
+
+// jsonTag mirrors the ced_capi JSON tag objects.
+type jsonTag struct {
+	Index int     `json:"index"`
+	Score float32 `json:"score"`
+	Label string  `json:"label"`
+}
+
+// SoundDetection classifies the clip at req.Src and returns scored AudioSet tags.
+func (c *Ced) SoundDetection(ctx context.Context, req *pb.SoundDetectionRequest) (*pb.SoundDetectionResponse, error) {
+	if c.ctxPtr == 0 {
+		return nil, errors.New("ced: model not loaded")
+	}
+	if req.GetSrc() == "" {
+		return nil, errors.New("ced: SoundDetectionRequest.src (audio path) is required")
+	}
+	topK := req.GetTopK()
+	if topK <= 0 {
+		topK = 10 // sensible default for a tagging response
+	}
+
+	c.engineMu.Lock()
+	out := cstr(CppClassifyPathJSON(c.ctxPtr, req.GetSrc(), topK))
+	lastErr := CppLastError(c.ctxPtr)
+	c.engineMu.Unlock()
+
+	if out == "" {
+		return nil, fmt.Errorf("ced: classification failed: %s", lastErr)
+	}
+	var tags []jsonTag
+	if err := json.Unmarshal([]byte(out), &tags); err != nil {
+		return nil, fmt.Errorf("ced: bad classifier JSON: %w", err)
+	}
+
+	thr := req.GetThreshold()
+	resp := &pb.SoundDetectionResponse{}
+	for _, t := range tags {
+		if t.Score < thr {
+			continue
+		}
+		resp.Detections = append(resp.Detections, &pb.SoundClass{
+			Label: t.Label, Score: t.Score, Index: int32(t.Index),
+		})
+	}
+	sort.Slice(resp.Detections, func(i, j int) bool {
+		return resp.Detections[i].Score > resp.Detections[j].Score
+	})
+	return resp, nil
+}
+
+func (c *Ced) Free() error {
+	c.engineMu.Lock()
+	defer c.engineMu.Unlock()
+	if c.ctxPtr != 0 {
+		CppFree(c.ctxPtr)
+		c.ctxPtr = 0
+	}
+	return nil
+}
diff --git a/backend/go/ced/main.go b/backend/go/ced/main.go
new file mode 100644
index 000000000..ea8aa8549
--- /dev/null
+++ b/backend/go/ced/main.go
@@ -0,0 +1,59 @@
+package main
+
+// ced sound-classification backend. Started internally by LocalAI: one gRPC
+// server per loaded model. Loads libced.so via purego and registers the flat
+// C-API declared in ced_capi.h. The library name can be overridden with
+// CED_LIBRARY (mirrors PARAKEET_LIBRARY / WHISPER_LIBRARY); the default looks
+// for the .so next to this binary.
+//
+// SKETCH: requires `make protogen-go` after the backend.proto SoundDetection
+// addition, and a built libced.so (see Makefile). See DESIGN.md.
+import (
+	"flag"
+	"fmt"
+	"os"
+
+	"github.com/ebitengine/purego"
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
+)
+
+var addr = flag.String("addr", "localhost:50051", "the address to connect to")
+
+type libFunc struct {
+	ptr  any
+	name string
+}
+
+func main() {
+	libName := os.Getenv("CED_LIBRARY")
+	if libName == "" {
+		libName = "libced.so"
+	}
+	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
+	if err != nil {
+		panic(fmt.Errorf("ced: dlopen %q: %w", libName, err))
+	}
+
+	// Bound 1:1 to ced_capi.h. char*-returning functions are declared uintptr
+	// so we can free the same pointer with ced_capi_free_string after copying
+	// (purego's string return would copy and leak the original).
+	for _, lf := range []libFunc{
+		{&CppAbiVersion, "ced_capi_abi_version"},
+		{&CppLoad, "ced_capi_load"},
+		{&CppFree, "ced_capi_free"},
+		{&CppLastError, "ced_capi_last_error"},
+		{&CppNumClasses, "ced_capi_num_classes"},
+		{&CppSampleRate, "ced_capi_sample_rate"},
+		{&CppClassifyPathJSON, "ced_capi_classify_path_json"},
+		{&CppClassifyPcmJSON, "ced_capi_classify_pcm_json"},
+		{&CppFreeString, "ced_capi_free_string"},
+	} {
+		purego.RegisterLibFunc(lf.ptr, lib, lf.name)
+	}
+
+	fmt.Fprintf(os.Stderr, "[ced] ABI=%d\n", CppAbiVersion())
+	flag.Parse()
+	if err := grpc.StartServer(*addr, &Ced{}); err != nil {
+		panic(err)
+	}
+}
diff --git a/backend/go/ced/package.sh b/backend/go/ced/package.sh
new file mode 100755
index 000000000..bde0adad6
--- /dev/null
+++ b/backend/go/ced/package.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+#
+# Bundle the ced-grpc binary, libced.so, the core runtime libs (libc/libstdc++/
+# libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE so the package
+# is self-contained. Mirrors backend/go/parakeet-cpp/package.sh; run.sh routes
+# the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc is used.
+
+set -e
+
+CURDIR=$(dirname "$(realpath "$0")")
+REPO_ROOT="${CURDIR}/../../.."
+
+mkdir -p "$CURDIR/package/lib"
+
+cp -avf "$CURDIR/ced-grpc" "$CURDIR/package/"
+cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
+
+cp -avf "$CURDIR"/libced.so* "$CURDIR/package/lib/" 2>/dev/null || {
+	echo "ERROR: libced.so not found in $CURDIR, run 'make' first" >&2
+	exit 1
+}
+
+if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
+    echo "Detected x86_64 architecture, copying x86_64 libraries..."
+    cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
+    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
+    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
+elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
+    echo "Detected ARM64 architecture, copying ARM64 libraries..."
+    cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
+    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
+    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
+elif [ "$(uname -s)" = "Darwin" ]; then
+    echo "Detected Darwin"
+else
+    echo "Error: Could not detect architecture"
+    exit 1
+fi
+
+GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
+if [ -f "$GPU_LIB_SCRIPT" ]; then
+    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
+    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
+    package_gpu_libs
+fi
+
+echo "Packaging completed successfully"
+ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
diff --git a/backend/go/ced/run.sh b/backend/go/ced/run.sh
new file mode 100755
index 000000000..bce6fec8e
--- /dev/null
+++ b/backend/go/ced/run.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -e
+
+CURDIR=$(dirname "$(realpath "$0")")
+
+export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
+
+# If a self-contained ld.so was packaged, route through it so the packaged
+# libc / libstdc++ are used instead of the host's (matches the sibling backends).
+if [ -f "$CURDIR/lib/ld.so" ]; then
+	echo "Using lib/ld.so"
+	exec "$CURDIR/lib/ld.so" "$CURDIR/ced-grpc" "$@"
+fi
+
+exec "$CURDIR/ced-grpc" "$@"
diff --git a/backend/index.yaml b/backend/index.yaml
index 97fd1eb28..3f61f7b4e 100644
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -178,6 +178,37 @@
     nvidia-cuda-12: "cuda12-parakeet-cpp"
     nvidia-l4t-cuda-12: "nvidia-l4t-arm64-parakeet-cpp"
     nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-parakeet-cpp"
+- &ced
+  name: "ced"
+  alias: "ced"
+  license: mit
+  icon: https://avatars.githubusercontent.com/u/95302084
+  description: |
+    CED sound-event classification / audio tagging (527-class AudioSet).
+    ced.cpp is a C++/ggml port that performs audio tagging over the AudioSet
+    taxonomy, exposed through the SoundDetection gRPC rpc and the
+    /v1/audio/classification REST endpoint. It runs on CPU, NVIDIA CUDA,
+    AMD ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets.
+  urls:
+    - https://github.com/mudler/ced.cpp
+  tags:
+    - audio-classification
+    - CPU
+    - GPU
+    - CUDA
+    - HIP
+  capabilities:
+    default: "cpu-ced"
+    nvidia: "cuda12-ced"
+    intel: "intel-sycl-f16-ced"
+    metal: "metal-ced"
+    amd: "rocm-ced"
+    vulkan: "vulkan-ced"
+    nvidia-l4t: "nvidia-l4t-arm64-ced"
+    nvidia-cuda-13: "cuda13-ced"
+    nvidia-cuda-12: "cuda12-ced"
+    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced"
 - &voxtral
   name: "voxtral"
   alias: "voxtral"
@@ -2650,6 +2681,121 @@
   uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-parakeet-cpp"
   mirrors:
     - localai/localai-backends:master-gpu-nvidia-cuda-13-parakeet-cpp
+## ced
+- !!merge <<: *ced
+  name: "ced-development"
+  capabilities:
+    default: "cpu-ced-development"
+    nvidia: "cuda12-ced-development"
+    intel: "intel-sycl-f16-ced-development"
+    metal: "metal-ced-development"
+    amd: "rocm-ced-development"
+    vulkan: "vulkan-ced-development"
+    nvidia-l4t: "nvidia-l4t-arm64-ced-development"
+    nvidia-cuda-13: "cuda13-ced-development"
+    nvidia-cuda-12: "cuda12-ced-development"
+    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced-development"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced-development"
+- !!merge <<: *ced
+  name: "nvidia-l4t-arm64-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-ced"
+  mirrors:
+    - localai/localai-backends:latest-nvidia-l4t-arm64-ced
+- !!merge <<: *ced
+  name: "nvidia-l4t-arm64-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-ced"
+  mirrors:
+    - localai/localai-backends:master-nvidia-l4t-arm64-ced
+- !!merge <<: *ced
+  name: "cuda13-nvidia-l4t-arm64-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-ced"
+  mirrors:
+    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-ced
+- !!merge <<: *ced
+  name: "cuda13-nvidia-l4t-arm64-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-ced"
+  mirrors:
+    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-ced
+- !!merge <<: *ced
+  name: "cpu-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-ced"
+  mirrors:
+    - localai/localai-backends:latest-cpu-ced
+- !!merge <<: *ced
+  name: "cpu-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-ced"
+  mirrors:
+    - localai/localai-backends:master-cpu-ced
+- !!merge <<: *ced
+  name: "metal-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-ced"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-ced
+- !!merge <<: *ced
+  name: "metal-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-ced"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-ced
+- !!merge <<: *ced
+  name: "cuda12-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-ced"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-12-ced
+- !!merge <<: *ced
+  name: "cuda12-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-ced"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-cuda-12-ced
+- !!merge <<: *ced
+  name: "rocm-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-ced"
+  mirrors:
+    - localai/localai-backends:latest-gpu-rocm-hipblas-ced
+- !!merge <<: *ced
+  name: "rocm-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-ced"
+  mirrors:
+    - localai/localai-backends:master-gpu-rocm-hipblas-ced
+- !!merge <<: *ced
+  name: "intel-sycl-f32-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-ced"
+  mirrors:
+    - localai/localai-backends:latest-gpu-intel-sycl-f32-ced
+- !!merge <<: *ced
+  name: "intel-sycl-f32-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-ced"
+  mirrors:
+    - localai/localai-backends:master-gpu-intel-sycl-f32-ced
+- !!merge <<: *ced
+  name: "intel-sycl-f16-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-ced"
+  mirrors:
+    - localai/localai-backends:latest-gpu-intel-sycl-f16-ced
+- !!merge <<: *ced
+  name: "intel-sycl-f16-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-ced"
+  mirrors:
+    - localai/localai-backends:master-gpu-intel-sycl-f16-ced
+- !!merge <<: *ced
+  name: "vulkan-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-ced"
+  mirrors:
+    - localai/localai-backends:latest-gpu-vulkan-ced
+- !!merge <<: *ced
+  name: "vulkan-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-ced"
+  mirrors:
+    - localai/localai-backends:master-gpu-vulkan-ced
+- !!merge <<: *ced
+  name: "cuda13-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-ced"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-13-ced
+- !!merge <<: *ced
+  name: "cuda13-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-ced"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-cuda-13-ced
 ## stablediffusion-ggml
 - !!merge <<: *stablediffusionggml
   name: "cpu-stablediffusion-ggml"
diff --git a/core/backend/sound_classification.go b/core/backend/sound_classification.go
new file mode 100644
index 000000000..666c32321
--- /dev/null
+++ b/core/backend/sound_classification.go
@@ -0,0 +1,88 @@
+package backend
+
+import (
+	"context"
+	"fmt"
+	"sort"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"
+
+	grpcPkg "github.com/mudler/LocalAI/pkg/grpc"
+	"github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/model"
+)
+
+// SoundDetectionRequest carries the knobs the HTTP layer collects for an
+// audio-tagging / sound-event-classification call. Audio is the path to the
+// uploaded clip on disk; TopK and Threshold are optional (0 = backend default).
+type SoundDetectionRequest struct {
+	Audio     string
+	TopK      int32
+	Threshold float32
+}
+
+func (r *SoundDetectionRequest) toProto() *proto.SoundDetectionRequest {
+	return &proto.SoundDetectionRequest{
+		Src:       r.Audio,
+		TopK:      r.TopK,
+		Threshold: r.Threshold,
+	}
+}
+
+func loadSoundDetectionModel(ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (grpcPkg.Backend, error) {
+	if modelConfig.Backend == "" {
+		return nil, fmt.Errorf("sound classification: model %q has no backend set; supported backends include ced", modelConfig.Name)
+	}
+	opts := ModelOptions(modelConfig, appConfig)
+	m, err := ml.Load(opts...)
+	if err != nil {
+		recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
+		return nil, err
+	}
+	if m == nil {
+		return nil, fmt.Errorf("could not load sound classification model")
+	}
+	return m, nil
+}
+
+// ModelSoundDetection runs the SoundDetection RPC against the configured
+// backend and returns a normalized schema.SoundClassificationResult.
+func ModelSoundDetection(ctx context.Context, req SoundDetectionRequest, ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (*schema.SoundClassificationResult, error) {
+	m, err := loadSoundDetectionModel(ml, modelConfig, appConfig)
+	if err != nil {
+		return nil, err
+	}
+
+	r, err := m.SoundDetection(ctx, req.toProto())
+	if err != nil {
+		return nil, err
+	}
+	return soundClassificationResultFromProto(modelConfig.Name, r), nil
+}
+
+// soundClassificationResultFromProto maps the backend detections to the
+// HTTP-facing schema, keeping the backend's score-descending order.
+func soundClassificationResultFromProto(modelName string, r *proto.SoundDetectionResponse) *schema.SoundClassificationResult {
+	out := &schema.SoundClassificationResult{
+		Model:      modelName,
+		Detections: []schema.SoundClassification{},
+	}
+	if r == nil {
+		return out
+	}
+	for _, d := range r.Detections {
+		if d == nil {
+			continue
+		}
+		out.Detections = append(out.Detections, schema.SoundClassification{
+			Index: int(d.Index),
+			Label: d.Label,
+			Score: d.Score,
+		})
+	}
+	sort.SliceStable(out.Detections, func(i, j int) bool {
+		return out.Detections[i].Score > out.Detections[j].Score
+	})
+	return out
+}
diff --git a/core/config/backend_capabilities.go b/core/config/backend_capabilities.go
index eba8c3c37..cc9567887 100644
--- a/core/config/backend_capabilities.go
+++ b/core/config/backend_capabilities.go
@@ -8,27 +8,28 @@ import (
 // Usecase name constants — the canonical string values used in gallery entries,
 // model configs (known_usecases), and UsecaseInfoMap keys.
 const (
-	UsecaseChat               = "chat"
-	UsecaseCompletion         = "completion"
-	UsecaseEdit               = "edit"
-	UsecaseVision             = "vision"
-	UsecaseEmbeddings         = "embeddings"
-	UsecaseTokenize           = "tokenize"
-	UsecaseImage              = "image"
-	UsecaseVideo              = "video"
-	UsecaseTranscript         = "transcript"
-	UsecaseTTS                = "tts"
-	UsecaseSoundGeneration    = "sound_generation"
-	UsecaseRerank             = "rerank"
-	UsecaseDetection          = "detection"
-	UsecaseDepth              = "depth"
-	UsecaseVAD                = "vad"
-	UsecaseAudioTransform     = "audio_transform"
-	UsecaseDiarization        = "diarization"
-	UsecaseRealtimeAudio      = "realtime_audio"
-	UsecaseFaceRecognition    = "face_recognition"
-	UsecaseSpeakerRecognition = "speaker_recognition"
-	UsecaseTokenClassify      = "token_classify"
+	UsecaseChat                = "chat"
+	UsecaseCompletion          = "completion"
+	UsecaseEdit                = "edit"
+	UsecaseVision              = "vision"
+	UsecaseEmbeddings          = "embeddings"
+	UsecaseTokenize            = "tokenize"
+	UsecaseImage               = "image"
+	UsecaseVideo               = "video"
+	UsecaseTranscript          = "transcript"
+	UsecaseTTS                 = "tts"
+	UsecaseSoundGeneration     = "sound_generation"
+	UsecaseRerank              = "rerank"
+	UsecaseDetection           = "detection"
+	UsecaseDepth               = "depth"
+	UsecaseVAD                 = "vad"
+	UsecaseAudioTransform      = "audio_transform"
+	UsecaseDiarization         = "diarization"
+	UsecaseSoundClassification = "sound_classification"
+	UsecaseRealtimeAudio       = "realtime_audio"
+	UsecaseFaceRecognition     = "face_recognition"
+	UsecaseSpeakerRecognition  = "speaker_recognition"
+	UsecaseTokenClassify       = "token_classify"
 )
 
 // GRPCMethod identifies a Backend service RPC from backend.proto.
@@ -51,6 +52,7 @@ const (
 	MethodVAD                GRPCMethod = "VAD"
 	MethodAudioTransform     GRPCMethod = "AudioTransform"
 	MethodDiarize            GRPCMethod = "Diarize"
+	MethodSoundDetection     GRPCMethod = "SoundDetection"
 	MethodAudioToAudioStream GRPCMethod = "AudioToAudioStream"
 	MethodFaceVerify         GRPCMethod = "FaceVerify"
 	MethodFaceAnalyze        GRPCMethod = "FaceAnalyze"
@@ -165,6 +167,11 @@ var UsecaseInfoMap = map[string]UsecaseInfo{
 		GRPCMethod:  MethodDiarize,
 		Description: "Speaker diarization (who-spoke-when, per-speaker segments) via the Diarize RPC.",
 	},
+	UsecaseSoundClassification: {
+		Flag:        FLAG_SOUND_CLASSIFICATION,
+		GRPCMethod:  MethodSoundDetection,
+		Description: "Sound-event classification / audio tagging (scored AudioSet labels like baby cry, glass breaking, alarms) via the SoundDetection RPC.",
+	},
 	UsecaseRealtimeAudio: {
 		Flag:        FLAG_REALTIME_AUDIO,
 		GRPCMethod:  MethodAudioToAudioStream,
diff --git a/core/config/meta/constants.go b/core/config/meta/constants.go
index 72da2f99a..7fed6ba75 100644
--- a/core/config/meta/constants.go
+++ b/core/config/meta/constants.go
@@ -68,6 +68,7 @@ var UsecaseOptions = []FieldOption{
 	{Value: "face_recognition", Label: "Face Recognition"},
 	{Value: "transcript", Label: "Transcript"},
 	{Value: "diarization", Label: "Diarization"},
+	{Value: "sound_classification", Label: "Sound Classification"},
 	{Value: "speaker_recognition", Label: "Speaker Recognition"},
 	{Value: "tts", Label: "TTS"},
 	{Value: "sound_generation", Label: "Sound Generation"},
diff --git a/core/config/meta/registry.go b/core/config/meta/registry.go
index b7ffa9290..a1cfe4c9a 100644
--- a/core/config/meta/registry.go
+++ b/core/config/meta/registry.go
@@ -328,6 +328,30 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			AutocompleteProvider: ProviderModelsVAD,
 			Order:                63,
 		},
+		"pipeline.sound_detection": {
+			Section:              "pipeline",
+			Label:                "Sound Detection Model",
+			Description:          "Model to use for sound-event classification (audio tagging, e.g. ced) in the pipeline. When set, committed realtime audio is also classified and the scored AudioSet tags are emitted as a conversation.item.sound_detection event.",
+			Component:            "model-select",
+			AutocompleteProvider: ProviderModels,
+			Order:                64,
+		},
+		"pipeline.sound_detection_window_ms": {
+			Section:     "pipeline",
+			Label:       "Sound Detection Window (ms)",
+			Description: "Server-side windowing for a sound-only realtime session: length in ms of the audio window classified each hop. 0 = client-driven (the client commits windows).",
+			Component:   "number",
+			Min:         f64(0),
+			Order:       65,
+		},
+		"pipeline.sound_detection_hop_ms": {
+			Section:     "pipeline",
+			Label:       "Sound Detection Hop (ms)",
+			Description: "Server-side windowing hop in ms: how often the server classifies the last window. 0 = client-driven.",
+			Component:   "number",
+			Min:         f64(0),
+			Order:       66,
+		},
 		"pipeline.reasoning_effort": {
 			Section:     "pipeline",
 			Label:       "Reasoning Effort",
diff --git a/core/config/model_config.go b/core/config/model_config.go
index 5dbfd2026..cbb336838 100644
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -604,6 +604,20 @@ type Pipeline struct {
 	LLM           string `yaml:"llm,omitempty" json:"llm,omitempty"`
 	Transcription string `yaml:"transcription,omitempty" json:"transcription,omitempty"`
 	VAD           string `yaml:"vad,omitempty" json:"vad,omitempty"`
+	// SoundDetection names a sound-event-classification model (e.g. ced). When
+	// set, each VAD-committed realtime utterance is also run through it and the
+	// scored AudioSet tags are emitted as a conversation.item.sound_detection
+	// server event, alongside (and independent of) transcription.
+	SoundDetection string `yaml:"sound_detection,omitempty" json:"sound_detection,omitempty"`
+
+	// SoundDetectionWindowMs / SoundDetectionHopMs enable server-side windowing
+	// for a sound-detection-only realtime session: instead of the client
+	// committing audio buffers, the server classifies the last WindowMs of
+	// streamed audio every HopMs and emits a sound_detection event per hop. Both
+	// must be > 0 to activate; otherwise the session stays client-driven (the
+	// client commits windows via input_audio_buffer.commit).
+	SoundDetectionWindowMs int `yaml:"sound_detection_window_ms,omitempty" json:"sound_detection_window_ms,omitempty"`
+	SoundDetectionHopMs    int `yaml:"sound_detection_hop_ms,omitempty" json:"sound_detection_hop_ms,omitempty"`
 
 	// ReasoningEffort sets the reasoning effort (none|minimal|low|medium|high) for
 	// the pipeline's LLM without editing the LLM model config. Overrides the LLM's
@@ -1452,6 +1466,11 @@ const (
 	// so it may combine freely with other usecases.
 	FLAG_TOKEN_CLASSIFY ModelConfigUsecase = 0b1000000000000000000000
 
+	// Marks a model as wired for the SoundDetection gRPC primitive
+	// (audio tagging / sound-event classification — scored AudioSet
+	// labels via the SoundDetection RPC, e.g. ced).
+	FLAG_SOUND_CLASSIFICATION ModelConfigUsecase = 0b10000000000000000000000
+
 	// Common Subsets
 	FLAG_LLM ModelConfigUsecase = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
 )
@@ -1460,12 +1479,12 @@ const (
 // Flags within the same group are NOT orthogonal (e.g., chat and completion are
 // both text/language). A model is multimodal when its usecases span 2+ groups.
 var ModalityGroups = []ModelConfigUsecase{
-	FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT,                // text/language
-	FLAG_VISION | FLAG_DETECTION,                           // visual understanding
-	FLAG_TRANSCRIPT | FLAG_REALTIME_AUDIO,                  // speech input — realtime_audio is any-to-any, so it counts here too
-	FLAG_TTS | FLAG_SOUND_GENERATION | FLAG_REALTIME_AUDIO, // audio output — and here, so a lone realtime_audio flag still reads as multimodal
-	FLAG_AUDIO_TRANSFORM,                                   // audio in/out transforms
-	FLAG_IMAGE | FLAG_VIDEO,                                // visual generation
+	FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT,                           // text/language
+	FLAG_VISION | FLAG_DETECTION,                                      // visual understanding
+	FLAG_TRANSCRIPT | FLAG_REALTIME_AUDIO | FLAG_SOUND_CLASSIFICATION, // audio input — realtime_audio is any-to-any, so it counts here too
+	FLAG_TTS | FLAG_SOUND_GENERATION | FLAG_REALTIME_AUDIO,            // audio output — and here, so a lone realtime_audio flag still reads as multimodal
+	FLAG_AUDIO_TRANSFORM,                                              // audio in/out transforms
+	FLAG_IMAGE | FLAG_VIDEO,                                           // visual generation
 }
 
 // IsMultimodal returns true if the given usecases span two or more orthogonal
@@ -1488,29 +1507,30 @@ func GetAllModelConfigUsecases() map[string]ModelConfigUsecase {
 	return map[string]ModelConfigUsecase{
 		// Note: FLAG_ANY is intentionally excluded from this map
 		// because it's 0 and would always match in HasUsecases checks
-		"FLAG_CHAT":                FLAG_CHAT,
-		"FLAG_COMPLETION":          FLAG_COMPLETION,
-		"FLAG_EDIT":                FLAG_EDIT,
-		"FLAG_EMBEDDINGS":          FLAG_EMBEDDINGS,
-		"FLAG_RERANK":              FLAG_RERANK,
-		"FLAG_IMAGE":               FLAG_IMAGE,
-		"FLAG_TRANSCRIPT":          FLAG_TRANSCRIPT,
-		"FLAG_TTS":                 FLAG_TTS,
-		"FLAG_SOUND_GENERATION":    FLAG_SOUND_GENERATION,
-		"FLAG_TOKENIZE":            FLAG_TOKENIZE,
-		"FLAG_VAD":                 FLAG_VAD,
-		"FLAG_LLM":                 FLAG_LLM,
-		"FLAG_VIDEO":               FLAG_VIDEO,
-		"FLAG_DETECTION":           FLAG_DETECTION,
-		"FLAG_VISION":              FLAG_VISION,
-		"FLAG_FACE_RECOGNITION":    FLAG_FACE_RECOGNITION,
-		"FLAG_SPEAKER_RECOGNITION": FLAG_SPEAKER_RECOGNITION,
-		"FLAG_AUDIO_TRANSFORM":     FLAG_AUDIO_TRANSFORM,
-		"FLAG_DIARIZATION":         FLAG_DIARIZATION,
-		"FLAG_REALTIME_AUDIO":      FLAG_REALTIME_AUDIO,
-		"FLAG_SCORE":               FLAG_SCORE,
-		"FLAG_DEPTH":               FLAG_DEPTH,
-		"FLAG_TOKEN_CLASSIFY":      FLAG_TOKEN_CLASSIFY,
+		"FLAG_CHAT":                 FLAG_CHAT,
+		"FLAG_COMPLETION":           FLAG_COMPLETION,
+		"FLAG_EDIT":                 FLAG_EDIT,
+		"FLAG_EMBEDDINGS":           FLAG_EMBEDDINGS,
+		"FLAG_RERANK":               FLAG_RERANK,
+		"FLAG_IMAGE":                FLAG_IMAGE,
+		"FLAG_TRANSCRIPT":           FLAG_TRANSCRIPT,
+		"FLAG_TTS":                  FLAG_TTS,
+		"FLAG_SOUND_GENERATION":     FLAG_SOUND_GENERATION,
+		"FLAG_TOKENIZE":             FLAG_TOKENIZE,
+		"FLAG_VAD":                  FLAG_VAD,
+		"FLAG_LLM":                  FLAG_LLM,
+		"FLAG_VIDEO":                FLAG_VIDEO,
+		"FLAG_DETECTION":            FLAG_DETECTION,
+		"FLAG_VISION":               FLAG_VISION,
+		"FLAG_FACE_RECOGNITION":     FLAG_FACE_RECOGNITION,
+		"FLAG_SPEAKER_RECOGNITION":  FLAG_SPEAKER_RECOGNITION,
+		"FLAG_AUDIO_TRANSFORM":      FLAG_AUDIO_TRANSFORM,
+		"FLAG_DIARIZATION":          FLAG_DIARIZATION,
+		"FLAG_SOUND_CLASSIFICATION": FLAG_SOUND_CLASSIFICATION,
+		"FLAG_REALTIME_AUDIO":       FLAG_REALTIME_AUDIO,
+		"FLAG_SCORE":                FLAG_SCORE,
+		"FLAG_DEPTH":                FLAG_DEPTH,
+		"FLAG_TOKEN_CLASSIFY":       FLAG_TOKEN_CLASSIFY,
 	}
 }
 
@@ -1713,6 +1733,16 @@ func (c *ModelConfig) GuessUsecases(u ModelConfigUsecase) bool {
 		}
 	}
 
+	if (u & FLAG_SOUND_CLASSIFICATION) == FLAG_SOUND_CLASSIFICATION {
+		// ced is a sound-event tagger (AudioSet labels) surfaced via the
+		// SoundDetection gRPC. Models without an explicit known_usecases
+		// still surface when they run on one of these backends.
+		soundClassificationBackends := []string{"ced"}
+		if !slices.Contains(soundClassificationBackends, c.Backend) {
+			return false
+		}
+	}
+
 	if (u & FLAG_REALTIME_AUDIO) == FLAG_REALTIME_AUDIO {
 		// Backends that own a single any-to-any loop and implement
 		// AudioToAudioStream — listed here so models without an explicit
diff --git a/core/http/auth/features.go b/core/http/auth/features.go
index 615e82a49..8dbb32a03 100644
--- a/core/http/auth/features.go
+++ b/core/http/auth/features.go
@@ -48,6 +48,10 @@ var RouteFeatureRegistry = []RouteFeature{
 	{"POST", "/v1/audio/diarization", FeatureAudioDiarization},
 	{"POST", "/audio/diarization", FeatureAudioDiarization},
 
+	// Audio classification (sound-event tagging)
+	{"POST", "/v1/audio/classification", FeatureAudioClassification},
+	{"POST", "/audio/classification", FeatureAudioClassification},
+
 	// Audio speech / TTS
 	{"POST", "/v1/audio/speech", FeatureAudioSpeech},
 	{"POST", "/audio/speech", FeatureAudioSpeech},
@@ -172,6 +176,7 @@ func APIFeatureMetas() []FeatureMeta {
 		{FeatureAudioSpeech, "Audio Speech / TTS", true},
 		{FeatureAudioTranscription, "Audio Transcription", true},
 		{FeatureAudioDiarization, "Audio Diarization", true},
+		{FeatureAudioClassification, "Audio Classification", true},
 		{FeatureVAD, "Voice Activity Detection", true},
 		{FeatureDetection, "Detection", true},
 		{FeatureVideo, "Video Generation", true},
diff --git a/core/http/auth/permissions.go b/core/http/auth/permissions.go
index 47c4d64e1..1795792f9 100644
--- a/core/http/auth/permissions.go
+++ b/core/http/auth/permissions.go
@@ -38,24 +38,25 @@ const (
 	FeatureQuantization = "quantization"
 
 	// API features (default ON for new users)
-	FeatureChat               = "chat"
-	FeatureImages             = "images"
-	FeatureAudioSpeech        = "audio_speech"
-	FeatureAudioTranscription = "audio_transcription"
-	FeatureAudioDiarization   = "audio_diarization"
-	FeatureVAD                = "vad"
-	FeatureDetection          = "detection"
-	FeatureVideo              = "video"
-	FeatureEmbeddings         = "embeddings"
-	FeatureSound              = "sound"
-	FeatureRealtime           = "realtime"
-	FeatureRerank             = "rerank"
-	FeatureTokenize           = "tokenize"
-	FeatureMCP                = "mcp"
-	FeatureStores             = "stores"
-	FeatureFaceRecognition    = "face_recognition"
-	FeatureVoiceRecognition   = "voice_recognition"
-	FeatureAudioTransform     = "audio_transform"
+	FeatureChat                = "chat"
+	FeatureImages              = "images"
+	FeatureAudioSpeech         = "audio_speech"
+	FeatureAudioTranscription  = "audio_transcription"
+	FeatureAudioDiarization    = "audio_diarization"
+	FeatureAudioClassification = "audio_classification"
+	FeatureVAD                 = "vad"
+	FeatureDetection           = "detection"
+	FeatureVideo               = "video"
+	FeatureEmbeddings          = "embeddings"
+	FeatureSound               = "sound"
+	FeatureRealtime            = "realtime"
+	FeatureRerank              = "rerank"
+	FeatureTokenize            = "tokenize"
+	FeatureMCP                 = "mcp"
+	FeatureStores              = "stores"
+	FeatureFaceRecognition     = "face_recognition"
+	FeatureVoiceRecognition    = "voice_recognition"
+	FeatureAudioTransform      = "audio_transform"
 	// FeaturePIIFilter gates the synchronous PII analyze/redact service
 	// (POST /api/pii/{analyze,redact}). Default ON like the other API
 	// features; the admin-only events log is gated separately in-handler.
@@ -71,7 +72,7 @@ var GeneralFeatures = []string{FeatureFineTuning, FeatureQuantization}
 // APIFeatures lists API endpoint features (default ON).
 var APIFeatures = []string{
 	FeatureChat, FeatureImages, FeatureAudioSpeech, FeatureAudioTranscription,
-	FeatureAudioDiarization,
+	FeatureAudioDiarization, FeatureAudioClassification,
 	FeatureVAD, FeatureDetection, FeatureVideo, FeatureEmbeddings, FeatureSound,
 	FeatureRealtime, FeatureRerank, FeatureTokenize, FeatureMCP, FeatureStores,
 	FeatureFaceRecognition, FeatureVoiceRecognition, FeatureAudioTransform,
diff --git a/core/http/endpoints/localai/api_instructions.go b/core/http/endpoints/localai/api_instructions.go
index 2ca856a62..405921e5e 100644
--- a/core/http/endpoints/localai/api_instructions.go
+++ b/core/http/endpoints/localai/api_instructions.go
@@ -32,9 +32,9 @@ var instructionDefs = []instructionDef{
 	},
 	{
 		Name:        "audio",
-		Description: "Text-to-speech, voice activity detection, transcription, speaker diarization, and sound generation",
+		Description: "Text-to-speech, voice activity detection, transcription, speaker diarization, sound classification, and sound generation",
 		Tags:        []string{"audio"},
-		Intro:       "Diarization (/v1/audio/diarization) returns speaker-labelled time segments. Backends with native ASR-diarization (vibevoice-cpp) can also emit per-segment text via include_text=true; backends with a dedicated pipeline (sherpa-onnx + pyannote) emit segmentation only. Response formats: json (default), verbose_json (adds speakers summary + text), rttm (NIST format).",
+		Intro:       "Diarization (/v1/audio/diarization) returns speaker-labelled time segments. Backends with native ASR-diarization (vibevoice-cpp) can also emit per-segment text via include_text=true; backends with a dedicated pipeline (sherpa-onnx + pyannote) emit segmentation only. Response formats: json (default), verbose_json (adds speakers summary + text), rttm (NIST format). Sound classification (/v1/audio/classification) returns scored AudioSet sound-event tags (audio tagging via the ced backend); top_k and threshold control the returned set.",
 	},
 	{
 		Name:        "images",
diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go
index 8de50e580..1af4c6b75 100644
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -93,16 +93,31 @@ type Session struct {
 	Voice                   string
 	TurnDetection           *types.TurnDetectionUnion // "server_vad", "semantic_vad" or "none"
 	InputAudioTranscription *types.AudioTranscription
-	Tools                   []types.ToolUnion
-	ToolChoice              *types.ToolChoiceUnion
-	Conversations           map[string]*Conversation
-	InputAudioBuffer        []byte
-	AudioBufferLock         sync.Mutex
-	OpusFrames              [][]byte
-	OpusFramesLock          sync.Mutex
-	Instructions            string
-	DefaultConversationID   string
-	ModelInterface          Model
+
+	// SoundDetectionEnabled is set when pipeline.sound_detection names a
+	// sound-event-classification model. When true, each committed utterance is
+	// also run through ModelInterface.SoundDetection and the scored tags are
+	// emitted as a conversation.item.sound_detection event. SoundDetectionTopK
+	// and SoundDetectionThreshold are the knobs passed to that call (defaults:
+	// top_k=5, threshold=0).
+	SoundDetectionEnabled   bool
+	SoundDetectionTopK      int
+	SoundDetectionThreshold float32
+	// SoundDetectionWindowMs / SoundDetectionHopMs, when both > 0, enable
+	// server-side windowing for a sound-only session: the server classifies the
+	// last WindowMs of streamed audio every HopMs (no client commits needed).
+	SoundDetectionWindowMs int
+	SoundDetectionHopMs    int
+	Tools                  []types.ToolUnion
+	ToolChoice             *types.ToolChoiceUnion
+	Conversations          map[string]*Conversation
+	InputAudioBuffer       []byte
+	AudioBufferLock        sync.Mutex
+	OpusFrames             [][]byte
+	OpusFramesLock         sync.Mutex
+	Instructions           string
+	DefaultConversationID  string
+	ModelInterface         Model
 	// The pipeline model config or the config for an any-to-any model
 	ModelConfig      *config.ModelConfig
 	InputSampleRate  int
@@ -250,6 +265,10 @@ type Model interface {
 	// TranscribeStream transcribes audio incrementally, invoking onDelta for each
 	// transcript text fragment and returning the final aggregated result.
 	TranscribeStream(ctx context.Context, audio, language string, translate, diarize bool, prompt string, onDelta func(text string)) (*schema.TranscriptionResult, error)
+	// SoundDetection classifies a committed audio window into scored AudioSet
+	// sound-event tags. topK caps the number of returned tags (0 = backend
+	// default), threshold drops tags below the given score (0 = keep all).
+	SoundDetection(ctx context.Context, audio string, topK int, threshold float32) (*schema.SoundClassificationResult, error)
 	PredictConfig() *config.ModelConfig
 }
 
@@ -399,7 +418,7 @@ func prepareRealtimeConfig(cfg *config.ModelConfig) (errCode, errMsg string, ok
 		return "", "", true
 	}
 
-	if cfg.Pipeline.VAD == "" && cfg.Pipeline.Transcription == "" && cfg.Pipeline.TTS == "" && cfg.Pipeline.LLM == "" {
+	if cfg.Pipeline.VAD == "" && cfg.Pipeline.Transcription == "" && cfg.Pipeline.TTS == "" && cfg.Pipeline.LLM == "" && cfg.Pipeline.SoundDetection == "" {
 		return "invalid_model", "Model is not a pipeline model", false
 	}
 	return "", "", true
@@ -469,6 +488,26 @@ func runRealtimeSession(application *application.Application, t Transport, model
 
 	sttModel := cfg.Pipeline.Transcription
 
+	// A sound-detection-only pipeline (sound_detection set, no transcription/LLM)
+	// activates on sounds, not speech, so it runs WITHOUT the voice VAD: the
+	// session defaults to turn_detection none and the client drives windowing via
+	// input_audio_buffer.commit. There is no transcription stage in that case.
+	soundOnly := cfg.Pipeline.SoundDetection != "" && cfg.Pipeline.Transcription == "" && cfg.Pipeline.LLM == ""
+
+	turnDetection := &types.TurnDetectionUnion{
+		ServerVad: &types.ServerVad{
+			Threshold:         0.5,
+			PrefixPaddingMs:   300,
+			SilenceDurationMs: 500,
+			CreateResponse:    true,
+		},
+	}
+	inputAudioTranscription := &types.AudioTranscription{Model: sttModel}
+	if soundOnly {
+		turnDetection = nil           // turn_detection none: no VAD
+		inputAudioTranscription = nil // no transcription stage
+	}
+
 	// Compose the system prompt: prepend the assistant prompt when we have
 	// one (it teaches the model the safety rules and tool recipes), then the
 	// session's default voice instructions. Order matches chat.go's
@@ -480,30 +519,26 @@ func runRealtimeSession(application *application.Application, t Transport, model
 
 	sessionID := generateSessionID()
 	session := &Session{
-		ID:                sessionID,
-		TranscriptionOnly: false,
-		Model:             model,
-		Voice:             cfg.TTSConfig.Voice,
-		Instructions:      instructions,
-		ModelConfig:       cfg,
-		Tools:             assistantTools,
-		AssistantTools:    assistantTools,
-		AssistantExecutor: assistantExecutor,
-		TurnDetection: &types.TurnDetectionUnion{
-			ServerVad: &types.ServerVad{
-				Threshold:         0.5,
-				PrefixPaddingMs:   300,
-				SilenceDurationMs: 500,
-				CreateResponse:    true,
-			},
-		},
-		InputAudioTranscription: &types.AudioTranscription{
-			Model: sttModel,
-		},
-		Conversations:    make(map[string]*Conversation),
-		InputSampleRate:  defaultRemoteSampleRate,
-		OutputSampleRate: defaultRemoteSampleRate,
-		MaxHistoryItems:  resolveMaxHistoryItems(cfg),
+		ID:                      sessionID,
+		TranscriptionOnly:       false,
+		Model:                   model,
+		Voice:                   cfg.TTSConfig.Voice,
+		Instructions:            instructions,
+		ModelConfig:             cfg,
+		Tools:                   assistantTools,
+		AssistantTools:          assistantTools,
+		AssistantExecutor:       assistantExecutor,
+		TurnDetection:           turnDetection,
+		InputAudioTranscription: inputAudioTranscription,
+		Conversations:           make(map[string]*Conversation),
+		InputSampleRate:         defaultRemoteSampleRate,
+		OutputSampleRate:        defaultRemoteSampleRate,
+		MaxHistoryItems:         resolveMaxHistoryItems(cfg),
+		SoundDetectionEnabled:   cfg.Pipeline.SoundDetection != "",
+		SoundDetectionTopK:      defaultSoundDetectionTopK,
+		SoundDetectionThreshold: 0,
+		SoundDetectionWindowMs:  cfg.Pipeline.SoundDetectionWindowMs,
+		SoundDetectionHopMs:     cfg.Pipeline.SoundDetectionHopMs,
 	}
 
 	// Create a default conversation
@@ -517,14 +552,24 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	session.Conversations[conversationID] = conversation
 	session.DefaultConversationID = conversationID
 
-	m, err := newModel(
-		&cfg.Pipeline,
-		application.ModelConfigLoader(),
-		application.ModelLoader(),
-		application.ApplicationConfig(),
-		evaluator,
-		buildRealtimeRoutingContext(application, sessionID),
-	)
+	var m Model
+	if soundOnly {
+		m, err = newSoundDetectionOnlyModel(
+			&cfg.Pipeline,
+			application.ModelConfigLoader(),
+			application.ModelLoader(),
+			application.ApplicationConfig(),
+		)
+	} else {
+		m, err = newModel(
+			&cfg.Pipeline,
+			application.ModelConfigLoader(),
+			application.ModelLoader(),
+			application.ApplicationConfig(),
+			evaluator,
+			buildRealtimeRoutingContext(application, sessionID),
+		)
+	}
 	if err != nil {
 		xlog.Error("failed to load model", "error", err)
 		sendError(t, "model_load_error", "Failed to load model", "", "")
@@ -605,6 +650,20 @@ func runRealtimeSession(application *application.Application, t Transport, model
 
 	toggleVAD()
 
+	// Server-side sound-detection windowing (option B): for a sound-only session
+	// with window/hop configured, the server classifies the last window of
+	// streamed audio on a timer, so the client only has to stream (no commits).
+	// This runs independent of VAD (sound events are not speech).
+	var soundWindowDone chan struct{}
+	if soundOnly && session.SoundDetectionWindowMs > 0 && session.SoundDetectionHopMs > 0 {
+		soundWindowDone = make(chan struct{})
+		wg.Go(func() {
+			handleSoundWindow(session, t, soundWindowDone)
+		})
+		xlog.Debug("Starting server-side sound-detection windowing",
+			"window_ms", session.SoundDetectionWindowMs, "hop_ms", session.SoundDetectionHopMs)
+	}
+
 	for {
 		msg, err = t.ReadEvent()
 		if err != nil {
@@ -880,6 +939,10 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	if vadServerStarted {
 		close(done)
 	}
+	// Stop the server-side sound-detection windowing goroutine (if running).
+	if soundWindowDone != nil {
+		close(soundWindowDone)
+	}
 	wg.Wait()
 
 	// Remove the session from the sessions map
@@ -971,6 +1034,10 @@ func updateTransSession(session *Session, update *types.SessionUnion, cl *config
 
 		session.ModelInterface = m
 		session.ModelConfig = cfg
+		session.SoundDetectionEnabled = cfg.Pipeline.SoundDetection != ""
+		if session.SoundDetectionTopK <= 0 {
+			session.SoundDetectionTopK = defaultSoundDetectionTopK
+		}
 	}
 
 	if trUpd != nil {
@@ -1343,7 +1410,8 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
 
 	// TODO: If we have a real any-to-any model then transcription is optional
 	var transcript string
-	if session.InputAudioTranscription != nil {
+	switch {
+	case session.InputAudioTranscription != nil:
 		// emitTranscription streams transcript deltas when
 		// pipeline.streaming.transcription is set, otherwise emits a single
 		// completed event; either way it returns the final transcript text.
@@ -1358,13 +1426,27 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
 			sendError(t, "transcription_failed", err.Error(), "", "event_TODO")
 			return
 		}
-	} else {
+	case session.SoundDetectionEnabled:
+		// Sound-detection-only session: no transcription and no LLM. The
+		// sound-detection emit below carries the result; there is no any-to-any
+		// path to fall into. Windowing is client-driven (turn_detection none +
+		// input_audio_buffer.commit), so this is not voice-gated.
+	default:
 		// The voice gate runs only on the transcription path above; if an
 		// any-to-any model path is added here, join the gate before responding.
 		sendNotImplemented(t, "any-to-any models")
 		return
 	}
 
+	// Sound-event detection is additive to transcription: classify the same
+	// committed window and emit its scored AudioSet tags as a separate event.
+	// A failure here is logged but must never abort the turn.
+	if session.SoundDetectionEnabled {
+		if sderr := emitSoundDetection(ctx, t, session, generateItemID(), f.Name()); sderr != nil {
+			xlog.Error("sound detection failed", "error", sderr)
+		}
+	}
+
 	// Join on the resolution before any side-effecting step.
 	var speaker *types.Speaker
 	if runResolve {
@@ -1415,11 +1497,94 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
 		}
 	}
 
-	if !session.TranscriptionOnly {
+	// Generate an LLM response only when there is a transcript to feed it. A
+	// sound-detection-only session (no transcription) has no LLM stage, so it
+	// stops here after emitting the sound-detection event.
+	if session.InputAudioTranscription != nil && !session.TranscriptionOnly {
 		generateResponse(ctx, session, utt, transcript, speaker, conv, t)
 	}
 }
 
+// handleSoundWindow runs server-side windowed sound-event detection (option B):
+// every HopMs it classifies the last WindowMs of streamed audio and emits a
+// sound_detection event, so a sound-only client only has to stream audio (no
+// input_audio_buffer.commit). It keeps the input buffer trimmed to one window
+// so a long stream stays bounded. Runs until done is closed. This is
+// independent of VAD: sound events are not speech.
+func handleSoundWindow(session *Session, t Transport, done chan struct{}) {
+	ticker := time.NewTicker(time.Duration(session.SoundDetectionHopMs) * time.Millisecond)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-done:
+			return
+		case <-ticker.C:
+			classifySoundWindow(session, t)
+		}
+	}
+}
+
+// classifySoundWindow is one windowing tick: it snapshots the most recent
+// WindowMs of buffered audio (trimming the buffer so a long stream stays
+// bounded) and, when there is enough, classifies it and emits a sound_detection
+// event. Extracted from handleSoundWindow so it can be driven synchronously in
+// tests.
+func classifySoundWindow(session *Session, t Transport) {
+	const bytesPerSample = 2 // 16-bit mono PCM
+	sr := session.InputSampleRate
+	windowBytes := session.SoundDetectionWindowMs * sr / 1000 * bytesPerSample
+	minBytes := sr / 100 * bytesPerSample // ~10ms before classifying
+
+	session.AudioBufferLock.Lock()
+	// Keep only the most recent window so a long stream stays bounded.
+	if windowBytes > 0 && len(session.InputAudioBuffer) > windowBytes {
+		trimmed := make([]byte, windowBytes)
+		copy(trimmed, session.InputAudioBuffer[len(session.InputAudioBuffer)-windowBytes:])
+		session.InputAudioBuffer = trimmed
+	}
+	window := make([]byte, len(session.InputAudioBuffer))
+	copy(window, session.InputAudioBuffer)
+	session.AudioBufferLock.Unlock()
+
+	if len(window) < minBytes {
+		return // not enough audio buffered yet
+	}
+	path, err := writeWindowWAV(window, sr)
+	if err != nil {
+		xlog.Error("sound window: failed to write wav", "error", err)
+		return
+	}
+	if sderr := emitSoundDetection(context.Background(), t, session, generateItemID(), path); sderr != nil {
+		xlog.Error("sound window: detection failed", "error", sderr)
+	}
+	if rerr := os.Remove(path); rerr != nil {
+		xlog.Debug("sound window: temp cleanup failed", "error", rerr)
+	}
+}
+
+// writeWindowWAV writes mono 16-bit PCM to a temp WAV at the given sample rate
+// (the ced classifier reads the declared rate and resamples). Returns the path;
+// the caller removes it.
+func writeWindowWAV(pcm []byte, sampleRate int) (string, error) {
+	f, err := os.CreateTemp("", "realtime-sound-window-*.wav")
+	if err != nil {
+		return "", err
+	}
+	defer func() { _ = f.Close() }()
+	hdr := laudio.NewWAVHeaderWithRate(uint32(len(pcm)), uint32(sampleRate))
+	if err := hdr.Write(f); err != nil {
+		_ = os.Remove(f.Name())
+		return "", err
+	}
+	if _, err := f.Write(pcm); err != nil {
+		_ = os.Remove(f.Name())
+		return "", err
+	}
+	_ = f.Sync()
+	return f.Name(), nil
+}
+
 func runVAD(ctx context.Context, session *Session, adata []int16) ([]schema.VADSegment, error) {
 	soundIntBuffer := &audio.IntBuffer{
 		Format:         &audio.Format{SampleRate: localSampleRate, NumChannels: 1},
diff --git a/core/http/endpoints/openai/realtime_doubles_test.go b/core/http/endpoints/openai/realtime_doubles_test.go
index 727ce7dcc..10e608c17 100644
--- a/core/http/endpoints/openai/realtime_doubles_test.go
+++ b/core/http/endpoints/openai/realtime_doubles_test.go
@@ -75,6 +75,11 @@ type fakeModel struct {
 	transcribeDeltas []string
 	transcribeFinal  *schema.TranscriptionResult
 
+	// soundDetectionResult/soundDetectionErr drive the SoundDetection double so
+	// the sound-event path can be exercised deterministically.
+	soundDetectionResult *schema.SoundClassificationResult
+	soundDetectionErr    error
+
 	// Predict streaming: predictTokens are replayed through the token callback
 	// (simulating streamed LLM output); predictResp/predictErr are returned by
 	// the deferred predict function. predictChunkDeltas, when set, are delivered
@@ -95,6 +100,13 @@ func (m *fakeModel) Transcribe(context.Context, string, string, bool, bool, stri
 	return m.transcribeFinal, nil
 }
 
+func (m *fakeModel) SoundDetection(context.Context, string, int, float32) (*schema.SoundClassificationResult, error) {
+	if m.soundDetectionErr != nil {
+		return nil, m.soundDetectionErr
+	}
+	return m.soundDetectionResult, nil
+}
+
 func (m *fakeModel) Predict(_ context.Context, msgs schema.Messages, _, _, _ []string, cb func(string, backend.TokenUsage) bool, _ []types.ToolUnion, _ *types.ToolChoiceUnion, _, _ *int, _ map[string]float64) (func() (backend.LLMResponse, error), error) {
 	m.lastMessages = msgs
 	if m.predictErr != nil {
diff --git a/core/http/endpoints/openai/realtime_model.go b/core/http/endpoints/openai/realtime_model.go
index 789ce0a0d..6843a521d 100644
--- a/core/http/endpoints/openai/realtime_model.go
+++ b/core/http/endpoints/openai/realtime_model.go
@@ -31,10 +31,11 @@ var (
 // This means that we will fake an Any-to-Any model by overriding some of the gRPC client methods
 // which are for Any-To-Any models, but instead we will call a pipeline (for e.g STT->LLM->TTS)
 type wrappedModel struct {
-	TTSConfig           *config.ModelConfig
-	TranscriptionConfig *config.ModelConfig
-	LLMConfig           *config.ModelConfig
-	VADConfig           *config.ModelConfig
+	TTSConfig            *config.ModelConfig
+	TranscriptionConfig  *config.ModelConfig
+	LLMConfig            *config.ModelConfig
+	VADConfig            *config.ModelConfig
+	SoundDetectionConfig *config.ModelConfig
 
 	appConfig   *config.ApplicationConfig
 	modelLoader *model.ModelLoader
@@ -64,8 +65,9 @@ type anyToAnyModel struct {
 }
 
 type transcriptOnlyModel struct {
-	TranscriptionConfig *config.ModelConfig
-	VADConfig           *config.ModelConfig
+	TranscriptionConfig  *config.ModelConfig
+	VADConfig            *config.ModelConfig
+	SoundDetectionConfig *config.ModelConfig
 
 	appConfig   *config.ApplicationConfig
 	modelLoader *model.ModelLoader
@@ -80,6 +82,10 @@ func (m *transcriptOnlyModel) Transcribe(ctx context.Context, audio, language st
 	return backend.ModelTranscription(ctx, audio, language, translate, diarize, prompt, m.modelLoader, *m.TranscriptionConfig, m.appConfig)
 }
 
+func (m *transcriptOnlyModel) SoundDetection(ctx context.Context, audio string, topK int, threshold float32) (*schema.SoundClassificationResult, error) {
+	return modelSoundDetection(ctx, m.modelLoader, m.appConfig, m.SoundDetectionConfig, audio, topK, threshold)
+}
+
 func (m *transcriptOnlyModel) Predict(ctx context.Context, messages schema.Messages, images, videos, audios []string, tokenCallback func(string, backend.TokenUsage) bool, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, logprobs *int, topLogprobs *int, logitBias map[string]float64) (func() (backend.LLMResponse, error), error) {
 	return nil, fmt.Errorf("predict operation not supported in transcript-only mode")
 }
@@ -108,6 +114,10 @@ func (m *wrappedModel) Transcribe(ctx context.Context, audio, language string, t
 	return backend.ModelTranscription(ctx, audio, language, translate, diarize, prompt, m.modelLoader, *m.TranscriptionConfig, m.appConfig)
 }
 
+func (m *wrappedModel) SoundDetection(ctx context.Context, audio string, topK int, threshold float32) (*schema.SoundClassificationResult, error) {
+	return modelSoundDetection(ctx, m.modelLoader, m.appConfig, m.SoundDetectionConfig, audio, topK, threshold)
+}
+
 func (m *wrappedModel) Predict(ctx context.Context, messages schema.Messages, images, videos, audios []string, tokenCallback func(string, backend.TokenUsage) bool, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, logprobs *int, topLogprobs *int, logitBias map[string]float64) (func() (backend.LLMResponse, error), error) {
 	input := schema.OpenAIRequest{
 		Messages: messages,
@@ -399,6 +409,39 @@ func transcribeStream(ctx context.Context, ml *model.ModelLoader, transcriptionC
 	return final, nil
 }
 
+// modelSoundDetection runs sound-event classification against the session's
+// sound-classification model config, mirroring how Transcribe dispatches to
+// the transcription backend. Returns an error when no sound-detection model is
+// configured for the session.
+func modelSoundDetection(ctx context.Context, ml *model.ModelLoader, appConfig *config.ApplicationConfig, soundConfig *config.ModelConfig, audio string, topK int, threshold float32) (*schema.SoundClassificationResult, error) {
+	if soundConfig == nil {
+		return nil, fmt.Errorf("sound detection is not configured for this session")
+	}
+	return backend.ModelSoundDetection(ctx, backend.SoundDetectionRequest{
+		Audio:     audio,
+		TopK:      int32(topK),
+		Threshold: threshold,
+	}, ml, *soundConfig, appConfig)
+}
+
+// loadSoundDetectionConfig resolves the optional sound-classification model
+// config named by pipeline.sound_detection. Returns (nil, nil) when no model
+// is configured so sound detection stays additive and never blocks session
+// setup.
+func loadSoundDetectionConfig(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader) (*config.ModelConfig, error) {
+	if pipeline.SoundDetection == "" {
+		return nil, nil
+	}
+	cfg, err := cl.LoadModelConfigFileByName(pipeline.SoundDetection, ml.ModelPath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to load sound detection config: %w", err)
+	}
+	if valid, _ := cfg.Validate(); !valid {
+		return nil, fmt.Errorf("failed to validate sound detection config %q", pipeline.SoundDetection)
+	}
+	return cfg, nil
+}
+
 func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, *config.ModelConfig, error) {
 	cfgVAD, err := cl.LoadModelConfigFileByName(pipeline.VAD, ml.ModelPath)
 	if err != nil {
@@ -420,9 +463,15 @@ func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfig
 		return nil, nil, fmt.Errorf("failed to validate config: %w", err)
 	}
 
+	cfgSound, err := loadSoundDetectionConfig(pipeline, cl, ml)
+	if err != nil {
+		return nil, nil, err
+	}
+
 	return &transcriptOnlyModel{
-		TranscriptionConfig: cfgSST,
-		VADConfig:           cfgVAD,
+		TranscriptionConfig:  cfgSST,
+		VADConfig:            cfgVAD,
+		SoundDetectionConfig: cfgSound,
 
 		confLoader:  cl,
 		modelLoader: ml,
@@ -430,6 +479,27 @@ func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfig
 	}, cfgSST, nil
 }
 
+// newSoundDetectionOnlyModel builds a realtime model that only does sound-event
+// classification: no VAD, transcription, LLM or TTS stages are loaded. Used for
+// a sound-detection-only realtime session, which activates on sounds (not
+// speech) and is driven by client-side windowing (turn_detection none +
+// input_audio_buffer.commit) rather than the voice VAD loop.
+func newSoundDetectionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, error) {
+	cfgSound, err := loadSoundDetectionConfig(pipeline, cl, ml)
+	if err != nil {
+		return nil, err
+	}
+	if cfgSound == nil {
+		return nil, fmt.Errorf("a sound-only realtime session requires pipeline.sound_detection")
+	}
+	return &transcriptOnlyModel{
+		SoundDetectionConfig: cfgSound,
+		confLoader:           cl,
+		modelLoader:          ml,
+		appConfig:            appConfig,
+	}, nil
+}
+
 // RealtimeRoutingContext is the bundle of routing dependencies the
 // realtime pipeline needs to consult router.Resolve per turn. nil-safe:
 // passing nil skips routing entirely and preserves the historical "one
@@ -544,11 +614,17 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
 		return nil, fmt.Errorf("failed to validate config: %w", err)
 	}
 
+	cfgSound, err := loadSoundDetectionConfig(pipeline, cl, ml)
+	if err != nil {
+		return nil, err
+	}
+
 	wm := &wrappedModel{
-		TTSConfig:           cfgTTS,
-		TranscriptionConfig: cfgSST,
-		LLMConfig:           cfgLLM,
-		VADConfig:           cfgVAD,
+		TTSConfig:            cfgTTS,
+		TranscriptionConfig:  cfgSST,
+		LLMConfig:            cfgLLM,
+		VADConfig:            cfgVAD,
+		SoundDetectionConfig: cfgSound,
 
 		confLoader:  cl,
 		modelLoader: ml,
diff --git a/core/http/endpoints/openai/realtime_sound_detection.go b/core/http/endpoints/openai/realtime_sound_detection.go
new file mode 100644
index 000000000..6bc4efb47
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_sound_detection.go
@@ -0,0 +1,48 @@
+package openai
+
+import (
+	"context"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+)
+
+// defaultSoundDetectionTopK is the number of scored tags requested per
+// committed utterance when the session does not pin its own top_k.
+const defaultSoundDetectionTopK = 5
+
+// emitSoundDetection classifies a committed utterance into sound-event tags and
+// emits a conversation.item.sound_detection event for it. It mirrors
+// emitTranscription's unary path: it calls the session's sound-event
+// classifier, maps the scored tags onto the server event, and sends it over
+// the transport. Sound detection is additive to transcription: its result is
+// emitted independently and a failure here is the caller's to log, never a
+// reason to abort the turn.
+func emitSoundDetection(ctx context.Context, t Transport, session *Session, itemID, audioPath string) error {
+	topK := session.SoundDetectionTopK
+	if topK <= 0 {
+		topK = defaultSoundDetectionTopK
+	}
+
+	result, err := session.ModelInterface.SoundDetection(ctx, audioPath, topK, session.SoundDetectionThreshold)
+	if err != nil {
+		return err
+	}
+
+	detections := make([]types.SoundDetectionTag, 0)
+	if result != nil {
+		for _, d := range result.Detections {
+			detections = append(detections, types.SoundDetectionTag{
+				Label: d.Label,
+				Score: d.Score,
+				Index: d.Index,
+			})
+		}
+	}
+
+	return t.SendEvent(types.ConversationItemSoundDetectionEvent{
+		ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
+		ItemID:          itemID,
+		ContentIndex:    0,
+		Detections:      detections,
+	})
+}
diff --git a/core/http/endpoints/openai/realtime_sound_detection_test.go b/core/http/endpoints/openai/realtime_sound_detection_test.go
new file mode 100644
index 000000000..e440e80c3
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_sound_detection_test.go
@@ -0,0 +1,170 @@
+package openai
+
+import (
+	"context"
+	"encoding/binary"
+	"errors"
+	"os"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+	"github.com/mudler/LocalAI/core/schema"
+)
+
+// emitSoundDetection classifies a committed utterance and emits a single
+// conversation.item.sound_detection event carrying the scored AudioSet tags.
+var _ = Describe("emitSoundDetection", func() {
+	It("emits a sound_detection event with the classifier's scored tags", func() {
+		session := &Session{
+			SoundDetectionEnabled: true,
+			SoundDetectionTopK:    5,
+			ModelInterface: &fakeModel{
+				soundDetectionResult: &schema.SoundClassificationResult{
+					Model: "ced",
+					Detections: []schema.SoundClassification{
+						{Index: 3, Label: "Baby cry, infant cry", Score: 0.91},
+						{Index: 7, Label: "Speech", Score: 0.42},
+					},
+				},
+			},
+		}
+		t := &fakeTransport{}
+
+		err := emitSoundDetection(context.Background(), t, session, "item1", "/tmp/x.wav")
+
+		Expect(err).ToNot(HaveOccurred())
+		Expect(t.countEvents(types.ServerEventTypeConversationItemSoundDetection)).To(Equal(1))
+
+		ev, ok := t.events[0].(types.ConversationItemSoundDetectionEvent)
+		Expect(ok).To(BeTrue())
+		Expect(ev.ItemID).To(Equal("item1"))
+		Expect(ev.ContentIndex).To(Equal(0))
+		Expect(ev.Detections).To(HaveLen(2))
+		Expect(ev.Detections[0].Label).To(Equal("Baby cry, infant cry"))
+		Expect(ev.Detections[0].Score).To(BeNumerically("~", 0.91, 1e-6))
+		Expect(ev.Detections[0].Index).To(Equal(3))
+		Expect(ev.Detections[1].Label).To(Equal("Speech"))
+	})
+
+	It("emits an event with no detections when the classifier returns none", func() {
+		session := &Session{
+			SoundDetectionEnabled: true,
+			ModelInterface: &fakeModel{
+				soundDetectionResult: &schema.SoundClassificationResult{Model: "ced"},
+			},
+		}
+		t := &fakeTransport{}
+
+		err := emitSoundDetection(context.Background(), t, session, "item1", "/tmp/x.wav")
+
+		Expect(err).ToNot(HaveOccurred())
+		Expect(t.countEvents(types.ServerEventTypeConversationItemSoundDetection)).To(Equal(1))
+		ev, ok := t.events[0].(types.ConversationItemSoundDetectionEvent)
+		Expect(ok).To(BeTrue())
+		Expect(ev.Detections).To(BeEmpty())
+	})
+
+	It("propagates the classifier error and emits no event", func() {
+		session := &Session{
+			SoundDetectionEnabled: true,
+			ModelInterface:        &fakeModel{soundDetectionErr: errors.New("boom")},
+		}
+		t := &fakeTransport{}
+
+		err := emitSoundDetection(context.Background(), t, session, "item1", "/tmp/x.wav")
+
+		Expect(err).To(HaveOccurred())
+		Expect(t.countEvents(types.ServerEventTypeConversationItemSoundDetection)).To(Equal(0))
+	})
+})
+
+// A sound-detection-only session (no transcription, no LLM) runs through
+// commitUtterance WITHOUT the voice/transcription path: it emits the
+// sound_detection event and stops - no transcription event, no LLM response.
+var _ = Describe("commitUtterance (sound-detection-only session)", func() {
+	It("emits sound detection and neither transcribes nor generates a response", func() {
+		session := &Session{
+			SoundDetectionEnabled:   true,
+			SoundDetectionTopK:      5,
+			InputAudioTranscription: nil, // sound-only: no transcription stage
+			ModelConfig:             &config.ModelConfig{},
+			ModelInterface: &fakeModel{
+				soundDetectionResult: &schema.SoundClassificationResult{
+					Model: "ced",
+					Detections: []schema.SoundClassification{
+						{Index: 23, Label: "Baby cry, infant cry", Score: 0.87},
+					},
+				},
+			},
+		}
+		tr := &fakeTransport{}
+		utt := make([]byte, 32) // non-empty PCM so commitUtterance proceeds
+
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
+
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemSoundDetection)).To(Equal(1))
+		// No transcription happened.
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(0))
+		// No LLM response was generated (sound-only has no LLM stage).
+		Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
+	})
+})
+
+// Server-side windowing (option B): a sound-only session classifies the last
+// WindowMs of streamed audio per tick, with no client commit, and keeps the
+// input buffer trimmed to one window.
+var _ = Describe("classifySoundWindow (server-side windowing)", func() {
+	newSoundSession := func() (*Session, *fakeTransport) {
+		return &Session{
+			SoundDetectionEnabled:  true,
+			SoundDetectionTopK:     5,
+			SoundDetectionWindowMs: 200, // 200ms @ 16kHz mono16 = 6400 bytes
+			SoundDetectionHopMs:    20,
+			InputSampleRate:        16000,
+			ModelInterface: &fakeModel{
+				soundDetectionResult: &schema.SoundClassificationResult{
+					Model:      "ced",
+					Detections: []schema.SoundClassification{{Index: 23, Label: "Baby cry, infant cry", Score: 0.87}},
+				},
+			},
+		}, &fakeTransport{}
+	}
+
+	It("emits a sound_detection event and trims the buffer to one window", func() {
+		session, tr := newSoundSession()
+		session.InputAudioBuffer = make([]byte, 10000) // > 6400-byte window
+
+		classifySoundWindow(session, tr)
+
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemSoundDetection)).To(Equal(1))
+		// buffer trimmed to exactly one window (200ms @ 16kHz mono 16-bit)
+		Expect(len(session.InputAudioBuffer)).To(Equal(6400))
+	})
+
+	It("does nothing when too little audio is buffered", func() {
+		session, tr := newSoundSession()
+		session.InputAudioBuffer = make([]byte, 100) // < ~10ms (320 bytes)
+
+		classifySoundWindow(session, tr)
+
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemSoundDetection)).To(Equal(0))
+	})
+})
+
+var _ = Describe("writeWindowWAV", func() {
+	It("writes a mono 16-bit WAV header declaring the given sample rate", func() {
+		pcm := make([]byte, 640)
+		path, err := writeWindowWAV(pcm, 24000)
+		Expect(err).ToNot(HaveOccurred())
+		defer func() { _ = os.Remove(path) }()
+
+		data, err := os.ReadFile(path)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(len(data)).To(BeNumerically(">=", 44+len(pcm)))
+		// SampleRate is a little-endian uint32 at byte offset 24 of a WAV header.
+		Expect(binary.LittleEndian.Uint32(data[24:28])).To(Equal(uint32(24000)))
+	})
+})
diff --git a/core/http/endpoints/openai/sound_classification.go b/core/http/endpoints/openai/sound_classification.go
new file mode 100644
index 000000000..b7e23f1b1
--- /dev/null
+++ b/core/http/endpoints/openai/sound_classification.go
@@ -0,0 +1,91 @@
+package openai
+
+import (
+	"io"
+	"net/http"
+	"os"
+	"path"
+	"path/filepath"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/middleware"
+	"github.com/mudler/LocalAI/core/schema"
+	model "github.com/mudler/LocalAI/pkg/model"
+
+	"github.com/mudler/xlog"
+)
+
+// SoundClassificationEndpoint runs an audio-tagging / sound-event
+// classification model (e.g. ced) over an uploaded clip and returns the
+// scored AudioSet tags in score-descending order. It mirrors the
+// transcription path: multipart audio upload -> temp file -> backend call.
+//
+// @Summary Classify sound events in audio (audio tagging).
+// @Tags audio
+// @accept multipart/form-data
+// @Param model formData string true "model"
+// @Param file formData file true "audio file"
+// @Param top_k formData int false "number of top tags to return (0 = backend default)"
+// @Param threshold formData number false "drop tags scoring below this value"
+// @Success 200 {object} schema.SoundClassificationResult
+// @Router /v1/audio/classification [post]
+func SoundClassificationEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc {
+	return func(c echo.Context) error {
+		input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
+		if !ok || input.Model == "" {
+			return echo.ErrBadRequest
+		}
+
+		modelConfig, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig)
+		if !ok || modelConfig == nil {
+			return echo.ErrBadRequest
+		}
+
+		req := backend.SoundDetectionRequest{
+			TopK:      int32(parseFormInt(c, "top_k", 0)),
+			Threshold: float32(parseFormFloat(c, "threshold", 0)),
+		}
+
+		file, err := c.FormFile("file")
+		if err != nil {
+			return err
+		}
+		f, err := file.Open()
+		if err != nil {
+			return err
+		}
+		defer func() { _ = f.Close() }()
+
+		dir, err := os.MkdirTemp("", "sound-classification")
+		if err != nil {
+			return err
+		}
+		defer func() { _ = os.RemoveAll(dir) }()
+
+		dst := filepath.Join(dir, path.Base(file.Filename))
+		dstFile, err := os.Create(dst) // #nosec G304 -- dst is a server-created temp dir joined with path.Base of the upload name (no traversal)
+		if err != nil {
+			return err
+		}
+		if _, err := io.Copy(dstFile, f); err != nil {
+			xlog.Debug("Audio file copying error", "filename", file.Filename, "dst", dst, "error", err)
+			_ = dstFile.Close()
+			return err
+		}
+		_ = dstFile.Close()
+		req.Audio = dst
+
+		result, err := backend.ModelSoundDetection(c.Request().Context(), req, ml, *modelConfig, appConfig)
+		if err != nil {
+			xlog.Error("Sound classification failed",
+				"model", modelConfig.Name,
+				"audio", dst,
+				"error", err)
+			return err
+		}
+
+		return c.JSON(http.StatusOK, result)
+	}
+}
diff --git a/core/http/endpoints/openai/types/server_events.go b/core/http/endpoints/openai/types/server_events.go
index 8183a8b78..6b0a233ee 100644
--- a/core/http/endpoints/openai/types/server_events.go
+++ b/core/http/endpoints/openai/types/server_events.go
@@ -18,6 +18,7 @@ const (
 	ServerEventTypeConversationItemInputAudioTranscriptionDelta     ServerEventType = "conversation.item.input_audio_transcription.delta"
 	ServerEventTypeConversationItemInputAudioTranscriptionSegment   ServerEventType = "conversation.item.input_audio_transcription.segment"
 	ServerEventTypeConversationItemInputAudioTranscriptionFailed    ServerEventType = "conversation.item.input_audio_transcription.failed"
+	ServerEventTypeConversationItemSoundDetection                   ServerEventType = "conversation.item.sound_detection"
 	ServerEventTypeConversationItemTruncated                        ServerEventType = "conversation.item.truncated"
 	ServerEventTypeConversationItemDeleted                          ServerEventType = "conversation.item.deleted"
 	// ServerEventTypeConversationItemSpeaker is a LocalAI extension: it reports
@@ -473,6 +474,55 @@ func (m ConversationItemInputAudioTranscriptionCompletedEvent) MarshalJSON() ([]
 	return json.Marshal(shadow)
 }
 
+// SoundDetectionTag is one scored sound-event tag from the sound-event
+// classifier. Label is the human-readable AudioSet class name, Score is the
+// per-class probability (multi-label, independent), and Index is the class
+// index in the model ontology.
+type SoundDetectionTag struct {
+	// The human-readable AudioSet class name (e.g. "Baby cry, infant cry").
+	Label string `json:"label"`
+
+	// The per-class probability for this tag.
+	Score float32 `json:"score"`
+
+	// The class index in the model ontology.
+	Index int `json:"index"`
+}
+
+// Returned when a committed input audio window has been classified by a
+// sound-event-detection model. This is a LocalAI extension to the OpenAI
+// Realtime API: when a pipeline configures sound_detection, each VAD-committed
+// utterance is run through the classifier and the scored AudioSet tags are
+// emitted as this event, independent of (and alongside) transcription.
+type ConversationItemSoundDetectionEvent struct {
+	ServerEventBase
+	// The ID of the item.
+	ItemID string `json:"item_id"`
+
+	// The index of the content part in the item's content array.
+	ContentIndex int `json:"content_index"`
+
+	// The scored sound-event tags, in score-descending order.
+	Detections []SoundDetectionTag `json:"detections"`
+}
+
+func (m ConversationItemSoundDetectionEvent) ServerEventType() ServerEventType {
+	return ServerEventTypeConversationItemSoundDetection
+}
+
+func (m ConversationItemSoundDetectionEvent) MarshalJSON() ([]byte, error) {
+	type typeAlias ConversationItemSoundDetectionEvent
+	type typeWrapper struct {
+		typeAlias
+		Type ServerEventType `json:"type"`
+	}
+	shadow := typeWrapper{
+		typeAlias: typeAlias(m),
+		Type:      m.ServerEventType(),
+	}
+	return json.Marshal(shadow)
+}
+
 // Returned when the text value of an input audio transcription content part is updated with incremental transcription results.
 //
 // See https://platform.openai.com/docs/api-reference/realtime-server-events/conversation/item/input_audio_transcription/delta
diff --git a/core/http/react-ui/public/locales/en/models.json b/core/http/react-ui/public/locales/en/models.json
index 9af2d77a9..2bf7b018d 100644
--- a/core/http/react-ui/public/locales/en/models.json
+++ b/core/http/react-ui/public/locales/en/models.json
@@ -23,6 +23,7 @@
     "tts": "TTS",
     "stt": "STT",
     "diarization": "Diarization",
+    "soundClassification": "Sound Tagging",
     "soundGen": "Sound",
     "audioTransform": "Audio FX",
     "realtimeAudio": "Realtime Audio",
diff --git a/core/http/react-ui/src/pages/Models.jsx b/core/http/react-ui/src/pages/Models.jsx
index 3c40afc93..5f3a3908d 100644
--- a/core/http/react-ui/src/pages/Models.jsx
+++ b/core/http/react-ui/src/pages/Models.jsx
@@ -31,6 +31,7 @@ const FILTERS = [
   { key: 'tts', labelKey: 'filters.tts', icon: 'fa-microphone' },
   { key: 'transcript', labelKey: 'filters.stt', icon: 'fa-headphones' },
   { key: 'diarization', labelKey: 'filters.diarization', icon: 'fa-users' },
+  { key: 'sound_classification', labelKey: 'filters.soundClassification', icon: 'fa-ear-listen' },
   { key: 'sound_generation', labelKey: 'filters.soundGen', icon: 'fa-music' },
   { key: 'audio_transform', labelKey: 'filters.audioTransform', icon: 'fa-sliders' },
   { key: 'realtime_audio', labelKey: 'filters.realtimeAudio', icon: 'fa-tower-broadcast' },
diff --git a/core/http/react-ui/src/utils/capabilities.js b/core/http/react-ui/src/utils/capabilities.js
index 95dd4bb7a..5d30a472d 100644
--- a/core/http/react-ui/src/utils/capabilities.js
+++ b/core/http/react-ui/src/utils/capabilities.js
@@ -15,6 +15,7 @@ export const CAP_SOUND_GENERATION = 'FLAG_SOUND_GENERATION'
 export const CAP_TOKENIZE = 'FLAG_TOKENIZE'
 export const CAP_VAD = 'FLAG_VAD'
 export const CAP_DIARIZATION = 'FLAG_DIARIZATION'
+export const CAP_SOUND_CLASSIFICATION = 'FLAG_SOUND_CLASSIFICATION'
 export const CAP_VIDEO = 'FLAG_VIDEO'
 export const CAP_DETECTION = 'FLAG_DETECTION'
 export const CAP_FACE_RECOGNITION = 'FLAG_FACE_RECOGNITION'
diff --git a/core/http/routes/localai.go b/core/http/routes/localai.go
index 1df1d5d8c..212f379f0 100644
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -284,13 +284,14 @@ func RegisterLocalAIRoutes(router *echo.Echo,
 			// Categorized endpoint groups for structured discovery
 			"endpoint_groups": map[string]any{
 				"openai_compatible": map[string]string{
-					"models":           "/v1/models",
-					"chat_completions": "/v1/chat/completions",
-					"completions":      "/v1/completions",
-					"embeddings":       "/v1/embeddings",
-					"transcription":    "/v1/audio/transcriptions",
-					"diarization":      "/v1/audio/diarization",
-					"image_generation": "/v1/images/generations",
+					"models":               "/v1/models",
+					"chat_completions":     "/v1/chat/completions",
+					"completions":          "/v1/completions",
+					"embeddings":           "/v1/embeddings",
+					"transcription":        "/v1/audio/transcriptions",
+					"diarization":          "/v1/audio/diarization",
+					"sound_classification": "/v1/audio/classification",
+					"image_generation":     "/v1/images/generations",
 				},
 				"config_management": map[string]string{
 					"config_metadata": "/api/models/config-metadata",
@@ -342,7 +343,7 @@ func RegisterLocalAIRoutes(router *echo.Echo,
 					"delete": "/stores/delete",
 				},
 				"docs": map[string]string{
-					"swagger": "/swagger/index.html",
+					"swagger":      "/swagger/index.html",
 					"instructions": "/api/instructions",
 				},
 			},
diff --git a/core/http/routes/openai.go b/core/http/routes/openai.go
index 5252edfdd..32603f567 100644
--- a/core/http/routes/openai.go
+++ b/core/http/routes/openai.go
@@ -200,6 +200,23 @@ func RegisterOpenAIRoutes(app *echo.Echo,
 	app.POST("/v1/audio/diarization", diarizationHandler, diarizationMiddleware...)
 	app.POST("/audio/diarization", diarizationHandler, diarizationMiddleware...)
 
+	soundClassificationHandler := openai.SoundClassificationEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig())
+	soundClassificationMiddleware := []echo.MiddlewareFunc{
+		traceMiddleware,
+		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_SOUND_CLASSIFICATION)),
+		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
+		func(next echo.HandlerFunc) echo.HandlerFunc {
+			return func(c echo.Context) error {
+				if err := re.SetOpenAIRequest(c); err != nil {
+					return err
+				}
+				return next(c)
+			}
+		},
+	}
+	app.POST("/v1/audio/classification", soundClassificationHandler, soundClassificationMiddleware...)
+	app.POST("/audio/classification", soundClassificationHandler, soundClassificationMiddleware...)
+
 	audioSpeechHandler := localai.TTSEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig())
 	audioSpeechMiddleware := []echo.MiddlewareFunc{
 		nodeHeaderMiddleware,
diff --git a/core/http/routes/ui_api.go b/core/http/routes/ui_api.go
index f398d71cd..e26894273 100644
--- a/core/http/routes/ui_api.go
+++ b/core/http/routes/ui_api.go
@@ -42,21 +42,22 @@ const (
 // usecaseFilters maps UI filter keys to ModelConfigUsecase flags for
 // capability-based gallery filtering.
 var usecaseFilters = map[string]config.ModelConfigUsecase{
-	config.UsecaseChat:            config.FLAG_CHAT,
-	config.UsecaseImage:           config.FLAG_IMAGE,
-	config.UsecaseVideo:           config.FLAG_VIDEO,
-	config.UsecaseVision:          config.FLAG_VISION,
-	config.UsecaseTTS:             config.FLAG_TTS,
-	config.UsecaseTranscript:      config.FLAG_TRANSCRIPT,
-	config.UsecaseSoundGeneration: config.FLAG_SOUND_GENERATION,
-	config.UsecaseEmbeddings:      config.FLAG_EMBEDDINGS,
-	config.UsecaseRerank:          config.FLAG_RERANK,
-	config.UsecaseDetection:       config.FLAG_DETECTION,
-	config.UsecaseVAD:             config.FLAG_VAD,
-	config.UsecaseAudioTransform:  config.FLAG_AUDIO_TRANSFORM,
-	config.UsecaseDiarization:     config.FLAG_DIARIZATION,
-	config.UsecaseRealtimeAudio:   config.FLAG_REALTIME_AUDIO,
-	config.UsecaseTokenClassify:   config.FLAG_TOKEN_CLASSIFY,
+	config.UsecaseChat:                config.FLAG_CHAT,
+	config.UsecaseImage:               config.FLAG_IMAGE,
+	config.UsecaseVideo:               config.FLAG_VIDEO,
+	config.UsecaseVision:              config.FLAG_VISION,
+	config.UsecaseTTS:                 config.FLAG_TTS,
+	config.UsecaseTranscript:          config.FLAG_TRANSCRIPT,
+	config.UsecaseSoundGeneration:     config.FLAG_SOUND_GENERATION,
+	config.UsecaseEmbeddings:          config.FLAG_EMBEDDINGS,
+	config.UsecaseRerank:              config.FLAG_RERANK,
+	config.UsecaseDetection:           config.FLAG_DETECTION,
+	config.UsecaseVAD:                 config.FLAG_VAD,
+	config.UsecaseAudioTransform:      config.FLAG_AUDIO_TRANSFORM,
+	config.UsecaseDiarization:         config.FLAG_DIARIZATION,
+	config.UsecaseSoundClassification: config.FLAG_SOUND_CLASSIFICATION,
+	config.UsecaseRealtimeAudio:       config.FLAG_REALTIME_AUDIO,
+	config.UsecaseTokenClassify:       config.FLAG_TOKEN_CLASSIFY,
 }
 
 // extractHFRepo tries to find a HuggingFace repo ID from model overrides or URLs.
diff --git a/core/schema/sound_classification.go b/core/schema/sound_classification.go
new file mode 100644
index 000000000..decd7c7e3
--- /dev/null
+++ b/core/schema/sound_classification.go
@@ -0,0 +1,19 @@
+package schema
+
+// SoundClassification is one scored sound-event tag. Score is the
+// per-class probability (multi-label, independent), Index is the class
+// index in the model ontology, and Label is the human-readable AudioSet
+// class name (e.g. "Baby cry, infant cry").
+type SoundClassification struct {
+	Index int     `json:"index"`
+	Label string  `json:"label"`
+	Score float32 `json:"score"`
+}
+
+// SoundClassificationResult is the JSON response of the
+// /v1/audio/classification endpoint: the model name and the scored tags
+// in score-descending order.
+type SoundClassificationResult struct {
+	Model      string                `json:"model"`
+	Detections []SoundClassification `json:"detections"`
+}
diff --git a/core/services/nodes/health_mock_test.go b/core/services/nodes/health_mock_test.go
index f14dd133d..86ac5cdcb 100644
--- a/core/services/nodes/health_mock_test.go
+++ b/core/services/nodes/health_mock_test.go
@@ -169,6 +169,9 @@ func (c *fakeBackendClient) SoundGeneration(_ context.Context, _ *pb.SoundGenera
 func (c *fakeBackendClient) Detect(_ context.Context, _ *pb.DetectOptions, _ ...ggrpc.CallOption) (*pb.DetectResponse, error) {
 	return nil, nil
 }
+func (c *fakeBackendClient) SoundDetection(_ context.Context, _ *pb.SoundDetectionRequest, _ ...ggrpc.CallOption) (*pb.SoundDetectionResponse, error) {
+	return nil, nil
+}
 func (c *fakeBackendClient) Depth(_ context.Context, _ *pb.DepthRequest, _ ...ggrpc.CallOption) (*pb.DepthResponse, error) {
 	return nil, nil
 }
diff --git a/core/services/nodes/inflight_test.go b/core/services/nodes/inflight_test.go
index 85de0ac8e..2eb90f9c6 100644
--- a/core/services/nodes/inflight_test.go
+++ b/core/services/nodes/inflight_test.go
@@ -99,6 +99,9 @@ func (f *fakeGRPCBackend) SoundGeneration(_ context.Context, _ *pb.SoundGenerati
 func (f *fakeGRPCBackend) Detect(_ context.Context, _ *pb.DetectOptions, _ ...ggrpc.CallOption) (*pb.DetectResponse, error) {
 	return &pb.DetectResponse{}, nil
 }
+func (f *fakeGRPCBackend) SoundDetection(_ context.Context, _ *pb.SoundDetectionRequest, _ ...ggrpc.CallOption) (*pb.SoundDetectionResponse, error) {
+	return &pb.SoundDetectionResponse{}, nil
+}
 
 func (f *fakeGRPCBackend) Depth(_ context.Context, _ *pb.DepthRequest, _ ...ggrpc.CallOption) (*pb.DepthResponse, error) {
 	return &pb.DepthResponse{}, nil
diff --git a/docs/content/features/audio-classification.md b/docs/content/features/audio-classification.md
new file mode 100644
index 000000000..f70674dc9
--- /dev/null
+++ b/docs/content/features/audio-classification.md
@@ -0,0 +1,55 @@
++++
+disableToc = false
+title = "Sound Classification"
+weight = 18
+url = "/features/audio-classification/"
++++
+
+Sound-event classification (audio tagging) answers the question **"what am I hearing?"** - given an audio clip, it returns a list of scored [AudioSet](https://research.google.com/audioset/) labels (e.g. *Baby cry, infant cry*, *Glass breaking*, *Dog bark*, *Alarm*).
+
+LocalAI exposes this through the `/v1/audio/classification` endpoint, modelled after `/v1/audio/transcriptions`. The reference backend is **[ced.cpp](https://github.com/mudler/ced.cpp)** (CED, a 527-class AudioSet tagger), a small ViT over a log-mel spectrogram ported to ggml with full PyTorch parity. Apache-2.0 weights are redistributable as GGUF.
+
+Because classification is exposed as a regular OpenAI-style endpoint, any HTTP client works - there is no Python dependency on the consumer side.
+
+## Endpoint
+
+```
+POST /v1/audio/classification
+Content-Type: multipart/form-data
+```
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `file` | file (required) | audio file in any format `ffmpeg` accepts |
+| `model` | string (required) | name of the sound-classification-capable model (e.g. `ced-base`) |
+| `top_k` | int | number of top tags to return (0 = backend default) |
+| `threshold` | float | drop tags scoring below this value |
+
+### Response
+
+```json
+{
+  "model": "ced-base",
+  "detections": [
+    {"index": 23, "label": "Baby cry, infant cry", "score": 0.87},
+    {"index": 22, "label": "Crying, sobbing", "score": 0.41}
+  ]
+}
+```
+
+Detections are returned in score-descending order. Scores are per-class probabilities (multi-label, independent), so they do not sum to 1.
+
+## Example
+
+```bash
+curl http://localhost:8080/v1/audio/classification \
+  -H "Content-Type: multipart/form-data" \
+  -F file="@/path/to/clip.wav" \
+  -F model="ced-base" \
+  -F top_k=10
+```
+
+## See also
+
+- [Audio to Text]({{% relref "audio-to-text" %}}) - speech transcription
+- [Speaker Diarization]({{% relref "audio-diarization" %}}) - who spoke when
diff --git a/docs/content/features/audio-diarization.md b/docs/content/features/audio-diarization.md
index 36d9437dc..b2cfa32b0 100644
--- a/docs/content/features/audio-diarization.md
+++ b/docs/content/features/audio-diarization.md
@@ -152,3 +152,7 @@ curl http://localhost:8080/v1/audio/diarization \
 - **Speaker identity across files**: speaker IDs (`SPEAKER_00`, `SPEAKER_01`, …) are local to each request. To track the same person across multiple recordings, combine `/v1/audio/diarization` with `/v1/voice/embed` (speaker embedding) and maintain your own embedding store.
 - **Hints vs. forces**: `num_speakers` overrides clustering when set; `min_speakers` / `max_speakers` are advisory and only honored by backends that expose a range hint. vibevoice.cpp ignores them — its model picks the count itself.
 - **Sample rate**: input is automatically converted to 16 kHz mono via ffmpeg before the backend sees it; sherpa-onnx pyannote-3.0 requires 16 kHz.
+
+## See also
+
+- [Sound Classification]({{% relref "audio-classification" %}}) - tag non-speech sound events (alarms, glass breaking, baby cry) in a clip.
diff --git a/docs/content/features/backends.md b/docs/content/features/backends.md
index 1713fabfb..4b7445a98 100644
--- a/docs/content/features/backends.md
+++ b/docs/content/features/backends.md
@@ -128,6 +128,7 @@ LocalAI supports various types of backends:
 - **Speech-to-Text Backends**: For transcription (e.g., whisper.cpp, parakeet.cpp, faster-whisper, NeMo)
 - **Text-to-Speech Backends**: For speech synthesis (e.g., piper, Kokoro, VibeVoice, Qwen3-TTS)
 - **Sound Generation Backends**: For music and audio generation (e.g., ACE-Step)
+- **Sound Classification Backends**: For sound-event classification / audio tagging - identifying everyday sounds like baby cry, glass breaking, alarms (e.g., ced.cpp)
 - **Image & Video Generation Backends**: For diffusion models (e.g., stable-diffusion.cpp, diffusers)
 - **Vision & Detection Backends**: For object detection, segmentation, depth, and face/voice recognition (e.g., rf-detr.cpp, locate-anything.cpp, sam3.cpp, insightface)
 - **Audio Processing Backends**: For voice activity detection and audio enhancement (e.g., Silero VAD, LocalVQE)
diff --git a/docs/content/whats-new.md b/docs/content/whats-new.md
index 170ccae98..6ff7979cc 100644
--- a/docs/content/whats-new.md
+++ b/docs/content/whats-new.md
@@ -15,6 +15,7 @@ You can see the release notes [here](https://github.com/mudler/LocalAI/releases)
 - **April 2026**: [Audio Transform](/features/audio-transform/) — generic audio-in / audio-out endpoint with optional reference signal. First implementation: [LocalVQE](https://github.com/localai-org/LocalVQE) C++ backend (joint AEC + noise suppression + dereverberation, DeepVQE-style). Both batch (`POST /audio/transformations`) and bidirectional WebSocket streaming (`/audio/transformations/stream`). Studio "Transform" tab with synchronized waveform players for input / reference / output.
 - **April 2026**: [Face recognition backend](/features/face-recognition/) — `insightface`-powered 1:1 verification, 1:N identification, face embedding, face detection, and demographic analysis. Ships both a non-commercial `buffalo_l` model and an Apache 2.0 OpenCV Zoo alternative.
 - **May 2026**: [Speaker diarization](/features/audio-diarization/) — new `/v1/audio/diarization` endpoint returning "who spoke when" segments. Backed by `sherpa-onnx` (pyannote-3.0 + speaker embeddings + clustering) for pure diarization, and `vibevoice-cpp` for diarization bundled with long-form ASR. Supports `json` / `verbose_json` / `rttm` response formats.
+- **June 2026**: [Sound classification](/features/audio-classification/) — new `/v1/audio/classification` endpoint for audio tagging / sound-event classification, returning scored [AudioSet](https://research.google.com/audioset/) labels (baby cry, glass breaking, alarms, ...). Backed by [ced.cpp](https://github.com/mudler/ced.cpp), a 527-class AudioSet tagger ported to ggml.
 - **June 2026**: [PII analyze / redact API](/features/middleware/#analyze--redact-api) — the PII detection pipeline (NER + restricted-regex pattern tiers) is now a standalone service: `POST /api/pii/analyze` returns detected entity spans and `POST /api/pii/redact` returns the sanitised text (or `400 pii_blocked`), without routing a chat request through the middleware. Events gain an `origin` (`middleware` / `proxy` / `pii_analyze` / `pii_redact`) so `/api/pii/events` can be filtered by source.
 - **June 2026**: Concurrent scoring and PII NER on llama.cpp — the `Score` (router classifier) and `TokenClassify` (PII NER) primitives now ride llama.cpp's server task queue instead of locking the context, so they run concurrently with chat/completion/embedding traffic and with each other. The `known_usecases` restriction that forced dedicated scorer/NER model configs on llama-cpp is lifted, repeated scoring calls reuse the prompt KV cache across candidates, and scoring inputs are no longer capped by the physical batch size.
 
diff --git a/gallery/ced.yaml b/gallery/ced.yaml
new file mode 100644
index 000000000..171b0d0d8
--- /dev/null
+++ b/gallery/ced.yaml
@@ -0,0 +1,7 @@
+---
+name: "ced-sound-classification"
+
+config_file: |
+  backend: ced
+  known_usecases:
+    - sound_classification
diff --git a/gallery/index.yaml b/gallery/index.yaml
index fcf180e13..cde505d72 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -3077,6 +3077,190 @@
       - transcript
     parameters:
       model: tiny
+- name: ced-base-f16
+  url: github:mudler/LocalAI/gallery/ced.yaml@master
+  urls:
+    - https://huggingface.co/mudler/ced-gguf
+    - https://huggingface.co/mispeech/ced-base
+  description: |
+    CED (Consistent Ensemble Distillation, Xiaomi) is a sound-event classifier that tags everyday sounds (baby cry, footsteps, glass breaking, alarms, dog bark, ...) into the 527-class AudioSet ontology. This is the f16 GGUF for the ced backend (a standalone C++/ggml port). Recommended default: fastest on CPU and near-lossless. Use POST /v1/audio/classification, or the realtime websocket API for live recognition.
+  license: apache-2.0
+  tags:
+    - audio-classification
+    - sound-event-detection
+    - audio-tagging
+    - audioset
+    - ced
+    - gguf
+    - f16
+  overrides:
+    parameters:
+      model: ced-base-f16.gguf
+  files:
+    - filename: ced-base-f16.gguf
+      sha256: 5c058d9f7b737167195fa54eae4a2ae17658ac2c0a8073f7f116ba006b2ab32c
+      uri: https://huggingface.co/mudler/ced-gguf/resolve/main/ced-base-f16.gguf
+- name: ced-base-q8
+  url: github:mudler/LocalAI/gallery/ced.yaml@master
+  urls:
+    - https://huggingface.co/mudler/ced-gguf
+    - https://huggingface.co/mispeech/ced-base
+  description: |
+    CED (Consistent Ensemble Distillation, Xiaomi) sound-event classifier over the 527-class AudioSet ontology (baby cry, footsteps, glass breaking, alarms, dog bark, ...). This is the q8_0 GGUF for the ced backend: smallest footprint (~88 MB, ~6.5x less memory than the PyTorch reference) and near-lossless (identical top-5 tags). Use POST /v1/audio/classification, or the realtime websocket API for live recognition.
+  license: apache-2.0
+  tags:
+    - audio-classification
+    - sound-event-detection
+    - audio-tagging
+    - audioset
+    - ced
+    - gguf
+    - q8
+  overrides:
+    parameters:
+      model: ced-base-q8_0.gguf
+  files:
+    - filename: ced-base-q8_0.gguf
+      sha256: bd34a7710169f0047fea17267965d211f967828ab25ba6fb9d3768481393f6e2
+      uri: https://huggingface.co/mudler/ced-gguf/resolve/main/ced-base-q8_0.gguf
+- name: ced-tiny-f16
+  url: github:mudler/LocalAI/gallery/ced.yaml@master
+  urls:
+    - https://huggingface.co/mudler/ced-gguf
+    - https://huggingface.co/mispeech/ced-tiny
+  description: |
+    CED-tiny (5.5M params, Pi-class / edge) sound-event classifier over the 527-class AudioSet ontology (baby cry, footsteps, glass breaking, alarms, dog bark, ...). f16 GGUF for the ced backend (recommended (fastest on CPU)). Use POST /v1/audio/classification, or the realtime websocket API for live recognition.
+  license: apache-2.0
+  tags:
+    - audio-classification
+    - sound-event-detection
+    - audio-tagging
+    - audioset
+    - ced
+    - gguf
+    - f16
+  overrides:
+    parameters:
+      model: ced-tiny-f16.gguf
+  files:
+    - filename: ced-tiny-f16.gguf
+      sha256: af8b81c67bae50bfca4ea83dbba77b3bae4fa6180d36c17d6877f7700aeeb77b
+      uri: https://huggingface.co/mudler/ced-gguf/resolve/main/ced-tiny-f16.gguf
+- name: ced-tiny-q8
+  url: github:mudler/LocalAI/gallery/ced.yaml@master
+  urls:
+    - https://huggingface.co/mudler/ced-gguf
+    - https://huggingface.co/mispeech/ced-tiny
+  description: |
+    CED-tiny (5.5M params, Pi-class / edge) sound-event classifier over the 527-class AudioSet ontology (baby cry, footsteps, glass breaking, alarms, dog bark, ...). q8_0 GGUF for the ced backend (smallest footprint, near-lossless). Use POST /v1/audio/classification, or the realtime websocket API for live recognition.
+  license: apache-2.0
+  tags:
+    - audio-classification
+    - sound-event-detection
+    - audio-tagging
+    - audioset
+    - ced
+    - gguf
+    - q8
+  overrides:
+    parameters:
+      model: ced-tiny-q8_0.gguf
+  files:
+    - filename: ced-tiny-q8_0.gguf
+      sha256: 48bee4e2fc3cc85d7806e03471db24e77fda6c2a2e81ffe9ef67caebaf2bd674
+      uri: https://huggingface.co/mudler/ced-gguf/resolve/main/ced-tiny-q8_0.gguf
+- name: ced-mini-f16
+  url: github:mudler/LocalAI/gallery/ced.yaml@master
+  urls:
+    - https://huggingface.co/mudler/ced-gguf
+    - https://huggingface.co/mispeech/ced-mini
+  description: |
+    CED-mini (9.6M params, low-power) sound-event classifier over the 527-class AudioSet ontology (baby cry, footsteps, glass breaking, alarms, dog bark, ...). f16 GGUF for the ced backend (recommended (fastest on CPU)). Use POST /v1/audio/classification, or the realtime websocket API for live recognition.
+  license: apache-2.0
+  tags:
+    - audio-classification
+    - sound-event-detection
+    - audio-tagging
+    - audioset
+    - ced
+    - gguf
+    - f16
+  overrides:
+    parameters:
+      model: ced-mini-f16.gguf
+  files:
+    - filename: ced-mini-f16.gguf
+      sha256: 3c6a8936c77312f07a9ecb7b4bbbcb1f93ad137920ca6656bae9306571fb0c03
+      uri: https://huggingface.co/mudler/ced-gguf/resolve/main/ced-mini-f16.gguf
+- name: ced-mini-q8
+  url: github:mudler/LocalAI/gallery/ced.yaml@master
+  urls:
+    - https://huggingface.co/mudler/ced-gguf
+    - https://huggingface.co/mispeech/ced-mini
+  description: |
+    CED-mini (9.6M params, low-power) sound-event classifier over the 527-class AudioSet ontology (baby cry, footsteps, glass breaking, alarms, dog bark, ...). q8_0 GGUF for the ced backend (smallest footprint, near-lossless). Use POST /v1/audio/classification, or the realtime websocket API for live recognition.
+  license: apache-2.0
+  tags:
+    - audio-classification
+    - sound-event-detection
+    - audio-tagging
+    - audioset
+    - ced
+    - gguf
+    - q8
+  overrides:
+    parameters:
+      model: ced-mini-q8_0.gguf
+  files:
+    - filename: ced-mini-q8_0.gguf
+      sha256: 7062cef9ca31459f339ce24a5914f3b65bde76ffd9ca4fc924a040327ff292bd
+      uri: https://huggingface.co/mudler/ced-gguf/resolve/main/ced-mini-q8_0.gguf
+- name: ced-small-f16
+  url: github:mudler/LocalAI/gallery/ced.yaml@master
+  urls:
+    - https://huggingface.co/mudler/ced-gguf
+    - https://huggingface.co/mispeech/ced-small
+  description: |
+    CED-small (22M params, balanced size/accuracy) sound-event classifier over the 527-class AudioSet ontology (baby cry, footsteps, glass breaking, alarms, dog bark, ...). f16 GGUF for the ced backend (recommended (fastest on CPU)). Use POST /v1/audio/classification, or the realtime websocket API for live recognition.
+  license: apache-2.0
+  tags:
+    - audio-classification
+    - sound-event-detection
+    - audio-tagging
+    - audioset
+    - ced
+    - gguf
+    - f16
+  overrides:
+    parameters:
+      model: ced-small-f16.gguf
+  files:
+    - filename: ced-small-f16.gguf
+      sha256: c391ed8697a1b08d7c1a463e4940a5c3a2f670e0544ab0d8ee23b544583602a8
+      uri: https://huggingface.co/mudler/ced-gguf/resolve/main/ced-small-f16.gguf
+- name: ced-small-q8
+  url: github:mudler/LocalAI/gallery/ced.yaml@master
+  urls:
+    - https://huggingface.co/mudler/ced-gguf
+    - https://huggingface.co/mispeech/ced-small
+  description: |
+    CED-small (22M params, balanced size/accuracy) sound-event classifier over the 527-class AudioSet ontology (baby cry, footsteps, glass breaking, alarms, dog bark, ...). q8_0 GGUF for the ced backend (smallest footprint, near-lossless). Use POST /v1/audio/classification, or the realtime websocket API for live recognition.
+  license: apache-2.0
+  tags:
+    - audio-classification
+    - sound-event-detection
+    - audio-tagging
+    - audioset
+    - ced
+    - gguf
+    - q8
+  overrides:
+    parameters:
+      model: ced-small-q8_0.gguf
+  files:
+    - filename: ced-small-q8_0.gguf
+      sha256: 888275fe43491cf832fb7b8125eccba34d1120745166f40cc12e93b79dea8efe
+      uri: https://huggingface.co/mudler/ced-gguf/resolve/main/ced-small-q8_0.gguf
 - name: omnilingual-0.3b-ctc-q8-sherpa
   url: github:mudler/LocalAI/gallery/sherpa-onnx-asr.yaml@master
   urls:
diff --git a/pkg/grpc/backend.go b/pkg/grpc/backend.go
index 44912c04b..f4cd511ac 100644
--- a/pkg/grpc/backend.go
+++ b/pkg/grpc/backend.go
@@ -82,6 +82,8 @@ type Backend interface {
 
 	Diarize(ctx context.Context, in *pb.DiarizeRequest, opts ...grpc.CallOption) (*pb.DiarizeResponse, error)
 
+	SoundDetection(ctx context.Context, in *pb.SoundDetectionRequest, opts ...grpc.CallOption) (*pb.SoundDetectionResponse, error)
+
 	AudioEncode(ctx context.Context, in *pb.AudioEncodeRequest, opts ...grpc.CallOption) (*pb.AudioEncodeResult, error)
 	AudioDecode(ctx context.Context, in *pb.AudioDecodeRequest, opts ...grpc.CallOption) (*pb.AudioDecodeResult, error)
 
diff --git a/pkg/grpc/base/base.go b/pkg/grpc/base/base.go
index c67c832a7..55b0d96b6 100644
--- a/pkg/grpc/base/base.go
+++ b/pkg/grpc/base/base.go
@@ -110,6 +110,10 @@ func (llm *Base) Diarize(*pb.DiarizeRequest) (pb.DiarizeResponse, error) {
 	return pb.DiarizeResponse{}, fmt.Errorf("unimplemented")
 }
 
+func (llm *Base) SoundDetection(context.Context, *pb.SoundDetectionRequest) (*pb.SoundDetectionResponse, error) {
+	return nil, fmt.Errorf("unimplemented")
+}
+
 func (llm *Base) TokenizeString(opts *pb.PredictOptions) (pb.TokenizationResponse, error) {
 	return pb.TokenizationResponse{}, fmt.Errorf("unimplemented")
 }
diff --git a/pkg/grpc/client.go b/pkg/grpc/client.go
index 8dd2b2c2e..b80c74bcd 100644
--- a/pkg/grpc/client.go
+++ b/pkg/grpc/client.go
@@ -616,6 +616,24 @@ func (c *Client) Diarize(ctx context.Context, in *pb.DiarizeRequest, opts ...grp
 	return client.Diarize(ctx, in, opts...)
 }
 
+func (c *Client) SoundDetection(ctx context.Context, in *pb.SoundDetectionRequest, opts ...grpc.CallOption) (*pb.SoundDetectionResponse, error) {
+	if !c.parallel {
+		c.opMutex.Lock()
+		defer c.opMutex.Unlock()
+	}
+	c.setBusy(true)
+	defer c.setBusy(false)
+	c.wdMark()
+	defer c.wdUnMark()
+	conn, err := c.dial()
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = conn.Close() }()
+	client := pb.NewBackendClient(conn)
+	return client.SoundDetection(ctx, in, opts...)
+}
+
 func (c *Client) Detect(ctx context.Context, in *pb.DetectOptions, opts ...grpc.CallOption) (*pb.DetectResponse, error) {
 	if !c.parallel {
 		c.opMutex.Lock()
diff --git a/pkg/grpc/embed.go b/pkg/grpc/embed.go
index c7c6406ca..2251dc707 100644
--- a/pkg/grpc/embed.go
+++ b/pkg/grpc/embed.go
@@ -153,6 +153,10 @@ func (e *embedBackend) Diarize(ctx context.Context, in *pb.DiarizeRequest, opts
 	return e.s.Diarize(ctx, in)
 }
 
+func (e *embedBackend) SoundDetection(ctx context.Context, in *pb.SoundDetectionRequest, opts ...grpc.CallOption) (*pb.SoundDetectionResponse, error) {
+	return e.s.SoundDetection(ctx, in)
+}
+
 func (e *embedBackend) AudioEncode(ctx context.Context, in *pb.AudioEncodeRequest, opts ...grpc.CallOption) (*pb.AudioEncodeResult, error) {
 	return e.s.AudioEncode(ctx, in)
 }
diff --git a/pkg/grpc/interface.go b/pkg/grpc/interface.go
index 888e36a0c..282735612 100644
--- a/pkg/grpc/interface.go
+++ b/pkg/grpc/interface.go
@@ -40,6 +40,7 @@ type AIModel interface {
 
 	VAD(*pb.VADRequest) (pb.VADResponse, error)
 	Diarize(*pb.DiarizeRequest) (pb.DiarizeResponse, error)
+	SoundDetection(context.Context, *pb.SoundDetectionRequest) (*pb.SoundDetectionResponse, error)
 
 	AudioEncode(*pb.AudioEncodeRequest) (*pb.AudioEncodeResult, error)
 	AudioDecode(*pb.AudioDecodeRequest) (*pb.AudioDecodeResult, error)
diff --git a/pkg/grpc/server.go b/pkg/grpc/server.go
index 35afb502c..53522f114 100644
--- a/pkg/grpc/server.go
+++ b/pkg/grpc/server.go
@@ -435,6 +435,14 @@ func (s *server) Diarize(ctx context.Context, in *pb.DiarizeRequest) (*pb.Diariz
 	return &res, nil
 }
 
+func (s *server) SoundDetection(ctx context.Context, in *pb.SoundDetectionRequest) (*pb.SoundDetectionResponse, error) {
+	if s.llm.Locking() {
+		s.llm.Lock()
+		defer s.llm.Unlock()
+	}
+	return s.llm.SoundDetection(ctx, in)
+}
+
 func (s *server) AudioEncode(ctx context.Context, in *pb.AudioEncodeRequest) (*pb.AudioEncodeResult, error) {
 	if s.llm.Locking() {
 		s.llm.Lock()
diff --git a/scripts/changed-backends.js b/scripts/changed-backends.js
index a2fe48e06..5690e00f5 100644
--- a/scripts/changed-backends.js
+++ b/scripts/changed-backends.js
@@ -26,6 +26,13 @@ function inferBackendPath(item) {
   if (item.backend === "parakeet-cpp") {
     return `backend/go/parakeet-cpp/`;
   }
+  // ced is a Go backend (Dockerfile.golang) wrapping the ced.cpp ggml port via
+  // purego, living in backend/go/ced/. Same explicit-branch rationale as
+  // parakeet-cpp above: the generic golang fallthrough would also resolve it,
+  // but this documents the mapping and guards a future dockerfile-suffix change.
+  if (item.backend === "ced") {
+    return `backend/go/ced/`;
+  }
   if (item.dockerfile.endsWith("golang")) {
     return `backend/go/${item.backend}/`;
   }
diff --git a/swagger/docs.go b/swagger/docs.go
index 20a1f5a3f..e01761643 100644
--- a/swagger/docs.go
+++ b/swagger/docs.go
@@ -1939,6 +1939,53 @@ const docTemplate = `{
                 }
             }
         },
+        "/v1/audio/classification": {
+            "post": {
+                "consumes": [
+                    "multipart/form-data"
+                ],
+                "tags": [
+                    "audio"
+                ],
+                "summary": "Classify sound events in audio (audio tagging).",
+                "parameters": [
+                    {
+                        "type": "string",
+                        "description": "model",
+                        "name": "model",
+                        "in": "formData",
+                        "required": true
+                    },
+                    {
+                        "type": "file",
+                        "description": "audio file",
+                        "name": "file",
+                        "in": "formData",
+                        "required": true
+                    },
+                    {
+                        "type": "integer",
+                        "description": "number of top tags to return (0 = backend default)",
+                        "name": "top_k",
+                        "in": "formData"
+                    },
+                    {
+                        "type": "number",
+                        "description": "drop tags scoring below this value",
+                        "name": "threshold",
+                        "in": "formData"
+                    }
+                ],
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "schema": {
+                            "$ref": "#/definitions/schema.SoundClassificationResult"
+                        }
+                    }
+                }
+            }
+        },
         "/v1/audio/diarization": {
             "post": {
                 "consumes": [
@@ -6084,6 +6131,34 @@ const docTemplate = `{
                 }
             }
         },
+        "schema.SoundClassification": {
+            "type": "object",
+            "properties": {
+                "index": {
+                    "type": "integer"
+                },
+                "label": {
+                    "type": "string"
+                },
+                "score": {
+                    "type": "number"
+                }
+            }
+        },
+        "schema.SoundClassificationResult": {
+            "type": "object",
+            "properties": {
+                "detections": {
+                    "type": "array",
+                    "items": {
+                        "$ref": "#/definitions/schema.SoundClassification"
+                    }
+                },
+                "model": {
+                    "type": "string"
+                }
+            }
+        },
         "schema.StreamOptions": {
             "type": "object",
             "properties": {
diff --git a/swagger/swagger.json b/swagger/swagger.json
index 09e03581b..5fc4ac638 100644
--- a/swagger/swagger.json
+++ b/swagger/swagger.json
@@ -1936,6 +1936,53 @@
                 }
             }
         },
+        "/v1/audio/classification": {
+            "post": {
+                "consumes": [
+                    "multipart/form-data"
+                ],
+                "tags": [
+                    "audio"
+                ],
+                "summary": "Classify sound events in audio (audio tagging).",
+                "parameters": [
+                    {
+                        "type": "string",
+                        "description": "model",
+                        "name": "model",
+                        "in": "formData",
+                        "required": true
+                    },
+                    {
+                        "type": "file",
+                        "description": "audio file",
+                        "name": "file",
+                        "in": "formData",
+                        "required": true
+                    },
+                    {
+                        "type": "integer",
+                        "description": "number of top tags to return (0 = backend default)",
+                        "name": "top_k",
+                        "in": "formData"
+                    },
+                    {
+                        "type": "number",
+                        "description": "drop tags scoring below this value",
+                        "name": "threshold",
+                        "in": "formData"
+                    }
+                ],
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "schema": {
+                            "$ref": "#/definitions/schema.SoundClassificationResult"
+                        }
+                    }
+                }
+            }
+        },
         "/v1/audio/diarization": {
             "post": {
                 "consumes": [
@@ -6081,6 +6128,34 @@
                 }
             }
         },
+        "schema.SoundClassification": {
+            "type": "object",
+            "properties": {
+                "index": {
+                    "type": "integer"
+                },
+                "label": {
+                    "type": "string"
+                },
+                "score": {
+                    "type": "number"
+                }
+            }
+        },
+        "schema.SoundClassificationResult": {
+            "type": "object",
+            "properties": {
+                "detections": {
+                    "type": "array",
+                    "items": {
+                        "$ref": "#/definitions/schema.SoundClassification"
+                    }
+                },
+                "model": {
+                    "type": "string"
+                }
+            }
+        },
         "schema.StreamOptions": {
             "type": "object",
             "properties": {
diff --git a/swagger/swagger.yaml b/swagger/swagger.yaml
index a25674539..f83ef14e8 100644
--- a/swagger/swagger.yaml
+++ b/swagger/swagger.yaml
@@ -2087,6 +2087,24 @@ definitions:
           classifier-side confidence signal).
         type: number
     type: object
+  schema.SoundClassification:
+    properties:
+      index:
+        type: integer
+      label:
+        type: string
+      score:
+        type: number
+    type: object
+  schema.SoundClassificationResult:
+    properties:
+      detections:
+        items:
+          $ref: '#/definitions/schema.SoundClassification'
+        type: array
+      model:
+        type: string
+    type: object
   schema.StreamOptions:
     properties:
       include_usage:
@@ -3770,6 +3788,37 @@ paths:
       summary: Generates audio from the input text.
       tags:
       - audio
+  /v1/audio/classification:
+    post:
+      consumes:
+      - multipart/form-data
+      parameters:
+      - description: model
+        in: formData
+        name: model
+        required: true
+        type: string
+      - description: audio file
+        in: formData
+        name: file
+        required: true
+        type: file
+      - description: number of top tags to return (0 = backend default)
+        in: formData
+        name: top_k
+        type: integer
+      - description: drop tags scoring below this value
+        in: formData
+        name: threshold
+        type: number
+      responses:
+        "200":
+          description: OK
+          schema:
+            $ref: '#/definitions/schema.SoundClassificationResult'
+      summary: Classify sound events in audio (audio tagging).
+      tags:
+      - audio
   /v1/audio/diarization:
     post:
       consumes:

From b7d67f57796de8b7a2f2f2243c95cd3bbb4c9698 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 08:43:40 +0200
Subject: [PATCH 40/99] chore: :arrow_up: Update ggml-org/llama.cpp to
 `7c082bc417bbe53210a83df4ba5b49e18ce6193c` (#10417)

:arrow_up: Update ggml-org/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/cpp/llama-cpp/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile
index bf9f4f608..f6e89c5ea 100644
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@
 
-LLAMA_VERSION?=e475fa2b5f9fb50c3d6fc3e7c6fdf1e004465b62
+LLAMA_VERSION?=7c082bc417bbe53210a83df4ba5b49e18ce6193c
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
 
 CMAKE_ARGS?=

From 64a4351f3aba05bb220babc345fe006159d2ee24 Mon Sep 17 00:00:00 2001
From: VJSai <vijaysaijnv@gmail.com>
Date: Mon, 22 Jun 2026 12:14:12 +0530
Subject: [PATCH 41/99] feat: send a LocalAI User-Agent on registry pulls
 (#10434)

LocalAI pulls models from OCI registries (via go-containerregistry), the
Ollama registry, and OCI blob stores (via oras), but every request went
out with the underlying library's generic User-Agent, so registry
operators had no way to attribute traffic to LocalAI.

Add an oci.UserAgent() helper that returns "LocalAI" (or
"LocalAI/<version>" when the binary is built with a version stamp via
internal.Version) and wire it into all three pull paths:

- pkg/oci/image.go: remote.WithUserAgent on the go-containerregistry
  image and digest requests
- pkg/oci/ollama.go: a User-Agent header on the Ollama manifest request
- pkg/oci/blob.go: a LocalAI User-Agent on the oras blob client. This
  mirrors oras' auth.DefaultClient (same retry.DefaultClient policy);
  only the advertised User-Agent changes.

Implements #6258.


Assisted-by: Claude:claude-opus-4-8 golangci-lint

Signed-off-by: Vijay Sai <vijaysaijnv@gmail.com>
---
 docs/content/getting-started/models.md |  4 ++++
 pkg/oci/blob.go                        | 12 ++++++++++
 pkg/oci/image.go                       |  2 ++
 pkg/oci/ollama.go                      |  1 +
 pkg/oci/useragent.go                   | 19 +++++++++++++++
 pkg/oci/useragent_test.go              | 32 ++++++++++++++++++++++++++
 6 files changed, 70 insertions(+)
 create mode 100644 pkg/oci/useragent.go
 create mode 100644 pkg/oci/useragent_test.go

diff --git a/docs/content/getting-started/models.md b/docs/content/getting-started/models.md
index cf949f715..b05f05728 100644
--- a/docs/content/getting-started/models.md
+++ b/docs/content/getting-started/models.md
@@ -131,6 +131,10 @@ local-ai run ollama://gemma:2b
 local-ai run oci://localai/phi-2:latest
 ```
 
+{{% notice note %}}
+When pulling models from Ollama or OCI registries, LocalAI identifies itself with a `LocalAI/<version>` `User-Agent` header so registry operators can attribute usage to LocalAI.
+{{% /notice %}}
+
 ### Run Models via URI
 
 To run models via URI, specify a URI to a model file or a configuration file when starting LocalAI. Valid syntax includes:
diff --git a/pkg/oci/blob.go b/pkg/oci/blob.go
index 0f5a2cf66..e034c4162 100644
--- a/pkg/oci/blob.go
+++ b/pkg/oci/blob.go
@@ -11,6 +11,8 @@ import (
 
 	oras "oras.land/oras-go/v2"
 	"oras.land/oras-go/v2/registry/remote"
+	"oras.land/oras-go/v2/registry/remote/auth"
+	"oras.land/oras-go/v2/registry/remote/retry"
 )
 
 func FetchImageBlob(ctx context.Context, r, reference, dst string, statusReader func(ocispec.Descriptor) io.Writer) error {
@@ -28,6 +30,16 @@ func FetchImageBlob(ctx context.Context, r, reference, dst string, statusReader
 	}
 	repo.SkipReferrersGC = true
 
+	// Identify LocalAI to the registry. This mirrors oras' auth.DefaultClient
+	// (same retry policy) but advertises a LocalAI User-Agent instead of the
+	// library default.
+	client := &auth.Client{
+		Client: retry.DefaultClient,
+		Cache:  auth.NewCache(),
+	}
+	client.SetUserAgent(UserAgent())
+	repo.Client = client
+
 	// https://github.com/oras-project/oras/blob/main/cmd/oras/internal/option/remote.go#L364
 	// https://github.com/oras-project/oras/blob/main/cmd/oras/root/blob/fetch.go#L136
 	desc, reader, err := oras.Fetch(ctx, repo.Blobs(), reference, oras.DefaultFetchOptions)
diff --git a/pkg/oci/image.go b/pkg/oci/image.go
index 2d00c3479..4dad02c7d 100644
--- a/pkg/oci/image.go
+++ b/pkg/oci/image.go
@@ -176,6 +176,7 @@ func GetImage(targetImage, targetPlatform string, auth *registrytypes.AuthConfig
 	opts := []remote.Option{
 		remote.WithTransport(tr),
 		remote.WithPlatform(*platform),
+		remote.WithUserAgent(UserAgent()),
 	}
 	if auth != nil {
 		opts = append(opts, remote.WithAuth(staticAuth{auth}))
@@ -223,6 +224,7 @@ func GetImageDigest(targetImage, targetPlatform string, auth *registrytypes.Auth
 	opts := []remote.Option{
 		remote.WithTransport(tr),
 		remote.WithPlatform(*platform),
+		remote.WithUserAgent(UserAgent()),
 	}
 	if auth != nil {
 		opts = append(opts, remote.WithAuth(staticAuth{auth}))
diff --git a/pkg/oci/ollama.go b/pkg/oci/ollama.go
index 2fb928281..f0a874013 100644
--- a/pkg/oci/ollama.go
+++ b/pkg/oci/ollama.go
@@ -47,6 +47,7 @@ func OllamaModelManifest(image string) (*Manifest, error) {
 		return nil, err
 	}
 	req.Header.Set("Accept", "application/vnd.docker.distribution.manifest.v2+json")
+	req.Header.Set("User-Agent", UserAgent())
 	client := httpclient.New(httpclient.WithFollowRedirects())
 	resp, err := client.Do(req)
 	if err != nil {
diff --git a/pkg/oci/useragent.go b/pkg/oci/useragent.go
new file mode 100644
index 000000000..82277c70d
--- /dev/null
+++ b/pkg/oci/useragent.go
@@ -0,0 +1,19 @@
+package oci
+
+import (
+	"fmt"
+
+	"github.com/mudler/LocalAI/internal"
+)
+
+// UserAgent returns the User-Agent string LocalAI sends on outbound registry
+// requests (OCI registries and Ollama). It identifies the client as LocalAI
+// and, when the binary was built with a version stamp, appends it so registries
+// can attribute client-side usage to LocalAI rather than to the generic
+// User-Agent of the underlying transport library.
+func UserAgent() string {
+	if internal.Version == "" {
+		return "LocalAI"
+	}
+	return fmt.Sprintf("LocalAI/%s", internal.Version)
+}
diff --git a/pkg/oci/useragent_test.go b/pkg/oci/useragent_test.go
new file mode 100644
index 000000000..14a10534c
--- /dev/null
+++ b/pkg/oci/useragent_test.go
@@ -0,0 +1,32 @@
+package oci_test
+
+import (
+	"github.com/mudler/LocalAI/internal"
+	. "github.com/mudler/LocalAI/pkg/oci"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("OCI", func() {
+	Context("UserAgent", func() {
+		var savedVersion string
+
+		BeforeEach(func() {
+			savedVersion = internal.Version
+		})
+
+		AfterEach(func() {
+			internal.Version = savedVersion
+		})
+
+		It("identifies as LocalAI when no version is stamped", func() {
+			internal.Version = ""
+			Expect(UserAgent()).To(Equal("LocalAI"))
+		})
+
+		It("appends the build version when one is stamped", func() {
+			internal.Version = "v3.2.1"
+			Expect(UserAgent()).To(Equal("LocalAI/v3.2.1"))
+		})
+	})
+})

From 20c643e1f6ce5e1e572404ead7123dc97b735f31 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 08:46:34 +0200
Subject: [PATCH 42/99] chore(model gallery): :robot: add 1 new models via
 gallery agent (#10439)

chore(model gallery): :robot: add new models via gallery agent

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 gallery/index.yaml | 92 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index cde505d72..710bc2740 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,96 @@
 ---
+- name: "glm-5.2"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/unsloth/GLM-5.2-GGUF
+  description: |
+    # GLM-5.2
+
+    👋 Join our WeChat or Discord community.
+
+    📖 Check out the GLM-5.2 blog and GLM-5 Technical report.
+
+    📍 Use GLM-5.2 API services on Z.ai API Platform.
+
+    🔜 Try GLM-5.2 here.
+
+    [Paper]
+    [GitHub]
+
+    ## Introduction
+
+    We're introducing GLM-5.2, our latest flagship model for long-horizon tasks. It marks a substantial leap in long-horizon task capability over its predecessor GLM-5.1 and, for the first time, delivers that capability on a **solid 1M-token context**. GLM-5.2's new capabilities include:
+      - **Solid 1M Context:** A solid 1M-token context that stably sustains long-horizon work
+      - **Advanced Coding with Flexible Effort**: Stronger coding capabilities with multiple thinking effort levels to balance performance and latency
+      - **Improved Architecture**: We propose IndexShare, which reuses the same indexer across every four sparse attention layers, reducing per-token FLOPs by 2.9× at a 1M context length. We also improve GLM-5.2’s MTP layer for speculative decoding, increasing the acceptance length by up to 20%
+      - **Pure Open**: An MIT open-source license — no regional limits, technical access without borders
+
+    ## Benchmark
+
+    ## Serve GLM-5.2 Locally
+
+    ...
+  license: "mit"
+  tags:
+    - llm
+    - gguf
+  icon: https://raw.githubusercontent.com/zai-org/GLM-5/refs/heads/main/resources/bench_52.png
+  overrides:
+    backend: llama-cpp
+    function:
+      automatic_tool_parsing_fallback: true
+      grammar:
+        disable: true
+    known_usecases:
+      - chat
+    options:
+      - use_jinja:true
+      - spec_type:draft-mtp
+      - spec_n_max:6
+      - spec_p_min:0.75
+    parameters:
+      min_p: 0.01
+      model: llama-cpp/models/GLM-5.2-GGUF/GLM-5.2-UD-Q4_K_M-00001-of-00011.gguf
+      repeat_penalty: 1
+      temperature: 1
+      top_k: -1
+      top_p: 0.95
+    template:
+      use_tokenizer_template: true
+  files:
+    - filename: llama-cpp/models/GLM-5.2-GGUF/GLM-5.2-UD-Q4_K_M-00001-of-00011.gguf
+      sha256: 3256ac8c290273f0965ff39e93a8bcd07dc99bcd23e923bd4b7306ef39061038
+      uri: https://huggingface.co/unsloth/GLM-5.2-GGUF/resolve/main/UD-Q4_K_M/GLM-5.2-UD-Q4_K_M-00001-of-00011.gguf
+    - filename: llama-cpp/models/GLM-5.2-GGUF/GLM-5.2-UD-Q4_K_M-00002-of-00011.gguf
+      sha256: 1020105e78d862988a6cabb3a78eafa75f29666ab8a5fd10de1b9b8c8a6bc5e8
+      uri: https://huggingface.co/unsloth/GLM-5.2-GGUF/resolve/main/UD-Q4_K_M/GLM-5.2-UD-Q4_K_M-00002-of-00011.gguf
+    - filename: llama-cpp/models/GLM-5.2-GGUF/GLM-5.2-UD-Q4_K_M-00003-of-00011.gguf
+      sha256: 0b36f406e120759290894ea4960d5086f9b362a8c8f9c7fcaad24b4471172efb
+      uri: https://huggingface.co/unsloth/GLM-5.2-GGUF/resolve/main/UD-Q4_K_M/GLM-5.2-UD-Q4_K_M-00003-of-00011.gguf
+    - filename: llama-cpp/models/GLM-5.2-GGUF/GLM-5.2-UD-Q4_K_M-00004-of-00011.gguf
+      sha256: 04b19199f52ba29e7f9966b15df3fbc2d1e5c56cd6343c405076be7174d49d32
+      uri: https://huggingface.co/unsloth/GLM-5.2-GGUF/resolve/main/UD-Q4_K_M/GLM-5.2-UD-Q4_K_M-00004-of-00011.gguf
+    - filename: llama-cpp/models/GLM-5.2-GGUF/GLM-5.2-UD-Q4_K_M-00005-of-00011.gguf
+      sha256: 5cb76d724ee16e80c1cb6aba29aacd76161e7a6f147079be3447501c06d95f2c
+      uri: https://huggingface.co/unsloth/GLM-5.2-GGUF/resolve/main/UD-Q4_K_M/GLM-5.2-UD-Q4_K_M-00005-of-00011.gguf
+    - filename: llama-cpp/models/GLM-5.2-GGUF/GLM-5.2-UD-Q4_K_M-00006-of-00011.gguf
+      sha256: ec2c65255c834b686f066e350bc5b8d8a7020cd1133f0ee9e819d2fb5d3afad0
+      uri: https://huggingface.co/unsloth/GLM-5.2-GGUF/resolve/main/UD-Q4_K_M/GLM-5.2-UD-Q4_K_M-00006-of-00011.gguf
+    - filename: llama-cpp/models/GLM-5.2-GGUF/GLM-5.2-UD-Q4_K_M-00007-of-00011.gguf
+      sha256: 53c8328852ca0b6791a9a9243bcc56157305adca8526a646054389845e7445a9
+      uri: https://huggingface.co/unsloth/GLM-5.2-GGUF/resolve/main/UD-Q4_K_M/GLM-5.2-UD-Q4_K_M-00007-of-00011.gguf
+    - filename: llama-cpp/models/GLM-5.2-GGUF/GLM-5.2-UD-Q4_K_M-00008-of-00011.gguf
+      sha256: 9a23bfb21c5f6fcc94b0329c108ec1ef3fdbd815c57eeb0bf105d26861d7271e
+      uri: https://huggingface.co/unsloth/GLM-5.2-GGUF/resolve/main/UD-Q4_K_M/GLM-5.2-UD-Q4_K_M-00008-of-00011.gguf
+    - filename: llama-cpp/models/GLM-5.2-GGUF/GLM-5.2-UD-Q4_K_M-00009-of-00011.gguf
+      sha256: 71088054fb1a09a4f38e2ee8a726526790660a4f77ead817f75cb7a484bdb0b8
+      uri: https://huggingface.co/unsloth/GLM-5.2-GGUF/resolve/main/UD-Q4_K_M/GLM-5.2-UD-Q4_K_M-00009-of-00011.gguf
+    - filename: llama-cpp/models/GLM-5.2-GGUF/GLM-5.2-UD-Q4_K_M-00010-of-00011.gguf
+      sha256: 848db99658faf24971df23638281305a15bdc187cbcaed968952ed9e9c835b50
+      uri: https://huggingface.co/unsloth/GLM-5.2-GGUF/resolve/main/UD-Q4_K_M/GLM-5.2-UD-Q4_K_M-00010-of-00011.gguf
+    - filename: llama-cpp/models/GLM-5.2-GGUF/GLM-5.2-UD-Q4_K_M-00011-of-00011.gguf
+      sha256: 629e23bce250fb500d9a190de7249c2882af524aacc112ce507a871ed5bebf90
+      uri: https://huggingface.co/unsloth/GLM-5.2-GGUF/resolve/main/UD-Q4_K_M/GLM-5.2-UD-Q4_K_M-00011-of-00011.gguf
 - name: "qwen3.6-35b-a3b-nvfp4-mtp"
   url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
   urls:

From 682fb2718c9ead61ae3fd5e0a4525fa2e273abba Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 09:06:20 +0200
Subject: [PATCH 43/99] fix(distributed): detach cold-load staging from the
 request context (#10438)

A model not yet loaded on a worker is staged lazily on the inference
request path. Staging a multi-GB model takes minutes - far longer than
any client keeps its HTTP request open - so a browser refresh, an
ingress/LB idle-timeout, or a round-robined retry landing on another
frontend replica cancels the request context and aborts the upload with
"context canceled" mid-transfer. Large models then never finish staging,
so they never load (observed in a 2-replica deployment: both frontends
repeatedly failed to stage a 15.7 GB GGUF, each attempt dying at a
different offset).

Bind the cold load (staging + LoadModel + the per-model advisory lock) to
context.WithoutCancel(ctx): it keeps the request's values (prefix chain)
but drops cancellation/deadline. Each long step keeps its own bound (the
file stager's resume budget, LoadModel's 5m timeout), and the advisory
lock still de-dupes concurrent loaders across replicas.


Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/services/nodes/router.go                 | 23 ++++--
 .../nodes/router_staging_context_test.go      | 80 +++++++++++++++++++
 2 files changed, 98 insertions(+), 5 deletions(-)
 create mode 100644 core/services/nodes/router_staging_context_test.go

diff --git a/core/services/nodes/router.go b/core/services/nodes/router.go
index ccbf48f43..f26fea2b9 100644
--- a/core/services/nodes/router.go
+++ b/core/services/nodes/router.go
@@ -359,8 +359,21 @@ func (r *SmartRouter) Route(ctx context.Context, modelID, modelName, backendType
 		}
 	}
 
-	// Step 2: Model not loaded — schedule loading with distributed lock to prevent duplicates
-	loadModel := func() (*RouteResult, error) {
+	// Step 2: Model not loaded — schedule loading with distributed lock to prevent duplicates.
+	//
+	// Detach the cold-load from the caller's context. Staging a model can
+	// transfer multiple GB to a worker, which takes far longer than any client
+	// keeps its HTTP request open — a browser refresh, an ingress/LB idle
+	// timeout, or a round-robined retry landing on another replica all cancel
+	// the request context. If staging were bound to it, the multi-GB upload
+	// aborts with "context canceled" mid-transfer and large models can never
+	// finish staging (the model-load outage). WithoutCancel keeps the request's
+	// values (prefix chain, etc.) but drops its cancellation/deadline. Each
+	// long step still has its own bound (the file stager's resume budget,
+	// LoadModel's 5m timeout), and the per-model advisory lock below de-dupes
+	// concurrent loaders across replicas.
+	loadCtx := context.WithoutCancel(ctx)
+	loadModel := func(ctx context.Context) (*RouteResult, error) {
 		// Re-check after acquiring lock — another request may have loaded it
 		node, nm, err := r.registry.FindAndLockNodeWithModel(ctx, trackingKey, candidateNodeIDs, pref)
 		if err == nil && node != nil {
@@ -433,9 +446,9 @@ func (r *SmartRouter) Route(ctx context.Context, modelID, modelName, backendType
 	if r.db != nil {
 		lockKey := advisorylock.KeyFromString("model-load:" + trackingKey)
 		var result *RouteResult
-		lockErr := advisorylock.WithLockCtx(ctx, r.db, lockKey, func() error {
+		lockErr := advisorylock.WithLockCtx(loadCtx, r.db, lockKey, func() error {
 			var err error
-			result, err = loadModel()
+			result, err = loadModel(loadCtx)
 			return err
 		})
 		if lockErr != nil {
@@ -444,7 +457,7 @@ func (r *SmartRouter) Route(ctx context.Context, modelID, modelName, backendType
 		return result, nil
 	}
 	// No DB (non-distributed) — proceed without lock
-	return loadModel()
+	return loadModel(loadCtx)
 }
 
 // parseSelectorJSON decodes a JSON node selector string into a map.
diff --git a/core/services/nodes/router_staging_context_test.go b/core/services/nodes/router_staging_context_test.go
new file mode 100644
index 000000000..6fa892689
--- /dev/null
+++ b/core/services/nodes/router_staging_context_test.go
@@ -0,0 +1,80 @@
+package nodes
+
+import (
+	"context"
+	"errors"
+	"os"
+	"path/filepath"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/services/messaging"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+)
+
+// cancelOnStageStager simulates the triggering HTTP request being abandoned
+// (client disconnect, ingress idle-timeout) the moment a multi-GB file starts
+// staging. It cancels the request context and records whether the context the
+// stager itself received was cancelled as a result.
+type cancelOnStageStager struct {
+	fakeFileStager
+	cancelRequest context.CancelFunc
+	staged        bool
+	ctxErrOnStage error
+}
+
+func (s *cancelOnStageStager) EnsureRemote(ctx context.Context, _, _, key string) (string, error) {
+	s.staged = true
+	// Mid-transfer: the client gives up on the (minutes-long) request.
+	if s.cancelRequest != nil {
+		s.cancelRequest()
+	}
+	// A multi-GB upload must survive this. If staging were bound to the
+	// request context, ctx is now cancelled and the real HTTP stager would
+	// abort with "context canceled" — exactly the production outage.
+	s.ctxErrOnStage = ctx.Err()
+	return "/remote/" + key, nil
+}
+
+var _ = Describe("Route cold-load staging context", func() {
+	It("detaches staging from the request context so a client disconnect cannot abort a multi-GB transfer", func() {
+		// A real model file so stageModelFiles actually calls the stager
+		// (non-existent paths are skipped).
+		tmp := GinkgoT().TempDir()
+		modelFile := filepath.Join(tmp, "big.gguf")
+		Expect(os.WriteFile(modelFile, []byte("weights"), 0o644)).To(Succeed())
+
+		reg := &fakeModelRouter{
+			findAndLockErr: errors.New("not loaded"),
+			findIdleNode:   &BackendNode{ID: "n1", Name: "worker-1", Address: "10.0.0.1:50051"},
+		}
+		backend := &stubBackend{loadResult: &pb.Result{Success: true}}
+		factory := &stubClientFactory{client: backend}
+		unloader := &fakeUnloader{installReply: &messaging.BackendInstallReply{
+			Success: true,
+			Address: "10.0.0.1:9001",
+		}}
+		stager := &cancelOnStageStager{}
+
+		router := NewSmartRouter(reg, SmartRouterOptions{
+			Unloader:      unloader,
+			ClientFactory: factory,
+			FileStager:    stager,
+			// DB nil: no advisory lock, exercises the same detached load ctx.
+		})
+
+		ctx, cancel := context.WithCancel(context.Background())
+		stager.cancelRequest = cancel
+		defer cancel()
+
+		result, err := router.Route(ctx, "big-model", filepath.Join("models", "big.gguf"), "llama-cpp",
+			&pb.ModelOptions{Model: "big.gguf", ModelFile: modelFile}, false)
+
+		Expect(err).ToNot(HaveOccurred())
+		Expect(result).ToNot(BeNil())
+		Expect(stager.staged).To(BeTrue(), "staging must have been attempted")
+		Expect(stager.ctxErrOnStage).ToNot(HaveOccurred(),
+			"staging context must survive cancellation of the triggering request")
+	})
+})

From 569d9bbd9e31a81d072094f806fc07c2c5bf64fc Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 09:28:07 +0200
Subject: [PATCH 44/99] fix(distributed): broadcast file-staging progress
 across replicas (#10440)

File-staging progress lived only in the SmartRouter's in-memory
StagingTracker on the replica performing the transfer. In a multi-replica
deployment behind a round-robin load balancer, a /api/operations poll
that lands on any other replica saw no staging row, so the progress
("processing file ... Total ... Current ...") flickered in and out as
polls rotated between frontends.

Mirror the pattern already used for gallery-install progress: the origin
replica broadcasts staging ticks over NATS (SubjectStagingProgress, a
new staging.<model>.progress subject), and peers merge them via
ApplyRemote (SubscribeBroadcasts on the wildcard). Byte-level ticks are
leading-edge debounced (~1/s); Start/FileComplete/Complete always
publish. A locally-owned op stays authoritative so the origin's own echo
and stray peer events can't clobber it, and mirrored remote ops expire
after a TTL so a missed Done event can't leave a phantom row. The UI read
path (StagingTracker.GetAll) is unchanged.


Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/application/distributed.go               |   9 +
 core/services/messaging/subjects.go           |  16 ++
 core/services/nodes/staging_progress.go       | 217 +++++++++++++++---
 .../nodes/staging_progress_broadcast_test.go  | 109 +++++++++
 4 files changed, 317 insertions(+), 34 deletions(-)
 create mode 100644 core/services/nodes/staging_progress_broadcast_test.go

diff --git a/core/application/distributed.go b/core/application/distributed.go
index 00c39422d..3235e4304 100644
--- a/core/application/distributed.go
+++ b/core/application/distributed.go
@@ -357,6 +357,15 @@ func initDistributed(cfg *config.ApplicationConfig, authDB *gorm.DB, configLoade
 		Pressure:         pressure,
 	})
 
+	// Wire staging-progress broadcasting so file-staging shows up on every
+	// replica, not just the one performing the transfer. Without this, a
+	// /api/operations poll that round-robins onto a peer sees no staging row and
+	// the progress flickers. The origin publishes; peers mirror via the wildcard.
+	router.StagingTracker().SetPublisher(natsClient)
+	if _, err := router.StagingTracker().SubscribeBroadcasts(natsClient); err != nil {
+		xlog.Warn("Failed to subscribe to staging progress broadcasts", "error", err)
+	}
+
 	// Create ReplicaReconciler for auto-scaling model replicas. Adapter +
 	// RegistrationToken feed the state-reconciliation passes: pending op
 	// drain uses the adapter, and model health probes use the token to auth
diff --git a/core/services/messaging/subjects.go b/core/services/messaging/subjects.go
index 1bee20f44..7d099460c 100644
--- a/core/services/messaging/subjects.go
+++ b/core/services/messaging/subjects.go
@@ -64,6 +64,22 @@ func SubjectGalleryProgress(opID string) string {
 	return subjectGalleryPrefix + sanitizeSubjectToken(opID) + ".progress"
 }
 
+// SubjectStagingProgress returns the NATS subject a frontend replica publishes
+// file-staging progress on. Staging progress is otherwise per-process state
+// (the SmartRouter's in-memory StagingTracker), so without this broadcast a
+// /api/operations poll that round-robins onto a replica that did not originate
+// the staging op sees nothing - the progress row flickers in multi-replica
+// deployments. Peers subscribe to the wildcard and merge.
+func SubjectStagingProgress(modelID string) string {
+	return subjectStagingPrefix + sanitizeSubjectToken(modelID) + ".progress"
+}
+
+const subjectStagingPrefix = "staging."
+
+// SubjectStagingProgressWildcard matches every replica's staging-progress
+// broadcasts so a peer can mirror staging ops it did not originate.
+const SubjectStagingProgressWildcard = "staging.*.progress"
+
 // SubjectGalleryOpStart and SubjectGalleryOpEnd are broadcast subjects for the
 // in-memory OpCache lifecycle. Frontend replicas publish to these when an
 // admin admits a new install/delete (Start) and when an operation is
diff --git a/core/services/nodes/staging_progress.go b/core/services/nodes/staging_progress.go
index 3d066c0fa..0a6ddc50e 100644
--- a/core/services/nodes/staging_progress.go
+++ b/core/services/nodes/staging_progress.go
@@ -5,58 +5,138 @@ import (
 	"fmt"
 	"sync"
 	"time"
+
+	"github.com/mudler/LocalAI/core/services/messaging"
 )
 
 // StagingStatus represents the current progress of a model staging operation.
 type StagingStatus struct {
-	ModelID    string  `json:"model_id"`
-	NodeName   string  `json:"node_name"`
-	FileName   string  `json:"file_name"`
-	BytesSent  int64   `json:"bytes_sent"`
-	TotalBytes int64   `json:"total_bytes"`
-	Progress   float64 `json:"progress"` // 0-100 overall progress
-	Speed      string  `json:"speed"`
-	FileIndex  int     `json:"file_index"`
-	TotalFiles int     `json:"total_files"`
-	Message    string  `json:"message"`
+	ModelID    string    `json:"model_id"`
+	NodeName   string    `json:"node_name"`
+	FileName   string    `json:"file_name"`
+	BytesSent  int64     `json:"bytes_sent"`
+	TotalBytes int64     `json:"total_bytes"`
+	Progress   float64   `json:"progress"` // 0-100 overall progress
+	Speed      string    `json:"speed"`
+	FileIndex  int       `json:"file_index"`
+	TotalFiles int       `json:"total_files"`
+	Message    string    `json:"message"`
 	StartedAt  time.Time `json:"started_at"`
 }
 
+const (
+	// stagingBroadcastInterval bounds how often byte-level UpdateFile ticks are
+	// re-broadcast to peers (leading-edge debounce). State transitions (Start,
+	// FileComplete, Complete) always publish so peers never miss them.
+	stagingBroadcastInterval = time.Second
+	// stagingRemoteTTL drops a mirrored (remote) op whose last update is older
+	// than this. NATS pub/sub is fire-and-forget, so a missed Done event would
+	// otherwise leave a phantom staging row on a peer forever; a live op
+	// refreshes its mirror at least every stagingBroadcastInterval.
+	stagingRemoteTTL = 60 * time.Second
+)
+
+// stagingEntry wraps a StagingStatus with the bookkeeping needed to keep peer
+// replicas consistent: whether this op is mirrored from a peer (remote) vs.
+// owned locally, when it was last updated (for remote-mirror expiry), and when
+// its byte progress was last broadcast (for debounce).
+type stagingEntry struct {
+	status    StagingStatus
+	remote    bool
+	updatedAt time.Time
+	lastPub   time.Time
+}
+
 // StagingTracker tracks active file staging operations in-memory.
 // Used by SmartRouter to publish progress and by /api/operations to surface it.
+//
+// In distributed mode each frontend replica runs its own tracker. The replica
+// performing a transfer owns the op locally and broadcasts progress over NATS
+// (SetPublisher); peers mirror it via ApplyRemote (SubscribeBroadcasts) so a
+// /api/operations poll that round-robins onto any replica surfaces the op.
 type StagingTracker struct {
-	mu     sync.RWMutex
-	active map[string]*StagingStatus
+	mu        sync.RWMutex
+	active    map[string]*stagingEntry
+	publisher messaging.Publisher
+}
+
+// StagingProgressEvent is the wire payload a frontend replica broadcasts on
+// SubjectStagingProgress so peer replicas can mirror a staging op they did not
+// originate. Done signals the op finished (peers drop their mirrored copy).
+type StagingProgressEvent struct {
+	ModelID string         `json:"model_id"`
+	Status  *StagingStatus `json:"status,omitempty"`
+	Done    bool           `json:"done"`
 }
 
 // NewStagingTracker creates a new tracker.
 func NewStagingTracker() *StagingTracker {
 	return &StagingTracker{
-		active: make(map[string]*StagingStatus),
+		active: make(map[string]*stagingEntry),
 	}
 }
 
+// SetPublisher wires the NATS publisher used to broadcast staging progress to
+// peer replicas. No-op publisher (nil) keeps the tracker standalone.
+func (t *StagingTracker) SetPublisher(p messaging.Publisher) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.publisher = p
+}
+
+// SubscribeBroadcasts subscribes to peer replicas' staging-progress broadcasts
+// and mirrors them into this tracker, so /api/operations on any replica surfaces
+// staging ops it did not originate. Returns the subscription for cleanup.
+func (t *StagingTracker) SubscribeBroadcasts(nc messaging.MessagingClient) (messaging.Subscription, error) {
+	return messaging.SubscribeJSON(nc, messaging.SubjectStagingProgressWildcard, func(evt StagingProgressEvent) {
+		if evt.ModelID == "" {
+			return
+		}
+		t.ApplyRemote(evt)
+	})
+}
+
+// publishStaging emits an event to the per-model staging subject. The publisher
+// is captured by the caller under the lock and passed in, so publishing happens
+// outside the lock (a slow NATS link must not stall the staging copy loop).
+func publishStaging(p messaging.Publisher, evt StagingProgressEvent) {
+	if p == nil {
+		return
+	}
+	_ = p.Publish(messaging.SubjectStagingProgress(evt.ModelID), evt)
+}
+
 // Start registers a new staging operation for the given model.
 func (t *StagingTracker) Start(modelID, nodeName string, totalFiles int) {
 	t.mu.Lock()
-	defer t.mu.Unlock()
-	t.active[modelID] = &StagingStatus{
-		ModelID:    modelID,
-		NodeName:   nodeName,
-		TotalFiles: totalFiles,
-		StartedAt:  time.Now(),
-		Message:    "Preparing to stage model files",
+	e := &stagingEntry{
+		status: StagingStatus{
+			ModelID:    modelID,
+			NodeName:   nodeName,
+			TotalFiles: totalFiles,
+			StartedAt:  time.Now(),
+			Message:    "Preparing to stage model files",
+		},
+		updatedAt: time.Now(),
+		// lastPub stays zero so the first UpdateFile tick always broadcasts.
 	}
+	t.active[modelID] = e
+	pub := t.publisher
+	snap := e.status
+	t.mu.Unlock()
+
+	publishStaging(pub, StagingProgressEvent{ModelID: modelID, Status: &snap})
 }
 
 // UpdateFile updates the tracker with current file transfer progress.
 func (t *StagingTracker) UpdateFile(modelID, fileName string, fileIndex int, bytesSent, totalBytes int64, speed string) {
 	t.mu.Lock()
-	defer t.mu.Unlock()
-	s, ok := t.active[modelID]
+	e, ok := t.active[modelID]
 	if !ok {
+		t.mu.Unlock()
 		return
 	}
+	s := &e.status
 	s.FileName = fileName
 	s.FileIndex = fileIndex
 	s.BytesSent = bytesSent
@@ -79,52 +159,121 @@ func (t *StagingTracker) UpdateFile(modelID, fileName string, fileIndex int, byt
 	} else {
 		s.Message = fmt.Sprintf("Staging %s", fileName)
 	}
+
+	e.updatedAt = time.Now()
+	// Leading-edge debounce: byte ticks fire many times per second; only
+	// re-broadcast at most once per stagingBroadcastInterval.
+	var pub messaging.Publisher
+	var snap StagingStatus
+	if time.Since(e.lastPub) >= stagingBroadcastInterval {
+		e.lastPub = time.Now()
+		pub = t.publisher
+		snap = e.status
+	}
+	t.mu.Unlock()
+
+	if pub != nil {
+		publishStaging(pub, StagingProgressEvent{ModelID: modelID, Status: &snap})
+	}
 }
 
 // FileComplete marks a single file as done within a staging operation.
 func (t *StagingTracker) FileComplete(modelID string, fileIndex, totalFiles int) {
 	t.mu.Lock()
-	defer t.mu.Unlock()
-	s, ok := t.active[modelID]
+	e, ok := t.active[modelID]
 	if !ok {
+		t.mu.Unlock()
 		return
 	}
+	s := &e.status
 	if totalFiles > 0 {
 		s.Progress = float64(fileIndex) / float64(totalFiles) * 100
 	}
 	s.BytesSent = 0
 	s.TotalBytes = 0
 	s.Speed = ""
+	e.updatedAt = time.Now()
+	e.lastPub = time.Now()
+	pub := t.publisher
+	snap := e.status
+	t.mu.Unlock()
+
+	// Always broadcast a per-file completion so peers' progress bars advance.
+	publishStaging(pub, StagingProgressEvent{ModelID: modelID, Status: &snap})
 }
 
 // Complete removes a staging operation (it's done).
 func (t *StagingTracker) Complete(modelID string) {
 	t.mu.Lock()
-	defer t.mu.Unlock()
+	_, ok := t.active[modelID]
 	delete(t.active, modelID)
+	pub := t.publisher
+	t.mu.Unlock()
+
+	if ok {
+		// Tell peers to drop their mirrored copy.
+		publishStaging(pub, StagingProgressEvent{ModelID: modelID, Done: true})
+	}
 }
 
-// GetAll returns a snapshot of all active staging operations.
+// ApplyRemote merges a peer replica's staging broadcast into this tracker. It
+// never re-broadcasts (no echo loop). A locally-owned op is authoritative: a
+// remote event for the same model is ignored, so the origin replica receiving
+// its own broadcast (and any stray peer event) cannot clobber or delete it.
+func (t *StagingTracker) ApplyRemote(evt StagingProgressEvent) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	if existing, ok := t.active[evt.ModelID]; ok && !existing.remote {
+		// We own this op locally — ignore peer chatter about it.
+		return
+	}
+	if evt.Done {
+		delete(t.active, evt.ModelID)
+		return
+	}
+	if evt.Status == nil {
+		return
+	}
+	t.active[evt.ModelID] = &stagingEntry{
+		status:    *evt.Status,
+		remote:    true,
+		updatedAt: time.Now(),
+	}
+}
+
+// GetAll returns a snapshot of all active staging operations. Stale remote
+// mirrors (a peer op whose Done event was missed) are pruned here so they don't
+// linger in the UI.
 func (t *StagingTracker) GetAll() map[string]StagingStatus {
-	t.mu.RLock()
-	defer t.mu.RUnlock()
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	now := time.Now()
 	result := make(map[string]StagingStatus, len(t.active))
-	for k, v := range t.active {
-		result[k] = *v
+	for k, e := range t.active {
+		if e.remote && now.Sub(e.updatedAt) > stagingRemoteTTL {
+			delete(t.active, k)
+			continue
+		}
+		result[k] = e.status
 	}
 	return result
 }
 
-// Get returns the status of a specific staging operation, or nil if not active.
+// Get returns the status of a specific staging operation, or nil if not active
+// (or a stale remote mirror).
 func (t *StagingTracker) Get(modelID string) *StagingStatus {
 	t.mu.RLock()
 	defer t.mu.RUnlock()
-	s, ok := t.active[modelID]
+	e, ok := t.active[modelID]
 	if !ok {
 		return nil
 	}
-	copy := *s
-	return &copy
+	if e.remote && time.Since(e.updatedAt) > stagingRemoteTTL {
+		return nil
+	}
+	s := e.status
+	return &s
 }
 
 // StagingProgressCallback is called by file stagers to report byte-level progress.
diff --git a/core/services/nodes/staging_progress_broadcast_test.go b/core/services/nodes/staging_progress_broadcast_test.go
new file mode 100644
index 000000000..0f0f0db1e
--- /dev/null
+++ b/core/services/nodes/staging_progress_broadcast_test.go
@@ -0,0 +1,109 @@
+package nodes
+
+import (
+	"encoding/json"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/services/messaging"
+)
+
+// decodeStagingEvents extracts every StagingProgressEvent the fake messaging
+// client captured, in publish order.
+func decodeStagingEvents(mc *fakeMessagingClient) []StagingProgressEvent {
+	mc.mu.Lock()
+	defer mc.mu.Unlock()
+	var out []StagingProgressEvent
+	for _, p := range mc.published {
+		var evt StagingProgressEvent
+		if err := json.Unmarshal(p.Data, &evt); err != nil {
+			continue
+		}
+		if evt.ModelID == "" {
+			continue
+		}
+		out = append(out, evt)
+	}
+	return out
+}
+
+var _ = Describe("StagingTracker cross-replica broadcast", func() {
+	Context("when a publisher is wired (distributed mode)", func() {
+		It("broadcasts staging progress so a peer replica surfaces an op it did not originate", func() {
+			mc := &fakeMessagingClient{}
+			origin := NewStagingTracker()
+			origin.SetPublisher(mc)
+
+			origin.Start("model-x", "worker-1", 1)
+			origin.UpdateFile("model-x", "weights.gguf", 1, 5<<30, 10<<30, "100 MiB/s")
+
+			events := decodeStagingEvents(mc)
+			Expect(events).ToNot(BeEmpty(), "writes must be broadcast over NATS")
+			Expect(mc.published[0].Subject).To(Equal(messaging.SubjectStagingProgress("model-x")))
+
+			// A peer replica that never ran the op merges the broadcast.
+			peer := NewStagingTracker()
+			for _, evt := range events {
+				peer.ApplyRemote(evt)
+			}
+
+			all := peer.GetAll()
+			Expect(all).To(HaveKey("model-x"))
+			Expect(all["model-x"].NodeName).To(Equal("worker-1"))
+			Expect(all["model-x"].FileName).To(Equal("weights.gguf"))
+			Expect(all["model-x"].TotalBytes).To(Equal(int64(10 << 30)))
+		})
+
+		It("removes the op from the peer when the origin completes it", func() {
+			mc := &fakeMessagingClient{}
+			origin := NewStagingTracker()
+			origin.SetPublisher(mc)
+
+			origin.Start("model-x", "worker-1", 1)
+			origin.Complete("model-x")
+
+			peer := NewStagingTracker()
+			for _, evt := range decodeStagingEvents(mc) {
+				peer.ApplyRemote(evt)
+			}
+			Expect(peer.GetAll()).ToNot(HaveKey("model-x"))
+		})
+
+		It("does not let a peer broadcast clobber an op this replica is itself running", func() {
+			local := NewStagingTracker()
+			local.Start("model-x", "worker-local", 2)
+			local.UpdateFile("model-x", "weights.gguf", 1, 9<<30, 10<<30, "")
+
+			// A stray/older remote event for the SAME modelID must not overwrite
+			// the authoritative local state, nor delete it.
+			local.ApplyRemote(StagingProgressEvent{
+				ModelID: "model-x",
+				Status:  &StagingStatus{ModelID: "model-x", NodeName: "worker-other", FileName: "stale.gguf"},
+			})
+			local.ApplyRemote(StagingProgressEvent{ModelID: "model-x", Done: true})
+
+			all := local.GetAll()
+			Expect(all).To(HaveKey("model-x"))
+			Expect(all["model-x"].NodeName).To(Equal("worker-local"))
+			Expect(all["model-x"].FileName).To(Equal("weights.gguf"))
+		})
+	})
+
+	Context("when no publisher is wired (standalone mode)", func() {
+		It("does not broadcast", func() {
+			mc := &fakeMessagingClient{}
+			t := NewStagingTracker()
+			t.Start("model-x", "worker-1", 1)
+			t.UpdateFile("model-x", "weights.gguf", 1, 1<<30, 10<<30, "")
+			Expect(mc.published).To(BeEmpty())
+		})
+	})
+})
+
+var _ = Describe("SubjectStagingProgress", func() {
+	It("namespaces by model id and matches the wildcard prefix", func() {
+		Expect(messaging.SubjectStagingProgress("model-x")).To(Equal("staging.model-x.progress"))
+		Expect(messaging.SubjectStagingProgressWildcard).To(Equal("staging.*.progress"))
+	})
+})

From 7226bb9f30d734c5e77f04de40f003cd4a08d4b3 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 12:21:58 +0200
Subject: [PATCH 45/99] chore: :arrow_up: Update CrispStrobe/CrispASR to
 `7a8cb80907341c0204bd0488c1244764f4163883` (#10315)

:arrow_up: Update CrispStrobe/CrispASR

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/go/crispasr/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/go/crispasr/Makefile b/backend/go/crispasr/Makefile
index bbc84f1de..21f66c240 100644
--- a/backend/go/crispasr/Makefile
+++ b/backend/go/crispasr/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 
 # CrispASR version (release tag)
 CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
-CRISPASR_VERSION?=d745bda4386ae0f9d1d2f23fff8ec95d76428221
+CRISPASR_VERSION?=7a8cb80907341c0204bd0488c1244764f4163883
 SO_TARGET?=libgocrispasr.so
 
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

From 62c99c10b3dcd312112f6555aeb231bc727266c3 Mon Sep 17 00:00:00 2001
From: Adira <dennisadira@gmail.com>
Date: Mon, 22 Jun 2026 13:38:06 +0300
Subject: [PATCH 46/99] fix(diffusers): pin diffusers and transformers to a
 known-good pair (#9979) (#10442)

fix(diffusers): pin diffusers and transformers to a known-good pair

The diffusers backend tracked git+https://github.com/huggingface/diffusers
(main) with an unpinned transformers. transformers v5 restructured
CLIPTextModel and removed the .text_model attribute that diffusers' single
-file loader reads, so loading any single-file Stable Diffusion checkpoint
fails:

    create_diffusers_clip_model_from_ldm (single_file_utils.py)
    position_embedding_dim = model.text_model.embeddings.position_embedding...
    AttributeError: 'CLIPTextModel' object has no attribute 'text_model'

No released diffusers (<=0.38.0) supports transformers v5 - only unreleased
diffusers main does. Because the requirements tracked main plus an unpinned
transformers, every backend image froze whichever pair existed at build
time, and images built once transformers v5 shipped but before diffusers
main caught up are permanently broken.

Pin the last known-good released pair across all requirements files:
diffusers==0.38.0 and transformers==4.57.6. 0.38.0 still exposes every
pipeline backend.py imports (Flux, Wan, Sana, LTX2, Qwen, GGUF), so no
functionality is lost, and builds become reproducible instead of drifting
into the broken window.

Fixes #9979

Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Adira Denis Muhando <dennisadira@gmail.com>
---
 backend/python/diffusers/requirements-cpu.txt | 22 ++++++++++++-------
 .../diffusers/requirements-cublas12.txt       | 22 ++++++++++++-------
 .../diffusers/requirements-cublas13.txt       | 22 ++++++++++++-------
 .../python/diffusers/requirements-hipblas.txt | 22 ++++++++++++-------
 .../python/diffusers/requirements-intel.txt   | 22 ++++++++++++-------
 .../python/diffusers/requirements-l4t12.txt   | 22 ++++++++++++-------
 .../python/diffusers/requirements-l4t13.txt   | 22 ++++++++++++-------
 backend/python/diffusers/requirements-mps.txt | 22 ++++++++++++-------
 8 files changed, 112 insertions(+), 64 deletions(-)

diff --git a/backend/python/diffusers/requirements-cpu.txt b/backend/python/diffusers/requirements-cpu.txt
index 8db419b29..46959222c 100644
--- a/backend/python/diffusers/requirements-cpu.txt
+++ b/backend/python/diffusers/requirements-cpu.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 torchvision==0.22.1
 accelerate
 git+https://github.com/xhinker/sd_embed
@@ -10,9 +10,15 @@ sentencepiece
 torch==2.7.1
 optimum-quanto
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
\ No newline at end of file
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
\ No newline at end of file
diff --git a/backend/python/diffusers/requirements-cublas12.txt b/backend/python/diffusers/requirements-cublas12.txt
index e3351ae75..5e6852cc7 100644
--- a/backend/python/diffusers/requirements-cublas12.txt
+++ b/backend/python/diffusers/requirements-cublas12.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cu121
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 torchvision
 accelerate
 git+https://github.com/xhinker/sd_embed
@@ -10,9 +10,15 @@ sentencepiece
 torch
 ftfy
 optimum-quanto
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
diff --git a/backend/python/diffusers/requirements-cublas13.txt b/backend/python/diffusers/requirements-cublas13.txt
index 546998ba4..ce77b6e6e 100644
--- a/backend/python/diffusers/requirements-cublas13.txt
+++ b/backend/python/diffusers/requirements-cublas13.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cu130
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 torchvision
 accelerate
 git+https://github.com/xhinker/sd_embed
@@ -10,9 +10,15 @@ sentencepiece
 torch
 ftfy
 optimum-quanto
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
diff --git a/backend/python/diffusers/requirements-hipblas.txt b/backend/python/diffusers/requirements-hipblas.txt
index 3480d1fd6..f3666d5f5 100644
--- a/backend/python/diffusers/requirements-hipblas.txt
+++ b/backend/python/diffusers/requirements-hipblas.txt
@@ -1,17 +1,23 @@
 --extra-index-url https://download.pytorch.org/whl/rocm7.0
 torch==2.10.0+rocm7.0
 torchvision==0.25.0+rocm7.0
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 accelerate
 peft
 sentencepiece
 optimum-quanto
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
\ No newline at end of file
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
\ No newline at end of file
diff --git a/backend/python/diffusers/requirements-intel.txt b/backend/python/diffusers/requirements-intel.txt
index c78f5ef23..73ab5b3b8 100644
--- a/backend/python/diffusers/requirements-intel.txt
+++ b/backend/python/diffusers/requirements-intel.txt
@@ -3,18 +3,24 @@ torch
 torchvision
 optimum[openvino]
 setuptools
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 accelerate
 git+https://github.com/xhinker/sd_embed
 peft
 sentencepiece
 optimum-quanto
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
\ No newline at end of file
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
\ No newline at end of file
diff --git a/backend/python/diffusers/requirements-l4t12.txt b/backend/python/diffusers/requirements-l4t12.txt
index 15857c4b0..9a9cdb0df 100644
--- a/backend/python/diffusers/requirements-l4t12.txt
+++ b/backend/python/diffusers/requirements-l4t12.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu129/
 torch
-git+https://github.com/huggingface/diffusers
-transformers
+diffusers==0.38.0
+transformers==4.57.6
 accelerate
 peft
 optimum-quanto
@@ -9,9 +9,15 @@ numpy<2
 sentencepiece
 torchvision
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
diff --git a/backend/python/diffusers/requirements-l4t13.txt b/backend/python/diffusers/requirements-l4t13.txt
index 226033a61..964c9c9f2 100644
--- a/backend/python/diffusers/requirements-l4t13.txt
+++ b/backend/python/diffusers/requirements-l4t13.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cu130
 torch
-git+https://github.com/huggingface/diffusers
-transformers
+diffusers==0.38.0
+transformers==4.57.6
 accelerate
 peft
 optimum-quanto
@@ -10,9 +10,15 @@ sentencepiece
 torchvision
 ftfy
 chardet
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
diff --git a/backend/python/diffusers/requirements-mps.txt b/backend/python/diffusers/requirements-mps.txt
index 58eb65f02..eeea59ddd 100644
--- a/backend/python/diffusers/requirements-mps.txt
+++ b/backend/python/diffusers/requirements-mps.txt
@@ -1,16 +1,22 @@
 torch==2.7.1
 torchvision==0.22.1
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 accelerate
 peft
 sentencepiece
 optimum-quanto
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
\ No newline at end of file
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
\ No newline at end of file

From f2abcc7503c86b254dd96eb383009ffe97cdb5cb Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 16:09:16 +0200
Subject: [PATCH 47/99] chore(model gallery): :robot: add 1 new models via
 gallery agent (#10445)

chore(model gallery): :robot: add new models via gallery agent

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 gallery/index.yaml | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 710bc2740..c0fd9c3c3 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,50 @@
 ---
+- name: "qwythos-9b-claude-mythos-5-1m"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/empero-ai/Qwythos-9B-Claude-Mythos-5-1M-GGUF
+  description: |
+    # Qwythos-9B
+
+    **Developed by Empero**
+
+    **Qwythos-9B** is a full-parameter reasoning model built on top of a **deeply uncensored Qwen3.5-9B base** and post-trained on **over 500 million tokens** of high-quality Claude Mythos and Claude Fable traces, with chain-of-thought generated in-house by Empero AI's internal tool **rethink**.
+
+    The result is a compact, fast, **dramatically more capable** 9B reasoning model. Headline capabilities:
+
+    ...
+  license: "apache-2.0"
+  tags:
+    - llm
+    - gguf
+    - vision
+    - multimodal
+    - reasoning
+  overrides:
+    backend: llama-cpp
+    function:
+      automatic_tool_parsing_fallback: true
+      grammar:
+        disable: true
+    known_usecases:
+      - chat
+    mmproj: llama-cpp/mmproj/Qwythos-9B-Claude-Mythos-5-1M-GGUF/mmproj-Qwythos-9B-Claude-Mythos-5-1M-f16.gguf
+    options:
+      - use_jinja:true
+      - spec_type:draft-mtp
+      - spec_n_max:6
+      - spec_p_min:0.75
+    parameters:
+      model: llama-cpp/models/Qwythos-9B-Claude-Mythos-5-1M-GGUF/Qwythos-9B-Claude-Mythos-5-1M-MTP-Q4_K_M.gguf
+    template:
+      use_tokenizer_template: true
+  files:
+    - filename: llama-cpp/models/Qwythos-9B-Claude-Mythos-5-1M-GGUF/Qwythos-9B-Claude-Mythos-5-1M-MTP-Q4_K_M.gguf
+      sha256: 24ee22e0f5d9f0d3d615809607f365c728d9b0c3f3fb6eb19d8bd83a1c2933d8
+      uri: https://huggingface.co/empero-ai/Qwythos-9B-Claude-Mythos-5-1M-GGUF/resolve/main/Qwythos-9B-Claude-Mythos-5-1M-MTP-Q4_K_M.gguf
+    - filename: llama-cpp/mmproj/Qwythos-9B-Claude-Mythos-5-1M-GGUF/mmproj-Qwythos-9B-Claude-Mythos-5-1M-f16.gguf
+      sha256: f70dc3509053962b0d0d3ee8a7eacebf5d60aa560cad78254ae8698516ae029f
+      uri: https://huggingface.co/empero-ai/Qwythos-9B-Claude-Mythos-5-1M-GGUF/resolve/main/mmproj-Qwythos-9B-Claude-Mythos-5-1M-f16.gguf
 - name: "glm-5.2"
   url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
   urls:

From 95b058e1c5df627069cc1152c27317a77a5a6c3a Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 18:24:29 +0200
Subject: [PATCH 48/99] feat(ui): restructure Cluster Nodes view (pulse + panel
 roster + detail page) (#10447)

* chore: gitignore SDD scratch directory

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(nodes): add GET /api/nodes/models cluster-wide loaded-models endpoint

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(ui): add nodesApi.allModels() for cluster-wide model roster

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(ui): move Scheduling to its own page and nav item

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(ui): replace nodes stat-card strip with cluster pulse + attention callout

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(ui): node-panel roster with inline model chips and segmented filter

Replace the Nodes table with a full-width node-panel roster that shows
each backend node's running-model chips without an expand click, plus an
All/Backend/Agent segmented filter. Per-node detail (models, backends,
labels, capacity) moves to the node detail page.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(ui): add deep-linkable node detail page at /app/nodes/:id

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* fix(ui): remove em-dash from CapacityEditor comment; align detail spec backend mock

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* chore(ui): nodes page cleanup, hover/chip polish, docs for restructured cluster view

Nodes.jsx dead-code sweep confirmed clean (no StatCard/table/expand
state/scheduling-form leftovers). Two App.css polish fixes: move the
node-panel hover border-color onto the bordered element so hover gives
real feedback, and add the missing .model-chip__state rule the
ModelChip component already emits. Update distributed-mode docs prose to
describe the restructured cluster view (cluster pulse, attention
callout, node-panel roster with inline model chips, All/Backend/Agent
filter, node detail page at /app/nodes/:id, Scheduling as its own page).

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* chore(ui): drop unused gpuVendorLabel export from nodeStatus

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 .gitignore                                    |    3 +
 core/http/endpoints/localai/nodes.go          |   17 +
 core/http/endpoints/localai/nodes_test.go     |   40 +
 core/http/react-ui/e2e/nodes-detail.spec.js   |   34 +
 .../nodes-per-node-backend-actions.spec.js    |   69 +-
 core/http/react-ui/e2e/nodes-roster.spec.js   |   47 +
 .../react-ui/e2e/page-render-smoke.spec.js    |    1 +
 core/http/react-ui/e2e/scheduling.spec.js     |   16 +
 .../react-ui/public/locales/de/admin.json     |    4 +
 core/http/react-ui/public/locales/de/nav.json |    1 +
 .../react-ui/public/locales/en/admin.json     |    4 +
 core/http/react-ui/public/locales/en/nav.json |    1 +
 .../react-ui/public/locales/es/admin.json     |    4 +
 core/http/react-ui/public/locales/es/nav.json |    1 +
 .../react-ui/public/locales/id/admin.json     |    4 +
 core/http/react-ui/public/locales/id/nav.json |    1 +
 .../react-ui/public/locales/it/admin.json     |    4 +
 core/http/react-ui/public/locales/it/nav.json |    1 +
 .../react-ui/public/locales/ko/admin.json     |    4 +
 core/http/react-ui/public/locales/ko/nav.json |    1 +
 .../react-ui/public/locales/zh-CN/admin.json  |    4 +
 .../react-ui/public/locales/zh-CN/nav.json    |    1 +
 core/http/react-ui/src/App.css                |   53 +
 .../src/components/console/consoleConfig.js   |    1 +
 .../src/components/nodes/AttentionCallout.jsx |   31 +
 .../src/components/nodes/CapacityEditor.jsx   |  196 +++
 .../src/components/nodes/ClusterPulse.jsx     |   18 +
 .../src/components/nodes/KeyValueChips.jsx    |   98 ++
 .../src/components/nodes/ModelChip.jsx        |   12 +
 .../src/components/nodes/NodePanel.jsx        |   60 +
 .../src/components/nodes/StatusPill.jsx       |   11 +
 .../src/components/nodes/nodeStatus.js        |   34 +
 core/http/react-ui/src/pages/NodeDetail.jsx   |  352 ++++
 core/http/react-ui/src/pages/Nodes.jsx        | 1451 +----------------
 core/http/react-ui/src/pages/Scheduling.jsx   |  438 +++++
 core/http/react-ui/src/router.jsx             |    4 +
 core/http/react-ui/src/utils/api.js           |    1 +
 core/http/react-ui/src/utils/config.js        |    1 +
 core/http/routes/nodes.go                     |    3 +
 docs/content/features/distributed-mode.md     |    4 +-
 swagger/docs.go                               |   65 +
 swagger/swagger.json                          |   65 +
 swagger/swagger.yaml                          |   44 +
 43 files changed, 1761 insertions(+), 1443 deletions(-)
 create mode 100644 core/http/react-ui/e2e/nodes-detail.spec.js
 create mode 100644 core/http/react-ui/e2e/nodes-roster.spec.js
 create mode 100644 core/http/react-ui/e2e/scheduling.spec.js
 create mode 100644 core/http/react-ui/src/components/nodes/AttentionCallout.jsx
 create mode 100644 core/http/react-ui/src/components/nodes/CapacityEditor.jsx
 create mode 100644 core/http/react-ui/src/components/nodes/ClusterPulse.jsx
 create mode 100644 core/http/react-ui/src/components/nodes/KeyValueChips.jsx
 create mode 100644 core/http/react-ui/src/components/nodes/ModelChip.jsx
 create mode 100644 core/http/react-ui/src/components/nodes/NodePanel.jsx
 create mode 100644 core/http/react-ui/src/components/nodes/StatusPill.jsx
 create mode 100644 core/http/react-ui/src/components/nodes/nodeStatus.js
 create mode 100644 core/http/react-ui/src/pages/NodeDetail.jsx
 create mode 100644 core/http/react-ui/src/pages/Scheduling.jsx

diff --git a/.gitignore b/.gitignore
index cc7d25fa6..177c79cba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -91,3 +91,6 @@ core/http/react-ui/test-results/
 
 # Local worktrees
 .worktrees/
+
+# SDD / brainstorm scratch (agent-driven development)
+.superpowers/
diff --git a/core/http/endpoints/localai/nodes.go b/core/http/endpoints/localai/nodes.go
index 820cb137f..d6c44e383 100644
--- a/core/http/endpoints/localai/nodes.go
+++ b/core/http/endpoints/localai/nodes.go
@@ -385,6 +385,23 @@ func GetNodeModelsEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {
 	}
 }
 
+// ListAllNodeModelsEndpoint returns all loaded models across all healthy nodes.
+// @Summary List all loaded models cluster-wide
+// @Tags Nodes
+// @Success 200 {array} nodes.NodeModel
+// @Router /api/nodes/models [get]
+func ListAllNodeModelsEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {
+	return func(c echo.Context) error {
+		ctx := c.Request().Context()
+		models, err := registry.ListAllLoadedModels(ctx)
+		if err != nil {
+			xlog.Error("Failed to list all node models", "error", err)
+			return c.JSON(http.StatusInternalServerError, nodeError(http.StatusInternalServerError, "failed to list node models"))
+		}
+		return c.JSON(http.StatusOK, models)
+	}
+}
+
 // DrainNodeEndpoint sets a node to draining status (no new requests).
 func DrainNodeEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {
 	return func(c echo.Context) error {
diff --git a/core/http/endpoints/localai/nodes_test.go b/core/http/endpoints/localai/nodes_test.go
index bca6f42bf..52cef6f03 100644
--- a/core/http/endpoints/localai/nodes_test.go
+++ b/core/http/endpoints/localai/nodes_test.go
@@ -407,4 +407,44 @@ var _ = Describe("Node HTTP handlers", func() {
 			Expect(names).To(ConsistOf("alpha", "beta"))
 		})
 	})
+
+	Describe("ListAllNodeModelsEndpoint", func() {
+		It("returns an empty list when no models are loaded", func() {
+			e := echo.New()
+			req := httptest.NewRequest(http.MethodGet, "/", nil)
+			rec := httptest.NewRecorder()
+			c := e.NewContext(req, rec)
+
+			handler := ListAllNodeModelsEndpoint(registry)
+			Expect(handler(c)).To(Succeed())
+			Expect(rec.Code).To(Equal(http.StatusOK))
+
+			var list []nodes.NodeModel
+			Expect(json.Unmarshal(rec.Body.Bytes(), &list)).To(Succeed())
+			Expect(list).To(BeEmpty())
+		})
+
+		It("returns loaded models across healthy nodes", func() {
+			ctx := context.Background()
+			Expect(registry.Register(ctx, &nodes.BackendNode{
+				ID: "n1", Name: "alpha", Address: "10.0.0.1:50051", Status: nodes.StatusHealthy,
+			}, true)).To(Succeed())
+			Expect(registry.SetNodeModel(ctx, "n1", "llama-3.3", 0, "loaded", "10.0.0.1:50051", 0)).To(Succeed())
+
+			e := echo.New()
+			req := httptest.NewRequest(http.MethodGet, "/", nil)
+			rec := httptest.NewRecorder()
+			c := e.NewContext(req, rec)
+
+			handler := ListAllNodeModelsEndpoint(registry)
+			Expect(handler(c)).To(Succeed())
+			Expect(rec.Code).To(Equal(http.StatusOK))
+
+			var list []nodes.NodeModel
+			Expect(json.Unmarshal(rec.Body.Bytes(), &list)).To(Succeed())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].ModelName).To(Equal("llama-3.3"))
+			Expect(list[0].NodeID).To(Equal("n1"))
+		})
+	})
 })
diff --git a/core/http/react-ui/e2e/nodes-detail.spec.js b/core/http/react-ui/e2e/nodes-detail.spec.js
new file mode 100644
index 000000000..65690ba49
--- /dev/null
+++ b/core/http/react-ui/e2e/nodes-detail.spec.js
@@ -0,0 +1,34 @@
+import { test, expect } from './coverage-fixtures.js'
+
+const ID = 'n1'
+async function mockNode(page) {
+  await page.route(`**/api/nodes/${ID}`, r => r.fulfill({ status: 200, contentType: 'application/json',
+    body: JSON.stringify({ id: ID, name: 'alpha', node_type: 'backend', address: '10.0.0.1:50051', status: 'healthy', total_vram: 24e9, available_vram: 12e9, max_replicas_per_model: 1, labels: { env: 'prod' } }) }))
+  await page.route(`**/api/nodes/${ID}/models`, r => r.fulfill({ status: 200, contentType: 'application/json',
+    body: JSON.stringify([{ node_id: ID, model_name: 'llama-3.3', state: 'loaded', in_flight: 0, replica_index: 0 }]) }))
+  await page.route(`**/api/nodes/${ID}/backends`, r => r.fulfill({ status: 200, contentType: 'application/json',
+    body: JSON.stringify([{ name: 'llama-cpp', is_system: true, installed_at: '2026-06-01T00:00:00Z' }]) }))
+}
+
+test.describe('Node detail page', () => {
+  test('renders sections for a node', async ({ page }) => {
+    await mockNode(page)
+    await page.goto(`/app/nodes/${ID}`)
+    await expect(page.locator('.page-title').first()).toBeVisible({ timeout: 15_000 })
+    await expect(page.getByText('alpha')).toBeVisible()
+    await expect(page.getByText('llama-3.3')).toBeVisible()
+    await expect(page.getByText('llama-cpp')).toBeVisible()
+    await expect(page.getByText('env=prod')).toBeVisible()
+  })
+
+  test('is reachable by clicking a roster panel', async ({ page }) => {
+    await page.route('**/api/nodes', r => r.fulfill({ status: 200, contentType: 'application/json',
+      body: JSON.stringify([{ id: ID, name: 'alpha', node_type: 'backend', address: '10.0.0.1:50051', status: 'healthy' }]) }))
+    await page.route('**/api/nodes/models', r => r.fulfill({ status: 200, contentType: 'application/json', body: '[]' }))
+    await page.route('**/api/nodes/scheduling', r => r.fulfill({ status: 200, contentType: 'application/json', body: '[]' }))
+    await mockNode(page)
+    await page.goto('/app/nodes')
+    await page.locator('.node-panel').filter({ hasText: 'alpha' }).getByText('alpha').click()
+    await expect(page).toHaveURL(new RegExp(`/app/nodes/${ID}$`))
+  })
+})
diff --git a/core/http/react-ui/e2e/nodes-per-node-backend-actions.spec.js b/core/http/react-ui/e2e/nodes-per-node-backend-actions.spec.js
index 76855437f..9ad92932c 100644
--- a/core/http/react-ui/e2e/nodes-per-node-backend-actions.spec.js
+++ b/core/http/react-ui/e2e/nodes-per-node-backend-actions.spec.js
@@ -12,28 +12,37 @@ const NODE_NAME = 'worker-test'
 const BACKEND_NAME = 'cuda12-vllm-development'
 
 async function mockDistributedNodes(page, { onDelete } = {}) {
+  const nodeRecord = {
+    id: NODE_ID,
+    name: NODE_NAME,
+    node_type: 'backend',
+    address: '10.0.0.1:50051',
+    http_address: '10.0.0.1:8090',
+    status: 'healthy',
+    total_vram: 0,
+    available_vram: 0,
+    total_ram: 8_000_000_000,
+    available_ram: 4_000_000_000,
+    gpu_vendor: '',
+    last_heartbeat: new Date().toISOString(),
+    created_at: new Date().toISOString(),
+    updated_at: new Date().toISOString(),
+  }
+
   await page.route('**/api/nodes', (route) => {
     route.fulfill({
       status: 200,
       contentType: 'application/json',
-      body: JSON.stringify([
-        {
-          id: NODE_ID,
-          name: NODE_NAME,
-          node_type: 'backend',
-          address: '10.0.0.1:50051',
-          http_address: '10.0.0.1:8090',
-          status: 'healthy',
-          total_vram: 0,
-          available_vram: 0,
-          total_ram: 8_000_000_000,
-          available_ram: 4_000_000_000,
-          gpu_vendor: '',
-          last_heartbeat: new Date().toISOString(),
-          created_at: new Date().toISOString(),
-          updated_at: new Date().toISOString(),
-        },
-      ]),
+      body: JSON.stringify([nodeRecord]),
+    })
+  })
+
+  // The detail page fetches the single node via nodesApi.get(id).
+  await page.route(`**/api/nodes/${NODE_ID}`, (route) => {
+    route.fulfill({
+      status: 200,
+      contentType: 'application/json',
+      body: JSON.stringify(nodeRecord),
     })
   })
 
@@ -80,24 +89,18 @@ async function mockDistributedNodes(page, { onDelete } = {}) {
   })
 }
 
-async function expandNodeAndWaitForBackends(page) {
-  await page.goto('/app/nodes')
-  // Click the row to expand it. The chevron toggle and the row both work,
-  // but clicking the name cell is the most user-like.
-  await page.getByText(NODE_NAME).first().click()
-  // Backends, Capacity and Labels live behind a "Manage" <details>
-  // disclosure (the drawer was distilled to keep at-a-glance content
-  // lean — see distill refactor in the multi-replica branch). Open it
-  // by clicking the summary inside the .node-manage scope so the
-  // per-node backend table is in the DOM before assertions run.
-  await page.locator('.node-manage > summary').first().click()
+async function openNodeDetail(page) {
+  // The per-node backend table now lives on the deep-linkable detail page
+  // at /app/nodes/:id (the old expand-row + "Manage" disclosure was removed
+  // when the roster was restructured). Navigate straight there.
+  await page.goto(`/app/nodes/${NODE_ID}`)
   await expect(page.getByRole('cell', { name: BACKEND_NAME, exact: true })).toBeVisible({ timeout: 10_000 })
 }
 
 test.describe('Nodes page — per-node backend actions', () => {
   test('upgrade affordance is self-explanatory (not "Reinstall backend" with a sync icon)', async ({ page }) => {
     await mockDistributedNodes(page)
-    await expandNodeAndWaitForBackends(page)
+    await openNodeDetail(page)
 
     // Negative: the old, ambiguous wording must not be used.
     await expect(page.locator('button[title="Reinstall backend"]')).toHaveCount(0)
@@ -114,7 +117,7 @@ test.describe('Nodes page — per-node backend actions', () => {
 
   test('per-node backend row shows a delete (trash) button next to upgrade', async ({ page }) => {
     await mockDistributedNodes(page)
-    await expandNodeAndWaitForBackends(page)
+    await openNodeDetail(page)
 
     const deleteBtn = page.locator('button[title="Delete backend from this node"]')
     await expect(deleteBtn).toBeVisible()
@@ -128,7 +131,7 @@ test.describe('Nodes page — per-node backend actions', () => {
         postedBody = route.request().postDataJSON()
       },
     })
-    await expandNodeAndWaitForBackends(page)
+    await openNodeDetail(page)
 
     await page.locator('button[title="Delete backend from this node"]').click()
 
@@ -150,7 +153,7 @@ test.describe('Nodes page — per-node backend actions', () => {
         deleteCalls += 1
       },
     })
-    await expandNodeAndWaitForBackends(page)
+    await openNodeDetail(page)
 
     await page.locator('button[title="Delete backend from this node"]').click()
 
diff --git a/core/http/react-ui/e2e/nodes-roster.spec.js b/core/http/react-ui/e2e/nodes-roster.spec.js
new file mode 100644
index 000000000..861b94441
--- /dev/null
+++ b/core/http/react-ui/e2e/nodes-roster.spec.js
@@ -0,0 +1,47 @@
+import { test, expect } from './coverage-fixtures.js'
+
+async function mockCluster(page, nodes) {
+  await page.route('**/api/nodes', r => r.fulfill({ status: 200, contentType: 'application/json', body: JSON.stringify(nodes) }))
+  await page.route('**/api/nodes/models', r => r.fulfill({ status: 200, contentType: 'application/json', body: '[]' }))
+  await page.route('**/api/nodes/scheduling', r => r.fulfill({ status: 200, contentType: 'application/json', body: '[]' }))
+}
+
+test.describe('Nodes roster header', () => {
+  test('shows a cluster pulse line and no stat-card grid', async ({ page }) => {
+    await mockCluster(page, [
+      { id: 'n1', name: 'alpha', node_type: 'backend', address: '10.0.0.1:50051', status: 'healthy' },
+      { id: 'n2', name: 'beta', node_type: 'backend', address: '10.0.0.2:50051', status: 'draining' },
+    ])
+    await page.goto('/app/nodes')
+    await expect(page.locator('.cluster-pulse')).toBeVisible({ timeout: 15_000 })
+    await expect(page.locator('.cluster-pulse')).toContainText('2 nodes')
+    await expect(page.locator('.stat-grid')).toHaveCount(0)
+  })
+
+  test('shows an approval callout for pending nodes', async ({ page }) => {
+    await mockCluster(page, [{ id: 'n3', name: 'gamma', node_type: 'backend', address: '10.0.0.3:50051', status: 'pending' }])
+    await page.goto('/app/nodes')
+    await expect(page.locator('.attention-callout')).toContainText('approval', { timeout: 15_000 })
+  })
+})
+
+test.describe('Nodes roster panels', () => {
+  test('shows model chips without clicking and filters by type', async ({ page }) => {
+    await page.route('**/api/nodes', r => r.fulfill({ status: 200, contentType: 'application/json', body: JSON.stringify([
+      { id: 'n1', name: 'alpha', node_type: 'backend', address: '10.0.0.1:50051', status: 'healthy' },
+      { id: 'a1', name: 'agent-1', node_type: 'agent', address: '10.0.0.9:50051', status: 'healthy' },
+    ]) }))
+    await page.route('**/api/nodes/models', r => r.fulfill({ status: 200, contentType: 'application/json', body: JSON.stringify([
+      { node_id: 'n1', model_name: 'llama-3.3', state: 'loaded', in_flight: 2, replica_index: 0 },
+    ]) }))
+    await page.route('**/api/nodes/scheduling', r => r.fulfill({ status: 200, contentType: 'application/json', body: '[]' }))
+
+    await page.goto('/app/nodes')
+    // model chip visible without any expand click
+    await expect(page.locator('.node-panel').filter({ hasText: 'alpha' }).getByText('llama-3.3')).toBeVisible({ timeout: 15_000 })
+    // segmented filter: Agent shows the agent node, hides the backend node
+    await page.getByRole('radio', { name: /Agent/ }).click()
+    await expect(page.getByText('agent-1')).toBeVisible()
+    await expect(page.getByText('alpha')).toHaveCount(0)
+  })
+})
diff --git a/core/http/react-ui/e2e/page-render-smoke.spec.js b/core/http/react-ui/e2e/page-render-smoke.spec.js
index 40cfa1897..5f89764e0 100644
--- a/core/http/react-ui/e2e/page-render-smoke.spec.js
+++ b/core/http/react-ui/e2e/page-render-smoke.spec.js
@@ -21,6 +21,7 @@ const PAGES = [
   ['/app/backends', 'Backends'],
   ['/app/settings', 'Settings'],
   ['/app/nodes', 'Nodes'],
+  ['/app/scheduling', 'Scheduling'],
   ['/app/face', 'Face recognition'],
   ['/app/voice', 'Voice recognition'],
   ['/app/fine-tune', 'Fine-tuning'],
diff --git a/core/http/react-ui/e2e/scheduling.spec.js b/core/http/react-ui/e2e/scheduling.spec.js
new file mode 100644
index 000000000..a43f11be7
--- /dev/null
+++ b/core/http/react-ui/e2e/scheduling.spec.js
@@ -0,0 +1,16 @@
+import { test, expect } from './coverage-fixtures.js'
+
+test.describe('Scheduling page', () => {
+  test('renders at /app/scheduling with rules from the API', async ({ page }) => {
+    await page.route('**/api/nodes/scheduling', (route) => {
+      route.fulfill({
+        status: 200, contentType: 'application/json',
+        body: JSON.stringify([{ model_name: 'llama-3.3', spread_all: true, min_replicas: 0, max_replicas: 0 }]),
+      })
+    })
+    await page.goto('/app/scheduling')
+    await expect(page.locator('.page-title').first()).toBeVisible({ timeout: 15_000 })
+    await expect(page).toHaveURL(/\/app\/scheduling$/)
+    await expect(page.getByText('llama-3.3')).toBeVisible()
+  })
+})
diff --git a/core/http/react-ui/public/locales/de/admin.json b/core/http/react-ui/public/locales/de/admin.json
index 88582b5a2..3bf9daa68 100644
--- a/core/http/react-ui/public/locales/de/admin.json
+++ b/core/http/react-ui/public/locales/de/admin.json
@@ -43,6 +43,10 @@
     "title": "Verteilte Knoten",
     "subtitle": "Backend- und Agenten-Worker-Knoten verwalten"
   },
+  "scheduling": {
+    "title": "Planung",
+    "subtitle": "Modellplatzierung und Replikat-Regeln im gesamten Cluster"
+  },
   "p2p": {
     "title": "Verteilte KI-Berechnung",
     "subtitle": "Skalieren Sie Ihre KI-Workloads über mehrere Geräte mit Peer-to-Peer-Verteilung"
diff --git a/core/http/react-ui/public/locales/de/nav.json b/core/http/react-ui/public/locales/de/nav.json
index 29f5c65d6..f2950da2d 100644
--- a/core/http/react-ui/public/locales/de/nav.json
+++ b/core/http/react-ui/public/locales/de/nav.json
@@ -50,6 +50,7 @@
     "backends": "Backends",
     "traces": "Traces",
     "nodes": "Knoten",
+    "scheduling": "Planung",
     "swarm": "Swarm",
     "system": "System",
     "settings": "Einstellungen",
diff --git a/core/http/react-ui/public/locales/en/admin.json b/core/http/react-ui/public/locales/en/admin.json
index f4a380ae3..05155dd25 100644
--- a/core/http/react-ui/public/locales/en/admin.json
+++ b/core/http/react-ui/public/locales/en/admin.json
@@ -43,6 +43,10 @@
     "title": "Distributed Nodes",
     "subtitle": "Manage backend and agent worker nodes"
   },
+  "scheduling": {
+    "title": "Scheduling",
+    "subtitle": "Model placement and replica rules across the cluster"
+  },
   "p2p": {
     "title": "Distributed AI Computing",
     "subtitle": "Scale your AI workloads across multiple devices with peer-to-peer distribution"
diff --git a/core/http/react-ui/public/locales/en/nav.json b/core/http/react-ui/public/locales/en/nav.json
index 20c8e1599..5423438f9 100644
--- a/core/http/react-ui/public/locales/en/nav.json
+++ b/core/http/react-ui/public/locales/en/nav.json
@@ -51,6 +51,7 @@
     "backends": "Backends",
     "traces": "Traces",
     "nodes": "Nodes",
+    "scheduling": "Scheduling",
     "swarm": "Swarm",
     "system": "System",
     "settings": "Settings",
diff --git a/core/http/react-ui/public/locales/es/admin.json b/core/http/react-ui/public/locales/es/admin.json
index fee37c1ab..1d4b61180 100644
--- a/core/http/react-ui/public/locales/es/admin.json
+++ b/core/http/react-ui/public/locales/es/admin.json
@@ -43,6 +43,10 @@
     "title": "Nodos distribuidos",
     "subtitle": "Administra nodos worker de backends y agentes"
   },
+  "scheduling": {
+    "title": "Planificación",
+    "subtitle": "Reglas de ubicación de modelos y réplicas en el clúster"
+  },
   "p2p": {
     "title": "Computación de IA distribuida",
     "subtitle": "Escala tus cargas de trabajo de IA en múltiples dispositivos con distribución peer-to-peer"
diff --git a/core/http/react-ui/public/locales/es/nav.json b/core/http/react-ui/public/locales/es/nav.json
index bbb2084fe..a1ed97bca 100644
--- a/core/http/react-ui/public/locales/es/nav.json
+++ b/core/http/react-ui/public/locales/es/nav.json
@@ -50,6 +50,7 @@
     "backends": "Backends",
     "traces": "Trazas",
     "nodes": "Nodos",
+    "scheduling": "Planificación",
     "swarm": "Swarm",
     "system": "Sistema",
     "settings": "Configuración",
diff --git a/core/http/react-ui/public/locales/id/admin.json b/core/http/react-ui/public/locales/id/admin.json
index 5e83eb37f..17da13570 100644
--- a/core/http/react-ui/public/locales/id/admin.json
+++ b/core/http/react-ui/public/locales/id/admin.json
@@ -43,6 +43,10 @@
     "title": "Node Terdistribusi",
     "subtitle": "Kelola node backend dan node worker"
   },
+  "scheduling": {
+    "title": "Penjadwalan",
+    "subtitle": "Aturan penempatan model dan replika di seluruh klaster"
+  },
   "p2p": {
     "title": "Komputasi AI Terdistribusi",
     "subtitle": "Skalakan beban kerja AI Anda ke beberapa perangkat dengan distribusi peer-to-peer"
diff --git a/core/http/react-ui/public/locales/id/nav.json b/core/http/react-ui/public/locales/id/nav.json
index 34d025277..225fc59f7 100644
--- a/core/http/react-ui/public/locales/id/nav.json
+++ b/core/http/react-ui/public/locales/id/nav.json
@@ -51,6 +51,7 @@
     "backends": "Backend",
     "traces": "Trace",
     "nodes": "Node",
+    "scheduling": "Penjadwalan",
     "swarm": "Swarm",
     "system": "Sistem",
     "settings": "Pengaturan",
diff --git a/core/http/react-ui/public/locales/it/admin.json b/core/http/react-ui/public/locales/it/admin.json
index 2bd575b66..323bae421 100644
--- a/core/http/react-ui/public/locales/it/admin.json
+++ b/core/http/react-ui/public/locales/it/admin.json
@@ -43,6 +43,10 @@
     "title": "Nodi distribuiti",
     "subtitle": "Gestisci i nodi worker dei backend e degli agenti"
   },
+  "scheduling": {
+    "title": "Pianificazione",
+    "subtitle": "Regole di posizionamento dei modelli e delle repliche nel cluster"
+  },
   "p2p": {
     "title": "Calcolo AI distribuito",
     "subtitle": "Scala i tuoi carichi di lavoro AI su più dispositivi con la distribuzione peer-to-peer"
diff --git a/core/http/react-ui/public/locales/it/nav.json b/core/http/react-ui/public/locales/it/nav.json
index 492f4b8db..c54171f39 100644
--- a/core/http/react-ui/public/locales/it/nav.json
+++ b/core/http/react-ui/public/locales/it/nav.json
@@ -50,6 +50,7 @@
     "backends": "Backend",
     "traces": "Tracce",
     "nodes": "Nodi",
+    "scheduling": "Pianificazione",
     "swarm": "Swarm",
     "system": "Sistema",
     "settings": "Impostazioni",
diff --git a/core/http/react-ui/public/locales/ko/admin.json b/core/http/react-ui/public/locales/ko/admin.json
index 726eaed65..1b6676571 100644
--- a/core/http/react-ui/public/locales/ko/admin.json
+++ b/core/http/react-ui/public/locales/ko/admin.json
@@ -43,6 +43,10 @@
     "title": "분산 노드",
     "subtitle": "백엔드 및 에이전트 워커 노드를 관리합니다"
   },
+  "scheduling": {
+    "title": "스케줄링",
+    "subtitle": "클러스터 전반의 모델 배치 및 복제본 규칙"
+  },
   "p2p": {
     "title": "분산 AI 컴퓨팅",
     "subtitle": "피어 투 피어 분산으로 여러 기기에 걸쳐 AI 워크로드를 확장합니다"
diff --git a/core/http/react-ui/public/locales/ko/nav.json b/core/http/react-ui/public/locales/ko/nav.json
index 98902880d..dbd2016cc 100644
--- a/core/http/react-ui/public/locales/ko/nav.json
+++ b/core/http/react-ui/public/locales/ko/nav.json
@@ -51,6 +51,7 @@
     "backends": "백엔드",
     "traces": "트레이스",
     "nodes": "노드",
+    "scheduling": "스케줄링",
     "swarm": "Swarm",
     "system": "시스템",
     "settings": "설정",
diff --git a/core/http/react-ui/public/locales/zh-CN/admin.json b/core/http/react-ui/public/locales/zh-CN/admin.json
index d55487e69..c5d9db452 100644
--- a/core/http/react-ui/public/locales/zh-CN/admin.json
+++ b/core/http/react-ui/public/locales/zh-CN/admin.json
@@ -43,6 +43,10 @@
     "title": "分布式节点",
     "subtitle": "管理后端和智能体工作节点"
   },
+  "scheduling": {
+    "title": "调度",
+    "subtitle": "集群中的模型放置和副本规则"
+  },
   "p2p": {
     "title": "分布式 AI 计算",
     "subtitle": "通过点对点分发将您的 AI 工作负载扩展到多个设备"
diff --git a/core/http/react-ui/public/locales/zh-CN/nav.json b/core/http/react-ui/public/locales/zh-CN/nav.json
index 58805eec1..730791ddd 100644
--- a/core/http/react-ui/public/locales/zh-CN/nav.json
+++ b/core/http/react-ui/public/locales/zh-CN/nav.json
@@ -50,6 +50,7 @@
     "backends": "后端",
     "traces": "追踪",
     "nodes": "节点",
+    "scheduling": "调度",
     "swarm": "Swarm",
     "system": "系统",
     "settings": "设置",
diff --git a/core/http/react-ui/src/App.css b/core/http/react-ui/src/App.css
index 81d225080..cf1a46bd3 100644
--- a/core/http/react-ui/src/App.css
+++ b/core/http/react-ui/src/App.css
@@ -8471,3 +8471,56 @@ select.input {
 .status-pill--error   .status-pill__dot { background: var(--color-error); }
 .status-pill--info    .status-pill__dot { background: var(--color-info); }
 .status-pill--muted   .status-pill__dot { background: var(--color-text-muted); }
+
+/* Nodes: cluster pulse + attention callout (replaces the stat-card strip) */
+.cluster-pulse {
+  font-size: var(--text-sm);
+  color: var(--color-text-muted);
+  margin: 0 0 var(--spacing-lg);
+}
+.cluster-pulse__strong { color: var(--color-text-primary); font-weight: 600; }
+
+.attention-callout {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  gap: var(--spacing-md);
+  padding: var(--spacing-sm) var(--spacing-md);
+  border-radius: var(--radius-md);
+  margin-bottom: var(--spacing-lg);
+  font-size: var(--text-sm);
+}
+.attention-callout--warn {
+  background: var(--color-warning-light);
+  border: 1px solid var(--color-warning-border);
+  color: var(--color-text-primary);
+}
+.attention-callout--error {
+  background: var(--color-error-light);
+  border: 1px solid var(--color-error-border);
+  color: var(--color-text-primary);
+}
+
+/* Node roster panels (Nodes page) */
+.node-roster { display: flex; flex-direction: column; gap: var(--spacing-sm); }
+.node-panel {
+  background: var(--color-bg-secondary);
+  border: 1px solid var(--color-border-subtle);
+  border-radius: var(--radius-lg);
+}
+.node-panel__main { padding: var(--spacing-md) var(--spacing-lg); cursor: pointer; }
+.node-panel:hover { border-color: var(--color-border); }
+.node-panel__head { display: flex; align-items: flex-start; justify-content: space-between; gap: var(--spacing-md); }
+.node-panel__id { display: flex; align-items: center; gap: var(--spacing-sm); flex-wrap: wrap; }
+.node-panel__name { font-weight: 600; }
+.node-panel__meta { display: flex; gap: var(--spacing-lg); margin-top: var(--spacing-sm); color: var(--color-text-muted); font-size: var(--text-xs); }
+.node-panel__models { display: flex; flex-wrap: wrap; gap: 6px; margin-top: var(--spacing-sm); }
+.model-chip {
+  display: inline-flex; align-items: center; gap: 5px;
+  font-family: var(--font-mono); font-size: 0.6875rem;
+  padding: 2px 8px; border-radius: var(--radius-sm); border: 1px solid;
+}
+.model-chip__dot { width: 6px; height: 6px; border-radius: 50%; }
+.model-chip__state { opacity: 0.85; font-style: normal; }
+.node-filter { margin-bottom: var(--spacing-lg); }
+.node-detail__metrics { display: flex; gap: var(--spacing-xl); margin: var(--spacing-md) 0 var(--spacing-lg); flex-wrap: wrap; }
diff --git a/core/http/react-ui/src/components/console/consoleConfig.js b/core/http/react-ui/src/components/console/consoleConfig.js
index e71f1b959..d207ee7f4 100644
--- a/core/http/react-ui/src/components/console/consoleConfig.js
+++ b/core/http/react-ui/src/components/console/consoleConfig.js
@@ -59,6 +59,7 @@ export const operateConsole = {
       titleKey: 'operate.cluster',
       items: [
         { path: '/app/nodes', icon: 'fas fa-network-wired', labelKey: 'items.nodes', adminOnly: true, feature: 'distributed' },
+        { path: '/app/scheduling', icon: 'fas fa-calendar-alt', labelKey: 'items.scheduling', adminOnly: true, feature: 'distributed' },
         { path: '/app/p2p', icon: 'fas fa-circle-nodes', labelKey: 'items.swarm', adminOnly: true },
       ],
     },
diff --git a/core/http/react-ui/src/components/nodes/AttentionCallout.jsx b/core/http/react-ui/src/components/nodes/AttentionCallout.jsx
new file mode 100644
index 000000000..b3e22169a
--- /dev/null
+++ b/core/http/react-ui/src/components/nodes/AttentionCallout.jsx
@@ -0,0 +1,31 @@
+export default function AttentionCallout({ nodes, onApprove }) {
+  const pending = nodes.filter(n => n.status === 'pending')
+  const unhealthy = nodes.filter(n => n.status === 'unhealthy' || n.status === 'offline')
+  if (pending.length === 0 && unhealthy.length === 0) return null
+
+  if (pending.length > 0) {
+    const first = pending[0]
+    const extra = pending.length - 1
+    return (
+      <div className="attention-callout attention-callout--warn">
+        <span>
+          <i className="fas fa-exclamation-circle" />{' '}
+          <strong>{pending.length} node{pending.length > 1 ? 's' : ''} awaiting approval</strong>
+          {' - '}{first.name}{extra > 0 ? ` +${extra} more` : ''}
+        </span>
+        <button className="btn btn-primary btn-sm" onClick={() => onApprove(first.id)}>
+          <i className="fas fa-check" /> Approve {first.name}
+        </button>
+      </div>
+    )
+  }
+  return (
+    <div className="attention-callout attention-callout--error">
+      <span>
+        <i className="fas fa-exclamation-triangle" />{' '}
+        <strong>{unhealthy.length} node{unhealthy.length > 1 ? 's' : ''} unhealthy</strong>
+        {' - '}{unhealthy.map(n => n.name).slice(0, 3).join(', ')}
+      </span>
+    </div>
+  )
+}
diff --git a/core/http/react-ui/src/components/nodes/CapacityEditor.jsx b/core/http/react-ui/src/components/nodes/CapacityEditor.jsx
new file mode 100644
index 000000000..9bdfe356a
--- /dev/null
+++ b/core/http/react-ui/src/components/nodes/CapacityEditor.jsx
@@ -0,0 +1,196 @@
+import { useState, useEffect, useCallback } from 'react'
+import { nodesApi } from '../../utils/api'
+import LoadingSpinner from '../LoadingSpinner'
+
+/**
+ * Inline editor for a node's per-model replica capacity.
+ *
+ * UX intent: discoverable affordance (pencil icon) that opens an inline
+ * input - never a modal for a single field. Source-of-truth note is shown
+ * inline so operators understand a worker re-registration will overwrite
+ * their override; surfacing this in a tooltip would hide too important a
+ * caveat.
+ *
+ * `confirmShrink` is a hook the parent provides so the page can render its
+ * own confirm dialog (it has access to all nodes and can phrase the message
+ * with full context).
+ */
+export default function CapacityEditor({ node, loadedModelCounts, onUpdate, confirmShrink, addToast }) {
+  const current = node.max_replicas_per_model || 1
+  const isOverride = !!node.max_replicas_per_model_manually_set
+  const [editing, setEditing] = useState(false)
+  const [draft, setDraft] = useState(String(current))
+  const [saving, setSaving] = useState(false)
+  const [resetting, setResetting] = useState(false)
+
+  // Reset draft when current value changes (server response, etc.)
+  useEffect(() => {
+    if (!editing) setDraft(String(current))
+  }, [current, editing])
+
+  const cancel = useCallback(() => {
+    setEditing(false)
+    setDraft(String(current))
+  }, [current])
+
+  const save = useCallback(async () => {
+    const value = parseInt(draft, 10)
+    if (!Number.isFinite(value) || value < 1) {
+      addToast('Replica capacity must be 1 or higher', 'error')
+      return
+    }
+    if (value === current) {
+      setEditing(false)
+      return
+    }
+    // Reducing the cap below current loaded replicas: confirm so the operator
+    // sees the consequence (running replicas keep going until idle eviction).
+    const maxLoadedAcrossModels = Math.max(0, ...Object.values(loadedModelCounts || {}))
+    if (value < maxLoadedAcrossModels) {
+      const proceed = await confirmShrink({ node, newValue: value, currentLoaded: maxLoadedAcrossModels })
+      if (!proceed) return
+    }
+    setSaving(true)
+    try {
+      await nodesApi.updateMaxReplicasPerModel(node.id, value)
+      addToast(`Replica capacity set to ${value} on ${node.name}`, 'success')
+      setEditing(false)
+      onUpdate?.(value)
+    } catch (err) {
+      addToast(`Could not change replica capacity: ${err.message || err}`, 'error')
+    } finally {
+      setSaving(false)
+    }
+  }, [draft, current, node, loadedModelCounts, confirmShrink, onUpdate, addToast])
+
+  const onKeyDown = (e) => {
+    if (e.key === 'Enter') { e.preventDefault(); save() }
+    else if (e.key === 'Escape') { e.preventDefault(); cancel() }
+  }
+
+  const reset = useCallback(async () => {
+    setResetting(true)
+    try {
+      await nodesApi.resetMaxReplicasPerModel(node.id)
+      addToast(`Override cleared on ${node.name}; worker flag will apply on next re-registration`, 'success')
+      onUpdate?.(null)
+    } catch (err) {
+      addToast(`Could not reset override: ${err.message || err}`, 'error')
+    } finally {
+      setResetting(false)
+    }
+  }, [node, onUpdate, addToast])
+
+  return (
+    <div style={{
+      display: 'flex', alignItems: 'flex-start', gap: 'var(--spacing-md)',
+    }}>
+      <i className="fas fa-layer-group" style={{ color: 'var(--color-text-muted)', marginTop: 3 }} aria-hidden="true" />
+      <div style={{ flex: 1, minWidth: 0 }}>
+        <div style={{ display: 'flex', alignItems: 'center', gap: 'var(--spacing-sm)', flexWrap: 'wrap' }}>
+          <label
+            htmlFor={`capacity-${node.id}`}
+            style={{ fontSize: '0.8125rem', fontWeight: 600, color: 'var(--color-text-primary)' }}
+          >
+            Max replicas per model
+          </label>
+          {editing ? (
+            <>
+              <input
+                id={`capacity-${node.id}`}
+                type="number"
+                min={1}
+                value={draft}
+                disabled={saving}
+                onChange={(e) => setDraft(e.target.value)}
+                onKeyDown={onKeyDown}
+                autoFocus
+                aria-describedby={`capacity-hint-${node.id}`}
+                style={{
+                  width: 72, padding: '4px 8px', borderRadius: 'var(--radius-sm)',
+                  border: '1px solid var(--color-border)', background: 'var(--color-bg-primary)',
+                  fontFamily: 'var(--font-mono)', fontSize: '0.8125rem',
+                  color: 'var(--color-text-primary)',
+                }}
+              />
+              <button
+                className="btn btn-primary btn-sm"
+                onClick={save}
+                disabled={saving}
+                style={{ minHeight: 32 }}
+                aria-label="Save replica capacity"
+              >
+                {saving ? <LoadingSpinner size="xs" /> : <><i className="fas fa-check" /> Save</>}
+              </button>
+              <button
+                className="btn btn-secondary btn-sm"
+                onClick={cancel}
+                disabled={saving}
+                style={{ minHeight: 32 }}
+                aria-label="Cancel"
+              >
+                Cancel
+              </button>
+            </>
+          ) : (
+            <>
+              <span
+                className="cell-mono"
+                style={{ fontSize: '0.8125rem', color: 'var(--color-text-secondary)' }}
+              >
+                {current}
+              </span>
+              {isOverride && (
+                <span
+                  title="This value was set from the UI. It will persist across worker restarts until you click Reset."
+                  style={{
+                    display: 'inline-block', fontSize: '0.6875rem', padding: '1px 6px',
+                    borderRadius: 'var(--radius-sm)', fontWeight: 500,
+                    background: 'var(--color-bg-primary)',
+                    border: '1px solid var(--color-warning, #d97706)',
+                    color: 'var(--color-warning, #d97706)',
+                  }}
+                >
+                  override
+                </span>
+              )}
+              <button
+                onClick={() => setEditing(true)}
+                aria-label={`Edit replica capacity (currently ${current})`}
+                title="Change replica capacity for this node"
+                style={{
+                  display: 'inline-flex', alignItems: 'center', justifyContent: 'center',
+                  minWidth: 32, minHeight: 32, padding: 4, borderRadius: 'var(--radius-sm)',
+                  border: '1px solid var(--color-border-subtle)',
+                  background: 'transparent', color: 'var(--color-text-muted)', cursor: 'pointer',
+                }}
+              >
+                <i className="fas fa-pencil-alt" />
+              </button>
+              {isOverride && (
+                <button
+                  onClick={reset}
+                  disabled={resetting}
+                  aria-label="Clear admin override and let the worker flag apply"
+                  title="Clear override; the worker's --max-replicas-per-model flag will apply on the next re-registration"
+                  className="btn btn-secondary btn-sm"
+                  style={{ minHeight: 32 }}
+                >
+                  {resetting ? <LoadingSpinner size="xs" /> : <><i className="fas fa-undo" /> Reset</>}
+                </button>
+              )}
+            </>
+          )}
+        </div>
+        <div
+          id={`capacity-hint-${node.id}`}
+          style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', marginTop: 4, lineHeight: 1.4 }}
+        >
+          {isOverride
+            ? <>Set from here. <strong>Reset</strong> to use the worker's default.</>
+            : <>Saved values stick across worker restarts.</>}
+        </div>
+      </div>
+    </div>
+  )
+}
diff --git a/core/http/react-ui/src/components/nodes/ClusterPulse.jsx b/core/http/react-ui/src/components/nodes/ClusterPulse.jsx
new file mode 100644
index 000000000..c465f3b72
--- /dev/null
+++ b/core/http/react-ui/src/components/nodes/ClusterPulse.jsx
@@ -0,0 +1,18 @@
+import { formatVRAM } from './nodeStatus'
+
+export default function ClusterPulse({ nodes }) {
+  const total = nodes.length
+  const healthy = nodes.filter(n => n.status === 'healthy').length
+  const draining = nodes.filter(n => n.status === 'draining').length
+  const usedVRAM = nodes.reduce((s, n) =>
+    (n.total_vram && n.available_vram != null) ? s + (n.total_vram - n.available_vram) : s, 0)
+  const vramStr = formatVRAM(usedVRAM)
+  return (
+    <p className="cluster-pulse">
+      <span className="cluster-pulse__strong">{total} {total === 1 ? 'node' : 'nodes'}</span>
+      {' · '}<span style={{ color: 'var(--color-success)' }}>{healthy} healthy</span>
+      {draining > 0 && <>{' · '}<span style={{ color: 'var(--color-warning)' }}>{draining} draining</span></>}
+      {vramStr && <>{' · '}{vramStr} VRAM in use</>}
+    </p>
+  )
+}
diff --git a/core/http/react-ui/src/components/nodes/KeyValueChips.jsx b/core/http/react-ui/src/components/nodes/KeyValueChips.jsx
new file mode 100644
index 000000000..f4b53a7c7
--- /dev/null
+++ b/core/http/react-ui/src/components/nodes/KeyValueChips.jsx
@@ -0,0 +1,98 @@
+import { useState } from 'react'
+
+/**
+ * Controlled chip-builder for { key: value } maps. Replaces the prior
+ * comma-separated-string Node Selector input AND the bespoke Labels editor
+ * in the node drawer - both were rendering the same chip pattern with
+ * subtly different markup.
+ *
+ * Fully controlled: parent owns the map and decides what onAdd/onRemove
+ * does (form state for the scheduling form; API calls for the live
+ * labels editor). The component just renders chips and a key/value input
+ * row.
+ *
+ * Props:
+ *   pairs       - current map of key -> value
+ *   onAdd(k,v)  - called when the user adds a pair (parent handles dedup
+ *                 and persistence side effects)
+ *   onRemove(k) - called when a chip's × is clicked
+ *   placeholderKey, placeholderValue - input hints
+ *   ariaLabel   - accessible name for the section
+ */
+export default function KeyValueChips({ pairs, onAdd, onRemove, placeholderKey = 'key', placeholderValue = 'value', ariaLabel }) {
+  const [k, setK] = useState('')
+  const [v, setV] = useState('')
+
+  const add = () => {
+    const key = k.trim()
+    if (!key) return
+    onAdd(key, v.trim())
+    setK(''); setV('')
+  }
+  const onKeyDown = (e) => {
+    if (e.key === 'Enter') { e.preventDefault(); add() }
+  }
+
+  const entries = pairs ? Object.entries(pairs) : []
+  return (
+    <div aria-label={ariaLabel}>
+      {entries.length > 0 && (
+        <div style={{ display: 'flex', flexWrap: 'wrap', gap: 4, marginBottom: 'var(--spacing-xs)' }}>
+          {entries.map(([key, val]) => (
+            <span key={key} style={{
+              display: 'inline-flex', alignItems: 'center', gap: 4,
+              fontSize: '0.75rem', padding: '2px 8px',
+              borderRadius: 'var(--radius-sm)',
+              background: 'var(--color-bg-tertiary)',
+              border: '1px solid var(--color-border-subtle)',
+              fontFamily: 'var(--font-mono)',
+            }}>
+              {key}={val}
+              <button
+                type="button"
+                onClick={(e) => { e.stopPropagation(); onRemove(key) }}
+                aria-label={`Remove ${key}`}
+                title="Remove"
+                style={{
+                  background: 'none', border: 'none', cursor: 'pointer',
+                  color: 'var(--color-text-muted)', fontSize: '0.625rem', padding: 0,
+                }}
+              >
+                <i className="fas fa-times" />
+              </button>
+            </span>
+          ))}
+        </div>
+      )}
+      <div style={{ display: 'flex', gap: 'var(--spacing-xs)', alignItems: 'stretch' }}>
+        <input
+          className="input"
+          type="text"
+          placeholder={placeholderKey}
+          value={k}
+          onChange={e => setK(e.target.value)}
+          onKeyDown={onKeyDown}
+          style={{ flex: 1 }}
+        />
+        <input
+          className="input"
+          type="text"
+          placeholder={placeholderValue}
+          value={v}
+          onChange={e => setV(e.target.value)}
+          onKeyDown={onKeyDown}
+          style={{ flex: 1 }}
+        />
+        <button
+          type="button"
+          className="btn btn-secondary btn-sm"
+          onClick={add}
+          disabled={!k.trim()}
+          style={{ minHeight: 36 }}
+        >
+          <i className="fas fa-plus" /> Add
+        </button>
+      </div>
+    </div>
+  )
+}
diff --git a/core/http/react-ui/src/components/nodes/ModelChip.jsx b/core/http/react-ui/src/components/nodes/ModelChip.jsx
new file mode 100644
index 000000000..04d735f06
--- /dev/null
+++ b/core/http/react-ui/src/components/nodes/ModelChip.jsx
@@ -0,0 +1,12 @@
+import { modelStateConfig } from './nodeStatus'
+
+export default function ModelChip({ model }) {
+  const cfg = modelStateConfig[model.state] || modelStateConfig.idle
+  return (
+    <span className="model-chip" style={{ background: cfg.bg, color: cfg.color, borderColor: cfg.border }}>
+      <span className="model-chip__dot" style={{ background: cfg.color }} />
+      {model.model_name}
+      {model.state !== 'loaded' && <span className="model-chip__state"> {model.state}</span>}
+    </span>
+  )
+}
diff --git a/core/http/react-ui/src/components/nodes/NodePanel.jsx b/core/http/react-ui/src/components/nodes/NodePanel.jsx
new file mode 100644
index 000000000..0a2b76578
--- /dev/null
+++ b/core/http/react-ui/src/components/nodes/NodePanel.jsx
@@ -0,0 +1,60 @@
+import { useNavigate } from 'react-router-dom'
+import StatusPill from './StatusPill'
+import ModelChip from './ModelChip'
+import ActionMenu from '../ActionMenu'
+import { formatVRAM } from './nodeStatus'
+
+export default function NodePanel({ node, models = [], onApprove, onDrain, onResume, onRemove }) {
+  const navigate = useNavigate()
+  const isAgent = node.node_type === 'agent'
+  const open = () => navigate(`/app/nodes/${node.id}`)
+  const usedVRAM = node.total_vram && node.available_vram != null ? node.total_vram - node.available_vram : null
+
+  return (
+    <div className="node-panel">
+      <div className="node-panel__main" onClick={open} role="button" tabIndex={0}
+        onKeyDown={(e) => { if (e.key === 'Enter') open() }}>
+        <div className="node-panel__head">
+          <div className="node-panel__id">
+            <StatusPill status={node.status} />
+            <span className="node-panel__name">{node.name}</span>
+            <span className="cell-mono cell-muted">{node.address}</span>
+          </div>
+          <div className="node-panel__actions" onClick={(e) => e.stopPropagation()}>
+            {node.status === 'pending' && (
+              <button className="btn btn-primary btn-sm" onClick={() => onApprove(node.id)}>
+                <i className="fas fa-check" /> Approve
+              </button>
+            )}
+            <ActionMenu
+              ariaLabel={`Actions for ${node.name}`}
+              triggerLabel={`Actions for ${node.name}`}
+              items={[
+                { key: 'resume', icon: 'fa-play', label: 'Resume', hidden: node.status !== 'draining', onClick: () => onResume(node.id) },
+                { key: 'drain', icon: 'fa-pause', label: 'Drain', hidden: node.status === 'draining' || node.status === 'pending', onClick: () => onDrain(node.id) },
+                { divider: true, hidden: node.status === 'pending' },
+                { key: 'remove', icon: 'fa-trash', label: 'Remove from cluster', danger: true, onClick: () => onRemove(node) },
+              ]}
+            />
+          </div>
+        </div>
+
+        {!isAgent && (
+          <>
+            <div className="node-panel__meta">
+              {node.total_vram > 0 && (
+                <span className="cell-mono">VRAM {formatVRAM(usedVRAM) || '0'} / {formatVRAM(node.total_vram)}</span>
+              )}
+              <span className="cell-mono">{node.in_flight_count || 0} in-flight</span>
+            </div>
+            <div className="node-panel__models">
+              {models.length === 0
+                ? <span className="cell-muted">No models loaded</span>
+                : models.map(m => <ModelChip key={`${m.model_name}-${m.replica_index ?? 0}`} model={m} />)}
+            </div>
+          </>
+        )}
+      </div>
+    </div>
+  )
+}
diff --git a/core/http/react-ui/src/components/nodes/StatusPill.jsx b/core/http/react-ui/src/components/nodes/StatusPill.jsx
new file mode 100644
index 000000000..4e4c4c5b5
--- /dev/null
+++ b/core/http/react-ui/src/components/nodes/StatusPill.jsx
@@ -0,0 +1,11 @@
+import { statusConfig } from './nodeStatus'
+
+export default function StatusPill({ status }) {
+  const cfg = statusConfig[status] || statusConfig.unhealthy
+  return (
+    <span className="node-status" style={{ color: cfg.color }}>
+      <span className="node-status__dot" style={{ background: cfg.color }} />
+      {cfg.label}
+    </span>
+  )
+}
diff --git a/core/http/react-ui/src/components/nodes/nodeStatus.js b/core/http/react-ui/src/components/nodes/nodeStatus.js
new file mode 100644
index 000000000..799a7c13f
--- /dev/null
+++ b/core/http/react-ui/src/components/nodes/nodeStatus.js
@@ -0,0 +1,34 @@
+export const statusConfig = {
+  healthy: { color: 'var(--color-success)', label: 'Healthy' },
+  unhealthy: { color: 'var(--color-error)', label: 'Unhealthy' },
+  offline: { color: 'var(--color-error)', label: 'Offline' },
+  registering: { color: 'var(--color-primary)', label: 'Registering' },
+  draining: { color: 'var(--color-warning)', label: 'Draining' },
+  pending: { color: 'var(--color-warning)', label: 'Pending Approval' },
+}
+
+export const modelStateConfig = {
+  loaded: { bg: 'var(--color-success-light)', color: 'var(--color-success)', border: 'var(--color-success-border)' },
+  loading: { bg: 'var(--color-primary-light)', color: 'var(--color-primary)', border: 'var(--color-primary-border)' },
+  unloading: { bg: 'var(--color-warning-light)', color: 'var(--color-warning)', border: 'var(--color-warning-border)' },
+  idle: { bg: 'var(--color-bg-tertiary)', color: 'var(--color-text-muted)', border: 'var(--color-border-subtle)' },
+}
+
+export function formatVRAM(bytes) {
+  if (!bytes || bytes === 0) return null
+  const gb = bytes / (1024 * 1024 * 1024)
+  return gb >= 1 ? `${gb.toFixed(1)} GB` : `${(bytes / (1024 * 1024)).toFixed(0)} MB`
+}
+
+export function timeAgo(dateString) {
+  if (!dateString) return 'never'
+  const seconds = Math.floor((Date.now() - new Date(dateString).getTime()) / 1000)
+  if (seconds < 0) return 'just now'
+  if (seconds < 60) return `${seconds}s ago`
+  const minutes = Math.floor(seconds / 60)
+  if (minutes < 60) return `${minutes}m ago`
+  const hours = Math.floor(minutes / 60)
+  if (hours < 24) return `${hours}h ago`
+  const days = Math.floor(hours / 24)
+  return `${days}d ago`
+}
diff --git a/core/http/react-ui/src/pages/NodeDetail.jsx b/core/http/react-ui/src/pages/NodeDetail.jsx
new file mode 100644
index 000000000..f61ba0576
--- /dev/null
+++ b/core/http/react-ui/src/pages/NodeDetail.jsx
@@ -0,0 +1,352 @@
+import { useState, useEffect, useCallback } from 'react'
+import { useParams, useNavigate, useOutletContext } from 'react-router-dom'
+import { nodesApi } from '../utils/api'
+import PageHeader from '../components/PageHeader'
+import LoadingSpinner from '../components/LoadingSpinner'
+import ConfirmDialog from '../components/ConfirmDialog'
+import StatusPill from '../components/nodes/StatusPill'
+import CapacityEditor from '../components/nodes/CapacityEditor'
+import KeyValueChips from '../components/nodes/KeyValueChips'
+import { formatVRAM, modelStateConfig, timeAgo } from '../components/nodes/nodeStatus'
+
+// Deep-linkable node management home. Reached by clicking a roster panel on
+// /app/nodes. Surfaces what's running here plus the management affordances
+// (capacity, backends, labels, drain/resume/remove) that previously lived in
+// the expanded-row "Manage" drawer.
+export default function NodeDetail() {
+  const { id } = useParams()
+  const navigate = useNavigate()
+  const { addToast } = useOutletContext()
+  const [node, setNode] = useState(null)
+  const [models, setModels] = useState([])
+  const [backends, setBackends] = useState([])
+  const [loading, setLoading] = useState(true)
+  const [confirmRemove, setConfirmRemove] = useState(false)
+  const [confirmUnload, setConfirmUnload] = useState(null)
+  const [confirmDeleteBackend, setConfirmDeleteBackend] = useState(null)
+  // Promise-based shrink confirmation: CapacityEditor awaits this hook so the
+  // page owns the dialog (it can phrase the message with full node context).
+  const [confirmShrinkState, setConfirmShrinkState] = useState(null)
+
+  const refresh = useCallback(async () => {
+    try {
+      const n = await nodesApi.get(id)
+      setNode(n)
+      const [m, b] = await Promise.all([nodesApi.getModels(id), nodesApi.getBackends(id)])
+      setModels(Array.isArray(m) ? m : [])
+      setBackends(Array.isArray(b) ? b : [])
+    } catch (err) {
+      addToast(`Failed to load node: ${err.message}`, 'error')
+    } finally {
+      setLoading(false)
+    }
+  }, [id, addToast])
+
+  useEffect(() => { refresh() }, [refresh])
+
+  const confirmShrink = useCallback((ctx) => new Promise((resolve) => {
+    setConfirmShrinkState({ ...ctx, resolve })
+  }), [])
+
+  if (loading) return <div className="page page--wide" style={{ display: 'flex', justifyContent: 'center', padding: 'var(--spacing-xl)' }}><LoadingSpinner size="lg" /></div>
+  if (!node) return <div className="page page--wide"><PageHeader title="Node not found" /></div>
+
+  const drain = async () => { try { await nodesApi.drain(id); addToast('Node set to draining', 'success'); refresh() } catch (e) { addToast(e.message, 'error') } }
+  const resume = async () => { try { await nodesApi.resume(id); addToast('Node resumed', 'success'); refresh() } catch (e) { addToast(e.message, 'error') } }
+  const remove = async () => { try { await nodesApi.delete(id); addToast('Node removed', 'success'); navigate('/app/nodes') } catch (e) { addToast(e.message, 'error') } }
+  const unload = async (name) => { try { await nodesApi.unloadModel(id, name); addToast(`Model "${name}" unloaded`, 'success'); refresh() } catch (e) { addToast(e.message, 'error') } }
+  const upgradeBackend = async (name) => { try { await nodesApi.installBackend(id, name); addToast(`Backend "${name}" upgraded`, 'success'); refresh() } catch (e) { addToast(e.message, 'error') } }
+  const deleteBackend = async (name) => { try { await nodesApi.deleteBackend(id, name); addToast(`Backend "${name}" deleted`, 'success'); refresh() } catch (e) { addToast(e.message, 'error') } }
+  const addLabel = async (k, v) => { try { await nodesApi.mergeLabels(id, { [k]: v }); refresh() } catch (e) { addToast(e.message, 'error') } }
+  const delLabel = async (k) => { try { await nodesApi.deleteLabel(id, k); refresh() } catch (e) { addToast(e.message, 'error') } }
+
+  const usedVRAM = node.total_vram && node.available_vram != null ? node.total_vram - node.available_vram : 0
+  // {modelName: replicaCount} of loaded models so the shrink confirm can warn
+  // if the new cap is below the actual count of any single model on this node.
+  const loadedModelCounts = (() => {
+    const counts = {}
+    models.forEach(m => { if (m.state === 'loaded') counts[m.model_name] = (counts[m.model_name] || 0) + 1 })
+    return counts
+  })()
+
+  return (
+    <div className="page page--wide">
+      <PageHeader
+        eyebrow={<a onClick={() => navigate('/app/nodes')} style={{ cursor: 'pointer', color: 'var(--color-primary)' }}><i className="fas fa-arrow-left" style={{ marginRight: 6 }} aria-hidden="true" />Cluster</a>}
+        title={<><StatusPill status={node.status} /> {node.name}</>}
+        supporting={node.address}
+        actions={
+          <>
+            {node.status === 'draining'
+              ? <button className="btn btn-secondary btn-sm" onClick={resume}><i className="fas fa-play" /> Resume</button>
+              : <button className="btn btn-secondary btn-sm" onClick={drain}><i className="fas fa-pause" /> Drain</button>}
+            <button className="btn btn-danger btn-sm" onClick={() => setConfirmRemove(true)}><i className="fas fa-trash" /> Remove</button>
+          </>
+        }
+      />
+
+      {/* Inline metrics row: VRAM / in-flight - no boxes, just labelled values. */}
+      <div className="node-detail__metrics">
+        {node.total_vram > 0 && (
+          <div>
+            <div className="drawer-eyebrow">VRAM</div>
+            <span className="cell-mono">{formatVRAM(usedVRAM) || '0'} / {formatVRAM(node.total_vram)}</span>
+          </div>
+        )}
+        <div>
+          <div className="drawer-eyebrow">In-flight</div>
+          <span className="cell-mono">{node.in_flight_count || 0}</span>
+        </div>
+        {node.node_type !== 'agent' && (
+          <div style={{ minWidth: 0 }}>
+            <div className="drawer-eyebrow">Capacity</div>
+            <CapacityEditor
+              node={node}
+              loadedModelCounts={loadedModelCounts}
+              confirmShrink={confirmShrink}
+              addToast={addToast}
+              onUpdate={() => refresh()}
+            />
+          </div>
+        )}
+      </div>
+
+      {/* Running models */}
+      <div style={{ marginTop: 'var(--spacing-lg)' }}>
+        <div className="drawer-eyebrow">Running models</div>
+        {models.length === 0 ? (
+          <p style={{ fontSize: '0.8125rem', color: 'var(--color-text-muted)', margin: '0 0 var(--spacing-md) 0' }}>
+            <i className="fas fa-cube" style={{ marginRight: 6, opacity: 0.6 }} aria-hidden="true" />
+            No models loaded yet - they'll appear here when scheduled to this node.
+          </p>
+        ) : (
+          <table className="table" style={{ margin: 0 }}>
+            <thead>
+              <tr>
+                <th>Model</th>
+                <th>State</th>
+                <th>In-Flight</th>
+                <th style={{ width: 40 }}>Logs</th>
+                <th style={{ textAlign: 'right' }}>Actions</th>
+              </tr>
+            </thead>
+            <tbody>
+              {(() => {
+                // Pre-compute per-model replica counts so the disambiguation
+                // pill only renders when this node actually hosts >1 replica
+                // of the same model. Single-replica deployments stay clean.
+                const replicaCounts = {}
+                models.forEach(m => { replicaCounts[m.model_name] = (replicaCounts[m.model_name] || 0) + 1 })
+                return models.map(m => {
+                  const stCfg = modelStateConfig[m.state] || modelStateConfig.idle
+                  const showReplica = (replicaCounts[m.model_name] || 0) > 1
+                  // Per-replica process key - what the worker stores logs under and what the
+                  // store's GetLines/Subscribe match on for replica-scoped filtering.
+                  const processKey = `${m.model_name}#${m.replica_index ?? 0}`
+                  return (
+                    <tr key={m.id || `${m.model_name}#${m.replica_index ?? 0}`}>
+                      <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.8125rem' }}>
+                        {m.model_name}
+                        {showReplica && (
+                          <span
+                            className="cell-mono"
+                            aria-label={`replica ${m.replica_index ?? 0}`}
+                            title={`Replica ${m.replica_index ?? 0} on this node`}
+                            style={{
+                              marginLeft: 8, padding: '1px 6px', borderRadius: 'var(--radius-sm)',
+                              background: 'var(--color-bg-tertiary)',
+                              border: '1px solid var(--color-border-subtle)',
+                              fontSize: '0.6875rem', fontWeight: 500,
+                              color: 'var(--color-text-secondary)',
+                            }}
+                          >
+                            rep {m.replica_index ?? 0}
+                          </span>
+                        )}
+                      </td>
+                      <td>
+                        <span style={{
+                          display: 'inline-block', padding: '2px 8px', borderRadius: 'var(--radius-sm)',
+                          fontSize: '0.75rem', fontWeight: 500,
+                          background: stCfg.bg, color: stCfg.color, border: `1px solid ${stCfg.border}`,
+                        }}>
+                          {m.state}
+                        </span>
+                      </td>
+                      <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.8125rem' }}>
+                        {m.in_flight ?? 0}
+                      </td>
+                      <td>
+                        <a
+                          href="#"
+                          onClick={(e) => {
+                            e.preventDefault()
+                            // Send the replica-scoped process key (modelName#replicaIndex).
+                            navigate(`/app/node-backend-logs/${id}/${encodeURIComponent(processKey)}`)
+                          }}
+                          style={{ fontSize: '0.75rem', color: 'var(--color-primary)' }}
+                          title={showReplica ? `View backend logs for replica ${m.replica_index ?? 0}` : 'View backend logs'}
+                        >
+                          <i className="fas fa-terminal" />
+                        </a>
+                      </td>
+                      <td style={{ textAlign: 'right' }}>
+                        <button
+                          className="btn btn-danger btn-sm"
+                          title={m.in_flight > 0 ? 'Unload model (has in-flight requests)' : 'Unload model'}
+                          onClick={() => setConfirmUnload({ modelName: m.model_name, inFlight: m.in_flight ?? 0 })}
+                        >
+                          <i className="fas fa-stop" />
+                        </button>
+                      </td>
+                    </tr>
+                  )
+                })
+              })()}
+            </tbody>
+          </table>
+        )}
+      </div>
+
+      {/* Installed backends */}
+      <div style={{ marginTop: 'var(--spacing-lg)' }}>
+        <div style={{
+          display: 'flex', alignItems: 'center', justifyContent: 'space-between',
+          marginBottom: 'var(--spacing-sm)',
+        }}>
+          <div className="drawer-eyebrow" style={{ margin: 0 }}>Installed backends</div>
+          <button
+            type="button"
+            className="btn btn-secondary btn-sm"
+            onClick={() => navigate(`/app/backends?target=${encodeURIComponent(id)}`)}
+            title={`Install a backend on ${node.name}`}
+          >
+            <i className="fas fa-plus" /> Add backend
+          </button>
+        </div>
+        {backends.length === 0 ? (
+          <p style={{ fontSize: '0.8125rem', color: 'var(--color-text-muted)', margin: 0 }}>
+            None installed. <a href="#" style={{ color: 'var(--color-primary)' }} onClick={(e) => { e.preventDefault(); navigate(`/app/backends?target=${encodeURIComponent(id)}`) }}>Install one from the gallery</a> to schedule models here.
+          </p>
+        ) : (
+          <table className="table" style={{ margin: 0 }}>
+            <thead>
+              <tr>
+                <th>Name</th>
+                <th>Type</th>
+                <th>Installed At</th>
+                <th style={{ textAlign: 'right' }}>Actions</th>
+              </tr>
+            </thead>
+            <tbody>
+              {backends.map(b => (
+                <tr key={b.name}>
+                  <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.8125rem' }}>
+                    {b.name}
+                  </td>
+                  <td>
+                    <span style={{
+                      display: 'inline-block', padding: '2px 8px', borderRadius: 'var(--radius-sm)',
+                      fontSize: '0.75rem', fontWeight: 500,
+                      background: b.is_system ? 'var(--color-bg-tertiary)' : 'var(--color-primary-light)',
+                      color: b.is_system ? 'var(--color-text-muted)' : 'var(--color-primary)',
+                      border: `1px solid ${b.is_system ? 'var(--color-border-subtle)' : 'var(--color-primary-border)'}`,
+                    }}>
+                      {b.is_system ? 'system' : 'gallery'}
+                    </span>
+                  </td>
+                  <td style={{ fontSize: '0.8125rem', color: 'var(--color-text-muted)' }}>
+                    {b.installed_at ? timeAgo(b.installed_at) : '-'}
+                  </td>
+                  <td style={{ textAlign: 'right' }}>
+                    {!b.is_system && (
+                      <div style={{ display: 'inline-flex', gap: 'var(--spacing-xs)' }}>
+                        <button
+                          className="btn btn-secondary btn-sm"
+                          onClick={() => upgradeBackend(b.name)}
+                          title="Upgrade backend on this node"
+                        >
+                          <i className="fas fa-arrow-up" />
+                        </button>
+                        <button
+                          className="btn btn-danger-ghost btn-sm"
+                          onClick={() => setConfirmDeleteBackend({ backend: b.name })}
+                          title="Delete backend from this node"
+                        >
+                          <i className="fas fa-trash" />
+                        </button>
+                      </div>
+                    )}
+                  </td>
+                </tr>
+              ))}
+            </tbody>
+          </table>
+        )}
+      </div>
+
+      {/* Labels - node.replica-slots is filtered out so the Capacity editor
+          stays the single source of truth for that label. */}
+      <div style={{ marginTop: 'var(--spacing-lg)' }}>
+        <div className="drawer-eyebrow">Labels</div>
+        <KeyValueChips
+          pairs={Object.fromEntries(Object.entries(node.labels || {}).filter(([k]) => k !== 'node.replica-slots'))}
+          onAdd={addLabel}
+          onRemove={delLabel}
+          placeholderKey="key"
+          placeholderValue="value"
+          ariaLabel="Node labels"
+        />
+      </div>
+
+      <ConfirmDialog
+        open={confirmRemove}
+        title="Remove node"
+        message={`Remove "${node.name}" from the cluster? This will deregister it.`}
+        confirmLabel="Remove"
+        danger
+        onConfirm={() => { remove(); setConfirmRemove(false) }}
+        onCancel={() => setConfirmRemove(false)}
+      />
+
+      <ConfirmDialog
+        open={!!confirmUnload}
+        title="Unload Model"
+        message={
+          confirmUnload
+            ? confirmUnload.inFlight > 0
+              ? `"${confirmUnload.modelName}" currently has ${confirmUnload.inFlight} in-flight request(s). Unloading will interrupt them. Continue?`
+              : `Unload "${confirmUnload.modelName}" from ${node.name}?`
+            : ''
+        }
+        confirmLabel="Unload"
+        danger={confirmUnload?.inFlight > 0}
+        onConfirm={() => { if (confirmUnload) unload(confirmUnload.modelName); setConfirmUnload(null) }}
+        onCancel={() => setConfirmUnload(null)}
+      />
+
+      <ConfirmDialog
+        open={!!confirmDeleteBackend}
+        title="Delete Backend"
+        message={confirmDeleteBackend ? `Delete "${confirmDeleteBackend.backend}" from ${node.name}? This removes the backend files from this node only.` : ''}
+        confirmLabel="Delete"
+        danger
+        onConfirm={() => { if (confirmDeleteBackend) deleteBackend(confirmDeleteBackend.backend); setConfirmDeleteBackend(null) }}
+        onCancel={() => setConfirmDeleteBackend(null)}
+      />
+
+      <ConfirmDialog
+        open={!!confirmShrinkState}
+        title="Reduce replica capacity"
+        message={
+          confirmShrinkState
+            ? `${node.name} currently has ${confirmShrinkState.currentLoaded} replica(s) of at least one model loaded. Reducing the cap to ${confirmShrinkState.newValue} won't evict anything immediately - running replicas keep going, but the reconciler will trim down on the next idle window. Continue?`
+            : ''
+        }
+        confirmLabel="Reduce"
+        onConfirm={() => { confirmShrinkState?.resolve(true); setConfirmShrinkState(null) }}
+        onCancel={() => { confirmShrinkState?.resolve(false); setConfirmShrinkState(null) }}
+      />
+    </div>
+  )
+}
diff --git a/core/http/react-ui/src/pages/Nodes.jsx b/core/http/react-ui/src/pages/Nodes.jsx
index 5f6649467..c6eb9b294 100644
--- a/core/http/react-ui/src/pages/Nodes.jsx
+++ b/core/http/react-ui/src/pages/Nodes.jsx
@@ -1,253 +1,15 @@
-import { useState, useEffect, useCallback, Fragment } from 'react'
-import { useOutletContext, useNavigate } from 'react-router-dom'
+import { useState, useEffect, useCallback, useMemo } from 'react'
+import { useOutletContext } from 'react-router-dom'
 import { useTranslation } from 'react-i18next'
 import { nodesApi } from '../utils/api'
 import LoadingSpinner from '../components/LoadingSpinner'
 import PageHeader from '../components/PageHeader'
 import ConfirmDialog from '../components/ConfirmDialog'
-import ActionMenu from '../components/ActionMenu'
-import SearchableModelSelect from '../components/SearchableModelSelect'
 import ImageSelector, { useImageSelector, dockerImage, dockerFlags } from '../components/ImageSelector'
-import StatCard from '../components/StatCard'
-import ResponsiveTable from '../components/ResponsiveTable'
+import ClusterPulse from '../components/nodes/ClusterPulse'
+import AttentionCallout from '../components/nodes/AttentionCallout'
+import NodePanel from '../components/nodes/NodePanel'
 
-function timeAgo(dateString) {
-  if (!dateString) return 'never'
-  const seconds = Math.floor((Date.now() - new Date(dateString).getTime()) / 1000)
-  if (seconds < 0) return 'just now'
-  if (seconds < 60) return `${seconds}s ago`
-  const minutes = Math.floor(seconds / 60)
-  if (minutes < 60) return `${minutes}m ago`
-  const hours = Math.floor(minutes / 60)
-  if (hours < 24) return `${hours}h ago`
-  const days = Math.floor(hours / 24)
-  return `${days}d ago`
-}
-
-function formatVRAM(bytes) {
-  if (!bytes || bytes === 0) return null
-  const gb = bytes / (1024 * 1024 * 1024)
-  return gb >= 1 ? `${gb.toFixed(1)} GB` : `${(bytes / (1024 * 1024)).toFixed(0)} MB`
-}
-
-/**
- * Inline editor for a node's per-model replica capacity.
- *
- * UX intent: discoverable affordance (pencil icon) that opens an inline
- * input — never a modal for a single field. Source-of-truth note is shown
- * inline so operators understand a worker re-registration will overwrite
- * their override; surfacing this in a tooltip would hide too important a
- * caveat.
- *
- * `confirmShrink` is a hook the parent provides so the page can render its
- * own confirm dialog (it has access to all nodes and can phrase the message
- * with full context).
- */
-function CapacityEditor({ node, loadedModelCounts, onUpdate, confirmShrink, addToast }) {
-  const current = node.max_replicas_per_model || 1
-  const isOverride = !!node.max_replicas_per_model_manually_set
-  const [editing, setEditing] = useState(false)
-  const [draft, setDraft] = useState(String(current))
-  const [saving, setSaving] = useState(false)
-  const [resetting, setResetting] = useState(false)
-
-  // Reset draft when current value changes (server response, etc.)
-  useEffect(() => {
-    if (!editing) setDraft(String(current))
-  }, [current, editing])
-
-  const cancel = useCallback(() => {
-    setEditing(false)
-    setDraft(String(current))
-  }, [current])
-
-  const save = useCallback(async () => {
-    const value = parseInt(draft, 10)
-    if (!Number.isFinite(value) || value < 1) {
-      addToast('Replica capacity must be 1 or higher', 'error')
-      return
-    }
-    if (value === current) {
-      setEditing(false)
-      return
-    }
-    // Reducing the cap below current loaded replicas: confirm so the operator
-    // sees the consequence (running replicas keep going until idle eviction).
-    const maxLoadedAcrossModels = Math.max(0, ...Object.values(loadedModelCounts || {}))
-    if (value < maxLoadedAcrossModels) {
-      const proceed = await confirmShrink({ node, newValue: value, currentLoaded: maxLoadedAcrossModels })
-      if (!proceed) return
-    }
-    setSaving(true)
-    try {
-      await nodesApi.updateMaxReplicasPerModel(node.id, value)
-      addToast(`Replica capacity set to ${value} on ${node.name}`, 'success')
-      setEditing(false)
-      onUpdate?.(value)
-    } catch (err) {
-      addToast(`Could not change replica capacity: ${err.message || err}`, 'error')
-    } finally {
-      setSaving(false)
-    }
-  }, [draft, current, node, loadedModelCounts, confirmShrink, onUpdate, addToast])
-
-  const onKeyDown = (e) => {
-    if (e.key === 'Enter') { e.preventDefault(); save() }
-    else if (e.key === 'Escape') { e.preventDefault(); cancel() }
-  }
-
-  const reset = useCallback(async () => {
-    setResetting(true)
-    try {
-      await nodesApi.resetMaxReplicasPerModel(node.id)
-      addToast(`Override cleared on ${node.name}; worker flag will apply on next re-registration`, 'success')
-      onUpdate?.(null)
-    } catch (err) {
-      addToast(`Could not reset override: ${err.message || err}`, 'error')
-    } finally {
-      setResetting(false)
-    }
-  }, [node, onUpdate, addToast])
-
-  return (
-    <div style={{
-      display: 'flex', alignItems: 'flex-start', gap: 'var(--spacing-md)',
-    }}>
-      <i className="fas fa-layer-group" style={{ color: 'var(--color-text-muted)', marginTop: 3 }} aria-hidden="true" />
-      <div style={{ flex: 1, minWidth: 0 }}>
-        <div style={{ display: 'flex', alignItems: 'center', gap: 'var(--spacing-sm)', flexWrap: 'wrap' }}>
-          <label
-            htmlFor={`capacity-${node.id}`}
-            style={{ fontSize: '0.8125rem', fontWeight: 600, color: 'var(--color-text-primary)' }}
-          >
-            Max replicas per model
-          </label>
-          {editing ? (
-            <>
-              <input
-                id={`capacity-${node.id}`}
-                type="number"
-                min={1}
-                value={draft}
-                disabled={saving}
-                onChange={(e) => setDraft(e.target.value)}
-                onKeyDown={onKeyDown}
-                autoFocus
-                aria-describedby={`capacity-hint-${node.id}`}
-                style={{
-                  width: 72, padding: '4px 8px', borderRadius: 'var(--radius-sm)',
-                  border: '1px solid var(--color-border)', background: 'var(--color-bg-primary)',
-                  fontFamily: 'var(--font-mono)', fontSize: '0.8125rem',
-                  color: 'var(--color-text-primary)',
-                }}
-              />
-              <button
-                className="btn btn-primary btn-sm"
-                onClick={save}
-                disabled={saving}
-                style={{ minHeight: 32 }}
-                aria-label="Save replica capacity"
-              >
-                {saving ? <LoadingSpinner size="xs" /> : <><i className="fas fa-check" /> Save</>}
-              </button>
-              <button
-                className="btn btn-secondary btn-sm"
-                onClick={cancel}
-                disabled={saving}
-                style={{ minHeight: 32 }}
-                aria-label="Cancel"
-              >
-                Cancel
-              </button>
-            </>
-          ) : (
-            <>
-              <span
-                className="cell-mono"
-                style={{ fontSize: '0.8125rem', color: 'var(--color-text-secondary)' }}
-              >
-                {current}
-              </span>
-              {isOverride && (
-                <span
-                  title="This value was set from the UI. It will persist across worker restarts until you click Reset."
-                  style={{
-                    display: 'inline-block', fontSize: '0.6875rem', padding: '1px 6px',
-                    borderRadius: 'var(--radius-sm)', fontWeight: 500,
-                    background: 'var(--color-bg-primary)',
-                    border: '1px solid var(--color-warning, #d97706)',
-                    color: 'var(--color-warning, #d97706)',
-                  }}
-                >
-                  override
-                </span>
-              )}
-              <button
-                onClick={() => setEditing(true)}
-                aria-label={`Edit replica capacity (currently ${current})`}
-                title="Change replica capacity for this node"
-                style={{
-                  display: 'inline-flex', alignItems: 'center', justifyContent: 'center',
-                  minWidth: 32, minHeight: 32, padding: 4, borderRadius: 'var(--radius-sm)',
-                  border: '1px solid var(--color-border-subtle)',
-                  background: 'transparent', color: 'var(--color-text-muted)', cursor: 'pointer',
-                }}
-              >
-                <i className="fas fa-pencil-alt" />
-              </button>
-              {isOverride && (
-                <button
-                  onClick={reset}
-                  disabled={resetting}
-                  aria-label="Clear admin override and let the worker flag apply"
-                  title="Clear override; the worker's --max-replicas-per-model flag will apply on the next re-registration"
-                  className="btn btn-secondary btn-sm"
-                  style={{ minHeight: 32 }}
-                >
-                  {resetting ? <LoadingSpinner size="xs" /> : <><i className="fas fa-undo" /> Reset</>}
-                </button>
-              )}
-            </>
-          )}
-        </div>
-        <div
-          id={`capacity-hint-${node.id}`}
-          style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', marginTop: 4, lineHeight: 1.4 }}
-        >
-          {isOverride
-            ? <>Set from here. <strong>Reset</strong> to use the worker's default.</>
-            : <>Saved values stick across worker restarts.</>}
-        </div>
-      </div>
-    </div>
-  )
-}
-
-function gpuVendorLabel(vendor) {
-  const labels = {
-    nvidia: 'NVIDIA',
-    amd: 'AMD',
-    intel: 'Intel',
-    vulkan: 'Vulkan',
-  }
-  return labels[vendor] || null
-}
-
-const statusConfig = {
-  healthy: { color: 'var(--color-success)', label: 'Healthy' },
-  unhealthy: { color: 'var(--color-error)', label: 'Unhealthy' },
-  offline: { color: 'var(--color-error)', label: 'Offline' },
-  registering: { color: 'var(--color-primary)', label: 'Registering' },
-  draining: { color: 'var(--color-warning)', label: 'Draining' },
-  pending: { color: 'var(--color-warning)', label: 'Pending Approval' },
-}
-
-const modelStateConfig = {
-  loaded: { bg: 'var(--color-success-light)', color: 'var(--color-success)', border: 'var(--color-success-border)' },
-  loading: { bg: 'var(--color-primary-light)', color: 'var(--color-primary)', border: 'var(--color-primary-border)' },
-  unloading: { bg: 'var(--color-warning-light)', color: 'var(--color-warning)', border: 'var(--color-warning-border)' },
-  idle: { bg: 'var(--color-bg-tertiary)', color: 'var(--color-text-muted)', border: 'var(--color-border-subtle)' },
-}
 
 function StepNumber({ n, bg, color }) {
   return (
@@ -345,389 +107,16 @@ function WorkerHintCard({ addToast, activeTab, hasWorkers }) {
   )
 }
 
-// Numeric input with quick-pick preset chips. Picked over a slider because
-// replica counts are exact specs (operator math), not fuzzy estimates. The
-// chips give one-click access to common values without the slider's
-// precision/special-value problems (e.g. MaxReplicas=0 = "no limit").
-function ReplicaInput({ id, label, value, onChange, presets }) {
-  return (
-    <div style={{ flex: 1 }}>
-      <label className="form-label" htmlFor={id}>{label}</label>
-      <input
-        id={id}
-        className="input"
-        type="number"
-        min={0}
-        value={value}
-        onChange={e => onChange(parseInt(e.target.value) || 0)}
-      />
-      <div style={{ display: 'flex', gap: 4, flexWrap: 'wrap', marginTop: 6 }}>
-        {presets.map(({ v, l }) => {
-          const active = value === v
-          return (
-            <button
-              key={v}
-              type="button"
-              onClick={() => onChange(v)}
-              aria-pressed={active}
-              className="cell-mono"
-              style={{
-                padding: '2px 8px',
-                borderRadius: 'var(--radius-sm)',
-                fontSize: '0.6875rem',
-                fontWeight: 500,
-                cursor: 'pointer',
-                background: active ? 'var(--color-primary-light)' : 'transparent',
-                border: `1px solid ${active ? 'var(--color-primary-border)' : 'var(--color-border-subtle)'}`,
-                color: active ? 'var(--color-primary)' : 'var(--color-text-muted)',
-              }}
-            >{l || v}</button>
-          )
-        })}
-      </div>
-    </div>
-  )
-}
-
-/**
- * Controlled chip-builder for { key: value } maps. Replaces the prior
- * comma-separated-string Node Selector input AND the bespoke Labels editor
- * in the node drawer — both were rendering the same chip pattern with
- * subtly different markup.
- *
- * Fully controlled: parent owns the map and decides what onAdd/onRemove
- * does (form state for the scheduling form; API calls for the live
- * labels editor). The component just renders chips and a key/value input
- * row.
- *
- * Props:
- *   pairs       — current map of key → value
- *   onAdd(k,v)  — called when the user adds a pair (parent handles dedup
- *                 and persistence side effects)
- *   onRemove(k) — called when a chip's × is clicked
- *   placeholderKey, placeholderValue — input hints
- *   ariaLabel   — accessible name for the section
- */
-function KeyValueChips({ pairs, onAdd, onRemove, placeholderKey = 'key', placeholderValue = 'value', ariaLabel }) {
-  const [k, setK] = useState('')
-  const [v, setV] = useState('')
-
-  const add = () => {
-    const key = k.trim()
-    if (!key) return
-    onAdd(key, v.trim())
-    setK(''); setV('')
-  }
-  const onKeyDown = (e) => {
-    if (e.key === 'Enter') { e.preventDefault(); add() }
-  }
-
-  const entries = pairs ? Object.entries(pairs) : []
-  return (
-    <div aria-label={ariaLabel}>
-      {entries.length > 0 && (
-        <div style={{ display: 'flex', flexWrap: 'wrap', gap: 4, marginBottom: 'var(--spacing-xs)' }}>
-          {entries.map(([key, val]) => (
-            <span key={key} style={{
-              display: 'inline-flex', alignItems: 'center', gap: 4,
-              fontSize: '0.75rem', padding: '2px 8px',
-              borderRadius: 'var(--radius-sm)',
-              background: 'var(--color-bg-tertiary)',
-              border: '1px solid var(--color-border-subtle)',
-              fontFamily: 'var(--font-mono)',
-            }}>
-              {key}={val}
-              <button
-                type="button"
-                onClick={(e) => { e.stopPropagation(); onRemove(key) }}
-                aria-label={`Remove ${key}`}
-                title="Remove"
-                style={{
-                  background: 'none', border: 'none', cursor: 'pointer',
-                  color: 'var(--color-text-muted)', fontSize: '0.625rem', padding: 0,
-                }}
-              >
-                <i className="fas fa-times" />
-              </button>
-            </span>
-          ))}
-        </div>
-      )}
-      <div style={{ display: 'flex', gap: 'var(--spacing-xs)', alignItems: 'stretch' }}>
-        <input
-          className="input"
-          type="text"
-          placeholder={placeholderKey}
-          value={k}
-          onChange={e => setK(e.target.value)}
-          onKeyDown={onKeyDown}
-          style={{ flex: 1 }}
-        />
-        <input
-          className="input"
-          type="text"
-          placeholder={placeholderValue}
-          value={v}
-          onChange={e => setV(e.target.value)}
-          onKeyDown={onKeyDown}
-          style={{ flex: 1 }}
-        />
-        <button
-          type="button"
-          className="btn btn-secondary btn-sm"
-          onClick={add}
-          disabled={!k.trim()}
-          style={{ minHeight: 36 }}
-        >
-          <i className="fas fa-plus" /> Add
-        </button>
-      </div>
-    </div>
-  )
-}
-
-function SchedulingForm({ onSave, onCancel }) {
-  const [mode, setMode] = useState('placement')
-  const [modelName, setModelName] = useState('')
-  // Selector is now a chip-builder map instead of a comma-separated string.
-  // Operators were copying syntax from docs and missing commas; the chip UI
-  // makes the key=value structure self-documenting.
-  const [selector, setSelector] = useState({})
-  const [minReplicas, setMinReplicas] = useState(1)
-  const [maxReplicas, setMaxReplicas] = useState(0)
-  // Prefix-cache routing controls. Empty routePolicy means "inherit the
-  // cluster default"; the three thresholds at 0 likewise inherit, so they
-  // stay out of the POST body's effective override only when explicitly set.
-  const [routePolicy, setRoutePolicy] = useState('')
-  const [balanceAbsThreshold, setBalanceAbsThreshold] = useState(0)
-  const [balanceRelThreshold, setBalanceRelThreshold] = useState(0)
-  const [minPrefixMatch, setMinPrefixMatch] = useState(0)
-
-  const hasSelector = Object.keys(selector).length > 0
-
-  const isValid = () => {
-    if (!modelName) return false
-    if (mode === 'placement') return hasSelector
-    if (mode === 'spread') return true
-    return minReplicas > 0 || maxReplicas > 0
-  }
-
-  const handleSubmit = () => {
-    onSave({
-      model_name: modelName,
-      node_selector: hasSelector ? selector : undefined,
-      min_replicas: mode === 'autoscaling' ? minReplicas : 0,
-      max_replicas: mode === 'autoscaling' ? maxReplicas : 0,
-      spread_all: mode === 'spread',
-      route_policy: routePolicy,
-      balance_abs_threshold: balanceAbsThreshold,
-      balance_rel_threshold: balanceRelThreshold,
-      min_prefix_match: minPrefixMatch,
-    })
-  }
-
-  return (
-    <div className="card" style={{ padding: 'var(--spacing-lg)', marginBottom: 'var(--spacing-md)' }}>
-      {/* Mode selector \u2014 uses the project's segmented control instead of two
-          50%-width filled buttons that competed visually with the actual
-          primary action (Save). */}
-      <div role="radiogroup" aria-label="Scheduling mode" className="segmented" style={{ marginBottom: 'var(--spacing-xs)' }}>
-        <button
-          type="button" role="radio" aria-checked={mode === 'placement'}
-          className={`segmented__item${mode === 'placement' ? ' is-active' : ''}`}
-          onClick={() => setMode('placement')}
-        >
-          <i className="fas fa-thumbtack" aria-hidden="true" /> Pin to nodes
-        </button>
-        <button
-          type="button" role="radio" aria-checked={mode === 'autoscaling'}
-          className={`segmented__item${mode === 'autoscaling' ? ' is-active' : ''}`}
-          onClick={() => setMode('autoscaling')}
-        >
-          <i className="fas fa-arrows-up-down" aria-hidden="true" /> Auto-scale
-        </button>
-        <button
-          type="button" role="radio" aria-checked={mode === 'spread'}
-          className={`segmented__item${mode === 'spread' ? ' is-active' : ''}`}
-          onClick={() => setMode('spread')}
-        >
-          <i className="fas fa-network-wired" aria-hidden="true" /> Spread to all
-        </button>
-      </div>
-      <p style={{ fontSize: '0.8125rem', color: 'var(--color-text-muted)', margin: '0 0 var(--spacing-lg) 0' }}>
-        {mode === 'placement'
-          ? 'Restrict this model to specific nodes. Loaded on demand, evictable when idle.'
-          : mode === 'spread'
-          ? 'Run one replica on every node matching the selector (all healthy nodes when empty). Tracks nodes joining and leaving.'
-          : 'Maintain a target replica count across the cluster. Min \u2265 1 protects from eviction.'}
-      </p>
-
-      {/* Linear vertical flow \u2014 model picker is the visual focus, then the
-          mode-specific fields below. No 2-column grid (the mismatched widths
-          made the form look raw). */}
-      <div style={{ display: 'flex', flexDirection: 'column', gap: 'var(--spacing-md)' }}>
-        <div>
-          <label className="form-label" htmlFor="sched-model">Model</label>
-          {/* Searchable combobox so a long gallery doesn't force the operator
-              to scroll through hundreds of entries. Free-text is allowed —
-              you can pre-create a rule for a model that hasn't been
-              installed yet, which is a real workflow when standing up a new
-              node and pre-staging its scheduling policy. */}
-          <SearchableModelSelect
-            value={modelName}
-            onChange={setModelName}
-            placeholder="Type to search models, or paste a name..."
-          />
-        </div>
-
-        <div>
-          <label className="form-label">
-            Node selector{mode === 'placement' ? '' : ' (optional)'}
-          </label>
-          <KeyValueChips
-            pairs={selector}
-            onAdd={(k, v) => setSelector(prev => ({ ...prev, [k]: v }))}
-            onRemove={(k) => setSelector(prev => { const n = { ...prev }; delete n[k]; return n })}
-            placeholderKey="key (e.g. gpu.vendor)"
-            placeholderValue="value (e.g. nvidia)"
-            ariaLabel="Node selector"
-          />
-          <span style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', display: 'block', marginTop: 6 }}>
-            {mode === 'placement'
-              ? 'Models will load only on nodes that match all listed labels.'
-              : (hasSelector ? 'Replicas land only on matching nodes.' : 'Empty = any healthy node.')}
-          </span>
-        </div>
-
-        {mode === 'autoscaling' && (
-          <div style={{ display: 'flex', gap: 'var(--spacing-md)' }}>
-            <ReplicaInput
-              id="sched-min"
-              label="Min replicas"
-              value={minReplicas}
-              onChange={setMinReplicas}
-              presets={[{ v: 1 }, { v: 2 }, { v: 3 }, { v: 4 }]}
-            />
-            <ReplicaInput
-              id="sched-max"
-              label="Max replicas"
-              value={maxReplicas}
-              onChange={setMaxReplicas}
-              presets={[{ v: 0, l: 'no limit' }, { v: 2 }, { v: 4 }, { v: 8 }]}
-            />
-          </div>
-        )}
-
-        {/* Per-model routing policy. Left empty/zero these inherit the
-            cluster-wide defaults; set them to override how requests for this
-            model are spread across replicas. */}
-        <div>
-          <label className="form-label" htmlFor="sched-route-policy">Routing policy</label>
-          <select
-            id="sched-route-policy"
-            className="input"
-            value={routePolicy}
-            onChange={e => setRoutePolicy(e.target.value)}
-          >
-            <option value="">Default (cluster setting)</option>
-            <option value="round_robin">Round Robin</option>
-            <option value="prefix_cache">Prefix Cache</option>
-          </select>
-          <span style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', display: 'block', marginTop: 6 }}>
-            Prefix Cache routes shared-prefix requests to the same replica to reuse its KV cache, falling back to round-robin when replicas are imbalanced.
-          </span>
-        </div>
-
-        {routePolicy === 'prefix_cache' && (
-          <div style={{ display: 'flex', gap: 'var(--spacing-md)' }}>
-            <div style={{ flex: 1 }}>
-              <label className="form-label" htmlFor="sched-min-prefix-match">Min prefix match</label>
-              <input
-                id="sched-min-prefix-match"
-                className="input"
-                type="number"
-                step="0.05"
-                min="0"
-                max="1"
-                value={minPrefixMatch}
-                onChange={e => setMinPrefixMatch(parseFloat(e.target.value) || 0)}
-              />
-              <span style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', display: 'block', marginTop: 6 }}>
-                Fraction of the prompt (0..1) that must match a cached prefix before affinity kicks in. 0 inherits the default.
-              </span>
-            </div>
-            <div style={{ flex: 1 }}>
-              <label className="form-label" htmlFor="sched-balance-abs">Balance abs threshold</label>
-              <input
-                id="sched-balance-abs"
-                className="input"
-                type="number"
-                min="0"
-                value={balanceAbsThreshold}
-                onChange={e => setBalanceAbsThreshold(parseInt(e.target.value) || 0)}
-              />
-              <span style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', display: 'block', marginTop: 6 }}>
-                Max absolute in-flight gap allowed before falling back to round-robin. 0 inherits the default.
-              </span>
-            </div>
-            <div style={{ flex: 1 }}>
-              <label className="form-label" htmlFor="sched-balance-rel">Balance rel threshold</label>
-              <input
-                id="sched-balance-rel"
-                className="input"
-                type="number"
-                step="0.1"
-                min="0"
-                value={balanceRelThreshold}
-                onChange={e => setBalanceRelThreshold(parseFloat(e.target.value) || 0)}
-              />
-              <span style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', display: 'block', marginTop: 6 }}>
-                Max relative in-flight ratio (&gt;= 1) allowed before falling back to round-robin. 0 inherits the default.
-              </span>
-            </div>
-          </div>
-        )}
-      </div>
-
-      {/* Hairline divider above the actions, matching the project's form pattern. */}
-      <div style={{
-        display: 'flex', gap: 'var(--spacing-sm)', justifyContent: 'flex-end',
-        marginTop: 'var(--spacing-lg)', paddingTop: 'var(--spacing-md)',
-        borderTop: '1px solid var(--color-border-subtle)',
-      }}>
-        <button className="btn btn-secondary btn-sm" onClick={onCancel}>Cancel</button>
-        <button className="btn btn-primary btn-sm" onClick={handleSubmit} disabled={!isValid()}>Save rule</button>
-      </div>
-    </div>
-  )
-}
-
 export default function Nodes() {
   const { addToast } = useOutletContext()
-  const navigate = useNavigate()
   const { t } = useTranslation('admin')
   const [nodesList, setNodesList] = useState([])
+  const [allModels, setAllModels] = useState([])
   const [loading, setLoading] = useState(true)
   const [enabled, setEnabled] = useState(true)
-  const [expandedNodeId, setExpandedNodeId] = useState(null)
-  const [nodeModels, setNodeModels] = useState({})
-  const [nodeBackends, setNodeBackends] = useState({})
   const [confirmDelete, setConfirmDelete] = useState(null)
-  const [confirmUnload, setConfirmUnload] = useState(null)
-  const [confirmDeleteBackend, setConfirmDeleteBackend] = useState(null)
-  // Capacity-shrink confirm uses a Promise resolver so the editor can `await`
-  // the user's choice. Pattern matches the rest of the page where confirms
-  // open a ConfirmDialog and the action proceeds in onConfirm.
-  const [confirmShrinkState, setConfirmShrinkState] = useState(null)
-  const confirmShrink = useCallback(({ node, newValue, currentLoaded }) => {
-    return new Promise((resolve) => {
-      setConfirmShrinkState({ node, newValue, currentLoaded, resolve })
-    })
-  }, [])
   const [showTips, setShowTips] = useState(false)
-  const [activeTab, setActiveTab] = useState('backend') // 'backend', 'agent', or 'scheduling'
-  const [schedulingConfigs, setSchedulingConfigs] = useState([])
-  const [showSchedulingForm, setShowSchedulingForm] = useState(false)
+  const [activeTab, setActiveTab] = useState('all') // 'all' | 'backend' | 'agent'
 
   const fetchNodes = useCallback(async () => {
     try {
@@ -743,71 +132,32 @@ export default function Nodes() {
     }
   }, [])
 
-  const fetchScheduling = useCallback(async () => {
+  // Roster model fetch: drives the inline model chips on each backend panel
+  // without an expand click. Grouped by node below.
+  const fetchAllModels = useCallback(async () => {
     try {
-      const data = await nodesApi.listScheduling()
-      setSchedulingConfigs(Array.isArray(data) ? data : [])
-    } catch { setSchedulingConfigs([]) }
+      const d = await nodesApi.allModels()
+      setAllModels(Array.isArray(d) ? d : [])
+    } catch {
+      setAllModels([])
+    }
   }, [])
 
   useEffect(() => {
     fetchNodes()
-    fetchScheduling()
-    const interval = setInterval(fetchNodes, 5000)
+    fetchAllModels()
+    const interval = setInterval(() => {
+      fetchNodes()
+      fetchAllModels()
+    }, 5000)
     return () => clearInterval(interval)
-  }, [fetchNodes, fetchScheduling])
+  }, [fetchNodes, fetchAllModels])
 
-  const fetchModels = useCallback(async (nodeId) => {
-    try {
-      const data = await nodesApi.getModels(nodeId)
-      setNodeModels(prev => ({ ...prev, [nodeId]: Array.isArray(data) ? data : [] }))
-    } catch {
-      setNodeModels(prev => ({ ...prev, [nodeId]: [] }))
-    }
-  }, [])
-
-  const fetchBackends = useCallback(async (nodeId) => {
-    try {
-      const data = await nodesApi.getBackends(nodeId)
-      setNodeBackends(prev => ({ ...prev, [nodeId]: Array.isArray(data) ? data : [] }))
-    } catch {
-      setNodeBackends(prev => ({ ...prev, [nodeId]: [] }))
-    }
-  }, [])
-
-  const toggleExpand = (nodeId) => {
-    if (expandedNodeId === nodeId) {
-      setExpandedNodeId(null)
-    } else {
-      setExpandedNodeId(nodeId)
-      if (!nodeModels[nodeId]) {
-        fetchModels(nodeId)
-      }
-      if (!nodeBackends[nodeId]) {
-        fetchBackends(nodeId)
-      }
-    }
-  }
-
-  const handleUpgradeBackend = async (nodeId, backendName) => {
-    try {
-      await nodesApi.installBackend(nodeId, backendName)
-      addToast(`Backend "${backendName}" upgraded`, 'success')
-      fetchBackends(nodeId)
-    } catch (err) {
-      addToast(`Failed to upgrade backend: ${err.message}`, 'error')
-    }
-  }
-
-  const handleDeleteBackendOnNode = async (nodeId, backendName) => {
-    try {
-      await nodesApi.deleteBackend(nodeId, backendName)
-      addToast(`Backend "${backendName}" deleted`, 'success')
-      fetchBackends(nodeId)
-    } catch (err) {
-      addToast(`Failed to delete backend: ${err.message}`, 'error')
-    }
-  }
+  const modelsByNode = useMemo(() => {
+    const m = {}
+    for (const x of allModels) (m[x.node_id] ||= []).push(x)
+    return m
+  }, [allModels])
 
   const handleDrain = async (nodeId) => {
     try {
@@ -839,42 +189,11 @@ export default function Nodes() {
     }
   }
 
-  const handleUnloadModel = async (nodeId, modelName) => {
-    try {
-      await nodesApi.unloadModel(nodeId, modelName)
-      addToast(`Model "${modelName}" unloaded`, 'success')
-      fetchModels(nodeId)
-    } catch (err) {
-      addToast(`Failed to unload model: ${err.message}`, 'error')
-    }
-  }
-
-  const handleAddLabel = async (nodeId, key, value) => {
-    try {
-      await nodesApi.mergeLabels(nodeId, { [key]: value })
-      addToast(`Label "${key}=${value}" added`, 'success')
-      fetchNodes()
-    } catch (err) {
-      addToast(`Failed to add label: ${err.message}`, 'error')
-    }
-  }
-
-  const handleDeleteLabel = async (nodeId, key) => {
-    try {
-      await nodesApi.deleteLabel(nodeId, key)
-      addToast(`Label "${key}" removed`, 'success')
-      fetchNodes()
-    } catch (err) {
-      addToast(`Failed to remove label: ${err.message}`, 'error')
-    }
-  }
-
   const handleDelete = async (nodeId) => {
     try {
       await nodesApi.delete(nodeId)
       addToast('Node removed', 'success')
       setConfirmDelete(null)
-      if (expandedNodeId === nodeId) setExpandedNodeId(null)
       fetchNodes()
     } catch (err) {
       addToast(`Failed to remove node: ${err.message}`, 'error')
@@ -986,14 +305,8 @@ export default function Nodes() {
   // Split nodes by type
   const backendNodes = nodesList.filter(n => !n.node_type || n.node_type === 'backend')
   const agentNodes = nodesList.filter(n => n.node_type === 'agent')
-  const filteredNodes = activeTab === 'agent' ? agentNodes : backendNodes
-
-  // Compute stats for current tab
-  const total = filteredNodes.length
-  const healthy = filteredNodes.filter(n => n.status === 'healthy').length
-  const unhealthy = filteredNodes.filter(n => n.status === 'unhealthy' || n.status === 'offline').length
-  const draining = filteredNodes.filter(n => n.status === 'draining').length
-  const pending = filteredNodes.filter(n => n.status === 'pending').length
+  const filteredNodes = activeTab === 'all' ? nodesList
+    : activeTab === 'agent' ? agentNodes : backendNodes
 
   return (
     <div className="page page--wide">
@@ -1007,65 +320,16 @@ export default function Nodes() {
         supporting={t('nodes.subtitle')}
       />
 
-      {/* Tabs */}
-      <div className="tabs" style={{ marginBottom: 'var(--spacing-lg)' }}>
-        <button
-          onClick={() => setActiveTab('backend')}
-          className={`tab ${activeTab === 'backend' ? 'tab-active' : ''}`}
-        >
-          <i className="fas fa-server" style={{ marginRight: 6 }} />
-          Backend Workers ({backendNodes.length})
-        </button>
-        <button
-          onClick={() => setActiveTab('agent')}
-          className={`tab ${activeTab === 'agent' ? 'tab-active' : ''}`}
-        >
-          <i className="fas fa-robot" style={{ marginRight: 6 }} />
-          Agent Workers ({agentNodes.length})
-        </button>
-        <button
-          onClick={() => setActiveTab('scheduling')}
-          className={`tab ${activeTab === 'scheduling' ? 'tab-active' : ''}`}
-        >
-          <i className="fas fa-calendar-alt" style={{ marginRight: 6 }} />
-          Scheduling ({schedulingConfigs.length})
-        </button>
-      </div>
+      <ClusterPulse nodes={nodesList} />
+      <AttentionCallout nodes={nodesList} onApprove={handleApprove} />
 
-      {activeTab !== 'scheduling' && <>
-      {/* Stat cards */}
-      <div className="stat-grid">
-        <StatCard icon={activeTab === 'agent' ? 'fas fa-robot' : 'fas fa-server'}
-          label={`Total ${activeTab === 'agent' ? 'Agent' : 'Backend'} Workers`} value={total} />
-        <StatCard icon="fas fa-check-circle" label="Healthy" value={healthy}
-          accentVar={healthy > 0 ? '--color-success' : undefined} />
-        <StatCard icon="fas fa-exclamation-circle" label="Unhealthy" value={unhealthy}
-          accentVar={unhealthy > 0 ? '--color-error' : undefined} />
-        <StatCard icon="fas fa-hourglass-half" label="Draining" value={draining}
-          accentVar={draining > 0 ? '--color-warning' : undefined} />
-        {pending > 0 && (
-          <StatCard icon="fas fa-clock" label="Pending" value={pending} accentVar="--color-warning" />
-        )}
-        {activeTab === 'backend' && (() => {
-          const clusterTotalVRAM = backendNodes.reduce((sum, n) => sum + (n.total_vram || 0), 0)
-          const clusterUsedVRAM = backendNodes.reduce((sum, n) => {
-            if (n.total_vram && n.available_vram != null) return sum + (n.total_vram - n.available_vram)
-            return sum
-          }, 0)
-          const totalModelsLoaded = backendNodes.reduce((sum, n) => sum + (n.model_count || 0), 0)
-          const totalInFlight = backendNodes.reduce((sum, n) => sum + (n.in_flight_count || 0), 0)
-          return (
-            <>
-              {clusterTotalVRAM > 0 && (
-                <StatCard icon="fas fa-microchip" label="Cluster VRAM"
-                  value={`${formatVRAM(clusterUsedVRAM) || '0'} / ${formatVRAM(clusterTotalVRAM)}`} />
-              )}
-              <StatCard icon="fas fa-cube" label="Models Loaded" value={totalModelsLoaded} />
-              <StatCard icon="fas fa-exchange-alt" label="In-Flight Requests" value={totalInFlight}
-                accentVar={totalInFlight > 0 ? '--color-primary' : undefined} />
-            </>
-          )
-        })()}
+      {/* Node-type filter */}
+      <div role="radiogroup" aria-label="Node type" className="segmented node-filter">
+        {[['all', 'All'], ['backend', 'Backend'], ['agent', 'Agent']].map(([key, label]) => (
+          <button key={key} type="button" role="radio" aria-checked={activeTab === key}
+            className={`segmented__item${activeTab === key ? ' is-active' : ''}`}
+            onClick={() => setActiveTab(key)}>{label}</button>
+        ))}
       </div>
 
       {/* Worker tips */}
@@ -1085,588 +349,13 @@ export default function Nodes() {
         </>
       )}
 
-      {/* Node table */}
       {filteredNodes.length > 0 && (
-        <ResponsiveTable>
-            <thead>
-              <tr>
-                <th>Name</th>
-                <th>Status</th>
-                <th>GPU / VRAM</th>
-                <th>Last Heartbeat</th>
-                <th style={{ textAlign: 'right' }}>Actions</th>
-              </tr>
-            </thead>
-            <tbody>
-              {filteredNodes.map(node => {
-                const status = statusConfig[node.status] || statusConfig.unhealthy
-                const isExpanded = expandedNodeId === node.id
-                const models = nodeModels[node.id]
-                const backends = nodeBackends[node.id]
-                const vendorLabel = gpuVendorLabel(node.gpu_vendor)
-                const totalVRAMStr = formatVRAM(node.total_vram)
-                const availVRAMStr = formatVRAM(node.available_vram)
-                const usedVRAM = node.total_vram && node.available_vram != null
-                  ? node.total_vram - node.available_vram
-                  : null
-                const usedVRAMStr = usedVRAM != null ? formatVRAM(usedVRAM) : null
-
-                // RAM fallback for CPU-only workers
-                const hasGPU = node.total_vram > 0
-                const totalRAMStr = formatVRAM(node.total_ram)
-                const usedRAM = node.total_ram && node.available_ram != null
-                  ? node.total_ram - node.available_ram
-                  : null
-                const usedRAMStr = usedRAM != null ? formatVRAM(usedRAM) : null
-
-                const canExpand = activeTab !== 'agent'
-                return (
-                  <Fragment key={node.id}>
-                    <tr
-                      onClick={canExpand ? () => toggleExpand(node.id) : undefined}
-                      style={{ cursor: canExpand ? 'pointer' : 'default' }}
-                    >
-                      <td>
-                        <div style={{ display: 'flex', alignItems: 'center', gap: 'var(--spacing-sm)' }}>
-                          {canExpand && (
-                            <span className={`row-chevron${isExpanded ? ' is-expanded' : ''}`} aria-hidden="true">
-                              <i className="fas fa-chevron-right" />
-                            </span>
-                          )}
-                          <i className="fas fa-server" style={{ color: 'var(--color-text-muted)', fontSize: 'var(--text-sm)' }} />
-                          <div>
-                            <div style={{ fontWeight: 600, fontSize: 'var(--text-sm)' }}>
-                              {node.name}
-                              {node.node_type !== 'agent' && (() => {
-                                // Slot count only applies to backend workers — agents don't
-                                // load models. Always render for backend nodes so operators
-                                // discover the field; muted (border-only) at the default of 1,
-                                // accented when > 1 so fat nodes stand out at a glance.
-                                const slots = node.max_replicas_per_model || 1
-                                const isMulti = slots > 1
-                                return (
-                                  <span
-                                    className="cell-mono"
-                                    title={isMulti
-                                      ? `Up to ${slots} replicas of any one model can run on this node`
-                                      : 'Single replica per model (default). Click the row to expand and change.'}
-                                    style={{
-                                      marginLeft: 8, padding: '1px 6px', borderRadius: 'var(--radius-sm)',
-                                      background: isMulti ? 'var(--color-bg-tertiary)' : 'transparent',
-                                      border: `1px solid ${isMulti ? 'var(--color-border)' : 'var(--color-border-subtle)'}`,
-                                      fontSize: '0.6875rem', fontWeight: 500,
-                                      color: isMulti ? 'var(--color-text-secondary)' : 'var(--color-text-muted)',
-                                    }}
-                                  >
-                                    <i className="fas fa-layer-group" style={{ marginRight: 4 }} />
-                                    {slots}× slots
-                                  </span>
-                                )
-                              })()}
-                            </div>
-                            <div className="cell-mono cell-muted">
-                              {node.address}
-                            </div>
-                            {node.labels && Object.keys(node.labels).length > 0 && (() => {
-                              // node.replica-slots is already shown structurally by the
-                              // slot badge above; surfacing it again as a label is noise.
-                              const visible = Object.entries(node.labels).filter(([k]) => k !== 'node.replica-slots')
-                              if (visible.length === 0) return null
-                              return (
-                              <div style={{ display: 'flex', flexWrap: 'wrap', gap: 3, marginTop: 3 }}>
-                                {visible.slice(0, 5).map(([k, v]) => (
-                                  <span key={k} className="cell-mono" style={{
-                                    padding: '1px 5px', borderRadius: "var(--radius-sm)",
-                                    background: 'var(--color-bg-tertiary)',
-                                    border: '1px solid var(--color-border-subtle)',
-                                  }}>{k}={v}</span>
-                                ))}
-                                {visible.length > 5 && (
-                                  <span className="cell-muted">
-                                    +{visible.length - 5} more
-                                  </span>
-                                )}
-                              </div>
-                              )
-                            })()}
-                          </div>
-                        </div>
-                      </td>
-                      <td>
-                        <span className="node-status" style={{ color: status.color }}>
-                          <span className="node-status__dot" style={{ background: status.color }} />
-                          {status.label}
-                        </span>
-                      </td>
-                      <td>
-                        {hasGPU && totalVRAMStr ? (
-                          <div style={{ fontSize: '0.8125rem', fontFamily: 'var(--font-mono)' }}>
-                            {vendorLabel && (
-                              <span style={{ color: 'var(--color-text-secondary)', marginRight: 4 }}>{vendorLabel}</span>
-                            )}
-                            <span style={{ color: 'var(--color-text-muted)' }}>
-                              {usedVRAMStr || '0'} / {totalVRAMStr}
-                            </span>
-                            {/* In-tick soft reservation: deducted at scheduling time, reset by the worker's next heartbeat. */}
-                            {node.reserved_vram > 0 && (
-                              <span
-                                title={`${formatVRAM(node.reserved_vram)} reserved by in-flight scheduling decisions; resets on next heartbeat`}
-                                style={{ color: 'var(--color-warning, #d97706)', marginLeft: 6 }}
-                              >
-                                +{formatVRAM(node.reserved_vram)} reserved
-                              </span>
-                            )}
-                          </div>
-                        ) : totalRAMStr ? (
-                          <div style={{ fontSize: '0.8125rem', fontFamily: 'var(--font-mono)' }}>
-                            <span style={{ color: 'var(--color-text-secondary)', marginRight: 4 }}>CPU</span>
-                            <span style={{ color: 'var(--color-text-muted)' }}>
-                              {usedRAMStr || '0'} / {totalRAMStr} RAM
-                            </span>
-                          </div>
-                        ) : (
-                          <span style={{ fontSize: '0.8125rem', color: 'var(--color-text-muted)' }}>-</span>
-                        )}
-                      </td>
-                      <td>
-                        <span style={{ fontSize: '0.8125rem', fontFamily: 'var(--font-mono)', color: 'var(--color-text-secondary)' }}>
-                          {timeAgo(node.last_heartbeat)}
-                        </span>
-                      </td>
-                      <td style={{ textAlign: 'right' }}>
-                        <div className="row-actions" onClick={e => e.stopPropagation()}>
-                          {/* Approve stays as a prominent primary button — it's
-                              a stateful admission gate, not a routine action,
-                              and matches how /manage surfaces install-time
-                              decisions outside the kebab menu. */}
-                          {node.status === 'pending' && (
-                            <button
-                              className="btn btn-primary btn-sm"
-                              onClick={() => handleApprove(node.id)}
-                            >
-                              <i className="fas fa-check" /> Approve
-                            </button>
-                          )}
-                          <ActionMenu
-                            ariaLabel={`Actions for ${node.name}`}
-                            triggerLabel={`Actions for ${node.name}`}
-                            items={[
-                              { key: 'resume', icon: 'fa-play', label: 'Resume',
-                                onClick: () => handleResume(node.id),
-                                hidden: node.status !== 'draining' },
-                              { key: 'drain', icon: 'fa-pause', label: 'Drain',
-                                onClick: () => handleDrain(node.id),
-                                hidden: node.status === 'draining' || node.status === 'pending' },
-                              { divider: true, hidden: node.status === 'pending' },
-                              { key: 'remove', icon: 'fa-trash', label: 'Remove from cluster',
-                                onClick: () => setConfirmDelete(node), danger: true },
-                            ]}
-                          />
-                        </div>
-                      </td>
-                    </tr>
-                    {isExpanded && canExpand && (
-                      <tr>
-                        <td colSpan={5} style={{ padding: 0, background: 'var(--color-bg-secondary)' }}>
-                          <div style={{ padding: 'var(--spacing-md) var(--spacing-lg)' }}>
-                            {/* The at-a-glance: what's running here? Empty
-                                state is a single thin line so an empty node
-                                doesn't render a giant placeholder box; the
-                                row's slot badge already conveys the
-                                node-level state. */}
-                            {!models ? (
-                              <LoadingSpinner size="sm" />
-                            ) : models.length === 0 ? (
-                              <p style={{ fontSize: '0.8125rem', color: 'var(--color-text-muted)', margin: '0 0 var(--spacing-md) 0' }}>
-                                <i className="fas fa-cube" style={{ marginRight: 6, opacity: 0.6 }} aria-hidden="true" />
-                                No models loaded yet — they'll appear here when scheduled to this node.
-                              </p>
-                            ) : (
-                              <table className="table" style={{ margin: 0 }}>
-                                <thead>
-                                  <tr>
-                                    <th>Model</th>
-                                    <th>State</th>
-                                    <th>In-Flight</th>
-                                    <th style={{ width: 40 }}>Logs</th>
-                                    <th style={{ textAlign: 'right' }}>Actions</th>
-                                  </tr>
-                                </thead>
-                                <tbody>
-                                  {(() => {
-                                    // Pre-compute per-model replica counts so the disambiguation
-                                    // pill only renders when this node actually hosts >1 replica
-                                    // of the same model. Single-replica deployments stay clean.
-                                    const replicaCounts = {}
-                                    models.forEach(m => { replicaCounts[m.model_name] = (replicaCounts[m.model_name] || 0) + 1 })
-                                    return models.map(m => {
-                                      const stCfg = modelStateConfig[m.state] || modelStateConfig.idle
-                                      const showReplica = (replicaCounts[m.model_name] || 0) > 1
-                                      // Per-replica process key — what the worker stores logs under and what the
-                                      // store's GetLines/Subscribe match on for replica-scoped filtering.
-                                      const processKey = `${m.model_name}#${m.replica_index ?? 0}`
-                                      return (
-                                      <tr key={m.id || `${m.model_name}#${m.replica_index ?? 0}`}>
-                                        <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.8125rem' }}>
-                                          {m.model_name}
-                                          {showReplica && (
-                                            <span
-                                              className="cell-mono"
-                                              aria-label={`replica ${m.replica_index ?? 0}`}
-                                              title={`Replica ${m.replica_index ?? 0} on this node`}
-                                              style={{
-                                                marginLeft: 8, padding: '1px 6px', borderRadius: 'var(--radius-sm)',
-                                                background: 'var(--color-bg-tertiary)',
-                                                border: '1px solid var(--color-border-subtle)',
-                                                fontSize: '0.6875rem', fontWeight: 500,
-                                                color: 'var(--color-text-secondary)',
-                                              }}
-                                            >
-                                              rep {m.replica_index ?? 0}
-                                            </span>
-                                          )}
-                                        </td>
-                                        <td>
-                                          <span style={{
-                                            display: 'inline-block', padding: '2px 8px', borderRadius: 'var(--radius-sm)',
-                                            fontSize: '0.75rem', fontWeight: 500,
-                                            background: stCfg.bg, color: stCfg.color, border: `1px solid ${stCfg.border}`,
-                                          }}>
-                                            {m.state}
-                                          </span>
-                                        </td>
-                                        <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.8125rem' }}>
-                                          {m.in_flight ?? 0}
-                                        </td>
-                                        <td>
-                                          <a
-                                            href="#"
-                                            onClick={(e) => {
-                                              e.preventDefault()
-                                              // Send the replica-scoped process key (modelName#replicaIndex).
-                                              // The worker's BackendLogStore returns only this replica's lines
-                                              // when given the full key; a future "merged" toggle in the logs
-                                              // page can navigate to the bare modelName URL to use aggregation.
-                                              navigate(`/app/node-backend-logs/${node.id}/${encodeURIComponent(processKey)}`)
-                                            }}
-                                            style={{ fontSize: '0.75rem', color: 'var(--color-primary)' }}
-                                            title={showReplica ? `View backend logs for replica ${m.replica_index ?? 0}` : 'View backend logs'}
-                                          >
-                                            <i className="fas fa-terminal" />
-                                          </a>
-                                        </td>
-                                        <td style={{ textAlign: 'right' }}>
-                                          <button
-                                            className="btn btn-danger btn-sm"
-                                            title={m.in_flight > 0 ? 'Unload model (has in-flight requests)' : 'Unload model'}
-                                            onClick={(e) => {
-                                              e.stopPropagation()
-                                              setConfirmUnload({
-                                                nodeId: node.id,
-                                                nodeName: node.name,
-                                                modelName: m.model_name,
-                                                inFlight: m.in_flight ?? 0,
-                                              })
-                                            }}
-                                          >
-                                            <i className="fas fa-stop" />
-                                          </button>
-                                        </td>
-                                      </tr>
-                                    )
-                                  })
-                                  })()}
-                                </tbody>
-                              </table>
-                            )}
-
-                            {/* Manage drawer: collapses three rarely-touched
-                                config zones (capacity, backends, labels)
-                                behind one disclosure so routine inspections
-                                stay focused on what's loaded above. Each
-                                zone gets a small eyebrow label instead of an
-                                h4 to avoid creating parallel hierarchies
-                                inside the disclosed area. */}
-                            <details className="node-manage" style={{ marginTop: 'var(--spacing-md)' }} onClick={e => e.stopPropagation()}>
-                              <summary style={{
-                                cursor: 'pointer', listStyle: 'none',
-                                fontSize: '0.8125rem', fontWeight: 600,
-                                color: 'var(--color-text-secondary)',
-                                padding: 'var(--spacing-xs) 0',
-                                display: 'inline-flex', alignItems: 'center', gap: 'var(--spacing-xs)',
-                              }}>
-                                <i className="fas fa-chevron-right node-manage__chevron" aria-hidden="true" />
-                                <i className="fas fa-sliders" aria-hidden="true" />
-                                Manage
-                              </summary>
-                              <div style={{ paddingTop: 'var(--spacing-md)', display: 'flex', flexDirection: 'column', gap: 'var(--spacing-lg)' }}>
-                                {/* Capacity */}
-                                <div>
-                                  <div className="drawer-eyebrow">Capacity</div>
-                                  <CapacityEditor
-                                    node={node}
-                                    loadedModelCounts={(() => {
-                                      // {modelName: replicaCount} so confirm-shrink
-                                      // can warn if reducing the cap below the actual
-                                      // count of any single model on this node.
-                                      const counts = {}
-                                      ;(models || []).forEach(m => {
-                                        if (m.state === 'loaded') counts[m.model_name] = (counts[m.model_name] || 0) + 1
-                                      })
-                                      return counts
-                                    })()}
-                                    confirmShrink={confirmShrink}
-                                    addToast={addToast}
-                                    onUpdate={() => fetchNodes()}
-                                  />
-                                </div>
-
-                                {/* Backends */}
-                                <div>
-                                  <div style={{
-                                    display: 'flex', alignItems: 'center', justifyContent: 'space-between',
-                                    marginBottom: 'var(--spacing-sm)',
-                                  }}>
-                                    <div className="drawer-eyebrow" style={{ margin: 0 }}>Backends</div>
-                                    <button
-                                      type="button"
-                                      className="btn btn-secondary btn-sm"
-                                      onClick={(e) => {
-                                        e.stopPropagation()
-                                        // Hand off to the gallery in target-node mode.
-                                        // The Backends page reads ?target=<id> and
-                                        // scopes its install action to this node —
-                                        // one gallery, two scopes, no duplicate UI.
-                                        navigate(`/app/backends?target=${encodeURIComponent(node.id)}`)
-                                      }}
-                                      title={`Install a backend on ${node.name}`}
-                                    >
-                                      <i className="fas fa-plus" /> Add backend
-                                    </button>
-                                  </div>
-                                  {!backends ? (
-                                    <LoadingSpinner size="sm" />
-                                  ) : backends.length === 0 ? (
-                                    <p style={{ fontSize: '0.8125rem', color: 'var(--color-text-muted)', margin: 0 }}>
-                                      None installed. <a href="#" style={{ color: 'var(--color-primary)' }} onClick={(e) => { e.preventDefault(); e.stopPropagation(); navigate(`/app/backends?target=${encodeURIComponent(node.id)}`) }}>Install one from the gallery</a> to schedule models here.
-                                    </p>
-                                  ) : (
-                                    <table className="table" style={{ margin: 0 }}>
-                                      <thead>
-                                        <tr>
-                                          <th>Name</th>
-                                          <th>Type</th>
-                                          <th>Installed At</th>
-                                          <th style={{ textAlign: 'right' }}>Actions</th>
-                                        </tr>
-                                      </thead>
-                                      <tbody>
-                                        {backends.map(b => (
-                                          <tr key={b.name}>
-                                            <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.8125rem' }}>
-                                              {b.name}
-                                            </td>
-                                            <td>
-                                              <span style={{
-                                                display: 'inline-block', padding: '2px 8px', borderRadius: 'var(--radius-sm)',
-                                                fontSize: '0.75rem', fontWeight: 500,
-                                                background: b.is_system ? 'var(--color-bg-tertiary)' : 'var(--color-primary-light)',
-                                                color: b.is_system ? 'var(--color-text-muted)' : 'var(--color-primary)',
-                                                border: `1px solid ${b.is_system ? 'var(--color-border-subtle)' : 'var(--color-primary-border)'}`,
-                                              }}>
-                                                {b.is_system ? 'system' : 'gallery'}
-                                              </span>
-                                            </td>
-                                            <td style={{ fontSize: '0.8125rem', color: 'var(--color-text-muted)' }}>
-                                              {b.installed_at ? timeAgo(b.installed_at) : '-'}
-                                            </td>
-                                            <td style={{ textAlign: 'right' }}>
-                                              {!b.is_system && (
-                                                <div style={{ display: 'inline-flex', gap: 'var(--spacing-xs)' }}>
-                                                  <button
-                                                    className="btn btn-secondary btn-sm"
-                                                    onClick={() => handleUpgradeBackend(node.id, b.name)}
-                                                    title="Upgrade backend on this node"
-                                                  >
-                                                    <i className="fas fa-arrow-up" />
-                                                  </button>
-                                                  <button
-                                                    className="btn btn-danger-ghost btn-sm"
-                                                    onClick={() => setConfirmDeleteBackend({ nodeId: node.id, nodeName: node.name, backend: b.name })}
-                                                    title="Delete backend from this node"
-                                                  >
-                                                    <i className="fas fa-trash" />
-                                                  </button>
-                                                </div>
-                                              )}
-                                            </td>
-                                          </tr>
-                                        ))}
-                                      </tbody>
-                                    </table>
-                                  )}
-                                </div>
-
-                                {/* Labels — same chip-builder as the scheduling
-                                    form, but onAdd/onRemove fire API calls
-                                    instead of mutating form state. node.replica-slots
-                                    is filtered out so the Capacity editor stays
-                                    the single source of truth for that label. */}
-                                <div>
-                                  <div className="drawer-eyebrow">Labels</div>
-                                  <KeyValueChips
-                                    pairs={node.labels ? Object.fromEntries(Object.entries(node.labels).filter(([k]) => k !== 'node.replica-slots')) : {}}
-                                    onAdd={(k, v) => handleAddLabel(node.id, k, v)}
-                                    onRemove={(k) => handleDeleteLabel(node.id, k)}
-                                    placeholderKey="key"
-                                    placeholderValue="value"
-                                    ariaLabel={`Labels for ${node.name}`}
-                                  />
-                                </div>
-                              </div>
-                            </details>
-                          </div>
-                        </td>
-                      </tr>
-                    )}
-                  </Fragment>
-                )
-              })}
-            </tbody>
-        </ResponsiveTable>
-      )}
-      </>}
-
-      {activeTab === 'scheduling' && (
-        <div>
-          <button className="btn btn-primary btn-sm" style={{ marginBottom: 'var(--spacing-md)' }}
-            onClick={() => setShowSchedulingForm(f => !f)}>
-            <i className="fas fa-plus" style={{ marginRight: 6 }} />
-            Add Scheduling Rule
-          </button>
-          {showSchedulingForm && <SchedulingForm onSave={async (config) => {
-            try {
-              await nodesApi.setScheduling(config)
-              fetchScheduling()
-              setShowSchedulingForm(false)
-              addToast('Scheduling rule saved', 'success')
-            } catch (err) {
-              addToast(`Failed to save rule: ${err.message}`, 'error')
-            }
-          }} onCancel={() => setShowSchedulingForm(false)} />}
-          {schedulingConfigs.length === 0 && !showSchedulingForm ? (
-            <p style={{ fontSize: '0.875rem', color: 'var(--color-text-muted)', textAlign: 'center', padding: 'var(--spacing-xl) 0' }}>
-              No scheduling rules configured. Add a rule to control how models are placed on nodes.
-            </p>
-          ) : schedulingConfigs.length > 0 && (
-            <ResponsiveTable>
-                <thead><tr>
-                  <th>Model</th>
-                  <th>Mode</th>
-                  <th>Node Selector</th>
-                  <th>Min Replicas</th>
-                  <th>Max Replicas</th>
-                  <th>Routing</th>
-                  <th>Thresholds</th>
-                  <th>Status</th>
-                  <th style={{ textAlign: 'right' }}>Actions</th>
-                </tr></thead>
-                <tbody>
-                  {schedulingConfigs.map(cfg => {
-                    const isSpread = !!cfg.spread_all
-                    const isAutoScaling = !isSpread && (cfg.min_replicas > 0 || cfg.max_replicas > 0)
-                    const hasSelector = !!cfg.node_selector
-                    const modeLabel = isSpread ? 'Spread' : isAutoScaling ? 'Auto-scaling' : hasSelector ? 'Placement' : 'Inactive'
-                    const modeColor = isSpread ? 'var(--color-warning)' : isAutoScaling ? 'var(--color-success)' : hasSelector ? 'var(--color-primary)' : 'var(--color-text-muted)'
-                    // Cooldown: reconciler tripped the circuit breaker because cluster
-                    // capacity is exhausted. Surface so the operator sees it instead
-                    // of the model silently failing to scale.
-                    const unsatisfiableUntil = cfg.unsatisfiable_until ? new Date(cfg.unsatisfiable_until) : null
-                    const isUnsatisfiable = unsatisfiableUntil && unsatisfiableUntil.getTime() > Date.now()
-                    return (
-                    <tr key={cfg.id || cfg.model_name}>
-                      <td style={{ fontWeight: 600, fontSize: '0.875rem' }}>{cfg.model_name}</td>
-                      <td>
-                        <span style={{
-                          display: 'inline-block', fontSize: '0.75rem', padding: '2px 8px', borderRadius: "var(--radius-sm)",
-                          background: 'var(--color-bg-tertiary)', border: `1px solid ${modeColor}`,
-                          color: modeColor, fontWeight: 600,
-                        }}>{modeLabel}</span>
-                      </td>
-                      <td>
-                        {cfg.node_selector ? (() => {
-                          try {
-                            const sel = typeof cfg.node_selector === 'string' ? JSON.parse(cfg.node_selector) : cfg.node_selector
-                            return Object.entries(sel).map(([k,v]) => (
-                              <span key={k} style={{
-                                display: 'inline-block', fontSize: '0.75rem', padding: '2px 6px', borderRadius: "var(--radius-sm)",
-                                background: 'var(--color-bg-tertiary)', border: '1px solid var(--color-border-subtle)',
-                                fontFamily: 'var(--font-mono)', marginRight: 4,
-                              }}>{k}={v}</span>
-                            ))
-                          } catch { return <span style={{ color: 'var(--color-text-muted)', fontSize: '0.8125rem' }}>{cfg.node_selector}</span> }
-                        })() : <span style={{ color: 'var(--color-text-muted)', fontSize: '0.8125rem' }}>Any node</span>}
-                      </td>
-                      <td style={{ fontFamily: 'var(--font-mono)' }}>
-                        {isSpread
-                          ? <span style={{
-                              display: 'inline-block', fontSize: '0.75rem', padding: '2px 8px', borderRadius: "var(--radius-sm)",
-                              background: 'var(--color-bg-tertiary)', border: '1px solid var(--color-warning)',
-                              color: 'var(--color-warning)', fontWeight: 600, fontFamily: 'var(--font-sans)',
-                            }}>Spread: all matching nodes</span>
-                          : isAutoScaling ? cfg.min_replicas : '-'}
-                      </td>
-                      <td style={{ fontFamily: 'var(--font-mono)' }}>
-                        {isSpread ? '-' : isAutoScaling ? (cfg.max_replicas || 'no limit') : '-'}
-                      </td>
-                      <td style={{ fontSize: '0.8125rem' }}>
-                        {cfg.route_policy || 'default'}
-                      </td>
-                      <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.75rem', color: 'var(--color-text-muted)' }}>
-                        {cfg.route_policy === 'prefix_cache' ? (
-                          <>
-                            <div>match: {cfg.min_prefix_match ? cfg.min_prefix_match : 'inherit'}</div>
-                            <div>abs: {cfg.balance_abs_threshold ? cfg.balance_abs_threshold : 'inherit'}</div>
-                            <div>rel: {cfg.balance_rel_threshold ? cfg.balance_rel_threshold : 'inherit'}</div>
-                          </>
-                        ) : '-'}
-                      </td>
-                      <td>
-                        {isUnsatisfiable ? (
-                          <span
-                            title={`Reconciler couldn't satisfy this rule (capacity exhausted). Will retry by ${unsatisfiableUntil.toLocaleString()}, or sooner on a node lifecycle change.`}
-                            style={{
-                              display: 'inline-block', fontSize: '0.75rem', padding: '2px 8px',
-                              borderRadius: 'var(--radius-sm)', fontWeight: 600,
-                              background: 'var(--color-bg-tertiary)',
-                              border: '1px solid var(--color-warning, #d97706)',
-                              color: 'var(--color-warning, #d97706)',
-                            }}
-                          >
-                            <i className="fas fa-exclamation-triangle" style={{ marginRight: 4 }} />
-                            Unsatisfiable until {unsatisfiableUntil.toLocaleTimeString([], { hour: '2-digit', minute: '2-digit' })}
-                          </span>
-                        ) : (
-                          <span style={{ fontSize: '0.8125rem', color: 'var(--color-text-muted)' }}>OK</span>
-                        )}
-                      </td>
-                      <td style={{ textAlign: 'right' }}>
-                        <button className="btn btn-danger btn-sm" onClick={async () => {
-                          try {
-                            await nodesApi.deleteScheduling(cfg.model_name)
-                            fetchScheduling()
-                            addToast('Rule deleted', 'success')
-                          } catch (err) {
-                            addToast(`Failed to delete rule: ${err.message}`, 'error')
-                          }
-                        }}><i className="fas fa-trash" /></button>
-                      </td>
-                    </tr>
-                    )
-                  })}
-                </tbody>
-            </ResponsiveTable>
-          )}
+        <div className="node-roster">
+          {filteredNodes.map(node => (
+            <NodePanel key={node.id} node={node} models={modelsByNode[node.id] || []}
+              onApprove={handleApprove} onDrain={handleDrain} onResume={handleResume}
+              onRemove={(n) => setConfirmDelete(n)} />
+          ))}
         </div>
       )}
 
@@ -1680,60 +369,6 @@ export default function Nodes() {
         onCancel={() => setConfirmDelete(null)}
       />
 
-      <ConfirmDialog
-        open={!!confirmDeleteBackend}
-        title="Delete Backend"
-        message={confirmDeleteBackend ? `Delete "${confirmDeleteBackend.backend}" from ${confirmDeleteBackend.nodeName}? This removes the backend files from this node only.` : ''}
-        confirmLabel="Delete"
-        danger
-        onConfirm={() => {
-          if (confirmDeleteBackend) {
-            handleDeleteBackendOnNode(confirmDeleteBackend.nodeId, confirmDeleteBackend.backend)
-          }
-          setConfirmDeleteBackend(null)
-        }}
-        onCancel={() => setConfirmDeleteBackend(null)}
-      />
-
-      <ConfirmDialog
-        open={!!confirmUnload}
-        title="Unload Model"
-        message={
-          confirmUnload
-            ? confirmUnload.inFlight > 0
-              ? `"${confirmUnload.modelName}" on ${confirmUnload.nodeName} currently has ${confirmUnload.inFlight} in-flight request(s). Unloading will interrupt them. Continue?`
-              : `Unload "${confirmUnload.modelName}" from ${confirmUnload.nodeName}?`
-            : ''
-        }
-        confirmLabel="Unload"
-        danger={confirmUnload?.inFlight > 0}
-        onConfirm={() => {
-          if (confirmUnload) {
-            handleUnloadModel(confirmUnload.nodeId, confirmUnload.modelName)
-          }
-          setConfirmUnload(null)
-        }}
-        onCancel={() => setConfirmUnload(null)}
-      />
-
-      <ConfirmDialog
-        open={!!confirmShrinkState}
-        title="Reduce replica capacity"
-        message={
-          confirmShrinkState
-            ? `${confirmShrinkState.node.name} currently has ${confirmShrinkState.currentLoaded} replica(s) of at least one model loaded. Reducing the cap to ${confirmShrinkState.newValue} won't evict anything immediately — running replicas keep going, but the reconciler will trim down on the next idle window. Continue?`
-            : ''
-        }
-        confirmLabel="Reduce"
-        onConfirm={() => {
-          confirmShrinkState?.resolve(true)
-          setConfirmShrinkState(null)
-        }}
-        onCancel={() => {
-          confirmShrinkState?.resolve(false)
-          setConfirmShrinkState(null)
-        }}
-      />
     </div>
   )
 }
diff --git a/core/http/react-ui/src/pages/Scheduling.jsx b/core/http/react-ui/src/pages/Scheduling.jsx
new file mode 100644
index 000000000..576b2215b
--- /dev/null
+++ b/core/http/react-ui/src/pages/Scheduling.jsx
@@ -0,0 +1,438 @@
+import { useState, useEffect, useCallback } from 'react'
+import { useOutletContext } from 'react-router-dom'
+import { useTranslation } from 'react-i18next'
+import { nodesApi } from '../utils/api'
+import PageHeader from '../components/PageHeader'
+import ConfirmDialog from '../components/ConfirmDialog'
+import ResponsiveTable from '../components/ResponsiveTable'
+import SearchableModelSelect from '../components/SearchableModelSelect'
+import KeyValueChips from '../components/nodes/KeyValueChips'
+
+// Numeric input with quick-pick preset chips. Picked over a slider because
+// replica counts are exact specs (operator math), not fuzzy estimates. The
+// chips give one-click access to common values without the slider's
+// precision/special-value problems (e.g. MaxReplicas=0 = "no limit").
+function ReplicaInput({ id, label, value, onChange, presets }) {
+  return (
+    <div style={{ flex: 1 }}>
+      <label className="form-label" htmlFor={id}>{label}</label>
+      <input
+        id={id}
+        className="input"
+        type="number"
+        min={0}
+        value={value}
+        onChange={e => onChange(parseInt(e.target.value) || 0)}
+      />
+      <div style={{ display: 'flex', gap: 4, flexWrap: 'wrap', marginTop: 6 }}>
+        {presets.map(({ v, l }) => {
+          const active = value === v
+          return (
+            <button
+              key={v}
+              type="button"
+              onClick={() => onChange(v)}
+              aria-pressed={active}
+              className="cell-mono"
+              style={{
+                padding: '2px 8px',
+                borderRadius: 'var(--radius-sm)',
+                fontSize: '0.6875rem',
+                fontWeight: 500,
+                cursor: 'pointer',
+                background: active ? 'var(--color-primary-light)' : 'transparent',
+                border: `1px solid ${active ? 'var(--color-primary-border)' : 'var(--color-border-subtle)'}`,
+                color: active ? 'var(--color-primary)' : 'var(--color-text-muted)',
+              }}
+            >{l || v}</button>
+          )
+        })}
+      </div>
+    </div>
+  )
+}
+
+function SchedulingForm({ onSave, onCancel }) {
+  const [mode, setMode] = useState('placement')
+  const [modelName, setModelName] = useState('')
+  // Selector is now a chip-builder map instead of a comma-separated string.
+  // Operators were copying syntax from docs and missing commas; the chip UI
+  // makes the key=value structure self-documenting.
+  const [selector, setSelector] = useState({})
+  const [minReplicas, setMinReplicas] = useState(1)
+  const [maxReplicas, setMaxReplicas] = useState(0)
+  // Prefix-cache routing controls. Empty routePolicy means "inherit the
+  // cluster default"; the three thresholds at 0 likewise inherit, so they
+  // stay out of the POST body's effective override only when explicitly set.
+  const [routePolicy, setRoutePolicy] = useState('')
+  const [balanceAbsThreshold, setBalanceAbsThreshold] = useState(0)
+  const [balanceRelThreshold, setBalanceRelThreshold] = useState(0)
+  const [minPrefixMatch, setMinPrefixMatch] = useState(0)
+
+  const hasSelector = Object.keys(selector).length > 0
+
+  const isValid = () => {
+    if (!modelName) return false
+    if (mode === 'placement') return hasSelector
+    if (mode === 'spread') return true
+    return minReplicas > 0 || maxReplicas > 0
+  }
+
+  const handleSubmit = () => {
+    onSave({
+      model_name: modelName,
+      node_selector: hasSelector ? selector : undefined,
+      min_replicas: mode === 'autoscaling' ? minReplicas : 0,
+      max_replicas: mode === 'autoscaling' ? maxReplicas : 0,
+      spread_all: mode === 'spread',
+      route_policy: routePolicy,
+      balance_abs_threshold: balanceAbsThreshold,
+      balance_rel_threshold: balanceRelThreshold,
+      min_prefix_match: minPrefixMatch,
+    })
+  }
+
+  return (
+    <div className="card" style={{ padding: 'var(--spacing-lg)', marginBottom: 'var(--spacing-md)' }}>
+      {/* Mode selector — uses the project's segmented control instead of two
+          50%-width filled buttons that competed visually with the actual
+          primary action (Save). */}
+      <div role="radiogroup" aria-label="Scheduling mode" className="segmented" style={{ marginBottom: 'var(--spacing-xs)' }}>
+        <button
+          type="button" role="radio" aria-checked={mode === 'placement'}
+          className={`segmented__item${mode === 'placement' ? ' is-active' : ''}`}
+          onClick={() => setMode('placement')}
+        >
+          <i className="fas fa-thumbtack" aria-hidden="true" /> Pin to nodes
+        </button>
+        <button
+          type="button" role="radio" aria-checked={mode === 'autoscaling'}
+          className={`segmented__item${mode === 'autoscaling' ? ' is-active' : ''}`}
+          onClick={() => setMode('autoscaling')}
+        >
+          <i className="fas fa-arrows-up-down" aria-hidden="true" /> Auto-scale
+        </button>
+        <button
+          type="button" role="radio" aria-checked={mode === 'spread'}
+          className={`segmented__item${mode === 'spread' ? ' is-active' : ''}`}
+          onClick={() => setMode('spread')}
+        >
+          <i className="fas fa-network-wired" aria-hidden="true" /> Spread to all
+        </button>
+      </div>
+      <p style={{ fontSize: '0.8125rem', color: 'var(--color-text-muted)', margin: '0 0 var(--spacing-lg) 0' }}>
+        {mode === 'placement'
+          ? 'Restrict this model to specific nodes. Loaded on demand, evictable when idle.'
+          : mode === 'spread'
+          ? 'Run one replica on every node matching the selector (all healthy nodes when empty). Tracks nodes joining and leaving.'
+          : 'Maintain a target replica count across the cluster. Min ≥ 1 protects from eviction.'}
+      </p>
+
+      {/* Linear vertical flow — model picker is the visual focus, then the
+          mode-specific fields below. No 2-column grid (the mismatched widths
+          made the form look raw). */}
+      <div style={{ display: 'flex', flexDirection: 'column', gap: 'var(--spacing-md)' }}>
+        <div>
+          <label className="form-label" htmlFor="sched-model">Model</label>
+          {/* Searchable combobox so a long gallery doesn't force the operator
+              to scroll through hundreds of entries. Free-text is allowed —
+              you can pre-create a rule for a model that hasn't been
+              installed yet, which is a real workflow when standing up a new
+              node and pre-staging its scheduling policy. */}
+          <SearchableModelSelect
+            value={modelName}
+            onChange={setModelName}
+            placeholder="Type to search models, or paste a name..."
+          />
+        </div>
+
+        <div>
+          <label className="form-label">
+            Node selector{mode === 'placement' ? '' : ' (optional)'}
+          </label>
+          <KeyValueChips
+            pairs={selector}
+            onAdd={(k, v) => setSelector(prev => ({ ...prev, [k]: v }))}
+            onRemove={(k) => setSelector(prev => { const n = { ...prev }; delete n[k]; return n })}
+            placeholderKey="key (e.g. gpu.vendor)"
+            placeholderValue="value (e.g. nvidia)"
+            ariaLabel="Node selector"
+          />
+          <span style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', display: 'block', marginTop: 6 }}>
+            {mode === 'placement'
+              ? 'Models will load only on nodes that match all listed labels.'
+              : (hasSelector ? 'Replicas land only on matching nodes.' : 'Empty = any healthy node.')}
+          </span>
+        </div>
+
+        {mode === 'autoscaling' && (
+          <div style={{ display: 'flex', gap: 'var(--spacing-md)' }}>
+            <ReplicaInput
+              id="sched-min"
+              label="Min replicas"
+              value={minReplicas}
+              onChange={setMinReplicas}
+              presets={[{ v: 1 }, { v: 2 }, { v: 3 }, { v: 4 }]}
+            />
+            <ReplicaInput
+              id="sched-max"
+              label="Max replicas"
+              value={maxReplicas}
+              onChange={setMaxReplicas}
+              presets={[{ v: 0, l: 'no limit' }, { v: 2 }, { v: 4 }, { v: 8 }]}
+            />
+          </div>
+        )}
+
+        {/* Per-model routing policy. Left empty/zero these inherit the
+            cluster-wide defaults; set them to override how requests for this
+            model are spread across replicas. */}
+        <div>
+          <label className="form-label" htmlFor="sched-route-policy">Routing policy</label>
+          <select
+            id="sched-route-policy"
+            className="input"
+            value={routePolicy}
+            onChange={e => setRoutePolicy(e.target.value)}
+          >
+            <option value="">Default (cluster setting)</option>
+            <option value="round_robin">Round Robin</option>
+            <option value="prefix_cache">Prefix Cache</option>
+          </select>
+          <span style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', display: 'block', marginTop: 6 }}>
+            Prefix Cache routes shared-prefix requests to the same replica to reuse its KV cache, falling back to round-robin when replicas are imbalanced.
+          </span>
+        </div>
+
+        {routePolicy === 'prefix_cache' && (
+          <div style={{ display: 'flex', gap: 'var(--spacing-md)' }}>
+            <div style={{ flex: 1 }}>
+              <label className="form-label" htmlFor="sched-min-prefix-match">Min prefix match</label>
+              <input
+                id="sched-min-prefix-match"
+                className="input"
+                type="number"
+                step="0.05"
+                min="0"
+                max="1"
+                value={minPrefixMatch}
+                onChange={e => setMinPrefixMatch(parseFloat(e.target.value) || 0)}
+              />
+              <span style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', display: 'block', marginTop: 6 }}>
+                Fraction of the prompt (0..1) that must match a cached prefix before affinity kicks in. 0 inherits the default.
+              </span>
+            </div>
+            <div style={{ flex: 1 }}>
+              <label className="form-label" htmlFor="sched-balance-abs">Balance abs threshold</label>
+              <input
+                id="sched-balance-abs"
+                className="input"
+                type="number"
+                min="0"
+                value={balanceAbsThreshold}
+                onChange={e => setBalanceAbsThreshold(parseInt(e.target.value) || 0)}
+              />
+              <span style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', display: 'block', marginTop: 6 }}>
+                Max absolute in-flight gap allowed before falling back to round-robin. 0 inherits the default.
+              </span>
+            </div>
+            <div style={{ flex: 1 }}>
+              <label className="form-label" htmlFor="sched-balance-rel">Balance rel threshold</label>
+              <input
+                id="sched-balance-rel"
+                className="input"
+                type="number"
+                step="0.1"
+                min="0"
+                value={balanceRelThreshold}
+                onChange={e => setBalanceRelThreshold(parseFloat(e.target.value) || 0)}
+              />
+              <span style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', display: 'block', marginTop: 6 }}>
+                Max relative in-flight ratio (&gt;= 1) allowed before falling back to round-robin. 0 inherits the default.
+              </span>
+            </div>
+          </div>
+        )}
+      </div>
+
+      {/* Hairline divider above the actions, matching the project's form pattern. */}
+      <div style={{
+        display: 'flex', gap: 'var(--spacing-sm)', justifyContent: 'flex-end',
+        marginTop: 'var(--spacing-lg)', paddingTop: 'var(--spacing-md)',
+        borderTop: '1px solid var(--color-border-subtle)',
+      }}>
+        <button className="btn btn-secondary btn-sm" onClick={onCancel}>Cancel</button>
+        <button className="btn btn-primary btn-sm" onClick={handleSubmit} disabled={!isValid()}>Save rule</button>
+      </div>
+    </div>
+  )
+}
+
+export default function Scheduling() {
+  const { addToast } = useOutletContext()
+  const { t } = useTranslation('admin')
+  const [schedulingConfigs, setSchedulingConfigs] = useState([])
+  const [showForm, setShowForm] = useState(false)
+  const [confirmDelete, setConfirmDelete] = useState(null)
+
+  const fetchScheduling = useCallback(async () => {
+    try {
+      const data = await nodesApi.listScheduling()
+      setSchedulingConfigs(Array.isArray(data) ? data : [])
+    } catch { setSchedulingConfigs([]) }
+  }, [])
+
+  useEffect(() => { fetchScheduling() }, [fetchScheduling])
+
+  const handleSave = async (config) => {
+    try {
+      await nodesApi.setScheduling(config)
+      addToast('Scheduling rule saved', 'success')
+      setShowForm(false)
+      fetchScheduling()
+    } catch (err) { addToast(`Failed to save rule: ${err.message}`, 'error') }
+  }
+
+  const handleDelete = async (model) => {
+    try {
+      await nodesApi.deleteScheduling(model)
+      addToast('Scheduling rule removed', 'success')
+      setConfirmDelete(null)
+      fetchScheduling()
+    } catch (err) { addToast(`Failed to remove rule: ${err.message}`, 'error') }
+  }
+
+  return (
+    <div className="page page--wide">
+      <PageHeader
+        title={<><i className="fas fa-calendar-alt" style={{ marginRight: 'var(--spacing-sm)' }} />{t('scheduling.title')}</>}
+        supporting={t('scheduling.subtitle')}
+      />
+      <div>
+        <button className="btn btn-primary btn-sm" style={{ marginBottom: 'var(--spacing-md)' }}
+          onClick={() => setShowForm(f => !f)}>
+          <i className="fas fa-plus" style={{ marginRight: 6 }} />
+          Add Scheduling Rule
+        </button>
+        {showForm && <SchedulingForm onSave={handleSave} onCancel={() => setShowForm(false)} />}
+        {schedulingConfigs.length === 0 && !showForm ? (
+          <p style={{ fontSize: '0.875rem', color: 'var(--color-text-muted)', textAlign: 'center', padding: 'var(--spacing-xl) 0' }}>
+            No scheduling rules configured. Add a rule to control how models are placed on nodes.
+          </p>
+        ) : schedulingConfigs.length > 0 && (
+          <ResponsiveTable>
+              <thead><tr>
+                <th>Model</th>
+                <th>Mode</th>
+                <th>Node Selector</th>
+                <th>Min Replicas</th>
+                <th>Max Replicas</th>
+                <th>Routing</th>
+                <th>Thresholds</th>
+                <th>Status</th>
+                <th style={{ textAlign: 'right' }}>Actions</th>
+              </tr></thead>
+              <tbody>
+                {schedulingConfigs.map(cfg => {
+                  const isSpread = !!cfg.spread_all
+                  const isAutoScaling = !isSpread && (cfg.min_replicas > 0 || cfg.max_replicas > 0)
+                  const hasSelector = !!cfg.node_selector
+                  const modeLabel = isSpread ? 'Spread' : isAutoScaling ? 'Auto-scaling' : hasSelector ? 'Placement' : 'Inactive'
+                  const modeColor = isSpread ? 'var(--color-warning)' : isAutoScaling ? 'var(--color-success)' : hasSelector ? 'var(--color-primary)' : 'var(--color-text-muted)'
+                  // Cooldown: reconciler tripped the circuit breaker because cluster
+                  // capacity is exhausted. Surface so the operator sees it instead
+                  // of the model silently failing to scale.
+                  const unsatisfiableUntil = cfg.unsatisfiable_until ? new Date(cfg.unsatisfiable_until) : null
+                  const isUnsatisfiable = unsatisfiableUntil && unsatisfiableUntil.getTime() > Date.now()
+                  return (
+                  <tr key={cfg.id || cfg.model_name}>
+                    <td style={{ fontWeight: 600, fontSize: '0.875rem' }}>{cfg.model_name}</td>
+                    <td>
+                      <span style={{
+                        display: 'inline-block', fontSize: '0.75rem', padding: '2px 8px', borderRadius: "var(--radius-sm)",
+                        background: 'var(--color-bg-tertiary)', border: `1px solid ${modeColor}`,
+                        color: modeColor, fontWeight: 600,
+                      }}>{modeLabel}</span>
+                    </td>
+                    <td>
+                      {cfg.node_selector ? (() => {
+                        try {
+                          const sel = typeof cfg.node_selector === 'string' ? JSON.parse(cfg.node_selector) : cfg.node_selector
+                          return Object.entries(sel).map(([k,v]) => (
+                            <span key={k} style={{
+                              display: 'inline-block', fontSize: '0.75rem', padding: '2px 6px', borderRadius: "var(--radius-sm)",
+                              background: 'var(--color-bg-tertiary)', border: '1px solid var(--color-border-subtle)',
+                              fontFamily: 'var(--font-mono)', marginRight: 4,
+                            }}>{k}={v}</span>
+                          ))
+                        } catch { return <span style={{ color: 'var(--color-text-muted)', fontSize: '0.8125rem' }}>{cfg.node_selector}</span> }
+                      })() : <span style={{ color: 'var(--color-text-muted)', fontSize: '0.8125rem' }}>Any node</span>}
+                    </td>
+                    <td style={{ fontFamily: 'var(--font-mono)' }}>
+                      {isSpread
+                        ? <span style={{
+                            display: 'inline-block', fontSize: '0.75rem', padding: '2px 8px', borderRadius: "var(--radius-sm)",
+                            background: 'var(--color-bg-tertiary)', border: '1px solid var(--color-warning)',
+                            color: 'var(--color-warning)', fontWeight: 600, fontFamily: 'var(--font-sans)',
+                          }}>Spread: all matching nodes</span>
+                        : isAutoScaling ? cfg.min_replicas : '-'}
+                    </td>
+                    <td style={{ fontFamily: 'var(--font-mono)' }}>
+                      {isSpread ? '-' : isAutoScaling ? (cfg.max_replicas || 'no limit') : '-'}
+                    </td>
+                    <td style={{ fontSize: '0.8125rem' }}>
+                      {cfg.route_policy || 'default'}
+                    </td>
+                    <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.75rem', color: 'var(--color-text-muted)' }}>
+                      {cfg.route_policy === 'prefix_cache' ? (
+                        <>
+                          <div>match: {cfg.min_prefix_match ? cfg.min_prefix_match : 'inherit'}</div>
+                          <div>abs: {cfg.balance_abs_threshold ? cfg.balance_abs_threshold : 'inherit'}</div>
+                          <div>rel: {cfg.balance_rel_threshold ? cfg.balance_rel_threshold : 'inherit'}</div>
+                        </>
+                      ) : '-'}
+                    </td>
+                    <td>
+                      {isUnsatisfiable ? (
+                        <span
+                          title={`Reconciler couldn't satisfy this rule (capacity exhausted). Will retry by ${unsatisfiableUntil.toLocaleString()}, or sooner on a node lifecycle change.`}
+                          style={{
+                            display: 'inline-block', fontSize: '0.75rem', padding: '2px 8px',
+                            borderRadius: 'var(--radius-sm)', fontWeight: 600,
+                            background: 'var(--color-bg-tertiary)',
+                            border: '1px solid var(--color-warning, #d97706)',
+                            color: 'var(--color-warning, #d97706)',
+                          }}
+                        >
+                          <i className="fas fa-exclamation-triangle" style={{ marginRight: 4 }} />
+                          Unsatisfiable until {unsatisfiableUntil.toLocaleTimeString([], { hour: '2-digit', minute: '2-digit' })}
+                        </span>
+                      ) : (
+                        <span style={{ fontSize: '0.8125rem', color: 'var(--color-text-muted)' }}>OK</span>
+                      )}
+                    </td>
+                    <td style={{ textAlign: 'right' }}>
+                      <button className="btn btn-danger btn-sm" onClick={() => setConfirmDelete(cfg.model_name)}>
+                        <i className="fas fa-trash" />
+                      </button>
+                    </td>
+                  </tr>
+                  )
+                })}
+              </tbody>
+          </ResponsiveTable>
+        )}
+      </div>
+
+      <ConfirmDialog
+        open={!!confirmDelete}
+        title="Remove scheduling rule"
+        message={confirmDelete ? `Remove the scheduling rule for "${confirmDelete}"?` : ''}
+        confirmLabel="Remove"
+        danger
+        onConfirm={() => confirmDelete && handleDelete(confirmDelete)}
+        onCancel={() => setConfirmDelete(null)}
+      />
+    </div>
+  )
+}
diff --git a/core/http/react-ui/src/router.jsx b/core/http/react-ui/src/router.jsx
index d8e9b34d4..03962d27e 100644
--- a/core/http/react-ui/src/router.jsx
+++ b/core/http/react-ui/src/router.jsx
@@ -69,7 +69,9 @@ const Studio = page('studio', () => import('./pages/Studio'))
 const FaceRecognition = page('face', () => import('./pages/FaceRecognition'))
 const VoiceRecognition = page('voice', () => import('./pages/VoiceRecognition'))
 const Nodes = page('nodes', () => import('./pages/Nodes'))
+const Scheduling = page('scheduling', () => import('./pages/Scheduling'))
 const NodeBackendLogs = page(null, () => import('./pages/NodeBackendLogs'))
+const NodeDetail = page(null, () => import('./pages/NodeDetail'))
 const NotFound = page(null, () => import('./pages/NotFound'))
 const Usage = page('usage', () => import('./pages/Usage'))
 const Users = page('users', () => import('./pages/Users'))
@@ -152,6 +154,8 @@ const appChildren = [
       { path: 'backend-logs/:modelId', element: <Admin><BackendLogs /></Admin> },
       { path: 'p2p', element: <Admin><P2P /></Admin> },
       { path: 'nodes', element: <Admin><Nodes /></Admin> },
+      { path: 'nodes/:id', element: <Admin><NodeDetail /></Admin> },
+      { path: 'scheduling', element: <Admin><Scheduling /></Admin> },
       { path: 'node-backend-logs/:nodeId/:modelId', element: <Admin><NodeBackendLogs /></Admin> },
       { path: 'usage', element: <Usage /> },
       { path: 'users', element: <RequireAuthEnabled><Admin><Users /></Admin></RequireAuthEnabled> },
diff --git a/core/http/react-ui/src/utils/api.js b/core/http/react-ui/src/utils/api.js
index 20bb90363..8da0bffbd 100644
--- a/core/http/react-ui/src/utils/api.js
+++ b/core/http/react-ui/src/utils/api.js
@@ -568,6 +568,7 @@ export const nodesApi = {
     method: 'DELETE',
   }),
   listScheduling: () => fetchJSON(API_CONFIG.endpoints.nodesScheduling),
+  allModels: () => fetchJSON(API_CONFIG.endpoints.nodesModels),
   setScheduling: (config) => postJSON(API_CONFIG.endpoints.nodesScheduling, config),
   deleteScheduling: (model) => fetchJSON(API_CONFIG.endpoints.nodesSchedulingModel(model), { method: 'DELETE' }),
 }
diff --git a/core/http/react-ui/src/utils/config.js b/core/http/react-ui/src/utils/config.js
index 65797fe41..d3db6ce2a 100644
--- a/core/http/react-ui/src/utils/config.js
+++ b/core/http/react-ui/src/utils/config.js
@@ -144,6 +144,7 @@ export const API_CONFIG = {
     nodeLabelKey: (id, key) => `/api/nodes/${id}/labels/${key}`,
     nodeMaxReplicasPerModel: (id) => `/api/nodes/${id}/max-replicas-per-model`,
     nodesScheduling: '/api/nodes/scheduling',
+    nodesModels: '/api/nodes/models',
     nodesSchedulingModel: (model) => `/api/nodes/scheduling/${encodeURIComponent(model)}`,
   },
 }
diff --git a/core/http/routes/nodes.go b/core/http/routes/nodes.go
index d6f5b8dab..f6a2124b8 100644
--- a/core/http/routes/nodes.go
+++ b/core/http/routes/nodes.go
@@ -71,6 +71,9 @@ func RegisterNodeAdminRoutes(e *echo.Echo, registry *nodes.NodeRegistry, unloade
 	admin := e.Group("/api/nodes", readyMw, adminMw)
 	admin.GET("", localai.ListNodesEndpoint(registry))
 
+	// Cluster-wide loaded models (registered before /:id to avoid route conflicts)
+	admin.GET("/models", localai.ListAllNodeModelsEndpoint(registry))
+
 	// Model scheduling (registered before /:id to avoid route conflicts)
 	admin.GET("/scheduling", localai.ListSchedulingEndpoint(registry))
 	admin.GET("/scheduling/:model", localai.GetSchedulingEndpoint(registry))
diff --git a/docs/content/features/distributed-mode.md b/docs/content/features/distributed-mode.md
index 8add5f859..e5a0b790a 100644
--- a/docs/content/features/distributed-mode.md
+++ b/docs/content/features/distributed-mode.md
@@ -311,7 +311,7 @@ Used by the WebUI and admin API consumers. Requires admin authentication.
 | `POST` | `/api/nodes/:id/models/unload` | Unload a model from a worker |
 | `POST` | `/api/nodes/:id/models/delete` | Delete model files from a worker |
 
-The **Nodes** page in the React WebUI provides a visual overview of all registered workers, their statuses, and loaded models.
+The **Nodes** page in the React WebUI provides a visual overview of all registered workers, their statuses, and loaded models. The page opens with a one-line **cluster pulse** summarising node health and an **attention callout** that surfaces nodes needing action (for example pending approvals). Below that, a roster of **node panels** lists each worker with its inline model chips (no expand click needed), filtered by an **All / Backend / Agent** segmented control. Selecting a panel opens a dedicated **node detail page** at `/app/nodes/:id` with per-node metrics, models, and backend actions. Model scheduling lives on its own **Scheduling** page (separate nav item), not as a tab on the Nodes page.
 
 ## Node Approval
 
@@ -554,7 +554,7 @@ local-ai worker \
 
 ## Model Scheduling
 
-Model scheduling controls where models are placed and how many replicas are maintained. It combines two optional features:
+Model scheduling controls where models are placed and how many replicas are maintained. In the React WebUI it has its own **Scheduling** page (a top-level nav item, separate from the Nodes page). It combines two optional features:
 
 ### Node Selectors
 
diff --git a/swagger/docs.go b/swagger/docs.go
index e01761643..e7b6b9acf 100644
--- a/swagger/docs.go
+++ b/swagger/docs.go
@@ -1021,6 +1021,25 @@ const docTemplate = `{
                 }
             }
         },
+        "/api/nodes/models": {
+            "get": {
+                "tags": [
+                    "Nodes"
+                ],
+                "summary": "List all loaded models cluster-wide",
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "schema": {
+                            "type": "array",
+                            "items": {
+                                "$ref": "#/definitions/nodes.NodeModel"
+                            }
+                        }
+                    }
+                }
+            }
+        },
         "/api/nodes/{id}/max-replicas-per-model": {
             "put": {
                 "tags": [
@@ -3754,6 +3773,52 @@ const docTemplate = `{
                 }
             }
         },
+        "nodes.NodeModel": {
+            "type": "object",
+            "properties": {
+                "address": {
+                    "description": "gRPC address for this replica's backend process",
+                    "type": "string"
+                },
+                "backend_type": {
+                    "description": "e.g. \"llama-cpp\"; used by reconciler to replicate loads",
+                    "type": "string"
+                },
+                "created_at": {
+                    "type": "string"
+                },
+                "id": {
+                    "type": "string"
+                },
+                "in_flight": {
+                    "description": "number of active requests on this replica",
+                    "type": "integer"
+                },
+                "last_used": {
+                    "type": "string"
+                },
+                "loading_by": {
+                    "description": "frontend ID that triggered loading",
+                    "type": "string"
+                },
+                "model_name": {
+                    "type": "string"
+                },
+                "node_id": {
+                    "type": "string"
+                },
+                "replica_index": {
+                    "type": "integer"
+                },
+                "state": {
+                    "description": "loading, loaded, unloading, idle",
+                    "type": "string"
+                },
+                "updated_at": {
+                    "type": "string"
+                }
+            }
+        },
         "proto.MemoryUsageData": {
             "type": "object",
             "properties": {
diff --git a/swagger/swagger.json b/swagger/swagger.json
index 5fc4ac638..4f9695bb1 100644
--- a/swagger/swagger.json
+++ b/swagger/swagger.json
@@ -1018,6 +1018,25 @@
                 }
             }
         },
+        "/api/nodes/models": {
+            "get": {
+                "tags": [
+                    "Nodes"
+                ],
+                "summary": "List all loaded models cluster-wide",
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "schema": {
+                            "type": "array",
+                            "items": {
+                                "$ref": "#/definitions/nodes.NodeModel"
+                            }
+                        }
+                    }
+                }
+            }
+        },
         "/api/nodes/{id}/max-replicas-per-model": {
             "put": {
                 "tags": [
@@ -3751,6 +3770,52 @@
                 }
             }
         },
+        "nodes.NodeModel": {
+            "type": "object",
+            "properties": {
+                "address": {
+                    "description": "gRPC address for this replica's backend process",
+                    "type": "string"
+                },
+                "backend_type": {
+                    "description": "e.g. \"llama-cpp\"; used by reconciler to replicate loads",
+                    "type": "string"
+                },
+                "created_at": {
+                    "type": "string"
+                },
+                "id": {
+                    "type": "string"
+                },
+                "in_flight": {
+                    "description": "number of active requests on this replica",
+                    "type": "integer"
+                },
+                "last_used": {
+                    "type": "string"
+                },
+                "loading_by": {
+                    "description": "frontend ID that triggered loading",
+                    "type": "string"
+                },
+                "model_name": {
+                    "type": "string"
+                },
+                "node_id": {
+                    "type": "string"
+                },
+                "replica_index": {
+                    "type": "integer"
+                },
+                "state": {
+                    "description": "loading, loaded, unloading, idle",
+                    "type": "string"
+                },
+                "updated_at": {
+                    "type": "string"
+                }
+            }
+        },
         "proto.MemoryUsageData": {
             "type": "object",
             "properties": {
diff --git a/swagger/swagger.yaml b/swagger/swagger.yaml
index f83ef14e8..cbb17d719 100644
--- a/swagger/swagger.yaml
+++ b/swagger/swagger.yaml
@@ -422,6 +422,38 @@ definitions:
       vram_display:
         type: string
     type: object
+  nodes.NodeModel:
+    properties:
+      address:
+        description: gRPC address for this replica's backend process
+        type: string
+      backend_type:
+        description: e.g. "llama-cpp"; used by reconciler to replicate loads
+        type: string
+      created_at:
+        type: string
+      id:
+        type: string
+      in_flight:
+        description: number of active requests on this replica
+        type: integer
+      last_used:
+        type: string
+      loading_by:
+        description: frontend ID that triggered loading
+        type: string
+      model_name:
+        type: string
+      node_id:
+        type: string
+      replica_index:
+        type: integer
+      state:
+        description: loading, loaded, unloading, idle
+        type: string
+      updated_at:
+        type: string
+    type: object
   proto.MemoryUsageData:
     properties:
       breakdown:
@@ -3221,6 +3253,18 @@ paths:
       summary: Update a node's max replicas per model
       tags:
       - Nodes
+  /api/nodes/models:
+    get:
+      responses:
+        "200":
+          description: OK
+          schema:
+            items:
+              $ref: '#/definitions/nodes.NodeModel'
+            type: array
+      summary: List all loaded models cluster-wide
+      tags:
+      - Nodes
   /api/p2p:
     get:
       responses:

From 63bcbf6c12b400a6fd0990b1953d9340701629f0 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Mon, 22 Jun 2026 17:26:19 +0100
Subject: [PATCH 49/99] fix(pii): post-merge review fixes + live NER e2e for
 the privacy-filter tier (#10401)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix(pii): post-merge review fixes + live NER e2e for the privacy-filter tier

Follow-up to the NER tier engine (#10360), already on master. This carries
only the incremental review fixes and tests that postdate that merge — the
feature itself is not re-introduced.

Review fixes:
- openai_completion.go: remove the dead `elem >= 0` conjunct in applyAnyText
  (the `elem < 0` guard above already returns).
- application.go: collapse ResolvePIIPolicy's inline re-implementation of
  PIIIsEnabled to a single cfg.PIIIsEnabled() call (sole source of the
  "explicit pii.enabled wins, else cloud-proxy default" rule) and return true
  past the !enabled guard where it is provable.
- pattern.go: hoist the triple `appConfig != nil && EnableTracing` check in
  patternDetector.Detect into one local.
- grammar.go: MaxQuantifier was 4096, but Go's regexp/syntax rejects repeat
  bounds above 1000 at Parse time, so walk()'s {n,m} guard could never fire —
  dead code shadowed by the parser. Lower it to 512 so a bound in (512,1000]
  is rejected here with an actionable error; >1000 still fails closed via
  Parse. Specs pin the relationship so the guard can't silently revert.
- PatternListEditor.jsx: clamp a directly-typed negative min_len to >=0 and
  force the DOM value back when clamping (min={0} only constrained the spinner,
  so a negative reached saved config and silently disabled the length filter).

Tests:
- piipattern_test.go: MaxQuantifier guard specs (must stay live, not dead).
- model-config.spec.js: assert the min_len clamp, and that entity_actions
  collapses a duplicate group to a single row (map semantics; regression guard
  against emitting an array that drops a row on save).
- tests/e2e-backends: token_classify capability driving the TokenClassify gRPC
  RPC against the backend image, asserting byte-correct, UTF-8 rune-aligned
  spans (entity.Text == text[start:end]) at threshold 0. Verified on CPU via
  `make test-extra-backend-privacy-filter` (3/3 specs).
- Makefile: test-extra-backend-privacy-filter wrapper.
- tests/e2e: e2e_pii_ner_test.go drives /api/pii/analyze + /api/pii/redact
  (mask + block) through the full HTTP -> detector -> redactor path; gated on
  PII_NER_MODEL_GGUF so the default suite is unaffected.
- .github/workflows/tests-pii-ner-e2e.yml: path-filtered / nightly CI job
  running the container harness on CPU.

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Richard Palethorpe <io@richiejp.com>

* feat(gallery): add privacy-filter-nemotron (f16 + q8)

GGUF conversions of OpenMed/privacy-filter-nemotron — a fine-grained English
PII token-classifier (55 categories / 221 BIOES classes), fine-tuned from
openai/privacy-filter on NVIDIA's Nemotron-PII dataset. Sibling to the existing
privacy-filter-multilingual entry, trading language breadth for category depth.

- privacy-filter-nemotron: F16 reference artifact (~2.8 GB).
- privacy-filter-nemotron-q8: Q8_0 quant (~1.64 GB) for RAM-constrained / edge
  use; description notes the size/speed tradeoff and to validate on your own
  data (a single dropped span is a PII leak).

Both run on the privacy-filter backend with known_usecases [token_classify] and
a default mask policy (min_score 0.5); operators add per-category entity_actions
as needed. sha256s taken from the HF repo's LFS object ids.

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Richard Palethorpe <io@richiejp.com>

---------

Signed-off-by: Richard Palethorpe <io@richiejp.com>
---
 .github/workflows/tests-pii-ner-e2e.yml       |  97 +++++++++
 Makefile                                      |  10 +
 core/application/application.go               |  10 +-
 core/http/react-ui/e2e/model-config.spec.js   |  48 +++++
 .../src/components/PatternListEditor.jsx      |  13 +-
 .../routing/piiadapter/openai_completion.go   |   2 +-
 core/services/routing/piidetector/pattern.go  |   7 +-
 core/services/routing/piipattern/grammar.go   |  14 +-
 .../routing/piipattern/piipattern_test.go     |  40 ++++
 gallery/index.yaml                            |  92 +++++++++
 tests/e2e-backends/backend_test.go            |  81 ++++++--
 tests/e2e/e2e_pii_ner_test.go                 | 186 ++++++++++++++++++
 tests/e2e/e2e_suite_test.go                   |  43 ++++
 13 files changed, 608 insertions(+), 35 deletions(-)
 create mode 100644 .github/workflows/tests-pii-ner-e2e.yml
 create mode 100644 tests/e2e/e2e_pii_ner_test.go

diff --git a/.github/workflows/tests-pii-ner-e2e.yml b/.github/workflows/tests-pii-ner-e2e.yml
new file mode 100644
index 000000000..2f95f3f46
--- /dev/null
+++ b/.github/workflows/tests-pii-ner-e2e.yml
@@ -0,0 +1,97 @@
+---
+name: 'PII NER tier E2E (live GGUF, CPU)'
+
+# Runs the real privacy-filter GGUF NER tier end-to-end on CPU — the gap the
+# hermetic tests/e2e suite cannot cover (it only exercises the in-process
+# pattern tier). Heavy (builds the C++ backend image + downloads a ~2.7 GB
+# GGUF), so it is path-filtered on PRs and otherwise runs nightly / on demand.
+#
+# This drives the container-level harness (tests/e2e-backends) via
+# `make test-extra-backend-privacy-filter`: it builds the privacy-filter image,
+# downloads the model, loads it on CPU, and asserts byte-correct, UTF-8-aligned
+# TokenClassify spans. The complementary HTTP-path specs in tests/e2e
+# (e2e_pii_ner_test.go) Skip unless PII_NER_MODEL_GGUF is wired.
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 3 * * *'
+  push:
+    branches:
+      - master
+    paths:
+      - 'backend/cpp/privacy-filter/**'
+      - 'backend/Dockerfile.privacy-filter'
+      - 'core/services/routing/pii/**'
+      - 'core/services/routing/piidetector/**'
+      - 'core/backend/token_classify.go'
+      - 'core/http/endpoints/localai/pii.go'
+      - 'core/schema/pii.go'
+      - 'tests/e2e-backends/**'
+      - 'tests/e2e/e2e_pii_ner_test.go'
+      - 'tests/e2e/e2e_suite_test.go'
+      - '.github/workflows/tests-pii-ner-e2e.yml'
+  pull_request:
+    paths:
+      - 'backend/cpp/privacy-filter/**'
+      - 'backend/Dockerfile.privacy-filter'
+      - 'core/services/routing/pii/**'
+      - 'core/services/routing/piidetector/**'
+      - 'core/backend/token_classify.go'
+      - 'core/http/endpoints/localai/pii.go'
+      - 'core/schema/pii.go'
+      - 'tests/e2e-backends/**'
+      - 'tests/e2e/e2e_pii_ner_test.go'
+      - 'tests/e2e/e2e_suite_test.go'
+      - '.github/workflows/tests-pii-ner-e2e.yml'
+
+concurrency:
+  group: ci-tests-pii-ner-e2e-${{ github.event.pull_request.number || github.sha }}-${{ github.repository }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  tests-pii-ner-e2e:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        go-version: ['1.25.x']
+    steps:
+      - name: Clone
+        uses: actions/checkout@v6
+        with:
+          submodules: true
+      - name: Free disk space
+        run: |
+          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL || true
+          sudo docker image prune --all --force || true
+          df -h
+      - name: Configure apt mirror on runner
+        uses: ./.github/actions/configure-apt-mirror
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v5
+        with:
+          go-version: ${{ matrix.go-version }}
+          cache: false
+      - name: Proto Dependencies
+        run: |
+          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
+          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+          rm protoc.zip
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          PATH="$PATH:$HOME/go/bin" make protogen-go
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential
+      # Builds local-ai-backend:privacy-filter, downloads the GGUF, loads it on
+      # CPU and runs the token_classify capability spec (byte-offset contract).
+      - name: Run live PII NER backend E2E
+        run: PATH="$PATH:$HOME/go/bin" make test-extra-backend-privacy-filter
+      - name: Setup tmate session if tests fail
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3.23
+        with:
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true
diff --git a/Makefile b/Makefile
index 8da9aacee..be0711b47 100644
--- a/Makefile
+++ b/Makefile
@@ -690,6 +690,16 @@ test-extra-backend-llama-cpp-transcription: docker-build-llama-cpp
 	BACKEND_TEST_CTX_SIZE=2048 \
 	$(MAKE) test-extra-backend
 
+## privacy-filter: the PII/NER token-classification backend. Exercises the
+## TokenClassify RPC and asserts byte-correct, UTF-8-aligned span offsets
+## against the openai-privacy-filter multilingual GGUF (CPU-runnable, ~50M
+## active params). This is the live-backend coverage for the PII NER tier.
+test-extra-backend-privacy-filter: docker-build-privacy-filter
+	BACKEND_IMAGE=local-ai-backend:privacy-filter \
+	BACKEND_TEST_MODEL_URL=https://huggingface.co/LocalAI-io/privacy-filter-multilingual-GGUF/resolve/main/privacy-filter-multilingual-f16.gguf \
+	BACKEND_TEST_CAPS=health,load,token_classify \
+	$(MAKE) test-extra-backend
+
 ## vllm is resolved from a HuggingFace model id (no file download) and
 ## exercises Predict + streaming + tool-call extraction via the hermes parser.
 ## Requires a host CPU with the SIMD instructions the prebuilt vllm CPU
diff --git a/core/application/application.go b/core/application/application.go
index d5c286318..9bbf26bb8 100644
--- a/core/application/application.go
+++ b/core/application/application.go
@@ -341,11 +341,9 @@ func (a *Application) ResolvePIIPolicy(cfg *config.ModelConfig) (enabled bool, d
 	}
 	appCfg := a.ApplicationConfig()
 
-	if cfg.PII.Enabled != nil {
-		enabled = *cfg.PII.Enabled
-	} else {
-		enabled = cfg.PIIIsEnabled() // backend default (cloud-proxy)
-	}
+	// PIIIsEnabled already encodes "explicit pii.enabled wins, else backend
+	// default (cloud-proxy)" — the single source of that rule.
+	enabled = cfg.PIIIsEnabled()
 	if !enabled {
 		return false, nil
 	}
@@ -354,7 +352,7 @@ func (a *Application) ResolvePIIPolicy(cfg *config.ModelConfig) (enabled bool, d
 	if len(detectors) == 0 {
 		detectors = append([]string(nil), appCfg.PIIDefaultDetectors...)
 	}
-	return enabled, detectors
+	return true, detectors // enabled is necessarily true past the !enabled guard
 }
 
 // PIIPolicyResolver adapts ResolvePIIPolicy to pii.PolicyResolver for
diff --git a/core/http/react-ui/e2e/model-config.spec.js b/core/http/react-ui/e2e/model-config.spec.js
index 2d7f0f8bd..96a73b543 100644
--- a/core/http/react-ui/e2e/model-config.spec.js
+++ b/core/http/react-ui/e2e/model-config.spec.js
@@ -288,6 +288,21 @@ test.describe('Model Editor - Interactive Tab', () => {
     await expect(page.locator('input[placeholder^="match,"]')).toBeVisible()
   })
 
+  test('pattern min_len clamps a directly-typed negative to 0', async ({ page }) => {
+    const searchInput = page.locator('input[placeholder="Search fields to add..."]')
+    await searchInput.fill('Custom Secret Patterns')
+    const dropdown = searchInput.locator('..').locator('..')
+    await dropdown.locator('div', { hasText: 'Custom Secret Patterns' }).first().click()
+
+    await page.locator('button', { hasText: 'Add pattern' }).click()
+    // The number input's min={0} only limits the spinner arrows, not keyboard
+    // entry; the editor must sanitise a typed negative so a meaningless
+    // negative length floor never reaches the saved config.
+    const minLen = page.locator('input[aria-label="Minimum length"]')
+    await minLen.fill('-5')
+    await expect(minLen).toHaveValue('0')
+  })
+
   // Regression: a map-typed field (entity_actions) present in the loaded YAML
   // must render WITH its values. flattenConfig used to recurse into the map,
   // scattering it across pii_detection.entity_actions.<GROUP> paths that match
@@ -329,4 +344,37 @@ test.describe('Model Editor - Interactive Tab', () => {
     await expect(page.getByText(/block —/i).first()).toBeVisible()
   })
 
+  // A map cannot hold two values for one key, so renaming a row to an existing
+  // group must collapse to a single row (Object.fromEntries, last write wins)
+  // rather than rendering two conflicting rows that silently lose one on save.
+  test('entity_actions collapses a duplicate group to a single row', async ({ page }) => {
+    await page.route('**/api/models/edit/ner-model', (route) => {
+      route.fulfill({
+        contentType: 'application/json',
+        body: JSON.stringify({
+          name: 'ner-model',
+          config: [
+            'name: ner-model',
+            'backend: llama-cpp',
+            'pii_detection:',
+            '    entity_actions:',
+            '        SSN: block',
+            '        EMAIL: mask',
+            '',
+          ].join('\n'),
+        }),
+      })
+    })
+
+    await page.goto('/app/model-editor/ner-model')
+
+    const groupInputs = page.locator('input[aria-label="Entity group"]')
+    await expect(groupInputs).toHaveCount(2)
+
+    // Rename the EMAIL row to duplicate SSN; the editor collapses to one SSN row.
+    await groupInputs.nth(1).fill('SSN')
+    await expect(groupInputs).toHaveCount(1)
+    await expect(groupInputs.nth(0)).toHaveValue('SSN')
+  })
+
 })
diff --git a/core/http/react-ui/src/components/PatternListEditor.jsx b/core/http/react-ui/src/components/PatternListEditor.jsx
index f5a82148a..a8965246c 100644
--- a/core/http/react-ui/src/components/PatternListEditor.jsx
+++ b/core/http/react-ui/src/components/PatternListEditor.jsx
@@ -74,7 +74,18 @@ export default function PatternListEditor({ value, onChange }) {
             min={0}
             value={r.min_len || 0}
             title="Minimum match length (0 = no floor)"
-            onChange={e => update(i, { min_len: parseInt(e.target.value, 10) || 0 })}
+            // min={0} only constrains the spinner, not keyboard entry. Clamp a
+            // typed negative to 0 (a negative floor is meaningless and would
+            // disable the length filter). When we clamp, force the DOM value
+            // too: the resulting 0->0 state change is a no-op, so React's
+            // controlled input would otherwise keep displaying the rejected
+            // "-5" even though the saved value is 0.
+            onChange={e => {
+              const parsed = parseInt(e.target.value, 10)
+              const n = Math.max(0, parsed || 0)
+              if (parsed < 0) e.target.value = String(n)
+              update(i, { min_len: n })
+            }}
             style={{ width: 80, fontSize: '0.8125rem' }}
             aria-label="Minimum length"
           />
diff --git a/core/services/routing/piiadapter/openai_completion.go b/core/services/routing/piiadapter/openai_completion.go
index 53e158fd9..ee9568290 100644
--- a/core/services/routing/piiadapter/openai_completion.go
+++ b/core/services/routing/piiadapter/openai_completion.go
@@ -44,7 +44,7 @@ func applyAnyText(v any, elem int, text string) any {
 	if elem < 0 {
 		return text
 	}
-	if arr, ok := v.([]any); ok && elem >= 0 && elem < len(arr) {
+	if arr, ok := v.([]any); ok && elem < len(arr) {
 		arr[elem] = text
 	}
 	return v
diff --git a/core/services/routing/piidetector/pattern.go b/core/services/routing/piidetector/pattern.go
index 1f4e01d1d..347defb92 100644
--- a/core/services/routing/piidetector/pattern.go
+++ b/core/services/routing/piidetector/pattern.go
@@ -39,8 +39,9 @@ type patternDetector struct {
 // When tracing is enabled it records a pattern_pii BackendTrace so the matches
 // (group, byte range, text) show in the Traces UI alongside NER detections.
 func (d *patternDetector) Detect(_ context.Context, text string) ([]pii.NEREntity, error) {
+	tracing := d.appConfig != nil && d.appConfig.EnableTracing
 	var start time.Time
-	if d.appConfig != nil && d.appConfig.EnableTracing {
+	if tracing {
 		trace.InitBackendTracingIfEnabled(d.appConfig.TracingMaxItems, d.appConfig.TracingMaxBodyBytes)
 		start = time.Now()
 	}
@@ -50,12 +51,12 @@ func (d *patternDetector) Detect(_ context.Context, text string) ([]pii.NEREntit
 	var traceEnts []backend.TokenEntity
 	for _, mt := range matches {
 		out = append(out, pii.NEREntity{Group: mt.Group, Start: mt.Start, End: mt.End, Score: 1.0, Text: mt.Text})
-		if d.appConfig != nil && d.appConfig.EnableTracing {
+		if tracing {
 			traceEnts = append(traceEnts, backend.TokenEntity{Group: mt.Group, Start: mt.Start, End: mt.End, Score: 1.0, Text: mt.Text})
 		}
 	}
 
-	if d.appConfig != nil && d.appConfig.EnableTracing {
+	if tracing {
 		trace.RecordBackendTrace(patternPIITrace(d.modelName, text, traceEnts, start))
 	}
 	return out, nil
diff --git a/core/services/routing/piipattern/grammar.go b/core/services/routing/piipattern/grammar.go
index 93ca34c72..5171e533f 100644
--- a/core/services/routing/piipattern/grammar.go
+++ b/core/services/routing/piipattern/grammar.go
@@ -28,10 +28,16 @@ const (
 	// credential shape, small enough that the compiled program stays tiny.
 	MaxPatternLen = 256
 	// MaxQuantifier caps an explicit {n,m} upper bound. RE2 expands a bounded
-	// repeat into that many copies, so an uncapped {0,1000000} would blow up
-	// the compiled program's memory. Unbounded {n,} (no upper) is a loop, not
-	// an expansion, and is allowed.
-	MaxQuantifier = 4096
+	// repeat into that many copies, so a large bound inflates the compiled
+	// program. Go's regexp/syntax independently rejects any bound above 1000
+	// at Parse time, so this cap MUST stay strictly below 1000 to be a live
+	// guard rather than dead code shadowed by the parser: a bound in
+	// (MaxQuantifier, 1000] reaches walk and is rejected here with an
+	// actionable error, while >1000 is caught earlier by Parse. 512 is far
+	// larger than any real credential token yet keeps the guard meaningful and
+	// is defence in depth should the stdlib cap ever rise. Unbounded {n,} (no
+	// upper) is a loop, not an expansion, and is allowed.
+	MaxQuantifier = 512
 	// MaxAlternation caps the arms of a single `a|b|c` alternation.
 	MaxAlternation = 64
 	// MaxAST bounds recursion depth so a pathologically nested pattern can't
diff --git a/core/services/routing/piipattern/piipattern_test.go b/core/services/routing/piipattern/piipattern_test.go
index ef38a4992..590142c3d 100644
--- a/core/services/routing/piipattern/piipattern_test.go
+++ b/core/services/routing/piipattern/piipattern_test.go
@@ -1,6 +1,7 @@
 package piipattern
 
 import (
+	"fmt"
 	"strings"
 	"testing"
 
@@ -36,6 +37,45 @@ var _ = Describe("ValidatePattern", func() {
 	)
 })
 
+var _ = Describe("MaxQuantifier guard (must stay live, not dead code)", func() {
+	// Go's regexp/syntax hard-caps repeat bounds at 1000 and rejects anything
+	// larger at Parse time, before walk() runs. So the walk() {n,m} guard only
+	// fires for bounds in (MaxQuantifier, 1000]; if MaxQuantifier ever creeps
+	// to >= 1000 the guard becomes unreachable dead code. These specs pin the
+	// relationship and prove the guard is the binding constraint in that band.
+	const stdlibRepeatCap = 1000
+
+	It("is strictly below the stdlib repeat cap so the guard is reachable", func() {
+		Expect(MaxQuantifier).To(BeNumerically("<", stdlibRepeatCap),
+			"MaxQuantifier must be < %d or walk()'s {n,m} guard is dead code (Parse rejects larger bounds first)", stdlibRepeatCap)
+	})
+
+	It("accepts a bound at exactly MaxQuantifier", func() {
+		Expect(ValidatePattern(fmt.Sprintf(`sk-ant-[A-Za-z0-9]{%d}`, MaxQuantifier))).To(Succeed())
+	})
+
+	It("rejects a bound just above MaxQuantifier with our actionable error (proves the guard runs)", func() {
+		// MaxQuantifier+1 is still parseable (<= stdlib cap), so it reaches
+		// walk(), where our guard — not the parser — rejects it.
+		err := ValidatePattern(fmt.Sprintf(`sk-ant-[A-Za-z0-9]{%d}`, MaxQuantifier+1))
+		Expect(err).To(HaveOccurred())
+		Expect(err.Error()).To(ContainSubstring("bound is too large"),
+			"a bound in (MaxQuantifier, stdlib cap] must be rejected by walk(), not the parser")
+	})
+
+	It("rejects an unbounded {n,} whose lower bound exceeds MaxQuantifier", func() {
+		err := ValidatePattern(fmt.Sprintf(`sk-ant-[A-Za-z0-9]{%d,}`, MaxQuantifier+1))
+		Expect(err).To(HaveOccurred())
+		Expect(err.Error()).To(ContainSubstring("bound is too large"))
+	})
+
+	It("still fails closed above the stdlib cap (Parse rejects before walk)", func() {
+		// >1000: caught by syntax.Parse; the message is the parser's, but it
+		// still fails closed — defence in depth.
+		Expect(ValidatePattern(fmt.Sprintf(`sk-ant-[A-Za-z0-9]{%d}`, stdlibRepeatCap+1))).NotTo(Succeed())
+	})
+})
+
 var _ = Describe("Compile", func() {
 	It("compiles a valid pattern with leftmost-longest semantics", func() {
 		re, err := Compile(`sk-ant-[A-Za-z0-9_-]{4,}`)
diff --git a/gallery/index.yaml b/gallery/index.yaml
index c0fd9c3c3..23135a510 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1252,6 +1252,98 @@
     - filename: privacy-filter/models/privacy-filter-multilingual/privacy-filter-multilingual-f16.gguf
       sha256: 01b76572f80b7d2ebee80a27cb9c3699c26b04cae1c402eee7664fc17a4b5ce6
       uri: https://huggingface.co/LocalAI-io/privacy-filter-multilingual-GGUF/resolve/main/privacy-filter-multilingual-f16.gguf
+- name: "privacy-filter-nemotron"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/5fd5e18a90b6dc4633f6d292/QPiv8pt4JNxr0FdGnpFef.png
+  urls:
+    - https://huggingface.co/OpenMed/privacy-filter-nemotron
+    - https://huggingface.co/LocalAI-io/privacy-filter-nemotron-GGUF
+  description: |
+    A fine-grained English PII token-classification model: a fine-tune of
+    openai/privacy-filter by OpenMed on NVIDIA's Nemotron-PII dataset. It labels
+    every token with a BIOES tag over 55 PII categories (221 classes), trading
+    the multilingual sibling's language breadth for category depth - identity,
+    contact, address, dates, government IDs, financial, healthcare, enterprise,
+    vehicle and digital entities (including api_key, ipv4/ipv6 and mac_address).
+    For multilingual text prefer privacy-filter-multilingual instead.
+
+    In LocalAI this is a PII detector for the NER redactor tier: set
+    known_usecases to [token_classify] (as below), and any model opts into
+    redaction by listing this one under pii.detectors. The detection policy
+    (which categories to mask vs block, and the score threshold) lives on this
+    model's own pii_detection block - see the overrides below. It runs locally
+    with no Python, served by the standalone privacy-filter backend's
+    TokenClassify RPC (constrained BIOES Viterbi decode into UTF-8 byte-offset
+    entity spans).
+
+    Architecture: gpt-oss-style sparse MoE (8 layers, d_model 640, 128 experts
+    top-4, ~1.5B total / ~50M active per token), bidirectional banded attention,
+    o200k tokenizer and a 221-way token-classification head; served via the
+    openai-privacy-filter architecture. F16, ~2.8 GB. (A smaller Q8_0 quant
+    exists on the GGUF repo for RAM-constrained use - validate it on your own
+    data, since for PII a single dropped span is a leak.)
+  license: apache-2.0
+  tags:
+    - token-classification
+    - ner
+    - pii
+    - privacy
+    - nemotron
+    - gguf
+  overrides:
+    backend: privacy-filter
+    embeddings: true
+    known_usecases:
+      - token_classify
+    parameters:
+      model: privacy-filter/models/privacy-filter-nemotron/privacy-filter-nemotron-f16.gguf
+    pii_detection:
+      min_score: 0.5
+      default_action: mask
+  files:
+    - filename: privacy-filter/models/privacy-filter-nemotron/privacy-filter-nemotron-f16.gguf
+      sha256: 70dfe91ff220ff04594168a83e296dcc2054449cde77f98d0e782edbb6a31f5a
+      uri: https://huggingface.co/LocalAI-io/privacy-filter-nemotron-GGUF/resolve/main/privacy-filter-nemotron-f16.gguf
+- name: "privacy-filter-nemotron-q8"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/5fd5e18a90b6dc4633f6d292/QPiv8pt4JNxr0FdGnpFef.png
+  urls:
+    - https://huggingface.co/OpenMed/privacy-filter-nemotron
+    - https://huggingface.co/LocalAI-io/privacy-filter-nemotron-GGUF
+  description: |
+    Q8_0 quant of privacy-filter-nemotron (~1.64 GB, vs ~2.8 GB for F16) for
+    RAM-constrained / edge use (e.g. a 4 GB Raspberry Pi 5). The MoE expert
+    weights are stored 8-bit; attention, embeddings and the classifier head
+    stay F16. Same model, policy and runtime as the F16 entry - see
+    privacy-filter-nemotron for the full description.
+
+    Prefer the F16 entry when you can afford it: it is the reference artifact.
+    On a mixed-PII document the publisher measured q8 matching F16 on 99.93% of
+    token labels with an identical span set at threshold 0.5 - but one token
+    flipped, and for PII a single dropped span is a leak. Treat q8 as a
+    deliberate size/speed tradeoff and validate it on your own data.
+  license: apache-2.0
+  tags:
+    - token-classification
+    - ner
+    - pii
+    - privacy
+    - nemotron
+    - gguf
+  overrides:
+    backend: privacy-filter
+    embeddings: true
+    known_usecases:
+      - token_classify
+    parameters:
+      model: privacy-filter/models/privacy-filter-nemotron/privacy-filter-nemotron-q8.gguf
+    pii_detection:
+      min_score: 0.5
+      default_action: mask
+  files:
+    - filename: privacy-filter/models/privacy-filter-nemotron/privacy-filter-nemotron-q8.gguf
+      sha256: 2ec11c154e572a2686f4d77e861b7f74e6917e09638fe9bd27156d48bd99e21a
+      uri: https://huggingface.co/LocalAI-io/privacy-filter-nemotron-GGUF/resolve/main/privacy-filter-nemotron-q8.gguf
 - name: "secret-filter"
   url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
   description: |
diff --git a/tests/e2e-backends/backend_test.go b/tests/e2e-backends/backend_test.go
index 6d0d27276..c60077fd7 100644
--- a/tests/e2e-backends/backend_test.go
+++ b/tests/e2e-backends/backend_test.go
@@ -11,6 +11,7 @@ import (
 	"path/filepath"
 	"strings"
 	"time"
+	"unicode/utf8"
 
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	. "github.com/onsi/ginkgo/v2"
@@ -85,27 +86,28 @@ import (
 // file path to LoadModel, so GGUF, ONNX, safetensors, .bin etc. all work so
 // long as the backend under test accepts that format.
 const (
-	capHealth        = "health"
-	capLoad          = "load"
-	capPredict       = "predict"
-	capStream        = "stream"
-	capEmbeddings    = "embeddings"
-	capTools         = "tools"
-	capTranscription = "transcription"
-	capTTS           = "tts"
-	capImage         = "image"
-	capFaceDetect    = "face_detect"
-	capFaceEmbed     = "face_embed"
-	capFaceVerify    = "face_verify"
-	capFaceAnalyze   = "face_analyze"
-	capFaceAntispoof = "face_antispoof"
-	capVoiceEmbed    = "voice_embed"
-	capVoiceVerify   = "voice_verify"
-	capVoiceAnalyze  = "voice_analyze"
+	capHealth         = "health"
+	capLoad           = "load"
+	capPredict        = "predict"
+	capStream         = "stream"
+	capEmbeddings     = "embeddings"
+	capTools          = "tools"
+	capTranscription  = "transcription"
+	capTTS            = "tts"
+	capImage          = "image"
+	capFaceDetect     = "face_detect"
+	capFaceEmbed      = "face_embed"
+	capFaceVerify     = "face_verify"
+	capFaceAnalyze    = "face_analyze"
+	capFaceAntispoof  = "face_antispoof"
+	capVoiceEmbed     = "voice_embed"
+	capVoiceVerify    = "voice_verify"
+	capVoiceAnalyze   = "voice_analyze"
 	capAudioTransform = "audio_transform"
-	capLogprobs      = "logprobs"
-	capLogitBias     = "logit_bias"
-	capTokenize      = "tokenize"
+	capLogprobs       = "logprobs"
+	capLogitBias      = "logit_bias"
+	capTokenize       = "tokenize"
+	capTokenClassify  = "token_classify"
 
 	defaultPrompt             = "The capital of France is"
 	streamPrompt              = "Once upon a time"
@@ -550,6 +552,45 @@ var _ = Describe("Backend container", Ordered, func() {
 		GinkgoWriter.Printf("Embedding: %d dims\n", len(res.GetEmbeddings()))
 	})
 
+	// TokenClassify is the PII-NER RPC (privacy-filter backend). The crown-jewel
+	// invariant is byte-offset correctness: Start/End are half-open BYTE offsets
+	// into the original UTF-8 text, and the backend's emitted text for a span must
+	// equal text[Start:End]. We run at Threshold 0 (raw, unfiltered) and assert
+	// every returned span is in range, rune-aligned, and self-consistent. The
+	// prompt carries multibyte runes BEFORE the PII so a rune/byte confusion in
+	// the engine would surface as a shifted slice here. Override the text with
+	// BACKEND_TEST_TOKEN_CLASSIFY_TEXT for a model that detects a different class.
+	It("classifies PII spans with byte-correct offsets via TokenClassify", func() {
+		if !caps[capTokenClassify] {
+			Skip("token_classify capability not enabled")
+		}
+		text := os.Getenv("BACKEND_TEST_TOKEN_CLASSIFY_TEXT")
+		if text == "" {
+			text = "Müller paid at café in Zürich; reach john.doe@example.com tomorrow."
+		}
+		ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second)
+		defer cancel()
+		res, err := client.TokenClassify(ctx, &pb.TokenClassifyRequest{Text: text, Threshold: 0})
+		Expect(err).NotTo(HaveOccurred(), "TokenClassify RPC failed")
+		ents := res.GetEntities()
+		Expect(ents).NotTo(BeEmpty(), "TokenClassify returned no entities for an obvious-PII sentence")
+		for _, e := range ents {
+			start, end := int(e.GetStart()), int(e.GetEnd())
+			Expect(start).To(BeNumerically(">=", 0))
+			Expect(end).To(BeNumerically(">", start))
+			Expect(end).To(BeNumerically("<=", len(text)))
+			Expect(utf8.RuneStart(text[start])).To(BeTrue(), "start %d is mid-rune in %q", start, text)
+			if end < len(text) {
+				Expect(utf8.RuneStart(text[end])).To(BeTrue(), "end %d is mid-rune in %q", end, text)
+			}
+			slice := text[start:end]
+			Expect(utf8.ValidString(slice)).To(BeTrue(), "span %q is not valid UTF-8", slice)
+			Expect(e.GetText()).To(Equal(slice), "entity text must equal text[start:end]")
+			GinkgoWriter.Printf("TokenClassify: %q [%d:%d] %s score=%.3f\n",
+				slice, start, end, e.GetEntityGroup(), e.GetScore())
+		}
+	})
+
 	It("generates an image via GenerateImage", func() {
 		if !caps[capImage] {
 			Skip("image capability not enabled")
diff --git a/tests/e2e/e2e_pii_ner_test.go b/tests/e2e/e2e_pii_ner_test.go
new file mode 100644
index 000000000..ec8c6954c
--- /dev/null
+++ b/tests/e2e/e2e_pii_ner_test.go
@@ -0,0 +1,186 @@
+package e2e_test
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"io"
+	"net/http"
+	"unicode/utf8"
+
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/schema"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// Live PII NER tier e2e. These specs run the real privacy-filter GGUF on CPU
+// through the full TokenClassify path — the gap the hermetic suite cannot
+// cover (it only exercises the in-process pattern tier). They Skip unless
+// PII_NER_MODEL_GGUF is wired in BeforeSuite, so the default PR suite is
+// unaffected; the dedicated CI job sets it.
+//
+// The crown-jewel invariant is byte-offset correctness: entity Start/End are
+// half-open BYTE offsets into the original UTF-8 text, and the model's emitted
+// text for a span must equal the corresponding byte slice. We assert that two
+// ways — directly against ModelTokenClassify (raw, Threshold 0, no redactor
+// merge) and against the /api/pii/analyze HTTP contract (post-merge,
+// post-MinScore). The multibyte case proves offsets are bytes, not runes.
+var _ = Describe("PII NER tier (live privacy-filter GGUF)", func() {
+	const (
+		// Reliable, unambiguous PII the multilingual NER model detects.
+		emailText = "Please contact John Doe at john.doe@example.com about invoice 4421."
+		// Multibyte chars BEFORE the email push its byte offset past its rune
+		// offset, so a rune/byte confusion in the engine or the Go bridge would
+		// surface as a mismatched slice here but not in the ASCII case above.
+		multibyteText = "Müller paid at café in Zürich; reach john.doe@example.com tomorrow."
+	)
+
+	BeforeEach(func() {
+		if piiNERModel == "" {
+			Skip("live PII NER model not wired (set PII_NER_MODEL_GGUF + REALTIME_BACKENDS_PATH; see tests-pii-ner-e2e.yml)")
+		}
+	})
+
+	Context("raw TokenClassify (byte-offset contract)", func() {
+		It("returns byte-correct, rune-aligned spans for an ASCII email", func() {
+			ents := tokenClassify(emailText)
+			Expect(ents).NotTo(BeEmpty(), "model must detect at least one entity in an obvious-PII sentence")
+			for _, e := range ents {
+				assertByteCorrectSpan(emailText, e.Start, e.End, e.Text)
+			}
+			Expect(spanCoversSubstring(emailText, ents, "john.doe@example.com")).To(BeTrue(),
+				"some detected span must cover the email address")
+		})
+
+		It("keeps byte offsets correct when multibyte runes precede the PII", func() {
+			ents := tokenClassify(multibyteText)
+			Expect(ents).NotTo(BeEmpty())
+			for _, e := range ents {
+				// This is the assertion that fails if offsets were computed in
+				// runes rather than bytes: the slice would be shifted left.
+				assertByteCorrectSpan(multibyteText, e.Start, e.End, e.Text)
+			}
+			Expect(spanCoversSubstring(multibyteText, ents, "john.doe@example.com")).To(BeTrue())
+		})
+	})
+
+	Context("HTTP /api/pii/analyze", func() {
+		It("reports ner-source entities with byte-correct offsets", func() {
+			status, resp := analyze(schema.PIIAnalyzeRequest{
+				Text:      emailText,
+				Detectors: []string{piiNERModel},
+			})
+			Expect(status).To(Equal(http.StatusOK))
+			Expect(resp.Entities).NotTo(BeEmpty())
+			for _, e := range resp.Entities {
+				Expect(e.Source).To(Equal("ner"), "privacy-filter detections must be tagged source=ner")
+				Expect(e.Action).To(Equal("mask"), "default_action mask must propagate to each entity")
+				assertByteCorrectSpan(emailText, e.Start, e.End, emailText[e.Start:e.End])
+				Expect(e.Score).To(BeNumerically(">=", 0.5), "below-MinScore spans are dropped before the response")
+			}
+		})
+	})
+
+	Context("HTTP /api/pii/redact", func() {
+		It("masks detected PII out of the returned text", func() {
+			status, body := redact(schema.PIIAnalyzeRequest{
+				Text:      emailText,
+				Detectors: []string{piiNERModel},
+			})
+			Expect(status).To(Equal(http.StatusOK))
+			var resp schema.PIIRedactResponse
+			Expect(json.Unmarshal(body, &resp)).To(Succeed())
+			Expect(resp.Masked).To(BeTrue())
+			Expect(resp.RedactedText).NotTo(Equal(emailText))
+			Expect(resp.RedactedText).NotTo(ContainSubstring("john.doe@example.com"),
+				"the masked email must not survive in the redacted body")
+		})
+
+		It("rejects the request with pii_blocked when an entity action is block", func() {
+			status, body := redact(schema.PIIAnalyzeRequest{
+				Text:      emailText,
+				Detectors: []string{piiNERBlockModel},
+			})
+			Expect(status).To(Equal(http.StatusBadRequest))
+			Expect(string(body)).To(ContainSubstring("pii_blocked"))
+			Expect(string(body)).NotTo(ContainSubstring("john.doe@example.com"),
+				"a blocked response must never echo the raw secret")
+		})
+	})
+})
+
+// tokenClassify drives core/backend.ModelTokenClassify against the live model
+// with the loader/config the running server uses — the same path the NER
+// detector takes, but at Threshold 0 so we see the raw, unmerged spans.
+func tokenClassify(text string) []backend.TokenEntity {
+	GinkgoHelper()
+	cfg, ok := localAIApp.ModelConfigLoader().GetModelConfig(piiNERModel)
+	Expect(ok).To(BeTrue(), "model config %q must be loaded", piiNERModel)
+	fn, err := backend.ModelTokenClassify(text, backend.TokenClassifyOptions{},
+		localAIApp.ModelLoader(), cfg, localAIApp.ApplicationConfig())
+	Expect(err).NotTo(HaveOccurred())
+	ents, err := fn(context.TODO())
+	Expect(err).NotTo(HaveOccurred())
+	return ents
+}
+
+// assertByteCorrectSpan is the shared byte-offset invariant: a half-open byte
+// range within text, aligned to UTF-8 rune boundaries, whose slice equals the
+// entity's own reported text.
+func assertByteCorrectSpan(text string, start, end int, got string) {
+	GinkgoHelper()
+	Expect(start).To(BeNumerically(">=", 0))
+	Expect(end).To(BeNumerically(">", start))
+	Expect(end).To(BeNumerically("<=", len(text)))
+	Expect(utf8.RuneStart(text[start])).To(BeTrue(), "start %d is mid-rune in %q", start, text)
+	if end < len(text) {
+		Expect(utf8.RuneStart(text[end])).To(BeTrue(), "end %d is mid-rune in %q", end, text)
+	}
+	slice := text[start:end]
+	Expect(utf8.ValidString(slice)).To(BeTrue(), "span %q is not valid UTF-8", slice)
+	Expect(slice).To(Equal(got), "entity text must equal text[start:end]")
+}
+
+func spanCoversSubstring(text string, ents []backend.TokenEntity, sub string) bool {
+	lo := bytes.Index([]byte(text), []byte(sub))
+	if lo < 0 {
+		return false
+	}
+	hi := lo + len(sub)
+	for _, e := range ents {
+		// any overlap with [lo,hi)
+		if e.Start < hi && e.End > lo {
+			return true
+		}
+	}
+	return false
+}
+
+func analyze(req schema.PIIAnalyzeRequest) (int, schema.PIIAnalyzeResponse) {
+	GinkgoHelper()
+	status, body := postJSON("/api/pii/analyze", req)
+	var resp schema.PIIAnalyzeResponse
+	if status == http.StatusOK {
+		Expect(json.Unmarshal(body, &resp)).To(Succeed())
+	}
+	return status, resp
+}
+
+func redact(req schema.PIIAnalyzeRequest) (int, []byte) {
+	GinkgoHelper()
+	return postJSON("/api/pii/redact", req)
+}
+
+func postJSON(path string, payload any) (int, []byte) {
+	GinkgoHelper()
+	data, err := json.Marshal(payload)
+	Expect(err).NotTo(HaveOccurred())
+	httpResp, err := http.Post(anthropicBaseURL+path, "application/json", bytes.NewReader(data))
+	Expect(err).NotTo(HaveOccurred())
+	defer func() { _ = httpResp.Body.Close() }()
+	body, err := io.ReadAll(httpResp.Body)
+	Expect(err).NotTo(HaveOccurred())
+	return httpResp.StatusCode, body
+}
diff --git a/tests/e2e/e2e_suite_test.go b/tests/e2e/e2e_suite_test.go
index 5a257bdb0..38e49f1cc 100644
--- a/tests/e2e/e2e_suite_test.go
+++ b/tests/e2e/e2e_suite_test.go
@@ -47,6 +47,15 @@ var (
 	// cloud-proxy model YAMLs can point at their URLs at startup time.
 	cpOpenAIUpstream    *fakeOpenAIUpstreamServer
 	cpAnthropicUpstream *fakeAnthropicUpstreamServer
+
+	// Live PII NER tier. Set only when PII_NER_MODEL_GGUF points at a
+	// privacy-filter GGUF and the privacy-filter backend is discoverable
+	// (REALTIME_BACKENDS_PATH). Empty => the NER specs Skip, exactly like the
+	// cloud-proxy specs Skip without their binary. This is what the hermetic
+	// suite cannot do (e2e_suite_test.go comment at the cp-translate detector):
+	// run the real GGUF NER tier instead of only the in-process pattern tier.
+	piiNERModel      string
+	piiNERBlockModel string
 )
 
 var _ = BeforeSuite(func() {
@@ -535,6 +544,40 @@ var _ = BeforeSuite(func() {
 		}
 	}
 
+	// Live PII NER tier. When PII_NER_MODEL_GGUF points at a downloaded
+	// privacy-filter GGUF, register two detector models that drive the real
+	// gRPC TokenClassify path on the privacy-filter backend (discovered via
+	// REALTIME_BACKENDS_PATH). Two models so we can exercise both policy
+	// outcomes against the same weights: mask (redact) and block (reject).
+	// NOTE: no pii_detection.builtins/patterns here — that would flip the
+	// detector to the in-process regex tier instead of the GGUF NER tier.
+	if gguf := os.Getenv("PII_NER_MODEL_GGUF"); gguf != "" {
+		piiNERModel = "privacy-filter-ner"
+		piiNERBlockModel = "privacy-filter-ner-block"
+		nerModelConfig := func(name, defaultAction string) map[string]any {
+			return map[string]any{
+				"name":           name,
+				"backend":        "privacy-filter",
+				"embeddings":     true, // required: TOKEN_CLS pooling loads via the embeddings flag
+				"known_usecases": []string{"token_classify"},
+				"parameters":     map[string]any{"model": gguf},
+				"pii_detection": map[string]any{
+					"min_score":      0.5,
+					"default_action": defaultAction,
+				},
+			}
+		}
+		for _, cfg := range []map[string]any{
+			nerModelConfig(piiNERModel, "mask"),
+			nerModelConfig(piiNERBlockModel, "block"),
+		} {
+			data, err := yaml.Marshal(cfg)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(os.WriteFile(filepath.Join(modelsPath, cfg["name"].(string)+".yaml"), data, 0644)).To(Succeed())
+		}
+		xlog.Info("wired live PII NER models", "gguf", gguf, "models", []string{piiNERModel, piiNERBlockModel})
+	}
+
 	systemState, err := system.GetSystemState(systemOpts...)
 	Expect(err).ToNot(HaveOccurred())
 

From 9d54a599b0e5cc6bf1ce75603229bfbb73a0af92 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 21:27:43 +0200
Subject: [PATCH 50/99] feat(ui): role and deployment-mode adaptive UI
 (landing, sidebar, top navbar) (#10449)

* feat(ui): add shared DeploymentContext (features + p2p signal)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* refactor(ui): extract launchAssistantChat shared helper

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(ui): role/mode-aware landing redirect at /app

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(ui): pin Cluster group and collapse Create for cluster admins

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(ui): desktop top navbar with mode pill and admin-via-chat jump

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(ui): admin token-usage meter in the top navbar

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(ui): top-navbar breakpoint handoff + assistant jump from chat page

M1: the desktop .top-navbar was hidden at max-width 768px while the
.mobile-header only appears at max-width 639px, leaving 640-768px with
neither bar so admins lost the mode pill, token meter and admin-via-chat
jump. Hide the top bar at 639px instead so it covers every width the rail
sidebar is shown and hands off to the mobile-header exactly at 639px.

M2: the navbar 'Admin via chat' button wrote localStorage and called
navigate('/app/chat'), but when already on the chat page Chat does not
remount so its mount-time payload reader never fired and the click was a
no-op until reload. The payload consume logic is factored into a shared
callback; the launcher now dispatches a localai-open-assistant event that
the mounted Chat listens for to re-consume the payload. Mount behavior is
unchanged.

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 .../react-ui/e2e/role-mode-adaptive.spec.js   | 100 +++++++++++++++++
 core/http/react-ui/public/locales/en/nav.json |  10 ++
 core/http/react-ui/src/App.css                |  44 ++++++++
 core/http/react-ui/src/App.jsx                |   2 +
 .../react-ui/src/components/HomeRoute.jsx     |  28 +++++
 core/http/react-ui/src/components/Sidebar.jsx |  58 ++++++++--
 .../react-ui/src/components/TopNavbar.jsx     |  96 ++++++++++++++++
 .../src/components/navbar/TokenUsageMeter.jsx |  52 +++++++++
 .../src/contexts/DeploymentContext.jsx        |  55 +++++++++
 core/http/react-ui/src/main.jsx               |   9 +-
 core/http/react-ui/src/pages/Chat.jsx         | 105 ++++++++++--------
 core/http/react-ui/src/pages/Home.jsx         |  11 +-
 core/http/react-ui/src/router.jsx             |   5 +-
 .../react-ui/src/utils/launchAssistantChat.js |  19 ++++
 core/http/react-ui/src/utils/resolveHome.js   |  11 ++
 core/http/react-ui/src/utils/sidebarPolicy.js |  20 ++++
 16 files changed, 556 insertions(+), 69 deletions(-)
 create mode 100644 core/http/react-ui/e2e/role-mode-adaptive.spec.js
 create mode 100644 core/http/react-ui/src/components/HomeRoute.jsx
 create mode 100644 core/http/react-ui/src/components/TopNavbar.jsx
 create mode 100644 core/http/react-ui/src/components/navbar/TokenUsageMeter.jsx
 create mode 100644 core/http/react-ui/src/contexts/DeploymentContext.jsx
 create mode 100644 core/http/react-ui/src/utils/launchAssistantChat.js
 create mode 100644 core/http/react-ui/src/utils/resolveHome.js
 create mode 100644 core/http/react-ui/src/utils/sidebarPolicy.js

diff --git a/core/http/react-ui/e2e/role-mode-adaptive.spec.js b/core/http/react-ui/e2e/role-mode-adaptive.spec.js
new file mode 100644
index 000000000..0e2e1b37b
--- /dev/null
+++ b/core/http/react-ui/e2e/role-mode-adaptive.spec.js
@@ -0,0 +1,100 @@
+import { test, expect } from './coverage-fixtures.js'
+
+// These specs stub /api/features and /api/auth/status per cell. The test server
+// disables auth (isAdmin=true) and reports its own features, so we intercept
+// before navigation to simulate each role x mode cell.
+
+function stubFeatures(page, features) {
+  return page.route('**/api/features', route =>
+    route.fulfill({ contentType: 'application/json', body: JSON.stringify(features) }))
+}
+
+function stubNoP2P(page) {
+  // P2P token endpoint returns empty -> p2pEnabled=false.
+  return page.route('**/api/p2p/token', route =>
+    route.fulfill({ contentType: 'text/plain', body: '' }))
+}
+
+test.describe('Adaptive landing (HomeRoute)', () => {
+  test('admin + distributed redirects /app to Nodes', async ({ page }) => {
+    await stubFeatures(page, { distributed: true })
+    await stubNoP2P(page)
+    await page.goto('/app')
+    await expect(page).toHaveURL(/\/app\/nodes$/)
+    await expect(page.locator('.page-title').first()).toBeVisible({ timeout: 15_000 })
+  })
+
+  test('admin + single-node stays on Home', async ({ page }) => {
+    await stubFeatures(page, { distributed: false })
+    await stubNoP2P(page)
+    await page.goto('/app')
+    await expect(page).toHaveURL(/\/app$/)
+    await expect(page.locator('.home-greeting')).toBeVisible({ timeout: 15_000 })
+  })
+})
+
+test.describe('Adaptive sidebar', () => {
+  test('distributed pins the Cluster group with Nodes at the top', async ({ page }) => {
+    await stubFeatures(page, { distributed: true })
+    await stubNoP2P(page)
+    await page.goto('/app/chat') // any in-app page so the sidebar is mounted
+    const pinned = page.locator('.sidebar-nav .sidebar-section-items').first()
+    await expect(pinned.getByText('Nodes', { exact: false })).toBeVisible({ timeout: 15_000 })
+  })
+
+  test('single-node does not pin a Cluster group', async ({ page }) => {
+    await stubFeatures(page, { distributed: false })
+    await stubNoP2P(page)
+    await page.goto('/app/chat')
+    // Nodes is reachable only via the Operate rail, not pinned at the top.
+    await expect(page.locator('.sidebar-nav')).toBeVisible({ timeout: 15_000 })
+    await expect(page.locator('.sidebar-nav .sidebar-section-items').first()
+      .getByText('Nodes', { exact: false })).toHaveCount(0)
+  })
+})
+
+test.describe('Top navbar', () => {
+  test('admin sees the mode pill and settings cog', async ({ page }) => {
+    await stubFeatures(page, { distributed: true })
+    await stubNoP2P(page)
+    await page.goto('/app/chat')
+    await expect(page.locator('.top-navbar__mode')).toBeVisible({ timeout: 15_000 })
+    await expect(page.locator('.top-navbar__icon[aria-label]')).not.toHaveCount(0)
+  })
+
+  test('admin-via-chat jump shows when localai_assistant is enabled', async ({ page }) => {
+    await stubFeatures(page, { distributed: false, localai_assistant: true })
+    await stubNoP2P(page)
+    await page.goto('/app/chat')
+    await expect(page.locator('.top-navbar__assistant')).toBeVisible({ timeout: 15_000 })
+  })
+
+  test('admin-via-chat jump hidden when localai_assistant is off', async ({ page }) => {
+    await stubFeatures(page, { distributed: false, localai_assistant: false })
+    await stubNoP2P(page)
+    await page.goto('/app/chat')
+    await expect(page.locator('.top-navbar__assistant')).toHaveCount(0)
+  })
+})
+
+test.describe('Token usage meter', () => {
+  test('renders when admin usage has data', async ({ page }) => {
+    await stubFeatures(page, { distributed: false })
+    await stubNoP2P(page)
+    await page.route('**/api/auth/admin/usage**', route =>
+      route.fulfill({ contentType: 'application/json',
+        body: JSON.stringify({ buckets: [{ total_tokens: 1234 }] }) }))
+    await page.goto('/app/chat')
+    await expect(page.locator('.top-navbar__meter')).toBeVisible({ timeout: 15_000 })
+  })
+
+  test('hidden when admin usage is empty (graceful degrade)', async ({ page }) => {
+    await stubFeatures(page, { distributed: false })
+    await stubNoP2P(page)
+    await page.route('**/api/auth/admin/usage**', route =>
+      route.fulfill({ contentType: 'application/json', body: JSON.stringify({ buckets: [] }) }))
+    await page.goto('/app/chat')
+    await expect(page.locator('.top-navbar')).toBeVisible({ timeout: 15_000 })
+    await expect(page.locator('.top-navbar__meter')).toHaveCount(0)
+  })
+})
diff --git a/core/http/react-ui/public/locales/en/nav.json b/core/http/react-ui/public/locales/en/nav.json
index 5423438f9..7317c74cd 100644
--- a/core/http/react-ui/public/locales/en/nav.json
+++ b/core/http/react-ui/public/locales/en/nav.json
@@ -12,6 +12,16 @@
   "accountSettings": "Account settings",
   "account": "Account",
   "accountFor": "Account: {{name}}",
+  "topbar": {
+    "label": "Top bar",
+    "modeDistributed": "Distributed",
+    "modeSwarm": "Swarm",
+    "modeSingle": "Single-node",
+    "pickModel": "Models",
+    "adminViaChat": "Admin via chat",
+    "tokensToday": "Tokens today",
+    "usageDetail": "View usage detail"
+  },
   "sections": {
     "create": "Create",
     "recognition": "Recognition",
diff --git a/core/http/react-ui/src/App.css b/core/http/react-ui/src/App.css
index cf1a46bd3..0238f2fb1 100644
--- a/core/http/react-ui/src/App.css
+++ b/core/http/react-ui/src/App.css
@@ -184,6 +184,50 @@
   font-size: 1.5rem;
 }
 
+/* Desktop top bar: deployment + admin affordances on wide screens. Hidden on
+   mobile, where .mobile-header carries the equivalent actions. */
+.top-navbar {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  gap: var(--spacing-md);
+  padding: var(--spacing-sm) var(--spacing-lg);
+  border-bottom: 1px solid var(--color-border-default);
+  background: var(--color-bg-secondary);
+}
+.top-navbar__right { display: flex; align-items: center; gap: var(--spacing-sm); }
+.top-navbar__mode {
+  font-size: 0.75rem;
+  padding: 2px 10px;
+  border-radius: 999px;
+  border: 1px solid var(--color-border-default);
+  color: var(--color-text-secondary);
+}
+.top-navbar__mode.is-active { color: var(--color-success); border-color: var(--color-success); }
+.top-navbar__btn {
+  display: inline-flex; align-items: center; gap: 6px;
+  font-size: 0.8125rem; padding: 5px 10px; border-radius: 8px;
+  border: 1px solid var(--color-border-default); background: var(--color-bg-tertiary);
+  color: var(--color-text-primary); cursor: pointer;
+}
+.top-navbar__icon {
+  width: 32px; height: 32px; display: inline-flex; align-items: center;
+  justify-content: center; border-radius: 8px; border: 1px solid var(--color-border-default);
+  background: var(--color-bg-tertiary); color: var(--color-text-secondary); cursor: pointer;
+}
+.top-navbar__avatar img { width: 100%; height: 100%; border-radius: 50%; object-fit: cover; }
+.top-navbar__meter {
+  display: inline-flex; flex-direction: column; gap: 3px; align-items: flex-start;
+  padding: 4px 10px; border-radius: 8px; border: 1px solid var(--color-border-default);
+  background: var(--color-bg-tertiary); cursor: pointer; min-width: 150px;
+}
+.top-navbar__meter-label { font-size: 0.6875rem; color: var(--color-text-secondary); }
+.top-navbar__meter-bar { width: 100%; height: 5px; border-radius: 3px; background: var(--color-bg-secondary); overflow: hidden; }
+.top-navbar__meter-bar i { display: block; height: 100%; background: var(--color-primary); }
+@media (max-width: 639px) {
+  .top-navbar { display: none; }
+}
+
 /* Sidebar */
 .sidebar {
   position: fixed;
diff --git a/core/http/react-ui/src/App.jsx b/core/http/react-ui/src/App.jsx
index b922499b5..37ebf384f 100644
--- a/core/http/react-ui/src/App.jsx
+++ b/core/http/react-ui/src/App.jsx
@@ -3,6 +3,7 @@ import { Outlet, useLocation, useNavigate } from 'react-router-dom'
 import { useTranslation } from 'react-i18next'
 import Sidebar from './components/Sidebar'
 import OperationsBar from './components/OperationsBar'
+import TopNavbar from './components/TopNavbar'
 import { ToastContainer, useToast } from './components/Toast'
 import { systemApi } from './utils/api'
 import { useTheme } from './contexts/ThemeContext'
@@ -98,6 +99,7 @@ export default function App() {
       <Sidebar isOpen={sidebarOpen} onClose={() => setSidebarOpen(false)} />
       <main className="main-content" {...(sidebarOpen ? { 'aria-hidden': 'true', inert: '' } : {})}>
         <OperationsBar />
+        <TopNavbar />
         {/* Mobile header — primary actions reachable without opening the
             drawer. Hamburger is the only way to expand the nav on phones;
             theme toggle and account avatar are mirrored from the sidebar
diff --git a/core/http/react-ui/src/components/HomeRoute.jsx b/core/http/react-ui/src/components/HomeRoute.jsx
new file mode 100644
index 000000000..6e0008d8f
--- /dev/null
+++ b/core/http/react-ui/src/components/HomeRoute.jsx
@@ -0,0 +1,28 @@
+import { lazy, Suspense } from 'react'
+import { Navigate } from 'react-router-dom'
+import { useAuth } from '../context/AuthContext'
+import { useDeployment } from '../contexts/DeploymentContext'
+import { resolveHome } from '../utils/resolveHome'
+import RouteFallback from './RouteFallback'
+
+const Home = lazy(() => import('../pages/Home'))
+
+// Index-route element. Waits for auth + deployment signals to load (so we never
+// flash the wrong landing), then either renders Home or redirects to the cell's
+// landing page. Redirecting (rather than rendering Nodes/Chat inline at /app)
+// keeps each target's own route guard, active-nav state, and deep-linkability.
+export default function HomeRoute() {
+  const { isAdmin, loading: authLoading } = useAuth()
+  const { distributed, p2pEnabled, loading: deployLoading } = useDeployment()
+
+  if (authLoading || deployLoading) return <RouteFallback />
+
+  const target = resolveHome({ isAdmin, distributed, p2pEnabled })
+  if (target) return <Navigate to={target} replace />
+
+  return (
+    <Suspense fallback={<RouteFallback />}>
+      <Home />
+    </Suspense>
+  )
+}
diff --git a/core/http/react-ui/src/components/Sidebar.jsx b/core/http/react-ui/src/components/Sidebar.jsx
index 58438fd51..679897e33 100644
--- a/core/http/react-ui/src/components/Sidebar.jsx
+++ b/core/http/react-ui/src/components/Sidebar.jsx
@@ -5,9 +5,11 @@ import ThemeToggle from './ThemeToggle'
 import LanguageSwitcher from './LanguageSwitcher'
 import { useAuth } from '../context/AuthContext'
 import { useBranding } from '../contexts/BrandingContext'
+import { useDeployment } from '../contexts/DeploymentContext'
 import { apiUrl } from '../utils/basePath'
 import { preloadRoute } from '../router'
 import { consoles, firstVisiblePath, consolePaths } from './console/consoleConfig'
+import { clusterPinItems, shouldCollapseCreate } from '../utils/sidebarPolicy'
 
 const COLLAPSED_KEY = 'localai_sidebar_collapsed'
 const SECTIONS_KEY = 'localai_sidebar_sections'
@@ -58,11 +60,13 @@ function NavItem({ item, onClose, collapsed }) {
   )
 }
 
-function loadSectionState() {
-  // Tiers render expanded by default (the redesign favours showing the few
-  // intent groups up front); users can still collapse any tier and the choice
-  // is persisted. Stored values override the defaults so a saved collapse wins.
+function loadSectionState(collapseCreate = false) {
+  // Tiers render expanded by default; users can collapse any tier and the
+  // choice persists (stored values override defaults). In cluster cells we
+  // start Create collapsed so the pinned cluster group leads - but only when
+  // the user has not already expressed a preference.
   const defaults = Object.fromEntries(sections.map(s => [s.id, true]))
+  if (collapseCreate) defaults.create = false
   try {
     const stored = localStorage.getItem(SECTIONS_KEY)
     return stored ? { ...defaults, ...JSON.parse(stored) } : defaults
@@ -77,20 +81,34 @@ function saveSectionState(state) {
 
 export default function Sidebar({ isOpen, onClose }) {
   const { t } = useTranslation('nav')
-  const [features, setFeatures] = useState({})
+  const { isAdmin, authEnabled, user, logout, hasFeature } = useAuth()
+  // Deployment shape (server features + p2p) drives the adaptive sidebar; the
+  // shared context replaces the sidebar's own /api/features fetch so the
+  // landing resolver, navbar, and this policy agree on one snapshot.
+  const deployment = useDeployment()
+  const features = deployment.features
+  // Shared shape for the console gating helpers (consoleConfig.js); in scope for
+  // both the pinned cluster group and the console-tier rendering below.
+  const auth = { isAdmin, authEnabled, hasFeature, features }
+  const collapseCreate = shouldCollapseCreate(auth, deployment)
   const [collapsed, setCollapsed] = useState(() => {
     try { return localStorage.getItem(COLLAPSED_KEY) === 'true' } catch (_) { return false }
   })
   const [openSections, setOpenSections] = useState(loadSectionState)
-  const { isAdmin, authEnabled, user, logout, hasFeature } = useAuth()
   const branding = useBranding()
   const navigate = useNavigate()
   const location = useLocation()
   const closeBtnRef = useRef(null)
 
+  // Apply the cluster-cell Create-collapse default once, only when the user has
+  // no stored section preference (so we never override an explicit choice).
   useEffect(() => {
-    fetch(apiUrl('/api/features')).then(r => r.json()).then(setFeatures).catch(() => {})
-  }, [])
+    if (deployment.loading) return
+    let hasStored = false
+    try { hasStored = !!localStorage.getItem(SECTIONS_KEY) } catch { hasStored = false }
+    if (hasStored || !collapseCreate) return
+    setOpenSections(prev => (prev.create === false ? prev : { ...prev, create: false }))
+  }, [deployment.loading, collapseCreate])
 
   // Stay in sync with external collapse dispatches (e.g. the chat
   // page's focus mode). The collapse-toggle button still owns the
@@ -157,8 +175,6 @@ export default function Sidebar({ isOpen, onClose }) {
   }
 
   const visibleTopItems = topItems.filter(filterItem)
-  // Shared shape for the console gating helpers (consoleConfig.js).
-  const auth = { isAdmin, authEnabled, hasFeature, features }
 
   // Inline sections (Create) carry no gating; a plain filterItem pass suffices.
   const getVisibleSectionItems = (section) => section.items.filter(filterItem)
@@ -199,6 +215,28 @@ export default function Sidebar({ isOpen, onClose }) {
             ))}
           </div>
 
+          {/* Pinned Cluster quick-access (admin + distributed/p2p). Same gate
+              as the Operate rail; surfaced at the top for cluster operators. */}
+          {(() => {
+            const pinned = clusterPinItems(auth, deployment)
+            if (pinned.length === 0) return null
+            return (
+              <div className="sidebar-section">
+                <div className="sidebar-section-title">{t('operate.cluster')}</div>
+                <div className="sidebar-section-items">
+                  {pinned.map(item => (
+                    <NavItem
+                      key={item.path}
+                      item={{ path: item.path, icon: item.icon, labelKey: item.labelKey }}
+                      onClose={onClose}
+                      collapsed={collapsed}
+                    />
+                  ))}
+                </div>
+              </div>
+            )
+          })()}
+
           {/* Collapsible sections */}
           {sections.map(section => {
             const visibleItems = getVisibleSectionItems(section)
diff --git a/core/http/react-ui/src/components/TopNavbar.jsx b/core/http/react-ui/src/components/TopNavbar.jsx
new file mode 100644
index 000000000..a1227b0a9
--- /dev/null
+++ b/core/http/react-ui/src/components/TopNavbar.jsx
@@ -0,0 +1,96 @@
+import { useNavigate } from 'react-router-dom'
+import { useTranslation } from 'react-i18next'
+import { useAuth } from '../context/AuthContext'
+import { useDeployment } from '../contexts/DeploymentContext'
+import { useTheme } from '../contexts/ThemeContext'
+import { launchAssistantChat } from '../utils/launchAssistantChat'
+import TokenUsageMeter from './navbar/TokenUsageMeter'
+
+// Desktop top bar. Complementary to the mobile-only header in App.jsx: this is
+// hidden on small screens (see .top-navbar CSS) and shows deployment/admin
+// affordances on wide screens where the sidebar footer is far from the content.
+export default function TopNavbar() {
+  const { t } = useTranslation('nav')
+  const navigate = useNavigate()
+  const { isAdmin, authEnabled, user } = useAuth()
+  const { features, distributed, p2pEnabled } = useDeployment()
+  const { theme, toggleTheme } = useTheme()
+
+  const modeLabel = distributed
+    ? t('topbar.modeDistributed')
+    : p2pEnabled
+      ? t('topbar.modeSwarm')
+      : t('topbar.modeSingle')
+
+  const showAssistantJump = isAdmin && !!features.localai_assistant
+  const showAvatar = authEnabled && user
+  const themeLabel = theme === 'dark' ? t('switchToLightMode') : t('switchToDarkMode')
+
+  return (
+    <div className="top-navbar" role="navigation" aria-label={t('topbar.label')}>
+      <div className="top-navbar__left">
+        {isAdmin && (
+          <span className={`top-navbar__mode ${distributed || p2pEnabled ? 'is-active' : ''}`}>
+            <i className="fas fa-circle-nodes" aria-hidden="true" /> {modeLabel}
+          </span>
+        )}
+      </div>
+      <div className="top-navbar__right">
+        {!isAdmin && (
+          <button
+            type="button"
+            className="top-navbar__btn"
+            onClick={() => navigate('/app/chat')}
+            title={t('topbar.pickModel')}
+          >
+            <i className="fas fa-cube" aria-hidden="true" /> {t('topbar.pickModel')}
+          </button>
+        )}
+        {showAssistantJump && (
+          <button
+            type="button"
+            className="top-navbar__btn top-navbar__assistant"
+            onClick={() => launchAssistantChat(navigate)}
+            title={t('topbar.adminViaChat')}
+          >
+            <i className="fas fa-user-shield" aria-hidden="true" /> {t('topbar.adminViaChat')}
+          </button>
+        )}
+        {isAdmin && <TokenUsageMeter />}
+        {isAdmin && (
+          <button
+            type="button"
+            className="top-navbar__icon"
+            onClick={() => navigate('/app/settings')}
+            aria-label={t('items.settings')}
+            title={t('items.settings')}
+          >
+            <i className="fas fa-cog" aria-hidden="true" />
+          </button>
+        )}
+        <button
+          type="button"
+          className="top-navbar__icon"
+          onClick={toggleTheme}
+          aria-label={themeLabel}
+          title={themeLabel}
+        >
+          <i className={`fas ${theme === 'dark' ? 'fa-sun' : 'fa-moon'}`} aria-hidden="true" />
+        </button>
+        {showAvatar && (
+          <button
+            type="button"
+            className="top-navbar__icon top-navbar__avatar"
+            onClick={() => navigate('/app/account')}
+            aria-label={user.name || user.email}
+            title={user.name || user.email}
+          >
+            {user.avatarUrl
+              ? <img src={user.avatarUrl} alt="" />
+              : <i className="fas fa-user-circle" aria-hidden="true" />}
+          </button>
+        )}
+      </div>
+    </div>
+  )
+}
diff --git a/core/http/react-ui/src/components/navbar/TokenUsageMeter.jsx b/core/http/react-ui/src/components/navbar/TokenUsageMeter.jsx
new file mode 100644
index 000000000..3213abc59
--- /dev/null
+++ b/core/http/react-ui/src/components/navbar/TokenUsageMeter.jsx
@@ -0,0 +1,52 @@
+import { useState, useEffect } from 'react'
+import { useNavigate } from 'react-router-dom'
+import { useTranslation } from 'react-i18next'
+import { usageApi } from '../../utils/api'
+
+// Compact admin-only usage glance: today's total tokens, optionally against a
+// quota cap, linking to the full /app/usage page. Self-contained data fetch so
+// a usage-API failure cannot break the navbar - it just renders nothing.
+function sumTotalTokens(res) {
+  const buckets = res?.buckets || res?.usage || (Array.isArray(res) ? res : [])
+  if (!Array.isArray(buckets) || buckets.length === 0) return null
+  return buckets.reduce((s, b) => s + (b.total_tokens || 0), 0)
+}
+
+export default function TokenUsageMeter() {
+  const { t } = useTranslation('nav')
+  const navigate = useNavigate()
+  const [tokens, setTokens] = useState(null)
+  const [cap, setCap] = useState(null)
+
+  useEffect(() => {
+    let cancelled = false
+    usageApi.getAdminUsage('day')
+      .then(res => { if (!cancelled) setTokens(sumTotalTokens(res)) })
+      .catch(() => { if (!cancelled) setTokens(null) })
+    usageApi.getMyQuotas()
+      .then(q => { if (!cancelled) setCap(q?.token_limit || q?.tokens?.limit || null) })
+      .catch(() => { if (!cancelled) setCap(null) })
+    return () => { cancelled = true }
+  }, [])
+
+  if (tokens === null) return null
+
+  const pct = cap ? Math.min(100, Math.round((tokens / cap) * 100)) : null
+
+  return (
+    <button
+      type="button"
+      className="top-navbar__meter"
+      onClick={() => navigate('/app/usage')}
+      title={t('topbar.usageDetail')}
+    >
+      <span className="top-navbar__meter-label">
+        {t('topbar.tokensToday')}: {Intl.NumberFormat().format(tokens)}
+        {cap ? ` / ${Intl.NumberFormat().format(cap)}` : ''}
+      </span>
+      {pct !== null && (
+        <span className="top-navbar__meter-bar"><i style={{ width: `${pct}%` }} /></span>
+      )}
+    </button>
+  )
+}
diff --git a/core/http/react-ui/src/contexts/DeploymentContext.jsx b/core/http/react-ui/src/contexts/DeploymentContext.jsx
new file mode 100644
index 000000000..ae0373620
--- /dev/null
+++ b/core/http/react-ui/src/contexts/DeploymentContext.jsx
@@ -0,0 +1,55 @@
+import { createContext, useContext, useState, useEffect } from 'react'
+import { apiUrl } from '../utils/basePath'
+import { p2pApi } from '../utils/api'
+
+const DeploymentContext = createContext(null)
+
+// One shared fetch of the deployment-shape signals the adaptive UI keys off:
+// server features (/api/features) and whether a P2P network token exists.
+// Components used to fetch /api/features independently (Sidebar, Home); this
+// centralises it so the landing resolver, sidebar policy, and navbar agree on
+// one snapshot and we issue a single request.
+export function DeploymentProvider({ children }) {
+  const [features, setFeatures] = useState({})
+  const [p2pEnabled, setP2pEnabled] = useState(false)
+  const [loading, setLoading] = useState(true)
+
+  useEffect(() => {
+    let cancelled = false
+    const featuresP = fetch(apiUrl('/api/features'))
+      .then(r => r.json())
+      .catch(() => ({}))
+    // P2P has no /api/features flag: it is "enabled" when a network token
+    // exists (mirrors pages/P2P.jsx). A 404/disabled endpoint throws and we
+    // treat that as not-enabled.
+    const p2pP = p2pApi.getToken()
+      .then(tok => (typeof tok === 'string' ? tok : (tok?.token || '')).trim())
+      .catch(() => '')
+    Promise.all([featuresP, p2pP]).then(([f, tok]) => {
+      if (cancelled) return
+      setFeatures(f || {})
+      setP2pEnabled(!!tok)
+      setLoading(false)
+    })
+    return () => { cancelled = true }
+  }, [])
+
+  const value = {
+    features,
+    distributed: !!features.distributed,
+    p2pEnabled,
+    loading,
+  }
+
+  return (
+    <DeploymentContext.Provider value={value}>
+      {children}
+    </DeploymentContext.Provider>
+  )
+}
+
+export function useDeployment() {
+  const ctx = useContext(DeploymentContext)
+  if (!ctx) throw new Error('useDeployment must be used within DeploymentProvider')
+  return ctx
+}
diff --git a/core/http/react-ui/src/main.jsx b/core/http/react-ui/src/main.jsx
index 28ce5fde1..8f1c025c4 100644
--- a/core/http/react-ui/src/main.jsx
+++ b/core/http/react-ui/src/main.jsx
@@ -4,6 +4,7 @@ import { RouterProvider } from 'react-router-dom'
 import { ThemeProvider } from './contexts/ThemeContext'
 import { BrandingProvider } from './contexts/BrandingContext'
 import { AuthProvider } from './context/AuthContext'
+import { DeploymentProvider } from './contexts/DeploymentContext'
 import { OperationsProvider } from './contexts/OperationsContext'
 import { router } from './router'
 import './i18n'
@@ -32,9 +33,11 @@ createRoot(document.getElementById('root')).render(
       <ThemeProvider>
         <BrandingProvider>
           <AuthProvider>
-            <OperationsProvider>
-              <RouterProvider router={router} />
-            </OperationsProvider>
+            <DeploymentProvider>
+              <OperationsProvider>
+                <RouterProvider router={router} />
+              </OperationsProvider>
+            </DeploymentProvider>
           </AuthProvider>
         </BrandingProvider>
       </ThemeProvider>
diff --git a/core/http/react-ui/src/pages/Chat.jsx b/core/http/react-ui/src/pages/Chat.jsx
index 675e15581..296dd284a 100644
--- a/core/http/react-ui/src/pages/Chat.jsx
+++ b/core/http/react-ui/src/pages/Chat.jsx
@@ -541,58 +541,73 @@ export default function Chat() {
     updateChatSettings(activeChat.id, { clientMCPServers: next })
   }, [activeChat, updateChatSettings])
 
-  // Load initial message from home page
+  // Load initial message / assistant launch from the Home page or the navbar
+  // quick-jump. Factored into a callback so both the mount-time reader and the
+  // navbar re-trigger event below consume the same payload through one path.
   const homeDataProcessed = useRef(false)
-  useEffect(() => {
-    if (homeDataProcessed.current) return
+  const consumeHomeChatData = useCallback(() => {
     const stored = localStorage.getItem('localai_index_chat_data')
-    if (stored) {
-      homeDataProcessed.current = true
-      try {
-        const data = JSON.parse(stored)
-        localStorage.removeItem('localai_index_chat_data')
+    if (!stored) return
+    try {
+      const data = JSON.parse(stored)
+      localStorage.removeItem('localai_index_chat_data')
 
-        // Two entry shapes from Home:
-        //   - "compose-and-send": data.message present → open new chat,
-        //     prefill the composer, click submit.
-        //   - "open-assistant": no message, just data.localaiAssistant → open
-        //     a fresh chat already in admin mode so the wizard can fire.
-        const hasMessage = !!data.message
-        const wantsAssistant = !!data.localaiAssistant
+      // Two entry shapes from Home:
+      //   - "compose-and-send": data.message present → open new chat,
+      //     prefill the composer, click submit.
+      //   - "open-assistant": no message, just data.localaiAssistant → open
+      //     a fresh chat already in admin mode so the wizard can fire.
+      const hasMessage = !!data.message
+      const wantsAssistant = !!data.localaiAssistant
 
-        if (hasMessage || wantsAssistant) {
-          let targetChat = activeChat
-          if (data.newChat) {
-            targetChat = addChat(data.model || '', '', data.mcpMode || false)
-          } else {
-            if (data.model && activeChat) {
-              updateChatSettings(activeChat.id, { model: data.model })
-            }
-            if (data.mcpMode && activeChat) {
-              updateChatSettings(activeChat.id, { mcpMode: true })
-            }
+      if (hasMessage || wantsAssistant) {
+        let targetChat = activeChat
+        if (data.newChat) {
+          targetChat = addChat(data.model || '', '', data.mcpMode || false)
+        } else {
+          if (data.model && activeChat) {
+            updateChatSettings(activeChat.id, { model: data.model })
           }
-          if (data.mcpServers?.length > 0 && targetChat) {
-            updateChatSettings(targetChat.id, { mcpServers: data.mcpServers })
-          }
-          if (data.clientMCPServers?.length > 0 && targetChat) {
-            updateChatSettings(targetChat.id, { clientMCPServers: data.clientMCPServers })
-          }
-          if (wantsAssistant && targetChat) {
-            updateChatSettings(targetChat.id, { localaiAssistant: true })
-          }
-          if (hasMessage) {
-            setInput(data.message)
-            if (data.files) setFiles(data.files)
-            setTimeout(() => {
-              const submitBtn = document.getElementById('chat-submit-btn')
-              submitBtn?.click()
-            }, 100)
+          if (data.mcpMode && activeChat) {
+            updateChatSettings(activeChat.id, { mcpMode: true })
           }
         }
-      } catch (_e) { /* ignore */ }
-    }
-  }, [])
+        if (data.mcpServers?.length > 0 && targetChat) {
+          updateChatSettings(targetChat.id, { mcpServers: data.mcpServers })
+        }
+        if (data.clientMCPServers?.length > 0 && targetChat) {
+          updateChatSettings(targetChat.id, { clientMCPServers: data.clientMCPServers })
+        }
+        if (wantsAssistant && targetChat) {
+          updateChatSettings(targetChat.id, { localaiAssistant: true })
+        }
+        if (hasMessage) {
+          setInput(data.message)
+          if (data.files) setFiles(data.files)
+          setTimeout(() => {
+            const submitBtn = document.getElementById('chat-submit-btn')
+            submitBtn?.click()
+          }, 100)
+        }
+      }
+    } catch (_e) { /* ignore */ }
+  }, [activeChat, addChat, updateChatSettings])
+
+  useEffect(() => {
+    if (homeDataProcessed.current) return
+    homeDataProcessed.current = true
+    consumeHomeChatData()
+  }, [consumeHomeChatData])
+
+  // Admins can re-trigger the assistant jump from the navbar while already on
+  // the chat page; navigate('/app/chat') does not remount Chat, so the
+  // mount-time reader above never fires. The launcher dispatches this event
+  // after writing the payload so we re-consume it and open a fresh assistant.
+  useEffect(() => {
+    const onOpenAssistant = () => consumeHomeChatData()
+    window.addEventListener('localai-open-assistant', onOpenAssistant)
+    return () => window.removeEventListener('localai-open-assistant', onOpenAssistant)
+  }, [consumeHomeChatData])
 
   // Track whether the user is pinned to the bottom. If they scroll up
   // while a response is streaming, stop forcing them back down.
diff --git a/core/http/react-ui/src/pages/Home.jsx b/core/http/react-ui/src/pages/Home.jsx
index 2c36b03a6..9e46a7254 100644
--- a/core/http/react-ui/src/pages/Home.jsx
+++ b/core/http/react-ui/src/pages/Home.jsx
@@ -13,6 +13,7 @@ import { useResources } from '../hooks/useResources'
 import { fileToBase64, backendControlApi, systemApi, modelsApi, mcpApi, nodesApi } from '../utils/api'
 import { API_CONFIG } from '../utils/config'
 import { greetingKey } from '../utils/greeting'
+import { launchAssistantChat } from '../utils/launchAssistantChat'
 import StatusPill from '../components/StatusPill'
 import Skeleton from '../components/Skeleton'
 import SectionHeading from '../components/SectionHeading'
@@ -228,16 +229,8 @@ export default function Home() {
   // requiring an initial message or model selection. Useful when an admin
   // wants to start the assistant from a cold home page.
   const openAssistantChat = useCallback(() => {
-    const chatData = {
-      model: selectedModel || '',
-      mcpMode: false,
-      localaiAssistant: true,
-      newChat: true,
-    }
-    localStorage.setItem('localai_index_chat_data', JSON.stringify(chatData))
-    try { localStorage.setItem('localai_assistant_used', '1') } catch { /* ignore */ }
+    launchAssistantChat(navigate, selectedModel)
     setAssistantUsed(true)
-    navigate('/app/chat')
   }, [navigate, selectedModel])
 
   const handleSubmit = (e) => {
diff --git a/core/http/react-ui/src/router.jsx b/core/http/react-ui/src/router.jsx
index 03962d27e..e985c4195 100644
--- a/core/http/react-ui/src/router.jsx
+++ b/core/http/react-ui/src/router.jsx
@@ -6,6 +6,7 @@ import RequireAdmin from './components/RequireAdmin'
 import RequireAuth from './components/RequireAuth'
 import RequireAuthEnabled from './components/RequireAuthEnabled'
 import RequireFeature from './components/RequireFeature'
+import HomeRoute from './components/HomeRoute'
 
 // Pages are code-split: each becomes its own chunk loaded on demand, so a route
 // no longer drags every other page (and its heavy deps — CodeMirror, the MCP
@@ -32,7 +33,7 @@ export function preloadRoute(path) {
   preloaders[m[1] ?? '']?.().catch(() => { /* network blip — real click will retry */ })
 }
 
-const Home = page('', () => import('./pages/Home'))
+page('', () => import('./pages/Home'))
 const Chat = page('chat', () => import('./pages/Chat'))
 const Models = page('models', () => import('./pages/Models'))
 const Manage = page('manage', () => import('./pages/Manage'))
@@ -96,7 +97,7 @@ function Feature({ feature, children }) {
 }
 
 const appChildren = [
-  { index: true, element: <Home /> },
+  { index: true, element: <HomeRoute /> },
   { path: 'chat', element: <Chat /> },
   { path: 'chat/:model', element: <Chat /> },
   { path: 'image', element: <ImageGen /> },
diff --git a/core/http/react-ui/src/utils/launchAssistantChat.js b/core/http/react-ui/src/utils/launchAssistantChat.js
new file mode 100644
index 000000000..833d2ec56
--- /dev/null
+++ b/core/http/react-ui/src/utils/launchAssistantChat.js
@@ -0,0 +1,19 @@
+// Opens a fresh chat already in LocalAI Assistant ("manage") mode. Chat.jsx
+// reads localai_index_chat_data on mount and enables localaiAssistant for the
+// new chat. Shared by the Home CTA and the top navbar quick-jump so there is
+// one definition of how the assistant is launched.
+export function launchAssistantChat(navigate, model = '') {
+  const chatData = {
+    model: model || '',
+    mcpMode: false,
+    localaiAssistant: true,
+    newChat: true,
+  }
+  try { localStorage.setItem('localai_index_chat_data', JSON.stringify(chatData)) } catch { /* ignore */ }
+  try { localStorage.setItem('localai_assistant_used', '1') } catch { /* ignore */ }
+  navigate('/app/chat')
+  // When already on /app/chat, navigate() does not remount Chat, so its
+  // mount-time reader would never see the payload above. Signal the mounted
+  // Chat to re-consume it; harmless elsewhere since Chat reads on mount anyway.
+  try { window.dispatchEvent(new CustomEvent('localai-open-assistant')) } catch { /* ignore */ }
+}
diff --git a/core/http/react-ui/src/utils/resolveHome.js b/core/http/react-ui/src/utils/resolveHome.js
new file mode 100644
index 000000000..2353db5d8
--- /dev/null
+++ b/core/http/react-ui/src/utils/resolveHome.js
@@ -0,0 +1,11 @@
+// Pure landing-page resolver for the index route. Returns a target path, or ''
+// meaning "render the default Home". Admin precedence is distributed > p2p >
+// plain; non-admins always go to Chat (distributed/p2p are admin-only and
+// invisible to them). Visibility gates are enforced elsewhere - this only
+// chooses where /app lands.
+export function resolveHome({ isAdmin, distributed, p2pEnabled }) {
+  if (!isAdmin) return '/app/chat'
+  if (distributed) return '/app/nodes'
+  if (p2pEnabled) return '/app/p2p'
+  return ''
+}
diff --git a/core/http/react-ui/src/utils/sidebarPolicy.js b/core/http/react-ui/src/utils/sidebarPolicy.js
new file mode 100644
index 000000000..da4cd392e
--- /dev/null
+++ b/core/http/react-ui/src/utils/sidebarPolicy.js
@@ -0,0 +1,20 @@
+import { operateConsole, isConsoleItemVisible } from '../components/console/consoleConfig'
+
+// The Operate > Cluster group, surfaced as a pinned top-of-sidebar quick-access
+// group when the admin is running a cluster (NATS-distributed) or a P2P swarm.
+// Items are filtered through the SAME gate as everywhere else, so e.g. in a
+// p2p-only deployment Nodes/Scheduling (feature: 'distributed') drop out and
+// only Swarm remains. Returns [] when the pin does not apply.
+export function clusterPinItems(auth, deployment) {
+  if (!auth.isAdmin) return []
+  if (!deployment.distributed && !deployment.p2pEnabled) return []
+  const group = operateConsole.groups.find(g => g.titleKey === 'operate.cluster')
+  if (!group) return []
+  return group.items.filter(item => isConsoleItemVisible(item, auth))
+}
+
+// In the cluster cells the Create group defaults collapsed so the pinned
+// cluster group leads. Users can still expand it; their stored choice wins.
+export function shouldCollapseCreate(auth, deployment) {
+  return !!auth.isAdmin && (!!deployment.distributed || !!deployment.p2pEnabled)
+}

From fdf475ec5f00ad2a9eb0f0eb804d1265e3ea60c7 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 21:28:49 +0200
Subject: [PATCH 51/99] feat(realtime): conversation compaction
 (summarize-then-drop) + OpenAI item.delete/truncate/clear (#10446)

* feat(realtime): add pipeline.compaction config + resolution

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* refactor(realtime): extract itemID helper, reuse in item.retrieve

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* test(realtime): drop duplicate Ginkgo bootstrap, fold specs into openai suite

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(realtime): implement conversation.item.delete

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(realtime): implement input_audio_buffer.clear

Add a handler for the input_audio_buffer.clear client event that discards
a partially-captured utterance (raw PCM + buffered Opus frames) via a
unit-tested clearInputAudio helper, then acks with input_audio_buffer.cleared.

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(realtime): implement conversation.item.truncate (text)

Clears both .Text and .Transcript of the assistant content part at
contentIndex so barge-in truncation also works for audio turns whose
spoken words live in .Transcript.

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(realtime): add Conversation.Memory + pair-safe compactionCut

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(realtime): compactionCut returns 0 for keep<=0 (no-cap sentinel, avoids panic)

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* style(realtime): gofmt compaction test helper closures

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(realtime): inject rolling memory into the prompt + summary builders

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(realtime): server-side summarize-then-drop compactor

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* test(realtime): unit-test prefixMatches eviction-safety predicate

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(realtime): resolve summarizer model + schedule compaction per turn

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* docs(realtime): document conversation compaction + new item events

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(realtime): resolve summary model inside compaction goroutine (lazy, off-path)

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* refactor(realtime): reuse reasoning.ExtractReasoningComplete for summary stripping

Replace the bespoke <think> regex in the compactor with the shared
pkg/reasoning extractor (via spokenReasoningConfig), matching the rest of
the realtime path and covering all reasoning tag families, not just <think>.

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(config): register pipeline.compaction fields in meta registry

TestAllFieldsHaveRegistryEntries requires every ModelConfig field to have
a UI/meta registry entry; add the four pipeline.compaction.* leaves so they
render with proper labels/descriptions instead of the reflection fallback.

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/config/meta/registry.go                  |  30 ++
 core/config/model_config.go                   |  21 ++
 core/http/endpoints/openai/realtime.go        |  99 +++++-
 .../endpoints/openai/realtime_compaction.go   | 326 ++++++++++++++++++
 .../openai/realtime_compaction_test.go        | 308 +++++++++++++++++
 docs/content/features/openai-realtime.md      |  27 ++
 6 files changed, 792 insertions(+), 19 deletions(-)
 create mode 100644 core/http/endpoints/openai/realtime_compaction.go
 create mode 100644 core/http/endpoints/openai/realtime_compaction_test.go

diff --git a/core/config/meta/registry.go b/core/config/meta/registry.go
index a1cfe4c9a..3476076e1 100644
--- a/core/config/meta/registry.go
+++ b/core/config/meta/registry.go
@@ -537,6 +537,36 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Component:   "number",
 			Order:       79,
 		},
+		"pipeline.compaction.enabled": {
+			Section:     "pipeline",
+			Label:       "Compaction Enabled",
+			Description: "Fold conversation items that age out of the live window (Max History Items) into a rolling summary instead of dropping them, so long realtime sessions stay cheap without losing earlier context. Off by default.",
+			Component:   "toggle",
+			Order:       80,
+		},
+		"pipeline.compaction.trigger_items": {
+			Section:     "pipeline",
+			Label:       "Compaction Trigger Items",
+			Description: "High-water mark: once the live conversation exceeds this many items, the overflow above Max History Items is summarized and evicted. Must be greater than Max History Items; defaults to twice it. The gap controls how often summarization runs.",
+			Component:   "number",
+			Order:       81,
+		},
+		"pipeline.compaction.summary_model": {
+			Section:     "pipeline",
+			Label:       "Compaction Summary Model",
+			Description: "Optional smaller/cheaper model used to produce the rolling summary. Empty reuses the pipeline's own LLM. On CPU, a tiny model here keeps compaction from competing with the conversation LLM.",
+			Component:   "input",
+			Advanced:    true,
+			Order:       82,
+		},
+		"pipeline.compaction.max_summary_tokens": {
+			Section:     "pipeline",
+			Label:       "Compaction Max Summary Tokens",
+			Description: "Advisory cap on the rolling summary length (fed to the summarizer prompt). Defaults to 512.",
+			Component:   "number",
+			Advanced:    true,
+			Order:       83,
+		},
 
 		// --- Functions ---
 		"function.grammar.parallel_calls": {
diff --git a/core/config/model_config.go b/core/config/model_config.go
index cbb336838..8886ddfd5 100644
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -641,11 +641,32 @@ type Pipeline struct {
 	// context fills.
 	MaxHistoryItems *int `yaml:"max_history_items,omitempty" json:"max_history_items,omitempty"`
 
+	// Compaction folds conversation items that age out of the live window
+	// (max_history_items) into a rolling summary instead of dropping them, so
+	// long realtime sessions stay cheap without losing earlier context. Nil
+	// (block absent) means disabled, preserving existing behavior.
+	Compaction *PipelineCompaction `yaml:"compaction,omitempty" json:"compaction,omitempty"`
+
 	// VoiceRecognition gates the pipeline behind speaker verification. Nil
 	// (block absent) means no gate, preserving existing behavior.
 	VoiceRecognition *PipelineVoiceRecognition `yaml:"voice_recognition,omitempty" json:"voice_recognition,omitempty"`
 }
 
+// PipelineCompaction configures summarize-then-drop for a realtime pipeline.
+type PipelineCompaction struct {
+	// Enabled turns summarize-then-drop on. Default false.
+	Enabled bool `yaml:"enabled,omitempty" json:"enabled,omitempty"`
+	// TriggerItems is the high-water mark: once live items exceed it, overflow
+	// above max_history_items is summarized and evicted. Must exceed
+	// max_history_items; clamped up if not. Default: 2x max_history_items.
+	TriggerItems int `yaml:"trigger_items,omitempty" json:"trigger_items,omitempty"`
+	// SummaryModel optionally names a smaller/cheaper model for the summary
+	// call. Empty uses the pipeline's own LLM.
+	SummaryModel string `yaml:"summary_model,omitempty" json:"summary_model,omitempty"`
+	// MaxSummaryTokens advises the summary length (fed to the prompt). Default 512.
+	MaxSummaryTokens int `yaml:"max_summary_tokens,omitempty" json:"max_summary_tokens,omitempty"`
+}
+
 // ApplyReasoningEffort resolves the effective reasoning effort — a per-request
 // value (requestEffort) overrides the config's own ReasoningEffort default —
 // stores it on the config so gRPCPredictOpts forwards it to the backend as the
diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go
index 1af4c6b75..d4d6a0ac4 100644
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -12,6 +12,7 @@ import (
 	"os"
 	"strconv"
 	"sync"
+	"sync/atomic"
 	"time"
 
 	"net/http"
@@ -134,6 +135,18 @@ type Session struct {
 	// pairs are kept together so we never feed an orphaned tool result.
 	MaxHistoryItems int
 
+	// Compaction settings resolved from pipeline.compaction (see resolveCompaction).
+	CompactionEnabled bool
+	CompactionTrigger int
+	SummaryModel      string
+	MaxSummaryTokens  int
+
+	// summarizerFactory lazily builds the model used for compaction summaries
+	// when summary_model is configured; nil means reuse the pipeline LLM.
+	summarizerFactory func() (Model, error)
+	summarizerOnce    sync.Once
+	summarizerCached  Model
+
 	// AssistantExecutor is non-nil when the session opted into the in-process
 	// LocalAI Assistant tool surface. Tool calls whose name matches this
 	// executor's catalog are run inproc and their output is fed back to the
@@ -241,6 +254,12 @@ type Conversation struct {
 	ID    string
 	Items []*types.MessageItemUnion
 	Lock  sync.Mutex
+	// Memory is the rolling summary of items already evicted by compaction. It
+	// is kept out of Items (so trimRealtimeItems never drops it) and rendered
+	// as a system message right after the session instructions.
+	Memory string
+	// compacting ensures at most one background compaction runs per conversation.
+	compacting atomic.Bool
 }
 
 func (c *Conversation) ToServer() types.Conversation {
@@ -540,13 +559,12 @@ func runRealtimeSession(application *application.Application, t Transport, model
 		SoundDetectionWindowMs:  cfg.Pipeline.SoundDetectionWindowMs,
 		SoundDetectionHopMs:     cfg.Pipeline.SoundDetectionHopMs,
 	}
+	session.CompactionEnabled, session.CompactionTrigger, session.MaxSummaryTokens, session.SummaryModel = resolveCompaction(cfg, session.MaxHistoryItems)
 
 	// Create a default conversation
 	conversationID := generateConversationID()
 	conversation := &Conversation{
-		ID: conversationID,
-		// TODO: We need to truncate the conversation items when a new item is added and we have run out of space. There are multiple places where items
-		//       can be added so we could use a datastructure here that enforces truncation upon addition
+		ID:    conversationID,
 		Items: []*types.MessageItemUnion{},
 	}
 	session.Conversations[conversationID] = conversation
@@ -577,6 +595,18 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	}
 	session.ModelInterface = m
 
+	if session.SummaryModel != "" {
+		summaryModelName := session.SummaryModel
+		sid := sessionID
+		session.summarizerFactory = func() (Model, error) {
+			summaryCfg, lerr := application.ModelConfigLoader().LoadModelConfigFileByNameDefaultOptions(summaryModelName, application.ApplicationConfig())
+			if lerr != nil {
+				return nil, fmt.Errorf("load summary model config %q: %w", summaryModelName, lerr)
+			}
+			return newModel(&summaryCfg.Pipeline, application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), evaluator, buildRealtimeRoutingContext(application, sid))
+		}
+	}
+
 	if cfg.Pipeline.VoiceGateEnabled() {
 		gate, gerr := newVoiceGate(
 			*cfg.Pipeline.VoiceRecognition,
@@ -807,6 +837,15 @@ func runRealtimeSession(application *application.Application, t Transport, model
 				commitUtterance(respCtx, allAudio, session, conversation, t)
 			}()
 
+		case types.InputAudioBufferClearEvent:
+			xlog.Debug("recv", "message", string(msg))
+			// Discard a partially-captured utterance so the client can restart
+			// input cleanly without the stale buffer leaking into the next commit.
+			clearInputAudio(session)
+			sendEvent(t, types.InputAudioBufferClearedEvent{
+				ServerEventBase: types.ServerEventBase{EventID: e.EventID},
+			})
+
 		case types.ConversationItemCreateEvent:
 			xlog.Debug("recv", "message", string(msg))
 			// Add the item to the conversation
@@ -841,7 +880,39 @@ func runRealtimeSession(application *application.Application, t Transport, model
 			})
 
 		case types.ConversationItemDeleteEvent:
-			sendError(t, "not_implemented", "Deleting items not implemented", "", "event_TODO")
+			xlog.Debug("recv", "message", string(msg))
+			if e.ItemID == "" {
+				sendError(t, "invalid_item_id", "Need item_id, but none specified", "", "event_TODO")
+				continue
+			}
+			conversation.Lock.Lock()
+			updated, ok := deleteItem(conversation.Items, e.ItemID)
+			conversation.Items = updated
+			conversation.Lock.Unlock()
+			if !ok {
+				sendError(t, "invalid_item_id", "Item to delete not found", "", "event_TODO")
+				continue
+			}
+			sendEvent(t, types.ConversationItemDeletedEvent{
+				ServerEventBase: types.ServerEventBase{EventID: e.EventID},
+				ItemID:          e.ItemID,
+			})
+
+		case types.ConversationItemTruncateEvent:
+			xlog.Debug("recv", "message", string(msg))
+			conversation.Lock.Lock()
+			ok := truncateAssistantText(conversation.Items, e.ItemID, e.ContentIndex)
+			conversation.Lock.Unlock()
+			if !ok {
+				sendError(t, "invalid_item_id", "Item to truncate not found", "", "event_TODO")
+				continue
+			}
+			sendEvent(t, types.ConversationItemTruncatedEvent{
+				ServerEventBase: types.ServerEventBase{EventID: e.EventID},
+				ItemID:          e.ItemID,
+				ContentIndex:    e.ContentIndex,
+				AudioEndMs:      e.AudioEndMs,
+			})
 
 		case types.ConversationItemRetrieveEvent:
 			xlog.Debug("recv", "message", string(msg))
@@ -854,21 +925,7 @@ func runRealtimeSession(application *application.Application, t Transport, model
 			conversation.Lock.Lock()
 			var retrievedItem types.MessageItemUnion
 			for _, item := range conversation.Items {
-				// We need to check ID in the union
-				var id string
-				if item.System != nil {
-					id = item.System.ID
-				} else if item.User != nil {
-					id = item.User.ID
-				} else if item.Assistant != nil {
-					id = item.Assistant.ID
-				} else if item.FunctionCall != nil {
-					id = item.FunctionCall.ID
-				} else if item.FunctionCallOutput != nil {
-					id = item.FunctionCallOutput.ID
-				}
-
-				if id == e.ItemID {
+				if itemID(item) == e.ItemID {
 					retrievedItem = *item
 					break
 				}
@@ -1666,6 +1723,9 @@ const maxAssistantToolTurns = 10
 
 func triggerResponse(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams) {
 	triggerResponseAtTurn(ctx, session, conv, t, overrides, 0)
+	// Fold aged-out turns into the rolling memory off the critical path; the
+	// next turn reaps the smaller buffer.
+	session.maybeCompact(conv)
 }
 
 func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams, toolTurn int) {
@@ -1721,6 +1781,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 	var lastUserSpeaker *types.Speaker
 	personalize := session.voiceGate != nil && session.voiceGate.cfg.PersonalizeEnabled()
 	conv.Lock.Lock()
+	conversationHistory = withMemory(conversationHistory, conv.Memory)
 	items := trimRealtimeItems(conv.Items, session.MaxHistoryItems)
 	for _, item := range items {
 		if item.User != nil {
diff --git a/core/http/endpoints/openai/realtime_compaction.go b/core/http/endpoints/openai/realtime_compaction.go
new file mode 100644
index 000000000..f79a2d7a2
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_compaction.go
@@ -0,0 +1,326 @@
+package openai
+
+import (
+	"context"
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/pkg/reasoning"
+	"github.com/mudler/xlog"
+)
+
+const (
+	defaultMaxSummaryTokens = 512
+	memoryPrefix            = "Summary of earlier conversation:\n"
+	// compactionTimeout bounds the summarizer call so a stuck model can't pin the
+	// compacting flag (and thus block all further compaction) forever.
+	compactionTimeout = 60 * time.Second
+)
+
+// withMemory inserts the rolling summary as a system message after the existing
+// (instructions) history. No-op when memory is empty.
+func withMemory(history schema.Messages, memory string) schema.Messages {
+	if memory == "" {
+		return history
+	}
+	content := memoryPrefix + memory
+	return append(history, schema.Message{
+		Role:          string(types.MessageRoleSystem),
+		StringContent: content,
+		Content:       content,
+	})
+}
+
+// renderItemsTranscript renders conversation items as a plain "role: text"
+// transcript for summarization. Non-text items (bare tool calls) are labelled
+// so the summarizer keeps track of actions taken.
+func renderItemsTranscript(items []*types.MessageItemUnion) string {
+	var b strings.Builder
+	for _, item := range items {
+		switch {
+		case item.User != nil:
+			b.WriteString("user: ")
+			for _, c := range item.User.Content {
+				if c.Text != "" {
+					b.WriteString(c.Text)
+				}
+				if c.Transcript != "" {
+					b.WriteString(c.Transcript)
+				}
+			}
+			b.WriteString("\n")
+		case item.Assistant != nil:
+			b.WriteString("assistant: ")
+			// Realtime assistant *audio* turns store the spoken words in
+			// .Transcript (not .Text), so emit both or spoken turns are dropped.
+			for _, c := range item.Assistant.Content {
+				if c.Text != "" {
+					b.WriteString(c.Text)
+				}
+				if c.Transcript != "" {
+					b.WriteString(c.Transcript)
+				}
+			}
+			b.WriteString("\n")
+		case item.FunctionCall != nil:
+			b.WriteString(fmt.Sprintf("assistant called tool %s(%s)\n", item.FunctionCall.Name, item.FunctionCall.Arguments))
+		case item.FunctionCallOutput != nil:
+			b.WriteString(fmt.Sprintf("tool result: %s\n", item.FunctionCallOutput.Output))
+		}
+	}
+	return strings.TrimSpace(b.String())
+}
+
+// buildSummaryMessages builds the chat messages for the summarizer LLM: a system
+// instruction plus prior memory and the new transcript to fold in. maxTokens is
+// advisory (fed to the prompt; not hard-enforced in v1).
+func buildSummaryMessages(priorMemory, transcript string, maxTokens int) schema.Messages {
+	system := fmt.Sprintf("You maintain a running memory of a live voice conversation. "+
+		"Merge the prior memory with the new exchanges into an updated memory. "+
+		"Keep names, decisions, facts, preferences, and open threads. Be concise "+
+		"(under ~%d tokens). Output only the updated memory, with no reasoning or tags.", maxTokens)
+	var user strings.Builder
+	if priorMemory != "" {
+		user.WriteString("Prior memory:\n")
+		user.WriteString(priorMemory)
+		user.WriteString("\n\n")
+	}
+	user.WriteString("New exchanges to fold in:\n")
+	user.WriteString(transcript)
+	return schema.Messages{
+		{Role: string(types.MessageRoleSystem), StringContent: system, Content: system},
+		{Role: string(types.MessageRoleUser), StringContent: user.String(), Content: user.String()},
+	}
+}
+
+// clearInputAudio resets the session's pending input audio buffer (the raw
+// PCM and any buffered Opus frames). Used by the input_audio_buffer.clear
+// realtime event so a client can discard a partially-captured utterance.
+func clearInputAudio(s *Session) {
+	s.AudioBufferLock.Lock()
+	s.InputAudioBuffer = nil
+	s.AudioBufferLock.Unlock()
+	s.OpusFramesLock.Lock()
+	s.OpusFrames = nil
+	s.OpusFramesLock.Unlock()
+}
+
+// itemID extracts the id from any MessageItemUnion variant ("" if none).
+func itemID(item *types.MessageItemUnion) string {
+	switch {
+	case item == nil:
+		return ""
+	case item.System != nil:
+		return item.System.ID
+	case item.User != nil:
+		return item.User.ID
+	case item.Assistant != nil:
+		return item.Assistant.ID
+	case item.FunctionCall != nil:
+		return item.FunctionCall.ID
+	case item.FunctionCallOutput != nil:
+		return item.FunctionCallOutput.ID
+	default:
+		return ""
+	}
+}
+
+// deleteItem removes the item with id from items, returning the new slice and
+// whether it was found.
+func deleteItem(items []*types.MessageItemUnion, id string) ([]*types.MessageItemUnion, bool) {
+	for i, item := range items {
+		if itemID(item) == id {
+			return append(items[:i:i], items[i+1:]...), true
+		}
+	}
+	return items, false
+}
+
+// truncateAssistantText clears the text of the assistant item's content part at
+// contentIndex. Minimal truncate: used to discard an interrupted/barge-in
+// response tail. Both .Text and .Transcript are cleared because realtime audio
+// turns store the spoken words in .Transcript (clearing only .Text would no-op).
+func truncateAssistantText(items []*types.MessageItemUnion, id string, contentIndex int) bool {
+	for _, item := range items {
+		if itemID(item) != id || item.Assistant == nil {
+			continue
+		}
+		if contentIndex >= 0 && contentIndex < len(item.Assistant.Content) {
+			item.Assistant.Content[contentIndex].Text = ""
+			item.Assistant.Content[contentIndex].Transcript = ""
+		}
+		return true
+	}
+	return false
+}
+
+// compactionCut returns the index splitting items into overflow (items[:cut],
+// to be summarized+evicted) and the kept live tail (items[cut:]), keeping the
+// last `keep` items. It mirrors trimRealtimeItems' pair-safety: the cut is
+// pulled left so a function_call and its function_call_output are never split
+// across the boundary (the whole pair lands in the kept tail). Returns 0 when
+// there is nothing to cut.
+func compactionCut(items []*types.MessageItemUnion, keep int) int {
+	// keep <= 0 means no live-window cap (the "unlimited history" sentinel, as
+	// in trimRealtimeItems): there is nothing to evict, so cut nothing. This
+	// also avoids indexing items[len(items)] in the pair-safety loop below.
+	if keep <= 0 {
+		return 0
+	}
+	cut := len(items) - keep
+	if cut <= 0 {
+		return 0
+	}
+	for cut > 0 && items[cut] != nil && items[cut].FunctionCallOutput != nil {
+		cut--
+	}
+	return cut
+}
+
+// resolveCompaction reads the pipeline.compaction block, applying defaults and
+// the trigger>max_history invariant. maxHistory is the already-resolved live
+// window size. Returns enabled=false (and zero values) when compaction is off.
+func resolveCompaction(cfg *config.ModelConfig, maxHistory int) (enabled bool, trigger, maxSummaryTokens int, summaryModel string) {
+	if cfg == nil || cfg.Pipeline.Compaction == nil || !cfg.Pipeline.Compaction.Enabled {
+		return false, 0, 0, ""
+	}
+	c := cfg.Pipeline.Compaction
+	trigger = c.TriggerItems
+	if trigger <= 0 {
+		trigger = maxHistory * 2
+	}
+	if trigger <= maxHistory {
+		trigger = maxHistory + 1
+	}
+	maxSummaryTokens = c.MaxSummaryTokens
+	if maxSummaryTokens <= 0 {
+		maxSummaryTokens = defaultMaxSummaryTokens
+	}
+	return true, trigger, maxSummaryTokens, c.SummaryModel
+}
+
+// prefixMatches reports whether items begins with the same ids, in order, as
+// snapshot — i.e. the overflow we summarized is still at the head (no concurrent
+// client delete reshuffled it).
+func prefixMatches(items, snapshot []*types.MessageItemUnion) bool {
+	if len(items) < len(snapshot) {
+		return false
+	}
+	for i := range snapshot {
+		if itemID(items[i]) != itemID(snapshot[i]) {
+			return false
+		}
+	}
+	return true
+}
+
+// compact folds overflow items into conv.Memory and evicts them. It never holds
+// conv.Lock across the summarizer call: snapshot under lock, summarize unlocked,
+// commit under lock (re-validating the head is unchanged). On any error it
+// leaves the conversation untouched — items are never dropped without a summary.
+func (s *Session) compact(conv *Conversation, model Model) {
+	if model == nil {
+		return
+	}
+	// Snapshot.
+	conv.Lock.Lock()
+	if len(conv.Items) <= s.CompactionTrigger {
+		conv.Lock.Unlock()
+		return
+	}
+	cut := compactionCut(conv.Items, s.MaxHistoryItems)
+	if cut <= 0 {
+		conv.Lock.Unlock()
+		return
+	}
+	overflow := append([]*types.MessageItemUnion(nil), conv.Items[:cut]...)
+	prior := conv.Memory
+	conv.Lock.Unlock()
+
+	// Summarize (unlocked).
+	msgs := buildSummaryMessages(prior, renderItemsTranscript(overflow), s.MaxSummaryTokens)
+	ctx, cancel := context.WithTimeout(context.Background(), compactionTimeout)
+	defer cancel()
+	predFunc, err := model.Predict(ctx, msgs, nil, nil, nil, nil, nil, nil, nil, nil, nil)
+	if err != nil {
+		xlog.Warn("realtime compaction: summarizer predict failed", "error", err)
+		return
+	}
+	pred, err := predFunc()
+	if err != nil {
+		xlog.Warn("realtime compaction: summarizer inference failed", "error", err)
+		return
+	}
+	// Strip any leaked reasoning/thinking spans using the same extractor the
+	// rest of the realtime path uses, rather than a bespoke regex.
+	rcfg := reasoning.Config{}
+	if mc := model.PredictConfig(); mc != nil {
+		rcfg = spokenReasoningConfig(mc.ReasoningConfig)
+	}
+	_, summary := reasoning.ExtractReasoningComplete(pred.Response, "", rcfg)
+	summary = strings.TrimSpace(summary)
+	if summary == "" {
+		xlog.Warn("realtime compaction: empty summary, skipping eviction")
+		return
+	}
+
+	// Commit.
+	conv.Lock.Lock()
+	defer conv.Lock.Unlock()
+	if !prefixMatches(conv.Items, overflow) {
+		xlog.Debug("realtime compaction: head changed during summary, skipping")
+		return
+	}
+	conv.Memory = summary
+	conv.Items = conv.Items[len(overflow):]
+	xlog.Debug("realtime compaction: evicted items into memory", "evicted", len(overflow), "remaining", len(conv.Items))
+}
+
+// summarizerModel resolves the model used to produce compaction summaries.
+// Without a configured summary_model (or factory) it reuses the pipeline LLM.
+func (s *Session) summarizerModel() Model {
+	if s.SummaryModel == "" || s.summarizerFactory == nil {
+		return s.ModelInterface
+	}
+	s.summarizerOnce.Do(func() {
+		m, err := s.summarizerFactory()
+		if err != nil {
+			xlog.Warn("realtime compaction: summary_model load failed, falling back to pipeline LLM", "model", s.SummaryModel, "error", err)
+			m = s.ModelInterface
+		}
+		s.summarizerCached = m
+	})
+	return s.summarizerCached
+}
+
+// maybeCompact schedules a background compaction when the live buffer has grown
+// past the trigger and none is already running. Returns immediately.
+func (s *Session) maybeCompact(conv *Conversation) {
+	if !s.CompactionEnabled {
+		return
+	}
+	conv.Lock.Lock()
+	over := len(conv.Items) > s.CompactionTrigger
+	conv.Lock.Unlock()
+	if !over {
+		return
+	}
+	if !conv.compacting.CompareAndSwap(false, true) {
+		return
+	}
+	go func() {
+		defer conv.compacting.Store(false)
+		// Resolve (and, for a configured summary_model, lazily load) the
+		// summarizer only when a compaction actually runs, off the response
+		// path — so the model load never blocks a user turn.
+		model := s.summarizerModel()
+		if model == nil {
+			return
+		}
+		s.compact(conv, model)
+	}()
+}
diff --git a/core/http/endpoints/openai/realtime_compaction_test.go b/core/http/endpoints/openai/realtime_compaction_test.go
new file mode 100644
index 000000000..5b19a8259
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_compaction_test.go
@@ -0,0 +1,308 @@
+package openai
+
+import (
+	"errors"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+	"github.com/mudler/LocalAI/core/schema"
+)
+
+var _ = Describe("resolveCompaction", func() {
+	It("disables when the block is absent", func() {
+		enabled, _, _, _ := resolveCompaction(&config.ModelConfig{}, 6)
+		Expect(enabled).To(BeFalse())
+	})
+
+	It("defaults trigger to 2x max history and tokens to 512", func() {
+		cfg := &config.ModelConfig{Pipeline: config.Pipeline{Compaction: &config.PipelineCompaction{Enabled: true}}}
+		enabled, trigger, maxTok, _ := resolveCompaction(cfg, 6)
+		Expect(enabled).To(BeTrue())
+		Expect(trigger).To(Equal(12))
+		Expect(maxTok).To(Equal(512))
+	})
+
+	It("clamps trigger to max history + 1 when misconfigured", func() {
+		cfg := &config.ModelConfig{Pipeline: config.Pipeline{Compaction: &config.PipelineCompaction{Enabled: true, TriggerItems: 4}}}
+		_, trigger, _, _ := resolveCompaction(cfg, 6)
+		Expect(trigger).To(Equal(7))
+	})
+
+	It("honors explicit values", func() {
+		cfg := &config.ModelConfig{Pipeline: config.Pipeline{Compaction: &config.PipelineCompaction{
+			Enabled: true, TriggerItems: 20, MaxSummaryTokens: 256, SummaryModel: "tiny"}}}
+		enabled, trigger, maxTok, model := resolveCompaction(cfg, 6)
+		Expect(enabled).To(BeTrue())
+		Expect(trigger).To(Equal(20))
+		Expect(maxTok).To(Equal(256))
+		Expect(model).To(Equal("tiny"))
+	})
+})
+
+var _ = Describe("deleteItem", func() {
+	mk := func(ids ...string) []*types.MessageItemUnion {
+		out := make([]*types.MessageItemUnion, len(ids))
+		for i, id := range ids {
+			out[i] = &types.MessageItemUnion{User: &types.MessageItemUser{ID: id}}
+		}
+		return out
+	}
+
+	It("removes the item with the given id", func() {
+		items, ok := deleteItem(mk("a", "b", "c"), "b")
+		Expect(ok).To(BeTrue())
+		Expect(len(items)).To(Equal(2))
+		Expect(itemID(items[0])).To(Equal("a"))
+		Expect(itemID(items[1])).To(Equal("c"))
+	})
+
+	It("reports not found for an unknown id", func() {
+		_, ok := deleteItem(mk("a"), "zzz")
+		Expect(ok).To(BeFalse())
+	})
+})
+
+var _ = Describe("clearInputAudio", func() {
+	It("resets the pending PCM and buffered Opus frames", func() {
+		s := &Session{InputAudioBuffer: []byte{1, 2, 3}, OpusFrames: [][]byte{{9}}}
+		clearInputAudio(s)
+		Expect(s.InputAudioBuffer).To(BeNil())
+		Expect(s.OpusFrames).To(BeNil())
+	})
+})
+
+var _ = Describe("truncateAssistantText", func() {
+	It("clears the text of the assistant content part at the index", func() {
+		items := []*types.MessageItemUnion{{Assistant: &types.MessageItemAssistant{
+			ID:      "a1",
+			Content: []types.MessageContentOutput{{Type: types.MessageContentTypeText, Text: "hello world"}},
+		}}}
+		ok := truncateAssistantText(items, "a1", 0)
+		Expect(ok).To(BeTrue())
+		Expect(items[0].Assistant.Content[0].Text).To(Equal(""))
+	})
+
+	// Realtime assistant *audio* turns store the spoken words in .Transcript, not
+	// .Text, so a barge-in truncate must clear .Transcript too or it would no-op.
+	It("clears the transcript of an assistant audio content part", func() {
+		items := []*types.MessageItemUnion{{Assistant: &types.MessageItemAssistant{
+			ID:      "a1",
+			Content: []types.MessageContentOutput{{Type: types.MessageContentTypeAudio, Transcript: "hello world"}},
+		}}}
+		ok := truncateAssistantText(items, "a1", 0)
+		Expect(ok).To(BeTrue())
+		Expect(items[0].Assistant.Content[0].Transcript).To(Equal(""))
+	})
+
+	It("returns false for an unknown id", func() {
+		Expect(truncateAssistantText(nil, "nope", 0)).To(BeFalse())
+	})
+})
+
+var _ = Describe("compactionCut", func() {
+	user := func(id string) *types.MessageItemUnion {
+		return &types.MessageItemUnion{User: &types.MessageItemUser{ID: id}}
+	}
+	call := func(id string) *types.MessageItemUnion {
+		return &types.MessageItemUnion{FunctionCall: &types.MessageItemFunctionCall{ID: id}}
+	}
+	out := func(id string) *types.MessageItemUnion {
+		return &types.MessageItemUnion{FunctionCallOutput: &types.MessageItemFunctionCallOutput{ID: id}}
+	}
+
+	It("cuts exactly len-keep when no pairs straddle the boundary", func() {
+		items := []*types.MessageItemUnion{user("1"), user("2"), user("3"), user("4")}
+		Expect(compactionCut(items, 2)).To(Equal(2))
+	})
+
+	It("returns 0 when nothing to cut", func() {
+		Expect(compactionCut([]*types.MessageItemUnion{user("1")}, 2)).To(Equal(0))
+	})
+
+	It("returns 0 (cuts nothing) when keep is 0 — the unlimited-window sentinel", func() {
+		items := []*types.MessageItemUnion{user("1"), user("2"), user("3")}
+		Expect(compactionCut(items, 0)).To(Equal(0))
+	})
+
+	It("moves the boundary so a call/output pair is not split", func() {
+		// keep=2 -> naive cut=2, but items[2] is the output of items[1]'s call;
+		// pull the cut right so the whole pair stays in the kept tail.
+		items := []*types.MessageItemUnion{user("1"), call("c"), out("c"), user("4")}
+		Expect(compactionCut(items, 2)).To(Equal(1))
+	})
+})
+
+var _ = Describe("withMemory", func() {
+	It("inserts a memory system message when memory is non-empty", func() {
+		base := schema.Messages{{Role: "system", StringContent: "instructions"}}
+		out := withMemory(base, "user is Bob; wants pizza")
+		Expect(len(out)).To(Equal(2))
+		Expect(out[1].Role).To(Equal("system"))
+		Expect(out[1].StringContent).To(ContainSubstring("user is Bob"))
+		Expect(out[1].StringContent).To(ContainSubstring("Summary of earlier conversation"))
+	})
+
+	It("is a no-op when memory is empty", func() {
+		base := schema.Messages{{Role: "system", StringContent: "instructions"}}
+		Expect(withMemory(base, "")).To(HaveLen(1))
+	})
+})
+
+var _ = Describe("renderItemsTranscript", func() {
+	It("renders user and assistant text turns", func() {
+		items := []*types.MessageItemUnion{
+			{User: &types.MessageItemUser{Content: []types.MessageContentInput{{Type: types.MessageContentTypeInputText, Text: "hi"}}}},
+			{Assistant: &types.MessageItemAssistant{Content: []types.MessageContentOutput{{Type: types.MessageContentTypeText, Text: "hello"}}}},
+		}
+		out := renderItemsTranscript(items)
+		Expect(out).To(ContainSubstring("user: hi"))
+		Expect(out).To(ContainSubstring("assistant: hello"))
+	})
+
+	// Realtime assistant *audio* turns store the spoken words in .Transcript, not
+	// .Text, so the transcript builder must emit .Transcript too or spoken turns
+	// would be dropped from the summary.
+	It("renders an assistant audio turn from its transcript", func() {
+		items := []*types.MessageItemUnion{
+			{Assistant: &types.MessageItemAssistant{Content: []types.MessageContentOutput{{Type: types.MessageContentTypeAudio, Transcript: "spoken words"}}}},
+		}
+		Expect(renderItemsTranscript(items)).To(ContainSubstring("assistant: spoken words"))
+	})
+})
+
+var _ = Describe("buildSummaryMessages", func() {
+	It("includes prior memory and the new transcript", func() {
+		msgs := buildSummaryMessages("prior facts", "user: hi", 512)
+		Expect(len(msgs)).To(Equal(2))
+		Expect(msgs[0].Role).To(Equal("system"))
+		Expect(msgs[1].StringContent).To(ContainSubstring("prior facts"))
+		Expect(msgs[1].StringContent).To(ContainSubstring("user: hi"))
+	})
+})
+
+var _ = Describe("compact", func() {
+	user := func(id, text string) *types.MessageItemUnion {
+		return &types.MessageItemUnion{User: &types.MessageItemUser{ID: id,
+			Content: []types.MessageContentInput{{Type: types.MessageContentTypeInputText, Text: text}}}}
+	}
+
+	It("summarizes overflow into Memory and evicts it, keeping the live tail", func() {
+		conv := &Conversation{Items: []*types.MessageItemUnion{
+			user("1", "a"), user("2", "b"), user("3", "c"), user("4", "d"),
+			user("5", "e"), user("6", "f"), user("7", "g"), user("8", "h"),
+		}}
+		s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4, MaxSummaryTokens: 512}
+		m := &fakeModel{predictResp: backend.LLMResponse{Response: "ROLLED UP"}}
+
+		s.compact(conv, m)
+
+		Expect(conv.Memory).To(Equal("ROLLED UP"))
+		Expect(len(conv.Items)).To(Equal(4))
+		Expect(itemID(conv.Items[0])).To(Equal("5"))
+		// The summarizer saw the evicted turns.
+		Expect(m.lastMessages[1].StringContent).To(ContainSubstring("a"))
+	})
+
+	It("leaves Items and Memory untouched when the summarizer errors", func() {
+		items := []*types.MessageItemUnion{user("1", "a"), user("2", "b"), user("3", "c")}
+		conv := &Conversation{Items: items}
+		s := &Session{CompactionEnabled: true, CompactionTrigger: 2, MaxHistoryItems: 1, MaxSummaryTokens: 512}
+		m := &fakeModel{predictErr: errors.New("boom")}
+
+		s.compact(conv, m)
+
+		Expect(conv.Memory).To(Equal(""))
+		Expect(len(conv.Items)).To(Equal(3))
+	})
+
+	It("strips leaked reasoning tags from the summary via the shared extractor", func() {
+		conv := &Conversation{Items: []*types.MessageItemUnion{
+			user("1", "a"), user("2", "b"), user("3", "c"), user("4", "d"),
+			user("5", "e"), user("6", "f"), user("7", "g"), user("8", "h"),
+		}}
+		s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4, MaxSummaryTokens: 512}
+		m := &fakeModel{predictResp: backend.LLMResponse{Response: "<think>planning the summary</think>CLEAN SUMMARY"}}
+
+		s.compact(conv, m)
+
+		Expect(conv.Memory).To(Equal("CLEAN SUMMARY"))
+		Expect(conv.Memory).ToNot(ContainSubstring("planning"))
+	})
+
+	It("does nothing when items are at or below the trigger", func() {
+		conv := &Conversation{Items: []*types.MessageItemUnion{user("1", "a")}}
+		s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4}
+		s.compact(conv, &fakeModel{predictResp: backend.LLMResponse{Response: "x"}})
+		Expect(conv.Memory).To(Equal(""))
+		Expect(len(conv.Items)).To(Equal(1))
+	})
+})
+
+var _ = Describe("prefixMatches", func() {
+	user := func(id string) *types.MessageItemUnion {
+		return &types.MessageItemUnion{User: &types.MessageItemUser{ID: id}}
+	}
+
+	It("matches when items begins with the snapshot ids in order", func() {
+		items := []*types.MessageItemUnion{user("1"), user("2"), user("3")}
+		snap := []*types.MessageItemUnion{user("1"), user("2")}
+		Expect(prefixMatches(items, snap)).To(BeTrue())
+	})
+
+	It("matches an empty snapshot", func() {
+		Expect(prefixMatches([]*types.MessageItemUnion{user("1")}, nil)).To(BeTrue())
+	})
+
+	It("fails when items is shorter than the snapshot (a concurrent delete shrank the head)", func() {
+		items := []*types.MessageItemUnion{user("1")}
+		snap := []*types.MessageItemUnion{user("1"), user("2")}
+		Expect(prefixMatches(items, snap)).To(BeFalse())
+	})
+
+	It("fails when the head ids differ (a concurrent delete reordered the head)", func() {
+		items := []*types.MessageItemUnion{user("2"), user("3")}
+		snap := []*types.MessageItemUnion{user("1"), user("2")}
+		Expect(prefixMatches(items, snap)).To(BeFalse())
+	})
+})
+
+var _ = Describe("summarizerModel", func() {
+	It("returns the pipeline model when no summary_model is set", func() {
+		m := &fakeModel{}
+		s := &Session{ModelInterface: m}
+		Expect(s.summarizerModel()).To(Equal(m))
+	})
+
+	It("uses the factory (once) when summary_model is set", func() {
+		pipeline := &fakeModel{}
+		small := &fakeModel{}
+		calls := 0
+		s := &Session{ModelInterface: pipeline, SummaryModel: "tiny",
+			summarizerFactory: func() (Model, error) { calls++; return small, nil }}
+		Expect(s.summarizerModel()).To(Equal(small))
+		Expect(s.summarizerModel()).To(Equal(small))
+		Expect(calls).To(Equal(1))
+	})
+
+	It("falls back to the pipeline model when the factory errors", func() {
+		pipeline := &fakeModel{}
+		s := &Session{ModelInterface: pipeline, SummaryModel: "tiny",
+			summarizerFactory: func() (Model, error) { return nil, errors.New("nope") }}
+		Expect(s.summarizerModel()).To(Equal(pipeline))
+	})
+})
+
+var _ = Describe("itemID", func() {
+	It("returns the id for each variant and empty for nil", func() {
+		Expect(itemID(nil)).To(Equal(""))
+		Expect(itemID(&types.MessageItemUnion{User: &types.MessageItemUser{ID: "u1"}})).To(Equal("u1"))
+		Expect(itemID(&types.MessageItemUnion{Assistant: &types.MessageItemAssistant{ID: "a1"}})).To(Equal("a1"))
+		Expect(itemID(&types.MessageItemUnion{System: &types.MessageItemSystem{ID: "s1"}})).To(Equal("s1"))
+		Expect(itemID(&types.MessageItemUnion{FunctionCall: &types.MessageItemFunctionCall{ID: "f1"}})).To(Equal("f1"))
+		Expect(itemID(&types.MessageItemUnion{FunctionCallOutput: &types.MessageItemFunctionCallOutput{ID: "o1"}})).To(Equal("o1"))
+	})
+})
diff --git a/docs/content/features/openai-realtime.md b/docs/content/features/openai-realtime.md
index 48cfc9332..a6e99267e 100644
--- a/docs/content/features/openai-realtime.md
+++ b/docs/content/features/openai-realtime.md
@@ -68,6 +68,33 @@ pipeline:
 
 This is applied only to the realtime session's copy of the LLM config, so it does not affect other users of the same model. Leave it unset to use the LLM model config's own reasoning settings.
 
+### Conversation compaction (long sessions on CPU)
+
+By default a realtime session feeds only the last `max_history_items` turns to the LLM; older turns are dropped and forgotten. On CPU, long calls also grow expensive as the prompt fills with verbatim history. Enable `compaction` to instead fold older turns into a rolling summary, so long calls stay cheap without losing earlier context.
+
+Compaction works with two numbers:
+
+- **`max_history_items`** is the *live window* — the recent turns kept verbatim in the prompt.
+- **`compaction.trigger_items`** is the *high-water mark* — let the buffer grow to here, then summarize the overflow (everything above `max_history_items`) into a rolling memory and evict it. It must be greater than `max_history_items`; if it is not, it is clamped up.
+
+The gap between the two controls how often summarization runs: a summary call fires roughly every `(trigger_items - max_history_items)` turns (here, about every 6 turns).
+
+```yaml
+pipeline:
+  max_history_items: 6        # live window — recent turns kept verbatim
+  compaction:
+    enabled: true
+    trigger_items: 12         # summarize overflow back down to max_history_items
+    summary_model: ""         # optional: a small model for the summary (CPU); default = pipeline LLM
+    max_summary_tokens: 512
+```
+
+{{% notice tip %}}
+On CPU, set `summary_model` to a small, fast model so compaction never competes with the conversation LLM for compute. Left empty, the pipeline's own LLM produces the summary.
+{{% /notice %}}
+
+Clients can also manage history directly via the now-supported `conversation.item.delete`, `conversation.item.truncate`, and `input_audio_buffer.clear` realtime events.
+
 ## Transports
 
 The Realtime API supports two transports: **WebSocket** and **WebRTC**.

From 10184b5e2869ea67ebaec88500ff227c937859ac Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 22 Jun 2026 21:38:37 +0200
Subject: [PATCH 52/99] chore(deps): bump actions/checkout from 6 to 7 (#10451)

Bumps [actions/checkout](https://github.com/actions/checkout) from 6 to 7.
- [Release notes](https://github.com/actions/checkout/releases)
- [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md)
- [Commits](https://github.com/actions/checkout/compare/v6...v7)

---
updated-dependencies:
- dependency-name: actions/checkout
  dependency-version: '7'
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/backend.yml                 |  2 +-
 .github/workflows/backend_build.yml           |  2 +-
 .github/workflows/backend_build_darwin.yml    |  2 +-
 .github/workflows/backend_merge.yml           |  2 +-
 .github/workflows/backend_pr.yml              |  2 +-
 .github/workflows/base-images.yml             |  2 +-
 .github/workflows/build-test.yaml             |  6 +-
 .github/workflows/bump-inference-defaults.yml |  2 +-
 .github/workflows/bump_deps.yaml              |  4 +-
 .github/workflows/bump_docs.yaml              |  2 +-
 .github/workflows/checksum_checker.yaml       |  2 +-
 .github/workflows/deploy-explorer.yaml        |  2 +-
 .github/workflows/gallery-agent.yaml          |  2 +-
 .github/workflows/generate_intel_image.yaml   |  2 +-
 .github/workflows/gh-pages.yml                |  2 +-
 .github/workflows/image_build.yml             |  2 +-
 .github/workflows/image_merge.yml             |  2 +-
 .github/workflows/lint.yml                    |  2 +-
 .github/workflows/release.yaml                |  6 +-
 .github/workflows/secscan.yaml                |  2 +-
 .github/workflows/test-extra.yml              | 86 +++++++++----------
 .github/workflows/test.yml                    |  4 +-
 .github/workflows/tests-aio.yml               |  2 +-
 .github/workflows/tests-e2e.yml               |  2 +-
 .github/workflows/tests-pii-ner-e2e.yml       |  2 +-
 .github/workflows/tests-ui-e2e.yml            |  2 +-
 .github/workflows/update_swagger.yaml         |  2 +-
 27 files changed, 75 insertions(+), 75 deletions(-)

diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml
index b41c3d4dd..705768b59 100644
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -44,7 +44,7 @@ jobs:
       has-merges-singlearch: ${{ steps.set-matrix.outputs['has-merges-singlearch'] }}
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
 
       - name: Setup Bun
         uses: oven-sh/setup-bun@v2
diff --git a/.github/workflows/backend_build.yml b/.github/workflows/backend_build.yml
index b3e177bd1..05d50cf82 100644
--- a/.github/workflows/backend_build.yml
+++ b/.github/workflows/backend_build.yml
@@ -101,7 +101,7 @@ jobs:
     steps:
 
       - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
 
diff --git a/.github/workflows/backend_build_darwin.yml b/.github/workflows/backend_build_darwin.yml
index 61f87eff6..749ffd4de 100644
--- a/.github/workflows/backend_build_darwin.yml
+++ b/.github/workflows/backend_build_darwin.yml
@@ -57,7 +57,7 @@ jobs:
       HOMEBREW_NO_ANALYTICS: '1'
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
 
diff --git a/.github/workflows/backend_merge.yml b/.github/workflows/backend_merge.yml
index c05fece8d..37f606aa9 100644
--- a/.github/workflows/backend_merge.yml
+++ b/.github/workflows/backend_merge.yml
@@ -49,7 +49,7 @@ jobs:
       # Sparse checkout: the merge job needs `.github/scripts/` (for the
       # keepalive cleanup script) but none of the source tree.
       - name: Checkout (.github/scripts only)
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           sparse-checkout: |
             .github/scripts
diff --git a/.github/workflows/backend_pr.yml b/.github/workflows/backend_pr.yml
index e9520a548..9517651e4 100644
--- a/.github/workflows/backend_pr.yml
+++ b/.github/workflows/backend_pr.yml
@@ -23,7 +23,7 @@ jobs:
       has-merges-singlearch: ${{ steps.set-matrix.outputs['has-merges-singlearch'] }}
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
 
       - name: Setup Bun
         uses: oven-sh/setup-bun@v2
diff --git a/.github/workflows/base-images.yml b/.github/workflows/base-images.yml
index 6152e1c56..637b603a5 100644
--- a/.github/workflows/base-images.yml
+++ b/.github/workflows/base-images.yml
@@ -127,7 +127,7 @@ jobs:
             # the original l4t matrix entry which set skip-drivers: 'true'.
             skip-drivers: 'true'
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
         with:
           submodules: false
       - name: Free disk space
diff --git a/.github/workflows/build-test.yaml b/.github/workflows/build-test.yaml
index e634848eb..5b23ddf77 100644
--- a/.github/workflows/build-test.yaml
+++ b/.github/workflows/build-test.yaml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           fetch-depth: 0
       - name: Set up Go
@@ -25,7 +25,7 @@ jobs:
     runs-on: macos-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           fetch-depth: 0
       - name: Set up Go
@@ -47,7 +47,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           fetch-depth: 0
       - name: Configure apt mirror on runner
diff --git a/.github/workflows/bump-inference-defaults.yml b/.github/workflows/bump-inference-defaults.yml
index 50485b5f1..0b3afa454 100644
--- a/.github/workflows/bump-inference-defaults.yml
+++ b/.github/workflows/bump-inference-defaults.yml
@@ -14,7 +14,7 @@ jobs:
   bump:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
 
       - uses: actions/setup-go@v5
         with:
diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml
index 481c9a609..aa4b21af7 100644
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -92,7 +92,7 @@ jobs:
             file: "backend/go/vibevoice-cpp/Makefile"
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
       - name: Bump dependencies 🔧
         id: bump
         run: |
@@ -128,7 +128,7 @@ jobs:
     if: github.repository == 'mudler/LocalAI'
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
       - name: Bump vLLM cu130 wheel pin 🔧
         id: bump
         run: |
diff --git a/.github/workflows/bump_docs.yaml b/.github/workflows/bump_docs.yaml
index 1fe355580..444f7fed7 100644
--- a/.github/workflows/bump_docs.yaml
+++ b/.github/workflows/bump_docs.yaml
@@ -13,7 +13,7 @@ jobs:
           - repository: "mudler/LocalAI"
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
       - name: Bump dependencies 🔧
         run: |
           bash .github/bump_docs.sh ${{ matrix.repository }}
diff --git a/.github/workflows/checksum_checker.yaml b/.github/workflows/checksum_checker.yaml
index 4952f69c5..3652c65bb 100644
--- a/.github/workflows/checksum_checker.yaml
+++ b/.github/workflows/checksum_checker.yaml
@@ -8,7 +8,7 @@ jobs:
     if: github.repository == 'mudler/LocalAI'
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
       - name: Configure apt mirror on runner
         uses: ./.github/actions/configure-apt-mirror
       - name: Install dependencies
diff --git a/.github/workflows/deploy-explorer.yaml b/.github/workflows/deploy-explorer.yaml
index 5c2a0e354..7914d2054 100644
--- a/.github/workflows/deploy-explorer.yaml
+++ b/.github/workflows/deploy-explorer.yaml
@@ -16,7 +16,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - uses: actions/setup-go@v5
diff --git a/.github/workflows/gallery-agent.yaml b/.github/workflows/gallery-agent.yaml
index ceb87b2d8..dfe9d40fa 100644
--- a/.github/workflows/gallery-agent.yaml
+++ b/.github/workflows/gallery-agent.yaml
@@ -31,7 +31,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           token: ${{ secrets.GITHUB_TOKEN }}
 
diff --git a/.github/workflows/generate_intel_image.yaml b/.github/workflows/generate_intel_image.yaml
index 27f1a0a3e..22627387f 100644
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -44,7 +44,7 @@ jobs:
         uses: docker/setup-buildx-action@master
 
       - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
 
       - name: Cache Intel images
         uses: docker/build-push-action@v7
diff --git a/.github/workflows/gh-pages.yml b/.github/workflows/gh-pages.yml
index b21627ae1..9df4e4f6f 100644
--- a/.github/workflows/gh-pages.yml
+++ b/.github/workflows/gh-pages.yml
@@ -28,7 +28,7 @@ jobs:
       HUGO_VERSION: "0.146.3"
     steps:
       - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           fetch-depth: 0  # needed for enableGitInfo
           submodules: true
diff --git a/.github/workflows/image_build.yml b/.github/workflows/image_build.yml
index b953ddbb2..89bc4124f 100644
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -80,7 +80,7 @@ jobs:
     steps:
 
       - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
 
       - name: Configure apt mirror on runner
         id: apt_mirror
diff --git a/.github/workflows/image_merge.yml b/.github/workflows/image_merge.yml
index 47b3f48a8..18d64d407 100644
--- a/.github/workflows/image_merge.yml
+++ b/.github/workflows/image_merge.yml
@@ -36,7 +36,7 @@ jobs:
       # Sparse checkout: needed for .github/scripts/ (the keepalive cleanup
       # script). Skips the rest of the source tree.
       - name: Checkout (.github/scripts only)
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           sparse-checkout: |
             .github/scripts
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index f9913229a..54d0a740c 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -20,7 +20,7 @@ jobs:
   golangci-lint:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
         with:
           # Full history so golangci-lint's new-from-merge-base can reach
           # origin/master and compute the diff against it.
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index a94c64fd4..614c1de3e 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -10,7 +10,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           fetch-depth: 0
       - name: Set up Go
@@ -28,7 +28,7 @@ jobs:
     runs-on: macos-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           fetch-depth: 0
       - name: Set up Go
@@ -46,7 +46,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           fetch-depth: 0
       - name: Configure apt mirror on runner
diff --git a/.github/workflows/secscan.yaml b/.github/workflows/secscan.yaml
index a8bac30dd..b5bf8e2be 100644
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -14,7 +14,7 @@ jobs:
       GO111MODULE: on
     steps:
       - name: Checkout Source
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         if: ${{ github.actor != 'dependabot[bot]' }}
       - name: Run Gosec Security Scanner
         if: ${{ github.actor != 'dependabot[bot]' }}
diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml
index c02dcec44..650f464a2 100644
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -50,7 +50,7 @@ jobs:
       parakeet-cpp: ${{ steps.detect.outputs.parakeet-cpp }}
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
       - name: Setup Bun
         uses: oven-sh/setup-bun@v2
       - name: Install dependencies
@@ -67,7 +67,7 @@ jobs:
   #   runs-on: ubuntu-latest
   #   steps:
   #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
   #       with:
   #         submodules: true
   #     - name: Dependencies
@@ -90,7 +90,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Dependencies
@@ -113,7 +113,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Dependencies
@@ -137,7 +137,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Dependencies
@@ -158,7 +158,7 @@ jobs:
   #  runs-on: ubuntu-latest
   #  steps:
   #    - name: Clone
-  #      uses: actions/checkout@v6
+  #      uses: actions/checkout@v7
   #      with:
   #        submodules: true
   #    - name: Dependencies
@@ -178,7 +178,7 @@ jobs:
   #   runs-on: ubuntu-latest
   #   steps:
   #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
   #       with:
   #         submodules: true
   #     - name: Dependencies
@@ -240,7 +240,7 @@ jobs:
   #           sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
   #           df -h
   #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
   #       with:
   #         submodules: true
   #     - name: Dependencies
@@ -265,7 +265,7 @@ jobs:
   #   runs-on: ubuntu-latest
   #   steps:
   #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
   #       with:
   #         submodules: true
   #     - name: Dependencies
@@ -288,7 +288,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Dependencies
@@ -309,7 +309,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Dependencies
@@ -330,7 +330,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Dependencies
@@ -351,7 +351,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Dependencies
@@ -373,7 +373,7 @@ jobs:
   #   timeout-minutes: 45
   #   steps:
   #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
   #       with:
   #         submodules: true
   #     - name: Dependencies
@@ -394,7 +394,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Dependencies
@@ -415,7 +415,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Dependencies
@@ -436,7 +436,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Dependencies
@@ -462,7 +462,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Dependencies
@@ -484,7 +484,7 @@ jobs:
     timeout-minutes: 30
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Dependencies
@@ -513,7 +513,7 @@ jobs:
     timeout-minutes: 90
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Setup Go
@@ -530,7 +530,7 @@ jobs:
     timeout-minutes: 90
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Setup Go
@@ -552,7 +552,7 @@ jobs:
     timeout-minutes: 20
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Setup Go
@@ -579,7 +579,7 @@ jobs:
     timeout-minutes: 90
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Setup Go
@@ -604,7 +604,7 @@ jobs:
     timeout-minutes: 90
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Setup Go
@@ -625,7 +625,7 @@ jobs:
     timeout-minutes: 90
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Setup Go
@@ -645,7 +645,7 @@ jobs:
     timeout-minutes: 90
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Setup Go
@@ -664,7 +664,7 @@ jobs:
     timeout-minutes: 90
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Setup Go
@@ -681,7 +681,7 @@ jobs:
     timeout-minutes: 90
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Setup Go
@@ -698,7 +698,7 @@ jobs:
     timeout-minutes: 90
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Setup Go
@@ -741,7 +741,7 @@ jobs:
   #   timeout-minutes: 90
   #   steps:
   #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
   #       with:
   #         submodules: true
   #     - name: Dependencies
@@ -783,7 +783,7 @@ jobs:
   #   timeout-minutes: 90
   #   steps:
   #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
   #       with:
   #         submodules: true
   #     - name: Dependencies
@@ -808,7 +808,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Dependencies
@@ -840,7 +840,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Dependencies
@@ -876,7 +876,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Dependencies
@@ -915,7 +915,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Dependencies
@@ -952,7 +952,7 @@ jobs:
     timeout-minutes: 90
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Dependencies
@@ -987,7 +987,7 @@ jobs:
     timeout-minutes: 90
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Setup Go
@@ -1013,7 +1013,7 @@ jobs:
     timeout-minutes: 150
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Dependencies
@@ -1042,7 +1042,7 @@ jobs:
     timeout-minutes: 60
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Setup Go
@@ -1058,7 +1058,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Dependencies
@@ -1091,7 +1091,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Dependencies
@@ -1114,7 +1114,7 @@ jobs:
     timeout-minutes: 90
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Dependencies
@@ -1140,7 +1140,7 @@ jobs:
     timeout-minutes: 90
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Dependencies
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index e727261c1..df5512283 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -21,7 +21,7 @@ jobs:
         go-version: ['1.26.x']
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Free disk space
@@ -84,7 +84,7 @@ jobs:
         go-version: ['1.26.x']
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Setup Go ${{ matrix.go-version }}
diff --git a/.github/workflows/tests-aio.yml b/.github/workflows/tests-aio.yml
index 162389df5..f8d3d34f0 100644
--- a/.github/workflows/tests-aio.yml
+++ b/.github/workflows/tests-aio.yml
@@ -62,7 +62,7 @@ jobs:
           sudo rm -rfv build || true
           df -h
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Dependencies
diff --git a/.github/workflows/tests-e2e.yml b/.github/workflows/tests-e2e.yml
index 90d7392d9..fb76a51e3 100644
--- a/.github/workflows/tests-e2e.yml
+++ b/.github/workflows/tests-e2e.yml
@@ -21,7 +21,7 @@ jobs:
         go-version: ['1.25.x']
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Configure apt mirror on runner
diff --git a/.github/workflows/tests-pii-ner-e2e.yml b/.github/workflows/tests-pii-ner-e2e.yml
index 2f95f3f46..800f67190 100644
--- a/.github/workflows/tests-pii-ner-e2e.yml
+++ b/.github/workflows/tests-pii-ner-e2e.yml
@@ -57,7 +57,7 @@ jobs:
         go-version: ['1.25.x']
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Free disk space
diff --git a/.github/workflows/tests-ui-e2e.yml b/.github/workflows/tests-ui-e2e.yml
index 99bb61e57..3c72f9dc0 100644
--- a/.github/workflows/tests-ui-e2e.yml
+++ b/.github/workflows/tests-ui-e2e.yml
@@ -23,7 +23,7 @@ jobs:
         go-version: ['1.26.x']
     steps:
       - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
         with:
           submodules: true
       - name: Configure apt mirror on runner
diff --git a/.github/workflows/update_swagger.yaml b/.github/workflows/update_swagger.yaml
index 4b8590f05..649722dbb 100644
--- a/.github/workflows/update_swagger.yaml
+++ b/.github/workflows/update_swagger.yaml
@@ -10,7 +10,7 @@ jobs:
       fail-fast: false
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
       - name: Configure apt mirror on runner
         uses: ./.github/actions/configure-apt-mirror
       - uses: actions/setup-go@v5

From 4755d676a39ea5ce236076934a80a15f04b0b0f4 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Mon, 22 Jun 2026 21:59:05 +0200
Subject: [PATCH 53/99] Revert "feat(ui): role and deployment-mode adaptive UI
 (landing, sidebar, top navbar)" (#10453)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Revert "feat(ui): role and deployment-mode adaptive UI (landing, sidebar, top…"

This reverts commit 9d54a599b0e5cc6bf1ce75603229bfbb73a0af92.
---
 .../react-ui/e2e/role-mode-adaptive.spec.js   | 100 ----------------
 core/http/react-ui/public/locales/en/nav.json |  10 --
 core/http/react-ui/src/App.css                |  44 -------
 core/http/react-ui/src/App.jsx                |   2 -
 .../react-ui/src/components/HomeRoute.jsx     |  28 -----
 core/http/react-ui/src/components/Sidebar.jsx |  58 ++-------
 .../react-ui/src/components/TopNavbar.jsx     |  96 ---------------
 .../src/components/navbar/TokenUsageMeter.jsx |  52 --------
 .../src/contexts/DeploymentContext.jsx        |  55 ---------
 core/http/react-ui/src/main.jsx               |   9 +-
 core/http/react-ui/src/pages/Chat.jsx         | 111 ++++++++----------
 core/http/react-ui/src/pages/Home.jsx         |  11 +-
 core/http/react-ui/src/router.jsx             |   5 +-
 .../react-ui/src/utils/launchAssistantChat.js |  19 ---
 core/http/react-ui/src/utils/resolveHome.js   |  11 --
 core/http/react-ui/src/utils/sidebarPolicy.js |  20 ----
 16 files changed, 72 insertions(+), 559 deletions(-)
 delete mode 100644 core/http/react-ui/e2e/role-mode-adaptive.spec.js
 delete mode 100644 core/http/react-ui/src/components/HomeRoute.jsx
 delete mode 100644 core/http/react-ui/src/components/TopNavbar.jsx
 delete mode 100644 core/http/react-ui/src/components/navbar/TokenUsageMeter.jsx
 delete mode 100644 core/http/react-ui/src/contexts/DeploymentContext.jsx
 delete mode 100644 core/http/react-ui/src/utils/launchAssistantChat.js
 delete mode 100644 core/http/react-ui/src/utils/resolveHome.js
 delete mode 100644 core/http/react-ui/src/utils/sidebarPolicy.js

diff --git a/core/http/react-ui/e2e/role-mode-adaptive.spec.js b/core/http/react-ui/e2e/role-mode-adaptive.spec.js
deleted file mode 100644
index 0e2e1b37b..000000000
--- a/core/http/react-ui/e2e/role-mode-adaptive.spec.js
+++ /dev/null
@@ -1,100 +0,0 @@
-import { test, expect } from './coverage-fixtures.js'
-
-// These specs stub /api/features and /api/auth/status per cell. The test server
-// disables auth (isAdmin=true) and reports its own features, so we intercept
-// before navigation to simulate each role x mode cell.
-
-function stubFeatures(page, features) {
-  return page.route('**/api/features', route =>
-    route.fulfill({ contentType: 'application/json', body: JSON.stringify(features) }))
-}
-
-function stubNoP2P(page) {
-  // P2P token endpoint returns empty -> p2pEnabled=false.
-  return page.route('**/api/p2p/token', route =>
-    route.fulfill({ contentType: 'text/plain', body: '' }))
-}
-
-test.describe('Adaptive landing (HomeRoute)', () => {
-  test('admin + distributed redirects /app to Nodes', async ({ page }) => {
-    await stubFeatures(page, { distributed: true })
-    await stubNoP2P(page)
-    await page.goto('/app')
-    await expect(page).toHaveURL(/\/app\/nodes$/)
-    await expect(page.locator('.page-title').first()).toBeVisible({ timeout: 15_000 })
-  })
-
-  test('admin + single-node stays on Home', async ({ page }) => {
-    await stubFeatures(page, { distributed: false })
-    await stubNoP2P(page)
-    await page.goto('/app')
-    await expect(page).toHaveURL(/\/app$/)
-    await expect(page.locator('.home-greeting')).toBeVisible({ timeout: 15_000 })
-  })
-})
-
-test.describe('Adaptive sidebar', () => {
-  test('distributed pins the Cluster group with Nodes at the top', async ({ page }) => {
-    await stubFeatures(page, { distributed: true })
-    await stubNoP2P(page)
-    await page.goto('/app/chat') // any in-app page so the sidebar is mounted
-    const pinned = page.locator('.sidebar-nav .sidebar-section-items').first()
-    await expect(pinned.getByText('Nodes', { exact: false })).toBeVisible({ timeout: 15_000 })
-  })
-
-  test('single-node does not pin a Cluster group', async ({ page }) => {
-    await stubFeatures(page, { distributed: false })
-    await stubNoP2P(page)
-    await page.goto('/app/chat')
-    // Nodes is reachable only via the Operate rail, not pinned at the top.
-    await expect(page.locator('.sidebar-nav')).toBeVisible({ timeout: 15_000 })
-    await expect(page.locator('.sidebar-nav .sidebar-section-items').first()
-      .getByText('Nodes', { exact: false })).toHaveCount(0)
-  })
-})
-
-test.describe('Top navbar', () => {
-  test('admin sees the mode pill and settings cog', async ({ page }) => {
-    await stubFeatures(page, { distributed: true })
-    await stubNoP2P(page)
-    await page.goto('/app/chat')
-    await expect(page.locator('.top-navbar__mode')).toBeVisible({ timeout: 15_000 })
-    await expect(page.locator('.top-navbar__icon[aria-label]')).not.toHaveCount(0)
-  })
-
-  test('admin-via-chat jump shows when localai_assistant is enabled', async ({ page }) => {
-    await stubFeatures(page, { distributed: false, localai_assistant: true })
-    await stubNoP2P(page)
-    await page.goto('/app/chat')
-    await expect(page.locator('.top-navbar__assistant')).toBeVisible({ timeout: 15_000 })
-  })
-
-  test('admin-via-chat jump hidden when localai_assistant is off', async ({ page }) => {
-    await stubFeatures(page, { distributed: false, localai_assistant: false })
-    await stubNoP2P(page)
-    await page.goto('/app/chat')
-    await expect(page.locator('.top-navbar__assistant')).toHaveCount(0)
-  })
-})
-
-test.describe('Token usage meter', () => {
-  test('renders when admin usage has data', async ({ page }) => {
-    await stubFeatures(page, { distributed: false })
-    await stubNoP2P(page)
-    await page.route('**/api/auth/admin/usage**', route =>
-      route.fulfill({ contentType: 'application/json',
-        body: JSON.stringify({ buckets: [{ total_tokens: 1234 }] }) }))
-    await page.goto('/app/chat')
-    await expect(page.locator('.top-navbar__meter')).toBeVisible({ timeout: 15_000 })
-  })
-
-  test('hidden when admin usage is empty (graceful degrade)', async ({ page }) => {
-    await stubFeatures(page, { distributed: false })
-    await stubNoP2P(page)
-    await page.route('**/api/auth/admin/usage**', route =>
-      route.fulfill({ contentType: 'application/json', body: JSON.stringify({ buckets: [] }) }))
-    await page.goto('/app/chat')
-    await expect(page.locator('.top-navbar')).toBeVisible({ timeout: 15_000 })
-    await expect(page.locator('.top-navbar__meter')).toHaveCount(0)
-  })
-})
diff --git a/core/http/react-ui/public/locales/en/nav.json b/core/http/react-ui/public/locales/en/nav.json
index 7317c74cd..5423438f9 100644
--- a/core/http/react-ui/public/locales/en/nav.json
+++ b/core/http/react-ui/public/locales/en/nav.json
@@ -12,16 +12,6 @@
   "accountSettings": "Account settings",
   "account": "Account",
   "accountFor": "Account: {{name}}",
-  "topbar": {
-    "label": "Top bar",
-    "modeDistributed": "Distributed",
-    "modeSwarm": "Swarm",
-    "modeSingle": "Single-node",
-    "pickModel": "Models",
-    "adminViaChat": "Admin via chat",
-    "tokensToday": "Tokens today",
-    "usageDetail": "View usage detail"
-  },
   "sections": {
     "create": "Create",
     "recognition": "Recognition",
diff --git a/core/http/react-ui/src/App.css b/core/http/react-ui/src/App.css
index 0238f2fb1..cf1a46bd3 100644
--- a/core/http/react-ui/src/App.css
+++ b/core/http/react-ui/src/App.css
@@ -184,50 +184,6 @@
   font-size: 1.5rem;
 }
 
-/* Desktop top bar: deployment + admin affordances on wide screens. Hidden on
-   mobile, where .mobile-header carries the equivalent actions. */
-.top-navbar {
-  display: flex;
-  align-items: center;
-  justify-content: space-between;
-  gap: var(--spacing-md);
-  padding: var(--spacing-sm) var(--spacing-lg);
-  border-bottom: 1px solid var(--color-border-default);
-  background: var(--color-bg-secondary);
-}
-.top-navbar__right { display: flex; align-items: center; gap: var(--spacing-sm); }
-.top-navbar__mode {
-  font-size: 0.75rem;
-  padding: 2px 10px;
-  border-radius: 999px;
-  border: 1px solid var(--color-border-default);
-  color: var(--color-text-secondary);
-}
-.top-navbar__mode.is-active { color: var(--color-success); border-color: var(--color-success); }
-.top-navbar__btn {
-  display: inline-flex; align-items: center; gap: 6px;
-  font-size: 0.8125rem; padding: 5px 10px; border-radius: 8px;
-  border: 1px solid var(--color-border-default); background: var(--color-bg-tertiary);
-  color: var(--color-text-primary); cursor: pointer;
-}
-.top-navbar__icon {
-  width: 32px; height: 32px; display: inline-flex; align-items: center;
-  justify-content: center; border-radius: 8px; border: 1px solid var(--color-border-default);
-  background: var(--color-bg-tertiary); color: var(--color-text-secondary); cursor: pointer;
-}
-.top-navbar__avatar img { width: 100%; height: 100%; border-radius: 50%; object-fit: cover; }
-.top-navbar__meter {
-  display: inline-flex; flex-direction: column; gap: 3px; align-items: flex-start;
-  padding: 4px 10px; border-radius: 8px; border: 1px solid var(--color-border-default);
-  background: var(--color-bg-tertiary); cursor: pointer; min-width: 150px;
-}
-.top-navbar__meter-label { font-size: 0.6875rem; color: var(--color-text-secondary); }
-.top-navbar__meter-bar { width: 100%; height: 5px; border-radius: 3px; background: var(--color-bg-secondary); overflow: hidden; }
-.top-navbar__meter-bar i { display: block; height: 100%; background: var(--color-primary); }
-@media (max-width: 639px) {
-  .top-navbar { display: none; }
-}
-
 /* Sidebar */
 .sidebar {
   position: fixed;
diff --git a/core/http/react-ui/src/App.jsx b/core/http/react-ui/src/App.jsx
index 37ebf384f..b922499b5 100644
--- a/core/http/react-ui/src/App.jsx
+++ b/core/http/react-ui/src/App.jsx
@@ -3,7 +3,6 @@ import { Outlet, useLocation, useNavigate } from 'react-router-dom'
 import { useTranslation } from 'react-i18next'
 import Sidebar from './components/Sidebar'
 import OperationsBar from './components/OperationsBar'
-import TopNavbar from './components/TopNavbar'
 import { ToastContainer, useToast } from './components/Toast'
 import { systemApi } from './utils/api'
 import { useTheme } from './contexts/ThemeContext'
@@ -99,7 +98,6 @@ export default function App() {
       <Sidebar isOpen={sidebarOpen} onClose={() => setSidebarOpen(false)} />
       <main className="main-content" {...(sidebarOpen ? { 'aria-hidden': 'true', inert: '' } : {})}>
         <OperationsBar />
-        <TopNavbar />
         {/* Mobile header — primary actions reachable without opening the
             drawer. Hamburger is the only way to expand the nav on phones;
             theme toggle and account avatar are mirrored from the sidebar
diff --git a/core/http/react-ui/src/components/HomeRoute.jsx b/core/http/react-ui/src/components/HomeRoute.jsx
deleted file mode 100644
index 6e0008d8f..000000000
--- a/core/http/react-ui/src/components/HomeRoute.jsx
+++ /dev/null
@@ -1,28 +0,0 @@
-import { lazy, Suspense } from 'react'
-import { Navigate } from 'react-router-dom'
-import { useAuth } from '../context/AuthContext'
-import { useDeployment } from '../contexts/DeploymentContext'
-import { resolveHome } from '../utils/resolveHome'
-import RouteFallback from './RouteFallback'
-
-const Home = lazy(() => import('../pages/Home'))
-
-// Index-route element. Waits for auth + deployment signals to load (so we never
-// flash the wrong landing), then either renders Home or redirects to the cell's
-// landing page. Redirecting (rather than rendering Nodes/Chat inline at /app)
-// keeps each target's own route guard, active-nav state, and deep-linkability.
-export default function HomeRoute() {
-  const { isAdmin, loading: authLoading } = useAuth()
-  const { distributed, p2pEnabled, loading: deployLoading } = useDeployment()
-
-  if (authLoading || deployLoading) return <RouteFallback />
-
-  const target = resolveHome({ isAdmin, distributed, p2pEnabled })
-  if (target) return <Navigate to={target} replace />
-
-  return (
-    <Suspense fallback={<RouteFallback />}>
-      <Home />
-    </Suspense>
-  )
-}
diff --git a/core/http/react-ui/src/components/Sidebar.jsx b/core/http/react-ui/src/components/Sidebar.jsx
index 679897e33..58438fd51 100644
--- a/core/http/react-ui/src/components/Sidebar.jsx
+++ b/core/http/react-ui/src/components/Sidebar.jsx
@@ -5,11 +5,9 @@ import ThemeToggle from './ThemeToggle'
 import LanguageSwitcher from './LanguageSwitcher'
 import { useAuth } from '../context/AuthContext'
 import { useBranding } from '../contexts/BrandingContext'
-import { useDeployment } from '../contexts/DeploymentContext'
 import { apiUrl } from '../utils/basePath'
 import { preloadRoute } from '../router'
 import { consoles, firstVisiblePath, consolePaths } from './console/consoleConfig'
-import { clusterPinItems, shouldCollapseCreate } from '../utils/sidebarPolicy'
 
 const COLLAPSED_KEY = 'localai_sidebar_collapsed'
 const SECTIONS_KEY = 'localai_sidebar_sections'
@@ -60,13 +58,11 @@ function NavItem({ item, onClose, collapsed }) {
   )
 }
 
-function loadSectionState(collapseCreate = false) {
-  // Tiers render expanded by default; users can collapse any tier and the
-  // choice persists (stored values override defaults). In cluster cells we
-  // start Create collapsed so the pinned cluster group leads - but only when
-  // the user has not already expressed a preference.
+function loadSectionState() {
+  // Tiers render expanded by default (the redesign favours showing the few
+  // intent groups up front); users can still collapse any tier and the choice
+  // is persisted. Stored values override the defaults so a saved collapse wins.
   const defaults = Object.fromEntries(sections.map(s => [s.id, true]))
-  if (collapseCreate) defaults.create = false
   try {
     const stored = localStorage.getItem(SECTIONS_KEY)
     return stored ? { ...defaults, ...JSON.parse(stored) } : defaults
@@ -81,34 +77,20 @@ function saveSectionState(state) {
 
 export default function Sidebar({ isOpen, onClose }) {
   const { t } = useTranslation('nav')
-  const { isAdmin, authEnabled, user, logout, hasFeature } = useAuth()
-  // Deployment shape (server features + p2p) drives the adaptive sidebar; the
-  // shared context replaces the sidebar's own /api/features fetch so the
-  // landing resolver, navbar, and this policy agree on one snapshot.
-  const deployment = useDeployment()
-  const features = deployment.features
-  // Shared shape for the console gating helpers (consoleConfig.js); in scope for
-  // both the pinned cluster group and the console-tier rendering below.
-  const auth = { isAdmin, authEnabled, hasFeature, features }
-  const collapseCreate = shouldCollapseCreate(auth, deployment)
+  const [features, setFeatures] = useState({})
   const [collapsed, setCollapsed] = useState(() => {
     try { return localStorage.getItem(COLLAPSED_KEY) === 'true' } catch (_) { return false }
   })
   const [openSections, setOpenSections] = useState(loadSectionState)
+  const { isAdmin, authEnabled, user, logout, hasFeature } = useAuth()
   const branding = useBranding()
   const navigate = useNavigate()
   const location = useLocation()
   const closeBtnRef = useRef(null)
 
-  // Apply the cluster-cell Create-collapse default once, only when the user has
-  // no stored section preference (so we never override an explicit choice).
   useEffect(() => {
-    if (deployment.loading) return
-    let hasStored = false
-    try { hasStored = !!localStorage.getItem(SECTIONS_KEY) } catch { hasStored = false }
-    if (hasStored || !collapseCreate) return
-    setOpenSections(prev => (prev.create === false ? prev : { ...prev, create: false }))
-  }, [deployment.loading, collapseCreate])
+    fetch(apiUrl('/api/features')).then(r => r.json()).then(setFeatures).catch(() => {})
+  }, [])
 
   // Stay in sync with external collapse dispatches (e.g. the chat
   // page's focus mode). The collapse-toggle button still owns the
@@ -175,6 +157,8 @@ export default function Sidebar({ isOpen, onClose }) {
   }
 
   const visibleTopItems = topItems.filter(filterItem)
+  // Shared shape for the console gating helpers (consoleConfig.js).
+  const auth = { isAdmin, authEnabled, hasFeature, features }
 
   // Inline sections (Create) carry no gating; a plain filterItem pass suffices.
   const getVisibleSectionItems = (section) => section.items.filter(filterItem)
@@ -215,28 +199,6 @@ export default function Sidebar({ isOpen, onClose }) {
             ))}
           </div>
 
-          {/* Pinned Cluster quick-access (admin + distributed/p2p). Same gate
-              as the Operate rail; surfaced at the top for cluster operators. */}
-          {(() => {
-            const pinned = clusterPinItems(auth, deployment)
-            if (pinned.length === 0) return null
-            return (
-              <div className="sidebar-section">
-                <div className="sidebar-section-title">{t('operate.cluster')}</div>
-                <div className="sidebar-section-items">
-                  {pinned.map(item => (
-                    <NavItem
-                      key={item.path}
-                      item={{ path: item.path, icon: item.icon, labelKey: item.labelKey }}
-                      onClose={onClose}
-                      collapsed={collapsed}
-                    />
-                  ))}
-                </div>
-              </div>
-            )
-          })()}
-
           {/* Collapsible sections */}
           {sections.map(section => {
             const visibleItems = getVisibleSectionItems(section)
diff --git a/core/http/react-ui/src/components/TopNavbar.jsx b/core/http/react-ui/src/components/TopNavbar.jsx
deleted file mode 100644
index a1227b0a9..000000000
--- a/core/http/react-ui/src/components/TopNavbar.jsx
+++ /dev/null
@@ -1,96 +0,0 @@
-import { useNavigate } from 'react-router-dom'
-import { useTranslation } from 'react-i18next'
-import { useAuth } from '../context/AuthContext'
-import { useDeployment } from '../contexts/DeploymentContext'
-import { useTheme } from '../contexts/ThemeContext'
-import { launchAssistantChat } from '../utils/launchAssistantChat'
-import TokenUsageMeter from './navbar/TokenUsageMeter'
-
-// Desktop top bar. Complementary to the mobile-only header in App.jsx: this is
-// hidden on small screens (see .top-navbar CSS) and shows deployment/admin
-// affordances on wide screens where the sidebar footer is far from the content.
-export default function TopNavbar() {
-  const { t } = useTranslation('nav')
-  const navigate = useNavigate()
-  const { isAdmin, authEnabled, user } = useAuth()
-  const { features, distributed, p2pEnabled } = useDeployment()
-  const { theme, toggleTheme } = useTheme()
-
-  const modeLabel = distributed
-    ? t('topbar.modeDistributed')
-    : p2pEnabled
-      ? t('topbar.modeSwarm')
-      : t('topbar.modeSingle')
-
-  const showAssistantJump = isAdmin && !!features.localai_assistant
-  const showAvatar = authEnabled && user
-  const themeLabel = theme === 'dark' ? t('switchToLightMode') : t('switchToDarkMode')
-
-  return (
-    <div className="top-navbar" role="navigation" aria-label={t('topbar.label')}>
-      <div className="top-navbar__left">
-        {isAdmin && (
-          <span className={`top-navbar__mode ${distributed || p2pEnabled ? 'is-active' : ''}`}>
-            <i className="fas fa-circle-nodes" aria-hidden="true" /> {modeLabel}
-          </span>
-        )}
-      </div>
-      <div className="top-navbar__right">
-        {!isAdmin && (
-          <button
-            type="button"
-            className="top-navbar__btn"
-            onClick={() => navigate('/app/chat')}
-            title={t('topbar.pickModel')}
-          >
-            <i className="fas fa-cube" aria-hidden="true" /> {t('topbar.pickModel')}
-          </button>
-        )}
-        {showAssistantJump && (
-          <button
-            type="button"
-            className="top-navbar__btn top-navbar__assistant"
-            onClick={() => launchAssistantChat(navigate)}
-            title={t('topbar.adminViaChat')}
-          >
-            <i className="fas fa-user-shield" aria-hidden="true" /> {t('topbar.adminViaChat')}
-          </button>
-        )}
-        {isAdmin && <TokenUsageMeter />}
-        {isAdmin && (
-          <button
-            type="button"
-            className="top-navbar__icon"
-            onClick={() => navigate('/app/settings')}
-            aria-label={t('items.settings')}
-            title={t('items.settings')}
-          >
-            <i className="fas fa-cog" aria-hidden="true" />
-          </button>
-        )}
-        <button
-          type="button"
-          className="top-navbar__icon"
-          onClick={toggleTheme}
-          aria-label={themeLabel}
-          title={themeLabel}
-        >
-          <i className={`fas ${theme === 'dark' ? 'fa-sun' : 'fa-moon'}`} aria-hidden="true" />
-        </button>
-        {showAvatar && (
-          <button
-            type="button"
-            className="top-navbar__icon top-navbar__avatar"
-            onClick={() => navigate('/app/account')}
-            aria-label={user.name || user.email}
-            title={user.name || user.email}
-          >
-            {user.avatarUrl
-              ? <img src={user.avatarUrl} alt="" />
-              : <i className="fas fa-user-circle" aria-hidden="true" />}
-          </button>
-        )}
-      </div>
-    </div>
-  )
-}
diff --git a/core/http/react-ui/src/components/navbar/TokenUsageMeter.jsx b/core/http/react-ui/src/components/navbar/TokenUsageMeter.jsx
deleted file mode 100644
index 3213abc59..000000000
--- a/core/http/react-ui/src/components/navbar/TokenUsageMeter.jsx
+++ /dev/null
@@ -1,52 +0,0 @@
-import { useState, useEffect } from 'react'
-import { useNavigate } from 'react-router-dom'
-import { useTranslation } from 'react-i18next'
-import { usageApi } from '../../utils/api'
-
-// Compact admin-only usage glance: today's total tokens, optionally against a
-// quota cap, linking to the full /app/usage page. Self-contained data fetch so
-// a usage-API failure cannot break the navbar - it just renders nothing.
-function sumTotalTokens(res) {
-  const buckets = res?.buckets || res?.usage || (Array.isArray(res) ? res : [])
-  if (!Array.isArray(buckets) || buckets.length === 0) return null
-  return buckets.reduce((s, b) => s + (b.total_tokens || 0), 0)
-}
-
-export default function TokenUsageMeter() {
-  const { t } = useTranslation('nav')
-  const navigate = useNavigate()
-  const [tokens, setTokens] = useState(null)
-  const [cap, setCap] = useState(null)
-
-  useEffect(() => {
-    let cancelled = false
-    usageApi.getAdminUsage('day')
-      .then(res => { if (!cancelled) setTokens(sumTotalTokens(res)) })
-      .catch(() => { if (!cancelled) setTokens(null) })
-    usageApi.getMyQuotas()
-      .then(q => { if (!cancelled) setCap(q?.token_limit || q?.tokens?.limit || null) })
-      .catch(() => { if (!cancelled) setCap(null) })
-    return () => { cancelled = true }
-  }, [])
-
-  if (tokens === null) return null
-
-  const pct = cap ? Math.min(100, Math.round((tokens / cap) * 100)) : null
-
-  return (
-    <button
-      type="button"
-      className="top-navbar__meter"
-      onClick={() => navigate('/app/usage')}
-      title={t('topbar.usageDetail')}
-    >
-      <span className="top-navbar__meter-label">
-        {t('topbar.tokensToday')}: {Intl.NumberFormat().format(tokens)}
-        {cap ? ` / ${Intl.NumberFormat().format(cap)}` : ''}
-      </span>
-      {pct !== null && (
-        <span className="top-navbar__meter-bar"><i style={{ width: `${pct}%` }} /></span>
-      )}
-    </button>
-  )
-}
diff --git a/core/http/react-ui/src/contexts/DeploymentContext.jsx b/core/http/react-ui/src/contexts/DeploymentContext.jsx
deleted file mode 100644
index ae0373620..000000000
--- a/core/http/react-ui/src/contexts/DeploymentContext.jsx
+++ /dev/null
@@ -1,55 +0,0 @@
-import { createContext, useContext, useState, useEffect } from 'react'
-import { apiUrl } from '../utils/basePath'
-import { p2pApi } from '../utils/api'
-
-const DeploymentContext = createContext(null)
-
-// One shared fetch of the deployment-shape signals the adaptive UI keys off:
-// server features (/api/features) and whether a P2P network token exists.
-// Components used to fetch /api/features independently (Sidebar, Home); this
-// centralises it so the landing resolver, sidebar policy, and navbar agree on
-// one snapshot and we issue a single request.
-export function DeploymentProvider({ children }) {
-  const [features, setFeatures] = useState({})
-  const [p2pEnabled, setP2pEnabled] = useState(false)
-  const [loading, setLoading] = useState(true)
-
-  useEffect(() => {
-    let cancelled = false
-    const featuresP = fetch(apiUrl('/api/features'))
-      .then(r => r.json())
-      .catch(() => ({}))
-    // P2P has no /api/features flag: it is "enabled" when a network token
-    // exists (mirrors pages/P2P.jsx). A 404/disabled endpoint throws and we
-    // treat that as not-enabled.
-    const p2pP = p2pApi.getToken()
-      .then(tok => (typeof tok === 'string' ? tok : (tok?.token || '')).trim())
-      .catch(() => '')
-    Promise.all([featuresP, p2pP]).then(([f, tok]) => {
-      if (cancelled) return
-      setFeatures(f || {})
-      setP2pEnabled(!!tok)
-      setLoading(false)
-    })
-    return () => { cancelled = true }
-  }, [])
-
-  const value = {
-    features,
-    distributed: !!features.distributed,
-    p2pEnabled,
-    loading,
-  }
-
-  return (
-    <DeploymentContext.Provider value={value}>
-      {children}
-    </DeploymentContext.Provider>
-  )
-}
-
-export function useDeployment() {
-  const ctx = useContext(DeploymentContext)
-  if (!ctx) throw new Error('useDeployment must be used within DeploymentProvider')
-  return ctx
-}
diff --git a/core/http/react-ui/src/main.jsx b/core/http/react-ui/src/main.jsx
index 8f1c025c4..28ce5fde1 100644
--- a/core/http/react-ui/src/main.jsx
+++ b/core/http/react-ui/src/main.jsx
@@ -4,7 +4,6 @@ import { RouterProvider } from 'react-router-dom'
 import { ThemeProvider } from './contexts/ThemeContext'
 import { BrandingProvider } from './contexts/BrandingContext'
 import { AuthProvider } from './context/AuthContext'
-import { DeploymentProvider } from './contexts/DeploymentContext'
 import { OperationsProvider } from './contexts/OperationsContext'
 import { router } from './router'
 import './i18n'
@@ -33,11 +32,9 @@ createRoot(document.getElementById('root')).render(
       <ThemeProvider>
         <BrandingProvider>
           <AuthProvider>
-            <DeploymentProvider>
-              <OperationsProvider>
-                <RouterProvider router={router} />
-              </OperationsProvider>
-            </DeploymentProvider>
+            <OperationsProvider>
+              <RouterProvider router={router} />
+            </OperationsProvider>
           </AuthProvider>
         </BrandingProvider>
       </ThemeProvider>
diff --git a/core/http/react-ui/src/pages/Chat.jsx b/core/http/react-ui/src/pages/Chat.jsx
index 296dd284a..675e15581 100644
--- a/core/http/react-ui/src/pages/Chat.jsx
+++ b/core/http/react-ui/src/pages/Chat.jsx
@@ -541,73 +541,58 @@ export default function Chat() {
     updateChatSettings(activeChat.id, { clientMCPServers: next })
   }, [activeChat, updateChatSettings])
 
-  // Load initial message / assistant launch from the Home page or the navbar
-  // quick-jump. Factored into a callback so both the mount-time reader and the
-  // navbar re-trigger event below consume the same payload through one path.
+  // Load initial message from home page
   const homeDataProcessed = useRef(false)
-  const consumeHomeChatData = useCallback(() => {
-    const stored = localStorage.getItem('localai_index_chat_data')
-    if (!stored) return
-    try {
-      const data = JSON.parse(stored)
-      localStorage.removeItem('localai_index_chat_data')
-
-      // Two entry shapes from Home:
-      //   - "compose-and-send": data.message present → open new chat,
-      //     prefill the composer, click submit.
-      //   - "open-assistant": no message, just data.localaiAssistant → open
-      //     a fresh chat already in admin mode so the wizard can fire.
-      const hasMessage = !!data.message
-      const wantsAssistant = !!data.localaiAssistant
-
-      if (hasMessage || wantsAssistant) {
-        let targetChat = activeChat
-        if (data.newChat) {
-          targetChat = addChat(data.model || '', '', data.mcpMode || false)
-        } else {
-          if (data.model && activeChat) {
-            updateChatSettings(activeChat.id, { model: data.model })
-          }
-          if (data.mcpMode && activeChat) {
-            updateChatSettings(activeChat.id, { mcpMode: true })
-          }
-        }
-        if (data.mcpServers?.length > 0 && targetChat) {
-          updateChatSettings(targetChat.id, { mcpServers: data.mcpServers })
-        }
-        if (data.clientMCPServers?.length > 0 && targetChat) {
-          updateChatSettings(targetChat.id, { clientMCPServers: data.clientMCPServers })
-        }
-        if (wantsAssistant && targetChat) {
-          updateChatSettings(targetChat.id, { localaiAssistant: true })
-        }
-        if (hasMessage) {
-          setInput(data.message)
-          if (data.files) setFiles(data.files)
-          setTimeout(() => {
-            const submitBtn = document.getElementById('chat-submit-btn')
-            submitBtn?.click()
-          }, 100)
-        }
-      }
-    } catch (_e) { /* ignore */ }
-  }, [activeChat, addChat, updateChatSettings])
-
   useEffect(() => {
     if (homeDataProcessed.current) return
-    homeDataProcessed.current = true
-    consumeHomeChatData()
-  }, [consumeHomeChatData])
+    const stored = localStorage.getItem('localai_index_chat_data')
+    if (stored) {
+      homeDataProcessed.current = true
+      try {
+        const data = JSON.parse(stored)
+        localStorage.removeItem('localai_index_chat_data')
 
-  // Admins can re-trigger the assistant jump from the navbar while already on
-  // the chat page; navigate('/app/chat') does not remount Chat, so the
-  // mount-time reader above never fires. The launcher dispatches this event
-  // after writing the payload so we re-consume it and open a fresh assistant.
-  useEffect(() => {
-    const onOpenAssistant = () => consumeHomeChatData()
-    window.addEventListener('localai-open-assistant', onOpenAssistant)
-    return () => window.removeEventListener('localai-open-assistant', onOpenAssistant)
-  }, [consumeHomeChatData])
+        // Two entry shapes from Home:
+        //   - "compose-and-send": data.message present → open new chat,
+        //     prefill the composer, click submit.
+        //   - "open-assistant": no message, just data.localaiAssistant → open
+        //     a fresh chat already in admin mode so the wizard can fire.
+        const hasMessage = !!data.message
+        const wantsAssistant = !!data.localaiAssistant
+
+        if (hasMessage || wantsAssistant) {
+          let targetChat = activeChat
+          if (data.newChat) {
+            targetChat = addChat(data.model || '', '', data.mcpMode || false)
+          } else {
+            if (data.model && activeChat) {
+              updateChatSettings(activeChat.id, { model: data.model })
+            }
+            if (data.mcpMode && activeChat) {
+              updateChatSettings(activeChat.id, { mcpMode: true })
+            }
+          }
+          if (data.mcpServers?.length > 0 && targetChat) {
+            updateChatSettings(targetChat.id, { mcpServers: data.mcpServers })
+          }
+          if (data.clientMCPServers?.length > 0 && targetChat) {
+            updateChatSettings(targetChat.id, { clientMCPServers: data.clientMCPServers })
+          }
+          if (wantsAssistant && targetChat) {
+            updateChatSettings(targetChat.id, { localaiAssistant: true })
+          }
+          if (hasMessage) {
+            setInput(data.message)
+            if (data.files) setFiles(data.files)
+            setTimeout(() => {
+              const submitBtn = document.getElementById('chat-submit-btn')
+              submitBtn?.click()
+            }, 100)
+          }
+        }
+      } catch (_e) { /* ignore */ }
+    }
+  }, [])
 
   // Track whether the user is pinned to the bottom. If they scroll up
   // while a response is streaming, stop forcing them back down.
diff --git a/core/http/react-ui/src/pages/Home.jsx b/core/http/react-ui/src/pages/Home.jsx
index 9e46a7254..2c36b03a6 100644
--- a/core/http/react-ui/src/pages/Home.jsx
+++ b/core/http/react-ui/src/pages/Home.jsx
@@ -13,7 +13,6 @@ import { useResources } from '../hooks/useResources'
 import { fileToBase64, backendControlApi, systemApi, modelsApi, mcpApi, nodesApi } from '../utils/api'
 import { API_CONFIG } from '../utils/config'
 import { greetingKey } from '../utils/greeting'
-import { launchAssistantChat } from '../utils/launchAssistantChat'
 import StatusPill from '../components/StatusPill'
 import Skeleton from '../components/Skeleton'
 import SectionHeading from '../components/SectionHeading'
@@ -229,8 +228,16 @@ export default function Home() {
   // requiring an initial message or model selection. Useful when an admin
   // wants to start the assistant from a cold home page.
   const openAssistantChat = useCallback(() => {
-    launchAssistantChat(navigate, selectedModel)
+    const chatData = {
+      model: selectedModel || '',
+      mcpMode: false,
+      localaiAssistant: true,
+      newChat: true,
+    }
+    localStorage.setItem('localai_index_chat_data', JSON.stringify(chatData))
+    try { localStorage.setItem('localai_assistant_used', '1') } catch { /* ignore */ }
     setAssistantUsed(true)
+    navigate('/app/chat')
   }, [navigate, selectedModel])
 
   const handleSubmit = (e) => {
diff --git a/core/http/react-ui/src/router.jsx b/core/http/react-ui/src/router.jsx
index e985c4195..03962d27e 100644
--- a/core/http/react-ui/src/router.jsx
+++ b/core/http/react-ui/src/router.jsx
@@ -6,7 +6,6 @@ import RequireAdmin from './components/RequireAdmin'
 import RequireAuth from './components/RequireAuth'
 import RequireAuthEnabled from './components/RequireAuthEnabled'
 import RequireFeature from './components/RequireFeature'
-import HomeRoute from './components/HomeRoute'
 
 // Pages are code-split: each becomes its own chunk loaded on demand, so a route
 // no longer drags every other page (and its heavy deps — CodeMirror, the MCP
@@ -33,7 +32,7 @@ export function preloadRoute(path) {
   preloaders[m[1] ?? '']?.().catch(() => { /* network blip — real click will retry */ })
 }
 
-page('', () => import('./pages/Home'))
+const Home = page('', () => import('./pages/Home'))
 const Chat = page('chat', () => import('./pages/Chat'))
 const Models = page('models', () => import('./pages/Models'))
 const Manage = page('manage', () => import('./pages/Manage'))
@@ -97,7 +96,7 @@ function Feature({ feature, children }) {
 }
 
 const appChildren = [
-  { index: true, element: <HomeRoute /> },
+  { index: true, element: <Home /> },
   { path: 'chat', element: <Chat /> },
   { path: 'chat/:model', element: <Chat /> },
   { path: 'image', element: <ImageGen /> },
diff --git a/core/http/react-ui/src/utils/launchAssistantChat.js b/core/http/react-ui/src/utils/launchAssistantChat.js
deleted file mode 100644
index 833d2ec56..000000000
--- a/core/http/react-ui/src/utils/launchAssistantChat.js
+++ /dev/null
@@ -1,19 +0,0 @@
-// Opens a fresh chat already in LocalAI Assistant ("manage") mode. Chat.jsx
-// reads localai_index_chat_data on mount and enables localaiAssistant for the
-// new chat. Shared by the Home CTA and the top navbar quick-jump so there is
-// one definition of how the assistant is launched.
-export function launchAssistantChat(navigate, model = '') {
-  const chatData = {
-    model: model || '',
-    mcpMode: false,
-    localaiAssistant: true,
-    newChat: true,
-  }
-  try { localStorage.setItem('localai_index_chat_data', JSON.stringify(chatData)) } catch { /* ignore */ }
-  try { localStorage.setItem('localai_assistant_used', '1') } catch { /* ignore */ }
-  navigate('/app/chat')
-  // When already on /app/chat, navigate() does not remount Chat, so its
-  // mount-time reader would never see the payload above. Signal the mounted
-  // Chat to re-consume it; harmless elsewhere since Chat reads on mount anyway.
-  try { window.dispatchEvent(new CustomEvent('localai-open-assistant')) } catch { /* ignore */ }
-}
diff --git a/core/http/react-ui/src/utils/resolveHome.js b/core/http/react-ui/src/utils/resolveHome.js
deleted file mode 100644
index 2353db5d8..000000000
--- a/core/http/react-ui/src/utils/resolveHome.js
+++ /dev/null
@@ -1,11 +0,0 @@
-// Pure landing-page resolver for the index route. Returns a target path, or ''
-// meaning "render the default Home". Admin precedence is distributed > p2p >
-// plain; non-admins always go to Chat (distributed/p2p are admin-only and
-// invisible to them). Visibility gates are enforced elsewhere - this only
-// chooses where /app lands.
-export function resolveHome({ isAdmin, distributed, p2pEnabled }) {
-  if (!isAdmin) return '/app/chat'
-  if (distributed) return '/app/nodes'
-  if (p2pEnabled) return '/app/p2p'
-  return ''
-}
diff --git a/core/http/react-ui/src/utils/sidebarPolicy.js b/core/http/react-ui/src/utils/sidebarPolicy.js
deleted file mode 100644
index da4cd392e..000000000
--- a/core/http/react-ui/src/utils/sidebarPolicy.js
+++ /dev/null
@@ -1,20 +0,0 @@
-import { operateConsole, isConsoleItemVisible } from '../components/console/consoleConfig'
-
-// The Operate > Cluster group, surfaced as a pinned top-of-sidebar quick-access
-// group when the admin is running a cluster (NATS-distributed) or a P2P swarm.
-// Items are filtered through the SAME gate as everywhere else, so e.g. in a
-// p2p-only deployment Nodes/Scheduling (feature: 'distributed') drop out and
-// only Swarm remains. Returns [] when the pin does not apply.
-export function clusterPinItems(auth, deployment) {
-  if (!auth.isAdmin) return []
-  if (!deployment.distributed && !deployment.p2pEnabled) return []
-  const group = operateConsole.groups.find(g => g.titleKey === 'operate.cluster')
-  if (!group) return []
-  return group.items.filter(item => isConsoleItemVisible(item, auth))
-}
-
-// In the cluster cells the Create group defaults collapsed so the pinned
-// cluster group leads. Users can still expand it; their stored choice wins.
-export function shouldCollapseCreate(auth, deployment) {
-  return !!auth.isAdmin && (!!deployment.distributed || !!deployment.p2pEnabled)
-}

From 56f8a6623f1d422c60a4d116f9cea6499ca7a860 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 22:41:16 +0200
Subject: [PATCH 54/99] fix(galleryop): persist cancellable so restarted
 in-flight ops stay cancellable (#10454)

In distributed mode a model/backend install marks OpStatus.Cancellable=true
while downloading, but the gallery_operations row never recorded it:
UpdateStatus persisted only progress/status and Create left the cancellable
column at its zero value. After a replica restart Hydrate rebuilt the op with
cancellable=false, /api/operations reported false, and the UI hid the cancel
button - the orphaned op then lingered until the 30-minute stale reaper
expired it ("stays there on restart, can't cancel, after a bit it expires").

Persist the flag on every progress tick and at row creation (installs are
cancellable, deletes are not), and clear it on terminal transitions. A
rehydrated in-flight op is now cancellable, so an admin can dismiss the
orphaned op immediately instead of waiting out the reaper. The functional
cancel path already survived restart (CancelOperation persists store.Cancel
even with no live CancelFunc); this restores the UI affordance that drives it.


Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/services/distributed/gallery.go          | 18 ++++--
 .../galleryop/cancellable_persist_test.go     | 56 +++++++++++++++++++
 core/services/galleryop/service.go            |  5 +-
 3 files changed, 73 insertions(+), 6 deletions(-)
 create mode 100644 core/services/galleryop/cancellable_persist_test.go

diff --git a/core/services/distributed/gallery.go b/core/services/distributed/gallery.go
index 7b1239e5a..d85fd76a8 100644
--- a/core/services/distributed/gallery.go
+++ b/core/services/distributed/gallery.go
@@ -79,21 +79,29 @@ func (s *GalleryStore) Create(op *GalleryOperationRecord) error {
 	}).Create(op).Error
 }
 
-// UpdateProgress updates progress for an operation.
-func (s *GalleryStore) UpdateProgress(id string, progress float64, message, downloadedSize string) error {
+// UpdateProgress updates progress for an operation. The cancellable flag is
+// persisted on every tick so a replica that restarts mid-install rehydrates the
+// op as still cancellable — otherwise the column keeps its Create-time zero
+// value (false), the UI hides the cancel button, and the orphaned op can only
+// be dismissed by waiting for the 30-minute stale reaper.
+func (s *GalleryStore) UpdateProgress(id string, progress float64, message, downloadedSize string, cancellable bool) error {
 	return s.db.Model(&GalleryOperationRecord{}).Where("id = ?", id).Updates(map[string]any{
 		"progress":             progress,
 		"message":              message,
 		"downloaded_file_size": downloadedSize,
+		"cancellable":          cancellable,
 		"updated_at":           time.Now(),
 	}).Error
 }
 
-// UpdateStatus updates the status of an operation.
+// UpdateStatus updates the status of an operation. A terminal status is never
+// cancellable, so the flag is cleared here to keep the persisted row consistent
+// with what the UI should offer.
 func (s *GalleryStore) UpdateStatus(id, status, errMsg string) error {
 	updates := map[string]any{
-		"status":     status,
-		"updated_at": time.Now(),
+		"status":      status,
+		"cancellable": false,
+		"updated_at":  time.Now(),
 	}
 	if errMsg != "" {
 		updates["error"] = errMsg
diff --git a/core/services/galleryop/cancellable_persist_test.go b/core/services/galleryop/cancellable_persist_test.go
new file mode 100644
index 000000000..6879dee92
--- /dev/null
+++ b/core/services/galleryop/cancellable_persist_test.go
@@ -0,0 +1,56 @@
+package galleryop_test
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/services/distributed"
+	"github.com/mudler/LocalAI/core/services/galleryop"
+	"github.com/mudler/LocalAI/core/services/testutil"
+)
+
+// Reproduces "an in-flight install can't be cancelled after a restart". The
+// live install path marks OpStatus.Cancellable=true on every progress tick, but
+// UpdateStatus persisted progress/status to the gallery store WITHOUT the
+// cancellable flag, and Create defaulted it to false. So after a replica
+// restart Hydrate rebuilt the op with Cancellable=false, /api/operations
+// reported cancellable:false, and the UI hid the cancel button — the orphaned
+// op lingered until the 30-minute stale reaper expired it. The cancellable
+// state must be persisted so a rehydrated in-flight op stays cancellable.
+var _ = Describe("GalleryService cancellable persistence across restart", func() {
+	It("rehydrates an in-flight op as still cancellable", func() {
+		db := testutil.SetupTestDB()
+		store, err := distributed.NewGalleryStore(db)
+		Expect(err).ToNot(HaveOccurred())
+
+		svc := galleryop.NewGalleryService(&config.ApplicationConfig{}, nil)
+		svc.SetGalleryStore(store)
+
+		// Seed the in-flight op row as the worker goroutine does on admission.
+		Expect(store.Create(&distributed.GalleryOperationRecord{
+			ID:                 "op-inflight",
+			GalleryElementName: "llama-cpp-development",
+			OpType:             "backend_install",
+			Status:             "pending",
+		})).To(Succeed())
+
+		// Simulate a progress tick: the live path always marks installs
+		// cancellable while they are downloading/processing.
+		svc.UpdateStatus("op-inflight", &galleryop.OpStatus{
+			Message:     "downloading",
+			Progress:    25,
+			Cancellable: true,
+		})
+
+		// A fresh replica boots and hydrates from the store.
+		fresh := galleryop.NewGalleryService(&config.ApplicationConfig{}, nil)
+		fresh.SetGalleryStore(store)
+		Expect(fresh.Hydrate()).To(Succeed())
+
+		st := fresh.GetStatus("op-inflight")
+		Expect(st).ToNot(BeNil(), "the in-flight op must hydrate after a restart")
+		Expect(st.Cancellable).To(BeTrue(),
+			"a still-active install must rehydrate as cancellable so the admin can dismiss it")
+	})
+})
diff --git a/core/services/galleryop/service.go b/core/services/galleryop/service.go
index 5b611d41e..d01d9cc19 100644
--- a/core/services/galleryop/service.go
+++ b/core/services/galleryop/service.go
@@ -167,7 +167,7 @@ func (g *GalleryService) UpdateStatus(s string, op *OpStatus) {
 				xlog.Warn("Failed to persist gallery operation status", "op_id", s, "error", err)
 			}
 		} else {
-			if err := store.UpdateProgress(s, op.Progress, op.Message, op.DownloadedFileSize); err != nil {
+			if err := store.UpdateProgress(s, op.Progress, op.Message, op.DownloadedFileSize, op.Cancellable); err != nil {
 				xlog.Warn("Failed to persist gallery operation progress", "op_id", s, "error", err)
 			}
 		}
@@ -467,6 +467,7 @@ func (g *GalleryService) Start(c context.Context, cl *config.ModelConfigLoader,
 						GalleryElementName: op.GalleryElementName,
 						OpType:             "backend_install",
 						Status:             "pending",
+						Cancellable:        true,
 					})
 				}
 				err := g.backendHandler(&op, systemState)
@@ -499,6 +500,8 @@ func (g *GalleryService) Start(c context.Context, cl *config.ModelConfigLoader,
 						GalleryElementName: op.GalleryElementName,
 						OpType:             opType,
 						Status:             "pending",
+						// A delete is not cancellable; an install is.
+						Cancellable: !op.Delete,
 					})
 				}
 				err := g.modelHandler(&op, cl, systemState)

From 69c16481c8f42c483afa92eb5a85312234fe594c Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Mon, 22 Jun 2026 23:45:22 +0200
Subject: [PATCH 55/99] fix(test): update e2e UpdateProgress calls for new
 cancellable arg (#10460)

PR #10454 added a `cancellable bool` parameter to GalleryStore.UpdateProgress
but missed two callers under tests/e2e/distributed, breaking the build on
master (golangci-lint and tests-e2e-backend both failed to compile with
"not enough arguments in call to ... UpdateProgress").

Pass cancellable=true (both ops are downloading installs, which are
cancellable) and assert the flag is persisted, exercising the new behavior.


Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 tests/e2e/distributed/gallery_distributed_test.go | 5 +++--
 tests/e2e/distributed/phase4_test.go              | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/e2e/distributed/gallery_distributed_test.go b/tests/e2e/distributed/gallery_distributed_test.go
index d975719d7..100063ed5 100644
--- a/tests/e2e/distributed/gallery_distributed_test.go
+++ b/tests/e2e/distributed/gallery_distributed_test.go
@@ -53,12 +53,13 @@ var _ = Describe("Gallery Distributed", Label("Distributed"), func() {
 			Expect(retrieved.Status).To(Equal("downloading"))
 			Expect(retrieved.FrontendID).To(Equal("f1"))
 
-			// Update progress
-			Expect(galleryStore.UpdateProgress(op.ID, 0.75, "75% complete", "6GB")).To(Succeed())
+			// Update progress (cancellable: a downloading install can be cancelled)
+			Expect(galleryStore.UpdateProgress(op.ID, 0.75, "75% complete", "6GB", true)).To(Succeed())
 
 			updated, _ := galleryStore.Get(op.ID)
 			Expect(updated.Progress).To(BeNumerically("~", 0.75, 0.01))
 			Expect(updated.Message).To(Equal("75% complete"))
+			Expect(updated.Cancellable).To(BeTrue())
 
 			// Complete
 			Expect(galleryStore.UpdateStatus(op.ID, "completed", "")).To(Succeed())
diff --git a/tests/e2e/distributed/phase4_test.go b/tests/e2e/distributed/phase4_test.go
index bfd03b9c3..5863b64e0 100644
--- a/tests/e2e/distributed/phase4_test.go
+++ b/tests/e2e/distributed/phase4_test.go
@@ -104,11 +104,12 @@ var _ = Describe("Phase 4: MCP, Skills, Gallery, Fine-Tuning", Label("Distribute
 			}
 			stores.Gallery.Create(op)
 
-			Expect(stores.Gallery.UpdateProgress(op.ID, 0.5, "50% complete", "2GB")).To(Succeed())
+			Expect(stores.Gallery.UpdateProgress(op.ID, 0.5, "50% complete", "2GB", true)).To(Succeed())
 
 			updated, _ := stores.Gallery.Get(op.ID)
 			Expect(updated.Progress).To(BeNumerically("~", 0.5, 0.01))
 			Expect(updated.Message).To(Equal("50% complete"))
+			Expect(updated.Cancellable).To(BeTrue())
 		})
 
 		It("should deduplicate concurrent downloads", func() {

From 9eedbf537ac1118ef6336838f7619ff254786e1f Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Tue, 23 Jun 2026 08:04:46 +0200
Subject: [PATCH 56/99] chore(model gallery): :robot: add 1 new models via
 gallery agent (#10461)

chore(model gallery): :robot: add new models via gallery agent

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 gallery/index.yaml | 133 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 23135a510..bc0053289 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,137 @@
 ---
+- name: "kimi-k2.7-code"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF
+  description: |
+    ## 1. Model Introduction
+
+    Kimi K2.7 Code is a coding-focused agentic model built upon Kimi K2.6. With substantial improvements on real-world long-horizon coding tasks, it strengthens end-to-end task completion across complex software engineering workflows while improving token efficiency, reducing thinking-token usage by approximately 30% compared with Kimi K2.6.
+
+    ## 2. Model Summary
+
+    ## 3. Evaluation Results
+
+    Benchmark
+    Kimi K2.6
+    Kimi K2.7 Code
+    GPT-5.5
+    Claude Opus 4.8
+
+    Coding
+
+    Kimi Code Bench v2
+    50.9
+    62.0
+    69.0
+    67.4
+
+    Program Bench
+    48.3
+    53.6
+    69.1
+    63.8
+
+    MLS Bench Lite
+    26.7
+    35.1
+    35.5
+    42.8
+
+    Agentic
+
+    Kimi Claw 24/7 Bench
+    42.9
+    46.9
+    52.8
+    50.4
+
+    MCP Atlas
+    69.4
+    76.0
+    79.4
+    81.3
+
+    MCP Mark Verified
+    72.8
+    81.1
+    92.9
+    76.4
+
+    Footnotes
+
+    ...
+  license: "other"
+  tags:
+    - llm
+    - gguf
+  icon: https://huggingface.co/moonshotai/Kimi-K2.7-Code/resolve/main/figures/kimi-logo.png
+  overrides:
+    backend: llama-cpp
+    function:
+      automatic_tool_parsing_fallback: true
+      grammar:
+        disable: true
+    known_usecases:
+      - chat
+    mmproj: llama-cpp/mmproj/Kimi-K2.7-Code-GGUF/mmproj-F32.gguf
+    options:
+      - use_jinja:true
+    parameters:
+      min_p: 0.01
+      model: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00001-of-00014.gguf
+      repeat_penalty: 1
+      temperature: 0.6
+      top_k: -1
+      top_p: 0.95
+    template:
+      use_tokenizer_template: true
+  files:
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00001-of-00014.gguf
+      sha256: 65f0aca336f876902323a90e2aff32cac76d071b2cdd818c6a8d78be8fc2c680
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00001-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00002-of-00014.gguf
+      sha256: 40f4416c130827a11502778891f4ef95b2144db90f51d63aa3548d0952a39683
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00002-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00003-of-00014.gguf
+      sha256: ba2ba0b5168784ace7c752ecadfc3631279b2bb023824cb0fe9e2dab3dd28f22
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00003-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00004-of-00014.gguf
+      sha256: 10298a6c98b13ef49be286fefbea8663e16473fb69bbeabe153bc80c60ae116e
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00004-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00005-of-00014.gguf
+      sha256: 8e9e4c8e35d34fc4fef6bfb65a715ad7defbd196970d833c1df6924d701c88b3
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00005-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00006-of-00014.gguf
+      sha256: ccff6e7f299742f82cf6f51a871e3eb3167511efaee967477cc8387f54d16442
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00006-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00007-of-00014.gguf
+      sha256: 1a3b639633a2d22f71156a9f643ded2329cdd969cc21177b644b5741bac1af8e
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00007-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00008-of-00014.gguf
+      sha256: bde28f682a1eab973538b2102007d952f37a13c1f7d55e2ed99177445ddc4282
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00008-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00009-of-00014.gguf
+      sha256: b6a23a95b61e100f7593fa75e2363966323fa767b7e4fdf45d963b59e8fdc69f
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00009-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00010-of-00014.gguf
+      sha256: fb10231c2e6d76921d40f22690f4aa08a8090c708edeaf7e581abafc24d3b25c
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00010-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00011-of-00014.gguf
+      sha256: d2290be7ed1a22ac1f9f8a4813389689e075ce2ab8abc3aaaa1157a3cb1462d8
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00011-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00012-of-00014.gguf
+      sha256: ce0d028314aa3fc783082dbca097e1055d69686a17ab8306574e2949568f26a5
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00012-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00013-of-00014.gguf
+      sha256: 217864ce63a1d130ab39dcb0996b6097e1aa78eb896e38efaefdbbac3a00b7ec
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00013-of-00014.gguf
+    - filename: llama-cpp/models/Kimi-K2.7-Code-GGUF/Kimi-K2.7-Code-UD-Q8_K_XL-00014-of-00014.gguf
+      sha256: eb7582ad7066c5eaa01bde95acb00b4ad9cd7b07cd50a6cf5c9ee427258bc9dd
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/UD-Q8_K_XL/Kimi-K2.7-Code-UD-Q8_K_XL-00014-of-00014.gguf
+    - filename: llama-cpp/mmproj/Kimi-K2.7-Code-GGUF/mmproj-F32.gguf
+      sha256: b2cc50c8c13fe70fc4968a83332f31e9007ea09ebb9ae91d46a4e4cd2a3053cd
+      uri: https://huggingface.co/unsloth/Kimi-K2.7-Code-GGUF/resolve/main/mmproj-F32.gguf
 - name: "qwythos-9b-claude-mythos-5-1m"
   url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
   urls:

From 78880679146cddb63746569dcdfd0cff86b53757 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Tue, 23 Jun 2026 12:27:34 +0100
Subject: [PATCH 57/99] fix(settings): merge partial /api/settings updates
 instead of overwriting (#10463)

POST /api/settings rebuilt runtime_settings.json from only the request
body, so a focused admin page that submits a single field wiped every
other persisted setting. The Middleware proxy tab (mitm_listen) and
detector table (pii_default_detectors), plus the MCP SetBranding tool
(instance_name/instance_tagline), all POST partial bodies; the
no-omitempty api_keys and pii_default_detectors fields even round-tripped
as JSON null.

Read the persisted settings and overlay only the fields the request set
(RuntimeSettings.MergeNonNil) before writing. Every field is a pointer, so
the reflection-based merge is total over the struct and any field added
later is preserved automatically. Absent or null fields are now kept;
clearing a setting is done by sending its explicit empty/zero value
(api_keys [], mitm_listen "", etc.), unchanged from before. The full
Settings page sends every field, so its Save behaves identically.

Assisted-by: Claude:claude-opus-4-8 Claude-Code

Signed-off-by: Richard Palethorpe <io@richiejp.com>
---
 core/config/runtime_settings_persist.go      | 30 ++++++++++++++
 core/config/runtime_settings_persist_test.go | 42 +++++++++++++++++++
 core/http/endpoints/localai/settings.go      | 43 ++++++++++++--------
 core/http/endpoints/localai/settings_test.go | 37 +++++++++++++++++
 4 files changed, 135 insertions(+), 17 deletions(-)

diff --git a/core/config/runtime_settings_persist.go b/core/config/runtime_settings_persist.go
index bb5a4110b..98a923b40 100644
--- a/core/config/runtime_settings_persist.go
+++ b/core/config/runtime_settings_persist.go
@@ -5,6 +5,7 @@ import (
 	"errors"
 	"os"
 	"path/filepath"
+	"reflect"
 )
 
 // runtimeSettingsFile is the on-disk filename inside DynamicConfigsDir.
@@ -33,6 +34,35 @@ func (o *ApplicationConfig) ReadPersistedSettings() (RuntimeSettings, error) {
 	return settings, nil
 }
 
+// MergeNonNil overlays every set (non-nil) field of overlay onto the
+// receiver, leaving the receiver's value untouched wherever overlay left a
+// field unset. Every RuntimeSettings field is a pointer precisely so "set"
+// can be told apart from "absent" (see the type doc), which makes this a
+// faithful partial update: a caller that submits only the field it owns
+// changes exactly that field and never clobbers unrelated settings.
+//
+// This is the read-modify-write contract the persistence helpers exist for.
+// UpdateSettingsEndpoint reads the on-disk settings, merges the request body
+// on top, and writes the result — so a focused admin page that POSTs only its
+// own field (the Middleware page sends only mitm_listen; the detector table
+// only pii_default_detectors) no longer nulls every other setting.
+//
+// Reflection keeps the merge total over the struct: a field added to
+// RuntimeSettings later is merged automatically, so the persistence path can
+// never silently drop a new setting the way a hand-maintained field list
+// would. Non-pointer fields (none today) are skipped — they cannot express
+// "absent", so the receiver wins.
+func (s *RuntimeSettings) MergeNonNil(overlay RuntimeSettings) {
+	dst := reflect.ValueOf(s).Elem()
+	src := reflect.ValueOf(overlay)
+	for i := 0; i < src.NumField(); i++ {
+		f := src.Field(i)
+		if f.Kind() == reflect.Pointer && !f.IsNil() {
+			dst.Field(i).Set(f)
+		}
+	}
+}
+
 // WritePersistedSettings serialises the given RuntimeSettings to
 // runtime_settings.json with restricted permissions (it may carry API
 // keys and P2P tokens).
diff --git a/core/config/runtime_settings_persist_test.go b/core/config/runtime_settings_persist_test.go
index a36acb0d2..7f5eb07a8 100644
--- a/core/config/runtime_settings_persist_test.go
+++ b/core/config/runtime_settings_persist_test.go
@@ -12,6 +12,7 @@ import (
 )
 
 func strPtr(s string) *string { return &s }
+func boolPtr(b bool) *bool     { return &b }
 
 var _ = Describe("RuntimeSettings persistence helpers", func() {
 	var (
@@ -51,6 +52,47 @@ var _ = Describe("RuntimeSettings persistence helpers", func() {
 		})
 	})
 
+	// MergeNonNil is the partial-update primitive UpdateSettingsEndpoint
+	// relies on: a focused admin page POSTs only the field it owns, and the
+	// handler reads the on-disk settings and overlays the request on top.
+	// Without it, the body would be written verbatim and every field the
+	// caller omitted would be nulled (the reported regression: changing
+	// mitm_listen wiped the galleries, api keys, watchdog config, etc.).
+	Describe("MergeNonNil partial update", func() {
+		It("overlays set fields and preserves unset ones", func() {
+			base := config.RuntimeSettings{
+				MITMListen:          strPtr(":9000"),
+				Galleries:           &[]config.Gallery{{Name: "g1", URL: "http://example/g1"}},
+				WatchdogIdleEnabled: boolPtr(true),
+				ApiKeys:             &[]string{"persisted-key"},
+				PIIDefaultDetectors: &[]string{"det-a"},
+			}
+
+			// Simulate the Middleware proxy tab: only mitm_listen is sent.
+			overlay := config.RuntimeSettings{MITMListen: strPtr(":8443")}
+			base.MergeNonNil(overlay)
+
+			Expect(base.MITMListen).ToNot(BeNil())
+			Expect(*base.MITMListen).To(Equal(":8443"), "set field should be overlaid")
+			// Everything the overlay left unset must survive untouched.
+			Expect(base.Galleries).ToNot(BeNil(), "galleries were clobbered")
+			Expect(*base.Galleries).To(HaveLen(1))
+			Expect(base.WatchdogIdleEnabled).ToNot(BeNil())
+			Expect(*base.WatchdogIdleEnabled).To(BeTrue())
+			Expect(base.ApiKeys).ToNot(BeNil(), "api_keys were clobbered")
+			Expect(*base.ApiKeys).To(Equal([]string{"persisted-key"}))
+			Expect(base.PIIDefaultDetectors).ToNot(BeNil(), "pii_default_detectors were clobbered")
+			Expect(*base.PIIDefaultDetectors).To(Equal([]string{"det-a"}))
+		})
+
+		It("lets an explicit empty slice clear a field", func() {
+			base := config.RuntimeSettings{PIIDefaultDetectors: &[]string{"det-a"}}
+			base.MergeNonNil(config.RuntimeSettings{PIIDefaultDetectors: &[]string{}})
+			Expect(base.PIIDefaultDetectors).ToNot(BeNil())
+			Expect(*base.PIIDefaultDetectors).To(BeEmpty(), "an explicit empty slice should clear, not preserve")
+		})
+	})
+
 	// MITM round trip pins the contract that loadRuntimeSettingsFromFile
 	// MITM listener address must survive a write/read round trip so the
 	// next process restart can bring the listener back up. (Intercept
diff --git a/core/http/endpoints/localai/settings.go b/core/http/endpoints/localai/settings.go
index 7d970f820..be6358939 100644
--- a/core/http/endpoints/localai/settings.go
+++ b/core/http/endpoints/localai/settings.go
@@ -4,8 +4,6 @@ import (
 	"encoding/json"
 	"io"
 	"net/http"
-	"os"
-	"path/filepath"
 	"time"
 
 	"github.com/labstack/echo/v4"
@@ -110,6 +108,18 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
 			})
 		}
 
+		// Read whatever is already persisted: it is both the source of truth
+		// for branding asset filenames (below) and the base we merge this
+		// request onto before writing. A read failure must not let a Save
+		// silently discard the existing settings — surface it instead.
+		persisted, err := appConfig.ReadPersistedSettings()
+		if err != nil {
+			return c.JSON(http.StatusInternalServerError, schema.SettingsResponse{
+				Success: false,
+				Error:   "Failed to read existing settings: " + err.Error(),
+			})
+		}
+
 		// Branding asset filenames are owned exclusively by
 		// /api/branding/asset/{kind} (upload/delete). The Settings page also
 		// round-trips them via GET /api/settings, but its local state is stale
@@ -118,11 +128,9 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
 		// at page open. Replace whatever the body sent for these three fields
 		// with the values currently on disk so /api/settings can never
 		// regress them.
-		if existing, err := appConfig.ReadPersistedSettings(); err == nil {
-			settings.LogoFile = existing.LogoFile
-			settings.LogoHorizontalFile = existing.LogoHorizontalFile
-			settings.FaviconFile = existing.FaviconFile
-		}
+		settings.LogoFile = persisted.LogoFile
+		settings.LogoHorizontalFile = persisted.LogoHorizontalFile
+		settings.FaviconFile = persisted.FaviconFile
 
 		// The UI reads ApiKeys from GET /api/settings, which already returns the
 		// merged env+runtime list. When the user clicks Save, the same merged
@@ -145,16 +153,17 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
 			settings.ApiKeys = &runtimeOnly
 		}
 
-		settingsFile := filepath.Join(appConfig.DynamicConfigsDir, "runtime_settings.json")
-		settingsJSON, err := json.MarshalIndent(settings, "", "  ")
-		if err != nil {
-			return c.JSON(http.StatusInternalServerError, schema.SettingsResponse{
-				Success: false,
-				Error:   "Failed to marshal settings: " + err.Error(),
-			})
-		}
-
-		if err := os.WriteFile(settingsFile, settingsJSON, 0600); err != nil {
+		// Persist as a partial update: overlay only the fields this request set
+		// onto the settings already on disk. Focused admin pages POST just the
+		// keys they own (the Middleware proxy tab sends only mitm_listen; the
+		// detector table only pii_default_detectors), so writing the request
+		// body verbatim would null every unrelated setting (the no-omitempty
+		// api_keys / pii_default_detectors fields even round-trip as JSON
+		// null). The full Settings page still round-trips every field, so its
+		// Save is unchanged.
+		toPersist := persisted
+		toPersist.MergeNonNil(settings)
+		if err := appConfig.WritePersistedSettings(toPersist); err != nil {
 			return c.JSON(http.StatusInternalServerError, schema.SettingsResponse{
 				Success: false,
 				Error:   "Failed to write settings file: " + err.Error(),
diff --git a/core/http/endpoints/localai/settings_test.go b/core/http/endpoints/localai/settings_test.go
index 25c84e1b7..7ba82e1a3 100644
--- a/core/http/endpoints/localai/settings_test.go
+++ b/core/http/endpoints/localai/settings_test.go
@@ -52,6 +52,10 @@ var _ = Describe("Settings endpoints", func() {
 		// Settings are persisted here; set after construction since there's no
 		// dedicated AppOption for it.
 		app.ApplicationConfig().DynamicConfigsDir = tmp
+		// Contain the MITM CA inside tmp too. The partial-save spec flips
+		// mitm_listen, which starts the listener and writes a CA; without this
+		// it defaults to ./mitm-ca and litters the package source tree.
+		app.ApplicationConfig().MITMCADir = filepath.Join(tmp, "mitm-ca")
 
 		e = echo.New()
 		e.GET("/api/settings", GetSettingsEndpoint(app))
@@ -109,6 +113,39 @@ var _ = Describe("Settings endpoints", func() {
 		Expect(err).ToNot(HaveOccurred())
 	})
 
+	// Regression: a focused admin page (the Middleware proxy tab) POSTs only
+	// the one field it owns — mitm_listen. The old handler wrote the request
+	// body verbatim, so every other persisted setting was dropped (and
+	// api_keys / pii_default_detectors, which lack omitempty, were written as
+	// null). A partial POST must now merge onto what is already on disk.
+	It("preserves unrelated persisted settings when a partial POST sets only mitm_listen", func() {
+		// First save establishes a fuller settings file (as the full Settings
+		// page would): galleries, an API key, and the MITM listener. The
+		// listener restart binds a real socket, so use 127.0.0.1:0 for an
+		// ephemeral free port rather than a fixed one that may be in use.
+		rec := post(`{"mitm_listen":"127.0.0.1:0","galleries":[{"name":"g1","url":"http://example/g1"}],"api_keys":["k1"],"pii_default_detectors":["det-a"]}`)
+		Expect(rec.Code).To(Equal(http.StatusOK), rec.Body.String())
+
+		// The Middleware proxy tab then changes only the listen address — the
+		// exact partial body that nulled everything else before the fix.
+		rec = post(`{"mitm_listen":"127.0.0.1:0"}`)
+		Expect(rec.Code).To(Equal(http.StatusOK), rec.Body.String())
+
+		raw, err := os.ReadFile(filepath.Join(tmp, "runtime_settings.json"))
+		Expect(err).ToNot(HaveOccurred())
+		var ondisk config.RuntimeSettings
+		Expect(json.Unmarshal(raw, &ondisk)).To(Succeed())
+
+		Expect(ondisk.MITMListen).ToNot(BeNil())
+		Expect(*ondisk.MITMListen).To(Equal("127.0.0.1:0"), "the changed field should be saved")
+		Expect(ondisk.Galleries).ToNot(BeNil(), "galleries were clobbered by the partial save")
+		Expect(*ondisk.Galleries).To(HaveLen(1))
+		Expect(ondisk.ApiKeys).ToNot(BeNil(), "api_keys were nulled by the partial save")
+		Expect(*ondisk.ApiKeys).To(Equal([]string{"k1"}))
+		Expect(ondisk.PIIDefaultDetectors).ToNot(BeNil(), "pii_default_detectors were nulled by the partial save")
+		Expect(*ondisk.PIIDefaultDetectors).To(Equal([]string{"det-a"}))
+	})
+
 	// Residual #9125: enabling the watchdog from a cold (off) state via the
 	// React master toggle must start the live watchdog immediately, without a
 	// restart. The toggle posts watchdog_idle_enabled/busy_enabled=true while

From 2edc4e25b3b395816fb09cbf40cfd208f63aee81 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Tue, 23 Jun 2026 13:27:51 +0200
Subject: [PATCH 58/99] chore: :arrow_up: Update ggml-org/whisper.cpp to
 `bae6bc02b1940bbfb87b6a0299c565e563b916d1` (#10459)

:arrow_up: Update ggml-org/whisper.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/go/whisper/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/go/whisper/Makefile b/backend/go/whisper/Makefile
index 9858b1d07..65c3d97d1 100644
--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=5ed76e9a079962f1c85cfce44edd325c27ef1f97
+WHISPER_CPP_VERSION?=bae6bc02b1940bbfb87b6a0299c565e563b916d1
 SO_TARGET?=libgowhisper.so
 
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

From 1d49041c85d936fed61529818b023518fd7069cd Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Tue, 23 Jun 2026 13:28:09 +0200
Subject: [PATCH 59/99] chore: :arrow_up: Update ggml-org/llama.cpp to
 `73618f27a801c0b8614ceaf3547d3c2a99baae14` (#10458)

:arrow_up: Update ggml-org/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/cpp/llama-cpp/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile
index f6e89c5ea..122a30c71 100644
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@
 
-LLAMA_VERSION?=7c082bc417bbe53210a83df4ba5b49e18ce6193c
+LLAMA_VERSION?=73618f27a801c0b8614ceaf3547d3c2a99baae14
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
 
 CMAKE_ARGS?=

From 67c88898662e388bbb29e7546121d1a28f0ef77e Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Tue, 23 Jun 2026 13:28:49 +0200
Subject: [PATCH 60/99] chore: :arrow_up: Update CrispStrobe/CrispASR to
 `63b57289255267edf66e43e33bc3911e04a2e92d` (#10455)

:arrow_up: Update CrispStrobe/CrispASR

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/go/crispasr/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/go/crispasr/Makefile b/backend/go/crispasr/Makefile
index 21f66c240..27e3b30d2 100644
--- a/backend/go/crispasr/Makefile
+++ b/backend/go/crispasr/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 
 # CrispASR version (release tag)
 CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
-CRISPASR_VERSION?=7a8cb80907341c0204bd0488c1244764f4163883
+CRISPASR_VERSION?=63b57289255267edf66e43e33bc3911e04a2e92d
 SO_TARGET?=libgocrispasr.so
 
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

From 06a7b6cadb82b1691c4823c5d1abe4ff76b9c17f Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Tue, 23 Jun 2026 13:29:07 +0200
Subject: [PATCH 61/99] chore: :arrow_up: Update leejet/stable-diffusion.cpp to
 `f440ad9c29dd8bc34e5d1f4b863832b96d6ea05f` (#10457)

:arrow_up: Update leejet/stable-diffusion.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/go/stablediffusion-ggml/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/go/stablediffusion-ggml/Makefile b/backend/go/stablediffusion-ggml/Makefile
index f77baccad..05b57b254 100644
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 
 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=b12098f5d09fc83da36e65c784f7bdb16a5a5ebf
+STABLEDIFFUSION_GGML_VERSION?=f440ad9c29dd8bc34e5d1f4b863832b96d6ea05f
 
 CMAKE_ARGS+=-DGGML_MAX_NAME=128
 

From dd8c8778e24f07754deec428ac768b0277e9c454 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Tue, 23 Jun 2026 15:43:21 +0200
Subject: [PATCH 62/99] chore(model gallery): :robot: add 1 new models via
 gallery agent (#10464)

chore(model gallery): :robot: add new models via gallery agent

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 gallery/index.yaml | 59 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index bc0053289..29f284ad8 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,63 @@
 ---
+- name: "qwopus3.6-27b-coder-compat-mtp"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/Jackrong/Qwopus3.6-27B-Coder-Compat-MTP-GGUF
+  description: |
+    🪐 Qwopus-3.6-27B-Coder
+    Coder SFT Release
+
+    Agentic Coding &amp; Tool-Use Reasoning Model Fine-Tuned on Qwopus3.6-27B-v2
+
+    🧬 Trace Inversion & Negentropy
+    🧠 27B Dense Model
+    ⚡ Agentic Coding
+    🛠️ Tool Calling & Agent
+    🏆 SWE-bench Verified: 67.0% (off-thinking)
+
+    💡 What is Qwopus-3.6-27B-Coder?
+    🪐 Qwopus-3.6-27B-Coder is a reasoning-enhanced agentic coding model built on top of Qwopus3.6-27B-v2. It inherits the powerful reasoning foundation of the v2 base — which achieved 87.43% MMLU-Pro and 75.25% SWE-bench Verified — and further specializes it for agentic code generation, structured tool calling, debugging, and instruction-following in developer workflows. The model is designed to excel at repository-level coding tasks, multi-turn tool orchestration, and complex logical reasoning under realistic agent environments.
+
+    🧩 Agentic Coding
+    Optimized for repository-level coding, debugging, patch generation, and structured multi-step development workflows.
+
+    🛠️ Tool Calling
+    Learns from real agent trajectories with tool definitions, tool calls, and environment feedback for robust multi-turn execution.
+
+    ...
+  license: "apache-2.0"
+  tags:
+    - llm
+    - gguf
+    - vision
+    - multimodal
+    - reasoning
+  icon: https://cdn-uploads.huggingface.co/production/uploads/66309bd090589b7c65950665/sGQKmrMc6L6guMoaB5_Y2.png
+  overrides:
+    backend: llama-cpp
+    function:
+      automatic_tool_parsing_fallback: true
+      grammar:
+        disable: true
+    known_usecases:
+      - chat
+    mmproj: llama-cpp/mmproj/Qwopus3.6-27B-Coder-Compat-MTP-GGUF/mmproj-F32.gguf
+    options:
+      - use_jinja:true
+      - spec_type:draft-mtp
+      - spec_n_max:6
+      - spec_p_min:0.75
+    parameters:
+      model: llama-cpp/models/Qwopus3.6-27B-Coder-Compat-MTP-GGUF/Qwopus3.6-27B-Coder-Compat-MTP-Q4_K_M.gguf
+    template:
+      use_tokenizer_template: true
+  files:
+    - filename: llama-cpp/models/Qwopus3.6-27B-Coder-Compat-MTP-GGUF/Qwopus3.6-27B-Coder-Compat-MTP-Q4_K_M.gguf
+      sha256: f893632170124da60e159b7bcc9d91e1cda3014b2c6b8ad9c6cde38a1fcd2f6f
+      uri: https://huggingface.co/Jackrong/Qwopus3.6-27B-Coder-Compat-MTP-GGUF/resolve/main/Qwopus3.6-27B-Coder-Compat-MTP-Q4_K_M.gguf
+    - filename: llama-cpp/mmproj/Qwopus3.6-27B-Coder-Compat-MTP-GGUF/mmproj-F32.gguf
+      sha256: 32f7ea0600c07272547da401d460f8abbd980f3a57b69d6df87be0e2505e0b9c
+      uri: https://huggingface.co/Jackrong/Qwopus3.6-27B-Coder-Compat-MTP-GGUF/resolve/main/mmproj-F32.gguf
 - name: "kimi-k2.7-code"
   url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
   urls:

From deb430f3ecec3a726ed8a377ef9a1aeef2f2747e Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Tue, 23 Jun 2026 23:15:47 +0200
Subject: [PATCH 63/99] chore(model-gallery): :arrow_up: update checksum
 (#10469)

:arrow_up: Checksum updates in gallery/index.yaml

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 gallery/index.yaml | 97 +++-------------------------------------------
 1 file changed, 5 insertions(+), 92 deletions(-)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 29f284ad8..255e828f5 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -3,28 +3,7 @@
   url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
   urls:
     - https://huggingface.co/Jackrong/Qwopus3.6-27B-Coder-Compat-MTP-GGUF
-  description: |
-    🪐 Qwopus-3.6-27B-Coder
-    Coder SFT Release
-
-    Agentic Coding &amp; Tool-Use Reasoning Model Fine-Tuned on Qwopus3.6-27B-v2
-
-    🧬 Trace Inversion & Negentropy
-    🧠 27B Dense Model
-    ⚡ Agentic Coding
-    🛠️ Tool Calling & Agent
-    🏆 SWE-bench Verified: 67.0% (off-thinking)
-
-    💡 What is Qwopus-3.6-27B-Coder?
-    🪐 Qwopus-3.6-27B-Coder is a reasoning-enhanced agentic coding model built on top of Qwopus3.6-27B-v2. It inherits the powerful reasoning foundation of the v2 base — which achieved 87.43% MMLU-Pro and 75.25% SWE-bench Verified — and further specializes it for agentic code generation, structured tool calling, debugging, and instruction-following in developer workflows. The model is designed to excel at repository-level coding tasks, multi-turn tool orchestration, and complex logical reasoning under realistic agent environments.
-
-    🧩 Agentic Coding
-    Optimized for repository-level coding, debugging, patch generation, and structured multi-step development workflows.
-
-    🛠️ Tool Calling
-    Learns from real agent trajectories with tool definitions, tool calls, and environment feedback for robust multi-turn execution.
-
-    ...
+  description: "\U0001FA90 Qwopus-3.6-27B-Coder\nCoder SFT Release\n\nAgentic Coding &amp; Tool-Use Reasoning Model Fine-Tuned on Qwopus3.6-27B-v2\n\n\U0001F9EC Trace Inversion & Negentropy\n\U0001F9E0 27B Dense Model\n⚡ Agentic Coding\n\U0001F6E0️ Tool Calling & Agent\n\U0001F3C6 SWE-bench Verified: 67.0% (off-thinking)\n\n\U0001F4A1 What is Qwopus-3.6-27B-Coder?\n\U0001FA90 Qwopus-3.6-27B-Coder is a reasoning-enhanced agentic coding model built on top of Qwopus3.6-27B-v2. It inherits the powerful reasoning foundation of the v2 base — which achieved 87.43% MMLU-Pro and 75.25% SWE-bench Verified — and further specializes it for agentic code generation, structured tool calling, debugging, and instruction-following in developer workflows. The model is designed to excel at repository-level coding tasks, multi-turn tool orchestration, and complex logical reasoning under realistic agent environments.\n\n\U0001F9E9 Agentic Coding\nOptimized for repository-level coding, debugging, patch generation, and structured multi-step development workflows.\n\n\U0001F6E0️ Tool Calling\nLearns from real agent trajectories with tool definitions, tool calls, and environment feedback for robust multi-turn execution.\n\n...\n"
   license: "apache-2.0"
   tags:
     - llm
@@ -241,33 +220,7 @@
   url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
   urls:
     - https://huggingface.co/unsloth/GLM-5.2-GGUF
-  description: |
-    # GLM-5.2
-
-    👋 Join our WeChat or Discord community.
-
-    📖 Check out the GLM-5.2 blog and GLM-5 Technical report.
-
-    📍 Use GLM-5.2 API services on Z.ai API Platform.
-
-    🔜 Try GLM-5.2 here.
-
-    [Paper]
-    [GitHub]
-
-    ## Introduction
-
-    We're introducing GLM-5.2, our latest flagship model for long-horizon tasks. It marks a substantial leap in long-horizon task capability over its predecessor GLM-5.1 and, for the first time, delivers that capability on a **solid 1M-token context**. GLM-5.2's new capabilities include:
-      - **Solid 1M Context:** A solid 1M-token context that stably sustains long-horizon work
-      - **Advanced Coding with Flexible Effort**: Stronger coding capabilities with multiple thinking effort levels to balance performance and latency
-      - **Improved Architecture**: We propose IndexShare, which reuses the same indexer across every four sparse attention layers, reducing per-token FLOPs by 2.9× at a 1M context length. We also improve GLM-5.2’s MTP layer for speculative decoding, increasing the acceptance length by up to 20%
-      - **Pure Open**: An MIT open-source license — no regional limits, technical access without borders
-
-    ## Benchmark
-
-    ## Serve GLM-5.2 Locally
-
-    ...
+  description: "# GLM-5.2\n\n\U0001F44B Join our WeChat or Discord community.\n\n\U0001F4D6 Check out the GLM-5.2 blog and GLM-5 Technical report.\n\n\U0001F4CD Use GLM-5.2 API services on Z.ai API Platform.\n\n\U0001F51C Try GLM-5.2 here.\n\n[Paper]\n[GitHub]\n\n## Introduction\n\nWe're introducing GLM-5.2, our latest flagship model for long-horizon tasks. It marks a substantial leap in long-horizon task capability over its predecessor GLM-5.1 and, for the first time, delivers that capability on a **solid 1M-token context**. GLM-5.2's new capabilities include:\n  - **Solid 1M Context:** A solid 1M-token context that stably sustains long-horizon work\n  - **Advanced Coding with Flexible Effort**: Stronger coding capabilities with multiple thinking effort levels to balance performance and latency\n  - **Improved Architecture**: We propose IndexShare, which reuses the same indexer across every four sparse attention layers, reducing per-token FLOPs by 2.9× at a 1M context length. We also improve GLM-5.2’s MTP layer for speculative decoding, increasing the acceptance length by up to 20%\n  - **Pure Open**: An MIT open-source license — no regional limits, technical access without borders\n\n## Benchmark\n\n## Serve GLM-5.2 Locally\n\n...\n"
   license: "mit"
   tags:
     - llm
@@ -390,26 +343,7 @@
   url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
   urls:
     - https://huggingface.co/michaelw9999/Qwopus3.6-27B-v2-MTP-NVFP4-GGUF
-  description: |
-    🪐 Qwopus3.6-27B-v2-MTP
-    MTP Release
-
-    Multi-Token Prediction reasoning model fine-tuned from Qwen3.6-27B
-
-    🧬 Trace Inversion & Negentropy
-    🧠 27B Parameters
-    ⚡ Speculative Decoding
-    🛠️ Coding / DevOps / Math
-
-    💡 What is Qwopus3.6-27B-v2-MTP?
-    🪐 Qwopus3.6-27B-v2-MTP is a speed-oriented reasoning release built on top of Qwen3.6-27B. It keeps the Qwopus line's focus on reconstructed reasoning traces, coding discipline, DevOps procedures, and mathematical derivations, while adding Multi-Token Prediction for faster generation. The goal is simple: preserve the depth and structure of a 27B reasoning model while making real interactive use noticeably faster.
-
-    ⚡ MTP DecodingAuxiliary future-token prediction improves throughput on long reasoning, code, math, and strict-format prompts.
-    🧩 Structured ReasoningInherits the Qwopus training recipe built around reconstructed step-by-step reasoning trajectories.
-    🧪 GB10 TestedValidated on a 30-question local benchmark across Logic, Coding, DevOps, Math, and Edge tasks.
-    🚀 Practical SpeedDesigned for workflows where strong answers matter, but waiting several extra minutes per task does not.
-
-    ...
+  description: "\U0001FA90 Qwopus3.6-27B-v2-MTP\nMTP Release\n\nMulti-Token Prediction reasoning model fine-tuned from Qwen3.6-27B\n\n\U0001F9EC Trace Inversion & Negentropy\n\U0001F9E0 27B Parameters\n⚡ Speculative Decoding\n\U0001F6E0️ Coding / DevOps / Math\n\n\U0001F4A1 What is Qwopus3.6-27B-v2-MTP?\n\U0001FA90 Qwopus3.6-27B-v2-MTP is a speed-oriented reasoning release built on top of Qwen3.6-27B. It keeps the Qwopus line's focus on reconstructed reasoning traces, coding discipline, DevOps procedures, and mathematical derivations, while adding Multi-Token Prediction for faster generation. The goal is simple: preserve the depth and structure of a 27B reasoning model while making real interactive use noticeably faster.\n\n⚡ MTP DecodingAuxiliary future-token prediction improves throughput on long reasoning, code, math, and strict-format prompts.\n\U0001F9E9 Structured ReasoningInherits the Qwopus training recipe built around reconstructed step-by-step reasoning trajectories.\n\U0001F9EA GB10 TestedValidated on a 30-question local benchmark across Logic, Coding, DevOps, Math, and Edge tasks.\n\U0001F680 Practical SpeedDesigned for workflows where strong answers matter, but waiting several extra minutes per task does not.\n\n...\n"
   tags:
     - llm
     - gguf
@@ -435,28 +369,7 @@
   url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
   urls:
     - https://huggingface.co/michaelw9999/Qwopus3.6-27B-Coder-MTP-NVFP4-GGUF
-  description: |
-    🪐 Qwopus-3.6-27B-Coder
-    Coder SFT Release
-
-    Agentic Coding &amp; Tool-Use Reasoning Model Fine-Tuned on Qwopus3.6-27B-v2
-
-    🧬 Trace Inversion & Negentropy
-    🧠 27B Dense Model
-    ⚡ Agentic Coding
-    🛠️ Tool Calling & Agent
-    🏆 SWE-bench Verified: 67.0% (off-thinking)
-
-    💡 What is Qwopus-3.6-27B-Coder?
-    🪐 Qwopus-3.6-27B-Coder is a reasoning-enhanced agentic coding model built on top of Qwopus3.6-27B-v2. It inherits the powerful reasoning foundation of the v2 base — which achieved 87.43% MMLU-Pro (300ex) and 75.25% SWE-bench Verified — and further specializes it for agentic code generation, structured tool calling, debugging, and instruction-following in developer workflows. The model is designed to excel at repository-level coding tasks, multi-turn tool orchestration, and complex logical reasoning under realistic agent environments.
-
-    🧩 Agentic Coding
-    Optimized for repository-level coding, debugging, patch generation, and structured multi-step development workflows.
-
-    🛠️ Tool Calling
-    Learns from real agent trajectories with tool definitions, tool calls, and environment feedback for robust multi-turn execution.
-
-    ...
+  description: "\U0001FA90 Qwopus-3.6-27B-Coder\nCoder SFT Release\n\nAgentic Coding &amp; Tool-Use Reasoning Model Fine-Tuned on Qwopus3.6-27B-v2\n\n\U0001F9EC Trace Inversion & Negentropy\n\U0001F9E0 27B Dense Model\n⚡ Agentic Coding\n\U0001F6E0️ Tool Calling & Agent\n\U0001F3C6 SWE-bench Verified: 67.0% (off-thinking)\n\n\U0001F4A1 What is Qwopus-3.6-27B-Coder?\n\U0001FA90 Qwopus-3.6-27B-Coder is a reasoning-enhanced agentic coding model built on top of Qwopus3.6-27B-v2. It inherits the powerful reasoning foundation of the v2 base — which achieved 87.43% MMLU-Pro (300ex) and 75.25% SWE-bench Verified — and further specializes it for agentic code generation, structured tool calling, debugging, and instruction-following in developer workflows. The model is designed to excel at repository-level coding tasks, multi-turn tool orchestration, and complex logical reasoning under realistic agent environments.\n\n\U0001F9E9 Agentic Coding\nOptimized for repository-level coding, debugging, patch generation, and structured multi-step development workflows.\n\n\U0001F6E0️ Tool Calling\nLearns from real agent trajectories with tool definitions, tool calls, and environment feedback for robust multi-turn execution.\n\n...\n"
   tags:
     - llm
     - gguf
@@ -1676,8 +1589,8 @@
       use_tokenizer_template: true
   files:
     - filename: llama-cpp/models/Qwopus3.6-27B-v2-MTP-GGUF/Qwopus3.6-27B-v2-MTP-Q4_K_M.gguf
-      sha256: 818d68223be4d8518dac0b3b5604dde633cbbcbae1f491d842a3e26711c6606d
       uri: https://huggingface.co/Jackrong/Qwopus3.6-27B-v2-MTP-GGUF/resolve/main/Qwopus3.6-27B-v2-MTP-Q4_K_M.gguf
+      sha256: 31cf5fc2406a0c7aaebcc26d440bf0df94e215d0589d5205bf319649c052b50a
 - name: "qwen3.6-40b-claude-4.6-opus-deckard-heretic-uncensored-thinking-neo-code-di-imatrix-max"
   url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
   urls:

From 4dbf69f889591a9da9c064924406985a8d75317c Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Wed, 24 Jun 2026 00:00:26 +0200
Subject: [PATCH 64/99] chore(model gallery): :robot: add 1 new models via
 gallery agent (#10472)

chore(model gallery): :robot: add new models via gallery agent

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 gallery/index.yaml | 50 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 255e828f5..e26f2a1f5 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,54 @@
 ---
+- name: "lfm2.5-1.2b-instruct"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct-GGUF
+  description: |
+    Try LFM • Docs • LEAP • Discord
+
+    # LFM2.5-1.2B-Instruct
+
+    LFM2.5 is a new family of hybrid models designed for **on-device deployment**. It builds on the LFM2 architecture with extended pre-training and reinforcement learning.
+
+      - **Best-in-class performance**: A 1.2B model rivaling much larger models, bringing high-quality AI to your pocket.
+      - **Fast edge inference**: 239 tok/s decode on AMD CPU, 82 tok/s on mobile NPU. Runs under 1GB of memory with day-one support for llama.cpp, MLX, and vLLM.
+      - **Scaled training**: Extended pre-training from 10T to 28T tokens and large-scale multi-stage reinforcement learning.
+
+    Find more information about LFM2.5 in our blog post.
+
+    ## 🗒️ Model Details
+
+    LFM2.5-1.2B-Instruct is a general-purpose text-only model with the following features:
+
+    ...
+  license: "other"
+  tags:
+    - llm
+    - gguf
+  icon: https://cdn-uploads.huggingface.co/production/uploads/61b8e2ba285851687028d395/dxnYF2fuLpulismtFSGFi.png
+  overrides:
+    backend: llama-cpp
+    function:
+      automatic_tool_parsing_fallback: true
+      grammar:
+        disable: true
+    known_usecases:
+      - chat
+    options:
+      - use_jinja:true
+    parameters:
+      min_p: 0.15
+      model: llama-cpp/models/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf
+      repeat_penalty: 1.05
+      temperature: 0.1
+      top_k: 50
+      top_p: 0.1
+    template:
+      use_tokenizer_template: true
+  files:
+    - filename: llama-cpp/models/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf
+      sha256: b1b3de114215d9507409a662a501a631095a479a419584e8a2ded6304b19b4f5
+      uri: https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct-GGUF/resolve/main/LFM2.5-1.2B-Instruct-Q4_K_M.gguf
 - name: "qwopus3.6-27b-coder-compat-mtp"
   url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
   urls:

From c93190de5017aa7aa0d8b6f0fe9df0547334c7a7 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Wed, 24 Jun 2026 09:40:13 +0200
Subject: [PATCH 65/99] chore: :arrow_up: Update ikawrakow/ik_llama.cpp to
 `7ccf1d209588962b96eacca325b37e9b3e8faf5e` (#10456)

:arrow_up: Update ikawrakow/ik_llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/cpp/ik-llama-cpp/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/cpp/ik-llama-cpp/Makefile b/backend/cpp/ik-llama-cpp/Makefile
index f7875c54d..0fbcf0bdb 100644
--- a/backend/cpp/ik-llama-cpp/Makefile
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -1,5 +1,5 @@
 
-IK_LLAMA_VERSION?=6c00e87ac84404af588ad2e65935bd6f079c696f
+IK_LLAMA_VERSION?=7ccf1d209588962b96eacca325b37e9b3e8faf5e
 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
 
 CMAKE_ARGS?=

From dba9cd7ca4eab42d8dbb35bd770fda1b57b3532e Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Wed, 24 Jun 2026 09:40:34 +0200
Subject: [PATCH 66/99] chore: :arrow_up: Update CrispStrobe/CrispASR to
 `96b2a6ee31d30389fed8a7ef1a54239b75231ddc` (#10465)

:arrow_up: Update CrispStrobe/CrispASR

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/go/crispasr/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/go/crispasr/Makefile b/backend/go/crispasr/Makefile
index 27e3b30d2..ba55b485e 100644
--- a/backend/go/crispasr/Makefile
+++ b/backend/go/crispasr/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 
 # CrispASR version (release tag)
 CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
-CRISPASR_VERSION?=63b57289255267edf66e43e33bc3911e04a2e92d
+CRISPASR_VERSION?=96b2a6ee31d30389fed8a7ef1a54239b75231ddc
 SO_TARGET?=libgocrispasr.so
 
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

From a5e28942a68f45941c88a7702f8ce2c7a3d46b87 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Wed, 24 Jun 2026 09:40:54 +0200
Subject: [PATCH 67/99] chore: :arrow_up: Update ggml-org/llama.cpp to
 `be4a6a63eb2b848e19c277bdcf2bd399e8af76d9` (#10467)

:arrow_up: Update ggml-org/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/cpp/llama-cpp/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile
index 122a30c71..24f1f215d 100644
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@
 
-LLAMA_VERSION?=73618f27a801c0b8614ceaf3547d3c2a99baae14
+LLAMA_VERSION?=be4a6a63eb2b848e19c277bdcf2bd399e8af76d9
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
 
 CMAKE_ARGS?=

From 4b6f911835218710d348c25b70541bf6f5d331ca Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Wed, 24 Jun 2026 09:41:14 +0200
Subject: [PATCH 68/99] chore: :arrow_up: Update ggml-org/whisper.cpp to
 `43d78af5be58f41d6ffbc227d608f104577741ea` (#10466)

:arrow_up: Update ggml-org/whisper.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/go/whisper/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/go/whisper/Makefile b/backend/go/whisper/Makefile
index 65c3d97d1..e8ad8545f 100644
--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=bae6bc02b1940bbfb87b6a0299c565e563b916d1
+WHISPER_CPP_VERSION?=43d78af5be58f41d6ffbc227d608f104577741ea
 SO_TARGET?=libgowhisper.so
 
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

From 0f3b24436d92e36e1df193444567134fe43dac7a Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Wed, 24 Jun 2026 09:41:43 +0200
Subject: [PATCH 69/99] chore: :arrow_up: Update mudler/parakeet.cpp to
 `89f5e2977b4d8bccd45e7bcc6f2ef7c4ed49e89a` (#10468)

:arrow_up: Update mudler/parakeet.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/go/parakeet-cpp/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/go/parakeet-cpp/Makefile b/backend/go/parakeet-cpp/Makefile
index 9a781d634..f9848dc34 100644
--- a/backend/go/parakeet-cpp/Makefile
+++ b/backend/go/parakeet-cpp/Makefile
@@ -1,6 +1,6 @@
 # parakeet-cpp backend Makefile.
 #
-# Upstream pin lives below as PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36
+# Upstream pin lives below as PARAKEET_VERSION?=89f5e2977b4d8bccd45e7bcc6f2ef7c4ed49e89a
 # (.github/bump_deps.sh) can find and update it - matches the
 # whisper.cpp / ds4 / vibevoice-cpp convention.
 #
@@ -15,7 +15,7 @@
 # That's what the L0 smoke test uses. The default target below does the
 # proper clone-at-pin + cmake build so CI doesn't need a side-checkout.
 
-PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36
+PARAKEET_VERSION?=89f5e2977b4d8bccd45e7bcc6f2ef7c4ed49e89a
 PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp
 
 GOCMD?=go

From e6042080c0da69f39baa6139314ef2f7776a4bf6 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Wed, 24 Jun 2026 09:42:09 +0200
Subject: [PATCH 70/99] fix(agents): URL-decode collection/agent name path
 params (#10443) (#10471)

fix(agents): URL-decode collection/agent name path params

Collection and agent names carry a "legacy-api-key:" prefix, so the ':'
arrives percent-encoded as %3A in the request path. Echo routes such
paths via URL.RawPath and stores the matched path-param value still
escaped, so c.Param("name") returned "legacy-api-key%3ALiteraryResearch"
and the store lookup 404'd ("collection not found").

This was second-order fallout of #10375/#10387: once colons became valid
in names, the URL-decode gap surfaced on every name-bearing endpoint.

Add a decodedParam helper that url.PathUnescape's the param (falling back
to the raw value on invalid encoding) and wire it into all collection
endpoints and the agent :name endpoints, which share the identical
prefix. The entry endpoints already unescaped c.Param("*"); this closes
the same gap for :name.

Fixes #10443


Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 .../endpoints/localai/agent_collections.go    | 20 ++++----
 .../localai/agent_collections_param_test.go   | 49 +++++++++++++++++++
 core/http/endpoints/localai/agents.go         | 41 +++++++++++-----
 3 files changed, 88 insertions(+), 22 deletions(-)
 create mode 100644 core/http/endpoints/localai/agent_collections_param_test.go

diff --git a/core/http/endpoints/localai/agent_collections.go b/core/http/endpoints/localai/agent_collections.go
index 17997bcfe..98850d6d1 100644
--- a/core/http/endpoints/localai/agent_collections.go
+++ b/core/http/endpoints/localai/agent_collections.go
@@ -70,7 +70,7 @@ func UploadToCollectionEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := c.Param("name")
+		name := decodedParam(c, "name")
 		file, err := c.FormFile("file")
 		if err != nil {
 			return c.JSON(http.StatusBadRequest, map[string]string{"error": "file required"})
@@ -116,7 +116,7 @@ func ListCollectionEntriesEndpoint(app *application.Application) echo.HandlerFun
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		entries, err := svc.ListCollectionEntriesForUser(userID, c.Param("name"))
+		entries, err := svc.ListCollectionEntriesForUser(userID, decodedParam(c, "name"))
 		if err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
@@ -139,7 +139,7 @@ func GetCollectionEntryContentEndpoint(app *application.Application) echo.Handle
 		if err != nil {
 			entry = entryParam
 		}
-		content, chunkCount, err := svc.GetCollectionEntryContentForUser(userID, c.Param("name"), entry)
+		content, chunkCount, err := svc.GetCollectionEntryContentForUser(userID, decodedParam(c, "name"), entry)
 		if err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
@@ -164,7 +164,7 @@ func SearchCollectionEndpoint(app *application.Application) echo.HandlerFunc {
 		if err := c.Bind(&payload); err != nil {
 			return c.JSON(http.StatusBadRequest, map[string]string{"error": err.Error()})
 		}
-		results, err := svc.SearchCollectionForUser(userID, c.Param("name"), payload.Query, payload.MaxResults)
+		results, err := svc.SearchCollectionForUser(userID, decodedParam(c, "name"), payload.Query, payload.MaxResults)
 		if err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
@@ -182,7 +182,7 @@ func ResetCollectionEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		if err := svc.ResetCollectionForUser(userID, c.Param("name")); err != nil {
+		if err := svc.ResetCollectionForUser(userID, decodedParam(c, "name")); err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
 			}
@@ -202,7 +202,7 @@ func DeleteCollectionEntryEndpoint(app *application.Application) echo.HandlerFun
 		if err := c.Bind(&payload); err != nil {
 			return c.JSON(http.StatusBadRequest, map[string]string{"error": err.Error()})
 		}
-		remaining, err := svc.DeleteCollectionEntryForUser(userID, c.Param("name"), payload.Entry)
+		remaining, err := svc.DeleteCollectionEntryForUser(userID, decodedParam(c, "name"), payload.Entry)
 		if err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
@@ -230,7 +230,7 @@ func AddCollectionSourceEndpoint(app *application.Application) echo.HandlerFunc
 		if payload.UpdateInterval < 1 {
 			payload.UpdateInterval = 60
 		}
-		if err := svc.AddCollectionSourceForUser(userID, c.Param("name"), payload.URL, payload.UpdateInterval); err != nil {
+		if err := svc.AddCollectionSourceForUser(userID, decodedParam(c, "name"), payload.URL, payload.UpdateInterval); err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
 			}
@@ -250,7 +250,7 @@ func RemoveCollectionSourceEndpoint(app *application.Application) echo.HandlerFu
 		if err := c.Bind(&payload); err != nil {
 			return c.JSON(http.StatusBadRequest, map[string]string{"error": err.Error()})
 		}
-		if err := svc.RemoveCollectionSourceForUser(userID, c.Param("name"), payload.URL); err != nil {
+		if err := svc.RemoveCollectionSourceForUser(userID, decodedParam(c, "name"), payload.URL); err != nil {
 			return c.JSON(http.StatusInternalServerError, map[string]string{"error": err.Error()})
 		}
 		return c.JSON(http.StatusOK, map[string]string{"status": "ok"})
@@ -267,7 +267,7 @@ func GetCollectionEntryRawFileEndpoint(app *application.Application) echo.Handle
 		if err != nil {
 			entry = entryParam
 		}
-		fpath, err := svc.GetCollectionEntryFilePathForUser(userID, c.Param("name"), entry)
+		fpath, err := svc.GetCollectionEntryFilePathForUser(userID, decodedParam(c, "name"), entry)
 		if err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
@@ -282,7 +282,7 @@ func ListCollectionSourcesEndpoint(app *application.Application) echo.HandlerFun
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		sources, err := svc.ListCollectionSourcesForUser(userID, c.Param("name"))
+		sources, err := svc.ListCollectionSourcesForUser(userID, decodedParam(c, "name"))
 		if err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
diff --git a/core/http/endpoints/localai/agent_collections_param_test.go b/core/http/endpoints/localai/agent_collections_param_test.go
new file mode 100644
index 000000000..e77ab9561
--- /dev/null
+++ b/core/http/endpoints/localai/agent_collections_param_test.go
@@ -0,0 +1,49 @@
+package localai
+
+import (
+	"net/http"
+	"net/http/httptest"
+
+	"github.com/labstack/echo/v4"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// Regression for #10443: agent/collection names carry a "legacy-api-key:"
+// prefix, so the ':' is percent-encoded as %3A in the request path. Echo routes
+// such paths via URL.RawPath and stores the path-param value still escaped, so
+// handlers must URL-decode it before looking the collection up in the store -
+// otherwise the lookup sees "legacy-api-key%3ALiteraryResearch" and 404s.
+var _ = Describe("decodedParam", func() {
+	var e *echo.Echo
+
+	BeforeEach(func() {
+		e = echo.New()
+	})
+
+	// route runs a request through Echo's real router so the path param is
+	// populated exactly as it would be in production, then returns the decoded
+	// value the handler would observe.
+	route := func(rawPath string) string {
+		var got string
+		e.GET("/api/agents/collections/:name/upload", func(c echo.Context) error {
+			got = decodedParam(c, "name")
+			return c.NoContent(http.StatusOK)
+		})
+		req := httptest.NewRequest(http.MethodGet, rawPath, nil)
+		rec := httptest.NewRecorder()
+		e.ServeHTTP(rec, req)
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		return got
+	}
+
+	It("decodes a percent-encoded colon in the collection name", func() {
+		got := route("/api/agents/collections/legacy-api-key%3ALiteraryResearch/upload")
+		Expect(got).To(Equal("legacy-api-key:LiteraryResearch"))
+	})
+
+	It("leaves an unencoded name untouched", func() {
+		got := route("/api/agents/collections/PlainCollection/upload")
+		Expect(got).To(Equal("PlainCollection"))
+	})
+})
diff --git a/core/http/endpoints/localai/agents.go b/core/http/endpoints/localai/agents.go
index 2bf2b3263..fa09b557e 100644
--- a/core/http/endpoints/localai/agents.go
+++ b/core/http/endpoints/localai/agents.go
@@ -6,6 +6,7 @@ import (
 	"io"
 	"maps"
 	"net/http"
+	"net/url"
 	"os"
 	"path/filepath"
 	"slices"
@@ -33,6 +34,22 @@ func getUserID(c echo.Context) string {
 	return user.ID
 }
 
+// decodedParam returns the named path parameter, URL-decoding it.
+//
+// Echo routes a request via URL.RawPath whenever the path contains
+// percent-encoded characters (e.g. %3A for ':'), and in that case stores the
+// matched path-param value raw/escaped. Agent and collection names carry a
+// "legacy-api-key:" prefix, so the ':' arrives as %3A and the raw param no
+// longer matches the stored name. Callers must unescape before lookups.
+// Falls back to the raw value if it isn't valid percent-encoding.
+func decodedParam(c echo.Context, name string) string {
+	raw := c.Param(name)
+	if decoded, err := url.PathUnescape(raw); err == nil {
+		return decoded
+	}
+	return raw
+}
+
 // isAdminUser returns true if the authenticated user has admin role.
 func isAdminUser(c echo.Context) bool {
 	user := auth.GetUser(c)
@@ -127,7 +144,7 @@ func GetAgentEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := c.Param("name")
+		name := decodedParam(c, "name")
 
 		statuses := svc.ListAgentsForUser(userID)
 		active, exists := statuses[name]
@@ -142,7 +159,7 @@ func UpdateAgentEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := c.Param("name")
+		name := decodedParam(c, "name")
 		var cfg state.AgentConfig
 		if err := c.Bind(&cfg); err != nil {
 			return c.JSON(http.StatusBadRequest, map[string]string{"error": err.Error()})
@@ -161,7 +178,7 @@ func DeleteAgentEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := c.Param("name")
+		name := decodedParam(c, "name")
 		if err := svc.DeleteAgentForUser(userID, name); err != nil {
 			return c.JSON(http.StatusInternalServerError, map[string]string{"error": err.Error()})
 		}
@@ -173,7 +190,7 @@ func GetAgentConfigEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := c.Param("name")
+		name := decodedParam(c, "name")
 		cfg := svc.GetAgentConfigForUser(userID, name)
 		if cfg == nil {
 			return c.JSON(http.StatusNotFound, map[string]string{"error": "Agent not found"})
@@ -186,7 +203,7 @@ func PauseAgentEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		if err := svc.PauseAgentForUser(userID, c.Param("name")); err != nil {
+		if err := svc.PauseAgentForUser(userID, decodedParam(c, "name")); err != nil {
 			return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
 		}
 		return c.JSON(http.StatusOK, map[string]string{"status": "ok"})
@@ -197,7 +214,7 @@ func ResumeAgentEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		if err := svc.ResumeAgentForUser(userID, c.Param("name")); err != nil {
+		if err := svc.ResumeAgentForUser(userID, decodedParam(c, "name")); err != nil {
 			return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
 		}
 		return c.JSON(http.StatusOK, map[string]string{"status": "ok"})
@@ -208,7 +225,7 @@ func GetAgentStatusEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := c.Param("name")
+		name := decodedParam(c, "name")
 
 		history := svc.GetAgentStatusForUser(userID, name)
 		if history == nil {
@@ -241,7 +258,7 @@ func GetAgentObservablesEndpoint(app *application.Application) echo.HandlerFunc
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := c.Param("name")
+		name := decodedParam(c, "name")
 
 		history, err := svc.GetAgentObservablesForUser(userID, name)
 		if err != nil {
@@ -261,7 +278,7 @@ func ClearAgentObservablesEndpoint(app *application.Application) echo.HandlerFun
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := c.Param("name")
+		name := decodedParam(c, "name")
 		if err := svc.ClearAgentObservablesForUser(userID, name); err != nil {
 			return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
 		}
@@ -273,7 +290,7 @@ func ChatWithAgentEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := c.Param("name")
+		name := decodedParam(c, "name")
 		var payload struct {
 			Message string `json:"message"`
 		}
@@ -302,7 +319,7 @@ func AgentSSEEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := c.Param("name")
+		name := decodedParam(c, "name")
 
 		// Try local SSE manager first
 		manager := svc.GetSSEManagerForUser(userID, name)
@@ -334,7 +351,7 @@ func ExportAgentEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := c.Param("name")
+		name := decodedParam(c, "name")
 		data, err := svc.ExportAgentForUser(userID, name)
 		if err != nil {
 			return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})

From fc618dcee6befb61dfb3867989bb4f358786aa8d Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Wed, 24 Jun 2026 10:13:37 +0200
Subject: [PATCH 71/99] fix(distributed): track in-flight for SoundDetection
 requests (#10475)

The distributed router wraps backend clients in InFlightTrackingClient so
the eviction logic knows which replicas are actively serving. Every
inference method must be wrapped: track() increments in-flight on entry
and decrements (plus fires onFirstComplete, which releases the load-time
reservation) on return.

SoundDetection was added after the tracking client and never got a
wrapper, so its calls fell through to the embedded passthrough Backend.
The increment/decrement never ran and, critically, onFirstComplete never
fired, so the reservation set at model load was never released - leaving
in-flight stuck at 1 and the replica permanently ineligible for eviction.

Wrap SoundDetection like the other non-LLM methods and cover it in the
"non-LLM inference methods track in-flight" table test.


Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/services/nodes/inflight.go      | 6 ++++++
 core/services/nodes/inflight_test.go | 7 +++++++
 2 files changed, 13 insertions(+)

diff --git a/core/services/nodes/inflight.go b/core/services/nodes/inflight.go
index 85b10f71c..b51ef6001 100644
--- a/core/services/nodes/inflight.go
+++ b/core/services/nodes/inflight.go
@@ -218,6 +218,12 @@ func (c *InFlightTrackingClient) Score(ctx context.Context, in *pb.ScoreRequest,
 	return res, c.reconcile(err)
 }
 
+func (c *InFlightTrackingClient) SoundDetection(ctx context.Context, in *pb.SoundDetectionRequest, opts ...ggrpc.CallOption) (*pb.SoundDetectionResponse, error) {
+	defer c.track(ctx)()
+	res, err := c.Backend.SoundDetection(ctx, in, opts...)
+	return res, c.reconcile(err)
+}
+
 func (c *InFlightTrackingClient) AudioEncode(ctx context.Context, in *pb.AudioEncodeRequest, opts ...ggrpc.CallOption) (*pb.AudioEncodeResult, error) {
 	defer c.track(ctx)()
 	res, err := c.Backend.AudioEncode(ctx, in, opts...)
diff --git a/core/services/nodes/inflight_test.go b/core/services/nodes/inflight_test.go
index 2eb90f9c6..5fc9820e7 100644
--- a/core/services/nodes/inflight_test.go
+++ b/core/services/nodes/inflight_test.go
@@ -408,6 +408,13 @@ var _ = Describe("InFlightTrackingClient", func() {
 				return err
 			})
 		})
+
+		It("SoundDetection", func() {
+			assertTracked(func() error {
+				_, err := client.SoundDetection(context.Background(), &pb.SoundDetectionRequest{})
+				return err
+			})
+		})
 	})
 
 	Describe("stale model reload (self-heal)", func() {

From e5620989dd6dc839759364111f4c1c984179802e Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Wed, 24 Jun 2026 11:08:29 +0200
Subject: [PATCH 72/99] refactor(distributed): make in-flight tracking coverage
 a compile-time contract (#10476)

PR #10475 fixed SoundDetection in-flight tracking, but the underlying trap
remains: InFlightTrackingClient embedded the whole grpc.Backend interface
"for passthrough of untracked methods", so any newly added inference method
is silently satisfied by the embedded passthrough and never wrapped with
track(). That leaves onFirstComplete unfired and in-flight stuck at 1 - the
exact SoundDetection bug, waiting to recur for the next backend method.

Close the gap at the type level instead of relying on reviewers to remember:

- Split grpc.Backend into two composed sub-interfaces: InferenceBackend
  (methods that are one discrete inference call and must be tracked) and
  ControlBackend (control-plane calls plus the streaming constructors whose
  work spans the returned stream, safe to pass through). The classification
  now lives next to the interface it documents.
- InFlightTrackingClient embeds only grpc.ControlBackend and implements every
  InferenceBackend method explicitly, delegating to an inner InferenceBackend.
  A `var _ grpc.Backend = (*InFlightTrackingClient)(nil)` assertion makes the
  package fail to compile if any inference method is left unwrapped.

Now adding a method to InferenceBackend is a build error (at the assertion and
every call site: "does not implement grpc.Backend (missing method X)"), not a
silent runtime leak - and the obvious fix is to copy a neighbouring wrapper,
which calls track(). No runtime guard or reviewer vigilance required.

Pure refactor: the composed Backend interface is identical to the old flat
one, so all implementers and consumers are unaffected (verified with a full
`go build ./...`). Behaviour is unchanged; the existing nodes suite passes.


Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/services/nodes/inflight.go | 101 +++++++++++++++++++-------------
 pkg/grpc/backend.go             |  71 +++++++++++++++-------
 2 files changed, 109 insertions(+), 63 deletions(-)

diff --git a/core/services/nodes/inflight.go b/core/services/nodes/inflight.go
index b51ef6001..bfc71b999 100644
--- a/core/services/nodes/inflight.go
+++ b/core/services/nodes/inflight.go
@@ -19,25 +19,40 @@ import (
 // Per-replica: a single tracker instance is bound to (nodeID, modelName, replicaIndex).
 // The router constructs one tracker per Route() result, so each in-flight tick lands
 // on the correct row even when multiple replicas of the same model live on the same node.
+//
+// Embedding only grpc.ControlBackend (not the whole grpc.Backend) is what makes
+// the in-flight accounting safe by construction: the control-plane methods pass
+// through untracked, while every grpc.InferenceBackend method must be declared
+// explicitly below to satisfy grpc.Backend. Adding an inference method to the
+// interface therefore breaks this file's build (see the var assertion below)
+// until it is wrapped with track() - so a new inference path can't be added
+// without an in-flight accounting decision.
 type InFlightTrackingClient struct {
-	grpc.Backend // embed for passthrough of untracked methods
-	registry     InFlightTracker
-	nodeID       string
-	modelName    string
-	replicaIndex int
+	grpc.ControlBackend                       // passthrough for control-plane / streaming-constructor methods
+	inner               grpc.InferenceBackend // tracked inference methods delegate here
+	registry            InFlightTracker
+	nodeID              string
+	modelName           string
+	replicaIndex        int
 
 	firstOnce       sync.Once // guards onFirstComplete
 	onFirstComplete func()    // called once after the first tracked inference call completes
 }
 
+// Compile-time contract: *InFlightTrackingClient must implement the FULL backend
+// surface. Because it embeds only ControlBackend, this fails to compile if any
+// InferenceBackend method is left unwrapped.
+var _ grpc.Backend = (*InFlightTrackingClient)(nil)
+
 // NewInFlightTrackingClient wraps a gRPC backend client with in-flight tracking.
 func NewInFlightTrackingClient(inner grpc.Backend, registry InFlightTracker, nodeID, modelName string, replicaIndex int) *InFlightTrackingClient {
 	return &InFlightTrackingClient{
-		Backend:      inner,
-		registry:     registry,
-		nodeID:       nodeID,
-		modelName:    modelName,
-		replicaIndex: replicaIndex,
+		ControlBackend: inner,
+		inner:          inner,
+		registry:       registry,
+		nodeID:         nodeID,
+		modelName:      modelName,
+		replicaIndex:   replicaIndex,
 	}
 }
 
@@ -91,160 +106,162 @@ func (c *InFlightTrackingClient) reconcile(err error) error {
 
 func (c *InFlightTrackingClient) Predict(ctx context.Context, in *pb.PredictOptions, opts ...ggrpc.CallOption) (*pb.Reply, error) {
 	defer c.track(ctx)()
-	reply, err := c.Backend.Predict(ctx, in, opts...)
+	reply, err := c.inner.Predict(ctx, in, opts...)
 	return reply, c.reconcile(err)
 }
 
 func (c *InFlightTrackingClient) PredictStream(ctx context.Context, in *pb.PredictOptions, f func(reply *pb.Reply), opts ...ggrpc.CallOption) error {
 	defer c.track(ctx)()
-	return c.reconcile(c.Backend.PredictStream(ctx, in, f, opts...))
+	return c.reconcile(c.inner.PredictStream(ctx, in, f, opts...))
 }
 
 func (c *InFlightTrackingClient) Embeddings(ctx context.Context, in *pb.PredictOptions, opts ...ggrpc.CallOption) (*pb.EmbeddingResult, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.Embeddings(ctx, in, opts...)
+	res, err := c.inner.Embeddings(ctx, in, opts...)
 	return res, c.reconcile(err)
 }
 
 func (c *InFlightTrackingClient) GenerateImage(ctx context.Context, in *pb.GenerateImageRequest, opts ...ggrpc.CallOption) (*pb.Result, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.GenerateImage(ctx, in, opts...)
+	res, err := c.inner.GenerateImage(ctx, in, opts...)
 	return res, c.reconcile(err)
 }
 
 func (c *InFlightTrackingClient) GenerateVideo(ctx context.Context, in *pb.GenerateVideoRequest, opts ...ggrpc.CallOption) (*pb.Result, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.GenerateVideo(ctx, in, opts...)
+	res, err := c.inner.GenerateVideo(ctx, in, opts...)
 	return res, c.reconcile(err)
 }
 
 func (c *InFlightTrackingClient) TTS(ctx context.Context, in *pb.TTSRequest, opts ...ggrpc.CallOption) (*pb.Result, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.TTS(ctx, in, opts...)
+	res, err := c.inner.TTS(ctx, in, opts...)
 	return res, c.reconcile(err)
 }
 
 func (c *InFlightTrackingClient) TTSStream(ctx context.Context, in *pb.TTSRequest, f func(reply *pb.Reply), opts ...ggrpc.CallOption) error {
 	defer c.track(ctx)()
-	return c.reconcile(c.Backend.TTSStream(ctx, in, f, opts...))
+	return c.reconcile(c.inner.TTSStream(ctx, in, f, opts...))
 }
 
 func (c *InFlightTrackingClient) SoundGeneration(ctx context.Context, in *pb.SoundGenerationRequest, opts ...ggrpc.CallOption) (*pb.Result, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.SoundGeneration(ctx, in, opts...)
+	res, err := c.inner.SoundGeneration(ctx, in, opts...)
 	return res, c.reconcile(err)
 }
 
 func (c *InFlightTrackingClient) AudioTranscription(ctx context.Context, in *pb.TranscriptRequest, opts ...ggrpc.CallOption) (*pb.TranscriptResult, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.AudioTranscription(ctx, in, opts...)
+	res, err := c.inner.AudioTranscription(ctx, in, opts...)
 	return res, c.reconcile(err)
 }
 
 func (c *InFlightTrackingClient) AudioTranscriptionStream(ctx context.Context, in *pb.TranscriptRequest, f func(chunk *pb.TranscriptStreamResponse), opts ...ggrpc.CallOption) error {
 	defer c.track(ctx)()
-	return c.reconcile(c.Backend.AudioTranscriptionStream(ctx, in, f, opts...))
+	return c.reconcile(c.inner.AudioTranscriptionStream(ctx, in, f, opts...))
 }
 
 func (c *InFlightTrackingClient) Detect(ctx context.Context, in *pb.DetectOptions, opts ...ggrpc.CallOption) (*pb.DetectResponse, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.Detect(ctx, in, opts...)
+	res, err := c.inner.Detect(ctx, in, opts...)
 	return res, c.reconcile(err)
 }
 
 func (c *InFlightTrackingClient) Depth(ctx context.Context, in *pb.DepthRequest, opts ...ggrpc.CallOption) (*pb.DepthResponse, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.Depth(ctx, in, opts...)
+	res, err := c.inner.Depth(ctx, in, opts...)
 	return res, c.reconcile(err)
 }
 
 func (c *InFlightTrackingClient) Rerank(ctx context.Context, in *pb.RerankRequest, opts ...ggrpc.CallOption) (*pb.RerankResult, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.Rerank(ctx, in, opts...)
+	res, err := c.inner.Rerank(ctx, in, opts...)
 	return res, c.reconcile(err)
 }
 
 func (c *InFlightTrackingClient) VAD(ctx context.Context, in *pb.VADRequest, opts ...ggrpc.CallOption) (*pb.VADResponse, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.VAD(ctx, in, opts...)
+	res, err := c.inner.VAD(ctx, in, opts...)
 	return res, c.reconcile(err)
 }
 
 func (c *InFlightTrackingClient) Diarize(ctx context.Context, in *pb.DiarizeRequest, opts ...ggrpc.CallOption) (*pb.DiarizeResponse, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.Diarize(ctx, in, opts...)
+	res, err := c.inner.Diarize(ctx, in, opts...)
 	return res, c.reconcile(err)
 }
 
 func (c *InFlightTrackingClient) FaceVerify(ctx context.Context, in *pb.FaceVerifyRequest, opts ...ggrpc.CallOption) (*pb.FaceVerifyResponse, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.FaceVerify(ctx, in, opts...)
+	res, err := c.inner.FaceVerify(ctx, in, opts...)
 	return res, c.reconcile(err)
 }
 
 func (c *InFlightTrackingClient) FaceAnalyze(ctx context.Context, in *pb.FaceAnalyzeRequest, opts ...ggrpc.CallOption) (*pb.FaceAnalyzeResponse, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.FaceAnalyze(ctx, in, opts...)
+	res, err := c.inner.FaceAnalyze(ctx, in, opts...)
 	return res, c.reconcile(err)
 }
 
 func (c *InFlightTrackingClient) VoiceVerify(ctx context.Context, in *pb.VoiceVerifyRequest, opts ...ggrpc.CallOption) (*pb.VoiceVerifyResponse, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.VoiceVerify(ctx, in, opts...)
+	res, err := c.inner.VoiceVerify(ctx, in, opts...)
 	return res, c.reconcile(err)
 }
 
 func (c *InFlightTrackingClient) VoiceAnalyze(ctx context.Context, in *pb.VoiceAnalyzeRequest, opts ...ggrpc.CallOption) (*pb.VoiceAnalyzeResponse, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.VoiceAnalyze(ctx, in, opts...)
+	res, err := c.inner.VoiceAnalyze(ctx, in, opts...)
 	return res, c.reconcile(err)
 }
 
 func (c *InFlightTrackingClient) VoiceEmbed(ctx context.Context, in *pb.VoiceEmbedRequest, opts ...ggrpc.CallOption) (*pb.VoiceEmbedResponse, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.VoiceEmbed(ctx, in, opts...)
+	res, err := c.inner.VoiceEmbed(ctx, in, opts...)
 	return res, c.reconcile(err)
 }
 
 func (c *InFlightTrackingClient) TokenClassify(ctx context.Context, in *pb.TokenClassifyRequest, opts ...ggrpc.CallOption) (*pb.TokenClassifyResponse, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.TokenClassify(ctx, in, opts...)
+	res, err := c.inner.TokenClassify(ctx, in, opts...)
 	return res, c.reconcile(err)
 }
 
 func (c *InFlightTrackingClient) Score(ctx context.Context, in *pb.ScoreRequest, opts ...ggrpc.CallOption) (*pb.ScoreResponse, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.Score(ctx, in, opts...)
+	res, err := c.inner.Score(ctx, in, opts...)
 	return res, c.reconcile(err)
 }
 
 func (c *InFlightTrackingClient) SoundDetection(ctx context.Context, in *pb.SoundDetectionRequest, opts ...ggrpc.CallOption) (*pb.SoundDetectionResponse, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.SoundDetection(ctx, in, opts...)
+	res, err := c.inner.SoundDetection(ctx, in, opts...)
 	return res, c.reconcile(err)
 }
 
 func (c *InFlightTrackingClient) AudioEncode(ctx context.Context, in *pb.AudioEncodeRequest, opts ...ggrpc.CallOption) (*pb.AudioEncodeResult, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.AudioEncode(ctx, in, opts...)
+	res, err := c.inner.AudioEncode(ctx, in, opts...)
 	return res, c.reconcile(err)
 }
 
 func (c *InFlightTrackingClient) AudioDecode(ctx context.Context, in *pb.AudioDecodeRequest, opts ...ggrpc.CallOption) (*pb.AudioDecodeResult, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.AudioDecode(ctx, in, opts...)
+	res, err := c.inner.AudioDecode(ctx, in, opts...)
 	return res, c.reconcile(err)
 }
 
 func (c *InFlightTrackingClient) AudioTransform(ctx context.Context, in *pb.AudioTransformRequest, opts ...ggrpc.CallOption) (*pb.AudioTransformResult, error) {
 	defer c.track(ctx)()
-	res, err := c.Backend.AudioTransform(ctx, in, opts...)
+	res, err := c.inner.AudioTransform(ctx, in, opts...)
 	return res, c.reconcile(err)
 }
 
-// AudioTransformStream, AudioToAudioStream and Forward are deliberately left as
-// embedded passthrough: they return a stream client and the inference spans the
-// stream's lifetime, not the constructor call. Wrapping the constructor with
-// track() would increment and immediately decrement (and fire onFirstComplete)
-// before any audio flows. Tracking those correctly needs the done() func tied to
-// stream close, which the current Backend interface doesn't surface here.
+// AudioTransformStream, AudioToAudioStream and Forward live in grpc.ControlBackend
+// and are passed through via the embedded field, NOT tracked: they return a stream
+// client and the inference spans the stream's lifetime, not the constructor call.
+// Wrapping the constructor with track() would increment and immediately decrement
+// (and fire onFirstComplete) before any audio flows. Tracking those correctly needs
+// the done() func tied to stream close, which the Backend interface doesn't surface
+// here. If they ever need tracking, move them to grpc.InferenceBackend - the build
+// will then force an explicit wrapper here.
diff --git a/pkg/grpc/backend.go b/pkg/grpc/backend.go
index f4cd511ac..838ab9865 100644
--- a/pkg/grpc/backend.go
+++ b/pkg/grpc/backend.go
@@ -41,11 +41,34 @@ func buildClient(address string, parallel bool, wd WatchDog, enableWatchDog bool
 	}
 }
 
+// Backend is the full client surface of a model backend. It is deliberately
+// composed of two sub-interfaces so that wrappers can get a COMPILE-TIME
+// guarantee about which methods they must account for:
+//
+//   - InferenceBackend - methods that each perform one discrete inference call
+//     (the call begins on entry and ends on return). A wrapper that does
+//     per-call accounting - e.g. the distributed router's in-flight tracker,
+//     core/services/nodes.InFlightTrackingClient - embeds only ControlBackend
+//     and implements every InferenceBackend method explicitly. Adding a method
+//     to InferenceBackend therefore breaks that wrapper's build until it is
+//     implemented: inference can't be added without an accounting decision.
+//   - ControlBackend - everything that is NOT a discrete inference call:
+//     lifecycle/control-plane operations and the streaming constructors whose
+//     work spans the returned stream rather than the constructor call. These
+//     are safe to pass through untracked.
+//
+// Keep the two sets disjoint; every backend method belongs to exactly one.
 type Backend interface {
-	IsBusy() bool
-	HealthCheck(ctx context.Context) (bool, error)
+	InferenceBackend
+	ControlBackend
+}
+
+// InferenceBackend is the subset of Backend whose methods each map to a single
+// inference call. Wrappers that account for in-flight work must implement these
+// explicitly (see Backend). Do NOT add methods that return a stream client or
+// that are control-plane only - those belong in ControlBackend.
+type InferenceBackend interface {
 	Embeddings(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.EmbeddingResult, error)
-	LoadModel(ctx context.Context, in *pb.ModelOptions, opts ...grpc.CallOption) (*pb.Result, error)
 	PredictStream(ctx context.Context, in *pb.PredictOptions, f func(reply *pb.Reply), opts ...grpc.CallOption) error
 	Predict(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.Reply, error)
 	GenerateImage(ctx context.Context, in *pb.GenerateImageRequest, opts ...grpc.CallOption) (*pb.Result, error)
@@ -53,6 +76,8 @@ type Backend interface {
 	TTS(ctx context.Context, in *pb.TTSRequest, opts ...grpc.CallOption) (*pb.Result, error)
 	TTSStream(ctx context.Context, in *pb.TTSRequest, f func(reply *pb.Reply), opts ...grpc.CallOption) error
 	SoundGeneration(ctx context.Context, in *pb.SoundGenerationRequest, opts ...grpc.CallOption) (*pb.Result, error)
+	AudioTranscription(ctx context.Context, in *pb.TranscriptRequest, opts ...grpc.CallOption) (*pb.TranscriptResult, error)
+	AudioTranscriptionStream(ctx context.Context, in *pb.TranscriptRequest, f func(chunk *pb.TranscriptStreamResponse), opts ...grpc.CallOption) error
 	Detect(ctx context.Context, in *pb.DetectOptions, opts ...grpc.CallOption) (*pb.DetectResponse, error)
 	Depth(ctx context.Context, in *pb.DepthRequest, opts ...grpc.CallOption) (*pb.DepthResponse, error)
 	FaceVerify(ctx context.Context, in *pb.FaceVerifyRequest, opts ...grpc.CallOption) (*pb.FaceVerifyResponse, error)
@@ -60,8 +85,25 @@ type Backend interface {
 	VoiceVerify(ctx context.Context, in *pb.VoiceVerifyRequest, opts ...grpc.CallOption) (*pb.VoiceVerifyResponse, error)
 	VoiceAnalyze(ctx context.Context, in *pb.VoiceAnalyzeRequest, opts ...grpc.CallOption) (*pb.VoiceAnalyzeResponse, error)
 	VoiceEmbed(ctx context.Context, in *pb.VoiceEmbedRequest, opts ...grpc.CallOption) (*pb.VoiceEmbedResponse, error)
-	AudioTranscription(ctx context.Context, in *pb.TranscriptRequest, opts ...grpc.CallOption) (*pb.TranscriptResult, error)
-	AudioTranscriptionStream(ctx context.Context, in *pb.TranscriptRequest, f func(chunk *pb.TranscriptStreamResponse), opts ...grpc.CallOption) error
+	Rerank(ctx context.Context, in *pb.RerankRequest, opts ...grpc.CallOption) (*pb.RerankResult, error)
+	TokenClassify(ctx context.Context, in *pb.TokenClassifyRequest, opts ...grpc.CallOption) (*pb.TokenClassifyResponse, error)
+	Score(ctx context.Context, in *pb.ScoreRequest, opts ...grpc.CallOption) (*pb.ScoreResponse, error)
+	VAD(ctx context.Context, in *pb.VADRequest, opts ...grpc.CallOption) (*pb.VADResponse, error)
+	Diarize(ctx context.Context, in *pb.DiarizeRequest, opts ...grpc.CallOption) (*pb.DiarizeResponse, error)
+	SoundDetection(ctx context.Context, in *pb.SoundDetectionRequest, opts ...grpc.CallOption) (*pb.SoundDetectionResponse, error)
+	AudioEncode(ctx context.Context, in *pb.AudioEncodeRequest, opts ...grpc.CallOption) (*pb.AudioEncodeResult, error)
+	AudioDecode(ctx context.Context, in *pb.AudioDecodeRequest, opts ...grpc.CallOption) (*pb.AudioDecodeResult, error)
+	AudioTransform(ctx context.Context, in *pb.AudioTransformRequest, opts ...grpc.CallOption) (*pb.AudioTransformResult, error)
+}
+
+// ControlBackend is the subset of Backend that is NOT per-call inference:
+// lifecycle/control-plane operations and the streaming constructors whose work
+// spans the returned stream rather than the constructor call. In-flight-tracking
+// wrappers embed this directly and pass it through untracked (see Backend).
+type ControlBackend interface {
+	IsBusy() bool
+	HealthCheck(ctx context.Context) (bool, error)
+	LoadModel(ctx context.Context, in *pb.ModelOptions, opts ...grpc.CallOption) (*pb.Result, error)
 	TokenizeString(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.TokenizationResponse, error)
 	Status(ctx context.Context) (*pb.StatusResponse, error)
 
@@ -70,24 +112,11 @@ type Backend interface {
 	StoresGet(ctx context.Context, in *pb.StoresGetOptions, opts ...grpc.CallOption) (*pb.StoresGetResult, error)
 	StoresFind(ctx context.Context, in *pb.StoresFindOptions, opts ...grpc.CallOption) (*pb.StoresFindResult, error)
 
-	Rerank(ctx context.Context, in *pb.RerankRequest, opts ...grpc.CallOption) (*pb.RerankResult, error)
-
-	TokenClassify(ctx context.Context, in *pb.TokenClassifyRequest, opts ...grpc.CallOption) (*pb.TokenClassifyResponse, error)
-
-	Score(ctx context.Context, in *pb.ScoreRequest, opts ...grpc.CallOption) (*pb.ScoreResponse, error)
-
 	GetTokenMetrics(ctx context.Context, in *pb.MetricsRequest, opts ...grpc.CallOption) (*pb.MetricsResponse, error)
 
-	VAD(ctx context.Context, in *pb.VADRequest, opts ...grpc.CallOption) (*pb.VADResponse, error)
-
-	Diarize(ctx context.Context, in *pb.DiarizeRequest, opts ...grpc.CallOption) (*pb.DiarizeResponse, error)
-
-	SoundDetection(ctx context.Context, in *pb.SoundDetectionRequest, opts ...grpc.CallOption) (*pb.SoundDetectionResponse, error)
-
-	AudioEncode(ctx context.Context, in *pb.AudioEncodeRequest, opts ...grpc.CallOption) (*pb.AudioEncodeResult, error)
-	AudioDecode(ctx context.Context, in *pb.AudioDecodeRequest, opts ...grpc.CallOption) (*pb.AudioDecodeResult, error)
-
-	AudioTransform(ctx context.Context, in *pb.AudioTransformRequest, opts ...grpc.CallOption) (*pb.AudioTransformResult, error)
+	// Streaming constructors: these return a stream client immediately; the
+	// actual inference spans the stream's lifetime, not this call, so they are
+	// NOT tracked as a single in-flight unit.
 	AudioTransformStream(ctx context.Context, opts ...grpc.CallOption) (AudioTransformStreamClient, error)
 	AudioToAudioStream(ctx context.Context, opts ...grpc.CallOption) (AudioToAudioStreamClient, error)
 

From e1994579f8a85b4bad5bcd7162f248b999b28b9e Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Wed, 24 Jun 2026 10:08:57 +0100
Subject: [PATCH 73/99] fix(pii): load default detectors at startup + add
 LOCALAI_PII_DEFAULT_DETECTORS (#10474)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pii_default_detectors was applied to the live config only by a live
POST /api/settings (ApplyRuntimeSettings) — neither the startup loader nor
the config file watcher read it back. So after a restart the persisted
default detectors were dropped, and the cloud-proxy MITM listener (which
resolves each intercept host's detectors once at start via ResolvePIIPolicy)
came up with an empty set and forwarded intercepted traffic unredacted, even
though the MITM model had pii.enabled:true and the defaults were on disk.
Request-side default redaction broke the same way.

- startup.go: loadRuntimeSettingsFromFile now applies pii_default_detectors,
  before startMITMIfConfigured, with env > file precedence.
- config_file_watcher.go: apply pii_default_detectors on live file edits,
  matching the existing env-guard pattern used for the other fields.
- settings endpoint: rebuild the MITM listener when pii_default_detectors
  changes (its per-host detector map is frozen at listener start), not only
  on a mitm_listen change — so toggling a default detector takes effect on
  cloud-proxy traffic immediately.
- new LOCALAI_PII_DEFAULT_DETECTORS env var / CLI flag (WithPIIDefaultDetectors)
  so the default detector set can be pinned at boot for immutable deployments.

Assisted-by: Claude:claude-opus-4-8 Claude-Code

Signed-off-by: Richard Palethorpe <io@richiejp.com>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
---
 core/application/config_file_watcher.go       | 10 ++++
 .../runtime_settings_branding_test.go         | 46 +++++++++++++++++++
 core/application/startup.go                   | 14 ++++++
 core/cli/run.go                               |  3 ++
 core/config/application_config.go             | 12 +++++
 core/http/endpoints/localai/settings.go       |  9 +++-
 core/http/endpoints/localai/settings_test.go  | 18 ++++++++
 docs/content/features/middleware.md           |  7 +++
 8 files changed, 118 insertions(+), 1 deletion(-)

diff --git a/core/application/config_file_watcher.go b/core/application/config_file_watcher.go
index e99de63e6..a5f7d5f48 100644
--- a/core/application/config_file_watcher.go
+++ b/core/application/config_file_watcher.go
@@ -215,6 +215,7 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
 		envBackendGalleries := slices.Equal(appConfig.BackendGalleries, startupAppConfig.BackendGalleries)
 		envAutoloadGalleries := appConfig.AutoloadGalleries == startupAppConfig.AutoloadGalleries
 		envAutoloadBackendGalleries := appConfig.AutoloadBackendGalleries == startupAppConfig.AutoloadBackendGalleries
+		envPIIDefaultDetectors := slices.Equal(appConfig.PIIDefaultDetectors, startupAppConfig.PIIDefaultDetectors)
 		envAgentJobRetentionDays := appConfig.AgentJobRetentionDays == startupAppConfig.AgentJobRetentionDays
 		envForceEvictionWhenBusy := appConfig.ForceEvictionWhenBusy == startupAppConfig.ForceEvictionWhenBusy
 		envLRUEvictionMaxRetries := appConfig.LRUEvictionMaxRetries == startupAppConfig.LRUEvictionMaxRetries
@@ -335,6 +336,15 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
 			if settings.AutoloadBackendGalleries != nil && !envAutoloadBackendGalleries {
 				appConfig.AutoloadBackendGalleries = *settings.AutoloadBackendGalleries
 			}
+			if settings.PIIDefaultDetectors != nil && !envPIIDefaultDetectors {
+				// Request-side default redaction reads this live via
+				// ResolvePIIPolicy, so a file edit takes effect on the next chat
+				// request. The MITM listener resolves its per-host detector map
+				// once at start, so a raw file edit reaches cloud-proxy traffic
+				// only after a restart or a POST /api/settings (which rebuilds
+				// the listener) — the admin UI uses the latter.
+				appConfig.PIIDefaultDetectors = append([]string(nil), (*settings.PIIDefaultDetectors)...)
+			}
 			if settings.AutoUpgradeBackends != nil {
 				appConfig.AutoUpgradeBackends = *settings.AutoUpgradeBackends
 			}
diff --git a/core/application/runtime_settings_branding_test.go b/core/application/runtime_settings_branding_test.go
index 6300f4456..763ede4b1 100644
--- a/core/application/runtime_settings_branding_test.go
+++ b/core/application/runtime_settings_branding_test.go
@@ -109,6 +109,52 @@ var _ = Describe("loadRuntimeSettingsFromFile", func() {
 		})
 	})
 
+	// Instance-wide default PII detectors. The file is the only source (no
+	// env var), and the loader runs immediately before startMITMIfConfigured,
+	// so a regression here means the cloud-proxy MITM listener resolves an
+	// empty detector set at boot and forwards intercepted traffic unredacted —
+	// even though pii_default_detectors is on disk and the MITM model has PII
+	// enabled. It also breaks request-side default redaction the same way.
+	Describe("PII default detectors", func() {
+		It("loads pii_default_detectors from the file", func() {
+			cfg := &config.ApplicationConfig{DynamicConfigsDir: seedSettings(`{"pii_default_detectors": ["privacy-filter-nemotron", "secret-filter"]}`)}
+			loadRuntimeSettingsFromFile(cfg)
+			Expect(cfg.PIIDefaultDetectors).To(Equal([]string{"privacy-filter-nemotron", "secret-filter"}))
+		})
+
+		It("does not override an env/CLI-set value (LOCALAI_PII_DEFAULT_DETECTORS)", func() {
+			cfg := &config.ApplicationConfig{
+				DynamicConfigsDir:   seedSettings(`{"pii_default_detectors": ["from-file"]}`),
+				PIIDefaultDetectors: []string{"from-env"}, // simulate WithPIIDefaultDetectors(env)
+			}
+			loadRuntimeSettingsFromFile(cfg)
+			Expect(cfg.PIIDefaultDetectors).To(Equal([]string{"from-env"}), "env var must win over the persisted file value")
+		})
+	})
+
+	// The live file watcher applies pii_default_detectors on a runtime change
+	// the same way it handles galleries/threads/etc.: env-set values (current
+	// == startup snapshot) are left alone, otherwise the file value is applied
+	// to the live config so request-side default redaction picks it up without
+	// a restart.
+	Describe("file watcher: pii_default_detectors", func() {
+		It("applies a changed file value to the live config", func() {
+			startup := config.ApplicationConfig{} // no env baseline
+			live := &config.ApplicationConfig{PIIDefaultDetectors: []string{"old"}}
+			handler := readRuntimeSettingsJson(startup)
+			Expect(handler([]byte(`{"pii_default_detectors":["new-a","new-b"]}`), live)).To(Succeed())
+			Expect(live.PIIDefaultDetectors).To(Equal([]string{"new-a", "new-b"}))
+		})
+
+		It("leaves an env-controlled value untouched", func() {
+			startup := config.ApplicationConfig{PIIDefaultDetectors: []string{"from-env"}}
+			live := &config.ApplicationConfig{PIIDefaultDetectors: []string{"from-env"}}
+			handler := readRuntimeSettingsJson(startup)
+			Expect(handler([]byte(`{"pii_default_detectors":["from-file"]}`), live)).To(Succeed())
+			Expect(live.PIIDefaultDetectors).To(Equal([]string{"from-env"}), "env-controlled detectors must not be overwritten by the file")
+		})
+	})
+
 	// The Agent Pool block has a mix of zero and non-zero defaults
 	// (Enabled=true, EmbeddingModel="granite-...", MaxChunkingSize=400,
 	// VectorEngine="chromem", AgentHubURL="https://agenthub.localai.io").
diff --git a/core/application/startup.go b/core/application/startup.go
index 352d66dab..1e5a7a73b 100644
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -750,6 +750,20 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
 		options.MITMListen = *settings.MITMListen
 	}
 
+	// Instance-wide default PII detectors. LOCALAI_PII_DEFAULT_DETECTORS (via
+	// WithPIIDefaultDetectors) wins when set; otherwise the file is the source
+	// — apply it only when the env/CLI left the value empty, mirroring the
+	// "env > file" precedence used for the other fields. This must land before
+	// startMITMIfConfigured (called right after this loader): the cloud-proxy
+	// listener resolves each intercept host's detectors once at start via
+	// ResolvePIIPolicy, and a MITM model that names no detectors of its own
+	// falls back to these defaults. Without it the listener (and request-side
+	// default redaction) starts with an empty detector set and forwards
+	// traffic unredacted even though pii_default_detectors is on disk.
+	if settings.PIIDefaultDetectors != nil && len(options.PIIDefaultDetectors) == 0 {
+		options.PIIDefaultDetectors = append([]string(nil), (*settings.PIIDefaultDetectors)...)
+	}
+
 	// Backend upgrade flags
 	if settings.AutoUpgradeBackends != nil {
 		if !options.AutoUpgradeBackends {
diff --git a/core/cli/run.go b/core/cli/run.go
index 23eebaaa0..abb0cdbf1 100644
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -181,6 +181,8 @@ type RunCMD struct {
 	// Cloud-proxy MITM listener (off by default).
 	MITMListen string `env:"LOCALAI_MITM_LISTEN" help:"Address (host:port) for the cloudproxy MITM listener. Empty = disabled. Clients set HTTPS_PROXY=http://<this>:<port>. Intercept hosts are declared per-model via the model YAML mitm.hosts: block; create one from the Add Model UI." group:"middleware"`
 	MITMCADir  string `env:"LOCALAI_MITM_CA_DIR" type:"path" help:"Directory holding the MITM proxy CA cert + key. Defaults to <data-path>/mitm-ca." group:"middleware"`
+
+	PIIDefaultDetectors []string `env:"LOCALAI_PII_DEFAULT_DETECTORS" help:"Instance-wide default PII/secret detector model names applied to any PII-enabled model (chiefly cloud-proxy / MITM models) that names no pii.detectors of its own. Comma-separated, e.g. privacy-filter-nemotron,secret-filter. Takes precedence over the value persisted via the Middleware UI." group:"middleware"`
 }
 
 func (r *RunCMD) Run(ctx *cliContext.Context) error {
@@ -243,6 +245,7 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		config.WithAPIAddress(r.Address),
 		config.WithMITMListen(r.MITMListen),
 		config.WithMITMCADir(r.MITMCADir),
+		config.WithPIIDefaultDetectors(r.PIIDefaultDetectors),
 		config.WithAgentJobRetentionDays(r.AgentJobRetentionDays),
 		config.WithLlamaCPPTunnelCallback(func(tunnels []string) {
 			tunnelEnvVar := strings.Join(tunnels, ",")
diff --git a/core/config/application_config.go b/core/config/application_config.go
index 54eb5cb99..87acd6bd5 100644
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -712,6 +712,18 @@ func WithMITMCADir(dir string) AppOption {
 	}
 }
 
+// WithPIIDefaultDetectors sets the instance-wide default PII/secret detector
+// model names applied to any PII-enabled model (chiefly cloud-proxy / MITM
+// models) that names no pii.detectors of its own. CLI/env:
+// LOCALAI_PII_DEFAULT_DETECTORS. Empty leaves the value to
+// runtime_settings.json / the Middleware UI; a non-empty value takes
+// precedence over the file (env > file).
+func WithPIIDefaultDetectors(detectors []string) AppOption {
+	return func(o *ApplicationConfig) {
+		o.PIIDefaultDetectors = detectors
+	}
+}
+
 func WithDynamicConfigDir(dynamicConfigsDir string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.DynamicConfigsDir = dynamicConfigsDir
diff --git a/core/http/endpoints/localai/settings.go b/core/http/endpoints/localai/settings.go
index be6358939..8033f07d5 100644
--- a/core/http/endpoints/localai/settings.go
+++ b/core/http/endpoints/localai/settings.go
@@ -271,7 +271,14 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
 			}
 		}
 
-		if settings.MITMListen != nil {
+		// Rebuild the MITM listener when its address OR the instance-wide
+		// default detectors change. The per-host detector map is resolved once
+		// at listener start (startMITMLocked → ResolvePIIPolicy), so a
+		// default-detector change is otherwise invisible to cloud-proxy traffic
+		// until the next restart — an admin toggling a default detector would
+		// see no redaction. RestartMITM is a no-op when the listener is
+		// disabled (empty address).
+		if settings.MITMListen != nil || settings.PIIDefaultDetectors != nil {
 			if err := app.RestartMITM(); err != nil {
 				xlog.Error("Failed to restart MITM proxy", "error", err)
 				return c.JSON(http.StatusInternalServerError, schema.SettingsResponse{
diff --git a/core/http/endpoints/localai/settings_test.go b/core/http/endpoints/localai/settings_test.go
index 7ba82e1a3..3974c5045 100644
--- a/core/http/endpoints/localai/settings_test.go
+++ b/core/http/endpoints/localai/settings_test.go
@@ -146,6 +146,24 @@ var _ = Describe("Settings endpoints", func() {
 		Expect(*ondisk.PIIDefaultDetectors).To(Equal([]string{"det-a"}))
 	})
 
+	// The MITM listener resolves its per-host PII detectors once at start
+	// (startMITMLocked → ResolvePIIPolicy), and the handler used to restart it
+	// only when mitm_listen changed. So an admin toggling a default detector
+	// (the Middleware detector table POSTs only pii_default_detectors) left
+	// cloud-proxy traffic unredacted until the next reboot. A
+	// pii_default_detectors change must now rebuild the listener.
+	It("rebuilds the MITM listener when only pii_default_detectors changes", func() {
+		rec := post(`{"mitm_listen":"127.0.0.1:0"}`)
+		Expect(rec.Code).To(Equal(http.StatusOK), rec.Body.String())
+		srv1 := app.MITMServer()
+		Expect(srv1).ToNot(BeNil(), "listener should be running after mitm_listen is set")
+
+		rec = post(`{"pii_default_detectors":["det-a"]}`)
+		Expect(rec.Code).To(Equal(http.StatusOK), rec.Body.String())
+		Expect(app.MITMServer()).ToNot(BeIdenticalTo(srv1),
+			"a default-detector change must restart the listener so it picks up the new detectors")
+	})
+
 	// Residual #9125: enabling the watchdog from a cold (off) state via the
 	// React master toggle must start the live watchdog immediately, without a
 	// restart. The toggle posts watchdog_idle_enabled/busy_enabled=true while
diff --git a/docs/content/features/middleware.md b/docs/content/features/middleware.md
index 5f03cb925..397af3c92 100644
--- a/docs/content/features/middleware.md
+++ b/docs/content/features/middleware.md
@@ -185,6 +185,13 @@ It is persisted through `POST /api/settings` and read live, so a change takes
 effect on the next request without a restart. A default that names a model no
 longer loaded still appears (marked *not loaded*) so it can be toggled off.
 
+The default set can also be supplied out-of-band with the
+`LOCALAI_PII_DEFAULT_DETECTORS` environment variable (comma-separated model
+names, e.g. `privacy-filter-nemotron,secret-filter`). When set it takes
+precedence over the value persisted via the UI (env > file), which is the
+right behaviour for immutable container deployments that pin filtering policy
+at boot rather than via the admin UI.
+
 This is what makes `cloud-proxy` / MITM redaction work out of the box: those
 backends default to PII-enabled but ship no detector list, so without a
 default detector the filter runs with nothing to scan. Set one here and

From e8ae88a2a01899e5de795a8d3bb5a40aed2beed1 Mon Sep 17 00:00:00 2001
From: "Dedy F. Setyawan" <dedyfajars@gmail.com>
Date: Wed, 24 Jun 2026 23:35:21 +0700
Subject: [PATCH 74/99] i18n(id): update and complete Indonesian translations
 (#10480)

- translate remaining English strings in chat, common, home, and media locales.
- fix typo and improve wording consistency (e.g., klaster -> kluster, otomasi -> automasi).

Signed-off-by: Dedy F. Setyawan <dedyfajars@gmail.com>
---
 .../react-ui/public/locales/id/admin.json     |  4 ++--
 .../http/react-ui/public/locales/id/chat.json |  4 ++--
 .../react-ui/public/locales/id/common.json    |  6 ++---
 .../http/react-ui/public/locales/id/home.json | 22 +++++++++----------
 .../react-ui/public/locales/id/media.json     |  6 ++---
 core/http/react-ui/public/locales/id/nav.json | 12 +++++-----
 6 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/core/http/react-ui/public/locales/id/admin.json b/core/http/react-ui/public/locales/id/admin.json
index 17da13570..28fa5829c 100644
--- a/core/http/react-ui/public/locales/id/admin.json
+++ b/core/http/react-ui/public/locales/id/admin.json
@@ -45,7 +45,7 @@
   },
   "scheduling": {
     "title": "Penjadwalan",
-    "subtitle": "Aturan penempatan model dan replika di seluruh klaster"
+    "subtitle": "Aturan penempatan model dan replika di seluruh kluster"
   },
   "p2p": {
     "title": "Komputasi AI Terdistribusi",
@@ -86,4 +86,4 @@
     "title": "Penjelajah",
     "subtitle": "Jelajahi file dan konfigurasi"
   }
-}
\ No newline at end of file
+}
diff --git a/core/http/react-ui/public/locales/id/chat.json b/core/http/react-ui/public/locales/id/chat.json
index c79edaeb4..b9216e325 100644
--- a/core/http/react-ui/public/locales/id/chat.json
+++ b/core/http/react-ui/public/locales/id/chat.json
@@ -72,7 +72,7 @@
   "actions": {
     "copy": "Salin",
     "regenerate": "Hasilkan ulang",
-    "jumpToLatest": "Jump to latest"
+    "jumpToLatest": "Lompat ke terbaru"
   },
   "streaming": {
     "transferring": "Mentransfer model...",
@@ -115,4 +115,4 @@
     "clearAll": "Hapus semua",
     "deleteAllTitle": "Hapus semua percakapan"
   }
-}
\ No newline at end of file
+}
diff --git a/core/http/react-ui/public/locales/id/common.json b/core/http/react-ui/public/locales/id/common.json
index 711b056df..3fc28806d 100644
--- a/core/http/react-ui/public/locales/id/common.json
+++ b/core/http/react-ui/public/locales/id/common.json
@@ -1,8 +1,8 @@
 {
   "unsaved": {
-    "title": "Discard unsaved changes?",
-    "message": "You have unsaved changes that will be lost if you leave this page.",
-    "leave": "Leave"
+    "title": "Buang perubahan yang belum disimpan?",
+    "message": "Anda memiliki perubahan yang belum disimpan. Perubahan tersebut akan hilang jika Anda meninggalkan halaman ini.",
+    "leave": "Tinggalkan Halaman"
   },
   "actions": {
     "save": "Simpan",
diff --git a/core/http/react-ui/public/locales/id/home.json b/core/http/react-ui/public/locales/id/home.json
index 368a40709..4e2aafdcb 100644
--- a/core/http/react-ui/public/locales/id/home.json
+++ b/core/http/react-ui/public/locales/id/home.json
@@ -7,15 +7,15 @@
   "resourceGpu": "GPU",
   "resourceRam": "RAM",
   "greeting": {
-    "morning": "Good morning",
-    "afternoon": "Good afternoon",
-    "evening": "Good evening",
-    "night": "Working late"
+    "morning": "Selamat pagi",
+    "afternoon": "Selamat siang",
+    "evening": "Selamat malam",
+    "night": "Selamat lembur"
   },
   "statusLine": {
-    "modelsLoaded_one": "{{count}} model loaded",
-    "modelsLoaded_other": "{{count}} models loaded",
-    "noModelsLoaded": "No models loaded",
+    "modelsLoaded_one": "{{count}} model dimuat",
+    "modelsLoaded_other": "{{count}} model dimuat",
+    "noModelsLoaded": "Tidak ada model yang dimuat",
     "nodes_one": "{{count}} node",
     "nodes_other": "{{count}} nodes"
   },
@@ -79,14 +79,14 @@
   },
   "connect": {
     "title": "Satu endpoint, semua API",
-    "subtitle": "LocalAI menyediakan API miliknya sendiri yang lengkap — pembuatan gambar & video, depth, deteksi objek, reranking, audio, pengenalan wajah & suara, serta suara realtime melalui WebRTC dan WebSocket. Di atas itu, lapisan kompatibilitas drop-in membuat aplikasi apa pun yang dibuat untuk OpenAI, Anthropic, Ollama, atau OpenAI Responses bekerja tanpa perubahan.",
+    "subtitle": "LocalAI menyediakan API miliknya sendiri yang lengkap — pembuatan gambar & video, depth, deteksi objek, reranking, audio, pengenalan wajah & suara, serta suara realtime melalui WebRTC dan WebSocket. Selain itu, lapisan kompatibilitas drop-in membuat aplikasi apa pun yang dibuat untuk OpenAI, Anthropic, Ollama, atau OpenAI Responses bekerja tanpa perubahan.",
     "nativeTitle": "API native",
     "compatTitle": "Kompatibilitas drop-in",
     "apiReference": "Referensi API lengkap",
     "copy": "Salin",
     "copied": "Disalin",
-    "browse": "Browse the API",
-    "hide": "Hide endpoints",
-    "dismiss": "Dismiss"
+    "browse": "Jelajahi API",
+    "hide": "Sembunyikan endpoint",
+    "dismiss": "Abaikan"
   }
 }
diff --git a/core/http/react-ui/public/locales/id/media.json b/core/http/react-ui/public/locales/id/media.json
index b49670c63..10350967b 100644
--- a/core/http/react-ui/public/locales/id/media.json
+++ b/core/http/react-ui/public/locales/id/media.json
@@ -5,7 +5,7 @@
       "video": "Video",
       "tts": "TTS",
       "sound": "Suara",
-      "transform": "Transform"
+      "transform": "Transformasi"
     }
   },
   "image": {
@@ -30,7 +30,7 @@
       "refImagesAdded_other": "{{count}} gambar ditambahkan"
     },
     "actions": {
-      "view": "View",
+      "view": "Lihat",
       "generate": "Hasilkan",
       "generating": "Menghasilkan..."
     },
@@ -153,4 +153,4 @@
     "clearConfirm": "Hapus",
     "cleared": "Riwayat dihapus"
   }
-}
\ No newline at end of file
+}
diff --git a/core/http/react-ui/public/locales/id/nav.json b/core/http/react-ui/public/locales/id/nav.json
index 225fc59f7..c13c197d9 100644
--- a/core/http/react-ui/public/locales/id/nav.json
+++ b/core/http/react-ui/public/locales/id/nav.json
@@ -19,11 +19,11 @@
     "operate": "Operasikan"
   },
   "operate": {
-    "inference": "Inference",
-    "cluster": "Cluster",
-    "observability": "Observability",
-    "access": "Access",
-    "system": "System"
+    "inference": "Inferensi",
+    "cluster": "Kluster",
+    "observability": "Observabilitas",
+    "access": "Akses",
+    "system": "Sistem"
   },
   "items": {
     "home": "Beranda",
@@ -64,7 +64,7 @@
     "copyright": "© 2023-{{year}} {{author}}"
   },
   "console": {
-    "automation": "Otomasi",
+    "automation": "Automasi",
     "training": "Pelatihan"
   }
 }

From 482314c623c86d9fb362a828a371295c1d417f85 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Wed, 24 Jun 2026 21:50:44 +0200
Subject: [PATCH 75/99] fix(realtime): resolve model aliases for pipeline
 sub-models (#10484)

Realtime pipeline sub-models (llm/transcription/tts/vad/sound-detection)
were loaded via cl.LoadModelConfigFileByName without alias resolution,
unlike top-level API requests which resolve aliases in
core/http/middleware/request.go. So a pipeline that references an alias
(e.g. `pipeline.llm: default`, where `default` is an alias for a real
LLM) reached model loading as the alias stub with an empty Backend.

This was silently broken on a single host (it failed downstream) and a
hard error in distributed/p2p mode:

    routing model : loading model default: ... installing backend on
    node X: backend name is empty

Fix by routing every pipeline sub-model load through a small helper that
follows a single alias hop (mirroring the top-level resolution), so
non-alias sub-models behave identically and aliased ones get the
target's full config (Backend, Model, ...).

Assisted-by: Claude:claude-opus-4-8

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/http/endpoints/openai/realtime_model.go  | 33 +++++++++---
 .../openai/realtime_model_alias_test.go       | 52 +++++++++++++++++++
 2 files changed, 78 insertions(+), 7 deletions(-)
 create mode 100644 core/http/endpoints/openai/realtime_model_alias_test.go

diff --git a/core/http/endpoints/openai/realtime_model.go b/core/http/endpoints/openai/realtime_model.go
index 6843a521d..0dafa0a35 100644
--- a/core/http/endpoints/openai/realtime_model.go
+++ b/core/http/endpoints/openai/realtime_model.go
@@ -432,7 +432,7 @@ func loadSoundDetectionConfig(pipeline *config.Pipeline, cl *config.ModelConfigL
 	if pipeline.SoundDetection == "" {
 		return nil, nil
 	}
-	cfg, err := cl.LoadModelConfigFileByName(pipeline.SoundDetection, ml.ModelPath)
+	cfg, err := loadPipelineSubModel(cl, pipeline.SoundDetection, ml.ModelPath)
 	if err != nil {
 		return nil, fmt.Errorf("failed to load sound detection config: %w", err)
 	}
@@ -443,7 +443,7 @@ func loadSoundDetectionConfig(pipeline *config.Pipeline, cl *config.ModelConfigL
 }
 
 func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, *config.ModelConfig, error) {
-	cfgVAD, err := cl.LoadModelConfigFileByName(pipeline.VAD, ml.ModelPath)
+	cfgVAD, err := loadPipelineSubModel(cl, pipeline.VAD, ml.ModelPath)
 	if err != nil {
 
 		return nil, nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -453,7 +453,7 @@ func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfig
 		return nil, nil, fmt.Errorf("failed to validate config: %w", err)
 	}
 
-	cfgSST, err := cl.LoadModelConfigFileByName(pipeline.Transcription, ml.ModelPath)
+	cfgSST, err := loadPipelineSubModel(cl, pipeline.Transcription, ml.ModelPath)
 	if err != nil {
 
 		return nil, nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -542,11 +542,30 @@ func buildRealtimeRoutingContext(a *application.Application, sessionID string) *
 	}
 }
 
+// loadPipelineSubModel loads a pipeline sub-model config by name and follows a
+// single alias hop, so a pipeline that references an alias (e.g. `llm: default`)
+// gets the alias target's full config (Backend, Model, ...) rather than the
+// alias stub with an empty Backend. Without this the alias survives unresolved
+// into model loading and fails downstream — notably in distributed mode with
+// "backend name is empty". Mirrors the top-level alias resolution in
+// core/http/middleware/request.go.
+func loadPipelineSubModel(cl *config.ModelConfigLoader, name, modelPath string) (*config.ModelConfig, error) {
+	cfg, err := cl.LoadModelConfigFileByName(name, modelPath)
+	if err != nil {
+		return nil, err
+	}
+	resolved, _, err := cl.ResolveAlias(cfg)
+	if err != nil {
+		return nil, err
+	}
+	return resolved, nil
+}
+
 // returns and loads either a wrapped model or a model that support audio-to-audio
 func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, evaluator *templates.Evaluator, routing *RealtimeRoutingContext) (Model, error) {
 	xlog.Debug("Creating new model pipeline model", "pipeline", pipeline)
 
-	cfgVAD, err := cl.LoadModelConfigFileByName(pipeline.VAD, ml.ModelPath)
+	cfgVAD, err := loadPipelineSubModel(cl, pipeline.VAD, ml.ModelPath)
 	if err != nil {
 
 		return nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -557,7 +576,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
 	}
 
 	// TODO: Do we always need a transcription model? It can be disabled. Note that any-to-any instruction following models don't transcribe as such, so if transcription is required it is a separate process
-	cfgSST, err := cl.LoadModelConfigFileByName(pipeline.Transcription, ml.ModelPath)
+	cfgSST, err := loadPipelineSubModel(cl, pipeline.Transcription, ml.ModelPath)
 	if err != nil {
 
 		return nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -589,7 +608,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
 	xlog.Debug("Loading a wrapped model")
 
 	// Otherwise we want to return a wrapped model, which is a "virtual" model that re-uses other models to perform operations
-	cfgLLM, err := cl.LoadModelConfigFileByName(pipeline.LLM, ml.ModelPath)
+	cfgLLM, err := loadPipelineSubModel(cl, pipeline.LLM, ml.ModelPath)
 	if err != nil {
 
 		return nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -604,7 +623,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
 	applyPipelineReasoning(cfgLLM, *pipeline)
 	applyPipelineThinking(cfgLLM, *pipeline)
 
-	cfgTTS, err := cl.LoadModelConfigFileByName(pipeline.TTS, ml.ModelPath)
+	cfgTTS, err := loadPipelineSubModel(cl, pipeline.TTS, ml.ModelPath)
 	if err != nil {
 
 		return nil, fmt.Errorf("failed to load backend config: %w", err)
diff --git a/core/http/endpoints/openai/realtime_model_alias_test.go b/core/http/endpoints/openai/realtime_model_alias_test.go
new file mode 100644
index 000000000..77179d963
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_model_alias_test.go
@@ -0,0 +1,52 @@
+package openai
+
+import (
+	"os"
+	"path/filepath"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/config"
+)
+
+// loadPipelineSubModel must resolve a pipeline sub-model that references an
+// alias (e.g. `llm: default`) one hop to the alias target's full config — so
+// the effective backend is the target's backend, not the empty backend of the
+// alias stub. This mirrors the top-level alias resolution done in
+// core/http/middleware/request.go, which the realtime pipeline previously
+// skipped (failing in distributed mode with "backend name is empty").
+var _ = Describe("loadPipelineSubModel", func() {
+	It("resolves a sub-model alias one hop to the target's config", func() {
+		tmpDir := GinkgoT().TempDir()
+
+		// A real model config with a concrete backend.
+		realLLM := `name: real-llm
+backend: llama-cpp
+parameters:
+  model: real-llm.gguf
+`
+		Expect(os.WriteFile(filepath.Join(tmpDir, "real-llm.yaml"), []byte(realLLM), 0644)).To(Succeed())
+
+		// An alias pointing at the real model.
+		aliasCfg := `name: default
+alias: real-llm
+`
+		Expect(os.WriteFile(filepath.Join(tmpDir, "default.yaml"), []byte(aliasCfg), 0644)).To(Succeed())
+
+		cl := config.NewModelConfigLoader(tmpDir)
+		Expect(cl.LoadModelConfigsFromPath(tmpDir)).To(Succeed())
+
+		// Resolving the alias must follow the hop to the target's full config.
+		resolved, err := loadPipelineSubModel(cl, "default", tmpDir)
+		Expect(err).NotTo(HaveOccurred())
+		Expect(resolved.IsAlias()).To(BeFalse())
+		Expect(resolved.Backend).To(Equal("llama-cpp"))
+
+		// A non-alias name must load unchanged.
+		direct, err := loadPipelineSubModel(cl, "real-llm", tmpDir)
+		Expect(err).NotTo(HaveOccurred())
+		Expect(direct.Backend).To(Equal("llama-cpp"))
+		Expect(direct.Name).To(Equal("real-llm"))
+	})
+})

From 193d0e6aefa2ffe051206c870f626148e3402f33 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Wed, 24 Jun 2026 22:19:03 +0200
Subject: [PATCH 76/99] fix(backends): darwin/metal support for supertonic
 (#10488)

The supertonic Go TTS backend dlopens ONNX Runtime, but its runtime and
packaging scripts were Linux-only: run.sh exported LD_LIBRARY_PATH, pointed
ONNXRUNTIME_LIB_PATH at libonnxruntime.so, and always tried the ld.so exec
path, while package.sh hard-failed on any non-Linux host. On macOS dyld has
no ld.so loader, uses DYLD_LIBRARY_PATH, and ONNX Runtime ships as a .dylib.

This applies the same purego .dylib/DYLD_LIBRARY_PATH fix that PR #10481
landed for 15 other ONNX/purego backends (sherpa-onnx, silero-vad, etc.) but
which omitted supertonic:

- run.sh: on darwin export DYLD_LIBRARY_PATH and point ONNXRUNTIME_LIB_PATH
  at libonnxruntime.dylib; guard the ld.so exec path to Linux only.
- package.sh: recognize Darwin instead of erroring out; the bundled .dylib is
  resolved via DYLD_LIBRARY_PATH, no glibc/ld.so to bundle.
- helper.go: platform-native default library extension (dylib on darwin) for
  the last-resort dlopen fallback.

It also wires the darwin CI build and gallery entries, resolving the
inconsistency where backend/index.yaml advertised metal for supertonic but no
includeDarwin matrix entry built the image:

- .github/backend-matrix.yml: add the -metal-darwin-arm64-supertonic Go entry.
- backend/index.yaml: declare metal capabilities and add the concrete
  metal-supertonic / metal-supertonic-development child entries.

The Makefile already detects Darwin/osx/arm64 and stages the per-OS ONNX
Runtime tarball, mirroring sherpa-onnx, so no Makefile change is required.


Assisted-by: Claude:opus-4.8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 .github/backend-matrix.yml       |  4 ++++
 backend/go/supertonic/helper.go  |  9 ++++++++-
 backend/go/supertonic/package.sh |  4 ++++
 backend/go/supertonic/run.sh     | 17 ++++++++++++-----
 backend/index.yaml               | 12 ++++++++++++
 5 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml
index 593e44cde..4cfc937ac 100644
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -4990,6 +4990,10 @@ includeDarwin:
     tag-suffix: "-metal-darwin-arm64-sherpa-onnx"
     build-type: "metal"
     lang: "go"
+  - backend: "supertonic"
+    tag-suffix: "-metal-darwin-arm64-supertonic"
+    build-type: "metal"
+    lang: "go"
   - backend: "local-store"
     tag-suffix: "-metal-darwin-arm64-local-store"
     build-type: "metal"
diff --git a/backend/go/supertonic/helper.go b/backend/go/supertonic/helper.go
index 9f927d5d3..884077e75 100644
--- a/backend/go/supertonic/helper.go
+++ b/backend/go/supertonic/helper.go
@@ -16,6 +16,7 @@ import (
 	"os"
 	"path/filepath"
 	"regexp"
+	"runtime"
 	"strings"
 	"time"
 	"unicode"
@@ -943,7 +944,13 @@ func InitializeONNXRuntime() error {
 			}
 		}
 		if libPath == "" {
-			libPath = "/usr/local/lib/libonnxruntime.so"
+			// LocalAI: default to the platform-native shared library
+			// extension when nothing else is found (dyld vs ld.so).
+			if runtime.GOOS == "darwin" {
+				libPath = "/usr/local/lib/libonnxruntime.dylib"
+			} else {
+				libPath = "/usr/local/lib/libonnxruntime.so"
+			}
 		}
 	}
 	ort.SetSharedLibraryPath(libPath)
diff --git a/backend/go/supertonic/package.sh b/backend/go/supertonic/package.sh
index 9e2a01625..678ca5ead 100755
--- a/backend/go/supertonic/package.sh
+++ b/backend/go/supertonic/package.sh
@@ -32,6 +32,10 @@ elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
     cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
     cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
     cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
+elif [ $(uname -s) = "Darwin" ]; then
+    # macOS: dyld resolves the bundled .dylib via DYLD_LIBRARY_PATH (set in
+    # run.sh); there is no ld.so loader nor glibc to bundle.
+    echo "Detected Darwin"
 else
     echo "Error: Could not detect architecture"
     exit 1
diff --git a/backend/go/supertonic/run.sh b/backend/go/supertonic/run.sh
index 2dabf7eb3..683c52ab2 100755
--- a/backend/go/supertonic/run.sh
+++ b/backend/go/supertonic/run.sh
@@ -3,12 +3,19 @@ set -ex
 
 CURDIR=$(dirname "$(realpath $0)")
 
-export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
-export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.so
+if [ "$(uname)" = "Darwin" ]; then
+	# macOS uses dyld: there is no ld.so loader, and the search path env
+	# var is DYLD_LIBRARY_PATH. ONNX Runtime ships as a .dylib here.
+	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
+	export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.dylib
+else
+	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
+	export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.so
 
-if [ -f $CURDIR/lib/ld.so ]; then
-	echo "Using lib/ld.so"
-	exec $CURDIR/lib/ld.so $CURDIR/supertonic "$@"
+	if [ -f $CURDIR/lib/ld.so ]; then
+		echo "Using lib/ld.so"
+		exec $CURDIR/lib/ld.so $CURDIR/supertonic "$@"
+	fi
 fi
 
 exec $CURDIR/supertonic "$@"
diff --git a/backend/index.yaml b/backend/index.yaml
index 3f61f7b4e..592c8fd6b 100644
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -1569,6 +1569,7 @@
     - TTS
   capabilities:
     default: "cpu-supertonic"
+    metal: "metal-supertonic"
 - !!merge <<: *neutts
   name: "neutts-development"
   capabilities:
@@ -5484,6 +5485,7 @@
   name: "supertonic-development"
   capabilities:
     default: "cpu-supertonic-development"
+    metal: "metal-supertonic-development"
 - !!merge <<: *supertonic
   name: "cpu-supertonic"
   uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-supertonic"
@@ -5494,3 +5496,13 @@
   uri: "quay.io/go-skynet/local-ai-backends:master-cpu-supertonic"
   mirrors:
     - localai/localai-backends:master-cpu-supertonic
+- !!merge <<: *supertonic
+  name: "metal-supertonic"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-supertonic"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-supertonic
+- !!merge <<: *supertonic
+  name: "metal-supertonic-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-supertonic"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-supertonic

From 62b14fd6354acf931f83b0f4655e32a9d57321ef Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Wed, 24 Jun 2026 23:16:27 +0200
Subject: [PATCH 77/99] feat(backends): add darwin/metal build for liquid-audio
 (#10486)

* feat(backends): add darwin/metal build for liquid-audio

Wire the already-MPS-ready liquid-audio backend (it ships
requirements-mps.txt) into the darwin CI matrix and the gallery so
metal-darwin-arm64 images are built and selectable.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:opus-4.8 [Claude Code]

* ci(liquid-audio): trigger darwin build via requirements-mps note

The changed-backends path filter only builds a backend when a file under
its directory changes. The metal wiring lived in index.yaml + the matrix,
so the darwin job was skipped. Add a documenting comment to the MPS
requirements so CI actually exercises the darwin build.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:opus-4.8 [Claude Code]

* fix(liquid-audio): guard uv-only --index-strategy for the pip/darwin path

Same fix as trl: the darwin/MPS build installs with pip (USE_PIP=true), which
rejects the uv-only --index-strategy flag and failed the darwin backend build.
Add it only on the uv path; Linux/CUDA resolution is unchanged.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:opus-4.8 [Claude Code]

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 .github/backend-matrix.yml                       |  3 +++
 backend/index.yaml                               | 12 ++++++++++++
 backend/python/liquid-audio/install.sh           |  8 +++++++-
 backend/python/liquid-audio/requirements-mps.txt |  1 +
 4 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml
index 4cfc937ac..17d436cc1 100644
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -4974,6 +4974,9 @@ includeDarwin:
   - backend: "kitten-tts"
     tag-suffix: "-metal-darwin-arm64-kitten-tts"
     build-type: "mps"
+  - backend: "liquid-audio"
+    tag-suffix: "-metal-darwin-arm64-liquid-audio"
+    build-type: "mps"
   - backend: "piper"
     tag-suffix: "-metal-darwin-arm64-piper"
     build-type: "metal"
diff --git a/backend/index.yaml b/backend/index.yaml
index 592c8fd6b..f3a2b892d 100644
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -1284,6 +1284,7 @@
     nvidia-cuda-13: "cuda13-liquid-audio"
     nvidia-cuda-12: "cuda12-liquid-audio"
     nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-liquid-audio"
+    metal: "metal-liquid-audio"
   icon: https://cdn-avatars.huggingface.co/v1/production/uploads/61b8e2ba285851687028d395/7_6D7rWrLxp2hb6OHSV1p.png
 - &qwen-tts
   urls:
@@ -4613,6 +4614,7 @@
     nvidia-cuda-13: "cuda13-liquid-audio-development"
     nvidia-cuda-12: "cuda12-liquid-audio-development"
     nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-liquid-audio-development"
+    metal: "metal-liquid-audio-development"
 - !!merge <<: *liquid-audio
   name: "cpu-liquid-audio"
   uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-liquid-audio"
@@ -4623,6 +4625,16 @@
   uri: "quay.io/go-skynet/local-ai-backends:master-cpu-liquid-audio"
   mirrors:
     - localai/localai-backends:master-cpu-liquid-audio
+- !!merge <<: *liquid-audio
+  name: "metal-liquid-audio"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-liquid-audio"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-liquid-audio
+- !!merge <<: *liquid-audio
+  name: "metal-liquid-audio-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-liquid-audio"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-liquid-audio
 - !!merge <<: *liquid-audio
   name: "cuda12-liquid-audio"
   uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-liquid-audio"
diff --git a/backend/python/liquid-audio/install.sh b/backend/python/liquid-audio/install.sh
index c7ed8eaa8..fe0f9caad 100755
--- a/backend/python/liquid-audio/install.sh
+++ b/backend/python/liquid-audio/install.sh
@@ -14,5 +14,11 @@ else
 fi
 
 # liquid-audio's torch wheels are large; allow upgrades to satisfy transitive pins
-EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
+EXTRA_PIP_INSTALL_FLAGS+=" --upgrade"
+# --index-strategy is a uv-only flag. The darwin/MPS build installs with pip
+# (USE_PIP=true in scripts/build/python-darwin.sh), which rejects it. Only add
+# it on the uv path; Linux/CUDA resolution is unchanged.
+if [ "x${USE_PIP:-}" != "xtrue" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
+fi
 installRequirements
diff --git a/backend/python/liquid-audio/requirements-mps.txt b/backend/python/liquid-audio/requirements-mps.txt
index f57687f29..3c9c36cca 100644
--- a/backend/python/liquid-audio/requirements-mps.txt
+++ b/backend/python/liquid-audio/requirements-mps.txt
@@ -1,3 +1,4 @@
+# MPS (Apple Silicon / Metal) build profile - installed by the darwin CI job.
 torch>=2.8.0
 torchaudio>=2.8.0
 torchcodec>=0.9.1

From 75ba2daba1e7ac12c91489d6c425df5f86ded598 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Wed, 24 Jun 2026 23:18:04 +0200
Subject: [PATCH 78/99] chore(model-gallery): :arrow_up: update checksum
 (#10495)

:arrow_up: Checksum updates in gallery/index.yaml

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 gallery/index.yaml | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index e26f2a1f5..52f23a771 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -3,24 +3,7 @@
   url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
   urls:
     - https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct-GGUF
-  description: |
-    Try LFM • Docs • LEAP • Discord
-
-    # LFM2.5-1.2B-Instruct
-
-    LFM2.5 is a new family of hybrid models designed for **on-device deployment**. It builds on the LFM2 architecture with extended pre-training and reinforcement learning.
-
-      - **Best-in-class performance**: A 1.2B model rivaling much larger models, bringing high-quality AI to your pocket.
-      - **Fast edge inference**: 239 tok/s decode on AMD CPU, 82 tok/s on mobile NPU. Runs under 1GB of memory with day-one support for llama.cpp, MLX, and vLLM.
-      - **Scaled training**: Extended pre-training from 10T to 28T tokens and large-scale multi-stage reinforcement learning.
-
-    Find more information about LFM2.5 in our blog post.
-
-    ## 🗒️ Model Details
-
-    LFM2.5-1.2B-Instruct is a general-purpose text-only model with the following features:
-
-    ...
+  description: "Try LFM • Docs • LEAP • Discord\n\n# LFM2.5-1.2B-Instruct\n\nLFM2.5 is a new family of hybrid models designed for **on-device deployment**. It builds on the LFM2 architecture with extended pre-training and reinforcement learning.\n\n  - **Best-in-class performance**: A 1.2B model rivaling much larger models, bringing high-quality AI to your pocket.\n  - **Fast edge inference**: 239 tok/s decode on AMD CPU, 82 tok/s on mobile NPU. Runs under 1GB of memory with day-one support for llama.cpp, MLX, and vLLM.\n  - **Scaled training**: Extended pre-training from 10T to 28T tokens and large-scale multi-stage reinforcement learning.\n\nFind more information about LFM2.5 in our blog post.\n\n## \U0001F5D2️ Model Details\n\nLFM2.5-1.2B-Instruct is a general-purpose text-only model with the following features:\n\n...\n"
   license: "other"
   tags:
     - llm
@@ -842,8 +825,8 @@
       use_tokenizer_template: true
   files:
     - filename: llama-cpp/models/Qwopus3.6-27B-Coder-MTP-GGUF/Qwopus3.6-27B-Coder-MTP-Q4_K_M.gguf
-      sha256: b2898667ed7b2388f0ab7691393833ae777f247492bbe62fdb4b2bd3e3cf3f79
       uri: https://huggingface.co/Jackrong/Qwopus3.6-27B-Coder-MTP-GGUF/resolve/main/Qwopus3.6-27B-Coder-MTP-Q4_K_M.gguf
+      sha256: b2b9180093496da2e00439e3fa23227c591355901bfa579bc6897bbc01b755ef
     - filename: llama-cpp/mmproj/Qwopus3.6-27B-Coder-MTP-GGUF/mmproj-F32.gguf
       sha256: 32f7ea0600c07272547da401d460f8abbd980f3a57b69d6df87be0e2505e0b9c
       uri: https://huggingface.co/Jackrong/Qwopus3.6-27B-Coder-MTP-GGUF/resolve/main/mmproj-F32.gguf

From 764b0352b938aebe21a91efa39867920adb7f26a Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Wed, 24 Jun 2026 23:18:24 +0200
Subject: [PATCH 79/99] docs: :arrow_up: update docs version mudler/LocalAI
 (#10491)

:arrow_up: Update docs version mudler/LocalAI

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 docs/data/version.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/data/version.json b/docs/data/version.json
index 0abaf0d54..f8cc39cee 100644
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v4.4.3"
+  "version": "v4.5.0"
 }

From 5c3d48ab5093d49c67c60f0577f52173096ae27c Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Wed, 24 Jun 2026 23:30:08 +0200
Subject: [PATCH 80/99] feat(ui): usage & UX enhancements (last-used model,
 polling, starter models, usage cost, a11y) (#10496)

* feat(ui): remember last-used model per capability

ModelSelector auto-selected the first option whenever the bound value was
empty or stale, so every visit to the Home chat box, Image, TTS or Talk
pages reset the choice to whatever sorted first. Persist the user's pick
in localStorage keyed by capability and prefer it on auto-select when the
model is still available, falling back to the first option otherwise.

Because every modality picker funnels through ModelSelector, this fixes
the friction everywhere at once. External-options callers pass no
capability and keep the previous first-item behaviour.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(ui): add visibility-aware polling hook

The app had 26 hand-rolled setInterval polls, none of which paused when
the browser tab was hidden, so backgrounded dashboards kept hitting the
server every few seconds for data nobody was looking at.

Add usePolling: runs immediately, polls on a fixed interval, pauses while
document.hidden, fires a catch-up poll on return, and guards against
overlapping slow requests. Route useResources (the highest-frequency
shared poll) through it. Further callers can be migrated incrementally.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(ui): hardware-aware starter models on empty home

A fresh install dropped admins straight into a 1000+ model gallery with
no guidance. Add a StarterModels widget to the empty-state wizard that
recommends a small, curated set tuned to the detected hardware:

- CPU-only machines (no GPU VRAM) are steered to genuinely small models
  (1-4B, Q4) that stay responsive without a GPU.
- GPU machines get suggestions scaled to available VRAM.

Curated names are real gallery entries, intersected against the live
gallery at render time so a trimmed/custom gallery degrades gracefully.
Install is one click via the existing model-install API.

Also routes Home's cluster and system-info polls through usePolling so a
backgrounded home page stops fetching.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(ui): optional token-cost estimates on usage dashboard

The usage dashboard tracked tokens but had no monetary view. Multi-user
deployments that bill back or budget compute had to export and compute
cost elsewhere.

Add an opt-in pricing control: admins set $ per 1M prompt/completion
tokens (stored per-browser). When set, an estimated-cost summary card and
per-model / per-user cost columns appear, computed from recorded token
counts. The entire cost surface stays hidden until a price is entered, so
the default view is unchanged. Cost is clearly labelled an estimate -
LocalAI itself has no notion of price.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* fix(ui): label icon-only send buttons for screen readers

The chat and agent-chat send buttons were a bare paper-plane icon with
no accessible name, so screen readers announced only "button". Add an
aria-label/title ("Send message") and mark the icon aria-hidden. An audit
of all icon-only buttons found these were the only two unlabeled controls;
the rest already carry visible text.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 .../http/react-ui/public/locales/en/chat.json |   1 +
 .../http/react-ui/public/locales/en/home.json |  14 ++
 core/http/react-ui/src/App.css                |  53 +++++++
 .../react-ui/src/components/ModelSelector.jsx |  36 ++++-
 .../react-ui/src/components/StarterModels.jsx | 129 ++++++++++++++++++
 core/http/react-ui/src/hooks/usePolling.js    |  66 +++++++++
 core/http/react-ui/src/hooks/useResources.js  |  17 +--
 core/http/react-ui/src/pages/AgentChat.jsx    |   4 +-
 core/http/react-ui/src/pages/Chat.jsx         |   4 +-
 core/http/react-ui/src/pages/Home.jsx         |  74 +++++-----
 core/http/react-ui/src/pages/Usage.jsx        |  86 +++++++++++-
 11 files changed, 426 insertions(+), 58 deletions(-)
 create mode 100644 core/http/react-ui/src/components/StarterModels.jsx
 create mode 100644 core/http/react-ui/src/hooks/usePolling.js

diff --git a/core/http/react-ui/public/locales/en/chat.json b/core/http/react-ui/public/locales/en/chat.json
index de9d0507d..ffda226db 100644
--- a/core/http/react-ui/public/locales/en/chat.json
+++ b/core/http/react-ui/public/locales/en/chat.json
@@ -86,6 +86,7 @@
   "input": {
     "placeholder": "Message...",
     "attachFile": "Attach file",
+    "send": "Send message",
     "stopGenerating": "Stop generating",
     "canvasTitle": "Canvas — extract code blocks and media into a side panel for preview, copy, and download",
     "canvasLabel": "Canvas",
diff --git a/core/http/react-ui/public/locales/en/home.json b/core/http/react-ui/public/locales/en/home.json
index fabd9e9dd..142767999 100644
--- a/core/http/react-ui/public/locales/en/home.json
+++ b/core/http/react-ui/public/locales/en/home.json
@@ -77,6 +77,20 @@
     "noModelsTitle": "No Models Available",
     "noModelsBody": "There are no models installed yet. Ask your administrator to set up models so you can start chatting."
   },
+  "starters": {
+    "title": "Recommended for your hardware",
+    "tier": {
+      "cpu": "CPU-only",
+      "gpu-small": "GPU",
+      "gpu-large": "GPU"
+    },
+    "cpuNote": "No GPU detected — these small models stay responsive on CPU.",
+    "gpuNote": "Picked to fit your available VRAM with room for context.",
+    "install": "Install",
+    "installing": "Installing",
+    "installStarted": "Installing {{model}}…",
+    "installFailed": "Install failed: {{message}}"
+  },
   "connect": {
     "title": "One endpoint, every API",
     "subtitle": "LocalAI serves its own full API — image & video generation, depth, object detection, reranking, audio, face & voice recognition, and realtime voice over WebRTC and WebSocket. On top of that, a drop-in compatibility layer lets any app built for OpenAI, Anthropic, Ollama or OpenAI Responses talk to it unchanged.",
diff --git a/core/http/react-ui/src/App.css b/core/http/react-ui/src/App.css
index cf1a46bd3..40eddc2e9 100644
--- a/core/http/react-ui/src/App.css
+++ b/core/http/react-ui/src/App.css
@@ -6363,6 +6363,59 @@ select.input {
   justify-content: center;
 }
 
+/* ──────────────────── Home: hardware-aware starter models ──────────────────── */
+
+.home-starters {
+  margin: var(--spacing-lg) 0;
+  padding: var(--spacing-lg);
+}
+.home-starters-head {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  gap: var(--spacing-md);
+}
+.home-starters-head strong {
+  font-size: 0.9375rem;
+}
+.home-starters-tier {
+  display: inline-flex;
+  align-items: center;
+  gap: var(--spacing-xs);
+  font-size: 0.75rem;
+  color: var(--color-text-muted);
+}
+.home-starters-sub {
+  margin: var(--spacing-xs) 0 var(--spacing-md);
+  font-size: 0.8125rem;
+  color: var(--color-text-secondary);
+}
+.home-starters-list {
+  list-style: none;
+  margin: 0;
+  padding: 0;
+  display: flex;
+  flex-direction: column;
+  gap: var(--spacing-xs);
+}
+.home-starters-item {
+  display: flex;
+  align-items: center;
+  gap: var(--spacing-md);
+  padding: var(--spacing-xs) 0;
+}
+.home-starters-name {
+  font-weight: 500;
+  font-size: 0.875rem;
+  word-break: break-all;
+}
+.home-starters-size {
+  margin-left: auto;
+  font-size: 0.75rem;
+  color: var(--color-text-muted);
+  white-space: nowrap;
+}
+
 /* ──────────────────── Home: drop-in endpoint / API compatibility ──────────────────── */
 
 .home-connect {
diff --git a/core/http/react-ui/src/components/ModelSelector.jsx b/core/http/react-ui/src/components/ModelSelector.jsx
index 9009524ee..76a118ec9 100644
--- a/core/http/react-ui/src/components/ModelSelector.jsx
+++ b/core/http/react-ui/src/components/ModelSelector.jsx
@@ -1,8 +1,25 @@
-import { useEffect, useMemo } from 'react'
+import { useEffect, useMemo, useCallback } from 'react'
 import { useModels } from '../hooks/useModels'
 import SearchableSelect from './SearchableSelect'
 import { useTranslation } from 'react-i18next'
 
+// Remember the last model the user picked, keyed by capability, so returning to
+// a page (Home chat box, Image, TTS, Talk...) defaults to that model instead of
+// whatever happens to sort first. Only persisted when a capability key exists —
+// `externalOptions` callers pass no capability and get the old first-item
+// behaviour. localStorage access is wrapped because private-browsing modes throw.
+const LAST_MODEL_PREFIX = 'localai_last_model:'
+
+function readLastModel(capability) {
+  if (!capability) return null
+  try { return localStorage.getItem(LAST_MODEL_PREFIX + capability) } catch { return null }
+}
+
+function writeLastModel(capability, model) {
+  if (!capability || !model) return
+  try { localStorage.setItem(LAST_MODEL_PREFIX + capability, model) } catch { /* ignore */ }
+}
+
 export default function ModelSelector({
   value, onChange, capability, className = '',
   options: externalOptions, loading: externalLoading,
@@ -19,16 +36,27 @@ export default function ModelSelector({
   const isLoading = externalOptions ? (externalLoading || false) : hookLoading
   const isDisabled = isLoading || (externalDisabled || false)
 
+  // Persist genuine selections so the next visit can restore them.
+  const handleChange = useCallback((next) => {
+    writeLastModel(capability, next)
+    onChange(next)
+  }, [capability, onChange])
+
   useEffect(() => {
     if (modelNames.length > 0 && (!value || !modelNames.includes(value))) {
-      onChange(modelNames[0])
+      // Prefer the remembered model when it's still available; otherwise fall
+      // back to the first option. Don't re-persist here — auto-select is not a
+      // user choice, and writing back the stored value would be a harmless but
+      // pointless round-trip.
+      const remembered = readLastModel(capability)
+      onChange(remembered && modelNames.includes(remembered) ? remembered : modelNames[0])
     }
-  }, [modelNames, value, onChange])
+  }, [modelNames, value, onChange, capability])
 
   return (
     <SearchableSelect
       value={value || ''}
-      onChange={onChange}
+      onChange={handleChange}
       options={modelNames}
       placeholder={isLoading ? t('selector.loading') : (modelNames.length === 0 ? t('selector.noModels') : t('selector.selectModel'))}
       searchPlaceholder={searchPlaceholder || t('selector.searchPlaceholder')}
diff --git a/core/http/react-ui/src/components/StarterModels.jsx b/core/http/react-ui/src/components/StarterModels.jsx
new file mode 100644
index 000000000..9273ae147
--- /dev/null
+++ b/core/http/react-ui/src/components/StarterModels.jsx
@@ -0,0 +1,129 @@
+import { useState, useEffect, useMemo } from 'react'
+import { useTranslation } from 'react-i18next'
+import { modelsApi } from '../utils/api'
+import { useResources } from '../hooks/useResources'
+
+// Curated, hardware-tiered starter models for the empty-state onboarding. Names
+// are real gallery entries (gallery/index.yaml); we intersect them against the
+// live gallery at render time so a custom/trimmed gallery degrades gracefully
+// (unmatched entries simply don't render).
+//
+// The guiding rule the maintainer asked for: CPU-only machines should be
+// steered to genuinely small models (1-4B, Q4) that stay responsive without a
+// GPU. GPU tiers scale the suggestion up with available VRAM.
+const SMALL = [
+  { name: 'llama-3.2-1b-instruct:q4_k_m', size: '~0.8 GB' },
+  { name: 'llama-3.2-3b-instruct:q4_k_m', size: '~2 GB' },
+  { name: 'qwen3-1.7b', size: '~1.4 GB' },
+  { name: 'gemma-3-1b-it', size: '~0.8 GB' },
+]
+const MID = [
+  { name: 'qwen3-4b', size: '~2.5 GB' },
+  { name: 'gemma-3-4b-it', size: '~3 GB' },
+  { name: 'llama-3.2-3b-instruct:q4_k_m', size: '~2 GB' },
+]
+const LARGE = [
+  { name: 'meta-llama-3.1-8b-instruct', size: '~5 GB' },
+  { name: 'qwen3-4b', size: '~2.5 GB' },
+  { name: 'mistral-7b-instruct-v0.3', size: '~4 GB' },
+]
+
+const GB = 1024 * 1024 * 1024
+
+// Pick a tier from detected hardware. total_memory is GPU VRAM in bytes (0 when
+// CPU-only). Thresholds are deliberately conservative so a suggestion that
+// "fits" really does.
+function pickTier(resources) {
+  const isGpu = resources?.type === 'gpu'
+  const vram = resources?.aggregate?.total_memory || 0
+  if (!isGpu || vram <= 0) return { id: 'cpu', list: SMALL }
+  if (vram < 8 * GB) return { id: 'gpu-small', list: MID }
+  return { id: 'gpu-large', list: LARGE }
+}
+
+export default function StarterModels({ addToast, onInstallStarted }) {
+  const { t } = useTranslation('home')
+  const { resources } = useResources()
+  const [available, setAvailable] = useState(null) // Set of gallery names, or null while loading
+  const [installing, setInstalling] = useState(() => new Set())
+
+  const tier = useMemo(() => pickTier(resources), [resources])
+  const candidates = tier.list
+
+  // Verify candidates exist in the live gallery. One search per name (the tier
+  // has at most a handful) keeps this resilient to gallery customization.
+  useEffect(() => {
+    let cancelled = false
+    const names = [...new Set(candidates.map(c => c.name))]
+    Promise.all(names.map(name =>
+      modelsApi.list({ search: name, page: 1 })
+        .then(data => (data?.models || []).some(m => (m.name || m.id) === name) ? name : null)
+        .catch(() => null)
+    )).then(found => {
+      if (cancelled) return
+      const hits = found.filter(Boolean)
+      // If verification yielded nothing (e.g. gallery unreachable), fall back to
+      // showing the curated list rather than an empty widget.
+      setAvailable(hits.length > 0 ? new Set(hits) : null)
+    })
+    return () => { cancelled = true }
+  }, [candidates])
+
+  const visible = available === null
+    ? candidates
+    : candidates.filter(c => available.has(c.name))
+
+  if (visible.length === 0) return null
+
+  const install = async (name) => {
+    setInstalling(prev => new Set(prev).add(name))
+    try {
+      await modelsApi.install(name)
+      addToast?.(t('starters.installStarted', { model: name }), 'success')
+      onInstallStarted?.(name)
+    } catch (err) {
+      addToast?.(t('starters.installFailed', { message: err.message }), 'error')
+      setInstalling(prev => {
+        const next = new Set(prev)
+        next.delete(name)
+        return next
+      })
+    }
+  }
+
+  return (
+    <section className="home-starters card">
+      <div className="home-starters-head">
+        <strong>{t('starters.title')}</strong>
+        <span className="home-starters-tier">
+          <i className={`fas ${tier.id === 'cpu' ? 'fa-memory' : 'fa-microchip'}`} aria-hidden="true" />
+          {t(`starters.tier.${tier.id}`)}
+        </span>
+      </div>
+      <p className="home-starters-sub">
+        {tier.id === 'cpu' ? t('starters.cpuNote') : t('starters.gpuNote')}
+      </p>
+      <ul className="home-starters-list">
+        {visible.map(c => {
+          const busy = installing.has(c.name)
+          return (
+            <li key={c.name} className="home-starters-item">
+              <span className="home-starters-name">{c.name}</span>
+              <span className="home-starters-size">{c.size}</span>
+              <button
+                type="button"
+                className="btn btn-primary btn-sm"
+                disabled={busy}
+                onClick={() => install(c.name)}
+              >
+                {busy
+                  ? (<><i className="fas fa-spinner fa-spin" aria-hidden="true" /> {t('starters.installing')}</>)
+                  : (<><i className="fas fa-download" aria-hidden="true" /> {t('starters.install')}</>)}
+              </button>
+            </li>
+          )
+        })}
+      </ul>
+    </section>
+  )
+}
diff --git a/core/http/react-ui/src/hooks/usePolling.js b/core/http/react-ui/src/hooks/usePolling.js
new file mode 100644
index 000000000..e75de5441
--- /dev/null
+++ b/core/http/react-ui/src/hooks/usePolling.js
@@ -0,0 +1,66 @@
+import { useEffect, useRef, useCallback } from 'react'
+
+// usePolling runs `fn` immediately and then on a fixed interval, with two
+// behaviours every hand-rolled setInterval in this app was missing:
+//
+//   1. Visibility-aware: the timer pauses while the tab is hidden
+//      (document.hidden) and fires an immediate catch-up poll when the tab
+//      becomes visible again. A backgrounded dashboard no longer hammers the
+//      server every few seconds for data nobody is looking at.
+//   2. Non-overlapping: if `fn` returns a promise that takes longer than the
+//      interval, the next tick waits for it instead of stacking requests.
+//
+// `enabled: false` stops polling entirely (one-shot or gated polls). The
+// returned `refetch` runs `fn` on demand and is stable across renders.
+export function usePolling(fn, intervalMs = 5000, { enabled = true, immediate = true } = {}) {
+  const fnRef = useRef(fn)
+  fnRef.current = fn
+
+  const runningRef = useRef(false)
+  const refetch = useCallback(async () => {
+    // Guard against overlap: a slow poll shouldn't pile up behind a fast timer.
+    if (runningRef.current) return
+    runningRef.current = true
+    try {
+      return await fnRef.current()
+    } finally {
+      runningRef.current = false
+    }
+  }, [])
+
+  useEffect(() => {
+    if (!enabled) return
+    let timer = null
+
+    const tick = () => { refetch() }
+
+    const start = () => {
+      if (timer != null) return
+      timer = setInterval(tick, intervalMs)
+    }
+    const stop = () => {
+      if (timer != null) { clearInterval(timer); timer = null }
+    }
+
+    const onVisibility = () => {
+      if (document.hidden) {
+        stop()
+      } else {
+        // Catch up immediately on return, then resume the cadence.
+        tick()
+        start()
+      }
+    }
+
+    if (immediate) tick()
+    if (!document.hidden) start()
+    document.addEventListener('visibilitychange', onVisibility)
+
+    return () => {
+      stop()
+      document.removeEventListener('visibilitychange', onVisibility)
+    }
+  }, [enabled, intervalMs, immediate, refetch])
+
+  return { refetch }
+}
diff --git a/core/http/react-ui/src/hooks/useResources.js b/core/http/react-ui/src/hooks/useResources.js
index 688ea0d74..7cd6c7b16 100644
--- a/core/http/react-ui/src/hooks/useResources.js
+++ b/core/http/react-ui/src/hooks/useResources.js
@@ -1,11 +1,11 @@
-import { useState, useEffect, useCallback, useRef } from 'react'
+import { useState, useCallback } from 'react'
 import { resourcesApi } from '../utils/api'
+import { usePolling } from './usePolling'
 
 export function useResources(pollInterval = 5000) {
   const [resources, setResources] = useState(null)
   const [loading, setLoading] = useState(true)
   const [error, setError] = useState(null)
-  const intervalRef = useRef(null)
 
   const fetchResources = useCallback(async () => {
     try {
@@ -19,13 +19,10 @@ export function useResources(pollInterval = 5000) {
     }
   }, [])
 
-  useEffect(() => {
-    fetchResources()
-    intervalRef.current = setInterval(fetchResources, pollInterval)
-    return () => {
-      if (intervalRef.current) clearInterval(intervalRef.current)
-    }
-  }, [fetchResources, pollInterval])
+  // Visibility-aware polling: pauses while the tab is hidden and catches up on
+  // return (see usePolling). Resource stats are pure dashboard data, so there's
+  // no reason to keep fetching them for a backgrounded tab.
+  const { refetch } = usePolling(fetchResources, pollInterval)
 
-  return { resources, loading, error, refetch: fetchResources }
+  return { resources, loading, error, refetch }
 }
diff --git a/core/http/react-ui/src/pages/AgentChat.jsx b/core/http/react-ui/src/pages/AgentChat.jsx
index ba503bc81..5bece394f 100644
--- a/core/http/react-ui/src/pages/AgentChat.jsx
+++ b/core/http/react-ui/src/pages/AgentChat.jsx
@@ -765,8 +765,10 @@ export default function AgentChat() {
             className="chat-send-btn"
             onClick={handleSend}
             disabled={processing || !input.trim()}
+            aria-label="Send message"
+            title="Send message"
           >
-            <i className="fas fa-paper-plane" />
+            <i className="fas fa-paper-plane" aria-hidden="true" />
           </button>
         </div>
       </div>
diff --git a/core/http/react-ui/src/pages/Chat.jsx b/core/http/react-ui/src/pages/Chat.jsx
index 675e15581..40c30d721 100644
--- a/core/http/react-ui/src/pages/Chat.jsx
+++ b/core/http/react-ui/src/pages/Chat.jsx
@@ -1427,8 +1427,10 @@ export default function Chat() {
                 className="chat-send-btn"
                 onClick={handleSend}
                 disabled={!input.trim() && files.length === 0}
+                aria-label={t('input.send')}
+                title={t('input.send')}
               >
-                <i className="fas fa-paper-plane" />
+                <i className="fas fa-paper-plane" aria-hidden="true" />
               </button>
             )}
           </div>
diff --git a/core/http/react-ui/src/pages/Home.jsx b/core/http/react-ui/src/pages/Home.jsx
index 2c36b03a6..52753bdf8 100644
--- a/core/http/react-ui/src/pages/Home.jsx
+++ b/core/http/react-ui/src/pages/Home.jsx
@@ -10,6 +10,7 @@ import UnifiedMCPDropdown from '../components/UnifiedMCPDropdown'
 import ConfirmDialog from '../components/ConfirmDialog'
 import HomeConnect from '../components/HomeConnect'
 import { useResources } from '../hooks/useResources'
+import { usePolling } from '../hooks/usePolling'
 import { fileToBase64, backendControlApi, systemApi, modelsApi, mcpApi, nodesApi } from '../utils/api'
 import { API_CONFIG } from '../utils/config'
 import { greetingKey } from '../utils/greeting'
@@ -17,6 +18,7 @@ import StatusPill from '../components/StatusPill'
 import Skeleton from '../components/Skeleton'
 import SectionHeading from '../components/SectionHeading'
 import EmptyState from '../components/EmptyState'
+import StarterModels from '../components/StarterModels'
 import { staggerStyle } from '../hooks/useStagger'
 
 export default function Home() {
@@ -68,40 +70,36 @@ export default function Home() {
       .catch(() => {})
   }, [])
 
-  // Poll cluster node data in distributed mode
-  useEffect(() => {
-    if (!distributedMode) return
-    const fetchCluster = async () => {
-      try {
-        const data = await nodesApi.list()
-        const nodes = Array.isArray(data) ? data : []
-        const backendNodes = nodes.filter(n => !n.node_type || n.node_type === 'backend')
-        const totalVRAM = backendNodes.reduce((sum, n) => sum + (n.total_vram || 0), 0)
-        const usedVRAM = backendNodes.reduce((sum, n) => {
-          if (n.total_vram && n.available_vram != null) return sum + (n.total_vram - n.available_vram)
-          return sum
-        }, 0)
-        const totalRAM = backendNodes.reduce((sum, n) => sum + (n.total_ram || 0), 0)
-        const usedRAM = backendNodes.reduce((sum, n) => {
-          if (n.total_ram && n.available_ram != null) return sum + (n.total_ram - n.available_ram)
-          return sum
-        }, 0)
-        const isGPU = totalVRAM > 0
-        const healthyCount = backendNodes.filter(n => n.status === 'healthy').length
-        const totalCount = backendNodes.length
-        setClusterData({
-          totalMem: isGPU ? totalVRAM : totalRAM,
-          usedMem: isGPU ? usedVRAM : usedRAM,
-          isGPU,
-          healthyCount,
-          totalCount,
-        })
-      } catch { setClusterData(null) }
-    }
-    fetchCluster()
-    const interval = setInterval(fetchCluster, 5000)
-    return () => clearInterval(interval)
-  }, [distributedMode])
+  // Poll cluster node data in distributed mode. Visibility-aware + gated on
+  // distributedMode so a non-distributed or backgrounded tab makes no calls.
+  const fetchCluster = useCallback(async () => {
+    try {
+      const data = await nodesApi.list()
+      const nodes = Array.isArray(data) ? data : []
+      const backendNodes = nodes.filter(n => !n.node_type || n.node_type === 'backend')
+      const totalVRAM = backendNodes.reduce((sum, n) => sum + (n.total_vram || 0), 0)
+      const usedVRAM = backendNodes.reduce((sum, n) => {
+        if (n.total_vram && n.available_vram != null) return sum + (n.total_vram - n.available_vram)
+        return sum
+      }, 0)
+      const totalRAM = backendNodes.reduce((sum, n) => sum + (n.total_ram || 0), 0)
+      const usedRAM = backendNodes.reduce((sum, n) => {
+        if (n.total_ram && n.available_ram != null) return sum + (n.total_ram - n.available_ram)
+        return sum
+      }, 0)
+      const isGPU = totalVRAM > 0
+      const healthyCount = backendNodes.filter(n => n.status === 'healthy').length
+      const totalCount = backendNodes.length
+      setClusterData({
+        totalMem: isGPU ? totalVRAM : totalRAM,
+        usedMem: isGPU ? usedVRAM : usedRAM,
+        isGPU,
+        healthyCount,
+        totalCount,
+      })
+    } catch { setClusterData(null) }
+  }, [])
+  usePolling(fetchCluster, 5000, { enabled: distributedMode })
 
   // Fetch configured models (to know if any exist) and loaded models (currently running)
   const fetchSystemInfo = useCallback(async () => {
@@ -123,11 +121,7 @@ export default function Home() {
     }
   }, [])
 
-  useEffect(() => {
-    fetchSystemInfo()
-    const interval = setInterval(fetchSystemInfo, 5000)
-    return () => clearInterval(interval)
-  }, [fetchSystemInfo])
+  usePolling(fetchSystemInfo, 5000)
 
   // Check MCP availability when selected model changes
   useEffect(() => {
@@ -523,6 +517,8 @@ export default function Home() {
             </div>
           </div>
 
+          <StarterModels addToast={addToast} onInstallStarted={fetchSystemInfo} />
+
           <div className="home-wizard-actions">
             <button className="btn btn-primary" onClick={() => navigate('/app/models')}>
               <i className="fas fa-store" /> {t('wizard.browseGallery')}
diff --git a/core/http/react-ui/src/pages/Usage.jsx b/core/http/react-ui/src/pages/Usage.jsx
index 50f499ada..a7a38ca46 100644
--- a/core/http/react-ui/src/pages/Usage.jsx
+++ b/core/http/react-ui/src/pages/Usage.jsx
@@ -24,7 +24,37 @@ function formatNumber(n) {
   return String(n)
 }
 
-function StatCard({ icon, label, value, muted }) {
+// Opt-in token pricing. LocalAI is self-hosted and has no inherent monetary
+// cost, but multi-user deployments use estimated cost for chargeback/budgeting.
+// Prices are admin-supplied $ per 1M tokens, stored locally (per-browser), and
+// the whole cost surface stays hidden until a non-zero price is set.
+const TOKEN_PRICING_KEY = 'localai_token_pricing'
+
+function loadPricing() {
+  try {
+    const p = JSON.parse(localStorage.getItem(TOKEN_PRICING_KEY) || '{}')
+    return { prompt: Number(p.prompt) || 0, completion: Number(p.completion) || 0 }
+  } catch { return { prompt: 0, completion: 0 } }
+}
+
+function savePricing(p) {
+  try { localStorage.setItem(TOKEN_PRICING_KEY, JSON.stringify(p)) } catch { /* ignore */ }
+}
+
+function pricingEnabled(p) { return (p?.prompt || 0) > 0 || (p?.completion || 0) > 0 }
+
+function costOf(row, p) {
+  return (row.prompt_tokens / 1_000_000) * (p.prompt || 0)
+       + (row.completion_tokens / 1_000_000) * (p.completion || 0)
+}
+
+function formatCost(n) {
+  if (!n) return '$0.00'
+  if (n < 0.01) return '<$0.01'
+  return '$' + n.toFixed(2)
+}
+
+function StatCard({ icon, label, value, muted, text }) {
   return (
     <div className="card" style={{ padding: 'var(--spacing-sm) var(--spacing-md)', flex: '1 1 0', minWidth: 120, opacity: muted ? 0.7 : 1 }}>
       <div style={{ display: 'flex', alignItems: 'center', gap: 6, marginBottom: 2 }}>
@@ -32,7 +62,7 @@ function StatCard({ icon, label, value, muted }) {
         <span style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)', fontWeight: 500, textTransform: 'uppercase', letterSpacing: '0.03em' }}>{label}</span>
       </div>
       <div style={{ fontSize: '1.375rem', fontWeight: 700, fontFamily: 'var(--font-mono)', color: muted ? 'var(--color-text-secondary)' : 'var(--color-text-primary)' }}>
-        {muted ? '~' : ''}{formatNumber(value)}
+        {text != null ? text : `${muted ? '~' : ''}${formatNumber(value)}`}
       </div>
     </div>
   )
@@ -642,6 +672,10 @@ export default function Usage() {
   const [activeTab, setActiveTab] = useState('models')
   const [quotas, setQuotas] = useState([])
   const [selectedUserId, setSelectedUserId] = useState(null)
+  const [pricing, setPricingState] = useState(loadPricing)
+  const [showPricing, setShowPricing] = useState(false)
+  const setPricing = (p) => { setPricingState(p); savePricing(p) }
+  const costEnabled = pricingEnabled(pricing)
 
   const fetchUsage = useCallback(async () => {
     setLoading(true)
@@ -743,11 +777,50 @@ export default function Usage() {
           <i className="fas fa-key" style={{ fontSize: '0.7rem' }} /> {t('usage.sources.tab')}
         </button>
         <div style={{ flex: 1 }} />
+        <button
+          className={`btn btn-sm ${costEnabled ? 'btn-primary' : 'btn-secondary'}`}
+          onClick={() => setShowPricing(v => !v)}
+          style={{ gap: 4 }}
+          title="Set token pricing to estimate cost"
+        >
+          <i className="fas fa-dollar-sign" /> {costEnabled ? 'Pricing' : 'Set pricing'}
+        </button>
         <button className="btn btn-secondary btn-sm" onClick={fetchUsage} disabled={loading} style={{ gap: 4 }}>
           <i className={`fas fa-rotate${loading ? ' fa-spin' : ''}`} /> Refresh
         </button>
       </div>
 
+      {showPricing && (
+        <div className="card" style={{ display: 'flex', alignItems: 'flex-end', gap: 'var(--spacing-md)', flexWrap: 'wrap', padding: 'var(--spacing-md)', marginBottom: 'var(--spacing-md)' }}>
+          <div style={{ display: 'flex', flexDirection: 'column', gap: 2 }}>
+            <label style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)', textTransform: 'uppercase', letterSpacing: '0.03em' }}>Prompt $/1M tokens</label>
+            <input
+              className="input" type="number" min="0" step="0.01" style={{ width: 140 }}
+              value={pricing.prompt || ''}
+              placeholder="0.00"
+              onChange={e => setPricing({ ...pricing, prompt: Number(e.target.value) || 0 })}
+            />
+          </div>
+          <div style={{ display: 'flex', flexDirection: 'column', gap: 2 }}>
+            <label style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)', textTransform: 'uppercase', letterSpacing: '0.03em' }}>Completion $/1M tokens</label>
+            <input
+              className="input" type="number" min="0" step="0.01" style={{ width: 140 }}
+              value={pricing.completion || ''}
+              placeholder="0.00"
+              onChange={e => setPricing({ ...pricing, completion: Number(e.target.value) || 0 })}
+            />
+          </div>
+          {costEnabled && (
+            <button className="btn btn-secondary btn-sm" onClick={() => setPricing({ prompt: 0, completion: 0 })} style={{ gap: 4 }}>
+              <i className="fas fa-times" /> Clear
+            </button>
+          )}
+          <span style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', flex: '1 1 200px' }}>
+            Estimated cost only. Prices are stored in this browser and applied to recorded token counts.
+          </span>
+        </div>
+      )}
+
       {loading ? (
         <div style={{ display: 'flex', justifyContent: 'center', padding: 'var(--spacing-xl)' }}>
           <LoadingSpinner size="lg" />
@@ -760,6 +833,9 @@ export default function Usage() {
             <StatCard icon="fas fa-arrow-up" label="Prompt" value={displayTotals.prompt_tokens} />
             <StatCard icon="fas fa-arrow-down" label="Completion" value={displayTotals.completion_tokens} />
             <StatCard icon="fas fa-coins" label="Total" value={displayTotals.total_tokens} />
+            {costEnabled && (
+              <StatCard icon="fas fa-dollar-sign" label="Est. Cost" text={formatCost(costOf(displayTotals, pricing))} />
+            )}
           </div>
 
           {/* Predictions */}
@@ -789,6 +865,7 @@ export default function Usage() {
                       <th style={{ width: 110 }}>Prompt</th>
                       <th style={{ width: 110 }}>Completion</th>
                       <th style={{ width: 110 }}>Total</th>
+                      {costEnabled && <th style={{ width: 100 }}>Est. Cost</th>}
                       <th style={{ width: 140 }}></th>
                     </tr>
                   </thead>
@@ -800,6 +877,7 @@ export default function Usage() {
                         <td style={monoCell}>{formatNumber(row.prompt_tokens)}</td>
                         <td style={monoCell}>{formatNumber(row.completion_tokens)}</td>
                         <td style={{ ...monoCell, fontWeight: 600 }}>{formatNumber(row.total_tokens)}</td>
+                        {costEnabled && <td style={monoCell}>{formatCost(costOf(row, pricing))}</td>}
                         <td><UsageBar value={row.total_tokens} max={maxTokens} /></td>
                       </tr>
                     ))}
@@ -827,6 +905,7 @@ export default function Usage() {
                       <th style={{ width: 110 }}>Prompt</th>
                       <th style={{ width: 110 }}>Completion</th>
                       <th style={{ width: 110 }}>Total</th>
+                      {costEnabled && <th style={{ width: 100 }}>Est. Cost</th>}
                       <th style={{ width: 110 }}>Proj. Total</th>
                       <th style={{ width: 140 }}></th>
                     </tr>
@@ -849,6 +928,7 @@ export default function Usage() {
                             <td style={monoCell}>{formatNumber(row.prompt_tokens)}</td>
                             <td style={monoCell}>{formatNumber(row.completion_tokens)}</td>
                             <td style={{ ...monoCell, fontWeight: 600 }}>{formatNumber(row.total_tokens)}</td>
+                            {costEnabled && <td style={monoCell}>{formatCost(costOf(row, pricing))}</td>}
                             <td style={{ ...monoCell, color: 'var(--color-text-muted)', fontStyle: 'italic' }}>
                               {up?.predictions ? `~${formatNumber(up.predictions.projectedTotals.total_tokens)}` : '-'}
                             </td>
@@ -856,7 +936,7 @@ export default function Usage() {
                           </tr>
                           {isExpanded && up && (
                             <tr>
-                              <td colSpan={8} style={{ padding: 0, background: 'var(--color-bg-secondary)' }}>
+                              <td colSpan={costEnabled ? 9 : 8} style={{ padding: 0, background: 'var(--color-bg-secondary)' }}>
                                 <div style={{ padding: 'var(--spacing-md)' }}>
                                   {up.predictions && (
                                     <div style={{ display: 'grid', gridTemplateColumns: 'repeat(auto-fit, minmax(100px, 1fr))', gap: 'var(--spacing-xs)', marginBottom: 'var(--spacing-sm)' }}>

From 0d6de15ae961eb6a80092f439c5c5ef6ae493630 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 25 Jun 2026 00:07:48 +0200
Subject: [PATCH 81/99] fix(config): per-device VRAM headroom for Blackwell
 defaults (#10485) (#10494)

The hardware-tuned defaults from #10411 were measured on a GB10 / DGX Spark
(128 GiB unified memory) and over-provisioned multi-GPU consumer Blackwell
(e.g. 2x16 GiB RTX 50-series) into CUDA OOM during model init:

  - The Blackwell physical batch (512 -> 2048) sets both n_batch and n_ubatch.
    The compute buffer scales ~n_ubatch * n_ctx and is allocated PER DEVICE
    (it can't be split across GPUs), so a large context turns ub2048 into
    multi-GiB of scratch that must fit one 16 GiB card.
  - The VRAM-scaled parallel-slot default tiered off TotalAvailableVRAM(),
    which SUMS all GPUs (2x16 -> "32 GiB" -> 8 slots), but the allocations
    are per-device.

Make both decisions per-device and context-aware:

  - xsysinfo.MinPerGPUVRAM() reports the smallest device's VRAM; localGPU()
    uses it so the parallel tier and batch guard reason about one card.
  - PhysicalBatchForContext(gpu, ctx) raises the batch only when the extra
    compute buffer fits VRAM/4 at this model's context (16 GiB crosses over
    ~174k ctx, 32 GiB ~349k; GB10 reports system RAM so it still clears it).
  - Apply hardware defaults AFTER runBackendHooks in SetDefaults so the
    GGUF-guessed context is resolved before the batch decision.
  - The distributed router gates the node batch the same way.

Unified-memory devices (GB10, Apple) report system RAM as their single
device's VRAM, so they keep the prefill win.


Assisted-by: Claude:opus-4.8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/config/hardware_defaults.go              | 80 +++++++++++++++++--
 .../config/hardware_defaults_internal_test.go | 19 ++++-
 core/config/hardware_defaults_test.go         | 45 +++++++++--
 core/config/model_config.go                   | 15 ++--
 core/services/nodes/router.go                 |  5 +-
 .../nodes/router_hardware_internal_test.go    | 13 ++-
 pkg/xsysinfo/gpu.go                           | 55 +++++++++++++
 pkg/xsysinfo/minvram_internal_test.go         | 37 +++++++++
 8 files changed, 244 insertions(+), 25 deletions(-)
 create mode 100644 pkg/xsysinfo/minvram_internal_test.go

diff --git a/core/config/hardware_defaults.go b/core/config/hardware_defaults.go
index 18c321639..b4e0e74c6 100644
--- a/core/config/hardware_defaults.go
+++ b/core/config/hardware_defaults.go
@@ -54,8 +54,35 @@ func (g GPU) IsNVIDIABlackwell() bool {
 	return maj >= 12
 }
 
+// Compute-buffer headroom guard for the raised physical batch.
+//
+// Raising n_ubatch grows the CUDA *compute buffer* (the scratch for the forward
+// graph), which is allocated PER DEVICE — it does not benefit from a second GPU
+// the way weights or KV (which are split across devices) do. The buffer scales
+// ~linearly with n_ubatch * n_ctx, so a large context turns the GB10-tuned
+// ub2048 into multi-GiB of extra scratch that must fit on a SINGLE card. On a
+// 16 GiB consumer Blackwell with a 200k context that overflows (issue #10485),
+// even though the GB10 it was measured on (128 GiB unified memory) had room.
+//
+// These constants size a conservative guard: only raise the batch when the
+// extra scratch fits the per-device VRAM ceiling.
+const (
+	// computeBufferBytesPerCell approximates the CUDA compute-buffer cost of one
+	// (n_ubatch * n_ctx) cell. Derived from an observed allocation (ub2048 *
+	// ctx204800 ~= 4.5 GiB => ~11 B/cell) and rounded up to 16 for margin, since
+	// the real cost also grows with model width (heads / embedding dim) which we
+	// don't know at config time.
+	computeBufferBytesPerCell = 16
+	// blackwellBatchHeadroomDivisor caps the extra compute buffer from raising the
+	// physical batch at VRAM/divisor. /4 keeps the bulk of a device for weights +
+	// KV, which already dominate VRAM use.
+	blackwellBatchHeadroomDivisor = 4
+)
+
 // PhysicalBatch returns the canonical physical batch (n_batch/n_ubatch) for the
-// given hardware, used when the model config leaves batch unset.
+// given hardware class, ignoring context/VRAM headroom. Use
+// PhysicalBatchForContext when a model context and per-device VRAM are known
+// (the load paths) so the raised batch can't overflow a single device.
 func PhysicalBatch(g GPU) int {
 	if g.IsNVIDIABlackwell() {
 		return BlackwellPhysicalBatch
@@ -63,6 +90,32 @@ func PhysicalBatch(g GPU) int {
 	return DefaultPhysicalBatch
 }
 
+// PhysicalBatchForContext is PhysicalBatch gated on per-device VRAM headroom for
+// the given context: it only raises the batch above the conservative default
+// when the extra compute buffer (which is allocated on a single device and grows
+// with n_ubatch * n_ctx) fits within blackwellBatchHeadroomDivisor of the GPU's
+// VRAM. g.VRAM must be the PER-DEVICE ceiling (the smallest device on a
+// multi-GPU host), not the summed total — the compute buffer can't be split.
+//
+// VRAM 0 (unknown) stays conservative rather than risk a per-device OOM; the
+// GB10 / unified-memory path reports system RAM, so it still clears the guard.
+func PhysicalBatchForContext(g GPU, ctx int) int {
+	if !g.IsNVIDIABlackwell() {
+		return DefaultPhysicalBatch
+	}
+	if ctx <= 0 {
+		ctx = DefaultContextSize
+	}
+	if g.VRAM == 0 {
+		return DefaultPhysicalBatch
+	}
+	extra := uint64(ctx) * uint64(BlackwellPhysicalBatch-DefaultPhysicalBatch) * computeBufferBytesPerCell
+	if extra <= g.VRAM/blackwellBatchHeadroomDivisor {
+		return BlackwellPhysicalBatch
+	}
+	return DefaultPhysicalBatch
+}
+
 // IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
 // Callers that re-tune a value chosen by an upstream host (the distributed
 // router correcting the frontend's guess) use this to avoid clobbering an
@@ -122,7 +175,12 @@ func hasParallelOption(opts []string) bool {
 // deterministic device — detection does a live nvidia-smi call.
 var localGPU = func() GPU {
 	vendor, _ := xsysinfo.DetectGPUVendor()
-	vram, _ := xsysinfo.TotalAvailableVRAM()
+	// Use the SMALLEST device's VRAM, not the summed total: the parallel-slot
+	// tier and the batch headroom guard both reason about what fits on a single
+	// card, and per-device compute buffers can't be split across GPUs. Summing
+	// two 16 GiB cards into "32 GiB" is what over-provisioned multi-GPU hosts
+	// into OOM (issue #10485).
+	vram, _ := xsysinfo.MinPerGPUVRAM()
 	return GPU{
 		Vendor:            vendor,
 		ComputeCapability: xsysinfo.NVIDIAComputeCapability(),
@@ -137,10 +195,20 @@ func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
 	if cfg == nil {
 		return
 	}
-	if cfg.Batch == 0 && gpu.IsNVIDIABlackwell() {
-		cfg.Batch = BlackwellPhysicalBatch
-		xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
-			"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability)
+	// Raise the physical batch on Blackwell only when the resulting compute
+	// buffer fits the per-device VRAM at THIS model's context. Leaving Batch at 0
+	// (rather than writing the default 512) preserves the downstream single-pass
+	// sizing in core/backend.EffectiveBatchSize for embedding/score/rerank.
+	if cfg.Batch == 0 {
+		ctx := DefaultContextSize
+		if cfg.ContextSize != nil {
+			ctx = *cfg.ContextSize
+		}
+		if PhysicalBatchForContext(gpu, ctx) == BlackwellPhysicalBatch {
+			cfg.Batch = BlackwellPhysicalBatch
+			xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
+				"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability, "context", ctx, "vram_gib", gpu.VRAM>>30)
+		}
 	}
 
 	// Enable concurrent serving by default on a capable GPU: without this the
diff --git a/core/config/hardware_defaults_internal_test.go b/core/config/hardware_defaults_internal_test.go
index 52c674c2d..d6878c86e 100644
--- a/core/config/hardware_defaults_internal_test.go
+++ b/core/config/hardware_defaults_internal_test.go
@@ -9,26 +9,37 @@ import (
 // GPU. The detection seam (localGPU) is injected so the path is deterministic
 // without a real GPU.
 var _ = Describe("SetDefaults hardware defaults (single-instance)", func() {
+	const gib = uint64(1) << 30
+
 	var orig func() GPU
 	BeforeEach(func() { orig = localGPU })
 	AfterEach(func() { localGPU = orig })
 
-	It("sets the physical batch on a local Blackwell GPU", func() {
-		localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
+	It("sets the physical batch on a local Blackwell GPU with headroom", func() {
+		localGPU = func() GPU { return GPU{ComputeCapability: "12.1", VRAM: 119 * gib} }
 		cfg := &ModelConfig{}
 		cfg.SetDefaults()
 		Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
 	})
 
+	It("leaves batch unset when a large context would overflow the device", func() {
+		// Regression guard for issue #10485: 16 GiB consumer Blackwell + ~200k ctx.
+		localGPU = func() GPU { return GPU{ComputeCapability: "12.0", VRAM: 16 * gib} }
+		ctx := 204800
+		cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
+		cfg.SetDefaults()
+		Expect(cfg.Batch).To(Equal(0))
+	})
+
 	It("leaves batch unset on a non-Blackwell local GPU", func() {
-		localGPU = func() GPU { return GPU{ComputeCapability: "8.9"} }
+		localGPU = func() GPU { return GPU{ComputeCapability: "8.9", VRAM: 119 * gib} }
 		cfg := &ModelConfig{}
 		cfg.SetDefaults()
 		Expect(cfg.Batch).To(Equal(0))
 	})
 
 	It("never overrides an explicit batch", func() {
-		localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
+		localGPU = func() GPU { return GPU{ComputeCapability: "12.1", VRAM: 119 * gib} }
 		cfg := &ModelConfig{}
 		cfg.Batch = 1024
 		cfg.SetDefaults()
diff --git a/core/config/hardware_defaults_test.go b/core/config/hardware_defaults_test.go
index ae7bf3964..3bc1bf297 100644
--- a/core/config/hardware_defaults_test.go
+++ b/core/config/hardware_defaults_test.go
@@ -7,6 +7,8 @@ import (
 )
 
 var _ = Describe("Hardware-driven config defaults", func() {
+	const gib = uint64(1) << 30
+
 	DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
 		func(cc string, want bool) {
 			Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
@@ -35,21 +37,54 @@ var _ = Describe("Hardware-driven config defaults", func() {
 		})
 	})
 
+	Describe("PhysicalBatchForContext (per-device VRAM headroom)", func() {
+		It("raises the batch when the compute buffer fits the device", func() {
+			// 16 GiB Blackwell with a small context: the extra scratch is tiny.
+			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.0", VRAM: 16 * gib}, 8192)).
+				To(Equal(BlackwellPhysicalBatch))
+		})
+		It("keeps the default batch when a large context would overflow one device", func() {
+			// The issue #10485 case: 16 GiB consumer Blackwell, ~200k context.
+			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.0", VRAM: 16 * gib}, 204800)).
+				To(Equal(DefaultPhysicalBatch))
+		})
+		It("still raises the batch on a large unified-memory device (GB10)", func() {
+			// GB10 reports system RAM (~119 GiB) as its single device's VRAM.
+			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.1", VRAM: 119 * gib}, 204800)).
+				To(Equal(BlackwellPhysicalBatch))
+		})
+		It("stays conservative when VRAM is unknown", func() {
+			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.1"}, 8192)).
+				To(Equal(DefaultPhysicalBatch))
+		})
+		It("never raises the batch on non-Blackwell", func() {
+			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "9.0", VRAM: 80 * gib}, 8192)).
+				To(Equal(DefaultPhysicalBatch))
+		})
+	})
+
 	Describe("ApplyHardwareDefaults", func() {
-		It("raises an unset batch to 2048 on Blackwell", func() {
+		It("raises an unset batch to 2048 on Blackwell with headroom", func() {
 			cfg := &ModelConfig{}
-			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
 			Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
 		})
+		It("leaves batch unset when a large context would overflow one device", func() {
+			// Regression guard for issue #10485: 16 GiB card + ~200k context.
+			ctx := 204800
+			cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.0", VRAM: 16 * gib})
+			Expect(cfg.Batch).To(Equal(0))
+		})
 		It("leaves batch unset on non-Blackwell", func() {
 			cfg := &ModelConfig{}
-			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0"})
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0", VRAM: 119 * gib})
 			Expect(cfg.Batch).To(Equal(0))
 		})
 		It("never overrides an explicit batch", func() {
 			cfg := &ModelConfig{}
 			cfg.Batch = 1024
-			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
 			Expect(cfg.Batch).To(Equal(1024))
 		})
 		It("no-ops on nil", func() {
@@ -57,8 +92,6 @@ var _ = Describe("Hardware-driven config defaults", func() {
 		})
 	})
 
-	const gib = uint64(1) << 30
-
 	DescribeTable("DefaultParallelSlots (by VRAM)",
 		func(vramGiB uint64, want int) {
 			Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want))
diff --git a/core/config/model_config.go b/core/config/model_config.go
index 8886ddfd5..2d1e18cc7 100644
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -1204,11 +1204,6 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	// This ensures gallery-installed and runtime-loaded models get optimal parameters.
 	ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model)
 
-	// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell).
-	// Uses the local GPU here; in distributed mode the router re-applies the same
-	// heuristics for the selected node's GPU before loading. Explicit config wins.
-	ApplyHardwareDefaults(cfg, localGPU())
-
 	// Apply serving-policy defaults (device-independent): cross-request prefix
 	// caching. Propagates to distributed nodes via the model options.
 	ApplyServingDefaults(cfg)
@@ -1247,6 +1242,16 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.ContextSize = &ctx
 	}
 	runBackendHooks(cfg, lo.modelPath)
+
+	// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell)
+	// LAST, after the context size is fully resolved (explicit config, LoadOptions,
+	// then the GGUF guess inside runBackendHooks): the Blackwell batch guard sizes
+	// the per-device compute buffer against this model's context, so it must see
+	// the final value, not a pre-guess nil. Uses the local GPU here; in distributed
+	// mode the router re-applies the same heuristics for the selected node's GPU
+	// before loading. Explicit config always wins.
+	ApplyHardwareDefaults(cfg, localGPU())
+
 	cfg.syncKnownUsecasesFromString()
 }
 
diff --git a/core/services/nodes/router.go b/core/services/nodes/router.go
index f26fea2b9..6ad550cf1 100644
--- a/core/services/nodes/router.go
+++ b/core/services/nodes/router.go
@@ -156,7 +156,10 @@ func applyNodeHardwareDefaults(opts *pb.ModelOptions, node *BackendNode) {
 		VRAM:              node.TotalVRAM,
 	}
 	if config.IsManagedPhysicalBatch(int(opts.NBatch)) {
-		opts.NBatch = int32(config.PhysicalBatch(gpu))
+		// Gate the raised batch on the selected node's per-device VRAM at this
+		// model's context, so a large context can't overflow the node's compute
+		// buffer (issue #10485). node.TotalVRAM is the node's reported ceiling.
+		opts.NBatch = int32(config.PhysicalBatchForContext(gpu, int(opts.ContextSize)))
 	}
 	// Default concurrent serving for the selected node (the frontend that built
 	// the options may have no GPU). Only adds when no parallel option is set.
diff --git a/core/services/nodes/router_hardware_internal_test.go b/core/services/nodes/router_hardware_internal_test.go
index 2418bf444..d8576c4e4 100644
--- a/core/services/nodes/router_hardware_internal_test.go
+++ b/core/services/nodes/router_hardware_internal_test.go
@@ -8,12 +8,19 @@ import (
 )
 
 var _ = Describe("applyNodeHardwareDefaults", func() {
-	It("raises a managed default batch on a Blackwell node", func() {
-		opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch}
-		applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1"})
+	It("raises a managed default batch on a Blackwell node with headroom", func() {
+		opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, ContextSize: 8192}
+		applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1", TotalVRAM: 119 << 30})
 		Expect(opts.NBatch).To(BeEquivalentTo(config.BlackwellPhysicalBatch))
 	})
 
+	It("keeps the default batch when a large context would overflow the node", func() {
+		// Regression guard for issue #10485 on the distributed path.
+		opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, ContextSize: 204800}
+		applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.0", TotalVRAM: 16 << 30})
+		Expect(opts.NBatch).To(BeEquivalentTo(config.DefaultPhysicalBatch))
+	})
+
 	It("resets a Blackwell guess on a non-Blackwell node", func() {
 		// frontend (Blackwell) guessed high, but the selected node is not Blackwell
 		opts := &pb.ModelOptions{NBatch: config.BlackwellPhysicalBatch}
diff --git a/pkg/xsysinfo/gpu.go b/pkg/xsysinfo/gpu.go
index f0185ddeb..da183212f 100644
--- a/pkg/xsysinfo/gpu.go
+++ b/pkg/xsysinfo/gpu.go
@@ -129,6 +129,61 @@ func TotalAvailableVRAM() (uint64, error) {
 	return 0, nil
 }
 
+// MinPerGPUVRAM returns the total VRAM of the SMALLEST GPU on the host (in
+// bytes), or 0 when no per-device VRAM is known. Unlike TotalAvailableVRAM
+// (which sums across devices) this reports a single device's ceiling, which is
+// the right figure for decisions about what must fit on one card: the compute
+// buffer (sized by n_ubatch) and the parallel-slot tier. Summing a multi-GPU
+// host's VRAM over-provisions those into a per-device OOM (issue #10485).
+//
+// Unified-memory devices (GB10, Apple) report system RAM as their single
+// device's VRAM, so they are unaffected.
+func MinPerGPUVRAM() (uint64, error) {
+	// Prefer per-device binary detection (nvidia-smi/rocm-smi report true
+	// per-card VRAM); ghw's per-card memory can reflect NUMA node RAM on some
+	// hosts, which is why TotalAvailableVRAM treats it as a sum.
+	if infos := GetGPUMemoryUsage(); len(infos) > 0 {
+		if v := minNonZeroVRAM(infos); v > 0 {
+			return v, nil
+		}
+	}
+
+	// Fallback: ghw per-card memory, taking the minimum non-zero card.
+	if gpus, err := GPUs(); err == nil {
+		var min uint64
+		for _, gpu := range gpus {
+			if gpu == nil || gpu.Node == nil || gpu.Node.Memory == nil {
+				continue
+			}
+			if b := gpu.Node.Memory.TotalUsableBytes; b > 0 {
+				if u := uint64(b); min == 0 || u < min {
+					min = u
+				}
+			}
+		}
+		if min > 0 {
+			return min, nil
+		}
+	}
+
+	return 0, nil
+}
+
+// minNonZeroVRAM returns the smallest non-zero TotalVRAM across the given GPUs,
+// or 0 when none report VRAM.
+func minNonZeroVRAM(infos []GPUMemoryInfo) uint64 {
+	var min uint64
+	for _, g := range infos {
+		if g.TotalVRAM == 0 {
+			continue
+		}
+		if min == 0 || g.TotalVRAM < min {
+			min = g.TotalVRAM
+		}
+	}
+	return min
+}
+
 func HasGPU(vendor string) bool {
 	gpus, err := GPUs()
 	if err != nil {
diff --git a/pkg/xsysinfo/minvram_internal_test.go b/pkg/xsysinfo/minvram_internal_test.go
new file mode 100644
index 000000000..ccd72dd5e
--- /dev/null
+++ b/pkg/xsysinfo/minvram_internal_test.go
@@ -0,0 +1,37 @@
+package xsysinfo
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("minNonZeroVRAM", func() {
+	const gib = uint64(1) << 30
+
+	It("returns the smallest device on a multi-GPU host", func() {
+		// Two unequal cards (e.g. RTX 5070 Ti + 5060 Ti, both 16 GiB, or a
+		// mixed pair): the smallest device is the per-card allocation ceiling.
+		infos := []GPUMemoryInfo{
+			{TotalVRAM: 16 * gib},
+			{TotalVRAM: 12 * gib},
+		}
+		Expect(minNonZeroVRAM(infos)).To(Equal(12 * gib))
+	})
+
+	It("ignores devices that report zero VRAM", func() {
+		infos := []GPUMemoryInfo{
+			{TotalVRAM: 0},
+			{TotalVRAM: 24 * gib},
+		}
+		Expect(minNonZeroVRAM(infos)).To(Equal(24 * gib))
+	})
+
+	It("returns the single device's VRAM on a one-GPU host", func() {
+		Expect(minNonZeroVRAM([]GPUMemoryInfo{{TotalVRAM: 16 * gib}})).To(Equal(16 * gib))
+	})
+
+	It("returns 0 when no device reports VRAM", func() {
+		Expect(minNonZeroVRAM([]GPUMemoryInfo{{TotalVRAM: 0}})).To(BeZero())
+		Expect(minNonZeroVRAM(nil)).To(BeZero())
+	})
+})

From f88981cdce96cd0056119e97500a4b8f31679d67 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 25 Jun 2026 00:22:45 +0200
Subject: [PATCH 82/99] feat(ui): data-driven hardware model recommendations +
 gallery surfacing (#10500)

* feat(ui): make hardware starter models data-driven

The empty-state starter widget recommended from a hardcoded list, which
drifts as the gallery evolves. Add useRecommendedModels: it queries the
live gallery for chat-capable models (their natural curated order, since
the gallery exposes no popularity signal), estimates size/VRAM for the top
candidates via the existing estimate endpoint, and ranks by hardware fit -
smallest on CPU-only boxes, largest-that-fits on GPUs.

StarterModels now renders those live picks and keeps the curated static
list only as an offline/trimmed-gallery fallback.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(ui): recommend models for your hardware in the gallery

Hardware-aware recommendations were only shown on the first-run empty
state. Surface them on the main Models gallery too: a dismissible
"Recommended for your hardware" strip at the top, sharing the
useRecommendedModels fit-ranking with the starter widget. CPU-only boxes
get small models; GPUs get the largest picks that fit VRAM, with size and
VRAM shown per card. One-click install; dismissal persists per browser.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(ui): gpu-mid tier + NVIDIA NVFP4 model recommendations

Refine the hardware recommendation tiers and curated picks:

- Add a gpu-mid tier (8-24GB VRAM) between gpu-small and gpu-large, so
  ~27B-class models are suggested separately from the 30B+ large tier.
- Detect NVIDIA GPUs (resources.gpus[].vendor) and, on NVIDIA only, prefer
  NVFP4 + MTP variants (Blackwell-optimised); NVFP4 models are filtered out
  of recommendations on non-NVIDIA hardware where they can't run. This
  applies to both the live ranking and the static fallback, with an NVFP4
  badge shown on those picks.
- Refresh the curated fallback to current models: Gemma-4 QAT Q4 builds at
  every tier, low qwen3.5 (4B distilled / 9B) on CPU/small, qwen3.6-27b
  and MTP variants at mid, qwen3.6/qwen3.5 35B-A3B apex/distilled at large.
  All names verified against gallery/index.yaml.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 .../http/react-ui/public/locales/en/home.json |   1 +
 .../react-ui/public/locales/en/models.json    |  10 ++
 core/http/react-ui/src/App.css                |  71 ++++++++++
 .../src/components/RecommendedModels.jsx      |  86 ++++++++++++
 .../react-ui/src/components/StarterModels.jsx | 130 +++++++++---------
 .../src/hooks/useRecommendedModels.js         | 108 +++++++++++++++
 core/http/react-ui/src/pages/Models.jsx       |   3 +
 7 files changed, 344 insertions(+), 65 deletions(-)
 create mode 100644 core/http/react-ui/src/components/RecommendedModels.jsx
 create mode 100644 core/http/react-ui/src/hooks/useRecommendedModels.js

diff --git a/core/http/react-ui/public/locales/en/home.json b/core/http/react-ui/public/locales/en/home.json
index 142767999..35533a5a8 100644
--- a/core/http/react-ui/public/locales/en/home.json
+++ b/core/http/react-ui/public/locales/en/home.json
@@ -82,6 +82,7 @@
     "tier": {
       "cpu": "CPU-only",
       "gpu-small": "GPU",
+      "gpu-mid": "GPU",
       "gpu-large": "GPU"
     },
     "cpuNote": "No GPU detected — these small models stay responsive on CPU.",
diff --git a/core/http/react-ui/public/locales/en/models.json b/core/http/react-ui/public/locales/en/models.json
index 2bf7b018d..bd23d389e 100644
--- a/core/http/react-ui/public/locales/en/models.json
+++ b/core/http/react-ui/public/locales/en/models.json
@@ -2,6 +2,16 @@
   "title": "Install Models",
   "subtitle": "Browse and install AI models from the gallery",
   "models": "Models",
+  "recommended": {
+    "title": "Recommended for your hardware",
+    "cpuNote": "No GPU detected - small models that stay responsive on CPU.",
+    "gpuNote": "Sized to fit your available VRAM with room for context.",
+    "install": "Install",
+    "installing": "Installing",
+    "installStarted": "Installing {{model}}…",
+    "installFailed": "Install failed: {{message}}",
+    "dismiss": "Dismiss recommendations"
+  },
   "stats": {
     "available": "Available",
     "installed": "Installed"
diff --git a/core/http/react-ui/src/App.css b/core/http/react-ui/src/App.css
index 40eddc2e9..4578a3dd8 100644
--- a/core/http/react-ui/src/App.css
+++ b/core/http/react-ui/src/App.css
@@ -6409,6 +6409,9 @@ select.input {
   font-size: 0.875rem;
   word-break: break-all;
 }
+.home-starters-badge {
+  font-size: 0.625rem;
+}
 .home-starters-size {
   margin-left: auto;
   font-size: 0.75rem;
@@ -6416,6 +6419,74 @@ select.input {
   white-space: nowrap;
 }
 
+/* ──────────────────── Models gallery: recommended-for-your-hardware strip ──────────────────── */
+
+.rec-models {
+  margin-bottom: var(--spacing-md);
+  padding: var(--spacing-md) var(--spacing-lg);
+}
+.rec-models-head {
+  display: flex;
+  align-items: flex-start;
+  justify-content: space-between;
+  gap: var(--spacing-md);
+}
+.rec-models-title {
+  display: flex;
+  align-items: center;
+  gap: var(--spacing-sm);
+  flex-wrap: wrap;
+}
+.rec-models-title i {
+  color: var(--color-primary);
+}
+.rec-models-note {
+  font-size: 0.8125rem;
+  color: var(--color-text-secondary);
+}
+.rec-models-dismiss {
+  background: none;
+  border: none;
+  color: var(--color-text-muted);
+  cursor: pointer;
+  padding: 4px;
+  flex-shrink: 0;
+}
+.rec-models-dismiss:hover {
+  color: var(--color-text-primary);
+}
+.rec-models-grid {
+  display: grid;
+  grid-template-columns: repeat(auto-fill, minmax(220px, 1fr));
+  gap: var(--spacing-sm);
+  margin-top: var(--spacing-md);
+}
+.rec-models-item {
+  display: flex;
+  flex-direction: column;
+  gap: var(--spacing-xs);
+  padding: var(--spacing-sm) var(--spacing-md);
+  border: 1px solid var(--color-border-subtle);
+  border-radius: var(--radius-md);
+  background: var(--color-bg-primary);
+}
+.rec-models-item-name {
+  font-weight: 500;
+  font-size: 0.8125rem;
+  word-break: break-all;
+}
+.rec-models-item-meta {
+  display: flex;
+  gap: var(--spacing-sm);
+  font-size: 0.75rem;
+  color: var(--color-text-muted);
+}
+.rec-models-item-fit {
+  display: inline-flex;
+  align-items: center;
+  gap: 4px;
+}
+
 /* ──────────────────── Home: drop-in endpoint / API compatibility ──────────────────── */
 
 .home-connect {
diff --git a/core/http/react-ui/src/components/RecommendedModels.jsx b/core/http/react-ui/src/components/RecommendedModels.jsx
new file mode 100644
index 000000000..7620406c8
--- /dev/null
+++ b/core/http/react-ui/src/components/RecommendedModels.jsx
@@ -0,0 +1,86 @@
+import { useState } from 'react'
+import { useTranslation } from 'react-i18next'
+import { modelsApi } from '../utils/api'
+import { useRecommendedModels, isNvfp4Name } from '../hooks/useRecommendedModels'
+
+const DISMISS_KEY = 'localai_rec_models_dismissed'
+
+// "Recommended for your hardware" strip at the top of the Models gallery. Shares
+// the hardware-fit ranking with the empty-state starter widget via
+// useRecommendedModels, but styled for the gallery page and dismissible (the
+// gallery is a repeat-visit surface, so it shouldn't nag).
+export default function RecommendedModels({ addToast }) {
+  const { t } = useTranslation('models')
+  const { recommended, tier, loading } = useRecommendedModels({ count: 4 })
+  const [installing, setInstalling] = useState(() => new Set())
+  const [dismissed, setDismissed] = useState(() => {
+    try { return localStorage.getItem(DISMISS_KEY) === '1' } catch { return false }
+  })
+
+  if (loading || dismissed) return null
+  if (!recommended || recommended.length === 0) return null
+
+  const dismiss = () => {
+    try { localStorage.setItem(DISMISS_KEY, '1') } catch { /* ignore */ }
+    setDismissed(true)
+  }
+
+  const install = async (name) => {
+    setInstalling(prev => new Set(prev).add(name))
+    try {
+      await modelsApi.install(name)
+      addToast?.(t('recommended.installStarted', { model: name }), 'success')
+    } catch (err) {
+      addToast?.(t('recommended.installFailed', { message: err.message }), 'error')
+      setInstalling(prev => {
+        const next = new Set(prev)
+        next.delete(name)
+        return next
+      })
+    }
+  }
+
+  const isGpu = tier.id !== 'cpu'
+
+  return (
+    <div className="rec-models card">
+      <div className="rec-models-head">
+        <div className="rec-models-title">
+          <i className={`fas ${isGpu ? 'fa-microchip' : 'fa-memory'}`} aria-hidden="true" />
+          <strong>{t('recommended.title')}</strong>
+          <span className="rec-models-note">{isGpu ? t('recommended.gpuNote') : t('recommended.cpuNote')}</span>
+        </div>
+        <button type="button" className="rec-models-dismiss" onClick={dismiss} aria-label={t('recommended.dismiss')} title={t('recommended.dismiss')}>
+          <i className="fas fa-times" aria-hidden="true" />
+        </button>
+      </div>
+      <div className="rec-models-grid">
+        {recommended.map(m => {
+          const busy = installing.has(m.name)
+          return (
+            <div key={m.name} className="rec-models-item">
+              <div className="rec-models-item-name">{m.name}</div>
+              <div className="rec-models-item-meta">
+                {isNvfp4Name(m.name) && <span className="badge badge-info">NVFP4</span>}
+                {m.sizeDisplay && <span>{m.sizeDisplay}</span>}
+                {isGpu && m.vramDisplay && (
+                  <span className="rec-models-item-fit"><i className="fas fa-microchip" aria-hidden="true" /> {m.vramDisplay}</span>
+                )}
+              </div>
+              <button
+                type="button"
+                className="btn btn-primary btn-sm"
+                disabled={busy}
+                onClick={() => install(m.name)}
+              >
+                {busy
+                  ? (<><i className="fas fa-spinner fa-spin" aria-hidden="true" /> {t('recommended.installing')}</>)
+                  : (<><i className="fas fa-download" aria-hidden="true" /> {t('recommended.install')}</>)}
+              </button>
+            </div>
+          )
+        })}
+      </div>
+    </div>
+  )
+}
diff --git a/core/http/react-ui/src/components/StarterModels.jsx b/core/http/react-ui/src/components/StarterModels.jsx
index 9273ae147..d5f8122b6 100644
--- a/core/http/react-ui/src/components/StarterModels.jsx
+++ b/core/http/react-ui/src/components/StarterModels.jsx
@@ -1,79 +1,78 @@
-import { useState, useEffect, useMemo } from 'react'
+import { useState } from 'react'
 import { useTranslation } from 'react-i18next'
 import { modelsApi } from '../utils/api'
-import { useResources } from '../hooks/useResources'
+import { useRecommendedModels, isNvfp4Name } from '../hooks/useRecommendedModels'
 
-// Curated, hardware-tiered starter models for the empty-state onboarding. Names
-// are real gallery entries (gallery/index.yaml); we intersect them against the
-// live gallery at render time so a custom/trimmed gallery degrades gracefully
-// (unmatched entries simply don't render).
-//
-// The guiding rule the maintainer asked for: CPU-only machines should be
-// steered to genuinely small models (1-4B, Q4) that stay responsive without a
-// GPU. GPU tiers scale the suggestion up with available VRAM.
-const SMALL = [
-  { name: 'llama-3.2-1b-instruct:q4_k_m', size: '~0.8 GB' },
-  { name: 'llama-3.2-3b-instruct:q4_k_m', size: '~2 GB' },
-  { name: 'qwen3-1.7b', size: '~1.4 GB' },
-  { name: 'gemma-3-1b-it', size: '~0.8 GB' },
-]
-const MID = [
-  { name: 'qwen3-4b', size: '~2.5 GB' },
-  { name: 'gemma-3-4b-it', size: '~3 GB' },
-  { name: 'llama-3.2-3b-instruct:q4_k_m', size: '~2 GB' },
-]
-const LARGE = [
-  { name: 'meta-llama-3.1-8b-instruct', size: '~5 GB' },
-  { name: 'qwen3-4b', size: '~2.5 GB' },
-  { name: 'mistral-7b-instruct-v0.3', size: '~4 GB' },
-]
+// Static fallback used only when the live gallery / estimates can't be reached
+// (offline, trimmed gallery). The hook is the primary, data-driven path; these
+// are real gallery names kept as a safety net so onboarding never shows nothing.
+// Gemma picks use the QAT (quantization-aware-trained) Q4 builds. NVIDIA boxes
+// get NVFP4 + MTP variants at the mid/large tiers (see NVIDIA below).
+const BASE = {
+  cpu: [
+    { name: 'gemma-4-e2b-it-qat-q4_0', size: '~1.5 GB' },
+    { name: 'qwen3.5-4b-claude-4.6-opus-reasoning-distilled', size: '~2.5 GB' },
+    { name: 'gemma-4-e4b-it-qat-q4_0', size: '~3 GB' },
+    { name: 'lfm2.5-1.2b-instruct', size: '~0.8 GB' },
+  ],
+  'gpu-small': [
+    { name: 'gemma-4-e4b-it-qat-q4_0', size: '~3 GB' },
+    { name: 'lfm2.5-8b-a1b', size: '~5 GB' },
+    { name: 'qwen3.5-9b', size: '~5.5 GB' },
+    { name: 'gemma-4-12b-it-qat-q4_0', size: '~7 GB' },
+  ],
+  'gpu-mid': [
+    { name: 'qwen3.6-27b', size: '~16 GB' },
+    { name: 'qwen3.6-27b-mtp-pi-tune', size: '~16 GB' },
+    { name: 'gemma-4-26b-a4b-it-qat-q4_0', size: '~16 GB' },
+    { name: 'qwen3.5-27b', size: '~16 GB' },
+  ],
+  'gpu-large': [
+    { name: 'qwen3.6-35b-a3b-apex', size: '~20 GB' },
+    { name: 'qwen3.6-35b-a3b-claude-4.6-opus-reasoning-distilled', size: '~20 GB' },
+    { name: 'gemma-4-31b-it-qat-q4_0', size: '~18 GB' },
+    { name: 'qwen3.5-35b-a3b-apex', size: '~20 GB' },
+  ],
+}
 
-const GB = 1024 * 1024 * 1024
+// NVIDIA-only overrides: NVFP4 is a Blackwell-optimised 4-bit format paired with
+// MTP (multi-token prediction) for speed. Only the mid/large tiers have these.
+const NVIDIA = {
+  'gpu-mid': [
+    { name: 'qwen3.6-27b-nvfp4-mtp', size: '~14 GB' },
+    { name: 'qwen3.6-27b-mtp-pi-tune', size: '~16 GB' },
+    { name: 'gemma-4-26b-a4b-it-qat-q4_0', size: '~16 GB' },
+    { name: 'qwen3.6-27b', size: '~16 GB' },
+  ],
+  'gpu-large': [
+    { name: 'qwen3.6-35b-a3b-nvfp4-mtp', size: '~18 GB' },
+    { name: 'qwen3.6-27b-nvfp4-mtp', size: '~14 GB' },
+    { name: 'qwen3.6-35b-a3b-apex', size: '~20 GB' },
+    { name: 'gemma-4-31b-it-qat-q4_0', size: '~18 GB' },
+  ],
+}
 
-// Pick a tier from detected hardware. total_memory is GPU VRAM in bytes (0 when
-// CPU-only). Thresholds are deliberately conservative so a suggestion that
-// "fits" really does.
-function pickTier(resources) {
-  const isGpu = resources?.type === 'gpu'
-  const vram = resources?.aggregate?.total_memory || 0
-  if (!isGpu || vram <= 0) return { id: 'cpu', list: SMALL }
-  if (vram < 8 * GB) return { id: 'gpu-small', list: MID }
-  return { id: 'gpu-large', list: LARGE }
+function fallbackFor(tierId, isNvidia) {
+  if (isNvidia && NVIDIA[tierId]) return NVIDIA[tierId]
+  return BASE[tierId] || BASE.cpu
 }
 
 export default function StarterModels({ addToast, onInstallStarted }) {
   const { t } = useTranslation('home')
-  const { resources } = useResources()
-  const [available, setAvailable] = useState(null) // Set of gallery names, or null while loading
+  const { recommended, tier, isNvidia, loading } = useRecommendedModels({ count: 4 })
   const [installing, setInstalling] = useState(() => new Set())
 
-  const tier = useMemo(() => pickTier(resources), [resources])
-  const candidates = tier.list
+  // While the hardware probe + gallery query are in flight, render nothing
+  // rather than flashing fallback content that may be replaced a moment later.
+  if (loading) return null
 
-  // Verify candidates exist in the live gallery. One search per name (the tier
-  // has at most a handful) keeps this resilient to gallery customization.
-  useEffect(() => {
-    let cancelled = false
-    const names = [...new Set(candidates.map(c => c.name))]
-    Promise.all(names.map(name =>
-      modelsApi.list({ search: name, page: 1 })
-        .then(data => (data?.models || []).some(m => (m.name || m.id) === name) ? name : null)
-        .catch(() => null)
-    )).then(found => {
-      if (cancelled) return
-      const hits = found.filter(Boolean)
-      // If verification yielded nothing (e.g. gallery unreachable), fall back to
-      // showing the curated list rather than an empty widget.
-      setAvailable(hits.length > 0 ? new Set(hits) : null)
-    })
-    return () => { cancelled = true }
-  }, [candidates])
+  // Prefer live recommendations; fall back to the static list only when the
+  // gallery yielded nothing.
+  const items = (recommended && recommended.length > 0)
+    ? recommended.map(r => ({ name: r.name, size: r.sizeDisplay }))
+    : fallbackFor(tier.id, isNvidia)
 
-  const visible = available === null
-    ? candidates
-    : candidates.filter(c => available.has(c.name))
-
-  if (visible.length === 0) return null
+  if (items.length === 0) return null
 
   const install = async (name) => {
     setInstalling(prev => new Set(prev).add(name))
@@ -104,12 +103,13 @@ export default function StarterModels({ addToast, onInstallStarted }) {
         {tier.id === 'cpu' ? t('starters.cpuNote') : t('starters.gpuNote')}
       </p>
       <ul className="home-starters-list">
-        {visible.map(c => {
+        {items.map(c => {
           const busy = installing.has(c.name)
           return (
             <li key={c.name} className="home-starters-item">
               <span className="home-starters-name">{c.name}</span>
-              <span className="home-starters-size">{c.size}</span>
+              {isNvfp4Name(c.name) && <span className="badge badge-info home-starters-badge">NVFP4</span>}
+              {c.size && <span className="home-starters-size">{c.size}</span>}
               <button
                 type="button"
                 className="btn btn-primary btn-sm"
diff --git a/core/http/react-ui/src/hooks/useRecommendedModels.js b/core/http/react-ui/src/hooks/useRecommendedModels.js
new file mode 100644
index 000000000..ca6090177
--- /dev/null
+++ b/core/http/react-ui/src/hooks/useRecommendedModels.js
@@ -0,0 +1,108 @@
+import { useState, useEffect } from 'react'
+import { modelsApi } from '../utils/api'
+import { useResources } from './useResources'
+
+// Data-driven "recommended for your hardware" model picks. The gallery exposes
+// no popularity/download signal and the list response carries no size, so we:
+//   1. ask the server for chat-capable models in their natural (curated) order,
+//   2. estimate size/VRAM for the top candidates (same endpoint the Models page
+//      uses), and
+//   3. rank by hardware fit — smallest on CPU-only boxes, largest-that-fits on
+//      GPUs (bigger == better quality while still fitting VRAM).
+//
+// Returns `recommended === null` while loading, `[]` when nothing could be
+// resolved (gallery/estimates unavailable) so callers can fall back.
+
+const GB = 1024 * 1024 * 1024
+const DEFAULT_CTX = 4096
+
+// NVFP4 is a Blackwell/NVIDIA-specific 4-bit format — only worth suggesting on
+// NVIDIA hardware, and to be filtered out elsewhere.
+export const isNvfp4Name = (name) => /nvfp4/i.test(name || '')
+
+export function hasNvidiaGpu(resources) {
+  return Array.isArray(resources?.gpus) &&
+    resources.gpus.some(g => (g?.vendor || '').toLowerCase() === 'nvidia')
+}
+
+export function recommendTier(resources) {
+  const isGpu = resources?.type === 'gpu'
+  const vram = resources?.aggregate?.total_memory || 0
+  if (!isGpu || vram <= 0) return { id: 'cpu', vram: 0 }
+  if (vram < 8 * GB) return { id: 'gpu-small', vram }
+  if (vram < 24 * GB) return { id: 'gpu-mid', vram }
+  return { id: 'gpu-large', vram }
+}
+
+function rank(candidates, tier, count, isNvidia) {
+  // NVFP4 only runs on NVIDIA (Blackwell) — drop it everywhere else, and prefer
+  // it on NVIDIA boxes where it's the fastest path.
+  const pool = candidates.filter(c => c.sizeBytes != null && (isNvidia || !isNvfp4Name(c.name)))
+  if (tier.id === 'cpu') {
+    // No GPU: smallest models stay responsive on CPU.
+    return [...pool].sort((a, b) => a.sizeBytes - b.sizeBytes).slice(0, count)
+  }
+  const limit = tier.vram * 0.95
+  const fits = pool.filter(c => c.vramBytes != null && c.vramBytes <= limit)
+  const base = fits.length > 0 ? fits : pool // tiny GPU where nothing fits → fall through to smallest
+  const byPreference = (a, b) => {
+    // On NVIDIA, surface NVFP4 first; then largest-that-fits (best quality).
+    if (isNvidia) {
+      const an = isNvfp4Name(a.name), bn = isNvfp4Name(b.name)
+      if (an !== bn) return an ? -1 : 1
+    }
+    return fits.length > 0 ? b.sizeBytes - a.sizeBytes : a.sizeBytes - b.sizeBytes
+  }
+  return [...base].sort(byPreference).slice(0, count)
+}
+
+export function useRecommendedModels({ count = 4, candidatePool = 10 } = {}) {
+  const { resources } = useResources()
+  const [recommended, setRecommended] = useState(null)
+  const [error, setError] = useState(null)
+
+  const resReady = resources !== null
+  const tier = recommendTier(resources)
+  const isNvidia = hasNvidiaGpu(resources)
+
+  useEffect(() => {
+    if (!resReady) return
+    let cancelled = false
+    setRecommended(null)
+    setError(null)
+    ;(async () => {
+      try {
+        const data = await modelsApi.list({ tag: 'chat', items: candidatePool, page: 1 })
+        // Recommend models the user hasn't installed yet.
+        const models = (data?.models || []).filter(m => !m.installed)
+        const estimated = await Promise.all(models.map(async (m) => {
+          const name = m.name || m.id
+          try {
+            const e = await modelsApi.estimate(name, [DEFAULT_CTX])
+            const ctx = e?.estimates?.[String(DEFAULT_CTX)]
+            return {
+              name,
+              description: m.description,
+              sizeBytes: e?.sizeBytes ?? null,
+              sizeDisplay: e?.sizeDisplay ?? null,
+              vramBytes: ctx?.vramBytes ?? null,
+              vramDisplay: ctx?.vramDisplay ?? null,
+            }
+          } catch {
+            return { name, sizeBytes: null }
+          }
+        }))
+        if (cancelled) return
+        setRecommended(rank(estimated, tier, count, isNvidia))
+      } catch (e) {
+        if (cancelled) return
+        setError(e.message)
+        setRecommended([])
+      }
+    })()
+    return () => { cancelled = true }
+    // tier.id / tier.vram / isNvidia are primitives, so resource polling doesn't re-run this.
+  }, [resReady, tier.id, tier.vram, isNvidia, count, candidatePool])
+
+  return { recommended, tier, isNvidia, error, loading: recommended === null }
+}
diff --git a/core/http/react-ui/src/pages/Models.jsx b/core/http/react-ui/src/pages/Models.jsx
index 5f3a3908d..cf10fdfb5 100644
--- a/core/http/react-ui/src/pages/Models.jsx
+++ b/core/http/react-ui/src/pages/Models.jsx
@@ -13,6 +13,7 @@ import ConfirmDialog from '../components/ConfirmDialog'
 import GalleryLoader from '../components/GalleryLoader'
 import Toggle from '../components/Toggle'
 import ResponsiveTable from '../components/ResponsiveTable'
+import RecommendedModels from '../components/RecommendedModels'
 import React from 'react'
 
 
@@ -301,6 +302,8 @@ export default function Models() {
         }
       />
 
+      <RecommendedModels addToast={addToast} />
+
       {/* Search */}
       <div className="search-bar" style={{ marginBottom: 'var(--spacing-md)' }}>
         <i className="fas fa-search search-icon" />

From 3f647a2764749dff7a240019fae1f89f7a2580bb Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 25 Jun 2026 00:57:42 +0200
Subject: [PATCH 83/99] chore: :arrow_up: Update ikawrakow/ik_llama.cpp to
 `d5507e33ae7ee2b7b41475f08044d3bde3b839ee` (#10498)

:arrow_up: Update ikawrakow/ik_llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/cpp/ik-llama-cpp/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/cpp/ik-llama-cpp/Makefile b/backend/cpp/ik-llama-cpp/Makefile
index 0fbcf0bdb..860606253 100644
--- a/backend/cpp/ik-llama-cpp/Makefile
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -1,5 +1,5 @@
 
-IK_LLAMA_VERSION?=7ccf1d209588962b96eacca325b37e9b3e8faf5e
+IK_LLAMA_VERSION?=d5507e33ae7ee2b7b41475f08044d3bde3b839ee
 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
 
 CMAKE_ARGS?=

From 3c63431e467f99d7e544215fe59597cf4a69c23b Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 25 Jun 2026 00:57:58 +0200
Subject: [PATCH 84/99] chore: :arrow_up: Update ServeurpersoCom/omnivoice.cpp
 to `0f37401bebe9b20c0160a888e592108fc1d17607` (#10492)

:arrow_up: Update ServeurpersoCom/omnivoice.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/go/omnivoice-cpp/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/go/omnivoice-cpp/Makefile b/backend/go/omnivoice-cpp/Makefile
index b42610aac..c245acf58 100644
--- a/backend/go/omnivoice-cpp/Makefile
+++ b/backend/go/omnivoice-cpp/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 
 # omnivoice.cpp version
 OMNIVOICE_REPO?=https://github.com/ServeurpersoCom/omnivoice.cpp
-OMNIVOICE_VERSION?=96d30169afd5e6bb3fd6a0e9be0eb505bfe81fcd
+OMNIVOICE_VERSION?=0f37401bebe9b20c0160a888e592108fc1d17607
 SO_TARGET?=libgomnivoicecpp.so
 
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

From c678530cf0e3ff273678841cd24dbc73a072ab2b Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 25 Jun 2026 08:09:18 +0200
Subject: [PATCH 85/99] fix(backends): darwin/metal support across purego Go
 backends (#10481)

* fix(parakeet-cpp): darwin/metal support (libparakeet.dylib + DYLD path)

The parakeet-cpp backend had no macOS support and panicked at startup on
Apple/Metal nodes when purego.Dlopen could not find "libparakeet.so".
Fix it across the same four layers the sibling voxtral backend already
handles correctly:

- main.go: default the dlopen target to libparakeet.dylib on darwin
  (runtime.GOOS), libparakeet.so elsewhere; PARAKEET_LIBRARY still wins.
- Makefile: also stage the built libparakeet.dylib next to the Go sources.
- package.sh: accept either the Linux .so[.X.Y] or the macOS .dylib when
  bundling instead of hard-failing when no .so is present (the macOS case);
  note that on Darwin only system frameworks are linked.
- run.sh: on Darwin set DYLD_LIBRARY_PATH and PARAKEET_LIBRARY to the
  packaged .dylib; keep LD_LIBRARY_PATH + .so on Linux.

Mirrors backend/go/voxtral.

Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(backends): darwin/metal support across purego Go backends

The parakeet-cpp fix in the previous commit was an instance of a bug
shared by nearly every purego/dlopen Go backend: the dlopen target was
hardcoded to a .so name and run.sh exported only LD_LIBRARY_PATH, so the
backend panicked at startup on macOS/Apple-Metal nodes (dyld needs the
.dylib name and DYLD_LIBRARY_PATH). voxtral was the only backend handling
this correctly.

Apply the same four-layer fix (mirroring backend/go/voxtral) to the
remaining affected backends:

  whisper, sherpa-onnx, ced, stablediffusion-ggml, vibevoice-cpp,
  qwen3-tts-cpp, omnivoice-cpp, crispasr, acestep-cpp, locate-anything-cpp,
  depth-anything-cpp, rfdetr-cpp, sam3-cpp, localvqe

Per backend:
- main.go (sherpa-onnx: backend.go, two libraries): default the dlopen
  target to the .dylib on darwin (runtime.GOOS), .so elsewhere; the
  existing <BACKEND>_LIBRARY env override still wins.
- run.sh: on Darwin set DYLD_LIBRARY_PATH and point <BACKEND>_LIBRARY at
  the packaged .dylib; keep LD_LIBRARY_PATH + the Linux CPU-variant
  (avx/avx2/avx512) selection unchanged in the else branch.
- package.sh: also bundle the .dylib and stop hard-failing when no .so is
  present (the macOS case).
- Makefile: also stage the built .dylib.

Notes:
- stablediffusion-ggml and acestep-cpp build their lib as a CMake MODULE,
  which emits .so (not .dylib) on macOS; run.sh prefers .dylib and falls
  back to .so so both layouts work.
- sherpa-onnx was already partly darwin-aware (Makefile/package.sh); only
  run.sh and the two dlopen defaults needed fixing.

Linux behavior is unchanged. Verified gofmt-clean and
`CGO_ENABLED=0 go build` for every backend.

Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/go/acestep-cpp/Makefile            |  3 ++-
 backend/go/acestep-cpp/main.go             |  7 ++++++-
 backend/go/acestep-cpp/package.sh          |  1 +
 backend/go/acestep-cpp/run.sh              | 17 ++++++++++++++---
 backend/go/ced/Makefile                    |  1 +
 backend/go/ced/main.go                     |  7 ++++++-
 backend/go/ced/package.sh                  |  8 +++++---
 backend/go/ced/run.sh                      |  7 ++++++-
 backend/go/crispasr/Makefile               | 15 ++++++++++++---
 backend/go/crispasr/main.go                |  7 ++++++-
 backend/go/crispasr/package.sh             |  3 ++-
 backend/go/crispasr/run.sh                 | 11 ++++++++---
 backend/go/depth-anything-cpp/Makefile     | 15 ++++++++++++---
 backend/go/depth-anything-cpp/main.go      |  7 ++++++-
 backend/go/depth-anything-cpp/package.sh   |  3 ++-
 backend/go/depth-anything-cpp/run.sh       | 11 ++++++++---
 backend/go/localvqe/Makefile               |  3 ++-
 backend/go/localvqe/main.go                |  7 ++++++-
 backend/go/localvqe/package.sh             |  2 ++
 backend/go/localvqe/run.sh                 | 15 +++++++++++++--
 backend/go/locate-anything-cpp/Makefile    | 15 ++++++++++++---
 backend/go/locate-anything-cpp/main.go     |  7 ++++++-
 backend/go/locate-anything-cpp/package.sh  |  3 ++-
 backend/go/locate-anything-cpp/run.sh      | 11 ++++++++---
 backend/go/omnivoice-cpp/Makefile          | 14 +++++++++++---
 backend/go/omnivoice-cpp/main.go           |  7 ++++++-
 backend/go/omnivoice-cpp/package.sh        |  3 ++-
 backend/go/omnivoice-cpp/run.sh            | 11 ++++++++---
 backend/go/parakeet-cpp/Makefile           |  1 +
 backend/go/parakeet-cpp/main.go            | 18 ++++++++++++------
 backend/go/parakeet-cpp/package.sh         | 15 +++++++++------
 backend/go/parakeet-cpp/run.sh             | 10 ++++++++--
 backend/go/qwen3-tts-cpp/Makefile          | 15 +++++++++++----
 backend/go/qwen3-tts-cpp/main.go           |  7 ++++++-
 backend/go/qwen3-tts-cpp/package.sh        |  3 ++-
 backend/go/qwen3-tts-cpp/run.sh            | 11 ++++++++---
 backend/go/rfdetr-cpp/Makefile             | 15 ++++++++++++---
 backend/go/rfdetr-cpp/main.go              |  7 ++++++-
 backend/go/rfdetr-cpp/package.sh           |  3 ++-
 backend/go/rfdetr-cpp/run.sh               | 11 ++++++++---
 backend/go/sam3-cpp/Makefile               | 15 ++++++++++++---
 backend/go/sam3-cpp/main.go                |  7 ++++++-
 backend/go/sam3-cpp/package.sh             |  3 ++-
 backend/go/sam3-cpp/run.sh                 | 11 ++++++++---
 backend/go/sherpa-onnx/backend.go          | 13 +++++++++++--
 backend/go/sherpa-onnx/run.sh              |  8 +++++++-
 backend/go/stablediffusion-ggml/Makefile   |  3 ++-
 backend/go/stablediffusion-ggml/main.go    |  7 ++++++-
 backend/go/stablediffusion-ggml/package.sh |  1 +
 backend/go/stablediffusion-ggml/run.sh     | 16 +++++++++++++---
 backend/go/vibevoice-cpp/Makefile          | 16 ++++++++++++----
 backend/go/vibevoice-cpp/main.go           |  7 ++++++-
 backend/go/vibevoice-cpp/package.sh        |  3 ++-
 backend/go/vibevoice-cpp/run.sh            | 11 ++++++++---
 backend/go/whisper/Makefile                |  3 ++-
 backend/go/whisper/main.go                 |  7 ++++++-
 backend/go/whisper/package.sh              |  3 ++-
 backend/go/whisper/run.sh                  | 11 ++++++++---
 58 files changed, 374 insertions(+), 108 deletions(-)

diff --git a/backend/go/acestep-cpp/Makefile b/backend/go/acestep-cpp/Makefile
index 0b1929b94..3332ce1b6 100644
--- a/backend/go/acestep-cpp/Makefile
+++ b/backend/go/acestep-cpp/Makefile
@@ -117,7 +117,8 @@ libgoacestepcpp-custom: CMakeLists.txt cpp/goacestepcpp.cpp cpp/goacestepcpp.h
 	cmake .. $(CMAKE_ARGS) && \
 	cmake --build . --config Release -j$(JOBS) --target goacestepcpp && \
 	cd .. && \
-	mv build-$(SO_TARGET)/libgoacestepcpp.so ./$(SO_TARGET)
+	(mv build-$(SO_TARGET)/libgoacestepcpp.so ./$(SO_TARGET) 2>/dev/null || \
+	 mv build-$(SO_TARGET)/libgoacestepcpp.dylib ./$(SO_TARGET) 2>/dev/null)
 
 test: acestep-cpp
 	@echo "Running acestep-cpp tests..."
diff --git a/backend/go/acestep-cpp/main.go b/backend/go/acestep-cpp/main.go
index c65afb335..e4c1378b8 100644
--- a/backend/go/acestep-cpp/main.go
+++ b/backend/go/acestep-cpp/main.go
@@ -4,6 +4,7 @@ package main
 import (
 	"flag"
 	"os"
+	"runtime"
 
 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -22,7 +23,11 @@ func main() {
 	// Get library name from environment variable, default to fallback
 	libName := os.Getenv("ACESTEP_LIBRARY")
 	if libName == "" {
-		libName = "./libgoacestepcpp-fallback.so"
+		if runtime.GOOS == "darwin" {
+			libName = "./libgoacestepcpp-fallback.dylib"
+		} else {
+			libName = "./libgoacestepcpp-fallback.so"
+		}
 	}
 
 	gosd, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
diff --git a/backend/go/acestep-cpp/package.sh b/backend/go/acestep-cpp/package.sh
index d922c5b86..5fecf3455 100755
--- a/backend/go/acestep-cpp/package.sh
+++ b/backend/go/acestep-cpp/package.sh
@@ -13,6 +13,7 @@ mkdir -p $CURDIR/package/lib
 
 cp -avf $CURDIR/acestep-cpp $CURDIR/package/
 cp -fv $CURDIR/libgoacestepcpp-*.so $CURDIR/package/
+cp -fv $CURDIR/libgoacestepcpp-*.dylib $CURDIR/package/ 2>/dev/null || true
 cp -fv $CURDIR/run.sh $CURDIR/package/
 
 # Detect architecture and copy appropriate libraries
diff --git a/backend/go/acestep-cpp/run.sh b/backend/go/acestep-cpp/run.sh
index d901e2c85..bcdfbc09e 100755
--- a/backend/go/acestep-cpp/run.sh
+++ b/backend/go/acestep-cpp/run.sh
@@ -12,9 +12,19 @@ if [ "$(uname)" != "Darwin" ]; then
 	grep -e "flags" /proc/cpuinfo | head -1
 fi
 
-LIBRARY="$CURDIR/libgoacestepcpp-fallback.so"
+if [ "$(uname)" = "Darwin" ]; then
+	# macOS: single library variant (Metal or Accelerate). The goacestepcpp
+	# target is built as a CMake MODULE, which emits a .dylib for a SHARED
+	# build but a .so for a MODULE build on Apple, so prefer .dylib and fall
+	# back to .so.
+	LIBRARY="$CURDIR/libgoacestepcpp-fallback.dylib"
+	if [ ! -e "$LIBRARY" ]; then
+		LIBRARY="$CURDIR/libgoacestepcpp-fallback.so"
+	fi
+	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
+else
+	LIBRARY="$CURDIR/libgoacestepcpp-fallback.so"
 
-if [ "$(uname)" != "Darwin" ]; then
 	if grep -q -e "\savx\s" /proc/cpuinfo ; then
 		echo "CPU:    AVX    found OK"
 		if [ -e $CURDIR/libgoacestepcpp-avx.so ]; then
@@ -36,9 +46,10 @@ if [ "$(uname)" != "Darwin" ]; then
 			LIBRARY="$CURDIR/libgoacestepcpp-avx512.so"
 		fi
 	fi
+
+	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 fi
 
-export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export ACESTEP_LIBRARY=$LIBRARY
 
 # If there is a lib/ld.so, use it
diff --git a/backend/go/ced/Makefile b/backend/go/ced/Makefile
index 632c0e255..2b15990ec 100644
--- a/backend/go/ced/Makefile
+++ b/backend/go/ced/Makefile
@@ -57,6 +57,7 @@ libced.so: sources/ced.cpp
 	cmake -B sources/ced.cpp/build-shared -S sources/ced.cpp $(CMAKE_ARGS)
 	cmake --build sources/ced.cpp/build-shared --config Release -j$(JOBS)
 	cp -fv sources/ced.cpp/build-shared/libced.so* ./ 2>/dev/null || true
+	cp -fv sources/ced.cpp/build-shared/libced.dylib ./ 2>/dev/null || true
 	cp -fv sources/ced.cpp/include/ced_capi.h ./
 
 ced-grpc: libced.so main.go goced.go
diff --git a/backend/go/ced/main.go b/backend/go/ced/main.go
index ea8aa8549..b6c93a9f9 100644
--- a/backend/go/ced/main.go
+++ b/backend/go/ced/main.go
@@ -12,6 +12,7 @@ import (
 	"flag"
 	"fmt"
 	"os"
+	"runtime"
 
 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -27,7 +28,11 @@ type libFunc struct {
 func main() {
 	libName := os.Getenv("CED_LIBRARY")
 	if libName == "" {
-		libName = "libced.so"
+		if runtime.GOOS == "darwin" {
+			libName = "libced.dylib"
+		} else {
+			libName = "libced.so"
+		}
 	}
 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
 	if err != nil {
diff --git a/backend/go/ced/package.sh b/backend/go/ced/package.sh
index bde0adad6..ff20d727f 100755
--- a/backend/go/ced/package.sh
+++ b/backend/go/ced/package.sh
@@ -15,10 +15,12 @@ mkdir -p "$CURDIR/package/lib"
 cp -avf "$CURDIR/ced-grpc" "$CURDIR/package/"
 cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
 
-cp -avf "$CURDIR"/libced.so* "$CURDIR/package/lib/" 2>/dev/null || {
-	echo "ERROR: libced.so not found in $CURDIR, run 'make' first" >&2
+cp -avf "$CURDIR"/libced.so* "$CURDIR/package/lib/" 2>/dev/null || true
+cp -avf "$CURDIR"/libced.dylib "$CURDIR/package/lib/" 2>/dev/null || true
+if ! ls "$CURDIR"/package/lib/libced.* >/dev/null 2>&1; then
+	echo "ERROR: libced shared library not found in $CURDIR, run 'make' first" >&2
 	exit 1
-}
+fi
 
 if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
     echo "Detected x86_64 architecture, copying x86_64 libraries..."
diff --git a/backend/go/ced/run.sh b/backend/go/ced/run.sh
index bce6fec8e..1f95f748f 100755
--- a/backend/go/ced/run.sh
+++ b/backend/go/ced/run.sh
@@ -3,7 +3,12 @@ set -e
 
 CURDIR=$(dirname "$(realpath "$0")")
 
-export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
+if [ "$(uname)" = "Darwin" ]; then
+	export DYLD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${DYLD_LIBRARY_PATH:-}"
+	export CED_LIBRARY="$CURDIR/lib/libced.dylib"
+else
+	export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
+fi
 
 # If a self-contained ld.so was packaged, route through it so the packaged
 # libc / libstdc++ are used instead of the host's (matches the sibling backends).
diff --git a/backend/go/crispasr/Makefile b/backend/go/crispasr/Makefile
index ba55b485e..1b32240e3 100644
--- a/backend/go/crispasr/Makefile
+++ b/backend/go/crispasr/Makefile
@@ -75,7 +75,8 @@ UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
 	VARIANT_TARGETS = libgocrispasr-avx.so libgocrispasr-avx2.so libgocrispasr-avx512.so libgocrispasr-fallback.so
 else
-	VARIANT_TARGETS = libgocrispasr-fallback.so
+	# On non-Linux (e.g., Darwin), build only fallback variant (as a dylib)
+	VARIANT_TARGETS = libgocrispasr-fallback.dylib
 endif
 
 crispasr: main.go gocrispasr.go $(VARIANT_TARGETS)
@@ -87,7 +88,7 @@ package: crispasr
 build: package
 
 clean: purge
-	rm -rf libgocrispasr*.so package sources/CrispASR crispasr
+	rm -rf libgocrispasr*.so libgocrispasr*.dylib package sources/CrispASR crispasr
 
 purge:
 	rm -rf build*
@@ -118,13 +119,21 @@ libgocrispasr-fallback.so: sources/CrispASR
 	SO_TARGET=libgocrispasr-fallback.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgocrispasr-custom
 	rm -rfv build*
 
+# Build fallback variant as a dylib (Darwin)
+libgocrispasr-fallback.dylib: sources/CrispASR
+	$(MAKE) purge
+	$(info ${GREEN}I crispasr build info:fallback (dylib)${RESET})
+	SO_TARGET=libgocrispasr-fallback.dylib CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgocrispasr-custom
+	rm -rfv build*
+
 libgocrispasr-custom: CMakeLists.txt cpp/crispasr_shim.cpp cpp/crispasr_shim.h
 	mkdir -p build-$(SO_TARGET) && \
 	cd build-$(SO_TARGET) && \
 	cmake .. $(CMAKE_ARGS) && \
 	cmake --build . --config Release -j$(JOBS) && \
 	cd .. && \
-	mv build-$(SO_TARGET)/libgocrispasr.so ./$(SO_TARGET)
+	(mv build-$(SO_TARGET)/libgocrispasr.so ./$(SO_TARGET) 2>/dev/null || \
+	 mv build-$(SO_TARGET)/libgocrispasr.dylib ./$(SO_TARGET) 2>/dev/null)
 
 test: crispasr
 	CGO_ENABLED=0 $(GOCMD) test -v ./...
diff --git a/backend/go/crispasr/main.go b/backend/go/crispasr/main.go
index 9f3ef14d0..a1f132cc5 100644
--- a/backend/go/crispasr/main.go
+++ b/backend/go/crispasr/main.go
@@ -4,6 +4,7 @@ package main
 import (
 	"flag"
 	"os"
+	"runtime"
 
 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -21,7 +22,11 @@ type LibFuncs struct {
 func main() {
 	libName := os.Getenv("CRISPASR_LIBRARY")
 	if libName == "" {
-		libName = "./libgocrispasr-fallback.so"
+		if runtime.GOOS == "darwin" {
+			libName = "./libgocrispasr-fallback.dylib"
+		} else {
+			libName = "./libgocrispasr-fallback.so"
+		}
 	}
 
 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
diff --git a/backend/go/crispasr/package.sh b/backend/go/crispasr/package.sh
index baee12944..9b89dad1b 100755
--- a/backend/go/crispasr/package.sh
+++ b/backend/go/crispasr/package.sh
@@ -12,7 +12,8 @@ REPO_ROOT="${CURDIR}/../../.."
 mkdir -p $CURDIR/package/lib
 
 cp -avf $CURDIR/crispasr $CURDIR/package/
-cp -fv $CURDIR/libgocrispasr-*.so $CURDIR/package/
+cp -fv $CURDIR/libgocrispasr-*.so $CURDIR/package/ 2>/dev/null || true
+cp -fv $CURDIR/libgocrispasr-*.dylib $CURDIR/package/ 2>/dev/null || true
 cp -fv $CURDIR/run.sh $CURDIR/package/
 
 # Detect architecture and copy appropriate libraries
diff --git a/backend/go/crispasr/run.sh b/backend/go/crispasr/run.sh
index ccb264833..6d3c4b216 100755
--- a/backend/go/crispasr/run.sh
+++ b/backend/go/crispasr/run.sh
@@ -12,9 +12,13 @@ if [ "$(uname)" != "Darwin" ]; then
 	grep -e "flags" /proc/cpuinfo | head -1
 fi
 
-LIBRARY="$CURDIR/libgocrispasr-fallback.so"
+if [ "$(uname)" = "Darwin" ]; then
+	# macOS: single dylib variant (Metal or Accelerate)
+	LIBRARY="$CURDIR/libgocrispasr-fallback.dylib"
+	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
+else
+	LIBRARY="$CURDIR/libgocrispasr-fallback.so"
 
-if [ "$(uname)" != "Darwin" ]; then
 	if grep -q -e "\savx\s" /proc/cpuinfo ; then
 		echo "CPU:    AVX    found OK"
 		if [ -e $CURDIR/libgocrispasr-avx.so ]; then
@@ -36,9 +40,10 @@ if [ "$(uname)" != "Darwin" ]; then
 			LIBRARY="$CURDIR/libgocrispasr-avx512.so"
 		fi
 	fi
+
+	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 fi
 
-export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export CRISPASR_LIBRARY=$LIBRARY
 
 # Point piper's espeak-ng phonemizer at the bundled voice data. The variable
diff --git a/backend/go/depth-anything-cpp/Makefile b/backend/go/depth-anything-cpp/Makefile
index f1a0b9f97..efe99a626 100644
--- a/backend/go/depth-anything-cpp/Makefile
+++ b/backend/go/depth-anything-cpp/Makefile
@@ -77,7 +77,7 @@ ifeq ($(UNAME_S),Linux)
 	VARIANT_TARGETS = libdepthanythingcpp-avx.so libdepthanythingcpp-avx2.so libdepthanythingcpp-avx512.so libdepthanythingcpp-fallback.so
 else
 	# On non-Linux (e.g., Darwin), build only fallback variant
-	VARIANT_TARGETS = libdepthanythingcpp-fallback.so
+	VARIANT_TARGETS = libdepthanythingcpp-fallback.dylib
 endif
 
 depth-anything-cpp: main.go godepthanythingcpp.go $(VARIANT_TARGETS)
@@ -89,7 +89,7 @@ package: depth-anything-cpp
 build: package
 
 clean: purge
-	rm -rf libdepthanythingcpp*.so depth-anything-cpp package sources
+	rm -rf libdepthanythingcpp*.so libdepthanythingcpp*.dylib depth-anything-cpp package sources
 
 purge:
 	rm -rf build*
@@ -116,11 +116,19 @@ libdepthanythingcpp-avx512.so: sources/depth-anything.cpp
 endif
 
 # Build fallback variant (all platforms)
+ifeq ($(UNAME_S),Darwin)
+libdepthanythingcpp-fallback.dylib: sources/depth-anything.cpp
+	rm -rfv build-$@
+	$(info ${GREEN}I depth-anything-cpp build info:fallback${RESET})
+	SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libdepthanythingcpp-custom
+	rm -rfv build-$@
+else
 libdepthanythingcpp-fallback.so: sources/depth-anything.cpp
 	rm -rfv build-$@
 	$(info ${GREEN}I depth-anything-cpp build info:fallback${RESET})
 	SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libdepthanythingcpp-custom
 	rm -rfv build-$@
+endif
 
 libdepthanythingcpp-custom: CMakeLists.txt
 	mkdir -p build-$(SO_TARGET) && \
@@ -128,7 +136,8 @@ libdepthanythingcpp-custom: CMakeLists.txt
 	cmake .. $(CMAKE_ARGS) && \
 	cmake --build . --config Release -j$(JOBS) && \
 	cd .. && \
-	mv build-$(SO_TARGET)/libdepthanything.so ./$(SO_TARGET)
+	(mv build-$(SO_TARGET)/libdepthanything.so ./$(SO_TARGET) 2>/dev/null || \
+	 mv build-$(SO_TARGET)/libdepthanything.dylib ./$(SO_TARGET) 2>/dev/null)
 
 all: depth-anything-cpp package
 
diff --git a/backend/go/depth-anything-cpp/main.go b/backend/go/depth-anything-cpp/main.go
index 4c4546797..cfad88b23 100644
--- a/backend/go/depth-anything-cpp/main.go
+++ b/backend/go/depth-anything-cpp/main.go
@@ -9,6 +9,7 @@ package main
 import (
 	"flag"
 	"os"
+	"runtime"
 
 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -27,7 +28,11 @@ func main() {
 	// Get library name from environment variable, default to fallback
 	libName := os.Getenv("DEPTHANYTHING_LIBRARY")
 	if libName == "" {
-		libName = "./libdepthanythingcpp-fallback.so"
+		if runtime.GOOS == "darwin" {
+			libName = "./libdepthanythingcpp-fallback.dylib"
+		} else {
+			libName = "./libdepthanythingcpp-fallback.so"
+		}
 	}
 
 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
diff --git a/backend/go/depth-anything-cpp/package.sh b/backend/go/depth-anything-cpp/package.sh
index 4690555ea..5bbd5559b 100755
--- a/backend/go/depth-anything-cpp/package.sh
+++ b/backend/go/depth-anything-cpp/package.sh
@@ -10,7 +10,8 @@ REPO_ROOT="${CURDIR}/../../.."
 # Create lib directory
 mkdir -p $CURDIR/package/lib
 
-cp -avf $CURDIR/libdepthanythingcpp-*.so $CURDIR/package/
+cp -fv $CURDIR/libdepthanythingcpp-*.so $CURDIR/package/ 2>/dev/null || true
+cp -fv $CURDIR/libdepthanythingcpp-*.dylib $CURDIR/package/ 2>/dev/null || true
 cp -avf $CURDIR/depth-anything-cpp $CURDIR/package/
 cp -fv $CURDIR/run.sh $CURDIR/package/
 
diff --git a/backend/go/depth-anything-cpp/run.sh b/backend/go/depth-anything-cpp/run.sh
index 984aa5849..cbff6b0b5 100755
--- a/backend/go/depth-anything-cpp/run.sh
+++ b/backend/go/depth-anything-cpp/run.sh
@@ -12,9 +12,13 @@ if [ "$(uname)" != "Darwin" ]; then
 	grep -e "flags" /proc/cpuinfo | head -1
 fi
 
-LIBRARY="$CURDIR/libdepthanythingcpp-fallback.so"
+if [ "$(uname)" = "Darwin" ]; then
+	# macOS: single dylib variant (Metal or Accelerate)
+	LIBRARY="$CURDIR/libdepthanythingcpp-fallback.dylib"
+	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
+else
+	LIBRARY="$CURDIR/libdepthanythingcpp-fallback.so"
 
-if [ "$(uname)" != "Darwin" ]; then
 	if grep -q -e "\savx\s" /proc/cpuinfo ; then
 		echo "CPU:    AVX    found OK"
 		if [ -e $CURDIR/libdepthanythingcpp-avx.so ]; then
@@ -36,9 +40,10 @@ if [ "$(uname)" != "Darwin" ]; then
 			LIBRARY="$CURDIR/libdepthanythingcpp-avx512.so"
 		fi
 	fi
+
+	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 fi
 
-export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export DEPTHANYTHING_LIBRARY=$LIBRARY
 
 # If there is a lib/ld.so, use it
diff --git a/backend/go/localvqe/Makefile b/backend/go/localvqe/Makefile
index 7b66e9371..049da0cdd 100644
--- a/backend/go/localvqe/Makefile
+++ b/backend/go/localvqe/Makefile
@@ -67,8 +67,9 @@ $(LIB_SENTINEL): sources/LocalVQE
 	# that the loader picks at runtime. We must build every target — the
 	# default `--target localvqe_shared` drops these. CMAKE_LIBRARY_OUTPUT_DIRECTORY
 	# routes all of them into build/bin; copy them out next to the binary.
-	cp -P build/bin/liblocalvqe.so* . 2>/dev/null || cp -P build/liblocalvqe.so* .
+	cp -P build/bin/liblocalvqe.so* . 2>/dev/null || cp -P build/bin/liblocalvqe.dylib . 2>/dev/null || cp -P build/liblocalvqe.so* . 2>/dev/null || cp -P build/liblocalvqe.dylib .
 	cp -P build/bin/libggml*.so* . 2>/dev/null || true
+	cp -P build/bin/libggml*.dylib . 2>/dev/null || true
 	touch $(LIB_SENTINEL)
 
 liblocalvqe.so: $(LIB_SENTINEL)
diff --git a/backend/go/localvqe/main.go b/backend/go/localvqe/main.go
index 56ed2de2f..cbaa2a134 100644
--- a/backend/go/localvqe/main.go
+++ b/backend/go/localvqe/main.go
@@ -4,6 +4,7 @@ package main
 import (
 	"flag"
 	"os"
+	"runtime"
 
 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -21,7 +22,11 @@ type LibFuncs struct {
 func main() {
 	libName := os.Getenv("LOCALVQE_LIBRARY")
 	if libName == "" {
-		libName = "./liblocalvqe.so"
+		if runtime.GOOS == "darwin" {
+			libName = "./liblocalvqe.dylib"
+		} else {
+			libName = "./liblocalvqe.so"
+		}
 	}
 
 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
diff --git a/backend/go/localvqe/package.sh b/backend/go/localvqe/package.sh
index ca8dfd3ab..9f9f2533d 100755
--- a/backend/go/localvqe/package.sh
+++ b/backend/go/localvqe/package.sh
@@ -15,7 +15,9 @@ cp -avf $CURDIR/localvqe $CURDIR/package/
 # liblocalvqe.so* (with SOVERSION symlinks) and the libggml-*.so runtime
 # variants — LocalVQE picks the matching CPU variant at load time.
 cp -P $CURDIR/liblocalvqe.so* $CURDIR/package/ 2>/dev/null || true
+cp -P $CURDIR/liblocalvqe.dylib $CURDIR/package/ 2>/dev/null || true
 cp -P $CURDIR/libggml*.so* $CURDIR/package/ 2>/dev/null || true
+cp -P $CURDIR/libggml*.dylib $CURDIR/package/ 2>/dev/null || true
 cp -fv $CURDIR/run.sh $CURDIR/package/
 
 # Detect architecture and copy appropriate libraries
diff --git a/backend/go/localvqe/run.sh b/backend/go/localvqe/run.sh
index 0f3192e31..d14d427c4 100755
--- a/backend/go/localvqe/run.sh
+++ b/backend/go/localvqe/run.sh
@@ -10,8 +10,19 @@ CURDIR=$(dirname "$(realpath $0)")
 # exec'ing the binary.
 cd "$CURDIR"
 
-export LD_LIBRARY_PATH=$CURDIR:$CURDIR/lib:$LD_LIBRARY_PATH
-export LOCALVQE_LIBRARY=$CURDIR/liblocalvqe.so
+if [ "$(uname)" = "Darwin" ]; then
+	# macOS: LocalVQE is built as a SHARED library, so dyld needs the .dylib +
+	# DYLD_LIBRARY_PATH. Prefer .dylib and fall back to .so just in case.
+	export DYLD_LIBRARY_PATH=$CURDIR:$CURDIR/lib:$DYLD_LIBRARY_PATH
+	LOCALVQE_LIBRARY=$CURDIR/liblocalvqe.dylib
+	if [ ! -e "$LOCALVQE_LIBRARY" ]; then
+		LOCALVQE_LIBRARY=$CURDIR/liblocalvqe.so
+	fi
+	export LOCALVQE_LIBRARY
+else
+	export LD_LIBRARY_PATH=$CURDIR:$CURDIR/lib:$LD_LIBRARY_PATH
+	export LOCALVQE_LIBRARY=$CURDIR/liblocalvqe.so
+fi
 
 if [ -f $CURDIR/lib/ld.so ]; then
 	echo "Using lib/ld.so"
diff --git a/backend/go/locate-anything-cpp/Makefile b/backend/go/locate-anything-cpp/Makefile
index 91dbc41c2..ba12c7195 100644
--- a/backend/go/locate-anything-cpp/Makefile
+++ b/backend/go/locate-anything-cpp/Makefile
@@ -70,7 +70,7 @@ ifeq ($(UNAME_S),Linux)
 	VARIANT_TARGETS = liblocateanythingcpp-avx.so liblocateanythingcpp-avx2.so liblocateanythingcpp-avx512.so liblocateanythingcpp-fallback.so
 else
 	# On non-Linux (e.g., Darwin), build only fallback variant
-	VARIANT_TARGETS = liblocateanythingcpp-fallback.so
+	VARIANT_TARGETS = liblocateanythingcpp-fallback.dylib
 endif
 
 locate-anything-cpp: main.go golocateanythingcpp.go $(VARIANT_TARGETS)
@@ -82,7 +82,7 @@ package: locate-anything-cpp
 build: package
 
 clean: purge
-	rm -rf liblocateanythingcpp*.so locate-anything-cpp package sources
+	rm -rf liblocateanythingcpp*.so liblocateanythingcpp*.dylib locate-anything-cpp package sources
 
 purge:
 	rm -rf build*
@@ -109,11 +109,19 @@ liblocateanythingcpp-avx512.so: sources/locate-anything.cpp
 endif
 
 # Build fallback variant (all platforms)
+ifeq ($(UNAME_S),Darwin)
+liblocateanythingcpp-fallback.dylib: sources/locate-anything.cpp
+	rm -rfv build-$@
+	$(info ${GREEN}I locate-anything-cpp build info:fallback${RESET})
+	SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) liblocateanythingcpp-custom
+	rm -rfv build-$@
+else
 liblocateanythingcpp-fallback.so: sources/locate-anything.cpp
 	rm -rfv build-$@
 	$(info ${GREEN}I locate-anything-cpp build info:fallback${RESET})
 	SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) liblocateanythingcpp-custom
 	rm -rfv build-$@
+endif
 
 liblocateanythingcpp-custom: CMakeLists.txt
 	mkdir -p build-$(SO_TARGET) && \
@@ -121,7 +129,8 @@ liblocateanythingcpp-custom: CMakeLists.txt
 	cmake .. $(CMAKE_ARGS) && \
 	cmake --build . --config Release -j$(JOBS) && \
 	cd .. && \
-	mv build-$(SO_TARGET)/liblocateanythingcpp.so ./$(SO_TARGET)
+	(mv build-$(SO_TARGET)/liblocateanythingcpp.so ./$(SO_TARGET) 2>/dev/null || \
+	 mv build-$(SO_TARGET)/liblocateanythingcpp.dylib ./$(SO_TARGET) 2>/dev/null)
 
 all: locate-anything-cpp package
 
diff --git a/backend/go/locate-anything-cpp/main.go b/backend/go/locate-anything-cpp/main.go
index 91ccaf38e..77e53bb95 100644
--- a/backend/go/locate-anything-cpp/main.go
+++ b/backend/go/locate-anything-cpp/main.go
@@ -9,6 +9,7 @@ package main
 import (
 	"flag"
 	"os"
+	"runtime"
 
 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -27,7 +28,11 @@ func main() {
 	// Get library name from environment variable, default to fallback
 	libName := os.Getenv("LOCATEANYTHING_LIBRARY")
 	if libName == "" {
-		libName = "./liblocateanythingcpp-fallback.so"
+		if runtime.GOOS == "darwin" {
+			libName = "./liblocateanythingcpp-fallback.dylib"
+		} else {
+			libName = "./liblocateanythingcpp-fallback.so"
+		}
 	}
 
 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
diff --git a/backend/go/locate-anything-cpp/package.sh b/backend/go/locate-anything-cpp/package.sh
index 3b1f13428..1e6cbee80 100755
--- a/backend/go/locate-anything-cpp/package.sh
+++ b/backend/go/locate-anything-cpp/package.sh
@@ -10,7 +10,8 @@ REPO_ROOT="${CURDIR}/../../.."
 # Create lib directory
 mkdir -p $CURDIR/package/lib
 
-cp -avf $CURDIR/liblocateanythingcpp-*.so $CURDIR/package/
+cp -fv $CURDIR/liblocateanythingcpp-*.so $CURDIR/package/ 2>/dev/null || true
+cp -fv $CURDIR/liblocateanythingcpp-*.dylib $CURDIR/package/ 2>/dev/null || true
 cp -avf $CURDIR/locate-anything-cpp $CURDIR/package/
 cp -fv $CURDIR/run.sh $CURDIR/package/
 
diff --git a/backend/go/locate-anything-cpp/run.sh b/backend/go/locate-anything-cpp/run.sh
index cefbff629..4eebb3c63 100755
--- a/backend/go/locate-anything-cpp/run.sh
+++ b/backend/go/locate-anything-cpp/run.sh
@@ -12,9 +12,13 @@ if [ "$(uname)" != "Darwin" ]; then
 	grep -e "flags" /proc/cpuinfo | head -1
 fi
 
-LIBRARY="$CURDIR/liblocateanythingcpp-fallback.so"
+if [ "$(uname)" = "Darwin" ]; then
+	# macOS: single dylib variant (Metal or Accelerate)
+	LIBRARY="$CURDIR/liblocateanythingcpp-fallback.dylib"
+	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
+else
+	LIBRARY="$CURDIR/liblocateanythingcpp-fallback.so"
 
-if [ "$(uname)" != "Darwin" ]; then
 	if grep -q -e "\savx\s" /proc/cpuinfo ; then
 		echo "CPU:    AVX    found OK"
 		if [ -e $CURDIR/liblocateanythingcpp-avx.so ]; then
@@ -36,9 +40,10 @@ if [ "$(uname)" != "Darwin" ]; then
 			LIBRARY="$CURDIR/liblocateanythingcpp-avx512.so"
 		fi
 	fi
+
+	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 fi
 
-export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export LOCATEANYTHING_LIBRARY=$LIBRARY
 
 # If there is a lib/ld.so, use it
diff --git a/backend/go/omnivoice-cpp/Makefile b/backend/go/omnivoice-cpp/Makefile
index c245acf58..36b447b13 100644
--- a/backend/go/omnivoice-cpp/Makefile
+++ b/backend/go/omnivoice-cpp/Makefile
@@ -65,7 +65,8 @@ UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
 	VARIANT_TARGETS = libgomnivoicecpp-avx.so libgomnivoicecpp-avx2.so libgomnivoicecpp-avx512.so libgomnivoicecpp-fallback.so
 else
-	VARIANT_TARGETS = libgomnivoicecpp-fallback.so
+	# On non-Linux (e.g., Darwin), build only fallback variant (as a dylib)
+	VARIANT_TARGETS = libgomnivoicecpp-fallback.dylib
 endif
 
 omnivoice-cpp: main.go gomnivoicecpp.go $(VARIANT_TARGETS)
@@ -77,7 +78,7 @@ package: omnivoice-cpp
 build: package
 
 clean: purge
-	rm -rf libgomnivoicecpp*.so package sources/omnivoice.cpp omnivoice-cpp
+	rm -rf libgomnivoicecpp*.so libgomnivoicecpp*.dylib package sources/omnivoice.cpp omnivoice-cpp
 
 purge:
 	rm -rf build*
@@ -106,13 +107,20 @@ libgomnivoicecpp-fallback.so: sources/omnivoice.cpp
 	SO_TARGET=libgomnivoicecpp-fallback.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgomnivoicecpp-custom
 	rm -rf build-libgomnivoicecpp-fallback.so
 
+# Build fallback variant as a dylib (Darwin)
+libgomnivoicecpp-fallback.dylib: sources/omnivoice.cpp
+	$(info ${GREEN}I omnivoice-cpp build info:fallback (dylib)${RESET})
+	SO_TARGET=libgomnivoicecpp-fallback.dylib CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgomnivoicecpp-custom
+	rm -rf build-libgomnivoicecpp-fallback.dylib
+
 libgomnivoicecpp-custom: CMakeLists.txt cpp/gomnivoicecpp.cpp cpp/gomnivoicecpp.h
 	mkdir -p build-$(SO_TARGET) && \
 	cd build-$(SO_TARGET) && \
 	cmake .. $(CMAKE_ARGS) && \
 	cmake --build . --config Release -j$(JOBS) --target gomnivoicecpp && \
 	cd .. && \
-	mv build-$(SO_TARGET)/libgomnivoicecpp.so ./$(SO_TARGET)
+	(mv build-$(SO_TARGET)/libgomnivoicecpp.so ./$(SO_TARGET) 2>/dev/null || \
+	 mv build-$(SO_TARGET)/libgomnivoicecpp.dylib ./$(SO_TARGET) 2>/dev/null)
 
 test: omnivoice-cpp
 	@echo "Running omnivoice-cpp tests..."
diff --git a/backend/go/omnivoice-cpp/main.go b/backend/go/omnivoice-cpp/main.go
index 891201f49..f44eb31a7 100644
--- a/backend/go/omnivoice-cpp/main.go
+++ b/backend/go/omnivoice-cpp/main.go
@@ -4,6 +4,7 @@ package main
 import (
 	"flag"
 	"os"
+	"runtime"
 
 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -21,7 +22,11 @@ type LibFuncs struct {
 func main() {
 	libName := os.Getenv("OMNIVOICE_LIBRARY")
 	if libName == "" {
-		libName = "./libgomnivoicecpp-fallback.so"
+		if runtime.GOOS == "darwin" {
+			libName = "./libgomnivoicecpp-fallback.dylib"
+		} else {
+			libName = "./libgomnivoicecpp-fallback.so"
+		}
 	}
 
 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
diff --git a/backend/go/omnivoice-cpp/package.sh b/backend/go/omnivoice-cpp/package.sh
index b8313d9d7..97a8d7809 100755
--- a/backend/go/omnivoice-cpp/package.sh
+++ b/backend/go/omnivoice-cpp/package.sh
@@ -12,7 +12,8 @@ REPO_ROOT="${CURDIR}/../../.."
 mkdir -p $CURDIR/package/lib
 
 cp -avf $CURDIR/omnivoice-cpp $CURDIR/package/
-cp -fv $CURDIR/libgomnivoicecpp-*.so $CURDIR/package/
+cp -fv $CURDIR/libgomnivoicecpp-*.so $CURDIR/package/ 2>/dev/null || true
+cp -fv $CURDIR/libgomnivoicecpp-*.dylib $CURDIR/package/ 2>/dev/null || true
 cp -fv $CURDIR/run.sh $CURDIR/package/
 
 # Detect architecture and copy appropriate libraries
diff --git a/backend/go/omnivoice-cpp/run.sh b/backend/go/omnivoice-cpp/run.sh
index f677ca21c..81ea2b719 100755
--- a/backend/go/omnivoice-cpp/run.sh
+++ b/backend/go/omnivoice-cpp/run.sh
@@ -12,9 +12,13 @@ if [ "$(uname)" != "Darwin" ]; then
 	grep -e "flags" /proc/cpuinfo | head -1
 fi
 
-LIBRARY="$CURDIR/libgomnivoicecpp-fallback.so"
+if [ "$(uname)" = "Darwin" ]; then
+	# macOS: single dylib variant (Metal or Accelerate)
+	LIBRARY="$CURDIR/libgomnivoicecpp-fallback.dylib"
+	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
+else
+	LIBRARY="$CURDIR/libgomnivoicecpp-fallback.so"
 
-if [ "$(uname)" != "Darwin" ]; then
 	if grep -q -e "\savx\s" /proc/cpuinfo ; then
 		echo "CPU:    AVX    found OK"
 		if [ -e $CURDIR/libgomnivoicecpp-avx.so ]; then
@@ -36,9 +40,10 @@ if [ "$(uname)" != "Darwin" ]; then
 			LIBRARY="$CURDIR/libgomnivoicecpp-avx512.so"
 		fi
 	fi
+
+	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 fi
 
-export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export OMNIVOICE_LIBRARY=$LIBRARY
 
 # If there is a lib/ld.so, use it
diff --git a/backend/go/parakeet-cpp/Makefile b/backend/go/parakeet-cpp/Makefile
index f9848dc34..7fc46f8e2 100644
--- a/backend/go/parakeet-cpp/Makefile
+++ b/backend/go/parakeet-cpp/Makefile
@@ -74,6 +74,7 @@ libparakeet.so: sources/parakeet.cpp
 	cmake -B sources/parakeet.cpp/build-shared -S sources/parakeet.cpp $(CMAKE_ARGS)
 	cmake --build sources/parakeet.cpp/build-shared --config Release -j$(JOBS)
 	cp -fv sources/parakeet.cpp/build-shared/libparakeet.so* ./ 2>/dev/null || true
+	cp -fv sources/parakeet.cpp/build-shared/libparakeet.dylib ./ 2>/dev/null || true
 	cp -fv sources/parakeet.cpp/include/parakeet_capi.h ./
 
 parakeet-cpp-grpc: libparakeet.so main.go goparakeetcpp.go
diff --git a/backend/go/parakeet-cpp/main.go b/backend/go/parakeet-cpp/main.go
index 963056e23..9c6466b13 100644
--- a/backend/go/parakeet-cpp/main.go
+++ b/backend/go/parakeet-cpp/main.go
@@ -2,15 +2,17 @@ package main
 
 // Started internally by LocalAI - one gRPC server per loaded model.
 //
-// Loads libparakeet.so via purego and registers the flat C-API entry
-// points declared in parakeet_capi.h. The library name can be overridden
-// with PARAKEET_LIBRARY (mirrors the WHISPER_LIBRARY / VIBEVOICECPP_LIBRARY
-// convention in the sibling backends); the default looks for the .so next
-// to this binary.
+// Loads the parakeet shared library via purego and registers the flat
+// C-API entry points declared in parakeet_capi.h. The library name can be
+// overridden with PARAKEET_LIBRARY (mirrors the WHISPER_LIBRARY /
+// VIBEVOICECPP_LIBRARY convention in the sibling backends); the default
+// looks next to this binary for libparakeet.so on Linux and
+// libparakeet.dylib on macOS.
 import (
 	"flag"
 	"fmt"
 	"os"
+	"runtime"
 
 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -28,7 +30,11 @@ type LibFuncs struct {
 func main() {
 	libName := os.Getenv("PARAKEET_LIBRARY")
 	if libName == "" {
-		libName = "libparakeet.so"
+		if runtime.GOOS == "darwin" {
+			libName = "libparakeet.dylib"
+		} else {
+			libName = "libparakeet.so"
+		}
 	}
 
 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
diff --git a/backend/go/parakeet-cpp/package.sh b/backend/go/parakeet-cpp/package.sh
index 0b580324c..af8e6b9e1 100755
--- a/backend/go/parakeet-cpp/package.sh
+++ b/backend/go/parakeet-cpp/package.sh
@@ -16,12 +16,15 @@ mkdir -p "$CURDIR/package/lib"
 cp -avf "$CURDIR/parakeet-cpp-grpc" "$CURDIR/package/"
 cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
 
-# libparakeet.so + any soname symlinks (libparakeet.so.X[.Y]). purego.Dlopen
-# resolves it via LD_LIBRARY_PATH, which run.sh points at lib/.
-cp -avf "$CURDIR"/libparakeet.so* "$CURDIR/package/lib/" 2>/dev/null || {
-	echo "ERROR: libparakeet.so not found in $CURDIR, run 'make' first" >&2
+# libparakeet shared lib + any soname symlinks. On Linux this is
+# libparakeet.so[.X.Y]; on macOS it is libparakeet.dylib. purego.Dlopen
+# resolves it via the *_LIBRARY_PATH that run.sh points at lib/.
+cp -avf "$CURDIR"/libparakeet.so* "$CURDIR/package/lib/" 2>/dev/null || true
+cp -avf "$CURDIR"/libparakeet.dylib "$CURDIR/package/lib/" 2>/dev/null || true
+if ! ls "$CURDIR"/package/lib/libparakeet.* >/dev/null 2>&1; then
+	echo "ERROR: libparakeet shared library not found in $CURDIR, run 'make' first" >&2
 	exit 1
-}
+fi
 
 # Detect architecture and copy the core runtime libs libparakeet.so links
 # against, plus the matching dynamic loader as lib/ld.so.
@@ -48,7 +51,7 @@ elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
     cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
     cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
 elif [ "$(uname -s)" = "Darwin" ]; then
-    echo "Detected Darwin"
+    echo "Detected Darwin — system frameworks linked dynamically, no bundled libs needed"
 else
     echo "Error: Could not detect architecture"
     exit 1
diff --git a/backend/go/parakeet-cpp/run.sh b/backend/go/parakeet-cpp/run.sh
index 6f371d4f0..be859f381 100755
--- a/backend/go/parakeet-cpp/run.sh
+++ b/backend/go/parakeet-cpp/run.sh
@@ -3,11 +3,17 @@ set -e
 
 CURDIR=$(dirname "$(realpath "$0")")
 
-export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
+if [ "$(uname)" = "Darwin" ]; then
+	export DYLD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${DYLD_LIBRARY_PATH:-}"
+	export PARAKEET_LIBRARY="$CURDIR/lib/libparakeet.dylib"
+else
+	export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
+	export PARAKEET_LIBRARY="$CURDIR/lib/libparakeet.so"
+fi
 
 # If a self-contained ld.so was packaged, route through it so the
 # packaged libc / libstdc++ are used instead of the host's (matches the
-# whisper backend's runtime layout).
+# whisper backend's runtime layout). Linux only.
 if [ -f "$CURDIR/lib/ld.so" ]; then
 	echo "Using lib/ld.so"
 	exec "$CURDIR/lib/ld.so" "$CURDIR/parakeet-cpp-grpc" "$@"
diff --git a/backend/go/qwen3-tts-cpp/Makefile b/backend/go/qwen3-tts-cpp/Makefile
index 4015f790e..3311f93c3 100644
--- a/backend/go/qwen3-tts-cpp/Makefile
+++ b/backend/go/qwen3-tts-cpp/Makefile
@@ -65,8 +65,8 @@ UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
 	VARIANT_TARGETS = libgoqwen3ttscpp-avx.so libgoqwen3ttscpp-avx2.so libgoqwen3ttscpp-avx512.so libgoqwen3ttscpp-fallback.so
 else
-	# On non-Linux (e.g., Darwin), build only fallback variant
-	VARIANT_TARGETS = libgoqwen3ttscpp-fallback.so
+	# On non-Linux (e.g., Darwin), build only fallback variant (as a dylib)
+	VARIANT_TARGETS = libgoqwen3ttscpp-fallback.dylib
 endif
 
 qwen3-tts-cpp: main.go goqwen3ttscpp.go $(VARIANT_TARGETS)
@@ -78,7 +78,7 @@ package: qwen3-tts-cpp
 build: package
 
 clean: purge
-	rm -rf libgoqwen3ttscpp*.so package sources/qwentts.cpp qwen3-tts-cpp
+	rm -rf libgoqwen3ttscpp*.so libgoqwen3ttscpp*.dylib package sources/qwentts.cpp qwen3-tts-cpp
 
 purge:
 	rm -rf build*
@@ -110,13 +110,20 @@ libgoqwen3ttscpp-fallback.so: sources/qwentts.cpp
 	SO_TARGET=libgoqwen3ttscpp-fallback.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgoqwen3ttscpp-custom
 	rm -rf build-libgoqwen3ttscpp-fallback.so
 
+# Build fallback variant as a dylib (Darwin)
+libgoqwen3ttscpp-fallback.dylib: sources/qwentts.cpp
+	$(info ${GREEN}I qwen3-tts-cpp build info:fallback (dylib)${RESET})
+	SO_TARGET=libgoqwen3ttscpp-fallback.dylib CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgoqwen3ttscpp-custom
+	rm -rf build-libgoqwen3ttscpp-fallback.dylib
+
 libgoqwen3ttscpp-custom: CMakeLists.txt cpp/goqwen3ttscpp.cpp cpp/goqwen3ttscpp.h
 	mkdir -p build-$(SO_TARGET) && \
 	cd build-$(SO_TARGET) && \
 	cmake .. $(CMAKE_ARGS) && \
 	cmake --build . --config Release -j$(JOBS) --target goqwen3ttscpp && \
 	cd .. && \
-	mv build-$(SO_TARGET)/libgoqwen3ttscpp.so ./$(SO_TARGET)
+	(mv build-$(SO_TARGET)/libgoqwen3ttscpp.so ./$(SO_TARGET) 2>/dev/null || \
+	 mv build-$(SO_TARGET)/libgoqwen3ttscpp.dylib ./$(SO_TARGET) 2>/dev/null)
 
 test: qwen3-tts-cpp
 	@echo "Running qwen3-tts-cpp tests..."
diff --git a/backend/go/qwen3-tts-cpp/main.go b/backend/go/qwen3-tts-cpp/main.go
index b788229cd..041a23ad0 100644
--- a/backend/go/qwen3-tts-cpp/main.go
+++ b/backend/go/qwen3-tts-cpp/main.go
@@ -4,6 +4,7 @@ package main
 import (
 	"flag"
 	"os"
+	"runtime"
 
 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -21,7 +22,11 @@ type LibFuncs struct {
 func main() {
 	libName := os.Getenv("QWEN3TTS_LIBRARY")
 	if libName == "" {
-		libName = "./libgoqwen3ttscpp-fallback.so"
+		if runtime.GOOS == "darwin" {
+			libName = "./libgoqwen3ttscpp-fallback.dylib"
+		} else {
+			libName = "./libgoqwen3ttscpp-fallback.so"
+		}
 	}
 
 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
diff --git a/backend/go/qwen3-tts-cpp/package.sh b/backend/go/qwen3-tts-cpp/package.sh
index bb73df968..11d4c57c3 100755
--- a/backend/go/qwen3-tts-cpp/package.sh
+++ b/backend/go/qwen3-tts-cpp/package.sh
@@ -12,7 +12,8 @@ REPO_ROOT="${CURDIR}/../../.."
 mkdir -p $CURDIR/package/lib
 
 cp -avf $CURDIR/qwen3-tts-cpp $CURDIR/package/
-cp -fv $CURDIR/libgoqwen3ttscpp-*.so $CURDIR/package/
+cp -fv $CURDIR/libgoqwen3ttscpp-*.so $CURDIR/package/ 2>/dev/null || true
+cp -fv $CURDIR/libgoqwen3ttscpp-*.dylib $CURDIR/package/ 2>/dev/null || true
 cp -fv $CURDIR/run.sh $CURDIR/package/
 
 # Detect architecture and copy appropriate libraries
diff --git a/backend/go/qwen3-tts-cpp/run.sh b/backend/go/qwen3-tts-cpp/run.sh
index 6416779fa..638cf9661 100755
--- a/backend/go/qwen3-tts-cpp/run.sh
+++ b/backend/go/qwen3-tts-cpp/run.sh
@@ -12,9 +12,13 @@ if [ "$(uname)" != "Darwin" ]; then
 	grep -e "flags" /proc/cpuinfo | head -1
 fi
 
-LIBRARY="$CURDIR/libgoqwen3ttscpp-fallback.so"
+if [ "$(uname)" = "Darwin" ]; then
+	# macOS: single dylib variant (Metal or Accelerate)
+	LIBRARY="$CURDIR/libgoqwen3ttscpp-fallback.dylib"
+	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
+else
+	LIBRARY="$CURDIR/libgoqwen3ttscpp-fallback.so"
 
-if [ "$(uname)" != "Darwin" ]; then
 	if grep -q -e "\savx\s" /proc/cpuinfo ; then
 		echo "CPU:    AVX    found OK"
 		if [ -e $CURDIR/libgoqwen3ttscpp-avx.so ]; then
@@ -36,9 +40,10 @@ if [ "$(uname)" != "Darwin" ]; then
 			LIBRARY="$CURDIR/libgoqwen3ttscpp-avx512.so"
 		fi
 	fi
+
+	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 fi
 
-export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export QWEN3TTS_LIBRARY=$LIBRARY
 
 # If there is a lib/ld.so, use it
diff --git a/backend/go/rfdetr-cpp/Makefile b/backend/go/rfdetr-cpp/Makefile
index 7c598f732..3282720ff 100644
--- a/backend/go/rfdetr-cpp/Makefile
+++ b/backend/go/rfdetr-cpp/Makefile
@@ -71,7 +71,7 @@ ifeq ($(UNAME_S),Linux)
 	VARIANT_TARGETS = librfdetrcpp-avx.so librfdetrcpp-avx2.so librfdetrcpp-avx512.so librfdetrcpp-fallback.so
 else
 	# On non-Linux (e.g., Darwin), build only fallback variant
-	VARIANT_TARGETS = librfdetrcpp-fallback.so
+	VARIANT_TARGETS = librfdetrcpp-fallback.dylib
 endif
 
 rfdetr-cpp: main.go gorfdetrcpp.go $(VARIANT_TARGETS)
@@ -83,7 +83,7 @@ package: rfdetr-cpp
 build: package
 
 clean: purge
-	rm -rf librfdetrcpp*.so rfdetr-cpp package sources
+	rm -rf librfdetrcpp*.so librfdetrcpp*.dylib rfdetr-cpp package sources
 
 purge:
 	rm -rf build*
@@ -110,11 +110,19 @@ librfdetrcpp-avx512.so: sources/rt-detr.cpp
 endif
 
 # Build fallback variant (all platforms)
+ifeq ($(UNAME_S),Darwin)
+librfdetrcpp-fallback.dylib: sources/rt-detr.cpp
+	rm -rfv build-$@
+	$(info ${GREEN}I rfdetr-cpp build info:fallback${RESET})
+	SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) librfdetrcpp-custom
+	rm -rfv build-$@
+else
 librfdetrcpp-fallback.so: sources/rt-detr.cpp
 	rm -rfv build-$@
 	$(info ${GREEN}I rfdetr-cpp build info:fallback${RESET})
 	SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) librfdetrcpp-custom
 	rm -rfv build-$@
+endif
 
 librfdetrcpp-custom: CMakeLists.txt
 	mkdir -p build-$(SO_TARGET) && \
@@ -122,7 +130,8 @@ librfdetrcpp-custom: CMakeLists.txt
 	cmake .. $(CMAKE_ARGS) && \
 	cmake --build . --config Release -j$(JOBS) && \
 	cd .. && \
-	mv build-$(SO_TARGET)/librfdetrcpp.so ./$(SO_TARGET)
+	(mv build-$(SO_TARGET)/librfdetrcpp.so ./$(SO_TARGET) 2>/dev/null || \
+	 mv build-$(SO_TARGET)/librfdetrcpp.dylib ./$(SO_TARGET) 2>/dev/null)
 
 all: rfdetr-cpp package
 
diff --git a/backend/go/rfdetr-cpp/main.go b/backend/go/rfdetr-cpp/main.go
index 3c95df1c2..58637122a 100644
--- a/backend/go/rfdetr-cpp/main.go
+++ b/backend/go/rfdetr-cpp/main.go
@@ -9,6 +9,7 @@ package main
 import (
 	"flag"
 	"os"
+	"runtime"
 
 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -27,7 +28,11 @@ func main() {
 	// Get library name from environment variable, default to fallback
 	libName := os.Getenv("RFDETR_LIBRARY")
 	if libName == "" {
-		libName = "./librfdetrcpp-fallback.so"
+		if runtime.GOOS == "darwin" {
+			libName = "./librfdetrcpp-fallback.dylib"
+		} else {
+			libName = "./librfdetrcpp-fallback.so"
+		}
 	}
 
 	rfdetrLib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
diff --git a/backend/go/rfdetr-cpp/package.sh b/backend/go/rfdetr-cpp/package.sh
index 9591b79dc..17319bf27 100755
--- a/backend/go/rfdetr-cpp/package.sh
+++ b/backend/go/rfdetr-cpp/package.sh
@@ -10,7 +10,8 @@ REPO_ROOT="${CURDIR}/../../.."
 # Create lib directory
 mkdir -p $CURDIR/package/lib
 
-cp -avf $CURDIR/librfdetrcpp-*.so $CURDIR/package/
+cp -fv $CURDIR/librfdetrcpp-*.so $CURDIR/package/ 2>/dev/null || true
+cp -fv $CURDIR/librfdetrcpp-*.dylib $CURDIR/package/ 2>/dev/null || true
 cp -avf $CURDIR/rfdetr-cpp $CURDIR/package/
 cp -fv $CURDIR/run.sh $CURDIR/package/
 
diff --git a/backend/go/rfdetr-cpp/run.sh b/backend/go/rfdetr-cpp/run.sh
index 042904e45..ffbd604dd 100755
--- a/backend/go/rfdetr-cpp/run.sh
+++ b/backend/go/rfdetr-cpp/run.sh
@@ -12,9 +12,13 @@ if [ "$(uname)" != "Darwin" ]; then
 	grep -e "flags" /proc/cpuinfo | head -1
 fi
 
-LIBRARY="$CURDIR/librfdetrcpp-fallback.so"
+if [ "$(uname)" = "Darwin" ]; then
+	# macOS: single dylib variant (Metal or Accelerate)
+	LIBRARY="$CURDIR/librfdetrcpp-fallback.dylib"
+	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
+else
+	LIBRARY="$CURDIR/librfdetrcpp-fallback.so"
 
-if [ "$(uname)" != "Darwin" ]; then
 	if grep -q -e "\savx\s" /proc/cpuinfo ; then
 		echo "CPU:    AVX    found OK"
 		if [ -e $CURDIR/librfdetrcpp-avx.so ]; then
@@ -36,9 +40,10 @@ if [ "$(uname)" != "Darwin" ]; then
 			LIBRARY="$CURDIR/librfdetrcpp-avx512.so"
 		fi
 	fi
+
+	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 fi
 
-export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export RFDETR_LIBRARY=$LIBRARY
 
 # If there is a lib/ld.so, use it
diff --git a/backend/go/sam3-cpp/Makefile b/backend/go/sam3-cpp/Makefile
index 53b0dfb5e..27b6cedf7 100644
--- a/backend/go/sam3-cpp/Makefile
+++ b/backend/go/sam3-cpp/Makefile
@@ -66,7 +66,7 @@ ifeq ($(UNAME_S),Linux)
 	VARIANT_TARGETS = libgosam3-avx.so libgosam3-avx2.so libgosam3-avx512.so libgosam3-fallback.so
 else
 	# On non-Linux (e.g., Darwin), build only fallback variant
-	VARIANT_TARGETS = libgosam3-fallback.so
+	VARIANT_TARGETS = libgosam3-fallback.dylib
 endif
 
 sam3-cpp: main.go gosam3.go $(VARIANT_TARGETS)
@@ -78,7 +78,7 @@ package: sam3-cpp
 build: package
 
 clean: purge
-	rm -rf libgosam3*.so sam3-cpp package sources
+	rm -rf libgosam3*.so libgosam3*.dylib sam3-cpp package sources
 
 purge:
 	rm -rf build*
@@ -105,11 +105,19 @@ libgosam3-avx512.so: sources/sam3.cpp
 endif
 
 # Build fallback variant (all platforms)
+ifeq ($(UNAME_S),Darwin)
+libgosam3-fallback.dylib: sources/sam3.cpp
+	$(MAKE) purge
+	$(info ${GREEN}I sam3-cpp build info:fallback${RESET})
+	SO_TARGET=libgosam3-fallback.dylib CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgosam3-custom
+	rm -rfv build*
+else
 libgosam3-fallback.so: sources/sam3.cpp
 	$(MAKE) purge
 	$(info ${GREEN}I sam3-cpp build info:fallback${RESET})
 	SO_TARGET=libgosam3-fallback.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgosam3-custom
 	rm -rfv build*
+endif
 
 libgosam3-custom: CMakeLists.txt cpp/gosam3.cpp cpp/gosam3.h
 	mkdir -p build-$(SO_TARGET) && \
@@ -117,6 +125,7 @@ libgosam3-custom: CMakeLists.txt cpp/gosam3.cpp cpp/gosam3.h
 	cmake .. $(CMAKE_ARGS) && \
 	cmake --build . --config Release -j$(JOBS) && \
 	cd .. && \
-	mv build-$(SO_TARGET)/libgosam3.so ./$(SO_TARGET)
+	(mv build-$(SO_TARGET)/libgosam3.so ./$(SO_TARGET) 2>/dev/null || \
+	 mv build-$(SO_TARGET)/libgosam3.dylib ./$(SO_TARGET) 2>/dev/null)
 
 all: sam3-cpp package
diff --git a/backend/go/sam3-cpp/main.go b/backend/go/sam3-cpp/main.go
index c83a59285..e36849f69 100644
--- a/backend/go/sam3-cpp/main.go
+++ b/backend/go/sam3-cpp/main.go
@@ -3,6 +3,7 @@ package main
 import (
 	"flag"
 	"os"
+	"runtime"
 
 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -21,7 +22,11 @@ func main() {
 	// Get library name from environment variable, default to fallback
 	libName := os.Getenv("SAM3_LIBRARY")
 	if libName == "" {
-		libName = "./libgosam3-fallback.so"
+		if runtime.GOOS == "darwin" {
+			libName = "./libgosam3-fallback.dylib"
+		} else {
+			libName = "./libgosam3-fallback.so"
+		}
 	}
 
 	gosamLib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
diff --git a/backend/go/sam3-cpp/package.sh b/backend/go/sam3-cpp/package.sh
index 254aef286..a648ee93c 100755
--- a/backend/go/sam3-cpp/package.sh
+++ b/backend/go/sam3-cpp/package.sh
@@ -10,7 +10,8 @@ REPO_ROOT="${CURDIR}/../../.."
 # Create lib directory
 mkdir -p $CURDIR/package/lib
 
-cp -avf $CURDIR/libgosam3-*.so $CURDIR/package/
+cp -fv $CURDIR/libgosam3-*.so $CURDIR/package/ 2>/dev/null || true
+cp -fv $CURDIR/libgosam3-*.dylib $CURDIR/package/ 2>/dev/null || true
 cp -avf $CURDIR/sam3-cpp $CURDIR/package/
 cp -fv $CURDIR/run.sh $CURDIR/package/
 
diff --git a/backend/go/sam3-cpp/run.sh b/backend/go/sam3-cpp/run.sh
index 423ed9199..7bff52df6 100755
--- a/backend/go/sam3-cpp/run.sh
+++ b/backend/go/sam3-cpp/run.sh
@@ -12,9 +12,13 @@ if [ "$(uname)" != "Darwin" ]; then
 	grep -e "flags" /proc/cpuinfo | head -1
 fi
 
-LIBRARY="$CURDIR/libgosam3-fallback.so"
+if [ "$(uname)" = "Darwin" ]; then
+	# macOS: single dylib variant (Metal or Accelerate)
+	LIBRARY="$CURDIR/libgosam3-fallback.dylib"
+	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
+else
+	LIBRARY="$CURDIR/libgosam3-fallback.so"
 
-if [ "$(uname)" != "Darwin" ]; then
 	if grep -q -e "\savx\s" /proc/cpuinfo ; then
 		echo "CPU:    AVX    found OK"
 		if [ -e $CURDIR/libgosam3-avx.so ]; then
@@ -36,9 +40,10 @@ if [ "$(uname)" != "Darwin" ]; then
 			LIBRARY="$CURDIR/libgosam3-avx512.so"
 		fi
 	fi
+
+	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 fi
 
-export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export SAM3_LIBRARY=$LIBRARY
 
 # If there is a lib/ld.so, use it
diff --git a/backend/go/sherpa-onnx/backend.go b/backend/go/sherpa-onnx/backend.go
index 0a092acf7..8bfe5e75c 100644
--- a/backend/go/sherpa-onnx/backend.go
+++ b/backend/go/sherpa-onnx/backend.go
@@ -7,6 +7,7 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
+	"runtime"
 	"strconv"
 	"strings"
 	"sync"
@@ -238,11 +239,19 @@ func loadSherpaLibs() error {
 func loadSherpaLibsOnce() error {
 	shimLib := os.Getenv("SHERPA_SHIM_LIBRARY")
 	if shimLib == "" {
-		shimLib = "libsherpa-shim.so"
+		if runtime.GOOS == "darwin" {
+			shimLib = "libsherpa-shim.dylib"
+		} else {
+			shimLib = "libsherpa-shim.so"
+		}
 	}
 	capiLib := os.Getenv("SHERPA_ONNX_LIBRARY")
 	if capiLib == "" {
-		capiLib = "libsherpa-onnx-c-api.so"
+		if runtime.GOOS == "darwin" {
+			capiLib = "libsherpa-onnx-c-api.dylib"
+		} else {
+			capiLib = "libsherpa-onnx-c-api.so"
+		}
 	}
 
 	shim, err := purego.Dlopen(shimLib, purego.RTLD_NOW|purego.RTLD_GLOBAL)
diff --git a/backend/go/sherpa-onnx/run.sh b/backend/go/sherpa-onnx/run.sh
index b703e5155..771324326 100755
--- a/backend/go/sherpa-onnx/run.sh
+++ b/backend/go/sherpa-onnx/run.sh
@@ -3,7 +3,13 @@ set -ex
 
 CURDIR=$(dirname "$(realpath $0)")
 
-export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
+if [ "$(uname)" = "Darwin" ]; then
+	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
+	export SHERPA_SHIM_LIBRARY=$CURDIR/lib/libsherpa-shim.dylib
+	export SHERPA_ONNX_LIBRARY=$CURDIR/lib/libsherpa-onnx-c-api.dylib
+else
+	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
+fi
 
 if [ -f $CURDIR/lib/ld.so ]; then
 	echo "Using lib/ld.so"
diff --git a/backend/go/stablediffusion-ggml/Makefile b/backend/go/stablediffusion-ggml/Makefile
index 05b57b254..d161a5b47 100644
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -131,6 +131,7 @@ libgosd-custom: CMakeLists.txt cpp/gosd.cpp cpp/gosd.h
 	cmake .. $(CMAKE_ARGS) && \
 	cmake --build . --config Release -j$(JOBS) && \
 	cd .. && \
-	mv build-$(SO_TARGET)/libgosd.so ./$(SO_TARGET)
+	(mv build-$(SO_TARGET)/libgosd.so ./$(SO_TARGET) 2>/dev/null || \
+	 mv build-$(SO_TARGET)/libgosd.dylib ./$(SO_TARGET) 2>/dev/null)
 
 all: stablediffusion-ggml package
\ No newline at end of file
diff --git a/backend/go/stablediffusion-ggml/main.go b/backend/go/stablediffusion-ggml/main.go
index 998f2a5ab..b509c6a2b 100644
--- a/backend/go/stablediffusion-ggml/main.go
+++ b/backend/go/stablediffusion-ggml/main.go
@@ -3,6 +3,7 @@ package main
 import (
 	"flag"
 	"os"
+	"runtime"
 
 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -21,7 +22,11 @@ func main() {
 	// Get library name from environment variable, default to fallback
 	libName := os.Getenv("SD_LIBRARY")
 	if libName == "" {
-		libName = "./libgosd-fallback.so"
+		if runtime.GOOS == "darwin" {
+			libName = "./libgosd-fallback.dylib"
+		} else {
+			libName = "./libgosd-fallback.so"
+		}
 	}
 
 	gosd, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
diff --git a/backend/go/stablediffusion-ggml/package.sh b/backend/go/stablediffusion-ggml/package.sh
index 8006e081f..922fb71ea 100755
--- a/backend/go/stablediffusion-ggml/package.sh
+++ b/backend/go/stablediffusion-ggml/package.sh
@@ -12,6 +12,7 @@ REPO_ROOT="${CURDIR}/../../.."
 mkdir -p $CURDIR/package/lib
 
 cp -avf $CURDIR/libgosd-*.so $CURDIR/package/
+cp -fv $CURDIR/libgosd-*.dylib $CURDIR/package/ 2>/dev/null || true
 cp -avf $CURDIR/stablediffusion-ggml $CURDIR/package/
 cp -fv $CURDIR/run.sh $CURDIR/package/
 
diff --git a/backend/go/stablediffusion-ggml/run.sh b/backend/go/stablediffusion-ggml/run.sh
index 71342e43b..e026b4b28 100755
--- a/backend/go/stablediffusion-ggml/run.sh
+++ b/backend/go/stablediffusion-ggml/run.sh
@@ -12,9 +12,18 @@ if [ "$(uname)" != "Darwin" ]; then
 	grep -e "flags" /proc/cpuinfo | head -1
 fi
 
-LIBRARY="$CURDIR/libgosd-fallback.so"
+if [ "$(uname)" = "Darwin" ]; then
+	# macOS: single library variant (Metal or Accelerate). The gosd target is
+	# built as a CMake MODULE, which emits a .dylib for a SHARED build but a
+	# .so for a MODULE build on Apple, so prefer .dylib and fall back to .so.
+	LIBRARY="$CURDIR/libgosd-fallback.dylib"
+	if [ ! -e "$LIBRARY" ]; then
+		LIBRARY="$CURDIR/libgosd-fallback.so"
+	fi
+	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
+else
+	LIBRARY="$CURDIR/libgosd-fallback.so"
 
-if [ "$(uname)" != "Darwin" ]; then
 	if grep -q -e "\savx\s" /proc/cpuinfo ; then
 		echo "CPU:    AVX    found OK"
 		if [ -e $CURDIR/libgosd-avx.so ]; then
@@ -36,9 +45,10 @@ if [ "$(uname)" != "Darwin" ]; then
 			LIBRARY="$CURDIR/libgosd-avx512.so"
 		fi
 	fi
+
+	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 fi
 
-export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export SD_LIBRARY=$LIBRARY
 
 # If there is a lib/ld.so, use it
diff --git a/backend/go/vibevoice-cpp/Makefile b/backend/go/vibevoice-cpp/Makefile
index 199df9cc4..dc71eaa5d 100644
--- a/backend/go/vibevoice-cpp/Makefile
+++ b/backend/go/vibevoice-cpp/Makefile
@@ -70,8 +70,8 @@ UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
 	VARIANT_TARGETS = libgovibevoicecpp-avx.so libgovibevoicecpp-avx2.so libgovibevoicecpp-avx512.so libgovibevoicecpp-fallback.so
 else
-	# On non-Linux (e.g., Darwin), build only fallback variant
-	VARIANT_TARGETS = libgovibevoicecpp-fallback.so
+	# On non-Linux (e.g., Darwin), build only fallback variant (as a dylib)
+	VARIANT_TARGETS = libgovibevoicecpp-fallback.dylib
 endif
 
 vibevoice-cpp: main.go govibevoicecpp.go $(VARIANT_TARGETS)
@@ -83,7 +83,7 @@ package: vibevoice-cpp
 build: package
 
 clean: purge
-	rm -rf libgovibevoicecpp*.so package sources/vibevoice.cpp vibevoice-cpp
+	rm -rf libgovibevoicecpp*.so libgovibevoicecpp*.dylib package sources/vibevoice.cpp vibevoice-cpp
 
 purge:
 	rm -rf build*
@@ -119,13 +119,21 @@ libgovibevoicecpp-fallback.so: sources/vibevoice.cpp
 	SO_TARGET=libgovibevoicecpp-fallback.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgovibevoicecpp-custom
 	rm -rfv build*
 
+# Build fallback variant as a dylib (Darwin)
+libgovibevoicecpp-fallback.dylib: sources/vibevoice.cpp
+	$(MAKE) purge
+	$(info ${GREEN}I vibevoice-cpp build info:fallback (dylib)${RESET})
+	SO_TARGET=libgovibevoicecpp-fallback.dylib CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgovibevoicecpp-custom
+	rm -rfv build*
+
 libgovibevoicecpp-custom: CMakeLists.txt cpp/govibevoicecpp.cpp cpp/govibevoicecpp.h
 	mkdir -p build-$(SO_TARGET) && \
 	cd build-$(SO_TARGET) && \
 	cmake .. $(CMAKE_ARGS) && \
 	cmake --build . --config Release -j$(JOBS) --target govibevoicecpp && \
 	cd .. && \
-	mv build-$(SO_TARGET)/libgovibevoicecpp.so ./$(SO_TARGET)
+	(mv build-$(SO_TARGET)/libgovibevoicecpp.so ./$(SO_TARGET) 2>/dev/null || \
+	 mv build-$(SO_TARGET)/libgovibevoicecpp.dylib ./$(SO_TARGET) 2>/dev/null)
 
 test: vibevoice-cpp
 	@echo "Running vibevoice-cpp tests..."
diff --git a/backend/go/vibevoice-cpp/main.go b/backend/go/vibevoice-cpp/main.go
index dd1f1ba43..b9a696d82 100644
--- a/backend/go/vibevoice-cpp/main.go
+++ b/backend/go/vibevoice-cpp/main.go
@@ -4,6 +4,7 @@ package main
 import (
 	"flag"
 	"os"
+	"runtime"
 
 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -21,7 +22,11 @@ type LibFuncs struct {
 func main() {
 	libName := os.Getenv("VIBEVOICECPP_LIBRARY")
 	if libName == "" {
-		libName = "./libgovibevoicecpp-fallback.so"
+		if runtime.GOOS == "darwin" {
+			libName = "./libgovibevoicecpp-fallback.dylib"
+		} else {
+			libName = "./libgovibevoicecpp-fallback.so"
+		}
 	}
 
 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
diff --git a/backend/go/vibevoice-cpp/package.sh b/backend/go/vibevoice-cpp/package.sh
index 88010846f..62860b8d6 100755
--- a/backend/go/vibevoice-cpp/package.sh
+++ b/backend/go/vibevoice-cpp/package.sh
@@ -12,7 +12,8 @@ REPO_ROOT="${CURDIR}/../../.."
 mkdir -p $CURDIR/package/lib
 
 cp -avf $CURDIR/vibevoice-cpp $CURDIR/package/
-cp -fv $CURDIR/libgovibevoicecpp-*.so $CURDIR/package/
+cp -fv $CURDIR/libgovibevoicecpp-*.so $CURDIR/package/ 2>/dev/null || true
+cp -fv $CURDIR/libgovibevoicecpp-*.dylib $CURDIR/package/ 2>/dev/null || true
 cp -fv $CURDIR/run.sh $CURDIR/package/
 
 # Detect architecture and copy appropriate libraries
diff --git a/backend/go/vibevoice-cpp/run.sh b/backend/go/vibevoice-cpp/run.sh
index 93e92d5b8..ec5a39c14 100755
--- a/backend/go/vibevoice-cpp/run.sh
+++ b/backend/go/vibevoice-cpp/run.sh
@@ -11,9 +11,13 @@ if [ "$(uname)" != "Darwin" ]; then
 	grep -e "flags" /proc/cpuinfo | head -1
 fi
 
-LIBRARY="$CURDIR/libgovibevoicecpp-fallback.so"
+if [ "$(uname)" = "Darwin" ]; then
+	# macOS: single dylib variant (Metal or Accelerate)
+	LIBRARY="$CURDIR/libgovibevoicecpp-fallback.dylib"
+	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
+else
+	LIBRARY="$CURDIR/libgovibevoicecpp-fallback.so"
 
-if [ "$(uname)" != "Darwin" ]; then
 	if grep -q -e "\savx\s" /proc/cpuinfo ; then
 		echo "CPU:    AVX    found OK"
 		if [ -e $CURDIR/libgovibevoicecpp-avx.so ]; then
@@ -34,9 +38,10 @@ if [ "$(uname)" != "Darwin" ]; then
 			LIBRARY="$CURDIR/libgovibevoicecpp-avx512.so"
 		fi
 	fi
+
+	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 fi
 
-export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export VIBEVOICECPP_LIBRARY=$LIBRARY
 
 if [ -f $CURDIR/lib/ld.so ]; then
diff --git a/backend/go/whisper/Makefile b/backend/go/whisper/Makefile
index e8ad8545f..6dd13dd2c 100644
--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -117,6 +117,7 @@ libgowhisper-custom: CMakeLists.txt cpp/gowhisper.cpp cpp/gowhisper.h
 	cmake .. $(CMAKE_ARGS) && \
 	cmake --build . --config Release -j$(JOBS) && \
 	cd .. && \
-	mv build-$(SO_TARGET)/libgowhisper.so ./$(SO_TARGET)
+	mv build-$(SO_TARGET)/libgowhisper.so ./$(SO_TARGET) 2>/dev/null || \
+		mv build-$(SO_TARGET)/libgowhisper.dylib ./$(SO_TARGET:.so=.dylib)
 
 all: whisper package
diff --git a/backend/go/whisper/main.go b/backend/go/whisper/main.go
index e48b24519..ab102f4c4 100644
--- a/backend/go/whisper/main.go
+++ b/backend/go/whisper/main.go
@@ -4,6 +4,7 @@ package main
 import (
 	"flag"
 	"os"
+	"runtime"
 
 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -22,7 +23,11 @@ func main() {
 	// Get library name from environment variable, default to fallback
 	libName := os.Getenv("WHISPER_LIBRARY")
 	if libName == "" {
-		libName = "./libgowhisper-fallback.so"
+		if runtime.GOOS == "darwin" {
+			libName = "./libgowhisper-fallback.dylib"
+		} else {
+			libName = "./libgowhisper-fallback.so"
+		}
 	}
 
 	gosd, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
diff --git a/backend/go/whisper/package.sh b/backend/go/whisper/package.sh
index dfecdf5c6..efeaa7009 100755
--- a/backend/go/whisper/package.sh
+++ b/backend/go/whisper/package.sh
@@ -12,7 +12,8 @@ REPO_ROOT="${CURDIR}/../../.."
 mkdir -p $CURDIR/package/lib
 
 cp -avf $CURDIR/whisper $CURDIR/package/
-cp -fv $CURDIR/libgowhisper-*.so $CURDIR/package/
+cp -fv $CURDIR/libgowhisper-*.so $CURDIR/package/ 2>/dev/null || true
+cp -fv $CURDIR/libgowhisper-*.dylib $CURDIR/package/ 2>/dev/null || true
 cp -fv $CURDIR/run.sh $CURDIR/package/
 
 # Detect architecture and copy appropriate libraries
diff --git a/backend/go/whisper/run.sh b/backend/go/whisper/run.sh
index 1af2c0535..0e2bd7eb0 100755
--- a/backend/go/whisper/run.sh
+++ b/backend/go/whisper/run.sh
@@ -12,9 +12,13 @@ if [ "$(uname)" != "Darwin" ]; then
 	grep -e "flags" /proc/cpuinfo | head -1
 fi
 
-LIBRARY="$CURDIR/libgowhisper-fallback.so"
+if [ "$(uname)" = "Darwin" ]; then
+	# macOS: single dylib variant (Metal or Accelerate)
+	LIBRARY="$CURDIR/libgowhisper-fallback.dylib"
+	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
+else
+	LIBRARY="$CURDIR/libgowhisper-fallback.so"
 
-if [ "$(uname)" != "Darwin" ]; then
 	if grep -q -e "\savx\s" /proc/cpuinfo ; then
 		echo "CPU:    AVX    found OK"
 		if [ -e $CURDIR/libgowhisper-avx.so ]; then
@@ -36,9 +40,10 @@ if [ "$(uname)" != "Darwin" ]; then
 			LIBRARY="$CURDIR/libgowhisper-avx512.so"
 		fi
 	fi
+
+	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 fi
 
-export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export WHISPER_LIBRARY=$LIBRARY
 
 # If there is a lib/ld.so, use it

From a7fec9a49db3f33dc6c879c0f41b993c4afd4635 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 25 Jun 2026 08:09:36 +0200
Subject: [PATCH 86/99] feat(backends): add darwin/metal (MPS) build for trl
 (#10487)

* feat(backends): add darwin/metal (MPS) build for trl

Authors backend/python/trl/requirements-mps.txt and wires trl into the
darwin CI matrix and gallery so the MPS training path can be built and
validated on Apple Silicon. The MPS variant installs plain PyPI torch
wheels (MPS-capable on macOS arm64) and the trl training stack; bitsandbytes
is omitted as it is a CUDA-only dependency with poor Apple Silicon support.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:opus-4.8 [Claude Code]

* fix(trl): guard uv-only --index-strategy for the pip/darwin path

The darwin/MPS build installs with pip (USE_PIP=true), which rejects the
uv-only --index-strategy flag and failed the darwin backend build. Add it
only on the uv path; Linux/CUDA resolution is unchanged.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:opus-4.8 [Claude Code]

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 .github/backend-matrix.yml              |  3 +++
 backend/index.yaml                      | 11 +++++++++++
 backend/python/trl/install.sh           |  8 +++++++-
 backend/python/trl/requirements-mps.txt | 12 ++++++++++++
 4 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 backend/python/trl/requirements-mps.txt

diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml
index 17d436cc1..f34921db9 100644
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -4974,6 +4974,9 @@ includeDarwin:
   - backend: "kitten-tts"
     tag-suffix: "-metal-darwin-arm64-kitten-tts"
     build-type: "mps"
+  - backend: "trl"
+    tag-suffix: "-metal-darwin-arm64-trl"
+    build-type: "mps"
   - backend: "liquid-audio"
     tag-suffix: "-metal-darwin-arm64-liquid-audio"
     build-type: "mps"
diff --git a/backend/index.yaml b/backend/index.yaml
index f3a2b892d..381aa073b 100644
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -5295,6 +5295,7 @@
     nvidia: "cuda12-trl"
     nvidia-cuda-12: "cuda12-trl"
     nvidia-cuda-13: "cuda13-trl"
+    metal: "metal-trl"
 ## TRL backend images
 - !!merge <<: *trl
   name: "cpu-trl"
@@ -5326,6 +5327,16 @@
   uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-trl"
   mirrors:
     - localai/localai-backends:master-gpu-nvidia-cuda-13-trl
+- !!merge <<: *trl
+  name: "metal-trl"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-trl"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-trl
+- !!merge <<: *trl
+  name: "metal-trl-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-trl"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-trl
 ## llama.cpp quantization backend
 - &llama-cpp-quantization
   name: "llama-cpp-quantization"
diff --git a/backend/python/trl/install.sh b/backend/python/trl/install.sh
index 6963e60ed..ce0552f87 100644
--- a/backend/python/trl/install.sh
+++ b/backend/python/trl/install.sh
@@ -8,7 +8,13 @@ else
     source $backend_dir/../common/libbackend.sh
 fi
 
-EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
+EXTRA_PIP_INSTALL_FLAGS+=" --upgrade"
+# --index-strategy is a uv-only flag. The darwin/MPS build installs with pip
+# (USE_PIP=true in scripts/build/python-darwin.sh), which rejects it. Only add
+# it when uv is the installer, keeping the Linux/CUDA resolution unchanged.
+if [ "x${USE_PIP:-}" != "xtrue" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
+fi
 installRequirements
 
 # Fetch convert_hf_to_gguf.py and gguf package from the same llama.cpp version
diff --git a/backend/python/trl/requirements-mps.txt b/backend/python/trl/requirements-mps.txt
new file mode 100644
index 000000000..fbdfb6536
--- /dev/null
+++ b/backend/python/trl/requirements-mps.txt
@@ -0,0 +1,12 @@
+torch==2.10.0
+trl
+peft
+datasets>=3.0.0
+transformers>=4.56.2
+accelerate>=1.4.0
+huggingface-hub>=1.3.0
+sentencepiece
+# Note: bitsandbytes is intentionally omitted on MPS. It is only used by the
+# CUDA (cublas) variants for 8-bit/4-bit quantization and has poor support on
+# Apple Silicon. torch here uses the plain PyPI wheels, which ship MPS support
+# on macOS arm64.

From 066abf82c08d966dc1ca254c32e64d64e676e4f0 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 25 Jun 2026 08:10:08 +0200
Subject: [PATCH 87/99] feat(llama-cpp): cpu_moe/n_cpu_moe options + generic
 upstream-flag passthrough (#10490)

* feat(llama-cpp): add main-model cpu_moe/n_cpu_moe options

Mirror the existing draft_cpu_moe/draft_n_cpu_moe siblings for the main
model, matching upstream --cpu-moe / --n-cpu-moe (common/arg.cpp). Lets
users keep MoE expert weights on CPU to manage VRAM on large MoE models.

Closes part of #10483

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(llama-cpp): forward unknown '-' options to upstream arg parser

Any options: entry starting with '-' is collected and passed verbatim to
llama.cpp's own common_params_parse (LLAMA_EXAMPLE_SERVER) at the end of
params_parse, so every upstream llama-server flag works without a new
hand-wired branch. Passthrough runs last and wins on overlap; n_parallel is
snapshotted to survive parser_init's SERVER reset, and help/usage/completion
flags are skipped to avoid exiting the backend.

Closes #10483

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* docs(llama-cpp): document cpu_moe/n_cpu_moe and option passthrough

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(llama-cpp): terminate tensor/kv override vectors after passthrough

The tensor_buft_overrides padding and the kv/draft override terminators
ran before the generic option passthrough, so a passthrough flag
(--cpu-moe, --override-tensor, --override-kv, ...) appended a real entry
after the null sentinel - tripping the model loader's
back().pattern == nullptr assertion (crash) or being silently dropped.
Move all three termination/padding blocks to the end of params_parse,
after both the named-option loop and common_params_parse have pushed
their real entries. Also widen the exit()-flag skip list so --version,
--license, --list-devices and --cache-list cannot terminate the backend.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/cpp/llama-cpp/grpc-server.cpp        | 138 ++++++++++++++++---
 docs/content/advanced/model-configuration.md |  33 +++++
 2 files changed, 150 insertions(+), 21 deletions(-)

diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp
index c2e7f22e4..6907b9122 100644
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -37,6 +37,7 @@
 #include "backend.pb.h"
 #include "backend.grpc.pb.h"
 #include "common.h"
+#include "arg.h"
 #include "chat-auto-parser.h"
 #include <getopt.h>
 #include <grpcpp/ext/proto_server_reflection_plugin.h>
@@ -592,6 +593,10 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
     params.checkpoint_min_step = 256;
 #endif
 
+    // Raw upstream llama-server flags collected from any option entry that
+    // starts with '-'. Applied once after the loop via common_params_parse.
+    std::vector<std::string> extra_argv;
+
      // decode options. Options are in form optname:optvale, or if booleans only optname.
     for (int i = 0; i < request->options_size(); i++) {
         std::string opt = request->options(i);
@@ -1080,6 +1085,31 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
                 } catch (...) {}
             }
 
+        // --- main model MoE on CPU (upstream --cpu-moe / --n-cpu-moe) ---
+        } else if (!strcmp(optname, "cpu_moe")) {
+            // Bool-style flag: keep all MoE expert weights on CPU.
+            const bool enable = (optval == NULL) ||
+                optval_str == "true" || optval_str == "1" || optval_str == "yes" ||
+                optval_str == "on" || optval_str == "enabled";
+            if (enable) {
+                params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
+            }
+        } else if (!strcmp(optname, "n_cpu_moe")) {
+            if (optval != NULL) {
+                try {
+                    int n = std::stoi(optval_str);
+                    if (n < 0) n = 0;
+                    // Keep override-name storage alive for the lifetime of the
+                    // params struct (mirrors upstream arg.cpp's function-local static).
+                    static std::list<std::string> buft_overrides_main;
+                    for (int i = 0; i < n; ++i) {
+                        buft_overrides_main.push_back(llm_ffn_exps_block_regex(i));
+                        params.tensor_buft_overrides.push_back(
+                            {buft_overrides_main.back().c_str(), ggml_backend_cpu_buffer_type()});
+                    }
+                } catch (...) {}
+            }
+
         // --- draft model tensor buffer overrides (upstream --spec-draft-override-tensor) ---
         } else if (!strcmp(optname, "draft_override_tensor") || !strcmp(optname, "spec_draft_override_tensor")) {
             // Format: <tensor regex>=<buffer type>,<tensor regex>=<buffer type>,...
@@ -1111,6 +1141,30 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
                 else { cur.push_back(c); }
             }
             if (!cur.empty()) flush(cur);
+
+        // --- generic passthrough: any entry starting with '-' is a raw
+        //     upstream llama-server flag, forwarded verbatim to the parser. ---
+        } else if (optname[0] == '-') {
+            std::string flag = optname;
+            // These flags make upstream's parser exit() (printing usage /
+            // completion), which would kill the backend process. Skip them.
+            if (flag == "-h" || flag == "--help" || flag == "--usage" ||
+                flag == "--version" || flag == "--license" ||
+                flag == "--list-devices" || flag == "-cl" ||
+                flag == "--cache-list" ||
+                flag.rfind("--completion", 0) == 0) {
+                fprintf(stderr,
+                    "[llama-cpp] ignoring passthrough flag that would exit: %s\n",
+                    flag.c_str());
+            } else {
+                extra_argv.push_back(flag);
+                // Preserve the whole value after the first ':' so embedded
+                // colons (e.g. host:port) survive strtok's truncation of optval.
+                auto colon = opt.find(':');
+                if (colon != std::string::npos) {
+                    extra_argv.push_back(opt.substr(colon + 1));
+                }
+            }
         }
     }
 
@@ -1146,27 +1200,6 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
         }
     }
 
-    if (!params.kv_overrides.empty()) {
-        params.kv_overrides.emplace_back();
-        params.kv_overrides.back().key[0] = 0;
-    }
-
-    // tensor_buft_overrides sentinel termination (mirrors upstream common/arg.cpp).
-    // Real entries are pushed during option parsing; here we pad/terminate so the
-    // model loader sees back().pattern == nullptr (GGML_ASSERT at common.cpp:1543)
-    // and so llama_params_fit has the placeholder slots it requires.
-    {
-        const size_t ntbo = llama_max_tensor_buft_overrides();
-        while (params.tensor_buft_overrides.size() < ntbo) {
-            params.tensor_buft_overrides.push_back({nullptr, nullptr});
-        }
-    }
-    // Terminate the draft tensor_buft_overrides list with a sentinel, mirroring
-    // the main-model handling above.
-    if (!params.speculative.draft.tensor_buft_overrides.empty()) {
-        params.speculative.draft.tensor_buft_overrides.push_back({nullptr, nullptr});
-    }
-
     // TODO: Add yarn
 
     if (!request->tensorsplit().empty()) {
@@ -1259,6 +1292,69 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
             params.sampling.grammar_triggers.push_back(std::move(trigger));
         }
     }
+
+    // Apply any raw upstream flags last so an explicit passthrough flag wins
+    // over the LocalAI-resolved field it maps to (e.g. --ctx-size beats
+    // context_size). This is the same parser llama-server itself uses.
+    if (!extra_argv.empty()) {
+        // common_params_parser_init resets a few fields for the SERVER example
+        // (n_parallel -> -1, use_color). Snapshot n_parallel so an unrelated
+        // passthrough flag can't silently clobber LocalAI's resolved value.
+        const int saved_n_parallel = params.n_parallel;
+
+        std::vector<char *> argv;
+        std::string prog = "llama-server";
+        argv.push_back(prog.data());
+        for (auto & a : extra_argv) {
+            argv.push_back(a.data());
+        }
+
+        // ctx_arg.params is a reference, so this overlays the given flags onto
+        // `params` in place. Returns false on a recoverable parse error (and
+        // self-restores params); may exit() on a hard error, exactly as
+        // passing the same bad flag to llama-server would.
+        if (!common_params_parse((int)argv.size(), argv.data(), params,
+                                 LLAMA_EXAMPLE_SERVER)) {
+            fprintf(stderr,
+                "[llama-cpp] failed to parse passthrough options; ignoring them\n");
+        }
+
+        // Restore n_parallel unless a passthrough flag explicitly set it
+        // (parser_init's reset sentinel for SERVER is -1).
+        if (params.n_parallel == -1) {
+            params.n_parallel = saved_n_parallel;
+        }
+    }
+
+    // Terminate/pad the override vectors only after BOTH the named-option loop
+    // and the generic passthrough (common_params_parse above) have pushed their
+    // real entries, so back() is the null sentinel the model loader asserts on.
+    // Running these before the passthrough let a passthrough flag (--cpu-moe,
+    // --override-tensor, --override-kv, ...) append a real entry after the
+    // sentinel: a GGML_ASSERT crash for tensor_buft_overrides, a silent drop for
+    // kv_overrides. Double-termination is harmless (the while is a no-op if the
+    // passthrough parse already padded; an extra trailing null is ignored).
+
+    if (!params.kv_overrides.empty()) {
+        params.kv_overrides.emplace_back();
+        params.kv_overrides.back().key[0] = 0;
+    }
+
+    // tensor_buft_overrides sentinel termination (mirrors upstream common/arg.cpp).
+    // Real entries are pushed during option parsing; here we pad/terminate so the
+    // model loader sees back().pattern == nullptr (GGML_ASSERT at common.cpp:1543)
+    // and so llama_params_fit has the placeholder slots it requires.
+    {
+        const size_t ntbo = llama_max_tensor_buft_overrides();
+        while (params.tensor_buft_overrides.size() < ntbo) {
+            params.tensor_buft_overrides.push_back({nullptr, nullptr});
+        }
+    }
+    // Terminate the draft tensor_buft_overrides list with a sentinel, mirroring
+    // the main-model handling above.
+    if (!params.speculative.draft.tensor_buft_overrides.empty()) {
+        params.speculative.draft.tensor_buft_overrides.push_back({nullptr, nullptr});
+    }
 }
 
 
diff --git a/docs/content/advanced/model-configuration.md b/docs/content/advanced/model-configuration.md
index 55e435b12..8092c162a 100644
--- a/docs/content/advanced/model-configuration.md
+++ b/docs/content/advanced/model-configuration.md
@@ -494,6 +494,39 @@ These llama.cpp options are passed through the `options:` array.
 | `direct_io` / `use_direct_io` | bool | `false` | Open the model with `O_DIRECT` (faster cold loads on NVMe; ignored if not supported). |
 | `verbosity` | int | `3` | llama.cpp internal log verbosity threshold. Higher = more verbose. |
 | `override_tensor` / `tensor_buft_overrides` | string | "" | Per-tensor buffer-type overrides for the main model. Format: `<tensor regex>=<buffer type>,<tensor regex>=<buffer type>,...`. Mirrors the existing `draft_override_tensor` syntax for the draft model. |
+| `cpu_moe` | bool | false | Keep all MoE expert weights of the main model on CPU (upstream `--cpu-moe`). Frees VRAM on large MoE models (DeepSeek, Qwen3 `*-A3B`). |
+| `n_cpu_moe` | int | 0 | Keep MoE expert weights of the first N main-model layers on CPU (upstream `--n-cpu-moe`). |
+
+#### Generic option passthrough
+
+Any `options:` entry whose name starts with `-` is forwarded **verbatim** to
+upstream llama.cpp's own `llama-server` argument parser. This means any flag the
+bundled llama.cpp supports works without LocalAI needing a dedicated option,
+even ones added after your LocalAI version was built. See the upstream
+[server flags reference](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md).
+
+Format mirrors the rest of the array - `--flag` for a boolean, or `--flag:value`
+for a flag that takes a value. Everything after the first `:` is the value, so
+embedded colons (e.g. `host:port`) are preserved:
+
+```yaml
+options:
+  - "--cpu-moe"                 # boolean flag
+  - "--n-cpu-moe:4"             # flag with a value
+  - "--override-tensor:exps=CPU"
+```
+
+Notes:
+
+- **Precedence:** passthrough flags are applied last, so an explicit flag
+  overrides the LocalAI option it maps to (e.g. `--ctx-size:8192` overrides
+  `context_size`).
+- **Power-user territory:** an invalid flag or value is rejected by the upstream
+  parser exactly as it would be by `llama-server`, which can fail model loading.
+  Prefer the named options above when one exists.
+- Flags that would terminate the process (such as `--help`, `--usage`,
+  `--version`, `--license`, `--list-devices`, `--cache-list`, and
+  `--completion*`) are ignored.
 
 ### Prompt Caching
 

From fae9f6356f1e447438d25a399c0345d32a51b002 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 25 Jun 2026 08:10:41 +0200
Subject: [PATCH 88/99] chore: :arrow_up: Update ServeurpersoCom/qwentts.cpp to
 `9dbe7ea26a01b30fccb117ae5e86807c1dc23d42` (#10499)

:arrow_up: Update ServeurpersoCom/qwentts.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/go/qwen3-tts-cpp/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/go/qwen3-tts-cpp/Makefile b/backend/go/qwen3-tts-cpp/Makefile
index 3311f93c3..c2bc6de34 100644
--- a/backend/go/qwen3-tts-cpp/Makefile
+++ b/backend/go/qwen3-tts-cpp/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 
 # qwentts.cpp version
 QWEN3TTS_REPO?=https://github.com/ServeurpersoCom/qwentts.cpp
-QWEN3TTS_CPP_VERSION?=4536dcdce27c3764a93a06d6bf64026b124962f5
+QWEN3TTS_CPP_VERSION?=9dbe7ea26a01b30fccb117ae5e86807c1dc23d42
 SO_TARGET?=libgoqwen3ttscpp.so
 
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

From fe4f425fb5818e3e1a315fa056033dee764f7fb2 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 25 Jun 2026 08:10:59 +0200
Subject: [PATCH 89/99] fix: correct scheme/host on self-referential URLs
 behind an HTTPS reverse proxy (#10482) (#10504)

* fix(http): harden BaseURL proxy scheme/host detection

Split comma-separated X-Forwarded-Proto and honor the RFC 7239 Forwarded
header so generated links use https behind common reverse-proxy setups.

Refs #10482

Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(http): honor explicit external base URL in BaseURL

When _external_base_url is set in the request context it dictates the
origin (scheme+host+port); the proxy path prefix is still appended.

Refs #10482

Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(config): generalize LOCALAI_BASE_URL to ExternalBaseURL

LOCALAI_BASE_URL now sets a single instance-wide external base URL used
for OAuth callbacks and all self-referential links. A Pre middleware
stamps it into the request context for middleware.BaseURL.

Refs #10482

Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* docs: document LOCALAI_BASE_URL and reverse-proxy headers

Refs #10482

Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* test(http): cover parseForwarded edge cases; clarify base-url flag group

Adds direct unit coverage for quoted/malformed/multi-element Forwarded
headers and regroups the external base URL flag away from auth-only.

Refs #10482

Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/cli/run.go                            |  11 +-
 core/config/application_config.go          |  12 +-
 core/http/app.go                           |  12 ++
 core/http/middleware/baseurl.go            |  57 ++++++++-
 core/http/middleware/baseurl_test.go       | 134 +++++++++++++++++++++
 core/http/routes/auth.go                   |   2 +-
 docs/content/advanced/reverse-proxy-tls.md |  20 +++
 7 files changed, 238 insertions(+), 10 deletions(-)

diff --git a/core/cli/run.go b/core/cli/run.go
index abb0cdbf1..fd7ba8cd9 100644
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -140,7 +140,7 @@ type RunCMD struct {
 	OIDCIssuer           string `env:"LOCALAI_OIDC_ISSUER" help:"OIDC issuer URL for auto-discovery" group:"auth"`
 	OIDCClientID         string `env:"LOCALAI_OIDC_CLIENT_ID" help:"OIDC Client ID (auto-enables auth)" group:"auth"`
 	OIDCClientSecret     string `env:"LOCALAI_OIDC_CLIENT_SECRET" help:"OIDC Client Secret" group:"auth"`
-	AuthBaseURL          string `env:"LOCALAI_BASE_URL" help:"Base URL for OAuth callbacks (e.g. http://localhost:8080)" group:"auth"`
+	ExternalBaseURL      string `env:"LOCALAI_BASE_URL" help:"External base URL of this instance (e.g. https://localhost:8080). Used for OAuth callbacks and self-referential links (generated images/videos, job status). When unset, derived from X-Forwarded-Proto/Host or Forwarded headers." group:"api"`
 	AuthAdminEmail       string `env:"LOCALAI_ADMIN_EMAIL" help:"Email address to auto-promote to admin role" group:"auth"`
 	AuthRegistrationMode string `env:"LOCALAI_REGISTRATION_MODE" default:"open" help:"Registration mode: 'open' (default), 'approval', or 'invite' (invite code required)" group:"auth"`
 	DisableLocalAuth     bool   `env:"LOCALAI_DISABLE_LOCAL_AUTH" default:"false" help:"Disable local email/password registration and login (use with OAuth/OIDC-only setups)" group:"auth"`
@@ -503,9 +503,6 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 			opts = append(opts, config.WithAuthOIDCClientID(r.OIDCClientID))
 			opts = append(opts, config.WithAuthOIDCClientSecret(r.OIDCClientSecret))
 		}
-		if r.AuthBaseURL != "" {
-			opts = append(opts, config.WithAuthBaseURL(r.AuthBaseURL))
-		}
 		if r.AuthAdminEmail != "" {
 			opts = append(opts, config.WithAuthAdminEmail(r.AuthAdminEmail))
 		}
@@ -523,6 +520,12 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		}
 	}
 
+	// Applied unconditionally: the external base URL governs all self-referential
+	// links (not just OAuth callbacks), so it must take effect even when auth is off.
+	if r.ExternalBaseURL != "" {
+		opts = append(opts, config.WithExternalBaseURL(r.ExternalBaseURL))
+	}
+
 	if idleWatchDog || busyWatchDog {
 		opts = append(opts, config.EnableWatchDog)
 		if idleWatchDog {
diff --git a/core/config/application_config.go b/core/config/application_config.go
index 87acd6bd5..1821a8441 100644
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -49,6 +49,13 @@ type ApplicationConfig struct {
 	P2PNetworkID                  string
 	Federated                     bool
 
+	// ExternalBaseURL is the externally visible base URL of this instance
+	// (scheme+host[:port]), set via LOCALAI_BASE_URL. When non-empty it is
+	// authoritative for every self-referential URL LocalAI emits (OAuth
+	// callbacks, generated image/video links, async job StatusURLs),
+	// overriding proxy-header detection. Empty = derive from request headers.
+	ExternalBaseURL string
+
 	// DisableStats turns off per-request token tracking. By default the
 	// routing module's billing recorder runs in every mode (including
 	// no-auth single-user) so dashboards and `/api/usage` are immediately
@@ -196,7 +203,6 @@ type AuthConfig struct {
 	OIDCIssuer          string // OIDC issuer URL for auto-discovery (e.g. https://accounts.google.com)
 	OIDCClientID        string
 	OIDCClientSecret    string
-	BaseURL             string // for OAuth callback URLs (e.g. "http://localhost:8080")
 	AdminEmail          string // auto-promote to admin on login
 	RegistrationMode    string // "open", "approval" (default when empty), "invite"
 	DisableLocalAuth    bool   // disable local email/password registration and login
@@ -950,9 +956,9 @@ func WithAuthGitHubClientSecret(clientSecret string) AppOption {
 	}
 }
 
-func WithAuthBaseURL(baseURL string) AppOption {
+func WithExternalBaseURL(url string) AppOption {
 	return func(o *ApplicationConfig) {
-		o.Auth.BaseURL = baseURL
+		o.ExternalBaseURL = url
 	}
 }
 
diff --git a/core/http/app.go b/core/http/app.go
index 9ec0711fb..ee5cd99eb 100644
--- a/core/http/app.go
+++ b/core/http/app.go
@@ -149,6 +149,18 @@ func API(application *application.Application) (*echo.Echo, error) {
 	// Middleware - StripPathPrefix must be registered early as it uses Rewrite which runs before routing
 	e.Pre(httpMiddleware.StripPathPrefix())
 
+	// Stamp the configured external base URL into each request context so
+	// middleware.BaseURL can treat it as authoritative for self-referential
+	// links. Registered as Pre so it runs before routing and handlers.
+	if extBaseURL := application.ApplicationConfig().ExternalBaseURL; extBaseURL != "" {
+		e.Pre(func(next echo.HandlerFunc) echo.HandlerFunc {
+			return func(c echo.Context) error {
+				c.Set("_external_base_url", extBaseURL)
+				return next(c)
+			}
+		})
+	}
+
 	e.Pre(middleware.RemoveTrailingSlash())
 
 	if application.ApplicationConfig().MachineTag != "" {
diff --git a/core/http/middleware/baseurl.go b/core/http/middleware/baseurl.go
index a1e1844ae..84f72cf69 100644
--- a/core/http/middleware/baseurl.go
+++ b/core/http/middleware/baseurl.go
@@ -55,17 +55,70 @@ func BasePathPrefix(c echo.Context) string {
 // The returned URL is guaranteed to end with `/`.
 // The method should be used in conjunction with the StripPathPrefix middleware.
 func BaseURL(c echo.Context) string {
+	// An explicit external base URL (LOCALAI_BASE_URL) is authoritative for
+	// the origin. The proxy-derived path prefix is still appended so a
+	// reverse-proxy mount point keeps working. Trailing slashes are
+	// normalized via BasePathPrefix, which always starts and ends with "/".
+	if ext, ok := c.Get("_external_base_url").(string); ok && ext != "" {
+		return strings.TrimRight(ext, "/") + BasePathPrefix(c)
+	}
+
+	fwdProto, fwdHost := parseForwarded(c.Request().Header.Get("Forwarded"))
+
 	scheme := "http"
-	if c.Request().Header.Get("X-Forwarded-Proto") == "https" {
+	switch {
+	case c.Request().TLS != nil:
 		scheme = "https"
-	} else if c.Request().TLS != nil {
+	case strings.EqualFold(firstToken(c.Request().Header.Get("X-Forwarded-Proto")), "https"):
+		scheme = "https"
+	case strings.EqualFold(fwdProto, "https"):
 		scheme = "https"
 	}
 
 	host := c.Request().Host
 	if forwardedHost := c.Request().Header.Get("X-Forwarded-Host"); forwardedHost != "" {
 		host = forwardedHost
+	} else if fwdHost != "" {
+		host = fwdHost
 	}
 
 	return scheme + "://" + host + BasePathPrefix(c)
 }
+
+// firstToken returns the first comma-separated token of v, trimmed of spaces.
+// Reverse-proxy chains can emit X-Forwarded-Proto as "https,http"; only the
+// first hop (closest to the client) is meaningful for scheme detection.
+func firstToken(v string) string {
+	if i := strings.IndexByte(v, ','); i >= 0 {
+		v = v[:i]
+	}
+	return strings.TrimSpace(v)
+}
+
+// parseForwarded extracts the proto and host directives from the first element
+// of an RFC 7239 Forwarded header (e.g. `for=x;proto=https;host=h, for=y`).
+// Values may be quoted. Returns empty strings when absent or malformed so the
+// caller can fall through to other signals.
+func parseForwarded(header string) (proto, host string) {
+	if header == "" {
+		return "", ""
+	}
+	// Only the first element (closest proxy to the client) matters here.
+	if i := strings.IndexByte(header, ','); i >= 0 {
+		header = header[:i]
+	}
+	for _, directive := range strings.Split(header, ";") {
+		key, value, ok := strings.Cut(strings.TrimSpace(directive), "=")
+		if !ok {
+			continue
+		}
+		value = strings.Trim(strings.TrimSpace(value), `"`)
+		switch strings.ToLower(strings.TrimSpace(key)) {
+		case "proto":
+			proto = value
+		case "host":
+			host = value
+		}
+	}
+	return proto, host
+}
diff --git a/core/http/middleware/baseurl_test.go b/core/http/middleware/baseurl_test.go
index 4f6dbb1d1..6a132514b 100644
--- a/core/http/middleware/baseurl_test.go
+++ b/core/http/middleware/baseurl_test.go
@@ -135,4 +135,138 @@ var _ = Describe("BaseURL", func() {
 			Entry("missing leading slash", "evil"),
 		)
 	})
+
+	Context("scheme detection hardening", func() {
+		It("treats comma-separated X-Forwarded-Proto as https when first token is https", func() {
+			app := echo.New()
+			actualURL := ""
+			app.GET("/x", func(c echo.Context) error {
+				actualURL = BaseURL(c)
+				return nil
+			})
+			req := httptest.NewRequest("GET", "/x", nil)
+			req.Header.Set("X-Forwarded-Proto", "https,http")
+			rec := httptest.NewRecorder()
+			app.ServeHTTP(rec, req)
+			Expect(actualURL).To(Equal("https://example.com/"))
+		})
+
+		It("derives https from the RFC 7239 Forwarded proto directive", func() {
+			app := echo.New()
+			actualURL := ""
+			app.GET("/x", func(c echo.Context) error {
+				actualURL = BaseURL(c)
+				return nil
+			})
+			req := httptest.NewRequest("GET", "/x", nil)
+			req.Header.Set("Forwarded", "for=192.0.2.1;proto=https;host=proxy.example")
+			rec := httptest.NewRecorder()
+			app.ServeHTTP(rec, req)
+			Expect(actualURL).To(Equal("https://proxy.example/"))
+		})
+
+		It("prefers X-Forwarded-Host over the Forwarded host directive", func() {
+			app := echo.New()
+			actualURL := ""
+			app.GET("/x", func(c echo.Context) error {
+				actualURL = BaseURL(c)
+				return nil
+			})
+			req := httptest.NewRequest("GET", "/x", nil)
+			req.Header.Set("X-Forwarded-Host", "xfh.example")
+			req.Header.Set("Forwarded", "host=fwd.example;proto=https")
+			rec := httptest.NewRecorder()
+			app.ServeHTTP(rec, req)
+			Expect(actualURL).To(Equal("https://xfh.example/"))
+		})
+	})
+
+	Context("explicit external base URL override", func() {
+		It("uses the configured origin over conflicting forwarded headers", func() {
+			app := echo.New()
+			actualURL := ""
+			app.GET("/x", func(c echo.Context) error {
+				c.Set("_external_base_url", "https://192.168.0.13:34567")
+				actualURL = BaseURL(c)
+				return nil
+			})
+			req := httptest.NewRequest("GET", "/x", nil)
+			req.Header.Set("X-Forwarded-Proto", "http")
+			req.Header.Set("X-Forwarded-Host", "internal:8080")
+			rec := httptest.NewRecorder()
+			app.ServeHTTP(rec, req)
+			Expect(actualURL).To(Equal("https://192.168.0.13:34567/"))
+		})
+
+		It("combines the configured origin with a detected path prefix", func() {
+			app := echo.New()
+			actualURL := ""
+			app.GET("/hello", func(c echo.Context) error {
+				c.Set("_original_path", "/localai/hello")
+				c.Set("_external_base_url", "https://ext.example")
+				actualURL = BaseURL(c)
+				return nil
+			})
+			req := httptest.NewRequest("GET", "/hello", nil)
+			rec := httptest.NewRecorder()
+			app.ServeHTTP(rec, req)
+			Expect(actualURL).To(Equal("https://ext.example/localai/"))
+		})
+
+		It("ignores an empty override", func() {
+			app := echo.New()
+			actualURL := ""
+			app.GET("/x", func(c echo.Context) error {
+				c.Set("_external_base_url", "")
+				actualURL = BaseURL(c)
+				return nil
+			})
+			req := httptest.NewRequest("GET", "/x", nil)
+			rec := httptest.NewRecorder()
+			app.ServeHTTP(rec, req)
+			Expect(actualURL).To(Equal("http://example.com/"))
+		})
+	})
+
+	Context("parseForwarded helper", func() {
+		It("parses unquoted proto and host", func() {
+			proto, host := parseForwarded("for=192.0.2.1;proto=https;host=h.example")
+			Expect(proto).To(Equal("https"))
+			Expect(host).To(Equal("h.example"))
+		})
+
+		It("strips quotes around values", func() {
+			proto, host := parseForwarded(`proto="https";host="h.example"`)
+			Expect(proto).To(Equal("https"))
+			Expect(host).To(Equal("h.example"))
+		})
+
+		It("uses only the first element of a multi-element header", func() {
+			proto, host := parseForwarded("proto=https;host=first.example, proto=http;host=second.example")
+			Expect(proto).To(Equal("https"))
+			Expect(host).To(Equal("first.example"))
+		})
+
+		It("returns empty strings for an empty header", func() {
+			proto, host := parseForwarded("")
+			Expect(proto).To(BeEmpty())
+			Expect(host).To(BeEmpty())
+		})
+
+		It("skips directives without a value", func() {
+			proto, host := parseForwarded("proto;host=h.example")
+			Expect(proto).To(BeEmpty())
+			Expect(host).To(Equal("h.example"))
+		})
+	})
+
+	Context("firstToken helper", func() {
+		It("returns the whole trimmed string when there is no comma", func() {
+			Expect(firstToken("  https  ")).To(Equal("https"))
+		})
+
+		It("returns the first trimmed token when there is a comma", func() {
+			Expect(firstToken("https , http")).To(Equal("https"))
+		})
+	})
 })
diff --git a/core/http/routes/auth.go b/core/http/routes/auth.go
index ef8372fff..b4144e0a1 100644
--- a/core/http/routes/auth.go
+++ b/core/http/routes/auth.go
@@ -268,7 +268,7 @@ func RegisterAuthRoutes(e *echo.Echo, app *application.Application) {
 	// Set up OAuth manager when any OAuth/OIDC provider is configured
 	if appConfig.Auth.GitHubClientID != "" || appConfig.Auth.OIDCClientID != "" {
 		oauthMgr, err := auth.NewOAuthManager(
-			appConfig.Auth.BaseURL,
+			appConfig.ExternalBaseURL,
 			auth.OAuthParams{
 				GitHubClientID:     appConfig.Auth.GitHubClientID,
 				GitHubClientSecret: appConfig.Auth.GitHubClientSecret,
diff --git a/docs/content/advanced/reverse-proxy-tls.md b/docs/content/advanced/reverse-proxy-tls.md
index 24af55c62..d36a64ae4 100644
--- a/docs/content/advanced/reverse-proxy-tls.md
+++ b/docs/content/advanced/reverse-proxy-tls.md
@@ -14,6 +14,26 @@ When running LocalAI behind a TLS termination reverse proxy, the Web UI may fail
 
 LocalAI uses the `X-Forwarded-Proto` HTTP header to determine the protocol used by clients. When this header is set to `https`, LocalAI will generate HTTPS URLs for static assets in the Web UI.
 
+## Running behind a reverse proxy (HTTPS / subpath)
+
+LocalAI does not terminate TLS itself, so HTTPS is provided by a reverse
+proxy in front of it. Self-referential links (generated image and video
+URLs, async job status URLs, OAuth callbacks) need the externally visible
+scheme, host and port.
+
+LocalAI determines these in this order:
+
+1. `LOCALAI_BASE_URL` - if set, it is authoritative for the origin. Set it to
+   the externally visible base URL, e.g. `LOCALAI_BASE_URL=https://localai.example.com`
+   or `https://192.168.0.13:34567`. Recommended whenever links come back with
+   the wrong scheme or host.
+2. Otherwise, the `X-Forwarded-Proto` and `X-Forwarded-Host` headers (or the
+   RFC 7239 `Forwarded` header) sent by the proxy. Ensure your proxy forwards
+   `X-Forwarded-Proto: https`.
+
+A reverse-proxy subpath mount is supported via `X-Forwarded-Prefix`; it is
+appended to `LOCALAI_BASE_URL` when both are present.
+
 ## Required Headers
 
 Your reverse proxy must forward these headers to LocalAI:

From 93d6255de393f10bcec662623f6b4636f4973b71 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 25 Jun 2026 08:11:17 +0200
Subject: [PATCH 90/99] chore: :arrow_up: Update ggml-org/llama.cpp to
 `8be759e6f70d629638a7eb70db3824cbdcea370b` (#10501)

:arrow_up: Update ggml-org/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/cpp/llama-cpp/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile
index 24f1f215d..f00fad518 100644
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@
 
-LLAMA_VERSION?=be4a6a63eb2b848e19c277bdcf2bd399e8af76d9
+LLAMA_VERSION?=8be759e6f70d629638a7eb70db3824cbdcea370b
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
 
 CMAKE_ARGS?=

From f1e50713216c4badb532a3e65061196838138ba8 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 25 Jun 2026 08:11:31 +0200
Subject: [PATCH 91/99] chore: :arrow_up: Update leejet/stable-diffusion.cpp to
 `8caa3f908ae6d4a4bef531e73b9a969f266a3d1f` (#10493)

:arrow_up: Update leejet/stable-diffusion.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/go/stablediffusion-ggml/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/go/stablediffusion-ggml/Makefile b/backend/go/stablediffusion-ggml/Makefile
index d161a5b47..7a9917ea8 100644
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 
 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=f440ad9c29dd8bc34e5d1f4b863832b96d6ea05f
+STABLEDIFFUSION_GGML_VERSION?=8caa3f908ae6d4a4bef531e73b9a969f266a3d1f
 
 CMAKE_ARGS+=-DGGML_MAX_NAME=128
 

From 693e3eec050cd507f4369800a1843ba0bb41448b Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 25 Jun 2026 08:11:52 +0200
Subject: [PATCH 92/99] chore(model gallery): :robot: add 1 new models via
 gallery agent (#10505)

chore(model gallery): :robot: add new models via gallery agent

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 gallery/index.yaml | 73 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 52f23a771..25a6e781d 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,77 @@
 ---
+- name: "gemmable-4-12b-mtp"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/Mia-AiLab/Gemmable-4-12B-MTP-GGUF
+  description: |
+    ## Gemmable 4 12B
+
+    Gemmable 4 12B is a GGUF export of Gemma 4 12B fine-tuned on Fable-5 style
+    reasoning and assistant traces.
+
+    ## Highlights
+
+      - Base model: `google/gemma-4-12B`
+      - Format: GGUF
+      - Training style: Fable-5 style reasoning and assistant traces
+      - Distribution: fp16 GGUF plus matching assistant GGUFs for each quant
+      - Intended use: local inference, coding, reasoning, and assistant workflows
+
+    ## How to use
+
+    ### llama.cpp
+
+    Standard load:
+
+    ```bash
+    llama-server -m "gemmable-4-12b-fp16.gguf"
+    ```
+
+    Speculative / draft-MTP load:
+
+    ```bash
+    llama-server -m "gemmable-4-12b-Q4_K_M.gguf" \
+      --spec-draft-model "gemmable-4-12b-Q4_K_M-mtp.gguf" \
+      --spec-type draft-mtp \
+      --spec-draft-n-max 4
+    ```
+
+    Use the matching fp16 or quantized main file with its `-mtp` companion.
+
+    ### LM Studio
+
+    1.  Search this repo, download target + mtp file.
+    2.  Load target.
+    3.  Load settings → Speculative Decoding → select mtp file file.
+
+    (Requires LM Studio with am17an's PR merged or custom llama.cpp runtime. As of 2026-05, mainline LM Studio runtime doesn't yet have `draft-mtp` for Gemma-4 — track upstream merge.)
+
+    ## GGUF / local inference notes
+
+    ...
+  tags:
+    - llm
+    - gguf
+    - reasoning
+  icon: https://storage.ko-fi.com/cdn/kofi6.png
+  overrides:
+    backend: llama-cpp
+    function:
+      automatic_tool_parsing_fallback: true
+      grammar:
+        disable: true
+    known_usecases:
+      - chat
+    options:
+      - use_jinja:true
+    parameters:
+      model: llama-cpp/models/Gemmable-4-12B-MTP-GGUF/gemmable-4-12b-Q4_K_M-mtp.gguf
+    template:
+      use_tokenizer_template: true
+  files:
+    - filename: llama-cpp/models/Gemmable-4-12B-MTP-GGUF/gemmable-4-12b-Q4_K_M-mtp.gguf
+      sha256: 217dc0ed177ecc733f801a851c3e3854cf1b17a1f86cd5430c0a7f82d93027bc
+      uri: https://huggingface.co/Mia-AiLab/Gemmable-4-12B-MTP-GGUF/resolve/main/gemmable-4-12b-Q4_K_M-mtp.gguf
 - name: "lfm2.5-1.2b-instruct"
   url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
   urls:

From 3a87d9e48f1f9af12f0ff52786fa9720fc1be9e0 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 25 Jun 2026 15:46:19 +0200
Subject: [PATCH 93/99] feat(vllm): macOS/Metal support via vllm-metal (MLX)
 (#10489)

* feat(vllm): macOS/Metal support via vllm-metal (MLX)

Add an additive Apple-Silicon path to the existing vllm Python backend so
vLLM runs on macOS via vllm-metal (github.com/vllm-project/vllm-metal).

Spike outcome (proven on a real M4 / macOS 26.5, Qwen3-0.6B):
- vllm-metal registers through vLLM's platform-plugin entry point
  (metal -> vllm_metal:register); MetalPlatform activates and runs on the
  GPU through MLX.
- LocalAI's backend.py is UNCHANGED: AsyncEngineArgs(...) ->
  AsyncLLMEngine.from_engine_args transparently resolves to vLLM 0.23's v1
  AsyncLLM MLX engine, and async generate produced correct output.
- backend.py is NOT touched: its only empty_cache() call is CUDA-only
  (guarded by torch.cuda.is_available()), so the benign shutdown-only
  "Allocator for mps is not a DeviceAllocator" noise comes from vLLM's
  internal EngineCore teardown, not from our code.

Changes (all gated behind a darwin condition; Linux/CUDA/ROCm/Intel paths
are byte-for-byte unchanged):
- install.sh: darwin branch forces PYTHON_VERSION=3.12 (vllm-metal
  requirement), creates/activates LocalAI's managed venv via ensureVenv,
  then reproduces vllm-metal's installer INTO that venv (build vLLM 0.23.0
  from the release source tarball against requirements/cpu.txt, then install
  the prebuilt vllm-metal wheel from its latest GitHub release), and runs
  runProtogen. installRequirements is skipped on darwin.
- backend-matrix.yml: add a vllm includeDarwin entry (mps, python).
- index.yaml: add metal capability + concrete metal-vllm /
  metal-vllm-development child entries mirroring the metal-kitten-tts
  template.

Version coupling: vllm-metal pins vLLM 0.23.0, equal to LocalAI's current
vllm pin. Bumping vllm must be coordinated with a supporting vllm-metal
release; documented in install.sh and requirements-cublas13-after.txt.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:opus-4.8 [Claude Code]

* chore(vllm): track the darwin vllm-metal pin via the autobumper

The Apple Silicon build pinned vLLM 0.23.0 as a hidden string in install.sh
while floating the vllm-metal wheel on releases/latest - the two could drift
apart silently. Make both a tracked, reproducible pair (VLLM_METAL_VERSION +
VLLM_VERSION), fetch the wheel by tag, and add .github/bump_vllm_metal.sh wired
into bump_deps.yaml. It tracks vllm-project/vllm-metal (not vllm/vllm latest),
reading the coupled vLLM source version from vllm-metal's own installer, and
opens a bump PR - mirroring the existing bump_vllm_wheel.sh for the cu130 wheel.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:opus-4.8 [Claude Code]

* chore(vllm): derive the darwin vLLM version, drop the second pin

Follow-up: VLLM_VERSION was still a hardcoded string duplicating what
VLLM_METAL_VERSION already determines. Derive it at install time from
vllm-metal's own installer (vllm_v=) at the pinned tag - one source of truth,
no second value to drift. The bumper now touches only VLLM_METAL_VERSION;
the derivation is immutable per tag, so builds stay reproducible.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:opus-4.8 [Claude Code]

* fix(vllm): fetch the vllm-metal wheel without the GitHub API

The darwin build resolved the wheel URL via api.github.com, whose
unauthenticated rate limit (60/hr per IP) 403s on shared macOS runners
(observed after the 9-min vLLM source build). Construct the release-asset
download URL deterministically from the pinned tag and the cp312/arm64 wheel
name instead - no API call, no rate limit. Verified the URL resolves (200).

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:opus-4.8 [Claude Code]

* fix(vllm): fail Score cleanly when the engine returns no prompt_logprobs

Audit of the Score path against vllm-metal (MLX on macOS): the engine accepts
SamplingParams(prompt_logprobs=1) but returns an all-None prompt_logprobs list
rather than computing it, so scoring is not supported there. The old guard
treated the truthy [None] list as valid and silently scored every candidate as
0. Detect the all-None case and return UNIMPLEMENTED instead. No-op on
Linux/CUDA, which populate real entries.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:opus-4.8 [Claude Code]

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 .github/backend-matrix.yml                    |  7 ++
 .github/bump_vllm_metal.sh                    | 55 +++++++++++
 .github/workflows/bump_deps.yaml              | 36 +++++++
 backend/index.yaml                            | 12 +++
 backend/python/vllm/backend.py                | 11 ++-
 backend/python/vllm/install.sh                | 96 ++++++++++++++++++-
 .../vllm/requirements-cublas13-after.txt      |  3 +
 7 files changed, 216 insertions(+), 4 deletions(-)
 create mode 100755 .github/bump_vllm_metal.sh

diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml
index f34921db9..5ad6d9e16 100644
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -4974,6 +4974,13 @@ includeDarwin:
   - backend: "kitten-tts"
     tag-suffix: "-metal-darwin-arm64-kitten-tts"
     build-type: "mps"
+  # vLLM on Apple Silicon via vllm-metal (MLX). The install is custom
+  # (backend/python/vllm/install.sh has a darwin branch); lang stays python so
+  # backend_build_darwin.yml drives it through build-darwin-python-backend ->
+  # scripts/build/python-darwin.sh, which runs the backend's install.sh.
+  - backend: "vllm"
+    tag-suffix: "-metal-darwin-arm64-vllm"
+    build-type: "mps"
   - backend: "trl"
     tag-suffix: "-metal-darwin-arm64-trl"
     build-type: "mps"
diff --git a/.github/bump_vllm_metal.sh b/.github/bump_vllm_metal.sh
new file mode 100755
index 000000000..f842680d5
--- /dev/null
+++ b/.github/bump_vllm_metal.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Bump the single vllm-metal pin (VLLM_METAL_VERSION) in the vLLM backend's
+# darwin (Apple Silicon) install path. The macOS/Metal build
+# (backend/python/vllm/install.sh, Darwin branch) installs vllm-metal, which is
+# version-locked to a specific vLLM source release. install.sh derives that vLLM
+# version at build time from vllm-metal's own installer (`vllm_v=`) at the pinned
+# tag, so there is only ONE value to bump here -- mirroring bump_vllm_wheel.sh,
+# which bumps the Linux cu130 wheel pin.
+#
+# This deliberately tracks vllm-project/vllm-metal, NOT vllm-project/vllm: the
+# darwin build can only use the exact vLLM version vllm-metal supports, so it may
+# lag the Linux pin (requirements-cublas13-after.txt) until vllm-metal catches up.
+set -xe
+REPO=$1   # vllm-project/vllm-metal
+FILE=$2   # backend/python/vllm/install.sh
+VAR=$3    # VLLM_METAL_VERSION (used for the workflow's output file names)
+
+if [ -z "$FILE" ] || [ -z "$REPO" ] || [ -z "$VAR" ]; then
+    echo "usage: $0 <repo> <install-file> <var-name>" >&2
+    exit 1
+fi
+
+# vllm-metal ships frequent dev releases, all flagged as non-prerelease, so
+# /releases/latest returns the newest one (with its cp312 wheel asset).
+LATEST_TAG=$(curl -sS -H "Accept: application/vnd.github+json" \
+    "https://api.github.com/repos/$REPO/releases/latest" \
+    | python3 -c "import json,sys; print(json.load(sys.stdin)['tag_name'])")
+
+# The coupled vLLM source version lives in vllm-metal's installer at that tag.
+NEW_VLLM_VERSION=$(curl -fsSL \
+    "https://raw.githubusercontent.com/$REPO/$LATEST_TAG/install.sh" \
+    | grep -oE 'vllm_v="[0-9]+\.[0-9]+\.[0-9]+"' | head -1 | cut -d'"' -f2)
+
+if [ -z "$LATEST_TAG" ] || [ -z "$NEW_VLLM_VERSION" ]; then
+    echo "Could not resolve vllm-metal tag ($LATEST_TAG) or its vllm_v ($NEW_VLLM_VERSION)." >&2
+    exit 1
+fi
+
+set +e
+CURRENT_TAG=$(grep -oE 'VLLM_METAL_VERSION="[^"]*"' "$FILE" | head -1 | cut -d'"' -f2)
+set -e
+
+# Rewrite the single pin. install.sh derives VLLM_VERSION from this tag at build
+# time, so there is nothing else to touch. peter-evans/create-pull-request opens
+# no PR on a clean tree, so a no-op rewrite (already current) is safe.
+sed -i "$FILE" \
+    -e "s|VLLM_METAL_VERSION=\"[^\"]*\"|VLLM_METAL_VERSION=\"$LATEST_TAG\"|"
+
+if [ -z "$CURRENT_TAG" ]; then
+    echo "Could not find VLLM_METAL_VERSION=\"...\" in $FILE." >&2
+    exit 0
+fi
+
+echo "vllm-metal ${CURRENT_TAG} -> ${LATEST_TAG} (builds vLLM ${NEW_VLLM_VERSION}): https://github.com/$REPO/releases/tag/${LATEST_TAG}" >> "${VAR}_message.txt"
+echo "${LATEST_TAG}" >> "${VAR}_commit.txt"
diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml
index aa4b21af7..a2c37881f 100644
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -154,3 +154,39 @@ jobs:
           branch: "update/VLLM_VERSION"
           body: ${{ steps.bump.outputs.message }}
           signoff: true
+
+  bump-vllm-metal:
+    # The darwin (Apple Silicon) vLLM build installs vllm-metal, which is locked
+    # to a specific vLLM source release. install.sh pins both VLLM_METAL_VERSION
+    # (the wheel release) and VLLM_VERSION (the vLLM it builds against); this job
+    # tracks vllm-project/vllm-metal and rewrites both atomically. Separate from
+    # bump-vllm-wheel because darwin follows vllm-metal, not vllm/vllm latest.
+    if: github.repository == 'mudler/LocalAI'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v7
+      - name: Bump vllm-metal pin 🔧
+        id: bump
+        run: |
+          bash .github/bump_vllm_metal.sh vllm-project/vllm-metal backend/python/vllm/install.sh VLLM_METAL_VERSION
+          {
+            echo 'message<<EOF'
+            cat "VLLM_METAL_VERSION_message.txt"
+            echo EOF
+          } >> "$GITHUB_OUTPUT"
+          {
+            echo 'commit<<EOF'
+            cat "VLLM_METAL_VERSION_commit.txt"
+            echo EOF
+          } >> "$GITHUB_OUTPUT"
+          rm -rfv VLLM_METAL_VERSION_message.txt VLLM_METAL_VERSION_commit.txt
+      - name: Create Pull Request
+        uses: peter-evans/create-pull-request@v8
+        with:
+          token: ${{ secrets.UPDATE_BOT_TOKEN }}
+          push-to-fork: ci-forks/LocalAI
+          commit-message: ':arrow_up: Update vllm-project/vllm-metal (darwin)'
+          title: 'chore: :arrow_up: Update vllm-metal (darwin) to `${{ steps.bump.outputs.commit }}`'
+          branch: "update/VLLM_METAL_VERSION"
+          body: ${{ steps.bump.outputs.message }}
+          signoff: true
diff --git a/backend/index.yaml b/backend/index.yaml
index 381aa073b..4a7a07d82 100644
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -645,6 +645,7 @@
     nvidia-cuda-13: "cuda13-vllm"
     nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm"
     cpu: "cpu-vllm"
+    metal: "metal-vllm"
 - &sglang
   name: "sglang"
   license: apache-2.0
@@ -2929,6 +2930,17 @@
     nvidia-cuda-13: "cuda13-vllm-development"
     nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm-development"
     cpu: "cpu-vllm-development"
+    metal: "metal-vllm-development"
+- !!merge <<: *vllm
+  name: "metal-vllm"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-vllm"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-vllm
+- !!merge <<: *vllm
+  name: "metal-vllm-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-vllm"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-vllm
 - !!merge <<: *vllm
   name: "cuda12-vllm"
   uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-vllm"
diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
index a38849137..1e93f26e2 100644
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -457,9 +457,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                     except Exception:
                         pass
 
-                if last_output is None or not getattr(last_output, "prompt_logprobs", None):
-                    context.set_code(grpc.StatusCode.INTERNAL)
-                    context.set_details("vLLM did not return prompt_logprobs")
+                _pl = getattr(last_output, "prompt_logprobs", None) if last_output is not None else None
+                # Some engines accept the prompt_logprobs request but return a
+                # list of all-None entries instead of computing them (observed
+                # with vllm-metal's MLX backend on macOS). Treat that as
+                # unsupported rather than silently scoring every candidate as 0.
+                if not _pl or all(e is None for e in _pl):
+                    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+                    context.set_details("This backend did not return prompt_logprobs; scoring is unsupported on this engine (e.g. vllm-metal / MLX on macOS).")
                     return backend_pb2.ScoreResponse()
 
                 prompt_logprobs = last_output.prompt_logprobs
diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh
index 320ef6772..85c1e97b0 100755
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -43,6 +43,24 @@ if [ "x${BUILD_PROFILE}" == "xcublas13" ]; then
     EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
 fi
 
+# Apple Silicon (Metal/MLX) via vllm-metal.
+# vllm-metal (github.com/vllm-project/vllm-metal) brings vLLM to macOS on Apple
+# Silicon: it registers through vLLM's platform-plugin entry point
+# (metal -> vllm_metal:register), MetalPlatform activates, and the vLLM v1
+# AsyncLLM engine runs on the GPU through MLX. LocalAI's backend.py is UNCHANGED
+# on darwin — AsyncEngineArgs(...) -> AsyncLLMEngine.from_engine_args transparently
+# resolves to the MLX engine (proven on a real M4 / macOS 26.5 against Qwen3-0.6B).
+#
+# vllm-metal REQUIRES Python 3.12, so force the portable CPython before the venv
+# is created (ensureVenv reads PYTHON_VERSION/PYTHON_PATCH/PY_STANDALONE_TAG).
+# The patch + standalone tag mirror the l4t13 cp312 pin — a known-good
+# python-build-standalone release that also ships an aarch64-apple-darwin asset.
+if [ "$(uname -s)" = "Darwin" ]; then
+    PYTHON_VERSION="3.12"
+    PYTHON_PATCH="12"
+    PY_STANDALONE_TAG="20251120"
+fi
+
 # JetPack 7 / L4T arm64 vllm + torch wheels come straight from PyPI now
 # (torch 2.11+ ships aarch64 + cu130 manylinux wheels and vllm 0.20+ ships
 # an aarch64 wheel pinned to that torch). They're cp312-only, so bump the
@@ -57,11 +75,87 @@ if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
     PY_STANDALONE_TAG="20251120"
 fi
 
+# ===================== Apple Silicon (Metal/MLX) =====================
+# Reproduce vllm-metal's upstream installer
+# (curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm-metal/main/install.sh)
+# but INTO LocalAI's managed venv (ensureVenv) instead of a throwaway
+# ~/.venv-vllm-metal, so the backend integrates with LocalAI's venv lifecycle
+# (portable CPython, _makeVenvPortable relocation, runtime activation). The
+# normal CUDA/CPU installRequirements is skipped on darwin — there is no
+# macOS/arm64 vLLM wheel on PyPI; vLLM is built from source and the MLX engine
+# is layered on by the vllm-metal wheel.
+if [ "$(uname -s)" = "Darwin" ]; then
+    # Create/activate the portable 3.12 venv. On darwin USE_PIP=true and
+    # PORTABLE_PYTHON=true (set by scripts/build/python-darwin.sh), so this is a
+    # `python -m venv` based, relocatable venv.
+    ensureVenv
+
+    # vllm-metal's installer drives everything through `uv`: building vLLM from
+    # the CPU requirements needs `--index-strategy unsafe-best-match` (mixes the
+    # pytorch CPU channel with PyPI), a flag plain pip does not have. The darwin
+    # venv is pip-based, so bootstrap uv into it. uv honours $VIRTUAL_ENV (set by
+    # libbackend's _activateVenv) and installs into THIS venv — same pattern the
+    # intel branch below relies on.
+    pip install uv
+
+    # The ONLY darwin version pin -- AUTO-BUMPED by .github/bump_vllm_metal.sh,
+    # which tracks vllm-project/vllm-metal releases (NOT vllm/vllm latest). Keep
+    # it as a plain double-quoted assignment on its own line so the bumper's sed
+    # can rewrite it. Darwin therefore follows vllm-metal and can lag the Linux
+    # vllm pin (requirements-cublas13-after.txt, bumped independently against
+    # vllm/vllm) until vllm-metal supports a newer vLLM.
+    VLLM_METAL_VERSION="v0.3.0.dev20260622062346"
+
+    # The coupled vLLM source version is whatever this vllm-metal release builds
+    # against -- it declares it in its own installer as `vllm_v=`. Derive it from
+    # the PINNED tag rather than hardcoding a second value that could drift. The
+    # tag is immutable, so this stays reproducible across rebuilds.
+    VLLM_VERSION=$(curl -fsSL "https://raw.githubusercontent.com/vllm-project/vllm-metal/${VLLM_METAL_VERSION}/install.sh" \
+        | grep -oE 'vllm_v="[0-9]+\.[0-9]+\.[0-9]+"' | head -n1 | cut -d'"' -f2)
+    if [ -z "${VLLM_VERSION}" ]; then
+        echo "ERROR: could not derive the vLLM version from vllm-metal ${VLLM_METAL_VERSION}" >&2
+        exit 1
+    fi
+    echo "vllm-metal ${VLLM_METAL_VERSION} builds against vLLM ${VLLM_VERSION}"
+
+    _vllm_src=$(mktemp -d)
+    trap 'rm -rf "${_vllm_src}"' EXIT
+    pushd "${_vllm_src}"
+        # 1) Build vLLM ${VLLM_VERSION} from the release source tarball against
+        #    the CPU requirements. vllm-metal layers its MLX platform plugin on
+        #    top of this exact build.
+        curl -fsSL -o "vllm-${VLLM_VERSION}.tar.gz" \
+            "https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}.tar.gz"
+        tar -xzf "vllm-${VLLM_VERSION}.tar.gz"
+        pushd "vllm-${VLLM_VERSION}"
+            uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
+            # -Wno-parentheses: clang on macOS treats one of vLLM's C++ warnings
+            # as an error without it (matches the upstream installer's CXXFLAGS).
+            CXXFLAGS="-Wno-parentheses" uv pip install .
+        popd
+    popd
+
+    # 2) Install the prebuilt vllm-metal wheel for the PINNED release. It pulls
+    #    mlx / mlx-metal as deps and registers the `metal` platform plugin that
+    #    backend.py resolves to at engine-init time. Build the release-asset URL
+    #    deterministically (tag + the cp312/arm64 wheel name) rather than querying
+    #    api.github.com, whose unauthenticated rate limit (60/hr per IP) 403s on
+    #    shared CI runners. The wheel version is the tag without its leading 'v'.
+    _metal_wheel="vllm_metal-${VLLM_METAL_VERSION#v}-cp312-cp312-macosx_11_0_arm64.whl"
+    _metal_wheel_url="https://github.com/vllm-project/vllm-metal/releases/download/${VLLM_METAL_VERSION}/${_metal_wheel}"
+    echo "Installing vllm-metal wheel: ${_metal_wheel_url}"
+    uv pip install "${_metal_wheel_url}"
+
+    # Generate the gRPC stubs (backend_pb2*). installRequirements normally does
+    # this via runProtogen at the end; we skipped installRequirements on darwin,
+    # so call it explicitly here.
+    runProtogen
+
 # Intel XPU has no upstream-published vllm wheels, so we always build vllm
 # from source against torch-xpu and replace the default triton with
 # triton-xpu (matching torch 2.11). Mirrors the upstream procedure:
 # https://github.com/vllm-project/vllm/blob/main/docs/getting_started/installation/gpu.xpu.inc.md
-if [ "x${BUILD_TYPE}" == "xintel" ]; then
+elif [ "x${BUILD_TYPE}" == "xintel" ]; then
     # Hide requirements-intel-after.txt so installRequirements doesn't
     # try `pip install vllm` (would either fail or grab a non-XPU wheel).
     _intel_after="${backend_dir}/requirements-intel-after.txt"
diff --git a/backend/python/vllm/requirements-cublas13-after.txt b/backend/python/vllm/requirements-cublas13-after.txt
index 62c486139..c04a25ab1 100644
--- a/backend/python/vllm/requirements-cublas13-after.txt
+++ b/backend/python/vllm/requirements-cublas13-after.txt
@@ -4,4 +4,7 @@
 # instead — the cublas13 case in install.sh adds --index-strategy=unsafe-best-match
 # so uv consults this index alongside PyPI.
 --extra-index-url https://wheels.vllm.ai/0.23.0/cu130
+# VERSION COUPLING: darwin/Apple-Silicon builds use vllm-metal (see install.sh),
+# which pins this exact vLLM version. Bumping vllm here means coordinating with a
+# vllm-metal release that supports the new version, or macOS/Metal builds break.
 vllm==0.23.0

From 4ac67d255dcddb6b42b53a48652078058facc9a4 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 25 Jun 2026 15:47:03 +0200
Subject: [PATCH 94/99] feat: single-build ggml CPU_ALL_VARIANTS for llama-cpp
 + turboquant (x86/arm64/apple) (#10497)

* feat(llama-cpp): single x86 CPU build via ggml CPU_ALL_VARIANTS

Replace the per-microarch avx/avx2/avx512/fallback multi-binary build on
x86 with a single grpc-server plus the dlopen-able libggml-cpu-*.so set
that ggml's backend registry selects at runtime by probing host CPU
features. One build instead of four, broader microarch coverage (adds
alderlake AVX-VNNI, zen4 AVX512-BF16, sapphirerapids AMX), and the
shell-side /proc/cpuinfo probing in run.sh goes away.

Build/link notes:
- CPU_ALL_VARIANTS requires GGML_BACKEND_DL + BUILD_SHARED_LIBS=ON, so
  ggml/llama become shared objects. SHARED_LIBS is now a make variable
  (default OFF) so the override survives the recursive sub-make into the
  VARIANT build dir instead of being re-clobbered by the base flags.
- The cpu-all target also builds "--target ggml": the per-microarch
  backends are runtime-dlopened, not link deps, so they only compile via
  ggml's add_dependencies().
- hw_grpc_proto is pinned STATIC. Under BUILD_SHARED_LIBS=ON it would
  otherwise become a DSO referencing hidden-visibility symbols in the
  static libprotobuf.a, which fails to link ("hidden symbol ... is
  referenced by DSO"). Keeping it static links gRPC/protobuf into the
  executable while only ggml/llama stay shared, so no PIC or base-image
  change is required.
- package.sh bundles the libggml-*.so set into package/lib; ggml finds
  them by scanning the bundled ld.so directory (/proc/self/exe), which
  run.sh launches from.

Scope: x86 only. arm64/darwin keep the single fallback build. The
ik-llama-cpp / turboquant forks and the other ggml C++ backends are
unchanged; the same recipe applies but is out of scope here.

Validated with a full docker build plus a live inference smoke test:
the model loads, ggml selects the AVX512_BF16 variant on a Zen-class
host, and tokens generate correctly.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(llama-cpp,turboquant): extend CPU_ALL_VARIANTS to arm64 + turboquant

- llama-cpp: x86 AND arm64 now use the single llama-cpp-cpu-all build
  (only hipblas keeps the fallback build). ggml's arm64 variant table
  (armv8.x / armv9.x, plus apple_m* on darwin) is selected at runtime.
- turboquant: same recipe via a turboquant-cpu-all target. turboquant
  copies backend/cpp/llama-cpp's CMakeLists.txt + Makefile per flavor, so
  the hw_grpc_proto STATIC fix and the SHARED_LIBS / EXTRA_CMAKE_ARGS
  make-vars are inherited; the target just passes SHARED_LIBS=ON, the DL
  flags and --target ggml through, then collects the .so set. run.sh and
  package.sh updated to ship/select turboquant-cpu-all.
- Makefile lib-collection find now also matches *.dylib (for the darwin
  build, which emits dylibs rather than .so).

ik-llama-cpp is intentionally left unchanged: its pinned ggml has no
CPU_ALL_VARIANTS support and its IQK kernels require AVX2, so the
per-microarch dynamic backend set does not apply.

Scope still excludes the darwin packaging wiring (separate change).

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* feat(llama-cpp,turboquant): arm64 gcc-14 for SME variants + darwin cpu-all packaging

- arm64: ggml CPU_ALL_VARIANTS builds armv9.2 SME variants whose -march=...+sme
  is rejected by the Ubuntu 24.04 default gcc-13. Build the arm64 variants with
  gcc-14 (installed in the compile step). The host only selects a variant it
  actually supports at runtime, but every variant must still compile.
- darwin: scripts/build/llama-cpp-darwin.sh builds llama-cpp-cpu-all instead of
  the fallback binary, keeps Metal (GGML_METAL stays ON; --target ggml also builds
  ggml-metal). The per-microarch libggml-cpu-*.dylib are placed in the package
  root next to the binary (darwin has no bundled ld.so, so ggml's executable-dir
  scan looks there), while the other shared dylibs go in lib/ for DYLD_LIBRARY_PATH.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* fix(llama-cpp-darwin): distribute ggml backends by suffix (.so root, .dylib lib)

ggml emits its loadable backends (per-microarch CPU variants, metal, blas) with a
.so suffix even on darwin, while the core libraries (ggml-base/ggml/llama/
llama-common/mtmd) use .dylib. Split the distribution by suffix: .so DL backends
go in the package root for ggml's executable-directory scan, .dylib core libs go
in lib/ for DYLD_LIBRARY_PATH. The previous .dylib name-pattern matched none of the
variants.

Verified on an M4: ggml loads the apple_m4 CPU variant (SME=1) and Metal, model
loads and generates correct tokens.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* fix(llama-cpp,turboquant): only CPU_ALL_VARIANTS for pure-CPU builds, GPU uses fallback

The previous gate sent every non-hipblas build through llama-cpp-cpu-all, so the
GPU image builds (cublas, sycl_f16/f32, vulkan, nvidia l4t) compiled the whole CPU
microarch variant matrix on top of their already-huge GPU backend - blowing the
build time (the sycl job was only 59% done after 2h11m) - and the arm64 l4t build
failed at `apt-get install gcc-14` (exit 100) on the Jetson base.

Gate on an empty BUILD_TYPE instead: only the pure CPU image (build-type: '' in
.github/backend-matrix.yml) builds the CPU_ALL_VARIANTS set; every GPU build gets a
single fallback CPU grpc-server, since the accelerator does the compute. This also
confines the arm64 gcc-14 step (needed for the armv9.2 SME variants) to the CPU
build, away from the GPU base images.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* docs(llama-cpp): correct run.sh comment for arm64/darwin cpu-all

arm64 and darwin CPU images now also ship llama-cpp-cpu-all (not fallback-only);
only GPU images ship fallback-only. Fix the stale comment to match.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 .docker/llama-cpp-compile.sh         | 32 ++++++++++++++++---------
 .docker/turboquant-compile.sh        | 22 ++++++++++-------
 backend/cpp/llama-cpp/CMakeLists.txt |  9 +++++--
 backend/cpp/llama-cpp/Makefile       | 36 ++++++++++++++++++++++++++--
 backend/cpp/llama-cpp/package.sh     | 16 +++++++++++++
 backend/cpp/llama-cpp/run.sh         | 26 +++++---------------
 backend/cpp/turboquant/Makefile      | 23 ++++++++++++++++++
 backend/cpp/turboquant/package.sh    |  9 +++++++
 backend/cpp/turboquant/run.sh        | 25 ++++---------------
 scripts/build/llama-cpp-darwin.sh    | 28 +++++++++++++++-------
 10 files changed, 154 insertions(+), 72 deletions(-)

diff --git a/.docker/llama-cpp-compile.sh b/.docker/llama-cpp-compile.sh
index bbc9aa21f..647a1c448 100755
--- a/.docker/llama-cpp-compile.sh
+++ b/.docker/llama-cpp-compile.sh
@@ -17,19 +17,29 @@ if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
   rm -rf /LocalAI/backend/cpp/llama-cpp-*-build
 fi
 
-if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
-  cd /LocalAI/backend/cpp/llama-cpp
-  make llama-cpp-fallback
-  make llama-cpp-grpc
-  make llama-cpp-rpc-server
+cd /LocalAI/backend/cpp/llama-cpp
+if [ -z "${BUILD_TYPE:-}" ]; then
+  # Pure CPU image (BUILD_TYPE empty): one build with ggml CPU_ALL_VARIANTS replaces the
+  # per-microarch binaries (x86: avx/avx2/avx512/fallback; arm64: armv8.x/armv9.x). ggml
+  # dlopens the best libggml-cpu-*.so at runtime by probing host CPU features.
+  #
+  # arm64: the CPU_ALL_VARIANTS table includes armv9.2 SME variants whose -march=...+sme is
+  # rejected by the Ubuntu 24.04 default gcc-13. gcc-14 accepts it, so build the arm64
+  # variants with it (the host never *selects* SME unless it has it, but every variant must
+  # still compile).
+  if [ "${TARGETARCH}" = "arm64" ]; then
+    apt-get update -qq && apt-get install -y -qq gcc-14 g++-14
+    export CC=gcc-14 CXX=g++-14
+  fi
+  make llama-cpp-cpu-all
 else
-  cd /LocalAI/backend/cpp/llama-cpp
-  make llama-cpp-avx
-  make llama-cpp-avx2
-  make llama-cpp-avx512
+  # GPU build (cublas/hipblas/sycl/vulkan/...): the accelerator does the compute, so a
+  # single fallback CPU build is enough - no per-microarch CPU variants needed. (This also
+  # keeps the heavy GPU backend compile from also building the whole CPU variant matrix,
+  # and avoids the gcc-14 apt step on GPU base images such as nvidia l4t.)
   make llama-cpp-fallback
-  make llama-cpp-grpc
-  make llama-cpp-rpc-server
 fi
+make llama-cpp-grpc
+make llama-cpp-rpc-server
 
 ccache -s || true
diff --git a/.docker/turboquant-compile.sh b/.docker/turboquant-compile.sh
index 7468bc1a7..ca6cf2690 100755
--- a/.docker/turboquant-compile.sh
+++ b/.docker/turboquant-compile.sh
@@ -19,17 +19,21 @@ fi
 
 cd /LocalAI/backend/cpp/turboquant
 
-if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
-  make turboquant-fallback
-  make turboquant-grpc
-  make turboquant-rpc-server
+if [ -z "${BUILD_TYPE:-}" ]; then
+  # Pure CPU image: one ggml CPU_ALL_VARIANTS build replaces the per-microarch binaries.
+  # arm64: the armv9.2 SME variants need gcc-14 (gcc-13 rejects +sme).
+  if [ "${TARGETARCH}" = "arm64" ]; then
+    apt-get update -qq && apt-get install -y -qq gcc-14 g++-14
+    export CC=gcc-14 CXX=g++-14
+  fi
+  make turboquant-cpu-all
 else
-  make turboquant-avx
-  make turboquant-avx2
-  make turboquant-avx512
+  # GPU build (cublas/hipblas/sycl/vulkan/...): single fallback CPU build, the accelerator
+  # does the compute. Keeps the GPU compile from also building the CPU variant matrix and
+  # avoids the gcc-14 apt step on GPU base images such as nvidia l4t.
   make turboquant-fallback
-  make turboquant-grpc
-  make turboquant-rpc-server
 fi
+make turboquant-grpc
+make turboquant-rpc-server
 
 ccache -s || true
diff --git a/backend/cpp/llama-cpp/CMakeLists.txt b/backend/cpp/llama-cpp/CMakeLists.txt
index cb1f5298c..bdf20802a 100644
--- a/backend/cpp/llama-cpp/CMakeLists.txt
+++ b/backend/cpp/llama-cpp/CMakeLists.txt
@@ -50,8 +50,13 @@ add_custom_command(
         "${hw_proto}"
       DEPENDS "${hw_proto}")
 
-# hw_grpc_proto
-add_library(hw_grpc_proto
+# hw_grpc_proto: force STATIC. Under the CPU_ALL_VARIANTS build BUILD_SHARED_LIBS=ON
+# (ggml/llama become shared), which would otherwise make this glue library a DSO. As a
+# DSO it references the hidden-visibility symbols in the static libprotobuf.a, which the
+# linker cannot satisfy ("hidden symbol ... in libprotobuf.a is referenced by DSO").
+# Keeping it STATIC links protobuf/gRPC directly into the grpc-server executable while
+# only ggml/llama stay shared. No effect on the static variants (already BUILD_SHARED_LIBS=OFF).
+add_library(hw_grpc_proto STATIC
   ${hw_grpc_srcs}
   ${hw_grpc_hdrs}
   ${hw_proto_srcs}
diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile
index f00fad518..b0fa0423c 100644
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -10,8 +10,16 @@ TARGET?=--target grpc-server
 JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 1)
 ARCH?=$(shell uname -m)
 
-# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
-CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
+# Shared libs default to OFF: we link static gRPC and the avx/avx2/avx512/fallback
+# variants are fully static. The CPU_ALL_VARIANTS build flips SHARED_LIBS=ON (ggml/llama
+# become shared so the dynamic CPU backends work; gRPC stays static via its imported
+# targets). SHARED_LIBS is a make variable, not an appended -D, so it survives the
+# recursive sub-make into the VARIANT build dir (which re-parses this Makefile) instead
+# of being re-clobbered by a second -DBUILD_SHARED_LIBS=OFF. EXTRA_CMAKE_ARGS is the hook
+# the CPU_ALL_VARIANTS target uses to inject -DGGML_BACKEND_DL/-DGGML_CPU_ALL_VARIANTS.
+SHARED_LIBS?=OFF
+EXTRA_CMAKE_ARGS?=
+CMAKE_ARGS+=-DBUILD_SHARED_LIBS=$(SHARED_LIBS) -DLLAMA_CURL=OFF $(EXTRA_CMAKE_ARGS)
 
 CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 ifeq ($(NATIVE),false)
@@ -120,6 +128,30 @@ llama-cpp-fallback: llama.cpp
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-fallback-build" build-llama-cpp-grpc-server
 	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build/grpc-server llama-cpp-fallback
 
+# Single-build CPU backend using ggml's CPU_ALL_VARIANTS. Produces ONE grpc-server
+# plus a set of dlopen-able libggml-cpu-*.so (sandybridge/haswell/skylakex/...) that
+# ggml's backend registry selects from at runtime by probing host CPU features.
+# Replaces the avx/avx2/avx512/fallback multi-binary build on x86.
+#
+# CPU_ALL_VARIANTS requires GGML_BACKEND_DL, which requires BUILD_SHARED_LIBS=ON, so we
+# pass SHARED_LIBS=ON and the DL flags as make variables (NOT pre-expanded into the
+# CMAKE_ARGS env string): command-line make variables propagate through every recursive
+# sub-make, so the deepest VARIANT-dir build computes BUILD_SHARED_LIBS=ON consistently.
+# Only ggml/llama go shared - gRPC is found via its static imported targets, so the
+# grpc-server binary keeps static gRPC and only dynamically links ggml.
+#
+# TARGET adds "ggml": the per-microarch backends are runtime-dlopened, not link deps of
+# grpc-server, so they only build because each is an add_dependencies() of the ggml target.
+llama-cpp-cpu-all: llama.cpp
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build purge
+	$(info ${GREEN}I llama-cpp build info:cpu-all-variants${RESET})
+	$(MAKE) SHARED_LIBS=ON EXTRA_CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON" TARGET="--target grpc-server --target ggml" VARIANT="llama-cpp-cpu-all-build" build-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build/grpc-server llama-cpp-cpu-all
+	rm -rf ggml-shared-libs && mkdir -p ggml-shared-libs
+	find $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build/llama.cpp/build \( -name '*.so*' -o -name '*.dylib' \) -exec cp -av {} ggml-shared-libs/ \;
+	@echo "Collected ggml shared backends:" && ls -la ggml-shared-libs/
+
 llama-cpp-grpc: llama.cpp
 	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build
 	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge
diff --git a/backend/cpp/llama-cpp/package.sh b/backend/cpp/llama-cpp/package.sh
index d1897e6be..5d2b18c5b 100755
--- a/backend/cpp/llama-cpp/package.sh
+++ b/backend/cpp/llama-cpp/package.sh
@@ -14,6 +14,22 @@ mkdir -p $CURDIR/package/lib
 cp -avrf $CURDIR/llama-cpp-* $CURDIR/package/
 cp -rfv $CURDIR/run.sh $CURDIR/package/
 
+# Bundle the ggml shared backends produced by the CPU_ALL_VARIANTS build (libggml-base.so,
+# libggml.so, libllama.so and the per-microarch libggml-cpu-*.so), all into package/lib.
+#
+# Two distinct resolution mechanisms both land here:
+#   - NEEDED deps (libggml-base/libggml/libllama): resolved by the dynamic linker via the
+#     LD_LIBRARY_PATH=$CURDIR/lib that run.sh exports.
+#   - The per-microarch libggml-cpu-*.so are NOT linked; ggml *discovers* them at runtime by
+#     scanning the executable's own directory (readlink /proc/self/exe). run.sh launches via
+#     the bundled $CURDIR/lib/ld.so, so /proc/self/exe -> .../lib/ld.so and ggml scans lib/.
+#     That is why the variants must sit in lib/ (next to ld.so), not just on the link path.
+# No-op on builds (arm64/darwin) that don't produce the all-variants set.
+if [ -d "$CURDIR/ggml-shared-libs" ]; then
+    echo "Bundling ggml shared backends (CPU_ALL_VARIANTS)..."
+    cp -avf $CURDIR/ggml-shared-libs/*.so* $CURDIR/package/lib/
+fi
+
 # Detect architecture and copy appropriate libraries
 if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
     # x86_64 architecture
diff --git a/backend/cpp/llama-cpp/run.sh b/backend/cpp/llama-cpp/run.sh
index 553faeb27..db8498f4b 100755
--- a/backend/cpp/llama-cpp/run.sh
+++ b/backend/cpp/llama-cpp/run.sh
@@ -12,26 +12,12 @@ grep -e "flags" /proc/cpuinfo | head -1
 
 BINARY=llama-cpp-fallback
 
-if grep -q -e "\savx\s" /proc/cpuinfo ; then
-	echo "CPU:    AVX    found OK"
-	if [ -e $CURDIR/llama-cpp-avx ]; then
-		BINARY=llama-cpp-avx
-	fi
-fi
-
-if grep -q -e "\savx2\s" /proc/cpuinfo ; then
-	echo "CPU:    AVX2   found OK"
-	if [ -e $CURDIR/llama-cpp-avx2 ]; then
-		BINARY=llama-cpp-avx2
-	fi
-fi
-
-# Check avx 512
-if grep -q -e "\savx512f\s" /proc/cpuinfo ; then
-	echo "CPU:    AVX512F found OK"
-	if [ -e $CURDIR/llama-cpp-avx512 ]; then
-		BINARY=llama-cpp-avx512
-	fi
+# CPU images (x86, arm64, darwin) ship a single llama-cpp-cpu-all built with ggml
+# CPU_ALL_VARIANTS: ggml's backend registry dlopens the best libggml-cpu-*.so for this
+# host, so no shell-side AVX probing. GPU images (cublas/sycl/vulkan/hipblas) ship only
+# llama-cpp-fallback (the accelerator does the compute), so fall back to it when absent.
+if [ -e $CURDIR/llama-cpp-cpu-all ]; then
+	BINARY=llama-cpp-cpu-all
 fi
 
 if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then
diff --git a/backend/cpp/turboquant/Makefile b/backend/cpp/turboquant/Makefile
index 98f5e4978..a32adf0b6 100644
--- a/backend/cpp/turboquant/Makefile
+++ b/backend/cpp/turboquant/Makefile
@@ -65,6 +65,29 @@ turboquant-avx:
 turboquant-fallback:
 	$(call turboquant-build,fallback,-DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server)
 
+# Single-build CPU backend via ggml CPU_ALL_VARIANTS (mirrors llama-cpp-cpu-all).
+# turboquant reuses backend/cpp/llama-cpp's CMakeLists.txt (hw_grpc_proto STATIC) and
+# Makefile (SHARED_LIBS make-var + EXTRA_CMAKE_ARGS), so this passes the same overrides
+# through to the copied build: SHARED_LIBS=ON, the DL flags, and --target ggml (which
+# pulls in the per-microarch libggml-cpu-*.so via ggml's add_dependencies). The .so set
+# is collected for package.sh to bundle into package/lib.
+turboquant-cpu-all:
+	rm -rf $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build
+	cp -rf $(LLAMA_CPP_DIR) $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build purge
+	bash $(CURRENT_MAKEFILE_DIR)/patch-grpc-server.sh $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/grpc-server.cpp
+	$(info $(GREEN)I turboquant build info:cpu-all-variants$(RESET))
+	LLAMA_REPO=$(LLAMA_REPO) LLAMA_VERSION=$(TURBOQUANT_VERSION) \
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build llama.cpp
+	bash $(CURRENT_MAKEFILE_DIR)/apply-patches.sh $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/llama.cpp $(PATCHES_DIR)
+	SHARED_LIBS=ON EXTRA_CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON" TARGET="--target grpc-server --target ggml" \
+	LLAMA_REPO=$(LLAMA_REPO) LLAMA_VERSION=$(TURBOQUANT_VERSION) \
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/grpc-server turboquant-cpu-all
+	rm -rf ggml-shared-libs && mkdir -p ggml-shared-libs
+	find $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/llama.cpp/build \( -name '*.so*' -o -name '*.dylib' \) -exec cp -av {} ggml-shared-libs/ \;
+	@echo "Collected ggml shared backends:" && ls -la ggml-shared-libs/
+
 turboquant-grpc:
 	$(call turboquant-build,grpc,-DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server --target rpc-server)
 
diff --git a/backend/cpp/turboquant/package.sh b/backend/cpp/turboquant/package.sh
index d5402fc31..c4559a68d 100755
--- a/backend/cpp/turboquant/package.sh
+++ b/backend/cpp/turboquant/package.sh
@@ -14,6 +14,15 @@ mkdir -p $CURDIR/package/lib
 cp -avrf $CURDIR/turboquant-* $CURDIR/package/
 cp -rfv $CURDIR/run.sh $CURDIR/package/
 
+# Bundle the ggml shared backends from the CPU_ALL_VARIANTS build into package/lib. ggml
+# discovers the per-microarch libggml-cpu-*.so by scanning the executable directory, which
+# (via the bundled lib/ld.so that run.sh launches through) resolves to lib/. See the
+# matching comment in backend/cpp/llama-cpp/package.sh. No-op on the fallback/ROCm builds.
+if [ -d "$CURDIR/ggml-shared-libs" ]; then
+    echo "Bundling ggml shared backends (CPU_ALL_VARIANTS)..."
+    cp -avf $CURDIR/ggml-shared-libs/*.so* $CURDIR/package/lib/
+fi
+
 # Detect architecture and copy appropriate libraries
 if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
     # x86_64 architecture
diff --git a/backend/cpp/turboquant/run.sh b/backend/cpp/turboquant/run.sh
index b0239e237..cd41a0f7f 100755
--- a/backend/cpp/turboquant/run.sh
+++ b/backend/cpp/turboquant/run.sh
@@ -12,26 +12,11 @@ grep -e "flags" /proc/cpuinfo | head -1
 
 BINARY=turboquant-fallback
 
-if grep -q -e "\savx\s" /proc/cpuinfo ; then
-	echo "CPU:    AVX    found OK"
-	if [ -e $CURDIR/turboquant-avx ]; then
-		BINARY=turboquant-avx
-	fi
-fi
-
-if grep -q -e "\savx2\s" /proc/cpuinfo ; then
-	echo "CPU:    AVX2   found OK"
-	if [ -e $CURDIR/turboquant-avx2 ]; then
-		BINARY=turboquant-avx2
-	fi
-fi
-
-# Check avx 512
-if grep -q -e "\savx512f\s" /proc/cpuinfo ; then
-	echo "CPU:    AVX512F found OK"
-	if [ -e $CURDIR/turboquant-avx512 ]; then
-		BINARY=turboquant-avx512
-	fi
+# x86/arm64 ship a single turboquant-cpu-all built with ggml CPU_ALL_VARIANTS: ggml's
+# backend registry dlopens the best libggml-cpu-*.so for this host, so no shell-side
+# probing. ROCm ships only turboquant-fallback, so fall back to it when cpu-all is absent.
+if [ -e $CURDIR/turboquant-cpu-all ]; then
+	BINARY=turboquant-cpu-all
 fi
 
 if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then
diff --git a/scripts/build/llama-cpp-darwin.sh b/scripts/build/llama-cpp-darwin.sh
index 9bdf36875..adec88f04 100644
--- a/scripts/build/llama-cpp-darwin.sh
+++ b/scripts/build/llama-cpp-darwin.sh
@@ -6,10 +6,11 @@ IMAGE_NAME="${IMAGE_NAME:-localai/llama-cpp-darwin}"
 
 pushd backend/cpp/llama-cpp
 
-# make llama-cpp-avx && \
-# make llama-cpp-avx2 && \
-# make llama-cpp-avx512 && \
-make llama-cpp-fallback && \
+# Single build via ggml CPU_ALL_VARIANTS: one binary plus the per-microarch Apple/arm
+# dylibs (apple_m1/m2_m3/m4, armv8.x) that ggml selects at runtime. GGML_METAL stays ON
+# and --target ggml also builds ggml-metal (via add_dependencies), so the Metal GPU
+# backend is still produced as a loadable libggml-metal.dylib.
+make llama-cpp-cpu-all && \
 make llama-cpp-grpc && \
 make llama-cpp-rpc-server
 
@@ -19,13 +20,24 @@ mkdir -p build/darwin
 mkdir -p backend-images
 mkdir -p build/darwin/lib
 
-# cp -rf backend/cpp/llama-cpp/llama-cpp-avx build/darwin/
-# cp -rf backend/cpp/llama-cpp/llama-cpp-avx2 build/darwin/
-# cp -rf backend/cpp/llama-cpp/llama-cpp-avx512 build/darwin/
-cp -rf backend/cpp/llama-cpp/llama-cpp-fallback build/darwin/
+cp -rf backend/cpp/llama-cpp/llama-cpp-cpu-all build/darwin/
 cp -rf backend/cpp/llama-cpp/llama-cpp-grpc build/darwin/
 cp -rf backend/cpp/llama-cpp/llama-cpp-rpc-server build/darwin/
 
+# Distribute the shared ggml/llama libraries from the CPU_ALL_VARIANTS build. Unlike the
+# old fully-static fallback build, these have @rpath install names, so the otool loop below
+# (which only copies deps that exist on disk) will not pick them up. The split is by suffix:
+#  - ggml emits its loadable backends (per-microarch CPU variants, metal, blas) with a .so
+#    suffix EVEN ON DARWIN. These go in the package ROOT next to the binary, because darwin
+#    run.sh execs the binary directly (no bundled ld.so) so ggml's executable-directory
+#    scan looks there.
+#  - the core libraries (libggml-base/libggml/libllama/libllama-common/libmtmd) use the
+#    platform .dylib suffix and are NEEDED deps; they go in lib/, resolved at load time via
+#    the DYLD_LIBRARY_PATH=lib that run.sh exports. -a preserves the version symlinks.
+SHLIBS=backend/cpp/llama-cpp/ggml-shared-libs
+cp -a $SHLIBS/*.so build/darwin/
+cp -a $SHLIBS/*.dylib build/darwin/lib/
+
 # Set default additional libs only for Darwin on M chips (arm64)
 if [[ "$(uname -s)" == "Darwin" && "$(uname -m)" == "arm64" ]]; then
     ADDITIONAL_LIBS=${ADDITIONAL_LIBS:-$(ls /opt/homebrew/Cellar/protobuf/**/lib/libutf8_validity*.dylib 2>/dev/null)}

From 79783120dda0c8e24ecf39b2ac87c5dfbffaac84 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 25 Jun 2026 15:48:23 +0200
Subject: [PATCH 95/99] fix(config): gate parallel-slot default on per-device
 VRAM too (#10485) (#10507)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The first #10485 fix (#10494) made the Blackwell physical-batch boost
per-device/context-aware, which neutralized the big compute-buffer OOM, but
the reporter's 2x16 GiB consumer Blackwell still OOM'd. Tracing the post-fix
log: the model now loads its weights, builds the main context and warms up
fine, and dies only on the *last* allocation — the MTP draft context's 800 MiB
KV cache on the tighter device.

#10411 changed only two defaults: the physical batch (now gated) and a
VRAM-scaled parallel-slot count. The KV cache is unified (n_ctx_seq == full
context proves slots share the budget, so parallel doesn't multiply KV), but
n_seq_max=4 still adds per-slot compute-graph / context-checkpoint / output
scratch. On a device packed ~99% by a 27B model spanning both cards, that
overhead is the few-hundred-MiB straw — which is why reverting #10411 (and only
#10411) restores a working load.

Gate the parallel-slot default on the same per-device headroom predicate as the
batch boost: when a large context already fills a single card
(largeContextForDevice), keep n_parallel=1. A user running one big-context model
that barely fits across two consumer GPUs is not serving four concurrent
tenants. Small contexts and large unified-memory devices (GB10) keep full
concurrency. Applied on both the single-host path and the distributed router.

Also make the auto-tuning visible and reversible (the debugging here needed
DEBUG logs and a git bisect):

  - Log the effective performance-relevant runtime options at INFO once per
    model load ("effective runtime tuning …": context, n_batch, n_gpu_layers,
    parallel, flash_attention, f16) so an admin can see what will run and pin or
    override any value in the model YAML.
  - LOCALAI_DISABLE_HARDWARE_DEFAULTS=true skips the hardware auto-tuning
    entirely (mirrors LOCALAI_DISABLE_GUESSING) for stock llama.cpp behavior.


Assisted-by: Claude:opus-4.8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/config/hardware_defaults.go              | 111 ++++++++++++++----
 core/config/hardware_defaults_test.go         |  43 +++++++
 core/services/nodes/router.go                 |   9 +-
 .../nodes/router_hardware_internal_test.go    |   8 ++
 docs/content/features/text-generation.md      |  10 ++
 pkg/model/initializers.go                     |  30 +++++
 pkg/model/initializers_internal_test.go       |  19 +++
 7 files changed, 205 insertions(+), 25 deletions(-)
 create mode 100644 pkg/model/initializers_internal_test.go

diff --git a/core/config/hardware_defaults.go b/core/config/hardware_defaults.go
index b4e0e74c6..81bc9fc7f 100644
--- a/core/config/hardware_defaults.go
+++ b/core/config/hardware_defaults.go
@@ -2,6 +2,7 @@ package config
 
 import (
 	"fmt"
+	"os"
 	"strconv"
 	"strings"
 
@@ -9,6 +10,19 @@ import (
 	"github.com/mudler/xlog"
 )
 
+// HardwareDefaultsDisabled reports whether hardware auto-tuning is turned off via
+// LOCALAI_DISABLE_HARDWARE_DEFAULTS=true (mirrors LOCALAI_DISABLE_GUESSING). When
+// set, ApplyHardwareDefaults and the distributed router's node tuning are
+// skipped entirely, so the backend runs llama.cpp's stock batch/parallel
+// behavior — an escape hatch for users who want predictable, un-tuned defaults.
+func HardwareDefaultsDisabled() bool {
+	// Read directly like the sibling LOCALAI_DISABLE_GUESSING toggle in
+	// hooks_llamacpp.go: these config-layer heuristic switches run deep in the
+	// defaults pipeline with no ApplicationConfig in scope to plumb through.
+	//nolint:forbidigo // config-layer heuristic toggle, mirrors LOCALAI_DISABLE_GUESSING
+	return os.Getenv("LOCALAI_DISABLE_HARDWARE_DEFAULTS") == "true"
+}
+
 // Hardware-driven model-config defaults.
 //
 // This sits alongside the other config overriders (ApplyInferenceDefaults for
@@ -103,17 +117,36 @@ func PhysicalBatchForContext(g GPU, ctx int) int {
 	if !g.IsNVIDIABlackwell() {
 		return DefaultPhysicalBatch
 	}
-	if ctx <= 0 {
-		ctx = DefaultContextSize
-	}
 	if g.VRAM == 0 {
 		return DefaultPhysicalBatch
 	}
-	extra := uint64(ctx) * uint64(BlackwellPhysicalBatch-DefaultPhysicalBatch) * computeBufferBytesPerCell
-	if extra <= g.VRAM/blackwellBatchHeadroomDivisor {
-		return BlackwellPhysicalBatch
+	if largeContextForDevice(g, ctx) {
+		return DefaultPhysicalBatch
 	}
-	return DefaultPhysicalBatch
+	return BlackwellPhysicalBatch
+}
+
+// largeContextForDevice reports whether the given context is large relative to
+// the per-device VRAM ceiling — the shared "tight single-model fit" signal that
+// suppresses BOTH throughput-oriented defaults (the Blackwell batch boost and
+// the concurrency slot count). It sizes the extra compute-buffer scratch a
+// raised batch would need at this context (which grows ~n_ubatch * n_ctx and
+// is allocated per device) and asks whether it overflows a fraction of the
+// device VRAM; when it does, the device has no headroom to spend on throughput
+// and the conservative defaults must hold (issue #10485).
+//
+// g.VRAM must be the PER-DEVICE ceiling (the smallest device on a multi-GPU
+// host). VRAM 0 (unknown) is treated as not-large so detection gaps don't
+// silently disable the defaults.
+func largeContextForDevice(g GPU, ctx int) bool {
+	if g.VRAM == 0 {
+		return false
+	}
+	if ctx <= 0 {
+		ctx = DefaultContextSize
+	}
+	extra := uint64(ctx) * uint64(BlackwellPhysicalBatch-DefaultPhysicalBatch) * computeBufferBytesPerCell
+	return extra > g.VRAM/blackwellBatchHeadroomDivisor
 }
 
 // IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
@@ -152,17 +185,50 @@ func DefaultParallelSlots(g GPU) int {
 	}
 }
 
-// EnsureParallelOption appends a VRAM-scaled "parallel:N" backend option when the
-// model doesn't already set one (and the GPU warrants concurrency). Returns the
-// possibly-extended options. Shared by the single-host config path
-// (ApplyHardwareDefaults) and the distributed router (per selected node).
-func EnsureParallelOption(opts []string, gpu GPU) []string {
-	if slots := DefaultParallelSlots(gpu); slots > 1 && !hasParallelOption(opts) {
+// ParallelSlotsForContext is DefaultParallelSlots gated on per-device VRAM
+// headroom for the given context. A large context already claims most of a
+// single device's VRAM (the KV cache plus the per-slot compute/checkpoint
+// scratch that scales with n_seq_max), so defaulting multiple slots there
+// pushes a tight single-model fit into per-device CUDA OOM (issue #10485): the
+// model loads but the final allocation (e.g. an MTP draft context's KV cache)
+// overflows the tighter card by a few hundred MiB. Returns 1 (no concurrency)
+// in that tight regime, otherwise the VRAM-scaled DefaultParallelSlots.
+//
+// g.VRAM must be the PER-DEVICE ceiling (smallest device on a multi-GPU host).
+// It shares largeContextForDevice with the batch boost so both throughput
+// defaults are suppressed together; the GB10 / unified-memory path reports
+// system RAM and so keeps full concurrency even at large contexts.
+func ParallelSlotsForContext(g GPU, ctx int) int {
+	slots := DefaultParallelSlots(g)
+	if slots <= 1 || g.VRAM == 0 {
+		return slots
+	}
+	if largeContextForDevice(g, ctx) {
+		return 1
+	}
+	return slots
+}
+
+// EnsureParallelOptionForContext appends a VRAM-scaled "parallel:N" backend
+// option when the model doesn't already set one and the GPU warrants (and has
+// headroom for) concurrency at this context. Returns the possibly-extended
+// options. Shared by the single-host config path (ApplyHardwareDefaults) and
+// the distributed router (per selected node).
+func EnsureParallelOptionForContext(opts []string, gpu GPU, ctx int) []string {
+	if slots := ParallelSlotsForContext(gpu, ctx); slots > 1 && !hasParallelOption(opts) {
 		return append(opts, fmt.Sprintf("parallel:%d", slots))
 	}
 	return opts
 }
 
+// EnsureParallelOption is EnsureParallelOptionForContext with no known context
+// (defaults to DefaultContextSize, which clears the headroom gate on any device
+// large enough to warrant concurrency). Kept for callers without a model
+// context.
+func EnsureParallelOption(opts []string, gpu GPU) []string {
+	return EnsureParallelOptionForContext(opts, gpu, 0)
+}
+
 // hasParallelOption reports whether the model already sets parallel/n_parallel
 // so we never override an explicit value (helper shared with serving_defaults.go).
 func hasParallelOption(opts []string) bool {
@@ -192,18 +258,18 @@ var localGPU = func() GPU {
 // and were left unset by the user. Currently: a larger physical batch on
 // Blackwell. Explicit config always wins (we only touch zero values).
 func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
-	if cfg == nil {
+	if cfg == nil || HardwareDefaultsDisabled() {
 		return
 	}
 	// Raise the physical batch on Blackwell only when the resulting compute
 	// buffer fits the per-device VRAM at THIS model's context. Leaving Batch at 0
 	// (rather than writing the default 512) preserves the downstream single-pass
 	// sizing in core/backend.EffectiveBatchSize for embedding/score/rerank.
+	ctx := DefaultContextSize
+	if cfg.ContextSize != nil {
+		ctx = *cfg.ContextSize
+	}
 	if cfg.Batch == 0 {
-		ctx := DefaultContextSize
-		if cfg.ContextSize != nil {
-			ctx = *cfg.ContextSize
-		}
 		if PhysicalBatchForContext(gpu, ctx) == BlackwellPhysicalBatch {
 			cfg.Batch = BlackwellPhysicalBatch
 			xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
@@ -214,13 +280,14 @@ func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
 	// Enable concurrent serving by default on a capable GPU: without this the
 	// llama.cpp backend runs n_parallel=1 and serializes multi-user requests
 	// (continuous batching stays off). Unified KV means the slots share the
-	// context budget, so this is concurrency without extra KV memory. Explicit
-	// parallel/n_parallel in the model options always wins.
+	// context budget, but a context large enough to fill a single device leaves
+	// no room for the per-slot scratch, so the slot count is gated on per-device
+	// headroom too (issue #10485). Explicit parallel/n_parallel always wins.
 	if before := len(cfg.Options); true {
-		cfg.Options = EnsureParallelOption(cfg.Options, gpu)
+		cfg.Options = EnsureParallelOptionForContext(cfg.Options, gpu, ctx)
 		if len(cfg.Options) > before {
 			xlog.Debug("[hardware_defaults] defaulting parallel slots for concurrent serving",
-				"option", cfg.Options[len(cfg.Options)-1], "vram_gib", gpu.VRAM>>30)
+				"option", cfg.Options[len(cfg.Options)-1], "context", ctx, "vram_gib", gpu.VRAM>>30)
 		}
 	}
 }
diff --git a/core/config/hardware_defaults_test.go b/core/config/hardware_defaults_test.go
index 3bc1bf297..452a5a884 100644
--- a/core/config/hardware_defaults_test.go
+++ b/core/config/hardware_defaults_test.go
@@ -90,6 +90,15 @@ var _ = Describe("Hardware-driven config defaults", func() {
 		It("no-ops on nil", func() {
 			Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic())
 		})
+
+		It("applies nothing when hardware defaults are disabled via env", func() {
+			GinkgoT().Setenv("LOCALAI_DISABLE_HARDWARE_DEFAULTS", "true")
+			Expect(HardwareDefaultsDisabled()).To(BeTrue())
+			cfg := &ModelConfig{}
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
+			Expect(cfg.Batch).To(Equal(0))
+			Expect(cfg.Options).To(BeEmpty())
+		})
 	})
 
 	DescribeTable("DefaultParallelSlots (by VRAM)",
@@ -105,12 +114,46 @@ var _ = Describe("Hardware-driven config defaults", func() {
 		Entry("unknown 0", uint64(0), 1),
 	)
 
+	Describe("ParallelSlotsForContext (per-device VRAM headroom)", func() {
+		It("keeps the VRAM-scaled slot count when the context fits the device", func() {
+			// 16 GiB card, small context: plenty of room for concurrency.
+			Expect(ParallelSlotsForContext(GPU{VRAM: 16 * gib}, 8192)).To(Equal(4))
+		})
+		It("drops to a single slot when a large context already fills the device", func() {
+			// Regression guard for issue #10485: 16 GiB consumer Blackwell, ~200k
+			// context. Even with unified KV, the per-slot compute/checkpoint
+			// scratch from 4 slots is the straw that overflows the tighter device.
+			Expect(ParallelSlotsForContext(GPU{VRAM: 16 * gib}, 204800)).To(Equal(1))
+		})
+		It("keeps concurrency on a large unified-memory device (GB10)", func() {
+			// GB10 reports system RAM (~119 GiB): a 200k context leaves headroom.
+			Expect(ParallelSlotsForContext(GPU{VRAM: 119 * gib}, 204800)).To(Equal(8))
+		})
+		It("keeps concurrency on a big datacenter card with a large context", func() {
+			// 80 GiB A100: 200k context is a small fraction, concurrency stays.
+			Expect(ParallelSlotsForContext(GPU{VRAM: 80 * gib}, 204800)).To(Equal(8))
+		})
+		It("stays a single slot on small/unknown VRAM regardless of context", func() {
+			Expect(ParallelSlotsForContext(GPU{VRAM: 2 * gib}, 8192)).To(Equal(1))
+			Expect(ParallelSlotsForContext(GPU{}, 8192)).To(Equal(1))
+		})
+	})
+
 	Describe("ApplyHardwareDefaults parallel slots", func() {
 		It("adds a VRAM-scaled parallel option on a capable GPU", func() {
 			cfg := &ModelConfig{}
 			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
 			Expect(cfg.Options).To(ContainElement("parallel:8"))
 		})
+		It("adds no parallel option when a large context already fills one device", func() {
+			// Regression guard for issue #10485: 16 GiB card + ~200k context. The
+			// model barely fits; defaulting concurrency tips the tighter GPU into
+			// CUDA OOM during the final (MTP draft) KV allocation.
+			ctx := 204800
+			cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.0", VRAM: 16 * gib})
+			Expect(cfg.Options).ToNot(ContainElement(ContainSubstring("parallel")))
+		})
 		It("scales the slot count down with VRAM", func() {
 			cfg := &ModelConfig{}
 			ApplyHardwareDefaults(cfg, GPU{VRAM: 24 * gib})
diff --git a/core/services/nodes/router.go b/core/services/nodes/router.go
index 6ad550cf1..ce3de3290 100644
--- a/core/services/nodes/router.go
+++ b/core/services/nodes/router.go
@@ -147,7 +147,7 @@ type scheduleLoadResult struct {
 // Only values the heuristics themselves manage are touched, so an explicit user
 // batch (e.g. 1024) is never overridden.
 func applyNodeHardwareDefaults(opts *pb.ModelOptions, node *BackendNode) {
-	if opts == nil || node == nil {
+	if opts == nil || node == nil || config.HardwareDefaultsDisabled() {
 		return
 	}
 	gpu := config.GPU{
@@ -162,8 +162,11 @@ func applyNodeHardwareDefaults(opts *pb.ModelOptions, node *BackendNode) {
 		opts.NBatch = int32(config.PhysicalBatchForContext(gpu, int(opts.ContextSize)))
 	}
 	// Default concurrent serving for the selected node (the frontend that built
-	// the options may have no GPU). Only adds when no parallel option is set.
-	opts.Options = config.EnsureParallelOption(opts.Options, gpu)
+	// the options may have no GPU). Gated on the node's per-device VRAM at this
+	// model's context, so a large context that already fills the device can't
+	// tip it into OOM by adding slot scratch (issue #10485). Only adds when no
+	// parallel option is set.
+	opts.Options = config.EnsureParallelOptionForContext(opts.Options, gpu, int(opts.ContextSize))
 }
 
 // scheduleAndLoad is the shared core for loading a model on a new node.
diff --git a/core/services/nodes/router_hardware_internal_test.go b/core/services/nodes/router_hardware_internal_test.go
index d8576c4e4..084222fee 100644
--- a/core/services/nodes/router_hardware_internal_test.go
+++ b/core/services/nodes/router_hardware_internal_test.go
@@ -41,6 +41,14 @@ var _ = Describe("applyNodeHardwareDefaults", func() {
 		Expect(opts.Options).To(ContainElement("parallel:8"))
 	})
 
+	It("adds no parallel option when a large context already fills the node device", func() {
+		// Regression guard for issue #10485: a 16 GiB node with a ~200k context
+		// is a tight single-model fit — the slot scratch would tip it into OOM.
+		opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, ContextSize: 204800}
+		applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.0", TotalVRAM: 16 << 30})
+		Expect(opts.Options).ToNot(ContainElement(ContainSubstring("parallel")))
+	})
+
 	It("never overrides an explicit parallel option on the node path", func() {
 		opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, Options: []string{"parallel:2"}}
 		applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1", TotalVRAM: 119 << 30})
diff --git a/docs/content/features/text-generation.md b/docs/content/features/text-generation.md
index c09717a3f..cadc67808 100644
--- a/docs/content/features/text-generation.md
+++ b/docs/content/features/text-generation.md
@@ -537,6 +537,16 @@ options:
 
 **Note:** The `parallel` option can also be set via the `LLAMACPP_PARALLEL` environment variable, and `grpc_servers` can be set via the `LLAMACPP_GRPC_SERVERS` environment variable. Options specified in the YAML file take precedence over environment variables.
 
+##### Hardware auto-tuning (and how to override it)
+
+On a detected GPU, LocalAI fills a few performance-relevant defaults the model config leaves unset — a larger physical batch on NVIDIA Blackwell, and a VRAM-scaled `parallel` slot count for concurrent serving. Both are gated on **per-device** VRAM at the model's context: when a large context already fills a single card (e.g. a 27B model with a 200k context across 2×16 GiB), the batch boost and the extra parallel slots are suppressed so they can't tip the tighter GPU into CUDA out-of-memory.
+
+Anything you set explicitly in the model YAML always wins, so to pin a value just set it (e.g. `batch: 512` or `options: ["parallel:1"]`). The effective values are logged at `INFO` when a model loads (`effective runtime tuning …`). To turn the hardware auto-tuning off entirely and run llama.cpp's stock behavior, set:
+
+```
+LOCALAI_DISABLE_HARDWARE_DEFAULTS=true
+```
+
 ##### Server-side prompt cache (repeated system prompts)
 
 Agents, coding assistants, and Anthropic/OpenAI-compatible CLIs typically resend the same large system prompt on every turn. The llama.cpp server can short-circuit prefill for the matching prefix by stashing idle slot KV states in host RAM and reloading them on a hit. Three settings interact:
diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go
index fdae562fe..509e58e68 100644
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -169,11 +169,41 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
 	}
 }
 
+// parallelSlotsFromOptions returns the effective n_parallel from the backend
+// option strings ("parallel:N" / "n_parallel:N"), or "1" when unset — the
+// llama.cpp default. Used only for the effective-tuning load log.
+func parallelSlotsFromOptions(opts []string) string {
+	for _, o := range opts {
+		k, v, ok := strings.Cut(o, ":")
+		if ok && (k == "parallel" || k == "n_parallel") {
+			return strings.TrimSpace(v)
+		}
+	}
+	return "1"
+}
+
 func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err error) {
 	o := NewOptions(opts...)
 
 	xlog.Info("BackendLoader starting", "modelID", o.modelID, "backend", o.backendString, "model", o.model)
 
+	// Surface the effective performance-relevant runtime options at load (some of
+	// these are auto-tuned for the detected hardware). Logged once per load so an
+	// admin can see what will actually run and pin or override any value in the
+	// model YAML — or set LOCALAI_DISABLE_HARDWARE_DEFAULTS=true to turn the
+	// hardware auto-tuning off entirely. Gated on an LLM-ish load (context set) so
+	// TTS/audio/other backends stay quiet.
+	if opt := o.gRPCOptions; opt != nil && opt.ContextSize > 0 {
+		xlog.Info("effective runtime tuning (override in the model YAML; LOCALAI_DISABLE_HARDWARE_DEFAULTS=true disables hardware auto-tuning)",
+			"modelID", o.modelID,
+			"context", opt.ContextSize,
+			"n_batch", opt.NBatch,
+			"n_gpu_layers", opt.NGPULayers,
+			"parallel", parallelSlotsFromOptions(opt.Options),
+			"flash_attention", opt.FlashAttention,
+			"f16", opt.F16Memory)
+	}
+
 	backend := strings.ToLower(o.backendString)
 	if realBackend, exists := Aliases[backend]; exists {
 		typeAlias, exists := TypeAlias[backend]
diff --git a/pkg/model/initializers_internal_test.go b/pkg/model/initializers_internal_test.go
new file mode 100644
index 000000000..6988f1aa2
--- /dev/null
+++ b/pkg/model/initializers_internal_test.go
@@ -0,0 +1,19 @@
+package model
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("parallelSlotsFromOptions", func() {
+	It("reads the parallel slot count from the backend options", func() {
+		Expect(parallelSlotsFromOptions([]string{"use_jinja:true", "parallel:4"})).To(Equal("4"))
+	})
+	It("accepts the n_parallel alias", func() {
+		Expect(parallelSlotsFromOptions([]string{"n_parallel:8"})).To(Equal("8"))
+	})
+	It("defaults to a single slot when unset", func() {
+		Expect(parallelSlotsFromOptions([]string{"use_jinja:true"})).To(Equal("1"))
+		Expect(parallelSlotsFromOptions(nil)).To(Equal("1"))
+	})
+})

From f72046b5b51ff5690cd7589a00dcbb416d9aa715 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 25 Jun 2026 17:18:55 +0200
Subject: [PATCH 96/99] fix(auth): make advisory locks dialect-aware and harden
 SQLite DSN (#10509)

* fix(auth): make advisory locks dialect-aware and harden SQLite DSN

Fixes #10506.

Two failures hit deployments that use the default SQLite auth database:

1. advisorylock executed PostgreSQL-only SQL (pg_advisory_lock /
   pg_try_advisory_lock) unconditionally. On a SQLite auth DB the job
   store, agent store and node registry migrations failed with
   "no such function: pg_advisory_lock". WithLockCtx/TryWithLockCtx now
   branch on the gorm dialect: PostgreSQL keeps the cross-process advisory
   lock, every other dialect uses a context-aware, per-key in-process lock
   (a SQLite auth DB is effectively single-process, so serializing within
   the process is sufficient).

2. The SQLite auth DSN set no busy timeout, so transient SQLITE_BUSY over
   network-backed storage (SMB/CIFS/NFS, e.g. Azure Files) failed the auth
   migration immediately with "database is locked". The DSN now sets
   _busy_timeout=5000 and _txlock=immediate (caller-supplied values are
   preserved). WAL is intentionally not enabled since its shared-memory
   mmap does not work over network filesystems. Docs note that PostgreSQL
   should be used when the data directory lives on shared storage.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* test(jobs): regression test for #10506 SQLite job store migration

Exercises the exact caller chain that failed in the issue:
auth.InitDB(sqlite) -> jobs.NewJobStore -> advisorylock.WithLockCtx ->
AutoMigrate. Before the dialect-aware advisory lock fix this failed with
"no such function: pg_advisory_lock"; the test now asserts it migrates
cleanly on a SQLite auth DB.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/http/auth/db_sqlite.go                   |  43 +++++-
 core/http/auth/db_sqlite_test.go              |  57 ++++++++
 core/services/advisorylock/advisorylock.go    |  77 ++++++++++-
 .../advisorylock/advisorylock_sqlite_test.go  | 129 ++++++++++++++++++
 core/services/jobs/sqlite_e2e_test.go         |  24 ++++
 docs/content/features/authentication.md       |   2 +
 6 files changed, 326 insertions(+), 6 deletions(-)
 create mode 100644 core/http/auth/db_sqlite_test.go
 create mode 100644 core/services/advisorylock/advisorylock_sqlite_test.go
 create mode 100644 core/services/jobs/sqlite_e2e_test.go

diff --git a/core/http/auth/db_sqlite.go b/core/http/auth/db_sqlite.go
index 5c13ecf05..eecabe4a5 100644
--- a/core/http/auth/db_sqlite.go
+++ b/core/http/auth/db_sqlite.go
@@ -3,10 +3,51 @@
 package auth
 
 import (
+	"net/url"
+	"strings"
+
 	"gorm.io/driver/sqlite"
 	"gorm.io/gorm"
 )
 
 func openSQLiteDialector(path string) (gorm.Dialector, error) {
-	return sqlite.Open(path), nil
+	return sqlite.Open(buildSQLiteDSN(path)), nil
+}
+
+// buildSQLiteDSN augments a SQLite file path with connection pragmas that make
+// the auth DB resilient on slow or contended storage.
+//
+//   - _busy_timeout=5000 makes SQLite retry for up to 5s on SQLITE_BUSY instead
+//     of failing immediately. Network-backed storage (SMB/CIFS/NFS, e.g. Azure
+//     Files) is prone to transient lock contention during migration (see #10506).
+//   - _txlock=immediate takes the write lock at BEGIN, avoiding deadlocks when a
+//     read transaction later upgrades to a write during AutoMigrate.
+//
+// We deliberately do NOT set WAL journal mode: WAL relies on a shared-memory
+// mmap that does not work over SMB/NFS, which is exactly the failing case here.
+//
+// Caller-supplied values for either pragma are preserved.
+func buildSQLiteDSN(path string) string {
+	base := path
+	rawQuery := ""
+	if i := strings.IndexByte(path, '?'); i >= 0 {
+		base = path[:i]
+		rawQuery = path[i+1:]
+	}
+
+	values, err := url.ParseQuery(rawQuery)
+	if err != nil {
+		// An unparseable query string means a hand-crafted DSN we should not
+		// risk corrupting; leave it untouched.
+		return path
+	}
+
+	if values.Get("_busy_timeout") == "" {
+		values.Set("_busy_timeout", "5000")
+	}
+	if values.Get("_txlock") == "" {
+		values.Set("_txlock", "immediate")
+	}
+
+	return base + "?" + values.Encode()
 }
diff --git a/core/http/auth/db_sqlite_test.go b/core/http/auth/db_sqlite_test.go
new file mode 100644
index 000000000..f1dc4e404
--- /dev/null
+++ b/core/http/auth/db_sqlite_test.go
@@ -0,0 +1,57 @@
+//go:build auth
+
+package auth
+
+import (
+	"net/url"
+	"strings"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// parseDSN splits a "base?query" DSN into its base and decoded query values so
+// assertions don't depend on url.Values.Encode()'s key ordering.
+func parseDSN(dsn string) (string, url.Values) {
+	base := dsn
+	rawQuery := ""
+	if i := strings.IndexByte(dsn, '?'); i >= 0 {
+		base = dsn[:i]
+		rawQuery = dsn[i+1:]
+	}
+	values, err := url.ParseQuery(rawQuery)
+	Expect(err).ToNot(HaveOccurred())
+	return base, values
+}
+
+var _ = Describe("buildSQLiteDSN", func() {
+	It("adds busy_timeout and txlock to a plain file path", func() {
+		base, values := parseDSN(buildSQLiteDSN("/data/database.db"))
+		Expect(base).To(Equal("/data/database.db"))
+		Expect(values.Get("_busy_timeout")).To(Equal("5000"))
+		Expect(values.Get("_txlock")).To(Equal("immediate"))
+	})
+
+	It("adds pragmas to an in-memory database", func() {
+		base, values := parseDSN(buildSQLiteDSN(":memory:"))
+		Expect(base).To(Equal(":memory:"))
+		Expect(values.Get("_busy_timeout")).To(Equal("5000"))
+		Expect(values.Get("_txlock")).To(Equal("immediate"))
+	})
+
+	It("preserves an existing query string", func() {
+		base, values := parseDSN(buildSQLiteDSN("/data/database.db?cache=shared"))
+		Expect(base).To(Equal("/data/database.db"))
+		Expect(values.Get("cache")).To(Equal("shared"))
+		Expect(values.Get("_busy_timeout")).To(Equal("5000"))
+		Expect(values.Get("_txlock")).To(Equal("immediate"))
+	})
+
+	It("does not override a caller-supplied busy_timeout or txlock", func() {
+		_, values := parseDSN(buildSQLiteDSN("/data/database.db?_busy_timeout=1000&_txlock=deferred"))
+		Expect(values["_busy_timeout"]).To(HaveLen(1), "_busy_timeout should not be duplicated")
+		Expect(values.Get("_busy_timeout")).To(Equal("1000"))
+		Expect(values["_txlock"]).To(HaveLen(1), "_txlock should not be duplicated")
+		Expect(values.Get("_txlock")).To(Equal("deferred"))
+	})
+})
diff --git a/core/services/advisorylock/advisorylock.go b/core/services/advisorylock/advisorylock.go
index 6cc0afb2a..f51a6357e 100644
--- a/core/services/advisorylock/advisorylock.go
+++ b/core/services/advisorylock/advisorylock.go
@@ -4,14 +4,59 @@ import (
 	"context"
 	"fmt"
 	"hash/fnv"
+	"strings"
+	"sync"
 
 	"gorm.io/gorm"
 )
 
-// TryWithLockCtx attempts to acquire a PostgreSQL advisory lock using the provided context.
-// Returns (true, nil) if the lock was acquired and fn executed, (false, nil) if the lock
-// was already held, or (false, error) on failure.
+// localLocks holds one buffered channel (capacity 1) per lock key, used as an
+// in-process mutex for non-PostgreSQL dialects (SQLite). A SQLite auth DB is
+// effectively single-process, so serializing guarded sections within this
+// process is sufficient - we cannot and need not coordinate across processes
+// the way a PostgreSQL advisory lock does.
+var (
+	localLocksMu sync.Mutex
+	localLocks   = map[int64]chan struct{}{}
+)
+
+// localLockChan returns the per-key buffered channel, creating it on first use.
+func localLockChan(key int64) chan struct{} {
+	localLocksMu.Lock()
+	defer localLocksMu.Unlock()
+	ch, ok := localLocks[key]
+	if !ok {
+		ch = make(chan struct{}, 1)
+		localLocks[key] = ch
+	}
+	return ch
+}
+
+// isPostgres reports whether the gorm dialect is PostgreSQL. Anything else
+// (SQLite and any non-postgres dialect) uses the in-process fallback, because
+// the pg_* advisory lock functions only exist on PostgreSQL.
+func isPostgres(db *gorm.DB) bool {
+	return strings.Contains(db.Dialector.Name(), "postgres")
+}
+
+// TryWithLockCtx attempts to acquire a lock and run fn without blocking.
+// Returns (true, nil) if the lock was acquired and fn executed, (false, nil) if
+// the lock was already held, or (false, error) on failure.
+//
+// On PostgreSQL it uses pg_try_advisory_lock (cross-process). On other dialects
+// (SQLite) it uses a non-blocking in-process lock keyed by key.
 func TryWithLockCtx(ctx context.Context, db *gorm.DB, key int64, fn func() error) (bool, error) {
+	if !isPostgres(db) {
+		ch := localLockChan(key)
+		select {
+		case ch <- struct{}{}:
+			defer func() { <-ch }()
+			return true, fn()
+		default:
+			return false, nil
+		}
+	}
+
 	sqlDB, err := db.DB()
 	if err != nil {
 		return false, fmt.Errorf("get sql.DB: %w", err)
@@ -50,9 +95,31 @@ func KeyFromString(s string) int64 {
 	return int64(h.Sum64()>>1) | 0x100000000
 }
 
-// WithLockCtx is like WithLock but respects context cancellation.
-// If ctx is cancelled while waiting for the lock, the function returns ctx.Err().
+// WithLockCtx acquires a lock for key, runs fn, then releases it, respecting
+// context cancellation. If ctx is cancelled while waiting for the lock, the
+// function returns ctx.Err().
+//
+// On PostgreSQL it uses pg_advisory_lock (cross-process). On other dialects
+// (SQLite) it falls back to a blocking in-process lock keyed by key, which is
+// sufficient because a SQLite auth DB is effectively single-process.
 func WithLockCtx(ctx context.Context, db *gorm.DB, key int64, fn func() error) error {
+	if !isPostgres(db) {
+		// Honor an already-cancelled context before attempting acquisition:
+		// select picks a ready case at random, so without this an already-free
+		// lock could be taken despite a cancelled ctx.
+		if err := ctx.Err(); err != nil {
+			return err
+		}
+		ch := localLockChan(key)
+		select {
+		case ch <- struct{}{}:
+			defer func() { <-ch }()
+			return fn()
+		case <-ctx.Done():
+			return ctx.Err()
+		}
+	}
+
 	sqlDB, err := db.DB()
 	if err != nil {
 		return fmt.Errorf("advisorylock: getting sql.DB: %w", err)
diff --git a/core/services/advisorylock/advisorylock_sqlite_test.go b/core/services/advisorylock/advisorylock_sqlite_test.go
new file mode 100644
index 000000000..9e9b6abfd
--- /dev/null
+++ b/core/services/advisorylock/advisorylock_sqlite_test.go
@@ -0,0 +1,129 @@
+package advisorylock
+
+import (
+	"context"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"gorm.io/driver/sqlite"
+	"gorm.io/gorm"
+)
+
+// These specs run against an in-memory SQLite DB and therefore do NOT require
+// Docker, unlike the PostgreSQL testcontainer specs.
+var _ = Describe("AdvisoryLock (SQLite fallback)", Label("sqlite"), func() {
+	var db *gorm.DB
+
+	BeforeEach(func() {
+		var err error
+		db, err = gorm.Open(sqlite.Open("file::memory:?cache=shared"), &gorm.Config{})
+		Expect(err).ToNot(HaveOccurred())
+		Expect(db.Dialector.Name()).To(ContainSubstring("sqlite"))
+	})
+
+	It("WithLockCtx executes fn and returns no error on SQLite", func() {
+		const lockKey int64 = 12001
+		executed := false
+
+		err := WithLockCtx(context.Background(), db, lockKey, func() error {
+			executed = true
+			return nil
+		})
+		Expect(err).ToNot(HaveOccurred())
+		Expect(executed).To(BeTrue(), "function should have run under the in-process lock")
+	})
+
+	It("WithLockCtx serializes concurrent goroutines on the same key", func() {
+		const lockKey int64 = 12002
+
+		var (
+			mu          sync.Mutex
+			maxRunning  int32
+			running     int32
+			concurrency int32
+		)
+
+		var wg sync.WaitGroup
+
+		for range 2 {
+			wg.Go(func() {
+				defer GinkgoRecover()
+				err := WithLockCtx(context.Background(), db, lockKey, func() error {
+					cur := atomic.AddInt32(&running, 1)
+					mu.Lock()
+					if cur > maxRunning {
+						maxRunning = cur
+					}
+					if cur > 1 {
+						atomic.AddInt32(&concurrency, 1)
+					}
+					mu.Unlock()
+
+					time.Sleep(50 * time.Millisecond)
+
+					atomic.AddInt32(&running, -1)
+					return nil
+				})
+				Expect(err).ToNot(HaveOccurred())
+			})
+		}
+
+		wg.Wait()
+
+		Expect(maxRunning).To(BeNumerically("<=", 1), "expected max 1 goroutine inside lock at a time")
+		Expect(concurrency).To(BeZero(), "detected concurrent execution inside advisory lock")
+	})
+
+	It("WithLockCtx returns an error and does not run fn with an already-cancelled context", func() {
+		const lockKey int64 = 12003
+		ctx, cancel := context.WithCancel(context.Background())
+		cancel()
+
+		err := WithLockCtx(ctx, db, lockKey, func() error {
+			Fail("function should not run with a cancelled context")
+			return nil
+		})
+		Expect(err).To(HaveOccurred())
+	})
+
+	It("TryWithLockCtx returns (true, nil) when free and (false, nil) when held", func() {
+		const lockKey int64 = 12004
+
+		acquired, err := TryWithLockCtx(context.Background(), db, lockKey, func() error {
+			return nil
+		})
+		Expect(err).ToNot(HaveOccurred())
+		Expect(acquired).To(BeTrue(), "expected TryWithLockCtx to acquire the free lock")
+
+		// Hold the lock in one goroutine while a concurrent TryWithLockCtx
+		// attempts to acquire the same key.
+		held := make(chan struct{})
+		release := make(chan struct{})
+		var wg sync.WaitGroup
+		wg.Go(func() {
+			defer GinkgoRecover()
+			ok, err := TryWithLockCtx(context.Background(), db, lockKey, func() error {
+				close(held)
+				<-release
+				return nil
+			})
+			Expect(err).ToNot(HaveOccurred())
+			Expect(ok).To(BeTrue())
+		})
+
+		<-held
+		ok, err := TryWithLockCtx(context.Background(), db, lockKey, func() error {
+			Fail("function should not run while lock is held")
+			return nil
+		})
+		Expect(err).ToNot(HaveOccurred())
+		Expect(ok).To(BeFalse(), "expected TryWithLockCtx to fail to acquire a held lock")
+
+		close(release)
+		wg.Wait()
+	})
+})
diff --git a/core/services/jobs/sqlite_e2e_test.go b/core/services/jobs/sqlite_e2e_test.go
new file mode 100644
index 000000000..26cb5a669
--- /dev/null
+++ b/core/services/jobs/sqlite_e2e_test.go
@@ -0,0 +1,24 @@
+//go:build auth
+
+package jobs_test
+
+import (
+	"github.com/mudler/LocalAI/core/http/auth"
+	"github.com/mudler/LocalAI/core/services/jobs"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// Reproduces the #10506 caller chain: auth.InitDB(sqlite) -> jobs.NewJobStore,
+// which previously failed with "no such function: pg_advisory_lock".
+var _ = Describe("NewJobStore on a SQLite auth DB (#10506)", func() {
+	It("migrates without pg_advisory_lock errors", func() {
+		db, err := auth.InitDB(":memory:")
+		Expect(err).ToNot(HaveOccurred())
+
+		store, err := jobs.NewJobStore(db)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(store).ToNot(BeNil())
+	})
+})
diff --git a/docs/content/features/authentication.md b/docs/content/features/authentication.md
index 35f3cc9ae..ffaa43b34 100644
--- a/docs/content/features/authentication.md
+++ b/docs/content/features/authentication.md
@@ -85,6 +85,8 @@ localai run
 | `LOCALAI_REGISTRATION_MODE` | `approval` | Registration mode: `open`, `approval`, or `invite` |
 | `LOCALAI_DISABLE_LOCAL_AUTH` | `false` | Disable local email/password registration and login (for OAuth/OIDC-only deployments) |
 
+> **Note: network-backed storage.** File-based SQLite relies on POSIX file locking, which is unreliable over network filesystems (SMB/CIFS/NFS, e.g. Azure Files / Azure Container Apps shared volumes). On such storage the auth DB can fail to migrate with `database is locked`. Use PostgreSQL (`LOCALAI_AUTH_DATABASE_URL=postgres://...`) when the data directory lives on shared or network storage, or place `database.db` on a local volume.
+
 ### Disabling Local Authentication
 
 If you want to enforce OAuth/OIDC-only login and prevent users from registering or logging in with email/password, set `LOCALAI_DISABLE_LOCAL_AUTH=true` (or pass `--disable-local-auth`):

From d1a9d59917922928dbe5d919fbcb6ea7844e35b2 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 25 Jun 2026 22:07:56 +0200
Subject: [PATCH 97/99] feat(backends): darwin/Metal builds for vision C++/ggml
 backends (depth-anything, locate-anything, rfdetr-cpp, sam3-cpp) (#10511)

feat(backends): darwin/Metal builds for the vision C++/ggml backends

depth-anything-cpp, locate-anything-cpp, rfdetr-cpp and sam3-cpp already carry
a Darwin/Metal path in their Makefiles (GGML_METAL=ON when build-type=metal),
but were never wired into CI, so no Metal image was published and Apple Silicon
could not install them.

- .github/backend-matrix.yml: add the four to includeDarwin (build-type metal,
  lang go), matching the other go+ggml *-cpp Metal entries.
- backend/index.yaml: add metal: to each backend's capabilities map (main and
  -development) plus concrete metal-<backend>(-development) entries pointing at
  the latest/master -metal-darwin-arm64-<backend> images.
- backend/go/*/Makefile: a one-line note on the existing Darwin branch (also
  the per-backend change the CI path filter needs to actually build them here).


Assisted-by: Claude:opus-4.8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 .github/backend-matrix.yml              | 19 ++++++++++
 backend/go/depth-anything-cpp/Makefile  |  2 ++
 backend/go/locate-anything-cpp/Makefile |  2 ++
 backend/go/rfdetr-cpp/Makefile          |  2 ++
 backend/go/sam3-cpp/Makefile            |  2 ++
 backend/index.yaml                      | 48 +++++++++++++++++++++++++
 6 files changed, 75 insertions(+)

diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml
index 5ad6d9e16..dc12daf97 100644
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -4922,6 +4922,25 @@ includeDarwin:
     tag-suffix: "-metal-darwin-arm64-vibevoice-cpp"
     build-type: "metal"
     lang: "go"
+  # Vision/utility C++/ggml backends (go+cgo). Their Makefiles already carry a
+  # Darwin/Metal path (GGML_METAL=ON when build-type=metal); this just builds and
+  # publishes the metal image so Apple Silicon can install them.
+  - backend: "depth-anything-cpp"
+    tag-suffix: "-metal-darwin-arm64-depth-anything-cpp"
+    build-type: "metal"
+    lang: "go"
+  - backend: "locate-anything-cpp"
+    tag-suffix: "-metal-darwin-arm64-locate-anything-cpp"
+    build-type: "metal"
+    lang: "go"
+  - backend: "rfdetr-cpp"
+    tag-suffix: "-metal-darwin-arm64-rfdetr-cpp"
+    build-type: "metal"
+    lang: "go"
+  - backend: "sam3-cpp"
+    tag-suffix: "-metal-darwin-arm64-sam3-cpp"
+    build-type: "metal"
+    lang: "go"
   - backend: "voxtral"
     tag-suffix: "-metal-darwin-arm64-voxtral"
     build-type: "metal"
diff --git a/backend/go/depth-anything-cpp/Makefile b/backend/go/depth-anything-cpp/Makefile
index efe99a626..e142607ab 100644
--- a/backend/go/depth-anything-cpp/Makefile
+++ b/backend/go/depth-anything-cpp/Makefile
@@ -40,6 +40,8 @@ else ifeq ($(BUILD_TYPE),hipblas)
 else ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=ON -DDA_GGML_VULKAN=ON
 else ifeq ($(OS),Darwin)
+	# macOS/Metal: built + published as an OCI image by CI (includeDarwin in
+	# .github/backend-matrix.yml) so Apple Silicon users can install this backend.
 	ifneq ($(BUILD_TYPE),metal)
 		CMAKE_ARGS+=-DGGML_METAL=OFF
 	else
diff --git a/backend/go/locate-anything-cpp/Makefile b/backend/go/locate-anything-cpp/Makefile
index ba12c7195..c66d57764 100644
--- a/backend/go/locate-anything-cpp/Makefile
+++ b/backend/go/locate-anything-cpp/Makefile
@@ -33,6 +33,8 @@ else ifeq ($(BUILD_TYPE),hipblas)
 else ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=ON -DLA_GGML_VULKAN=ON
 else ifeq ($(OS),Darwin)
+	# macOS/Metal: built + published as an OCI image by CI (includeDarwin in
+	# .github/backend-matrix.yml) so Apple Silicon users can install this backend.
 	ifneq ($(BUILD_TYPE),metal)
 		CMAKE_ARGS+=-DGGML_METAL=OFF
 	else
diff --git a/backend/go/rfdetr-cpp/Makefile b/backend/go/rfdetr-cpp/Makefile
index 3282720ff..448a8e78b 100644
--- a/backend/go/rfdetr-cpp/Makefile
+++ b/backend/go/rfdetr-cpp/Makefile
@@ -34,6 +34,8 @@ else ifeq ($(BUILD_TYPE),hipblas)
 else ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=ON -DRFDETR_GGML_VULKAN=ON
 else ifeq ($(OS),Darwin)
+	# macOS/Metal: built + published as an OCI image by CI (includeDarwin in
+	# .github/backend-matrix.yml) so Apple Silicon users can install this backend.
 	ifneq ($(BUILD_TYPE),metal)
 		CMAKE_ARGS+=-DGGML_METAL=OFF
 	else
diff --git a/backend/go/sam3-cpp/Makefile b/backend/go/sam3-cpp/Makefile
index 27b6cedf7..f91bb356a 100644
--- a/backend/go/sam3-cpp/Makefile
+++ b/backend/go/sam3-cpp/Makefile
@@ -31,6 +31,8 @@ else ifeq ($(BUILD_TYPE),hipblas)
 else ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=ON
 else ifeq ($(OS),Darwin)
+	# macOS/Metal: built + published as an OCI image by CI (includeDarwin in
+	# .github/backend-matrix.yml) so Apple Silicon users can install this backend.
 	ifneq ($(BUILD_TYPE),metal)
 		CMAKE_ARGS+=-DGGML_METAL=OFF
 	else
diff --git a/backend/index.yaml b/backend/index.yaml
index 4a7a07d82..bc51cf68e 100644
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -340,6 +340,7 @@
     nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-sam3-cpp"
     intel: "intel-sycl-f32-sam3-cpp"
     vulkan: "vulkan-sam3-cpp"
+    metal: "metal-sam3-cpp"
 - &rfdetrcpp
   name: "rfdetr-cpp"
   alias: "rfdetr-cpp"
@@ -368,6 +369,7 @@
     nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-rfdetr-cpp"
     intel: "intel-sycl-f32-rfdetr-cpp"
     vulkan: "vulkan-rfdetr-cpp"
+    metal: "metal-rfdetr-cpp"
 - &locateanything
   name: "locate-anything"
   alias: "locate-anything"
@@ -397,6 +399,7 @@
     nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-locate-anything-cpp"
     intel: "intel-sycl-f32-locate-anything-cpp"
     vulkan: "vulkan-locate-anything-cpp"
+    metal: "metal-locate-anything-cpp"
 - !!merge <<: *locateanything
   name: "locate-anything-development"
   capabilities:
@@ -409,6 +412,7 @@
     nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-locate-anything-cpp-development"
     intel: "intel-sycl-f32-locate-anything-cpp-development"
     vulkan: "vulkan-locate-anything-cpp-development"
+    metal: "metal-locate-anything-cpp-development"
 - !!merge <<: *locateanything
   name: "cpu-locate-anything-cpp"
   uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-locate-anything-cpp"
@@ -419,6 +423,16 @@
   uri: "quay.io/go-skynet/local-ai-backends:master-cpu-locate-anything-cpp"
   mirrors:
     - localai/localai-backends:master-cpu-locate-anything-cpp
+- !!merge <<: *locateanything
+  name: "metal-locate-anything-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-locate-anything-cpp"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-locate-anything-cpp
+- !!merge <<: *locateanything
+  name: "metal-locate-anything-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-locate-anything-cpp"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-locate-anything-cpp
 - !!merge <<: *locateanything
   name: "cuda12-locate-anything-cpp"
   uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-locate-anything-cpp"
@@ -517,6 +531,7 @@
     nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-depth-anything-cpp"
     intel: "intel-sycl-f32-depth-anything-cpp"
     vulkan: "vulkan-depth-anything-cpp"
+    metal: "metal-depth-anything-cpp"
 - !!merge <<: *depthanything
   name: "depth-anything-development"
   capabilities:
@@ -529,6 +544,7 @@
     nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-depth-anything-cpp-development"
     intel: "intel-sycl-f32-depth-anything-cpp-development"
     vulkan: "vulkan-depth-anything-cpp-development"
+    metal: "metal-depth-anything-cpp-development"
 - !!merge <<: *depthanything
   name: "cpu-depth-anything-cpp"
   uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-depth-anything-cpp"
@@ -539,6 +555,16 @@
   uri: "quay.io/go-skynet/local-ai-backends:master-cpu-depth-anything-cpp"
   mirrors:
     - localai/localai-backends:master-cpu-depth-anything-cpp
+- !!merge <<: *depthanything
+  name: "metal-depth-anything-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-depth-anything-cpp"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-depth-anything-cpp
+- !!merge <<: *depthanything
+  name: "metal-depth-anything-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-depth-anything-cpp"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-depth-anything-cpp
 - !!merge <<: *depthanything
   name: "cuda12-depth-anything-cpp"
   uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-depth-anything-cpp"
@@ -3220,6 +3246,7 @@
     nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-sam3-cpp-development"
     intel: "intel-sycl-f32-sam3-cpp-development"
     vulkan: "vulkan-sam3-cpp-development"
+    metal: "metal-sam3-cpp-development"
 - !!merge <<: *sam3cpp
   name: "cpu-sam3-cpp"
   uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-sam3-cpp"
@@ -3230,6 +3257,16 @@
   uri: "quay.io/go-skynet/local-ai-backends:master-cpu-sam3-cpp"
   mirrors:
     - localai/localai-backends:master-cpu-sam3-cpp
+- !!merge <<: *sam3cpp
+  name: "metal-sam3-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-sam3-cpp"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-sam3-cpp
+- !!merge <<: *sam3cpp
+  name: "metal-sam3-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-sam3-cpp"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-sam3-cpp
 - !!merge <<: *sam3cpp
   name: "cuda12-sam3-cpp"
   uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-sam3-cpp"
@@ -3303,6 +3340,7 @@
     nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-rfdetr-cpp-development"
     intel: "intel-sycl-f32-rfdetr-cpp-development"
     vulkan: "vulkan-rfdetr-cpp-development"
+    metal: "metal-rfdetr-cpp-development"
 - !!merge <<: *rfdetrcpp
   name: "cpu-rfdetr-cpp"
   uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-rfdetr-cpp"
@@ -3313,6 +3351,16 @@
   uri: "quay.io/go-skynet/local-ai-backends:master-cpu-rfdetr-cpp"
   mirrors:
     - localai/localai-backends:master-cpu-rfdetr-cpp
+- !!merge <<: *rfdetrcpp
+  name: "metal-rfdetr-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-rfdetr-cpp"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-rfdetr-cpp
+- !!merge <<: *rfdetrcpp
+  name: "metal-rfdetr-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-rfdetr-cpp"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-rfdetr-cpp
 - !!merge <<: *rfdetrcpp
   name: "cuda12-rfdetr-cpp"
   uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-rfdetr-cpp"

From 286c508ce012dfb33106c55d8dbc7d9787d28094 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 25 Jun 2026 22:54:36 +0200
Subject: [PATCH 98/99] feat(backends): darwin build for the localvqe backend
 (acoustic echo cancellation) (#10512)

feat(backends): darwin build for the localvqe backend

LocalVQE (acoustic echo cancellation / noise suppression / dereverberation)
already builds on Darwin - its Makefile takes the OS=Darwin branch with
GGML_METAL=OFF (upstream is CPU + Vulkan only), producing a native arm64 CPU
image. It was just never wired into CI.

- .github/backend-matrix.yml: add localvqe to includeDarwin (build-type metal,
  lang go) - the darwin/arm64 build profile; the backend itself stays CPU.
- backend/index.yaml: metal: capability + concrete metal-localvqe(-development)
  entries pointing at the -metal-darwin-arm64-localvqe images.
- backend/go/localvqe/Makefile: note on the existing Darwin branch (also the
  per-backend change the CI path filter needs to build it here).


Assisted-by: Claude:opus-4.8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 .github/backend-matrix.yml   |  6 ++++++
 backend/go/localvqe/Makefile |  2 ++
 backend/index.yaml           | 12 ++++++++++++
 3 files changed, 20 insertions(+)

diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml
index dc12daf97..4d2b85977 100644
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -4941,6 +4941,12 @@ includeDarwin:
     tag-suffix: "-metal-darwin-arm64-sam3-cpp"
     build-type: "metal"
     lang: "go"
+  # LocalVQE has no Metal path; on Apple Silicon it builds CPU-only (GGML_METAL
+  # OFF) but is still a native arm64 image. Uses the darwin/metal build profile.
+  - backend: "localvqe"
+    tag-suffix: "-metal-darwin-arm64-localvqe"
+    build-type: "metal"
+    lang: "go"
   - backend: "voxtral"
     tag-suffix: "-metal-darwin-arm64-voxtral"
     build-type: "metal"
diff --git a/backend/go/localvqe/Makefile b/backend/go/localvqe/Makefile
index 049da0cdd..58b73c3b9 100644
--- a/backend/go/localvqe/Makefile
+++ b/backend/go/localvqe/Makefile
@@ -32,6 +32,8 @@ endif
 ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=ON -DLOCALVQE_VULKAN=ON
 else ifeq ($(OS),Darwin)
+	# Apple Silicon: CPU-only (no Metal upstream); built + published as an arm64
+	# image by CI (includeDarwin in .github/backend-matrix.yml) for macOS install.
 	CMAKE_ARGS+=-DGGML_METAL=OFF
 endif
 
diff --git a/backend/index.yaml b/backend/index.yaml
index bc51cf68e..a7399e20d 100644
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -1057,6 +1057,8 @@
     nvidia-l4t: "vulkan-localvqe"
     nvidia-l4t-cuda-12: "vulkan-localvqe"
     nvidia-l4t-cuda-13: "vulkan-localvqe"
+    # Apple Silicon: CPU build (LocalVQE has no Metal path); still arm64-native.
+    metal: "metal-localvqe"
 - &privacyfilter
   name: "privacy-filter"
   alias: "privacy-filter"
@@ -4149,6 +4151,16 @@
   uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-localvqe"
   mirrors:
     - localai/localai-backends:master-gpu-vulkan-localvqe
+- !!merge <<: *localvqecpp
+  name: "metal-localvqe"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-localvqe"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-localvqe
+- !!merge <<: *localvqecpp
+  name: "metal-localvqe-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-localvqe"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-localvqe
 ## kokoro
 - !!merge <<: *kokoro
   name: "kokoro-development"

From f2ed63e39a60901d6b414b34ec23e72774cf26b1 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 25 Jun 2026 23:26:39 +0200
Subject: [PATCH 99/99] docs(backends): make OS coverage explicit + require
 darwin support for new backends (#10516)

docs(backends): make OS coverage explicit + require darwin for new backends

The backend matrix is the source of truth for which OS a backend ships on, but
that was never written down, so backends were landing Linux-only by default even
when the engine builds fine on macOS.

- .github/backend-matrix.yml: header block documenting the two matrices
  (include = Linux, includeDarwin = macOS/Apple Silicon) and the policy that new
  backends target every OS they can build for.
- .agents/adding-backends.md: a 'Cover every OS' subsection in step 2 (full darwin
  wiring: includeDarwin entry, index.yaml metal: + metal-<backend> entries,
  run.sh DYLD branch + inferBackendPathDarwin case for C++ backends, the
  hw_grpc_proto protobuf/grpc link gotcha, and the path-filter touch) plus a
  verification-checklist item.
- AGENTS.md (CLAUDE.md): Quick Reference pointer so it surfaces every session.


Assisted-by: Claude:opus-4.8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 .agents/adding-backends.md | 19 +++++++++++++++++++
 .github/backend-matrix.yml | 22 ++++++++++++++++++++++
 AGENTS.md                  |  1 +
 3 files changed, 42 insertions(+)

diff --git a/.agents/adding-backends.md b/.agents/adding-backends.md
index ab965f789..fb98c55f2 100644
--- a/.agents/adding-backends.md
+++ b/.agents/adding-backends.md
@@ -102,6 +102,24 @@ Multi-arch backends are NOT a single matrix entry with `platforms: 'linux/amd64,
 
 Entries whose `dockerfile` is `./backend/Dockerfile.{llama-cpp,ik-llama-cpp,turboquant}` must also set a `builder-base-image` field pointing at a prebuilt base from `quay.io/go-skynet/ci-cache:base-grpc-*` (CI builds these via `.github/workflows/base-images.yml`). The mapping is by `(build-type, platforms)` — see existing entries for the pattern. CI uses these prebuilt bases to skip the gRPC compile (~25–35 min cold). Local `make backends/<name>` ignores `builder-base-image` and uses the from-source path inside the Dockerfile, so you don't need quay access for local builds.
 
+### Cover every OS the project supports (Linux **and** Darwin)
+
+`.github/backend-matrix.yml` has two matrices, and they are the source of truth for which OS a backend ships on:
+
+- `include:` — the **Linux** matrix (x86_64 + arm64; CPU and CUDA / ROCm / SYCL / Vulkan).
+- `includeDarwin:` — the **macOS / Apple Silicon** matrix (arm64; Metal where the engine supports it, otherwise a native arm64 CPU build).
+
+**A new backend must target every OS it can build for — do not ship Linux-only by default.** A backend that appears only under `include:` is silently unavailable on macOS even when its code would run there. Most C/C++/GGML engines build on Darwin out of the box (ggml defaults `GGML_METAL=ON` on Apple, so a plain build is Metal-enabled), and many Python backends do too (CPU / MPS wheels). If a backend genuinely cannot support an OS (e.g. CUDA-only, no CPU variant), state that in the PR description instead of omitting it silently.
+
+Wiring a backend into `includeDarwin:` is more than the matrix entry:
+
+1. **`includeDarwin:` entry** — `tag-suffix: "-metal-darwin-arm64-<backend>"`, `build-type: "metal"`, `lang: "go"` for go+ggml backends; omit `build-type` for the bespoke C++ ones (llama-cpp / ds4 / privacy-filter). Match an existing entry of the same shape.
+2. **`backend/index.yaml`** — add `metal:` to the backend's `capabilities` map (main and `-development`) and concrete `metal-<backend>` / `metal-<backend>-development` image entries pointing at the `-metal-darwin-arm64-<backend>` images.
+3. **C/C++ backends only** — add an `inferBackendPathDarwin` case in `scripts/changed-backends.js` returning `backend/cpp/<backend>/` (the generic fallthrough assumes `backend/<lang>/`, which is wrong for a C++ source tree driven with `lang: go`), and give `run.sh` a Darwin branch that exports `DYLD_LIBRARY_PATH` instead of `LD_LIBRARY_PATH`. If the build is bespoke (single `grpc-server` + dylib bundling), model it on `scripts/build/ds4-darwin.sh` and add a `backends/<backend>-darwin` make target plus a gated step in `.github/workflows/backend_build_darwin.yml`.
+4. **C++ proto gotcha** — if the backend compiles the generated gRPC/protobuf in a separate CMake target (e.g. `hw_grpc_proto`), that target must link `protobuf::libprotobuf` + `gRPC::grpc++` so the Homebrew include dirs propagate; otherwise macOS fails with `google/protobuf/runtime_version.h not found` (Linux hides this because apt headers sit in `/usr/include`).
+
+The CI path filter only builds a backend on a PR when a file under its directory changes, so a darwin-only YAML edit builds nothing — touch a file under `backend/<lang>/<backend>/` (a one-line comment is enough) in the same PR.
+
 ## 3. Add Backend Metadata to `backend/index.yaml`
 
 **Step 3a: Add Meta Definition**
@@ -225,6 +243,7 @@ After adding a new backend, verify:
 
 - [ ] Backend directory structure is complete with all necessary files
 - [ ] Build configurations added to `.github/backend-matrix.yml` for all desired platforms (per-arch entries with `platform-tag` for multi-arch; `builder-base-image` for llama-cpp / ik-llama-cpp / turboquant)
+- [ ] **OS coverage considered**: added to `includeDarwin:` (macOS/Apple Silicon) if the backend can build there — with the `backend/index.yaml` `metal:` capability + `metal-<backend>` image entries, a `run.sh` Darwin/DYLD branch and `inferBackendPathDarwin` case for C++ backends — or the PR explains why an OS is unsupported. Do not ship Linux-only by default.
 - [ ] Meta definition added to `backend/index.yaml` in the `## metas` section
 - [ ] Image entries added to `backend/index.yaml` for all build variants (latest + development)
 - [ ] Tag suffixes match between workflow file and index.yaml
diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml
index 4d2b85977..b66f1bbf3 100644
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -2,6 +2,28 @@
 # Matrix data for backend container image builds.
 # Consumed by scripts/changed-backends.js for both backend.yml and backend_pr.yml.
 # This file is NOT a workflow — it has no top-level 'on:' or 'jobs:'.
+#
+# OS / platform coverage — READ THIS WHEN ADDING A BACKEND
+# --------------------------------------------------------
+# This file is the source of truth for which OS each backend is built and
+# published for. A backend ships ONLY for the matrices it appears in:
+#   - Linux  -> the `include:` matrix below (x86_64 + arm64; CPU and
+#               CUDA / ROCm / SYCL / Vulkan variants).
+#   - macOS  -> the `includeDarwin:` matrix (Apple Silicon / arm64; Metal where
+#               the engine supports it, otherwise a native arm64 CPU build).
+#
+# New backends must target EVERY OS they can build for, not just Linux. A backend
+# listed only under `include:` is silently unavailable on macOS even when its code
+# would run there. Most C/C++/GGML engines build on Darwin (ggml defaults
+# GGML_METAL=ON on Apple, so a plain build is Metal-enabled), and many Python
+# backends do too (CPU / MPS). If a backend genuinely cannot support an OS, say so
+# in its PR description rather than silently omitting it.
+#
+# Adding a backend to `includeDarwin:` is more than one line — see the darwin
+# checklist in .agents/adding-backends.md (includeDarwin entry, the index.yaml
+# `metal:` capability + `metal-<backend>` image entries, a `run.sh` Darwin/DYLD
+# branch for C/C++ backends, and the inferBackendPathDarwin case in
+# scripts/changed-backends.js so the path filter actually builds it).
 
 # Linux matrix (consumed by backend-jobs).
 include:
diff --git a/AGENTS.md b/AGENTS.md
index 9f397e613..1095ef531 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -43,4 +43,5 @@ LocalAI follows the Linux kernel project's [guidelines for AI coding assistants]
 - **New API endpoints**: LocalAI advertises its capability surface in several independent places — swagger `@Tags`, `/api/instructions` registry, auth `RouteFeatureRegistry`, React UI `capabilities.js`, docs. Read [.agents/api-endpoints-and-auth.md](.agents/api-endpoints-and-auth.md) and follow its checklist — missing any surface means clients, admins, and the UI won't know the endpoint exists.
 - **Admin endpoints → MCP tool**: every admin endpoint that an admin would manage conversationally (install/list/edit/toggle/upgrade) MUST also be exposed as an MCP tool in `pkg/mcp/localaitools/`. The LocalAI Assistant chat modality and the standalone `local-ai mcp-server` consume that package; drift between REST and MCP is a real risk. Read [.agents/localai-assistant-mcp.md](.agents/localai-assistant-mcp.md) — the `TestToolHTTPRouteMappingComplete` test fails until you wire the new tool and update the route map.
 - **Build**: Inspect `Makefile` and `.github/workflows/` — ask the user before running long builds
+- **Backend OS coverage**: a new backend must target every OS it can build for, not just Linux. `.github/backend-matrix.yml` has two matrices — `include:` (Linux) and `includeDarwin:` (macOS / Apple Silicon). Most C/C++/GGML and many Python backends build on Darwin too — wire the `includeDarwin` entry + `backend/index.yaml` `metal:` entries, or say in the PR why an OS is unsupported. See the darwin checklist in [.agents/adding-backends.md](.agents/adding-backends.md).
 - **UI**: The active UI is the React app in `core/http/react-ui/`. The older Alpine.js/HTML UI in `core/http/static/` is pending deprecation — all new UI work goes in the React UI