chore(deps): bump mxschmitt/action-tmate from 3.23 to 3.24

Bumps [mxschmitt/action-tmate](https://github.com/mxschmitt/action-tmate) from 3.23 to 3.24. - [Release notes](https://github.com/mxschmitt/action-tmate/releases) - [Changelog](https://github.com/mxschmitt/action-tmate/blob/master/RELEASE.md) - [Commits](https://github.com/mxschmitt/action-tmate/compare/v3.23...v3.24) --- updated-dependencies: - dependency-name: mxschmitt/action-tmate dependency-version: '3.24' dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>
chore(deps): bump actions/checkout from 6 to 7 (#10451 )
2026-06-23 16:19:07 -04:00 · 2026-06-22 19:42:05 +00:00 · 2026-06-22 21:38:37 +02:00 · 2026-06-22 21:28:49 +02:00 · 2026-06-22 21:27:43 +02:00 · 2026-06-22 18:26:19 +02:00
147 changed files with 4548 additions and 4724 deletions
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -3723,302 +3723,6 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
-  # voice-detect
-  - build-type: 'cublas'
-    cuda-major-version: "12"
-    cuda-minor-version: "8"
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-nvidia-cuda-12-voice-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "voice-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'cublas'
-    cuda-major-version: "13"
-    cuda-minor-version: "0"
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-nvidia-cuda-13-voice-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "voice-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'cublas'
-    cuda-major-version: "13"
-    cuda-minor-version: "0"
-    platforms: 'linux/arm64'
-    skip-drivers: 'false'
-    tag-latest: 'auto'
-    tag-suffix: '-nvidia-l4t-cuda-13-arm64-voice-detect'
-    base-image: "ubuntu:24.04"
-    ubuntu-version: '2404'
-    runs-on: 'ubuntu-24.04-arm'
-    backend: "voice-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-  - build-type: ''
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    platform-tag: 'amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-cpu-voice-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "voice-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: ''
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/arm64'
-    platform-tag: 'arm64'
-    tag-latest: 'auto'
-    tag-suffix: '-cpu-voice-detect'
-    runs-on: 'ubuntu-24.04-arm'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "voice-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'sycl_f32'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-intel-sycl-f32-voice-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
-    skip-drivers: 'false'
-    backend: "voice-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'sycl_f16'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-intel-sycl-f16-voice-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
-    skip-drivers: 'false'
-    backend: "voice-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'vulkan'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    platform-tag: 'amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-vulkan-voice-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "voice-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'vulkan'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/arm64'
-    platform-tag: 'arm64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-vulkan-voice-detect'
-    runs-on: 'ubuntu-24.04-arm'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "voice-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'cublas'
-    cuda-major-version: "12"
-    cuda-minor-version: "0"
-    platforms: 'linux/arm64'
-    skip-drivers: 'false'
-    tag-latest: 'auto'
-    tag-suffix: '-nvidia-l4t-arm64-voice-detect'
-    base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
-    runs-on: 'ubuntu-24.04-arm'
-    backend: "voice-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2204'
-  - build-type: 'hipblas'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-rocm-hipblas-voice-detect'
-    base-image: "rocm/dev-ubuntu-24.04:7.2.1"
-    runs-on: 'ubuntu-latest'
-    skip-drivers: 'false'
-    backend: "voice-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  # face-detect
-  - build-type: 'cublas'
-    cuda-major-version: "12"
-    cuda-minor-version: "8"
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-nvidia-cuda-12-face-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "face-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'cublas'
-    cuda-major-version: "13"
-    cuda-minor-version: "0"
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-nvidia-cuda-13-face-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "face-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'cublas'
-    cuda-major-version: "13"
-    cuda-minor-version: "0"
-    platforms: 'linux/arm64'
-    skip-drivers: 'false'
-    tag-latest: 'auto'
-    tag-suffix: '-nvidia-l4t-cuda-13-arm64-face-detect'
-    base-image: "ubuntu:24.04"
-    ubuntu-version: '2404'
-    runs-on: 'ubuntu-24.04-arm'
-    backend: "face-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-  - build-type: ''
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    platform-tag: 'amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-cpu-face-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "face-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: ''
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/arm64'
-    platform-tag: 'arm64'
-    tag-latest: 'auto'
-    tag-suffix: '-cpu-face-detect'
-    runs-on: 'ubuntu-24.04-arm'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "face-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'sycl_f32'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-intel-sycl-f32-face-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
-    skip-drivers: 'false'
-    backend: "face-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'sycl_f16'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-intel-sycl-f16-face-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
-    skip-drivers: 'false'
-    backend: "face-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'vulkan'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    platform-tag: 'amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-vulkan-face-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "face-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'vulkan'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/arm64'
-    platform-tag: 'arm64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-vulkan-face-detect'
-    runs-on: 'ubuntu-24.04-arm'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "face-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'cublas'
-    cuda-major-version: "12"
-    cuda-minor-version: "0"
-    platforms: 'linux/arm64'
-    skip-drivers: 'false'
-    tag-latest: 'auto'
-    tag-suffix: '-nvidia-l4t-arm64-face-detect'
-    base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
-    runs-on: 'ubuntu-24.04-arm'
-    backend: "face-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2204'
-  - build-type: 'hipblas'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-rocm-hipblas-face-detect'
-    base-image: "rocm/dev-ubuntu-24.04:7.2.1"
-    runs-on: 'ubuntu-latest'
-    skip-drivers: 'false'
-    backend: "face-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
  # acestep-cpp
  - build-type: ''
    cuda-major-version: ""
@@ -5202,14 +4906,6 @@ includeDarwin:
    tag-suffix: "-metal-darwin-arm64-ced"
    build-type: "metal"
    lang: "go"
-  - backend: "voice-detect"
-    tag-suffix: "-metal-darwin-arm64-voice-detect"
-    build-type: "metal"
-    lang: "go"
-  - backend: "face-detect"
-    tag-suffix: "-metal-darwin-arm64-face-detect"
-    build-type: "metal"
-    lang: "go"
  - backend: "acestep-cpp"
    tag-suffix: "-metal-darwin-arm64-acestep-cpp"
    build-type: "metal"
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -44,7 +44,7 @@ jobs:
      has-merges-singlearch: ${{ steps.set-matrix.outputs['has-merges-singlearch'] }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7

      - name: Setup Bun
        uses: oven-sh/setup-bun@v2
--- a/.github/workflows/backend_build.yml
+++ b/.github/workflows/backend_build.yml
@@ -101,7 +101,7 @@ jobs:
    steps:

      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true

--- a/.github/workflows/backend_build_darwin.yml
+++ b/.github/workflows/backend_build_darwin.yml
@@ -57,7 +57,7 @@ jobs:
      HOMEBREW_NO_ANALYTICS: '1'
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true

--- a/.github/workflows/backend_merge.yml
+++ b/.github/workflows/backend_merge.yml
@@ -49,7 +49,7 @@ jobs:
      # Sparse checkout: the merge job needs `.github/scripts/` (for the
      # keepalive cleanup script) but none of the source tree.
      - name: Checkout (.github/scripts only)
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          sparse-checkout: |
            .github/scripts
--- a/.github/workflows/backend_pr.yml
+++ b/.github/workflows/backend_pr.yml
@@ -23,7 +23,7 @@ jobs:
      has-merges-singlearch: ${{ steps.set-matrix.outputs['has-merges-singlearch'] }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7

      - name: Setup Bun
        uses: oven-sh/setup-bun@v2
--- a/.github/workflows/base-images.yml
+++ b/.github/workflows/base-images.yml
@@ -127,7 +127,7 @@ jobs:
            # the original l4t matrix entry which set skip-drivers: 'true'.
            skip-drivers: 'true'
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
        with:
          submodules: false
      - name: Free disk space
--- a/.github/workflows/build-test.yaml
+++ b/.github/workflows/build-test.yaml
@@ -11,7 +11,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          fetch-depth: 0
      - name: Set up Go
@@ -25,7 +25,7 @@ jobs:
    runs-on: macos-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          fetch-depth: 0
      - name: Set up Go
@@ -47,7 +47,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          fetch-depth: 0
      - name: Configure apt mirror on runner
--- a/.github/workflows/bump-inference-defaults.yml
+++ b/.github/workflows/bump-inference-defaults.yml
@@ -14,7 +14,7 @@ jobs:
  bump:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7

      - uses: actions/setup-go@v5
        with:
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -46,14 +46,6 @@ jobs:
            variable: "CED_VERSION"
            branch: "master"
            file: "backend/go/ced/Makefile"
-          - repository: "mudler/voice-detect.cpp"
-            variable: "VOICEDETECT_VERSION"
-            branch: "master"
-            file: "backend/go/voice-detect/Makefile"
-          - repository: "mudler/face-detect.cpp"
-            variable: "FACEDETECT_VERSION"
-            branch: "master"
-            file: "backend/go/face-detect/Makefile"
          - repository: "mudler/depth-anything.cpp"
            variable: "DEPTHANYTHING_VERSION"
            branch: "master"
@@ -100,7 +92,7 @@ jobs:
            file: "backend/go/vibevoice-cpp/Makefile"
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
      - name: Bump dependencies 🔧
        id: bump
        run: |
@@ -136,7 +128,7 @@ jobs:
    if: github.repository == 'mudler/LocalAI'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
      - name: Bump vLLM cu130 wheel pin 🔧
        id: bump
        run: |
--- a/.github/workflows/bump_docs.yaml
+++ b/.github/workflows/bump_docs.yaml
@@ -13,7 +13,7 @@ jobs:
          - repository: "mudler/LocalAI"
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
      - name: Bump dependencies 🔧
        run: |
          bash .github/bump_docs.sh ${{ matrix.repository }}
--- a/.github/workflows/checksum_checker.yaml
+++ b/.github/workflows/checksum_checker.yaml
@@ -8,7 +8,7 @@ jobs:
    if: github.repository == 'mudler/LocalAI'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
      - name: Configure apt mirror on runner
        uses: ./.github/actions/configure-apt-mirror
      - name: Install dependencies
--- a/.github/workflows/deploy-explorer.yaml
+++ b/.github/workflows/deploy-explorer.yaml
@@ -16,7 +16,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - uses: actions/setup-go@v5
--- a/.github/workflows/gallery-agent.yaml
+++ b/.github/workflows/gallery-agent.yaml
@@ -31,7 +31,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          token: ${{ secrets.GITHUB_TOKEN }}

--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -44,7 +44,7 @@ jobs:
        uses: docker/setup-buildx-action@master

      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7

      - name: Cache Intel images
        uses: docker/build-push-action@v7
--- a/.github/workflows/gh-pages.yml
+++ b/.github/workflows/gh-pages.yml
@@ -28,7 +28,7 @@ jobs:
      HUGO_VERSION: "0.146.3"
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          fetch-depth: 0  # needed for enableGitInfo
          submodules: true
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -80,7 +80,7 @@ jobs:
    steps:

      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7

      - name: Configure apt mirror on runner
        id: apt_mirror
--- a/.github/workflows/image_merge.yml
+++ b/.github/workflows/image_merge.yml
@@ -36,7 +36,7 @@ jobs:
      # Sparse checkout: needed for .github/scripts/ (the keepalive cleanup
      # script). Skips the rest of the source tree.
      - name: Checkout (.github/scripts only)
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          sparse-checkout: |
            .github/scripts
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -20,7 +20,7 @@ jobs:
  golangci-lint:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
        with:
          # Full history so golangci-lint's new-from-merge-base can reach
          # origin/master and compute the diff against it.
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -10,7 +10,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          fetch-depth: 0
      - name: Set up Go
@@ -28,7 +28,7 @@ jobs:
    runs-on: macos-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          fetch-depth: 0
      - name: Set up Go
@@ -46,7 +46,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          fetch-depth: 0
      - name: Configure apt mirror on runner
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -14,7 +14,7 @@ jobs:
      GO111MODULE: on
    steps:
      - name: Checkout Source
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -50,7 +50,7 @@ jobs:
      parakeet-cpp: ${{ steps.detect.outputs.parakeet-cpp }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
      - name: Setup Bun
        uses: oven-sh/setup-bun@v2
      - name: Install dependencies
@@ -67,7 +67,7 @@ jobs:
  #   runs-on: ubuntu-latest
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -90,7 +90,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -113,7 +113,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -137,7 +137,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -158,7 +158,7 @@ jobs:
  #  runs-on: ubuntu-latest
  #  steps:
  #    - name: Clone
-  #      uses: actions/checkout@v6
+  #      uses: actions/checkout@v7
  #      with:
  #        submodules: true
  #    - name: Dependencies
@@ -178,7 +178,7 @@ jobs:
  #   runs-on: ubuntu-latest
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -240,7 +240,7 @@ jobs:
  #           sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
  #           df -h
  #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -265,7 +265,7 @@ jobs:
  #   runs-on: ubuntu-latest
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -288,7 +288,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -309,7 +309,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -330,7 +330,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -351,7 +351,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -373,7 +373,7 @@ jobs:
  #   timeout-minutes: 45
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -394,7 +394,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -415,7 +415,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -436,7 +436,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -462,7 +462,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -484,7 +484,7 @@ jobs:
    timeout-minutes: 30
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -513,7 +513,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -530,7 +530,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -552,7 +552,7 @@ jobs:
    timeout-minutes: 20
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -579,7 +579,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -604,7 +604,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -625,7 +625,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -645,7 +645,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -664,7 +664,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -681,7 +681,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -698,7 +698,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -741,7 +741,7 @@ jobs:
  #   timeout-minutes: 90
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -783,7 +783,7 @@ jobs:
  #   timeout-minutes: 90
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -808,7 +808,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -840,7 +840,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -876,7 +876,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -915,7 +915,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -952,7 +952,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -987,7 +987,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -1013,7 +1013,7 @@ jobs:
    timeout-minutes: 150
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -1042,7 +1042,7 @@ jobs:
    timeout-minutes: 60
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -1058,7 +1058,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -1091,7 +1091,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -1114,7 +1114,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -1140,7 +1140,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -21,7 +21,7 @@ jobs:
        go-version: ['1.26.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Free disk space
@@ -71,7 +71,7 @@ jobs:
          if-no-files-found: ignore
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.23
+        uses: mxschmitt/action-tmate@v3.24
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -84,7 +84,7 @@ jobs:
        go-version: ['1.26.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
@@ -116,7 +116,7 @@ jobs:
          PATH="$PATH:$HOME/go/bin" BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.23
+        uses: mxschmitt/action-tmate@v3.24
        with:
          detached: true
          connect-timeout-seconds: 180
--- a/.github/workflows/tests-aio.yml
+++ b/.github/workflows/tests-aio.yml
@@ -62,7 +62,7 @@ jobs:
          sudo rm -rfv build || true
          df -h
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -79,7 +79,7 @@ jobs:
            PATH="$PATH:$HOME/go/bin" make backends/local-store backends/silero-vad backends/llama-cpp backends/whisper backends/piper backends/stablediffusion-ggml docker-build-e2e e2e-aio
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.23
+        uses: mxschmitt/action-tmate@v3.24
        with:
          detached: true
          connect-timeout-seconds: 180
--- a/.github/workflows/tests-e2e.yml
+++ b/.github/workflows/tests-e2e.yml
@@ -21,7 +21,7 @@ jobs:
        go-version: ['1.25.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Configure apt mirror on runner
@@ -57,7 +57,7 @@ jobs:
          PATH="$PATH:$HOME/go/bin" make build-mock-backend test-e2e
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.23
+        uses: mxschmitt/action-tmate@v3.24
        with:
          detached: true
          connect-timeout-seconds: 180
--- a/.github/workflows/tests-pii-ner-e2e.yml
+++ b/.github/workflows/tests-pii-ner-e2e.yml
@@ -0,0 +1,97 @@
+---
+name: 'PII NER tier E2E (live GGUF, CPU)'
+
+# Runs the real privacy-filter GGUF NER tier end-to-end on CPU — the gap the
+# hermetic tests/e2e suite cannot cover (it only exercises the in-process
+# pattern tier). Heavy (builds the C++ backend image + downloads a ~2.7 GB
+# GGUF), so it is path-filtered on PRs and otherwise runs nightly / on demand.
+#
+# This drives the container-level harness (tests/e2e-backends) via
+# `make test-extra-backend-privacy-filter`: it builds the privacy-filter image,
+# downloads the model, loads it on CPU, and asserts byte-correct, UTF-8-aligned
+# TokenClassify spans. The complementary HTTP-path specs in tests/e2e
+# (e2e_pii_ner_test.go) Skip unless PII_NER_MODEL_GGUF is wired.
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 3 * * *'
+  push:
+    branches:
+      - master
+    paths:
+      - 'backend/cpp/privacy-filter/**'
+      - 'backend/Dockerfile.privacy-filter'
+      - 'core/services/routing/pii/**'
+      - 'core/services/routing/piidetector/**'
+      - 'core/backend/token_classify.go'
+      - 'core/http/endpoints/localai/pii.go'
+      - 'core/schema/pii.go'
+      - 'tests/e2e-backends/**'
+      - 'tests/e2e/e2e_pii_ner_test.go'
+      - 'tests/e2e/e2e_suite_test.go'
+      - '.github/workflows/tests-pii-ner-e2e.yml'
+  pull_request:
+    paths:
+      - 'backend/cpp/privacy-filter/**'
+      - 'backend/Dockerfile.privacy-filter'
+      - 'core/services/routing/pii/**'
+      - 'core/services/routing/piidetector/**'
+      - 'core/backend/token_classify.go'
+      - 'core/http/endpoints/localai/pii.go'
+      - 'core/schema/pii.go'
+      - 'tests/e2e-backends/**'
+      - 'tests/e2e/e2e_pii_ner_test.go'
+      - 'tests/e2e/e2e_suite_test.go'
+      - '.github/workflows/tests-pii-ner-e2e.yml'
+
+concurrency:
+  group: ci-tests-pii-ner-e2e-${{ github.event.pull_request.number || github.sha }}-${{ github.repository }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  tests-pii-ner-e2e:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        go-version: ['1.25.x']
+    steps:
+      - name: Clone
+        uses: actions/checkout@v7
+        with:
+          submodules: true
+      - name: Free disk space
+        run: |
+          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL || true
+          sudo docker image prune --all --force || true
+          df -h
+      - name: Configure apt mirror on runner
+        uses: ./.github/actions/configure-apt-mirror
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v5
+        with:
+          go-version: ${{ matrix.go-version }}
+          cache: false
+      - name: Proto Dependencies
+        run: |
+          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
+          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+          rm protoc.zip
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          PATH="$PATH:$HOME/go/bin" make protogen-go
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential
+      # Builds local-ai-backend:privacy-filter, downloads the GGUF, loads it on
+      # CPU and runs the token_classify capability spec (byte-offset contract).
+      - name: Run live PII NER backend E2E
+        run: PATH="$PATH:$HOME/go/bin" make test-extra-backend-privacy-filter
+      - name: Setup tmate session if tests fail
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3.24
+        with:
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true
--- a/.github/workflows/tests-ui-e2e.yml
+++ b/.github/workflows/tests-ui-e2e.yml
@@ -23,7 +23,7 @@ jobs:
        go-version: ['1.26.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Configure apt mirror on runner
@@ -75,7 +75,7 @@ jobs:
          retention-days: 7
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.23
+        uses: mxschmitt/action-tmate@v3.24
        with:
          detached: true
          connect-timeout-seconds: 180
--- a/.github/workflows/update_swagger.yaml
+++ b/.github/workflows/update_swagger.yaml
@@ -10,7 +10,7 @@ jobs:
      fail-fast: false
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
      - name: Configure apt mirror on runner
        uses: ./.github/actions/configure-apt-mirror
      - uses: actions/setup-go@v5
--- a/.gitignore
+++ b/.gitignore
@@ -91,3 +91,6 @@ core/http/react-ui/test-results/

 # Local worktrees
 .worktrees/
+
+# SDD / brainstorm scratch (agent-driven development)
+.superpowers/
--- a/10
+++ b/10
@@ -690,6 +690,16 @@ test-extra-backend-llama-cpp-transcription: docker-build-llama-cpp
 	BACKEND_TEST_CTX_SIZE=2048 \
 	$(MAKE) test-extra-backend

+## privacy-filter: the PII/NER token-classification backend. Exercises the
+## TokenClassify RPC and asserts byte-correct, UTF-8-aligned span offsets
+## against the openai-privacy-filter multilingual GGUF (CPU-runnable, ~50M
+## active params). This is the live-backend coverage for the PII NER tier.
+test-extra-backend-privacy-filter: docker-build-privacy-filter
+	BACKEND_IMAGE=local-ai-backend:privacy-filter \
+	BACKEND_TEST_MODEL_URL=https://huggingface.co/LocalAI-io/privacy-filter-multilingual-GGUF/resolve/main/privacy-filter-multilingual-f16.gguf \
+	BACKEND_TEST_CAPS=health,load,token_classify \
+	$(MAKE) test-extra-backend
+
 ## vllm is resolved from a HuggingFace model id (no file download) and
 ## exercises Predict + streaming + tool-call extraction via the hermes parser.
 ## Requires a host CPU with the SIMD instructions the prebuilt vllm CPU
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=e475fa2b5f9fb50c3d6fc3e7c6fdf1e004465b62
+LLAMA_VERSION?=7c082bc417bbe53210a83df4ba5b49e18ce6193c
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
--- a/backend/go/crispasr/Makefile
+++ b/backend/go/crispasr/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # CrispASR version (release tag)
 CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
-CRISPASR_VERSION?=d745bda4386ae0f9d1d2f23fff8ec95d76428221
+CRISPASR_VERSION?=7a8cb80907341c0204bd0488c1244764f4163883
 SO_TARGET?=libgocrispasr.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/face-detect/.gitignore
+++ b/backend/go/face-detect/.gitignore
@@ -1,18 +0,0 @@
-# Fetched upstream sources
-sources/
-
-# CMake build directories
-build*/
-
-# build artifacts staged in-tree by the Makefile (cp from sources/) or
-# symlinked for local dev; the real sources live in face-detect.cpp upstream.
-*.so
-*.so.*
-facedetect_capi.h
-compile_commands.json
-
-# Compiled backend binary
-face-detect-grpc
-
-# Packaging output
-package/
--- a/backend/go/face-detect/Makefile
+++ b/backend/go/face-detect/Makefile
@@ -1,97 +0,0 @@
-# face-detect backend Makefile.
-#
-# Upstream pin lives below as FACEDETECT_VERSION?=be22d67... (.github/bump_deps.sh
-# can find and update it - matches the voice-detect / parakeet.cpp / whisper.cpp
-# convention).
-#
-# Local dev shortcut: if you already have an out-of-tree face-detect.cpp build,
-# symlink the .so + header into this directory and skip the clone/cmake steps:
-#
-#   ln -sf /path/to/face-detect.cpp/build-shared/libfacedetect.so .
-#   ln -sf /path/to/face-detect.cpp/include/facedetect_capi.h .
-#   go build -o face-detect-grpc .
-#
-# The default target below does the proper clone-at-pin + cmake build so CI does
-# not need a side-checkout.
-
-FACEDETECT_VERSION?=be22d67145a8bcd879f45ad33fbea03131c5922b
-FACEDETECT_REPO?=https://github.com/mudler/face-detect.cpp
-
-GOCMD?=go
-GO_TAGS?=
-JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
-
-BUILD_TYPE?=
-NATIVE?=false
-
-# Build ggml + the vendored libjpeg-turbo statically into libfacedetect.so (PIC)
-# so the shared lib is self-contained: dlopen needs no libggml*.so alongside it,
-# only system libs (libstdc++/libgomp/libc) the runtime image already provides.
-# The vendored jpeg symbols are hidden via -Wl,--exclude-libs,ALL on the C++
-# side, so only the facedetect_capi_* surface is exported.
-CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DFACEDETECT_SHARED=ON -DFACEDETECT_BUILD_CLI=OFF -DFACEDETECT_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-
-ifeq ($(NATIVE),false)
-	CMAKE_ARGS+=-DGGML_NATIVE=OFF
-endif
-
-# face-detect.cpp gates its GGML backends behind FACEDETECT_GGML_* options and
-# does set(GGML_CUDA ${FACEDETECT_GGML_CUDA} CACHE BOOL "" FORCE), so a bare
-# -DGGML_CUDA=ON is overwritten back to OFF. Forward the FACEDETECT_GGML_*
-# options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
-ifeq ($(BUILD_TYPE),cublas)
-	CMAKE_ARGS+=-DFACEDETECT_GGML_CUDA=ON
-else ifeq ($(BUILD_TYPE),openblas)
-	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
-else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DFACEDETECT_GGML_HIP=ON
-else ifeq ($(BUILD_TYPE),vulkan)
-	CMAKE_ARGS+=-DFACEDETECT_GGML_VULKAN=ON
-else ifeq ($(BUILD_TYPE),metal)
-	CMAKE_ARGS+=-DFACEDETECT_GGML_METAL=ON
-endif
-
-.PHONY: face-detect-grpc package build clean purge test all
-
-all: face-detect-grpc
-
-# Clone the upstream face-detect.cpp source at the pinned commit. Directory acts
-# as the target so make only re-clones when missing. After a FACEDETECT_VERSION
-# bump, run 'make purge && make' to refetch.
-sources/face-detect.cpp:
-	mkdir -p sources/face-detect.cpp
-	cd sources/face-detect.cpp && \
-	git init -q && \
-	git remote add origin $(FACEDETECT_REPO) && \
-	git fetch --depth 1 origin $(FACEDETECT_VERSION) && \
-	git checkout FETCH_HEAD && \
-	git submodule update --init --recursive --depth 1 --single-branch
-
-# Build the shared lib + header out-of-tree, then stage them next to the Go
-# sources so purego.Dlopen("libfacedetect.so") and the cgo-less build both pick
-# them up.
-libfacedetect.so: sources/face-detect.cpp
-	cmake -B sources/face-detect.cpp/build-shared -S sources/face-detect.cpp $(CMAKE_ARGS)
-	cmake --build sources/face-detect.cpp/build-shared --config Release -j$(JOBS) --target facedetect
-	cp -fv sources/face-detect.cpp/build-shared/libfacedetect.so* ./ 2>/dev/null || true
-	cp -fv sources/face-detect.cpp/include/facedetect_capi.h ./
-
-face-detect-grpc: libfacedetect.so main.go gofacedetect.go options.go
-	CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o face-detect-grpc .
-
-package: face-detect-grpc
-	bash package.sh
-
-build: package
-
-# Test target. The embed/detect/verify/analyze smoke specs are gated on
-# FACEDETECT_BACKEND_TEST_MODEL + FACEDETECT_BACKEND_TEST_IMAGE; without them the
-# heavy specs auto-skip and only the pure-Go parsing specs run.
-test:
-	LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
-
-clean: purge
-	rm -rf libfacedetect.so* facedetect_capi.h package face-detect-grpc
-
-purge:
-	rm -rf sources/face-detect.cpp
--- a/backend/go/face-detect/gofacedetect.go
+++ b/backend/go/face-detect/gofacedetect.go
@@ -1,431 +0,0 @@
-package main
-
-import (
-	"encoding/base64"
-	"encoding/json"
-	"errors"
-	"fmt"
-	"math"
-	"os"
-	"path/filepath"
-	"strconv"
-	"strings"
-	"time"
-	"unsafe"
-
-	"github.com/mudler/LocalAI/pkg/grpc/base"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	"github.com/mudler/xlog"
-)
-
-// purego-bound entry points from libfacedetect.so. Names match
-// facedetect_capi.h exactly so a `nm libfacedetect.so | grep facedetect_capi`
-// is enough to spot drift.
-//
-// The opaque ctx and the malloc'd char*/float* return values are declared as
-// uintptr so we get the raw pointer back and can release it via the matching
-// capi free function. purego's native string/[]float32 returns would copy and
-// forget the original pointer, leaking the C-owned buffer on every call.
-var (
-	CppAbiVersion  func() int32
-	CppLoad        func(ggufPath string) uintptr
-	CppFree        func(ctx uintptr)
-	CppLastError   func(ctx uintptr) string
-	CppFreeString  func(s uintptr)
-	CppFreeVec     func(v uintptr)
-	CppEmbedPath   func(ctx uintptr, imagePath string, outVec, outDim unsafe.Pointer) int32
-	CppEmbedRGB    func(ctx uintptr, rgb []byte, width, height int32, outVec, outDim unsafe.Pointer) int32
-	CppDetectJSON  func(ctx uintptr, imagePath string) uintptr
-	CppVerifyPaths func(ctx uintptr, a, b string, threshold float32, antiSpoof int32, outDistance, outVerified unsafe.Pointer) int32
-	CppAnalyzeJSON func(ctx uintptr, imagePath string) uintptr
-)
-
-// FaceDetect implements the face-recognition (biometric) subset of the Backend
-// gRPC service over libfacedetect.so. The C side keeps a single loaded model
-// pack plus a per-ctx last-error buffer and is not reentrant, so
-// base.SingleThread serializes every call.
-type FaceDetect struct {
-	base.SingleThread
-	opts   loadOptions
-	ctxPtr uintptr
-}
-
-func (f *FaceDetect) Load(opts *pb.ModelOptions) error {
-	model := opts.ModelFile
-	if model == "" {
-		model = opts.ModelPath
-	}
-	if !filepath.IsAbs(model) && opts.ModelPath != "" {
-		model = filepath.Join(opts.ModelPath, model)
-	}
-	if model == "" {
-		return errors.New("face-detect: ModelFile is required")
-	}
-
-	f.opts = parseOptions(opts.Options)
-	if f.opts.modelName == "" {
-		f.opts.modelName = filepath.Base(model)
-	}
-
-	// Propagate LocalAI's per-model thread budget to the engine. LocalAI spawns
-	// one backend process per model and serves requests concurrently, so the
-	// engine's own min(hardware_concurrency, 8) default can oversubscribe cores.
-	// FACEDETECT_THREADS is read by the engine at backend construction, so it
-	// must be set before the capi load. A non-positive Threads means "unset":
-	// leave the env alone so the engine keeps its sane default.
-	threads := opts.Threads
-	if threads > 0 {
-		if err := os.Setenv("FACEDETECT_THREADS", strconv.Itoa(int(threads))); err != nil {
-			return fmt.Errorf("face-detect: set FACEDETECT_THREADS: %w", err)
-		}
-		xlog.Info("face-detect: applying LocalAI thread budget", "threads", threads)
-	}
-
-	xlog.Info("face-detect: loading model", "model", model,
-		"verify_threshold", f.opts.verifyThreshold, "abi", CppAbiVersion())
-
-	ctx := CppLoad(model)
-	if ctx == 0 {
-		// The last-error buffer lives on the ctx that was never returned, so
-		// surface the path the operator tried to load instead.
-		return fmt.Errorf("face-detect: facedetect_capi_load failed for %q", model)
-	}
-	f.ctxPtr = ctx
-	return nil
-}
-
-// Embeddings returns the L2-normalized ArcFace embedding of the primary face in
-// the supplied image. Mirroring the Python face backend, the image is read from
-// Images[0] as a base64 payload; materializeImage decodes it to a temp file so
-// the path-based C-API can run its own decode (cv2.imread parity). The gRPC
-// server wraps the returned slice in an EmbeddingResult.
-func (f *FaceDetect) Embeddings(req *pb.PredictOptions) ([]float32, error) {
-	if f.ctxPtr == 0 {
-		return nil, errors.New("face-detect: model not loaded")
-	}
-	if len(req.Images) == 0 || req.Images[0] == "" {
-		return nil, errors.New("face-detect: Embedding requires Images[0] to be a base64 image")
-	}
-
-	path, cleanup, err := materializeImage(req.Images[0])
-	if err != nil {
-		return nil, err
-	}
-	defer cleanup()
-
-	return f.embedPath(path)
-}
-
-func (f *FaceDetect) embedPath(path string) ([]float32, error) {
-	var vec uintptr
-	var dim int32
-	rc := CppEmbedPath(f.ctxPtr, path, unsafe.Pointer(&vec), unsafe.Pointer(&dim))
-	if rc != 0 || vec == 0 || dim <= 0 {
-		return nil, f.lastErr("embed", path)
-	}
-	defer CppFreeVec(vec)
-	// Copy out of the C-owned malloc'd buffer before freeing it. The
-	// uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
-	// a C heap pointer from Go-managed memory; safe here, the GC neither tracks
-	// nor moves this buffer and we copy immediately.
-	src := unsafe.Slice((*float32)(unsafe.Pointer(vec)), int(dim)) //nolint:govet // C-owned malloc'd vector, copied out before free
-	out := make([]float32, int(dim))
-	copy(out, src)
-	return out, nil
-}
-
-// Detect runs SCRFD over the image and returns one Detection per face. The
-// C-API emits a box as [x1,y1,x2,y2] in pixels; the proto carries x/y plus
-// width/height, so the corners are converted. The 5 facial landmarks the engine
-// also returns are dropped: the Detection message has no field for them.
-func (f *FaceDetect) Detect(req *pb.DetectOptions) (pb.DetectResponse, error) {
-	if f.ctxPtr == 0 {
-		return pb.DetectResponse{}, errors.New("face-detect: model not loaded")
-	}
-	if req.Src == "" {
-		return pb.DetectResponse{}, errors.New("face-detect: src image is required")
-	}
-
-	path, cleanup, err := materializeImage(req.Src)
-	if err != nil {
-		return pb.DetectResponse{}, err
-	}
-	defer cleanup()
-
-	faces, err := f.detectFaces(path)
-	if err != nil {
-		return pb.DetectResponse{}, err
-	}
-
-	dets := make([]*pb.Detection, 0, len(faces))
-	for _, fc := range faces {
-		if req.Threshold > 0 && fc.Score < req.Threshold {
-			continue
-		}
-		x, y, w, h := fc.xywh()
-		dets = append(dets, &pb.Detection{
-			X:          x,
-			Y:          y,
-			Width:      w,
-			Height:     h,
-			Confidence: fc.Score,
-			ClassName:  "face",
-		})
-	}
-	return pb.DetectResponse{Detections: dets}, nil
-}
-
-// FaceVerify embeds the primary face in each image and reports whether they are
-// the same identity by cosine distance against a threshold. A request threshold
-// <= 0 falls back to the model-configured default (verify_threshold option,
-// 0.35 if unset). When anti_spoofing is set, the C-API applies a MiniFASNet
-// veto internally (verified forced false on a spoof); the per-image liveness
-// scores are not exposed by the verify entry point, so img*_is_real /
-// img*_antispoof_score stay at their zero values.
-func (f *FaceDetect) FaceVerify(req *pb.FaceVerifyRequest) (pb.FaceVerifyResponse, error) {
-	if f.ctxPtr == 0 {
-		return pb.FaceVerifyResponse{}, errors.New("face-detect: model not loaded")
-	}
-	if req.Img1 == "" || req.Img2 == "" {
-		return pb.FaceVerifyResponse{}, errors.New("face-detect: img1 and img2 are required")
-	}
-
-	path1, cleanup1, err := materializeImage(req.Img1)
-	if err != nil {
-		return pb.FaceVerifyResponse{}, err
-	}
-	defer cleanup1()
-	path2, cleanup2, err := materializeImage(req.Img2)
-	if err != nil {
-		return pb.FaceVerifyResponse{}, err
-	}
-	defer cleanup2()
-
-	threshold := req.Threshold
-	if threshold <= 0 {
-		threshold = f.opts.verifyThreshold
-	}
-
-	antiSpoof := int32(0)
-	if req.AntiSpoofing {
-		antiSpoof = 1
-	}
-
-	started := time.Now()
-	var distance float32
-	var verified int32
-	rc := CppVerifyPaths(f.ctxPtr, path1, path2, threshold, antiSpoof,
-		unsafe.Pointer(&distance), unsafe.Pointer(&verified))
-	if rc != 0 {
-		return pb.FaceVerifyResponse{}, f.lastErr("verify", req.Img1[:min(8, len(req.Img1))]+"...")
-	}
-	elapsedMs := float32(time.Since(started).Seconds() * 1000.0)
-
-	// Confidence decays linearly from 100 at distance 0 to 0 at the threshold,
-	// matching the Python face backend's reporting.
-	confidence := float32(0)
-	if threshold > 0 {
-		confidence = float32(math.Max(0, math.Min(100, (1.0-float64(distance)/float64(threshold))*100.0)))
-	}
-
-	return pb.FaceVerifyResponse{
-		Verified:         verified != 0,
-		Distance:         distance,
-		Threshold:        threshold,
-		Confidence:       confidence,
-		Model:            f.opts.modelName,
-		Img1Area:         f.bestArea(path1),
-		Img2Area:         f.bestArea(path2),
-		ProcessingTimeMs: elapsedMs,
-	}, nil
-}
-
-// FaceAnalyze runs the genderage head on every detected face. The C-API returns
-// "M"/"F" gender labels and a rounded age; the labels are normalized to the
-// "Man"/"Woman" values the proto documents.
-func (f *FaceDetect) FaceAnalyze(req *pb.FaceAnalyzeRequest) (pb.FaceAnalyzeResponse, error) {
-	if f.ctxPtr == 0 {
-		return pb.FaceAnalyzeResponse{}, errors.New("face-detect: model not loaded")
-	}
-	if req.Img == "" {
-		return pb.FaceAnalyzeResponse{}, errors.New("face-detect: img is required")
-	}
-
-	path, cleanup, err := materializeImage(req.Img)
-	if err != nil {
-		return pb.FaceAnalyzeResponse{}, err
-	}
-	defer cleanup()
-
-	ptr := CppAnalyzeJSON(f.ctxPtr, path)
-	if ptr == 0 {
-		return pb.FaceAnalyzeResponse{}, f.lastErr("analyze", path)
-	}
-	defer CppFreeString(ptr)
-
-	faces, err := parseAnalyzeJSON(goStringFromCPtr(ptr))
-	if err != nil {
-		return pb.FaceAnalyzeResponse{}, fmt.Errorf("face-detect: analyze JSON: %w", err)
-	}
-	return pb.FaceAnalyzeResponse{Faces: faces}, nil
-}
-
-// faceBox is one entry of the detect/analyze JSON documents the engine emits.
-type faceBox struct {
-	Score  float32   `json:"score"`
-	Box    []float32 `json:"box"`
-	Age    float32   `json:"age"`
-	Gender string    `json:"gender"`
-}
-
-// xywh converts the engine's [x1,y1,x2,y2] box into the x/y/width/height the
-// proto carries. A short or missing box yields zeros.
-func (b faceBox) xywh() (x, y, w, h float32) {
-	if len(b.Box) < 4 {
-		return 0, 0, 0, 0
-	}
-	return b.Box[0], b.Box[1], b.Box[2] - b.Box[0], b.Box[3] - b.Box[1]
-}
-
-type facesJSON struct {
-	Faces []faceBox `json:"faces"`
-}
-
-func (f *FaceDetect) detectFaces(path string) ([]faceBox, error) {
-	ptr := CppDetectJSON(f.ctxPtr, path)
-	if ptr == 0 {
-		return nil, f.lastErr("detect", path)
-	}
-	defer CppFreeString(ptr)
-
-	var doc facesJSON
-	if err := json.Unmarshal([]byte(goStringFromCPtr(ptr)), &doc); err != nil {
-		return nil, fmt.Errorf("face-detect: detect JSON: %w", err)
-	}
-	return doc.Faces, nil
-}
-
-// bestArea returns the FacialArea of the highest-scoring face in an image, or an
-// empty area when detection fails or finds nothing. Best-effort: verify already
-// succeeded, so a missing region must not turn a valid match into an error.
-func (f *FaceDetect) bestArea(path string) *pb.FacialArea {
-	faces, err := f.detectFaces(path)
-	if err != nil || len(faces) == 0 {
-		return &pb.FacialArea{}
-	}
-	best := faces[0]
-	for _, fc := range faces[1:] {
-		if fc.Score > best.Score {
-			best = fc
-		}
-	}
-	x, y, w, h := best.xywh()
-	return &pb.FacialArea{X: x, Y: y, W: w, H: h}
-}
-
-// parseAnalyzeJSON maps the engine's analyze document onto FaceAnalysis entries.
-// The engine reports gender as "M"/"F"; both the dominant label and the score
-// map are filled with the "Man"/"Woman" form the proto documents.
-func parseAnalyzeJSON(doc string) ([]*pb.FaceAnalysis, error) {
-	var parsed facesJSON
-	if err := json.Unmarshal([]byte(doc), &parsed); err != nil {
-		return nil, err
-	}
-
-	out := make([]*pb.FaceAnalysis, 0, len(parsed.Faces))
-	for _, fc := range parsed.Faces {
-		x, y, w, h := fc.xywh()
-		fa := &pb.FaceAnalysis{
-			Region:         &pb.FacialArea{X: x, Y: y, W: w, H: h},
-			FaceConfidence: fc.Score,
-			Age:            fc.Age,
-		}
-		if label := normalizeGender(fc.Gender); label != "" {
-			fa.DominantGender = label
-			fa.Gender = map[string]float32{label: 1.0}
-		}
-		out = append(out, fa)
-	}
-	return out, nil
-}
-
-// normalizeGender maps the engine's "M"/"F" code to the "Man"/"Woman" labels the
-// proto documents. Unknown codes pass through unchanged.
-func normalizeGender(g string) string {
-	switch strings.ToUpper(strings.TrimSpace(g)) {
-	case "M":
-		return "Man"
-	case "F":
-		return "Woman"
-	case "":
-		return ""
-	default:
-		return g
-	}
-}
-
-// materializeImage decodes a base64 image payload into a temp file and returns
-// its path plus a cleanup func. As a convenience for callers that already pass a
-// filesystem path (e.g. a test fixture), an existing path is used as-is with a
-// no-op cleanup. data: URI prefixes are stripped before decoding.
-func materializeImage(src string) (path string, cleanup func(), err error) {
-	noop := func() {}
-	if src == "" {
-		return "", noop, errors.New("face-detect: empty image input")
-	}
-	if _, statErr := os.Stat(src); statErr == nil {
-		return src, noop, nil
-	}
-
-	payload := src
-	if i := strings.Index(payload, ","); strings.HasPrefix(payload, "data:") && i >= 0 {
-		payload = payload[i+1:]
-	}
-	data, decErr := base64.StdEncoding.DecodeString(strings.TrimSpace(payload))
-	if decErr != nil || len(data) == 0 {
-		return "", noop, errors.New("face-detect: image is neither an existing path nor valid base64")
-	}
-
-	tmp, createErr := os.CreateTemp("", "face-detect-*.img")
-	if createErr != nil {
-		return "", noop, fmt.Errorf("face-detect: create temp image: %w", createErr)
-	}
-	cleanup = func() { _ = os.Remove(tmp.Name()) }
-	if _, wErr := tmp.Write(data); wErr != nil {
-		_ = tmp.Close()
-		cleanup()
-		return "", noop, fmt.Errorf("face-detect: write temp image: %w", wErr)
-	}
-	if cErr := tmp.Close(); cErr != nil {
-		cleanup()
-		return "", noop, fmt.Errorf("face-detect: close temp image: %w", cErr)
-	}
-	return tmp.Name(), cleanup, nil
-}
-
-// lastErr wraps the C-API's per-ctx last-error buffer into a Go error.
-func (f *FaceDetect) lastErr(op, subject string) error {
-	msg := strings.TrimSpace(CppLastError(f.ctxPtr))
-	if msg == "" {
-		msg = "no error detail"
-	}
-	return fmt.Errorf("face-detect: %s failed for %q: %s", op, subject, msg)
-}
-
-// goStringFromCPtr copies a NUL-terminated C string into Go memory. cptr is a
-// malloc'd buffer the caller owns; release it via CppFreeString after the copy.
-//
-// The uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
-// a C heap pointer from Go-managed memory. Safe here: the GC neither tracks nor
-// moves the buffer and we dereference it immediately to copy the bytes out.
-func goStringFromCPtr(cptr uintptr) string {
-	if cptr == 0 {
-		return ""
-	}
-	p := unsafe.Pointer(cptr) //nolint:govet // C-owned malloc'd buffer, not Go-GC memory (see doc above)
-	n := 0
-	for *(*byte)(unsafe.Add(p, n)) != 0 {
-		n++
-	}
-	return string(unsafe.Slice((*byte)(p), n))
-}
--- a/backend/go/face-detect/gofacedetect_test.go
+++ b/backend/go/face-detect/gofacedetect_test.go
@@ -1,230 +0,0 @@
-package main
-
-import (
-	"encoding/base64"
-	"os"
-	"sync"
-	"testing"
-
-	"github.com/ebitengine/purego"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-func TestFaceDetect(t *testing.T) {
-	RegisterFailHandler(Fail)
-	RunSpecs(t, "face-detect Backend Suite")
-}
-
-var (
-	libLoadOnce sync.Once
-	libLoadErr  error
-)
-
-// ensureLibLoaded mirrors main.go's bootstrap so a Go test can drive the C-API
-// bridge without spinning up the gRPC server. Records the error (the smoke
-// specs skip themselves) when libfacedetect.so is not loadable from cwd
-// (LD_LIBRARY_PATH or a symlink in ./).
-func ensureLibLoaded() error {
-	libLoadOnce.Do(func() {
-		libName := os.Getenv("FACEDETECT_LIBRARY")
-		if libName == "" {
-			libName = "libfacedetect.so"
-		}
-		lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
-		if err != nil {
-			libLoadErr = err
-			return
-		}
-		purego.RegisterLibFunc(&CppAbiVersion, lib, "facedetect_capi_abi_version")
-		purego.RegisterLibFunc(&CppLoad, lib, "facedetect_capi_load")
-		purego.RegisterLibFunc(&CppFree, lib, "facedetect_capi_free")
-		purego.RegisterLibFunc(&CppLastError, lib, "facedetect_capi_last_error")
-		purego.RegisterLibFunc(&CppFreeString, lib, "facedetect_capi_free_string")
-		purego.RegisterLibFunc(&CppFreeVec, lib, "facedetect_capi_free_vec")
-		purego.RegisterLibFunc(&CppEmbedPath, lib, "facedetect_capi_embed_path")
-		purego.RegisterLibFunc(&CppEmbedRGB, lib, "facedetect_capi_embed_rgb")
-		purego.RegisterLibFunc(&CppDetectJSON, lib, "facedetect_capi_detect_path_json")
-		purego.RegisterLibFunc(&CppVerifyPaths, lib, "facedetect_capi_verify_paths")
-		purego.RegisterLibFunc(&CppAnalyzeJSON, lib, "facedetect_capi_analyze_path_json")
-	})
-	return libLoadErr
-}
-
-var _ = Describe("parseOptions", func() {
-	It("defaults verify_threshold to 0.35", func() {
-		o := parseOptions(nil)
-		Expect(o.verifyThreshold).To(Equal(float32(0.35)))
-		Expect(o.modelName).To(Equal(""))
-	})
-
-	It("parses verify_threshold, threshold alias and model_name", func() {
-		o := parseOptions([]string{"verify_threshold:0.4", "model_name:buffalo_l", "unknown:x"})
-		Expect(o.verifyThreshold).To(Equal(float32(0.4)))
-		Expect(o.modelName).To(Equal("buffalo_l"))
-
-		o2 := parseOptions([]string{"threshold:0.3"})
-		Expect(o2.verifyThreshold).To(Equal(float32(0.3)))
-	})
-
-	It("ignores non-positive thresholds and keeps the default", func() {
-		o := parseOptions([]string{"verify_threshold:0", "threshold:-1"})
-		Expect(o.verifyThreshold).To(Equal(float32(0.35)))
-	})
-})
-
-var _ = Describe("normalizeGender", func() {
-	It("maps M/F codes to Man/Woman", func() {
-		Expect(normalizeGender("M")).To(Equal("Man"))
-		Expect(normalizeGender("f")).To(Equal("Woman"))
-		Expect(normalizeGender(" m ")).To(Equal("Man"))
-	})
-
-	It("passes empty and unknown codes through", func() {
-		Expect(normalizeGender("")).To(Equal(""))
-		Expect(normalizeGender("nonbinary")).To(Equal("nonbinary"))
-	})
-})
-
-var _ = Describe("faceBox.xywh", func() {
-	It("converts an [x1,y1,x2,y2] box to x/y/width/height", func() {
-		b := faceBox{Box: []float32{10, 20, 50, 80}}
-		x, y, w, h := b.xywh()
-		Expect(x).To(Equal(float32(10)))
-		Expect(y).To(Equal(float32(20)))
-		Expect(w).To(Equal(float32(40)))
-		Expect(h).To(Equal(float32(60)))
-	})
-
-	It("returns zeros for a short box", func() {
-		x, y, w, h := faceBox{Box: []float32{1, 2}}.xywh()
-		Expect([]float32{x, y, w, h}).To(Equal([]float32{0, 0, 0, 0}))
-	})
-})
-
-var _ = Describe("parseAnalyzeJSON", func() {
-	It("maps region, age and gender for each face", func() {
-		doc := `{"faces":[
-			{"score":0.997,"box":[10,20,50,80],"age":31,"gender":"M"},
-			{"score":0.81,"box":[0,0,40,40],"age":24,"gender":"F"}]}`
-		faces, err := parseAnalyzeJSON(doc)
-		Expect(err).ToNot(HaveOccurred())
-		Expect(faces).To(HaveLen(2))
-
-		Expect(faces[0].FaceConfidence).To(BeNumerically("~", 0.997, 1e-4))
-		Expect(faces[0].Age).To(BeNumerically("~", 31, 1e-4))
-		Expect(faces[0].DominantGender).To(Equal("Man"))
-		Expect(faces[0].Gender).To(HaveKeyWithValue("Man", float32(1.0)))
-		Expect(faces[0].Region.W).To(Equal(float32(40)))
-		Expect(faces[0].Region.H).To(Equal(float32(60)))
-
-		Expect(faces[1].DominantGender).To(Equal("Woman"))
-	})
-
-	It("tolerates a missing gender field", func() {
-		faces, err := parseAnalyzeJSON(`{"faces":[{"score":0.5,"box":[0,0,10,10],"age":40}]}`)
-		Expect(err).ToNot(HaveOccurred())
-		Expect(faces).To(HaveLen(1))
-		Expect(faces[0].DominantGender).To(Equal(""))
-		Expect(faces[0].Gender).To(BeEmpty())
-	})
-
-	It("returns no faces for an empty document", func() {
-		faces, err := parseAnalyzeJSON(`{"faces":[]}`)
-		Expect(err).ToNot(HaveOccurred())
-		Expect(faces).To(BeEmpty())
-	})
-
-	It("returns an error on malformed JSON", func() {
-		_, err := parseAnalyzeJSON(`{not-json`)
-		Expect(err).To(HaveOccurred())
-	})
-})
-
-var _ = Describe("materializeImage", func() {
-	It("decodes a base64 payload to a temp file", func() {
-		payload := base64.StdEncoding.EncodeToString([]byte("\xff\xd8\xff\xe0fake-jpeg"))
-		path, cleanup, err := materializeImage(payload)
-		Expect(err).ToNot(HaveOccurred())
-		defer cleanup()
-		data, rerr := os.ReadFile(path)
-		Expect(rerr).ToNot(HaveOccurred())
-		Expect(data).To(Equal([]byte("\xff\xd8\xff\xe0fake-jpeg")))
-	})
-
-	It("strips a data: URI prefix before decoding", func() {
-		payload := "data:image/png;base64," + base64.StdEncoding.EncodeToString([]byte("hello"))
-		path, cleanup, err := materializeImage(payload)
-		Expect(err).ToNot(HaveOccurred())
-		defer cleanup()
-		data, rerr := os.ReadFile(path)
-		Expect(rerr).ToNot(HaveOccurred())
-		Expect(data).To(Equal([]byte("hello")))
-	})
-
-	It("uses an existing path as-is", func() {
-		tmp, err := os.CreateTemp("", "face-detect-fixture-*.bin")
-		Expect(err).ToNot(HaveOccurred())
-		defer func() { _ = os.Remove(tmp.Name()) }()
-		Expect(tmp.Close()).To(Succeed())
-
-		path, cleanup, err := materializeImage(tmp.Name())
-		Expect(err).ToNot(HaveOccurred())
-		defer cleanup()
-		Expect(path).To(Equal(tmp.Name()))
-	})
-
-	It("errors on input that is neither a path nor base64", func() {
-		_, _, err := materializeImage("not base64!!!")
-		Expect(err).To(HaveOccurred())
-	})
-})
-
-// The specs below exercise the real C-API end to end. They run only when both a
-// model GGUF and a test image are provided, and skip cleanly otherwise so the
-// suite stays green without large assets.
-var _ = Describe("FaceDetect end-to-end", Ordered, func() {
-	var (
-		f         *FaceDetect
-		modelPath = os.Getenv("FACEDETECT_BACKEND_TEST_MODEL")
-		imagePath = os.Getenv("FACEDETECT_BACKEND_TEST_IMAGE")
-	)
-
-	BeforeAll(func() {
-		if modelPath == "" || imagePath == "" {
-			Skip("set FACEDETECT_BACKEND_TEST_MODEL and FACEDETECT_BACKEND_TEST_IMAGE to run the e2e specs")
-		}
-		if err := ensureLibLoaded(); err != nil {
-			Skip("libfacedetect.so not loadable: " + err.Error())
-		}
-		f = &FaceDetect{}
-		Expect(f.Load(&pb.ModelOptions{ModelFile: modelPath})).To(Succeed())
-	})
-
-	It("embeds the primary face in an image", func() {
-		emb, err := f.Embeddings(&pb.PredictOptions{Images: []string{imagePath}})
-		Expect(err).ToNot(HaveOccurred())
-		Expect(emb).ToNot(BeEmpty())
-	})
-
-	It("detects at least one face", func() {
-		resp, err := f.Detect(&pb.DetectOptions{Src: imagePath})
-		Expect(err).ToNot(HaveOccurred())
-		Expect(resp.Detections).ToNot(BeEmpty())
-		Expect(resp.Detections[0].ClassName).To(Equal("face"))
-	})
-
-	It("verifies an image against itself as the same identity", func() {
-		resp, err := f.FaceVerify(&pb.FaceVerifyRequest{Img1: imagePath, Img2: imagePath})
-		Expect(err).ToNot(HaveOccurred())
-		Expect(resp.Verified).To(BeTrue())
-		Expect(resp.Distance).To(BeNumerically("<=", resp.Threshold))
-	})
-
-	It("analyzes age/gender for each face", func() {
-		resp, err := f.FaceAnalyze(&pb.FaceAnalyzeRequest{Img: imagePath})
-		Expect(err).ToNot(HaveOccurred())
-		Expect(resp.Faces).ToNot(BeEmpty())
-	})
-})
--- a/backend/go/face-detect/main.go
+++ b/backend/go/face-detect/main.go
@@ -1,65 +0,0 @@
-package main
-
-// Started internally by LocalAI - one gRPC server per loaded model.
-//
-// Loads libfacedetect.so via purego and registers the flat C-API entry points
-// declared in facedetect_capi.h. The library name can be overridden with
-// FACEDETECT_LIBRARY (mirrors the VOICEDETECT_LIBRARY / PARAKEET_LIBRARY
-// convention in the sibling backends); the default looks for the .so next to
-// this binary (resolved via LD_LIBRARY_PATH by run.sh).
-import (
-	"flag"
-	"fmt"
-	"os"
-
-	"github.com/ebitengine/purego"
-	grpc "github.com/mudler/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-type LibFuncs struct {
-	FuncPtr any
-	Name    string
-}
-
-func main() {
-	libName := os.Getenv("FACEDETECT_LIBRARY")
-	if libName == "" {
-		libName = "libfacedetect.so"
-	}
-
-	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
-	if err != nil {
-		panic(fmt.Errorf("face-detect: dlopen %q: %w", libName, err))
-	}
-
-	// Bound 1:1 to facedetect_capi.h. char*/float* returns are registered as
-	// uintptr so the raw pointer can be freed via the matching capi free fn.
-	libFuncs := []LibFuncs{
-		{&CppAbiVersion, "facedetect_capi_abi_version"},
-		{&CppLoad, "facedetect_capi_load"},
-		{&CppFree, "facedetect_capi_free"},
-		{&CppLastError, "facedetect_capi_last_error"},
-		{&CppFreeString, "facedetect_capi_free_string"},
-		{&CppFreeVec, "facedetect_capi_free_vec"},
-		{&CppEmbedPath, "facedetect_capi_embed_path"},
-		{&CppEmbedRGB, "facedetect_capi_embed_rgb"},
-		{&CppDetectJSON, "facedetect_capi_detect_path_json"},
-		{&CppVerifyPaths, "facedetect_capi_verify_paths"},
-		{&CppAnalyzeJSON, "facedetect_capi_analyze_path_json"},
-	}
-	for _, lf := range libFuncs {
-		purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
-	}
-
-	fmt.Fprintf(os.Stderr, "[face-detect] ABI=%d\n", CppAbiVersion())
-
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &FaceDetect{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/go/face-detect/options.go
+++ b/backend/go/face-detect/options.go
@@ -1,47 +0,0 @@
-package main
-
-import (
-	"strconv"
-	"strings"
-)
-
-// defaultVerifyThreshold is the cosine-distance cutoff used when a request does
-// not set one. Matches the insightface buffalo_l ArcFace R50 default the Python
-// face backend ships with so the two implementations agree on verdicts out of
-// the box.
-const defaultVerifyThreshold float32 = 0.35
-
-// loadOptions holds the parsed model-level options for face-detect.
-type loadOptions struct {
-	verifyThreshold float32
-	modelName       string
-}
-
-func splitOption(o string) (key, value string, ok bool) {
-	i := strings.Index(o, ":")
-	if i < 0 {
-		return "", "", false
-	}
-	return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true
-}
-
-// parseOptions reads the backend "key:value" option slice. Unknown keys are
-// ignored. Defaults: verify_threshold 0.35, model_name derived from the file.
-func parseOptions(opts []string) loadOptions {
-	o := loadOptions{verifyThreshold: defaultVerifyThreshold}
-	for _, oo := range opts {
-		key, value, ok := splitOption(oo)
-		if !ok {
-			continue
-		}
-		switch key {
-		case "verify_threshold", "threshold":
-			if f, err := strconv.ParseFloat(value, 32); err == nil && f > 0 {
-				o.verifyThreshold = float32(f)
-			}
-		case "model_name":
-			o.modelName = value
-		}
-	}
-	return o
-}
--- a/backend/go/face-detect/package.sh
+++ b/backend/go/face-detect/package.sh
@@ -1,68 +0,0 @@
-#!/bin/bash
-#
-# Bundle the face-detect-grpc binary, libfacedetect.so, the core runtime libs
-# (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE
-# so the package is self-contained. Mirrors backend/go/voice-detect/package.sh;
-# run.sh routes the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc
-# is used instead of the host's.
-
-set -e
-
-CURDIR=$(dirname "$(realpath "$0")")
-REPO_ROOT="${CURDIR}/../../.."
-
-mkdir -p "$CURDIR/package/lib"
-
-cp -avf "$CURDIR/face-detect-grpc" "$CURDIR/package/"
-cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
-
-# libfacedetect.so + any soname symlinks. purego.Dlopen resolves it via
-# LD_LIBRARY_PATH, which run.sh points at lib/.
-cp -avf "$CURDIR"/libfacedetect.so* "$CURDIR/package/lib/" 2>/dev/null || {
-	echo "ERROR: libfacedetect.so not found in $CURDIR, run 'make' first" >&2
-	exit 1
-}
-
-# Detect architecture and copy the core runtime libs libfacedetect.so links
-# against, plus the matching dynamic loader as lib/ld.so.
-if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
-    echo "Detected x86_64 architecture, copying x86_64 libraries..."
-    cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
-    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
-    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
-    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
-    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
-    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
-    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
-    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
-    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
-elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
-    echo "Detected ARM64 architecture, copying ARM64 libraries..."
-    cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
-    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
-    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
-    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
-    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
-    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
-    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
-    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
-    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
-elif [ "$(uname -s)" = "Darwin" ]; then
-    echo "Detected Darwin"
-else
-    echo "Error: Could not detect architecture"
-    exit 1
-fi
-
-# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers) based on
-# BUILD_TYPE so the backend can reach the GPU without the runtime base image
-# shipping those drivers.
-GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
-if [ -f "$GPU_LIB_SCRIPT" ]; then
-    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
-    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
-    package_gpu_libs
-fi
-
-echo "Packaging completed successfully"
-ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
--- a/backend/go/face-detect/run.sh
+++ b/backend/go/face-detect/run.sh
@@ -1,16 +0,0 @@
-#!/bin/bash
-set -e
-
-CURDIR=$(dirname "$(realpath "$0")")
-
-export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
-
-# If a self-contained ld.so was packaged, route through it so the packaged
-# libc / libstdc++ are used instead of the host's (matches the voice-detect /
-# whisper / parakeet backends' runtime layout).
-if [ -f "$CURDIR/lib/ld.so" ]; then
-	echo "Using lib/ld.so"
-	exec "$CURDIR/lib/ld.so" "$CURDIR/face-detect-grpc" "$@"
-fi
-
-exec "$CURDIR/face-detect-grpc" "$@"
--- a/backend/go/face-detect/test.sh
+++ b/backend/go/face-detect/test.sh
@@ -1,15 +0,0 @@
-#!/bin/bash
-set -e
-
-CURDIR=$(dirname "$(realpath "$0")")
-cd "$CURDIR"
-
-echo "Running face-detect backend tests..."
-
-# The pure-Go parsing specs always run. The embed/detect/verify/analyze smoke
-# specs run only when a model + image are provided via
-# FACEDETECT_BACKEND_TEST_MODEL and FACEDETECT_BACKEND_TEST_IMAGE; otherwise they
-# auto-skip.
-LD_LIBRARY_PATH="$CURDIR:${LD_LIBRARY_PATH:-}" go test -v -timeout 1200s .
-
-echo "face-detect tests completed."
--- a/backend/go/voice-detect/.gitignore
+++ b/backend/go/voice-detect/.gitignore
@@ -1,18 +0,0 @@
-# Fetched upstream sources
-sources/
-
-# CMake build directories
-build*/
-
-# build artifacts staged in-tree by the Makefile (cp from sources/) or
-# symlinked for local dev; the real sources live in voice-detect.cpp upstream.
-*.so
-*.so.*
-voicedetect_capi.h
-compile_commands.json
-
-# Compiled backend binary
-voice-detect-grpc
-
-# Packaging output
-package/
--- a/backend/go/voice-detect/Makefile
+++ b/backend/go/voice-detect/Makefile
@@ -1,94 +0,0 @@
-# voice-detect backend Makefile.
-#
-# Upstream pin lives below as VOICEDETECT_VERSION?=f4e7eef... (.github/bump_deps.sh
-# can find and update it - matches the parakeet.cpp / whisper.cpp / ds4 convention).
-#
-# Local dev shortcut: if you already have an out-of-tree voice-detect.cpp build,
-# symlink the .so + header into this directory and skip the clone/cmake steps:
-#
-#   ln -sf /path/to/voice-detect.cpp/build-shared/libvoicedetect.so .
-#   ln -sf /path/to/voice-detect.cpp/include/voicedetect_capi.h .
-#   go build -o voice-detect-grpc .
-#
-# The default target below does the proper clone-at-pin + cmake build so CI does
-# not need a side-checkout.
-
-VOICEDETECT_VERSION?=f4e7eefcbd7396566845fa3824d2b470760ad418
-VOICEDETECT_REPO?=https://github.com/mudler/voice-detect.cpp
-
-GOCMD?=go
-GO_TAGS?=
-JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
-
-BUILD_TYPE?=
-NATIVE?=false
-
-# Build ggml statically into libvoicedetect.so (PIC) so the shared lib is
-# self-contained: dlopen needs no libggml*.so alongside it, only system libs
-# (libstdc++/libgomp/libc) that the runtime image already provides.
-CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DVOICEDETECT_SHARED=ON -DVOICEDETECT_BUILD_CLI=OFF -DVOICEDETECT_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-
-ifeq ($(NATIVE),false)
-	CMAKE_ARGS+=-DGGML_NATIVE=OFF
-endif
-
-# voice-detect.cpp gates its GGML backends behind VOICEDETECT_GGML_* options and
-# does set(GGML_CUDA ${VOICEDETECT_GGML_CUDA} CACHE BOOL "" FORCE), so a bare
-# -DGGML_CUDA=ON is overwritten back to OFF. Forward the VOICEDETECT_GGML_*
-# options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
-ifeq ($(BUILD_TYPE),cublas)
-	CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDA=ON
-else ifeq ($(BUILD_TYPE),openblas)
-	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
-else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DVOICEDETECT_GGML_HIP=ON
-else ifeq ($(BUILD_TYPE),vulkan)
-	CMAKE_ARGS+=-DVOICEDETECT_GGML_VULKAN=ON
-else ifeq ($(BUILD_TYPE),metal)
-	CMAKE_ARGS+=-DVOICEDETECT_GGML_METAL=ON
-endif
-
-.PHONY: voice-detect-grpc package build clean purge test all
-
-all: voice-detect-grpc
-
-# Clone the upstream voice-detect.cpp source at the pinned commit. Directory acts
-# as the target so make only re-clones when missing. After a VOICEDETECT_VERSION
-# bump, run 'make purge && make' to refetch.
-sources/voice-detect.cpp:
-	mkdir -p sources/voice-detect.cpp
-	cd sources/voice-detect.cpp && \
-	git init -q && \
-	git remote add origin $(VOICEDETECT_REPO) && \
-	git fetch --depth 1 origin $(VOICEDETECT_VERSION) && \
-	git checkout FETCH_HEAD && \
-	git submodule update --init --recursive --depth 1 --single-branch
-
-# Build the shared lib + header out-of-tree, then stage them next to the Go
-# sources so purego.Dlopen("libvoicedetect.so") and the cgo-less build both pick
-# them up.
-libvoicedetect.so: sources/voice-detect.cpp
-	cmake -B sources/voice-detect.cpp/build-shared -S sources/voice-detect.cpp $(CMAKE_ARGS)
-	cmake --build sources/voice-detect.cpp/build-shared --config Release -j$(JOBS) --target voicedetect
-	cp -fv sources/voice-detect.cpp/build-shared/libvoicedetect.so* ./ 2>/dev/null || true
-	cp -fv sources/voice-detect.cpp/include/voicedetect_capi.h ./
-
-voice-detect-grpc: libvoicedetect.so main.go govoicedetect.go options.go
-	CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o voice-detect-grpc .
-
-package: voice-detect-grpc
-	bash package.sh
-
-build: package
-
-# Test target. The embed/verify/analyze smoke specs are gated on
-# VOICEDETECT_BACKEND_TEST_MODEL + VOICEDETECT_BACKEND_TEST_WAV; without them the
-# heavy specs auto-skip and only the pure-Go parsing specs run.
-test:
-	LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
-
-clean: purge
-	rm -rf libvoicedetect.so* voicedetect_capi.h package voice-detect-grpc
-
-purge:
-	rm -rf sources/voice-detect.cpp
--- a/backend/go/voice-detect/govoicedetect.go
+++ b/backend/go/voice-detect/govoicedetect.go
@@ -1,273 +0,0 @@
-package main
-
-import (
-	"encoding/json"
-	"errors"
-	"fmt"
-	"math"
-	"os"
-	"path/filepath"
-	"strconv"
-	"strings"
-	"time"
-	"unsafe"
-
-	"github.com/mudler/LocalAI/pkg/grpc/base"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	"github.com/mudler/xlog"
-)
-
-// purego-bound entry points from libvoicedetect.so. Names match
-// voicedetect_capi.h exactly so a `nm libvoicedetect.so | grep voicedetect_capi`
-// is enough to spot drift.
-//
-// The opaque ctx and the malloc'd char*/float* return values are declared as
-// uintptr so we get the raw pointer back and can release it via the matching
-// capi free function. purego's native string/[]float32 returns would copy and
-// forget the original pointer, leaking the C-owned buffer on every call.
-var (
-	CppAbiVersion  func() int32
-	CppLoad        func(ggufPath string) uintptr
-	CppFree        func(ctx uintptr)
-	CppLastError   func(ctx uintptr) string
-	CppFreeString  func(s uintptr)
-	CppFreeVec     func(v uintptr)
-	CppEmbedPath   func(ctx uintptr, wavPath string, outVec, outDim unsafe.Pointer) int32
-	CppEmbedPCM    func(ctx uintptr, pcm []float32, nSamples, sampleRate int32, outVec, outDim unsafe.Pointer) int32
-	CppVerifyPaths func(ctx uintptr, a, b string, threshold float32, outDistance, outVerified unsafe.Pointer) int32
-	CppAnalyzeJSON func(ctx uintptr, wavPath string) uintptr
-)
-
-// VoiceDetect implements the speaker-recognition voice subset of the Backend
-// gRPC service over libvoicedetect.so. The C side keeps a single loaded model
-// plus a per-ctx last-error buffer and is not reentrant, so base.SingleThread
-// serializes every call.
-type VoiceDetect struct {
-	base.SingleThread
-	opts   loadOptions
-	ctxPtr uintptr
-}
-
-func (v *VoiceDetect) Load(opts *pb.ModelOptions) error {
-	model := opts.ModelFile
-	if model == "" {
-		model = opts.ModelPath
-	}
-	if !filepath.IsAbs(model) && opts.ModelPath != "" {
-		model = filepath.Join(opts.ModelPath, model)
-	}
-	if model == "" {
-		return errors.New("voice-detect: ModelFile is required")
-	}
-
-	v.opts = parseOptions(opts.Options)
-	if v.opts.modelName == "" {
-		v.opts.modelName = filepath.Base(model)
-	}
-
-	// Propagate LocalAI's per-model thread budget to the engine. LocalAI spawns
-	// one backend process per model and serves requests concurrently, so the
-	// engine's own min(hardware_concurrency, 8) default can oversubscribe cores.
-	// VOICEDETECT_THREADS is read by the engine at backend construction, so it
-	// must be set before the capi load. A non-positive Threads means "unset":
-	// leave the env alone so the engine keeps its sane default.
-	threads := opts.Threads
-	if threads > 0 {
-		if err := os.Setenv("VOICEDETECT_THREADS", strconv.Itoa(int(threads))); err != nil {
-			return fmt.Errorf("voice-detect: set VOICEDETECT_THREADS: %w", err)
-		}
-		xlog.Info("voice-detect: applying LocalAI thread budget", "threads", threads)
-	}
-
-	xlog.Info("voice-detect: loading model", "model", model,
-		"verify_threshold", v.opts.verifyThreshold, "abi", CppAbiVersion())
-
-	ctx := CppLoad(model)
-	if ctx == 0 {
-		// The last-error buffer lives on the ctx that was never returned, so
-		// surface the path the operator tried to load instead.
-		return fmt.Errorf("voice-detect: voicedetect_capi_load failed for %q", model)
-	}
-	v.ctxPtr = ctx
-	return nil
-}
-
-// VoiceEmbed returns the L2-normalized speaker embedding for an audio clip.
-// The request carries a filesystem PATH; the HTTP layer materializes
-// base64/URL/data-URI inputs to a temp file before the gRPC call.
-func (v *VoiceDetect) VoiceEmbed(req *pb.VoiceEmbedRequest) (pb.VoiceEmbedResponse, error) {
-	if v.ctxPtr == 0 {
-		return pb.VoiceEmbedResponse{}, errors.New("voice-detect: model not loaded")
-	}
-	if req.Audio == "" {
-		return pb.VoiceEmbedResponse{}, errors.New("voice-detect: audio path is required")
-	}
-	emb, err := v.embedPath(req.Audio)
-	if err != nil {
-		return pb.VoiceEmbedResponse{}, err
-	}
-	return pb.VoiceEmbedResponse{Embedding: emb, Model: v.opts.modelName}, nil
-}
-
-func (v *VoiceDetect) embedPath(path string) ([]float32, error) {
-	var vec uintptr
-	var dim int32
-	rc := CppEmbedPath(v.ctxPtr, path, unsafe.Pointer(&vec), unsafe.Pointer(&dim))
-	if rc != 0 || vec == 0 || dim <= 0 {
-		return nil, v.lastErr("embed", path)
-	}
-	defer CppFreeVec(vec)
-	// Copy out of the C-owned malloc'd buffer before freeing it. The
-	// uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
-	// a C heap pointer from Go-managed memory; safe here, the GC neither tracks
-	// nor moves this buffer and we copy immediately.
-	src := unsafe.Slice((*float32)(unsafe.Pointer(vec)), int(dim)) //nolint:govet // C-owned malloc'd vector, copied out before free
-	out := make([]float32, int(dim))
-	copy(out, src)
-	return out, nil
-}
-
-// VoiceVerify embeds two clips and reports whether they are the same speaker by
-// cosine distance against a threshold. A request threshold <= 0 falls back to
-// the model-configured default (verify_threshold option, 0.25 if unset).
-func (v *VoiceDetect) VoiceVerify(req *pb.VoiceVerifyRequest) (pb.VoiceVerifyResponse, error) {
-	if v.ctxPtr == 0 {
-		return pb.VoiceVerifyResponse{}, errors.New("voice-detect: model not loaded")
-	}
-	if req.Audio1 == "" || req.Audio2 == "" {
-		return pb.VoiceVerifyResponse{}, errors.New("voice-detect: audio1 and audio2 are required")
-	}
-
-	threshold := req.Threshold
-	if threshold <= 0 {
-		threshold = v.opts.verifyThreshold
-	}
-
-	started := time.Now()
-	var distance float32
-	var verified int32
-	rc := CppVerifyPaths(v.ctxPtr, req.Audio1, req.Audio2, threshold,
-		unsafe.Pointer(&distance), unsafe.Pointer(&verified))
-	if rc != 0 {
-		return pb.VoiceVerifyResponse{}, v.lastErr("verify", req.Audio1+","+req.Audio2)
-	}
-	elapsedMs := float32(time.Since(started).Seconds() * 1000.0)
-
-	// Confidence decays linearly from 100 at distance 0 to 0 at the threshold,
-	// matching the Python speaker-recognition backend's reporting.
-	confidence := float32(0)
-	if threshold > 0 {
-		confidence = float32(math.Max(0, math.Min(100, (1.0-float64(distance)/float64(threshold))*100.0)))
-	}
-
-	return pb.VoiceVerifyResponse{
-		Verified:         verified != 0,
-		Distance:         distance,
-		Threshold:        threshold,
-		Confidence:       confidence,
-		Model:            v.opts.modelName,
-		ProcessingTimeMs: elapsedMs,
-	}, nil
-}
-
-// VoiceAnalyze runs the age/gender/emotion heads on a single clip. The C-API
-// always evaluates every supported head, so the request's actions filter is
-// advisory and the full analysis is returned as a single segment (the engine
-// does not produce time-bounded segments).
-func (v *VoiceDetect) VoiceAnalyze(req *pb.VoiceAnalyzeRequest) (pb.VoiceAnalyzeResponse, error) {
-	if v.ctxPtr == 0 {
-		return pb.VoiceAnalyzeResponse{}, errors.New("voice-detect: model not loaded")
-	}
-	if req.Audio == "" {
-		return pb.VoiceAnalyzeResponse{}, errors.New("voice-detect: audio path is required")
-	}
-
-	ptr := CppAnalyzeJSON(v.ctxPtr, req.Audio)
-	if ptr == 0 {
-		return pb.VoiceAnalyzeResponse{}, v.lastErr("analyze", req.Audio)
-	}
-	defer CppFreeString(ptr)
-
-	seg, err := parseAnalyzeJSON(goStringFromCPtr(ptr))
-	if err != nil {
-		return pb.VoiceAnalyzeResponse{}, fmt.Errorf("voice-detect: analyze JSON for %q: %w", req.Audio, err)
-	}
-	return pb.VoiceAnalyzeResponse{Segments: []*pb.VoiceAnalysis{seg}}, nil
-}
-
-// analyzeJSON mirrors the document returned by voicedetect_capi_analyze_path_json:
-//
-//	{"age":42.0,
-//	 "gender":{"label":"female","female":0.88,"male":0.12},
-//	 "emotion":{"label":"neutral","scores":{"neutral":0.7, ...}}}
-//
-// gender is a mixed object (a "label" string plus per-class float scores), so
-// it is decoded into raw messages and split in parseAnalyzeJSON.
-type analyzeJSON struct {
-	Age     float32                    `json:"age"`
-	Gender  map[string]json.RawMessage `json:"gender"`
-	Emotion struct {
-		Label  string             `json:"label"`
-		Scores map[string]float32 `json:"scores"`
-	} `json:"emotion"`
-}
-
-// parseAnalyzeJSON maps the engine's analyze document onto a VoiceAnalysis.
-// start/end stay 0: the model emits a single whole-utterance result, not
-// time-bounded segments.
-func parseAnalyzeJSON(doc string) (*pb.VoiceAnalysis, error) {
-	var a analyzeJSON
-	if err := json.Unmarshal([]byte(doc), &a); err != nil {
-		return nil, err
-	}
-
-	seg := &pb.VoiceAnalysis{
-		Age:             a.Age,
-		DominantEmotion: a.Emotion.Label,
-		Emotion:         a.Emotion.Scores,
-	}
-
-	if len(a.Gender) > 0 {
-		gender := make(map[string]float32, len(a.Gender))
-		for k, raw := range a.Gender {
-			if k == "label" {
-				_ = json.Unmarshal(raw, &seg.DominantGender)
-				continue
-			}
-			var score float32
-			if err := json.Unmarshal(raw, &score); err == nil {
-				gender[k] = score
-			}
-		}
-		seg.Gender = gender
-	}
-
-	return seg, nil
-}
-
-// lastErr wraps the C-API's per-ctx last-error buffer into a Go error.
-func (v *VoiceDetect) lastErr(op, subject string) error {
-	msg := strings.TrimSpace(CppLastError(v.ctxPtr))
-	if msg == "" {
-		msg = "no error detail"
-	}
-	return fmt.Errorf("voice-detect: %s failed for %q: %s", op, subject, msg)
-}
-
-// goStringFromCPtr copies a NUL-terminated C string into Go memory. cptr is a
-// malloc'd buffer the caller owns; release it via CppFreeString after the copy.
-//
-// The uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
-// a C heap pointer from Go-managed memory. Safe here: the GC neither tracks nor
-// moves the buffer and we dereference it immediately to copy the bytes out.
-func goStringFromCPtr(cptr uintptr) string {
-	if cptr == 0 {
-		return ""
-	}
-	p := unsafe.Pointer(cptr) //nolint:govet // C-owned malloc'd buffer, not Go-GC memory (see doc above)
-	n := 0
-	for *(*byte)(unsafe.Add(p, n)) != 0 {
-		n++
-	}
-	return string(unsafe.Slice((*byte)(p), n))
-}
--- a/backend/go/voice-detect/govoicedetect_test.go
+++ b/backend/go/voice-detect/govoicedetect_test.go
@@ -1,144 +0,0 @@
-package main
-
-import (
-	"os"
-	"sync"
-	"testing"
-
-	"github.com/ebitengine/purego"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-func TestVoiceDetect(t *testing.T) {
-	RegisterFailHandler(Fail)
-	RunSpecs(t, "voice-detect Backend Suite")
-}
-
-var (
-	libLoadOnce sync.Once
-	libLoadErr  error
-)
-
-// ensureLibLoaded mirrors main.go's bootstrap so a Go test can drive the C-API
-// bridge without spinning up the gRPC server. Records the error (the smoke
-// specs skip themselves) when libvoicedetect.so is not loadable from cwd
-// (LD_LIBRARY_PATH or a symlink in ./).
-func ensureLibLoaded() error {
-	libLoadOnce.Do(func() {
-		libName := os.Getenv("VOICEDETECT_LIBRARY")
-		if libName == "" {
-			libName = "libvoicedetect.so"
-		}
-		lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
-		if err != nil {
-			libLoadErr = err
-			return
-		}
-		purego.RegisterLibFunc(&CppAbiVersion, lib, "voicedetect_capi_abi_version")
-		purego.RegisterLibFunc(&CppLoad, lib, "voicedetect_capi_load")
-		purego.RegisterLibFunc(&CppFree, lib, "voicedetect_capi_free")
-		purego.RegisterLibFunc(&CppLastError, lib, "voicedetect_capi_last_error")
-		purego.RegisterLibFunc(&CppFreeString, lib, "voicedetect_capi_free_string")
-		purego.RegisterLibFunc(&CppFreeVec, lib, "voicedetect_capi_free_vec")
-		purego.RegisterLibFunc(&CppEmbedPath, lib, "voicedetect_capi_embed_path")
-		purego.RegisterLibFunc(&CppEmbedPCM, lib, "voicedetect_capi_embed_pcm")
-		purego.RegisterLibFunc(&CppVerifyPaths, lib, "voicedetect_capi_verify_paths")
-		purego.RegisterLibFunc(&CppAnalyzeJSON, lib, "voicedetect_capi_analyze_path_json")
-	})
-	return libLoadErr
-}
-
-var _ = Describe("parseOptions", func() {
-	It("defaults verify_threshold to 0.25", func() {
-		o := parseOptions(nil)
-		Expect(o.verifyThreshold).To(Equal(float32(0.25)))
-		Expect(o.modelName).To(Equal(""))
-	})
-
-	It("parses verify_threshold, threshold alias and model_name", func() {
-		o := parseOptions([]string{"verify_threshold:0.4", "model_name:ecapa", "unknown:x"})
-		Expect(o.verifyThreshold).To(Equal(float32(0.4)))
-		Expect(o.modelName).To(Equal("ecapa"))
-
-		o2 := parseOptions([]string{"threshold:0.3"})
-		Expect(o2.verifyThreshold).To(Equal(float32(0.3)))
-	})
-
-	It("ignores non-positive thresholds and keeps the default", func() {
-		o := parseOptions([]string{"verify_threshold:0", "threshold:-1"})
-		Expect(o.verifyThreshold).To(Equal(float32(0.25)))
-	})
-})
-
-var _ = Describe("parseAnalyzeJSON", func() {
-	It("maps age, gender label+scores and emotion label+scores", func() {
-		doc := `{"age":42.0,
-			"gender":{"label":"female","female":0.88,"male":0.12},
-			"emotion":{"label":"neutral","scores":{"neutral":0.7,"happy":0.2,"sad":0.1}}}`
-		seg, err := parseAnalyzeJSON(doc)
-		Expect(err).ToNot(HaveOccurred())
-		Expect(seg.Age).To(BeNumerically("~", 42.0, 1e-4))
-		Expect(seg.Start).To(Equal(float32(0)))
-		Expect(seg.End).To(Equal(float32(0)))
-
-		Expect(seg.DominantGender).To(Equal("female"))
-		Expect(seg.Gender).To(HaveKeyWithValue("female", BeNumerically("~", 0.88, 1e-4)))
-		Expect(seg.Gender).To(HaveKeyWithValue("male", BeNumerically("~", 0.12, 1e-4)))
-		// The "label" entry is consumed into DominantGender, not the score map.
-		Expect(seg.Gender).ToNot(HaveKey("label"))
-
-		Expect(seg.DominantEmotion).To(Equal("neutral"))
-		Expect(seg.Emotion).To(HaveKeyWithValue("neutral", BeNumerically("~", 0.7, 1e-4)))
-		Expect(seg.Emotion).To(HaveKeyWithValue("happy", BeNumerically("~", 0.2, 1e-4)))
-	})
-
-	It("tolerates a missing gender block", func() {
-		seg, err := parseAnalyzeJSON(`{"age":30.0,"emotion":{"label":"happy","scores":{"happy":1.0}}}`)
-		Expect(err).ToNot(HaveOccurred())
-		Expect(seg.DominantGender).To(Equal(""))
-		Expect(seg.DominantEmotion).To(Equal("happy"))
-	})
-
-	It("returns an error on malformed JSON", func() {
-		_, err := parseAnalyzeJSON(`{not-json`)
-		Expect(err).To(HaveOccurred())
-	})
-})
-
-// The specs below exercise the real C-API end to end. They run only when both a
-// model GGUF and a test WAV are provided, and skip cleanly otherwise so the
-// suite stays green without large assets.
-var _ = Describe("VoiceDetect end-to-end", Ordered, func() {
-	var (
-		v         *VoiceDetect
-		modelPath = os.Getenv("VOICEDETECT_BACKEND_TEST_MODEL")
-		wavPath   = os.Getenv("VOICEDETECT_BACKEND_TEST_WAV")
-	)
-
-	BeforeAll(func() {
-		if modelPath == "" || wavPath == "" {
-			Skip("set VOICEDETECT_BACKEND_TEST_MODEL and VOICEDETECT_BACKEND_TEST_WAV to run the e2e specs")
-		}
-		if err := ensureLibLoaded(); err != nil {
-			Skip("libvoicedetect.so not loadable: " + err.Error())
-		}
-		v = &VoiceDetect{}
-		Expect(v.Load(&pb.ModelOptions{ModelFile: modelPath})).To(Succeed())
-	})
-
-	It("embeds an audio clip", func() {
-		resp, err := v.VoiceEmbed(&pb.VoiceEmbedRequest{Audio: wavPath})
-		Expect(err).ToNot(HaveOccurred())
-		Expect(resp.Embedding).ToNot(BeEmpty())
-		Expect(resp.Model).ToNot(BeEmpty())
-	})
-
-	It("verifies a clip against itself as the same speaker", func() {
-		resp, err := v.VoiceVerify(&pb.VoiceVerifyRequest{Audio1: wavPath, Audio2: wavPath})
-		Expect(err).ToNot(HaveOccurred())
-		Expect(resp.Verified).To(BeTrue())
-		Expect(resp.Distance).To(BeNumerically("<=", resp.Threshold))
-	})
-})
--- a/backend/go/voice-detect/main.go
+++ b/backend/go/voice-detect/main.go
@@ -1,64 +0,0 @@
-package main
-
-// Started internally by LocalAI - one gRPC server per loaded model.
-//
-// Loads libvoicedetect.so via purego and registers the flat C-API entry points
-// declared in voicedetect_capi.h. The library name can be overridden with
-// VOICEDETECT_LIBRARY (mirrors the PARAKEET_LIBRARY / OMNIVOICE_LIBRARY
-// convention in the sibling backends); the default looks for the .so next to
-// this binary (resolved via LD_LIBRARY_PATH by run.sh).
-import (
-	"flag"
-	"fmt"
-	"os"
-
-	"github.com/ebitengine/purego"
-	grpc "github.com/mudler/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-type LibFuncs struct {
-	FuncPtr any
-	Name    string
-}
-
-func main() {
-	libName := os.Getenv("VOICEDETECT_LIBRARY")
-	if libName == "" {
-		libName = "libvoicedetect.so"
-	}
-
-	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
-	if err != nil {
-		panic(fmt.Errorf("voice-detect: dlopen %q: %w", libName, err))
-	}
-
-	// Bound 1:1 to voicedetect_capi.h. char*/float* returns are registered as
-	// uintptr so the raw pointer can be freed via the matching capi free fn.
-	libFuncs := []LibFuncs{
-		{&CppAbiVersion, "voicedetect_capi_abi_version"},
-		{&CppLoad, "voicedetect_capi_load"},
-		{&CppFree, "voicedetect_capi_free"},
-		{&CppLastError, "voicedetect_capi_last_error"},
-		{&CppFreeString, "voicedetect_capi_free_string"},
-		{&CppFreeVec, "voicedetect_capi_free_vec"},
-		{&CppEmbedPath, "voicedetect_capi_embed_path"},
-		{&CppEmbedPCM, "voicedetect_capi_embed_pcm"},
-		{&CppVerifyPaths, "voicedetect_capi_verify_paths"},
-		{&CppAnalyzeJSON, "voicedetect_capi_analyze_path_json"},
-	}
-	for _, lf := range libFuncs {
-		purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
-	}
-
-	fmt.Fprintf(os.Stderr, "[voice-detect] ABI=%d\n", CppAbiVersion())
-
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &VoiceDetect{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/go/voice-detect/options.go
+++ b/backend/go/voice-detect/options.go
@@ -1,46 +0,0 @@
-package main
-
-import (
-	"strconv"
-	"strings"
-)
-
-// defaultVerifyThreshold is the cosine-distance cutoff used when a request does
-// not set one. Matches the Python speaker-recognition backend's default so the
-// two implementations agree on verdicts out of the box.
-const defaultVerifyThreshold float32 = 0.25
-
-// loadOptions holds the parsed model-level options for voice-detect.
-type loadOptions struct {
-	verifyThreshold float32
-	modelName       string
-}
-
-func splitOption(o string) (key, value string, ok bool) {
-	i := strings.Index(o, ":")
-	if i < 0 {
-		return "", "", false
-	}
-	return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true
-}
-
-// parseOptions reads the backend "key:value" option slice. Unknown keys are
-// ignored. Defaults: verify_threshold 0.25, model_name derived from the file.
-func parseOptions(opts []string) loadOptions {
-	o := loadOptions{verifyThreshold: defaultVerifyThreshold}
-	for _, oo := range opts {
-		key, value, ok := splitOption(oo)
-		if !ok {
-			continue
-		}
-		switch key {
-		case "verify_threshold", "threshold":
-			if f, err := strconv.ParseFloat(value, 32); err == nil && f > 0 {
-				o.verifyThreshold = float32(f)
-			}
-		case "model_name":
-			o.modelName = value
-		}
-	}
-	return o
-}
--- a/backend/go/voice-detect/package.sh
+++ b/backend/go/voice-detect/package.sh
@@ -1,68 +0,0 @@
-#!/bin/bash
-#
-# Bundle the voice-detect-grpc binary, libvoicedetect.so, the core runtime libs
-# (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE
-# so the package is self-contained. Mirrors backend/go/parakeet-cpp/package.sh;
-# run.sh routes the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc
-# is used instead of the host's.
-
-set -e
-
-CURDIR=$(dirname "$(realpath "$0")")
-REPO_ROOT="${CURDIR}/../../.."
-
-mkdir -p "$CURDIR/package/lib"
-
-cp -avf "$CURDIR/voice-detect-grpc" "$CURDIR/package/"
-cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
-
-# libvoicedetect.so + any soname symlinks. purego.Dlopen resolves it via
-# LD_LIBRARY_PATH, which run.sh points at lib/.
-cp -avf "$CURDIR"/libvoicedetect.so* "$CURDIR/package/lib/" 2>/dev/null || {
-	echo "ERROR: libvoicedetect.so not found in $CURDIR, run 'make' first" >&2
-	exit 1
-}
-
-# Detect architecture and copy the core runtime libs libvoicedetect.so links
-# against, plus the matching dynamic loader as lib/ld.so.
-if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
-    echo "Detected x86_64 architecture, copying x86_64 libraries..."
-    cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
-    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
-    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
-    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
-    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
-    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
-    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
-    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
-    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
-elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
-    echo "Detected ARM64 architecture, copying ARM64 libraries..."
-    cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
-    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
-    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
-    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
-    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
-    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
-    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
-    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
-    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
-elif [ "$(uname -s)" = "Darwin" ]; then
-    echo "Detected Darwin"
-else
-    echo "Error: Could not detect architecture"
-    exit 1
-fi
-
-# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers) based on
-# BUILD_TYPE so the backend can reach the GPU without the runtime base image
-# shipping those drivers.
-GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
-if [ -f "$GPU_LIB_SCRIPT" ]; then
-    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
-    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
-    package_gpu_libs
-fi
-
-echo "Packaging completed successfully"
-ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
--- a/backend/go/voice-detect/run.sh
+++ b/backend/go/voice-detect/run.sh
@@ -1,16 +0,0 @@
-#!/bin/bash
-set -e
-
-CURDIR=$(dirname "$(realpath "$0")")
-
-export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
-
-# If a self-contained ld.so was packaged, route through it so the packaged
-# libc / libstdc++ are used instead of the host's (matches the whisper /
-# parakeet backends' runtime layout).
-if [ -f "$CURDIR/lib/ld.so" ]; then
-	echo "Using lib/ld.so"
-	exec "$CURDIR/lib/ld.so" "$CURDIR/voice-detect-grpc" "$@"
-fi
-
-exec "$CURDIR/voice-detect-grpc" "$@"
--- a/backend/go/voice-detect/test.sh
+++ b/backend/go/voice-detect/test.sh
@@ -1,14 +0,0 @@
-#!/bin/bash
-set -e
-
-CURDIR=$(dirname "$(realpath "$0")")
-cd "$CURDIR"
-
-echo "Running voice-detect backend tests..."
-
-# The pure-Go parsing specs always run. The embed/verify/analyze smoke specs run
-# only when a model + WAV are provided via VOICEDETECT_BACKEND_TEST_MODEL and
-# VOICEDETECT_BACKEND_TEST_WAV; otherwise they auto-skip.
-LD_LIBRARY_PATH="$CURDIR:${LD_LIBRARY_PATH:-}" go test -v -timeout 1200s .
-
-echo "voice-detect tests completed."
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -209,78 +209,6 @@
    nvidia-cuda-12: "cuda12-ced"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced"
- &voicedetect
-  name: "voice-detect"
-  alias: "voice-detect"
-  license: mit
-  icon: https://avatars.githubusercontent.com/u/95302084
-  description: |
-    voice-detect speaker recognition and voice analysis.
-    voice-detect.cpp is a C++/ggml engine that produces L2-normalised
-    speaker embeddings (ECAPA-TDNN, WeSpeaker ResNet34, 3D-Speaker
-    ERes2Net, CAM++) for voice verification and 1:N identification, plus
-    a wav2vec2 age / gender / emotion analysis head. It replaces the
-    Python speaker-recognition backend and is exposed through the Voice*
-    gRPC rpcs and the /v1/voice/* REST endpoints. It runs on CPU, NVIDIA
-    CUDA, AMD ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets.
-  urls:
-    - https://github.com/mudler/voice-detect.cpp
-  tags:
-    - voice-recognition
-    - speaker-verification
-    - speaker-embedding
-    - CPU
-    - GPU
-    - CUDA
-    - HIP
-  capabilities:
-    default: "cpu-voice-detect"
-    nvidia: "cuda12-voice-detect"
-    intel: "intel-sycl-f16-voice-detect"
-    metal: "metal-voice-detect"
-    amd: "rocm-voice-detect"
-    vulkan: "vulkan-voice-detect"
-    nvidia-l4t: "nvidia-l4t-arm64-voice-detect"
-    nvidia-cuda-13: "cuda13-voice-detect"
-    nvidia-cuda-12: "cuda12-voice-detect"
-    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-voice-detect"
-    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-voice-detect"
- &facedetect
-  name: "face-detect"
-  alias: "face-detect"
-  license: mit
-  icon: https://avatars.githubusercontent.com/u/95302084
-  description: |
-    face-detect face detection, embedding, verification and analysis.
-    face-detect.cpp is a C++/ggml engine that runs SCRFD / YuNet face
-    detection and ArcFace / SFace 512-d (or 128-d) L2-normalised face
-    embeddings for verification and 1:N identification, plus a landmark /
-    age / gender analysis head. It replaces the Python insightface backend
-    and is exposed through the Embedding, Detect and Face* gRPC rpcs and
-    the /v1/face/* REST endpoints. It runs on CPU, NVIDIA CUDA, AMD
-    ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets.
-  urls:
-    - https://github.com/mudler/face-detect.cpp
-  tags:
-    - face-recognition
-    - face-verification
-    - face-embedding
-    - CPU
-    - GPU
-    - CUDA
-    - HIP
-  capabilities:
-    default: "cpu-face-detect"
-    nvidia: "cuda12-face-detect"
-    intel: "intel-sycl-f16-face-detect"
-    metal: "metal-face-detect"
-    amd: "rocm-face-detect"
-    vulkan: "vulkan-face-detect"
-    nvidia-l4t: "nvidia-l4t-arm64-face-detect"
-    nvidia-cuda-13: "cuda13-face-detect"
-    nvidia-cuda-12: "cuda12-face-detect"
-    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-face-detect"
-    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-face-detect"
 - &voxtral
  name: "voxtral"
  alias: "voxtral"
@@ -2868,236 +2796,6 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-ced"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-13-ced
-## voice-detect
- !!merge <<: *voicedetect
-  name: "voice-detect-development"
-  capabilities:
-    default: "cpu-voice-detect-development"
-    nvidia: "cuda12-voice-detect-development"
-    intel: "intel-sycl-f16-voice-detect-development"
-    metal: "metal-voice-detect-development"
-    amd: "rocm-voice-detect-development"
-    vulkan: "vulkan-voice-detect-development"
-    nvidia-l4t: "nvidia-l4t-arm64-voice-detect-development"
-    nvidia-cuda-13: "cuda13-voice-detect-development"
-    nvidia-cuda-12: "cuda12-voice-detect-development"
-    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-voice-detect-development"
-    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-voice-detect-development"
- !!merge <<: *voicedetect
-  name: "nvidia-l4t-arm64-voice-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-voice-detect"
-  mirrors:
-    - localai/localai-backends:latest-nvidia-l4t-arm64-voice-detect
- !!merge <<: *voicedetect
-  name: "nvidia-l4t-arm64-voice-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-voice-detect"
-  mirrors:
-    - localai/localai-backends:master-nvidia-l4t-arm64-voice-detect
- !!merge <<: *voicedetect
-  name: "cuda13-nvidia-l4t-arm64-voice-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-voice-detect"
-  mirrors:
-    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-voice-detect
- !!merge <<: *voicedetect
-  name: "cuda13-nvidia-l4t-arm64-voice-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-voice-detect"
-  mirrors:
-    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-voice-detect
- !!merge <<: *voicedetect
-  name: "cpu-voice-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-voice-detect"
-  mirrors:
-    - localai/localai-backends:latest-cpu-voice-detect
- !!merge <<: *voicedetect
-  name: "cpu-voice-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-voice-detect"
-  mirrors:
-    - localai/localai-backends:master-cpu-voice-detect
- !!merge <<: *voicedetect
-  name: "metal-voice-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-voice-detect"
-  mirrors:
-    - localai/localai-backends:latest-metal-darwin-arm64-voice-detect
- !!merge <<: *voicedetect
-  name: "metal-voice-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-voice-detect"
-  mirrors:
-    - localai/localai-backends:master-metal-darwin-arm64-voice-detect
- !!merge <<: *voicedetect
-  name: "cuda12-voice-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-voice-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-nvidia-cuda-12-voice-detect
- !!merge <<: *voicedetect
-  name: "cuda12-voice-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-voice-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-nvidia-cuda-12-voice-detect
- !!merge <<: *voicedetect
-  name: "rocm-voice-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-voice-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-rocm-hipblas-voice-detect
- !!merge <<: *voicedetect
-  name: "rocm-voice-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-voice-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-rocm-hipblas-voice-detect
- !!merge <<: *voicedetect
-  name: "intel-sycl-f32-voice-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-voice-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-intel-sycl-f32-voice-detect
- !!merge <<: *voicedetect
-  name: "intel-sycl-f32-voice-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-voice-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-intel-sycl-f32-voice-detect
- !!merge <<: *voicedetect
-  name: "intel-sycl-f16-voice-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-voice-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-intel-sycl-f16-voice-detect
- !!merge <<: *voicedetect
-  name: "intel-sycl-f16-voice-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-voice-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-intel-sycl-f16-voice-detect
- !!merge <<: *voicedetect
-  name: "vulkan-voice-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-voice-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-vulkan-voice-detect
- !!merge <<: *voicedetect
-  name: "vulkan-voice-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-voice-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-vulkan-voice-detect
- !!merge <<: *voicedetect
-  name: "cuda13-voice-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-voice-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-nvidia-cuda-13-voice-detect
- !!merge <<: *voicedetect
-  name: "cuda13-voice-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-voice-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-nvidia-cuda-13-voice-detect
-## face-detect
- !!merge <<: *facedetect
-  name: "face-detect-development"
-  capabilities:
-    default: "cpu-face-detect-development"
-    nvidia: "cuda12-face-detect-development"
-    intel: "intel-sycl-f16-face-detect-development"
-    metal: "metal-face-detect-development"
-    amd: "rocm-face-detect-development"
-    vulkan: "vulkan-face-detect-development"
-    nvidia-l4t: "nvidia-l4t-arm64-face-detect-development"
-    nvidia-cuda-13: "cuda13-face-detect-development"
-    nvidia-cuda-12: "cuda12-face-detect-development"
-    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-face-detect-development"
-    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-face-detect-development"
- !!merge <<: *facedetect
-  name: "nvidia-l4t-arm64-face-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-face-detect"
-  mirrors:
-    - localai/localai-backends:latest-nvidia-l4t-arm64-face-detect
- !!merge <<: *facedetect
-  name: "nvidia-l4t-arm64-face-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-face-detect"
-  mirrors:
-    - localai/localai-backends:master-nvidia-l4t-arm64-face-detect
- !!merge <<: *facedetect
-  name: "cuda13-nvidia-l4t-arm64-face-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-face-detect"
-  mirrors:
-    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-face-detect
- !!merge <<: *facedetect
-  name: "cuda13-nvidia-l4t-arm64-face-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-face-detect"
-  mirrors:
-    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-face-detect
- !!merge <<: *facedetect
-  name: "cpu-face-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-face-detect"
-  mirrors:
-    - localai/localai-backends:latest-cpu-face-detect
- !!merge <<: *facedetect
-  name: "cpu-face-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-face-detect"
-  mirrors:
-    - localai/localai-backends:master-cpu-face-detect
- !!merge <<: *facedetect
-  name: "metal-face-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-face-detect"
-  mirrors:
-    - localai/localai-backends:latest-metal-darwin-arm64-face-detect
- !!merge <<: *facedetect
-  name: "metal-face-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-face-detect"
-  mirrors:
-    - localai/localai-backends:master-metal-darwin-arm64-face-detect
- !!merge <<: *facedetect
-  name: "cuda12-face-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-face-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-nvidia-cuda-12-face-detect
- !!merge <<: *facedetect
-  name: "cuda12-face-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-face-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-nvidia-cuda-12-face-detect
- !!merge <<: *facedetect
-  name: "rocm-face-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-face-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-rocm-hipblas-face-detect
- !!merge <<: *facedetect
-  name: "rocm-face-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-face-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-rocm-hipblas-face-detect
- !!merge <<: *facedetect
-  name: "intel-sycl-f32-face-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-face-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-intel-sycl-f32-face-detect
- !!merge <<: *facedetect
-  name: "intel-sycl-f32-face-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-face-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-intel-sycl-f32-face-detect
- !!merge <<: *facedetect
-  name: "intel-sycl-f16-face-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-face-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-intel-sycl-f16-face-detect
- !!merge <<: *facedetect
-  name: "intel-sycl-f16-face-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-face-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-intel-sycl-f16-face-detect
- !!merge <<: *facedetect
-  name: "vulkan-face-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-face-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-vulkan-face-detect
- !!merge <<: *facedetect
-  name: "vulkan-face-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-face-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-vulkan-face-detect
- !!merge <<: *facedetect
-  name: "cuda13-face-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-face-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-nvidia-cuda-13-face-detect
- !!merge <<: *facedetect
-  name: "cuda13-face-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-face-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-nvidia-cuda-13-face-detect
 ## stablediffusion-ggml
 - !!merge <<: *stablediffusionggml
  name: "cpu-stablediffusion-ggml"
--- a/backend/python/diffusers/requirements-cpu.txt
+++ b/backend/python/diffusers/requirements-cpu.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 torchvision==0.22.1
 accelerate
 git+https://github.com/xhinker/sd_embed
@@ -10,9 +10,15 @@ sentencepiece
 torch==2.7.1
 optimum-quanto
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-cublas12.txt
+++ b/backend/python/diffusers/requirements-cublas12.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cu121
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 torchvision
 accelerate
 git+https://github.com/xhinker/sd_embed
@@ -10,9 +10,15 @@ sentencepiece
 torch
 ftfy
 optimum-quanto
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-cublas13.txt
+++ b/backend/python/diffusers/requirements-cublas13.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cu130
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 torchvision
 accelerate
 git+https://github.com/xhinker/sd_embed
@@ -10,9 +10,15 @@ sentencepiece
 torch
 ftfy
 optimum-quanto
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-hipblas.txt
+++ b/backend/python/diffusers/requirements-hipblas.txt
@@ -1,17 +1,23 @@
 --extra-index-url https://download.pytorch.org/whl/rocm7.0
 torch==2.10.0+rocm7.0
 torchvision==0.25.0+rocm7.0
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 accelerate
 peft
 sentencepiece
 optimum-quanto
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-intel.txt
+++ b/backend/python/diffusers/requirements-intel.txt
@@ -3,18 +3,24 @@ torch
 torchvision
 optimum[openvino]
 setuptools
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 accelerate
 git+https://github.com/xhinker/sd_embed
 peft
 sentencepiece
 optimum-quanto
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-l4t12.txt
+++ b/backend/python/diffusers/requirements-l4t12.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu129/
 torch
-git+https://github.com/huggingface/diffusers
-transformers
+diffusers==0.38.0
+transformers==4.57.6
 accelerate
 peft
 optimum-quanto
@@ -9,9 +9,15 @@ numpy<2
 sentencepiece
 torchvision
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-l4t13.txt
+++ b/backend/python/diffusers/requirements-l4t13.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cu130
 torch
-git+https://github.com/huggingface/diffusers
-transformers
+diffusers==0.38.0
+transformers==4.57.6
 accelerate
 peft
 optimum-quanto
@@ -10,9 +10,15 @@ sentencepiece
 torchvision
 ftfy
 chardet
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-mps.txt
+++ b/backend/python/diffusers/requirements-mps.txt
@@ -1,16 +1,22 @@
 torch==2.7.1
 torchvision==0.22.1
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 accelerate
 peft
 sentencepiece
 optimum-quanto
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/core/application/application.go
+++ b/core/application/application.go
@@ -341,11 +341,9 @@ func (a *Application) ResolvePIIPolicy(cfg *config.ModelConfig) (enabled bool, d
 	}
 	appCfg := a.ApplicationConfig()

-	if cfg.PII.Enabled != nil {
-		enabled = *cfg.PII.Enabled
-	} else {
-		enabled = cfg.PIIIsEnabled() // backend default (cloud-proxy)
-	}
+	// PIIIsEnabled already encodes "explicit pii.enabled wins, else backend
+	// default (cloud-proxy)" — the single source of that rule.
+	enabled = cfg.PIIIsEnabled()
 	if !enabled {
 		return false, nil
 	}
@@ -354,7 +352,7 @@ func (a *Application) ResolvePIIPolicy(cfg *config.ModelConfig) (enabled bool, d
 	if len(detectors) == 0 {
 		detectors = append([]string(nil), appCfg.PIIDefaultDetectors...)
 	}
-	return enabled, detectors
+	return true, detectors // enabled is necessarily true past the !enabled guard
 }

 // PIIPolicyResolver adapts ResolvePIIPolicy to pii.PolicyResolver for
--- a/core/application/distributed.go
+++ b/core/application/distributed.go
@@ -357,6 +357,15 @@ func initDistributed(cfg *config.ApplicationConfig, authDB *gorm.DB, configLoade
 		Pressure:         pressure,
 	})

+	// Wire staging-progress broadcasting so file-staging shows up on every
+	// replica, not just the one performing the transfer. Without this, a
+	// /api/operations poll that round-robins onto a peer sees no staging row and
+	// the progress flickers. The origin publishes; peers mirror via the wildcard.
+	router.StagingTracker().SetPublisher(natsClient)
+	if _, err := router.StagingTracker().SubscribeBroadcasts(natsClient); err != nil {
+		xlog.Warn("Failed to subscribe to staging progress broadcasts", "error", err)
+	}
+
 	// Create ReplicaReconciler for auto-scaling model replicas. Adapter +
 	// RegistrationToken feed the state-reconciliation passes: pending op
 	// drain uses the adapter, and model health probes use the token to auth
--- a/core/config/backend_capabilities.go
+++ b/core/config/backend_capabilities.go
@@ -542,19 +542,6 @@ var BackendCapabilities = map[string]BackendCapability{
 		DefaultUsecases:  []string{UsecaseSpeakerRecognition},
 		Description:      "Speaker recognition — voice identity verification and analysis",
 	},
-	"voice-detect": {
-		GRPCMethods:      []GRPCMethod{MethodVoiceVerify, MethodVoiceEmbed, MethodVoiceAnalyze},
-		PossibleUsecases: []string{UsecaseSpeakerRecognition},
-		DefaultUsecases:  []string{UsecaseSpeakerRecognition},
-		Description:      "voice-detect.cpp: C++/ggml speaker embedding, verification and voice analysis (age/gender/emotion)",
-	},
-	"face-detect": {
-		GRPCMethods:      []GRPCMethod{MethodEmbedding, MethodDetect, MethodFaceVerify, MethodFaceAnalyze},
-		PossibleUsecases: []string{UsecaseEmbeddings, UsecaseDetection, UsecaseFaceRecognition},
-		DefaultUsecases:  []string{UsecaseFaceRecognition},
-		AcceptsImages:    true,
-		Description:      "face-detect.cpp: C++/ggml face detection, embedding, verification and attribute analysis",
-	},
 	"silero-vad": {
 		GRPCMethods:      []GRPCMethod{MethodVAD},
 		PossibleUsecases: []string{UsecaseVAD},
--- a/core/config/meta/registry.go
+++ b/core/config/meta/registry.go
@@ -537,6 +537,36 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Component:   "number",
 			Order:       79,
 		},
+		"pipeline.compaction.enabled": {
+			Section:     "pipeline",
+			Label:       "Compaction Enabled",
+			Description: "Fold conversation items that age out of the live window (Max History Items) into a rolling summary instead of dropping them, so long realtime sessions stay cheap without losing earlier context. Off by default.",
+			Component:   "toggle",
+			Order:       80,
+		},
+		"pipeline.compaction.trigger_items": {
+			Section:     "pipeline",
+			Label:       "Compaction Trigger Items",
+			Description: "High-water mark: once the live conversation exceeds this many items, the overflow above Max History Items is summarized and evicted. Must be greater than Max History Items; defaults to twice it. The gap controls how often summarization runs.",
+			Component:   "number",
+			Order:       81,
+		},
+		"pipeline.compaction.summary_model": {
+			Section:     "pipeline",
+			Label:       "Compaction Summary Model",
+			Description: "Optional smaller/cheaper model used to produce the rolling summary. Empty reuses the pipeline's own LLM. On CPU, a tiny model here keeps compaction from competing with the conversation LLM.",
+			Component:   "input",
+			Advanced:    true,
+			Order:       82,
+		},
+		"pipeline.compaction.max_summary_tokens": {
+			Section:     "pipeline",
+			Label:       "Compaction Max Summary Tokens",
+			Description: "Advisory cap on the rolling summary length (fed to the summarizer prompt). Defaults to 512.",
+			Component:   "number",
+			Advanced:    true,
+			Order:       83,
+		},

 		// --- Functions ---
 		"function.grammar.parallel_calls": {
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -641,11 +641,32 @@ type Pipeline struct {
 	// context fills.
 	MaxHistoryItems *int `yaml:"max_history_items,omitempty" json:"max_history_items,omitempty"`

+	// Compaction folds conversation items that age out of the live window
+	// (max_history_items) into a rolling summary instead of dropping them, so
+	// long realtime sessions stay cheap without losing earlier context. Nil
+	// (block absent) means disabled, preserving existing behavior.
+	Compaction *PipelineCompaction `yaml:"compaction,omitempty" json:"compaction,omitempty"`
+
 	// VoiceRecognition gates the pipeline behind speaker verification. Nil
 	// (block absent) means no gate, preserving existing behavior.
 	VoiceRecognition *PipelineVoiceRecognition `yaml:"voice_recognition,omitempty" json:"voice_recognition,omitempty"`
 }

+// PipelineCompaction configures summarize-then-drop for a realtime pipeline.
+type PipelineCompaction struct {
+	// Enabled turns summarize-then-drop on. Default false.
+	Enabled bool `yaml:"enabled,omitempty" json:"enabled,omitempty"`
+	// TriggerItems is the high-water mark: once live items exceed it, overflow
+	// above max_history_items is summarized and evicted. Must exceed
+	// max_history_items; clamped up if not. Default: 2x max_history_items.
+	TriggerItems int `yaml:"trigger_items,omitempty" json:"trigger_items,omitempty"`
+	// SummaryModel optionally names a smaller/cheaper model for the summary
+	// call. Empty uses the pipeline's own LLM.
+	SummaryModel string `yaml:"summary_model,omitempty" json:"summary_model,omitempty"`
+	// MaxSummaryTokens advises the summary length (fed to the prompt). Default 512.
+	MaxSummaryTokens int `yaml:"max_summary_tokens,omitempty" json:"max_summary_tokens,omitempty"`
+}
+
 // ApplyReasoningEffort resolves the effective reasoning effort — a per-request
 // value (requestEffort) overrides the config's own ReasoningEffort default —
 // stores it on the config so gRPCPredictOpts forwards it to the backend as the
--- a/core/http/endpoints/localai/nodes.go
+++ b/core/http/endpoints/localai/nodes.go
@@ -385,6 +385,23 @@ func GetNodeModelsEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {
 	}
 }

+// ListAllNodeModelsEndpoint returns all loaded models across all healthy nodes.
+// @Summary List all loaded models cluster-wide
+// @Tags Nodes
+// @Success 200 {array} nodes.NodeModel
+// @Router /api/nodes/models [get]
+func ListAllNodeModelsEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {
+	return func(c echo.Context) error {
+		ctx := c.Request().Context()
+		models, err := registry.ListAllLoadedModels(ctx)
+		if err != nil {
+			xlog.Error("Failed to list all node models", "error", err)
+			return c.JSON(http.StatusInternalServerError, nodeError(http.StatusInternalServerError, "failed to list node models"))
+		}
+		return c.JSON(http.StatusOK, models)
+	}
+}
+
 // DrainNodeEndpoint sets a node to draining status (no new requests).
 func DrainNodeEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {
 	return func(c echo.Context) error {
--- a/core/http/endpoints/localai/nodes_test.go
+++ b/core/http/endpoints/localai/nodes_test.go
@@ -407,4 +407,44 @@ var _ = Describe("Node HTTP handlers", func() {
 			Expect(names).To(ConsistOf("alpha", "beta"))
 		})
 	})
+
+	Describe("ListAllNodeModelsEndpoint", func() {
+		It("returns an empty list when no models are loaded", func() {
+			e := echo.New()
+			req := httptest.NewRequest(http.MethodGet, "/", nil)
+			rec := httptest.NewRecorder()
+			c := e.NewContext(req, rec)
+
+			handler := ListAllNodeModelsEndpoint(registry)
+			Expect(handler(c)).To(Succeed())
+			Expect(rec.Code).To(Equal(http.StatusOK))
+
+			var list []nodes.NodeModel
+			Expect(json.Unmarshal(rec.Body.Bytes(), &list)).To(Succeed())
+			Expect(list).To(BeEmpty())
+		})
+
+		It("returns loaded models across healthy nodes", func() {
+			ctx := context.Background()
+			Expect(registry.Register(ctx, &nodes.BackendNode{
+				ID: "n1", Name: "alpha", Address: "10.0.0.1:50051", Status: nodes.StatusHealthy,
+			}, true)).To(Succeed())
+			Expect(registry.SetNodeModel(ctx, "n1", "llama-3.3", 0, "loaded", "10.0.0.1:50051", 0)).To(Succeed())
+
+			e := echo.New()
+			req := httptest.NewRequest(http.MethodGet, "/", nil)
+			rec := httptest.NewRecorder()
+			c := e.NewContext(req, rec)
+
+			handler := ListAllNodeModelsEndpoint(registry)
+			Expect(handler(c)).To(Succeed())
+			Expect(rec.Code).To(Equal(http.StatusOK))
+
+			var list []nodes.NodeModel
+			Expect(json.Unmarshal(rec.Body.Bytes(), &list)).To(Succeed())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].ModelName).To(Equal("llama-3.3"))
+			Expect(list[0].NodeID).To(Equal("n1"))
+		})
+	})
 })
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -12,6 +12,7 @@ import (
 	"os"
 	"strconv"
 	"sync"
+	"sync/atomic"
 	"time"

 	"net/http"
@@ -134,6 +135,18 @@ type Session struct {
 	// pairs are kept together so we never feed an orphaned tool result.
 	MaxHistoryItems int

+	// Compaction settings resolved from pipeline.compaction (see resolveCompaction).
+	CompactionEnabled bool
+	CompactionTrigger int
+	SummaryModel      string
+	MaxSummaryTokens  int
+
+	// summarizerFactory lazily builds the model used for compaction summaries
+	// when summary_model is configured; nil means reuse the pipeline LLM.
+	summarizerFactory func() (Model, error)
+	summarizerOnce    sync.Once
+	summarizerCached  Model
+
 	// AssistantExecutor is non-nil when the session opted into the in-process
 	// LocalAI Assistant tool surface. Tool calls whose name matches this
 	// executor's catalog are run inproc and their output is fed back to the
@@ -241,6 +254,12 @@ type Conversation struct {
 	ID    string
 	Items []*types.MessageItemUnion
 	Lock  sync.Mutex
+	// Memory is the rolling summary of items already evicted by compaction. It
+	// is kept out of Items (so trimRealtimeItems never drops it) and rendered
+	// as a system message right after the session instructions.
+	Memory string
+	// compacting ensures at most one background compaction runs per conversation.
+	compacting atomic.Bool
 }

 func (c *Conversation) ToServer() types.Conversation {
@@ -540,13 +559,12 @@ func runRealtimeSession(application *application.Application, t Transport, model
 		SoundDetectionWindowMs:  cfg.Pipeline.SoundDetectionWindowMs,
 		SoundDetectionHopMs:     cfg.Pipeline.SoundDetectionHopMs,
 	}
+	session.CompactionEnabled, session.CompactionTrigger, session.MaxSummaryTokens, session.SummaryModel = resolveCompaction(cfg, session.MaxHistoryItems)

 	// Create a default conversation
 	conversationID := generateConversationID()
 	conversation := &Conversation{
-		ID: conversationID,
-		// TODO: We need to truncate the conversation items when a new item is added and we have run out of space. There are multiple places where items
-		//       can be added so we could use a datastructure here that enforces truncation upon addition
+		ID:    conversationID,
 		Items: []*types.MessageItemUnion{},
 	}
 	session.Conversations[conversationID] = conversation
@@ -577,6 +595,18 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	}
 	session.ModelInterface = m

+	if session.SummaryModel != "" {
+		summaryModelName := session.SummaryModel
+		sid := sessionID
+		session.summarizerFactory = func() (Model, error) {
+			summaryCfg, lerr := application.ModelConfigLoader().LoadModelConfigFileByNameDefaultOptions(summaryModelName, application.ApplicationConfig())
+			if lerr != nil {
+				return nil, fmt.Errorf("load summary model config %q: %w", summaryModelName, lerr)
+			}
+			return newModel(&summaryCfg.Pipeline, application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), evaluator, buildRealtimeRoutingContext(application, sid))
+		}
+	}
+
 	if cfg.Pipeline.VoiceGateEnabled() {
 		gate, gerr := newVoiceGate(
 			*cfg.Pipeline.VoiceRecognition,
@@ -807,6 +837,15 @@ func runRealtimeSession(application *application.Application, t Transport, model
 				commitUtterance(respCtx, allAudio, session, conversation, t)
 			}()

+		case types.InputAudioBufferClearEvent:
+			xlog.Debug("recv", "message", string(msg))
+			// Discard a partially-captured utterance so the client can restart
+			// input cleanly without the stale buffer leaking into the next commit.
+			clearInputAudio(session)
+			sendEvent(t, types.InputAudioBufferClearedEvent{
+				ServerEventBase: types.ServerEventBase{EventID: e.EventID},
+			})
+
 		case types.ConversationItemCreateEvent:
 			xlog.Debug("recv", "message", string(msg))
 			// Add the item to the conversation
@@ -841,7 +880,39 @@ func runRealtimeSession(application *application.Application, t Transport, model
 			})

 		case types.ConversationItemDeleteEvent:
-			sendError(t, "not_implemented", "Deleting items not implemented", "", "event_TODO")
+			xlog.Debug("recv", "message", string(msg))
+			if e.ItemID == "" {
+				sendError(t, "invalid_item_id", "Need item_id, but none specified", "", "event_TODO")
+				continue
+			}
+			conversation.Lock.Lock()
+			updated, ok := deleteItem(conversation.Items, e.ItemID)
+			conversation.Items = updated
+			conversation.Lock.Unlock()
+			if !ok {
+				sendError(t, "invalid_item_id", "Item to delete not found", "", "event_TODO")
+				continue
+			}
+			sendEvent(t, types.ConversationItemDeletedEvent{
+				ServerEventBase: types.ServerEventBase{EventID: e.EventID},
+				ItemID:          e.ItemID,
+			})
+
+		case types.ConversationItemTruncateEvent:
+			xlog.Debug("recv", "message", string(msg))
+			conversation.Lock.Lock()
+			ok := truncateAssistantText(conversation.Items, e.ItemID, e.ContentIndex)
+			conversation.Lock.Unlock()
+			if !ok {
+				sendError(t, "invalid_item_id", "Item to truncate not found", "", "event_TODO")
+				continue
+			}
+			sendEvent(t, types.ConversationItemTruncatedEvent{
+				ServerEventBase: types.ServerEventBase{EventID: e.EventID},
+				ItemID:          e.ItemID,
+				ContentIndex:    e.ContentIndex,
+				AudioEndMs:      e.AudioEndMs,
+			})

 		case types.ConversationItemRetrieveEvent:
 			xlog.Debug("recv", "message", string(msg))
@@ -854,21 +925,7 @@ func runRealtimeSession(application *application.Application, t Transport, model
 			conversation.Lock.Lock()
 			var retrievedItem types.MessageItemUnion
 			for _, item := range conversation.Items {
-				// We need to check ID in the union
-				var id string
-				if item.System != nil {
-					id = item.System.ID
-				} else if item.User != nil {
-					id = item.User.ID
-				} else if item.Assistant != nil {
-					id = item.Assistant.ID
-				} else if item.FunctionCall != nil {
-					id = item.FunctionCall.ID
-				} else if item.FunctionCallOutput != nil {
-					id = item.FunctionCallOutput.ID
-				}
-
-				if id == e.ItemID {
+				if itemID(item) == e.ItemID {
 					retrievedItem = *item
 					break
 				}
@@ -1666,6 +1723,9 @@ const maxAssistantToolTurns = 10

 func triggerResponse(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams) {
 	triggerResponseAtTurn(ctx, session, conv, t, overrides, 0)
+	// Fold aged-out turns into the rolling memory off the critical path; the
+	// next turn reaps the smaller buffer.
+	session.maybeCompact(conv)
 }

 func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams, toolTurn int) {
@@ -1721,6 +1781,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 	var lastUserSpeaker *types.Speaker
 	personalize := session.voiceGate != nil && session.voiceGate.cfg.PersonalizeEnabled()
 	conv.Lock.Lock()
+	conversationHistory = withMemory(conversationHistory, conv.Memory)
 	items := trimRealtimeItems(conv.Items, session.MaxHistoryItems)
 	for _, item := range items {
 		if item.User != nil {
--- a/core/http/endpoints/openai/realtime_compaction.go
+++ b/core/http/endpoints/openai/realtime_compaction.go
@@ -0,0 +1,326 @@
+package openai
+
+import (
+	"context"
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/pkg/reasoning"
+	"github.com/mudler/xlog"
+)
+
+const (
+	defaultMaxSummaryTokens = 512
+	memoryPrefix            = "Summary of earlier conversation:\n"
+	// compactionTimeout bounds the summarizer call so a stuck model can't pin the
+	// compacting flag (and thus block all further compaction) forever.
+	compactionTimeout = 60 * time.Second
+)
+
+// withMemory inserts the rolling summary as a system message after the existing
+// (instructions) history. No-op when memory is empty.
+func withMemory(history schema.Messages, memory string) schema.Messages {
+	if memory == "" {
+		return history
+	}
+	content := memoryPrefix + memory
+	return append(history, schema.Message{
+		Role:          string(types.MessageRoleSystem),
+		StringContent: content,
+		Content:       content,
+	})
+}
+
+// renderItemsTranscript renders conversation items as a plain "role: text"
+// transcript for summarization. Non-text items (bare tool calls) are labelled
+// so the summarizer keeps track of actions taken.
+func renderItemsTranscript(items []*types.MessageItemUnion) string {
+	var b strings.Builder
+	for _, item := range items {
+		switch {
+		case item.User != nil:
+			b.WriteString("user: ")
+			for _, c := range item.User.Content {
+				if c.Text != "" {
+					b.WriteString(c.Text)
+				}
+				if c.Transcript != "" {
+					b.WriteString(c.Transcript)
+				}
+			}
+			b.WriteString("\n")
+		case item.Assistant != nil:
+			b.WriteString("assistant: ")
+			// Realtime assistant *audio* turns store the spoken words in
+			// .Transcript (not .Text), so emit both or spoken turns are dropped.
+			for _, c := range item.Assistant.Content {
+				if c.Text != "" {
+					b.WriteString(c.Text)
+				}
+				if c.Transcript != "" {
+					b.WriteString(c.Transcript)
+				}
+			}
+			b.WriteString("\n")
+		case item.FunctionCall != nil:
+			b.WriteString(fmt.Sprintf("assistant called tool %s(%s)\n", item.FunctionCall.Name, item.FunctionCall.Arguments))
+		case item.FunctionCallOutput != nil:
+			b.WriteString(fmt.Sprintf("tool result: %s\n", item.FunctionCallOutput.Output))
+		}
+	}
+	return strings.TrimSpace(b.String())
+}
+
+// buildSummaryMessages builds the chat messages for the summarizer LLM: a system
+// instruction plus prior memory and the new transcript to fold in. maxTokens is
+// advisory (fed to the prompt; not hard-enforced in v1).
+func buildSummaryMessages(priorMemory, transcript string, maxTokens int) schema.Messages {
+	system := fmt.Sprintf("You maintain a running memory of a live voice conversation. "+
+		"Merge the prior memory with the new exchanges into an updated memory. "+
+		"Keep names, decisions, facts, preferences, and open threads. Be concise "+
+		"(under ~%d tokens). Output only the updated memory, with no reasoning or tags.", maxTokens)
+	var user strings.Builder
+	if priorMemory != "" {
+		user.WriteString("Prior memory:\n")
+		user.WriteString(priorMemory)
+		user.WriteString("\n\n")
+	}
+	user.WriteString("New exchanges to fold in:\n")
+	user.WriteString(transcript)
+	return schema.Messages{
+		{Role: string(types.MessageRoleSystem), StringContent: system, Content: system},
+		{Role: string(types.MessageRoleUser), StringContent: user.String(), Content: user.String()},
+	}
+}
+
+// clearInputAudio resets the session's pending input audio buffer (the raw
+// PCM and any buffered Opus frames). Used by the input_audio_buffer.clear
+// realtime event so a client can discard a partially-captured utterance.
+func clearInputAudio(s *Session) {
+	s.AudioBufferLock.Lock()
+	s.InputAudioBuffer = nil
+	s.AudioBufferLock.Unlock()
+	s.OpusFramesLock.Lock()
+	s.OpusFrames = nil
+	s.OpusFramesLock.Unlock()
+}
+
+// itemID extracts the id from any MessageItemUnion variant ("" if none).
+func itemID(item *types.MessageItemUnion) string {
+	switch {
+	case item == nil:
+		return ""
+	case item.System != nil:
+		return item.System.ID
+	case item.User != nil:
+		return item.User.ID
+	case item.Assistant != nil:
+		return item.Assistant.ID
+	case item.FunctionCall != nil:
+		return item.FunctionCall.ID
+	case item.FunctionCallOutput != nil:
+		return item.FunctionCallOutput.ID
+	default:
+		return ""
+	}
+}
+
+// deleteItem removes the item with id from items, returning the new slice and
+// whether it was found.
+func deleteItem(items []*types.MessageItemUnion, id string) ([]*types.MessageItemUnion, bool) {
+	for i, item := range items {
+		if itemID(item) == id {
+			return append(items[:i:i], items[i+1:]...), true
+		}
+	}
+	return items, false
+}
+
+// truncateAssistantText clears the text of the assistant item's content part at
+// contentIndex. Minimal truncate: used to discard an interrupted/barge-in
+// response tail. Both .Text and .Transcript are cleared because realtime audio
+// turns store the spoken words in .Transcript (clearing only .Text would no-op).
+func truncateAssistantText(items []*types.MessageItemUnion, id string, contentIndex int) bool {
+	for _, item := range items {
+		if itemID(item) != id || item.Assistant == nil {
+			continue
+		}
+		if contentIndex >= 0 && contentIndex < len(item.Assistant.Content) {
+			item.Assistant.Content[contentIndex].Text = ""
+			item.Assistant.Content[contentIndex].Transcript = ""
+		}
+		return true
+	}
+	return false
+}
+
+// compactionCut returns the index splitting items into overflow (items[:cut],
+// to be summarized+evicted) and the kept live tail (items[cut:]), keeping the
+// last `keep` items. It mirrors trimRealtimeItems' pair-safety: the cut is
+// pulled left so a function_call and its function_call_output are never split
+// across the boundary (the whole pair lands in the kept tail). Returns 0 when
+// there is nothing to cut.
+func compactionCut(items []*types.MessageItemUnion, keep int) int {
+	// keep <= 0 means no live-window cap (the "unlimited history" sentinel, as
+	// in trimRealtimeItems): there is nothing to evict, so cut nothing. This
+	// also avoids indexing items[len(items)] in the pair-safety loop below.
+	if keep <= 0 {
+		return 0
+	}
+	cut := len(items) - keep
+	if cut <= 0 {
+		return 0
+	}
+	for cut > 0 && items[cut] != nil && items[cut].FunctionCallOutput != nil {
+		cut--
+	}
+	return cut
+}
+
+// resolveCompaction reads the pipeline.compaction block, applying defaults and
+// the trigger>max_history invariant. maxHistory is the already-resolved live
+// window size. Returns enabled=false (and zero values) when compaction is off.
+func resolveCompaction(cfg *config.ModelConfig, maxHistory int) (enabled bool, trigger, maxSummaryTokens int, summaryModel string) {
+	if cfg == nil || cfg.Pipeline.Compaction == nil || !cfg.Pipeline.Compaction.Enabled {
+		return false, 0, 0, ""
+	}
+	c := cfg.Pipeline.Compaction
+	trigger = c.TriggerItems
+	if trigger <= 0 {
+		trigger = maxHistory * 2
+	}
+	if trigger <= maxHistory {
+		trigger = maxHistory + 1
+	}
+	maxSummaryTokens = c.MaxSummaryTokens
+	if maxSummaryTokens <= 0 {
+		maxSummaryTokens = defaultMaxSummaryTokens
+	}
+	return true, trigger, maxSummaryTokens, c.SummaryModel
+}
+
+// prefixMatches reports whether items begins with the same ids, in order, as
+// snapshot — i.e. the overflow we summarized is still at the head (no concurrent
+// client delete reshuffled it).
+func prefixMatches(items, snapshot []*types.MessageItemUnion) bool {
+	if len(items) < len(snapshot) {
+		return false
+	}
+	for i := range snapshot {
+		if itemID(items[i]) != itemID(snapshot[i]) {
+			return false
+		}
+	}
+	return true
+}
+
+// compact folds overflow items into conv.Memory and evicts them. It never holds
+// conv.Lock across the summarizer call: snapshot under lock, summarize unlocked,
+// commit under lock (re-validating the head is unchanged). On any error it
+// leaves the conversation untouched — items are never dropped without a summary.
+func (s *Session) compact(conv *Conversation, model Model) {
+	if model == nil {
+		return
+	}
+	// Snapshot.
+	conv.Lock.Lock()
+	if len(conv.Items) <= s.CompactionTrigger {
+		conv.Lock.Unlock()
+		return
+	}
+	cut := compactionCut(conv.Items, s.MaxHistoryItems)
+	if cut <= 0 {
+		conv.Lock.Unlock()
+		return
+	}
+	overflow := append([]*types.MessageItemUnion(nil), conv.Items[:cut]...)
+	prior := conv.Memory
+	conv.Lock.Unlock()
+
+	// Summarize (unlocked).
+	msgs := buildSummaryMessages(prior, renderItemsTranscript(overflow), s.MaxSummaryTokens)
+	ctx, cancel := context.WithTimeout(context.Background(), compactionTimeout)
+	defer cancel()
+	predFunc, err := model.Predict(ctx, msgs, nil, nil, nil, nil, nil, nil, nil, nil, nil)
+	if err != nil {
+		xlog.Warn("realtime compaction: summarizer predict failed", "error", err)
+		return
+	}
+	pred, err := predFunc()
+	if err != nil {
+		xlog.Warn("realtime compaction: summarizer inference failed", "error", err)
+		return
+	}
+	// Strip any leaked reasoning/thinking spans using the same extractor the
+	// rest of the realtime path uses, rather than a bespoke regex.
+	rcfg := reasoning.Config{}
+	if mc := model.PredictConfig(); mc != nil {
+		rcfg = spokenReasoningConfig(mc.ReasoningConfig)
+	}
+	_, summary := reasoning.ExtractReasoningComplete(pred.Response, "", rcfg)
+	summary = strings.TrimSpace(summary)
+	if summary == "" {
+		xlog.Warn("realtime compaction: empty summary, skipping eviction")
+		return
+	}
+
+	// Commit.
+	conv.Lock.Lock()
+	defer conv.Lock.Unlock()
+	if !prefixMatches(conv.Items, overflow) {
+		xlog.Debug("realtime compaction: head changed during summary, skipping")
+		return
+	}
+	conv.Memory = summary
+	conv.Items = conv.Items[len(overflow):]
+	xlog.Debug("realtime compaction: evicted items into memory", "evicted", len(overflow), "remaining", len(conv.Items))
+}
+
+// summarizerModel resolves the model used to produce compaction summaries.
+// Without a configured summary_model (or factory) it reuses the pipeline LLM.
+func (s *Session) summarizerModel() Model {
+	if s.SummaryModel == "" || s.summarizerFactory == nil {
+		return s.ModelInterface
+	}
+	s.summarizerOnce.Do(func() {
+		m, err := s.summarizerFactory()
+		if err != nil {
+			xlog.Warn("realtime compaction: summary_model load failed, falling back to pipeline LLM", "model", s.SummaryModel, "error", err)
+			m = s.ModelInterface
+		}
+		s.summarizerCached = m
+	})
+	return s.summarizerCached
+}
+
+// maybeCompact schedules a background compaction when the live buffer has grown
+// past the trigger and none is already running. Returns immediately.
+func (s *Session) maybeCompact(conv *Conversation) {
+	if !s.CompactionEnabled {
+		return
+	}
+	conv.Lock.Lock()
+	over := len(conv.Items) > s.CompactionTrigger
+	conv.Lock.Unlock()
+	if !over {
+		return
+	}
+	if !conv.compacting.CompareAndSwap(false, true) {
+		return
+	}
+	go func() {
+		defer conv.compacting.Store(false)
+		// Resolve (and, for a configured summary_model, lazily load) the
+		// summarizer only when a compaction actually runs, off the response
+		// path — so the model load never blocks a user turn.
+		model := s.summarizerModel()
+		if model == nil {
+			return
+		}
+		s.compact(conv, model)
+	}()
+}
--- a/core/http/endpoints/openai/realtime_compaction_test.go
+++ b/core/http/endpoints/openai/realtime_compaction_test.go
@@ -0,0 +1,308 @@
+package openai
+
+import (
+	"errors"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+	"github.com/mudler/LocalAI/core/schema"
+)
+
+var _ = Describe("resolveCompaction", func() {
+	It("disables when the block is absent", func() {
+		enabled, _, _, _ := resolveCompaction(&config.ModelConfig{}, 6)
+		Expect(enabled).To(BeFalse())
+	})
+
+	It("defaults trigger to 2x max history and tokens to 512", func() {
+		cfg := &config.ModelConfig{Pipeline: config.Pipeline{Compaction: &config.PipelineCompaction{Enabled: true}}}
+		enabled, trigger, maxTok, _ := resolveCompaction(cfg, 6)
+		Expect(enabled).To(BeTrue())
+		Expect(trigger).To(Equal(12))
+		Expect(maxTok).To(Equal(512))
+	})
+
+	It("clamps trigger to max history + 1 when misconfigured", func() {
+		cfg := &config.ModelConfig{Pipeline: config.Pipeline{Compaction: &config.PipelineCompaction{Enabled: true, TriggerItems: 4}}}
+		_, trigger, _, _ := resolveCompaction(cfg, 6)
+		Expect(trigger).To(Equal(7))
+	})
+
+	It("honors explicit values", func() {
+		cfg := &config.ModelConfig{Pipeline: config.Pipeline{Compaction: &config.PipelineCompaction{
+			Enabled: true, TriggerItems: 20, MaxSummaryTokens: 256, SummaryModel: "tiny"}}}
+		enabled, trigger, maxTok, model := resolveCompaction(cfg, 6)
+		Expect(enabled).To(BeTrue())
+		Expect(trigger).To(Equal(20))
+		Expect(maxTok).To(Equal(256))
+		Expect(model).To(Equal("tiny"))
+	})
+})
+
+var _ = Describe("deleteItem", func() {
+	mk := func(ids ...string) []*types.MessageItemUnion {
+		out := make([]*types.MessageItemUnion, len(ids))
+		for i, id := range ids {
+			out[i] = &types.MessageItemUnion{User: &types.MessageItemUser{ID: id}}
+		}
+		return out
+	}
+
+	It("removes the item with the given id", func() {
+		items, ok := deleteItem(mk("a", "b", "c"), "b")
+		Expect(ok).To(BeTrue())
+		Expect(len(items)).To(Equal(2))
+		Expect(itemID(items[0])).To(Equal("a"))
+		Expect(itemID(items[1])).To(Equal("c"))
+	})
+
+	It("reports not found for an unknown id", func() {
+		_, ok := deleteItem(mk("a"), "zzz")
+		Expect(ok).To(BeFalse())
+	})
+})
+
+var _ = Describe("clearInputAudio", func() {
+	It("resets the pending PCM and buffered Opus frames", func() {
+		s := &Session{InputAudioBuffer: []byte{1, 2, 3}, OpusFrames: [][]byte{{9}}}
+		clearInputAudio(s)
+		Expect(s.InputAudioBuffer).To(BeNil())
+		Expect(s.OpusFrames).To(BeNil())
+	})
+})
+
+var _ = Describe("truncateAssistantText", func() {
+	It("clears the text of the assistant content part at the index", func() {
+		items := []*types.MessageItemUnion{{Assistant: &types.MessageItemAssistant{
+			ID:      "a1",
+			Content: []types.MessageContentOutput{{Type: types.MessageContentTypeText, Text: "hello world"}},
+		}}}
+		ok := truncateAssistantText(items, "a1", 0)
+		Expect(ok).To(BeTrue())
+		Expect(items[0].Assistant.Content[0].Text).To(Equal(""))
+	})
+
+	// Realtime assistant *audio* turns store the spoken words in .Transcript, not
+	// .Text, so a barge-in truncate must clear .Transcript too or it would no-op.
+	It("clears the transcript of an assistant audio content part", func() {
+		items := []*types.MessageItemUnion{{Assistant: &types.MessageItemAssistant{
+			ID:      "a1",
+			Content: []types.MessageContentOutput{{Type: types.MessageContentTypeAudio, Transcript: "hello world"}},
+		}}}
+		ok := truncateAssistantText(items, "a1", 0)
+		Expect(ok).To(BeTrue())
+		Expect(items[0].Assistant.Content[0].Transcript).To(Equal(""))
+	})
+
+	It("returns false for an unknown id", func() {
+		Expect(truncateAssistantText(nil, "nope", 0)).To(BeFalse())
+	})
+})
+
+var _ = Describe("compactionCut", func() {
+	user := func(id string) *types.MessageItemUnion {
+		return &types.MessageItemUnion{User: &types.MessageItemUser{ID: id}}
+	}
+	call := func(id string) *types.MessageItemUnion {
+		return &types.MessageItemUnion{FunctionCall: &types.MessageItemFunctionCall{ID: id}}
+	}
+	out := func(id string) *types.MessageItemUnion {
+		return &types.MessageItemUnion{FunctionCallOutput: &types.MessageItemFunctionCallOutput{ID: id}}
+	}
+
+	It("cuts exactly len-keep when no pairs straddle the boundary", func() {
+		items := []*types.MessageItemUnion{user("1"), user("2"), user("3"), user("4")}
+		Expect(compactionCut(items, 2)).To(Equal(2))
+	})
+
+	It("returns 0 when nothing to cut", func() {
+		Expect(compactionCut([]*types.MessageItemUnion{user("1")}, 2)).To(Equal(0))
+	})
+
+	It("returns 0 (cuts nothing) when keep is 0 — the unlimited-window sentinel", func() {
+		items := []*types.MessageItemUnion{user("1"), user("2"), user("3")}
+		Expect(compactionCut(items, 0)).To(Equal(0))
+	})
+
+	It("moves the boundary so a call/output pair is not split", func() {
+		// keep=2 -> naive cut=2, but items[2] is the output of items[1]'s call;
+		// pull the cut right so the whole pair stays in the kept tail.
+		items := []*types.MessageItemUnion{user("1"), call("c"), out("c"), user("4")}
+		Expect(compactionCut(items, 2)).To(Equal(1))
+	})
+})
+
+var _ = Describe("withMemory", func() {
+	It("inserts a memory system message when memory is non-empty", func() {
+		base := schema.Messages{{Role: "system", StringContent: "instructions"}}
+		out := withMemory(base, "user is Bob; wants pizza")
+		Expect(len(out)).To(Equal(2))
+		Expect(out[1].Role).To(Equal("system"))
+		Expect(out[1].StringContent).To(ContainSubstring("user is Bob"))
+		Expect(out[1].StringContent).To(ContainSubstring("Summary of earlier conversation"))
+	})
+
+	It("is a no-op when memory is empty", func() {
+		base := schema.Messages{{Role: "system", StringContent: "instructions"}}
+		Expect(withMemory(base, "")).To(HaveLen(1))
+	})
+})
+
+var _ = Describe("renderItemsTranscript", func() {
+	It("renders user and assistant text turns", func() {
+		items := []*types.MessageItemUnion{
+			{User: &types.MessageItemUser{Content: []types.MessageContentInput{{Type: types.MessageContentTypeInputText, Text: "hi"}}}},
+			{Assistant: &types.MessageItemAssistant{Content: []types.MessageContentOutput{{Type: types.MessageContentTypeText, Text: "hello"}}}},
+		}
+		out := renderItemsTranscript(items)
+		Expect(out).To(ContainSubstring("user: hi"))
+		Expect(out).To(ContainSubstring("assistant: hello"))
+	})
+
+	// Realtime assistant *audio* turns store the spoken words in .Transcript, not
+	// .Text, so the transcript builder must emit .Transcript too or spoken turns
+	// would be dropped from the summary.
+	It("renders an assistant audio turn from its transcript", func() {
+		items := []*types.MessageItemUnion{
+			{Assistant: &types.MessageItemAssistant{Content: []types.MessageContentOutput{{Type: types.MessageContentTypeAudio, Transcript: "spoken words"}}}},
+		}
+		Expect(renderItemsTranscript(items)).To(ContainSubstring("assistant: spoken words"))
+	})
+})
+
+var _ = Describe("buildSummaryMessages", func() {
+	It("includes prior memory and the new transcript", func() {
+		msgs := buildSummaryMessages("prior facts", "user: hi", 512)
+		Expect(len(msgs)).To(Equal(2))
+		Expect(msgs[0].Role).To(Equal("system"))
+		Expect(msgs[1].StringContent).To(ContainSubstring("prior facts"))
+		Expect(msgs[1].StringContent).To(ContainSubstring("user: hi"))
+	})
+})
+
+var _ = Describe("compact", func() {
+	user := func(id, text string) *types.MessageItemUnion {
+		return &types.MessageItemUnion{User: &types.MessageItemUser{ID: id,
+			Content: []types.MessageContentInput{{Type: types.MessageContentTypeInputText, Text: text}}}}
+	}
+
+	It("summarizes overflow into Memory and evicts it, keeping the live tail", func() {
+		conv := &Conversation{Items: []*types.MessageItemUnion{
+			user("1", "a"), user("2", "b"), user("3", "c"), user("4", "d"),
+			user("5", "e"), user("6", "f"), user("7", "g"), user("8", "h"),
+		}}
+		s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4, MaxSummaryTokens: 512}
+		m := &fakeModel{predictResp: backend.LLMResponse{Response: "ROLLED UP"}}
+
+		s.compact(conv, m)
+
+		Expect(conv.Memory).To(Equal("ROLLED UP"))
+		Expect(len(conv.Items)).To(Equal(4))
+		Expect(itemID(conv.Items[0])).To(Equal("5"))
+		// The summarizer saw the evicted turns.
+		Expect(m.lastMessages[1].StringContent).To(ContainSubstring("a"))
+	})
+
+	It("leaves Items and Memory untouched when the summarizer errors", func() {
+		items := []*types.MessageItemUnion{user("1", "a"), user("2", "b"), user("3", "c")}
+		conv := &Conversation{Items: items}
+		s := &Session{CompactionEnabled: true, CompactionTrigger: 2, MaxHistoryItems: 1, MaxSummaryTokens: 512}
+		m := &fakeModel{predictErr: errors.New("boom")}
+
+		s.compact(conv, m)
+
+		Expect(conv.Memory).To(Equal(""))
+		Expect(len(conv.Items)).To(Equal(3))
+	})
+
+	It("strips leaked reasoning tags from the summary via the shared extractor", func() {
+		conv := &Conversation{Items: []*types.MessageItemUnion{
+			user("1", "a"), user("2", "b"), user("3", "c"), user("4", "d"),
+			user("5", "e"), user("6", "f"), user("7", "g"), user("8", "h"),
+		}}
+		s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4, MaxSummaryTokens: 512}
+		m := &fakeModel{predictResp: backend.LLMResponse{Response: "<think>planning the summary</think>CLEAN SUMMARY"}}
+
+		s.compact(conv, m)
+
+		Expect(conv.Memory).To(Equal("CLEAN SUMMARY"))
+		Expect(conv.Memory).ToNot(ContainSubstring("planning"))
+	})
+
+	It("does nothing when items are at or below the trigger", func() {
+		conv := &Conversation{Items: []*types.MessageItemUnion{user("1", "a")}}
+		s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4}
+		s.compact(conv, &fakeModel{predictResp: backend.LLMResponse{Response: "x"}})
+		Expect(conv.Memory).To(Equal(""))
+		Expect(len(conv.Items)).To(Equal(1))
+	})
+})
+
+var _ = Describe("prefixMatches", func() {
+	user := func(id string) *types.MessageItemUnion {
+		return &types.MessageItemUnion{User: &types.MessageItemUser{ID: id}}
+	}
+
+	It("matches when items begins with the snapshot ids in order", func() {
+		items := []*types.MessageItemUnion{user("1"), user("2"), user("3")}
+		snap := []*types.MessageItemUnion{user("1"), user("2")}
+		Expect(prefixMatches(items, snap)).To(BeTrue())
+	})
+
+	It("matches an empty snapshot", func() {
+		Expect(prefixMatches([]*types.MessageItemUnion{user("1")}, nil)).To(BeTrue())
+	})
+
+	It("fails when items is shorter than the snapshot (a concurrent delete shrank the head)", func() {
+		items := []*types.MessageItemUnion{user("1")}
+		snap := []*types.MessageItemUnion{user("1"), user("2")}
+		Expect(prefixMatches(items, snap)).To(BeFalse())
+	})
+
+	It("fails when the head ids differ (a concurrent delete reordered the head)", func() {
+		items := []*types.MessageItemUnion{user("2"), user("3")}
+		snap := []*types.MessageItemUnion{user("1"), user("2")}
+		Expect(prefixMatches(items, snap)).To(BeFalse())
+	})
+})
+
+var _ = Describe("summarizerModel", func() {
+	It("returns the pipeline model when no summary_model is set", func() {
+		m := &fakeModel{}
+		s := &Session{ModelInterface: m}
+		Expect(s.summarizerModel()).To(Equal(m))
+	})
+
+	It("uses the factory (once) when summary_model is set", func() {
+		pipeline := &fakeModel{}
+		small := &fakeModel{}
+		calls := 0
+		s := &Session{ModelInterface: pipeline, SummaryModel: "tiny",
+			summarizerFactory: func() (Model, error) { calls++; return small, nil }}
+		Expect(s.summarizerModel()).To(Equal(small))
+		Expect(s.summarizerModel()).To(Equal(small))
+		Expect(calls).To(Equal(1))
+	})
+
+	It("falls back to the pipeline model when the factory errors", func() {
+		pipeline := &fakeModel{}
+		s := &Session{ModelInterface: pipeline, SummaryModel: "tiny",
+			summarizerFactory: func() (Model, error) { return nil, errors.New("nope") }}
+		Expect(s.summarizerModel()).To(Equal(pipeline))
+	})
+})
+
+var _ = Describe("itemID", func() {
+	It("returns the id for each variant and empty for nil", func() {
+		Expect(itemID(nil)).To(Equal(""))
+		Expect(itemID(&types.MessageItemUnion{User: &types.MessageItemUser{ID: "u1"}})).To(Equal("u1"))
+		Expect(itemID(&types.MessageItemUnion{Assistant: &types.MessageItemAssistant{ID: "a1"}})).To(Equal("a1"))
+		Expect(itemID(&types.MessageItemUnion{System: &types.MessageItemSystem{ID: "s1"}})).To(Equal("s1"))
+		Expect(itemID(&types.MessageItemUnion{FunctionCall: &types.MessageItemFunctionCall{ID: "f1"}})).To(Equal("f1"))
+		Expect(itemID(&types.MessageItemUnion{FunctionCallOutput: &types.MessageItemFunctionCallOutput{ID: "o1"}})).To(Equal("o1"))
+	})
+})
--- a/core/http/react-ui/e2e/model-config.spec.js
+++ b/core/http/react-ui/e2e/model-config.spec.js
@@ -288,6 +288,21 @@ test.describe('Model Editor - Interactive Tab', () => {
    await expect(page.locator('input[placeholder^="match,"]')).toBeVisible()
  })

+  test('pattern min_len clamps a directly-typed negative to 0', async ({ page }) => {
+    const searchInput = page.locator('input[placeholder="Search fields to add..."]')
+    await searchInput.fill('Custom Secret Patterns')
+    const dropdown = searchInput.locator('..').locator('..')
+    await dropdown.locator('div', { hasText: 'Custom Secret Patterns' }).first().click()
+
+    await page.locator('button', { hasText: 'Add pattern' }).click()
+    // The number input's min={0} only limits the spinner arrows, not keyboard
+    // entry; the editor must sanitise a typed negative so a meaningless
+    // negative length floor never reaches the saved config.
+    const minLen = page.locator('input[aria-label="Minimum length"]')
+    await minLen.fill('-5')
+    await expect(minLen).toHaveValue('0')
+  })
+
  // Regression: a map-typed field (entity_actions) present in the loaded YAML
  // must render WITH its values. flattenConfig used to recurse into the map,
  // scattering it across pii_detection.entity_actions.<GROUP> paths that match
@@ -329,4 +344,37 @@ test.describe('Model Editor - Interactive Tab', () => {
    await expect(page.getByText(/block —/i).first()).toBeVisible()
  })

+  // A map cannot hold two values for one key, so renaming a row to an existing
+  // group must collapse to a single row (Object.fromEntries, last write wins)
+  // rather than rendering two conflicting rows that silently lose one on save.
+  test('entity_actions collapses a duplicate group to a single row', async ({ page }) => {
+    await page.route('**/api/models/edit/ner-model', (route) => {
+      route.fulfill({
+        contentType: 'application/json',
+        body: JSON.stringify({
+          name: 'ner-model',
+          config: [
+            'name: ner-model',
+            'backend: llama-cpp',
+            'pii_detection:',
+            '    entity_actions:',
+            '        SSN: block',
+            '        EMAIL: mask',
+            '',
+          ].join('\n'),
+        }),
+      })
+    })
+
+    await page.goto('/app/model-editor/ner-model')
+
+    const groupInputs = page.locator('input[aria-label="Entity group"]')
+    await expect(groupInputs).toHaveCount(2)
+
+    // Rename the EMAIL row to duplicate SSN; the editor collapses to one SSN row.
+    await groupInputs.nth(1).fill('SSN')
+    await expect(groupInputs).toHaveCount(1)
+    await expect(groupInputs.nth(0)).toHaveValue('SSN')
+  })
+
 })
--- a/core/http/react-ui/e2e/nodes-detail.spec.js
+++ b/core/http/react-ui/e2e/nodes-detail.spec.js
@@ -0,0 +1,34 @@
+import { test, expect } from './coverage-fixtures.js'
+
+const ID = 'n1'
+async function mockNode(page) {
+  await page.route(`**/api/nodes/${ID}`, r => r.fulfill({ status: 200, contentType: 'application/json',
+    body: JSON.stringify({ id: ID, name: 'alpha', node_type: 'backend', address: '10.0.0.1:50051', status: 'healthy', total_vram: 24e9, available_vram: 12e9, max_replicas_per_model: 1, labels: { env: 'prod' } }) }))
+  await page.route(`**/api/nodes/${ID}/models`, r => r.fulfill({ status: 200, contentType: 'application/json',
+    body: JSON.stringify([{ node_id: ID, model_name: 'llama-3.3', state: 'loaded', in_flight: 0, replica_index: 0 }]) }))
+  await page.route(`**/api/nodes/${ID}/backends`, r => r.fulfill({ status: 200, contentType: 'application/json',
+    body: JSON.stringify([{ name: 'llama-cpp', is_system: true, installed_at: '2026-06-01T00:00:00Z' }]) }))
+}
+
+test.describe('Node detail page', () => {
+  test('renders sections for a node', async ({ page }) => {
+    await mockNode(page)
+    await page.goto(`/app/nodes/${ID}`)
+    await expect(page.locator('.page-title').first()).toBeVisible({ timeout: 15_000 })
+    await expect(page.getByText('alpha')).toBeVisible()
+    await expect(page.getByText('llama-3.3')).toBeVisible()
+    await expect(page.getByText('llama-cpp')).toBeVisible()
+    await expect(page.getByText('env=prod')).toBeVisible()
+  })
+
+  test('is reachable by clicking a roster panel', async ({ page }) => {
+    await page.route('**/api/nodes', r => r.fulfill({ status: 200, contentType: 'application/json',
+      body: JSON.stringify([{ id: ID, name: 'alpha', node_type: 'backend', address: '10.0.0.1:50051', status: 'healthy' }]) }))
+    await page.route('**/api/nodes/models', r => r.fulfill({ status: 200, contentType: 'application/json', body: '[]' }))
+    await page.route('**/api/nodes/scheduling', r => r.fulfill({ status: 200, contentType: 'application/json', body: '[]' }))
+    await mockNode(page)
+    await page.goto('/app/nodes')
+    await page.locator('.node-panel').filter({ hasText: 'alpha' }).getByText('alpha').click()
+    await expect(page).toHaveURL(new RegExp(`/app/nodes/${ID}$`))
+  })
+})
--- a/core/http/react-ui/e2e/nodes-per-node-backend-actions.spec.js
+++ b/core/http/react-ui/e2e/nodes-per-node-backend-actions.spec.js
@@ -12,28 +12,37 @@ const NODE_NAME = 'worker-test'
 const BACKEND_NAME = 'cuda12-vllm-development'

 async function mockDistributedNodes(page, { onDelete } = {}) {
+  const nodeRecord = {
+    id: NODE_ID,
+    name: NODE_NAME,
+    node_type: 'backend',
+    address: '10.0.0.1:50051',
+    http_address: '10.0.0.1:8090',
+    status: 'healthy',
+    total_vram: 0,
+    available_vram: 0,
+    total_ram: 8_000_000_000,
+    available_ram: 4_000_000_000,
+    gpu_vendor: '',
+    last_heartbeat: new Date().toISOString(),
+    created_at: new Date().toISOString(),
+    updated_at: new Date().toISOString(),
+  }
+
  await page.route('**/api/nodes', (route) => {
    route.fulfill({
      status: 200,
      contentType: 'application/json',
-      body: JSON.stringify([
-        {
-          id: NODE_ID,
-          name: NODE_NAME,
-          node_type: 'backend',
-          address: '10.0.0.1:50051',
-          http_address: '10.0.0.1:8090',
-          status: 'healthy',
-          total_vram: 0,
-          available_vram: 0,
-          total_ram: 8_000_000_000,
-          available_ram: 4_000_000_000,
-          gpu_vendor: '',
-          last_heartbeat: new Date().toISOString(),
-          created_at: new Date().toISOString(),
-          updated_at: new Date().toISOString(),
-        },
-      ]),
+      body: JSON.stringify([nodeRecord]),
+    })
+  })
+
+  // The detail page fetches the single node via nodesApi.get(id).
+  await page.route(`**/api/nodes/${NODE_ID}`, (route) => {
+    route.fulfill({
+      status: 200,
+      contentType: 'application/json',
+      body: JSON.stringify(nodeRecord),
    })
  })

@@ -80,24 +89,18 @@ async function mockDistributedNodes(page, { onDelete } = {}) {
  })
 }

-async function expandNodeAndWaitForBackends(page) {
-  await page.goto('/app/nodes')
-  // Click the row to expand it. The chevron toggle and the row both work,
-  // but clicking the name cell is the most user-like.
-  await page.getByText(NODE_NAME).first().click()
-  // Backends, Capacity and Labels live behind a "Manage" <details>
-  // disclosure (the drawer was distilled to keep at-a-glance content
-  // lean — see distill refactor in the multi-replica branch). Open it
-  // by clicking the summary inside the .node-manage scope so the
-  // per-node backend table is in the DOM before assertions run.
-  await page.locator('.node-manage > summary').first().click()
+async function openNodeDetail(page) {
+  // The per-node backend table now lives on the deep-linkable detail page
+  // at /app/nodes/:id (the old expand-row + "Manage" disclosure was removed
+  // when the roster was restructured). Navigate straight there.
+  await page.goto(`/app/nodes/${NODE_ID}`)
  await expect(page.getByRole('cell', { name: BACKEND_NAME, exact: true })).toBeVisible({ timeout: 10_000 })
 }

 test.describe('Nodes page — per-node backend actions', () => {
  test('upgrade affordance is self-explanatory (not "Reinstall backend" with a sync icon)', async ({ page }) => {
    await mockDistributedNodes(page)
-    await expandNodeAndWaitForBackends(page)
+    await openNodeDetail(page)

    // Negative: the old, ambiguous wording must not be used.
    await expect(page.locator('button[title="Reinstall backend"]')).toHaveCount(0)
@@ -114,7 +117,7 @@ test.describe('Nodes page — per-node backend actions', () => {

  test('per-node backend row shows a delete (trash) button next to upgrade', async ({ page }) => {
    await mockDistributedNodes(page)
-    await expandNodeAndWaitForBackends(page)
+    await openNodeDetail(page)

    const deleteBtn = page.locator('button[title="Delete backend from this node"]')
    await expect(deleteBtn).toBeVisible()
@@ -128,7 +131,7 @@ test.describe('Nodes page — per-node backend actions', () => {
        postedBody = route.request().postDataJSON()
      },
    })
-    await expandNodeAndWaitForBackends(page)
+    await openNodeDetail(page)

    await page.locator('button[title="Delete backend from this node"]').click()

@@ -150,7 +153,7 @@ test.describe('Nodes page — per-node backend actions', () => {
        deleteCalls += 1
      },
    })
-    await expandNodeAndWaitForBackends(page)
+    await openNodeDetail(page)

    await page.locator('button[title="Delete backend from this node"]').click()

--- a/core/http/react-ui/e2e/nodes-roster.spec.js
+++ b/core/http/react-ui/e2e/nodes-roster.spec.js
@@ -0,0 +1,47 @@
+import { test, expect } from './coverage-fixtures.js'
+
+async function mockCluster(page, nodes) {
+  await page.route('**/api/nodes', r => r.fulfill({ status: 200, contentType: 'application/json', body: JSON.stringify(nodes) }))
+  await page.route('**/api/nodes/models', r => r.fulfill({ status: 200, contentType: 'application/json', body: '[]' }))
+  await page.route('**/api/nodes/scheduling', r => r.fulfill({ status: 200, contentType: 'application/json', body: '[]' }))
+}
+
+test.describe('Nodes roster header', () => {
+  test('shows a cluster pulse line and no stat-card grid', async ({ page }) => {
+    await mockCluster(page, [
+      { id: 'n1', name: 'alpha', node_type: 'backend', address: '10.0.0.1:50051', status: 'healthy' },
+      { id: 'n2', name: 'beta', node_type: 'backend', address: '10.0.0.2:50051', status: 'draining' },
+    ])
+    await page.goto('/app/nodes')
+    await expect(page.locator('.cluster-pulse')).toBeVisible({ timeout: 15_000 })
+    await expect(page.locator('.cluster-pulse')).toContainText('2 nodes')
+    await expect(page.locator('.stat-grid')).toHaveCount(0)
+  })
+
+  test('shows an approval callout for pending nodes', async ({ page }) => {
+    await mockCluster(page, [{ id: 'n3', name: 'gamma', node_type: 'backend', address: '10.0.0.3:50051', status: 'pending' }])
+    await page.goto('/app/nodes')
+    await expect(page.locator('.attention-callout')).toContainText('approval', { timeout: 15_000 })
+  })
+})
+
+test.describe('Nodes roster panels', () => {
+  test('shows model chips without clicking and filters by type', async ({ page }) => {
+    await page.route('**/api/nodes', r => r.fulfill({ status: 200, contentType: 'application/json', body: JSON.stringify([
+      { id: 'n1', name: 'alpha', node_type: 'backend', address: '10.0.0.1:50051', status: 'healthy' },
+      { id: 'a1', name: 'agent-1', node_type: 'agent', address: '10.0.0.9:50051', status: 'healthy' },
+    ]) }))
+    await page.route('**/api/nodes/models', r => r.fulfill({ status: 200, contentType: 'application/json', body: JSON.stringify([
+      { node_id: 'n1', model_name: 'llama-3.3', state: 'loaded', in_flight: 2, replica_index: 0 },
+    ]) }))
+    await page.route('**/api/nodes/scheduling', r => r.fulfill({ status: 200, contentType: 'application/json', body: '[]' }))
+
+    await page.goto('/app/nodes')
+    // model chip visible without any expand click
+    await expect(page.locator('.node-panel').filter({ hasText: 'alpha' }).getByText('llama-3.3')).toBeVisible({ timeout: 15_000 })
+    // segmented filter: Agent shows the agent node, hides the backend node
+    await page.getByRole('radio', { name: /Agent/ }).click()
+    await expect(page.getByText('agent-1')).toBeVisible()
+    await expect(page.getByText('alpha')).toHaveCount(0)
+  })
+})
--- a/core/http/react-ui/e2e/page-render-smoke.spec.js
+++ b/core/http/react-ui/e2e/page-render-smoke.spec.js
@@ -21,6 +21,7 @@ const PAGES = [
  ['/app/backends', 'Backends'],
  ['/app/settings', 'Settings'],
  ['/app/nodes', 'Nodes'],
+  ['/app/scheduling', 'Scheduling'],
  ['/app/face', 'Face recognition'],
  ['/app/voice', 'Voice recognition'],
  ['/app/fine-tune', 'Fine-tuning'],
--- a/core/http/react-ui/e2e/role-mode-adaptive.spec.js
+++ b/core/http/react-ui/e2e/role-mode-adaptive.spec.js
@@ -0,0 +1,100 @@
+import { test, expect } from './coverage-fixtures.js'
+
+// These specs stub /api/features and /api/auth/status per cell. The test server
+// disables auth (isAdmin=true) and reports its own features, so we intercept
+// before navigation to simulate each role x mode cell.
+
+function stubFeatures(page, features) {
+  return page.route('**/api/features', route =>
+    route.fulfill({ contentType: 'application/json', body: JSON.stringify(features) }))
+}
+
+function stubNoP2P(page) {
+  // P2P token endpoint returns empty -> p2pEnabled=false.
+  return page.route('**/api/p2p/token', route =>
+    route.fulfill({ contentType: 'text/plain', body: '' }))
+}
+
+test.describe('Adaptive landing (HomeRoute)', () => {
+  test('admin + distributed redirects /app to Nodes', async ({ page }) => {
+    await stubFeatures(page, { distributed: true })
+    await stubNoP2P(page)
+    await page.goto('/app')
+    await expect(page).toHaveURL(/\/app\/nodes$/)
+    await expect(page.locator('.page-title').first()).toBeVisible({ timeout: 15_000 })
+  })
+
+  test('admin + single-node stays on Home', async ({ page }) => {
+    await stubFeatures(page, { distributed: false })
+    await stubNoP2P(page)
+    await page.goto('/app')
+    await expect(page).toHaveURL(/\/app$/)
+    await expect(page.locator('.home-greeting')).toBeVisible({ timeout: 15_000 })
+  })
+})
+
+test.describe('Adaptive sidebar', () => {
+  test('distributed pins the Cluster group with Nodes at the top', async ({ page }) => {
+    await stubFeatures(page, { distributed: true })
+    await stubNoP2P(page)
+    await page.goto('/app/chat') // any in-app page so the sidebar is mounted
+    const pinned = page.locator('.sidebar-nav .sidebar-section-items').first()
+    await expect(pinned.getByText('Nodes', { exact: false })).toBeVisible({ timeout: 15_000 })
+  })
+
+  test('single-node does not pin a Cluster group', async ({ page }) => {
+    await stubFeatures(page, { distributed: false })
+    await stubNoP2P(page)
+    await page.goto('/app/chat')
+    // Nodes is reachable only via the Operate rail, not pinned at the top.
+    await expect(page.locator('.sidebar-nav')).toBeVisible({ timeout: 15_000 })
+    await expect(page.locator('.sidebar-nav .sidebar-section-items').first()
+      .getByText('Nodes', { exact: false })).toHaveCount(0)
+  })
+})
+
+test.describe('Top navbar', () => {
+  test('admin sees the mode pill and settings cog', async ({ page }) => {
+    await stubFeatures(page, { distributed: true })
+    await stubNoP2P(page)
+    await page.goto('/app/chat')
+    await expect(page.locator('.top-navbar__mode')).toBeVisible({ timeout: 15_000 })
+    await expect(page.locator('.top-navbar__icon[aria-label]')).not.toHaveCount(0)
+  })
+
+  test('admin-via-chat jump shows when localai_assistant is enabled', async ({ page }) => {
+    await stubFeatures(page, { distributed: false, localai_assistant: true })
+    await stubNoP2P(page)
+    await page.goto('/app/chat')
+    await expect(page.locator('.top-navbar__assistant')).toBeVisible({ timeout: 15_000 })
+  })
+
+  test('admin-via-chat jump hidden when localai_assistant is off', async ({ page }) => {
+    await stubFeatures(page, { distributed: false, localai_assistant: false })
+    await stubNoP2P(page)
+    await page.goto('/app/chat')
+    await expect(page.locator('.top-navbar__assistant')).toHaveCount(0)
+  })
+})
+
+test.describe('Token usage meter', () => {
+  test('renders when admin usage has data', async ({ page }) => {
+    await stubFeatures(page, { distributed: false })
+    await stubNoP2P(page)
+    await page.route('**/api/auth/admin/usage**', route =>
+      route.fulfill({ contentType: 'application/json',
+        body: JSON.stringify({ buckets: [{ total_tokens: 1234 }] }) }))
+    await page.goto('/app/chat')
+    await expect(page.locator('.top-navbar__meter')).toBeVisible({ timeout: 15_000 })
+  })
+
+  test('hidden when admin usage is empty (graceful degrade)', async ({ page }) => {
+    await stubFeatures(page, { distributed: false })
+    await stubNoP2P(page)
+    await page.route('**/api/auth/admin/usage**', route =>
+      route.fulfill({ contentType: 'application/json', body: JSON.stringify({ buckets: [] }) }))
+    await page.goto('/app/chat')
+    await expect(page.locator('.top-navbar')).toBeVisible({ timeout: 15_000 })
+    await expect(page.locator('.top-navbar__meter')).toHaveCount(0)
+  })
+})
--- a/core/http/react-ui/e2e/scheduling.spec.js
+++ b/core/http/react-ui/e2e/scheduling.spec.js
@@ -0,0 +1,16 @@
+import { test, expect } from './coverage-fixtures.js'
+
+test.describe('Scheduling page', () => {
+  test('renders at /app/scheduling with rules from the API', async ({ page }) => {
+    await page.route('**/api/nodes/scheduling', (route) => {
+      route.fulfill({
+        status: 200, contentType: 'application/json',
+        body: JSON.stringify([{ model_name: 'llama-3.3', spread_all: true, min_replicas: 0, max_replicas: 0 }]),
+      })
+    })
+    await page.goto('/app/scheduling')
+    await expect(page.locator('.page-title').first()).toBeVisible({ timeout: 15_000 })
+    await expect(page).toHaveURL(/\/app\/scheduling$/)
+    await expect(page.getByText('llama-3.3')).toBeVisible()
+  })
+})
--- a/core/http/react-ui/public/locales/de/admin.json
+++ b/core/http/react-ui/public/locales/de/admin.json
@@ -43,6 +43,10 @@
    "title": "Verteilte Knoten",
    "subtitle": "Backend- und Agenten-Worker-Knoten verwalten"
  },
+  "scheduling": {
+    "title": "Planung",
+    "subtitle": "Modellplatzierung und Replikat-Regeln im gesamten Cluster"
+  },
  "p2p": {
    "title": "Verteilte KI-Berechnung",
    "subtitle": "Skalieren Sie Ihre KI-Workloads über mehrere Geräte mit Peer-to-Peer-Verteilung"
--- a/core/http/react-ui/public/locales/de/nav.json
+++ b/core/http/react-ui/public/locales/de/nav.json
@@ -50,6 +50,7 @@
    "backends": "Backends",
    "traces": "Traces",
    "nodes": "Knoten",
+    "scheduling": "Planung",
    "swarm": "Swarm",
    "system": "System",
    "settings": "Einstellungen",
--- a/core/http/react-ui/public/locales/en/admin.json
+++ b/core/http/react-ui/public/locales/en/admin.json
@@ -43,6 +43,10 @@
    "title": "Distributed Nodes",
    "subtitle": "Manage backend and agent worker nodes"
  },
+  "scheduling": {
+    "title": "Scheduling",
+    "subtitle": "Model placement and replica rules across the cluster"
+  },
  "p2p": {
    "title": "Distributed AI Computing",
    "subtitle": "Scale your AI workloads across multiple devices with peer-to-peer distribution"
--- a/core/http/react-ui/public/locales/en/nav.json
+++ b/core/http/react-ui/public/locales/en/nav.json
@@ -12,6 +12,16 @@
  "accountSettings": "Account settings",
  "account": "Account",
  "accountFor": "Account: {{name}}",
+  "topbar": {
+    "label": "Top bar",
+    "modeDistributed": "Distributed",
+    "modeSwarm": "Swarm",
+    "modeSingle": "Single-node",
+    "pickModel": "Models",
+    "adminViaChat": "Admin via chat",
+    "tokensToday": "Tokens today",
+    "usageDetail": "View usage detail"
+  },
  "sections": {
    "create": "Create",
    "recognition": "Recognition",
@@ -51,6 +61,7 @@
    "backends": "Backends",
    "traces": "Traces",
    "nodes": "Nodes",
+    "scheduling": "Scheduling",
    "swarm": "Swarm",
    "system": "System",
    "settings": "Settings",
--- a/core/http/react-ui/public/locales/es/admin.json
+++ b/core/http/react-ui/public/locales/es/admin.json
@@ -43,6 +43,10 @@
    "title": "Nodos distribuidos",
    "subtitle": "Administra nodos worker de backends y agentes"
  },
+  "scheduling": {
+    "title": "Planificación",
+    "subtitle": "Reglas de ubicación de modelos y réplicas en el clúster"
+  },
  "p2p": {
    "title": "Computación de IA distribuida",
    "subtitle": "Escala tus cargas de trabajo de IA en múltiples dispositivos con distribución peer-to-peer"
--- a/core/http/react-ui/public/locales/es/nav.json
+++ b/core/http/react-ui/public/locales/es/nav.json
@@ -50,6 +50,7 @@
    "backends": "Backends",
    "traces": "Trazas",
    "nodes": "Nodos",
+    "scheduling": "Planificación",
    "swarm": "Swarm",
    "system": "Sistema",
    "settings": "Configuración",
--- a/core/http/react-ui/public/locales/id/admin.json
+++ b/core/http/react-ui/public/locales/id/admin.json
@@ -43,6 +43,10 @@
    "title": "Node Terdistribusi",
    "subtitle": "Kelola node backend dan node worker"
  },
+  "scheduling": {
+    "title": "Penjadwalan",
+    "subtitle": "Aturan penempatan model dan replika di seluruh klaster"
+  },
  "p2p": {
    "title": "Komputasi AI Terdistribusi",
    "subtitle": "Skalakan beban kerja AI Anda ke beberapa perangkat dengan distribusi peer-to-peer"
--- a/core/http/react-ui/public/locales/id/nav.json
+++ b/core/http/react-ui/public/locales/id/nav.json
@@ -51,6 +51,7 @@
    "backends": "Backend",
    "traces": "Trace",
    "nodes": "Node",
+    "scheduling": "Penjadwalan",
    "swarm": "Swarm",
    "system": "Sistem",
    "settings": "Pengaturan",
--- a/core/http/react-ui/public/locales/it/admin.json
+++ b/core/http/react-ui/public/locales/it/admin.json
@@ -43,6 +43,10 @@
    "title": "Nodi distribuiti",
    "subtitle": "Gestisci i nodi worker dei backend e degli agenti"
  },
+  "scheduling": {
+    "title": "Pianificazione",
+    "subtitle": "Regole di posizionamento dei modelli e delle repliche nel cluster"
+  },
  "p2p": {
    "title": "Calcolo AI distribuito",
    "subtitle": "Scala i tuoi carichi di lavoro AI su più dispositivi con la distribuzione peer-to-peer"
--- a/core/http/react-ui/public/locales/it/nav.json
+++ b/core/http/react-ui/public/locales/it/nav.json
@@ -50,6 +50,7 @@
    "backends": "Backend",
    "traces": "Tracce",
    "nodes": "Nodi",
+    "scheduling": "Pianificazione",
    "swarm": "Swarm",
    "system": "Sistema",
    "settings": "Impostazioni",
--- a/core/http/react-ui/public/locales/ko/admin.json
+++ b/core/http/react-ui/public/locales/ko/admin.json
@@ -43,6 +43,10 @@
    "title": "분산 노드",
    "subtitle": "백엔드 및 에이전트 워커 노드를 관리합니다"
  },
+  "scheduling": {
+    "title": "스케줄링",
+    "subtitle": "클러스터 전반의 모델 배치 및 복제본 규칙"
+  },
  "p2p": {
    "title": "분산 AI 컴퓨팅",
    "subtitle": "피어 투 피어 분산으로 여러 기기에 걸쳐 AI 워크로드를 확장합니다"
--- a/core/http/react-ui/public/locales/ko/nav.json
+++ b/core/http/react-ui/public/locales/ko/nav.json
@@ -51,6 +51,7 @@
    "backends": "백엔드",
    "traces": "트레이스",
    "nodes": "노드",
+    "scheduling": "스케줄링",
    "swarm": "Swarm",
    "system": "시스템",
    "settings": "설정",
--- a/core/http/react-ui/public/locales/zh-CN/admin.json
+++ b/core/http/react-ui/public/locales/zh-CN/admin.json
@@ -43,6 +43,10 @@
    "title": "分布式节点",
    "subtitle": "管理后端和智能体工作节点"
  },
+  "scheduling": {
+    "title": "调度",
+    "subtitle": "集群中的模型放置和副本规则"
+  },
  "p2p": {
    "title": "分布式 AI 计算",
    "subtitle": "通过点对点分发将您的 AI 工作负载扩展到多个设备"
--- a/core/http/react-ui/public/locales/zh-CN/nav.json
+++ b/core/http/react-ui/public/locales/zh-CN/nav.json
@@ -50,6 +50,7 @@
    "backends": "后端",
    "traces": "追踪",
    "nodes": "节点",
+    "scheduling": "调度",
    "swarm": "Swarm",
    "system": "系统",
    "settings": "设置",
--- a/core/http/react-ui/src/App.css
+++ b/core/http/react-ui/src/App.css
@@ -184,6 +184,50 @@
  font-size: 1.5rem;
 }

+/* Desktop top bar: deployment + admin affordances on wide screens. Hidden on
+   mobile, where .mobile-header carries the equivalent actions. */
+.top-navbar {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  gap: var(--spacing-md);
+  padding: var(--spacing-sm) var(--spacing-lg);
+  border-bottom: 1px solid var(--color-border-default);
+  background: var(--color-bg-secondary);
+}
+.top-navbar__right { display: flex; align-items: center; gap: var(--spacing-sm); }
+.top-navbar__mode {
+  font-size: 0.75rem;
+  padding: 2px 10px;
+  border-radius: 999px;
+  border: 1px solid var(--color-border-default);
+  color: var(--color-text-secondary);
+}
+.top-navbar__mode.is-active { color: var(--color-success); border-color: var(--color-success); }
+.top-navbar__btn {
+  display: inline-flex; align-items: center; gap: 6px;
+  font-size: 0.8125rem; padding: 5px 10px; border-radius: 8px;
+  border: 1px solid var(--color-border-default); background: var(--color-bg-tertiary);
+  color: var(--color-text-primary); cursor: pointer;
+}
+.top-navbar__icon {
+  width: 32px; height: 32px; display: inline-flex; align-items: center;
+  justify-content: center; border-radius: 8px; border: 1px solid var(--color-border-default);
+  background: var(--color-bg-tertiary); color: var(--color-text-secondary); cursor: pointer;
+}
+.top-navbar__avatar img { width: 100%; height: 100%; border-radius: 50%; object-fit: cover; }
+.top-navbar__meter {
+  display: inline-flex; flex-direction: column; gap: 3px; align-items: flex-start;
+  padding: 4px 10px; border-radius: 8px; border: 1px solid var(--color-border-default);
+  background: var(--color-bg-tertiary); cursor: pointer; min-width: 150px;
+}
+.top-navbar__meter-label { font-size: 0.6875rem; color: var(--color-text-secondary); }
+.top-navbar__meter-bar { width: 100%; height: 5px; border-radius: 3px; background: var(--color-bg-secondary); overflow: hidden; }
+.top-navbar__meter-bar i { display: block; height: 100%; background: var(--color-primary); }
+@media (max-width: 639px) {
+  .top-navbar { display: none; }
+}
+
 /* Sidebar */
 .sidebar {
  position: fixed;
@@ -8471,3 +8515,56 @@ select.input {
 .status-pill--error   .status-pill__dot { background: var(--color-error); }
 .status-pill--info    .status-pill__dot { background: var(--color-info); }
 .status-pill--muted   .status-pill__dot { background: var(--color-text-muted); }
+
+/* Nodes: cluster pulse + attention callout (replaces the stat-card strip) */
+.cluster-pulse {
+  font-size: var(--text-sm);
+  color: var(--color-text-muted);
+  margin: 0 0 var(--spacing-lg);
+}
+.cluster-pulse__strong { color: var(--color-text-primary); font-weight: 600; }
+
+.attention-callout {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  gap: var(--spacing-md);
+  padding: var(--spacing-sm) var(--spacing-md);
+  border-radius: var(--radius-md);
+  margin-bottom: var(--spacing-lg);
+  font-size: var(--text-sm);
+}
+.attention-callout--warn {
+  background: var(--color-warning-light);
+  border: 1px solid var(--color-warning-border);
+  color: var(--color-text-primary);
+}
+.attention-callout--error {
+  background: var(--color-error-light);
+  border: 1px solid var(--color-error-border);
+  color: var(--color-text-primary);
+}
+
+/* Node roster panels (Nodes page) */
+.node-roster { display: flex; flex-direction: column; gap: var(--spacing-sm); }
+.node-panel {
+  background: var(--color-bg-secondary);
+  border: 1px solid var(--color-border-subtle);
+  border-radius: var(--radius-lg);
+}
+.node-panel__main { padding: var(--spacing-md) var(--spacing-lg); cursor: pointer; }
+.node-panel:hover { border-color: var(--color-border); }
+.node-panel__head { display: flex; align-items: flex-start; justify-content: space-between; gap: var(--spacing-md); }
+.node-panel__id { display: flex; align-items: center; gap: var(--spacing-sm); flex-wrap: wrap; }
+.node-panel__name { font-weight: 600; }
+.node-panel__meta { display: flex; gap: var(--spacing-lg); margin-top: var(--spacing-sm); color: var(--color-text-muted); font-size: var(--text-xs); }
+.node-panel__models { display: flex; flex-wrap: wrap; gap: 6px; margin-top: var(--spacing-sm); }
+.model-chip {
+  display: inline-flex; align-items: center; gap: 5px;
+  font-family: var(--font-mono); font-size: 0.6875rem;
+  padding: 2px 8px; border-radius: var(--radius-sm); border: 1px solid;
+}
+.model-chip__dot { width: 6px; height: 6px; border-radius: 50%; }
+.model-chip__state { opacity: 0.85; font-style: normal; }
+.node-filter { margin-bottom: var(--spacing-lg); }
+.node-detail__metrics { display: flex; gap: var(--spacing-xl); margin: var(--spacing-md) 0 var(--spacing-lg); flex-wrap: wrap; }
--- a/core/http/react-ui/src/App.jsx
+++ b/core/http/react-ui/src/App.jsx
@@ -3,6 +3,7 @@ import { Outlet, useLocation, useNavigate } from 'react-router-dom'
 import { useTranslation } from 'react-i18next'
 import Sidebar from './components/Sidebar'
 import OperationsBar from './components/OperationsBar'
+import TopNavbar from './components/TopNavbar'
 import { ToastContainer, useToast } from './components/Toast'
 import { systemApi } from './utils/api'
 import { useTheme } from './contexts/ThemeContext'
@@ -98,6 +99,7 @@ export default function App() {
      <Sidebar isOpen={sidebarOpen} onClose={() => setSidebarOpen(false)} />
      <main className="main-content" {...(sidebarOpen ? { 'aria-hidden': 'true', inert: '' } : {})}>
        <OperationsBar />
+        <TopNavbar />
        {/* Mobile header — primary actions reachable without opening the
            drawer. Hamburger is the only way to expand the nav on phones;
            theme toggle and account avatar are mirrored from the sidebar
--- a/core/http/react-ui/src/components/HomeRoute.jsx
+++ b/core/http/react-ui/src/components/HomeRoute.jsx
@@ -0,0 +1,28 @@
+import { lazy, Suspense } from 'react'
+import { Navigate } from 'react-router-dom'
+import { useAuth } from '../context/AuthContext'
+import { useDeployment } from '../contexts/DeploymentContext'
+import { resolveHome } from '../utils/resolveHome'
+import RouteFallback from './RouteFallback'
+
+const Home = lazy(() => import('../pages/Home'))
+
+// Index-route element. Waits for auth + deployment signals to load (so we never
+// flash the wrong landing), then either renders Home or redirects to the cell's
+// landing page. Redirecting (rather than rendering Nodes/Chat inline at /app)
+// keeps each target's own route guard, active-nav state, and deep-linkability.
+export default function HomeRoute() {
+  const { isAdmin, loading: authLoading } = useAuth()
+  const { distributed, p2pEnabled, loading: deployLoading } = useDeployment()
+
+  if (authLoading || deployLoading) return <RouteFallback />
+
+  const target = resolveHome({ isAdmin, distributed, p2pEnabled })
+  if (target) return <Navigate to={target} replace />
+
+  return (
+    <Suspense fallback={<RouteFallback />}>
+      <Home />
+    </Suspense>
+  )
+}
--- a/core/http/react-ui/src/components/PatternListEditor.jsx
+++ b/core/http/react-ui/src/components/PatternListEditor.jsx
@@ -74,7 +74,18 @@ export default function PatternListEditor({ value, onChange }) {
            min={0}
            value={r.min_len || 0}
            title="Minimum match length (0 = no floor)"
-            onChange={e => update(i, { min_len: parseInt(e.target.value, 10) || 0 })}
+            // min={0} only constrains the spinner, not keyboard entry. Clamp a
+            // typed negative to 0 (a negative floor is meaningless and would
+            // disable the length filter). When we clamp, force the DOM value
+            // too: the resulting 0->0 state change is a no-op, so React's
+            // controlled input would otherwise keep displaying the rejected
+            // "-5" even though the saved value is 0.
+            onChange={e => {
+              const parsed = parseInt(e.target.value, 10)
+              const n = Math.max(0, parsed || 0)
+              if (parsed < 0) e.target.value = String(n)
+              update(i, { min_len: n })
+            }}
            style={{ width: 80, fontSize: '0.8125rem' }}
            aria-label="Minimum length"
          />
--- a/core/http/react-ui/src/components/Sidebar.jsx
+++ b/core/http/react-ui/src/components/Sidebar.jsx
@@ -5,9 +5,11 @@ import ThemeToggle from './ThemeToggle'
 import LanguageSwitcher from './LanguageSwitcher'
 import { useAuth } from '../context/AuthContext'
 import { useBranding } from '../contexts/BrandingContext'
+import { useDeployment } from '../contexts/DeploymentContext'
 import { apiUrl } from '../utils/basePath'
 import { preloadRoute } from '../router'
 import { consoles, firstVisiblePath, consolePaths } from './console/consoleConfig'
+import { clusterPinItems, shouldCollapseCreate } from '../utils/sidebarPolicy'

 const COLLAPSED_KEY = 'localai_sidebar_collapsed'
 const SECTIONS_KEY = 'localai_sidebar_sections'
@@ -58,11 +60,13 @@ function NavItem({ item, onClose, collapsed }) {
  )
 }

-function loadSectionState() {
-  // Tiers render expanded by default (the redesign favours showing the few
-  // intent groups up front); users can still collapse any tier and the choice
-  // is persisted. Stored values override the defaults so a saved collapse wins.
+function loadSectionState(collapseCreate = false) {
+  // Tiers render expanded by default; users can collapse any tier and the
+  // choice persists (stored values override defaults). In cluster cells we
+  // start Create collapsed so the pinned cluster group leads - but only when
+  // the user has not already expressed a preference.
  const defaults = Object.fromEntries(sections.map(s => [s.id, true]))
+  if (collapseCreate) defaults.create = false
  try {
    const stored = localStorage.getItem(SECTIONS_KEY)
    return stored ? { ...defaults, ...JSON.parse(stored) } : defaults
@@ -77,20 +81,34 @@ function saveSectionState(state) {

 export default function Sidebar({ isOpen, onClose }) {
  const { t } = useTranslation('nav')
-  const [features, setFeatures] = useState({})
+  const { isAdmin, authEnabled, user, logout, hasFeature } = useAuth()
+  // Deployment shape (server features + p2p) drives the adaptive sidebar; the
+  // shared context replaces the sidebar's own /api/features fetch so the
+  // landing resolver, navbar, and this policy agree on one snapshot.
+  const deployment = useDeployment()
+  const features = deployment.features
+  // Shared shape for the console gating helpers (consoleConfig.js); in scope for
+  // both the pinned cluster group and the console-tier rendering below.
+  const auth = { isAdmin, authEnabled, hasFeature, features }
+  const collapseCreate = shouldCollapseCreate(auth, deployment)
  const [collapsed, setCollapsed] = useState(() => {
    try { return localStorage.getItem(COLLAPSED_KEY) === 'true' } catch (_) { return false }
  })
  const [openSections, setOpenSections] = useState(loadSectionState)
-  const { isAdmin, authEnabled, user, logout, hasFeature } = useAuth()
  const branding = useBranding()
  const navigate = useNavigate()
  const location = useLocation()
  const closeBtnRef = useRef(null)

+  // Apply the cluster-cell Create-collapse default once, only when the user has
+  // no stored section preference (so we never override an explicit choice).
  useEffect(() => {
-    fetch(apiUrl('/api/features')).then(r => r.json()).then(setFeatures).catch(() => {})
-  }, [])
+    if (deployment.loading) return
+    let hasStored = false
+    try { hasStored = !!localStorage.getItem(SECTIONS_KEY) } catch { hasStored = false }
+    if (hasStored || !collapseCreate) return
+    setOpenSections(prev => (prev.create === false ? prev : { ...prev, create: false }))
+  }, [deployment.loading, collapseCreate])

  // Stay in sync with external collapse dispatches (e.g. the chat
  // page's focus mode). The collapse-toggle button still owns the
@@ -157,8 +175,6 @@ export default function Sidebar({ isOpen, onClose }) {
  }

  const visibleTopItems = topItems.filter(filterItem)
-  // Shared shape for the console gating helpers (consoleConfig.js).
-  const auth = { isAdmin, authEnabled, hasFeature, features }

  // Inline sections (Create) carry no gating; a plain filterItem pass suffices.
  const getVisibleSectionItems = (section) => section.items.filter(filterItem)
@@ -199,6 +215,28 @@ export default function Sidebar({ isOpen, onClose }) {
            ))}
          </div>

+          {/* Pinned Cluster quick-access (admin + distributed/p2p). Same gate
+              as the Operate rail; surfaced at the top for cluster operators. */}
+          {(() => {
+            const pinned = clusterPinItems(auth, deployment)
+            if (pinned.length === 0) return null
+            return (
+              <div className="sidebar-section">
+                <div className="sidebar-section-title">{t('operate.cluster')}</div>
+                <div className="sidebar-section-items">
+                  {pinned.map(item => (
+                    <NavItem
+                      key={item.path}
+                      item={{ path: item.path, icon: item.icon, labelKey: item.labelKey }}
+                      onClose={onClose}
+                      collapsed={collapsed}
+                    />
+                  ))}
+                </div>
+              </div>
+            )
+          })()}
+
          {/* Collapsible sections */}
          {sections.map(section => {
            const visibleItems = getVisibleSectionItems(section)
--- a/core/http/react-ui/src/components/TopNavbar.jsx
+++ b/core/http/react-ui/src/components/TopNavbar.jsx
@@ -0,0 +1,96 @@
+import { useNavigate } from 'react-router-dom'
+import { useTranslation } from 'react-i18next'
+import { useAuth } from '../context/AuthContext'
+import { useDeployment } from '../contexts/DeploymentContext'
+import { useTheme } from '../contexts/ThemeContext'
+import { launchAssistantChat } from '../utils/launchAssistantChat'
+import TokenUsageMeter from './navbar/TokenUsageMeter'
+
+// Desktop top bar. Complementary to the mobile-only header in App.jsx: this is
+// hidden on small screens (see .top-navbar CSS) and shows deployment/admin
+// affordances on wide screens where the sidebar footer is far from the content.
+export default function TopNavbar() {
+  const { t } = useTranslation('nav')
+  const navigate = useNavigate()
+  const { isAdmin, authEnabled, user } = useAuth()
+  const { features, distributed, p2pEnabled } = useDeployment()
+  const { theme, toggleTheme } = useTheme()
+
+  const modeLabel = distributed
+    ? t('topbar.modeDistributed')
+    : p2pEnabled
+      ? t('topbar.modeSwarm')
+      : t('topbar.modeSingle')
+
+  const showAssistantJump = isAdmin && !!features.localai_assistant
+  const showAvatar = authEnabled && user
+  const themeLabel = theme === 'dark' ? t('switchToLightMode') : t('switchToDarkMode')
+
+  return (
+    <div className="top-navbar" role="navigation" aria-label={t('topbar.label')}>
+      <div className="top-navbar__left">
+        {isAdmin && (
+          <span className={`top-navbar__mode ${distributed || p2pEnabled ? 'is-active' : ''}`}>
+            <i className="fas fa-circle-nodes" aria-hidden="true" /> {modeLabel}
+          </span>
+        )}
+      </div>
+      <div className="top-navbar__right">
+        {!isAdmin && (
+          <button
+            type="button"
+            className="top-navbar__btn"
+            onClick={() => navigate('/app/chat')}
+            title={t('topbar.pickModel')}
+          >
+            <i className="fas fa-cube" aria-hidden="true" /> {t('topbar.pickModel')}
+          </button>
+        )}
+        {showAssistantJump && (
+          <button
+            type="button"
+            className="top-navbar__btn top-navbar__assistant"
+            onClick={() => launchAssistantChat(navigate)}
+            title={t('topbar.adminViaChat')}
+          >
+            <i className="fas fa-user-shield" aria-hidden="true" /> {t('topbar.adminViaChat')}
+          </button>
+        )}
+        {isAdmin && <TokenUsageMeter />}
+        {isAdmin && (
+          <button
+            type="button"
+            className="top-navbar__icon"
+            onClick={() => navigate('/app/settings')}
+            aria-label={t('items.settings')}
+            title={t('items.settings')}
+          >
+            <i className="fas fa-cog" aria-hidden="true" />
+          </button>
+        )}
+        <button
+          type="button"
+          className="top-navbar__icon"
+          onClick={toggleTheme}
+          aria-label={themeLabel}
+          title={themeLabel}
+        >
+          <i className={`fas ${theme === 'dark' ? 'fa-sun' : 'fa-moon'}`} aria-hidden="true" />
+        </button>
+        {showAvatar && (
+          <button
+            type="button"
+            className="top-navbar__icon top-navbar__avatar"
+            onClick={() => navigate('/app/account')}
+            aria-label={user.name || user.email}
+            title={user.name || user.email}
+          >
+            {user.avatarUrl
+              ? <img src={user.avatarUrl} alt="" />
+              : <i className="fas fa-user-circle" aria-hidden="true" />}
+          </button>
+        )}
+      </div>
+    </div>
+  )
+}
--- a/core/http/react-ui/src/components/console/consoleConfig.js
+++ b/core/http/react-ui/src/components/console/consoleConfig.js
@@ -59,6 +59,7 @@ export const operateConsole = {
      titleKey: 'operate.cluster',
      items: [
        { path: '/app/nodes', icon: 'fas fa-network-wired', labelKey: 'items.nodes', adminOnly: true, feature: 'distributed' },
+        { path: '/app/scheduling', icon: 'fas fa-calendar-alt', labelKey: 'items.scheduling', adminOnly: true, feature: 'distributed' },
        { path: '/app/p2p', icon: 'fas fa-circle-nodes', labelKey: 'items.swarm', adminOnly: true },
      ],
    },
--- a/core/http/react-ui/src/components/navbar/TokenUsageMeter.jsx
+++ b/core/http/react-ui/src/components/navbar/TokenUsageMeter.jsx
@@ -0,0 +1,52 @@
+import { useState, useEffect } from 'react'
+import { useNavigate } from 'react-router-dom'
+import { useTranslation } from 'react-i18next'
+import { usageApi } from '../../utils/api'
+
+// Compact admin-only usage glance: today's total tokens, optionally against a
+// quota cap, linking to the full /app/usage page. Self-contained data fetch so
+// a usage-API failure cannot break the navbar - it just renders nothing.
+function sumTotalTokens(res) {
+  const buckets = res?.buckets || res?.usage || (Array.isArray(res) ? res : [])
+  if (!Array.isArray(buckets) || buckets.length === 0) return null
+  return buckets.reduce((s, b) => s + (b.total_tokens || 0), 0)
+}
+
+export default function TokenUsageMeter() {
+  const { t } = useTranslation('nav')
+  const navigate = useNavigate()
+  const [tokens, setTokens] = useState(null)
+  const [cap, setCap] = useState(null)
+
+  useEffect(() => {
+    let cancelled = false
+    usageApi.getAdminUsage('day')
+      .then(res => { if (!cancelled) setTokens(sumTotalTokens(res)) })
+      .catch(() => { if (!cancelled) setTokens(null) })
+    usageApi.getMyQuotas()
+      .then(q => { if (!cancelled) setCap(q?.token_limit || q?.tokens?.limit || null) })
+      .catch(() => { if (!cancelled) setCap(null) })
+    return () => { cancelled = true }
+  }, [])
+
+  if (tokens === null) return null
+
+  const pct = cap ? Math.min(100, Math.round((tokens / cap) * 100)) : null
+
+  return (
+    <button
+      type="button"
+      className="top-navbar__meter"
+      onClick={() => navigate('/app/usage')}
+      title={t('topbar.usageDetail')}
+    >
+      <span className="top-navbar__meter-label">
+        {t('topbar.tokensToday')}: {Intl.NumberFormat().format(tokens)}
+        {cap ? ` / ${Intl.NumberFormat().format(cap)}` : ''}
+      </span>
+      {pct !== null && (
+        <span className="top-navbar__meter-bar"><i style={{ width: `${pct}%` }} /></span>
+      )}
+    </button>
+  )
+}
--- a/core/http/react-ui/src/components/nodes/AttentionCallout.jsx
+++ b/core/http/react-ui/src/components/nodes/AttentionCallout.jsx
@@ -0,0 +1,31 @@
+export default function AttentionCallout({ nodes, onApprove }) {
+  const pending = nodes.filter(n => n.status === 'pending')
+  const unhealthy = nodes.filter(n => n.status === 'unhealthy' || n.status === 'offline')
+  if (pending.length === 0 && unhealthy.length === 0) return null
+
+  if (pending.length > 0) {
+    const first = pending[0]
+    const extra = pending.length - 1
+    return (
+      <div className="attention-callout attention-callout--warn">
+        <span>
+          <i className="fas fa-exclamation-circle" />{' '}
+          <strong>{pending.length} node{pending.length > 1 ? 's' : ''} awaiting approval</strong>
+          {' - '}{first.name}{extra > 0 ? ` +${extra} more` : ''}
+        </span>
+        <button className="btn btn-primary btn-sm" onClick={() => onApprove(first.id)}>
+          <i className="fas fa-check" /> Approve {first.name}
+        </button>
+      </div>
+    )
+  }
+  return (
+    <div className="attention-callout attention-callout--error">
+      <span>
+        <i className="fas fa-exclamation-triangle" />{' '}
+        <strong>{unhealthy.length} node{unhealthy.length > 1 ? 's' : ''} unhealthy</strong>
+        {' - '}{unhealthy.map(n => n.name).slice(0, 3).join(', ')}
+      </span>
+    </div>
+  )
+}
--- a/core/http/react-ui/src/components/nodes/CapacityEditor.jsx
+++ b/core/http/react-ui/src/components/nodes/CapacityEditor.jsx
@@ -0,0 +1,196 @@
+import { useState, useEffect, useCallback } from 'react'
+import { nodesApi } from '../../utils/api'
+import LoadingSpinner from '../LoadingSpinner'
+
+/**
+ * Inline editor for a node's per-model replica capacity.
+ *
+ * UX intent: discoverable affordance (pencil icon) that opens an inline
+ * input - never a modal for a single field. Source-of-truth note is shown
+ * inline so operators understand a worker re-registration will overwrite
+ * their override; surfacing this in a tooltip would hide too important a
+ * caveat.
+ *
+ * `confirmShrink` is a hook the parent provides so the page can render its
+ * own confirm dialog (it has access to all nodes and can phrase the message
+ * with full context).
+ */
+export default function CapacityEditor({ node, loadedModelCounts, onUpdate, confirmShrink, addToast }) {
+  const current = node.max_replicas_per_model || 1
+  const isOverride = !!node.max_replicas_per_model_manually_set
+  const [editing, setEditing] = useState(false)
+  const [draft, setDraft] = useState(String(current))
+  const [saving, setSaving] = useState(false)
+  const [resetting, setResetting] = useState(false)
+
+  // Reset draft when current value changes (server response, etc.)
+  useEffect(() => {
+    if (!editing) setDraft(String(current))
+  }, [current, editing])
+
+  const cancel = useCallback(() => {
+    setEditing(false)
+    setDraft(String(current))
+  }, [current])
+
+  const save = useCallback(async () => {
+    const value = parseInt(draft, 10)
+    if (!Number.isFinite(value) || value < 1) {
+      addToast('Replica capacity must be 1 or higher', 'error')
+      return
+    }
+    if (value === current) {
+      setEditing(false)
+      return
+    }
+    // Reducing the cap below current loaded replicas: confirm so the operator
+    // sees the consequence (running replicas keep going until idle eviction).
+    const maxLoadedAcrossModels = Math.max(0, ...Object.values(loadedModelCounts || {}))
+    if (value < maxLoadedAcrossModels) {
+      const proceed = await confirmShrink({ node, newValue: value, currentLoaded: maxLoadedAcrossModels })
+      if (!proceed) return
+    }
+    setSaving(true)
+    try {
+      await nodesApi.updateMaxReplicasPerModel(node.id, value)
+      addToast(`Replica capacity set to ${value} on ${node.name}`, 'success')
+      setEditing(false)
+      onUpdate?.(value)
+    } catch (err) {
+      addToast(`Could not change replica capacity: ${err.message || err}`, 'error')
+    } finally {
+      setSaving(false)
+    }
+  }, [draft, current, node, loadedModelCounts, confirmShrink, onUpdate, addToast])
+
+  const onKeyDown = (e) => {
+    if (e.key === 'Enter') { e.preventDefault(); save() }
+    else if (e.key === 'Escape') { e.preventDefault(); cancel() }
+  }
+
+  const reset = useCallback(async () => {
+    setResetting(true)
+    try {
+      await nodesApi.resetMaxReplicasPerModel(node.id)
+      addToast(`Override cleared on ${node.name}; worker flag will apply on next re-registration`, 'success')
+      onUpdate?.(null)
+    } catch (err) {
+      addToast(`Could not reset override: ${err.message || err}`, 'error')
+    } finally {
+      setResetting(false)
+    }
+  }, [node, onUpdate, addToast])
+
+  return (
+    <div style={{
+      display: 'flex', alignItems: 'flex-start', gap: 'var(--spacing-md)',
+    }}>
+      <i className="fas fa-layer-group" style={{ color: 'var(--color-text-muted)', marginTop: 3 }} aria-hidden="true" />
+      <div style={{ flex: 1, minWidth: 0 }}>
+        <div style={{ display: 'flex', alignItems: 'center', gap: 'var(--spacing-sm)', flexWrap: 'wrap' }}>
+          <label
+            htmlFor={`capacity-${node.id}`}
+            style={{ fontSize: '0.8125rem', fontWeight: 600, color: 'var(--color-text-primary)' }}
+          >
+            Max replicas per model
+          </label>
+          {editing ? (
+            <>
+              <input
+                id={`capacity-${node.id}`}
+                type="number"
+                min={1}
+                value={draft}
+                disabled={saving}
+                onChange={(e) => setDraft(e.target.value)}
+                onKeyDown={onKeyDown}
+                autoFocus
+                aria-describedby={`capacity-hint-${node.id}`}
+                style={{
+                  width: 72, padding: '4px 8px', borderRadius: 'var(--radius-sm)',
+                  border: '1px solid var(--color-border)', background: 'var(--color-bg-primary)',
+                  fontFamily: 'var(--font-mono)', fontSize: '0.8125rem',
+                  color: 'var(--color-text-primary)',
+                }}
+              />
+              <button
+                className="btn btn-primary btn-sm"
+                onClick={save}
+                disabled={saving}
+                style={{ minHeight: 32 }}
+                aria-label="Save replica capacity"
+              >
+                {saving ? <LoadingSpinner size="xs" /> : <><i className="fas fa-check" /> Save</>}
+              </button>
+              <button
+                className="btn btn-secondary btn-sm"
+                onClick={cancel}
+                disabled={saving}
+                style={{ minHeight: 32 }}
+                aria-label="Cancel"
+              >
+                Cancel
+              </button>
+            </>
+          ) : (
+            <>
+              <span
+                className="cell-mono"
+                style={{ fontSize: '0.8125rem', color: 'var(--color-text-secondary)' }}
+              >
+                {current}
+              </span>
+              {isOverride && (
+                <span
+                  title="This value was set from the UI. It will persist across worker restarts until you click Reset."
+                  style={{
+                    display: 'inline-block', fontSize: '0.6875rem', padding: '1px 6px',
+                    borderRadius: 'var(--radius-sm)', fontWeight: 500,
+                    background: 'var(--color-bg-primary)',
+                    border: '1px solid var(--color-warning, #d97706)',
+                    color: 'var(--color-warning, #d97706)',
+                  }}
+                >
+                  override
+                </span>
+              )}
+              <button
+                onClick={() => setEditing(true)}
+                aria-label={`Edit replica capacity (currently ${current})`}
+                title="Change replica capacity for this node"
+                style={{
+                  display: 'inline-flex', alignItems: 'center', justifyContent: 'center',
+                  minWidth: 32, minHeight: 32, padding: 4, borderRadius: 'var(--radius-sm)',
+                  border: '1px solid var(--color-border-subtle)',
+                  background: 'transparent', color: 'var(--color-text-muted)', cursor: 'pointer',
+                }}
+              >
+                <i className="fas fa-pencil-alt" />
+              </button>
+              {isOverride && (
+                <button
+                  onClick={reset}
+                  disabled={resetting}
+                  aria-label="Clear admin override and let the worker flag apply"
+                  title="Clear override; the worker's --max-replicas-per-model flag will apply on the next re-registration"
+                  className="btn btn-secondary btn-sm"
+                  style={{ minHeight: 32 }}
+                >
+                  {resetting ? <LoadingSpinner size="xs" /> : <><i className="fas fa-undo" /> Reset</>}
+                </button>
+              )}
+            </>
+          )}
+        </div>
+        <div
+          id={`capacity-hint-${node.id}`}
+          style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', marginTop: 4, lineHeight: 1.4 }}
+        >
+          {isOverride
+            ? <>Set from here. <strong>Reset</strong> to use the worker's default.</>
+            : <>Saved values stick across worker restarts.</>}
+        </div>
+      </div>
+    </div>
+  )
+}
--- a/Show More
+++ b/Show More