docs(readme): announce native voice-detect + face-detect backends in Latest News

Add a Latest News entry for the new from-scratch C++/ggml biometric backends (voice-detect.cpp + face-detect.cpp) that replace the Python insightface and speaker-recognition backends: no Python/onnxruntime at inference, self-contained GGUF, bit-exact parity, GPU cuDNN parity. Mirrors the parakeet.cpp / locate-anything.cpp native-backend news entries. Refs PR #10441. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code]
chore(recon): bump voice-detect pin to ERes2Net blocked-default (30beecd)
2026-06-25 00:59:28 -04:00 · 2026-06-24 22:14:41 +00:00 · 2026-06-24 19:51:03 +00:00 · 2026-06-24 15:54:12 +00:00 · 2026-06-24 15:39:42 +00:00 · 2026-06-24 13:03:48 +00:00
194 changed files with 4963 additions and 5833 deletions
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -3723,6 +3723,302 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  # voice-detect
  - build-type: 'cublas'
    cuda-major-version: "12"
    cuda-minor-version: "8"
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-nvidia-cuda-12-voice-detect'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "voice-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "13"
    cuda-minor-version: "0"
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-nvidia-cuda-13-voice-detect'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "voice-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "13"
    cuda-minor-version: "0"
    platforms: 'linux/arm64'
    skip-drivers: 'false'
    tag-latest: 'auto'
    tag-suffix: '-nvidia-l4t-cuda-13-arm64-voice-detect'
    base-image: "ubuntu:24.04"
    ubuntu-version: '2404'
    runs-on: 'ubuntu-24.04-arm'
    backend: "voice-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
  - build-type: ''
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    platform-tag: 'amd64'
    tag-latest: 'auto'
    tag-suffix: '-cpu-voice-detect'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "voice-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: ''
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/arm64'
    platform-tag: 'arm64'
    tag-latest: 'auto'
    tag-suffix: '-cpu-voice-detect'
    runs-on: 'ubuntu-24.04-arm'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "voice-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'sycl_f32'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-intel-sycl-f32-voice-detect'
    runs-on: 'ubuntu-latest'
    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
    skip-drivers: 'false'
    backend: "voice-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'sycl_f16'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-intel-sycl-f16-voice-detect'
    runs-on: 'ubuntu-latest'
    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
    skip-drivers: 'false'
    backend: "voice-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'vulkan'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    platform-tag: 'amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-vulkan-voice-detect'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "voice-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'vulkan'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/arm64'
    platform-tag: 'arm64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-vulkan-voice-detect'
    runs-on: 'ubuntu-24.04-arm'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "voice-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "12"
    cuda-minor-version: "0"
    platforms: 'linux/arm64'
    skip-drivers: 'false'
    tag-latest: 'auto'
    tag-suffix: '-nvidia-l4t-arm64-voice-detect'
    base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
    runs-on: 'ubuntu-24.04-arm'
    backend: "voice-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2204'
  - build-type: 'hipblas'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-rocm-hipblas-voice-detect'
    base-image: "rocm/dev-ubuntu-24.04:7.2.1"
    runs-on: 'ubuntu-latest'
    skip-drivers: 'false'
    backend: "voice-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  # face-detect
  - build-type: 'cublas'
    cuda-major-version: "12"
    cuda-minor-version: "8"
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-nvidia-cuda-12-face-detect'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "face-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "13"
    cuda-minor-version: "0"
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-nvidia-cuda-13-face-detect'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "face-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "13"
    cuda-minor-version: "0"
    platforms: 'linux/arm64'
    skip-drivers: 'false'
    tag-latest: 'auto'
    tag-suffix: '-nvidia-l4t-cuda-13-arm64-face-detect'
    base-image: "ubuntu:24.04"
    ubuntu-version: '2404'
    runs-on: 'ubuntu-24.04-arm'
    backend: "face-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
  - build-type: ''
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    platform-tag: 'amd64'
    tag-latest: 'auto'
    tag-suffix: '-cpu-face-detect'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "face-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: ''
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/arm64'
    platform-tag: 'arm64'
    tag-latest: 'auto'
    tag-suffix: '-cpu-face-detect'
    runs-on: 'ubuntu-24.04-arm'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "face-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'sycl_f32'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-intel-sycl-f32-face-detect'
    runs-on: 'ubuntu-latest'
    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
    skip-drivers: 'false'
    backend: "face-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'sycl_f16'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-intel-sycl-f16-face-detect'
    runs-on: 'ubuntu-latest'
    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
    skip-drivers: 'false'
    backend: "face-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'vulkan'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    platform-tag: 'amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-vulkan-face-detect'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "face-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'vulkan'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/arm64'
    platform-tag: 'arm64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-vulkan-face-detect'
    runs-on: 'ubuntu-24.04-arm'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "face-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "12"
    cuda-minor-version: "0"
    platforms: 'linux/arm64'
    skip-drivers: 'false'
    tag-latest: 'auto'
    tag-suffix: '-nvidia-l4t-arm64-face-detect'
    base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
    runs-on: 'ubuntu-24.04-arm'
    backend: "face-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2204'
  - build-type: 'hipblas'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-rocm-hipblas-face-detect'
    base-image: "rocm/dev-ubuntu-24.04:7.2.1"
    runs-on: 'ubuntu-latest'
    skip-drivers: 'false'
    backend: "face-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  # acestep-cpp
  - build-type: ''
    cuda-major-version: ""
@@ -4906,6 +5202,14 @@ includeDarwin:
    tag-suffix: "-metal-darwin-arm64-ced"
    build-type: "metal"
    lang: "go"
  - backend: "voice-detect"
    tag-suffix: "-metal-darwin-arm64-voice-detect"
    build-type: "metal"
    lang: "go"
  - backend: "face-detect"
    tag-suffix: "-metal-darwin-arm64-face-detect"
    build-type: "metal"
    lang: "go"
  - backend: "acestep-cpp"
    tag-suffix: "-metal-darwin-arm64-acestep-cpp"
    build-type: "metal"
@@ -4974,9 +5278,6 @@ includeDarwin:
  - backend: "kitten-tts"
    tag-suffix: "-metal-darwin-arm64-kitten-tts"
    build-type: "mps"
  - backend: "liquid-audio"
    tag-suffix: "-metal-darwin-arm64-liquid-audio"
    build-type: "mps"
  - backend: "piper"
    tag-suffix: "-metal-darwin-arm64-piper"
    build-type: "metal"
@@ -4993,10 +5294,6 @@ includeDarwin:
    tag-suffix: "-metal-darwin-arm64-sherpa-onnx"
    build-type: "metal"
    lang: "go"
  - backend: "supertonic"
    tag-suffix: "-metal-darwin-arm64-supertonic"
    build-type: "metal"
    lang: "go"
  - backend: "local-store"
    tag-suffix: "-metal-darwin-arm64-local-store"
    build-type: "metal"
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -44,7 +44,7 @@ jobs:
      has-merges-singlearch: ${{ steps.set-matrix.outputs['has-merges-singlearch'] }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
      - name: Setup Bun
        uses: oven-sh/setup-bun@v2
--- a/.github/workflows/backend_build.yml
+++ b/.github/workflows/backend_build.yml
@@ -101,7 +101,7 @@ jobs:
    steps:
      - name: Checkout
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
--- a/.github/workflows/backend_build_darwin.yml
+++ b/.github/workflows/backend_build_darwin.yml
@@ -57,7 +57,7 @@ jobs:
      HOMEBREW_NO_ANALYTICS: '1'
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
--- a/.github/workflows/backend_merge.yml
+++ b/.github/workflows/backend_merge.yml
@@ -49,7 +49,7 @@ jobs:
      # Sparse checkout: the merge job needs `.github/scripts/` (for the
      # keepalive cleanup script) but none of the source tree.
      - name: Checkout (.github/scripts only)
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          sparse-checkout: |
            .github/scripts
--- a/.github/workflows/backend_pr.yml
+++ b/.github/workflows/backend_pr.yml
@@ -23,7 +23,7 @@ jobs:
      has-merges-singlearch: ${{ steps.set-matrix.outputs['has-merges-singlearch'] }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
      - name: Setup Bun
        uses: oven-sh/setup-bun@v2
--- a/.github/workflows/base-images.yml
+++ b/.github/workflows/base-images.yml
@@ -127,7 +127,7 @@ jobs:
            # the original l4t matrix entry which set skip-drivers: 'true'.
            skip-drivers: 'true'
    steps:
-      - uses: actions/checkout@v7
+      - uses: actions/checkout@v6
        with:
          submodules: false
      - name: Free disk space
--- a/.github/workflows/build-test.yaml
+++ b/.github/workflows/build-test.yaml
@@ -11,7 +11,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          fetch-depth: 0
      - name: Set up Go
@@ -25,7 +25,7 @@ jobs:
    runs-on: macos-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          fetch-depth: 0
      - name: Set up Go
@@ -47,7 +47,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          fetch-depth: 0
      - name: Configure apt mirror on runner
--- a/.github/workflows/bump-inference-defaults.yml
+++ b/.github/workflows/bump-inference-defaults.yml
@@ -14,7 +14,7 @@ jobs:
  bump:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v7
+      - uses: actions/checkout@v6
      - uses: actions/setup-go@v5
        with:
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -46,6 +46,14 @@ jobs:
            variable: "CED_VERSION"
            branch: "master"
            file: "backend/go/ced/Makefile"
          - repository: "mudler/voice-detect.cpp"
            variable: "VOICEDETECT_VERSION"
            branch: "master"
            file: "backend/go/voice-detect/Makefile"
          - repository: "mudler/face-detect.cpp"
            variable: "FACEDETECT_VERSION"
            branch: "master"
            file: "backend/go/face-detect/Makefile"
          - repository: "mudler/depth-anything.cpp"
            variable: "DEPTHANYTHING_VERSION"
            branch: "master"
@@ -92,7 +100,7 @@ jobs:
            file: "backend/go/vibevoice-cpp/Makefile"
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v7
+      - uses: actions/checkout@v6
      - name: Bump dependencies 🔧
        id: bump
        run: |
@@ -128,7 +136,7 @@ jobs:
    if: github.repository == 'mudler/LocalAI'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v7
+      - uses: actions/checkout@v6
      - name: Bump vLLM cu130 wheel pin 🔧
        id: bump
        run: |
--- a/.github/workflows/bump_docs.yaml
+++ b/.github/workflows/bump_docs.yaml
@@ -13,7 +13,7 @@ jobs:
          - repository: "mudler/LocalAI"
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v7
+      - uses: actions/checkout@v6
      - name: Bump dependencies 🔧
        run: |
          bash .github/bump_docs.sh ${{ matrix.repository }}
--- a/.github/workflows/checksum_checker.yaml
+++ b/.github/workflows/checksum_checker.yaml
@@ -8,7 +8,7 @@ jobs:
    if: github.repository == 'mudler/LocalAI'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v7
+      - uses: actions/checkout@v6
      - name: Configure apt mirror on runner
        uses: ./.github/actions/configure-apt-mirror
      - name: Install dependencies
--- a/.github/workflows/deploy-explorer.yaml
+++ b/.github/workflows/deploy-explorer.yaml
@@ -16,7 +16,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - uses: actions/setup-go@v5
--- a/.github/workflows/gallery-agent.yaml
+++ b/.github/workflows/gallery-agent.yaml
@@ -31,7 +31,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -44,7 +44,7 @@ jobs:
        uses: docker/setup-buildx-action@master
      - name: Checkout
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
      - name: Cache Intel images
        uses: docker/build-push-action@v7
--- a/.github/workflows/gh-pages.yml
+++ b/.github/workflows/gh-pages.yml
@@ -28,7 +28,7 @@ jobs:
      HUGO_VERSION: "0.146.3"
    steps:
      - name: Checkout
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          fetch-depth: 0  # needed for enableGitInfo
          submodules: true
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -80,7 +80,7 @@ jobs:
    steps:
      - name: Checkout
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
      - name: Configure apt mirror on runner
        id: apt_mirror
--- a/.github/workflows/image_merge.yml
+++ b/.github/workflows/image_merge.yml
@@ -36,7 +36,7 @@ jobs:
      # Sparse checkout: needed for .github/scripts/ (the keepalive cleanup
      # script). Skips the rest of the source tree.
      - name: Checkout (.github/scripts only)
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          sparse-checkout: |
            .github/scripts
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -20,7 +20,7 @@ jobs:
  golangci-lint:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v7
+      - uses: actions/checkout@v6
        with:
          # Full history so golangci-lint's new-from-merge-base can reach
          # origin/master and compute the diff against it.
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -10,7 +10,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          fetch-depth: 0
      - name: Set up Go
@@ -28,7 +28,7 @@ jobs:
    runs-on: macos-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          fetch-depth: 0
      - name: Set up Go
@@ -46,7 +46,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          fetch-depth: 0
      - name: Configure apt mirror on runner
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -14,7 +14,7 @@ jobs:
      GO111MODULE: on
    steps:
      - name: Checkout Source
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -50,7 +50,7 @@ jobs:
      parakeet-cpp: ${{ steps.detect.outputs.parakeet-cpp }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
      - name: Setup Bun
        uses: oven-sh/setup-bun@v2
      - name: Install dependencies
@@ -67,7 +67,7 @@ jobs:
  #   runs-on: ubuntu-latest
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v7
+  #       uses: actions/checkout@v6
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -90,7 +90,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Dependencies
@@ -113,7 +113,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Dependencies
@@ -137,7 +137,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Dependencies
@@ -158,7 +158,7 @@ jobs:
  #  runs-on: ubuntu-latest
  #  steps:
  #    - name: Clone
-  #      uses: actions/checkout@v7
+  #      uses: actions/checkout@v6
  #      with:
  #        submodules: true
  #    - name: Dependencies
@@ -178,7 +178,7 @@ jobs:
  #   runs-on: ubuntu-latest
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v7
+  #       uses: actions/checkout@v6
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -240,7 +240,7 @@ jobs:
  #           sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
  #           df -h
  #     - name: Clone
-  #       uses: actions/checkout@v7
+  #       uses: actions/checkout@v6
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -265,7 +265,7 @@ jobs:
  #   runs-on: ubuntu-latest
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v7
+  #       uses: actions/checkout@v6
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -288,7 +288,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Dependencies
@@ -309,7 +309,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Dependencies
@@ -330,7 +330,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Dependencies
@@ -351,7 +351,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Dependencies
@@ -373,7 +373,7 @@ jobs:
  #   timeout-minutes: 45
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v7
+  #       uses: actions/checkout@v6
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -394,7 +394,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Dependencies
@@ -415,7 +415,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Dependencies
@@ -436,7 +436,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Dependencies
@@ -462,7 +462,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Dependencies
@@ -484,7 +484,7 @@ jobs:
    timeout-minutes: 30
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Dependencies
@@ -513,7 +513,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Setup Go
@@ -530,7 +530,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Setup Go
@@ -552,7 +552,7 @@ jobs:
    timeout-minutes: 20
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Setup Go
@@ -579,7 +579,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Setup Go
@@ -604,7 +604,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Setup Go
@@ -625,7 +625,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Setup Go
@@ -645,7 +645,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Setup Go
@@ -664,7 +664,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Setup Go
@@ -681,7 +681,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Setup Go
@@ -698,7 +698,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Setup Go
@@ -741,7 +741,7 @@ jobs:
  #   timeout-minutes: 90
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v7
+  #       uses: actions/checkout@v6
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -783,7 +783,7 @@ jobs:
  #   timeout-minutes: 90
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v7
+  #       uses: actions/checkout@v6
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -808,7 +808,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Dependencies
@@ -840,7 +840,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Dependencies
@@ -876,7 +876,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Dependencies
@@ -915,7 +915,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Dependencies
@@ -952,7 +952,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Dependencies
@@ -987,7 +987,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Setup Go
@@ -1013,7 +1013,7 @@ jobs:
    timeout-minutes: 150
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Dependencies
@@ -1042,7 +1042,7 @@ jobs:
    timeout-minutes: 60
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Setup Go
@@ -1058,7 +1058,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Dependencies
@@ -1091,7 +1091,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Dependencies
@@ -1114,7 +1114,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Dependencies
@@ -1140,7 +1140,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Dependencies
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -21,7 +21,7 @@ jobs:
        go-version: ['1.26.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Free disk space
@@ -84,7 +84,7 @@ jobs:
        go-version: ['1.26.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
--- a/.github/workflows/tests-aio.yml
+++ b/.github/workflows/tests-aio.yml
@@ -62,7 +62,7 @@ jobs:
          sudo rm -rfv build || true
          df -h
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Dependencies
--- a/.github/workflows/tests-e2e.yml
+++ b/.github/workflows/tests-e2e.yml
@@ -21,7 +21,7 @@ jobs:
        go-version: ['1.25.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Configure apt mirror on runner
--- a/.github/workflows/tests-pii-ner-e2e.yml
+++ b/.github/workflows/tests-pii-ner-e2e.yml
@@ -1,97 +0,0 @@
 ---
 name: 'PII NER tier E2E (live GGUF, CPU)'
 # Runs the real privacy-filter GGUF NER tier end-to-end on CPU — the gap the
 # hermetic tests/e2e suite cannot cover (it only exercises the in-process
 # pattern tier). Heavy (builds the C++ backend image + downloads a ~2.7 GB
 # GGUF), so it is path-filtered on PRs and otherwise runs nightly / on demand.
 #
 # This drives the container-level harness (tests/e2e-backends) via
 # `make test-extra-backend-privacy-filter`: it builds the privacy-filter image,
 # downloads the model, loads it on CPU, and asserts byte-correct, UTF-8-aligned
 # TokenClassify spans. The complementary HTTP-path specs in tests/e2e
 # (e2e_pii_ner_test.go) Skip unless PII_NER_MODEL_GGUF is wired.
 on:
  workflow_dispatch:
  schedule:
    - cron: '0 3 * * *'
  push:
    branches:
      - master
    paths:
      - 'backend/cpp/privacy-filter/**'
      - 'backend/Dockerfile.privacy-filter'
      - 'core/services/routing/pii/**'
      - 'core/services/routing/piidetector/**'
      - 'core/backend/token_classify.go'
      - 'core/http/endpoints/localai/pii.go'
      - 'core/schema/pii.go'
      - 'tests/e2e-backends/**'
      - 'tests/e2e/e2e_pii_ner_test.go'
      - 'tests/e2e/e2e_suite_test.go'
      - '.github/workflows/tests-pii-ner-e2e.yml'
  pull_request:
    paths:
      - 'backend/cpp/privacy-filter/**'
      - 'backend/Dockerfile.privacy-filter'
      - 'core/services/routing/pii/**'
      - 'core/services/routing/piidetector/**'
      - 'core/backend/token_classify.go'
      - 'core/http/endpoints/localai/pii.go'
      - 'core/schema/pii.go'
      - 'tests/e2e-backends/**'
      - 'tests/e2e/e2e_pii_ner_test.go'
      - 'tests/e2e/e2e_suite_test.go'
      - '.github/workflows/tests-pii-ner-e2e.yml'
 concurrency:
  group: ci-tests-pii-ner-e2e-${{ github.event.pull_request.number || github.sha }}-${{ github.repository }}
  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
 jobs:
  tests-pii-ner-e2e:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        go-version: ['1.25.x']
    steps:
      - name: Clone
        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Free disk space
        run: |
          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL || true
          sudo docker image prune --all --force || true
          df -h
      - name: Configure apt mirror on runner
        uses: ./.github/actions/configure-apt-mirror
      - name: Setup Go ${{ matrix.go-version }}
        uses: actions/setup-go@v5
        with:
          go-version: ${{ matrix.go-version }}
          cache: false
      - name: Proto Dependencies
        run: |
          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
          rm protoc.zip
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          PATH="$PATH:$HOME/go/bin" make protogen-go
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y build-essential
      # Builds local-ai-backend:privacy-filter, downloads the GGUF, loads it on
      # CPU and runs the token_classify capability spec (byte-offset contract).
      - name: Run live PII NER backend E2E
        run: PATH="$PATH:$HOME/go/bin" make test-extra-backend-privacy-filter
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
        uses: mxschmitt/action-tmate@v3.23
        with:
          detached: true
          connect-timeout-seconds: 180
          limit-access-to-actor: true
--- a/.github/workflows/tests-ui-e2e.yml
+++ b/.github/workflows/tests-ui-e2e.yml
@@ -23,7 +23,7 @@ jobs:
        go-version: ['1.26.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v7
+        uses: actions/checkout@v6
        with:
          submodules: true
      - name: Configure apt mirror on runner
--- a/.github/workflows/update_swagger.yaml
+++ b/.github/workflows/update_swagger.yaml
@@ -10,7 +10,7 @@ jobs:
      fail-fast: false
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v7
+      - uses: actions/checkout@v6
      - name: Configure apt mirror on runner
        uses: ./.github/actions/configure-apt-mirror
      - uses: actions/setup-go@v5
--- a/.gitignore
+++ b/.gitignore
@@ -91,6 +91,3 @@ core/http/react-ui/test-results/
 # Local worktrees
 .worktrees/
 # SDD / brainstorm scratch (agent-driven development)
 .superpowers/
--- a/10
+++ b/10
@@ -690,16 +690,6 @@ test-extra-backend-llama-cpp-transcription: docker-build-llama-cpp
 	BACKEND_TEST_CTX_SIZE=2048 \
 	$(MAKE) test-extra-backend
 ## privacy-filter: the PII/NER token-classification backend. Exercises the
 ## TokenClassify RPC and asserts byte-correct, UTF-8-aligned span offsets
 ## against the openai-privacy-filter multilingual GGUF (CPU-runnable, ~50M
 ## active params). This is the live-backend coverage for the PII NER tier.
 test-extra-backend-privacy-filter: docker-build-privacy-filter
 	BACKEND_IMAGE=local-ai-backend:privacy-filter \
 	BACKEND_TEST_MODEL_URL=https://huggingface.co/LocalAI-io/privacy-filter-multilingual-GGUF/resolve/main/privacy-filter-multilingual-f16.gguf \
 	BACKEND_TEST_CAPS=health,load,token_classify \
 	$(MAKE) test-extra-backend
 ## vllm is resolved from a HuggingFace model id (no file download) and
 ## exercises Predict + streaming + tool-call extraction via the hermes parser.
 ## Requires a host CPU with the SIMD instructions the prebuilt vllm CPU
--- a/README.md
+++ b/README.md
@@ -177,6 +177,7 @@ For more details, see the [Getting Started guide](https://localai.io/basics/gett
 ## Latest News
 - **June 2026**: New native biometric backends from the LocalAI team: [voice-detect.cpp](https://github.com/mudler/voice-detect.cpp) for speaker recognition and voice analysis (ECAPA-TDNN, WeSpeaker, ERes2Net, CAM++, wav2vec2 age/gender/emotion) and [face-detect.cpp](https://github.com/mudler/face-detect.cpp) for face detection, recognition, demographics and anti-spoofing (SCRFD/ArcFace, YuNet/SFace). Both are from-scratch C++/ggml engines with no Python or onnxruntime at inference, self-contained GGUF weights, bit-exact parity with the reference, and GPU cuDNN parity, replacing the heavier Python `insightface` and `speaker-recognition` backends ([PR #10441](https://github.com/mudler/LocalAI/pull/10441)).
 - **June 2026**: New [realtime voice assistant demo](https://github.com/localai-org/localai-realtime-demo) (a tiny Go client for the Realtime API with a full talk-back voice loop and tool calling), plus [streaming of the realtime LLM / TTS / transcription pipeline stages](https://github.com/mudler/LocalAI/pull/10176) and [configurable WebRTC ICE candidates](https://github.com/mudler/LocalAI/pull/10231).
 - **June 2026**: Big speech push: the [parakeet.cpp](https://github.com/mudler/parakeet.cpp) ASR engine gains [NeMo-faithful segment timestamps](https://github.com/mudler/LocalAI/pull/10207), a [multilingual streaming Nemotron-3.5 model](https://github.com/mudler/LocalAI/pull/10199), [dynamic batching for concurrent transcription](https://github.com/mudler/LocalAI/pull/10112) and [CUDA graphs](https://github.com/mudler/LocalAI/pull/10273); the new [CrispASR backend](https://github.com/mudler/LocalAI/pull/10099) adds multi-architecture ASR + TTS, and [60 Piper TTS voices across 42 languages](https://github.com/mudler/LocalAI/pull/10296) land in the gallery (plus [per-request TTS instructions and params](https://github.com/mudler/LocalAI/pull/10172)).
 - **June 2026**: New backends and models: [locate-anything.cpp](https://github.com/mudler/LocalAI/pull/10264) for open-vocabulary object detection via ggml, [Ideogram4 image generation](https://github.com/mudler/LocalAI/pull/10201) in stablediffusion-ggml, [llama.cpp video input](https://github.com/mudler/LocalAI/pull/10216), and the [Gemma 4 QAT family with MTP speculative-decoding pairs](https://github.com/mudler/LocalAI/pull/10215). Plus an [interactive CLI chat mode](https://github.com/mudler/LocalAI/pull/10226) and [RAG source citations in agent responses](https://github.com/mudler/LocalAI/pull/10228).
--- a/backend/Dockerfile.golang
+++ b/backend/Dockerfile.golang
@@ -137,7 +137,7 @@ RUN <<EOT bash
            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
        if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
            apt-get install -y --no-install-recommends \
-            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
+            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} libcudnn9-dev-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
        fi
        apt-get clean && \
        rm -rf /var/lib/apt/lists/*
--- a/backend/cpp/ik-llama-cpp/Makefile
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -1,5 +1,5 @@
-IK_LLAMA_VERSION?=d5507e33ae7ee2b7b41475f08044d3bde3b839ee
+IK_LLAMA_VERSION?=6c00e87ac84404af588ad2e65935bd6f079c696f
 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@
-LLAMA_VERSION?=be4a6a63eb2b848e19c277bdcf2bd399e8af76d9
+LLAMA_VERSION?=e475fa2b5f9fb50c3d6fc3e7c6fdf1e004465b62
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
 CMAKE_ARGS?=
--- a/backend/go/crispasr/Makefile
+++ b/backend/go/crispasr/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 # CrispASR version (release tag)
 CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
-CRISPASR_VERSION?=96b2a6ee31d30389fed8a7ef1a54239b75231ddc
+CRISPASR_VERSION?=d745bda4386ae0f9d1d2f23fff8ec95d76428221
 SO_TARGET?=libgocrispasr.so
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/face-detect/.gitignore
+++ b/backend/go/face-detect/.gitignore
@@ -0,0 +1,18 @@
 # Fetched upstream sources
 sources/
 # CMake build directories
 build*/
 # build artifacts staged in-tree by the Makefile (cp from sources/) or
 # symlinked for local dev; the real sources live in face-detect.cpp upstream.
 *.so
 *.so.*
 facedetect_capi.h
 compile_commands.json
 # Compiled backend binary
 face-detect-grpc
 # Packaging output
 package/
--- a/backend/go/face-detect/Makefile
+++ b/backend/go/face-detect/Makefile
@@ -0,0 +1,110 @@
 # face-detect backend Makefile.
 #
 # Upstream pin lives below as FACEDETECT_VERSION?=6107a24... (.github/bump_deps.sh
 # can find and update it - matches the voice-detect / parakeet.cpp / whisper.cpp
 # convention).
 #
 # Local dev shortcut: if you already have an out-of-tree face-detect.cpp build,
 # symlink the .so + header into this directory and skip the clone/cmake steps:
 #
 #   ln -sf /path/to/face-detect.cpp/build-shared/libfacedetect.so .
 #   ln -sf /path/to/face-detect.cpp/include/facedetect_capi.h .
 #   go build -o face-detect-grpc .
 #
 # The default target below does the proper clone-at-pin + cmake build so CI does
 # not need a side-checkout.
 FACEDETECT_VERSION?=6107a2414fdaccc9ce8650b762f9436d20541cbe
 FACEDETECT_REPO?=https://github.com/mudler/face-detect.cpp
 GOCMD?=go
 GO_TAGS?=
 JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
 BUILD_TYPE?=
 NATIVE?=false
 # Resolve the target arch. The backend matrix / Docker build pass TARGETARCH
 # (amd64|arm64); fall back to uname -m (aarch64|x86_64) for a local build.
 RECON_ARCH?=$(or $(TARGETARCH),$(shell uname -m))
 # Build ggml + the vendored libjpeg-turbo statically into libfacedetect.so (PIC)
 # so the shared lib is self-contained: dlopen needs no libggml*.so alongside it,
 # only system libs (libstdc++/libgomp/libc) the runtime image already provides.
 # The vendored jpeg symbols are hidden via -Wl,--exclude-libs,ALL on the C++
 # side, so only the facedetect_capi_* surface is exported.
 CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DFACEDETECT_SHARED=ON -DFACEDETECT_BUILD_CLI=OFF -DFACEDETECT_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
 endif
 # face-detect.cpp gates its GGML backends behind FACEDETECT_GGML_* options and
 # does set(GGML_CUDA ${FACEDETECT_GGML_CUDA} CACHE BOOL "" FORCE), so a bare
 # -DGGML_CUDA=ON is overwritten back to OFF. Forward the FACEDETECT_GGML_*
 # options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
 ifeq ($(BUILD_TYPE),cublas)
 	CMAKE_ARGS+=-DFACEDETECT_GGML_CUDA=ON
 	# Opt-in cuDNN implicit-GEMM conv path (kills im2col on GPU, SCRFD 2.3x
 	# vs torch-cuDNN parity). Only the arm64 + CUDA 13 image (GB10/Jetson/L4T)
 	# ships libcudnn9 + the -dev headers, so gate cuDNN to that variant.
 	# x86 CUDA images carry no cuDNN -> enabling it there is a link failure.
 	ifeq ($(CUDA_MAJOR_VERSION),13)
 	ifneq (,$(filter arm64 aarch64,$(RECON_ARCH)))
 		CMAKE_ARGS+=-DFACEDETECT_GGML_CUDNN=ON
 	endif
 	endif
 else ifeq ($(BUILD_TYPE),openblas)
 	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DFACEDETECT_GGML_HIP=ON
 else ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DFACEDETECT_GGML_VULKAN=ON
 else ifeq ($(BUILD_TYPE),metal)
 	CMAKE_ARGS+=-DFACEDETECT_GGML_METAL=ON
 endif
 .PHONY: face-detect-grpc package build clean purge test all
 all: face-detect-grpc
 # Clone the upstream face-detect.cpp source at the pinned commit. Directory acts
 # as the target so make only re-clones when missing. After a FACEDETECT_VERSION
 # bump, run 'make purge && make' to refetch.
 sources/face-detect.cpp:
 	mkdir -p sources/face-detect.cpp
 	cd sources/face-detect.cpp && \
 	git init -q && \
 	git remote add origin $(FACEDETECT_REPO) && \
 	git fetch --depth 1 origin $(FACEDETECT_VERSION) && \
 	git checkout FETCH_HEAD && \
 	git submodule update --init --recursive --depth 1 --single-branch
 # Build the shared lib + header out-of-tree, then stage them next to the Go
 # sources so purego.Dlopen("libfacedetect.so") and the cgo-less build both pick
 # them up.
 libfacedetect.so: sources/face-detect.cpp
 	cmake -B sources/face-detect.cpp/build-shared -S sources/face-detect.cpp $(CMAKE_ARGS)
 	cmake --build sources/face-detect.cpp/build-shared --config Release -j$(JOBS) --target facedetect
 	cp -fv sources/face-detect.cpp/build-shared/libfacedetect.so* ./ 2>/dev/null || true
 	cp -fv sources/face-detect.cpp/include/facedetect_capi.h ./
 face-detect-grpc: libfacedetect.so main.go gofacedetect.go options.go
 	CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o face-detect-grpc .
 package: face-detect-grpc
 	bash package.sh
 build: package
 # Test target. The embed/detect/verify/analyze smoke specs are gated on
 # FACEDETECT_BACKEND_TEST_MODEL + FACEDETECT_BACKEND_TEST_IMAGE; without them the
 # heavy specs auto-skip and only the pure-Go parsing specs run.
 test:
 	LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
 clean: purge
 	rm -rf libfacedetect.so* facedetect_capi.h package face-detect-grpc
 purge:
 	rm -rf sources/face-detect.cpp
--- a/backend/go/face-detect/gofacedetect.go
+++ b/backend/go/face-detect/gofacedetect.go
@@ -0,0 +1,431 @@
 package main
 import (
 	"encoding/base64"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"math"
 	"os"
 	"path/filepath"
 	"strconv"
 	"strings"
 	"time"
 	"unsafe"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/xlog"
 )
 // purego-bound entry points from libfacedetect.so. Names match
 // facedetect_capi.h exactly so a `nm libfacedetect.so | grep facedetect_capi`
 // is enough to spot drift.
 //
 // The opaque ctx and the malloc'd char*/float* return values are declared as
 // uintptr so we get the raw pointer back and can release it via the matching
 // capi free function. purego's native string/[]float32 returns would copy and
 // forget the original pointer, leaking the C-owned buffer on every call.
 var (
 	CppAbiVersion  func() int32
 	CppLoad        func(ggufPath string) uintptr
 	CppFree        func(ctx uintptr)
 	CppLastError   func(ctx uintptr) string
 	CppFreeString  func(s uintptr)
 	CppFreeVec     func(v uintptr)
 	CppEmbedPath   func(ctx uintptr, imagePath string, outVec, outDim unsafe.Pointer) int32
 	CppEmbedRGB    func(ctx uintptr, rgb []byte, width, height int32, outVec, outDim unsafe.Pointer) int32
 	CppDetectJSON  func(ctx uintptr, imagePath string) uintptr
 	CppVerifyPaths func(ctx uintptr, a, b string, threshold float32, antiSpoof int32, outDistance, outVerified unsafe.Pointer) int32
 	CppAnalyzeJSON func(ctx uintptr, imagePath string) uintptr
 )
 // FaceDetect implements the face-recognition (biometric) subset of the Backend
 // gRPC service over libfacedetect.so. The C side keeps a single loaded model
 // pack plus a per-ctx last-error buffer and is not reentrant, so
 // base.SingleThread serializes every call.
 type FaceDetect struct {
 	base.SingleThread
 	opts   loadOptions
 	ctxPtr uintptr
 }
 func (f *FaceDetect) Load(opts *pb.ModelOptions) error {
 	model := opts.ModelFile
 	if model == "" {
 		model = opts.ModelPath
 	}
 	if !filepath.IsAbs(model) && opts.ModelPath != "" {
 		model = filepath.Join(opts.ModelPath, model)
 	}
 	if model == "" {
 		return errors.New("face-detect: ModelFile is required")
 	}
 	f.opts = parseOptions(opts.Options)
 	if f.opts.modelName == "" {
 		f.opts.modelName = filepath.Base(model)
 	}
 	// Propagate LocalAI's per-model thread budget to the engine. LocalAI spawns
 	// one backend process per model and serves requests concurrently, so the
 	// engine's own min(hardware_concurrency, 8) default can oversubscribe cores.
 	// FACEDETECT_THREADS is read by the engine at backend construction, so it
 	// must be set before the capi load. A non-positive Threads means "unset":
 	// leave the env alone so the engine keeps its sane default.
 	threads := opts.Threads
 	if threads > 0 {
 		if err := os.Setenv("FACEDETECT_THREADS", strconv.Itoa(int(threads))); err != nil {
 			return fmt.Errorf("face-detect: set FACEDETECT_THREADS: %w", err)
 		}
 		xlog.Info("face-detect: applying LocalAI thread budget", "threads", threads)
 	}
 	xlog.Info("face-detect: loading model", "model", model,
 		"verify_threshold", f.opts.verifyThreshold, "abi", CppAbiVersion())
 	ctx := CppLoad(model)
 	if ctx == 0 {
 		// The last-error buffer lives on the ctx that was never returned, so
 		// surface the path the operator tried to load instead.
 		return fmt.Errorf("face-detect: facedetect_capi_load failed for %q", model)
 	}
 	f.ctxPtr = ctx
 	return nil
 }
 // Embeddings returns the L2-normalized ArcFace embedding of the primary face in
 // the supplied image. Mirroring the Python face backend, the image is read from
 // Images[0] as a base64 payload; materializeImage decodes it to a temp file so
 // the path-based C-API can run its own decode (cv2.imread parity). The gRPC
 // server wraps the returned slice in an EmbeddingResult.
 func (f *FaceDetect) Embeddings(req *pb.PredictOptions) ([]float32, error) {
 	if f.ctxPtr == 0 {
 		return nil, errors.New("face-detect: model not loaded")
 	}
 	if len(req.Images) == 0 || req.Images[0] == "" {
 		return nil, errors.New("face-detect: Embedding requires Images[0] to be a base64 image")
 	}
 	path, cleanup, err := materializeImage(req.Images[0])
 	if err != nil {
 		return nil, err
 	}
 	defer cleanup()
 	return f.embedPath(path)
 }
 func (f *FaceDetect) embedPath(path string) ([]float32, error) {
 	var vec uintptr
 	var dim int32
 	rc := CppEmbedPath(f.ctxPtr, path, unsafe.Pointer(&vec), unsafe.Pointer(&dim))
 	if rc != 0 || vec == 0 || dim <= 0 {
 		return nil, f.lastErr("embed", path)
 	}
 	defer CppFreeVec(vec)
 	// Copy out of the C-owned malloc'd buffer before freeing it. The
 	// uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
 	// a C heap pointer from Go-managed memory; safe here, the GC neither tracks
 	// nor moves this buffer and we copy immediately.
 	src := unsafe.Slice((*float32)(unsafe.Pointer(vec)), int(dim)) //nolint:govet // C-owned malloc'd vector, copied out before free
 	out := make([]float32, int(dim))
 	copy(out, src)
 	return out, nil
 }
 // Detect runs SCRFD over the image and returns one Detection per face. The
 // C-API emits a box as [x1,y1,x2,y2] in pixels; the proto carries x/y plus
 // width/height, so the corners are converted. The 5 facial landmarks the engine
 // also returns are dropped: the Detection message has no field for them.
 func (f *FaceDetect) Detect(req *pb.DetectOptions) (pb.DetectResponse, error) {
 	if f.ctxPtr == 0 {
 		return pb.DetectResponse{}, errors.New("face-detect: model not loaded")
 	}
 	if req.Src == "" {
 		return pb.DetectResponse{}, errors.New("face-detect: src image is required")
 	}
 	path, cleanup, err := materializeImage(req.Src)
 	if err != nil {
 		return pb.DetectResponse{}, err
 	}
 	defer cleanup()
 	faces, err := f.detectFaces(path)
 	if err != nil {
 		return pb.DetectResponse{}, err
 	}
 	dets := make([]*pb.Detection, 0, len(faces))
 	for _, fc := range faces {
 		if req.Threshold > 0 && fc.Score < req.Threshold {
 			continue
 		}
 		x, y, w, h := fc.xywh()
 		dets = append(dets, &pb.Detection{
 			X:          x,
 			Y:          y,
 			Width:      w,
 			Height:     h,
 			Confidence: fc.Score,
 			ClassName:  "face",
 		})
 	}
 	return pb.DetectResponse{Detections: dets}, nil
 }
 // FaceVerify embeds the primary face in each image and reports whether they are
 // the same identity by cosine distance against a threshold. A request threshold
 // <= 0 falls back to the model-configured default (verify_threshold option,
 // 0.35 if unset). When anti_spoofing is set, the C-API applies a MiniFASNet
 // veto internally (verified forced false on a spoof); the per-image liveness
 // scores are not exposed by the verify entry point, so img*_is_real /
 // img*_antispoof_score stay at their zero values.
 func (f *FaceDetect) FaceVerify(req *pb.FaceVerifyRequest) (pb.FaceVerifyResponse, error) {
 	if f.ctxPtr == 0 {
 		return pb.FaceVerifyResponse{}, errors.New("face-detect: model not loaded")
 	}
 	if req.Img1 == "" || req.Img2 == "" {
 		return pb.FaceVerifyResponse{}, errors.New("face-detect: img1 and img2 are required")
 	}
 	path1, cleanup1, err := materializeImage(req.Img1)
 	if err != nil {
 		return pb.FaceVerifyResponse{}, err
 	}
 	defer cleanup1()
 	path2, cleanup2, err := materializeImage(req.Img2)
 	if err != nil {
 		return pb.FaceVerifyResponse{}, err
 	}
 	defer cleanup2()
 	threshold := req.Threshold
 	if threshold <= 0 {
 		threshold = f.opts.verifyThreshold
 	}
 	antiSpoof := int32(0)
 	if req.AntiSpoofing {
 		antiSpoof = 1
 	}
 	started := time.Now()
 	var distance float32
 	var verified int32
 	rc := CppVerifyPaths(f.ctxPtr, path1, path2, threshold, antiSpoof,
 		unsafe.Pointer(&distance), unsafe.Pointer(&verified))
 	if rc != 0 {
 		return pb.FaceVerifyResponse{}, f.lastErr("verify", req.Img1[:min(8, len(req.Img1))]+"...")
 	}
 	elapsedMs := float32(time.Since(started).Seconds() * 1000.0)
 	// Confidence decays linearly from 100 at distance 0 to 0 at the threshold,
 	// matching the Python face backend's reporting.
 	confidence := float32(0)
 	if threshold > 0 {
 		confidence = float32(math.Max(0, math.Min(100, (1.0-float64(distance)/float64(threshold))*100.0)))
 	}
 	return pb.FaceVerifyResponse{
 		Verified:         verified != 0,
 		Distance:         distance,
 		Threshold:        threshold,
 		Confidence:       confidence,
 		Model:            f.opts.modelName,
 		Img1Area:         f.bestArea(path1),
 		Img2Area:         f.bestArea(path2),
 		ProcessingTimeMs: elapsedMs,
 	}, nil
 }
 // FaceAnalyze runs the genderage head on every detected face. The C-API returns
 // "M"/"F" gender labels and a rounded age; the labels are normalized to the
 // "Man"/"Woman" values the proto documents.
 func (f *FaceDetect) FaceAnalyze(req *pb.FaceAnalyzeRequest) (pb.FaceAnalyzeResponse, error) {
 	if f.ctxPtr == 0 {
 		return pb.FaceAnalyzeResponse{}, errors.New("face-detect: model not loaded")
 	}
 	if req.Img == "" {
 		return pb.FaceAnalyzeResponse{}, errors.New("face-detect: img is required")
 	}
 	path, cleanup, err := materializeImage(req.Img)
 	if err != nil {
 		return pb.FaceAnalyzeResponse{}, err
 	}
 	defer cleanup()
 	ptr := CppAnalyzeJSON(f.ctxPtr, path)
 	if ptr == 0 {
 		return pb.FaceAnalyzeResponse{}, f.lastErr("analyze", path)
 	}
 	defer CppFreeString(ptr)
 	faces, err := parseAnalyzeJSON(goStringFromCPtr(ptr))
 	if err != nil {
 		return pb.FaceAnalyzeResponse{}, fmt.Errorf("face-detect: analyze JSON: %w", err)
 	}
 	return pb.FaceAnalyzeResponse{Faces: faces}, nil
 }
 // faceBox is one entry of the detect/analyze JSON documents the engine emits.
 type faceBox struct {
 	Score  float32   `json:"score"`
 	Box    []float32 `json:"box"`
 	Age    float32   `json:"age"`
 	Gender string    `json:"gender"`
 }
 // xywh converts the engine's [x1,y1,x2,y2] box into the x/y/width/height the
 // proto carries. A short or missing box yields zeros.
 func (b faceBox) xywh() (x, y, w, h float32) {
 	if len(b.Box) < 4 {
 		return 0, 0, 0, 0
 	}
 	return b.Box[0], b.Box[1], b.Box[2] - b.Box[0], b.Box[3] - b.Box[1]
 }
 type facesJSON struct {
 	Faces []faceBox `json:"faces"`
 }
 func (f *FaceDetect) detectFaces(path string) ([]faceBox, error) {
 	ptr := CppDetectJSON(f.ctxPtr, path)
 	if ptr == 0 {
 		return nil, f.lastErr("detect", path)
 	}
 	defer CppFreeString(ptr)
 	var doc facesJSON
 	if err := json.Unmarshal([]byte(goStringFromCPtr(ptr)), &doc); err != nil {
 		return nil, fmt.Errorf("face-detect: detect JSON: %w", err)
 	}
 	return doc.Faces, nil
 }
 // bestArea returns the FacialArea of the highest-scoring face in an image, or an
 // empty area when detection fails or finds nothing. Best-effort: verify already
 // succeeded, so a missing region must not turn a valid match into an error.
 func (f *FaceDetect) bestArea(path string) *pb.FacialArea {
 	faces, err := f.detectFaces(path)
 	if err != nil || len(faces) == 0 {
 		return &pb.FacialArea{}
 	}
 	best := faces[0]
 	for _, fc := range faces[1:] {
 		if fc.Score > best.Score {
 			best = fc
 		}
 	}
 	x, y, w, h := best.xywh()
 	return &pb.FacialArea{X: x, Y: y, W: w, H: h}
 }
 // parseAnalyzeJSON maps the engine's analyze document onto FaceAnalysis entries.
 // The engine reports gender as "M"/"F"; both the dominant label and the score
 // map are filled with the "Man"/"Woman" form the proto documents.
 func parseAnalyzeJSON(doc string) ([]*pb.FaceAnalysis, error) {
 	var parsed facesJSON
 	if err := json.Unmarshal([]byte(doc), &parsed); err != nil {
 		return nil, err
 	}
 	out := make([]*pb.FaceAnalysis, 0, len(parsed.Faces))
 	for _, fc := range parsed.Faces {
 		x, y, w, h := fc.xywh()
 		fa := &pb.FaceAnalysis{
 			Region:         &pb.FacialArea{X: x, Y: y, W: w, H: h},
 			FaceConfidence: fc.Score,
 			Age:            fc.Age,
 		}
 		if label := normalizeGender(fc.Gender); label != "" {
 			fa.DominantGender = label
 			fa.Gender = map[string]float32{label: 1.0}
 		}
 		out = append(out, fa)
 	}
 	return out, nil
 }
 // normalizeGender maps the engine's "M"/"F" code to the "Man"/"Woman" labels the
 // proto documents. Unknown codes pass through unchanged.
 func normalizeGender(g string) string {
 	switch strings.ToUpper(strings.TrimSpace(g)) {
 	case "M":
 		return "Man"
 	case "F":
 		return "Woman"
 	case "":
 		return ""
 	default:
 		return g
 	}
 }
 // materializeImage decodes a base64 image payload into a temp file and returns
 // its path plus a cleanup func. As a convenience for callers that already pass a
 // filesystem path (e.g. a test fixture), an existing path is used as-is with a
 // no-op cleanup. data: URI prefixes are stripped before decoding.
 func materializeImage(src string) (path string, cleanup func(), err error) {
 	noop := func() {}
 	if src == "" {
 		return "", noop, errors.New("face-detect: empty image input")
 	}
 	if _, statErr := os.Stat(src); statErr == nil {
 		return src, noop, nil
 	}
 	payload := src
 	if i := strings.Index(payload, ","); strings.HasPrefix(payload, "data:") && i >= 0 {
 		payload = payload[i+1:]
 	}
 	data, decErr := base64.StdEncoding.DecodeString(strings.TrimSpace(payload))
 	if decErr != nil || len(data) == 0 {
 		return "", noop, errors.New("face-detect: image is neither an existing path nor valid base64")
 	}
 	tmp, createErr := os.CreateTemp("", "face-detect-*.img")
 	if createErr != nil {
 		return "", noop, fmt.Errorf("face-detect: create temp image: %w", createErr)
 	}
 	cleanup = func() { _ = os.Remove(tmp.Name()) }
 	if _, wErr := tmp.Write(data); wErr != nil {
 		_ = tmp.Close()
 		cleanup()
 		return "", noop, fmt.Errorf("face-detect: write temp image: %w", wErr)
 	}
 	if cErr := tmp.Close(); cErr != nil {
 		cleanup()
 		return "", noop, fmt.Errorf("face-detect: close temp image: %w", cErr)
 	}
 	return tmp.Name(), cleanup, nil
 }
 // lastErr wraps the C-API's per-ctx last-error buffer into a Go error.
 func (f *FaceDetect) lastErr(op, subject string) error {
 	msg := strings.TrimSpace(CppLastError(f.ctxPtr))
 	if msg == "" {
 		msg = "no error detail"
 	}
 	return fmt.Errorf("face-detect: %s failed for %q: %s", op, subject, msg)
 }
 // goStringFromCPtr copies a NUL-terminated C string into Go memory. cptr is a
 // malloc'd buffer the caller owns; release it via CppFreeString after the copy.
 //
 // The uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
 // a C heap pointer from Go-managed memory. Safe here: the GC neither tracks nor
 // moves the buffer and we dereference it immediately to copy the bytes out.
 func goStringFromCPtr(cptr uintptr) string {
 	if cptr == 0 {
 		return ""
 	}
 	p := unsafe.Pointer(cptr) //nolint:govet // C-owned malloc'd buffer, not Go-GC memory (see doc above)
 	n := 0
 	for *(*byte)(unsafe.Add(p, n)) != 0 {
 		n++
 	}
 	return string(unsafe.Slice((*byte)(p), n))
 }
--- a/backend/go/face-detect/gofacedetect_test.go
+++ b/backend/go/face-detect/gofacedetect_test.go
@@ -0,0 +1,230 @@
 package main
 import (
 	"encoding/base64"
 	"os"
 	"sync"
 	"testing"
 	"github.com/ebitengine/purego"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
 func TestFaceDetect(t *testing.T) {
 	RegisterFailHandler(Fail)
 	RunSpecs(t, "face-detect Backend Suite")
 }
 var (
 	libLoadOnce sync.Once
 	libLoadErr  error
 )
 // ensureLibLoaded mirrors main.go's bootstrap so a Go test can drive the C-API
 // bridge without spinning up the gRPC server. Records the error (the smoke
 // specs skip themselves) when libfacedetect.so is not loadable from cwd
 // (LD_LIBRARY_PATH or a symlink in ./).
 func ensureLibLoaded() error {
 	libLoadOnce.Do(func() {
 		libName := os.Getenv("FACEDETECT_LIBRARY")
 		if libName == "" {
 			libName = "libfacedetect.so"
 		}
 		lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
 		if err != nil {
 			libLoadErr = err
 			return
 		}
 		purego.RegisterLibFunc(&CppAbiVersion, lib, "facedetect_capi_abi_version")
 		purego.RegisterLibFunc(&CppLoad, lib, "facedetect_capi_load")
 		purego.RegisterLibFunc(&CppFree, lib, "facedetect_capi_free")
 		purego.RegisterLibFunc(&CppLastError, lib, "facedetect_capi_last_error")
 		purego.RegisterLibFunc(&CppFreeString, lib, "facedetect_capi_free_string")
 		purego.RegisterLibFunc(&CppFreeVec, lib, "facedetect_capi_free_vec")
 		purego.RegisterLibFunc(&CppEmbedPath, lib, "facedetect_capi_embed_path")
 		purego.RegisterLibFunc(&CppEmbedRGB, lib, "facedetect_capi_embed_rgb")
 		purego.RegisterLibFunc(&CppDetectJSON, lib, "facedetect_capi_detect_path_json")
 		purego.RegisterLibFunc(&CppVerifyPaths, lib, "facedetect_capi_verify_paths")
 		purego.RegisterLibFunc(&CppAnalyzeJSON, lib, "facedetect_capi_analyze_path_json")
 	})
 	return libLoadErr
 }
 var _ = Describe("parseOptions", func() {
 	It("defaults verify_threshold to 0.35", func() {
 		o := parseOptions(nil)
 		Expect(o.verifyThreshold).To(Equal(float32(0.35)))
 		Expect(o.modelName).To(Equal(""))
 	})
 	It("parses verify_threshold, threshold alias and model_name", func() {
 		o := parseOptions([]string{"verify_threshold:0.4", "model_name:buffalo_l", "unknown:x"})
 		Expect(o.verifyThreshold).To(Equal(float32(0.4)))
 		Expect(o.modelName).To(Equal("buffalo_l"))
 		o2 := parseOptions([]string{"threshold:0.3"})
 		Expect(o2.verifyThreshold).To(Equal(float32(0.3)))
 	})
 	It("ignores non-positive thresholds and keeps the default", func() {
 		o := parseOptions([]string{"verify_threshold:0", "threshold:-1"})
 		Expect(o.verifyThreshold).To(Equal(float32(0.35)))
 	})
 })
 var _ = Describe("normalizeGender", func() {
 	It("maps M/F codes to Man/Woman", func() {
 		Expect(normalizeGender("M")).To(Equal("Man"))
 		Expect(normalizeGender("f")).To(Equal("Woman"))
 		Expect(normalizeGender(" m ")).To(Equal("Man"))
 	})
 	It("passes empty and unknown codes through", func() {
 		Expect(normalizeGender("")).To(Equal(""))
 		Expect(normalizeGender("nonbinary")).To(Equal("nonbinary"))
 	})
 })
 var _ = Describe("faceBox.xywh", func() {
 	It("converts an [x1,y1,x2,y2] box to x/y/width/height", func() {
 		b := faceBox{Box: []float32{10, 20, 50, 80}}
 		x, y, w, h := b.xywh()
 		Expect(x).To(Equal(float32(10)))
 		Expect(y).To(Equal(float32(20)))
 		Expect(w).To(Equal(float32(40)))
 		Expect(h).To(Equal(float32(60)))
 	})
 	It("returns zeros for a short box", func() {
 		x, y, w, h := faceBox{Box: []float32{1, 2}}.xywh()
 		Expect([]float32{x, y, w, h}).To(Equal([]float32{0, 0, 0, 0}))
 	})
 })
 var _ = Describe("parseAnalyzeJSON", func() {
 	It("maps region, age and gender for each face", func() {
 		doc := `{"faces":[
 			{"score":0.997,"box":[10,20,50,80],"age":31,"gender":"M"},
 			{"score":0.81,"box":[0,0,40,40],"age":24,"gender":"F"}]}`
 		faces, err := parseAnalyzeJSON(doc)
 		Expect(err).ToNot(HaveOccurred())
 		Expect(faces).To(HaveLen(2))
 		Expect(faces[0].FaceConfidence).To(BeNumerically("~", 0.997, 1e-4))
 		Expect(faces[0].Age).To(BeNumerically("~", 31, 1e-4))
 		Expect(faces[0].DominantGender).To(Equal("Man"))
 		Expect(faces[0].Gender).To(HaveKeyWithValue("Man", float32(1.0)))
 		Expect(faces[0].Region.W).To(Equal(float32(40)))
 		Expect(faces[0].Region.H).To(Equal(float32(60)))
 		Expect(faces[1].DominantGender).To(Equal("Woman"))
 	})
 	It("tolerates a missing gender field", func() {
 		faces, err := parseAnalyzeJSON(`{"faces":[{"score":0.5,"box":[0,0,10,10],"age":40}]}`)
 		Expect(err).ToNot(HaveOccurred())
 		Expect(faces).To(HaveLen(1))
 		Expect(faces[0].DominantGender).To(Equal(""))
 		Expect(faces[0].Gender).To(BeEmpty())
 	})
 	It("returns no faces for an empty document", func() {
 		faces, err := parseAnalyzeJSON(`{"faces":[]}`)
 		Expect(err).ToNot(HaveOccurred())
 		Expect(faces).To(BeEmpty())
 	})
 	It("returns an error on malformed JSON", func() {
 		_, err := parseAnalyzeJSON(`{not-json`)
 		Expect(err).To(HaveOccurred())
 	})
 })
 var _ = Describe("materializeImage", func() {
 	It("decodes a base64 payload to a temp file", func() {
 		payload := base64.StdEncoding.EncodeToString([]byte("\xff\xd8\xff\xe0fake-jpeg"))
 		path, cleanup, err := materializeImage(payload)
 		Expect(err).ToNot(HaveOccurred())
 		defer cleanup()
 		data, rerr := os.ReadFile(path)
 		Expect(rerr).ToNot(HaveOccurred())
 		Expect(data).To(Equal([]byte("\xff\xd8\xff\xe0fake-jpeg")))
 	})
 	It("strips a data: URI prefix before decoding", func() {
 		payload := "data:image/png;base64," + base64.StdEncoding.EncodeToString([]byte("hello"))
 		path, cleanup, err := materializeImage(payload)
 		Expect(err).ToNot(HaveOccurred())
 		defer cleanup()
 		data, rerr := os.ReadFile(path)
 		Expect(rerr).ToNot(HaveOccurred())
 		Expect(data).To(Equal([]byte("hello")))
 	})
 	It("uses an existing path as-is", func() {
 		tmp, err := os.CreateTemp("", "face-detect-fixture-*.bin")
 		Expect(err).ToNot(HaveOccurred())
 		defer func() { _ = os.Remove(tmp.Name()) }()
 		Expect(tmp.Close()).To(Succeed())
 		path, cleanup, err := materializeImage(tmp.Name())
 		Expect(err).ToNot(HaveOccurred())
 		defer cleanup()
 		Expect(path).To(Equal(tmp.Name()))
 	})
 	It("errors on input that is neither a path nor base64", func() {
 		_, _, err := materializeImage("not base64!!!")
 		Expect(err).To(HaveOccurred())
 	})
 })
 // The specs below exercise the real C-API end to end. They run only when both a
 // model GGUF and a test image are provided, and skip cleanly otherwise so the
 // suite stays green without large assets.
 var _ = Describe("FaceDetect end-to-end", Ordered, func() {
 	var (
 		f         *FaceDetect
 		modelPath = os.Getenv("FACEDETECT_BACKEND_TEST_MODEL")
 		imagePath = os.Getenv("FACEDETECT_BACKEND_TEST_IMAGE")
 	)
 	BeforeAll(func() {
 		if modelPath == "" || imagePath == "" {
 			Skip("set FACEDETECT_BACKEND_TEST_MODEL and FACEDETECT_BACKEND_TEST_IMAGE to run the e2e specs")
 		}
 		if err := ensureLibLoaded(); err != nil {
 			Skip("libfacedetect.so not loadable: " + err.Error())
 		}
 		f = &FaceDetect{}
 		Expect(f.Load(&pb.ModelOptions{ModelFile: modelPath})).To(Succeed())
 	})
 	It("embeds the primary face in an image", func() {
 		emb, err := f.Embeddings(&pb.PredictOptions{Images: []string{imagePath}})
 		Expect(err).ToNot(HaveOccurred())
 		Expect(emb).ToNot(BeEmpty())
 	})
 	It("detects at least one face", func() {
 		resp, err := f.Detect(&pb.DetectOptions{Src: imagePath})
 		Expect(err).ToNot(HaveOccurred())
 		Expect(resp.Detections).ToNot(BeEmpty())
 		Expect(resp.Detections[0].ClassName).To(Equal("face"))
 	})
 	It("verifies an image against itself as the same identity", func() {
 		resp, err := f.FaceVerify(&pb.FaceVerifyRequest{Img1: imagePath, Img2: imagePath})
 		Expect(err).ToNot(HaveOccurred())
 		Expect(resp.Verified).To(BeTrue())
 		Expect(resp.Distance).To(BeNumerically("<=", resp.Threshold))
 	})
 	It("analyzes age/gender for each face", func() {
 		resp, err := f.FaceAnalyze(&pb.FaceAnalyzeRequest{Img: imagePath})
 		Expect(err).ToNot(HaveOccurred())
 		Expect(resp.Faces).ToNot(BeEmpty())
 	})
 })
--- a/backend/go/face-detect/main.go
+++ b/backend/go/face-detect/main.go
@@ -0,0 +1,65 @@
 package main
 // Started internally by LocalAI - one gRPC server per loaded model.
 //
 // Loads libfacedetect.so via purego and registers the flat C-API entry points
 // declared in facedetect_capi.h. The library name can be overridden with
 // FACEDETECT_LIBRARY (mirrors the VOICEDETECT_LIBRARY / PARAKEET_LIBRARY
 // convention in the sibling backends); the default looks for the .so next to
 // this binary (resolved via LD_LIBRARY_PATH by run.sh).
 import (
 	"flag"
 	"fmt"
 	"os"
 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )
 var (
 	addr = flag.String("addr", "localhost:50051", "the address to connect to")
 )
 type LibFuncs struct {
 	FuncPtr any
 	Name    string
 }
 func main() {
 	libName := os.Getenv("FACEDETECT_LIBRARY")
 	if libName == "" {
 		libName = "libfacedetect.so"
 	}
 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
 	if err != nil {
 		panic(fmt.Errorf("face-detect: dlopen %q: %w", libName, err))
 	}
 	// Bound 1:1 to facedetect_capi.h. char*/float* returns are registered as
 	// uintptr so the raw pointer can be freed via the matching capi free fn.
 	libFuncs := []LibFuncs{
 		{&CppAbiVersion, "facedetect_capi_abi_version"},
 		{&CppLoad, "facedetect_capi_load"},
 		{&CppFree, "facedetect_capi_free"},
 		{&CppLastError, "facedetect_capi_last_error"},
 		{&CppFreeString, "facedetect_capi_free_string"},
 		{&CppFreeVec, "facedetect_capi_free_vec"},
 		{&CppEmbedPath, "facedetect_capi_embed_path"},
 		{&CppEmbedRGB, "facedetect_capi_embed_rgb"},
 		{&CppDetectJSON, "facedetect_capi_detect_path_json"},
 		{&CppVerifyPaths, "facedetect_capi_verify_paths"},
 		{&CppAnalyzeJSON, "facedetect_capi_analyze_path_json"},
 	}
 	for _, lf := range libFuncs {
 		purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
 	}
 	fmt.Fprintf(os.Stderr, "[face-detect] ABI=%d\n", CppAbiVersion())
 	flag.Parse()
 	if err := grpc.StartServer(*addr, &FaceDetect{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/face-detect/options.go
+++ b/backend/go/face-detect/options.go
@@ -0,0 +1,47 @@
 package main
 import (
 	"strconv"
 	"strings"
 )
 // defaultVerifyThreshold is the cosine-distance cutoff used when a request does
 // not set one. Matches the insightface buffalo_l ArcFace R50 default the Python
 // face backend ships with so the two implementations agree on verdicts out of
 // the box.
 const defaultVerifyThreshold float32 = 0.35
 // loadOptions holds the parsed model-level options for face-detect.
 type loadOptions struct {
 	verifyThreshold float32
 	modelName       string
 }
 func splitOption(o string) (key, value string, ok bool) {
 	i := strings.Index(o, ":")
 	if i < 0 {
 		return "", "", false
 	}
 	return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true
 }
 // parseOptions reads the backend "key:value" option slice. Unknown keys are
 // ignored. Defaults: verify_threshold 0.35, model_name derived from the file.
 func parseOptions(opts []string) loadOptions {
 	o := loadOptions{verifyThreshold: defaultVerifyThreshold}
 	for _, oo := range opts {
 		key, value, ok := splitOption(oo)
 		if !ok {
 			continue
 		}
 		switch key {
 		case "verify_threshold", "threshold":
 			if f, err := strconv.ParseFloat(value, 32); err == nil && f > 0 {
 				o.verifyThreshold = float32(f)
 			}
 		case "model_name":
 			o.modelName = value
 		}
 	}
 	return o
 }
--- a/backend/go/face-detect/package.sh
+++ b/backend/go/face-detect/package.sh
@@ -0,0 +1,68 @@
 #!/bin/bash
 #
 # Bundle the face-detect-grpc binary, libfacedetect.so, the core runtime libs
 # (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE
 # so the package is self-contained. Mirrors backend/go/voice-detect/package.sh;
 # run.sh routes the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc
 # is used instead of the host's.
 set -e
 CURDIR=$(dirname "$(realpath "$0")")
 REPO_ROOT="${CURDIR}/../../.."
 mkdir -p "$CURDIR/package/lib"
 cp -avf "$CURDIR/face-detect-grpc" "$CURDIR/package/"
 cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
 # libfacedetect.so + any soname symlinks. purego.Dlopen resolves it via
 # LD_LIBRARY_PATH, which run.sh points at lib/.
 cp -avf "$CURDIR"/libfacedetect.so* "$CURDIR/package/lib/" 2>/dev/null || {
 	echo "ERROR: libfacedetect.so not found in $CURDIR, run 'make' first" >&2
 	exit 1
 }
 # Detect architecture and copy the core runtime libs libfacedetect.so links
 # against, plus the matching dynamic loader as lib/ld.so.
 if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
    echo "Detected x86_64 architecture, copying x86_64 libraries..."
    cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
 elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
    echo "Detected ARM64 architecture, copying ARM64 libraries..."
    cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
 elif [ "$(uname -s)" = "Darwin" ]; then
    echo "Detected Darwin"
 else
    echo "Error: Could not detect architecture"
    exit 1
 fi
 # Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers) based on
 # BUILD_TYPE so the backend can reach the GPU without the runtime base image
 # shipping those drivers.
 GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
 if [ -f "$GPU_LIB_SCRIPT" ]; then
    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
    package_gpu_libs
 fi
 echo "Packaging completed successfully"
 ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
--- a/backend/go/face-detect/run.sh
+++ b/backend/go/face-detect/run.sh
@@ -0,0 +1,16 @@
 #!/bin/bash
 set -e
 CURDIR=$(dirname "$(realpath "$0")")
 export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
 # If a self-contained ld.so was packaged, route through it so the packaged
 # libc / libstdc++ are used instead of the host's (matches the voice-detect /
 # whisper / parakeet backends' runtime layout).
 if [ -f "$CURDIR/lib/ld.so" ]; then
 	echo "Using lib/ld.so"
 	exec "$CURDIR/lib/ld.so" "$CURDIR/face-detect-grpc" "$@"
 fi
 exec "$CURDIR/face-detect-grpc" "$@"
--- a/backend/go/face-detect/test.sh
+++ b/backend/go/face-detect/test.sh
@@ -0,0 +1,15 @@
 #!/bin/bash
 set -e
 CURDIR=$(dirname "$(realpath "$0")")
 cd "$CURDIR"
 echo "Running face-detect backend tests..."
 # The pure-Go parsing specs always run. The embed/detect/verify/analyze smoke
 # specs run only when a model + image are provided via
 # FACEDETECT_BACKEND_TEST_MODEL and FACEDETECT_BACKEND_TEST_IMAGE; otherwise they
 # auto-skip.
 LD_LIBRARY_PATH="$CURDIR:${LD_LIBRARY_PATH:-}" go test -v -timeout 1200s .
 echo "face-detect tests completed."
--- a/backend/go/omnivoice-cpp/Makefile
+++ b/backend/go/omnivoice-cpp/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 # omnivoice.cpp version
 OMNIVOICE_REPO?=https://github.com/ServeurpersoCom/omnivoice.cpp
-OMNIVOICE_VERSION?=0f37401bebe9b20c0160a888e592108fc1d17607
+OMNIVOICE_VERSION?=96d30169afd5e6bb3fd6a0e9be0eb505bfe81fcd
 SO_TARGET?=libgomnivoicecpp.so
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/parakeet-cpp/Makefile
+++ b/backend/go/parakeet-cpp/Makefile
@@ -1,6 +1,6 @@
 # parakeet-cpp backend Makefile.
 #
-# Upstream pin lives below as PARAKEET_VERSION?=89f5e2977b4d8bccd45e7bcc6f2ef7c4ed49e89a
+# Upstream pin lives below as PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36
 # (.github/bump_deps.sh) can find and update it - matches the
 # whisper.cpp / ds4 / vibevoice-cpp convention.
 #
@@ -15,7 +15,7 @@
 # That's what the L0 smoke test uses. The default target below does the
 # proper clone-at-pin + cmake build so CI doesn't need a side-checkout.
-PARAKEET_VERSION?=89f5e2977b4d8bccd45e7bcc6f2ef7c4ed49e89a
+PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36
 PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp
 GOCMD?=go
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=f440ad9c29dd8bc34e5d1f4b863832b96d6ea05f
+STABLEDIFFUSION_GGML_VERSION?=b12098f5d09fc83da36e65c784f7bdb16a5a5ebf
 CMAKE_ARGS+=-DGGML_MAX_NAME=128
--- a/backend/go/supertonic/helper.go
+++ b/backend/go/supertonic/helper.go
@@ -16,7 +16,6 @@ import (
 	"os"
 	"path/filepath"
 	"regexp"
 	"runtime"
 	"strings"
 	"time"
 	"unicode"
@@ -944,15 +943,9 @@ func InitializeONNXRuntime() error {
 			}
 		}
 		if libPath == "" {
 			// LocalAI: default to the platform-native shared library
 			// extension when nothing else is found (dyld vs ld.so).
 			if runtime.GOOS == "darwin" {
 				libPath = "/usr/local/lib/libonnxruntime.dylib"
 			} else {
 			libPath = "/usr/local/lib/libonnxruntime.so"
 		}
 	}
 	}
 	ort.SetSharedLibraryPath(libPath)
 	if err := ort.InitializeEnvironment(); err != nil {
--- a/backend/go/supertonic/package.sh
+++ b/backend/go/supertonic/package.sh
@@ -32,10 +32,6 @@ elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
 elif [ $(uname -s) = "Darwin" ]; then
    # macOS: dyld resolves the bundled .dylib via DYLD_LIBRARY_PATH (set in
    # run.sh); there is no ld.so loader nor glibc to bundle.
    echo "Detected Darwin"
 else
    echo "Error: Could not detect architecture"
    exit 1
--- a/backend/go/supertonic/run.sh
+++ b/backend/go/supertonic/run.sh
@@ -3,12 +3,6 @@ set -ex
 CURDIR=$(dirname "$(realpath $0)")
 if [ "$(uname)" = "Darwin" ]; then
 	# macOS uses dyld: there is no ld.so loader, and the search path env
 	# var is DYLD_LIBRARY_PATH. ONNX Runtime ships as a .dylib here.
 	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
 	export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.dylib
 else
 export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.so
@@ -16,6 +10,5 @@ else
 	echo "Using lib/ld.so"
 	exec $CURDIR/lib/ld.so $CURDIR/supertonic "$@"
 fi
 fi
 exec $CURDIR/supertonic "$@"
--- a/backend/go/voice-detect/.gitignore
+++ b/backend/go/voice-detect/.gitignore
@@ -0,0 +1,18 @@
 # Fetched upstream sources
 sources/
 # CMake build directories
 build*/
 # build artifacts staged in-tree by the Makefile (cp from sources/) or
 # symlinked for local dev; the real sources live in voice-detect.cpp upstream.
 *.so
 *.so.*
 voicedetect_capi.h
 compile_commands.json
 # Compiled backend binary
 voice-detect-grpc
 # Packaging output
 package/
--- a/backend/go/voice-detect/Makefile
+++ b/backend/go/voice-detect/Makefile
@@ -0,0 +1,107 @@
 # voice-detect backend Makefile.
 #
 # Upstream pin lives below as VOICEDETECT_VERSION?=30beecd... (.github/bump_deps.sh
 # can find and update it - matches the parakeet.cpp / whisper.cpp / ds4 convention).
 #
 # Local dev shortcut: if you already have an out-of-tree voice-detect.cpp build,
 # symlink the .so + header into this directory and skip the clone/cmake steps:
 #
 #   ln -sf /path/to/voice-detect.cpp/build-shared/libvoicedetect.so .
 #   ln -sf /path/to/voice-detect.cpp/include/voicedetect_capi.h .
 #   go build -o voice-detect-grpc .
 #
 # The default target below does the proper clone-at-pin + cmake build so CI does
 # not need a side-checkout.
 VOICEDETECT_VERSION?=30beecdbe9662fb27e826ae4ec949d3fa02ff366
 VOICEDETECT_REPO?=https://github.com/mudler/voice-detect.cpp
 GOCMD?=go
 GO_TAGS?=
 JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
 BUILD_TYPE?=
 NATIVE?=false
 # Resolve the target arch. The backend matrix / Docker build pass TARGETARCH
 # (amd64|arm64); fall back to uname -m (aarch64|x86_64) for a local build.
 RECON_ARCH?=$(or $(TARGETARCH),$(shell uname -m))
 # Build ggml statically into libvoicedetect.so (PIC) so the shared lib is
 # self-contained: dlopen needs no libggml*.so alongside it, only system libs
 # (libstdc++/libgomp/libc) that the runtime image already provides.
 CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DVOICEDETECT_SHARED=ON -DVOICEDETECT_BUILD_CLI=OFF -DVOICEDETECT_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
 endif
 # voice-detect.cpp gates its GGML backends behind VOICEDETECT_GGML_* options and
 # does set(GGML_CUDA ${VOICEDETECT_GGML_CUDA} CACHE BOOL "" FORCE), so a bare
 # -DGGML_CUDA=ON is overwritten back to OFF. Forward the VOICEDETECT_GGML_*
 # options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
 ifeq ($(BUILD_TYPE),cublas)
 	CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDA=ON
 	# Opt-in cuDNN implicit-GEMM conv path (kills im2col on GPU, reaches
 	# torch-cuDNN parity). Only the arm64 + CUDA 13 image (GB10/Jetson/L4T)
 	# ships libcudnn9 + the -dev headers, so gate cuDNN to that variant.
 	# x86 CUDA images carry no cuDNN -> enabling it there is a link failure.
 	ifeq ($(CUDA_MAJOR_VERSION),13)
 	ifneq (,$(filter arm64 aarch64,$(RECON_ARCH)))
 		CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDNN=ON
 	endif
 	endif
 else ifeq ($(BUILD_TYPE),openblas)
 	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DVOICEDETECT_GGML_HIP=ON
 else ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DVOICEDETECT_GGML_VULKAN=ON
 else ifeq ($(BUILD_TYPE),metal)
 	CMAKE_ARGS+=-DVOICEDETECT_GGML_METAL=ON
 endif
 .PHONY: voice-detect-grpc package build clean purge test all
 all: voice-detect-grpc
 # Clone the upstream voice-detect.cpp source at the pinned commit. Directory acts
 # as the target so make only re-clones when missing. After a VOICEDETECT_VERSION
 # bump, run 'make purge && make' to refetch.
 sources/voice-detect.cpp:
 	mkdir -p sources/voice-detect.cpp
 	cd sources/voice-detect.cpp && \
 	git init -q && \
 	git remote add origin $(VOICEDETECT_REPO) && \
 	git fetch --depth 1 origin $(VOICEDETECT_VERSION) && \
 	git checkout FETCH_HEAD && \
 	git submodule update --init --recursive --depth 1 --single-branch
 # Build the shared lib + header out-of-tree, then stage them next to the Go
 # sources so purego.Dlopen("libvoicedetect.so") and the cgo-less build both pick
 # them up.
 libvoicedetect.so: sources/voice-detect.cpp
 	cmake -B sources/voice-detect.cpp/build-shared -S sources/voice-detect.cpp $(CMAKE_ARGS)
 	cmake --build sources/voice-detect.cpp/build-shared --config Release -j$(JOBS) --target voicedetect
 	cp -fv sources/voice-detect.cpp/build-shared/libvoicedetect.so* ./ 2>/dev/null || true
 	cp -fv sources/voice-detect.cpp/include/voicedetect_capi.h ./
 voice-detect-grpc: libvoicedetect.so main.go govoicedetect.go options.go
 	CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o voice-detect-grpc .
 package: voice-detect-grpc
 	bash package.sh
 build: package
 # Test target. The embed/verify/analyze smoke specs are gated on
 # VOICEDETECT_BACKEND_TEST_MODEL + VOICEDETECT_BACKEND_TEST_WAV; without them the
 # heavy specs auto-skip and only the pure-Go parsing specs run.
 test:
 	LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
 clean: purge
 	rm -rf libvoicedetect.so* voicedetect_capi.h package voice-detect-grpc
 purge:
 	rm -rf sources/voice-detect.cpp
--- a/backend/go/voice-detect/govoicedetect.go
+++ b/backend/go/voice-detect/govoicedetect.go
@@ -0,0 +1,273 @@
 package main
 import (
 	"encoding/json"
 	"errors"
 	"fmt"
 	"math"
 	"os"
 	"path/filepath"
 	"strconv"
 	"strings"
 	"time"
 	"unsafe"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/xlog"
 )
 // purego-bound entry points from libvoicedetect.so. Names match
 // voicedetect_capi.h exactly so a `nm libvoicedetect.so | grep voicedetect_capi`
 // is enough to spot drift.
 //
 // The opaque ctx and the malloc'd char*/float* return values are declared as
 // uintptr so we get the raw pointer back and can release it via the matching
 // capi free function. purego's native string/[]float32 returns would copy and
 // forget the original pointer, leaking the C-owned buffer on every call.
 var (
 	CppAbiVersion  func() int32
 	CppLoad        func(ggufPath string) uintptr
 	CppFree        func(ctx uintptr)
 	CppLastError   func(ctx uintptr) string
 	CppFreeString  func(s uintptr)
 	CppFreeVec     func(v uintptr)
 	CppEmbedPath   func(ctx uintptr, wavPath string, outVec, outDim unsafe.Pointer) int32
 	CppEmbedPCM    func(ctx uintptr, pcm []float32, nSamples, sampleRate int32, outVec, outDim unsafe.Pointer) int32
 	CppVerifyPaths func(ctx uintptr, a, b string, threshold float32, outDistance, outVerified unsafe.Pointer) int32
 	CppAnalyzeJSON func(ctx uintptr, wavPath string) uintptr
 )
 // VoiceDetect implements the speaker-recognition voice subset of the Backend
 // gRPC service over libvoicedetect.so. The C side keeps a single loaded model
 // plus a per-ctx last-error buffer and is not reentrant, so base.SingleThread
 // serializes every call.
 type VoiceDetect struct {
 	base.SingleThread
 	opts   loadOptions
 	ctxPtr uintptr
 }
 func (v *VoiceDetect) Load(opts *pb.ModelOptions) error {
 	model := opts.ModelFile
 	if model == "" {
 		model = opts.ModelPath
 	}
 	if !filepath.IsAbs(model) && opts.ModelPath != "" {
 		model = filepath.Join(opts.ModelPath, model)
 	}
 	if model == "" {
 		return errors.New("voice-detect: ModelFile is required")
 	}
 	v.opts = parseOptions(opts.Options)
 	if v.opts.modelName == "" {
 		v.opts.modelName = filepath.Base(model)
 	}
 	// Propagate LocalAI's per-model thread budget to the engine. LocalAI spawns
 	// one backend process per model and serves requests concurrently, so the
 	// engine's own min(hardware_concurrency, 8) default can oversubscribe cores.
 	// VOICEDETECT_THREADS is read by the engine at backend construction, so it
 	// must be set before the capi load. A non-positive Threads means "unset":
 	// leave the env alone so the engine keeps its sane default.
 	threads := opts.Threads
 	if threads > 0 {
 		if err := os.Setenv("VOICEDETECT_THREADS", strconv.Itoa(int(threads))); err != nil {
 			return fmt.Errorf("voice-detect: set VOICEDETECT_THREADS: %w", err)
 		}
 		xlog.Info("voice-detect: applying LocalAI thread budget", "threads", threads)
 	}
 	xlog.Info("voice-detect: loading model", "model", model,
 		"verify_threshold", v.opts.verifyThreshold, "abi", CppAbiVersion())
 	ctx := CppLoad(model)
 	if ctx == 0 {
 		// The last-error buffer lives on the ctx that was never returned, so
 		// surface the path the operator tried to load instead.
 		return fmt.Errorf("voice-detect: voicedetect_capi_load failed for %q", model)
 	}
 	v.ctxPtr = ctx
 	return nil
 }
 // VoiceEmbed returns the L2-normalized speaker embedding for an audio clip.
 // The request carries a filesystem PATH; the HTTP layer materializes
 // base64/URL/data-URI inputs to a temp file before the gRPC call.
 func (v *VoiceDetect) VoiceEmbed(req *pb.VoiceEmbedRequest) (pb.VoiceEmbedResponse, error) {
 	if v.ctxPtr == 0 {
 		return pb.VoiceEmbedResponse{}, errors.New("voice-detect: model not loaded")
 	}
 	if req.Audio == "" {
 		return pb.VoiceEmbedResponse{}, errors.New("voice-detect: audio path is required")
 	}
 	emb, err := v.embedPath(req.Audio)
 	if err != nil {
 		return pb.VoiceEmbedResponse{}, err
 	}
 	return pb.VoiceEmbedResponse{Embedding: emb, Model: v.opts.modelName}, nil
 }
 func (v *VoiceDetect) embedPath(path string) ([]float32, error) {
 	var vec uintptr
 	var dim int32
 	rc := CppEmbedPath(v.ctxPtr, path, unsafe.Pointer(&vec), unsafe.Pointer(&dim))
 	if rc != 0 || vec == 0 || dim <= 0 {
 		return nil, v.lastErr("embed", path)
 	}
 	defer CppFreeVec(vec)
 	// Copy out of the C-owned malloc'd buffer before freeing it. The
 	// uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
 	// a C heap pointer from Go-managed memory; safe here, the GC neither tracks
 	// nor moves this buffer and we copy immediately.
 	src := unsafe.Slice((*float32)(unsafe.Pointer(vec)), int(dim)) //nolint:govet // C-owned malloc'd vector, copied out before free
 	out := make([]float32, int(dim))
 	copy(out, src)
 	return out, nil
 }
 // VoiceVerify embeds two clips and reports whether they are the same speaker by
 // cosine distance against a threshold. A request threshold <= 0 falls back to
 // the model-configured default (verify_threshold option, 0.25 if unset).
 func (v *VoiceDetect) VoiceVerify(req *pb.VoiceVerifyRequest) (pb.VoiceVerifyResponse, error) {
 	if v.ctxPtr == 0 {
 		return pb.VoiceVerifyResponse{}, errors.New("voice-detect: model not loaded")
 	}
 	if req.Audio1 == "" || req.Audio2 == "" {
 		return pb.VoiceVerifyResponse{}, errors.New("voice-detect: audio1 and audio2 are required")
 	}
 	threshold := req.Threshold
 	if threshold <= 0 {
 		threshold = v.opts.verifyThreshold
 	}
 	started := time.Now()
 	var distance float32
 	var verified int32
 	rc := CppVerifyPaths(v.ctxPtr, req.Audio1, req.Audio2, threshold,
 		unsafe.Pointer(&distance), unsafe.Pointer(&verified))
 	if rc != 0 {
 		return pb.VoiceVerifyResponse{}, v.lastErr("verify", req.Audio1+","+req.Audio2)
 	}
 	elapsedMs := float32(time.Since(started).Seconds() * 1000.0)
 	// Confidence decays linearly from 100 at distance 0 to 0 at the threshold,
 	// matching the Python speaker-recognition backend's reporting.
 	confidence := float32(0)
 	if threshold > 0 {
 		confidence = float32(math.Max(0, math.Min(100, (1.0-float64(distance)/float64(threshold))*100.0)))
 	}
 	return pb.VoiceVerifyResponse{
 		Verified:         verified != 0,
 		Distance:         distance,
 		Threshold:        threshold,
 		Confidence:       confidence,
 		Model:            v.opts.modelName,
 		ProcessingTimeMs: elapsedMs,
 	}, nil
 }
 // VoiceAnalyze runs the age/gender/emotion heads on a single clip. The C-API
 // always evaluates every supported head, so the request's actions filter is
 // advisory and the full analysis is returned as a single segment (the engine
 // does not produce time-bounded segments).
 func (v *VoiceDetect) VoiceAnalyze(req *pb.VoiceAnalyzeRequest) (pb.VoiceAnalyzeResponse, error) {
 	if v.ctxPtr == 0 {
 		return pb.VoiceAnalyzeResponse{}, errors.New("voice-detect: model not loaded")
 	}
 	if req.Audio == "" {
 		return pb.VoiceAnalyzeResponse{}, errors.New("voice-detect: audio path is required")
 	}
 	ptr := CppAnalyzeJSON(v.ctxPtr, req.Audio)
 	if ptr == 0 {
 		return pb.VoiceAnalyzeResponse{}, v.lastErr("analyze", req.Audio)
 	}
 	defer CppFreeString(ptr)
 	seg, err := parseAnalyzeJSON(goStringFromCPtr(ptr))
 	if err != nil {
 		return pb.VoiceAnalyzeResponse{}, fmt.Errorf("voice-detect: analyze JSON for %q: %w", req.Audio, err)
 	}
 	return pb.VoiceAnalyzeResponse{Segments: []*pb.VoiceAnalysis{seg}}, nil
 }
 // analyzeJSON mirrors the document returned by voicedetect_capi_analyze_path_json:
 //
 //	{"age":42.0,
 //	 "gender":{"label":"female","female":0.88,"male":0.12},
 //	 "emotion":{"label":"neutral","scores":{"neutral":0.7, ...}}}
 //
 // gender is a mixed object (a "label" string plus per-class float scores), so
 // it is decoded into raw messages and split in parseAnalyzeJSON.
 type analyzeJSON struct {
 	Age     float32                    `json:"age"`
 	Gender  map[string]json.RawMessage `json:"gender"`
 	Emotion struct {
 		Label  string             `json:"label"`
 		Scores map[string]float32 `json:"scores"`
 	} `json:"emotion"`
 }
 // parseAnalyzeJSON maps the engine's analyze document onto a VoiceAnalysis.
 // start/end stay 0: the model emits a single whole-utterance result, not
 // time-bounded segments.
 func parseAnalyzeJSON(doc string) (*pb.VoiceAnalysis, error) {
 	var a analyzeJSON
 	if err := json.Unmarshal([]byte(doc), &a); err != nil {
 		return nil, err
 	}
 	seg := &pb.VoiceAnalysis{
 		Age:             a.Age,
 		DominantEmotion: a.Emotion.Label,
 		Emotion:         a.Emotion.Scores,
 	}
 	if len(a.Gender) > 0 {
 		gender := make(map[string]float32, len(a.Gender))
 		for k, raw := range a.Gender {
 			if k == "label" {
 				_ = json.Unmarshal(raw, &seg.DominantGender)
 				continue
 			}
 			var score float32
 			if err := json.Unmarshal(raw, &score); err == nil {
 				gender[k] = score
 			}
 		}
 		seg.Gender = gender
 	}
 	return seg, nil
 }
 // lastErr wraps the C-API's per-ctx last-error buffer into a Go error.
 func (v *VoiceDetect) lastErr(op, subject string) error {
 	msg := strings.TrimSpace(CppLastError(v.ctxPtr))
 	if msg == "" {
 		msg = "no error detail"
 	}
 	return fmt.Errorf("voice-detect: %s failed for %q: %s", op, subject, msg)
 }
 // goStringFromCPtr copies a NUL-terminated C string into Go memory. cptr is a
 // malloc'd buffer the caller owns; release it via CppFreeString after the copy.
 //
 // The uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
 // a C heap pointer from Go-managed memory. Safe here: the GC neither tracks nor
 // moves the buffer and we dereference it immediately to copy the bytes out.
 func goStringFromCPtr(cptr uintptr) string {
 	if cptr == 0 {
 		return ""
 	}
 	p := unsafe.Pointer(cptr) //nolint:govet // C-owned malloc'd buffer, not Go-GC memory (see doc above)
 	n := 0
 	for *(*byte)(unsafe.Add(p, n)) != 0 {
 		n++
 	}
 	return string(unsafe.Slice((*byte)(p), n))
 }
--- a/backend/go/voice-detect/govoicedetect_test.go
+++ b/backend/go/voice-detect/govoicedetect_test.go
@@ -0,0 +1,144 @@
 package main
 import (
 	"os"
 	"sync"
 	"testing"
 	"github.com/ebitengine/purego"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
 func TestVoiceDetect(t *testing.T) {
 	RegisterFailHandler(Fail)
 	RunSpecs(t, "voice-detect Backend Suite")
 }
 var (
 	libLoadOnce sync.Once
 	libLoadErr  error
 )
 // ensureLibLoaded mirrors main.go's bootstrap so a Go test can drive the C-API
 // bridge without spinning up the gRPC server. Records the error (the smoke
 // specs skip themselves) when libvoicedetect.so is not loadable from cwd
 // (LD_LIBRARY_PATH or a symlink in ./).
 func ensureLibLoaded() error {
 	libLoadOnce.Do(func() {
 		libName := os.Getenv("VOICEDETECT_LIBRARY")
 		if libName == "" {
 			libName = "libvoicedetect.so"
 		}
 		lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
 		if err != nil {
 			libLoadErr = err
 			return
 		}
 		purego.RegisterLibFunc(&CppAbiVersion, lib, "voicedetect_capi_abi_version")
 		purego.RegisterLibFunc(&CppLoad, lib, "voicedetect_capi_load")
 		purego.RegisterLibFunc(&CppFree, lib, "voicedetect_capi_free")
 		purego.RegisterLibFunc(&CppLastError, lib, "voicedetect_capi_last_error")
 		purego.RegisterLibFunc(&CppFreeString, lib, "voicedetect_capi_free_string")
 		purego.RegisterLibFunc(&CppFreeVec, lib, "voicedetect_capi_free_vec")
 		purego.RegisterLibFunc(&CppEmbedPath, lib, "voicedetect_capi_embed_path")
 		purego.RegisterLibFunc(&CppEmbedPCM, lib, "voicedetect_capi_embed_pcm")
 		purego.RegisterLibFunc(&CppVerifyPaths, lib, "voicedetect_capi_verify_paths")
 		purego.RegisterLibFunc(&CppAnalyzeJSON, lib, "voicedetect_capi_analyze_path_json")
 	})
 	return libLoadErr
 }
 var _ = Describe("parseOptions", func() {
 	It("defaults verify_threshold to 0.25", func() {
 		o := parseOptions(nil)
 		Expect(o.verifyThreshold).To(Equal(float32(0.25)))
 		Expect(o.modelName).To(Equal(""))
 	})
 	It("parses verify_threshold, threshold alias and model_name", func() {
 		o := parseOptions([]string{"verify_threshold:0.4", "model_name:ecapa", "unknown:x"})
 		Expect(o.verifyThreshold).To(Equal(float32(0.4)))
 		Expect(o.modelName).To(Equal("ecapa"))
 		o2 := parseOptions([]string{"threshold:0.3"})
 		Expect(o2.verifyThreshold).To(Equal(float32(0.3)))
 	})
 	It("ignores non-positive thresholds and keeps the default", func() {
 		o := parseOptions([]string{"verify_threshold:0", "threshold:-1"})
 		Expect(o.verifyThreshold).To(Equal(float32(0.25)))
 	})
 })
 var _ = Describe("parseAnalyzeJSON", func() {
 	It("maps age, gender label+scores and emotion label+scores", func() {
 		doc := `{"age":42.0,
 			"gender":{"label":"female","female":0.88,"male":0.12},
 			"emotion":{"label":"neutral","scores":{"neutral":0.7,"happy":0.2,"sad":0.1}}}`
 		seg, err := parseAnalyzeJSON(doc)
 		Expect(err).ToNot(HaveOccurred())
 		Expect(seg.Age).To(BeNumerically("~", 42.0, 1e-4))
 		Expect(seg.Start).To(Equal(float32(0)))
 		Expect(seg.End).To(Equal(float32(0)))
 		Expect(seg.DominantGender).To(Equal("female"))
 		Expect(seg.Gender).To(HaveKeyWithValue("female", BeNumerically("~", 0.88, 1e-4)))
 		Expect(seg.Gender).To(HaveKeyWithValue("male", BeNumerically("~", 0.12, 1e-4)))
 		// The "label" entry is consumed into DominantGender, not the score map.
 		Expect(seg.Gender).ToNot(HaveKey("label"))
 		Expect(seg.DominantEmotion).To(Equal("neutral"))
 		Expect(seg.Emotion).To(HaveKeyWithValue("neutral", BeNumerically("~", 0.7, 1e-4)))
 		Expect(seg.Emotion).To(HaveKeyWithValue("happy", BeNumerically("~", 0.2, 1e-4)))
 	})
 	It("tolerates a missing gender block", func() {
 		seg, err := parseAnalyzeJSON(`{"age":30.0,"emotion":{"label":"happy","scores":{"happy":1.0}}}`)
 		Expect(err).ToNot(HaveOccurred())
 		Expect(seg.DominantGender).To(Equal(""))
 		Expect(seg.DominantEmotion).To(Equal("happy"))
 	})
 	It("returns an error on malformed JSON", func() {
 		_, err := parseAnalyzeJSON(`{not-json`)
 		Expect(err).To(HaveOccurred())
 	})
 })
 // The specs below exercise the real C-API end to end. They run only when both a
 // model GGUF and a test WAV are provided, and skip cleanly otherwise so the
 // suite stays green without large assets.
 var _ = Describe("VoiceDetect end-to-end", Ordered, func() {
 	var (
 		v         *VoiceDetect
 		modelPath = os.Getenv("VOICEDETECT_BACKEND_TEST_MODEL")
 		wavPath   = os.Getenv("VOICEDETECT_BACKEND_TEST_WAV")
 	)
 	BeforeAll(func() {
 		if modelPath == "" || wavPath == "" {
 			Skip("set VOICEDETECT_BACKEND_TEST_MODEL and VOICEDETECT_BACKEND_TEST_WAV to run the e2e specs")
 		}
 		if err := ensureLibLoaded(); err != nil {
 			Skip("libvoicedetect.so not loadable: " + err.Error())
 		}
 		v = &VoiceDetect{}
 		Expect(v.Load(&pb.ModelOptions{ModelFile: modelPath})).To(Succeed())
 	})
 	It("embeds an audio clip", func() {
 		resp, err := v.VoiceEmbed(&pb.VoiceEmbedRequest{Audio: wavPath})
 		Expect(err).ToNot(HaveOccurred())
 		Expect(resp.Embedding).ToNot(BeEmpty())
 		Expect(resp.Model).ToNot(BeEmpty())
 	})
 	It("verifies a clip against itself as the same speaker", func() {
 		resp, err := v.VoiceVerify(&pb.VoiceVerifyRequest{Audio1: wavPath, Audio2: wavPath})
 		Expect(err).ToNot(HaveOccurred())
 		Expect(resp.Verified).To(BeTrue())
 		Expect(resp.Distance).To(BeNumerically("<=", resp.Threshold))
 	})
 })
--- a/backend/go/voice-detect/main.go
+++ b/backend/go/voice-detect/main.go
@@ -0,0 +1,64 @@
 package main
 // Started internally by LocalAI - one gRPC server per loaded model.
 //
 // Loads libvoicedetect.so via purego and registers the flat C-API entry points
 // declared in voicedetect_capi.h. The library name can be overridden with
 // VOICEDETECT_LIBRARY (mirrors the PARAKEET_LIBRARY / OMNIVOICE_LIBRARY
 // convention in the sibling backends); the default looks for the .so next to
 // this binary (resolved via LD_LIBRARY_PATH by run.sh).
 import (
 	"flag"
 	"fmt"
 	"os"
 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )
 var (
 	addr = flag.String("addr", "localhost:50051", "the address to connect to")
 )
 type LibFuncs struct {
 	FuncPtr any
 	Name    string
 }
 func main() {
 	libName := os.Getenv("VOICEDETECT_LIBRARY")
 	if libName == "" {
 		libName = "libvoicedetect.so"
 	}
 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
 	if err != nil {
 		panic(fmt.Errorf("voice-detect: dlopen %q: %w", libName, err))
 	}
 	// Bound 1:1 to voicedetect_capi.h. char*/float* returns are registered as
 	// uintptr so the raw pointer can be freed via the matching capi free fn.
 	libFuncs := []LibFuncs{
 		{&CppAbiVersion, "voicedetect_capi_abi_version"},
 		{&CppLoad, "voicedetect_capi_load"},
 		{&CppFree, "voicedetect_capi_free"},
 		{&CppLastError, "voicedetect_capi_last_error"},
 		{&CppFreeString, "voicedetect_capi_free_string"},
 		{&CppFreeVec, "voicedetect_capi_free_vec"},
 		{&CppEmbedPath, "voicedetect_capi_embed_path"},
 		{&CppEmbedPCM, "voicedetect_capi_embed_pcm"},
 		{&CppVerifyPaths, "voicedetect_capi_verify_paths"},
 		{&CppAnalyzeJSON, "voicedetect_capi_analyze_path_json"},
 	}
 	for _, lf := range libFuncs {
 		purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
 	}
 	fmt.Fprintf(os.Stderr, "[voice-detect] ABI=%d\n", CppAbiVersion())
 	flag.Parse()
 	if err := grpc.StartServer(*addr, &VoiceDetect{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/voice-detect/options.go
+++ b/backend/go/voice-detect/options.go
@@ -0,0 +1,46 @@
 package main
 import (
 	"strconv"
 	"strings"
 )
 // defaultVerifyThreshold is the cosine-distance cutoff used when a request does
 // not set one. Matches the Python speaker-recognition backend's default so the
 // two implementations agree on verdicts out of the box.
 const defaultVerifyThreshold float32 = 0.25
 // loadOptions holds the parsed model-level options for voice-detect.
 type loadOptions struct {
 	verifyThreshold float32
 	modelName       string
 }
 func splitOption(o string) (key, value string, ok bool) {
 	i := strings.Index(o, ":")
 	if i < 0 {
 		return "", "", false
 	}
 	return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true
 }
 // parseOptions reads the backend "key:value" option slice. Unknown keys are
 // ignored. Defaults: verify_threshold 0.25, model_name derived from the file.
 func parseOptions(opts []string) loadOptions {
 	o := loadOptions{verifyThreshold: defaultVerifyThreshold}
 	for _, oo := range opts {
 		key, value, ok := splitOption(oo)
 		if !ok {
 			continue
 		}
 		switch key {
 		case "verify_threshold", "threshold":
 			if f, err := strconv.ParseFloat(value, 32); err == nil && f > 0 {
 				o.verifyThreshold = float32(f)
 			}
 		case "model_name":
 			o.modelName = value
 		}
 	}
 	return o
 }
--- a/backend/go/voice-detect/package.sh
+++ b/backend/go/voice-detect/package.sh
@@ -0,0 +1,68 @@
 #!/bin/bash
 #
 # Bundle the voice-detect-grpc binary, libvoicedetect.so, the core runtime libs
 # (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE
 # so the package is self-contained. Mirrors backend/go/parakeet-cpp/package.sh;
 # run.sh routes the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc
 # is used instead of the host's.
 set -e
 CURDIR=$(dirname "$(realpath "$0")")
 REPO_ROOT="${CURDIR}/../../.."
 mkdir -p "$CURDIR/package/lib"
 cp -avf "$CURDIR/voice-detect-grpc" "$CURDIR/package/"
 cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
 # libvoicedetect.so + any soname symlinks. purego.Dlopen resolves it via
 # LD_LIBRARY_PATH, which run.sh points at lib/.
 cp -avf "$CURDIR"/libvoicedetect.so* "$CURDIR/package/lib/" 2>/dev/null || {
 	echo "ERROR: libvoicedetect.so not found in $CURDIR, run 'make' first" >&2
 	exit 1
 }
 # Detect architecture and copy the core runtime libs libvoicedetect.so links
 # against, plus the matching dynamic loader as lib/ld.so.
 if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
    echo "Detected x86_64 architecture, copying x86_64 libraries..."
    cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
 elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
    echo "Detected ARM64 architecture, copying ARM64 libraries..."
    cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
 elif [ "$(uname -s)" = "Darwin" ]; then
    echo "Detected Darwin"
 else
    echo "Error: Could not detect architecture"
    exit 1
 fi
 # Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers) based on
 # BUILD_TYPE so the backend can reach the GPU without the runtime base image
 # shipping those drivers.
 GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
 if [ -f "$GPU_LIB_SCRIPT" ]; then
    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
    package_gpu_libs
 fi
 echo "Packaging completed successfully"
 ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
--- a/backend/go/voice-detect/run.sh
+++ b/backend/go/voice-detect/run.sh
@@ -0,0 +1,16 @@
 #!/bin/bash
 set -e
 CURDIR=$(dirname "$(realpath "$0")")
 export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
 # If a self-contained ld.so was packaged, route through it so the packaged
 # libc / libstdc++ are used instead of the host's (matches the whisper /
 # parakeet backends' runtime layout).
 if [ -f "$CURDIR/lib/ld.so" ]; then
 	echo "Using lib/ld.so"
 	exec "$CURDIR/lib/ld.so" "$CURDIR/voice-detect-grpc" "$@"
 fi
 exec "$CURDIR/voice-detect-grpc" "$@"
--- a/backend/go/voice-detect/test.sh
+++ b/backend/go/voice-detect/test.sh
@@ -0,0 +1,14 @@
 #!/bin/bash
 set -e
 CURDIR=$(dirname "$(realpath "$0")")
 cd "$CURDIR"
 echo "Running voice-detect backend tests..."
 # The pure-Go parsing specs always run. The embed/verify/analyze smoke specs run
 # only when a model + WAV are provided via VOICEDETECT_BACKEND_TEST_MODEL and
 # VOICEDETECT_BACKEND_TEST_WAV; otherwise they auto-skip.
 LD_LIBRARY_PATH="$CURDIR:${LD_LIBRARY_PATH:-}" go test -v -timeout 1200s .
 echo "voice-detect tests completed."
--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=43d78af5be58f41d6ffbc227d608f104577741ea
+WHISPER_CPP_VERSION?=5ed76e9a079962f1c85cfce44edd325c27ef1f97
 SO_TARGET?=libgowhisper.so
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -209,6 +209,78 @@
    nvidia-cuda-12: "cuda12-ced"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced"
 - &voicedetect
  name: "voice-detect"
  alias: "voice-detect"
  license: mit
  icon: https://avatars.githubusercontent.com/u/95302084
  description: |
    voice-detect speaker recognition and voice analysis.
    voice-detect.cpp is a C++/ggml engine that produces L2-normalised
    speaker embeddings (ECAPA-TDNN, WeSpeaker ResNet34, 3D-Speaker
    ERes2Net, CAM++) for voice verification and 1:N identification, plus
    a wav2vec2 age / gender / emotion analysis head. It replaces the
    Python speaker-recognition backend and is exposed through the Voice*
    gRPC rpcs and the /v1/voice/* REST endpoints. It runs on CPU, NVIDIA
    CUDA, AMD ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets.
  urls:
    - https://github.com/mudler/voice-detect.cpp
  tags:
    - voice-recognition
    - speaker-verification
    - speaker-embedding
    - CPU
    - GPU
    - CUDA
    - HIP
  capabilities:
    default: "cpu-voice-detect"
    nvidia: "cuda12-voice-detect"
    intel: "intel-sycl-f16-voice-detect"
    metal: "metal-voice-detect"
    amd: "rocm-voice-detect"
    vulkan: "vulkan-voice-detect"
    nvidia-l4t: "nvidia-l4t-arm64-voice-detect"
    nvidia-cuda-13: "cuda13-voice-detect"
    nvidia-cuda-12: "cuda12-voice-detect"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-voice-detect"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-voice-detect"
 - &facedetect
  name: "face-detect"
  alias: "face-detect"
  license: mit
  icon: https://avatars.githubusercontent.com/u/95302084
  description: |
    face-detect face detection, embedding, verification and analysis.
    face-detect.cpp is a C++/ggml engine that runs SCRFD / YuNet face
    detection and ArcFace / SFace 512-d (or 128-d) L2-normalised face
    embeddings for verification and 1:N identification, plus a landmark /
    age / gender analysis head. It replaces the Python insightface backend
    and is exposed through the Embedding, Detect and Face* gRPC rpcs and
    the /v1/face/* REST endpoints. It runs on CPU, NVIDIA CUDA, AMD
    ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets.
  urls:
    - https://github.com/mudler/face-detect.cpp
  tags:
    - face-recognition
    - face-verification
    - face-embedding
    - CPU
    - GPU
    - CUDA
    - HIP
  capabilities:
    default: "cpu-face-detect"
    nvidia: "cuda12-face-detect"
    intel: "intel-sycl-f16-face-detect"
    metal: "metal-face-detect"
    amd: "rocm-face-detect"
    vulkan: "vulkan-face-detect"
    nvidia-l4t: "nvidia-l4t-arm64-face-detect"
    nvidia-cuda-13: "cuda13-face-detect"
    nvidia-cuda-12: "cuda12-face-detect"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-face-detect"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-face-detect"
 - &voxtral
  name: "voxtral"
  alias: "voxtral"
@@ -1284,7 +1356,6 @@
    nvidia-cuda-13: "cuda13-liquid-audio"
    nvidia-cuda-12: "cuda12-liquid-audio"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-liquid-audio"
    metal: "metal-liquid-audio"
  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/61b8e2ba285851687028d395/7_6D7rWrLxp2hb6OHSV1p.png
 - &qwen-tts
  urls:
@@ -1570,7 +1641,6 @@
    - TTS
  capabilities:
    default: "cpu-supertonic"
    metal: "metal-supertonic"
 - !!merge <<: *neutts
  name: "neutts-development"
  capabilities:
@@ -2798,6 +2868,236 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-ced"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-13-ced
 ## voice-detect
 - !!merge <<: *voicedetect
  name: "voice-detect-development"
  capabilities:
    default: "cpu-voice-detect-development"
    nvidia: "cuda12-voice-detect-development"
    intel: "intel-sycl-f16-voice-detect-development"
    metal: "metal-voice-detect-development"
    amd: "rocm-voice-detect-development"
    vulkan: "vulkan-voice-detect-development"
    nvidia-l4t: "nvidia-l4t-arm64-voice-detect-development"
    nvidia-cuda-13: "cuda13-voice-detect-development"
    nvidia-cuda-12: "cuda12-voice-detect-development"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-voice-detect-development"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-voice-detect-development"
 - !!merge <<: *voicedetect
  name: "nvidia-l4t-arm64-voice-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-voice-detect"
  mirrors:
    - localai/localai-backends:latest-nvidia-l4t-arm64-voice-detect
 - !!merge <<: *voicedetect
  name: "nvidia-l4t-arm64-voice-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-voice-detect"
  mirrors:
    - localai/localai-backends:master-nvidia-l4t-arm64-voice-detect
 - !!merge <<: *voicedetect
  name: "cuda13-nvidia-l4t-arm64-voice-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-voice-detect"
  mirrors:
    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-voice-detect
 - !!merge <<: *voicedetect
  name: "cuda13-nvidia-l4t-arm64-voice-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-voice-detect"
  mirrors:
    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-voice-detect
 - !!merge <<: *voicedetect
  name: "cpu-voice-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-voice-detect"
  mirrors:
    - localai/localai-backends:latest-cpu-voice-detect
 - !!merge <<: *voicedetect
  name: "cpu-voice-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-voice-detect"
  mirrors:
    - localai/localai-backends:master-cpu-voice-detect
 - !!merge <<: *voicedetect
  name: "metal-voice-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-voice-detect"
  mirrors:
    - localai/localai-backends:latest-metal-darwin-arm64-voice-detect
 - !!merge <<: *voicedetect
  name: "metal-voice-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-voice-detect"
  mirrors:
    - localai/localai-backends:master-metal-darwin-arm64-voice-detect
 - !!merge <<: *voicedetect
  name: "cuda12-voice-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-voice-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-12-voice-detect
 - !!merge <<: *voicedetect
  name: "cuda12-voice-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-voice-detect"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-12-voice-detect
 - !!merge <<: *voicedetect
  name: "rocm-voice-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-voice-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-rocm-hipblas-voice-detect
 - !!merge <<: *voicedetect
  name: "rocm-voice-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-voice-detect"
  mirrors:
    - localai/localai-backends:master-gpu-rocm-hipblas-voice-detect
 - !!merge <<: *voicedetect
  name: "intel-sycl-f32-voice-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-voice-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-sycl-f32-voice-detect
 - !!merge <<: *voicedetect
  name: "intel-sycl-f32-voice-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-voice-detect"
  mirrors:
    - localai/localai-backends:master-gpu-intel-sycl-f32-voice-detect
 - !!merge <<: *voicedetect
  name: "intel-sycl-f16-voice-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-voice-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-sycl-f16-voice-detect
 - !!merge <<: *voicedetect
  name: "intel-sycl-f16-voice-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-voice-detect"
  mirrors:
    - localai/localai-backends:master-gpu-intel-sycl-f16-voice-detect
 - !!merge <<: *voicedetect
  name: "vulkan-voice-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-voice-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-vulkan-voice-detect
 - !!merge <<: *voicedetect
  name: "vulkan-voice-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-voice-detect"
  mirrors:
    - localai/localai-backends:master-gpu-vulkan-voice-detect
 - !!merge <<: *voicedetect
  name: "cuda13-voice-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-voice-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-13-voice-detect
 - !!merge <<: *voicedetect
  name: "cuda13-voice-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-voice-detect"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-13-voice-detect
 ## face-detect
 - !!merge <<: *facedetect
  name: "face-detect-development"
  capabilities:
    default: "cpu-face-detect-development"
    nvidia: "cuda12-face-detect-development"
    intel: "intel-sycl-f16-face-detect-development"
    metal: "metal-face-detect-development"
    amd: "rocm-face-detect-development"
    vulkan: "vulkan-face-detect-development"
    nvidia-l4t: "nvidia-l4t-arm64-face-detect-development"
    nvidia-cuda-13: "cuda13-face-detect-development"
    nvidia-cuda-12: "cuda12-face-detect-development"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-face-detect-development"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-face-detect-development"
 - !!merge <<: *facedetect
  name: "nvidia-l4t-arm64-face-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-face-detect"
  mirrors:
    - localai/localai-backends:latest-nvidia-l4t-arm64-face-detect
 - !!merge <<: *facedetect
  name: "nvidia-l4t-arm64-face-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-face-detect"
  mirrors:
    - localai/localai-backends:master-nvidia-l4t-arm64-face-detect
 - !!merge <<: *facedetect
  name: "cuda13-nvidia-l4t-arm64-face-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-face-detect"
  mirrors:
    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-face-detect
 - !!merge <<: *facedetect
  name: "cuda13-nvidia-l4t-arm64-face-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-face-detect"
  mirrors:
    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-face-detect
 - !!merge <<: *facedetect
  name: "cpu-face-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-face-detect"
  mirrors:
    - localai/localai-backends:latest-cpu-face-detect
 - !!merge <<: *facedetect
  name: "cpu-face-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-face-detect"
  mirrors:
    - localai/localai-backends:master-cpu-face-detect
 - !!merge <<: *facedetect
  name: "metal-face-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-face-detect"
  mirrors:
    - localai/localai-backends:latest-metal-darwin-arm64-face-detect
 - !!merge <<: *facedetect
  name: "metal-face-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-face-detect"
  mirrors:
    - localai/localai-backends:master-metal-darwin-arm64-face-detect
 - !!merge <<: *facedetect
  name: "cuda12-face-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-face-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-12-face-detect
 - !!merge <<: *facedetect
  name: "cuda12-face-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-face-detect"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-12-face-detect
 - !!merge <<: *facedetect
  name: "rocm-face-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-face-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-rocm-hipblas-face-detect
 - !!merge <<: *facedetect
  name: "rocm-face-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-face-detect"
  mirrors:
    - localai/localai-backends:master-gpu-rocm-hipblas-face-detect
 - !!merge <<: *facedetect
  name: "intel-sycl-f32-face-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-face-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-sycl-f32-face-detect
 - !!merge <<: *facedetect
  name: "intel-sycl-f32-face-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-face-detect"
  mirrors:
    - localai/localai-backends:master-gpu-intel-sycl-f32-face-detect
 - !!merge <<: *facedetect
  name: "intel-sycl-f16-face-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-face-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-sycl-f16-face-detect
 - !!merge <<: *facedetect
  name: "intel-sycl-f16-face-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-face-detect"
  mirrors:
    - localai/localai-backends:master-gpu-intel-sycl-f16-face-detect
 - !!merge <<: *facedetect
  name: "vulkan-face-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-face-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-vulkan-face-detect
 - !!merge <<: *facedetect
  name: "vulkan-face-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-face-detect"
  mirrors:
    - localai/localai-backends:master-gpu-vulkan-face-detect
 - !!merge <<: *facedetect
  name: "cuda13-face-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-face-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-13-face-detect
 - !!merge <<: *facedetect
  name: "cuda13-face-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-face-detect"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-13-face-detect
 ## stablediffusion-ggml
 - !!merge <<: *stablediffusionggml
  name: "cpu-stablediffusion-ggml"
@@ -4614,7 +4914,6 @@
    nvidia-cuda-13: "cuda13-liquid-audio-development"
    nvidia-cuda-12: "cuda12-liquid-audio-development"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-liquid-audio-development"
    metal: "metal-liquid-audio-development"
 - !!merge <<: *liquid-audio
  name: "cpu-liquid-audio"
  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-liquid-audio"
@@ -4625,16 +4924,6 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-liquid-audio"
  mirrors:
    - localai/localai-backends:master-cpu-liquid-audio
 - !!merge <<: *liquid-audio
  name: "metal-liquid-audio"
  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-liquid-audio"
  mirrors:
    - localai/localai-backends:latest-metal-darwin-arm64-liquid-audio
 - !!merge <<: *liquid-audio
  name: "metal-liquid-audio-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-liquid-audio"
  mirrors:
    - localai/localai-backends:master-metal-darwin-arm64-liquid-audio
 - !!merge <<: *liquid-audio
  name: "cuda12-liquid-audio"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-liquid-audio"
@@ -5497,7 +5786,6 @@
  name: "supertonic-development"
  capabilities:
    default: "cpu-supertonic-development"
    metal: "metal-supertonic-development"
 - !!merge <<: *supertonic
  name: "cpu-supertonic"
  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-supertonic"
@@ -5508,13 +5796,3 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-supertonic"
  mirrors:
    - localai/localai-backends:master-cpu-supertonic
 - !!merge <<: *supertonic
  name: "metal-supertonic"
  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-supertonic"
  mirrors:
    - localai/localai-backends:latest-metal-darwin-arm64-supertonic
 - !!merge <<: *supertonic
  name: "metal-supertonic-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-supertonic"
  mirrors:
    - localai/localai-backends:master-metal-darwin-arm64-supertonic
--- a/backend/python/diffusers/requirements-cpu.txt
+++ b/backend/python/diffusers/requirements-cpu.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-diffusers==0.38.0
+git+https://github.com/huggingface/diffusers
 opencv-python
-transformers==4.57.6
+transformers
 torchvision==0.22.1
 accelerate
 git+https://github.com/xhinker/sd_embed
@@ -10,15 +10,9 @@ sentencepiece
 torch==2.7.1
 optimum-quanto
 ftfy
-# diffusers and transformers are pinned together on purpose. transformers v5
+# TODO: re-add compel once it supports transformers >= 5.
-# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# Tracking: https://github.com/damian0815/compel/pull/129
-# breaks single-file Stable Diffusion loading on every released diffusers
+#           https://github.com/damian0815/compel/issues/128
-# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# main via git froze whichever broken pair existed at image-build time. Pin the
+# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# last known-good released pair so builds are reproducible and can't drift into
+# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
 # the broken window. See https://github.com/mudler/LocalAI/issues/9979
 #
 # compel is intentionally omitted: it pins transformers~=4.25, which conflicts
 # with this pin and previously forced pip into multi-hour resolver backtracking
 # storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
 # the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-cublas12.txt
+++ b/backend/python/diffusers/requirements-cublas12.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cu121
-diffusers==0.38.0
+git+https://github.com/huggingface/diffusers
 opencv-python
-transformers==4.57.6
+transformers
 torchvision
 accelerate
 git+https://github.com/xhinker/sd_embed
@@ -10,15 +10,9 @@ sentencepiece
 torch
 ftfy
 optimum-quanto
-# diffusers and transformers are pinned together on purpose. transformers v5
+# TODO: re-add compel once it supports transformers >= 5.
-# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# Tracking: https://github.com/damian0815/compel/pull/129
-# breaks single-file Stable Diffusion loading on every released diffusers
+#           https://github.com/damian0815/compel/issues/128
-# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# main via git froze whichever broken pair existed at image-build time. Pin the
+# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# last known-good released pair so builds are reproducible and can't drift into
+# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
 # the broken window. See https://github.com/mudler/LocalAI/issues/9979
 #
 # compel is intentionally omitted: it pins transformers~=4.25, which conflicts
 # with this pin and previously forced pip into multi-hour resolver backtracking
 # storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
 # the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-cublas13.txt
+++ b/backend/python/diffusers/requirements-cublas13.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cu130
-diffusers==0.38.0
+git+https://github.com/huggingface/diffusers
 opencv-python
-transformers==4.57.6
+transformers
 torchvision
 accelerate
 git+https://github.com/xhinker/sd_embed
@@ -10,15 +10,9 @@ sentencepiece
 torch
 ftfy
 optimum-quanto
-# diffusers and transformers are pinned together on purpose. transformers v5
+# TODO: re-add compel once it supports transformers >= 5.
-# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# Tracking: https://github.com/damian0815/compel/pull/129
-# breaks single-file Stable Diffusion loading on every released diffusers
+#           https://github.com/damian0815/compel/issues/128
-# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# main via git froze whichever broken pair existed at image-build time. Pin the
+# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# last known-good released pair so builds are reproducible and can't drift into
+# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
 # the broken window. See https://github.com/mudler/LocalAI/issues/9979
 #
 # compel is intentionally omitted: it pins transformers~=4.25, which conflicts
 # with this pin and previously forced pip into multi-hour resolver backtracking
 # storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
 # the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-hipblas.txt
+++ b/backend/python/diffusers/requirements-hipblas.txt
@@ -1,23 +1,17 @@
 --extra-index-url https://download.pytorch.org/whl/rocm7.0
 torch==2.10.0+rocm7.0
 torchvision==0.25.0+rocm7.0
-diffusers==0.38.0
+git+https://github.com/huggingface/diffusers
 opencv-python
-transformers==4.57.6
+transformers
 accelerate
 peft
 sentencepiece
 optimum-quanto
 ftfy
-# diffusers and transformers are pinned together on purpose. transformers v5
+# TODO: re-add compel once it supports transformers >= 5.
-# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# Tracking: https://github.com/damian0815/compel/pull/129
-# breaks single-file Stable Diffusion loading on every released diffusers
+#           https://github.com/damian0815/compel/issues/128
-# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# main via git froze whichever broken pair existed at image-build time. Pin the
+# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# last known-good released pair so builds are reproducible and can't drift into
+# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
 # the broken window. See https://github.com/mudler/LocalAI/issues/9979
 #
 # compel is intentionally omitted: it pins transformers~=4.25, which conflicts
 # with this pin and previously forced pip into multi-hour resolver backtracking
 # storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
 # the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-intel.txt
+++ b/backend/python/diffusers/requirements-intel.txt
@@ -3,24 +3,18 @@ torch
 torchvision
 optimum[openvino]
 setuptools
-diffusers==0.38.0
+git+https://github.com/huggingface/diffusers
 opencv-python
-transformers==4.57.6
+transformers
 accelerate
 git+https://github.com/xhinker/sd_embed
 peft
 sentencepiece
 optimum-quanto
 ftfy
-# diffusers and transformers are pinned together on purpose. transformers v5
+# TODO: re-add compel once it supports transformers >= 5.
-# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# Tracking: https://github.com/damian0815/compel/pull/129
-# breaks single-file Stable Diffusion loading on every released diffusers
+#           https://github.com/damian0815/compel/issues/128
-# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# main via git froze whichever broken pair existed at image-build time. Pin the
+# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# last known-good released pair so builds are reproducible and can't drift into
+# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
 # the broken window. See https://github.com/mudler/LocalAI/issues/9979
 #
 # compel is intentionally omitted: it pins transformers~=4.25, which conflicts
 # with this pin and previously forced pip into multi-hour resolver backtracking
 # storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
 # the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-l4t12.txt
+++ b/backend/python/diffusers/requirements-l4t12.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu129/
 torch
-diffusers==0.38.0
+git+https://github.com/huggingface/diffusers
-transformers==4.57.6
+transformers
 accelerate
 peft
 optimum-quanto
@@ -9,15 +9,9 @@ numpy<2
 sentencepiece
 torchvision
 ftfy
-# diffusers and transformers are pinned together on purpose. transformers v5
+# TODO: re-add compel once it supports transformers >= 5.
-# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# Tracking: https://github.com/damian0815/compel/pull/129
-# breaks single-file Stable Diffusion loading on every released diffusers
+#           https://github.com/damian0815/compel/issues/128
-# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# main via git froze whichever broken pair existed at image-build time. Pin the
+# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# last known-good released pair so builds are reproducible and can't drift into
+# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
 # the broken window. See https://github.com/mudler/LocalAI/issues/9979
 #
 # compel is intentionally omitted: it pins transformers~=4.25, which conflicts
 # with this pin and previously forced pip into multi-hour resolver backtracking
 # storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
 # the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-l4t13.txt
+++ b/backend/python/diffusers/requirements-l4t13.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cu130
 torch
-diffusers==0.38.0
+git+https://github.com/huggingface/diffusers
-transformers==4.57.6
+transformers
 accelerate
 peft
 optimum-quanto
@@ -10,15 +10,9 @@ sentencepiece
 torchvision
 ftfy
 chardet
-# diffusers and transformers are pinned together on purpose. transformers v5
+# TODO: re-add compel once it supports transformers >= 5.
-# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# Tracking: https://github.com/damian0815/compel/pull/129
-# breaks single-file Stable Diffusion loading on every released diffusers
+#           https://github.com/damian0815/compel/issues/128
-# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# main via git froze whichever broken pair existed at image-build time. Pin the
+# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# last known-good released pair so builds are reproducible and can't drift into
+# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
 # the broken window. See https://github.com/mudler/LocalAI/issues/9979
 #
 # compel is intentionally omitted: it pins transformers~=4.25, which conflicts
 # with this pin and previously forced pip into multi-hour resolver backtracking
 # storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
 # the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-mps.txt
+++ b/backend/python/diffusers/requirements-mps.txt
@@ -1,22 +1,16 @@
 torch==2.7.1
 torchvision==0.22.1
-diffusers==0.38.0
+git+https://github.com/huggingface/diffusers
 opencv-python
-transformers==4.57.6
+transformers
 accelerate
 peft
 sentencepiece
 optimum-quanto
 ftfy
-# diffusers and transformers are pinned together on purpose. transformers v5
+# TODO: re-add compel once it supports transformers >= 5.
-# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# Tracking: https://github.com/damian0815/compel/pull/129
-# breaks single-file Stable Diffusion loading on every released diffusers
+#           https://github.com/damian0815/compel/issues/128
-# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# main via git froze whichever broken pair existed at image-build time. Pin the
+# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# last known-good released pair so builds are reproducible and can't drift into
+# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
 # the broken window. See https://github.com/mudler/LocalAI/issues/9979
 #
 # compel is intentionally omitted: it pins transformers~=4.25, which conflicts
 # with this pin and previously forced pip into multi-hour resolver backtracking
 # storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
 # the import succeeding, so dropping it here is safe.
--- a/backend/python/liquid-audio/install.sh
+++ b/backend/python/liquid-audio/install.sh
@@ -14,11 +14,5 @@ else
 fi
 # liquid-audio's torch wheels are large; allow upgrades to satisfy transitive pins
-EXTRA_PIP_INSTALL_FLAGS+=" --upgrade"
+EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 # --index-strategy is a uv-only flag. The darwin/MPS build installs with pip
 # (USE_PIP=true in scripts/build/python-darwin.sh), which rejects it. Only add
 # it on the uv path; Linux/CUDA resolution is unchanged.
 if [ "x${USE_PIP:-}" != "xtrue" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
 fi
 installRequirements
--- a/backend/python/liquid-audio/requirements-mps.txt
+++ b/backend/python/liquid-audio/requirements-mps.txt
@@ -1,4 +1,3 @@
 # MPS (Apple Silicon / Metal) build profile - installed by the darwin CI job.
 torch>=2.8.0
 torchaudio>=2.8.0
 torchcodec>=0.9.1
--- a/core/application/application.go
+++ b/core/application/application.go
@@ -341,9 +341,11 @@ func (a *Application) ResolvePIIPolicy(cfg *config.ModelConfig) (enabled bool, d
 	}
 	appCfg := a.ApplicationConfig()
-	// PIIIsEnabled already encodes "explicit pii.enabled wins, else backend
+	if cfg.PII.Enabled != nil {
-	// default (cloud-proxy)" — the single source of that rule.
+		enabled = *cfg.PII.Enabled
-	enabled = cfg.PIIIsEnabled()
+	} else {
 		enabled = cfg.PIIIsEnabled() // backend default (cloud-proxy)
 	}
 	if !enabled {
 		return false, nil
 	}
@@ -352,7 +354,7 @@ func (a *Application) ResolvePIIPolicy(cfg *config.ModelConfig) (enabled bool, d
 	if len(detectors) == 0 {
 		detectors = append([]string(nil), appCfg.PIIDefaultDetectors...)
 	}
-	return true, detectors // enabled is necessarily true past the !enabled guard
+	return enabled, detectors
 }
 // PIIPolicyResolver adapts ResolvePIIPolicy to pii.PolicyResolver for
--- a/core/application/config_file_watcher.go
+++ b/core/application/config_file_watcher.go
@@ -215,7 +215,6 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
 		envBackendGalleries := slices.Equal(appConfig.BackendGalleries, startupAppConfig.BackendGalleries)
 		envAutoloadGalleries := appConfig.AutoloadGalleries == startupAppConfig.AutoloadGalleries
 		envAutoloadBackendGalleries := appConfig.AutoloadBackendGalleries == startupAppConfig.AutoloadBackendGalleries
 		envPIIDefaultDetectors := slices.Equal(appConfig.PIIDefaultDetectors, startupAppConfig.PIIDefaultDetectors)
 		envAgentJobRetentionDays := appConfig.AgentJobRetentionDays == startupAppConfig.AgentJobRetentionDays
 		envForceEvictionWhenBusy := appConfig.ForceEvictionWhenBusy == startupAppConfig.ForceEvictionWhenBusy
 		envLRUEvictionMaxRetries := appConfig.LRUEvictionMaxRetries == startupAppConfig.LRUEvictionMaxRetries
@@ -336,15 +335,6 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
 			if settings.AutoloadBackendGalleries != nil && !envAutoloadBackendGalleries {
 				appConfig.AutoloadBackendGalleries = *settings.AutoloadBackendGalleries
 			}
 			if settings.PIIDefaultDetectors != nil && !envPIIDefaultDetectors {
 				// Request-side default redaction reads this live via
 				// ResolvePIIPolicy, so a file edit takes effect on the next chat
 				// request. The MITM listener resolves its per-host detector map
 				// once at start, so a raw file edit reaches cloud-proxy traffic
 				// only after a restart or a POST /api/settings (which rebuilds
 				// the listener) — the admin UI uses the latter.
 				appConfig.PIIDefaultDetectors = append([]string(nil), (*settings.PIIDefaultDetectors)...)
 			}
 			if settings.AutoUpgradeBackends != nil {
 				appConfig.AutoUpgradeBackends = *settings.AutoUpgradeBackends
 			}
--- a/core/application/distributed.go
+++ b/core/application/distributed.go
@@ -357,15 +357,6 @@ func initDistributed(cfg *config.ApplicationConfig, authDB *gorm.DB, configLoade
 		Pressure:         pressure,
 	})
 	// Wire staging-progress broadcasting so file-staging shows up on every
 	// replica, not just the one performing the transfer. Without this, a
 	// /api/operations poll that round-robins onto a peer sees no staging row and
 	// the progress flickers. The origin publishes; peers mirror via the wildcard.
 	router.StagingTracker().SetPublisher(natsClient)
 	if _, err := router.StagingTracker().SubscribeBroadcasts(natsClient); err != nil {
 		xlog.Warn("Failed to subscribe to staging progress broadcasts", "error", err)
 	}
 	// Create ReplicaReconciler for auto-scaling model replicas. Adapter +
 	// RegistrationToken feed the state-reconciliation passes: pending op
 	// drain uses the adapter, and model health probes use the token to auth
--- a/core/application/runtime_settings_branding_test.go
+++ b/core/application/runtime_settings_branding_test.go
@@ -109,52 +109,6 @@ var _ = Describe("loadRuntimeSettingsFromFile", func() {
 		})
 	})
 	// Instance-wide default PII detectors. The file is the only source (no
 	// env var), and the loader runs immediately before startMITMIfConfigured,
 	// so a regression here means the cloud-proxy MITM listener resolves an
 	// empty detector set at boot and forwards intercepted traffic unredacted —
 	// even though pii_default_detectors is on disk and the MITM model has PII
 	// enabled. It also breaks request-side default redaction the same way.
 	Describe("PII default detectors", func() {
 		It("loads pii_default_detectors from the file", func() {
 			cfg := &config.ApplicationConfig{DynamicConfigsDir: seedSettings(`{"pii_default_detectors": ["privacy-filter-nemotron", "secret-filter"]}`)}
 			loadRuntimeSettingsFromFile(cfg)
 			Expect(cfg.PIIDefaultDetectors).To(Equal([]string{"privacy-filter-nemotron", "secret-filter"}))
 		})
 		It("does not override an env/CLI-set value (LOCALAI_PII_DEFAULT_DETECTORS)", func() {
 			cfg := &config.ApplicationConfig{
 				DynamicConfigsDir:   seedSettings(`{"pii_default_detectors": ["from-file"]}`),
 				PIIDefaultDetectors: []string{"from-env"}, // simulate WithPIIDefaultDetectors(env)
 			}
 			loadRuntimeSettingsFromFile(cfg)
 			Expect(cfg.PIIDefaultDetectors).To(Equal([]string{"from-env"}), "env var must win over the persisted file value")
 		})
 	})
 	// The live file watcher applies pii_default_detectors on a runtime change
 	// the same way it handles galleries/threads/etc.: env-set values (current
 	// == startup snapshot) are left alone, otherwise the file value is applied
 	// to the live config so request-side default redaction picks it up without
 	// a restart.
 	Describe("file watcher: pii_default_detectors", func() {
 		It("applies a changed file value to the live config", func() {
 			startup := config.ApplicationConfig{} // no env baseline
 			live := &config.ApplicationConfig{PIIDefaultDetectors: []string{"old"}}
 			handler := readRuntimeSettingsJson(startup)
 			Expect(handler([]byte(`{"pii_default_detectors":["new-a","new-b"]}`), live)).To(Succeed())
 			Expect(live.PIIDefaultDetectors).To(Equal([]string{"new-a", "new-b"}))
 		})
 		It("leaves an env-controlled value untouched", func() {
 			startup := config.ApplicationConfig{PIIDefaultDetectors: []string{"from-env"}}
 			live := &config.ApplicationConfig{PIIDefaultDetectors: []string{"from-env"}}
 			handler := readRuntimeSettingsJson(startup)
 			Expect(handler([]byte(`{"pii_default_detectors":["from-file"]}`), live)).To(Succeed())
 			Expect(live.PIIDefaultDetectors).To(Equal([]string{"from-env"}), "env-controlled detectors must not be overwritten by the file")
 		})
 	})
 	// The Agent Pool block has a mix of zero and non-zero defaults
 	// (Enabled=true, EmbeddingModel="granite-...", MaxChunkingSize=400,
 	// VectorEngine="chromem", AgentHubURL="https://agenthub.localai.io").
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -750,20 +750,6 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
 		options.MITMListen = *settings.MITMListen
 	}
 	// Instance-wide default PII detectors. LOCALAI_PII_DEFAULT_DETECTORS (via
 	// WithPIIDefaultDetectors) wins when set; otherwise the file is the source
 	// — apply it only when the env/CLI left the value empty, mirroring the
 	// "env > file" precedence used for the other fields. This must land before
 	// startMITMIfConfigured (called right after this loader): the cloud-proxy
 	// listener resolves each intercept host's detectors once at start via
 	// ResolvePIIPolicy, and a MITM model that names no detectors of its own
 	// falls back to these defaults. Without it the listener (and request-side
 	// default redaction) starts with an empty detector set and forwards
 	// traffic unredacted even though pii_default_detectors is on disk.
 	if settings.PIIDefaultDetectors != nil && len(options.PIIDefaultDetectors) == 0 {
 		options.PIIDefaultDetectors = append([]string(nil), (*settings.PIIDefaultDetectors)...)
 	}
 	// Backend upgrade flags
 	if settings.AutoUpgradeBackends != nil {
 		if !options.AutoUpgradeBackends {
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -181,8 +181,6 @@ type RunCMD struct {
 	// Cloud-proxy MITM listener (off by default).
 	MITMListen string `env:"LOCALAI_MITM_LISTEN" help:"Address (host:port) for the cloudproxy MITM listener. Empty = disabled. Clients set HTTPS_PROXY=http://<this>:<port>. Intercept hosts are declared per-model via the model YAML mitm.hosts: block; create one from the Add Model UI." group:"middleware"`
 	MITMCADir  string `env:"LOCALAI_MITM_CA_DIR" type:"path" help:"Directory holding the MITM proxy CA cert + key. Defaults to <data-path>/mitm-ca." group:"middleware"`
 	PIIDefaultDetectors []string `env:"LOCALAI_PII_DEFAULT_DETECTORS" help:"Instance-wide default PII/secret detector model names applied to any PII-enabled model (chiefly cloud-proxy / MITM models) that names no pii.detectors of its own. Comma-separated, e.g. privacy-filter-nemotron,secret-filter. Takes precedence over the value persisted via the Middleware UI." group:"middleware"`
 }
 func (r *RunCMD) Run(ctx *cliContext.Context) error {
@@ -245,7 +243,6 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		config.WithAPIAddress(r.Address),
 		config.WithMITMListen(r.MITMListen),
 		config.WithMITMCADir(r.MITMCADir),
 		config.WithPIIDefaultDetectors(r.PIIDefaultDetectors),
 		config.WithAgentJobRetentionDays(r.AgentJobRetentionDays),
 		config.WithLlamaCPPTunnelCallback(func(tunnels []string) {
 			tunnelEnvVar := strings.Join(tunnels, ",")
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -712,18 +712,6 @@ func WithMITMCADir(dir string) AppOption {
 	}
 }
 // WithPIIDefaultDetectors sets the instance-wide default PII/secret detector
 // model names applied to any PII-enabled model (chiefly cloud-proxy / MITM
 // models) that names no pii.detectors of its own. CLI/env:
 // LOCALAI_PII_DEFAULT_DETECTORS. Empty leaves the value to
 // runtime_settings.json / the Middleware UI; a non-empty value takes
 // precedence over the file (env > file).
 func WithPIIDefaultDetectors(detectors []string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.PIIDefaultDetectors = detectors
 	}
 }
 func WithDynamicConfigDir(dynamicConfigsDir string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.DynamicConfigsDir = dynamicConfigsDir
--- a/core/config/backend_capabilities.go
+++ b/core/config/backend_capabilities.go
@@ -542,6 +542,19 @@ var BackendCapabilities = map[string]BackendCapability{
 		DefaultUsecases:  []string{UsecaseSpeakerRecognition},
 		Description:      "Speaker recognition — voice identity verification and analysis",
 	},
 	"voice-detect": {
 		GRPCMethods:      []GRPCMethod{MethodVoiceVerify, MethodVoiceEmbed, MethodVoiceAnalyze},
 		PossibleUsecases: []string{UsecaseSpeakerRecognition},
 		DefaultUsecases:  []string{UsecaseSpeakerRecognition},
 		Description:      "voice-detect.cpp: C++/ggml speaker embedding, verification and voice analysis (age/gender/emotion)",
 	},
 	"face-detect": {
 		GRPCMethods:      []GRPCMethod{MethodEmbedding, MethodDetect, MethodFaceVerify, MethodFaceAnalyze},
 		PossibleUsecases: []string{UsecaseEmbeddings, UsecaseDetection, UsecaseFaceRecognition},
 		DefaultUsecases:  []string{UsecaseFaceRecognition},
 		AcceptsImages:    true,
 		Description:      "face-detect.cpp: C++/ggml face detection, embedding, verification and attribute analysis",
 	},
 	"silero-vad": {
 		GRPCMethods:      []GRPCMethod{MethodVAD},
 		PossibleUsecases: []string{UsecaseVAD},
--- a/core/config/hardware_defaults.go
+++ b/core/config/hardware_defaults.go
@@ -54,35 +54,8 @@ func (g GPU) IsNVIDIABlackwell() bool {
 	return maj >= 12
 }
 // Compute-buffer headroom guard for the raised physical batch.
 //
 // Raising n_ubatch grows the CUDA *compute buffer* (the scratch for the forward
 // graph), which is allocated PER DEVICE — it does not benefit from a second GPU
 // the way weights or KV (which are split across devices) do. The buffer scales
 // ~linearly with n_ubatch * n_ctx, so a large context turns the GB10-tuned
 // ub2048 into multi-GiB of extra scratch that must fit on a SINGLE card. On a
 // 16 GiB consumer Blackwell with a 200k context that overflows (issue #10485),
 // even though the GB10 it was measured on (128 GiB unified memory) had room.
 //
 // These constants size a conservative guard: only raise the batch when the
 // extra scratch fits the per-device VRAM ceiling.
 const (
 	// computeBufferBytesPerCell approximates the CUDA compute-buffer cost of one
 	// (n_ubatch * n_ctx) cell. Derived from an observed allocation (ub2048 *
 	// ctx204800 ~= 4.5 GiB => ~11 B/cell) and rounded up to 16 for margin, since
 	// the real cost also grows with model width (heads / embedding dim) which we
 	// don't know at config time.
 	computeBufferBytesPerCell = 16
 	// blackwellBatchHeadroomDivisor caps the extra compute buffer from raising the
 	// physical batch at VRAM/divisor. /4 keeps the bulk of a device for weights +
 	// KV, which already dominate VRAM use.
 	blackwellBatchHeadroomDivisor = 4
 )
 // PhysicalBatch returns the canonical physical batch (n_batch/n_ubatch) for the
-// given hardware class, ignoring context/VRAM headroom. Use
+// given hardware, used when the model config leaves batch unset.
 // PhysicalBatchForContext when a model context and per-device VRAM are known
 // (the load paths) so the raised batch can't overflow a single device.
 func PhysicalBatch(g GPU) int {
 	if g.IsNVIDIABlackwell() {
 		return BlackwellPhysicalBatch
@@ -90,32 +63,6 @@ func PhysicalBatch(g GPU) int {
 	return DefaultPhysicalBatch
 }
 // PhysicalBatchForContext is PhysicalBatch gated on per-device VRAM headroom for
 // the given context: it only raises the batch above the conservative default
 // when the extra compute buffer (which is allocated on a single device and grows
 // with n_ubatch * n_ctx) fits within blackwellBatchHeadroomDivisor of the GPU's
 // VRAM. g.VRAM must be the PER-DEVICE ceiling (the smallest device on a
 // multi-GPU host), not the summed total — the compute buffer can't be split.
 //
 // VRAM 0 (unknown) stays conservative rather than risk a per-device OOM; the
 // GB10 / unified-memory path reports system RAM, so it still clears the guard.
 func PhysicalBatchForContext(g GPU, ctx int) int {
 	if !g.IsNVIDIABlackwell() {
 		return DefaultPhysicalBatch
 	}
 	if ctx <= 0 {
 		ctx = DefaultContextSize
 	}
 	if g.VRAM == 0 {
 		return DefaultPhysicalBatch
 	}
 	extra := uint64(ctx) * uint64(BlackwellPhysicalBatch-DefaultPhysicalBatch) * computeBufferBytesPerCell
 	if extra <= g.VRAM/blackwellBatchHeadroomDivisor {
 		return BlackwellPhysicalBatch
 	}
 	return DefaultPhysicalBatch
 }
 // IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
 // Callers that re-tune a value chosen by an upstream host (the distributed
 // router correcting the frontend's guess) use this to avoid clobbering an
@@ -175,12 +122,7 @@ func hasParallelOption(opts []string) bool {
 // deterministic device — detection does a live nvidia-smi call.
 var localGPU = func() GPU {
 	vendor, _ := xsysinfo.DetectGPUVendor()
-	// Use the SMALLEST device's VRAM, not the summed total: the parallel-slot
+	vram, _ := xsysinfo.TotalAvailableVRAM()
 	// tier and the batch headroom guard both reason about what fits on a single
 	// card, and per-device compute buffers can't be split across GPUs. Summing
 	// two 16 GiB cards into "32 GiB" is what over-provisioned multi-GPU hosts
 	// into OOM (issue #10485).
 	vram, _ := xsysinfo.MinPerGPUVRAM()
 	return GPU{
 		Vendor:            vendor,
 		ComputeCapability: xsysinfo.NVIDIAComputeCapability(),
@@ -195,20 +137,10 @@ func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
 	if cfg == nil {
 		return
 	}
-	// Raise the physical batch on Blackwell only when the resulting compute
+	if cfg.Batch == 0 && gpu.IsNVIDIABlackwell() {
 	// buffer fits the per-device VRAM at THIS model's context. Leaving Batch at 0
 	// (rather than writing the default 512) preserves the downstream single-pass
 	// sizing in core/backend.EffectiveBatchSize for embedding/score/rerank.
 	if cfg.Batch == 0 {
 		ctx := DefaultContextSize
 		if cfg.ContextSize != nil {
 			ctx = *cfg.ContextSize
 		}
 		if PhysicalBatchForContext(gpu, ctx) == BlackwellPhysicalBatch {
 		cfg.Batch = BlackwellPhysicalBatch
 		xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
-				"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability, "context", ctx, "vram_gib", gpu.VRAM>>30)
+			"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability)
 		}
 	}
 	// Enable concurrent serving by default on a capable GPU: without this the
--- a/core/config/hardware_defaults_internal_test.go
+++ b/core/config/hardware_defaults_internal_test.go
@@ -9,37 +9,26 @@ import (
 // GPU. The detection seam (localGPU) is injected so the path is deterministic
 // without a real GPU.
 var _ = Describe("SetDefaults hardware defaults (single-instance)", func() {
 	const gib = uint64(1) << 30
 	var orig func() GPU
 	BeforeEach(func() { orig = localGPU })
 	AfterEach(func() { localGPU = orig })
-	It("sets the physical batch on a local Blackwell GPU with headroom", func() {
+	It("sets the physical batch on a local Blackwell GPU", func() {
-		localGPU = func() GPU { return GPU{ComputeCapability: "12.1", VRAM: 119 * gib} }
+		localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
 		cfg := &ModelConfig{}
 		cfg.SetDefaults()
 		Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
 	})
 	It("leaves batch unset when a large context would overflow the device", func() {
 		// Regression guard for issue #10485: 16 GiB consumer Blackwell + ~200k ctx.
 		localGPU = func() GPU { return GPU{ComputeCapability: "12.0", VRAM: 16 * gib} }
 		ctx := 204800
 		cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
 		cfg.SetDefaults()
 		Expect(cfg.Batch).To(Equal(0))
 	})
 	It("leaves batch unset on a non-Blackwell local GPU", func() {
-		localGPU = func() GPU { return GPU{ComputeCapability: "8.9", VRAM: 119 * gib} }
+		localGPU = func() GPU { return GPU{ComputeCapability: "8.9"} }
 		cfg := &ModelConfig{}
 		cfg.SetDefaults()
 		Expect(cfg.Batch).To(Equal(0))
 	})
 	It("never overrides an explicit batch", func() {
-		localGPU = func() GPU { return GPU{ComputeCapability: "12.1", VRAM: 119 * gib} }
+		localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
 		cfg := &ModelConfig{}
 		cfg.Batch = 1024
 		cfg.SetDefaults()
--- a/core/config/hardware_defaults_test.go
+++ b/core/config/hardware_defaults_test.go
@@ -7,8 +7,6 @@ import (
 )
 var _ = Describe("Hardware-driven config defaults", func() {
 	const gib = uint64(1) << 30
 	DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
 		func(cc string, want bool) {
 			Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
@@ -37,54 +35,21 @@ var _ = Describe("Hardware-driven config defaults", func() {
 		})
 	})
 	Describe("PhysicalBatchForContext (per-device VRAM headroom)", func() {
 		It("raises the batch when the compute buffer fits the device", func() {
 			// 16 GiB Blackwell with a small context: the extra scratch is tiny.
 			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.0", VRAM: 16 * gib}, 8192)).
 				To(Equal(BlackwellPhysicalBatch))
 		})
 		It("keeps the default batch when a large context would overflow one device", func() {
 			// The issue #10485 case: 16 GiB consumer Blackwell, ~200k context.
 			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.0", VRAM: 16 * gib}, 204800)).
 				To(Equal(DefaultPhysicalBatch))
 		})
 		It("still raises the batch on a large unified-memory device (GB10)", func() {
 			// GB10 reports system RAM (~119 GiB) as its single device's VRAM.
 			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.1", VRAM: 119 * gib}, 204800)).
 				To(Equal(BlackwellPhysicalBatch))
 		})
 		It("stays conservative when VRAM is unknown", func() {
 			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.1"}, 8192)).
 				To(Equal(DefaultPhysicalBatch))
 		})
 		It("never raises the batch on non-Blackwell", func() {
 			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "9.0", VRAM: 80 * gib}, 8192)).
 				To(Equal(DefaultPhysicalBatch))
 		})
 	})
 	Describe("ApplyHardwareDefaults", func() {
-		It("raises an unset batch to 2048 on Blackwell with headroom", func() {
+		It("raises an unset batch to 2048 on Blackwell", func() {
 			cfg := &ModelConfig{}
-			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
 			Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
 		})
 		It("leaves batch unset when a large context would overflow one device", func() {
 			// Regression guard for issue #10485: 16 GiB card + ~200k context.
 			ctx := 204800
 			cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
 			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.0", VRAM: 16 * gib})
 			Expect(cfg.Batch).To(Equal(0))
 		})
 		It("leaves batch unset on non-Blackwell", func() {
 			cfg := &ModelConfig{}
-			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0", VRAM: 119 * gib})
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0"})
 			Expect(cfg.Batch).To(Equal(0))
 		})
 		It("never overrides an explicit batch", func() {
 			cfg := &ModelConfig{}
 			cfg.Batch = 1024
-			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
 			Expect(cfg.Batch).To(Equal(1024))
 		})
 		It("no-ops on nil", func() {
@@ -92,6 +57,8 @@ var _ = Describe("Hardware-driven config defaults", func() {
 		})
 	})
 	const gib = uint64(1) << 30
 	DescribeTable("DefaultParallelSlots (by VRAM)",
 		func(vramGiB uint64, want int) {
 			Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want))
--- a/core/config/meta/registry.go
+++ b/core/config/meta/registry.go
@@ -537,36 +537,6 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Component:   "number",
 			Order:       79,
 		},
 		"pipeline.compaction.enabled": {
 			Section:     "pipeline",
 			Label:       "Compaction Enabled",
 			Description: "Fold conversation items that age out of the live window (Max History Items) into a rolling summary instead of dropping them, so long realtime sessions stay cheap without losing earlier context. Off by default.",
 			Component:   "toggle",
 			Order:       80,
 		},
 		"pipeline.compaction.trigger_items": {
 			Section:     "pipeline",
 			Label:       "Compaction Trigger Items",
 			Description: "High-water mark: once the live conversation exceeds this many items, the overflow above Max History Items is summarized and evicted. Must be greater than Max History Items; defaults to twice it. The gap controls how often summarization runs.",
 			Component:   "number",
 			Order:       81,
 		},
 		"pipeline.compaction.summary_model": {
 			Section:     "pipeline",
 			Label:       "Compaction Summary Model",
 			Description: "Optional smaller/cheaper model used to produce the rolling summary. Empty reuses the pipeline's own LLM. On CPU, a tiny model here keeps compaction from competing with the conversation LLM.",
 			Component:   "input",
 			Advanced:    true,
 			Order:       82,
 		},
 		"pipeline.compaction.max_summary_tokens": {
 			Section:     "pipeline",
 			Label:       "Compaction Max Summary Tokens",
 			Description: "Advisory cap on the rolling summary length (fed to the summarizer prompt). Defaults to 512.",
 			Component:   "number",
 			Advanced:    true,
 			Order:       83,
 		},
 		// --- Functions ---
 		"function.grammar.parallel_calls": {
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -641,32 +641,11 @@ type Pipeline struct {
 	// context fills.
 	MaxHistoryItems *int `yaml:"max_history_items,omitempty" json:"max_history_items,omitempty"`
 	// Compaction folds conversation items that age out of the live window
 	// (max_history_items) into a rolling summary instead of dropping them, so
 	// long realtime sessions stay cheap without losing earlier context. Nil
 	// (block absent) means disabled, preserving existing behavior.
 	Compaction *PipelineCompaction `yaml:"compaction,omitempty" json:"compaction,omitempty"`
 	// VoiceRecognition gates the pipeline behind speaker verification. Nil
 	// (block absent) means no gate, preserving existing behavior.
 	VoiceRecognition *PipelineVoiceRecognition `yaml:"voice_recognition,omitempty" json:"voice_recognition,omitempty"`
 }
 // PipelineCompaction configures summarize-then-drop for a realtime pipeline.
 type PipelineCompaction struct {
 	// Enabled turns summarize-then-drop on. Default false.
 	Enabled bool `yaml:"enabled,omitempty" json:"enabled,omitempty"`
 	// TriggerItems is the high-water mark: once live items exceed it, overflow
 	// above max_history_items is summarized and evicted. Must exceed
 	// max_history_items; clamped up if not. Default: 2x max_history_items.
 	TriggerItems int `yaml:"trigger_items,omitempty" json:"trigger_items,omitempty"`
 	// SummaryModel optionally names a smaller/cheaper model for the summary
 	// call. Empty uses the pipeline's own LLM.
 	SummaryModel string `yaml:"summary_model,omitempty" json:"summary_model,omitempty"`
 	// MaxSummaryTokens advises the summary length (fed to the prompt). Default 512.
 	MaxSummaryTokens int `yaml:"max_summary_tokens,omitempty" json:"max_summary_tokens,omitempty"`
 }
 // ApplyReasoningEffort resolves the effective reasoning effort — a per-request
 // value (requestEffort) overrides the config's own ReasoningEffort default —
 // stores it on the config so gRPCPredictOpts forwards it to the backend as the
@@ -1204,6 +1183,11 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	// This ensures gallery-installed and runtime-loaded models get optimal parameters.
 	ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model)
 	// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell).
 	// Uses the local GPU here; in distributed mode the router re-applies the same
 	// heuristics for the selected node's GPU before loading. Explicit config wins.
 	ApplyHardwareDefaults(cfg, localGPU())
 	// Apply serving-policy defaults (device-independent): cross-request prefix
 	// caching. Propagates to distributed nodes via the model options.
 	ApplyServingDefaults(cfg)
@@ -1242,16 +1226,6 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.ContextSize = &ctx
 	}
 	runBackendHooks(cfg, lo.modelPath)
 	// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell)
 	// LAST, after the context size is fully resolved (explicit config, LoadOptions,
 	// then the GGUF guess inside runBackendHooks): the Blackwell batch guard sizes
 	// the per-device compute buffer against this model's context, so it must see
 	// the final value, not a pre-guess nil. Uses the local GPU here; in distributed
 	// mode the router re-applies the same heuristics for the selected node's GPU
 	// before loading. Explicit config always wins.
 	ApplyHardwareDefaults(cfg, localGPU())
 	cfg.syncKnownUsecasesFromString()
 }
--- a/core/config/runtime_settings_persist.go
+++ b/core/config/runtime_settings_persist.go
@@ -5,7 +5,6 @@ import (
 	"errors"
 	"os"
 	"path/filepath"
 	"reflect"
 )
 // runtimeSettingsFile is the on-disk filename inside DynamicConfigsDir.
@@ -34,35 +33,6 @@ func (o *ApplicationConfig) ReadPersistedSettings() (RuntimeSettings, error) {
 	return settings, nil
 }
 // MergeNonNil overlays every set (non-nil) field of overlay onto the
 // receiver, leaving the receiver's value untouched wherever overlay left a
 // field unset. Every RuntimeSettings field is a pointer precisely so "set"
 // can be told apart from "absent" (see the type doc), which makes this a
 // faithful partial update: a caller that submits only the field it owns
 // changes exactly that field and never clobbers unrelated settings.
 //
 // This is the read-modify-write contract the persistence helpers exist for.
 // UpdateSettingsEndpoint reads the on-disk settings, merges the request body
 // on top, and writes the result — so a focused admin page that POSTs only its
 // own field (the Middleware page sends only mitm_listen; the detector table
 // only pii_default_detectors) no longer nulls every other setting.
 //
 // Reflection keeps the merge total over the struct: a field added to
 // RuntimeSettings later is merged automatically, so the persistence path can
 // never silently drop a new setting the way a hand-maintained field list
 // would. Non-pointer fields (none today) are skipped — they cannot express
 // "absent", so the receiver wins.
 func (s *RuntimeSettings) MergeNonNil(overlay RuntimeSettings) {
 	dst := reflect.ValueOf(s).Elem()
 	src := reflect.ValueOf(overlay)
 	for i := 0; i < src.NumField(); i++ {
 		f := src.Field(i)
 		if f.Kind() == reflect.Pointer && !f.IsNil() {
 			dst.Field(i).Set(f)
 		}
 	}
 }
 // WritePersistedSettings serialises the given RuntimeSettings to
 // runtime_settings.json with restricted permissions (it may carry API
 // keys and P2P tokens).
--- a/core/config/runtime_settings_persist_test.go
+++ b/core/config/runtime_settings_persist_test.go
@@ -12,7 +12,6 @@ import (
 )
 func strPtr(s string) *string { return &s }
 func boolPtr(b bool) *bool     { return &b }
 var _ = Describe("RuntimeSettings persistence helpers", func() {
 	var (
@@ -52,47 +51,6 @@ var _ = Describe("RuntimeSettings persistence helpers", func() {
 		})
 	})
 	// MergeNonNil is the partial-update primitive UpdateSettingsEndpoint
 	// relies on: a focused admin page POSTs only the field it owns, and the
 	// handler reads the on-disk settings and overlays the request on top.
 	// Without it, the body would be written verbatim and every field the
 	// caller omitted would be nulled (the reported regression: changing
 	// mitm_listen wiped the galleries, api keys, watchdog config, etc.).
 	Describe("MergeNonNil partial update", func() {
 		It("overlays set fields and preserves unset ones", func() {
 			base := config.RuntimeSettings{
 				MITMListen:          strPtr(":9000"),
 				Galleries:           &[]config.Gallery{{Name: "g1", URL: "http://example/g1"}},
 				WatchdogIdleEnabled: boolPtr(true),
 				ApiKeys:             &[]string{"persisted-key"},
 				PIIDefaultDetectors: &[]string{"det-a"},
 			}
 			// Simulate the Middleware proxy tab: only mitm_listen is sent.
 			overlay := config.RuntimeSettings{MITMListen: strPtr(":8443")}
 			base.MergeNonNil(overlay)
 			Expect(base.MITMListen).ToNot(BeNil())
 			Expect(*base.MITMListen).To(Equal(":8443"), "set field should be overlaid")
 			// Everything the overlay left unset must survive untouched.
 			Expect(base.Galleries).ToNot(BeNil(), "galleries were clobbered")
 			Expect(*base.Galleries).To(HaveLen(1))
 			Expect(base.WatchdogIdleEnabled).ToNot(BeNil())
 			Expect(*base.WatchdogIdleEnabled).To(BeTrue())
 			Expect(base.ApiKeys).ToNot(BeNil(), "api_keys were clobbered")
 			Expect(*base.ApiKeys).To(Equal([]string{"persisted-key"}))
 			Expect(base.PIIDefaultDetectors).ToNot(BeNil(), "pii_default_detectors were clobbered")
 			Expect(*base.PIIDefaultDetectors).To(Equal([]string{"det-a"}))
 		})
 		It("lets an explicit empty slice clear a field", func() {
 			base := config.RuntimeSettings{PIIDefaultDetectors: &[]string{"det-a"}}
 			base.MergeNonNil(config.RuntimeSettings{PIIDefaultDetectors: &[]string{}})
 			Expect(base.PIIDefaultDetectors).ToNot(BeNil())
 			Expect(*base.PIIDefaultDetectors).To(BeEmpty(), "an explicit empty slice should clear, not preserve")
 		})
 	})
 	// MITM round trip pins the contract that loadRuntimeSettingsFromFile
 	// MITM listener address must survive a write/read round trip so the
 	// next process restart can bring the listener back up. (Intercept
--- a/core/http/endpoints/localai/agent_collections.go
+++ b/core/http/endpoints/localai/agent_collections.go
@@ -70,7 +70,7 @@ func UploadToCollectionEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := decodedParam(c, "name")
+		name := c.Param("name")
 		file, err := c.FormFile("file")
 		if err != nil {
 			return c.JSON(http.StatusBadRequest, map[string]string{"error": "file required"})
@@ -116,7 +116,7 @@ func ListCollectionEntriesEndpoint(app *application.Application) echo.HandlerFun
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		entries, err := svc.ListCollectionEntriesForUser(userID, decodedParam(c, "name"))
+		entries, err := svc.ListCollectionEntriesForUser(userID, c.Param("name"))
 		if err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
@@ -139,7 +139,7 @@ func GetCollectionEntryContentEndpoint(app *application.Application) echo.Handle
 		if err != nil {
 			entry = entryParam
 		}
-		content, chunkCount, err := svc.GetCollectionEntryContentForUser(userID, decodedParam(c, "name"), entry)
+		content, chunkCount, err := svc.GetCollectionEntryContentForUser(userID, c.Param("name"), entry)
 		if err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
@@ -164,7 +164,7 @@ func SearchCollectionEndpoint(app *application.Application) echo.HandlerFunc {
 		if err := c.Bind(&payload); err != nil {
 			return c.JSON(http.StatusBadRequest, map[string]string{"error": err.Error()})
 		}
-		results, err := svc.SearchCollectionForUser(userID, decodedParam(c, "name"), payload.Query, payload.MaxResults)
+		results, err := svc.SearchCollectionForUser(userID, c.Param("name"), payload.Query, payload.MaxResults)
 		if err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
@@ -182,7 +182,7 @@ func ResetCollectionEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		if err := svc.ResetCollectionForUser(userID, decodedParam(c, "name")); err != nil {
+		if err := svc.ResetCollectionForUser(userID, c.Param("name")); err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
 			}
@@ -202,7 +202,7 @@ func DeleteCollectionEntryEndpoint(app *application.Application) echo.HandlerFun
 		if err := c.Bind(&payload); err != nil {
 			return c.JSON(http.StatusBadRequest, map[string]string{"error": err.Error()})
 		}
-		remaining, err := svc.DeleteCollectionEntryForUser(userID, decodedParam(c, "name"), payload.Entry)
+		remaining, err := svc.DeleteCollectionEntryForUser(userID, c.Param("name"), payload.Entry)
 		if err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
@@ -230,7 +230,7 @@ func AddCollectionSourceEndpoint(app *application.Application) echo.HandlerFunc
 		if payload.UpdateInterval < 1 {
 			payload.UpdateInterval = 60
 		}
-		if err := svc.AddCollectionSourceForUser(userID, decodedParam(c, "name"), payload.URL, payload.UpdateInterval); err != nil {
+		if err := svc.AddCollectionSourceForUser(userID, c.Param("name"), payload.URL, payload.UpdateInterval); err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
 			}
@@ -250,7 +250,7 @@ func RemoveCollectionSourceEndpoint(app *application.Application) echo.HandlerFu
 		if err := c.Bind(&payload); err != nil {
 			return c.JSON(http.StatusBadRequest, map[string]string{"error": err.Error()})
 		}
-		if err := svc.RemoveCollectionSourceForUser(userID, decodedParam(c, "name"), payload.URL); err != nil {
+		if err := svc.RemoveCollectionSourceForUser(userID, c.Param("name"), payload.URL); err != nil {
 			return c.JSON(http.StatusInternalServerError, map[string]string{"error": err.Error()})
 		}
 		return c.JSON(http.StatusOK, map[string]string{"status": "ok"})
@@ -267,7 +267,7 @@ func GetCollectionEntryRawFileEndpoint(app *application.Application) echo.Handle
 		if err != nil {
 			entry = entryParam
 		}
-		fpath, err := svc.GetCollectionEntryFilePathForUser(userID, decodedParam(c, "name"), entry)
+		fpath, err := svc.GetCollectionEntryFilePathForUser(userID, c.Param("name"), entry)
 		if err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
@@ -282,7 +282,7 @@ func ListCollectionSourcesEndpoint(app *application.Application) echo.HandlerFun
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		sources, err := svc.ListCollectionSourcesForUser(userID, decodedParam(c, "name"))
+		sources, err := svc.ListCollectionSourcesForUser(userID, c.Param("name"))
 		if err != nil {
 			if strings.Contains(err.Error(), "not found") {
 				return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
--- a/core/http/endpoints/localai/agent_collections_param_test.go
+++ b/core/http/endpoints/localai/agent_collections_param_test.go
@@ -1,49 +0,0 @@
 package localai
 import (
 	"net/http"
 	"net/http/httptest"
 	"github.com/labstack/echo/v4"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
 // Regression for #10443: agent/collection names carry a "legacy-api-key:"
 // prefix, so the ':' is percent-encoded as %3A in the request path. Echo routes
 // such paths via URL.RawPath and stores the path-param value still escaped, so
 // handlers must URL-decode it before looking the collection up in the store -
 // otherwise the lookup sees "legacy-api-key%3ALiteraryResearch" and 404s.
 var _ = Describe("decodedParam", func() {
 	var e *echo.Echo
 	BeforeEach(func() {
 		e = echo.New()
 	})
 	// route runs a request through Echo's real router so the path param is
 	// populated exactly as it would be in production, then returns the decoded
 	// value the handler would observe.
 	route := func(rawPath string) string {
 		var got string
 		e.GET("/api/agents/collections/:name/upload", func(c echo.Context) error {
 			got = decodedParam(c, "name")
 			return c.NoContent(http.StatusOK)
 		})
 		req := httptest.NewRequest(http.MethodGet, rawPath, nil)
 		rec := httptest.NewRecorder()
 		e.ServeHTTP(rec, req)
 		Expect(rec.Code).To(Equal(http.StatusOK))
 		return got
 	}
 	It("decodes a percent-encoded colon in the collection name", func() {
 		got := route("/api/agents/collections/legacy-api-key%3ALiteraryResearch/upload")
 		Expect(got).To(Equal("legacy-api-key:LiteraryResearch"))
 	})
 	It("leaves an unencoded name untouched", func() {
 		got := route("/api/agents/collections/PlainCollection/upload")
 		Expect(got).To(Equal("PlainCollection"))
 	})
 })
--- a/core/http/endpoints/localai/agents.go
+++ b/core/http/endpoints/localai/agents.go
@@ -6,7 +6,6 @@ import (
 	"io"
 	"maps"
 	"net/http"
 	"net/url"
 	"os"
 	"path/filepath"
 	"slices"
@@ -34,22 +33,6 @@ func getUserID(c echo.Context) string {
 	return user.ID
 }
 // decodedParam returns the named path parameter, URL-decoding it.
 //
 // Echo routes a request via URL.RawPath whenever the path contains
 // percent-encoded characters (e.g. %3A for ':'), and in that case stores the
 // matched path-param value raw/escaped. Agent and collection names carry a
 // "legacy-api-key:" prefix, so the ':' arrives as %3A and the raw param no
 // longer matches the stored name. Callers must unescape before lookups.
 // Falls back to the raw value if it isn't valid percent-encoding.
 func decodedParam(c echo.Context, name string) string {
 	raw := c.Param(name)
 	if decoded, err := url.PathUnescape(raw); err == nil {
 		return decoded
 	}
 	return raw
 }
 // isAdminUser returns true if the authenticated user has admin role.
 func isAdminUser(c echo.Context) bool {
 	user := auth.GetUser(c)
@@ -144,7 +127,7 @@ func GetAgentEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := decodedParam(c, "name")
+		name := c.Param("name")
 		statuses := svc.ListAgentsForUser(userID)
 		active, exists := statuses[name]
@@ -159,7 +142,7 @@ func UpdateAgentEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := decodedParam(c, "name")
+		name := c.Param("name")
 		var cfg state.AgentConfig
 		if err := c.Bind(&cfg); err != nil {
 			return c.JSON(http.StatusBadRequest, map[string]string{"error": err.Error()})
@@ -178,7 +161,7 @@ func DeleteAgentEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := decodedParam(c, "name")
+		name := c.Param("name")
 		if err := svc.DeleteAgentForUser(userID, name); err != nil {
 			return c.JSON(http.StatusInternalServerError, map[string]string{"error": err.Error()})
 		}
@@ -190,7 +173,7 @@ func GetAgentConfigEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := decodedParam(c, "name")
+		name := c.Param("name")
 		cfg := svc.GetAgentConfigForUser(userID, name)
 		if cfg == nil {
 			return c.JSON(http.StatusNotFound, map[string]string{"error": "Agent not found"})
@@ -203,7 +186,7 @@ func PauseAgentEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		if err := svc.PauseAgentForUser(userID, decodedParam(c, "name")); err != nil {
+		if err := svc.PauseAgentForUser(userID, c.Param("name")); err != nil {
 			return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
 		}
 		return c.JSON(http.StatusOK, map[string]string{"status": "ok"})
@@ -214,7 +197,7 @@ func ResumeAgentEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		if err := svc.ResumeAgentForUser(userID, decodedParam(c, "name")); err != nil {
+		if err := svc.ResumeAgentForUser(userID, c.Param("name")); err != nil {
 			return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
 		}
 		return c.JSON(http.StatusOK, map[string]string{"status": "ok"})
@@ -225,7 +208,7 @@ func GetAgentStatusEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := decodedParam(c, "name")
+		name := c.Param("name")
 		history := svc.GetAgentStatusForUser(userID, name)
 		if history == nil {
@@ -258,7 +241,7 @@ func GetAgentObservablesEndpoint(app *application.Application) echo.HandlerFunc
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := decodedParam(c, "name")
+		name := c.Param("name")
 		history, err := svc.GetAgentObservablesForUser(userID, name)
 		if err != nil {
@@ -278,7 +261,7 @@ func ClearAgentObservablesEndpoint(app *application.Application) echo.HandlerFun
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := decodedParam(c, "name")
+		name := c.Param("name")
 		if err := svc.ClearAgentObservablesForUser(userID, name); err != nil {
 			return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
 		}
@@ -290,7 +273,7 @@ func ChatWithAgentEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := decodedParam(c, "name")
+		name := c.Param("name")
 		var payload struct {
 			Message string `json:"message"`
 		}
@@ -319,7 +302,7 @@ func AgentSSEEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := decodedParam(c, "name")
+		name := c.Param("name")
 		// Try local SSE manager first
 		manager := svc.GetSSEManagerForUser(userID, name)
@@ -351,7 +334,7 @@ func ExportAgentEndpoint(app *application.Application) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		svc := app.AgentPoolService()
 		userID := effectiveUserID(c)
-		name := decodedParam(c, "name")
+		name := c.Param("name")
 		data, err := svc.ExportAgentForUser(userID, name)
 		if err != nil {
 			return c.JSON(http.StatusNotFound, map[string]string{"error": err.Error()})
--- a/core/http/endpoints/localai/nodes.go
+++ b/core/http/endpoints/localai/nodes.go
@@ -385,23 +385,6 @@ func GetNodeModelsEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {
 	}
 }
 // ListAllNodeModelsEndpoint returns all loaded models across all healthy nodes.
 // @Summary List all loaded models cluster-wide
 // @Tags Nodes
 // @Success 200 {array} nodes.NodeModel
 // @Router /api/nodes/models [get]
 func ListAllNodeModelsEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		ctx := c.Request().Context()
 		models, err := registry.ListAllLoadedModels(ctx)
 		if err != nil {
 			xlog.Error("Failed to list all node models", "error", err)
 			return c.JSON(http.StatusInternalServerError, nodeError(http.StatusInternalServerError, "failed to list node models"))
 		}
 		return c.JSON(http.StatusOK, models)
 	}
 }
 // DrainNodeEndpoint sets a node to draining status (no new requests).
 func DrainNodeEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {
 	return func(c echo.Context) error {
--- a/core/http/endpoints/localai/nodes_test.go
+++ b/core/http/endpoints/localai/nodes_test.go
@@ -407,44 +407,4 @@ var _ = Describe("Node HTTP handlers", func() {
 			Expect(names).To(ConsistOf("alpha", "beta"))
 		})
 	})
 	Describe("ListAllNodeModelsEndpoint", func() {
 		It("returns an empty list when no models are loaded", func() {
 			e := echo.New()
 			req := httptest.NewRequest(http.MethodGet, "/", nil)
 			rec := httptest.NewRecorder()
 			c := e.NewContext(req, rec)
 			handler := ListAllNodeModelsEndpoint(registry)
 			Expect(handler(c)).To(Succeed())
 			Expect(rec.Code).To(Equal(http.StatusOK))
 			var list []nodes.NodeModel
 			Expect(json.Unmarshal(rec.Body.Bytes(), &list)).To(Succeed())
 			Expect(list).To(BeEmpty())
 		})
 		It("returns loaded models across healthy nodes", func() {
 			ctx := context.Background()
 			Expect(registry.Register(ctx, &nodes.BackendNode{
 				ID: "n1", Name: "alpha", Address: "10.0.0.1:50051", Status: nodes.StatusHealthy,
 			}, true)).To(Succeed())
 			Expect(registry.SetNodeModel(ctx, "n1", "llama-3.3", 0, "loaded", "10.0.0.1:50051", 0)).To(Succeed())
 			e := echo.New()
 			req := httptest.NewRequest(http.MethodGet, "/", nil)
 			rec := httptest.NewRecorder()
 			c := e.NewContext(req, rec)
 			handler := ListAllNodeModelsEndpoint(registry)
 			Expect(handler(c)).To(Succeed())
 			Expect(rec.Code).To(Equal(http.StatusOK))
 			var list []nodes.NodeModel
 			Expect(json.Unmarshal(rec.Body.Bytes(), &list)).To(Succeed())
 			Expect(list).To(HaveLen(1))
 			Expect(list[0].ModelName).To(Equal("llama-3.3"))
 			Expect(list[0].NodeID).To(Equal("n1"))
 		})
 	})
 })
--- a/core/http/endpoints/localai/settings.go
+++ b/core/http/endpoints/localai/settings.go
@@ -4,6 +4,8 @@ import (
 	"encoding/json"
 	"io"
 	"net/http"
 	"os"
 	"path/filepath"
 	"time"
 	"github.com/labstack/echo/v4"
@@ -108,18 +110,6 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
 			})
 		}
 		// Read whatever is already persisted: it is both the source of truth
 		// for branding asset filenames (below) and the base we merge this
 		// request onto before writing. A read failure must not let a Save
 		// silently discard the existing settings — surface it instead.
 		persisted, err := appConfig.ReadPersistedSettings()
 		if err != nil {
 			return c.JSON(http.StatusInternalServerError, schema.SettingsResponse{
 				Success: false,
 				Error:   "Failed to read existing settings: " + err.Error(),
 			})
 		}
 		// Branding asset filenames are owned exclusively by
 		// /api/branding/asset/{kind} (upload/delete). The Settings page also
 		// round-trips them via GET /api/settings, but its local state is stale
@@ -128,9 +118,11 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
 		// at page open. Replace whatever the body sent for these three fields
 		// with the values currently on disk so /api/settings can never
 		// regress them.
-		settings.LogoFile = persisted.LogoFile
+		if existing, err := appConfig.ReadPersistedSettings(); err == nil {
-		settings.LogoHorizontalFile = persisted.LogoHorizontalFile
+			settings.LogoFile = existing.LogoFile
-		settings.FaviconFile = persisted.FaviconFile
+			settings.LogoHorizontalFile = existing.LogoHorizontalFile
 			settings.FaviconFile = existing.FaviconFile
 		}
 		// The UI reads ApiKeys from GET /api/settings, which already returns the
 		// merged env+runtime list. When the user clicks Save, the same merged
@@ -153,17 +145,16 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
 			settings.ApiKeys = &runtimeOnly
 		}
-		// Persist as a partial update: overlay only the fields this request set
+		settingsFile := filepath.Join(appConfig.DynamicConfigsDir, "runtime_settings.json")
-		// onto the settings already on disk. Focused admin pages POST just the
+		settingsJSON, err := json.MarshalIndent(settings, "", "  ")
-		// keys they own (the Middleware proxy tab sends only mitm_listen; the
+		if err != nil {
-		// detector table only pii_default_detectors), so writing the request
+			return c.JSON(http.StatusInternalServerError, schema.SettingsResponse{
-		// body verbatim would null every unrelated setting (the no-omitempty
+				Success: false,
-		// api_keys / pii_default_detectors fields even round-trip as JSON
+				Error:   "Failed to marshal settings: " + err.Error(),
-		// null). The full Settings page still round-trips every field, so its
+			})
-		// Save is unchanged.
+		}
-		toPersist := persisted
+
-		toPersist.MergeNonNil(settings)
+		if err := os.WriteFile(settingsFile, settingsJSON, 0600); err != nil {
 		if err := appConfig.WritePersistedSettings(toPersist); err != nil {
 			return c.JSON(http.StatusInternalServerError, schema.SettingsResponse{
 				Success: false,
 				Error:   "Failed to write settings file: " + err.Error(),
@@ -271,14 +262,7 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
 			}
 		}
-		// Rebuild the MITM listener when its address OR the instance-wide
+		if settings.MITMListen != nil {
 		// default detectors change. The per-host detector map is resolved once
 		// at listener start (startMITMLocked → ResolvePIIPolicy), so a
 		// default-detector change is otherwise invisible to cloud-proxy traffic
 		// until the next restart — an admin toggling a default detector would
 		// see no redaction. RestartMITM is a no-op when the listener is
 		// disabled (empty address).
 		if settings.MITMListen != nil || settings.PIIDefaultDetectors != nil {
 			if err := app.RestartMITM(); err != nil {
 				xlog.Error("Failed to restart MITM proxy", "error", err)
 				return c.JSON(http.StatusInternalServerError, schema.SettingsResponse{
--- a/core/http/endpoints/localai/settings_test.go
+++ b/core/http/endpoints/localai/settings_test.go
@@ -52,10 +52,6 @@ var _ = Describe("Settings endpoints", func() {
 		// Settings are persisted here; set after construction since there's no
 		// dedicated AppOption for it.
 		app.ApplicationConfig().DynamicConfigsDir = tmp
 		// Contain the MITM CA inside tmp too. The partial-save spec flips
 		// mitm_listen, which starts the listener and writes a CA; without this
 		// it defaults to ./mitm-ca and litters the package source tree.
 		app.ApplicationConfig().MITMCADir = filepath.Join(tmp, "mitm-ca")
 		e = echo.New()
 		e.GET("/api/settings", GetSettingsEndpoint(app))
@@ -113,57 +109,6 @@ var _ = Describe("Settings endpoints", func() {
 		Expect(err).ToNot(HaveOccurred())
 	})
 	// Regression: a focused admin page (the Middleware proxy tab) POSTs only
 	// the one field it owns — mitm_listen. The old handler wrote the request
 	// body verbatim, so every other persisted setting was dropped (and
 	// api_keys / pii_default_detectors, which lack omitempty, were written as
 	// null). A partial POST must now merge onto what is already on disk.
 	It("preserves unrelated persisted settings when a partial POST sets only mitm_listen", func() {
 		// First save establishes a fuller settings file (as the full Settings
 		// page would): galleries, an API key, and the MITM listener. The
 		// listener restart binds a real socket, so use 127.0.0.1:0 for an
 		// ephemeral free port rather than a fixed one that may be in use.
 		rec := post(`{"mitm_listen":"127.0.0.1:0","galleries":[{"name":"g1","url":"http://example/g1"}],"api_keys":["k1"],"pii_default_detectors":["det-a"]}`)
 		Expect(rec.Code).To(Equal(http.StatusOK), rec.Body.String())
 		// The Middleware proxy tab then changes only the listen address — the
 		// exact partial body that nulled everything else before the fix.
 		rec = post(`{"mitm_listen":"127.0.0.1:0"}`)
 		Expect(rec.Code).To(Equal(http.StatusOK), rec.Body.String())
 		raw, err := os.ReadFile(filepath.Join(tmp, "runtime_settings.json"))
 		Expect(err).ToNot(HaveOccurred())
 		var ondisk config.RuntimeSettings
 		Expect(json.Unmarshal(raw, &ondisk)).To(Succeed())
 		Expect(ondisk.MITMListen).ToNot(BeNil())
 		Expect(*ondisk.MITMListen).To(Equal("127.0.0.1:0"), "the changed field should be saved")
 		Expect(ondisk.Galleries).ToNot(BeNil(), "galleries were clobbered by the partial save")
 		Expect(*ondisk.Galleries).To(HaveLen(1))
 		Expect(ondisk.ApiKeys).ToNot(BeNil(), "api_keys were nulled by the partial save")
 		Expect(*ondisk.ApiKeys).To(Equal([]string{"k1"}))
 		Expect(ondisk.PIIDefaultDetectors).ToNot(BeNil(), "pii_default_detectors were nulled by the partial save")
 		Expect(*ondisk.PIIDefaultDetectors).To(Equal([]string{"det-a"}))
 	})
 	// The MITM listener resolves its per-host PII detectors once at start
 	// (startMITMLocked → ResolvePIIPolicy), and the handler used to restart it
 	// only when mitm_listen changed. So an admin toggling a default detector
 	// (the Middleware detector table POSTs only pii_default_detectors) left
 	// cloud-proxy traffic unredacted until the next reboot. A
 	// pii_default_detectors change must now rebuild the listener.
 	It("rebuilds the MITM listener when only pii_default_detectors changes", func() {
 		rec := post(`{"mitm_listen":"127.0.0.1:0"}`)
 		Expect(rec.Code).To(Equal(http.StatusOK), rec.Body.String())
 		srv1 := app.MITMServer()
 		Expect(srv1).ToNot(BeNil(), "listener should be running after mitm_listen is set")
 		rec = post(`{"pii_default_detectors":["det-a"]}`)
 		Expect(rec.Code).To(Equal(http.StatusOK), rec.Body.String())
 		Expect(app.MITMServer()).ToNot(BeIdenticalTo(srv1),
 			"a default-detector change must restart the listener so it picks up the new detectors")
 	})
 	// Residual #9125: enabling the watchdog from a cold (off) state via the
 	// React master toggle must start the live watchdog immediately, without a
 	// restart. The toggle posts watchdog_idle_enabled/busy_enabled=true while
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -12,7 +12,6 @@ import (
 	"os"
 	"strconv"
 	"sync"
 	"sync/atomic"
 	"time"
 	"net/http"
@@ -135,18 +134,6 @@ type Session struct {
 	// pairs are kept together so we never feed an orphaned tool result.
 	MaxHistoryItems int
 	// Compaction settings resolved from pipeline.compaction (see resolveCompaction).
 	CompactionEnabled bool
 	CompactionTrigger int
 	SummaryModel      string
 	MaxSummaryTokens  int
 	// summarizerFactory lazily builds the model used for compaction summaries
 	// when summary_model is configured; nil means reuse the pipeline LLM.
 	summarizerFactory func() (Model, error)
 	summarizerOnce    sync.Once
 	summarizerCached  Model
 	// AssistantExecutor is non-nil when the session opted into the in-process
 	// LocalAI Assistant tool surface. Tool calls whose name matches this
 	// executor's catalog are run inproc and their output is fed back to the
@@ -254,12 +241,6 @@ type Conversation struct {
 	ID    string
 	Items []*types.MessageItemUnion
 	Lock  sync.Mutex
 	// Memory is the rolling summary of items already evicted by compaction. It
 	// is kept out of Items (so trimRealtimeItems never drops it) and rendered
 	// as a system message right after the session instructions.
 	Memory string
 	// compacting ensures at most one background compaction runs per conversation.
 	compacting atomic.Bool
 }
 func (c *Conversation) ToServer() types.Conversation {
@@ -559,12 +540,13 @@ func runRealtimeSession(application *application.Application, t Transport, model
 		SoundDetectionWindowMs:  cfg.Pipeline.SoundDetectionWindowMs,
 		SoundDetectionHopMs:     cfg.Pipeline.SoundDetectionHopMs,
 	}
 	session.CompactionEnabled, session.CompactionTrigger, session.MaxSummaryTokens, session.SummaryModel = resolveCompaction(cfg, session.MaxHistoryItems)
 	// Create a default conversation
 	conversationID := generateConversationID()
 	conversation := &Conversation{
 		ID: conversationID,
 		// TODO: We need to truncate the conversation items when a new item is added and we have run out of space. There are multiple places where items
 		//       can be added so we could use a datastructure here that enforces truncation upon addition
 		Items: []*types.MessageItemUnion{},
 	}
 	session.Conversations[conversationID] = conversation
@@ -595,18 +577,6 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	}
 	session.ModelInterface = m
 	if session.SummaryModel != "" {
 		summaryModelName := session.SummaryModel
 		sid := sessionID
 		session.summarizerFactory = func() (Model, error) {
 			summaryCfg, lerr := application.ModelConfigLoader().LoadModelConfigFileByNameDefaultOptions(summaryModelName, application.ApplicationConfig())
 			if lerr != nil {
 				return nil, fmt.Errorf("load summary model config %q: %w", summaryModelName, lerr)
 			}
 			return newModel(&summaryCfg.Pipeline, application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), evaluator, buildRealtimeRoutingContext(application, sid))
 		}
 	}
 	if cfg.Pipeline.VoiceGateEnabled() {
 		gate, gerr := newVoiceGate(
 			*cfg.Pipeline.VoiceRecognition,
@@ -837,15 +807,6 @@ func runRealtimeSession(application *application.Application, t Transport, model
 				commitUtterance(respCtx, allAudio, session, conversation, t)
 			}()
 		case types.InputAudioBufferClearEvent:
 			xlog.Debug("recv", "message", string(msg))
 			// Discard a partially-captured utterance so the client can restart
 			// input cleanly without the stale buffer leaking into the next commit.
 			clearInputAudio(session)
 			sendEvent(t, types.InputAudioBufferClearedEvent{
 				ServerEventBase: types.ServerEventBase{EventID: e.EventID},
 			})
 		case types.ConversationItemCreateEvent:
 			xlog.Debug("recv", "message", string(msg))
 			// Add the item to the conversation
@@ -880,39 +841,7 @@ func runRealtimeSession(application *application.Application, t Transport, model
 			})
 		case types.ConversationItemDeleteEvent:
-			xlog.Debug("recv", "message", string(msg))
+			sendError(t, "not_implemented", "Deleting items not implemented", "", "event_TODO")
 			if e.ItemID == "" {
 				sendError(t, "invalid_item_id", "Need item_id, but none specified", "", "event_TODO")
 				continue
 			}
 			conversation.Lock.Lock()
 			updated, ok := deleteItem(conversation.Items, e.ItemID)
 			conversation.Items = updated
 			conversation.Lock.Unlock()
 			if !ok {
 				sendError(t, "invalid_item_id", "Item to delete not found", "", "event_TODO")
 				continue
 			}
 			sendEvent(t, types.ConversationItemDeletedEvent{
 				ServerEventBase: types.ServerEventBase{EventID: e.EventID},
 				ItemID:          e.ItemID,
 			})
 		case types.ConversationItemTruncateEvent:
 			xlog.Debug("recv", "message", string(msg))
 			conversation.Lock.Lock()
 			ok := truncateAssistantText(conversation.Items, e.ItemID, e.ContentIndex)
 			conversation.Lock.Unlock()
 			if !ok {
 				sendError(t, "invalid_item_id", "Item to truncate not found", "", "event_TODO")
 				continue
 			}
 			sendEvent(t, types.ConversationItemTruncatedEvent{
 				ServerEventBase: types.ServerEventBase{EventID: e.EventID},
 				ItemID:          e.ItemID,
 				ContentIndex:    e.ContentIndex,
 				AudioEndMs:      e.AudioEndMs,
 			})
 		case types.ConversationItemRetrieveEvent:
 			xlog.Debug("recv", "message", string(msg))
@@ -925,7 +854,21 @@ func runRealtimeSession(application *application.Application, t Transport, model
 			conversation.Lock.Lock()
 			var retrievedItem types.MessageItemUnion
 			for _, item := range conversation.Items {
-				if itemID(item) == e.ItemID {
+				// We need to check ID in the union
 				var id string
 				if item.System != nil {
 					id = item.System.ID
 				} else if item.User != nil {
 					id = item.User.ID
 				} else if item.Assistant != nil {
 					id = item.Assistant.ID
 				} else if item.FunctionCall != nil {
 					id = item.FunctionCall.ID
 				} else if item.FunctionCallOutput != nil {
 					id = item.FunctionCallOutput.ID
 				}
 				if id == e.ItemID {
 					retrievedItem = *item
 					break
 				}
@@ -1723,9 +1666,6 @@ const maxAssistantToolTurns = 10
 func triggerResponse(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams) {
 	triggerResponseAtTurn(ctx, session, conv, t, overrides, 0)
 	// Fold aged-out turns into the rolling memory off the critical path; the
 	// next turn reaps the smaller buffer.
 	session.maybeCompact(conv)
 }
 func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams, toolTurn int) {
@@ -1781,7 +1721,6 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 	var lastUserSpeaker *types.Speaker
 	personalize := session.voiceGate != nil && session.voiceGate.cfg.PersonalizeEnabled()
 	conv.Lock.Lock()
 	conversationHistory = withMemory(conversationHistory, conv.Memory)
 	items := trimRealtimeItems(conv.Items, session.MaxHistoryItems)
 	for _, item := range items {
 		if item.User != nil {
--- a/core/http/endpoints/openai/realtime_compaction.go
+++ b/core/http/endpoints/openai/realtime_compaction.go
@@ -1,326 +0,0 @@
 package openai
 import (
 	"context"
 	"fmt"
 	"strings"
 	"time"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/reasoning"
 	"github.com/mudler/xlog"
 )
 const (
 	defaultMaxSummaryTokens = 512
 	memoryPrefix            = "Summary of earlier conversation:\n"
 	// compactionTimeout bounds the summarizer call so a stuck model can't pin the
 	// compacting flag (and thus block all further compaction) forever.
 	compactionTimeout = 60 * time.Second
 )
 // withMemory inserts the rolling summary as a system message after the existing
 // (instructions) history. No-op when memory is empty.
 func withMemory(history schema.Messages, memory string) schema.Messages {
 	if memory == "" {
 		return history
 	}
 	content := memoryPrefix + memory
 	return append(history, schema.Message{
 		Role:          string(types.MessageRoleSystem),
 		StringContent: content,
 		Content:       content,
 	})
 }
 // renderItemsTranscript renders conversation items as a plain "role: text"
 // transcript for summarization. Non-text items (bare tool calls) are labelled
 // so the summarizer keeps track of actions taken.
 func renderItemsTranscript(items []*types.MessageItemUnion) string {
 	var b strings.Builder
 	for _, item := range items {
 		switch {
 		case item.User != nil:
 			b.WriteString("user: ")
 			for _, c := range item.User.Content {
 				if c.Text != "" {
 					b.WriteString(c.Text)
 				}
 				if c.Transcript != "" {
 					b.WriteString(c.Transcript)
 				}
 			}
 			b.WriteString("\n")
 		case item.Assistant != nil:
 			b.WriteString("assistant: ")
 			// Realtime assistant *audio* turns store the spoken words in
 			// .Transcript (not .Text), so emit both or spoken turns are dropped.
 			for _, c := range item.Assistant.Content {
 				if c.Text != "" {
 					b.WriteString(c.Text)
 				}
 				if c.Transcript != "" {
 					b.WriteString(c.Transcript)
 				}
 			}
 			b.WriteString("\n")
 		case item.FunctionCall != nil:
 			b.WriteString(fmt.Sprintf("assistant called tool %s(%s)\n", item.FunctionCall.Name, item.FunctionCall.Arguments))
 		case item.FunctionCallOutput != nil:
 			b.WriteString(fmt.Sprintf("tool result: %s\n", item.FunctionCallOutput.Output))
 		}
 	}
 	return strings.TrimSpace(b.String())
 }
 // buildSummaryMessages builds the chat messages for the summarizer LLM: a system
 // instruction plus prior memory and the new transcript to fold in. maxTokens is
 // advisory (fed to the prompt; not hard-enforced in v1).
 func buildSummaryMessages(priorMemory, transcript string, maxTokens int) schema.Messages {
 	system := fmt.Sprintf("You maintain a running memory of a live voice conversation. "+
 		"Merge the prior memory with the new exchanges into an updated memory. "+
 		"Keep names, decisions, facts, preferences, and open threads. Be concise "+
 		"(under ~%d tokens). Output only the updated memory, with no reasoning or tags.", maxTokens)
 	var user strings.Builder
 	if priorMemory != "" {
 		user.WriteString("Prior memory:\n")
 		user.WriteString(priorMemory)
 		user.WriteString("\n\n")
 	}
 	user.WriteString("New exchanges to fold in:\n")
 	user.WriteString(transcript)
 	return schema.Messages{
 		{Role: string(types.MessageRoleSystem), StringContent: system, Content: system},
 		{Role: string(types.MessageRoleUser), StringContent: user.String(), Content: user.String()},
 	}
 }
 // clearInputAudio resets the session's pending input audio buffer (the raw
 // PCM and any buffered Opus frames). Used by the input_audio_buffer.clear
 // realtime event so a client can discard a partially-captured utterance.
 func clearInputAudio(s *Session) {
 	s.AudioBufferLock.Lock()
 	s.InputAudioBuffer = nil
 	s.AudioBufferLock.Unlock()
 	s.OpusFramesLock.Lock()
 	s.OpusFrames = nil
 	s.OpusFramesLock.Unlock()
 }
 // itemID extracts the id from any MessageItemUnion variant ("" if none).
 func itemID(item *types.MessageItemUnion) string {
 	switch {
 	case item == nil:
 		return ""
 	case item.System != nil:
 		return item.System.ID
 	case item.User != nil:
 		return item.User.ID
 	case item.Assistant != nil:
 		return item.Assistant.ID
 	case item.FunctionCall != nil:
 		return item.FunctionCall.ID
 	case item.FunctionCallOutput != nil:
 		return item.FunctionCallOutput.ID
 	default:
 		return ""
 	}
 }
 // deleteItem removes the item with id from items, returning the new slice and
 // whether it was found.
 func deleteItem(items []*types.MessageItemUnion, id string) ([]*types.MessageItemUnion, bool) {
 	for i, item := range items {
 		if itemID(item) == id {
 			return append(items[:i:i], items[i+1:]...), true
 		}
 	}
 	return items, false
 }
 // truncateAssistantText clears the text of the assistant item's content part at
 // contentIndex. Minimal truncate: used to discard an interrupted/barge-in
 // response tail. Both .Text and .Transcript are cleared because realtime audio
 // turns store the spoken words in .Transcript (clearing only .Text would no-op).
 func truncateAssistantText(items []*types.MessageItemUnion, id string, contentIndex int) bool {
 	for _, item := range items {
 		if itemID(item) != id || item.Assistant == nil {
 			continue
 		}
 		if contentIndex >= 0 && contentIndex < len(item.Assistant.Content) {
 			item.Assistant.Content[contentIndex].Text = ""
 			item.Assistant.Content[contentIndex].Transcript = ""
 		}
 		return true
 	}
 	return false
 }
 // compactionCut returns the index splitting items into overflow (items[:cut],
 // to be summarized+evicted) and the kept live tail (items[cut:]), keeping the
 // last `keep` items. It mirrors trimRealtimeItems' pair-safety: the cut is
 // pulled left so a function_call and its function_call_output are never split
 // across the boundary (the whole pair lands in the kept tail). Returns 0 when
 // there is nothing to cut.
 func compactionCut(items []*types.MessageItemUnion, keep int) int {
 	// keep <= 0 means no live-window cap (the "unlimited history" sentinel, as
 	// in trimRealtimeItems): there is nothing to evict, so cut nothing. This
 	// also avoids indexing items[len(items)] in the pair-safety loop below.
 	if keep <= 0 {
 		return 0
 	}
 	cut := len(items) - keep
 	if cut <= 0 {
 		return 0
 	}
 	for cut > 0 && items[cut] != nil && items[cut].FunctionCallOutput != nil {
 		cut--
 	}
 	return cut
 }
 // resolveCompaction reads the pipeline.compaction block, applying defaults and
 // the trigger>max_history invariant. maxHistory is the already-resolved live
 // window size. Returns enabled=false (and zero values) when compaction is off.
 func resolveCompaction(cfg *config.ModelConfig, maxHistory int) (enabled bool, trigger, maxSummaryTokens int, summaryModel string) {
 	if cfg == nil || cfg.Pipeline.Compaction == nil || !cfg.Pipeline.Compaction.Enabled {
 		return false, 0, 0, ""
 	}
 	c := cfg.Pipeline.Compaction
 	trigger = c.TriggerItems
 	if trigger <= 0 {
 		trigger = maxHistory * 2
 	}
 	if trigger <= maxHistory {
 		trigger = maxHistory + 1
 	}
 	maxSummaryTokens = c.MaxSummaryTokens
 	if maxSummaryTokens <= 0 {
 		maxSummaryTokens = defaultMaxSummaryTokens
 	}
 	return true, trigger, maxSummaryTokens, c.SummaryModel
 }
 // prefixMatches reports whether items begins with the same ids, in order, as
 // snapshot — i.e. the overflow we summarized is still at the head (no concurrent
 // client delete reshuffled it).
 func prefixMatches(items, snapshot []*types.MessageItemUnion) bool {
 	if len(items) < len(snapshot) {
 		return false
 	}
 	for i := range snapshot {
 		if itemID(items[i]) != itemID(snapshot[i]) {
 			return false
 		}
 	}
 	return true
 }
 // compact folds overflow items into conv.Memory and evicts them. It never holds
 // conv.Lock across the summarizer call: snapshot under lock, summarize unlocked,
 // commit under lock (re-validating the head is unchanged). On any error it
 // leaves the conversation untouched — items are never dropped without a summary.
 func (s *Session) compact(conv *Conversation, model Model) {
 	if model == nil {
 		return
 	}
 	// Snapshot.
 	conv.Lock.Lock()
 	if len(conv.Items) <= s.CompactionTrigger {
 		conv.Lock.Unlock()
 		return
 	}
 	cut := compactionCut(conv.Items, s.MaxHistoryItems)
 	if cut <= 0 {
 		conv.Lock.Unlock()
 		return
 	}
 	overflow := append([]*types.MessageItemUnion(nil), conv.Items[:cut]...)
 	prior := conv.Memory
 	conv.Lock.Unlock()
 	// Summarize (unlocked).
 	msgs := buildSummaryMessages(prior, renderItemsTranscript(overflow), s.MaxSummaryTokens)
 	ctx, cancel := context.WithTimeout(context.Background(), compactionTimeout)
 	defer cancel()
 	predFunc, err := model.Predict(ctx, msgs, nil, nil, nil, nil, nil, nil, nil, nil, nil)
 	if err != nil {
 		xlog.Warn("realtime compaction: summarizer predict failed", "error", err)
 		return
 	}
 	pred, err := predFunc()
 	if err != nil {
 		xlog.Warn("realtime compaction: summarizer inference failed", "error", err)
 		return
 	}
 	// Strip any leaked reasoning/thinking spans using the same extractor the
 	// rest of the realtime path uses, rather than a bespoke regex.
 	rcfg := reasoning.Config{}
 	if mc := model.PredictConfig(); mc != nil {
 		rcfg = spokenReasoningConfig(mc.ReasoningConfig)
 	}
 	_, summary := reasoning.ExtractReasoningComplete(pred.Response, "", rcfg)
 	summary = strings.TrimSpace(summary)
 	if summary == "" {
 		xlog.Warn("realtime compaction: empty summary, skipping eviction")
 		return
 	}
 	// Commit.
 	conv.Lock.Lock()
 	defer conv.Lock.Unlock()
 	if !prefixMatches(conv.Items, overflow) {
 		xlog.Debug("realtime compaction: head changed during summary, skipping")
 		return
 	}
 	conv.Memory = summary
 	conv.Items = conv.Items[len(overflow):]
 	xlog.Debug("realtime compaction: evicted items into memory", "evicted", len(overflow), "remaining", len(conv.Items))
 }
 // summarizerModel resolves the model used to produce compaction summaries.
 // Without a configured summary_model (or factory) it reuses the pipeline LLM.
 func (s *Session) summarizerModel() Model {
 	if s.SummaryModel == "" || s.summarizerFactory == nil {
 		return s.ModelInterface
 	}
 	s.summarizerOnce.Do(func() {
 		m, err := s.summarizerFactory()
 		if err != nil {
 			xlog.Warn("realtime compaction: summary_model load failed, falling back to pipeline LLM", "model", s.SummaryModel, "error", err)
 			m = s.ModelInterface
 		}
 		s.summarizerCached = m
 	})
 	return s.summarizerCached
 }
 // maybeCompact schedules a background compaction when the live buffer has grown
 // past the trigger and none is already running. Returns immediately.
 func (s *Session) maybeCompact(conv *Conversation) {
 	if !s.CompactionEnabled {
 		return
 	}
 	conv.Lock.Lock()
 	over := len(conv.Items) > s.CompactionTrigger
 	conv.Lock.Unlock()
 	if !over {
 		return
 	}
 	if !conv.compacting.CompareAndSwap(false, true) {
 		return
 	}
 	go func() {
 		defer conv.compacting.Store(false)
 		// Resolve (and, for a configured summary_model, lazily load) the
 		// summarizer only when a compaction actually runs, off the response
 		// path — so the model load never blocks a user turn.
 		model := s.summarizerModel()
 		if model == nil {
 			return
 		}
 		s.compact(conv, model)
 	}()
 }
--- a/core/http/endpoints/openai/realtime_compaction_test.go
+++ b/core/http/endpoints/openai/realtime_compaction_test.go
@@ -1,308 +0,0 @@
 package openai
 import (
 	"errors"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
 	"github.com/mudler/LocalAI/core/schema"
 )
 var _ = Describe("resolveCompaction", func() {
 	It("disables when the block is absent", func() {
 		enabled, _, _, _ := resolveCompaction(&config.ModelConfig{}, 6)
 		Expect(enabled).To(BeFalse())
 	})
 	It("defaults trigger to 2x max history and tokens to 512", func() {
 		cfg := &config.ModelConfig{Pipeline: config.Pipeline{Compaction: &config.PipelineCompaction{Enabled: true}}}
 		enabled, trigger, maxTok, _ := resolveCompaction(cfg, 6)
 		Expect(enabled).To(BeTrue())
 		Expect(trigger).To(Equal(12))
 		Expect(maxTok).To(Equal(512))
 	})
 	It("clamps trigger to max history + 1 when misconfigured", func() {
 		cfg := &config.ModelConfig{Pipeline: config.Pipeline{Compaction: &config.PipelineCompaction{Enabled: true, TriggerItems: 4}}}
 		_, trigger, _, _ := resolveCompaction(cfg, 6)
 		Expect(trigger).To(Equal(7))
 	})
 	It("honors explicit values", func() {
 		cfg := &config.ModelConfig{Pipeline: config.Pipeline{Compaction: &config.PipelineCompaction{
 			Enabled: true, TriggerItems: 20, MaxSummaryTokens: 256, SummaryModel: "tiny"}}}
 		enabled, trigger, maxTok, model := resolveCompaction(cfg, 6)
 		Expect(enabled).To(BeTrue())
 		Expect(trigger).To(Equal(20))
 		Expect(maxTok).To(Equal(256))
 		Expect(model).To(Equal("tiny"))
 	})
 })
 var _ = Describe("deleteItem", func() {
 	mk := func(ids ...string) []*types.MessageItemUnion {
 		out := make([]*types.MessageItemUnion, len(ids))
 		for i, id := range ids {
 			out[i] = &types.MessageItemUnion{User: &types.MessageItemUser{ID: id}}
 		}
 		return out
 	}
 	It("removes the item with the given id", func() {
 		items, ok := deleteItem(mk("a", "b", "c"), "b")
 		Expect(ok).To(BeTrue())
 		Expect(len(items)).To(Equal(2))
 		Expect(itemID(items[0])).To(Equal("a"))
 		Expect(itemID(items[1])).To(Equal("c"))
 	})
 	It("reports not found for an unknown id", func() {
 		_, ok := deleteItem(mk("a"), "zzz")
 		Expect(ok).To(BeFalse())
 	})
 })
 var _ = Describe("clearInputAudio", func() {
 	It("resets the pending PCM and buffered Opus frames", func() {
 		s := &Session{InputAudioBuffer: []byte{1, 2, 3}, OpusFrames: [][]byte{{9}}}
 		clearInputAudio(s)
 		Expect(s.InputAudioBuffer).To(BeNil())
 		Expect(s.OpusFrames).To(BeNil())
 	})
 })
 var _ = Describe("truncateAssistantText", func() {
 	It("clears the text of the assistant content part at the index", func() {
 		items := []*types.MessageItemUnion{{Assistant: &types.MessageItemAssistant{
 			ID:      "a1",
 			Content: []types.MessageContentOutput{{Type: types.MessageContentTypeText, Text: "hello world"}},
 		}}}
 		ok := truncateAssistantText(items, "a1", 0)
 		Expect(ok).To(BeTrue())
 		Expect(items[0].Assistant.Content[0].Text).To(Equal(""))
 	})
 	// Realtime assistant *audio* turns store the spoken words in .Transcript, not
 	// .Text, so a barge-in truncate must clear .Transcript too or it would no-op.
 	It("clears the transcript of an assistant audio content part", func() {
 		items := []*types.MessageItemUnion{{Assistant: &types.MessageItemAssistant{
 			ID:      "a1",
 			Content: []types.MessageContentOutput{{Type: types.MessageContentTypeAudio, Transcript: "hello world"}},
 		}}}
 		ok := truncateAssistantText(items, "a1", 0)
 		Expect(ok).To(BeTrue())
 		Expect(items[0].Assistant.Content[0].Transcript).To(Equal(""))
 	})
 	It("returns false for an unknown id", func() {
 		Expect(truncateAssistantText(nil, "nope", 0)).To(BeFalse())
 	})
 })
 var _ = Describe("compactionCut", func() {
 	user := func(id string) *types.MessageItemUnion {
 		return &types.MessageItemUnion{User: &types.MessageItemUser{ID: id}}
 	}
 	call := func(id string) *types.MessageItemUnion {
 		return &types.MessageItemUnion{FunctionCall: &types.MessageItemFunctionCall{ID: id}}
 	}
 	out := func(id string) *types.MessageItemUnion {
 		return &types.MessageItemUnion{FunctionCallOutput: &types.MessageItemFunctionCallOutput{ID: id}}
 	}
 	It("cuts exactly len-keep when no pairs straddle the boundary", func() {
 		items := []*types.MessageItemUnion{user("1"), user("2"), user("3"), user("4")}
 		Expect(compactionCut(items, 2)).To(Equal(2))
 	})
 	It("returns 0 when nothing to cut", func() {
 		Expect(compactionCut([]*types.MessageItemUnion{user("1")}, 2)).To(Equal(0))
 	})
 	It("returns 0 (cuts nothing) when keep is 0 — the unlimited-window sentinel", func() {
 		items := []*types.MessageItemUnion{user("1"), user("2"), user("3")}
 		Expect(compactionCut(items, 0)).To(Equal(0))
 	})
 	It("moves the boundary so a call/output pair is not split", func() {
 		// keep=2 -> naive cut=2, but items[2] is the output of items[1]'s call;
 		// pull the cut right so the whole pair stays in the kept tail.
 		items := []*types.MessageItemUnion{user("1"), call("c"), out("c"), user("4")}
 		Expect(compactionCut(items, 2)).To(Equal(1))
 	})
 })
 var _ = Describe("withMemory", func() {
 	It("inserts a memory system message when memory is non-empty", func() {
 		base := schema.Messages{{Role: "system", StringContent: "instructions"}}
 		out := withMemory(base, "user is Bob; wants pizza")
 		Expect(len(out)).To(Equal(2))
 		Expect(out[1].Role).To(Equal("system"))
 		Expect(out[1].StringContent).To(ContainSubstring("user is Bob"))
 		Expect(out[1].StringContent).To(ContainSubstring("Summary of earlier conversation"))
 	})
 	It("is a no-op when memory is empty", func() {
 		base := schema.Messages{{Role: "system", StringContent: "instructions"}}
 		Expect(withMemory(base, "")).To(HaveLen(1))
 	})
 })
 var _ = Describe("renderItemsTranscript", func() {
 	It("renders user and assistant text turns", func() {
 		items := []*types.MessageItemUnion{
 			{User: &types.MessageItemUser{Content: []types.MessageContentInput{{Type: types.MessageContentTypeInputText, Text: "hi"}}}},
 			{Assistant: &types.MessageItemAssistant{Content: []types.MessageContentOutput{{Type: types.MessageContentTypeText, Text: "hello"}}}},
 		}
 		out := renderItemsTranscript(items)
 		Expect(out).To(ContainSubstring("user: hi"))
 		Expect(out).To(ContainSubstring("assistant: hello"))
 	})
 	// Realtime assistant *audio* turns store the spoken words in .Transcript, not
 	// .Text, so the transcript builder must emit .Transcript too or spoken turns
 	// would be dropped from the summary.
 	It("renders an assistant audio turn from its transcript", func() {
 		items := []*types.MessageItemUnion{
 			{Assistant: &types.MessageItemAssistant{Content: []types.MessageContentOutput{{Type: types.MessageContentTypeAudio, Transcript: "spoken words"}}}},
 		}
 		Expect(renderItemsTranscript(items)).To(ContainSubstring("assistant: spoken words"))
 	})
 })
 var _ = Describe("buildSummaryMessages", func() {
 	It("includes prior memory and the new transcript", func() {
 		msgs := buildSummaryMessages("prior facts", "user: hi", 512)
 		Expect(len(msgs)).To(Equal(2))
 		Expect(msgs[0].Role).To(Equal("system"))
 		Expect(msgs[1].StringContent).To(ContainSubstring("prior facts"))
 		Expect(msgs[1].StringContent).To(ContainSubstring("user: hi"))
 	})
 })
 var _ = Describe("compact", func() {
 	user := func(id, text string) *types.MessageItemUnion {
 		return &types.MessageItemUnion{User: &types.MessageItemUser{ID: id,
 			Content: []types.MessageContentInput{{Type: types.MessageContentTypeInputText, Text: text}}}}
 	}
 	It("summarizes overflow into Memory and evicts it, keeping the live tail", func() {
 		conv := &Conversation{Items: []*types.MessageItemUnion{
 			user("1", "a"), user("2", "b"), user("3", "c"), user("4", "d"),
 			user("5", "e"), user("6", "f"), user("7", "g"), user("8", "h"),
 		}}
 		s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4, MaxSummaryTokens: 512}
 		m := &fakeModel{predictResp: backend.LLMResponse{Response: "ROLLED UP"}}
 		s.compact(conv, m)
 		Expect(conv.Memory).To(Equal("ROLLED UP"))
 		Expect(len(conv.Items)).To(Equal(4))
 		Expect(itemID(conv.Items[0])).To(Equal("5"))
 		// The summarizer saw the evicted turns.
 		Expect(m.lastMessages[1].StringContent).To(ContainSubstring("a"))
 	})
 	It("leaves Items and Memory untouched when the summarizer errors", func() {
 		items := []*types.MessageItemUnion{user("1", "a"), user("2", "b"), user("3", "c")}
 		conv := &Conversation{Items: items}
 		s := &Session{CompactionEnabled: true, CompactionTrigger: 2, MaxHistoryItems: 1, MaxSummaryTokens: 512}
 		m := &fakeModel{predictErr: errors.New("boom")}
 		s.compact(conv, m)
 		Expect(conv.Memory).To(Equal(""))
 		Expect(len(conv.Items)).To(Equal(3))
 	})
 	It("strips leaked reasoning tags from the summary via the shared extractor", func() {
 		conv := &Conversation{Items: []*types.MessageItemUnion{
 			user("1", "a"), user("2", "b"), user("3", "c"), user("4", "d"),
 			user("5", "e"), user("6", "f"), user("7", "g"), user("8", "h"),
 		}}
 		s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4, MaxSummaryTokens: 512}
 		m := &fakeModel{predictResp: backend.LLMResponse{Response: "<think>planning the summary</think>CLEAN SUMMARY"}}
 		s.compact(conv, m)
 		Expect(conv.Memory).To(Equal("CLEAN SUMMARY"))
 		Expect(conv.Memory).ToNot(ContainSubstring("planning"))
 	})
 	It("does nothing when items are at or below the trigger", func() {
 		conv := &Conversation{Items: []*types.MessageItemUnion{user("1", "a")}}
 		s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4}
 		s.compact(conv, &fakeModel{predictResp: backend.LLMResponse{Response: "x"}})
 		Expect(conv.Memory).To(Equal(""))
 		Expect(len(conv.Items)).To(Equal(1))
 	})
 })
 var _ = Describe("prefixMatches", func() {
 	user := func(id string) *types.MessageItemUnion {
 		return &types.MessageItemUnion{User: &types.MessageItemUser{ID: id}}
 	}
 	It("matches when items begins with the snapshot ids in order", func() {
 		items := []*types.MessageItemUnion{user("1"), user("2"), user("3")}
 		snap := []*types.MessageItemUnion{user("1"), user("2")}
 		Expect(prefixMatches(items, snap)).To(BeTrue())
 	})
 	It("matches an empty snapshot", func() {
 		Expect(prefixMatches([]*types.MessageItemUnion{user("1")}, nil)).To(BeTrue())
 	})
 	It("fails when items is shorter than the snapshot (a concurrent delete shrank the head)", func() {
 		items := []*types.MessageItemUnion{user("1")}
 		snap := []*types.MessageItemUnion{user("1"), user("2")}
 		Expect(prefixMatches(items, snap)).To(BeFalse())
 	})
 	It("fails when the head ids differ (a concurrent delete reordered the head)", func() {
 		items := []*types.MessageItemUnion{user("2"), user("3")}
 		snap := []*types.MessageItemUnion{user("1"), user("2")}
 		Expect(prefixMatches(items, snap)).To(BeFalse())
 	})
 })
 var _ = Describe("summarizerModel", func() {
 	It("returns the pipeline model when no summary_model is set", func() {
 		m := &fakeModel{}
 		s := &Session{ModelInterface: m}
 		Expect(s.summarizerModel()).To(Equal(m))
 	})
 	It("uses the factory (once) when summary_model is set", func() {
 		pipeline := &fakeModel{}
 		small := &fakeModel{}
 		calls := 0
 		s := &Session{ModelInterface: pipeline, SummaryModel: "tiny",
 			summarizerFactory: func() (Model, error) { calls++; return small, nil }}
 		Expect(s.summarizerModel()).To(Equal(small))
 		Expect(s.summarizerModel()).To(Equal(small))
 		Expect(calls).To(Equal(1))
 	})
 	It("falls back to the pipeline model when the factory errors", func() {
 		pipeline := &fakeModel{}
 		s := &Session{ModelInterface: pipeline, SummaryModel: "tiny",
 			summarizerFactory: func() (Model, error) { return nil, errors.New("nope") }}
 		Expect(s.summarizerModel()).To(Equal(pipeline))
 	})
 })
 var _ = Describe("itemID", func() {
 	It("returns the id for each variant and empty for nil", func() {
 		Expect(itemID(nil)).To(Equal(""))
 		Expect(itemID(&types.MessageItemUnion{User: &types.MessageItemUser{ID: "u1"}})).To(Equal("u1"))
 		Expect(itemID(&types.MessageItemUnion{Assistant: &types.MessageItemAssistant{ID: "a1"}})).To(Equal("a1"))
 		Expect(itemID(&types.MessageItemUnion{System: &types.MessageItemSystem{ID: "s1"}})).To(Equal("s1"))
 		Expect(itemID(&types.MessageItemUnion{FunctionCall: &types.MessageItemFunctionCall{ID: "f1"}})).To(Equal("f1"))
 		Expect(itemID(&types.MessageItemUnion{FunctionCallOutput: &types.MessageItemFunctionCallOutput{ID: "o1"}})).To(Equal("o1"))
 	})
 })
--- a/core/http/endpoints/openai/realtime_model.go
+++ b/core/http/endpoints/openai/realtime_model.go
@@ -432,7 +432,7 @@ func loadSoundDetectionConfig(pipeline *config.Pipeline, cl *config.ModelConfigL
 	if pipeline.SoundDetection == "" {
 		return nil, nil
 	}
-	cfg, err := loadPipelineSubModel(cl, pipeline.SoundDetection, ml.ModelPath)
+	cfg, err := cl.LoadModelConfigFileByName(pipeline.SoundDetection, ml.ModelPath)
 	if err != nil {
 		return nil, fmt.Errorf("failed to load sound detection config: %w", err)
 	}
@@ -443,7 +443,7 @@ func loadSoundDetectionConfig(pipeline *config.Pipeline, cl *config.ModelConfigL
 }
 func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, *config.ModelConfig, error) {
-	cfgVAD, err := loadPipelineSubModel(cl, pipeline.VAD, ml.ModelPath)
+	cfgVAD, err := cl.LoadModelConfigFileByName(pipeline.VAD, ml.ModelPath)
 	if err != nil {
 		return nil, nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -453,7 +453,7 @@ func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfig
 		return nil, nil, fmt.Errorf("failed to validate config: %w", err)
 	}
-	cfgSST, err := loadPipelineSubModel(cl, pipeline.Transcription, ml.ModelPath)
+	cfgSST, err := cl.LoadModelConfigFileByName(pipeline.Transcription, ml.ModelPath)
 	if err != nil {
 		return nil, nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -542,30 +542,11 @@ func buildRealtimeRoutingContext(a *application.Application, sessionID string) *
 	}
 }
 // loadPipelineSubModel loads a pipeline sub-model config by name and follows a
 // single alias hop, so a pipeline that references an alias (e.g. `llm: default`)
 // gets the alias target's full config (Backend, Model, ...) rather than the
 // alias stub with an empty Backend. Without this the alias survives unresolved
 // into model loading and fails downstream — notably in distributed mode with
 // "backend name is empty". Mirrors the top-level alias resolution in
 // core/http/middleware/request.go.
 func loadPipelineSubModel(cl *config.ModelConfigLoader, name, modelPath string) (*config.ModelConfig, error) {
 	cfg, err := cl.LoadModelConfigFileByName(name, modelPath)
 	if err != nil {
 		return nil, err
 	}
 	resolved, _, err := cl.ResolveAlias(cfg)
 	if err != nil {
 		return nil, err
 	}
 	return resolved, nil
 }
 // returns and loads either a wrapped model or a model that support audio-to-audio
 func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, evaluator *templates.Evaluator, routing *RealtimeRoutingContext) (Model, error) {
 	xlog.Debug("Creating new model pipeline model", "pipeline", pipeline)
-	cfgVAD, err := loadPipelineSubModel(cl, pipeline.VAD, ml.ModelPath)
+	cfgVAD, err := cl.LoadModelConfigFileByName(pipeline.VAD, ml.ModelPath)
 	if err != nil {
 		return nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -576,7 +557,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
 	}
 	// TODO: Do we always need a transcription model? It can be disabled. Note that any-to-any instruction following models don't transcribe as such, so if transcription is required it is a separate process
-	cfgSST, err := loadPipelineSubModel(cl, pipeline.Transcription, ml.ModelPath)
+	cfgSST, err := cl.LoadModelConfigFileByName(pipeline.Transcription, ml.ModelPath)
 	if err != nil {
 		return nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -608,7 +589,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
 	xlog.Debug("Loading a wrapped model")
 	// Otherwise we want to return a wrapped model, which is a "virtual" model that re-uses other models to perform operations
-	cfgLLM, err := loadPipelineSubModel(cl, pipeline.LLM, ml.ModelPath)
+	cfgLLM, err := cl.LoadModelConfigFileByName(pipeline.LLM, ml.ModelPath)
 	if err != nil {
 		return nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -623,7 +604,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
 	applyPipelineReasoning(cfgLLM, *pipeline)
 	applyPipelineThinking(cfgLLM, *pipeline)
-	cfgTTS, err := loadPipelineSubModel(cl, pipeline.TTS, ml.ModelPath)
+	cfgTTS, err := cl.LoadModelConfigFileByName(pipeline.TTS, ml.ModelPath)
 	if err != nil {
 		return nil, fmt.Errorf("failed to load backend config: %w", err)
--- a/core/http/endpoints/openai/realtime_model_alias_test.go
+++ b/core/http/endpoints/openai/realtime_model_alias_test.go
@@ -1,52 +0,0 @@
 package openai
 import (
 	"os"
 	"path/filepath"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 	"github.com/mudler/LocalAI/core/config"
 )
 // loadPipelineSubModel must resolve a pipeline sub-model that references an
 // alias (e.g. `llm: default`) one hop to the alias target's full config — so
 // the effective backend is the target's backend, not the empty backend of the
 // alias stub. This mirrors the top-level alias resolution done in
 // core/http/middleware/request.go, which the realtime pipeline previously
 // skipped (failing in distributed mode with "backend name is empty").
 var _ = Describe("loadPipelineSubModel", func() {
 	It("resolves a sub-model alias one hop to the target's config", func() {
 		tmpDir := GinkgoT().TempDir()
 		// A real model config with a concrete backend.
 		realLLM := `name: real-llm
 backend: llama-cpp
 parameters:
  model: real-llm.gguf
 `
 		Expect(os.WriteFile(filepath.Join(tmpDir, "real-llm.yaml"), []byte(realLLM), 0644)).To(Succeed())
 		// An alias pointing at the real model.
 		aliasCfg := `name: default
 alias: real-llm
 `
 		Expect(os.WriteFile(filepath.Join(tmpDir, "default.yaml"), []byte(aliasCfg), 0644)).To(Succeed())
 		cl := config.NewModelConfigLoader(tmpDir)
 		Expect(cl.LoadModelConfigsFromPath(tmpDir)).To(Succeed())
 		// Resolving the alias must follow the hop to the target's full config.
 		resolved, err := loadPipelineSubModel(cl, "default", tmpDir)
 		Expect(err).NotTo(HaveOccurred())
 		Expect(resolved.IsAlias()).To(BeFalse())
 		Expect(resolved.Backend).To(Equal("llama-cpp"))
 		// A non-alias name must load unchanged.
 		direct, err := loadPipelineSubModel(cl, "real-llm", tmpDir)
 		Expect(err).NotTo(HaveOccurred())
 		Expect(direct.Backend).To(Equal("llama-cpp"))
 		Expect(direct.Name).To(Equal("real-llm"))
 	})
 })
--- a/core/http/react-ui/e2e/model-config.spec.js
+++ b/core/http/react-ui/e2e/model-config.spec.js
@@ -288,21 +288,6 @@ test.describe('Model Editor - Interactive Tab', () => {
    await expect(page.locator('input[placeholder^="match,"]')).toBeVisible()
  })
  test('pattern min_len clamps a directly-typed negative to 0', async ({ page }) => {
    const searchInput = page.locator('input[placeholder="Search fields to add..."]')
    await searchInput.fill('Custom Secret Patterns')
    const dropdown = searchInput.locator('..').locator('..')
    await dropdown.locator('div', { hasText: 'Custom Secret Patterns' }).first().click()
    await page.locator('button', { hasText: 'Add pattern' }).click()
    // The number input's min={0} only limits the spinner arrows, not keyboard
    // entry; the editor must sanitise a typed negative so a meaningless
    // negative length floor never reaches the saved config.
    const minLen = page.locator('input[aria-label="Minimum length"]')
    await minLen.fill('-5')
    await expect(minLen).toHaveValue('0')
  })
  // Regression: a map-typed field (entity_actions) present in the loaded YAML
  // must render WITH its values. flattenConfig used to recurse into the map,
  // scattering it across pii_detection.entity_actions.<GROUP> paths that match
@@ -344,37 +329,4 @@ test.describe('Model Editor - Interactive Tab', () => {
    await expect(page.getByText(/block —/i).first()).toBeVisible()
  })
  // A map cannot hold two values for one key, so renaming a row to an existing
  // group must collapse to a single row (Object.fromEntries, last write wins)
  // rather than rendering two conflicting rows that silently lose one on save.
  test('entity_actions collapses a duplicate group to a single row', async ({ page }) => {
    await page.route('**/api/models/edit/ner-model', (route) => {
      route.fulfill({
        contentType: 'application/json',
        body: JSON.stringify({
          name: 'ner-model',
          config: [
            'name: ner-model',
            'backend: llama-cpp',
            'pii_detection:',
            '    entity_actions:',
            '        SSN: block',
            '        EMAIL: mask',
            '',
          ].join('\n'),
        }),
      })
    })
    await page.goto('/app/model-editor/ner-model')
    const groupInputs = page.locator('input[aria-label="Entity group"]')
    await expect(groupInputs).toHaveCount(2)
    // Rename the EMAIL row to duplicate SSN; the editor collapses to one SSN row.
    await groupInputs.nth(1).fill('SSN')
    await expect(groupInputs).toHaveCount(1)
    await expect(groupInputs.nth(0)).toHaveValue('SSN')
  })
 })
--- a/core/http/react-ui/e2e/nodes-detail.spec.js
+++ b/core/http/react-ui/e2e/nodes-detail.spec.js
@@ -1,34 +0,0 @@
 import { test, expect } from './coverage-fixtures.js'
 const ID = 'n1'
 async function mockNode(page) {
  await page.route(`**/api/nodes/${ID}`, r => r.fulfill({ status: 200, contentType: 'application/json',
    body: JSON.stringify({ id: ID, name: 'alpha', node_type: 'backend', address: '10.0.0.1:50051', status: 'healthy', total_vram: 24e9, available_vram: 12e9, max_replicas_per_model: 1, labels: { env: 'prod' } }) }))
  await page.route(`**/api/nodes/${ID}/models`, r => r.fulfill({ status: 200, contentType: 'application/json',
    body: JSON.stringify([{ node_id: ID, model_name: 'llama-3.3', state: 'loaded', in_flight: 0, replica_index: 0 }]) }))
  await page.route(`**/api/nodes/${ID}/backends`, r => r.fulfill({ status: 200, contentType: 'application/json',
    body: JSON.stringify([{ name: 'llama-cpp', is_system: true, installed_at: '2026-06-01T00:00:00Z' }]) }))
 }
 test.describe('Node detail page', () => {
  test('renders sections for a node', async ({ page }) => {
    await mockNode(page)
    await page.goto(`/app/nodes/${ID}`)
    await expect(page.locator('.page-title').first()).toBeVisible({ timeout: 15_000 })
    await expect(page.getByText('alpha')).toBeVisible()
    await expect(page.getByText('llama-3.3')).toBeVisible()
    await expect(page.getByText('llama-cpp')).toBeVisible()
    await expect(page.getByText('env=prod')).toBeVisible()
  })
  test('is reachable by clicking a roster panel', async ({ page }) => {
    await page.route('**/api/nodes', r => r.fulfill({ status: 200, contentType: 'application/json',
      body: JSON.stringify([{ id: ID, name: 'alpha', node_type: 'backend', address: '10.0.0.1:50051', status: 'healthy' }]) }))
    await page.route('**/api/nodes/models', r => r.fulfill({ status: 200, contentType: 'application/json', body: '[]' }))
    await page.route('**/api/nodes/scheduling', r => r.fulfill({ status: 200, contentType: 'application/json', body: '[]' }))
    await mockNode(page)
    await page.goto('/app/nodes')
    await page.locator('.node-panel').filter({ hasText: 'alpha' }).getByText('alpha').click()
    await expect(page).toHaveURL(new RegExp(`/app/nodes/${ID}$`))
  })
 })
--- a/Show More
+++ b/Show More