chore(model gallery): 🤖 add 1 new models via gallery agent (#10464 )

chore(model gallery): 🤖 add new models via gallery agent Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
chore: ⬆️ Update leejet/stable-diffusion.cpp to f440ad9c29dd8bc34e5d1f4b863832b96d6ea05f (#10457 )
2026-06-23 16:19:07 -04:00 · 2026-06-23 15:43:21 +02:00 · 2026-06-23 13:29:07 +02:00 · 2026-06-23 13:28:49 +02:00 · 2026-06-23 13:28:09 +02:00 · 2026-06-23 13:27:51 +02:00
272 changed files with 12367 additions and 2239 deletions
--- a/.agents/adding-backends.md
+++ b/.agents/adding-backends.md
@@ -198,6 +198,27 @@ docker-build-backends: ... docker-build-<backend-name>
 - If the backend is in `backend/python/<backend-name>/` but uses `.` as context in the workflow file, use `.` context
 - Check similar backends to determine the correct context
 ## Documenting the backend (README + docs)
 A backend is not "added" until it is discoverable. Update the user-facing docs:
 - **`docs/content/features/backends.md`** - add the backend to the right
  category in the "LocalAI supports various types of backends" list (and add a
  new category if it introduces a new modality, e.g. sound classification).
 - If the backend introduces a **new API surface** (a new endpoint or a realtime
  capability), document it under `docs/content/` where its area lives (audio,
  vision, etc.) and follow the api-endpoints checklist in
  [api-endpoints-and-auth.md](api-endpoints-and-auth.md).
 **If the backend is a native C/C++/GGML engine created and maintained by the
 LocalAI team** (a from-scratch port like `parakeet.cpp`, `ced.cpp`,
 `vibevoice.cpp`, `rf-detr.cpp`, not a wrapper around a third-party runtime), it
 ALSO belongs in the top-level **`README.md`** table under "native C/C++/GGML
 engines ... developed and maintained by the LocalAI project itself". Add a row
 linking the upstream engine repo with a one-line description. This is the
 project's showcase of its own engines; a new in-house backend that is missing
 from it is a documentation bug.
 ## 5. Verification Checklist
 After adding a new backend, verify:
@@ -211,6 +232,8 @@ After adding a new backend, verify:
 - [ ] No YAML syntax errors (check with linter)
 - [ ] No Makefile syntax errors (check with linter)
 - [ ] Follows the same pattern as similar backends (e.g., if it's a transcription backend, follow `faster-whisper` pattern)
 - [ ] Documented: added to the category list in `docs/content/features/backends.md` (and any new endpoint/realtime capability documented under `docs/content/`)
 - [ ] If it is an in-house native C/C++/GGML engine, added to the maintained-engines table in the top-level `README.md`
 ## Bundling runtime shared libraries (`package.sh`)
--- a/.docker/install-base-deps.sh
+++ b/.docker/install-base-deps.sh
@@ -70,6 +70,12 @@ if [ "${BUILD_TYPE:-}" = "vulkan" ] && [ "${SKIP_DRIVERS:-false}" = "false" ]; t
        git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
        ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
        clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
    # Mesa Vulkan ICD drivers (ANV/RADV/lavapipe + Arm SoC) and their ICD
    # manifests. The LunarG SDK below only provides the loader and shader
    # tooling, not hardware drivers — without Mesa the packaged Vulkan backend
    # would ship a loader that finds no GPU. package-gpu-libs.sh bundles these
    # .so files plus their deps into the backend so it stays self-contained.
    apt-get install -y mesa-vulkan-drivers libdrm2
    if [ "amd64" = "${TARGETARCH:-}" ]; then
        wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz"
        tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -3575,6 +3575,154 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  # ced
  - build-type: 'cublas'
    cuda-major-version: "12"
    cuda-minor-version: "8"
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-nvidia-cuda-12-ced'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "ced"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "13"
    cuda-minor-version: "0"
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-nvidia-cuda-13-ced'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "ced"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "13"
    cuda-minor-version: "0"
    platforms: 'linux/arm64'
    skip-drivers: 'false'
    tag-latest: 'auto'
    tag-suffix: '-nvidia-l4t-cuda-13-arm64-ced'
    base-image: "ubuntu:24.04"
    ubuntu-version: '2404'
    runs-on: 'ubuntu-24.04-arm'
    backend: "ced"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
  - build-type: ''
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    platform-tag: 'amd64'
    tag-latest: 'auto'
    tag-suffix: '-cpu-ced'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "ced"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: ''
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/arm64'
    platform-tag: 'arm64'
    tag-latest: 'auto'
    tag-suffix: '-cpu-ced'
    runs-on: 'ubuntu-24.04-arm'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "ced"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'sycl_f32'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-intel-sycl-f32-ced'
    runs-on: 'ubuntu-latest'
    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
    skip-drivers: 'false'
    backend: "ced"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'sycl_f16'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-intel-sycl-f16-ced'
    runs-on: 'ubuntu-latest'
    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
    skip-drivers: 'false'
    backend: "ced"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'vulkan'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    platform-tag: 'amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-vulkan-ced'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "ced"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'vulkan'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/arm64'
    platform-tag: 'arm64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-vulkan-ced'
    runs-on: 'ubuntu-24.04-arm'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "ced"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "12"
    cuda-minor-version: "0"
    platforms: 'linux/arm64'
    skip-drivers: 'false'
    tag-latest: 'auto'
    tag-suffix: '-nvidia-l4t-arm64-ced'
    base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
    runs-on: 'ubuntu-24.04-arm'
    backend: "ced"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2204'
  - build-type: 'hipblas'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-rocm-hipblas-ced'
    base-image: "rocm/dev-ubuntu-24.04:7.2.1"
    runs-on: 'ubuntu-latest'
    skip-drivers: 'false'
    backend: "ced"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  # acestep-cpp
  - build-type: ''
    cuda-major-version: ""
@@ -4754,6 +4902,10 @@ includeDarwin:
    tag-suffix: "-metal-darwin-arm64-parakeet-cpp"
    build-type: "metal"
    lang: "go"
  - backend: "ced"
    tag-suffix: "-metal-darwin-arm64-ced"
    build-type: "metal"
    lang: "go"
  - backend: "acestep-cpp"
    tag-suffix: "-metal-darwin-arm64-acestep-cpp"
    build-type: "metal"
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -44,7 +44,7 @@ jobs:
      has-merges-singlearch: ${{ steps.set-matrix.outputs['has-merges-singlearch'] }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
      - name: Setup Bun
        uses: oven-sh/setup-bun@v2
--- a/.github/workflows/backend_build.yml
+++ b/.github/workflows/backend_build.yml
@@ -101,7 +101,7 @@ jobs:
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
--- a/.github/workflows/backend_build_darwin.yml
+++ b/.github/workflows/backend_build_darwin.yml
@@ -57,7 +57,7 @@ jobs:
      HOMEBREW_NO_ANALYTICS: '1'
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
--- a/.github/workflows/backend_merge.yml
+++ b/.github/workflows/backend_merge.yml
@@ -49,7 +49,7 @@ jobs:
      # Sparse checkout: the merge job needs `.github/scripts/` (for the
      # keepalive cleanup script) but none of the source tree.
      - name: Checkout (.github/scripts only)
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          sparse-checkout: |
            .github/scripts
--- a/.github/workflows/backend_pr.yml
+++ b/.github/workflows/backend_pr.yml
@@ -23,7 +23,7 @@ jobs:
      has-merges-singlearch: ${{ steps.set-matrix.outputs['has-merges-singlearch'] }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
      - name: Setup Bun
        uses: oven-sh/setup-bun@v2
--- a/.github/workflows/base-images.yml
+++ b/.github/workflows/base-images.yml
@@ -127,7 +127,7 @@ jobs:
            # the original l4t matrix entry which set skip-drivers: 'true'.
            skip-drivers: 'true'
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
        with:
          submodules: false
      - name: Free disk space
--- a/.github/workflows/build-test.yaml
+++ b/.github/workflows/build-test.yaml
@@ -11,7 +11,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          fetch-depth: 0
      - name: Set up Go
@@ -25,7 +25,7 @@ jobs:
    runs-on: macos-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          fetch-depth: 0
      - name: Set up Go
@@ -47,7 +47,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          fetch-depth: 0
      - name: Configure apt mirror on runner
--- a/.github/workflows/bump-inference-defaults.yml
+++ b/.github/workflows/bump-inference-defaults.yml
@@ -14,7 +14,7 @@ jobs:
  bump:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
      - uses: actions/setup-go@v5
        with:
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -42,6 +42,10 @@ jobs:
            variable: "PARAKEET_VERSION"
            branch: "master"
            file: "backend/go/parakeet-cpp/Makefile"
          - repository: "mudler/ced.cpp"
            variable: "CED_VERSION"
            branch: "master"
            file: "backend/go/ced/Makefile"
          - repository: "mudler/depth-anything.cpp"
            variable: "DEPTHANYTHING_VERSION"
            branch: "master"
@@ -88,7 +92,7 @@ jobs:
            file: "backend/go/vibevoice-cpp/Makefile"
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
      - name: Bump dependencies 🔧
        id: bump
        run: |
@@ -124,7 +128,7 @@ jobs:
    if: github.repository == 'mudler/LocalAI'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
      - name: Bump vLLM cu130 wheel pin 🔧
        id: bump
        run: |
--- a/.github/workflows/bump_docs.yaml
+++ b/.github/workflows/bump_docs.yaml
@@ -13,7 +13,7 @@ jobs:
          - repository: "mudler/LocalAI"
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
      - name: Bump dependencies 🔧
        run: |
          bash .github/bump_docs.sh ${{ matrix.repository }}
--- a/.github/workflows/checksum_checker.yaml
+++ b/.github/workflows/checksum_checker.yaml
@@ -8,7 +8,7 @@ jobs:
    if: github.repository == 'mudler/LocalAI'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
      - name: Configure apt mirror on runner
        uses: ./.github/actions/configure-apt-mirror
      - name: Install dependencies
--- a/.github/workflows/deploy-explorer.yaml
+++ b/.github/workflows/deploy-explorer.yaml
@@ -16,7 +16,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - uses: actions/setup-go@v5
--- a/.github/workflows/gallery-agent.yaml
+++ b/.github/workflows/gallery-agent.yaml
@@ -31,7 +31,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -44,7 +44,7 @@ jobs:
        uses: docker/setup-buildx-action@master
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
      - name: Cache Intel images
        uses: docker/build-push-action@v7
--- a/.github/workflows/gh-pages.yml
+++ b/.github/workflows/gh-pages.yml
@@ -28,7 +28,7 @@ jobs:
      HUGO_VERSION: "0.146.3"
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          fetch-depth: 0  # needed for enableGitInfo
          submodules: true
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -80,7 +80,7 @@ jobs:
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
      - name: Configure apt mirror on runner
        id: apt_mirror
--- a/.github/workflows/image_merge.yml
+++ b/.github/workflows/image_merge.yml
@@ -36,7 +36,7 @@ jobs:
      # Sparse checkout: needed for .github/scripts/ (the keepalive cleanup
      # script). Skips the rest of the source tree.
      - name: Checkout (.github/scripts only)
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          sparse-checkout: |
            .github/scripts
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -20,7 +20,7 @@ jobs:
  golangci-lint:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
        with:
          # Full history so golangci-lint's new-from-merge-base can reach
          # origin/master and compute the diff against it.
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -10,7 +10,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          fetch-depth: 0
      - name: Set up Go
@@ -28,7 +28,7 @@ jobs:
    runs-on: macos-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          fetch-depth: 0
      - name: Set up Go
@@ -46,7 +46,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          fetch-depth: 0
      - name: Configure apt mirror on runner
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -14,7 +14,7 @@ jobs:
      GO111MODULE: on
    steps:
      - name: Checkout Source
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -50,7 +50,7 @@ jobs:
      parakeet-cpp: ${{ steps.detect.outputs.parakeet-cpp }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
      - name: Setup Bun
        uses: oven-sh/setup-bun@v2
      - name: Install dependencies
@@ -67,7 +67,7 @@ jobs:
  #   runs-on: ubuntu-latest
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -90,7 +90,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -113,7 +113,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -137,7 +137,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -158,7 +158,7 @@ jobs:
  #  runs-on: ubuntu-latest
  #  steps:
  #    - name: Clone
-  #      uses: actions/checkout@v6
+  #      uses: actions/checkout@v7
  #      with:
  #        submodules: true
  #    - name: Dependencies
@@ -178,7 +178,7 @@ jobs:
  #   runs-on: ubuntu-latest
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -240,7 +240,7 @@ jobs:
  #           sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
  #           df -h
  #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -265,7 +265,7 @@ jobs:
  #   runs-on: ubuntu-latest
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -288,7 +288,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -309,7 +309,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -330,7 +330,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -351,7 +351,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -373,7 +373,7 @@ jobs:
  #   timeout-minutes: 45
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -394,7 +394,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -415,7 +415,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -436,7 +436,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -462,7 +462,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -484,7 +484,7 @@ jobs:
    timeout-minutes: 30
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -513,7 +513,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -530,7 +530,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -552,7 +552,7 @@ jobs:
    timeout-minutes: 20
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -579,7 +579,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -604,7 +604,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -625,7 +625,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -645,7 +645,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -664,7 +664,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -681,7 +681,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -698,7 +698,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -741,7 +741,7 @@ jobs:
  #   timeout-minutes: 90
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -783,7 +783,7 @@ jobs:
  #   timeout-minutes: 90
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -808,7 +808,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -840,7 +840,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -876,7 +876,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -915,7 +915,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -952,7 +952,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -987,7 +987,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -1013,7 +1013,7 @@ jobs:
    timeout-minutes: 150
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -1042,7 +1042,7 @@ jobs:
    timeout-minutes: 60
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -1058,7 +1058,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -1091,7 +1091,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -1114,7 +1114,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -1140,7 +1140,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -21,7 +21,7 @@ jobs:
        go-version: ['1.26.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Free disk space
@@ -84,7 +84,7 @@ jobs:
        go-version: ['1.26.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
--- a/.github/workflows/tests-aio.yml
+++ b/.github/workflows/tests-aio.yml
@@ -62,7 +62,7 @@ jobs:
          sudo rm -rfv build || true
          df -h
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
--- a/.github/workflows/tests-e2e.yml
+++ b/.github/workflows/tests-e2e.yml
@@ -21,7 +21,7 @@ jobs:
        go-version: ['1.25.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Configure apt mirror on runner
--- a/.github/workflows/tests-pii-ner-e2e.yml
+++ b/.github/workflows/tests-pii-ner-e2e.yml
@@ -0,0 +1,97 @@
 ---
 name: 'PII NER tier E2E (live GGUF, CPU)'
 # Runs the real privacy-filter GGUF NER tier end-to-end on CPU — the gap the
 # hermetic tests/e2e suite cannot cover (it only exercises the in-process
 # pattern tier). Heavy (builds the C++ backend image + downloads a ~2.7 GB
 # GGUF), so it is path-filtered on PRs and otherwise runs nightly / on demand.
 #
 # This drives the container-level harness (tests/e2e-backends) via
 # `make test-extra-backend-privacy-filter`: it builds the privacy-filter image,
 # downloads the model, loads it on CPU, and asserts byte-correct, UTF-8-aligned
 # TokenClassify spans. The complementary HTTP-path specs in tests/e2e
 # (e2e_pii_ner_test.go) Skip unless PII_NER_MODEL_GGUF is wired.
 on:
  workflow_dispatch:
  schedule:
    - cron: '0 3 * * *'
  push:
    branches:
      - master
    paths:
      - 'backend/cpp/privacy-filter/**'
      - 'backend/Dockerfile.privacy-filter'
      - 'core/services/routing/pii/**'
      - 'core/services/routing/piidetector/**'
      - 'core/backend/token_classify.go'
      - 'core/http/endpoints/localai/pii.go'
      - 'core/schema/pii.go'
      - 'tests/e2e-backends/**'
      - 'tests/e2e/e2e_pii_ner_test.go'
      - 'tests/e2e/e2e_suite_test.go'
      - '.github/workflows/tests-pii-ner-e2e.yml'
  pull_request:
    paths:
      - 'backend/cpp/privacy-filter/**'
      - 'backend/Dockerfile.privacy-filter'
      - 'core/services/routing/pii/**'
      - 'core/services/routing/piidetector/**'
      - 'core/backend/token_classify.go'
      - 'core/http/endpoints/localai/pii.go'
      - 'core/schema/pii.go'
      - 'tests/e2e-backends/**'
      - 'tests/e2e/e2e_pii_ner_test.go'
      - 'tests/e2e/e2e_suite_test.go'
      - '.github/workflows/tests-pii-ner-e2e.yml'
 concurrency:
  group: ci-tests-pii-ner-e2e-${{ github.event.pull_request.number || github.sha }}-${{ github.repository }}
  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
 jobs:
  tests-pii-ner-e2e:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        go-version: ['1.25.x']
    steps:
      - name: Clone
        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Free disk space
        run: |
          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL || true
          sudo docker image prune --all --force || true
          df -h
      - name: Configure apt mirror on runner
        uses: ./.github/actions/configure-apt-mirror
      - name: Setup Go ${{ matrix.go-version }}
        uses: actions/setup-go@v5
        with:
          go-version: ${{ matrix.go-version }}
          cache: false
      - name: Proto Dependencies
        run: |
          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
          rm protoc.zip
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          PATH="$PATH:$HOME/go/bin" make protogen-go
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y build-essential
      # Builds local-ai-backend:privacy-filter, downloads the GGUF, loads it on
      # CPU and runs the token_classify capability spec (byte-offset contract).
      - name: Run live PII NER backend E2E
        run: PATH="$PATH:$HOME/go/bin" make test-extra-backend-privacy-filter
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
        uses: mxschmitt/action-tmate@v3.23
        with:
          detached: true
          connect-timeout-seconds: 180
          limit-access-to-actor: true
--- a/.github/workflows/tests-ui-e2e.yml
+++ b/.github/workflows/tests-ui-e2e.yml
@@ -23,7 +23,7 @@ jobs:
        go-version: ['1.26.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Configure apt mirror on runner
--- a/.github/workflows/update_swagger.yaml
+++ b/.github/workflows/update_swagger.yaml
@@ -10,7 +10,7 @@ jobs:
      fail-fast: false
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
      - name: Configure apt mirror on runner
        uses: ./.github/actions/configure-apt-mirror
      - uses: actions/setup-go@v5
--- a/.gitignore
+++ b/.gitignore
@@ -91,3 +91,6 @@ core/http/react-ui/test-results/
 # Local worktrees
 .worktrees/
 # SDD / brainstorm scratch (agent-driven development)
 .superpowers/
--- a/10
+++ b/10
@@ -690,6 +690,16 @@ test-extra-backend-llama-cpp-transcription: docker-build-llama-cpp
 	BACKEND_TEST_CTX_SIZE=2048 \
 	$(MAKE) test-extra-backend
 ## privacy-filter: the PII/NER token-classification backend. Exercises the
 ## TokenClassify RPC and asserts byte-correct, UTF-8-aligned span offsets
 ## against the openai-privacy-filter multilingual GGUF (CPU-runnable, ~50M
 ## active params). This is the live-backend coverage for the PII NER tier.
 test-extra-backend-privacy-filter: docker-build-privacy-filter
 	BACKEND_IMAGE=local-ai-backend:privacy-filter \
 	BACKEND_TEST_MODEL_URL=https://huggingface.co/LocalAI-io/privacy-filter-multilingual-GGUF/resolve/main/privacy-filter-multilingual-f16.gguf \
 	BACKEND_TEST_CAPS=health,load,token_classify \
 	$(MAKE) test-extra-backend
 ## vllm is resolved from a HuggingFace model id (no file download) and
 ## exercises Predict + streaming + tool-call extraction via the hermes parser.
 ## Requires a host CPU with the SIMD instructions the prebuilt vllm CPU
--- a/README.md
+++ b/README.md
@@ -231,6 +231,7 @@ Most backends wrap a best-in-class upstream engine. A handful of them are native
 | Backend | What it does |
 |---------|-------------|
 | [parakeet.cpp](https://github.com/mudler/parakeet.cpp) | C++/GGML port of NVIDIA NeMo Parakeet ASR (tdt/ctc/rnnt/hybrid), with cache-aware streaming transcription |
 | [ced.cpp](https://github.com/mudler/ced.cpp) | C++/GGML port of the CED audio-tagging models: sound-event classification (527-class AudioSet) over REST and the realtime API for live recognition |
 | [voxtral.c](https://github.com/mudler/voxtral.c) | Voxtral Realtime 4B speech-to-text in pure C |
 | [vibevoice.cpp](https://github.com/mudler/vibevoice.cpp) | Native port of Microsoft VibeVoice for TTS (voice cloning) and long-form ASR with speaker diarization |
 | [rf-detr.cpp](https://github.com/mudler/rf-detr.cpp) | Native RF-DETR object detection and instance segmentation |
@@ -240,6 +241,8 @@ Most backends wrap a best-in-class upstream engine. A handful of them are native
 | [LocalVQE](https://github.com/localai-org/LocalVQE) | Joint acoustic echo cancellation, noise suppression, and dereverberation |
 | [local-store](https://github.com/mudler/LocalAI) | Local-first vector database for embeddings (shipped in-tree) |
 We also maintain [apex-quant](https://github.com/localai-org/apex-quant), a per-tensor, per-layer quantization recipe for Mixture-of-Experts models that exploits their structural sparsity to produce GGUFs matching or beating Q8_0 quality - and they run out of the box on stock llama.cpp.
 ## Resources
 - [Documentation](https://localai.io/)
--- a/backend/Dockerfile.golang
+++ b/backend/Dockerfile.golang
@@ -65,7 +65,12 @@ RUN <<EOT bash
            libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
            git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
            ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
-            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
+            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils && \
        apt-get install -y mesa-vulkan-drivers libdrm2
        # Mesa Vulkan ICD drivers (ANV/RADV/lavapipe) + their manifests. The
        # LunarG SDK below only provides the loader and shader tooling, not
        # hardware drivers — without Mesa, package-gpu-libs.sh has no ICD to
        # bundle and the packaged backend finds no GPU at runtime.
        if [ "amd64" = "$TARGETARCH" ]; then
            wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
            tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
--- a/backend/Dockerfile.python
+++ b/backend/Dockerfile.python
@@ -66,7 +66,12 @@ RUN <<EOT bash
            libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
            git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
            ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
-            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
+            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils && \
        apt-get install -y mesa-vulkan-drivers libdrm2
        # Mesa Vulkan ICD drivers (ANV/RADV/lavapipe) + their manifests. The
        # LunarG SDK below only provides the loader and shader tooling, not
        # hardware drivers — without Mesa, package-gpu-libs.sh has no ICD to
        # bundle and the packaged backend finds no GPU at runtime.
        if [ "amd64" = "$TARGETARCH" ]; then
            wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
            tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -24,6 +24,9 @@ service Backend {
  rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
  rpc Status(HealthMessage) returns (StatusResponse) {}
  rpc Detect(DetectOptions) returns (DetectResponse) {}
  // SoundDetection runs an audio-tagging / sound-event-classification model
  // (e.g. CED over the AudioSet ontology) on a clip and returns scored labels.
  rpc SoundDetection(SoundDetectionRequest) returns (SoundDetectionResponse) {}
  rpc Depth(DepthRequest) returns (DepthResponse) {}
  rpc FaceVerify(FaceVerifyRequest) returns (FaceVerifyResponse) {}
  rpc FaceAnalyze(FaceAnalyzeRequest) returns (FaceAnalyzeResponse) {}
@@ -671,6 +674,24 @@ message DetectResponse {
  repeated Detection Detections = 1;
 }
 // --- Sound-event classification / audio tagging messages (CED) ---
 message SoundDetectionRequest {
  string src = 1;       // audio file path (LocalAI writes the upload to disk)
  int32 top_k = 2;      // number of top tags to return (0 = all classes)
  float threshold = 3;  // optional: drop tags scoring below this
 }
 message SoundClass {
  string label = 1;     // AudioSet class name, e.g. "Baby cry, infant cry"
  float score = 2;      // per-class probability (multi-label, independent)
  int32 index = 3;      // class index in the model ontology
 }
 message SoundDetectionResponse {
  repeated SoundClass detections = 1;  // score-descending
 }
 // --- Depth estimation messages (Depth Anything 3) ---
 message DepthRequest {
--- a/backend/cpp/ik-llama-cpp/Makefile
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -1,5 +1,5 @@
-IK_LLAMA_VERSION?=b3dfb7858cfcb9166e92f366e5af87f19ebc94be
+IK_LLAMA_VERSION?=6c00e87ac84404af588ad2e65935bd6f079c696f
 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@
-LLAMA_VERSION?=f3e182816421c648188b5eab269853bf1531d950
+LLAMA_VERSION?=73618f27a801c0b8614ceaf3547d3c2a99baae14
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -18,6 +18,18 @@
 #if __has_include("server-chat.cpp")
 #include "server-chat.cpp"
 #endif
 // server-schema.cpp exists only in llama.cpp after the upstream refactor that
 // extracted the JSON request-schema evaluation (previously the static
 // server_task::params_from_json_cmpl) into server_schema::eval_llama_cmpl_schema.
 // server-context.cpp and grpc-server.cpp both call into it, so its definitions
 // must be part of this translation unit or the link fails. __has_include keeps
 // the source compatible with older pins/forks (e.g. llama-cpp-turboquant) that
 // predate the split and still expose params_from_json_cmpl (see the guarded
 // call sites below).
 #if __has_include("server-schema.cpp")
 #define LOCALAI_HAS_SERVER_SCHEMA 1
 #include "server-schema.cpp"
 #endif
 #include "server-context.cpp"
 // LocalAI
@@ -2102,7 +2114,11 @@ public:
                task.index = i;
                task.tokens    = std::move(inputs[i]);
 #ifdef LOCALAI_HAS_SERVER_SCHEMA
                task.params           = server_schema::eval_llama_cmpl_schema(
 #else
                task.params           = server_task::params_from_json_cmpl(
 #endif
                        ctx_server.impl->vocab,
                        params_base,
                        ctx_server.get_meta().slot_n_ctx,
@@ -2116,7 +2132,7 @@ public:
                // cannot detect tool calls or separate reasoning from content.
                task.params.res_type                 = TASK_RESPONSE_TYPE_OAI_CHAT;
                task.params.oaicompat_cmpl_id         = completion_id;
-                // oaicompat_model is already populated by params_from_json_cmpl
+                // oaicompat_model is already populated by eval_llama_cmpl_schema
                tasks.push_back(std::move(task));
            }
@@ -2940,7 +2956,11 @@ public:
                task.index = i;
                task.tokens    = std::move(inputs[i]);
 #ifdef LOCALAI_HAS_SERVER_SCHEMA
                task.params           = server_schema::eval_llama_cmpl_schema(
 #else
                task.params           = server_task::params_from_json_cmpl(
 #endif
                        ctx_server.impl->vocab,
                        params_base,
                        ctx_server.get_meta().slot_n_ctx,
@@ -2952,7 +2972,7 @@ public:
                // reasoning, tool calls, and content are classified into ChatDeltas.
                task.params.res_type                 = TASK_RESPONSE_TYPE_OAI_CHAT;
                task.params.oaicompat_cmpl_id         = completion_id;
-                // oaicompat_model is already populated by params_from_json_cmpl
+                // oaicompat_model is already populated by eval_llama_cmpl_schema
                tasks.push_back(std::move(task));
            }
--- a/backend/cpp/privacy-filter/Makefile
+++ b/backend/cpp/privacy-filter/Makefile
@@ -8,7 +8,7 @@
 # Local development: point at a working checkout instead of cloning, e.g.
 #   make PRIVACY_FILTER_SRC=$HOME/c/privacy-filter.cpp grpc-server
-PRIVACY_FILTER_VERSION?=646342f7a59c6b7d195185eac60bad762e572f1d
+PRIVACY_FILTER_VERSION?=98f52c5ef2250f207cc6b9a6aef05393a120cb7c
 PRIVACY_FILTER_REPO?=https://github.com/localai-org/privacy-filter.cpp
 PRIVACY_FILTER_SRC?=
--- a/backend/go/ced/.gitignore
+++ b/backend/go/ced/.gitignore
@@ -0,0 +1,11 @@
 .cache/
 sources/
 build/
 package/
 ced-grpc
 # build artifacts staged in-tree by the Makefile (cp from sources/) or
 # symlinked for local dev; the real sources live in ced.cpp upstream.
 *.so
 *.so.*
 ced_capi.h
 compile_commands.json
--- a/backend/go/ced/Makefile
+++ b/backend/go/ced/Makefile
@@ -0,0 +1,77 @@
 # ced sound-classification backend Makefile.
 #
 # Upstream pin lives below as CED_VERSION?=<sha> so .github/bump_deps.sh can find
 # and update it (matches the parakeet-cpp / whisper.cpp convention).
 #
 # Local dev shortcut: symlink an out-of-tree ced.cpp shared build + header and
 # skip the clone/cmake steps entirely:
 #   ln -sf /path/to/ced.cpp/build-shared/libced.so .
 #   ln -sf /path/to/ced.cpp/include/ced_capi.h .
 #   go build -o ced-grpc .
 CED_VERSION?=c04ac14b7992d00584d9e812c9bb6268598a6ce7
 CED_REPO?=https://github.com/mudler/ced.cpp
 GOCMD?=go
 GO_TAGS?=
 JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
 BUILD_TYPE?=
 NATIVE?=false
 # Static-link ggml into libced.so (PIC) so the shared lib is self-contained:
 # dlopen needs no libggml*.so alongside it, only system libs the runtime image
 # already provides.
 CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DCED_SHARED=ON -DCED_BUILD_CLI=OFF -DCED_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
 endif
 # ced.cpp gates its ggml backends behind CED_GGML_* options (set(... CACHE BOOL
 # "" FORCE)), so forward those instead of a bare -DGGML_CUDA=ON.
 ifeq ($(BUILD_TYPE),cublas)
 	CMAKE_ARGS+=-DCED_GGML_CUDA=ON -DGGML_CUDA_GRAPHS=ON
 else ifeq ($(BUILD_TYPE),openblas)
 	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DCED_GGML_HIP=ON
 else ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DCED_GGML_VULKAN=ON
 endif
 .PHONY: ced-grpc package build clean purge test all
 all: ced-grpc
 sources/ced.cpp:
 	mkdir -p sources/ced.cpp
 	cd sources/ced.cpp && \
 	git init -q && \
 	git remote add origin $(CED_REPO) && \
 	git fetch --depth 1 origin $(CED_VERSION) && \
 	git checkout FETCH_HEAD && \
 	git submodule update --init --recursive --depth 1 --single-branch
 libced.so: sources/ced.cpp
 	cmake -B sources/ced.cpp/build-shared -S sources/ced.cpp $(CMAKE_ARGS)
 	cmake --build sources/ced.cpp/build-shared --config Release -j$(JOBS)
 	cp -fv sources/ced.cpp/build-shared/libced.so* ./ 2>/dev/null || true
 	cp -fv sources/ced.cpp/include/ced_capi.h ./
 ced-grpc: libced.so main.go goced.go
 	CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o ced-grpc .
 package: ced-grpc
 	bash package.sh
 build: package
 test:
 	LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
 clean: purge
 	rm -rf libced.so* ced_capi.h package ced-grpc
 purge:
 	rm -rf sources/ced.cpp
--- a/backend/go/ced/goced.go
+++ b/backend/go/ced/goced.go
@@ -0,0 +1,130 @@
 package main
 // Go side of the ced backend: purego bindings over ced_capi.h plus the gRPC
 // SoundDetection implementation.
 //
 // SKETCH: the pb.SoundDetection* types come from backend.proto (regenerate with
 // `make protogen-go`). The C side is single-threaded per ctx, so we guard the
 // engine with engineMu; LocalAI also serializes via base.SingleThread.
 import (
 	"context"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"sort"
 	"sync"
 	"unsafe"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )
 // purego-bound entry points from libced.so. Names match ced_capi.h exactly.
 var (
 	CppAbiVersion       func() int32
 	CppLoad             func(ggufPath string) uintptr
 	CppFree             func(ctx uintptr)
 	CppLastError        func(ctx uintptr) string
 	CppNumClasses       func(ctx uintptr) int32
 	CppSampleRate       func(ctx uintptr) int32
 	CppClassifyPathJSON func(ctx uintptr, wavPath string, topK int32) uintptr
 	CppClassifyPcmJSON  func(ctx uintptr, pcm []float32, nSamples int32, sampleRate int32, topK int32) uintptr
 	CppFreeString       func(s uintptr)
 )
 // cstr copies a malloc'd C string (returned as uintptr) into a Go string and
 // frees the original via ced_capi_free_string. Empty/0 -> "".
 func cstr(p uintptr) string {
 	if p == 0 {
 		return ""
 	}
 	defer CppFreeString(p)
 	var b []byte
 	for i := 0; ; i++ {
 		ch := *(*byte)(unsafe.Pointer(p + uintptr(i))) //nolint:govet // #nosec G103 -- C-owned NUL-terminated string from libced (not Go-GC memory)
 		if ch == 0 {
 			break
 		}
 		b = append(b, ch)
 	}
 	return string(b)
 }
 // Ced is the gRPC backend. One loaded CED model per instance.
 type Ced struct {
 	base.Base
 	ctxPtr   uintptr
 	engineMu sync.Mutex
 }
 // Load resolves the GGUF and opens the C-API context.
 func (c *Ced) Load(opts *pb.ModelOptions) error {
 	if opts.ModelFile == "" {
 		return errors.New("ced: ModelFile is required")
 	}
 	ctx := CppLoad(opts.ModelFile)
 	if ctx == 0 {
 		return fmt.Errorf("ced: ced_capi_load failed for %q: %s", opts.ModelFile, CppLastError(0))
 	}
 	c.ctxPtr = ctx
 	return nil
 }
 // jsonTag mirrors the ced_capi JSON tag objects.
 type jsonTag struct {
 	Index int     `json:"index"`
 	Score float32 `json:"score"`
 	Label string  `json:"label"`
 }
 // SoundDetection classifies the clip at req.Src and returns scored AudioSet tags.
 func (c *Ced) SoundDetection(ctx context.Context, req *pb.SoundDetectionRequest) (*pb.SoundDetectionResponse, error) {
 	if c.ctxPtr == 0 {
 		return nil, errors.New("ced: model not loaded")
 	}
 	if req.GetSrc() == "" {
 		return nil, errors.New("ced: SoundDetectionRequest.src (audio path) is required")
 	}
 	topK := req.GetTopK()
 	if topK <= 0 {
 		topK = 10 // sensible default for a tagging response
 	}
 	c.engineMu.Lock()
 	out := cstr(CppClassifyPathJSON(c.ctxPtr, req.GetSrc(), topK))
 	lastErr := CppLastError(c.ctxPtr)
 	c.engineMu.Unlock()
 	if out == "" {
 		return nil, fmt.Errorf("ced: classification failed: %s", lastErr)
 	}
 	var tags []jsonTag
 	if err := json.Unmarshal([]byte(out), &tags); err != nil {
 		return nil, fmt.Errorf("ced: bad classifier JSON: %w", err)
 	}
 	thr := req.GetThreshold()
 	resp := &pb.SoundDetectionResponse{}
 	for _, t := range tags {
 		if t.Score < thr {
 			continue
 		}
 		resp.Detections = append(resp.Detections, &pb.SoundClass{
 			Label: t.Label, Score: t.Score, Index: int32(t.Index),
 		})
 	}
 	sort.Slice(resp.Detections, func(i, j int) bool {
 		return resp.Detections[i].Score > resp.Detections[j].Score
 	})
 	return resp, nil
 }
 func (c *Ced) Free() error {
 	c.engineMu.Lock()
 	defer c.engineMu.Unlock()
 	if c.ctxPtr != 0 {
 		CppFree(c.ctxPtr)
 		c.ctxPtr = 0
 	}
 	return nil
 }
--- a/backend/go/ced/main.go
+++ b/backend/go/ced/main.go
@@ -0,0 +1,59 @@
 package main
 // ced sound-classification backend. Started internally by LocalAI: one gRPC
 // server per loaded model. Loads libced.so via purego and registers the flat
 // C-API declared in ced_capi.h. The library name can be overridden with
 // CED_LIBRARY (mirrors PARAKEET_LIBRARY / WHISPER_LIBRARY); the default looks
 // for the .so next to this binary.
 //
 // SKETCH: requires `make protogen-go` after the backend.proto SoundDetection
 // addition, and a built libced.so (see Makefile). See DESIGN.md.
 import (
 	"flag"
 	"fmt"
 	"os"
 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )
 var addr = flag.String("addr", "localhost:50051", "the address to connect to")
 type libFunc struct {
 	ptr  any
 	name string
 }
 func main() {
 	libName := os.Getenv("CED_LIBRARY")
 	if libName == "" {
 		libName = "libced.so"
 	}
 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
 	if err != nil {
 		panic(fmt.Errorf("ced: dlopen %q: %w", libName, err))
 	}
 	// Bound 1:1 to ced_capi.h. char*-returning functions are declared uintptr
 	// so we can free the same pointer with ced_capi_free_string after copying
 	// (purego's string return would copy and leak the original).
 	for _, lf := range []libFunc{
 		{&CppAbiVersion, "ced_capi_abi_version"},
 		{&CppLoad, "ced_capi_load"},
 		{&CppFree, "ced_capi_free"},
 		{&CppLastError, "ced_capi_last_error"},
 		{&CppNumClasses, "ced_capi_num_classes"},
 		{&CppSampleRate, "ced_capi_sample_rate"},
 		{&CppClassifyPathJSON, "ced_capi_classify_path_json"},
 		{&CppClassifyPcmJSON, "ced_capi_classify_pcm_json"},
 		{&CppFreeString, "ced_capi_free_string"},
 	} {
 		purego.RegisterLibFunc(lf.ptr, lib, lf.name)
 	}
 	fmt.Fprintf(os.Stderr, "[ced] ABI=%d\n", CppAbiVersion())
 	flag.Parse()
 	if err := grpc.StartServer(*addr, &Ced{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/ced/package.sh
+++ b/backend/go/ced/package.sh
@@ -0,0 +1,60 @@
 #!/bin/bash
 #
 # Bundle the ced-grpc binary, libced.so, the core runtime libs (libc/libstdc++/
 # libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE so the package
 # is self-contained. Mirrors backend/go/parakeet-cpp/package.sh; run.sh routes
 # the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc is used.
 set -e
 CURDIR=$(dirname "$(realpath "$0")")
 REPO_ROOT="${CURDIR}/../../.."
 mkdir -p "$CURDIR/package/lib"
 cp -avf "$CURDIR/ced-grpc" "$CURDIR/package/"
 cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
 cp -avf "$CURDIR"/libced.so* "$CURDIR/package/lib/" 2>/dev/null || {
 	echo "ERROR: libced.so not found in $CURDIR, run 'make' first" >&2
 	exit 1
 }
 if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
    echo "Detected x86_64 architecture, copying x86_64 libraries..."
    cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
 elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
    echo "Detected ARM64 architecture, copying ARM64 libraries..."
    cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
 elif [ "$(uname -s)" = "Darwin" ]; then
    echo "Detected Darwin"
 else
    echo "Error: Could not detect architecture"
    exit 1
 fi
 GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
 if [ -f "$GPU_LIB_SCRIPT" ]; then
    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
    package_gpu_libs
 fi
 echo "Packaging completed successfully"
 ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
--- a/backend/go/ced/run.sh
+++ b/backend/go/ced/run.sh
@@ -0,0 +1,15 @@
 #!/bin/bash
 set -e
 CURDIR=$(dirname "$(realpath "$0")")
 export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
 # If a self-contained ld.so was packaged, route through it so the packaged
 # libc / libstdc++ are used instead of the host's (matches the sibling backends).
 if [ -f "$CURDIR/lib/ld.so" ]; then
 	echo "Using lib/ld.so"
 	exec "$CURDIR/lib/ld.so" "$CURDIR/ced-grpc" "$@"
 fi
 exec "$CURDIR/ced-grpc" "$@"
--- a/backend/go/crispasr/Makefile
+++ b/backend/go/crispasr/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 # CrispASR version (release tag)
 CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
-CRISPASR_VERSION?=d745bda4386ae0f9d1d2f23fff8ec95d76428221
+CRISPASR_VERSION?=63b57289255267edf66e43e33bc3911e04a2e92d
 SO_TARGET?=libgocrispasr.so
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
@@ -67,7 +67,7 @@ sources/CrispASR:
 	# it, so ${CMAKE_SOURCE_DIR} is THIS backend dir and the talk-llama sources
 	# aren't found. Rewrite to ${PROJECT_SOURCE_DIR} (the crispasr project root),
 	# which is correct both standalone and as a subproject. Idempotent.
-	sed -i 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt
+	sed -i.bak 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt && rm -f sources/CrispASR/src/CMakeLists.txt.bak
 # Detect OS
 UNAME_S := $(shell uname -s)
--- a/backend/go/crispasr/cpp/crispasr_shim.cpp
+++ b/backend/go/crispasr/cpp/crispasr_shim.cpp
@@ -47,6 +47,74 @@ extern "C" void set_abort(int v) {
  g_abort.store(v, std::memory_order_relaxed);
 }
 // --- word-level timestamp accessors ---
 extern "C" {
 int crispasr_session_result_n_words(crispasr_session_result *r, int seg_i);
 const char *crispasr_session_result_word_text(crispasr_session_result *r,
                                               int seg_i, int word_i);
 int64_t crispasr_session_result_word_t0(crispasr_session_result *r, int seg_i,
                                         int word_i);
 int64_t crispasr_session_result_word_t1(crispasr_session_result *r, int seg_i,
                                         int word_i);
 // Parakeet-specific word accessors
 int crispasr_parakeet_result_n_words(void *r);
 const char *crispasr_parakeet_result_word_text(void *r, int word_i);
 int64_t crispasr_parakeet_result_word_t0(void *r, int word_i);
 int64_t crispasr_parakeet_result_word_t1(void *r, int word_i);
 }
 void *get_result(void) { return g_result; }
 int get_word_count(int seg_i) {
  if (!g_result)
    return 0;
  return crispasr_session_result_n_words(g_result, seg_i);
 }
 const char *get_word_text(int seg_i, int word_i) {
  if (!g_result)
    return "";
  return crispasr_session_result_word_text(g_result, seg_i, word_i);
 }
 int64_t get_word_t0(int seg_i, int word_i) {
  if (!g_result)
    return 0;
  return crispasr_session_result_word_t0(g_result, seg_i, word_i);
 }
 int64_t get_word_t1(int seg_i, int word_i) {
  if (!g_result)
    return 0;
  return crispasr_session_result_word_t1(g_result, seg_i, word_i);
 }
 // Parakeet-specific word accessors
 int get_parakeet_word_count(void) {
  if (!g_result)
    return 0;
  return crispasr_parakeet_result_n_words(g_result);
 }
 const char *get_parakeet_word_text(int word_i) {
  if (!g_result)
    return "";
  return crispasr_parakeet_result_word_text(g_result, word_i);
 }
 int64_t get_parakeet_word_t0(int word_i) {
  if (!g_result)
    return 0;
  return crispasr_parakeet_result_word_t0(g_result, word_i);
 }
 int64_t get_parakeet_word_t1(int word_i) {
  if (!g_result)
    return 0;
  return crispasr_parakeet_result_word_t1(g_result, word_i);
 }
 static void ggml_log_cb(enum ggml_log_level level, const char *log,
                        void *data) {
  const char *level_str;
--- a/backend/go/crispasr/cpp/crispasr_shim.h
+++ b/backend/go/crispasr/cpp/crispasr_shim.h
@@ -20,4 +20,18 @@ float *tts_synthesize(const char *text, int *out_n_samples); // 24kHz mono float
 void tts_free(float *pcm);
 int tts_set_voice(const char *name); // best-effort speaker selection; 0 ok
 int tts_set_voice_file(const char *path, const char *ref_text); // load voice pack (.gguf) or zero-shot clone (.wav + ref_text)
 // --- word-level timestamp accessors ---
 // Session-based (works for whisper-like backends)
 void *get_result(void);
 int get_word_count(int seg_i);
 const char *get_word_text(int seg_i, int word_i);
 int64_t get_word_t0(int seg_i, int word_i);
 int64_t get_word_t1(int seg_i, int word_i);
 // Parakeet-specific (global word list, no segment index)
 int get_parakeet_word_count(void);
 const char *get_parakeet_word_text(int word_i);
 int64_t get_parakeet_word_t0(int word_i);
 int64_t get_parakeet_word_t1(int word_i);
 }
--- a/backend/go/crispasr/gocrispasr.go
+++ b/backend/go/crispasr/gocrispasr.go
@@ -34,6 +34,18 @@ var (
 	CppTTSFree         func(ptr uintptr)
 	CppTTSSetVoice     func(name string) int
 	CppTTSSetVoiceFile func(path string, refText string) int
 	// Word-level timestamp accessors (session-based, per-segment)
 	CppGetWordCount func(segI int) int
 	CppGetWordText  func(segI int, wordI int) string
 	CppGetWordT0    func(segI int, wordI int) int64
 	CppGetWordT1    func(segI int, wordI int) int64
 	// Parakeet-specific word accessors (global, no segment index)
 	CppGetParakeetWordCount func() int
 	CppGetParakeetWordText  func(wordI int) string
 	CppGetParakeetWordT0    func(wordI int) int64
 	CppGetParakeetWordT1    func(wordI int) int64
 )
 type CrispASR struct {
@@ -212,6 +224,28 @@ func (w *CrispASR) VAD(req *pb.VADRequest) (pb.VADResponse, error) {
 	}, nil
 }
 // isValidWord reports whether a TranscriptWord contains recognisable speech
 // content. The parakeet-specific word accessors can return stale initialisation
 // data (model name, binary blobs) when a segment has no real speech. A word is
 // considered valid only when:
 //   - the text is non-empty after trimming,
 //   - it contains no U+FFFD replacement characters (from binary data scrubbing),
 //   - both timestamps are non-negative,
 //   - the word has positive duration (end > start).
 func isValidWord(w *pb.TranscriptWord) bool {
 	txt := strings.TrimSpace(w.Text)
 	if txt == "" {
 		return false
 	}
 	if strings.ContainsRune(txt, '\uFFFD') {
 		return false
 	}
 	if w.Start < 0 || w.End < 0 || w.End <= w.Start {
 		return false
 	}
 	return true
 }
 func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRequest) (pb.TranscriptResult, error) {
 	if err := ctx.Err(); err != nil {
 		return pb.TranscriptResult{}, status.Error(codes.Canceled, "transcription cancelled")
@@ -290,15 +324,54 @@ func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRe
 		// IDs, so Tokens is left empty.
 		txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "<22>")
 		// Populate word-level timestamps. Try session-based functions first
 		// (per-segment); fall back to parakeet-specific functions (global word
 		// list with no segment index — only populated on the first segment to
 		// avoid duplication).
 		words := []*pb.TranscriptWord{}
 		wordCount := CppGetWordCount(i)
 		if wordCount == 0 && i == 0 {
 			wordCount = CppGetParakeetWordCount()
 			for j := 0; j < wordCount; j++ {
 				w := &pb.TranscriptWord{
 					Start: CppGetParakeetWordT0(j) * (10000000),
 					End:   CppGetParakeetWordT1(j) * (10000000),
 					Text:  strings.ToValidUTF8(strings.Clone(CppGetParakeetWordText(j)), "<22>"),
 				}
 				if isValidWord(w) {
 					words = append(words, w)
 				}
 			}
 		} else {
 			for j := 0; j < wordCount; j++ {
 				w := &pb.TranscriptWord{
 					Start: CppGetWordT0(i, j) * (10000000),
 					End:   CppGetWordT1(i, j) * (10000000),
 					Text:  strings.ToValidUTF8(strings.Clone(CppGetWordText(i, j)), "<22>"),
 				}
 				if isValidWord(w) {
 					words = append(words, w)
 				}
 			}
 		}
 		// Skip empty segments with no recognisable content (e.g. trailing
 		// silence segments that parakeet emits with stale init data).
 		trimmed := strings.TrimSpace(txt)
 		if trimmed == "" && len(words) == 0 {
 			continue
 		}
 		segment := &pb.TranscriptSegment{
 			Id:    int32(i),
 			Text:  txt,
 			Start: s, End: t,
 			Words: words,
 		}
 		segments = append(segments, segment)
-		text += " " + strings.TrimSpace(txt)
+		text += " " + trimmed
 	}
 	return pb.TranscriptResult{
@@ -390,13 +463,20 @@ func (w *CrispASR) AudioTranscriptionStream(ctx context.Context, opts *pb.Transc
 		s := CppGetSegmentStart(i) * 10000000
 		t := CppGetSegmentEnd(i) * 10000000
 		txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "<22>")
 		// Skip empty segments (e.g. trailing silence that parakeet emits
 		// with stale init data).
 		trimmed := strings.TrimSpace(txt)
 		if trimmed == "" && s == t {
 			continue
 		}
 		segments = append(segments, &pb.TranscriptSegment{
 			Id:    int32(i),
 			Text:  txt,
 			Start: s, End: t,
 		})
 		trimmed := strings.TrimSpace(txt)
 		if trimmed == "" {
 			continue
 		}
--- a/backend/go/crispasr/main.go
+++ b/backend/go/crispasr/main.go
@@ -44,6 +44,14 @@ func main() {
 		{&CppTTSFree, "tts_free"},
 		{&CppTTSSetVoice, "tts_set_voice"},
 		{&CppTTSSetVoiceFile, "tts_set_voice_file"},
 		{&CppGetWordCount, "get_word_count"},
 		{&CppGetWordText, "get_word_text"},
 		{&CppGetWordT0, "get_word_t0"},
 		{&CppGetWordT1, "get_word_t1"},
 		{&CppGetParakeetWordCount, "get_parakeet_word_count"},
 		{&CppGetParakeetWordText, "get_parakeet_word_text"},
 		{&CppGetParakeetWordT0, "get_parakeet_word_t0"},
 		{&CppGetParakeetWordT1, "get_parakeet_word_t1"},
 	}
 	for _, lf := range libFuncs {
--- a/backend/go/depth-anything-cpp/Makefile
+++ b/backend/go/depth-anything-cpp/Makefile
@@ -8,11 +8,13 @@ JOBS?=$(shell nproc --ignore=1)
 # depth-anything.cpp. Pin to a specific commit for a stable build; a squash
 # merge upstream can orphan a branch, so the native version is pinned by SHA.
-# This SHA adds the nested two-file metric C-API (abi_version 4,
+# This SHA adds the Depth Anything V2 engine + C-API routing (depth-only,
-# da_capi_load_nested) required by the depth-anything-3-nested gallery model;
+# relative + metric) on top of the nested two-file metric C-API (abi_version 4,
-# tag it (e.g. v0.1.3) upstream to keep the SHA alive.
+# da_capi_load_nested) required by the depth-anything-3-nested gallery model.
 # It is kept alive by the upstream tag da2-support (survives a squash-merge);
 # repoint to the master merge commit once mudler/depth-anything.cpp PR #1 lands.
 DEPTHANYTHING_REPO?=https://github.com/mudler/depth-anything.cpp.git
-DEPTHANYTHING_VERSION?=cce5edc395fd1843806093d7ccc0c8b0d0b97b72
+DEPTHANYTHING_VERSION?=f4e17dea695dd12ae76bea98ba58030996b98118
 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
--- a/backend/go/omnivoice-cpp/Makefile
+++ b/backend/go/omnivoice-cpp/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 # omnivoice.cpp version
 OMNIVOICE_REPO?=https://github.com/ServeurpersoCom/omnivoice.cpp
-OMNIVOICE_VERSION?=2603355a5dfacae5cfc33531d5d0933221843509
+OMNIVOICE_VERSION?=96d30169afd5e6bb3fd6a0e9be0eb505bfe81fcd
 SO_TARGET?=libgomnivoicecpp.so
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/parakeet-cpp/Makefile
+++ b/backend/go/parakeet-cpp/Makefile
@@ -1,6 +1,6 @@
 # parakeet-cpp backend Makefile.
 #
-# Upstream pin lives below as PARAKEET_VERSION?=92a5f0306be354c109150fe58ae4cc4f8a21ca45
+# Upstream pin lives below as PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36
 # (.github/bump_deps.sh) can find and update it - matches the
 # whisper.cpp / ds4 / vibevoice-cpp convention.
 #
@@ -15,7 +15,7 @@
 # That's what the L0 smoke test uses. The default target below does the
 # proper clone-at-pin + cmake build so CI doesn't need a side-checkout.
-PARAKEET_VERSION?=92a5f0306be354c109150fe58ae4cc4f8a21ca45
+PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36
 PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp
 GOCMD?=go
--- a/backend/go/parakeet-cpp/package.sh
+++ b/backend/go/parakeet-cpp/package.sh
@@ -1,23 +1,68 @@
 #!/bin/bash
 #
-# L0 packaging stub: copy the binary, run.sh and libparakeet.so* into
+# Bundle the parakeet-cpp-grpc binary, libparakeet.so, the core runtime
-# package/. The full ldd walk (libc, libstdc++, libgomp, GPU runtimes,
+# libs (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active
-# arch detection) lands in L3, mirroring backend/go/whisper/package.sh.
+# BUILD_TYPE so the package is self-contained. Mirrors
 # backend/go/whisper/package.sh; run.sh routes the (CGO_ENABLED=0) binary
 # through lib/ld.so so the packaged libc is used instead of the host's.
 set -e
 CURDIR=$(dirname "$(realpath "$0")")
 REPO_ROOT="${CURDIR}/../../.."
 mkdir -p "$CURDIR/package/lib"
 cp -avf "$CURDIR/parakeet-cpp-grpc" "$CURDIR/package/"
 cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
-# libparakeet.so + any soname symlinks (libparakeet.so.X, libparakeet.so.X.Y).
+# libparakeet.so + any soname symlinks (libparakeet.so.X[.Y]). purego.Dlopen
 # resolves it via LD_LIBRARY_PATH, which run.sh points at lib/.
 cp -avf "$CURDIR"/libparakeet.so* "$CURDIR/package/lib/" 2>/dev/null || {
 	echo "ERROR: libparakeet.so not found in $CURDIR, run 'make' first" >&2
 	exit 1
 }
-echo "L0 package layout (full ldd walk lands in L3):"
+# Detect architecture and copy the core runtime libs libparakeet.so links
 # against, plus the matching dynamic loader as lib/ld.so.
 if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
    echo "Detected x86_64 architecture, copying x86_64 libraries..."
    cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
 elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
    echo "Detected ARM64 architecture, copying ARM64 libraries..."
    cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
 elif [ "$(uname -s)" = "Darwin" ]; then
    echo "Detected Darwin"
 else
    echo "Error: Could not detect architecture"
    exit 1
 fi
 # Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers)
 # based on BUILD_TYPE so the backend can reach the GPU without the runtime
 # base image shipping those drivers.
 GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
 if [ -f "$GPU_LIB_SCRIPT" ]; then
    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
    package_gpu_libs
 fi
 echo "Packaging completed successfully"
 ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
--- a/backend/go/qwen3-tts-cpp/Makefile
+++ b/backend/go/qwen3-tts-cpp/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 # qwentts.cpp version
 QWEN3TTS_REPO?=https://github.com/ServeurpersoCom/qwentts.cpp
-QWEN3TTS_CPP_VERSION?=0bf4a18b22e8bb8718d95294e9f7f45c0d4270a4
+QWEN3TTS_CPP_VERSION?=4536dcdce27c3764a93a06d6bf64026b124962f5
 SO_TARGET?=libgoqwen3ttscpp.so
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=7f0e728b7d42f2490dfa5dd9539082d904f2f6b2
+STABLEDIFFUSION_GGML_VERSION?=f440ad9c29dd8bc34e5d1f4b863832b96d6ea05f
 CMAKE_ARGS+=-DGGML_MAX_NAME=128
--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=86c40c3bd6fc86f1187fb751d111b49e0fc18e84
+WHISPER_CPP_VERSION?=bae6bc02b1940bbfb87b6a0299c565e563b916d1
 SO_TARGET?=libgowhisper.so
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -178,6 +178,37 @@
    nvidia-cuda-12: "cuda12-parakeet-cpp"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-parakeet-cpp"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-parakeet-cpp"
 - &ced
  name: "ced"
  alias: "ced"
  license: mit
  icon: https://avatars.githubusercontent.com/u/95302084
  description: |
    CED sound-event classification / audio tagging (527-class AudioSet).
    ced.cpp is a C++/ggml port that performs audio tagging over the AudioSet
    taxonomy, exposed through the SoundDetection gRPC rpc and the
    /v1/audio/classification REST endpoint. It runs on CPU, NVIDIA CUDA,
    AMD ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets.
  urls:
    - https://github.com/mudler/ced.cpp
  tags:
    - audio-classification
    - CPU
    - GPU
    - CUDA
    - HIP
  capabilities:
    default: "cpu-ced"
    nvidia: "cuda12-ced"
    intel: "intel-sycl-f16-ced"
    metal: "metal-ced"
    amd: "rocm-ced"
    vulkan: "vulkan-ced"
    nvidia-l4t: "nvidia-l4t-arm64-ced"
    nvidia-cuda-13: "cuda13-ced"
    nvidia-cuda-12: "cuda12-ced"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced"
 - &voxtral
  name: "voxtral"
  alias: "voxtral"
@@ -2650,6 +2681,121 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-parakeet-cpp"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-13-parakeet-cpp
 ## ced
 - !!merge <<: *ced
  name: "ced-development"
  capabilities:
    default: "cpu-ced-development"
    nvidia: "cuda12-ced-development"
    intel: "intel-sycl-f16-ced-development"
    metal: "metal-ced-development"
    amd: "rocm-ced-development"
    vulkan: "vulkan-ced-development"
    nvidia-l4t: "nvidia-l4t-arm64-ced-development"
    nvidia-cuda-13: "cuda13-ced-development"
    nvidia-cuda-12: "cuda12-ced-development"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced-development"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced-development"
 - !!merge <<: *ced
  name: "nvidia-l4t-arm64-ced"
  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-ced"
  mirrors:
    - localai/localai-backends:latest-nvidia-l4t-arm64-ced
 - !!merge <<: *ced
  name: "nvidia-l4t-arm64-ced-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-ced"
  mirrors:
    - localai/localai-backends:master-nvidia-l4t-arm64-ced
 - !!merge <<: *ced
  name: "cuda13-nvidia-l4t-arm64-ced"
  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-ced"
  mirrors:
    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-ced
 - !!merge <<: *ced
  name: "cuda13-nvidia-l4t-arm64-ced-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-ced"
  mirrors:
    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-ced
 - !!merge <<: *ced
  name: "cpu-ced"
  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-ced"
  mirrors:
    - localai/localai-backends:latest-cpu-ced
 - !!merge <<: *ced
  name: "cpu-ced-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-ced"
  mirrors:
    - localai/localai-backends:master-cpu-ced
 - !!merge <<: *ced
  name: "metal-ced"
  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-ced"
  mirrors:
    - localai/localai-backends:latest-metal-darwin-arm64-ced
 - !!merge <<: *ced
  name: "metal-ced-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-ced"
  mirrors:
    - localai/localai-backends:master-metal-darwin-arm64-ced
 - !!merge <<: *ced
  name: "cuda12-ced"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-ced"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-12-ced
 - !!merge <<: *ced
  name: "cuda12-ced-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-ced"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-12-ced
 - !!merge <<: *ced
  name: "rocm-ced"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-ced"
  mirrors:
    - localai/localai-backends:latest-gpu-rocm-hipblas-ced
 - !!merge <<: *ced
  name: "rocm-ced-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-ced"
  mirrors:
    - localai/localai-backends:master-gpu-rocm-hipblas-ced
 - !!merge <<: *ced
  name: "intel-sycl-f32-ced"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-ced"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-sycl-f32-ced
 - !!merge <<: *ced
  name: "intel-sycl-f32-ced-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-ced"
  mirrors:
    - localai/localai-backends:master-gpu-intel-sycl-f32-ced
 - !!merge <<: *ced
  name: "intel-sycl-f16-ced"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-ced"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-sycl-f16-ced
 - !!merge <<: *ced
  name: "intel-sycl-f16-ced-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-ced"
  mirrors:
    - localai/localai-backends:master-gpu-intel-sycl-f16-ced
 - !!merge <<: *ced
  name: "vulkan-ced"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-ced"
  mirrors:
    - localai/localai-backends:latest-gpu-vulkan-ced
 - !!merge <<: *ced
  name: "vulkan-ced-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-ced"
  mirrors:
    - localai/localai-backends:master-gpu-vulkan-ced
 - !!merge <<: *ced
  name: "cuda13-ced"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-ced"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-13-ced
 - !!merge <<: *ced
  name: "cuda13-ced-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-ced"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-13-ced
 ## stablediffusion-ggml
 - !!merge <<: *stablediffusionggml
  name: "cpu-stablediffusion-ggml"
--- a/backend/python/diffusers/requirements-cpu.txt
+++ b/backend/python/diffusers/requirements-cpu.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 torchvision==0.22.1
 accelerate
 git+https://github.com/xhinker/sd_embed
@@ -10,9 +10,15 @@ sentencepiece
 torch==2.7.1
 optimum-quanto
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
+# diffusers and transformers are pinned together on purpose. transformers v5
-# Tracking: https://github.com/damian0815/compel/pull/129
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
-#           https://github.com/damian0815/compel/issues/128
+# breaks single-file Stable Diffusion loading on every released diffusers
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
+# main via git froze whichever broken pair existed at image-build time. Pin the
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# last known-good released pair so builds are reproducible and can't drift into
 # the broken window. See https://github.com/mudler/LocalAI/issues/9979
 #
 # compel is intentionally omitted: it pins transformers~=4.25, which conflicts
 # with this pin and previously forced pip into multi-hour resolver backtracking
 # storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
 # the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-cublas12.txt
+++ b/backend/python/diffusers/requirements-cublas12.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cu121
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 torchvision
 accelerate
 git+https://github.com/xhinker/sd_embed
@@ -10,9 +10,15 @@ sentencepiece
 torch
 ftfy
 optimum-quanto
-# TODO: re-add compel once it supports transformers >= 5.
+# diffusers and transformers are pinned together on purpose. transformers v5
-# Tracking: https://github.com/damian0815/compel/pull/129
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
-#           https://github.com/damian0815/compel/issues/128
+# breaks single-file Stable Diffusion loading on every released diffusers
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
+# main via git froze whichever broken pair existed at image-build time. Pin the
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# last known-good released pair so builds are reproducible and can't drift into
 # the broken window. See https://github.com/mudler/LocalAI/issues/9979
 #
 # compel is intentionally omitted: it pins transformers~=4.25, which conflicts
 # with this pin and previously forced pip into multi-hour resolver backtracking
 # storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
 # the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-cublas13.txt
+++ b/backend/python/diffusers/requirements-cublas13.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cu130
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 torchvision
 accelerate
 git+https://github.com/xhinker/sd_embed
@@ -10,9 +10,15 @@ sentencepiece
 torch
 ftfy
 optimum-quanto
-# TODO: re-add compel once it supports transformers >= 5.
+# diffusers and transformers are pinned together on purpose. transformers v5
-# Tracking: https://github.com/damian0815/compel/pull/129
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
-#           https://github.com/damian0815/compel/issues/128
+# breaks single-file Stable Diffusion loading on every released diffusers
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
+# main via git froze whichever broken pair existed at image-build time. Pin the
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# last known-good released pair so builds are reproducible and can't drift into
 # the broken window. See https://github.com/mudler/LocalAI/issues/9979
 #
 # compel is intentionally omitted: it pins transformers~=4.25, which conflicts
 # with this pin and previously forced pip into multi-hour resolver backtracking
 # storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
 # the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-hipblas.txt
+++ b/backend/python/diffusers/requirements-hipblas.txt
@@ -1,17 +1,23 @@
 --extra-index-url https://download.pytorch.org/whl/rocm7.0
 torch==2.10.0+rocm7.0
 torchvision==0.25.0+rocm7.0
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 accelerate
 peft
 sentencepiece
 optimum-quanto
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
+# diffusers and transformers are pinned together on purpose. transformers v5
-# Tracking: https://github.com/damian0815/compel/pull/129
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
-#           https://github.com/damian0815/compel/issues/128
+# breaks single-file Stable Diffusion loading on every released diffusers
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
+# main via git froze whichever broken pair existed at image-build time. Pin the
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# last known-good released pair so builds are reproducible and can't drift into
 # the broken window. See https://github.com/mudler/LocalAI/issues/9979
 #
 # compel is intentionally omitted: it pins transformers~=4.25, which conflicts
 # with this pin and previously forced pip into multi-hour resolver backtracking
 # storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
 # the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-intel.txt
+++ b/backend/python/diffusers/requirements-intel.txt
@@ -3,18 +3,24 @@ torch
 torchvision
 optimum[openvino]
 setuptools
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 accelerate
 git+https://github.com/xhinker/sd_embed
 peft
 sentencepiece
 optimum-quanto
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
+# diffusers and transformers are pinned together on purpose. transformers v5
-# Tracking: https://github.com/damian0815/compel/pull/129
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
-#           https://github.com/damian0815/compel/issues/128
+# breaks single-file Stable Diffusion loading on every released diffusers
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
+# main via git froze whichever broken pair existed at image-build time. Pin the
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# last known-good released pair so builds are reproducible and can't drift into
 # the broken window. See https://github.com/mudler/LocalAI/issues/9979
 #
 # compel is intentionally omitted: it pins transformers~=4.25, which conflicts
 # with this pin and previously forced pip into multi-hour resolver backtracking
 # storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
 # the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-l4t12.txt
+++ b/backend/python/diffusers/requirements-l4t12.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu129/
 torch
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
-transformers
+transformers==4.57.6
 accelerate
 peft
 optimum-quanto
@@ -9,9 +9,15 @@ numpy<2
 sentencepiece
 torchvision
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
+# diffusers and transformers are pinned together on purpose. transformers v5
-# Tracking: https://github.com/damian0815/compel/pull/129
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
-#           https://github.com/damian0815/compel/issues/128
+# breaks single-file Stable Diffusion loading on every released diffusers
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
+# main via git froze whichever broken pair existed at image-build time. Pin the
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# last known-good released pair so builds are reproducible and can't drift into
 # the broken window. See https://github.com/mudler/LocalAI/issues/9979
 #
 # compel is intentionally omitted: it pins transformers~=4.25, which conflicts
 # with this pin and previously forced pip into multi-hour resolver backtracking
 # storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
 # the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-l4t13.txt
+++ b/backend/python/diffusers/requirements-l4t13.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cu130
 torch
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
-transformers
+transformers==4.57.6
 accelerate
 peft
 optimum-quanto
@@ -10,9 +10,15 @@ sentencepiece
 torchvision
 ftfy
 chardet
-# TODO: re-add compel once it supports transformers >= 5.
+# diffusers and transformers are pinned together on purpose. transformers v5
-# Tracking: https://github.com/damian0815/compel/pull/129
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
-#           https://github.com/damian0815/compel/issues/128
+# breaks single-file Stable Diffusion loading on every released diffusers
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
+# main via git froze whichever broken pair existed at image-build time. Pin the
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# last known-good released pair so builds are reproducible and can't drift into
 # the broken window. See https://github.com/mudler/LocalAI/issues/9979
 #
 # compel is intentionally omitted: it pins transformers~=4.25, which conflicts
 # with this pin and previously forced pip into multi-hour resolver backtracking
 # storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
 # the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-mps.txt
+++ b/backend/python/diffusers/requirements-mps.txt
@@ -1,16 +1,22 @@
 torch==2.7.1
 torchvision==0.22.1
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 accelerate
 peft
 sentencepiece
 optimum-quanto
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
+# diffusers and transformers are pinned together on purpose. transformers v5
-# Tracking: https://github.com/damian0815/compel/pull/129
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
-#           https://github.com/damian0815/compel/issues/128
+# breaks single-file Stable Diffusion loading on every released diffusers
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
+# main via git froze whichever broken pair existed at image-build time. Pin the
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# last known-good released pair so builds are reproducible and can't drift into
 # the broken window. See https://github.com/mudler/LocalAI/issues/9979
 #
 # compel is intentionally omitted: it pins transformers~=4.25, which conflicts
 # with this pin and previously forced pip into multi-hour resolver backtracking
 # storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
 # the import succeeding, so dropping it here is safe.
--- a/backend/python/nemo/backend.py
+++ b/backend/python/nemo/backend.py
@@ -84,6 +84,135 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(message="Model loaded successfully", success=True)
    def _get_stride_seconds(self):
        """Compute the seconds-per-frame stride for the loaded model.
        stride = preprocessor_window_stride * encoder_subsampling_factor
        """
        try:
            preprocessor = self.model.preprocessor
            window_stride = preprocessor._cfg.get('window_stride', 0.01)
            subsampling_factor = getattr(self.model.encoder, 'subsampling_factor', 8)
            return window_stride * subsampling_factor
        except (AttributeError, KeyError, TypeError) as err:
            print(
                f"Warning: could not compute stride from model config ({err}), "
                f"falling back to 0.08s/frame",
                file=sys.stderr,
            )
            return 0.08
    def _build_segments_with_words(self, hypothesis, stride, timestamp_granularities=None):
        """Build TranscriptSegment list from a NeMo Hypothesis with timestamps.
        Supports two granularity modes:
          - "word": one TranscriptSegment per word, each with a single TranscriptWord entry
          - "segment" (default): merge consecutive words into sentence-level segments,
            splitting at word-level time gaps that exceed a dynamic threshold.
        """
        if not hypothesis or not isinstance(hypothesis.timestamp, dict):
            return []
        word_offsets = hypothesis.timestamp.get('word', [])
        if not word_offsets:
            return []
        granularities = list(timestamp_granularities) if timestamp_granularities else []
        granularity = "word" if "word" in granularities else "segment"
        # Build a flat list of (text, start_ns, end_ns) from NeMo word offsets
        transcript_words = []
        for wo in word_offsets:
            word_text = wo.get('word', '')
            if not word_text:
                continue
            start_offset = wo.get('start_offset', 0)
            end_offset = wo.get('end_offset', start_offset)
            start_ns = int(start_offset * stride * 1_000_000_000)
            end_ns = int(end_offset * stride * 1_000_000_000)
            transcript_words.append({
                'text': word_text,
                'start': start_ns,
                'end': end_ns,
            })
        if not transcript_words:
            return []
        if granularity == "word":
            # One segment per word
            result = []
            for idx, tw in enumerate(transcript_words):
                word = backend_pb2.TranscriptWord(
                    start=tw['start'], end=tw['end'], text=tw['text']
                )
                result.append(backend_pb2.TranscriptSegment(
                    id=idx,
                    start=tw['start'],
                    end=tw['end'],
                    text=tw['text'],
                    words=[word],
                ))
            return result
        # segment mode — merge at word-level time-gap boundaries
        # Compute gap threshold: median inter-word gap * 3, clamped to [0.3, 2.0]s
        gaps = []
        for i in range(1, len(transcript_words)):
            gap = (transcript_words[i]['start'] - transcript_words[i - 1]['end']) / 1_000_000_000
            if gap > 0:
                gaps.append(gap)
        if gaps:
            gaps.sort()
            median_gap = gaps[len(gaps) // 2]
            threshold_ns = int(max(0.3, min(median_gap * 3, 2.0)) * 1_000_000_000)
        else:
            threshold_ns = int(0.5 * 1_000_000_000)
        result = []
        buf_words = []  # list of TranscriptWord protobuf
        buf_start = None
        buf_end = 0
        buf_text = []
        prev_end = None
        for tw in transcript_words:
            # Detect word-level time gap
            if prev_end is not None and (tw['start'] - prev_end) >= threshold_ns and buf_text:
                seg_text = ' '.join(buf_text)
                result.append(backend_pb2.TranscriptSegment(
                    id=len(result),
                    start=buf_start,
                    end=buf_end,
                    text=seg_text,
                    words=list(buf_words),
                ))
                buf_words = []
                buf_text = []
                buf_start = None
            if buf_start is None:
                buf_start = tw['start']
            buf_end = tw['end']
            buf_text.append(tw['text'])
            buf_words.append(backend_pb2.TranscriptWord(
                start=tw['start'], end=tw['end'], text=tw['text']
            ))
            prev_end = tw['end']
        # flush remaining
        if buf_text and buf_start is not None:
            seg_text = ' '.join(buf_text)
            result.append(backend_pb2.TranscriptSegment(
                id=len(result),
                start=buf_start,
                end=buf_end,
                text=seg_text,
                words=list(buf_words),
            ))
        return result
    def AudioTranscription(self, request, context):
        result_segments = []
        text = ""
@@ -93,26 +222,67 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                print(f"Error: Audio file not found: {audio_path}", file=sys.stderr)
                return backend_pb2.TranscriptResult(segments=[], text="")
-            # NEMO's transcribe method accepts a list of audio paths and returns a list of transcripts
+            # Determine requested timestamp granularity
-            results = self.model.transcribe([audio_path])
+            timestamp_granularities = list(request.timestamp_granularities) if request.timestamp_granularities else []
            want_timestamps = bool(timestamp_granularities)
-            if not results or len(results) == 0:
+            if want_timestamps:
-                return backend_pb2.TranscriptResult(segments=[], text="")
+                # Request timestamps from NeMo.
                # timestamps=True forces NeMo to return Hypothesis objects with
                # the timestamp dict populated, so we omit return_hypotheses to
                # let NeMo choose the correct return type.
                results = self.model.transcribe([audio_path], timestamps=True)
-            # Get the transcript text from the first result.
+                if results and len(results) > 0:
-            # CTC models return List[str], TDT/RNNT models return List[Hypothesis]
+                    hypotheses = results[0] if isinstance(results[0], list) else results
-            # where the actual text lives in Hypothesis.text.
+                    if hypotheses and len(hypotheses) > 0:
-            result = results[0]
+                        hypothesis = hypotheses[0]
-            if isinstance(result, str):
+
-                text = result
+                        # Hypothesis object should have .timestamp populated
                        if not hasattr(hypothesis, 'timestamp') or not isinstance(hypothesis.timestamp, dict):
                            print(
                                "Warning: timestamps were requested but NeMo did not return "
                                "Hypothesis objects; falling back to untimestamped output",
                                file=sys.stderr,
                            )
                        # Extract text
                        if hasattr(hypothesis, 'text'):
                            text = hypothesis.text or ""
                        elif isinstance(hypothesis, str):
                            text = hypothesis
                        # Build segments with word-level timestamps
                        stride = self._get_stride_seconds()
                        result_segments = self._build_segments_with_words(
                            hypothesis, stride, timestamp_granularities
                        )
                        # If no word offsets but we have text, fall back to single segment
                        if not result_segments and text:
                            result_segments.append(backend_pb2.TranscriptSegment(
                                id=0, start=0, end=0, text=text
                            ))
            else:
-                text = getattr(result, 'text', None) or ""
+                # Simple transcription without timestamps
                # NEMO's transcribe method accepts a list of audio paths and returns a list of transcripts
                results = self.model.transcribe([audio_path])
-            if text:
+                if results and len(results) > 0:
-                # Create a single segment with the full transcription
+                    # Get the transcript text from the first result.
-                result_segments.append(backend_pb2.TranscriptSegment(
+                    # CTC models return List[str], TDT/RNNT models return List[Hypothesis]
-                    id=0, start=0, end=0, text=text
+                    # where the actual text lives in Hypothesis.text.
-                ))
+                    result = results[0]
                    if isinstance(result, str):
                        text = result
                    else:
                        text = getattr(result, 'text', None) or ""
                    if text:
                        # Create a single segment with the full transcription
                        result_segments.append(backend_pb2.TranscriptSegment(
                            id=0, start=0, end=0, text=text
                        ))
        except Exception as err:
            print(f"Error in AudioTranscription: {err}", file=sys.stderr)
--- a/backend/python/trl/backend.py
+++ b/backend/python/trl/backend.py
@@ -309,6 +309,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        dataset_split = request.dataset_split or "train"
        if os.path.exists(request.dataset_source):
            _allowed_dir = os.path.realpath(os.path.abspath(os.environ.get("LOCALAI_DATASET_DIR", os.getcwd())))
            _real_path = os.path.realpath(os.path.abspath(request.dataset_source))
            if not (_real_path == _allowed_dir or _real_path.startswith(_allowed_dir + os.sep)):
                raise ValueError("Dataset source path is outside the allowed directory")
            if request.dataset_source.endswith('.json') or request.dataset_source.endswith('.jsonl'):
                dataset = load_dataset("json", data_files=request.dataset_source, split=dataset_split)
            elif request.dataset_source.endswith('.csv'):
@@ -687,6 +691,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
    def ExportModel(self, request, context):
        export_format = request.export_format or "lora"
        output_path = request.output_path
        _allowed_output_dir = os.path.realpath(os.path.abspath(os.environ.get("LOCALAI_OUTPUT_DIR", os.getcwd())))
        _real_output_path = os.path.realpath(os.path.abspath(output_path))
        if not (_real_output_path == _allowed_output_dir or _real_output_path.startswith(_allowed_output_dir + os.sep)):
            raise ValueError("Output path is outside the allowed directory")
        output_path = _real_output_path
        checkpoint_path = request.checkpoint_path
        # Extract HF token for gated model access
@@ -807,7 +816,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                env = os.environ.copy()
                env["NO_LOCAL_GGUF"] = "1"
                cmd = [sys.executable, convert_script, merge_dir, "--outtype", outtype, "--outfile", gguf_path]
-                conv_result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600, env=env)
+                conv_result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600, env=env, shell=False)  # nosemgrep: python.django.security.injection.command.subprocess-injection.subprocess-injection
                if conv_result.returncode != 0:
                    diag = f"stdout: {conv_result.stdout[-300:]}\nstderr: {conv_result.stderr[-500:]}"
                    return backend_pb2.Result(success=False,
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -48,8 +48,10 @@ try:
 except ImportError:
    HAS_REASONING_PARSERS = False
 # vLLM >= 0.23 renamed GuidedDecodingParams -> StructuredOutputsParams and the
 # SamplingParams field guided_decoding -> structured_outputs.
 try:
-    from vllm.sampling_params import GuidedDecodingParams
+    from vllm.sampling_params import StructuredOutputsParams
    HAS_GUIDED_DECODING = True
 except ImportError:
    HAS_GUIDED_DECODING = False
@@ -536,13 +538,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                if value not in (None, 0, [], False):
                    setattr(sampling_params, param_field, value)
-        # Guided decoding: use Grammar field to pass JSON schema or BNF
+        # Structured-output decoding: use Grammar field to pass JSON schema or BNF
        if HAS_GUIDED_DECODING and request.Grammar:
            try:
                json.loads(request.Grammar)  # valid JSON = JSON schema
-                sampling_params.guided_decoding = GuidedDecodingParams(json=request.Grammar)
+                sampling_params.structured_outputs = StructuredOutputsParams(json=request.Grammar)
            except json.JSONDecodeError:
-                sampling_params.guided_decoding = GuidedDecodingParams(grammar=request.Grammar)
+                sampling_params.structured_outputs = StructuredOutputsParams(grammar=request.Grammar)
        # Extract image paths and process images
        prompt = request.Prompt
@@ -596,23 +598,124 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        # Stream the results
        generated_text = ""
        generated_token_ids: list[int] = []
        last_output = None
        # Tool-parsing strategy decision (made once, before the loop):
        #
        # When a tool parser is active, the model's raw tool-call markup
        # (e.g. <tool_call>...) must not be streamed verbatim as delta.content
        # — clients would see the unparsed syntax. Two paths:
        #
        # (A) native streaming via parser.extract_tool_calls_streaming. All
        #     concrete tool parsers shipped with vLLM 0.23+ implement this
        #     (Granite4, Qwen3Coder, DeepSeekV31, Jamba, Ernie45, Hermes,
        #     llama3_json, mistral, …). The parser decides per-delta whether
        #     to emit content or suppress tool-call markup, and emits a
        #     structured DeltaMessage(tool_calls=[...]) when a call is ready.
        # (B) buffer fallback — used only when the parser surprisingly lacks
        #     the streaming method or it raises mid-stream. The post-loop
        #     extract_tool_calls assembles the final chat_delta. Same correctness
        #     guarantee as a non-streaming response, at the cost of a delayed
        #     final chunk.
        has_tool_parser = bool(self.tool_parser_cls and request.Tools)
        tp_instance = None
        tp_request = None
        native_streaming = False
        native_streaming_error = False
        if has_tool_parser:
            try:
                tools_for_parser = json.loads(request.Tools)
            except json.JSONDecodeError:
                tools_for_parser = []
            try:
                tp_instance = self.tool_parser_cls(self.tokenizer, tools=tools_for_parser)
            except TypeError:
                tp_instance = self.tool_parser_cls(self.tokenizer)
            # Build a minimal ChatCompletionRequest so the streaming method
            # sees the tools list. We do not need any other request fields —
            # parsers only read .tools (and sometimes .tool_choice, which we
            # leave at default).
            try:
                from vllm.entrypoints.openai.chat_completion.protocol import (
                    ChatCompletionRequest as _CCR,
                )
                tp_request = _CCR(
                    model="local",
                    messages=[{"role": "user", "content": ""}],
                    tools=tools_for_parser or None,
                )
            except Exception as e:
                print(f"Could not build ChatCompletionRequest for streaming parser: {e}",
                      file=sys.stderr)
                tp_request = None
            native_streaming = (
                tp_request is not None
                and hasattr(tp_instance, "extract_tool_calls_streaming")
            )
        try:
            async for request_output in outputs:
                iteration_text = request_output.outputs[0].text
                last_output = request_output
                if streaming:
                    # Remove text already sent as vllm concatenates the text from previous yields
                    delta_iteration_text = iteration_text.removeprefix(generated_text)
-                    # Send the partial result
+                    new_token_ids = list(request_output.outputs[0].token_ids)
-                    yield backend_pb2.Reply(
+                    delta_token_ids = new_token_ids[len(generated_token_ids):]
                        message=bytes(delta_iteration_text, encoding='utf-8'),
                        chat_deltas=[backend_pb2.ChatDelta(content=delta_iteration_text)],
                    )
-                # Keep track of text generated
+                    if not has_tool_parser:
                        # Plain streaming — unchanged from pre-tool-parser path.
                        yield backend_pb2.Reply(
                            message=bytes(delta_iteration_text, encoding='utf-8'),
                            chat_deltas=[backend_pb2.ChatDelta(content=delta_iteration_text)],
                        )
                    elif native_streaming and not native_streaming_error:
                        # (A) Native vLLM extract_tool_calls_streaming.
                        try:
                            msg = tp_instance.extract_tool_calls_streaming(
                                previous_text=generated_text,
                                current_text=iteration_text,
                                delta_text=delta_iteration_text,
                                previous_token_ids=generated_token_ids,
                                current_token_ids=new_token_ids,
                                delta_token_ids=delta_token_ids,
                                request=tp_request,
                            )
                        except Exception as e:
                            print(f"Streaming tool parser error (falling back to "
                                  f"buffer for the rest of the stream): {e}",
                                  file=sys.stderr)
                            native_streaming_error = True
                            msg = None
                        if msg is not None:
                            tc_protos = []
                            for tc in (msg.tool_calls or []):
                                fn = tc.function or None
                                tc_protos.append(backend_pb2.ToolCallDelta(
                                    index=tc.index,
                                    id=tc.id or "",
                                    name=(fn.name if fn and fn.name else "") or "",
                                    arguments=(fn.arguments if fn and fn.arguments else "") or "",
                                ))
                            cd_kwargs = {}
                            if msg.content:
                                cd_kwargs["content"] = msg.content
                            if msg.reasoning:
                                cd_kwargs["reasoning_content"] = msg.reasoning
                            if tc_protos:
                                cd_kwargs["tool_calls"] = tc_protos
                            if cd_kwargs:
                                yield backend_pb2.Reply(
                                    message=bytes(msg.content or "", encoding='utf-8'),
                                    chat_deltas=[backend_pb2.ChatDelta(**cd_kwargs)],
                                )
                    # (B) buffer fallback — emit nothing during the stream.
                    # The post-loop extract_tool_calls block builds the final chunk.
                # Keep track of text + token_ids generated
                generated_text = iteration_text
                generated_token_ids = list(request_output.outputs[0].token_ids)
        finally:
            await outputs.aclose()
@@ -637,16 +740,19 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            except Exception as e:
                print(f"Reasoning parser error: {e}", file=sys.stderr)
-        if self.tool_parser_cls and request.Tools:
+        # When (A) native streaming ran cleanly, per-delta yields above already
        # delivered everything — do NOT extract again on the full text or we'd
        # duplicate content/tool_calls into the final chunk.
        if has_tool_parser and not (native_streaming and not native_streaming_error):
            try:
-                tools = json.loads(request.Tools)
+                tp = tp_instance
-                # Some concrete parsers only accept the tokenizer; only the
+                if tp is None:
-                # abstract base declares the tools kwarg. Try with tools first,
+                    # Defensive: tp_instance build failed earlier; reconstruct.
-                # fall back to tokenizer-only.
+                    tools = json.loads(request.Tools)
-                try:
+                    try:
-                    tp = self.tool_parser_cls(self.tokenizer, tools=tools)
+                        tp = self.tool_parser_cls(self.tokenizer, tools=tools)
-                except TypeError:
+                    except TypeError:
-                    tp = self.tool_parser_cls(self.tokenizer)
+                        tp = self.tool_parser_cls(self.tokenizer)
                info = tp.extract_tool_calls(content, request=None)
                if info.tools_called:
                    content = info.content or ""
@@ -659,6 +765,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        ))
            except Exception as e:
                print(f"Tool parser error: {e}", file=sys.stderr)
        elif native_streaming and not native_streaming_error:
            # Per-delta path already emitted content + tool_calls; the final
            # chat_delta should carry only metadata (token counts, logprobs).
            content = ""
        # Extract token counts
        prompt_tokens = 0
@@ -698,7 +808,26 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        )
        if streaming:
-            # Final chunk with structured data
+            # Final chunk with structured data.
            #
            # If we used the buffer fallback (has_tool_parser=True AND native
            # streaming did NOT run cleanly) and the parser found no tool call,
            # flush the buffered content as ONE content delta — and clear the
            # final chat_delta's content so the metadata chunk does not repeat
            # what we just sent. This is the plain-text-with-tool-parser path.
            buffered_fallback = (
                has_tool_parser
                and not (native_streaming and not native_streaming_error)
            )
            if buffered_fallback and not tool_calls_proto and content:
                yield backend_pb2.Reply(
                    message=bytes(content, encoding='utf-8'),
                    chat_deltas=[backend_pb2.ChatDelta(content=content)],
                )
                chat_delta = backend_pb2.ChatDelta(
                    reasoning_content=reasoning_content,
                    tool_calls=tool_calls_proto,
                )
            yield backend_pb2.Reply(
                message=b"",
                prompt_tokens=prompt_tokens,
--- a/backend/python/vllm/test.py
+++ b/backend/python/vllm/test.py
@@ -278,4 +278,261 @@ class TestBackendServicer(unittest.TestCase):
            print(err)
            self.fail("Embedding service failed")
        finally:
-            self.tearDown()
+            self.tearDown()
 class TestStreamingToolParser(unittest.TestCase):
    """
    Server-less unit tests for the streaming + tool-parser machinery in
    BackendServicer._predict. These tests instantiate BackendServicer
    directly and mock the vLLM engine + tool parser, so they do not need
    a GPU, a model, or a running gRPC server. Kept in a separate class to
    avoid the parent setUp() which spawns a subprocess.
    Covers #582 (follow-up to #10346):
      1. Markup-leak prevention with a non-streaming parser (buffer fallback)
      2. No content duplication on the plain-text path with the buffer fallback
      3. Native streaming progressive plain-text emission
      4. Native streaming structured tool_call, no markup leak
      5. Parser exception → graceful fallback to buffer, still no markup
      6. No-tool-parser regression: unchanged per-delta content stream
    """
    @staticmethod
    def _make_generate(chunks):
        """Build a fake vLLM engine.generate that yields cumulative chunks."""
        from types import SimpleNamespace
        async def gen(*a, **k):
            for i, t in enumerate(chunks):
                yield SimpleNamespace(
                    outputs=[SimpleNamespace(
                        text=t,
                        token_ids=list(range(i + 1)),
                        logprobs=None,
                    )],
                    prompt_token_ids=[0],
                )
        return lambda *a, **k: gen()
    @staticmethod
    def _collect(servicer, req):
        import asyncio
        async def run():
            return [r async for r in servicer._predict(req, None, streaming=True)]
        return asyncio.run(run())
    def _new_servicer(self):
        import sys, os
        sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
        from backend import BackendServicer
        s = BackendServicer()
        s.reasoning_parser_cls = None
        s.tool_parser_cls = None
        s.tokenizer = None
        return s
    # ── Case 1+2: parser without streaming method → buffer fallback ──
    def test_buffer_path_no_markup_no_duplication(self):
        from types import SimpleNamespace
        def parser_cls(called, content_text, calls):
            class _P:
                def __init__(self, tokenizer, tools=None):
                    pass
                # NOTE: NO extract_tool_calls_streaming → takes the buffer path
                def extract_tool_calls(self, c, request=None):
                    return SimpleNamespace(
                        tools_called=called, content=content_text, tool_calls=calls,
                    )
            return _P
        tools_json = '[{"type":"function","function":{"name":"calc","parameters":{}}}]'
        # Tool-call case: no raw markup in any delta.content
        s = self._new_servicer()
        s.llm = SimpleNamespace(generate=self._make_generate([
            '<tool_call>\n{"name": "calc"',
            '<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
        ]))
        call = SimpleNamespace(id="call_1",
                               function=SimpleNamespace(name="calc", arguments='{"x": 1}'))
        s.tool_parser_cls = parser_cls(True, "", [call])
        req = backend_pb2.PredictOptions(Prompt="x", Tools=tools_json)
        replies = self._collect(s, req)
        contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
        self.assertFalse(
            any("<tool_call" in c for c in contents),
            f"markup leaked: {contents!r}",
        )
        names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls]
        self.assertIn("calc", names, "tool_call missing from final chunk")
        # Plain-text-with-tools case: full content delivered exactly once
        s2 = self._new_servicer()
        s2.llm = SimpleNamespace(generate=self._make_generate([
            "The capital ",
            "The capital of France is Paris.",
        ]))
        s2.tool_parser_cls = parser_cls(False, "", [])
        req2 = backend_pb2.PredictOptions(Prompt="x", Tools=tools_json)
        joined = "".join(
            cd.content for r in self._collect(s2, req2)
            for cd in r.chat_deltas if cd.content
        )
        self.assertEqual(
            joined.count("The capital of France is Paris."), 1,
            f"buffered content duplicated: {joined!r}",
        )
    # ── Case 3: native streaming, progressive plain text ──
    def test_native_streaming_progressive_plain_text(self):
        from types import SimpleNamespace
        class _DeltaMsg:
            def __init__(self, content=None, reasoning=None, tool_calls=None):
                self.content = content
                self.reasoning = reasoning
                self.tool_calls = tool_calls or []
        class StreamingParser:
            def __init__(self, tokenizer, tools=None):
                pass
            def extract_tool_calls(self, c, request=None):
                # Should NOT be called when native streaming runs successfully.
                raise AssertionError("extract_tool_calls invoked on native-streaming path")
            def extract_tool_calls_streaming(
                self, previous_text, current_text, delta_text,
                previous_token_ids, current_token_ids, delta_token_ids, request,
            ):
                if not delta_text:
                    return None
                return _DeltaMsg(content=delta_text)
        s = self._new_servicer()
        s.llm = SimpleNamespace(generate=self._make_generate([
            "Paris ",
            "Paris is ",
            "Paris is the capital of France.",
        ]))
        s.tool_parser_cls = StreamingParser
        req = backend_pb2.PredictOptions(
            Prompt="x",
            Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
        )
        replies = self._collect(s, req)
        intermediate_content = [
            cd.content for r in replies[:-1] for cd in r.chat_deltas if cd.content
        ]
        self.assertTrue(
            len(intermediate_content) > 0,
            "Plain-text response not streamed progressively (native streaming inactive?)",
        )
        assembled = "".join(
            cd.content for r in replies for cd in r.chat_deltas if cd.content
        )
        self.assertEqual(
            assembled, "Paris is the capital of France.",
            f"Assembled content wrong: {assembled!r}",
        )
    # ── Case 4: native streaming, structured tool_call, no markup ──
    def test_native_streaming_tool_call_no_markup_leak(self):
        from types import SimpleNamespace
        class _DeltaMsg:
            def __init__(self, content=None, reasoning=None, tool_calls=None):
                self.content = content
                self.reasoning = reasoning
                self.tool_calls = tool_calls or []
        class _ToolCallStreamer:
            def __init__(self, tokenizer, tools=None):
                self._emitted = False
            def extract_tool_calls(self, c, request=None):
                raise AssertionError("extract_tool_calls invoked on native-streaming path")
            def extract_tool_calls_streaming(
                self, previous_text, current_text, delta_text,
                previous_token_ids, current_token_ids, delta_token_ids, request,
            ):
                if "</tool_call>" in current_text and not self._emitted:
                    self._emitted = True
                    fn = SimpleNamespace(name="calc", arguments='{"x": 1}')
                    tc = SimpleNamespace(id="call_1", type="function", index=0, function=fn)
                    return _DeltaMsg(tool_calls=[tc])
                return None
        s = self._new_servicer()
        s.llm = SimpleNamespace(generate=self._make_generate([
            '<tool_call>\n',
            '<tool_call>\n{"name": "calc"',
            '<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
        ]))
        s.tool_parser_cls = _ToolCallStreamer
        req = backend_pb2.PredictOptions(
            Prompt="x",
            Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
        )
        replies = self._collect(s, req)
        contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
        self.assertFalse(
            any("<tool_call" in c or "</tool_call>" in c for c in contents),
            f"markup leaked as content: {contents!r}",
        )
        names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls if tc.name]
        args  = [tc.arguments for r in replies for cd in r.chat_deltas for tc in cd.tool_calls if tc.arguments]
        self.assertIn("calc", names, f"tool_call name missing; got {names!r}")
        self.assertIn('{"x": 1}', args, f"tool_call args missing; got {args!r}")
    # ── Case 5: parser exception → fallback to buffer, no leak ──
    def test_native_streaming_parser_exception_falls_back_to_buffer(self):
        from types import SimpleNamespace
        call = SimpleNamespace(id="call_1",
                               function=SimpleNamespace(name="calc", arguments='{"x": 1}'))
        class _BrokenStreamer:
            def __init__(self, tokenizer, tools=None):
                pass
            def extract_tool_calls(self, c, request=None):
                return SimpleNamespace(tools_called=True, content="", tool_calls=[call])
            def extract_tool_calls_streaming(self, *a, **kw):
                raise RuntimeError("simulated parser bug")
        s = self._new_servicer()
        s.llm = SimpleNamespace(generate=self._make_generate([
            '<tool_call>\n{"name": "calc"',
            '<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
        ]))
        s.tool_parser_cls = _BrokenStreamer
        req = backend_pb2.PredictOptions(
            Prompt="x",
            Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
        )
        replies = self._collect(s, req)
        contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
        self.assertFalse(
            any("<tool_call" in c for c in contents),
            f"markup leaked after parser exception: {contents!r}",
        )
        names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls]
        self.assertIn("calc", names, "tool_call missing from final chunk after fallback")
    # ── Case 6: no tool parser → unchanged per-delta content stream ──
    def test_no_tool_parser_unchanged_per_delta_stream(self):
        from types import SimpleNamespace
        s = self._new_servicer()  # tool_parser_cls already None
        s.llm = SimpleNamespace(generate=self._make_generate([
            "Hello ", "Hello world", "Hello world!",
        ]))
        req = backend_pb2.PredictOptions(Prompt="x", Tools="")
        replies = self._collect(s, req)
        intermediate = [
            cd.content for r in replies[:-1] for cd in r.chat_deltas if cd.content
        ]
        self.assertEqual(
            intermediate, ["Hello ", "world", "!"],
            f"plain streaming changed; got {intermediate!r}",
        )
--- a/core/application/application.go
+++ b/core/application/application.go
@@ -341,11 +341,9 @@ func (a *Application) ResolvePIIPolicy(cfg *config.ModelConfig) (enabled bool, d
 	}
 	appCfg := a.ApplicationConfig()
-	if cfg.PII.Enabled != nil {
+	// PIIIsEnabled already encodes "explicit pii.enabled wins, else backend
-		enabled = *cfg.PII.Enabled
+	// default (cloud-proxy)" — the single source of that rule.
-	} else {
+	enabled = cfg.PIIIsEnabled()
 		enabled = cfg.PIIIsEnabled() // backend default (cloud-proxy)
 	}
 	if !enabled {
 		return false, nil
 	}
@@ -354,7 +352,7 @@ func (a *Application) ResolvePIIPolicy(cfg *config.ModelConfig) (enabled bool, d
 	if len(detectors) == 0 {
 		detectors = append([]string(nil), appCfg.PIIDefaultDetectors...)
 	}
-	return enabled, detectors
+	return true, detectors // enabled is necessarily true past the !enabled guard
 }
 // PIIPolicyResolver adapts ResolvePIIPolicy to pii.PolicyResolver for
--- a/core/application/distributed.go
+++ b/core/application/distributed.go
@@ -357,6 +357,15 @@ func initDistributed(cfg *config.ApplicationConfig, authDB *gorm.DB, configLoade
 		Pressure:         pressure,
 	})
 	// Wire staging-progress broadcasting so file-staging shows up on every
 	// replica, not just the one performing the transfer. Without this, a
 	// /api/operations poll that round-robins onto a peer sees no staging row and
 	// the progress flickers. The origin publishes; peers mirror via the wildcard.
 	router.StagingTracker().SetPublisher(natsClient)
 	if _, err := router.StagingTracker().SubscribeBroadcasts(natsClient); err != nil {
 		xlog.Warn("Failed to subscribe to staging progress broadcasts", "error", err)
 	}
 	// Create ReplicaReconciler for auto-scaling model replicas. Adapter +
 	// RegistrationToken feed the state-reconciliation passes: pending op
 	// drain uses the adapter, and model health probes use the token to auth
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -25,6 +25,7 @@ import (
 	"github.com/mudler/LocalAI/core/services/storage"
 	coreStartup "github.com/mudler/LocalAI/core/startup"
 	"github.com/mudler/LocalAI/internal"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/signals"
 	"github.com/mudler/LocalAI/pkg/vram"
@@ -71,6 +72,16 @@ func New(opts ...config.AppOption) (*Application, error) {
 	if err != nil {
 		return nil, fmt.Errorf("unable to create ModelPath: %q", err)
 	}
 	// Reap *.partial downloads abandoned by a previous run (killed mid-transfer
 	// by an OOM/restart, or stalled before cleanup could run). The 24h window
 	// is well beyond any legitimate in-flight download, so this never trims an
 	// active transfer; it just stops dead partials accumulating on the volume.
 	if removed, cErr := downloader.CleanupStalePartialFiles(options.SystemState.Model.ModelsPath, 24*time.Hour); cErr != nil {
 		xlog.Warn("Failed to reap stale partial downloads", "error", cErr)
 	} else if removed > 0 {
 		xlog.Info("Reaped stale partial downloads", "count", removed)
 	}
 	if options.GeneratedContentDir != "" {
 		err := os.MkdirAll(options.GeneratedContentDir, 0o750)
 		if err != nil {
@@ -633,6 +644,12 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
 			options.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
 		}
 	}
 	if settings.SizeAwareEviction != nil {
 		// Only apply if current value is default (false), suggesting it wasn't set from env var
 		if !options.SizeAwareEviction {
 			options.SizeAwareEviction = *settings.SizeAwareEviction
 		}
 	}
 	if settings.LRUEvictionMaxRetries != nil {
 		// Only apply if current value is default (30), suggesting it wasn't set from env var
 		if options.LRUEvictionMaxRetries == 0 {
@@ -836,6 +853,7 @@ func initializeWatchdog(application *Application, options *config.ApplicationCon
 			model.WithLRULimit(lruLimit),
 			model.WithMemoryReclaimer(options.MemoryReclaimerEnabled, options.MemoryReclaimerThreshold),
 			model.WithForceEvictionWhenBusy(options.ForceEvictionWhenBusy),
 			model.WithSizeAwareEviction(options.SizeAwareEviction),
 		)
 		application.ModelLoader().SetWatchDog(wd)
--- a/core/application/watchdog.go
+++ b/core/application/watchdog.go
@@ -90,6 +90,7 @@ func (a *Application) startWatchdog() error {
 			model.WithLRULimit(lruLimit),
 			model.WithMemoryReclaimer(appConfig.MemoryReclaimerEnabled, appConfig.MemoryReclaimerThreshold),
 			model.WithForceEvictionWhenBusy(appConfig.ForceEvictionWhenBusy),
 			model.WithSizeAwareEviction(appConfig.SizeAwareEviction),
 		)
 		// Create new stop channel BEFORE setting up any goroutines
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -1,6 +1,7 @@
 package backend
 import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"math/rand/v2"
@@ -12,7 +13,9 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/trace"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/vram"
 	"github.com/mudler/xlog"
 )
@@ -33,6 +36,67 @@ func recordModelLoadFailure(appConfig *config.ApplicationConfig, modelName, back
 	})
 }
 // estimateModelSizeBytes uses the unified EstimateModel entry point to compute
 // the total weight-file size for a model config.  It collects all weight files
 // from DownloadFiles, Model, and MMProj, and also extracts the HuggingFace
 // repo ID so EstimateModel can fall back to the HF API when local file
 // metadata is unavailable (e.g. not-yet-downloaded models).
 func estimateModelSizeBytes(c config.ModelConfig, modelsPath string) int64 {
 	seen := make(map[string]bool)
 	input := vram.ModelEstimateInput{}
 	addFile := func(uri string) {
 		if !vram.IsWeightFile(uri) {
 			return
 		}
 		resolved := uri
 		if !strings.Contains(uri, "://") {
 			resolved = "file://" + filepath.Join(modelsPath, uri)
 		}
 		if seen[resolved] {
 			return
 		}
 		seen[resolved] = true
 		input.Files = append(input.Files, vram.FileInput{URI: resolved})
 	}
 	// tryHFRepo resolves any huggingface:// or hf:// URI to an HTTPS URL and
 	// then extracts the org/model repo ID for use as the HF fallback path.
 	tryHFRepo := func(uri string) {
 		if input.HFRepo != "" {
 			return
 		}
 		resolved := downloader.URI(uri).ResolveURL()
 		if repoID, ok := vram.ExtractHFRepoID(resolved); ok {
 			input.HFRepo = repoID
 		}
 	}
 	for _, f := range c.DownloadFiles {
 		uriStr := string(f.URI)
 		addFile(uriStr)
 		tryHFRepo(uriStr)
 	}
 	addFile(c.Model)
 	tryHFRepo(c.Model)
 	if c.MMProj != "" {
 		addFile(c.MMProj)
 	}
 	if len(input.Files) == 0 && input.HFRepo == "" {
 		return 0
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
 	result, err := vram.EstimateModelMultiContext(ctx, input, nil)
 	if err != nil || result.SizeBytes == 0 {
 		return 0
 	}
 	return int64(result.SizeBytes)
 }
 func ModelOptions(c config.ModelConfig, so *config.ApplicationConfig, opts ...model.Option) []model.Option {
 	defOpts := []model.Option{
 		model.WithBackendString(c.Backend),
@@ -70,6 +134,10 @@ func ModelOptions(c config.ModelConfig, so *config.ApplicationConfig, opts ...mo
 		defOpts = append(defOpts, model.WithExternalBackend(k, v))
 	}
 	if sizeBytes := estimateModelSizeBytes(c, so.SystemState.Model.ModelsPath); sizeBytes > 0 {
 		defOpts = append(defOpts, model.WithModelSizeBytes(sizeBytes))
 	}
 	return append(defOpts, opts...)
 }
@@ -90,10 +158,11 @@ func getSeed(c config.ModelConfig) int32 {
 // DefaultContextSize and DefaultBatchSize are the backend's fallbacks when a
 // model config leaves them unset. Exported so callers that must respect the
 // effective decode window — notably the router's prompt trimmer — resolve the
-// same numbers grpcModelOpts does instead of guessing.
+// same numbers grpcModelOpts does instead of guessing. The values are owned by
 // core/config (single source of truth shared with the config default tiers).
 const (
-	DefaultContextSize = 4096
+	DefaultContextSize = config.DefaultContextSize
-	DefaultBatchSize   = 512
+	DefaultBatchSize   = config.DefaultPhysicalBatch
 )
 // EffectiveContextSize is the context window the backend will run with: the
@@ -129,7 +198,7 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
 	ctxSize := EffectiveContextSize(c)
 	b := EffectiveBatchSize(c)
-	flashAttention := "auto"
+	flashAttention := config.DefaultFlashAttention
 	if c.FlashAttention != nil {
 		flashAttention = *c.FlashAttention
@@ -175,7 +244,7 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
 		mmlock = *c.MMlock
 	}
-	nGPULayers := 9999999
+	nGPULayers := config.DefaultNGPULayers
 	if c.NGPULayers != nil {
 		nGPULayers = *c.NGPULayers
 	}
--- a/core/backend/sound_classification.go
+++ b/core/backend/sound_classification.go
@@ -0,0 +1,88 @@
 package backend
 import (
 	"context"
 	"fmt"
 	"sort"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
 	grpcPkg "github.com/mudler/LocalAI/pkg/grpc"
 	"github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/model"
 )
 // SoundDetectionRequest carries the knobs the HTTP layer collects for an
 // audio-tagging / sound-event-classification call. Audio is the path to the
 // uploaded clip on disk; TopK and Threshold are optional (0 = backend default).
 type SoundDetectionRequest struct {
 	Audio     string
 	TopK      int32
 	Threshold float32
 }
 func (r *SoundDetectionRequest) toProto() *proto.SoundDetectionRequest {
 	return &proto.SoundDetectionRequest{
 		Src:       r.Audio,
 		TopK:      r.TopK,
 		Threshold: r.Threshold,
 	}
 }
 func loadSoundDetectionModel(ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (grpcPkg.Backend, error) {
 	if modelConfig.Backend == "" {
 		return nil, fmt.Errorf("sound classification: model %q has no backend set; supported backends include ced", modelConfig.Name)
 	}
 	opts := ModelOptions(modelConfig, appConfig)
 	m, err := ml.Load(opts...)
 	if err != nil {
 		recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
 		return nil, err
 	}
 	if m == nil {
 		return nil, fmt.Errorf("could not load sound classification model")
 	}
 	return m, nil
 }
 // ModelSoundDetection runs the SoundDetection RPC against the configured
 // backend and returns a normalized schema.SoundClassificationResult.
 func ModelSoundDetection(ctx context.Context, req SoundDetectionRequest, ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (*schema.SoundClassificationResult, error) {
 	m, err := loadSoundDetectionModel(ml, modelConfig, appConfig)
 	if err != nil {
 		return nil, err
 	}
 	r, err := m.SoundDetection(ctx, req.toProto())
 	if err != nil {
 		return nil, err
 	}
 	return soundClassificationResultFromProto(modelConfig.Name, r), nil
 }
 // soundClassificationResultFromProto maps the backend detections to the
 // HTTP-facing schema, keeping the backend's score-descending order.
 func soundClassificationResultFromProto(modelName string, r *proto.SoundDetectionResponse) *schema.SoundClassificationResult {
 	out := &schema.SoundClassificationResult{
 		Model:      modelName,
 		Detections: []schema.SoundClassification{},
 	}
 	if r == nil {
 		return out
 	}
 	for _, d := range r.Detections {
 		if d == nil {
 			continue
 		}
 		out.Detections = append(out.Detections, schema.SoundClassification{
 			Index: int(d.Index),
 			Label: d.Label,
 			Score: d.Score,
 		})
 	}
 	sort.SliceStable(out.Detections, func(i, j int) bool {
 		return out.Detections[i].Score > out.Detections[j].Score
 	})
 	return out
 }
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -93,6 +93,7 @@ type RunCMD struct {
 	EnableMemoryReclaimer              bool     `env:"LOCALAI_MEMORY_RECLAIMER,MEMORY_RECLAIMER,LOCALAI_GPU_RECLAIMER,GPU_RECLAIMER" default:"false" help:"Enable memory threshold monitoring to auto-evict backends when memory usage exceeds threshold (uses GPU VRAM if available, otherwise RAM)" group:"backends"`
 	MemoryReclaimerThreshold           float64  `env:"LOCALAI_MEMORY_RECLAIMER_THRESHOLD,MEMORY_RECLAIMER_THRESHOLD,LOCALAI_GPU_RECLAIMER_THRESHOLD,GPU_RECLAIMER_THRESHOLD" default:"0.95" help:"Memory usage threshold (0.0-1.0) that triggers backend eviction (default 0.95 = 95%%)" group:"backends"`
 	ForceEvictionWhenBusy              bool     `env:"LOCALAI_FORCE_EVICTION_WHEN_BUSY,FORCE_EVICTION_WHEN_BUSY" default:"false" help:"Force eviction even when models have active API calls (default: false for safety)" group:"backends"`
 	SizeAwareEviction                  bool     `env:"LOCALAI_SIZE_AWARE_EVICTION,SIZE_AWARE_EVICTION" default:"false" help:"Evict the largest loaded model first rather than the least-recently-used one, keeping small utility models resident and maximizing freed memory per eviction" group:"backends"`
 	LRUEvictionMaxRetries              int      `env:"LOCALAI_LRU_EVICTION_MAX_RETRIES,LRU_EVICTION_MAX_RETRIES" default:"30" help:"Maximum number of retries when waiting for busy models to become idle before eviction (default: 30)" group:"backends"`
 	LRUEvictionRetryInterval           string   `env:"LOCALAI_LRU_EVICTION_RETRY_INTERVAL,LRU_EVICTION_RETRY_INTERVAL" default:"1s" help:"Interval between retries when waiting for busy models to become idle (e.g., 1s, 2s) (default: 1s)" group:"backends"`
 	Federated                          bool     `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
@@ -564,6 +565,9 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 	if r.ForceEvictionWhenBusy {
 		opts = append(opts, config.WithForceEvictionWhenBusy(true))
 	}
 	if r.SizeAwareEviction {
 		opts = append(opts, config.WithSizeAwareEviction(true))
 	}
 	if r.LRUEvictionMaxRetries > 0 {
 		opts = append(opts, config.WithLRUEvictionMaxRetries(r.LRUEvictionMaxRetries))
 	}
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -119,6 +119,7 @@ type ApplicationConfig struct {
 	// Eviction settings
 	ForceEvictionWhenBusy    bool          // Force eviction even when models have active API calls (default: false for safety)
 	SizeAwareEviction        bool          // Evict largest models first rather than least-recently-used (default: false)
 	LRUEvictionMaxRetries    int           // Maximum number of retries when waiting for busy models to become idle (default: 30)
 	LRUEvictionRetryInterval time.Duration // Interval between retries when waiting for busy models (default: 1s)
@@ -488,6 +489,16 @@ func WithForceEvictionWhenBusy(enabled bool) AppOption {
 	}
 }
 // WithSizeAwareEviction enables size-aware eviction ordering.
 // When true, the watchdog evicts the largest loaded model first rather than the
 // least-recently-used one, keeping small utility models resident and maximizing
 // memory freed per eviction.
 func WithSizeAwareEviction(enabled bool) AppOption {
 	return func(o *ApplicationConfig) {
 		o.SizeAwareEviction = enabled
 	}
 }
 // WithLRUEvictionMaxRetries sets the maximum number of retries when waiting for busy models to become idle
 func WithLRUEvictionMaxRetries(maxRetries int) AppOption {
 	return func(o *ApplicationConfig) {
@@ -1028,6 +1039,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 	memoryReclaimerEnabled := o.MemoryReclaimerEnabled
 	memoryReclaimerThreshold := o.MemoryReclaimerThreshold
 	forceEvictionWhenBusy := o.ForceEvictionWhenBusy
 	sizeAwareEviction := o.SizeAwareEviction
 	lruEvictionMaxRetries := o.LRUEvictionMaxRetries
 	threads := o.Threads
 	contextSize := o.ContextSize
@@ -1120,6 +1132,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 		MemoryReclaimerEnabled:    &memoryReclaimerEnabled,
 		MemoryReclaimerThreshold:  &memoryReclaimerThreshold,
 		ForceEvictionWhenBusy:     &forceEvictionWhenBusy,
 		SizeAwareEviction:         &sizeAwareEviction,
 		LRUEvictionMaxRetries:     &lruEvictionMaxRetries,
 		LRUEvictionRetryInterval:  &lruEvictionRetryInterval,
 		Threads:                   &threads,
@@ -1244,6 +1257,10 @@ func (o *ApplicationConfig) ApplyRuntimeSettings(settings *RuntimeSettings) (req
 		o.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
 		// This setting doesn't require restart, can be updated dynamically
 	}
 	if settings.SizeAwareEviction != nil {
 		o.SizeAwareEviction = *settings.SizeAwareEviction
 		// This setting doesn't require restart, can be updated dynamically
 	}
 	if settings.LRUEvictionMaxRetries != nil {
 		o.LRUEvictionMaxRetries = *settings.LRUEvictionMaxRetries
 		// This setting doesn't require restart, can be updated dynamically
--- a/core/config/backend_capabilities.go
+++ b/core/config/backend_capabilities.go
@@ -8,27 +8,28 @@ import (
 // Usecase name constants — the canonical string values used in gallery entries,
 // model configs (known_usecases), and UsecaseInfoMap keys.
 const (
-	UsecaseChat               = "chat"
+	UsecaseChat                = "chat"
-	UsecaseCompletion         = "completion"
+	UsecaseCompletion          = "completion"
-	UsecaseEdit               = "edit"
+	UsecaseEdit                = "edit"
-	UsecaseVision             = "vision"
+	UsecaseVision              = "vision"
-	UsecaseEmbeddings         = "embeddings"
+	UsecaseEmbeddings          = "embeddings"
-	UsecaseTokenize           = "tokenize"
+	UsecaseTokenize            = "tokenize"
-	UsecaseImage              = "image"
+	UsecaseImage               = "image"
-	UsecaseVideo              = "video"
+	UsecaseVideo               = "video"
-	UsecaseTranscript         = "transcript"
+	UsecaseTranscript          = "transcript"
-	UsecaseTTS                = "tts"
+	UsecaseTTS                 = "tts"
-	UsecaseSoundGeneration    = "sound_generation"
+	UsecaseSoundGeneration     = "sound_generation"
-	UsecaseRerank             = "rerank"
+	UsecaseRerank              = "rerank"
-	UsecaseDetection          = "detection"
+	UsecaseDetection           = "detection"
-	UsecaseDepth              = "depth"
+	UsecaseDepth               = "depth"
-	UsecaseVAD                = "vad"
+	UsecaseVAD                 = "vad"
-	UsecaseAudioTransform     = "audio_transform"
+	UsecaseAudioTransform      = "audio_transform"
-	UsecaseDiarization        = "diarization"
+	UsecaseDiarization         = "diarization"
-	UsecaseRealtimeAudio      = "realtime_audio"
+	UsecaseSoundClassification = "sound_classification"
-	UsecaseFaceRecognition    = "face_recognition"
+	UsecaseRealtimeAudio       = "realtime_audio"
-	UsecaseSpeakerRecognition = "speaker_recognition"
+	UsecaseFaceRecognition     = "face_recognition"
-	UsecaseTokenClassify      = "token_classify"
+	UsecaseSpeakerRecognition  = "speaker_recognition"
 	UsecaseTokenClassify       = "token_classify"
 )
 // GRPCMethod identifies a Backend service RPC from backend.proto.
@@ -51,6 +52,7 @@ const (
 	MethodVAD                GRPCMethod = "VAD"
 	MethodAudioTransform     GRPCMethod = "AudioTransform"
 	MethodDiarize            GRPCMethod = "Diarize"
 	MethodSoundDetection     GRPCMethod = "SoundDetection"
 	MethodAudioToAudioStream GRPCMethod = "AudioToAudioStream"
 	MethodFaceVerify         GRPCMethod = "FaceVerify"
 	MethodFaceAnalyze        GRPCMethod = "FaceAnalyze"
@@ -165,6 +167,11 @@ var UsecaseInfoMap = map[string]UsecaseInfo{
 		GRPCMethod:  MethodDiarize,
 		Description: "Speaker diarization (who-spoke-when, per-speaker segments) via the Diarize RPC.",
 	},
 	UsecaseSoundClassification: {
 		Flag:        FLAG_SOUND_CLASSIFICATION,
 		GRPCMethod:  MethodSoundDetection,
 		Description: "Sound-event classification / audio tagging (scored AudioSet labels like baby cry, glass breaking, alarms) via the SoundDetection RPC.",
 	},
 	UsecaseRealtimeAudio: {
 		Flag:        FLAG_REALTIME_AUDIO,
 		GRPCMethod:  MethodAudioToAudioStream,
--- a/core/config/defaults.go
+++ b/core/config/defaults.go
@@ -0,0 +1,30 @@
 package config
 // Canonical default values.
 //
 // These are owned here so the two layers that need them share a single source
 // of truth: the config tiers (ApplyInference/Hardware/Serving/Generic — which
 // *decide* defaults) and core/backend/options.go (which *translates* a
 // ModelConfig to the backend wire format and supplies the same fallbacks
 // defensively). Previously these were duplicated as literals across both
 // packages and had drifted (e.g. n_gpu_layers 9999999 vs 99999999, two batch
 // constants of 512). core/backend imports core/config, so backend references
 // these; config never imports backend.
 const (
 	// DefaultContextSize is the fallback context window when none is configured
 	// or estimable from the model.
 	DefaultContextSize = 4096
 	// GGUFFallbackContextSize is the context window for a GGUF model whose
 	// metadata yields no usable estimate (see guessGGUFFromFile). Deliberately
 	// smaller than DefaultContextSize to stay conservative on memory there.
 	GGUFFallbackContextSize = 1024
 	// DefaultNGPULayers means "offload all layers"; the backend (fit_params)
 	// clamps to what actually fits in device memory.
 	DefaultNGPULayers = 99999999
 	// DefaultFlashAttention is the flash-attention mode default; "auto" lets the
 	// backend enable it when the model + backend support it.
 	DefaultFlashAttention = "auto"
 )
--- a/core/config/generic_defaults.go
+++ b/core/config/generic_defaults.go
@@ -0,0 +1,115 @@
 package config
 import "os"
 // ApplyGenericDefaults fills the generic fallback values applied after the
 // higher-priority tiers (ApplyInferenceDefaults for the model family,
 // ApplyHardwareDefaults for the device, ApplyServingDefaults for serving
 // policy): sampling parameters and a few runtime flags. Like the other tiers it
 // only fills values still left unset, so model-family / explicit config wins.
 func ApplyGenericDefaults(cfg *ModelConfig) {
 	if cfg == nil {
 		return
 	}
 	// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
 	defaultTopP := 0.95
 	defaultTopK := 40
 	defaultMinP := 0.0
 	defaultTemp := 0.9
 	// https://github.com/mudler/LocalAI/issues/2780
 	defaultMirostat := 0
 	defaultMirostatTAU := 5.0
 	defaultMirostatETA := 0.1
 	defaultTypicalP := 1.0
 	defaultTFZ := 1.0
 	defaultZero := 0
 	trueV := true
 	falseV := false
 	if cfg.Seed == nil {
 		//  random number generator seed
 		defaultSeed := RAND_SEED
 		cfg.Seed = &defaultSeed
 	}
 	// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
 	// native default differs (issue #6632). Only inject it for the llama.cpp
 	// family and the empty/auto backend; leave TopK nil for known non-llama
 	// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
 	// is 0 rather than a silently-changed 40.
 	if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
 		cfg.TopK = &defaultTopK
 	}
 	if cfg.MinP == nil {
 		cfg.MinP = &defaultMinP
 	}
 	if cfg.TypicalP == nil {
 		cfg.TypicalP = &defaultTypicalP
 	}
 	if cfg.TFZ == nil {
 		cfg.TFZ = &defaultTFZ
 	}
 	if cfg.MMap == nil {
 		// MMap is enabled by default
 		// Only exception is for Intel GPUs
 		if os.Getenv("XPU") != "" {
 			cfg.MMap = &falseV
 		} else {
 			cfg.MMap = &trueV
 		}
 	}
 	if cfg.MMlock == nil {
 		// MMlock is disabled by default
 		cfg.MMlock = &falseV
 	}
 	if cfg.TopP == nil {
 		cfg.TopP = &defaultTopP
 	}
 	if cfg.Temperature == nil {
 		cfg.Temperature = &defaultTemp
 	}
 	if cfg.Maxtokens == nil {
 		cfg.Maxtokens = &defaultZero
 	}
 	if cfg.Mirostat == nil {
 		cfg.Mirostat = &defaultMirostat
 	}
 	if cfg.MirostatETA == nil {
 		cfg.MirostatETA = &defaultMirostatETA
 	}
 	if cfg.MirostatTAU == nil {
 		cfg.MirostatTAU = &defaultMirostatTAU
 	}
 	if cfg.LowVRAM == nil {
 		cfg.LowVRAM = &falseV
 	}
 	if cfg.Embeddings == nil {
 		cfg.Embeddings = &falseV
 	}
 	if cfg.Reranking == nil {
 		cfg.Reranking = &falseV
 	}
 	if cfg.PromptCacheAll == nil {
 		// Match upstream llama.cpp's default (common/common.h: cache_prompt = true)
 		// and let cache_idle_slots / kv_unified actually do useful work; users can
 		// opt out with an explicit `prompt_cache_all: false` in the model YAML.
 		cfg.PromptCacheAll = &trueV
 	}
 }
--- a/core/config/generic_defaults_test.go
+++ b/core/config/generic_defaults_test.go
@@ -0,0 +1,36 @@
 package config_test
 import (
 	. "github.com/mudler/LocalAI/core/config"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
 var _ = Describe("ApplyGenericDefaults (generic fallback tier)", func() {
 	It("fills sampling + runtime fallbacks when unset", func() {
 		cfg := &ModelConfig{} // empty backend uses the llama sampler defaults
 		ApplyGenericDefaults(cfg)
 		Expect(cfg.TopP).ToNot(BeNil())
 		Expect(*cfg.TopP).To(Equal(0.95))
 		Expect(*cfg.TopK).To(Equal(40))
 		Expect(*cfg.Temperature).To(Equal(0.9))
 		Expect(*cfg.MMap).To(BeTrue())
 		Expect(*cfg.MMlock).To(BeFalse())
 		Expect(*cfg.PromptCacheAll).To(BeTrue())
 	})
 	It("never overrides explicit values", func() {
 		tk := 7
 		tp := 0.5
 		cfg := &ModelConfig{}
 		cfg.TopK = &tk
 		cfg.TopP = &tp
 		ApplyGenericDefaults(cfg)
 		Expect(*cfg.TopK).To(Equal(7))
 		Expect(*cfg.TopP).To(Equal(0.5))
 	})
 	It("no-ops on nil", func() {
 		Expect(func() { ApplyGenericDefaults(nil) }).ToNot(Panic())
 	})
 })
--- a/core/config/gguf.go
+++ b/core/config/gguf.go
@@ -14,11 +14,6 @@ import (
 	"github.com/gpustack/gguf-parser-go/util/ptr"
 )
 const (
 	defaultContextSize = 1024
 	defaultNGPULayers  = 99999999
 )
 // reservedNonChatModel reports whether the operator reserved this model for an
 // internal primitive — the router score classifier or the PII NER
 // token_classify tier. Such a model has no chat template and must not be
@@ -38,7 +33,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
 			cSize := int(ctxSize)
 			cfg.ContextSize = &cSize
 		} else {
-			defaultCtx = defaultContextSize
+			defaultCtx = GGUFFallbackContextSize
 			cfg.ContextSize = &defaultCtx
 		}
 	}
@@ -52,7 +47,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
 	if cfg.NGPULayers == nil {
 		// we assume we want to offload all layers
-		defaultHigh := defaultNGPULayers
+		defaultHigh := DefaultNGPULayers
 		cfg.NGPULayers = &defaultHigh
 	}
--- a/core/config/hardware_defaults.go
+++ b/core/config/hardware_defaults.go
@@ -0,0 +1,180 @@
 package config
 import (
 	"fmt"
 	"strconv"
 	"strings"
 	"github.com/mudler/LocalAI/pkg/xsysinfo"
 	"github.com/mudler/xlog"
 )
 // Hardware-driven model-config defaults.
 //
 // This sits alongside the other config overriders (ApplyInferenceDefaults for
 // model families, guessDefaultsFromFile for GGUF/NGPULayers): they all
 // heuristically fill ModelConfig values the user left unset. Hardware tuning is
 // the same domain — "adjust the config from the device that will run it" — so
 // it lives here rather than scattered into the backend or a separate package.
 //
 // The heuristics are parameterized on a GPU descriptor (not on direct
 // detection) so they apply in both deployment shapes: SetDefaults passes the
 // LocalGPU on a single host, and the distributed router passes the *selected
 // node's* reported GPU before loading there (the frontend that loaded the
 // config may have no GPU at all).
 // GPU describes the device that will run a model.
 type GPU struct {
 	// Vendor is "nvidia", "amd", … (matches xsysinfo vendor constants).
 	Vendor string
 	// ComputeCapability is the NVIDIA compute capability as "major.minor"
 	// (e.g. "12.1" for GB10 / DGX Spark). Empty for non-NVIDIA / unknown.
 	ComputeCapability string
 	// VRAM is total device memory in bytes (0 = unknown).
 	VRAM uint64
 }
 // Physical batch (n_batch / n_ubatch) defaults.
 const (
 	// DefaultPhysicalBatch is the conservative default when no hardware-specific
 	// tuning applies. core/backend.DefaultBatchSize references this (single source).
 	DefaultPhysicalBatch = 512
 	// BlackwellPhysicalBatch is the default on NVIDIA Blackwell consumer GPUs
 	// (sm_12x: sm_120 RTX 50-series, sm_121 GB10 / DGX Spark). A larger physical
 	// batch materially lifts MoE prefill there (per-expert GEMM tiles fill
 	// better); measured on a GB10 with Qwen3-30B-A3B to saturate around 2048.
 	BlackwellPhysicalBatch = 2048
 )
 // IsNVIDIABlackwell reports whether the GPU is in the NVIDIA Blackwell consumer
 // family (sm_12x). Datacenter Blackwell (B100/B200/GB200, sm_100 / cc 10.0)
 // reports a different compute capability and is intentionally not matched.
 func (g GPU) IsNVIDIABlackwell() bool {
 	maj, _ := parseComputeCapability(g.ComputeCapability)
 	return maj >= 12
 }
 // PhysicalBatch returns the canonical physical batch (n_batch/n_ubatch) for the
 // given hardware, used when the model config leaves batch unset.
 func PhysicalBatch(g GPU) int {
 	if g.IsNVIDIABlackwell() {
 		return BlackwellPhysicalBatch
 	}
 	return DefaultPhysicalBatch
 }
 // IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
 // Callers that re-tune a value chosen by an upstream host (the distributed
 // router correcting the frontend's guess) use this to avoid clobbering an
 // explicit user batch such as 1024.
 func IsManagedPhysicalBatch(n int) bool {
 	return n == DefaultPhysicalBatch || n == BlackwellPhysicalBatch
 }
 // Parallel-slot (n_parallel) VRAM tiers. llama.cpp serializes requests at
 // n_parallel=1 (the backend default) and only auto-enables continuous batching
 // when n_parallel > 1 — so a single-slot default makes concurrent requests
 // queue. We default a slot count by GPU size so multi-user serving works out of
 // the box. With the backend's unified KV cache the slots SHARE the context
 // budget, so more slots add concurrency without multiplying KV memory.
 const (
 	parallelSlotsVRAMHigh = uint64(32) << 30 // >=32 GiB -> 8 slots
 	parallelSlotsVRAMMid  = uint64(8) << 30  // >=8 GiB  -> 4 slots
 	parallelSlotsVRAMLow  = uint64(4) << 30  // >=4 GiB  -> 2 slots
 )
 // DefaultParallelSlots returns the n_parallel default for the given GPU. Returns
 // 1 (no concurrency) when VRAM is unknown or too small, so we never change
 // behavior on CPU-only / tiny devices.
 func DefaultParallelSlots(g GPU) int {
 	switch {
 	case g.VRAM >= parallelSlotsVRAMHigh:
 		return 8
 	case g.VRAM >= parallelSlotsVRAMMid:
 		return 4
 	case g.VRAM >= parallelSlotsVRAMLow:
 		return 2
 	default:
 		return 1
 	}
 }
 // EnsureParallelOption appends a VRAM-scaled "parallel:N" backend option when the
 // model doesn't already set one (and the GPU warrants concurrency). Returns the
 // possibly-extended options. Shared by the single-host config path
 // (ApplyHardwareDefaults) and the distributed router (per selected node).
 func EnsureParallelOption(opts []string, gpu GPU) []string {
 	if slots := DefaultParallelSlots(gpu); slots > 1 && !hasParallelOption(opts) {
 		return append(opts, fmt.Sprintf("parallel:%d", slots))
 	}
 	return opts
 }
 // hasParallelOption reports whether the model already sets parallel/n_parallel
 // so we never override an explicit value (helper shared with serving_defaults.go).
 func hasParallelOption(opts []string) bool {
 	return backendOptionSet(opts, "parallel", "n_parallel")
 }
 // localGPU builds a GPU descriptor from local detection, used by SetDefaults on
 // a single host (the distributed router builds it from the selected node's
 // reported info instead). It is a package var so tests can inject a
 // deterministic device — detection does a live nvidia-smi call.
 var localGPU = func() GPU {
 	vendor, _ := xsysinfo.DetectGPUVendor()
 	vram, _ := xsysinfo.TotalAvailableVRAM()
 	return GPU{
 		Vendor:            vendor,
 		ComputeCapability: xsysinfo.NVIDIAComputeCapability(),
 		VRAM:              vram,
 	}
 }
 // ApplyHardwareDefaults fills ModelConfig values that depend on the target GPU
 // and were left unset by the user. Currently: a larger physical batch on
 // Blackwell. Explicit config always wins (we only touch zero values).
 func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
 	if cfg == nil {
 		return
 	}
 	if cfg.Batch == 0 && gpu.IsNVIDIABlackwell() {
 		cfg.Batch = BlackwellPhysicalBatch
 		xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
 			"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability)
 	}
 	// Enable concurrent serving by default on a capable GPU: without this the
 	// llama.cpp backend runs n_parallel=1 and serializes multi-user requests
 	// (continuous batching stays off). Unified KV means the slots share the
 	// context budget, so this is concurrency without extra KV memory. Explicit
 	// parallel/n_parallel in the model options always wins.
 	if before := len(cfg.Options); true {
 		cfg.Options = EnsureParallelOption(cfg.Options, gpu)
 		if len(cfg.Options) > before {
 			xlog.Debug("[hardware_defaults] defaulting parallel slots for concurrent serving",
 				"option", cfg.Options[len(cfg.Options)-1], "vram_gib", gpu.VRAM>>30)
 		}
 	}
 }
 // parseComputeCapability splits a "major.minor" string into integer parts.
 // Returns (-1, -1) when it can't be parsed.
 func parseComputeCapability(cc string) (int, int) {
 	cc = strings.TrimSpace(cc)
 	if cc == "" {
 		return -1, -1
 	}
 	majStr, minStr := cc, "0"
 	if dot := strings.IndexByte(cc, '.'); dot >= 0 {
 		majStr, minStr = cc[:dot], cc[dot+1:]
 	}
 	maj, err := strconv.Atoi(strings.TrimSpace(majStr))
 	if err != nil {
 		return -1, -1
 	}
 	min, err := strconv.Atoi(strings.TrimSpace(minStr))
 	if err != nil {
 		min = 0
 	}
 	return maj, min
 }
--- a/core/config/hardware_defaults_internal_test.go
+++ b/core/config/hardware_defaults_internal_test.go
@@ -0,0 +1,37 @@
 package config
 import (
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
 // Single-instance path: SetDefaults applies hardware defaults from the local
 // GPU. The detection seam (localGPU) is injected so the path is deterministic
 // without a real GPU.
 var _ = Describe("SetDefaults hardware defaults (single-instance)", func() {
 	var orig func() GPU
 	BeforeEach(func() { orig = localGPU })
 	AfterEach(func() { localGPU = orig })
 	It("sets the physical batch on a local Blackwell GPU", func() {
 		localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
 		cfg := &ModelConfig{}
 		cfg.SetDefaults()
 		Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
 	})
 	It("leaves batch unset on a non-Blackwell local GPU", func() {
 		localGPU = func() GPU { return GPU{ComputeCapability: "8.9"} }
 		cfg := &ModelConfig{}
 		cfg.SetDefaults()
 		Expect(cfg.Batch).To(Equal(0))
 	})
 	It("never overrides an explicit batch", func() {
 		localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
 		cfg := &ModelConfig{}
 		cfg.Batch = 1024
 		cfg.SetDefaults()
 		Expect(cfg.Batch).To(Equal(1024))
 	})
 })
--- a/core/config/hardware_defaults_test.go
+++ b/core/config/hardware_defaults_test.go
@@ -0,0 +1,97 @@
 package config_test
 import (
 	. "github.com/mudler/LocalAI/core/config"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
 var _ = Describe("Hardware-driven config defaults", func() {
 	DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
 		func(cc string, want bool) {
 			Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
 		},
 		Entry("GB10 12.1", "12.1", true),
 		Entry("RTX 50 12.0", "12.0", true),
 		Entry("future 13.0", "13.0", true),
 		Entry("Hopper 9.0", "9.0", false),
 		Entry("Ada 8.9", "8.9", false),
 		Entry("datacenter Blackwell sm_100 10.0", "10.0", false),
 		Entry("unknown", "", false),
 	)
 	Describe("PhysicalBatch / IsManagedPhysicalBatch", func() {
 		It("returns the Blackwell batch on Blackwell", func() {
 			Expect(PhysicalBatch(GPU{ComputeCapability: "12.1"})).To(Equal(BlackwellPhysicalBatch))
 		})
 		It("returns the default batch otherwise", func() {
 			Expect(PhysicalBatch(GPU{ComputeCapability: "9.0"})).To(Equal(DefaultPhysicalBatch))
 			Expect(PhysicalBatch(GPU{})).To(Equal(DefaultPhysicalBatch))
 		})
 		It("recognizes managed defaults but not explicit values", func() {
 			Expect(IsManagedPhysicalBatch(DefaultPhysicalBatch)).To(BeTrue())
 			Expect(IsManagedPhysicalBatch(BlackwellPhysicalBatch)).To(BeTrue())
 			Expect(IsManagedPhysicalBatch(1024)).To(BeFalse())
 		})
 	})
 	Describe("ApplyHardwareDefaults", func() {
 		It("raises an unset batch to 2048 on Blackwell", func() {
 			cfg := &ModelConfig{}
 			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
 			Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
 		})
 		It("leaves batch unset on non-Blackwell", func() {
 			cfg := &ModelConfig{}
 			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0"})
 			Expect(cfg.Batch).To(Equal(0))
 		})
 		It("never overrides an explicit batch", func() {
 			cfg := &ModelConfig{}
 			cfg.Batch = 1024
 			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
 			Expect(cfg.Batch).To(Equal(1024))
 		})
 		It("no-ops on nil", func() {
 			Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic())
 		})
 	})
 	const gib = uint64(1) << 30
 	DescribeTable("DefaultParallelSlots (by VRAM)",
 		func(vramGiB uint64, want int) {
 			Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want))
 		},
 		Entry("GB10 119 GiB", uint64(119), 8),
 		Entry("48 GiB", uint64(48), 8),
 		Entry("24 GiB", uint64(24), 4),
 		Entry("8 GiB", uint64(8), 4),
 		Entry("6 GiB", uint64(6), 2),
 		Entry("2 GiB", uint64(2), 1),
 		Entry("unknown 0", uint64(0), 1),
 	)
 	Describe("ApplyHardwareDefaults parallel slots", func() {
 		It("adds a VRAM-scaled parallel option on a capable GPU", func() {
 			cfg := &ModelConfig{}
 			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
 			Expect(cfg.Options).To(ContainElement("parallel:8"))
 		})
 		It("scales the slot count down with VRAM", func() {
 			cfg := &ModelConfig{}
 			ApplyHardwareDefaults(cfg, GPU{VRAM: 24 * gib})
 			Expect(cfg.Options).To(ContainElement("parallel:4"))
 		})
 		It("adds no parallel option on small/unknown VRAM", func() {
 			cfg := &ModelConfig{}
 			ApplyHardwareDefaults(cfg, GPU{VRAM: 2 * gib})
 			Expect(cfg.Options).ToNot(ContainElement(ContainSubstring("parallel")))
 		})
 		It("never overrides an explicit parallel option", func() {
 			cfg := &ModelConfig{Options: []string{"parallel:2"}}
 			ApplyHardwareDefaults(cfg, GPU{VRAM: 119 * gib})
 			Expect(cfg.Options).To(Equal([]string{"parallel:2"}))
 		})
 	})
 })
--- a/core/config/hooks_llamacpp.go
+++ b/core/config/hooks_llamacpp.go
@@ -34,7 +34,7 @@ func llamaCppDefaults(cfg *ModelConfig, modelPath string) {
 	// Default context size if not set, regardless of whether GGUF parsing succeeds
 	defer func() {
 		if cfg.ContextSize == nil {
-			ctx := defaultContextSize
+			ctx := GGUFFallbackContextSize
 			cfg.ContextSize = &ctx
 		}
 	}()
--- a/core/config/meta/constants.go
+++ b/core/config/meta/constants.go
@@ -68,6 +68,7 @@ var UsecaseOptions = []FieldOption{
 	{Value: "face_recognition", Label: "Face Recognition"},
 	{Value: "transcript", Label: "Transcript"},
 	{Value: "diarization", Label: "Diarization"},
 	{Value: "sound_classification", Label: "Sound Classification"},
 	{Value: "speaker_recognition", Label: "Speaker Recognition"},
 	{Value: "tts", Label: "TTS"},
 	{Value: "sound_generation", Label: "Sound Generation"},
--- a/core/config/meta/registry.go
+++ b/core/config/meta/registry.go
@@ -286,6 +286,15 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order:       45,
 		},
 		// --- Alias ---
 		"alias": {
 			Section:     "alias",
 			Label:       "Alias target",
 			Description: "Redirect all traffic for this model to another configured model. When set, every other field on this config is ignored and requests are served by the target model.",
 			Component:   "model-select",
 			Order:       0,
 		},
 		// --- Pipeline ---
 		"pipeline.llm": {
 			Section:              "pipeline",
@@ -319,6 +328,30 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			AutocompleteProvider: ProviderModelsVAD,
 			Order:                63,
 		},
 		"pipeline.sound_detection": {
 			Section:              "pipeline",
 			Label:                "Sound Detection Model",
 			Description:          "Model to use for sound-event classification (audio tagging, e.g. ced) in the pipeline. When set, committed realtime audio is also classified and the scored AudioSet tags are emitted as a conversation.item.sound_detection event.",
 			Component:            "model-select",
 			AutocompleteProvider: ProviderModels,
 			Order:                64,
 		},
 		"pipeline.sound_detection_window_ms": {
 			Section:     "pipeline",
 			Label:       "Sound Detection Window (ms)",
 			Description: "Server-side windowing for a sound-only realtime session: length in ms of the audio window classified each hop. 0 = client-driven (the client commits windows).",
 			Component:   "number",
 			Min:         f64(0),
 			Order:       65,
 		},
 		"pipeline.sound_detection_hop_ms": {
 			Section:     "pipeline",
 			Label:       "Sound Detection Hop (ms)",
 			Description: "Server-side windowing hop in ms: how often the server classifies the last window. 0 = client-driven.",
 			Component:   "number",
 			Min:         f64(0),
 			Order:       66,
 		},
 		"pipeline.reasoning_effort": {
 			Section:     "pipeline",
 			Label:       "Reasoning Effort",
@@ -448,6 +481,55 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Component:   "json-editor",
 			Order:       78,
 		},
 		"pipeline.voice_recognition.enforce": {
 			Section:     "pipeline",
 			Label:       "Voice Gate Enforce",
 			Description: "Whether the gate rejects unauthorized speakers. Enabled (default) drops unauthorized utterances before the LLM. Disabled still resolves and surfaces the speaker (for the conversation.item.speaker event and personalization) but never drops a turn.",
 			Component:   "toggle",
 			Order:       80,
 		},
 		"pipeline.voice_recognition.identity.announce": {
 			Section:     "pipeline",
 			Label:       "Speaker Identity Announce",
 			Description: "Emit a conversation.item.speaker event to the client naming the recognized speaker. When set, identity is resolved on every turn even if 'when' is 'first'.",
 			Component:   "toggle",
 			Order:       81,
 		},
 		"pipeline.voice_recognition.identity.announce_unknown": {
 			Section:     "pipeline",
 			Label:       "Speaker Identity Announce Unknown",
 			Description: "Also emit the conversation.item.speaker event (with matched=false) when no confident match is found. Default only announces on a match.",
 			Component:   "toggle",
 			Order:       82,
 		},
 		"pipeline.voice_recognition.identity.personalize": {
 			Section:     "pipeline",
 			Label:       "Speaker Identity Personalize",
 			Description: "Inform the LLM who is speaking so it can tailor replies. Enables the name and system-note injection below.",
 			Component:   "toggle",
 			Order:       83,
 		},
 		"pipeline.voice_recognition.identity.inject_name": {
 			Section:     "pipeline",
 			Label:       "Speaker Identity Inject Name",
 			Description: "Personalization: set the per-message OpenAI 'name' field on each user turn to the recognized speaker.",
 			Component:   "toggle",
 			Order:       84,
 		},
 		"pipeline.voice_recognition.identity.inject_system_note": {
 			Section:     "pipeline",
 			Label:       "Speaker Identity Inject System Note",
 			Description: "Personalization: append a 'The current speaker is <name>.' note to the system message reflecting the latest speaker.",
 			Component:   "toggle",
 			Order:       85,
 		},
 		"pipeline.voice_recognition.identity.note_unknown": {
 			Section:     "pipeline",
 			Label:       "Speaker Identity Note Unknown",
 			Description: "Personalization: when the speaker is unidentified, append 'The current speaker is unknown.' to the system message so the model can ask who it is talking to.",
 			Component:   "toggle",
 			Order:       86,
 		},
 		"pipeline.max_history_items": {
 			Section:     "pipeline",
 			Label:       "Max History Items",
@@ -455,6 +537,36 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Component:   "number",
 			Order:       79,
 		},
 		"pipeline.compaction.enabled": {
 			Section:     "pipeline",
 			Label:       "Compaction Enabled",
 			Description: "Fold conversation items that age out of the live window (Max History Items) into a rolling summary instead of dropping them, so long realtime sessions stay cheap without losing earlier context. Off by default.",
 			Component:   "toggle",
 			Order:       80,
 		},
 		"pipeline.compaction.trigger_items": {
 			Section:     "pipeline",
 			Label:       "Compaction Trigger Items",
 			Description: "High-water mark: once the live conversation exceeds this many items, the overflow above Max History Items is summarized and evicted. Must be greater than Max History Items; defaults to twice it. The gap controls how often summarization runs.",
 			Component:   "number",
 			Order:       81,
 		},
 		"pipeline.compaction.summary_model": {
 			Section:     "pipeline",
 			Label:       "Compaction Summary Model",
 			Description: "Optional smaller/cheaper model used to produce the rolling summary. Empty reuses the pipeline's own LLM. On CPU, a tiny model here keeps compaction from competing with the conversation LLM.",
 			Component:   "input",
 			Advanced:    true,
 			Order:       82,
 		},
 		"pipeline.compaction.max_summary_tokens": {
 			Section:     "pipeline",
 			Label:       "Compaction Max Summary Tokens",
 			Description: "Advisory cap on the rolling summary length (fed to the summarizer prompt). Defaults to 512.",
 			Component:   "number",
 			Advanced:    true,
 			Order:       83,
 		},
 		// --- Functions ---
 		"function.grammar.parallel_calls": {
--- a/core/config/meta/registry_test.go
+++ b/core/config/meta/registry_test.go
@@ -0,0 +1,28 @@
 package meta_test
 import (
 	"github.com/mudler/LocalAI/core/config/meta"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
 var _ = Describe("alias field metadata", func() {
 	It("registers the alias field as a model-select in the alias section", func() {
 		reg := meta.DefaultRegistry()
 		f, ok := reg["alias"]
 		Expect(ok).To(BeTrue(), "alias field should have a registry override")
 		Expect(f.Section).To(Equal("alias"))
 		Expect(f.Component).To(Equal("model-select"))
 	})
 	It("defines an alias section", func() {
 		var found bool
 		for _, s := range meta.DefaultSections() {
 			if s.ID == "alias" {
 				found = true
 			}
 		}
 		Expect(found).To(BeTrue(), "DefaultSections should include an alias section")
 	})
 })
--- a/core/config/meta/types.go
+++ b/core/config/meta/types.go
@@ -69,6 +69,7 @@ type FieldMetaOverride struct {
 func DefaultSections() []Section {
 	return []Section{
 		{ID: "general", Label: "General", Icon: "settings", Order: 0},
 		{ID: "alias", Label: "Alias", Icon: "git-merge", Order: 5},
 		{ID: "llm", Label: "LLM", Icon: "cpu", Order: 10},
 		{ID: "parameters", Label: "Parameters", Icon: "sliders", Order: 20},
 		{ID: "templates", Label: "Templates", Icon: "file-text", Order: 30},
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -37,6 +37,12 @@ type ModelConfig struct {
 	schema.PredictionOptions `yaml:"parameters,omitempty" json:"parameters,omitempty"`
 	Name                     string `yaml:"name,omitempty" json:"name,omitempty"`
 	// Alias, when set, makes this config a pure redirect: every request for
 	// Name is served by the model named here. All other fields are ignored.
 	// The target must be an existing, non-alias model (enforced at load and
 	// at create/swap time). See docs/content for Model Aliases.
 	Alias string `yaml:"alias,omitempty" json:"alias,omitempty"`
 	F16                 *bool               `yaml:"f16,omitempty" json:"f16,omitempty"`
 	Threads             *int                `yaml:"threads,omitempty" json:"threads,omitempty"`
 	Debug               *bool               `yaml:"debug,omitempty" json:"debug,omitempty"`
@@ -391,6 +397,10 @@ func (c *ModelConfig) HasRouter() bool {
 	return len(c.Router.Candidates) > 0
 }
 // IsAlias reports whether this config is a pure redirect to another model.
 // Value receiver so it is callable on non-addressable config values too.
 func (c ModelConfig) IsAlias() bool { return c.Alias != "" }
 // @Description PII filtering configuration. PII redaction is per-model so
 // that local models don't pay the latency or behaviour change of regex
 // scanning, while cloud-bound traffic (cloud-proxy backend) can default to
@@ -594,6 +604,20 @@ type Pipeline struct {
 	LLM           string `yaml:"llm,omitempty" json:"llm,omitempty"`
 	Transcription string `yaml:"transcription,omitempty" json:"transcription,omitempty"`
 	VAD           string `yaml:"vad,omitempty" json:"vad,omitempty"`
 	// SoundDetection names a sound-event-classification model (e.g. ced). When
 	// set, each VAD-committed realtime utterance is also run through it and the
 	// scored AudioSet tags are emitted as a conversation.item.sound_detection
 	// server event, alongside (and independent of) transcription.
 	SoundDetection string `yaml:"sound_detection,omitempty" json:"sound_detection,omitempty"`
 	// SoundDetectionWindowMs / SoundDetectionHopMs enable server-side windowing
 	// for a sound-detection-only realtime session: instead of the client
 	// committing audio buffers, the server classifies the last WindowMs of
 	// streamed audio every HopMs and emits a sound_detection event per hop. Both
 	// must be > 0 to activate; otherwise the session stays client-driven (the
 	// client commits windows via input_audio_buffer.commit).
 	SoundDetectionWindowMs int `yaml:"sound_detection_window_ms,omitempty" json:"sound_detection_window_ms,omitempty"`
 	SoundDetectionHopMs    int `yaml:"sound_detection_hop_ms,omitempty" json:"sound_detection_hop_ms,omitempty"`
 	// ReasoningEffort sets the reasoning effort (none|minimal|low|medium|high) for
 	// the pipeline's LLM without editing the LLM model config. Overrides the LLM's
@@ -617,11 +641,32 @@ type Pipeline struct {
 	// context fills.
 	MaxHistoryItems *int `yaml:"max_history_items,omitempty" json:"max_history_items,omitempty"`
 	// Compaction folds conversation items that age out of the live window
 	// (max_history_items) into a rolling summary instead of dropping them, so
 	// long realtime sessions stay cheap without losing earlier context. Nil
 	// (block absent) means disabled, preserving existing behavior.
 	Compaction *PipelineCompaction `yaml:"compaction,omitempty" json:"compaction,omitempty"`
 	// VoiceRecognition gates the pipeline behind speaker verification. Nil
 	// (block absent) means no gate, preserving existing behavior.
 	VoiceRecognition *PipelineVoiceRecognition `yaml:"voice_recognition,omitempty" json:"voice_recognition,omitempty"`
 }
 // PipelineCompaction configures summarize-then-drop for a realtime pipeline.
 type PipelineCompaction struct {
 	// Enabled turns summarize-then-drop on. Default false.
 	Enabled bool `yaml:"enabled,omitempty" json:"enabled,omitempty"`
 	// TriggerItems is the high-water mark: once live items exceed it, overflow
 	// above max_history_items is summarized and evicted. Must exceed
 	// max_history_items; clamped up if not. Default: 2x max_history_items.
 	TriggerItems int `yaml:"trigger_items,omitempty" json:"trigger_items,omitempty"`
 	// SummaryModel optionally names a smaller/cheaper model for the summary
 	// call. Empty uses the pipeline's own LLM.
 	SummaryModel string `yaml:"summary_model,omitempty" json:"summary_model,omitempty"`
 	// MaxSummaryTokens advises the summary length (fed to the prompt). Default 512.
 	MaxSummaryTokens int `yaml:"max_summary_tokens,omitempty" json:"max_summary_tokens,omitempty"`
 }
 // ApplyReasoningEffort resolves the effective reasoning effort — a per-request
 // value (requestEffort) overrides the config's own ReasoningEffort default —
 // stores it on the config so gRPCPredictOpts forwards it to the backend as the
@@ -759,6 +804,13 @@ type PipelineVoiceRecognition struct {
 	Allow VoiceRecognitionAllow `yaml:"allow,omitempty" json:"allow,omitempty"`
 	// References are the authorized reference speakers (verify mode).
 	References []VoiceReference `yaml:"references,omitempty" json:"references,omitempty"`
 	// Enforce controls the authorization gate. A nil value or true rejects
 	// unauthorized speakers (the historical behavior). false resolves the
 	// speaker's identity for surfacing/personalization but never drops a turn.
 	Enforce *bool `yaml:"enforce,omitempty" json:"enforce,omitempty"`
 	// Identity surfaces the recognized speaker to the client and the LLM. It is
 	// independent of Enforce: identity can be surfaced without gating.
 	Identity *VoiceIdentityConfig `yaml:"identity,omitempty" json:"identity,omitempty"`
 }
 // @Description VoiceRecognitionAllow filters authorized registry identities.
@@ -775,6 +827,25 @@ type VoiceReference struct {
 	Audio string `yaml:"audio,omitempty" json:"audio,omitempty"`
 }
 // @Description VoiceIdentityConfig surfaces the recognized speaker to the realtime
 // client and the LLM. When set, identity is resolved on every turn even if the
 // gate's When is "first" (the gate still authorizes only once).
 type VoiceIdentityConfig struct {
 	// Announce emits a conversation.item.speaker event to the client.
 	Announce bool `yaml:"announce,omitempty" json:"announce,omitempty"`
 	// AnnounceUnknown also emits the event when there is no confident match.
 	AnnounceUnknown bool `yaml:"announce_unknown,omitempty" json:"announce_unknown,omitempty"`
 	// Personalize informs the LLM who is speaking.
 	Personalize bool `yaml:"personalize,omitempty" json:"personalize,omitempty"`
 	// InjectName sets the per-message name field on each user turn.
 	InjectName bool `yaml:"inject_name,omitempty" json:"inject_name,omitempty"`
 	// InjectSystemNote maintains a "current speaker" note in the system message.
 	InjectSystemNote bool `yaml:"inject_system_note,omitempty" json:"inject_system_note,omitempty"`
 	// NoteUnknown adds a "the current speaker is unknown" note (enables the model
 	// to ask who it is talking to).
 	NoteUnknown bool `yaml:"note_unknown,omitempty" json:"note_unknown,omitempty"`
 }
 // VoiceGateEnabled reports whether a voice-recognition gate is configured. The
 // mere presence of the block is the intent signal: a present-but-incomplete
 // block (e.g. missing model) must fail closed at construction, not be silently
@@ -783,6 +854,28 @@ func (p Pipeline) VoiceGateEnabled() bool {
 	return p.VoiceRecognition != nil
 }
 // EnforceGate reports whether the gate rejects unauthorized speakers. A nil
 // Enforce means "enforce" so existing configs keep gating.
 func (p PipelineVoiceRecognition) EnforceGate() bool {
 	return p.Enforce == nil || *p.Enforce
 }
 // IdentityEnabled reports whether the speaker's identity must be resolved for
 // surfacing or personalization.
 func (p PipelineVoiceRecognition) IdentityEnabled() bool {
 	return p.Identity != nil && (p.Identity.Announce || p.Identity.Personalize)
 }
 // AnnounceEnabled reports whether to emit the conversation.item.speaker event.
 func (p PipelineVoiceRecognition) AnnounceEnabled() bool {
 	return p.Identity != nil && p.Identity.Announce
 }
 // PersonalizeEnabled reports whether to inform the LLM of the speaker.
 func (p PipelineVoiceRecognition) PersonalizeEnabled() bool {
 	return p.Identity != nil && p.Identity.Personalize
 }
 // Normalize fills in defaults in place for omitted fields.
 func (v *PipelineVoiceRecognition) Normalize() {
 	if v.Mode == "" {
@@ -1111,107 +1204,22 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	// This ensures gallery-installed and runtime-loaded models get optimal parameters.
 	ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model)
-	// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
+	// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell).
-	defaultTopP := 0.95
+	// Uses the local GPU here; in distributed mode the router re-applies the same
-	defaultTopK := 40
+	// heuristics for the selected node's GPU before loading. Explicit config wins.
-	defaultMinP := 0.0
+	ApplyHardwareDefaults(cfg, localGPU())
-	defaultTemp := 0.9
+
-	// https://github.com/mudler/LocalAI/issues/2780
+	// Apply serving-policy defaults (device-independent): cross-request prefix
-	defaultMirostat := 0
+	// caching. Propagates to distributed nodes via the model options.
-	defaultMirostatTAU := 5.0
+	ApplyServingDefaults(cfg)
-	defaultMirostatETA := 0.1
+
-	defaultTypicalP := 1.0
+	// Generic fallback defaults (sampling params + runtime flags), applied after
-	defaultTFZ := 1.0
+	// the model-family / hardware / serving tiers above. Only fills unset values.
-	defaultZero := 0
+	ApplyGenericDefaults(cfg)
 	trueV := true
 	falseV := false
 	if cfg.Seed == nil {
 		//  random number generator seed
 		defaultSeed := RAND_SEED
 		cfg.Seed = &defaultSeed
 	}
 	// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
 	// native default differs (issue #6632). Only inject it for the llama.cpp
 	// family and the empty/auto backend; leave TopK nil for known non-llama
 	// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
 	// is 0 rather than a silently-changed 40.
 	if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
 		cfg.TopK = &defaultTopK
 	}
 	if cfg.MinP == nil {
 		cfg.MinP = &defaultMinP
 	}
 	if cfg.TypicalP == nil {
 		cfg.TypicalP = &defaultTypicalP
 	}
 	if cfg.TFZ == nil {
 		cfg.TFZ = &defaultTFZ
 	}
 	if cfg.MMap == nil {
 		// MMap is enabled by default
 		// Only exception is for Intel GPUs
 		if os.Getenv("XPU") != "" {
 			cfg.MMap = &falseV
 		} else {
 			cfg.MMap = &trueV
 		}
 	}
 	if cfg.MMlock == nil {
 		// MMlock is disabled by default
 		cfg.MMlock = &falseV
 	}
 	if cfg.TopP == nil {
 		cfg.TopP = &defaultTopP
 	}
 	if cfg.Temperature == nil {
 		cfg.Temperature = &defaultTemp
 	}
 	if cfg.Maxtokens == nil {
 		cfg.Maxtokens = &defaultZero
 	}
 	if cfg.Mirostat == nil {
 		cfg.Mirostat = &defaultMirostat
 	}
 	if cfg.MirostatETA == nil {
 		cfg.MirostatETA = &defaultMirostatETA
 	}
 	if cfg.MirostatTAU == nil {
 		cfg.MirostatTAU = &defaultMirostatTAU
 	}
 	if cfg.LowVRAM == nil {
 		cfg.LowVRAM = &falseV
 	}
 	if cfg.Embeddings == nil {
 		cfg.Embeddings = &falseV
 	}
 	if cfg.Reranking == nil {
 		cfg.Reranking = &falseV
 	}
 	if cfg.PromptCacheAll == nil {
 		// Match upstream llama.cpp's default (common/common.h: cache_prompt = true)
 		// and let cache_idle_slots / kv_unified actually do useful work; users can
 		// opt out with an explicit `prompt_cache_all: false` in the model YAML.
 		cfg.PromptCacheAll = &trueV
 	}
 	if threads == 0 {
 		// Threads can't be 0
 		threads = 4
@@ -1243,6 +1251,22 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 }
 func (c *ModelConfig) Validate() (bool, error) {
 	// An alias is a pure redirect: validate only its own shape here. Target
 	// existence and the no-chain rule need the full config set, so the loader
 	// (load-time) and the create/swap endpoints enforce those.
 	if c.IsAlias() {
 		if c.Name == "" {
 			return false, fmt.Errorf("alias config requires a name")
 		}
 		if c.Alias == c.Name {
 			return false, fmt.Errorf("alias %q cannot point to itself", c.Name)
 		}
 		if c.Backend != "" || c.Model != "" {
 			return false, fmt.Errorf("alias config %q must not set backend or parameters.model: an alias is a pure redirect", c.Name)
 		}
 		return true, nil
 	}
 	downloadedFileNames := []string{}
 	for _, f := range c.DownloadFiles {
 		downloadedFileNames = append(downloadedFileNames, f.Filename)
@@ -1463,6 +1487,11 @@ const (
 	// so it may combine freely with other usecases.
 	FLAG_TOKEN_CLASSIFY ModelConfigUsecase = 0b1000000000000000000000
 	// Marks a model as wired for the SoundDetection gRPC primitive
 	// (audio tagging / sound-event classification — scored AudioSet
 	// labels via the SoundDetection RPC, e.g. ced).
 	FLAG_SOUND_CLASSIFICATION ModelConfigUsecase = 0b10000000000000000000000
 	// Common Subsets
 	FLAG_LLM ModelConfigUsecase = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
 )
@@ -1471,12 +1500,12 @@ const (
 // Flags within the same group are NOT orthogonal (e.g., chat and completion are
 // both text/language). A model is multimodal when its usecases span 2+ groups.
 var ModalityGroups = []ModelConfigUsecase{
-	FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT,                // text/language
+	FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT,                           // text/language
-	FLAG_VISION | FLAG_DETECTION,                           // visual understanding
+	FLAG_VISION | FLAG_DETECTION,                                      // visual understanding
-	FLAG_TRANSCRIPT | FLAG_REALTIME_AUDIO,                  // speech input — realtime_audio is any-to-any, so it counts here too
+	FLAG_TRANSCRIPT | FLAG_REALTIME_AUDIO | FLAG_SOUND_CLASSIFICATION, // audio input — realtime_audio is any-to-any, so it counts here too
-	FLAG_TTS | FLAG_SOUND_GENERATION | FLAG_REALTIME_AUDIO, // audio output — and here, so a lone realtime_audio flag still reads as multimodal
+	FLAG_TTS | FLAG_SOUND_GENERATION | FLAG_REALTIME_AUDIO,            // audio output — and here, so a lone realtime_audio flag still reads as multimodal
-	FLAG_AUDIO_TRANSFORM,                                   // audio in/out transforms
+	FLAG_AUDIO_TRANSFORM,                                              // audio in/out transforms
-	FLAG_IMAGE | FLAG_VIDEO,                                // visual generation
+	FLAG_IMAGE | FLAG_VIDEO,                                           // visual generation
 }
 // IsMultimodal returns true if the given usecases span two or more orthogonal
@@ -1499,29 +1528,30 @@ func GetAllModelConfigUsecases() map[string]ModelConfigUsecase {
 	return map[string]ModelConfigUsecase{
 		// Note: FLAG_ANY is intentionally excluded from this map
 		// because it's 0 and would always match in HasUsecases checks
-		"FLAG_CHAT":                FLAG_CHAT,
+		"FLAG_CHAT":                 FLAG_CHAT,
-		"FLAG_COMPLETION":          FLAG_COMPLETION,
+		"FLAG_COMPLETION":           FLAG_COMPLETION,
-		"FLAG_EDIT":                FLAG_EDIT,
+		"FLAG_EDIT":                 FLAG_EDIT,
-		"FLAG_EMBEDDINGS":          FLAG_EMBEDDINGS,
+		"FLAG_EMBEDDINGS":           FLAG_EMBEDDINGS,
-		"FLAG_RERANK":              FLAG_RERANK,
+		"FLAG_RERANK":               FLAG_RERANK,
-		"FLAG_IMAGE":               FLAG_IMAGE,
+		"FLAG_IMAGE":                FLAG_IMAGE,
-		"FLAG_TRANSCRIPT":          FLAG_TRANSCRIPT,
+		"FLAG_TRANSCRIPT":           FLAG_TRANSCRIPT,
-		"FLAG_TTS":                 FLAG_TTS,
+		"FLAG_TTS":                  FLAG_TTS,
-		"FLAG_SOUND_GENERATION":    FLAG_SOUND_GENERATION,
+		"FLAG_SOUND_GENERATION":     FLAG_SOUND_GENERATION,
-		"FLAG_TOKENIZE":            FLAG_TOKENIZE,
+		"FLAG_TOKENIZE":             FLAG_TOKENIZE,
-		"FLAG_VAD":                 FLAG_VAD,
+		"FLAG_VAD":                  FLAG_VAD,
-		"FLAG_LLM":                 FLAG_LLM,
+		"FLAG_LLM":                  FLAG_LLM,
-		"FLAG_VIDEO":               FLAG_VIDEO,
+		"FLAG_VIDEO":                FLAG_VIDEO,
-		"FLAG_DETECTION":           FLAG_DETECTION,
+		"FLAG_DETECTION":            FLAG_DETECTION,
-		"FLAG_VISION":              FLAG_VISION,
+		"FLAG_VISION":               FLAG_VISION,
-		"FLAG_FACE_RECOGNITION":    FLAG_FACE_RECOGNITION,
+		"FLAG_FACE_RECOGNITION":     FLAG_FACE_RECOGNITION,
-		"FLAG_SPEAKER_RECOGNITION": FLAG_SPEAKER_RECOGNITION,
+		"FLAG_SPEAKER_RECOGNITION":  FLAG_SPEAKER_RECOGNITION,
-		"FLAG_AUDIO_TRANSFORM":     FLAG_AUDIO_TRANSFORM,
+		"FLAG_AUDIO_TRANSFORM":      FLAG_AUDIO_TRANSFORM,
-		"FLAG_DIARIZATION":         FLAG_DIARIZATION,
+		"FLAG_DIARIZATION":          FLAG_DIARIZATION,
-		"FLAG_REALTIME_AUDIO":      FLAG_REALTIME_AUDIO,
+		"FLAG_SOUND_CLASSIFICATION": FLAG_SOUND_CLASSIFICATION,
-		"FLAG_SCORE":               FLAG_SCORE,
+		"FLAG_REALTIME_AUDIO":       FLAG_REALTIME_AUDIO,
-		"FLAG_DEPTH":               FLAG_DEPTH,
+		"FLAG_SCORE":                FLAG_SCORE,
-		"FLAG_TOKEN_CLASSIFY":      FLAG_TOKEN_CLASSIFY,
+		"FLAG_DEPTH":                FLAG_DEPTH,
 		"FLAG_TOKEN_CLASSIFY":       FLAG_TOKEN_CLASSIFY,
 	}
 }
@@ -1724,6 +1754,16 @@ func (c *ModelConfig) GuessUsecases(u ModelConfigUsecase) bool {
 		}
 	}
 	if (u & FLAG_SOUND_CLASSIFICATION) == FLAG_SOUND_CLASSIFICATION {
 		// ced is a sound-event tagger (AudioSet labels) surfaced via the
 		// SoundDetection gRPC. Models without an explicit known_usecases
 		// still surface when they run on one of these backends.
 		soundClassificationBackends := []string{"ced"}
 		if !slices.Contains(soundClassificationBackends, c.Backend) {
 			return false
 		}
 	}
 	if (u & FLAG_REALTIME_AUDIO) == FLAG_REALTIME_AUDIO {
 		// Backends that own a single any-to-any loop and implement
 		// AudioToAudioStream — listed here so models without an explicit
--- a/core/config/model_config_loader.go
+++ b/core/config/model_config_loader.go
@@ -294,6 +294,44 @@ func (bcl *ModelConfigLoader) UpdateModelConfig(m string, updater func(*ModelCon
 	}
 }
 // ResolveAlias follows a one-hop alias to its target config. Returns
 // (resolved, wasAlias, err). Non-alias configs return (cfg, false, nil)
 // unchanged. Strict: the target must exist and must not itself be an alias
 // (chains are rejected). The returned config is a copy of the target.
 func (bcl *ModelConfigLoader) ResolveAlias(cfg *ModelConfig) (*ModelConfig, bool, error) {
 	if cfg == nil || !cfg.IsAlias() {
 		return cfg, false, nil
 	}
 	target, exists := bcl.GetModelConfig(cfg.Alias)
 	if !exists {
 		return nil, true, fmt.Errorf("alias %q points to unknown model %q", cfg.Name, cfg.Alias)
 	}
 	if target.IsAlias() {
 		return nil, true, fmt.Errorf("alias %q points to another alias %q (chains are not allowed)", cfg.Name, cfg.Alias)
 	}
 	return &target, true, nil
 }
 // ValidateAliasTarget checks an alias config's target at create/swap time:
 // the target must exist, must not be an alias, and must not be disabled.
 // Returns nil for non-alias configs.
 func (bcl *ModelConfigLoader) ValidateAliasTarget(cfg *ModelConfig) error {
 	if cfg == nil || !cfg.IsAlias() {
 		return nil
 	}
 	target, exists := bcl.GetModelConfig(cfg.Alias)
 	if !exists {
 		return fmt.Errorf("alias target %q does not exist", cfg.Alias)
 	}
 	if target.IsAlias() {
 		return fmt.Errorf("alias target %q is itself an alias (chains are not allowed)", cfg.Alias)
 	}
 	if target.IsDisabled() {
 		return fmt.Errorf("alias target %q is disabled", cfg.Alias)
 	}
 	return nil
 }
 // Preload prepare models if they are not local but url or huggingface repositories
 func (bcl *ModelConfigLoader) Preload(modelPath string) error {
 	bcl.Lock()
@@ -475,5 +513,21 @@ func (bcl *ModelConfigLoader) LoadModelConfigsFromPath(path string, opts ...Conf
 		}
 	}
 	// Surface aliases whose targets are missing or themselves aliases. These
 	// resolve to a clear request-time error; warning here gives operators
 	// visibility without failing startup.
 	for name, c := range bcl.configs {
 		if !c.IsAlias() {
 			continue
 		}
 		target, ok := bcl.configs[c.Alias]
 		switch {
 		case !ok:
 			xlog.Warn("alias points to unknown model", "alias", name, "target", c.Alias)
 		case target.IsAlias():
 			xlog.Warn("alias points to another alias (chains are not allowed)", "alias", name, "target", c.Alias)
 		}
 	}
 	return nil
 }
--- a/core/config/model_config_loader_test.go
+++ b/core/config/model_config_loader_test.go
@@ -61,3 +61,51 @@ var _ = Describe("ModelConfigLoader.GetModelsConflictingWith", func() {
 		Expect(bcl.GetModelsConflictingWith("a")).To(ConsistOf("b"))
 	})
 })
 var _ = Describe("ModelConfigLoader alias resolution", func() {
 	var loader *ModelConfigLoader
 	BeforeEach(func() {
 		loader = NewModelConfigLoader("")
 		loader.configs["real"] = ModelConfig{Name: "real", Backend: "llama-cpp"}
 		loader.configs["gpt-4"] = ModelConfig{Name: "gpt-4", Alias: "real"}
 		loader.configs["chain"] = ModelConfig{Name: "chain", Alias: "gpt-4"}
 		loader.configs["dangling"] = ModelConfig{Name: "dangling", Alias: "nope"}
 	})
 	It("returns non-alias configs unchanged", func() {
 		cfg := loader.configs["real"]
 		got, was, err := loader.ResolveAlias(&cfg)
 		Expect(err).ToNot(HaveOccurred())
 		Expect(was).To(BeFalse())
 		Expect(got.Name).To(Equal("real"))
 	})
 	It("resolves an alias to its target", func() {
 		cfg := loader.configs["gpt-4"]
 		got, was, err := loader.ResolveAlias(&cfg)
 		Expect(err).ToNot(HaveOccurred())
 		Expect(was).To(BeTrue())
 		Expect(got.Name).To(Equal("real"))
 	})
 	It("rejects an alias chain", func() {
 		cfg := loader.configs["chain"]
 		_, was, err := loader.ResolveAlias(&cfg)
 		Expect(was).To(BeTrue())
 		Expect(err).To(MatchError(ContainSubstring("chains are not allowed")))
 	})
 	It("rejects a dangling alias", func() {
 		cfg := loader.configs["dangling"]
 		_, _, err := loader.ResolveAlias(&cfg)
 		Expect(err).To(MatchError(ContainSubstring("unknown model")))
 	})
 	It("ValidateAliasTarget passes for a real target and fails for a chain", func() {
 		good := loader.configs["gpt-4"]
 		Expect(loader.ValidateAliasTarget(&good)).ToNot(HaveOccurred())
 		bad := loader.configs["chain"]
 		Expect(loader.ValidateAliasTarget(&bad)).To(MatchError(ContainSubstring("itself an alias")))
 	})
 })
--- a/core/config/model_config_test.go
+++ b/core/config/model_config_test.go
@@ -787,3 +787,32 @@ var _ = Describe("pattern detector config", func() {
 		Expect(err).To(MatchError(ContainSubstring("pattern \"EMAILish\"")))
 	})
 })
 var _ = Describe("ModelConfig alias", func() {
 	It("reports IsAlias when alias is set", func() {
 		c := ModelConfig{Name: "gpt-4", Alias: "my-llama-3"}
 		Expect(c.IsAlias()).To(BeTrue())
 		Expect(ModelConfig{Name: "real"}.IsAlias()).To(BeFalse())
 	})
 	It("validates a minimal alias config", func() {
 		c := ModelConfig{Name: "gpt-4", Alias: "my-llama-3"}
 		ok, err := c.Validate()
 		Expect(err).ToNot(HaveOccurred())
 		Expect(ok).To(BeTrue())
 	})
 	It("rejects an alias pointing to itself", func() {
 		c := ModelConfig{Name: "loop", Alias: "loop"}
 		ok, err := c.Validate()
 		Expect(ok).To(BeFalse())
 		Expect(err).To(MatchError(ContainSubstring("itself")))
 	})
 	It("rejects an alias that also sets a backend", func() {
 		c := ModelConfig{Name: "gpt-4", Alias: "my-llama-3", Backend: "llama-cpp"}
 		ok, err := c.Validate()
 		Expect(ok).To(BeFalse())
 		Expect(err).To(MatchError(ContainSubstring("pure redirect")))
 	})
 })
--- a/core/config/runtime_settings.go
+++ b/core/config/runtime_settings.go
@@ -28,6 +28,7 @@ type RuntimeSettings struct {
 	// Eviction settings
 	ForceEvictionWhenBusy    *bool   `json:"force_eviction_when_busy,omitempty"`    // Force eviction even when models have active API calls (default: false for safety)
 	SizeAwareEviction        *bool   `json:"size_aware_eviction,omitempty"`          // Evict largest models first rather than least-recently-used (default: false)
 	LRUEvictionMaxRetries    *int    `json:"lru_eviction_max_retries,omitempty"`    // Maximum number of retries when waiting for busy models to become idle (default: 30)
 	LRUEvictionRetryInterval *string `json:"lru_eviction_retry_interval,omitempty"` // Interval between retries when waiting for busy models (e.g., 1s, 2s) (default: 1s)
--- a/core/config/runtime_settings_persist.go
+++ b/core/config/runtime_settings_persist.go
@@ -5,6 +5,7 @@ import (
 	"errors"
 	"os"
 	"path/filepath"
 	"reflect"
 )
 // runtimeSettingsFile is the on-disk filename inside DynamicConfigsDir.
@@ -33,6 +34,35 @@ func (o *ApplicationConfig) ReadPersistedSettings() (RuntimeSettings, error) {
 	return settings, nil
 }
 // MergeNonNil overlays every set (non-nil) field of overlay onto the
 // receiver, leaving the receiver's value untouched wherever overlay left a
 // field unset. Every RuntimeSettings field is a pointer precisely so "set"
 // can be told apart from "absent" (see the type doc), which makes this a
 // faithful partial update: a caller that submits only the field it owns
 // changes exactly that field and never clobbers unrelated settings.
 //
 // This is the read-modify-write contract the persistence helpers exist for.
 // UpdateSettingsEndpoint reads the on-disk settings, merges the request body
 // on top, and writes the result — so a focused admin page that POSTs only its
 // own field (the Middleware page sends only mitm_listen; the detector table
 // only pii_default_detectors) no longer nulls every other setting.
 //
 // Reflection keeps the merge total over the struct: a field added to
 // RuntimeSettings later is merged automatically, so the persistence path can
 // never silently drop a new setting the way a hand-maintained field list
 // would. Non-pointer fields (none today) are skipped — they cannot express
 // "absent", so the receiver wins.
 func (s *RuntimeSettings) MergeNonNil(overlay RuntimeSettings) {
 	dst := reflect.ValueOf(s).Elem()
 	src := reflect.ValueOf(overlay)
 	for i := 0; i < src.NumField(); i++ {
 		f := src.Field(i)
 		if f.Kind() == reflect.Pointer && !f.IsNil() {
 			dst.Field(i).Set(f)
 		}
 	}
 }
 // WritePersistedSettings serialises the given RuntimeSettings to
 // runtime_settings.json with restricted permissions (it may carry API
 // keys and P2P tokens).
--- a/core/config/runtime_settings_persist_test.go
+++ b/core/config/runtime_settings_persist_test.go
@@ -12,6 +12,7 @@ import (
 )
 func strPtr(s string) *string { return &s }
 func boolPtr(b bool) *bool     { return &b }
 var _ = Describe("RuntimeSettings persistence helpers", func() {
 	var (
@@ -51,6 +52,47 @@ var _ = Describe("RuntimeSettings persistence helpers", func() {
 		})
 	})
 	// MergeNonNil is the partial-update primitive UpdateSettingsEndpoint
 	// relies on: a focused admin page POSTs only the field it owns, and the
 	// handler reads the on-disk settings and overlays the request on top.
 	// Without it, the body would be written verbatim and every field the
 	// caller omitted would be nulled (the reported regression: changing
 	// mitm_listen wiped the galleries, api keys, watchdog config, etc.).
 	Describe("MergeNonNil partial update", func() {
 		It("overlays set fields and preserves unset ones", func() {
 			base := config.RuntimeSettings{
 				MITMListen:          strPtr(":9000"),
 				Galleries:           &[]config.Gallery{{Name: "g1", URL: "http://example/g1"}},
 				WatchdogIdleEnabled: boolPtr(true),
 				ApiKeys:             &[]string{"persisted-key"},
 				PIIDefaultDetectors: &[]string{"det-a"},
 			}
 			// Simulate the Middleware proxy tab: only mitm_listen is sent.
 			overlay := config.RuntimeSettings{MITMListen: strPtr(":8443")}
 			base.MergeNonNil(overlay)
 			Expect(base.MITMListen).ToNot(BeNil())
 			Expect(*base.MITMListen).To(Equal(":8443"), "set field should be overlaid")
 			// Everything the overlay left unset must survive untouched.
 			Expect(base.Galleries).ToNot(BeNil(), "galleries were clobbered")
 			Expect(*base.Galleries).To(HaveLen(1))
 			Expect(base.WatchdogIdleEnabled).ToNot(BeNil())
 			Expect(*base.WatchdogIdleEnabled).To(BeTrue())
 			Expect(base.ApiKeys).ToNot(BeNil(), "api_keys were clobbered")
 			Expect(*base.ApiKeys).To(Equal([]string{"persisted-key"}))
 			Expect(base.PIIDefaultDetectors).ToNot(BeNil(), "pii_default_detectors were clobbered")
 			Expect(*base.PIIDefaultDetectors).To(Equal([]string{"det-a"}))
 		})
 		It("lets an explicit empty slice clear a field", func() {
 			base := config.RuntimeSettings{PIIDefaultDetectors: &[]string{"det-a"}}
 			base.MergeNonNil(config.RuntimeSettings{PIIDefaultDetectors: &[]string{}})
 			Expect(base.PIIDefaultDetectors).ToNot(BeNil())
 			Expect(*base.PIIDefaultDetectors).To(BeEmpty(), "an explicit empty slice should clear, not preserve")
 		})
 	})
 	// MITM round trip pins the contract that loadRuntimeSettingsFromFile
 	// MITM listener address must survive a write/read round trip so the
 	// next process restart can bring the listener back up. (Intercept
--- a/core/config/serving_defaults.go
+++ b/core/config/serving_defaults.go
@@ -0,0 +1,56 @@
 package config
 import (
 	"fmt"
 	"strings"
 	"github.com/mudler/xlog"
 )
 // Serving-policy model-config defaults.
 //
 // Sibling to hardware_defaults.go: those fill values driven by the target
 // *device* (Blackwell batch, VRAM-scaled parallel slots); these fill values
 // that improve multi-request / multi-user *serving* regardless of the GPU. They
 // run together from SetDefaults and only ever fill values the user left unset.
 // DefaultCacheReuse is the minimum shared-prefix chunk (in tokens) the backend
 // reuses across requests via KV-cache shifting. The llama.cpp backend ships this
 // disabled (n_cache_reuse = 0); we enable it so repeated prefixes (system
 // prompts, RAG context, agent scaffolds, multi-turn chat) are not recomputed.
 // This is the universally-useful part of "paged attention" (cross-request prefix
 // sharing) and needs none of the block-KV machinery.
 const DefaultCacheReuse = 256
 // ApplyServingDefaults fills serving-policy ModelConfig values the user left
 // unset. Currently: enable cross-request prefix caching. Explicit
 // cache_reuse/n_cache_reuse in the model options always wins.
 func ApplyServingDefaults(cfg *ModelConfig) {
 	if cfg == nil {
 		return
 	}
 	if !backendOptionSet(cfg.Options, "cache_reuse", "n_cache_reuse") {
 		cfg.Options = append(cfg.Options, fmt.Sprintf("cache_reuse:%d", DefaultCacheReuse))
 		xlog.Debug("[serving_defaults] enabling cross-request prefix cache",
 			"cache_reuse", DefaultCacheReuse)
 	}
 }
 // backendOptionSet reports whether the backend options already set any of names.
 // Options are "name:value" strings (or bare "name"); used so we never override
 // an explicit value. Shared with hardware_defaults.go.
 func backendOptionSet(opts []string, names ...string) bool {
 	for _, o := range opts {
 		name := o
 		if i := strings.IndexByte(o, ':'); i >= 0 {
 			name = o[:i]
 		}
 		name = strings.TrimSpace(strings.ToLower(name))
 		for _, n := range names {
 			if name == n {
 				return true
 			}
 		}
 	}
 	return false
 }
--- a/Show More
+++ b/Show More