fix(llama-cpp-darwin): distribute ggml backends by suffix (.so root, .dylib lib)

ggml emits its loadable backends (per-microarch CPU variants, metal, blas) with a .so suffix even on darwin, while the core libraries (ggml-base/ggml/llama/ llama-common/mtmd) use .dylib. Split the distribution by suffix: .so DL backends go in the package root for ggml's executable-directory scan, .dylib core libs go in lib/ for DYLD_LIBRARY_PATH. The previous .dylib name-pattern matched none of the variants. Verified on an M4: ggml loads the apple_m4 CPU variant (SME=1) and Metal, model loads and generates correct tokens. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code]
feat(llama-cpp,turboquant): arm64 gcc-14 for SME variants + darwin cpu-all packaging
2026-06-25 00:59:28 -04:00 · 2026-06-24 21:59:29 +00:00 · 2026-06-24 21:50:29 +00:00 · 2026-06-24 21:33:32 +00:00 · 2026-06-24 21:21:03 +00:00 · 2026-06-24 21:50:44 +02:00
293 changed files with 12906 additions and 2427 deletions
--- a/.agents/adding-backends.md
+++ b/.agents/adding-backends.md
@@ -198,6 +198,27 @@ docker-build-backends: ... docker-build-<backend-name>
 - If the backend is in `backend/python/<backend-name>/` but uses `.` as context in the workflow file, use `.` context
 - Check similar backends to determine the correct context

+## Documenting the backend (README + docs)
+
+A backend is not "added" until it is discoverable. Update the user-facing docs:
+
+- **`docs/content/features/backends.md`** - add the backend to the right
+  category in the "LocalAI supports various types of backends" list (and add a
+  new category if it introduces a new modality, e.g. sound classification).
+- If the backend introduces a **new API surface** (a new endpoint or a realtime
+  capability), document it under `docs/content/` where its area lives (audio,
+  vision, etc.) and follow the api-endpoints checklist in
+  [api-endpoints-and-auth.md](api-endpoints-and-auth.md).
+
+**If the backend is a native C/C++/GGML engine created and maintained by the
+LocalAI team** (a from-scratch port like `parakeet.cpp`, `ced.cpp`,
+`vibevoice.cpp`, `rf-detr.cpp`, not a wrapper around a third-party runtime), it
+ALSO belongs in the top-level **`README.md`** table under "native C/C++/GGML
+engines ... developed and maintained by the LocalAI project itself". Add a row
+linking the upstream engine repo with a one-line description. This is the
+project's showcase of its own engines; a new in-house backend that is missing
+from it is a documentation bug.
+
 ## 5. Verification Checklist

 After adding a new backend, verify:
@@ -211,6 +232,8 @@ After adding a new backend, verify:
 - [ ] No YAML syntax errors (check with linter)
 - [ ] No Makefile syntax errors (check with linter)
 - [ ] Follows the same pattern as similar backends (e.g., if it's a transcription backend, follow `faster-whisper` pattern)
+- [ ] Documented: added to the category list in `docs/content/features/backends.md` (and any new endpoint/realtime capability documented under `docs/content/`)
+- [ ] If it is an in-house native C/C++/GGML engine, added to the maintained-engines table in the top-level `README.md`

 ## Bundling runtime shared libraries (`package.sh`)

--- a/.docker/install-base-deps.sh
+++ b/.docker/install-base-deps.sh
@@ -70,6 +70,12 @@ if [ "${BUILD_TYPE:-}" = "vulkan" ] && [ "${SKIP_DRIVERS:-false}" = "false" ]; t
        git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
        ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
        clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
+    # Mesa Vulkan ICD drivers (ANV/RADV/lavapipe + Arm SoC) and their ICD
+    # manifests. The LunarG SDK below only provides the loader and shader
+    # tooling, not hardware drivers — without Mesa the packaged Vulkan backend
+    # would ship a loader that finds no GPU. package-gpu-libs.sh bundles these
+    # .so files plus their deps into the backend so it stays self-contained.
+    apt-get install -y mesa-vulkan-drivers libdrm2
    if [ "amd64" = "${TARGETARCH:-}" ]; then
        wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz"
        tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz
--- a/.docker/llama-cpp-compile.sh
+++ b/.docker/llama-cpp-compile.sh
@@ -17,19 +17,25 @@ if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
  rm -rf /LocalAI/backend/cpp/llama-cpp-*-build
 fi

-if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
-  cd /LocalAI/backend/cpp/llama-cpp
+cd /LocalAI/backend/cpp/llama-cpp
+if [ "${BUILD_TYPE}" = "hipblas" ]; then
+  # ROCm: the GPU does the compute, so a single fallback CPU build is enough.
  make llama-cpp-fallback
-  make llama-cpp-grpc
-  make llama-cpp-rpc-server
 else
-  cd /LocalAI/backend/cpp/llama-cpp
-  make llama-cpp-avx
-  make llama-cpp-avx2
-  make llama-cpp-avx512
-  make llama-cpp-fallback
-  make llama-cpp-grpc
-  make llama-cpp-rpc-server
+  # arm64: ggml's CPU_ALL_VARIANTS table includes armv9.2 SME variants whose
+  # -march=...+sme is rejected by the Ubuntu 24.04 default gcc-13. gcc-14 accepts it, so
+  # build the arm64 variants with gcc-14 (the host never *selects* SME unless it has it,
+  # but every variant must still compile).
+  if [ "${TARGETARCH}" = "arm64" ]; then
+    apt-get update -qq && apt-get install -y -qq gcc-14 g++-14
+    export CC=gcc-14 CXX=g++-14
+  fi
+  # x86 and arm64: one build with ggml CPU_ALL_VARIANTS replaces the per-microarch
+  # binaries (x86: avx/avx2/avx512/fallback; arm64: armv8.x/armv9.x). ggml dlopens the
+  # best libggml-cpu-*.so at runtime by probing host CPU features.
+  make llama-cpp-cpu-all
 fi
+make llama-cpp-grpc
+make llama-cpp-rpc-server

 ccache -s || true
--- a/.docker/turboquant-compile.sh
+++ b/.docker/turboquant-compile.sh
@@ -19,17 +19,19 @@ fi

 cd /LocalAI/backend/cpp/turboquant

-if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
+if [ "${BUILD_TYPE}" = "hipblas" ]; then
+  # ROCm: single fallback CPU build (GPU does the compute).
  make turboquant-fallback
-  make turboquant-grpc
-  make turboquant-rpc-server
 else
-  make turboquant-avx
-  make turboquant-avx2
-  make turboquant-avx512
-  make turboquant-fallback
-  make turboquant-grpc
-  make turboquant-rpc-server
+  # arm64: the CPU_ALL_VARIANTS armv9.2 SME variants need gcc-14 (gcc-13 rejects +sme).
+  if [ "${TARGETARCH}" = "arm64" ]; then
+    apt-get update -qq && apt-get install -y -qq gcc-14 g++-14
+    export CC=gcc-14 CXX=g++-14
+  fi
+  # x86 and arm64: one ggml CPU_ALL_VARIANTS build replaces the per-microarch binaries.
+  make turboquant-cpu-all
 fi
+make turboquant-grpc
+make turboquant-rpc-server

 ccache -s || true
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -3575,6 +3575,154 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
+  # ced
+  - build-type: 'cublas'
+    cuda-major-version: "12"
+    cuda-minor-version: "8"
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-nvidia-cuda-12-ced'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'cublas'
+    cuda-major-version: "13"
+    cuda-minor-version: "0"
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-nvidia-cuda-13-ced'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'cublas'
+    cuda-major-version: "13"
+    cuda-minor-version: "0"
+    platforms: 'linux/arm64'
+    skip-drivers: 'false'
+    tag-latest: 'auto'
+    tag-suffix: '-nvidia-l4t-cuda-13-arm64-ced'
+    base-image: "ubuntu:24.04"
+    ubuntu-version: '2404'
+    runs-on: 'ubuntu-24.04-arm'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+  - build-type: ''
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    platform-tag: 'amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-cpu-ced'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: ''
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/arm64'
+    platform-tag: 'arm64'
+    tag-latest: 'auto'
+    tag-suffix: '-cpu-ced'
+    runs-on: 'ubuntu-24.04-arm'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'sycl_f32'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-intel-sycl-f32-ced'
+    runs-on: 'ubuntu-latest'
+    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'sycl_f16'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-intel-sycl-f16-ced'
+    runs-on: 'ubuntu-latest'
+    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'vulkan'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    platform-tag: 'amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-vulkan-ced'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'vulkan'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/arm64'
+    platform-tag: 'arm64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-vulkan-ced'
+    runs-on: 'ubuntu-24.04-arm'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'cublas'
+    cuda-major-version: "12"
+    cuda-minor-version: "0"
+    platforms: 'linux/arm64'
+    skip-drivers: 'false'
+    tag-latest: 'auto'
+    tag-suffix: '-nvidia-l4t-arm64-ced'
+    base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
+    runs-on: 'ubuntu-24.04-arm'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2204'
+  - build-type: 'hipblas'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-rocm-hipblas-ced'
+    base-image: "rocm/dev-ubuntu-24.04:7.2.1"
+    runs-on: 'ubuntu-latest'
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
  # acestep-cpp
  - build-type: ''
    cuda-major-version: ""
@@ -4754,6 +4902,10 @@ includeDarwin:
    tag-suffix: "-metal-darwin-arm64-parakeet-cpp"
    build-type: "metal"
    lang: "go"
+  - backend: "ced"
+    tag-suffix: "-metal-darwin-arm64-ced"
+    build-type: "metal"
+    lang: "go"
  - backend: "acestep-cpp"
    tag-suffix: "-metal-darwin-arm64-acestep-cpp"
    build-type: "metal"
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -44,7 +44,7 @@ jobs:
      has-merges-singlearch: ${{ steps.set-matrix.outputs['has-merges-singlearch'] }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7

      - name: Setup Bun
        uses: oven-sh/setup-bun@v2
--- a/.github/workflows/backend_build.yml
+++ b/.github/workflows/backend_build.yml
@@ -101,7 +101,7 @@ jobs:
    steps:

      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true

--- a/.github/workflows/backend_build_darwin.yml
+++ b/.github/workflows/backend_build_darwin.yml
@@ -57,7 +57,7 @@ jobs:
      HOMEBREW_NO_ANALYTICS: '1'
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true

--- a/.github/workflows/backend_merge.yml
+++ b/.github/workflows/backend_merge.yml
@@ -49,7 +49,7 @@ jobs:
      # Sparse checkout: the merge job needs `.github/scripts/` (for the
      # keepalive cleanup script) but none of the source tree.
      - name: Checkout (.github/scripts only)
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          sparse-checkout: |
            .github/scripts
--- a/.github/workflows/backend_pr.yml
+++ b/.github/workflows/backend_pr.yml
@@ -23,7 +23,7 @@ jobs:
      has-merges-singlearch: ${{ steps.set-matrix.outputs['has-merges-singlearch'] }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7

      - name: Setup Bun
        uses: oven-sh/setup-bun@v2
--- a/.github/workflows/base-images.yml
+++ b/.github/workflows/base-images.yml
@@ -127,7 +127,7 @@ jobs:
            # the original l4t matrix entry which set skip-drivers: 'true'.
            skip-drivers: 'true'
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
        with:
          submodules: false
      - name: Free disk space
--- a/.github/workflows/build-test.yaml
+++ b/.github/workflows/build-test.yaml
@@ -11,7 +11,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          fetch-depth: 0
      - name: Set up Go
@@ -25,7 +25,7 @@ jobs:
    runs-on: macos-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          fetch-depth: 0
      - name: Set up Go
@@ -47,7 +47,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          fetch-depth: 0
      - name: Configure apt mirror on runner
--- a/.github/workflows/bump-inference-defaults.yml
+++ b/.github/workflows/bump-inference-defaults.yml
@@ -14,7 +14,7 @@ jobs:
  bump:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7

      - uses: actions/setup-go@v5
        with:
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -42,6 +42,10 @@ jobs:
            variable: "PARAKEET_VERSION"
            branch: "master"
            file: "backend/go/parakeet-cpp/Makefile"
+          - repository: "mudler/ced.cpp"
+            variable: "CED_VERSION"
+            branch: "master"
+            file: "backend/go/ced/Makefile"
          - repository: "mudler/depth-anything.cpp"
            variable: "DEPTHANYTHING_VERSION"
            branch: "master"
@@ -88,7 +92,7 @@ jobs:
            file: "backend/go/vibevoice-cpp/Makefile"
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
      - name: Bump dependencies 🔧
        id: bump
        run: |
@@ -124,7 +128,7 @@ jobs:
    if: github.repository == 'mudler/LocalAI'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
      - name: Bump vLLM cu130 wheel pin 🔧
        id: bump
        run: |
--- a/.github/workflows/bump_docs.yaml
+++ b/.github/workflows/bump_docs.yaml
@@ -13,7 +13,7 @@ jobs:
          - repository: "mudler/LocalAI"
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
      - name: Bump dependencies 🔧
        run: |
          bash .github/bump_docs.sh ${{ matrix.repository }}
--- a/.github/workflows/checksum_checker.yaml
+++ b/.github/workflows/checksum_checker.yaml
@@ -8,7 +8,7 @@ jobs:
    if: github.repository == 'mudler/LocalAI'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
      - name: Configure apt mirror on runner
        uses: ./.github/actions/configure-apt-mirror
      - name: Install dependencies
--- a/.github/workflows/deploy-explorer.yaml
+++ b/.github/workflows/deploy-explorer.yaml
@@ -16,7 +16,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - uses: actions/setup-go@v5
--- a/.github/workflows/gallery-agent.yaml
+++ b/.github/workflows/gallery-agent.yaml
@@ -31,7 +31,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          token: ${{ secrets.GITHUB_TOKEN }}

--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -44,7 +44,7 @@ jobs:
        uses: docker/setup-buildx-action@master

      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7

      - name: Cache Intel images
        uses: docker/build-push-action@v7
--- a/.github/workflows/gh-pages.yml
+++ b/.github/workflows/gh-pages.yml
@@ -28,7 +28,7 @@ jobs:
      HUGO_VERSION: "0.146.3"
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          fetch-depth: 0  # needed for enableGitInfo
          submodules: true
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -80,7 +80,7 @@ jobs:
    steps:

      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7

      - name: Configure apt mirror on runner
        id: apt_mirror
--- a/.github/workflows/image_merge.yml
+++ b/.github/workflows/image_merge.yml
@@ -36,7 +36,7 @@ jobs:
      # Sparse checkout: needed for .github/scripts/ (the keepalive cleanup
      # script). Skips the rest of the source tree.
      - name: Checkout (.github/scripts only)
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          sparse-checkout: |
            .github/scripts
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -20,7 +20,7 @@ jobs:
  golangci-lint:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
        with:
          # Full history so golangci-lint's new-from-merge-base can reach
          # origin/master and compute the diff against it.
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -10,7 +10,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          fetch-depth: 0
      - name: Set up Go
@@ -28,7 +28,7 @@ jobs:
    runs-on: macos-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          fetch-depth: 0
      - name: Set up Go
@@ -46,7 +46,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          fetch-depth: 0
      - name: Configure apt mirror on runner
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -14,7 +14,7 @@ jobs:
      GO111MODULE: on
    steps:
      - name: Checkout Source
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -50,7 +50,7 @@ jobs:
      parakeet-cpp: ${{ steps.detect.outputs.parakeet-cpp }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
      - name: Setup Bun
        uses: oven-sh/setup-bun@v2
      - name: Install dependencies
@@ -67,7 +67,7 @@ jobs:
  #   runs-on: ubuntu-latest
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -90,7 +90,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -113,7 +113,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -137,7 +137,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -158,7 +158,7 @@ jobs:
  #  runs-on: ubuntu-latest
  #  steps:
  #    - name: Clone
-  #      uses: actions/checkout@v6
+  #      uses: actions/checkout@v7
  #      with:
  #        submodules: true
  #    - name: Dependencies
@@ -178,7 +178,7 @@ jobs:
  #   runs-on: ubuntu-latest
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -240,7 +240,7 @@ jobs:
  #           sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
  #           df -h
  #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -265,7 +265,7 @@ jobs:
  #   runs-on: ubuntu-latest
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -288,7 +288,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -309,7 +309,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -330,7 +330,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -351,7 +351,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -373,7 +373,7 @@ jobs:
  #   timeout-minutes: 45
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -394,7 +394,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -415,7 +415,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -436,7 +436,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -462,7 +462,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -484,7 +484,7 @@ jobs:
    timeout-minutes: 30
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -513,7 +513,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -530,7 +530,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -552,7 +552,7 @@ jobs:
    timeout-minutes: 20
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -579,7 +579,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -604,7 +604,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -625,7 +625,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -645,7 +645,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -664,7 +664,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -681,7 +681,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -698,7 +698,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -741,7 +741,7 @@ jobs:
  #   timeout-minutes: 90
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -783,7 +783,7 @@ jobs:
  #   timeout-minutes: 90
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v6
+  #       uses: actions/checkout@v7
  #       with:
  #         submodules: true
  #     - name: Dependencies
@@ -808,7 +808,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -840,7 +840,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -876,7 +876,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -915,7 +915,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -952,7 +952,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -987,7 +987,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -1013,7 +1013,7 @@ jobs:
    timeout-minutes: 150
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -1042,7 +1042,7 @@ jobs:
    timeout-minutes: 60
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go
@@ -1058,7 +1058,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -1091,7 +1091,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -1114,7 +1114,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
@@ -1140,7 +1140,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -21,7 +21,7 @@ jobs:
        go-version: ['1.26.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Free disk space
@@ -84,7 +84,7 @@ jobs:
        go-version: ['1.26.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
--- a/.github/workflows/tests-aio.yml
+++ b/.github/workflows/tests-aio.yml
@@ -62,7 +62,7 @@ jobs:
          sudo rm -rfv build || true
          df -h
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Dependencies
--- a/.github/workflows/tests-e2e.yml
+++ b/.github/workflows/tests-e2e.yml
@@ -21,7 +21,7 @@ jobs:
        go-version: ['1.25.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Configure apt mirror on runner
--- a/.github/workflows/tests-pii-ner-e2e.yml
+++ b/.github/workflows/tests-pii-ner-e2e.yml
@@ -0,0 +1,97 @@
+---
+name: 'PII NER tier E2E (live GGUF, CPU)'
+
+# Runs the real privacy-filter GGUF NER tier end-to-end on CPU — the gap the
+# hermetic tests/e2e suite cannot cover (it only exercises the in-process
+# pattern tier). Heavy (builds the C++ backend image + downloads a ~2.7 GB
+# GGUF), so it is path-filtered on PRs and otherwise runs nightly / on demand.
+#
+# This drives the container-level harness (tests/e2e-backends) via
+# `make test-extra-backend-privacy-filter`: it builds the privacy-filter image,
+# downloads the model, loads it on CPU, and asserts byte-correct, UTF-8-aligned
+# TokenClassify spans. The complementary HTTP-path specs in tests/e2e
+# (e2e_pii_ner_test.go) Skip unless PII_NER_MODEL_GGUF is wired.
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 3 * * *'
+  push:
+    branches:
+      - master
+    paths:
+      - 'backend/cpp/privacy-filter/**'
+      - 'backend/Dockerfile.privacy-filter'
+      - 'core/services/routing/pii/**'
+      - 'core/services/routing/piidetector/**'
+      - 'core/backend/token_classify.go'
+      - 'core/http/endpoints/localai/pii.go'
+      - 'core/schema/pii.go'
+      - 'tests/e2e-backends/**'
+      - 'tests/e2e/e2e_pii_ner_test.go'
+      - 'tests/e2e/e2e_suite_test.go'
+      - '.github/workflows/tests-pii-ner-e2e.yml'
+  pull_request:
+    paths:
+      - 'backend/cpp/privacy-filter/**'
+      - 'backend/Dockerfile.privacy-filter'
+      - 'core/services/routing/pii/**'
+      - 'core/services/routing/piidetector/**'
+      - 'core/backend/token_classify.go'
+      - 'core/http/endpoints/localai/pii.go'
+      - 'core/schema/pii.go'
+      - 'tests/e2e-backends/**'
+      - 'tests/e2e/e2e_pii_ner_test.go'
+      - 'tests/e2e/e2e_suite_test.go'
+      - '.github/workflows/tests-pii-ner-e2e.yml'
+
+concurrency:
+  group: ci-tests-pii-ner-e2e-${{ github.event.pull_request.number || github.sha }}-${{ github.repository }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  tests-pii-ner-e2e:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        go-version: ['1.25.x']
+    steps:
+      - name: Clone
+        uses: actions/checkout@v7
+        with:
+          submodules: true
+      - name: Free disk space
+        run: |
+          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL || true
+          sudo docker image prune --all --force || true
+          df -h
+      - name: Configure apt mirror on runner
+        uses: ./.github/actions/configure-apt-mirror
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v5
+        with:
+          go-version: ${{ matrix.go-version }}
+          cache: false
+      - name: Proto Dependencies
+        run: |
+          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
+          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+          rm protoc.zip
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          PATH="$PATH:$HOME/go/bin" make protogen-go
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential
+      # Builds local-ai-backend:privacy-filter, downloads the GGUF, loads it on
+      # CPU and runs the token_classify capability spec (byte-offset contract).
+      - name: Run live PII NER backend E2E
+        run: PATH="$PATH:$HOME/go/bin" make test-extra-backend-privacy-filter
+      - name: Setup tmate session if tests fail
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3.23
+        with:
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true
--- a/.github/workflows/tests-ui-e2e.yml
+++ b/.github/workflows/tests-ui-e2e.yml
@@ -23,7 +23,7 @@ jobs:
        go-version: ['1.26.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v7
        with:
          submodules: true
      - name: Configure apt mirror on runner
--- a/.github/workflows/update_swagger.yaml
+++ b/.github/workflows/update_swagger.yaml
@@ -10,7 +10,7 @@ jobs:
      fail-fast: false
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
      - name: Configure apt mirror on runner
        uses: ./.github/actions/configure-apt-mirror
      - uses: actions/setup-go@v5
--- a/.gitignore
+++ b/.gitignore
@@ -91,3 +91,6 @@ core/http/react-ui/test-results/

 # Local worktrees
 .worktrees/
+
+# SDD / brainstorm scratch (agent-driven development)
+.superpowers/
--- a/10
+++ b/10
@@ -690,6 +690,16 @@ test-extra-backend-llama-cpp-transcription: docker-build-llama-cpp
 	BACKEND_TEST_CTX_SIZE=2048 \
 	$(MAKE) test-extra-backend

+## privacy-filter: the PII/NER token-classification backend. Exercises the
+## TokenClassify RPC and asserts byte-correct, UTF-8-aligned span offsets
+## against the openai-privacy-filter multilingual GGUF (CPU-runnable, ~50M
+## active params). This is the live-backend coverage for the PII NER tier.
+test-extra-backend-privacy-filter: docker-build-privacy-filter
+	BACKEND_IMAGE=local-ai-backend:privacy-filter \
+	BACKEND_TEST_MODEL_URL=https://huggingface.co/LocalAI-io/privacy-filter-multilingual-GGUF/resolve/main/privacy-filter-multilingual-f16.gguf \
+	BACKEND_TEST_CAPS=health,load,token_classify \
+	$(MAKE) test-extra-backend
+
 ## vllm is resolved from a HuggingFace model id (no file download) and
 ## exercises Predict + streaming + tool-call extraction via the hermes parser.
 ## Requires a host CPU with the SIMD instructions the prebuilt vllm CPU
--- a/README.md
+++ b/README.md
@@ -231,6 +231,7 @@ Most backends wrap a best-in-class upstream engine. A handful of them are native
 | Backend | What it does |
 |---------|-------------|
 | [parakeet.cpp](https://github.com/mudler/parakeet.cpp) | C++/GGML port of NVIDIA NeMo Parakeet ASR (tdt/ctc/rnnt/hybrid), with cache-aware streaming transcription |
+| [ced.cpp](https://github.com/mudler/ced.cpp) | C++/GGML port of the CED audio-tagging models: sound-event classification (527-class AudioSet) over REST and the realtime API for live recognition |
 | [voxtral.c](https://github.com/mudler/voxtral.c) | Voxtral Realtime 4B speech-to-text in pure C |
 | [vibevoice.cpp](https://github.com/mudler/vibevoice.cpp) | Native port of Microsoft VibeVoice for TTS (voice cloning) and long-form ASR with speaker diarization |
 | [rf-detr.cpp](https://github.com/mudler/rf-detr.cpp) | Native RF-DETR object detection and instance segmentation |
@@ -240,6 +241,8 @@ Most backends wrap a best-in-class upstream engine. A handful of them are native
 | [LocalVQE](https://github.com/localai-org/LocalVQE) | Joint acoustic echo cancellation, noise suppression, and dereverberation |
 | [local-store](https://github.com/mudler/LocalAI) | Local-first vector database for embeddings (shipped in-tree) |

+We also maintain [apex-quant](https://github.com/localai-org/apex-quant), a per-tensor, per-layer quantization recipe for Mixture-of-Experts models that exploits their structural sparsity to produce GGUFs matching or beating Q8_0 quality - and they run out of the box on stock llama.cpp.
+
 ## Resources

 - [Documentation](https://localai.io/)
--- a/backend/Dockerfile.golang
+++ b/backend/Dockerfile.golang
@@ -65,7 +65,12 @@ RUN <<EOT bash
            libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
            git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
            ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
-            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
+            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils && \
+        apt-get install -y mesa-vulkan-drivers libdrm2
+        # Mesa Vulkan ICD drivers (ANV/RADV/lavapipe) + their manifests. The
+        # LunarG SDK below only provides the loader and shader tooling, not
+        # hardware drivers — without Mesa, package-gpu-libs.sh has no ICD to
+        # bundle and the packaged backend finds no GPU at runtime.
        if [ "amd64" = "$TARGETARCH" ]; then
            wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
            tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
--- a/backend/Dockerfile.python
+++ b/backend/Dockerfile.python
@@ -66,7 +66,12 @@ RUN <<EOT bash
            libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
            git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
            ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
-            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
+            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils && \
+        apt-get install -y mesa-vulkan-drivers libdrm2
+        # Mesa Vulkan ICD drivers (ANV/RADV/lavapipe) + their manifests. The
+        # LunarG SDK below only provides the loader and shader tooling, not
+        # hardware drivers — without Mesa, package-gpu-libs.sh has no ICD to
+        # bundle and the packaged backend finds no GPU at runtime.
        if [ "amd64" = "$TARGETARCH" ]; then
            wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
            tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -24,6 +24,9 @@ service Backend {
  rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
  rpc Status(HealthMessage) returns (StatusResponse) {}
  rpc Detect(DetectOptions) returns (DetectResponse) {}
+  // SoundDetection runs an audio-tagging / sound-event-classification model
+  // (e.g. CED over the AudioSet ontology) on a clip and returns scored labels.
+  rpc SoundDetection(SoundDetectionRequest) returns (SoundDetectionResponse) {}
  rpc Depth(DepthRequest) returns (DepthResponse) {}
  rpc FaceVerify(FaceVerifyRequest) returns (FaceVerifyResponse) {}
  rpc FaceAnalyze(FaceAnalyzeRequest) returns (FaceAnalyzeResponse) {}
@@ -671,6 +674,24 @@ message DetectResponse {
  repeated Detection Detections = 1;
 }

+// --- Sound-event classification / audio tagging messages (CED) ---
+
+message SoundDetectionRequest {
+  string src = 1;       // audio file path (LocalAI writes the upload to disk)
+  int32 top_k = 2;      // number of top tags to return (0 = all classes)
+  float threshold = 3;  // optional: drop tags scoring below this
+}
+
+message SoundClass {
+  string label = 1;     // AudioSet class name, e.g. "Baby cry, infant cry"
+  float score = 2;      // per-class probability (multi-label, independent)
+  int32 index = 3;      // class index in the model ontology
+}
+
+message SoundDetectionResponse {
+  repeated SoundClass detections = 1;  // score-descending
+}
+
 // --- Depth estimation messages (Depth Anything 3) ---

 message DepthRequest {
--- a/backend/cpp/ik-llama-cpp/Makefile
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -1,5 +1,5 @@

-IK_LLAMA_VERSION?=b3dfb7858cfcb9166e92f366e5af87f19ebc94be
+IK_LLAMA_VERSION?=7ccf1d209588962b96eacca325b37e9b3e8faf5e
 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/CMakeLists.txt
+++ b/backend/cpp/llama-cpp/CMakeLists.txt
@@ -50,8 +50,13 @@ add_custom_command(
        "${hw_proto}"
      DEPENDS "${hw_proto}")

-# hw_grpc_proto
-add_library(hw_grpc_proto
+# hw_grpc_proto: force STATIC. Under the CPU_ALL_VARIANTS build BUILD_SHARED_LIBS=ON
+# (ggml/llama become shared), which would otherwise make this glue library a DSO. As a
+# DSO it references the hidden-visibility symbols in the static libprotobuf.a, which the
+# linker cannot satisfy ("hidden symbol ... in libprotobuf.a is referenced by DSO").
+# Keeping it STATIC links protobuf/gRPC directly into the grpc-server executable while
+# only ggml/llama stay shared. No effect on the static variants (already BUILD_SHARED_LIBS=OFF).
+add_library(hw_grpc_proto STATIC
  ${hw_grpc_srcs}
  ${hw_grpc_hdrs}
  ${hw_proto_srcs}
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=f3e182816421c648188b5eab269853bf1531d950
+LLAMA_VERSION?=be4a6a63eb2b848e19c277bdcf2bd399e8af76d9
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
@@ -10,8 +10,16 @@ TARGET?=--target grpc-server
 JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 1)
 ARCH?=$(shell uname -m)

-# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
-CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
+# Shared libs default to OFF: we link static gRPC and the avx/avx2/avx512/fallback
+# variants are fully static. The CPU_ALL_VARIANTS build flips SHARED_LIBS=ON (ggml/llama
+# become shared so the dynamic CPU backends work; gRPC stays static via its imported
+# targets). SHARED_LIBS is a make variable, not an appended -D, so it survives the
+# recursive sub-make into the VARIANT build dir (which re-parses this Makefile) instead
+# of being re-clobbered by a second -DBUILD_SHARED_LIBS=OFF. EXTRA_CMAKE_ARGS is the hook
+# the CPU_ALL_VARIANTS target uses to inject -DGGML_BACKEND_DL/-DGGML_CPU_ALL_VARIANTS.
+SHARED_LIBS?=OFF
+EXTRA_CMAKE_ARGS?=
+CMAKE_ARGS+=-DBUILD_SHARED_LIBS=$(SHARED_LIBS) -DLLAMA_CURL=OFF $(EXTRA_CMAKE_ARGS)

 CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 ifeq ($(NATIVE),false)
@@ -120,6 +128,30 @@ llama-cpp-fallback: llama.cpp
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-fallback-build" build-llama-cpp-grpc-server
 	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build/grpc-server llama-cpp-fallback

+# Single-build CPU backend using ggml's CPU_ALL_VARIANTS. Produces ONE grpc-server
+# plus a set of dlopen-able libggml-cpu-*.so (sandybridge/haswell/skylakex/...) that
+# ggml's backend registry selects from at runtime by probing host CPU features.
+# Replaces the avx/avx2/avx512/fallback multi-binary build on x86.
+#
+# CPU_ALL_VARIANTS requires GGML_BACKEND_DL, which requires BUILD_SHARED_LIBS=ON, so we
+# pass SHARED_LIBS=ON and the DL flags as make variables (NOT pre-expanded into the
+# CMAKE_ARGS env string): command-line make variables propagate through every recursive
+# sub-make, so the deepest VARIANT-dir build computes BUILD_SHARED_LIBS=ON consistently.
+# Only ggml/llama go shared - gRPC is found via its static imported targets, so the
+# grpc-server binary keeps static gRPC and only dynamically links ggml.
+#
+# TARGET adds "ggml": the per-microarch backends are runtime-dlopened, not link deps of
+# grpc-server, so they only build because each is an add_dependencies() of the ggml target.
+llama-cpp-cpu-all: llama.cpp
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build purge
+	$(info ${GREEN}I llama-cpp build info:cpu-all-variants${RESET})
+	$(MAKE) SHARED_LIBS=ON EXTRA_CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON" TARGET="--target grpc-server --target ggml" VARIANT="llama-cpp-cpu-all-build" build-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build/grpc-server llama-cpp-cpu-all
+	rm -rf ggml-shared-libs && mkdir -p ggml-shared-libs
+	find $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build/llama.cpp/build \( -name '*.so*' -o -name '*.dylib' \) -exec cp -av {} ggml-shared-libs/ \;
+	@echo "Collected ggml shared backends:" && ls -la ggml-shared-libs/
+
 llama-cpp-grpc: llama.cpp
 	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build
 	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -18,6 +18,18 @@
 #if __has_include("server-chat.cpp")
 #include "server-chat.cpp"
 #endif
+// server-schema.cpp exists only in llama.cpp after the upstream refactor that
+// extracted the JSON request-schema evaluation (previously the static
+// server_task::params_from_json_cmpl) into server_schema::eval_llama_cmpl_schema.
+// server-context.cpp and grpc-server.cpp both call into it, so its definitions
+// must be part of this translation unit or the link fails. __has_include keeps
+// the source compatible with older pins/forks (e.g. llama-cpp-turboquant) that
+// predate the split and still expose params_from_json_cmpl (see the guarded
+// call sites below).
+#if __has_include("server-schema.cpp")
+#define LOCALAI_HAS_SERVER_SCHEMA 1
+#include "server-schema.cpp"
+#endif
 #include "server-context.cpp"

 // LocalAI
@@ -2102,7 +2114,11 @@ public:
                task.index = i;

                task.tokens    = std::move(inputs[i]);
+#ifdef LOCALAI_HAS_SERVER_SCHEMA
+                task.params           = server_schema::eval_llama_cmpl_schema(
+#else
                task.params           = server_task::params_from_json_cmpl(
+#endif
                        ctx_server.impl->vocab,
                        params_base,
                        ctx_server.get_meta().slot_n_ctx,
@@ -2116,7 +2132,7 @@ public:
                // cannot detect tool calls or separate reasoning from content.
                task.params.res_type                 = TASK_RESPONSE_TYPE_OAI_CHAT;
                task.params.oaicompat_cmpl_id         = completion_id;
-                // oaicompat_model is already populated by params_from_json_cmpl
+                // oaicompat_model is already populated by eval_llama_cmpl_schema

                tasks.push_back(std::move(task));
            }
@@ -2940,7 +2956,11 @@ public:
                task.index = i;

                task.tokens    = std::move(inputs[i]);
+#ifdef LOCALAI_HAS_SERVER_SCHEMA
+                task.params           = server_schema::eval_llama_cmpl_schema(
+#else
                task.params           = server_task::params_from_json_cmpl(
+#endif
                        ctx_server.impl->vocab,
                        params_base,
                        ctx_server.get_meta().slot_n_ctx,
@@ -2952,7 +2972,7 @@ public:
                // reasoning, tool calls, and content are classified into ChatDeltas.
                task.params.res_type                 = TASK_RESPONSE_TYPE_OAI_CHAT;
                task.params.oaicompat_cmpl_id         = completion_id;
-                // oaicompat_model is already populated by params_from_json_cmpl
+                // oaicompat_model is already populated by eval_llama_cmpl_schema

                tasks.push_back(std::move(task));
            }
--- a/backend/cpp/llama-cpp/package.sh
+++ b/backend/cpp/llama-cpp/package.sh
@@ -14,6 +14,22 @@ mkdir -p $CURDIR/package/lib
 cp -avrf $CURDIR/llama-cpp-* $CURDIR/package/
 cp -rfv $CURDIR/run.sh $CURDIR/package/

+# Bundle the ggml shared backends produced by the CPU_ALL_VARIANTS build (libggml-base.so,
+# libggml.so, libllama.so and the per-microarch libggml-cpu-*.so), all into package/lib.
+#
+# Two distinct resolution mechanisms both land here:
+#   - NEEDED deps (libggml-base/libggml/libllama): resolved by the dynamic linker via the
+#     LD_LIBRARY_PATH=$CURDIR/lib that run.sh exports.
+#   - The per-microarch libggml-cpu-*.so are NOT linked; ggml *discovers* them at runtime by
+#     scanning the executable's own directory (readlink /proc/self/exe). run.sh launches via
+#     the bundled $CURDIR/lib/ld.so, so /proc/self/exe -> .../lib/ld.so and ggml scans lib/.
+#     That is why the variants must sit in lib/ (next to ld.so), not just on the link path.
+# No-op on builds (arm64/darwin) that don't produce the all-variants set.
+if [ -d "$CURDIR/ggml-shared-libs" ]; then
+    echo "Bundling ggml shared backends (CPU_ALL_VARIANTS)..."
+    cp -avf $CURDIR/ggml-shared-libs/*.so* $CURDIR/package/lib/
+fi
+
 # Detect architecture and copy appropriate libraries
 if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
    # x86_64 architecture
--- a/backend/cpp/llama-cpp/run.sh
+++ b/backend/cpp/llama-cpp/run.sh
@@ -12,26 +12,11 @@ grep -e "flags" /proc/cpuinfo | head -1

 BINARY=llama-cpp-fallback

-if grep -q -e "\savx\s" /proc/cpuinfo ; then
-	echo "CPU:    AVX    found OK"
-	if [ -e $CURDIR/llama-cpp-avx ]; then
-		BINARY=llama-cpp-avx
-	fi
-fi
-
-if grep -q -e "\savx2\s" /proc/cpuinfo ; then
-	echo "CPU:    AVX2   found OK"
-	if [ -e $CURDIR/llama-cpp-avx2 ]; then
-		BINARY=llama-cpp-avx2
-	fi
-fi
-
-# Check avx 512
-if grep -q -e "\savx512f\s" /proc/cpuinfo ; then
-	echo "CPU:    AVX512F found OK"
-	if [ -e $CURDIR/llama-cpp-avx512 ]; then
-		BINARY=llama-cpp-avx512
-	fi
+# x86 ships a single llama-cpp-cpu-all built with ggml CPU_ALL_VARIANTS: ggml's backend
+# registry dlopens the best libggml-cpu-*.so for this host, so no shell-side AVX probing.
+# arm64/darwin builds ship only llama-cpp-fallback, so fall back to it when cpu-all absent.
+if [ -e $CURDIR/llama-cpp-cpu-all ]; then
+	BINARY=llama-cpp-cpu-all
 fi

 if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then
--- a/backend/cpp/privacy-filter/Makefile
+++ b/backend/cpp/privacy-filter/Makefile
@@ -8,7 +8,7 @@
 # Local development: point at a working checkout instead of cloning, e.g.
 #   make PRIVACY_FILTER_SRC=$HOME/c/privacy-filter.cpp grpc-server

-PRIVACY_FILTER_VERSION?=646342f7a59c6b7d195185eac60bad762e572f1d
+PRIVACY_FILTER_VERSION?=98f52c5ef2250f207cc6b9a6aef05393a120cb7c
 PRIVACY_FILTER_REPO?=https://github.com/localai-org/privacy-filter.cpp
 PRIVACY_FILTER_SRC?=

--- a/backend/cpp/turboquant/Makefile
+++ b/backend/cpp/turboquant/Makefile
@@ -65,6 +65,29 @@ turboquant-avx:
 turboquant-fallback:
 	$(call turboquant-build,fallback,-DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server)

+# Single-build CPU backend via ggml CPU_ALL_VARIANTS (mirrors llama-cpp-cpu-all).
+# turboquant reuses backend/cpp/llama-cpp's CMakeLists.txt (hw_grpc_proto STATIC) and
+# Makefile (SHARED_LIBS make-var + EXTRA_CMAKE_ARGS), so this passes the same overrides
+# through to the copied build: SHARED_LIBS=ON, the DL flags, and --target ggml (which
+# pulls in the per-microarch libggml-cpu-*.so via ggml's add_dependencies). The .so set
+# is collected for package.sh to bundle into package/lib.
+turboquant-cpu-all:
+	rm -rf $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build
+	cp -rf $(LLAMA_CPP_DIR) $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build purge
+	bash $(CURRENT_MAKEFILE_DIR)/patch-grpc-server.sh $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/grpc-server.cpp
+	$(info $(GREEN)I turboquant build info:cpu-all-variants$(RESET))
+	LLAMA_REPO=$(LLAMA_REPO) LLAMA_VERSION=$(TURBOQUANT_VERSION) \
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build llama.cpp
+	bash $(CURRENT_MAKEFILE_DIR)/apply-patches.sh $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/llama.cpp $(PATCHES_DIR)
+	SHARED_LIBS=ON EXTRA_CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON" TARGET="--target grpc-server --target ggml" \
+	LLAMA_REPO=$(LLAMA_REPO) LLAMA_VERSION=$(TURBOQUANT_VERSION) \
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/grpc-server turboquant-cpu-all
+	rm -rf ggml-shared-libs && mkdir -p ggml-shared-libs
+	find $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/llama.cpp/build \( -name '*.so*' -o -name '*.dylib' \) -exec cp -av {} ggml-shared-libs/ \;
+	@echo "Collected ggml shared backends:" && ls -la ggml-shared-libs/
+
 turboquant-grpc:
 	$(call turboquant-build,grpc,-DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server --target rpc-server)

--- a/backend/cpp/turboquant/package.sh
+++ b/backend/cpp/turboquant/package.sh
@@ -14,6 +14,15 @@ mkdir -p $CURDIR/package/lib
 cp -avrf $CURDIR/turboquant-* $CURDIR/package/
 cp -rfv $CURDIR/run.sh $CURDIR/package/

+# Bundle the ggml shared backends from the CPU_ALL_VARIANTS build into package/lib. ggml
+# discovers the per-microarch libggml-cpu-*.so by scanning the executable directory, which
+# (via the bundled lib/ld.so that run.sh launches through) resolves to lib/. See the
+# matching comment in backend/cpp/llama-cpp/package.sh. No-op on the fallback/ROCm builds.
+if [ -d "$CURDIR/ggml-shared-libs" ]; then
+    echo "Bundling ggml shared backends (CPU_ALL_VARIANTS)..."
+    cp -avf $CURDIR/ggml-shared-libs/*.so* $CURDIR/package/lib/
+fi
+
 # Detect architecture and copy appropriate libraries
 if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
    # x86_64 architecture
--- a/backend/cpp/turboquant/run.sh
+++ b/backend/cpp/turboquant/run.sh
@@ -12,26 +12,11 @@ grep -e "flags" /proc/cpuinfo | head -1

 BINARY=turboquant-fallback

-if grep -q -e "\savx\s" /proc/cpuinfo ; then
-	echo "CPU:    AVX    found OK"
-	if [ -e $CURDIR/turboquant-avx ]; then
-		BINARY=turboquant-avx
-	fi
-fi
-
-if grep -q -e "\savx2\s" /proc/cpuinfo ; then
-	echo "CPU:    AVX2   found OK"
-	if [ -e $CURDIR/turboquant-avx2 ]; then
-		BINARY=turboquant-avx2
-	fi
-fi
-
-# Check avx 512
-if grep -q -e "\savx512f\s" /proc/cpuinfo ; then
-	echo "CPU:    AVX512F found OK"
-	if [ -e $CURDIR/turboquant-avx512 ]; then
-		BINARY=turboquant-avx512
-	fi
+# x86/arm64 ship a single turboquant-cpu-all built with ggml CPU_ALL_VARIANTS: ggml's
+# backend registry dlopens the best libggml-cpu-*.so for this host, so no shell-side
+# probing. ROCm ships only turboquant-fallback, so fall back to it when cpu-all is absent.
+if [ -e $CURDIR/turboquant-cpu-all ]; then
+	BINARY=turboquant-cpu-all
 fi

 if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then
--- a/backend/go/ced/.gitignore
+++ b/backend/go/ced/.gitignore
@@ -0,0 +1,11 @@
+.cache/
+sources/
+build/
+package/
+ced-grpc
+# build artifacts staged in-tree by the Makefile (cp from sources/) or
+# symlinked for local dev; the real sources live in ced.cpp upstream.
+*.so
+*.so.*
+ced_capi.h
+compile_commands.json
--- a/backend/go/ced/Makefile
+++ b/backend/go/ced/Makefile
@@ -0,0 +1,77 @@
+# ced sound-classification backend Makefile.
+#
+# Upstream pin lives below as CED_VERSION?=<sha> so .github/bump_deps.sh can find
+# and update it (matches the parakeet-cpp / whisper.cpp convention).
+#
+# Local dev shortcut: symlink an out-of-tree ced.cpp shared build + header and
+# skip the clone/cmake steps entirely:
+#   ln -sf /path/to/ced.cpp/build-shared/libced.so .
+#   ln -sf /path/to/ced.cpp/include/ced_capi.h .
+#   go build -o ced-grpc .
+
+CED_VERSION?=c04ac14b7992d00584d9e812c9bb6268598a6ce7
+CED_REPO?=https://github.com/mudler/ced.cpp
+
+GOCMD?=go
+GO_TAGS?=
+JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
+
+BUILD_TYPE?=
+NATIVE?=false
+
+# Static-link ggml into libced.so (PIC) so the shared lib is self-contained:
+# dlopen needs no libggml*.so alongside it, only system libs the runtime image
+# already provides.
+CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DCED_SHARED=ON -DCED_BUILD_CLI=OFF -DCED_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+
+ifeq ($(NATIVE),false)
+	CMAKE_ARGS+=-DGGML_NATIVE=OFF
+endif
+
+# ced.cpp gates its ggml backends behind CED_GGML_* options (set(... CACHE BOOL
+# "" FORCE)), so forward those instead of a bare -DGGML_CUDA=ON.
+ifeq ($(BUILD_TYPE),cublas)
+	CMAKE_ARGS+=-DCED_GGML_CUDA=ON -DGGML_CUDA_GRAPHS=ON
+else ifeq ($(BUILD_TYPE),openblas)
+	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+else ifeq ($(BUILD_TYPE),hipblas)
+	CMAKE_ARGS+=-DCED_GGML_HIP=ON
+else ifeq ($(BUILD_TYPE),vulkan)
+	CMAKE_ARGS+=-DCED_GGML_VULKAN=ON
+endif
+
+.PHONY: ced-grpc package build clean purge test all
+
+all: ced-grpc
+
+sources/ced.cpp:
+	mkdir -p sources/ced.cpp
+	cd sources/ced.cpp && \
+	git init -q && \
+	git remote add origin $(CED_REPO) && \
+	git fetch --depth 1 origin $(CED_VERSION) && \
+	git checkout FETCH_HEAD && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+libced.so: sources/ced.cpp
+	cmake -B sources/ced.cpp/build-shared -S sources/ced.cpp $(CMAKE_ARGS)
+	cmake --build sources/ced.cpp/build-shared --config Release -j$(JOBS)
+	cp -fv sources/ced.cpp/build-shared/libced.so* ./ 2>/dev/null || true
+	cp -fv sources/ced.cpp/include/ced_capi.h ./
+
+ced-grpc: libced.so main.go goced.go
+	CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o ced-grpc .
+
+package: ced-grpc
+	bash package.sh
+
+build: package
+
+test:
+	LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
+
+clean: purge
+	rm -rf libced.so* ced_capi.h package ced-grpc
+
+purge:
+	rm -rf sources/ced.cpp
--- a/backend/go/ced/goced.go
+++ b/backend/go/ced/goced.go
@@ -0,0 +1,130 @@
+package main
+
+// Go side of the ced backend: purego bindings over ced_capi.h plus the gRPC
+// SoundDetection implementation.
+//
+// SKETCH: the pb.SoundDetection* types come from backend.proto (regenerate with
+// `make protogen-go`). The C side is single-threaded per ctx, so we guard the
+// engine with engineMu; LocalAI also serializes via base.SingleThread.
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"sort"
+	"sync"
+	"unsafe"
+
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+)
+
+// purego-bound entry points from libced.so. Names match ced_capi.h exactly.
+var (
+	CppAbiVersion       func() int32
+	CppLoad             func(ggufPath string) uintptr
+	CppFree             func(ctx uintptr)
+	CppLastError        func(ctx uintptr) string
+	CppNumClasses       func(ctx uintptr) int32
+	CppSampleRate       func(ctx uintptr) int32
+	CppClassifyPathJSON func(ctx uintptr, wavPath string, topK int32) uintptr
+	CppClassifyPcmJSON  func(ctx uintptr, pcm []float32, nSamples int32, sampleRate int32, topK int32) uintptr
+	CppFreeString       func(s uintptr)
+)
+
+// cstr copies a malloc'd C string (returned as uintptr) into a Go string and
+// frees the original via ced_capi_free_string. Empty/0 -> "".
+func cstr(p uintptr) string {
+	if p == 0 {
+		return ""
+	}
+	defer CppFreeString(p)
+	var b []byte
+	for i := 0; ; i++ {
+		ch := *(*byte)(unsafe.Pointer(p + uintptr(i))) //nolint:govet // #nosec G103 -- C-owned NUL-terminated string from libced (not Go-GC memory)
+		if ch == 0 {
+			break
+		}
+		b = append(b, ch)
+	}
+	return string(b)
+}
+
+// Ced is the gRPC backend. One loaded CED model per instance.
+type Ced struct {
+	base.Base
+	ctxPtr   uintptr
+	engineMu sync.Mutex
+}
+
+// Load resolves the GGUF and opens the C-API context.
+func (c *Ced) Load(opts *pb.ModelOptions) error {
+	if opts.ModelFile == "" {
+		return errors.New("ced: ModelFile is required")
+	}
+	ctx := CppLoad(opts.ModelFile)
+	if ctx == 0 {
+		return fmt.Errorf("ced: ced_capi_load failed for %q: %s", opts.ModelFile, CppLastError(0))
+	}
+	c.ctxPtr = ctx
+	return nil
+}
+
+// jsonTag mirrors the ced_capi JSON tag objects.
+type jsonTag struct {
+	Index int     `json:"index"`
+	Score float32 `json:"score"`
+	Label string  `json:"label"`
+}
+
+// SoundDetection classifies the clip at req.Src and returns scored AudioSet tags.
+func (c *Ced) SoundDetection(ctx context.Context, req *pb.SoundDetectionRequest) (*pb.SoundDetectionResponse, error) {
+	if c.ctxPtr == 0 {
+		return nil, errors.New("ced: model not loaded")
+	}
+	if req.GetSrc() == "" {
+		return nil, errors.New("ced: SoundDetectionRequest.src (audio path) is required")
+	}
+	topK := req.GetTopK()
+	if topK <= 0 {
+		topK = 10 // sensible default for a tagging response
+	}
+
+	c.engineMu.Lock()
+	out := cstr(CppClassifyPathJSON(c.ctxPtr, req.GetSrc(), topK))
+	lastErr := CppLastError(c.ctxPtr)
+	c.engineMu.Unlock()
+
+	if out == "" {
+		return nil, fmt.Errorf("ced: classification failed: %s", lastErr)
+	}
+	var tags []jsonTag
+	if err := json.Unmarshal([]byte(out), &tags); err != nil {
+		return nil, fmt.Errorf("ced: bad classifier JSON: %w", err)
+	}
+
+	thr := req.GetThreshold()
+	resp := &pb.SoundDetectionResponse{}
+	for _, t := range tags {
+		if t.Score < thr {
+			continue
+		}
+		resp.Detections = append(resp.Detections, &pb.SoundClass{
+			Label: t.Label, Score: t.Score, Index: int32(t.Index),
+		})
+	}
+	sort.Slice(resp.Detections, func(i, j int) bool {
+		return resp.Detections[i].Score > resp.Detections[j].Score
+	})
+	return resp, nil
+}
+
+func (c *Ced) Free() error {
+	c.engineMu.Lock()
+	defer c.engineMu.Unlock()
+	if c.ctxPtr != 0 {
+		CppFree(c.ctxPtr)
+		c.ctxPtr = 0
+	}
+	return nil
+}
--- a/backend/go/ced/main.go
+++ b/backend/go/ced/main.go
@@ -0,0 +1,59 @@
+package main
+
+// ced sound-classification backend. Started internally by LocalAI: one gRPC
+// server per loaded model. Loads libced.so via purego and registers the flat
+// C-API declared in ced_capi.h. The library name can be overridden with
+// CED_LIBRARY (mirrors PARAKEET_LIBRARY / WHISPER_LIBRARY); the default looks
+// for the .so next to this binary.
+//
+// SKETCH: requires `make protogen-go` after the backend.proto SoundDetection
+// addition, and a built libced.so (see Makefile). See DESIGN.md.
+import (
+	"flag"
+	"fmt"
+	"os"
+
+	"github.com/ebitengine/purego"
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
+)
+
+var addr = flag.String("addr", "localhost:50051", "the address to connect to")
+
+type libFunc struct {
+	ptr  any
+	name string
+}
+
+func main() {
+	libName := os.Getenv("CED_LIBRARY")
+	if libName == "" {
+		libName = "libced.so"
+	}
+	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
+	if err != nil {
+		panic(fmt.Errorf("ced: dlopen %q: %w", libName, err))
+	}
+
+	// Bound 1:1 to ced_capi.h. char*-returning functions are declared uintptr
+	// so we can free the same pointer with ced_capi_free_string after copying
+	// (purego's string return would copy and leak the original).
+	for _, lf := range []libFunc{
+		{&CppAbiVersion, "ced_capi_abi_version"},
+		{&CppLoad, "ced_capi_load"},
+		{&CppFree, "ced_capi_free"},
+		{&CppLastError, "ced_capi_last_error"},
+		{&CppNumClasses, "ced_capi_num_classes"},
+		{&CppSampleRate, "ced_capi_sample_rate"},
+		{&CppClassifyPathJSON, "ced_capi_classify_path_json"},
+		{&CppClassifyPcmJSON, "ced_capi_classify_pcm_json"},
+		{&CppFreeString, "ced_capi_free_string"},
+	} {
+		purego.RegisterLibFunc(lf.ptr, lib, lf.name)
+	}
+
+	fmt.Fprintf(os.Stderr, "[ced] ABI=%d\n", CppAbiVersion())
+	flag.Parse()
+	if err := grpc.StartServer(*addr, &Ced{}); err != nil {
+		panic(err)
+	}
+}
--- a/backend/go/ced/package.sh
+++ b/backend/go/ced/package.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+#
+# Bundle the ced-grpc binary, libced.so, the core runtime libs (libc/libstdc++/
+# libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE so the package
+# is self-contained. Mirrors backend/go/parakeet-cpp/package.sh; run.sh routes
+# the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc is used.
+
+set -e
+
+CURDIR=$(dirname "$(realpath "$0")")
+REPO_ROOT="${CURDIR}/../../.."
+
+mkdir -p "$CURDIR/package/lib"
+
+cp -avf "$CURDIR/ced-grpc" "$CURDIR/package/"
+cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
+
+cp -avf "$CURDIR"/libced.so* "$CURDIR/package/lib/" 2>/dev/null || {
+	echo "ERROR: libced.so not found in $CURDIR, run 'make' first" >&2
+	exit 1
+}
+
+if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
+    echo "Detected x86_64 architecture, copying x86_64 libraries..."
+    cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
+    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
+    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
+elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
+    echo "Detected ARM64 architecture, copying ARM64 libraries..."
+    cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
+    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
+    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
+elif [ "$(uname -s)" = "Darwin" ]; then
+    echo "Detected Darwin"
+else
+    echo "Error: Could not detect architecture"
+    exit 1
+fi
+
+GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
+if [ -f "$GPU_LIB_SCRIPT" ]; then
+    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
+    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
+    package_gpu_libs
+fi
+
+echo "Packaging completed successfully"
+ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
--- a/backend/go/ced/run.sh
+++ b/backend/go/ced/run.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -e
+
+CURDIR=$(dirname "$(realpath "$0")")
+
+export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
+
+# If a self-contained ld.so was packaged, route through it so the packaged
+# libc / libstdc++ are used instead of the host's (matches the sibling backends).
+if [ -f "$CURDIR/lib/ld.so" ]; then
+	echo "Using lib/ld.so"
+	exec "$CURDIR/lib/ld.so" "$CURDIR/ced-grpc" "$@"
+fi
+
+exec "$CURDIR/ced-grpc" "$@"
--- a/backend/go/crispasr/Makefile
+++ b/backend/go/crispasr/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # CrispASR version (release tag)
 CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
-CRISPASR_VERSION?=d745bda4386ae0f9d1d2f23fff8ec95d76428221
+CRISPASR_VERSION?=96b2a6ee31d30389fed8a7ef1a54239b75231ddc
 SO_TARGET?=libgocrispasr.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
@@ -67,7 +67,7 @@ sources/CrispASR:
 	# it, so ${CMAKE_SOURCE_DIR} is THIS backend dir and the talk-llama sources
 	# aren't found. Rewrite to ${PROJECT_SOURCE_DIR} (the crispasr project root),
 	# which is correct both standalone and as a subproject. Idempotent.
-	sed -i 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt
+	sed -i.bak 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt && rm -f sources/CrispASR/src/CMakeLists.txt.bak

 # Detect OS
 UNAME_S := $(shell uname -s)
--- a/backend/go/crispasr/cpp/crispasr_shim.cpp
+++ b/backend/go/crispasr/cpp/crispasr_shim.cpp
@@ -47,6 +47,74 @@ extern "C" void set_abort(int v) {
  g_abort.store(v, std::memory_order_relaxed);
 }

+// --- word-level timestamp accessors ---
+extern "C" {
+int crispasr_session_result_n_words(crispasr_session_result *r, int seg_i);
+const char *crispasr_session_result_word_text(crispasr_session_result *r,
+                                               int seg_i, int word_i);
+int64_t crispasr_session_result_word_t0(crispasr_session_result *r, int seg_i,
+                                         int word_i);
+int64_t crispasr_session_result_word_t1(crispasr_session_result *r, int seg_i,
+                                         int word_i);
+
+// Parakeet-specific word accessors
+int crispasr_parakeet_result_n_words(void *r);
+const char *crispasr_parakeet_result_word_text(void *r, int word_i);
+int64_t crispasr_parakeet_result_word_t0(void *r, int word_i);
+int64_t crispasr_parakeet_result_word_t1(void *r, int word_i);
+}
+
+void *get_result(void) { return g_result; }
+
+int get_word_count(int seg_i) {
+  if (!g_result)
+    return 0;
+  return crispasr_session_result_n_words(g_result, seg_i);
+}
+
+const char *get_word_text(int seg_i, int word_i) {
+  if (!g_result)
+    return "";
+  return crispasr_session_result_word_text(g_result, seg_i, word_i);
+}
+
+int64_t get_word_t0(int seg_i, int word_i) {
+  if (!g_result)
+    return 0;
+  return crispasr_session_result_word_t0(g_result, seg_i, word_i);
+}
+
+int64_t get_word_t1(int seg_i, int word_i) {
+  if (!g_result)
+    return 0;
+  return crispasr_session_result_word_t1(g_result, seg_i, word_i);
+}
+
+// Parakeet-specific word accessors
+int get_parakeet_word_count(void) {
+  if (!g_result)
+    return 0;
+  return crispasr_parakeet_result_n_words(g_result);
+}
+
+const char *get_parakeet_word_text(int word_i) {
+  if (!g_result)
+    return "";
+  return crispasr_parakeet_result_word_text(g_result, word_i);
+}
+
+int64_t get_parakeet_word_t0(int word_i) {
+  if (!g_result)
+    return 0;
+  return crispasr_parakeet_result_word_t0(g_result, word_i);
+}
+
+int64_t get_parakeet_word_t1(int word_i) {
+  if (!g_result)
+    return 0;
+  return crispasr_parakeet_result_word_t1(g_result, word_i);
+}
+
 static void ggml_log_cb(enum ggml_log_level level, const char *log,
                        void *data) {
  const char *level_str;
--- a/backend/go/crispasr/cpp/crispasr_shim.h
+++ b/backend/go/crispasr/cpp/crispasr_shim.h
@@ -20,4 +20,18 @@ float *tts_synthesize(const char *text, int *out_n_samples); // 24kHz mono float
 void tts_free(float *pcm);
 int tts_set_voice(const char *name); // best-effort speaker selection; 0 ok
 int tts_set_voice_file(const char *path, const char *ref_text); // load voice pack (.gguf) or zero-shot clone (.wav + ref_text)
+
+// --- word-level timestamp accessors ---
+// Session-based (works for whisper-like backends)
+void *get_result(void);
+int get_word_count(int seg_i);
+const char *get_word_text(int seg_i, int word_i);
+int64_t get_word_t0(int seg_i, int word_i);
+int64_t get_word_t1(int seg_i, int word_i);
+
+// Parakeet-specific (global word list, no segment index)
+int get_parakeet_word_count(void);
+const char *get_parakeet_word_text(int word_i);
+int64_t get_parakeet_word_t0(int word_i);
+int64_t get_parakeet_word_t1(int word_i);
 }
--- a/backend/go/crispasr/gocrispasr.go
+++ b/backend/go/crispasr/gocrispasr.go
@@ -34,6 +34,18 @@ var (
 	CppTTSFree         func(ptr uintptr)
 	CppTTSSetVoice     func(name string) int
 	CppTTSSetVoiceFile func(path string, refText string) int
+
+	// Word-level timestamp accessors (session-based, per-segment)
+	CppGetWordCount func(segI int) int
+	CppGetWordText  func(segI int, wordI int) string
+	CppGetWordT0    func(segI int, wordI int) int64
+	CppGetWordT1    func(segI int, wordI int) int64
+
+	// Parakeet-specific word accessors (global, no segment index)
+	CppGetParakeetWordCount func() int
+	CppGetParakeetWordText  func(wordI int) string
+	CppGetParakeetWordT0    func(wordI int) int64
+	CppGetParakeetWordT1    func(wordI int) int64
 )

 type CrispASR struct {
@@ -212,6 +224,28 @@ func (w *CrispASR) VAD(req *pb.VADRequest) (pb.VADResponse, error) {
 	}, nil
 }

+// isValidWord reports whether a TranscriptWord contains recognisable speech
+// content. The parakeet-specific word accessors can return stale initialisation
+// data (model name, binary blobs) when a segment has no real speech. A word is
+// considered valid only when:
+//   - the text is non-empty after trimming,
+//   - it contains no U+FFFD replacement characters (from binary data scrubbing),
+//   - both timestamps are non-negative,
+//   - the word has positive duration (end > start).
+func isValidWord(w *pb.TranscriptWord) bool {
+	txt := strings.TrimSpace(w.Text)
+	if txt == "" {
+		return false
+	}
+	if strings.ContainsRune(txt, '\uFFFD') {
+		return false
+	}
+	if w.Start < 0 || w.End < 0 || w.End <= w.Start {
+		return false
+	}
+	return true
+}
+
 func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRequest) (pb.TranscriptResult, error) {
 	if err := ctx.Err(); err != nil {
 		return pb.TranscriptResult{}, status.Error(codes.Canceled, "transcription cancelled")
@@ -290,15 +324,54 @@ func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRe
 		// IDs, so Tokens is left empty.
 		txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "<22>")

+		// Populate word-level timestamps. Try session-based functions first
+		// (per-segment); fall back to parakeet-specific functions (global word
+		// list with no segment index — only populated on the first segment to
+		// avoid duplication).
+		words := []*pb.TranscriptWord{}
+		wordCount := CppGetWordCount(i)
+		if wordCount == 0 && i == 0 {
+			wordCount = CppGetParakeetWordCount()
+			for j := 0; j < wordCount; j++ {
+				w := &pb.TranscriptWord{
+					Start: CppGetParakeetWordT0(j) * (10000000),
+					End:   CppGetParakeetWordT1(j) * (10000000),
+					Text:  strings.ToValidUTF8(strings.Clone(CppGetParakeetWordText(j)), "<22>"),
+				}
+				if isValidWord(w) {
+					words = append(words, w)
+				}
+			}
+		} else {
+			for j := 0; j < wordCount; j++ {
+				w := &pb.TranscriptWord{
+					Start: CppGetWordT0(i, j) * (10000000),
+					End:   CppGetWordT1(i, j) * (10000000),
+					Text:  strings.ToValidUTF8(strings.Clone(CppGetWordText(i, j)), "<22>"),
+				}
+				if isValidWord(w) {
+					words = append(words, w)
+				}
+			}
+		}
+
+		// Skip empty segments with no recognisable content (e.g. trailing
+		// silence segments that parakeet emits with stale init data).
+		trimmed := strings.TrimSpace(txt)
+		if trimmed == "" && len(words) == 0 {
+			continue
+		}
+
 		segment := &pb.TranscriptSegment{
 			Id:    int32(i),
 			Text:  txt,
 			Start: s, End: t,
+			Words: words,
 		}

 		segments = append(segments, segment)

-		text += " " + strings.TrimSpace(txt)
+		text += " " + trimmed
 	}

 	return pb.TranscriptResult{
@@ -390,13 +463,20 @@ func (w *CrispASR) AudioTranscriptionStream(ctx context.Context, opts *pb.Transc
 		s := CppGetSegmentStart(i) * 10000000
 		t := CppGetSegmentEnd(i) * 10000000
 		txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "<22>")
+
+		// Skip empty segments (e.g. trailing silence that parakeet emits
+		// with stale init data).
+		trimmed := strings.TrimSpace(txt)
+		if trimmed == "" && s == t {
+			continue
+		}
+
 		segments = append(segments, &pb.TranscriptSegment{
 			Id:    int32(i),
 			Text:  txt,
 			Start: s, End: t,
 		})

-		trimmed := strings.TrimSpace(txt)
 		if trimmed == "" {
 			continue
 		}
--- a/backend/go/crispasr/main.go
+++ b/backend/go/crispasr/main.go
@@ -44,6 +44,14 @@ func main() {
 		{&CppTTSFree, "tts_free"},
 		{&CppTTSSetVoice, "tts_set_voice"},
 		{&CppTTSSetVoiceFile, "tts_set_voice_file"},
+		{&CppGetWordCount, "get_word_count"},
+		{&CppGetWordText, "get_word_text"},
+		{&CppGetWordT0, "get_word_t0"},
+		{&CppGetWordT1, "get_word_t1"},
+		{&CppGetParakeetWordCount, "get_parakeet_word_count"},
+		{&CppGetParakeetWordText, "get_parakeet_word_text"},
+		{&CppGetParakeetWordT0, "get_parakeet_word_t0"},
+		{&CppGetParakeetWordT1, "get_parakeet_word_t1"},
 	}

 	for _, lf := range libFuncs {
--- a/backend/go/depth-anything-cpp/Makefile
+++ b/backend/go/depth-anything-cpp/Makefile
@@ -8,11 +8,13 @@ JOBS?=$(shell nproc --ignore=1)

 # depth-anything.cpp. Pin to a specific commit for a stable build; a squash
 # merge upstream can orphan a branch, so the native version is pinned by SHA.
-# This SHA adds the nested two-file metric C-API (abi_version 4,
-# da_capi_load_nested) required by the depth-anything-3-nested gallery model;
-# tag it (e.g. v0.1.3) upstream to keep the SHA alive.
+# This SHA adds the Depth Anything V2 engine + C-API routing (depth-only,
+# relative + metric) on top of the nested two-file metric C-API (abi_version 4,
+# da_capi_load_nested) required by the depth-anything-3-nested gallery model.
+# It is kept alive by the upstream tag da2-support (survives a squash-merge);
+# repoint to the master merge commit once mudler/depth-anything.cpp PR #1 lands.
 DEPTHANYTHING_REPO?=https://github.com/mudler/depth-anything.cpp.git
-DEPTHANYTHING_VERSION?=cce5edc395fd1843806093d7ccc0c8b0d0b97b72
+DEPTHANYTHING_VERSION?=f4e17dea695dd12ae76bea98ba58030996b98118

 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
--- a/backend/go/omnivoice-cpp/Makefile
+++ b/backend/go/omnivoice-cpp/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # omnivoice.cpp version
 OMNIVOICE_REPO?=https://github.com/ServeurpersoCom/omnivoice.cpp
-OMNIVOICE_VERSION?=2603355a5dfacae5cfc33531d5d0933221843509
+OMNIVOICE_VERSION?=96d30169afd5e6bb3fd6a0e9be0eb505bfe81fcd
 SO_TARGET?=libgomnivoicecpp.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/parakeet-cpp/Makefile
+++ b/backend/go/parakeet-cpp/Makefile
@@ -1,6 +1,6 @@
 # parakeet-cpp backend Makefile.
 #
-# Upstream pin lives below as PARAKEET_VERSION?=92a5f0306be354c109150fe58ae4cc4f8a21ca45
+# Upstream pin lives below as PARAKEET_VERSION?=89f5e2977b4d8bccd45e7bcc6f2ef7c4ed49e89a
 # (.github/bump_deps.sh) can find and update it - matches the
 # whisper.cpp / ds4 / vibevoice-cpp convention.
 #
@@ -15,7 +15,7 @@
 # That's what the L0 smoke test uses. The default target below does the
 # proper clone-at-pin + cmake build so CI doesn't need a side-checkout.

-PARAKEET_VERSION?=92a5f0306be354c109150fe58ae4cc4f8a21ca45
+PARAKEET_VERSION?=89f5e2977b4d8bccd45e7bcc6f2ef7c4ed49e89a
 PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp

 GOCMD?=go
--- a/backend/go/parakeet-cpp/package.sh
+++ b/backend/go/parakeet-cpp/package.sh
@@ -1,23 +1,68 @@
 #!/bin/bash
 #
-# L0 packaging stub: copy the binary, run.sh and libparakeet.so* into
-# package/. The full ldd walk (libc, libstdc++, libgomp, GPU runtimes,
-# arch detection) lands in L3, mirroring backend/go/whisper/package.sh.
+# Bundle the parakeet-cpp-grpc binary, libparakeet.so, the core runtime
+# libs (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active
+# BUILD_TYPE so the package is self-contained. Mirrors
+# backend/go/whisper/package.sh; run.sh routes the (CGO_ENABLED=0) binary
+# through lib/ld.so so the packaged libc is used instead of the host's.

 set -e

 CURDIR=$(dirname "$(realpath "$0")")
+REPO_ROOT="${CURDIR}/../../.."

 mkdir -p "$CURDIR/package/lib"

 cp -avf "$CURDIR/parakeet-cpp-grpc" "$CURDIR/package/"
 cp -avf "$CURDIR/run.sh" "$CURDIR/package/"

-# libparakeet.so + any soname symlinks (libparakeet.so.X, libparakeet.so.X.Y).
+# libparakeet.so + any soname symlinks (libparakeet.so.X[.Y]). purego.Dlopen
+# resolves it via LD_LIBRARY_PATH, which run.sh points at lib/.
 cp -avf "$CURDIR"/libparakeet.so* "$CURDIR/package/lib/" 2>/dev/null || {
 	echo "ERROR: libparakeet.so not found in $CURDIR, run 'make' first" >&2
 	exit 1
 }

-echo "L0 package layout (full ldd walk lands in L3):"
+# Detect architecture and copy the core runtime libs libparakeet.so links
+# against, plus the matching dynamic loader as lib/ld.so.
+if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
+    echo "Detected x86_64 architecture, copying x86_64 libraries..."
+    cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
+    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
+    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
+elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
+    echo "Detected ARM64 architecture, copying ARM64 libraries..."
+    cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
+    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
+    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
+elif [ "$(uname -s)" = "Darwin" ]; then
+    echo "Detected Darwin"
+else
+    echo "Error: Could not detect architecture"
+    exit 1
+fi
+
+# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers)
+# based on BUILD_TYPE so the backend can reach the GPU without the runtime
+# base image shipping those drivers.
+GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
+if [ -f "$GPU_LIB_SCRIPT" ]; then
+    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
+    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
+    package_gpu_libs
+fi
+
+echo "Packaging completed successfully"
 ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
--- a/backend/go/qwen3-tts-cpp/Makefile
+++ b/backend/go/qwen3-tts-cpp/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # qwentts.cpp version
 QWEN3TTS_REPO?=https://github.com/ServeurpersoCom/qwentts.cpp
-QWEN3TTS_CPP_VERSION?=0bf4a18b22e8bb8718d95294e9f7f45c0d4270a4
+QWEN3TTS_CPP_VERSION?=4536dcdce27c3764a93a06d6bf64026b124962f5
 SO_TARGET?=libgoqwen3ttscpp.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=7f0e728b7d42f2490dfa5dd9539082d904f2f6b2
+STABLEDIFFUSION_GGML_VERSION?=f440ad9c29dd8bc34e5d1f4b863832b96d6ea05f

 CMAKE_ARGS+=-DGGML_MAX_NAME=128

--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=86c40c3bd6fc86f1187fb751d111b49e0fc18e84
+WHISPER_CPP_VERSION?=43d78af5be58f41d6ffbc227d608f104577741ea
 SO_TARGET?=libgowhisper.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -178,6 +178,37 @@
    nvidia-cuda-12: "cuda12-parakeet-cpp"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-parakeet-cpp"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-parakeet-cpp"
+- &ced
+  name: "ced"
+  alias: "ced"
+  license: mit
+  icon: https://avatars.githubusercontent.com/u/95302084
+  description: |
+    CED sound-event classification / audio tagging (527-class AudioSet).
+    ced.cpp is a C++/ggml port that performs audio tagging over the AudioSet
+    taxonomy, exposed through the SoundDetection gRPC rpc and the
+    /v1/audio/classification REST endpoint. It runs on CPU, NVIDIA CUDA,
+    AMD ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets.
+  urls:
+    - https://github.com/mudler/ced.cpp
+  tags:
+    - audio-classification
+    - CPU
+    - GPU
+    - CUDA
+    - HIP
+  capabilities:
+    default: "cpu-ced"
+    nvidia: "cuda12-ced"
+    intel: "intel-sycl-f16-ced"
+    metal: "metal-ced"
+    amd: "rocm-ced"
+    vulkan: "vulkan-ced"
+    nvidia-l4t: "nvidia-l4t-arm64-ced"
+    nvidia-cuda-13: "cuda13-ced"
+    nvidia-cuda-12: "cuda12-ced"
+    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced"
 - &voxtral
  name: "voxtral"
  alias: "voxtral"
@@ -2650,6 +2681,121 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-parakeet-cpp"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-13-parakeet-cpp
+## ced
+- !!merge <<: *ced
+  name: "ced-development"
+  capabilities:
+    default: "cpu-ced-development"
+    nvidia: "cuda12-ced-development"
+    intel: "intel-sycl-f16-ced-development"
+    metal: "metal-ced-development"
+    amd: "rocm-ced-development"
+    vulkan: "vulkan-ced-development"
+    nvidia-l4t: "nvidia-l4t-arm64-ced-development"
+    nvidia-cuda-13: "cuda13-ced-development"
+    nvidia-cuda-12: "cuda12-ced-development"
+    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced-development"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced-development"
+- !!merge <<: *ced
+  name: "nvidia-l4t-arm64-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-ced"
+  mirrors:
+    - localai/localai-backends:latest-nvidia-l4t-arm64-ced
+- !!merge <<: *ced
+  name: "nvidia-l4t-arm64-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-ced"
+  mirrors:
+    - localai/localai-backends:master-nvidia-l4t-arm64-ced
+- !!merge <<: *ced
+  name: "cuda13-nvidia-l4t-arm64-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-ced"
+  mirrors:
+    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-ced
+- !!merge <<: *ced
+  name: "cuda13-nvidia-l4t-arm64-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-ced"
+  mirrors:
+    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-ced
+- !!merge <<: *ced
+  name: "cpu-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-ced"
+  mirrors:
+    - localai/localai-backends:latest-cpu-ced
+- !!merge <<: *ced
+  name: "cpu-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-ced"
+  mirrors:
+    - localai/localai-backends:master-cpu-ced
+- !!merge <<: *ced
+  name: "metal-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-ced"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-ced
+- !!merge <<: *ced
+  name: "metal-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-ced"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-ced
+- !!merge <<: *ced
+  name: "cuda12-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-ced"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-12-ced
+- !!merge <<: *ced
+  name: "cuda12-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-ced"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-cuda-12-ced
+- !!merge <<: *ced
+  name: "rocm-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-ced"
+  mirrors:
+    - localai/localai-backends:latest-gpu-rocm-hipblas-ced
+- !!merge <<: *ced
+  name: "rocm-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-ced"
+  mirrors:
+    - localai/localai-backends:master-gpu-rocm-hipblas-ced
+- !!merge <<: *ced
+  name: "intel-sycl-f32-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-ced"
+  mirrors:
+    - localai/localai-backends:latest-gpu-intel-sycl-f32-ced
+- !!merge <<: *ced
+  name: "intel-sycl-f32-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-ced"
+  mirrors:
+    - localai/localai-backends:master-gpu-intel-sycl-f32-ced
+- !!merge <<: *ced
+  name: "intel-sycl-f16-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-ced"
+  mirrors:
+    - localai/localai-backends:latest-gpu-intel-sycl-f16-ced
+- !!merge <<: *ced
+  name: "intel-sycl-f16-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-ced"
+  mirrors:
+    - localai/localai-backends:master-gpu-intel-sycl-f16-ced
+- !!merge <<: *ced
+  name: "vulkan-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-ced"
+  mirrors:
+    - localai/localai-backends:latest-gpu-vulkan-ced
+- !!merge <<: *ced
+  name: "vulkan-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-ced"
+  mirrors:
+    - localai/localai-backends:master-gpu-vulkan-ced
+- !!merge <<: *ced
+  name: "cuda13-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-ced"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-13-ced
+- !!merge <<: *ced
+  name: "cuda13-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-ced"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-cuda-13-ced
 ## stablediffusion-ggml
 - !!merge <<: *stablediffusionggml
  name: "cpu-stablediffusion-ggml"
--- a/backend/python/diffusers/requirements-cpu.txt
+++ b/backend/python/diffusers/requirements-cpu.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 torchvision==0.22.1
 accelerate
 git+https://github.com/xhinker/sd_embed
@@ -10,9 +10,15 @@ sentencepiece
 torch==2.7.1
 optimum-quanto
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-cublas12.txt
+++ b/backend/python/diffusers/requirements-cublas12.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cu121
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 torchvision
 accelerate
 git+https://github.com/xhinker/sd_embed
@@ -10,9 +10,15 @@ sentencepiece
 torch
 ftfy
 optimum-quanto
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-cublas13.txt
+++ b/backend/python/diffusers/requirements-cublas13.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cu130
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 torchvision
 accelerate
 git+https://github.com/xhinker/sd_embed
@@ -10,9 +10,15 @@ sentencepiece
 torch
 ftfy
 optimum-quanto
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-hipblas.txt
+++ b/backend/python/diffusers/requirements-hipblas.txt
@@ -1,17 +1,23 @@
 --extra-index-url https://download.pytorch.org/whl/rocm7.0
 torch==2.10.0+rocm7.0
 torchvision==0.25.0+rocm7.0
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 accelerate
 peft
 sentencepiece
 optimum-quanto
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-intel.txt
+++ b/backend/python/diffusers/requirements-intel.txt
@@ -3,18 +3,24 @@ torch
 torchvision
 optimum[openvino]
 setuptools
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 accelerate
 git+https://github.com/xhinker/sd_embed
 peft
 sentencepiece
 optimum-quanto
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-l4t12.txt
+++ b/backend/python/diffusers/requirements-l4t12.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu129/
 torch
-git+https://github.com/huggingface/diffusers
-transformers
+diffusers==0.38.0
+transformers==4.57.6
 accelerate
 peft
 optimum-quanto
@@ -9,9 +9,15 @@ numpy<2
 sentencepiece
 torchvision
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-l4t13.txt
+++ b/backend/python/diffusers/requirements-l4t13.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cu130
 torch
-git+https://github.com/huggingface/diffusers
-transformers
+diffusers==0.38.0
+transformers==4.57.6
 accelerate
 peft
 optimum-quanto
@@ -10,9 +10,15 @@ sentencepiece
 torchvision
 ftfy
 chardet
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-mps.txt
+++ b/backend/python/diffusers/requirements-mps.txt
@@ -1,16 +1,22 @@
 torch==2.7.1
 torchvision==0.22.1
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 accelerate
 peft
 sentencepiece
 optimum-quanto
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/backend/python/nemo/backend.py
+++ b/backend/python/nemo/backend.py
@@ -84,6 +84,135 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

        return backend_pb2.Result(message="Model loaded successfully", success=True)

+    def _get_stride_seconds(self):
+        """Compute the seconds-per-frame stride for the loaded model.
+
+        stride = preprocessor_window_stride * encoder_subsampling_factor
+        """
+        try:
+            preprocessor = self.model.preprocessor
+            window_stride = preprocessor._cfg.get('window_stride', 0.01)
+            subsampling_factor = getattr(self.model.encoder, 'subsampling_factor', 8)
+            return window_stride * subsampling_factor
+        except (AttributeError, KeyError, TypeError) as err:
+            print(
+                f"Warning: could not compute stride from model config ({err}), "
+                f"falling back to 0.08s/frame",
+                file=sys.stderr,
+            )
+            return 0.08
+
+    def _build_segments_with_words(self, hypothesis, stride, timestamp_granularities=None):
+        """Build TranscriptSegment list from a NeMo Hypothesis with timestamps.
+
+        Supports two granularity modes:
+          - "word": one TranscriptSegment per word, each with a single TranscriptWord entry
+          - "segment" (default): merge consecutive words into sentence-level segments,
+            splitting at word-level time gaps that exceed a dynamic threshold.
+        """
+        if not hypothesis or not isinstance(hypothesis.timestamp, dict):
+            return []
+
+        word_offsets = hypothesis.timestamp.get('word', [])
+        if not word_offsets:
+            return []
+
+        granularities = list(timestamp_granularities) if timestamp_granularities else []
+        granularity = "word" if "word" in granularities else "segment"
+
+        # Build a flat list of (text, start_ns, end_ns) from NeMo word offsets
+        transcript_words = []
+        for wo in word_offsets:
+            word_text = wo.get('word', '')
+            if not word_text:
+                continue
+            start_offset = wo.get('start_offset', 0)
+            end_offset = wo.get('end_offset', start_offset)
+            start_ns = int(start_offset * stride * 1_000_000_000)
+            end_ns = int(end_offset * stride * 1_000_000_000)
+            transcript_words.append({
+                'text': word_text,
+                'start': start_ns,
+                'end': end_ns,
+            })
+
+        if not transcript_words:
+            return []
+
+        if granularity == "word":
+            # One segment per word
+            result = []
+            for idx, tw in enumerate(transcript_words):
+                word = backend_pb2.TranscriptWord(
+                    start=tw['start'], end=tw['end'], text=tw['text']
+                )
+                result.append(backend_pb2.TranscriptSegment(
+                    id=idx,
+                    start=tw['start'],
+                    end=tw['end'],
+                    text=tw['text'],
+                    words=[word],
+                ))
+            return result
+
+        # segment mode — merge at word-level time-gap boundaries
+        # Compute gap threshold: median inter-word gap * 3, clamped to [0.3, 2.0]s
+        gaps = []
+        for i in range(1, len(transcript_words)):
+            gap = (transcript_words[i]['start'] - transcript_words[i - 1]['end']) / 1_000_000_000
+            if gap > 0:
+                gaps.append(gap)
+        if gaps:
+            gaps.sort()
+            median_gap = gaps[len(gaps) // 2]
+            threshold_ns = int(max(0.3, min(median_gap * 3, 2.0)) * 1_000_000_000)
+        else:
+            threshold_ns = int(0.5 * 1_000_000_000)
+
+        result = []
+        buf_words = []  # list of TranscriptWord protobuf
+        buf_start = None
+        buf_end = 0
+        buf_text = []
+        prev_end = None
+
+        for tw in transcript_words:
+            # Detect word-level time gap
+            if prev_end is not None and (tw['start'] - prev_end) >= threshold_ns and buf_text:
+                seg_text = ' '.join(buf_text)
+                result.append(backend_pb2.TranscriptSegment(
+                    id=len(result),
+                    start=buf_start,
+                    end=buf_end,
+                    text=seg_text,
+                    words=list(buf_words),
+                ))
+                buf_words = []
+                buf_text = []
+                buf_start = None
+
+            if buf_start is None:
+                buf_start = tw['start']
+            buf_end = tw['end']
+            buf_text.append(tw['text'])
+            buf_words.append(backend_pb2.TranscriptWord(
+                start=tw['start'], end=tw['end'], text=tw['text']
+            ))
+            prev_end = tw['end']
+
+        # flush remaining
+        if buf_text and buf_start is not None:
+            seg_text = ' '.join(buf_text)
+            result.append(backend_pb2.TranscriptSegment(
+                id=len(result),
+                start=buf_start,
+                end=buf_end,
+                text=seg_text,
+                words=list(buf_words),
+            ))
+
+        return result
+
    def AudioTranscription(self, request, context):
        result_segments = []
        text = ""
@@ -93,26 +222,67 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                print(f"Error: Audio file not found: {audio_path}", file=sys.stderr)
                return backend_pb2.TranscriptResult(segments=[], text="")

-            # NEMO's transcribe method accepts a list of audio paths and returns a list of transcripts
-            results = self.model.transcribe([audio_path])
+            # Determine requested timestamp granularity
+            timestamp_granularities = list(request.timestamp_granularities) if request.timestamp_granularities else []
+            want_timestamps = bool(timestamp_granularities)

-            if not results or len(results) == 0:
-                return backend_pb2.TranscriptResult(segments=[], text="")
+            if want_timestamps:
+                # Request timestamps from NeMo.
+                # timestamps=True forces NeMo to return Hypothesis objects with
+                # the timestamp dict populated, so we omit return_hypotheses to
+                # let NeMo choose the correct return type.
+                results = self.model.transcribe([audio_path], timestamps=True)

-            # Get the transcript text from the first result.
-            # CTC models return List[str], TDT/RNNT models return List[Hypothesis]
-            # where the actual text lives in Hypothesis.text.
-            result = results[0]
-            if isinstance(result, str):
-                text = result
+                if results and len(results) > 0:
+                    hypotheses = results[0] if isinstance(results[0], list) else results
+                    if hypotheses and len(hypotheses) > 0:
+                        hypothesis = hypotheses[0]
+
+                        # Hypothesis object should have .timestamp populated
+                        if not hasattr(hypothesis, 'timestamp') or not isinstance(hypothesis.timestamp, dict):
+                            print(
+                                "Warning: timestamps were requested but NeMo did not return "
+                                "Hypothesis objects; falling back to untimestamped output",
+                                file=sys.stderr,
+                            )
+
+                        # Extract text
+                        if hasattr(hypothesis, 'text'):
+                            text = hypothesis.text or ""
+                        elif isinstance(hypothesis, str):
+                            text = hypothesis
+
+                        # Build segments with word-level timestamps
+                        stride = self._get_stride_seconds()
+                        result_segments = self._build_segments_with_words(
+                            hypothesis, stride, timestamp_granularities
+                        )
+
+                        # If no word offsets but we have text, fall back to single segment
+                        if not result_segments and text:
+                            result_segments.append(backend_pb2.TranscriptSegment(
+                                id=0, start=0, end=0, text=text
+                            ))
            else:
-                text = getattr(result, 'text', None) or ""
+                # Simple transcription without timestamps
+                # NEMO's transcribe method accepts a list of audio paths and returns a list of transcripts
+                results = self.model.transcribe([audio_path])

-            if text:
-                # Create a single segment with the full transcription
-                result_segments.append(backend_pb2.TranscriptSegment(
-                    id=0, start=0, end=0, text=text
-                ))
+                if results and len(results) > 0:
+                    # Get the transcript text from the first result.
+                    # CTC models return List[str], TDT/RNNT models return List[Hypothesis]
+                    # where the actual text lives in Hypothesis.text.
+                    result = results[0]
+                    if isinstance(result, str):
+                        text = result
+                    else:
+                        text = getattr(result, 'text', None) or ""
+
+                    if text:
+                        # Create a single segment with the full transcription
+                        result_segments.append(backend_pb2.TranscriptSegment(
+                            id=0, start=0, end=0, text=text
+                        ))

        except Exception as err:
            print(f"Error in AudioTranscription: {err}", file=sys.stderr)
--- a/backend/python/trl/backend.py
+++ b/backend/python/trl/backend.py
@@ -309,6 +309,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

        dataset_split = request.dataset_split or "train"
        if os.path.exists(request.dataset_source):
+            _allowed_dir = os.path.realpath(os.path.abspath(os.environ.get("LOCALAI_DATASET_DIR", os.getcwd())))
+            _real_path = os.path.realpath(os.path.abspath(request.dataset_source))
+            if not (_real_path == _allowed_dir or _real_path.startswith(_allowed_dir + os.sep)):
+                raise ValueError("Dataset source path is outside the allowed directory")
            if request.dataset_source.endswith('.json') or request.dataset_source.endswith('.jsonl'):
                dataset = load_dataset("json", data_files=request.dataset_source, split=dataset_split)
            elif request.dataset_source.endswith('.csv'):
@@ -687,6 +691,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
    def ExportModel(self, request, context):
        export_format = request.export_format or "lora"
        output_path = request.output_path
+        _allowed_output_dir = os.path.realpath(os.path.abspath(os.environ.get("LOCALAI_OUTPUT_DIR", os.getcwd())))
+        _real_output_path = os.path.realpath(os.path.abspath(output_path))
+        if not (_real_output_path == _allowed_output_dir or _real_output_path.startswith(_allowed_output_dir + os.sep)):
+            raise ValueError("Output path is outside the allowed directory")
+        output_path = _real_output_path
        checkpoint_path = request.checkpoint_path

        # Extract HF token for gated model access
@@ -807,7 +816,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                env = os.environ.copy()
                env["NO_LOCAL_GGUF"] = "1"
                cmd = [sys.executable, convert_script, merge_dir, "--outtype", outtype, "--outfile", gguf_path]
-                conv_result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600, env=env)
+                conv_result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600, env=env, shell=False)  # nosemgrep: python.django.security.injection.command.subprocess-injection.subprocess-injection
                if conv_result.returncode != 0:
                    diag = f"stdout: {conv_result.stdout[-300:]}\nstderr: {conv_result.stderr[-500:]}"
                    return backend_pb2.Result(success=False,
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -48,8 +48,10 @@ try:
 except ImportError:
    HAS_REASONING_PARSERS = False

+# vLLM >= 0.23 renamed GuidedDecodingParams -> StructuredOutputsParams and the
+# SamplingParams field guided_decoding -> structured_outputs.
 try:
-    from vllm.sampling_params import GuidedDecodingParams
+    from vllm.sampling_params import StructuredOutputsParams
    HAS_GUIDED_DECODING = True
 except ImportError:
    HAS_GUIDED_DECODING = False
@@ -536,13 +538,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                if value not in (None, 0, [], False):
                    setattr(sampling_params, param_field, value)

-        # Guided decoding: use Grammar field to pass JSON schema or BNF
+        # Structured-output decoding: use Grammar field to pass JSON schema or BNF
        if HAS_GUIDED_DECODING and request.Grammar:
            try:
                json.loads(request.Grammar)  # valid JSON = JSON schema
-                sampling_params.guided_decoding = GuidedDecodingParams(json=request.Grammar)
+                sampling_params.structured_outputs = StructuredOutputsParams(json=request.Grammar)
            except json.JSONDecodeError:
-                sampling_params.guided_decoding = GuidedDecodingParams(grammar=request.Grammar)
+                sampling_params.structured_outputs = StructuredOutputsParams(grammar=request.Grammar)

        # Extract image paths and process images
        prompt = request.Prompt
@@ -596,23 +598,124 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

        # Stream the results
        generated_text = ""
+        generated_token_ids: list[int] = []
        last_output = None
+
+        # Tool-parsing strategy decision (made once, before the loop):
+        #
+        # When a tool parser is active, the model's raw tool-call markup
+        # (e.g. <tool_call>...) must not be streamed verbatim as delta.content
+        # — clients would see the unparsed syntax. Two paths:
+        #
+        # (A) native streaming via parser.extract_tool_calls_streaming. All
+        #     concrete tool parsers shipped with vLLM 0.23+ implement this
+        #     (Granite4, Qwen3Coder, DeepSeekV31, Jamba, Ernie45, Hermes,
+        #     llama3_json, mistral, …). The parser decides per-delta whether
+        #     to emit content or suppress tool-call markup, and emits a
+        #     structured DeltaMessage(tool_calls=[...]) when a call is ready.
+        # (B) buffer fallback — used only when the parser surprisingly lacks
+        #     the streaming method or it raises mid-stream. The post-loop
+        #     extract_tool_calls assembles the final chat_delta. Same correctness
+        #     guarantee as a non-streaming response, at the cost of a delayed
+        #     final chunk.
+        has_tool_parser = bool(self.tool_parser_cls and request.Tools)
+        tp_instance = None
+        tp_request = None
+        native_streaming = False
+        native_streaming_error = False
+        if has_tool_parser:
+            try:
+                tools_for_parser = json.loads(request.Tools)
+            except json.JSONDecodeError:
+                tools_for_parser = []
+            try:
+                tp_instance = self.tool_parser_cls(self.tokenizer, tools=tools_for_parser)
+            except TypeError:
+                tp_instance = self.tool_parser_cls(self.tokenizer)
+            # Build a minimal ChatCompletionRequest so the streaming method
+            # sees the tools list. We do not need any other request fields —
+            # parsers only read .tools (and sometimes .tool_choice, which we
+            # leave at default).
+            try:
+                from vllm.entrypoints.openai.chat_completion.protocol import (
+                    ChatCompletionRequest as _CCR,
+                )
+                tp_request = _CCR(
+                    model="local",
+                    messages=[{"role": "user", "content": ""}],
+                    tools=tools_for_parser or None,
+                )
+            except Exception as e:
+                print(f"Could not build ChatCompletionRequest for streaming parser: {e}",
+                      file=sys.stderr)
+                tp_request = None
+            native_streaming = (
+                tp_request is not None
+                and hasattr(tp_instance, "extract_tool_calls_streaming")
+            )
+
        try:
            async for request_output in outputs:
                iteration_text = request_output.outputs[0].text
                last_output = request_output

                if streaming:
-                    # Remove text already sent as vllm concatenates the text from previous yields
                    delta_iteration_text = iteration_text.removeprefix(generated_text)
-                    # Send the partial result
-                    yield backend_pb2.Reply(
-                        message=bytes(delta_iteration_text, encoding='utf-8'),
-                        chat_deltas=[backend_pb2.ChatDelta(content=delta_iteration_text)],
-                    )
+                    new_token_ids = list(request_output.outputs[0].token_ids)
+                    delta_token_ids = new_token_ids[len(generated_token_ids):]

-                # Keep track of text generated
+                    if not has_tool_parser:
+                        # Plain streaming — unchanged from pre-tool-parser path.
+                        yield backend_pb2.Reply(
+                            message=bytes(delta_iteration_text, encoding='utf-8'),
+                            chat_deltas=[backend_pb2.ChatDelta(content=delta_iteration_text)],
+                        )
+                    elif native_streaming and not native_streaming_error:
+                        # (A) Native vLLM extract_tool_calls_streaming.
+                        try:
+                            msg = tp_instance.extract_tool_calls_streaming(
+                                previous_text=generated_text,
+                                current_text=iteration_text,
+                                delta_text=delta_iteration_text,
+                                previous_token_ids=generated_token_ids,
+                                current_token_ids=new_token_ids,
+                                delta_token_ids=delta_token_ids,
+                                request=tp_request,
+                            )
+                        except Exception as e:
+                            print(f"Streaming tool parser error (falling back to "
+                                  f"buffer for the rest of the stream): {e}",
+                                  file=sys.stderr)
+                            native_streaming_error = True
+                            msg = None
+                        if msg is not None:
+                            tc_protos = []
+                            for tc in (msg.tool_calls or []):
+                                fn = tc.function or None
+                                tc_protos.append(backend_pb2.ToolCallDelta(
+                                    index=tc.index,
+                                    id=tc.id or "",
+                                    name=(fn.name if fn and fn.name else "") or "",
+                                    arguments=(fn.arguments if fn and fn.arguments else "") or "",
+                                ))
+                            cd_kwargs = {}
+                            if msg.content:
+                                cd_kwargs["content"] = msg.content
+                            if msg.reasoning:
+                                cd_kwargs["reasoning_content"] = msg.reasoning
+                            if tc_protos:
+                                cd_kwargs["tool_calls"] = tc_protos
+                            if cd_kwargs:
+                                yield backend_pb2.Reply(
+                                    message=bytes(msg.content or "", encoding='utf-8'),
+                                    chat_deltas=[backend_pb2.ChatDelta(**cd_kwargs)],
+                                )
+                    # (B) buffer fallback — emit nothing during the stream.
+                    # The post-loop extract_tool_calls block builds the final chunk.
+
+                # Keep track of text + token_ids generated
                generated_text = iteration_text
+                generated_token_ids = list(request_output.outputs[0].token_ids)
        finally:
            await outputs.aclose()

@@ -637,16 +740,19 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            except Exception as e:
                print(f"Reasoning parser error: {e}", file=sys.stderr)

-        if self.tool_parser_cls and request.Tools:
+        # When (A) native streaming ran cleanly, per-delta yields above already
+        # delivered everything — do NOT extract again on the full text or we'd
+        # duplicate content/tool_calls into the final chunk.
+        if has_tool_parser and not (native_streaming and not native_streaming_error):
            try:
-                tools = json.loads(request.Tools)
-                # Some concrete parsers only accept the tokenizer; only the
-                # abstract base declares the tools kwarg. Try with tools first,
-                # fall back to tokenizer-only.
-                try:
-                    tp = self.tool_parser_cls(self.tokenizer, tools=tools)
-                except TypeError:
-                    tp = self.tool_parser_cls(self.tokenizer)
+                tp = tp_instance
+                if tp is None:
+                    # Defensive: tp_instance build failed earlier; reconstruct.
+                    tools = json.loads(request.Tools)
+                    try:
+                        tp = self.tool_parser_cls(self.tokenizer, tools=tools)
+                    except TypeError:
+                        tp = self.tool_parser_cls(self.tokenizer)
                info = tp.extract_tool_calls(content, request=None)
                if info.tools_called:
                    content = info.content or ""
@@ -659,6 +765,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        ))
            except Exception as e:
                print(f"Tool parser error: {e}", file=sys.stderr)
+        elif native_streaming and not native_streaming_error:
+            # Per-delta path already emitted content + tool_calls; the final
+            # chat_delta should carry only metadata (token counts, logprobs).
+            content = ""

        # Extract token counts
        prompt_tokens = 0
@@ -698,7 +808,26 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        )

        if streaming:
-            # Final chunk with structured data
+            # Final chunk with structured data.
+            #
+            # If we used the buffer fallback (has_tool_parser=True AND native
+            # streaming did NOT run cleanly) and the parser found no tool call,
+            # flush the buffered content as ONE content delta — and clear the
+            # final chat_delta's content so the metadata chunk does not repeat
+            # what we just sent. This is the plain-text-with-tool-parser path.
+            buffered_fallback = (
+                has_tool_parser
+                and not (native_streaming and not native_streaming_error)
+            )
+            if buffered_fallback and not tool_calls_proto and content:
+                yield backend_pb2.Reply(
+                    message=bytes(content, encoding='utf-8'),
+                    chat_deltas=[backend_pb2.ChatDelta(content=content)],
+                )
+                chat_delta = backend_pb2.ChatDelta(
+                    reasoning_content=reasoning_content,
+                    tool_calls=tool_calls_proto,
+                )
            yield backend_pb2.Reply(
                message=b"",
                prompt_tokens=prompt_tokens,
--- a/backend/python/vllm/test.py
+++ b/backend/python/vllm/test.py
@@ -278,4 +278,261 @@ class TestBackendServicer(unittest.TestCase):
            print(err)
            self.fail("Embedding service failed")
        finally:
-            self.tearDown()
+            self.tearDown()
+
+
+class TestStreamingToolParser(unittest.TestCase):
+    """
+    Server-less unit tests for the streaming + tool-parser machinery in
+    BackendServicer._predict. These tests instantiate BackendServicer
+    directly and mock the vLLM engine + tool parser, so they do not need
+    a GPU, a model, or a running gRPC server. Kept in a separate class to
+    avoid the parent setUp() which spawns a subprocess.
+
+    Covers #582 (follow-up to #10346):
+      1. Markup-leak prevention with a non-streaming parser (buffer fallback)
+      2. No content duplication on the plain-text path with the buffer fallback
+      3. Native streaming progressive plain-text emission
+      4. Native streaming structured tool_call, no markup leak
+      5. Parser exception → graceful fallback to buffer, still no markup
+      6. No-tool-parser regression: unchanged per-delta content stream
+    """
+
+    @staticmethod
+    def _make_generate(chunks):
+        """Build a fake vLLM engine.generate that yields cumulative chunks."""
+        from types import SimpleNamespace
+        async def gen(*a, **k):
+            for i, t in enumerate(chunks):
+                yield SimpleNamespace(
+                    outputs=[SimpleNamespace(
+                        text=t,
+                        token_ids=list(range(i + 1)),
+                        logprobs=None,
+                    )],
+                    prompt_token_ids=[0],
+                )
+        return lambda *a, **k: gen()
+
+    @staticmethod
+    def _collect(servicer, req):
+        import asyncio
+        async def run():
+            return [r async for r in servicer._predict(req, None, streaming=True)]
+        return asyncio.run(run())
+
+    def _new_servicer(self):
+        import sys, os
+        sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+        from backend import BackendServicer
+        s = BackendServicer()
+        s.reasoning_parser_cls = None
+        s.tool_parser_cls = None
+        s.tokenizer = None
+        return s
+
+    # ── Case 1+2: parser without streaming method → buffer fallback ──
+    def test_buffer_path_no_markup_no_duplication(self):
+        from types import SimpleNamespace
+
+        def parser_cls(called, content_text, calls):
+            class _P:
+                def __init__(self, tokenizer, tools=None):
+                    pass
+                # NOTE: NO extract_tool_calls_streaming → takes the buffer path
+                def extract_tool_calls(self, c, request=None):
+                    return SimpleNamespace(
+                        tools_called=called, content=content_text, tool_calls=calls,
+                    )
+            return _P
+
+        tools_json = '[{"type":"function","function":{"name":"calc","parameters":{}}}]'
+
+        # Tool-call case: no raw markup in any delta.content
+        s = self._new_servicer()
+        s.llm = SimpleNamespace(generate=self._make_generate([
+            '<tool_call>\n{"name": "calc"',
+            '<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
+        ]))
+        call = SimpleNamespace(id="call_1",
+                               function=SimpleNamespace(name="calc", arguments='{"x": 1}'))
+        s.tool_parser_cls = parser_cls(True, "", [call])
+        req = backend_pb2.PredictOptions(Prompt="x", Tools=tools_json)
+        replies = self._collect(s, req)
+        contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
+        self.assertFalse(
+            any("<tool_call" in c for c in contents),
+            f"markup leaked: {contents!r}",
+        )
+        names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls]
+        self.assertIn("calc", names, "tool_call missing from final chunk")
+
+        # Plain-text-with-tools case: full content delivered exactly once
+        s2 = self._new_servicer()
+        s2.llm = SimpleNamespace(generate=self._make_generate([
+            "The capital ",
+            "The capital of France is Paris.",
+        ]))
+        s2.tool_parser_cls = parser_cls(False, "", [])
+        req2 = backend_pb2.PredictOptions(Prompt="x", Tools=tools_json)
+        joined = "".join(
+            cd.content for r in self._collect(s2, req2)
+            for cd in r.chat_deltas if cd.content
+        )
+        self.assertEqual(
+            joined.count("The capital of France is Paris."), 1,
+            f"buffered content duplicated: {joined!r}",
+        )
+
+    # ── Case 3: native streaming, progressive plain text ──
+    def test_native_streaming_progressive_plain_text(self):
+        from types import SimpleNamespace
+
+        class _DeltaMsg:
+            def __init__(self, content=None, reasoning=None, tool_calls=None):
+                self.content = content
+                self.reasoning = reasoning
+                self.tool_calls = tool_calls or []
+
+        class StreamingParser:
+            def __init__(self, tokenizer, tools=None):
+                pass
+            def extract_tool_calls(self, c, request=None):
+                # Should NOT be called when native streaming runs successfully.
+                raise AssertionError("extract_tool_calls invoked on native-streaming path")
+            def extract_tool_calls_streaming(
+                self, previous_text, current_text, delta_text,
+                previous_token_ids, current_token_ids, delta_token_ids, request,
+            ):
+                if not delta_text:
+                    return None
+                return _DeltaMsg(content=delta_text)
+
+        s = self._new_servicer()
+        s.llm = SimpleNamespace(generate=self._make_generate([
+            "Paris ",
+            "Paris is ",
+            "Paris is the capital of France.",
+        ]))
+        s.tool_parser_cls = StreamingParser
+        req = backend_pb2.PredictOptions(
+            Prompt="x",
+            Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
+        )
+        replies = self._collect(s, req)
+
+        intermediate_content = [
+            cd.content for r in replies[:-1] for cd in r.chat_deltas if cd.content
+        ]
+        self.assertTrue(
+            len(intermediate_content) > 0,
+            "Plain-text response not streamed progressively (native streaming inactive?)",
+        )
+        assembled = "".join(
+            cd.content for r in replies for cd in r.chat_deltas if cd.content
+        )
+        self.assertEqual(
+            assembled, "Paris is the capital of France.",
+            f"Assembled content wrong: {assembled!r}",
+        )
+
+    # ── Case 4: native streaming, structured tool_call, no markup ──
+    def test_native_streaming_tool_call_no_markup_leak(self):
+        from types import SimpleNamespace
+
+        class _DeltaMsg:
+            def __init__(self, content=None, reasoning=None, tool_calls=None):
+                self.content = content
+                self.reasoning = reasoning
+                self.tool_calls = tool_calls or []
+
+        class _ToolCallStreamer:
+            def __init__(self, tokenizer, tools=None):
+                self._emitted = False
+            def extract_tool_calls(self, c, request=None):
+                raise AssertionError("extract_tool_calls invoked on native-streaming path")
+            def extract_tool_calls_streaming(
+                self, previous_text, current_text, delta_text,
+                previous_token_ids, current_token_ids, delta_token_ids, request,
+            ):
+                if "</tool_call>" in current_text and not self._emitted:
+                    self._emitted = True
+                    fn = SimpleNamespace(name="calc", arguments='{"x": 1}')
+                    tc = SimpleNamespace(id="call_1", type="function", index=0, function=fn)
+                    return _DeltaMsg(tool_calls=[tc])
+                return None
+
+        s = self._new_servicer()
+        s.llm = SimpleNamespace(generate=self._make_generate([
+            '<tool_call>\n',
+            '<tool_call>\n{"name": "calc"',
+            '<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
+        ]))
+        s.tool_parser_cls = _ToolCallStreamer
+        req = backend_pb2.PredictOptions(
+            Prompt="x",
+            Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
+        )
+        replies = self._collect(s, req)
+
+        contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
+        self.assertFalse(
+            any("<tool_call" in c or "</tool_call>" in c for c in contents),
+            f"markup leaked as content: {contents!r}",
+        )
+        names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls if tc.name]
+        args  = [tc.arguments for r in replies for cd in r.chat_deltas for tc in cd.tool_calls if tc.arguments]
+        self.assertIn("calc", names, f"tool_call name missing; got {names!r}")
+        self.assertIn('{"x": 1}', args, f"tool_call args missing; got {args!r}")
+
+    # ── Case 5: parser exception → fallback to buffer, no leak ──
+    def test_native_streaming_parser_exception_falls_back_to_buffer(self):
+        from types import SimpleNamespace
+        call = SimpleNamespace(id="call_1",
+                               function=SimpleNamespace(name="calc", arguments='{"x": 1}'))
+
+        class _BrokenStreamer:
+            def __init__(self, tokenizer, tools=None):
+                pass
+            def extract_tool_calls(self, c, request=None):
+                return SimpleNamespace(tools_called=True, content="", tool_calls=[call])
+            def extract_tool_calls_streaming(self, *a, **kw):
+                raise RuntimeError("simulated parser bug")
+
+        s = self._new_servicer()
+        s.llm = SimpleNamespace(generate=self._make_generate([
+            '<tool_call>\n{"name": "calc"',
+            '<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
+        ]))
+        s.tool_parser_cls = _BrokenStreamer
+        req = backend_pb2.PredictOptions(
+            Prompt="x",
+            Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
+        )
+        replies = self._collect(s, req)
+
+        contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
+        self.assertFalse(
+            any("<tool_call" in c for c in contents),
+            f"markup leaked after parser exception: {contents!r}",
+        )
+        names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls]
+        self.assertIn("calc", names, "tool_call missing from final chunk after fallback")
+
+    # ── Case 6: no tool parser → unchanged per-delta content stream ──
+    def test_no_tool_parser_unchanged_per_delta_stream(self):
+        from types import SimpleNamespace
+        s = self._new_servicer()  # tool_parser_cls already None
+        s.llm = SimpleNamespace(generate=self._make_generate([
+            "Hello ", "Hello world", "Hello world!",
+        ]))
+        req = backend_pb2.PredictOptions(Prompt="x", Tools="")
+        replies = self._collect(s, req)
+
+        intermediate = [
+            cd.content for r in replies[:-1] for cd in r.chat_deltas if cd.content
+        ]
+        self.assertEqual(
+            intermediate, ["Hello ", "world", "!"],
+            f"plain streaming changed; got {intermediate!r}",
+        )
--- a/core/application/application.go
+++ b/core/application/application.go
@@ -341,11 +341,9 @@ func (a *Application) ResolvePIIPolicy(cfg *config.ModelConfig) (enabled bool, d
 	}
 	appCfg := a.ApplicationConfig()

-	if cfg.PII.Enabled != nil {
-		enabled = *cfg.PII.Enabled
-	} else {
-		enabled = cfg.PIIIsEnabled() // backend default (cloud-proxy)
-	}
+	// PIIIsEnabled already encodes "explicit pii.enabled wins, else backend
+	// default (cloud-proxy)" — the single source of that rule.
+	enabled = cfg.PIIIsEnabled()
 	if !enabled {
 		return false, nil
 	}
@@ -354,7 +352,7 @@ func (a *Application) ResolvePIIPolicy(cfg *config.ModelConfig) (enabled bool, d
 	if len(detectors) == 0 {
 		detectors = append([]string(nil), appCfg.PIIDefaultDetectors...)
 	}
-	return enabled, detectors
+	return true, detectors // enabled is necessarily true past the !enabled guard
 }

 // PIIPolicyResolver adapts ResolvePIIPolicy to pii.PolicyResolver for
--- a/core/application/config_file_watcher.go
+++ b/core/application/config_file_watcher.go
@@ -215,6 +215,7 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
 		envBackendGalleries := slices.Equal(appConfig.BackendGalleries, startupAppConfig.BackendGalleries)
 		envAutoloadGalleries := appConfig.AutoloadGalleries == startupAppConfig.AutoloadGalleries
 		envAutoloadBackendGalleries := appConfig.AutoloadBackendGalleries == startupAppConfig.AutoloadBackendGalleries
+		envPIIDefaultDetectors := slices.Equal(appConfig.PIIDefaultDetectors, startupAppConfig.PIIDefaultDetectors)
 		envAgentJobRetentionDays := appConfig.AgentJobRetentionDays == startupAppConfig.AgentJobRetentionDays
 		envForceEvictionWhenBusy := appConfig.ForceEvictionWhenBusy == startupAppConfig.ForceEvictionWhenBusy
 		envLRUEvictionMaxRetries := appConfig.LRUEvictionMaxRetries == startupAppConfig.LRUEvictionMaxRetries
@@ -335,6 +336,15 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
 			if settings.AutoloadBackendGalleries != nil && !envAutoloadBackendGalleries {
 				appConfig.AutoloadBackendGalleries = *settings.AutoloadBackendGalleries
 			}
+			if settings.PIIDefaultDetectors != nil && !envPIIDefaultDetectors {
+				// Request-side default redaction reads this live via
+				// ResolvePIIPolicy, so a file edit takes effect on the next chat
+				// request. The MITM listener resolves its per-host detector map
+				// once at start, so a raw file edit reaches cloud-proxy traffic
+				// only after a restart or a POST /api/settings (which rebuilds
+				// the listener) — the admin UI uses the latter.
+				appConfig.PIIDefaultDetectors = append([]string(nil), (*settings.PIIDefaultDetectors)...)
+			}
 			if settings.AutoUpgradeBackends != nil {
 				appConfig.AutoUpgradeBackends = *settings.AutoUpgradeBackends
 			}
--- a/core/application/distributed.go
+++ b/core/application/distributed.go
@@ -357,6 +357,15 @@ func initDistributed(cfg *config.ApplicationConfig, authDB *gorm.DB, configLoade
 		Pressure:         pressure,
 	})

+	// Wire staging-progress broadcasting so file-staging shows up on every
+	// replica, not just the one performing the transfer. Without this, a
+	// /api/operations poll that round-robins onto a peer sees no staging row and
+	// the progress flickers. The origin publishes; peers mirror via the wildcard.
+	router.StagingTracker().SetPublisher(natsClient)
+	if _, err := router.StagingTracker().SubscribeBroadcasts(natsClient); err != nil {
+		xlog.Warn("Failed to subscribe to staging progress broadcasts", "error", err)
+	}
+
 	// Create ReplicaReconciler for auto-scaling model replicas. Adapter +
 	// RegistrationToken feed the state-reconciliation passes: pending op
 	// drain uses the adapter, and model health probes use the token to auth
--- a/core/application/runtime_settings_branding_test.go
+++ b/core/application/runtime_settings_branding_test.go
@@ -109,6 +109,52 @@ var _ = Describe("loadRuntimeSettingsFromFile", func() {
 		})
 	})

+	// Instance-wide default PII detectors. The file is the only source (no
+	// env var), and the loader runs immediately before startMITMIfConfigured,
+	// so a regression here means the cloud-proxy MITM listener resolves an
+	// empty detector set at boot and forwards intercepted traffic unredacted —
+	// even though pii_default_detectors is on disk and the MITM model has PII
+	// enabled. It also breaks request-side default redaction the same way.
+	Describe("PII default detectors", func() {
+		It("loads pii_default_detectors from the file", func() {
+			cfg := &config.ApplicationConfig{DynamicConfigsDir: seedSettings(`{"pii_default_detectors": ["privacy-filter-nemotron", "secret-filter"]}`)}
+			loadRuntimeSettingsFromFile(cfg)
+			Expect(cfg.PIIDefaultDetectors).To(Equal([]string{"privacy-filter-nemotron", "secret-filter"}))
+		})
+
+		It("does not override an env/CLI-set value (LOCALAI_PII_DEFAULT_DETECTORS)", func() {
+			cfg := &config.ApplicationConfig{
+				DynamicConfigsDir:   seedSettings(`{"pii_default_detectors": ["from-file"]}`),
+				PIIDefaultDetectors: []string{"from-env"}, // simulate WithPIIDefaultDetectors(env)
+			}
+			loadRuntimeSettingsFromFile(cfg)
+			Expect(cfg.PIIDefaultDetectors).To(Equal([]string{"from-env"}), "env var must win over the persisted file value")
+		})
+	})
+
+	// The live file watcher applies pii_default_detectors on a runtime change
+	// the same way it handles galleries/threads/etc.: env-set values (current
+	// == startup snapshot) are left alone, otherwise the file value is applied
+	// to the live config so request-side default redaction picks it up without
+	// a restart.
+	Describe("file watcher: pii_default_detectors", func() {
+		It("applies a changed file value to the live config", func() {
+			startup := config.ApplicationConfig{} // no env baseline
+			live := &config.ApplicationConfig{PIIDefaultDetectors: []string{"old"}}
+			handler := readRuntimeSettingsJson(startup)
+			Expect(handler([]byte(`{"pii_default_detectors":["new-a","new-b"]}`), live)).To(Succeed())
+			Expect(live.PIIDefaultDetectors).To(Equal([]string{"new-a", "new-b"}))
+		})
+
+		It("leaves an env-controlled value untouched", func() {
+			startup := config.ApplicationConfig{PIIDefaultDetectors: []string{"from-env"}}
+			live := &config.ApplicationConfig{PIIDefaultDetectors: []string{"from-env"}}
+			handler := readRuntimeSettingsJson(startup)
+			Expect(handler([]byte(`{"pii_default_detectors":["from-file"]}`), live)).To(Succeed())
+			Expect(live.PIIDefaultDetectors).To(Equal([]string{"from-env"}), "env-controlled detectors must not be overwritten by the file")
+		})
+	})
+
 	// The Agent Pool block has a mix of zero and non-zero defaults
 	// (Enabled=true, EmbeddingModel="granite-...", MaxChunkingSize=400,
 	// VectorEngine="chromem", AgentHubURL="https://agenthub.localai.io").
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -25,6 +25,7 @@ import (
 	"github.com/mudler/LocalAI/core/services/storage"
 	coreStartup "github.com/mudler/LocalAI/core/startup"
 	"github.com/mudler/LocalAI/internal"
+	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/signals"
 	"github.com/mudler/LocalAI/pkg/vram"

@@ -71,6 +72,16 @@ func New(opts ...config.AppOption) (*Application, error) {
 	if err != nil {
 		return nil, fmt.Errorf("unable to create ModelPath: %q", err)
 	}
+
+	// Reap *.partial downloads abandoned by a previous run (killed mid-transfer
+	// by an OOM/restart, or stalled before cleanup could run). The 24h window
+	// is well beyond any legitimate in-flight download, so this never trims an
+	// active transfer; it just stops dead partials accumulating on the volume.
+	if removed, cErr := downloader.CleanupStalePartialFiles(options.SystemState.Model.ModelsPath, 24*time.Hour); cErr != nil {
+		xlog.Warn("Failed to reap stale partial downloads", "error", cErr)
+	} else if removed > 0 {
+		xlog.Info("Reaped stale partial downloads", "count", removed)
+	}
 	if options.GeneratedContentDir != "" {
 		err := os.MkdirAll(options.GeneratedContentDir, 0o750)
 		if err != nil {
@@ -633,6 +644,12 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
 			options.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
 		}
 	}
+	if settings.SizeAwareEviction != nil {
+		// Only apply if current value is default (false), suggesting it wasn't set from env var
+		if !options.SizeAwareEviction {
+			options.SizeAwareEviction = *settings.SizeAwareEviction
+		}
+	}
 	if settings.LRUEvictionMaxRetries != nil {
 		// Only apply if current value is default (30), suggesting it wasn't set from env var
 		if options.LRUEvictionMaxRetries == 0 {
@@ -733,6 +750,20 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
 		options.MITMListen = *settings.MITMListen
 	}

+	// Instance-wide default PII detectors. LOCALAI_PII_DEFAULT_DETECTORS (via
+	// WithPIIDefaultDetectors) wins when set; otherwise the file is the source
+	// — apply it only when the env/CLI left the value empty, mirroring the
+	// "env > file" precedence used for the other fields. This must land before
+	// startMITMIfConfigured (called right after this loader): the cloud-proxy
+	// listener resolves each intercept host's detectors once at start via
+	// ResolvePIIPolicy, and a MITM model that names no detectors of its own
+	// falls back to these defaults. Without it the listener (and request-side
+	// default redaction) starts with an empty detector set and forwards
+	// traffic unredacted even though pii_default_detectors is on disk.
+	if settings.PIIDefaultDetectors != nil && len(options.PIIDefaultDetectors) == 0 {
+		options.PIIDefaultDetectors = append([]string(nil), (*settings.PIIDefaultDetectors)...)
+	}
+
 	// Backend upgrade flags
 	if settings.AutoUpgradeBackends != nil {
 		if !options.AutoUpgradeBackends {
@@ -836,6 +867,7 @@ func initializeWatchdog(application *Application, options *config.ApplicationCon
 			model.WithLRULimit(lruLimit),
 			model.WithMemoryReclaimer(options.MemoryReclaimerEnabled, options.MemoryReclaimerThreshold),
 			model.WithForceEvictionWhenBusy(options.ForceEvictionWhenBusy),
+			model.WithSizeAwareEviction(options.SizeAwareEviction),
 		)
 		application.ModelLoader().SetWatchDog(wd)

--- a/core/application/watchdog.go
+++ b/core/application/watchdog.go
@@ -90,6 +90,7 @@ func (a *Application) startWatchdog() error {
 			model.WithLRULimit(lruLimit),
 			model.WithMemoryReclaimer(appConfig.MemoryReclaimerEnabled, appConfig.MemoryReclaimerThreshold),
 			model.WithForceEvictionWhenBusy(appConfig.ForceEvictionWhenBusy),
+			model.WithSizeAwareEviction(appConfig.SizeAwareEviction),
 		)

 		// Create new stop channel BEFORE setting up any goroutines
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -1,6 +1,7 @@
 package backend

 import (
+	"context"
 	"encoding/json"
 	"fmt"
 	"math/rand/v2"
@@ -12,7 +13,9 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/trace"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/vram"
 	"github.com/mudler/xlog"
 )

@@ -33,6 +36,67 @@ func recordModelLoadFailure(appConfig *config.ApplicationConfig, modelName, back
 	})
 }

+// estimateModelSizeBytes uses the unified EstimateModel entry point to compute
+// the total weight-file size for a model config.  It collects all weight files
+// from DownloadFiles, Model, and MMProj, and also extracts the HuggingFace
+// repo ID so EstimateModel can fall back to the HF API when local file
+// metadata is unavailable (e.g. not-yet-downloaded models).
+func estimateModelSizeBytes(c config.ModelConfig, modelsPath string) int64 {
+	seen := make(map[string]bool)
+	input := vram.ModelEstimateInput{}
+
+	addFile := func(uri string) {
+		if !vram.IsWeightFile(uri) {
+			return
+		}
+		resolved := uri
+		if !strings.Contains(uri, "://") {
+			resolved = "file://" + filepath.Join(modelsPath, uri)
+		}
+		if seen[resolved] {
+			return
+		}
+		seen[resolved] = true
+		input.Files = append(input.Files, vram.FileInput{URI: resolved})
+	}
+
+	// tryHFRepo resolves any huggingface:// or hf:// URI to an HTTPS URL and
+	// then extracts the org/model repo ID for use as the HF fallback path.
+	tryHFRepo := func(uri string) {
+		if input.HFRepo != "" {
+			return
+		}
+		resolved := downloader.URI(uri).ResolveURL()
+		if repoID, ok := vram.ExtractHFRepoID(resolved); ok {
+			input.HFRepo = repoID
+		}
+	}
+
+	for _, f := range c.DownloadFiles {
+		uriStr := string(f.URI)
+		addFile(uriStr)
+		tryHFRepo(uriStr)
+	}
+	addFile(c.Model)
+	tryHFRepo(c.Model)
+	if c.MMProj != "" {
+		addFile(c.MMProj)
+	}
+
+	if len(input.Files) == 0 && input.HFRepo == "" {
+		return 0
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+
+	result, err := vram.EstimateModelMultiContext(ctx, input, nil)
+	if err != nil || result.SizeBytes == 0 {
+		return 0
+	}
+	return int64(result.SizeBytes)
+}
+
 func ModelOptions(c config.ModelConfig, so *config.ApplicationConfig, opts ...model.Option) []model.Option {
 	defOpts := []model.Option{
 		model.WithBackendString(c.Backend),
@@ -70,6 +134,10 @@ func ModelOptions(c config.ModelConfig, so *config.ApplicationConfig, opts ...mo
 		defOpts = append(defOpts, model.WithExternalBackend(k, v))
 	}

+	if sizeBytes := estimateModelSizeBytes(c, so.SystemState.Model.ModelsPath); sizeBytes > 0 {
+		defOpts = append(defOpts, model.WithModelSizeBytes(sizeBytes))
+	}
+
 	return append(defOpts, opts...)
 }

@@ -90,10 +158,11 @@ func getSeed(c config.ModelConfig) int32 {
 // DefaultContextSize and DefaultBatchSize are the backend's fallbacks when a
 // model config leaves them unset. Exported so callers that must respect the
 // effective decode window — notably the router's prompt trimmer — resolve the
-// same numbers grpcModelOpts does instead of guessing.
+// same numbers grpcModelOpts does instead of guessing. The values are owned by
+// core/config (single source of truth shared with the config default tiers).
 const (
-	DefaultContextSize = 4096
-	DefaultBatchSize   = 512
+	DefaultContextSize = config.DefaultContextSize
+	DefaultBatchSize   = config.DefaultPhysicalBatch
 )

 // EffectiveContextSize is the context window the backend will run with: the
@@ -129,7 +198,7 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
 	ctxSize := EffectiveContextSize(c)
 	b := EffectiveBatchSize(c)

-	flashAttention := "auto"
+	flashAttention := config.DefaultFlashAttention

 	if c.FlashAttention != nil {
 		flashAttention = *c.FlashAttention
@@ -175,7 +244,7 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
 		mmlock = *c.MMlock
 	}

-	nGPULayers := 9999999
+	nGPULayers := config.DefaultNGPULayers
 	if c.NGPULayers != nil {
 		nGPULayers = *c.NGPULayers
 	}
--- a/core/backend/sound_classification.go
+++ b/core/backend/sound_classification.go
@@ -0,0 +1,88 @@
+package backend
+
+import (
+	"context"
+	"fmt"
+	"sort"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"
+
+	grpcPkg "github.com/mudler/LocalAI/pkg/grpc"
+	"github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/model"
+)
+
+// SoundDetectionRequest carries the knobs the HTTP layer collects for an
+// audio-tagging / sound-event-classification call. Audio is the path to the
+// uploaded clip on disk; TopK and Threshold are optional (0 = backend default).
+type SoundDetectionRequest struct {
+	Audio     string
+	TopK      int32
+	Threshold float32
+}
+
+func (r *SoundDetectionRequest) toProto() *proto.SoundDetectionRequest {
+	return &proto.SoundDetectionRequest{
+		Src:       r.Audio,
+		TopK:      r.TopK,
+		Threshold: r.Threshold,
+	}
+}
+
+func loadSoundDetectionModel(ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (grpcPkg.Backend, error) {
+	if modelConfig.Backend == "" {
+		return nil, fmt.Errorf("sound classification: model %q has no backend set; supported backends include ced", modelConfig.Name)
+	}
+	opts := ModelOptions(modelConfig, appConfig)
+	m, err := ml.Load(opts...)
+	if err != nil {
+		recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
+		return nil, err
+	}
+	if m == nil {
+		return nil, fmt.Errorf("could not load sound classification model")
+	}
+	return m, nil
+}
+
+// ModelSoundDetection runs the SoundDetection RPC against the configured
+// backend and returns a normalized schema.SoundClassificationResult.
+func ModelSoundDetection(ctx context.Context, req SoundDetectionRequest, ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (*schema.SoundClassificationResult, error) {
+	m, err := loadSoundDetectionModel(ml, modelConfig, appConfig)
+	if err != nil {
+		return nil, err
+	}
+
+	r, err := m.SoundDetection(ctx, req.toProto())
+	if err != nil {
+		return nil, err
+	}
+	return soundClassificationResultFromProto(modelConfig.Name, r), nil
+}
+
+// soundClassificationResultFromProto maps the backend detections to the
+// HTTP-facing schema, keeping the backend's score-descending order.
+func soundClassificationResultFromProto(modelName string, r *proto.SoundDetectionResponse) *schema.SoundClassificationResult {
+	out := &schema.SoundClassificationResult{
+		Model:      modelName,
+		Detections: []schema.SoundClassification{},
+	}
+	if r == nil {
+		return out
+	}
+	for _, d := range r.Detections {
+		if d == nil {
+			continue
+		}
+		out.Detections = append(out.Detections, schema.SoundClassification{
+			Index: int(d.Index),
+			Label: d.Label,
+			Score: d.Score,
+		})
+	}
+	sort.SliceStable(out.Detections, func(i, j int) bool {
+		return out.Detections[i].Score > out.Detections[j].Score
+	})
+	return out
+}
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -93,6 +93,7 @@ type RunCMD struct {
 	EnableMemoryReclaimer              bool     `env:"LOCALAI_MEMORY_RECLAIMER,MEMORY_RECLAIMER,LOCALAI_GPU_RECLAIMER,GPU_RECLAIMER" default:"false" help:"Enable memory threshold monitoring to auto-evict backends when memory usage exceeds threshold (uses GPU VRAM if available, otherwise RAM)" group:"backends"`
 	MemoryReclaimerThreshold           float64  `env:"LOCALAI_MEMORY_RECLAIMER_THRESHOLD,MEMORY_RECLAIMER_THRESHOLD,LOCALAI_GPU_RECLAIMER_THRESHOLD,GPU_RECLAIMER_THRESHOLD" default:"0.95" help:"Memory usage threshold (0.0-1.0) that triggers backend eviction (default 0.95 = 95%%)" group:"backends"`
 	ForceEvictionWhenBusy              bool     `env:"LOCALAI_FORCE_EVICTION_WHEN_BUSY,FORCE_EVICTION_WHEN_BUSY" default:"false" help:"Force eviction even when models have active API calls (default: false for safety)" group:"backends"`
+	SizeAwareEviction                  bool     `env:"LOCALAI_SIZE_AWARE_EVICTION,SIZE_AWARE_EVICTION" default:"false" help:"Evict the largest loaded model first rather than the least-recently-used one, keeping small utility models resident and maximizing freed memory per eviction" group:"backends"`
 	LRUEvictionMaxRetries              int      `env:"LOCALAI_LRU_EVICTION_MAX_RETRIES,LRU_EVICTION_MAX_RETRIES" default:"30" help:"Maximum number of retries when waiting for busy models to become idle before eviction (default: 30)" group:"backends"`
 	LRUEvictionRetryInterval           string   `env:"LOCALAI_LRU_EVICTION_RETRY_INTERVAL,LRU_EVICTION_RETRY_INTERVAL" default:"1s" help:"Interval between retries when waiting for busy models to become idle (e.g., 1s, 2s) (default: 1s)" group:"backends"`
 	Federated                          bool     `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
@@ -180,6 +181,8 @@ type RunCMD struct {
 	// Cloud-proxy MITM listener (off by default).
 	MITMListen string `env:"LOCALAI_MITM_LISTEN" help:"Address (host:port) for the cloudproxy MITM listener. Empty = disabled. Clients set HTTPS_PROXY=http://<this>:<port>. Intercept hosts are declared per-model via the model YAML mitm.hosts: block; create one from the Add Model UI." group:"middleware"`
 	MITMCADir  string `env:"LOCALAI_MITM_CA_DIR" type:"path" help:"Directory holding the MITM proxy CA cert + key. Defaults to <data-path>/mitm-ca." group:"middleware"`
+
+	PIIDefaultDetectors []string `env:"LOCALAI_PII_DEFAULT_DETECTORS" help:"Instance-wide default PII/secret detector model names applied to any PII-enabled model (chiefly cloud-proxy / MITM models) that names no pii.detectors of its own. Comma-separated, e.g. privacy-filter-nemotron,secret-filter. Takes precedence over the value persisted via the Middleware UI." group:"middleware"`
 }

 func (r *RunCMD) Run(ctx *cliContext.Context) error {
@@ -242,6 +245,7 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		config.WithAPIAddress(r.Address),
 		config.WithMITMListen(r.MITMListen),
 		config.WithMITMCADir(r.MITMCADir),
+		config.WithPIIDefaultDetectors(r.PIIDefaultDetectors),
 		config.WithAgentJobRetentionDays(r.AgentJobRetentionDays),
 		config.WithLlamaCPPTunnelCallback(func(tunnels []string) {
 			tunnelEnvVar := strings.Join(tunnels, ",")
@@ -564,6 +568,9 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 	if r.ForceEvictionWhenBusy {
 		opts = append(opts, config.WithForceEvictionWhenBusy(true))
 	}
+	if r.SizeAwareEviction {
+		opts = append(opts, config.WithSizeAwareEviction(true))
+	}
 	if r.LRUEvictionMaxRetries > 0 {
 		opts = append(opts, config.WithLRUEvictionMaxRetries(r.LRUEvictionMaxRetries))
 	}
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -119,6 +119,7 @@ type ApplicationConfig struct {

 	// Eviction settings
 	ForceEvictionWhenBusy    bool          // Force eviction even when models have active API calls (default: false for safety)
+	SizeAwareEviction        bool          // Evict largest models first rather than least-recently-used (default: false)
 	LRUEvictionMaxRetries    int           // Maximum number of retries when waiting for busy models to become idle (default: 30)
 	LRUEvictionRetryInterval time.Duration // Interval between retries when waiting for busy models (default: 1s)

@@ -488,6 +489,16 @@ func WithForceEvictionWhenBusy(enabled bool) AppOption {
 	}
 }

+// WithSizeAwareEviction enables size-aware eviction ordering.
+// When true, the watchdog evicts the largest loaded model first rather than the
+// least-recently-used one, keeping small utility models resident and maximizing
+// memory freed per eviction.
+func WithSizeAwareEviction(enabled bool) AppOption {
+	return func(o *ApplicationConfig) {
+		o.SizeAwareEviction = enabled
+	}
+}
+
 // WithLRUEvictionMaxRetries sets the maximum number of retries when waiting for busy models to become idle
 func WithLRUEvictionMaxRetries(maxRetries int) AppOption {
 	return func(o *ApplicationConfig) {
@@ -701,6 +712,18 @@ func WithMITMCADir(dir string) AppOption {
 	}
 }

+// WithPIIDefaultDetectors sets the instance-wide default PII/secret detector
+// model names applied to any PII-enabled model (chiefly cloud-proxy / MITM
+// models) that names no pii.detectors of its own. CLI/env:
+// LOCALAI_PII_DEFAULT_DETECTORS. Empty leaves the value to
+// runtime_settings.json / the Middleware UI; a non-empty value takes
+// precedence over the file (env > file).
+func WithPIIDefaultDetectors(detectors []string) AppOption {
+	return func(o *ApplicationConfig) {
+		o.PIIDefaultDetectors = detectors
+	}
+}
+
 func WithDynamicConfigDir(dynamicConfigsDir string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.DynamicConfigsDir = dynamicConfigsDir
@@ -1028,6 +1051,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 	memoryReclaimerEnabled := o.MemoryReclaimerEnabled
 	memoryReclaimerThreshold := o.MemoryReclaimerThreshold
 	forceEvictionWhenBusy := o.ForceEvictionWhenBusy
+	sizeAwareEviction := o.SizeAwareEviction
 	lruEvictionMaxRetries := o.LRUEvictionMaxRetries
 	threads := o.Threads
 	contextSize := o.ContextSize
@@ -1120,6 +1144,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 		MemoryReclaimerEnabled:    &memoryReclaimerEnabled,
 		MemoryReclaimerThreshold:  &memoryReclaimerThreshold,
 		ForceEvictionWhenBusy:     &forceEvictionWhenBusy,
+		SizeAwareEviction:         &sizeAwareEviction,
 		LRUEvictionMaxRetries:     &lruEvictionMaxRetries,
 		LRUEvictionRetryInterval:  &lruEvictionRetryInterval,
 		Threads:                   &threads,
@@ -1244,6 +1269,10 @@ func (o *ApplicationConfig) ApplyRuntimeSettings(settings *RuntimeSettings) (req
 		o.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
 		// This setting doesn't require restart, can be updated dynamically
 	}
+	if settings.SizeAwareEviction != nil {
+		o.SizeAwareEviction = *settings.SizeAwareEviction
+		// This setting doesn't require restart, can be updated dynamically
+	}
 	if settings.LRUEvictionMaxRetries != nil {
 		o.LRUEvictionMaxRetries = *settings.LRUEvictionMaxRetries
 		// This setting doesn't require restart, can be updated dynamically
--- a/core/config/backend_capabilities.go
+++ b/core/config/backend_capabilities.go
@@ -8,27 +8,28 @@ import (
 // Usecase name constants — the canonical string values used in gallery entries,
 // model configs (known_usecases), and UsecaseInfoMap keys.
 const (
-	UsecaseChat               = "chat"
-	UsecaseCompletion         = "completion"
-	UsecaseEdit               = "edit"
-	UsecaseVision             = "vision"
-	UsecaseEmbeddings         = "embeddings"
-	UsecaseTokenize           = "tokenize"
-	UsecaseImage              = "image"
-	UsecaseVideo              = "video"
-	UsecaseTranscript         = "transcript"
-	UsecaseTTS                = "tts"
-	UsecaseSoundGeneration    = "sound_generation"
-	UsecaseRerank             = "rerank"
-	UsecaseDetection          = "detection"
-	UsecaseDepth              = "depth"
-	UsecaseVAD                = "vad"
-	UsecaseAudioTransform     = "audio_transform"
-	UsecaseDiarization        = "diarization"
-	UsecaseRealtimeAudio      = "realtime_audio"
-	UsecaseFaceRecognition    = "face_recognition"
-	UsecaseSpeakerRecognition = "speaker_recognition"
-	UsecaseTokenClassify      = "token_classify"
+	UsecaseChat                = "chat"
+	UsecaseCompletion          = "completion"
+	UsecaseEdit                = "edit"
+	UsecaseVision              = "vision"
+	UsecaseEmbeddings          = "embeddings"
+	UsecaseTokenize            = "tokenize"
+	UsecaseImage               = "image"
+	UsecaseVideo               = "video"
+	UsecaseTranscript          = "transcript"
+	UsecaseTTS                 = "tts"
+	UsecaseSoundGeneration     = "sound_generation"
+	UsecaseRerank              = "rerank"
+	UsecaseDetection           = "detection"
+	UsecaseDepth               = "depth"
+	UsecaseVAD                 = "vad"
+	UsecaseAudioTransform      = "audio_transform"
+	UsecaseDiarization         = "diarization"
+	UsecaseSoundClassification = "sound_classification"
+	UsecaseRealtimeAudio       = "realtime_audio"
+	UsecaseFaceRecognition     = "face_recognition"
+	UsecaseSpeakerRecognition  = "speaker_recognition"
+	UsecaseTokenClassify       = "token_classify"
 )

 // GRPCMethod identifies a Backend service RPC from backend.proto.
@@ -51,6 +52,7 @@ const (
 	MethodVAD                GRPCMethod = "VAD"
 	MethodAudioTransform     GRPCMethod = "AudioTransform"
 	MethodDiarize            GRPCMethod = "Diarize"
+	MethodSoundDetection     GRPCMethod = "SoundDetection"
 	MethodAudioToAudioStream GRPCMethod = "AudioToAudioStream"
 	MethodFaceVerify         GRPCMethod = "FaceVerify"
 	MethodFaceAnalyze        GRPCMethod = "FaceAnalyze"
@@ -165,6 +167,11 @@ var UsecaseInfoMap = map[string]UsecaseInfo{
 		GRPCMethod:  MethodDiarize,
 		Description: "Speaker diarization (who-spoke-when, per-speaker segments) via the Diarize RPC.",
 	},
+	UsecaseSoundClassification: {
+		Flag:        FLAG_SOUND_CLASSIFICATION,
+		GRPCMethod:  MethodSoundDetection,
+		Description: "Sound-event classification / audio tagging (scored AudioSet labels like baby cry, glass breaking, alarms) via the SoundDetection RPC.",
+	},
 	UsecaseRealtimeAudio: {
 		Flag:        FLAG_REALTIME_AUDIO,
 		GRPCMethod:  MethodAudioToAudioStream,
--- a/core/config/defaults.go
+++ b/core/config/defaults.go
@@ -0,0 +1,30 @@
+package config
+
+// Canonical default values.
+//
+// These are owned here so the two layers that need them share a single source
+// of truth: the config tiers (ApplyInference/Hardware/Serving/Generic — which
+// *decide* defaults) and core/backend/options.go (which *translates* a
+// ModelConfig to the backend wire format and supplies the same fallbacks
+// defensively). Previously these were duplicated as literals across both
+// packages and had drifted (e.g. n_gpu_layers 9999999 vs 99999999, two batch
+// constants of 512). core/backend imports core/config, so backend references
+// these; config never imports backend.
+const (
+	// DefaultContextSize is the fallback context window when none is configured
+	// or estimable from the model.
+	DefaultContextSize = 4096
+
+	// GGUFFallbackContextSize is the context window for a GGUF model whose
+	// metadata yields no usable estimate (see guessGGUFFromFile). Deliberately
+	// smaller than DefaultContextSize to stay conservative on memory there.
+	GGUFFallbackContextSize = 1024
+
+	// DefaultNGPULayers means "offload all layers"; the backend (fit_params)
+	// clamps to what actually fits in device memory.
+	DefaultNGPULayers = 99999999
+
+	// DefaultFlashAttention is the flash-attention mode default; "auto" lets the
+	// backend enable it when the model + backend support it.
+	DefaultFlashAttention = "auto"
+)
--- a/core/config/generic_defaults.go
+++ b/core/config/generic_defaults.go
@@ -0,0 +1,115 @@
+package config
+
+import "os"
+
+// ApplyGenericDefaults fills the generic fallback values applied after the
+// higher-priority tiers (ApplyInferenceDefaults for the model family,
+// ApplyHardwareDefaults for the device, ApplyServingDefaults for serving
+// policy): sampling parameters and a few runtime flags. Like the other tiers it
+// only fills values still left unset, so model-family / explicit config wins.
+func ApplyGenericDefaults(cfg *ModelConfig) {
+	if cfg == nil {
+		return
+	}
+
+	// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
+	defaultTopP := 0.95
+	defaultTopK := 40
+	defaultMinP := 0.0
+	defaultTemp := 0.9
+	// https://github.com/mudler/LocalAI/issues/2780
+	defaultMirostat := 0
+	defaultMirostatTAU := 5.0
+	defaultMirostatETA := 0.1
+	defaultTypicalP := 1.0
+	defaultTFZ := 1.0
+	defaultZero := 0
+
+	trueV := true
+	falseV := false
+
+	if cfg.Seed == nil {
+		//  random number generator seed
+		defaultSeed := RAND_SEED
+		cfg.Seed = &defaultSeed
+	}
+
+	// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
+	// native default differs (issue #6632). Only inject it for the llama.cpp
+	// family and the empty/auto backend; leave TopK nil for known non-llama
+	// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
+	// is 0 rather than a silently-changed 40.
+	if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
+		cfg.TopK = &defaultTopK
+	}
+
+	if cfg.MinP == nil {
+		cfg.MinP = &defaultMinP
+	}
+
+	if cfg.TypicalP == nil {
+		cfg.TypicalP = &defaultTypicalP
+	}
+
+	if cfg.TFZ == nil {
+		cfg.TFZ = &defaultTFZ
+	}
+
+	if cfg.MMap == nil {
+		// MMap is enabled by default
+
+		// Only exception is for Intel GPUs
+		if os.Getenv("XPU") != "" {
+			cfg.MMap = &falseV
+		} else {
+			cfg.MMap = &trueV
+		}
+	}
+
+	if cfg.MMlock == nil {
+		// MMlock is disabled by default
+		cfg.MMlock = &falseV
+	}
+
+	if cfg.TopP == nil {
+		cfg.TopP = &defaultTopP
+	}
+	if cfg.Temperature == nil {
+		cfg.Temperature = &defaultTemp
+	}
+
+	if cfg.Maxtokens == nil {
+		cfg.Maxtokens = &defaultZero
+	}
+
+	if cfg.Mirostat == nil {
+		cfg.Mirostat = &defaultMirostat
+	}
+
+	if cfg.MirostatETA == nil {
+		cfg.MirostatETA = &defaultMirostatETA
+	}
+
+	if cfg.MirostatTAU == nil {
+		cfg.MirostatTAU = &defaultMirostatTAU
+	}
+
+	if cfg.LowVRAM == nil {
+		cfg.LowVRAM = &falseV
+	}
+
+	if cfg.Embeddings == nil {
+		cfg.Embeddings = &falseV
+	}
+
+	if cfg.Reranking == nil {
+		cfg.Reranking = &falseV
+	}
+
+	if cfg.PromptCacheAll == nil {
+		// Match upstream llama.cpp's default (common/common.h: cache_prompt = true)
+		// and let cache_idle_slots / kv_unified actually do useful work; users can
+		// opt out with an explicit `prompt_cache_all: false` in the model YAML.
+		cfg.PromptCacheAll = &trueV
+	}
+}
--- a/core/config/generic_defaults_test.go
+++ b/core/config/generic_defaults_test.go
@@ -0,0 +1,36 @@
+package config_test
+
+import (
+	. "github.com/mudler/LocalAI/core/config"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("ApplyGenericDefaults (generic fallback tier)", func() {
+	It("fills sampling + runtime fallbacks when unset", func() {
+		cfg := &ModelConfig{} // empty backend uses the llama sampler defaults
+		ApplyGenericDefaults(cfg)
+		Expect(cfg.TopP).ToNot(BeNil())
+		Expect(*cfg.TopP).To(Equal(0.95))
+		Expect(*cfg.TopK).To(Equal(40))
+		Expect(*cfg.Temperature).To(Equal(0.9))
+		Expect(*cfg.MMap).To(BeTrue())
+		Expect(*cfg.MMlock).To(BeFalse())
+		Expect(*cfg.PromptCacheAll).To(BeTrue())
+	})
+
+	It("never overrides explicit values", func() {
+		tk := 7
+		tp := 0.5
+		cfg := &ModelConfig{}
+		cfg.TopK = &tk
+		cfg.TopP = &tp
+		ApplyGenericDefaults(cfg)
+		Expect(*cfg.TopK).To(Equal(7))
+		Expect(*cfg.TopP).To(Equal(0.5))
+	})
+
+	It("no-ops on nil", func() {
+		Expect(func() { ApplyGenericDefaults(nil) }).ToNot(Panic())
+	})
+})
--- a/core/config/gguf.go
+++ b/core/config/gguf.go
@@ -14,11 +14,6 @@ import (
 	"github.com/gpustack/gguf-parser-go/util/ptr"
 )

-const (
-	defaultContextSize = 1024
-	defaultNGPULayers  = 99999999
-)
-
 // reservedNonChatModel reports whether the operator reserved this model for an
 // internal primitive — the router score classifier or the PII NER
 // token_classify tier. Such a model has no chat template and must not be
@@ -38,7 +33,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
 			cSize := int(ctxSize)
 			cfg.ContextSize = &cSize
 		} else {
-			defaultCtx = defaultContextSize
+			defaultCtx = GGUFFallbackContextSize
 			cfg.ContextSize = &defaultCtx
 		}
 	}
@@ -52,7 +47,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {

 	if cfg.NGPULayers == nil {
 		// we assume we want to offload all layers
-		defaultHigh := defaultNGPULayers
+		defaultHigh := DefaultNGPULayers
 		cfg.NGPULayers = &defaultHigh
 	}

--- a/core/config/hardware_defaults.go
+++ b/core/config/hardware_defaults.go
@@ -0,0 +1,180 @@
+package config
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+
+	"github.com/mudler/LocalAI/pkg/xsysinfo"
+	"github.com/mudler/xlog"
+)
+
+// Hardware-driven model-config defaults.
+//
+// This sits alongside the other config overriders (ApplyInferenceDefaults for
+// model families, guessDefaultsFromFile for GGUF/NGPULayers): they all
+// heuristically fill ModelConfig values the user left unset. Hardware tuning is
+// the same domain — "adjust the config from the device that will run it" — so
+// it lives here rather than scattered into the backend or a separate package.
+//
+// The heuristics are parameterized on a GPU descriptor (not on direct
+// detection) so they apply in both deployment shapes: SetDefaults passes the
+// LocalGPU on a single host, and the distributed router passes the *selected
+// node's* reported GPU before loading there (the frontend that loaded the
+// config may have no GPU at all).
+
+// GPU describes the device that will run a model.
+type GPU struct {
+	// Vendor is "nvidia", "amd", … (matches xsysinfo vendor constants).
+	Vendor string
+	// ComputeCapability is the NVIDIA compute capability as "major.minor"
+	// (e.g. "12.1" for GB10 / DGX Spark). Empty for non-NVIDIA / unknown.
+	ComputeCapability string
+	// VRAM is total device memory in bytes (0 = unknown).
+	VRAM uint64
+}
+
+// Physical batch (n_batch / n_ubatch) defaults.
+const (
+	// DefaultPhysicalBatch is the conservative default when no hardware-specific
+	// tuning applies. core/backend.DefaultBatchSize references this (single source).
+	DefaultPhysicalBatch = 512
+	// BlackwellPhysicalBatch is the default on NVIDIA Blackwell consumer GPUs
+	// (sm_12x: sm_120 RTX 50-series, sm_121 GB10 / DGX Spark). A larger physical
+	// batch materially lifts MoE prefill there (per-expert GEMM tiles fill
+	// better); measured on a GB10 with Qwen3-30B-A3B to saturate around 2048.
+	BlackwellPhysicalBatch = 2048
+)
+
+// IsNVIDIABlackwell reports whether the GPU is in the NVIDIA Blackwell consumer
+// family (sm_12x). Datacenter Blackwell (B100/B200/GB200, sm_100 / cc 10.0)
+// reports a different compute capability and is intentionally not matched.
+func (g GPU) IsNVIDIABlackwell() bool {
+	maj, _ := parseComputeCapability(g.ComputeCapability)
+	return maj >= 12
+}
+
+// PhysicalBatch returns the canonical physical batch (n_batch/n_ubatch) for the
+// given hardware, used when the model config leaves batch unset.
+func PhysicalBatch(g GPU) int {
+	if g.IsNVIDIABlackwell() {
+		return BlackwellPhysicalBatch
+	}
+	return DefaultPhysicalBatch
+}
+
+// IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
+// Callers that re-tune a value chosen by an upstream host (the distributed
+// router correcting the frontend's guess) use this to avoid clobbering an
+// explicit user batch such as 1024.
+func IsManagedPhysicalBatch(n int) bool {
+	return n == DefaultPhysicalBatch || n == BlackwellPhysicalBatch
+}
+
+// Parallel-slot (n_parallel) VRAM tiers. llama.cpp serializes requests at
+// n_parallel=1 (the backend default) and only auto-enables continuous batching
+// when n_parallel > 1 — so a single-slot default makes concurrent requests
+// queue. We default a slot count by GPU size so multi-user serving works out of
+// the box. With the backend's unified KV cache the slots SHARE the context
+// budget, so more slots add concurrency without multiplying KV memory.
+const (
+	parallelSlotsVRAMHigh = uint64(32) << 30 // >=32 GiB -> 8 slots
+	parallelSlotsVRAMMid  = uint64(8) << 30  // >=8 GiB  -> 4 slots
+	parallelSlotsVRAMLow  = uint64(4) << 30  // >=4 GiB  -> 2 slots
+)
+
+// DefaultParallelSlots returns the n_parallel default for the given GPU. Returns
+// 1 (no concurrency) when VRAM is unknown or too small, so we never change
+// behavior on CPU-only / tiny devices.
+func DefaultParallelSlots(g GPU) int {
+	switch {
+	case g.VRAM >= parallelSlotsVRAMHigh:
+		return 8
+	case g.VRAM >= parallelSlotsVRAMMid:
+		return 4
+	case g.VRAM >= parallelSlotsVRAMLow:
+		return 2
+	default:
+		return 1
+	}
+}
+
+// EnsureParallelOption appends a VRAM-scaled "parallel:N" backend option when the
+// model doesn't already set one (and the GPU warrants concurrency). Returns the
+// possibly-extended options. Shared by the single-host config path
+// (ApplyHardwareDefaults) and the distributed router (per selected node).
+func EnsureParallelOption(opts []string, gpu GPU) []string {
+	if slots := DefaultParallelSlots(gpu); slots > 1 && !hasParallelOption(opts) {
+		return append(opts, fmt.Sprintf("parallel:%d", slots))
+	}
+	return opts
+}
+
+// hasParallelOption reports whether the model already sets parallel/n_parallel
+// so we never override an explicit value (helper shared with serving_defaults.go).
+func hasParallelOption(opts []string) bool {
+	return backendOptionSet(opts, "parallel", "n_parallel")
+}
+
+// localGPU builds a GPU descriptor from local detection, used by SetDefaults on
+// a single host (the distributed router builds it from the selected node's
+// reported info instead). It is a package var so tests can inject a
+// deterministic device — detection does a live nvidia-smi call.
+var localGPU = func() GPU {
+	vendor, _ := xsysinfo.DetectGPUVendor()
+	vram, _ := xsysinfo.TotalAvailableVRAM()
+	return GPU{
+		Vendor:            vendor,
+		ComputeCapability: xsysinfo.NVIDIAComputeCapability(),
+		VRAM:              vram,
+	}
+}
+
+// ApplyHardwareDefaults fills ModelConfig values that depend on the target GPU
+// and were left unset by the user. Currently: a larger physical batch on
+// Blackwell. Explicit config always wins (we only touch zero values).
+func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
+	if cfg == nil {
+		return
+	}
+	if cfg.Batch == 0 && gpu.IsNVIDIABlackwell() {
+		cfg.Batch = BlackwellPhysicalBatch
+		xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
+			"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability)
+	}
+
+	// Enable concurrent serving by default on a capable GPU: without this the
+	// llama.cpp backend runs n_parallel=1 and serializes multi-user requests
+	// (continuous batching stays off). Unified KV means the slots share the
+	// context budget, so this is concurrency without extra KV memory. Explicit
+	// parallel/n_parallel in the model options always wins.
+	if before := len(cfg.Options); true {
+		cfg.Options = EnsureParallelOption(cfg.Options, gpu)
+		if len(cfg.Options) > before {
+			xlog.Debug("[hardware_defaults] defaulting parallel slots for concurrent serving",
+				"option", cfg.Options[len(cfg.Options)-1], "vram_gib", gpu.VRAM>>30)
+		}
+	}
+}
+
+// parseComputeCapability splits a "major.minor" string into integer parts.
+// Returns (-1, -1) when it can't be parsed.
+func parseComputeCapability(cc string) (int, int) {
+	cc = strings.TrimSpace(cc)
+	if cc == "" {
+		return -1, -1
+	}
+	majStr, minStr := cc, "0"
+	if dot := strings.IndexByte(cc, '.'); dot >= 0 {
+		majStr, minStr = cc[:dot], cc[dot+1:]
+	}
+	maj, err := strconv.Atoi(strings.TrimSpace(majStr))
+	if err != nil {
+		return -1, -1
+	}
+	min, err := strconv.Atoi(strings.TrimSpace(minStr))
+	if err != nil {
+		min = 0
+	}
+	return maj, min
+}
--- a/core/config/hardware_defaults_internal_test.go
+++ b/core/config/hardware_defaults_internal_test.go
@@ -0,0 +1,37 @@
+package config
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// Single-instance path: SetDefaults applies hardware defaults from the local
+// GPU. The detection seam (localGPU) is injected so the path is deterministic
+// without a real GPU.
+var _ = Describe("SetDefaults hardware defaults (single-instance)", func() {
+	var orig func() GPU
+	BeforeEach(func() { orig = localGPU })
+	AfterEach(func() { localGPU = orig })
+
+	It("sets the physical batch on a local Blackwell GPU", func() {
+		localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
+		cfg := &ModelConfig{}
+		cfg.SetDefaults()
+		Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
+	})
+
+	It("leaves batch unset on a non-Blackwell local GPU", func() {
+		localGPU = func() GPU { return GPU{ComputeCapability: "8.9"} }
+		cfg := &ModelConfig{}
+		cfg.SetDefaults()
+		Expect(cfg.Batch).To(Equal(0))
+	})
+
+	It("never overrides an explicit batch", func() {
+		localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
+		cfg := &ModelConfig{}
+		cfg.Batch = 1024
+		cfg.SetDefaults()
+		Expect(cfg.Batch).To(Equal(1024))
+	})
+})
--- a/core/config/hardware_defaults_test.go
+++ b/core/config/hardware_defaults_test.go
@@ -0,0 +1,97 @@
+package config_test
+
+import (
+	. "github.com/mudler/LocalAI/core/config"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("Hardware-driven config defaults", func() {
+	DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
+		func(cc string, want bool) {
+			Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
+		},
+		Entry("GB10 12.1", "12.1", true),
+		Entry("RTX 50 12.0", "12.0", true),
+		Entry("future 13.0", "13.0", true),
+		Entry("Hopper 9.0", "9.0", false),
+		Entry("Ada 8.9", "8.9", false),
+		Entry("datacenter Blackwell sm_100 10.0", "10.0", false),
+		Entry("unknown", "", false),
+	)
+
+	Describe("PhysicalBatch / IsManagedPhysicalBatch", func() {
+		It("returns the Blackwell batch on Blackwell", func() {
+			Expect(PhysicalBatch(GPU{ComputeCapability: "12.1"})).To(Equal(BlackwellPhysicalBatch))
+		})
+		It("returns the default batch otherwise", func() {
+			Expect(PhysicalBatch(GPU{ComputeCapability: "9.0"})).To(Equal(DefaultPhysicalBatch))
+			Expect(PhysicalBatch(GPU{})).To(Equal(DefaultPhysicalBatch))
+		})
+		It("recognizes managed defaults but not explicit values", func() {
+			Expect(IsManagedPhysicalBatch(DefaultPhysicalBatch)).To(BeTrue())
+			Expect(IsManagedPhysicalBatch(BlackwellPhysicalBatch)).To(BeTrue())
+			Expect(IsManagedPhysicalBatch(1024)).To(BeFalse())
+		})
+	})
+
+	Describe("ApplyHardwareDefaults", func() {
+		It("raises an unset batch to 2048 on Blackwell", func() {
+			cfg := &ModelConfig{}
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
+			Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
+		})
+		It("leaves batch unset on non-Blackwell", func() {
+			cfg := &ModelConfig{}
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0"})
+			Expect(cfg.Batch).To(Equal(0))
+		})
+		It("never overrides an explicit batch", func() {
+			cfg := &ModelConfig{}
+			cfg.Batch = 1024
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
+			Expect(cfg.Batch).To(Equal(1024))
+		})
+		It("no-ops on nil", func() {
+			Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic())
+		})
+	})
+
+	const gib = uint64(1) << 30
+
+	DescribeTable("DefaultParallelSlots (by VRAM)",
+		func(vramGiB uint64, want int) {
+			Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want))
+		},
+		Entry("GB10 119 GiB", uint64(119), 8),
+		Entry("48 GiB", uint64(48), 8),
+		Entry("24 GiB", uint64(24), 4),
+		Entry("8 GiB", uint64(8), 4),
+		Entry("6 GiB", uint64(6), 2),
+		Entry("2 GiB", uint64(2), 1),
+		Entry("unknown 0", uint64(0), 1),
+	)
+
+	Describe("ApplyHardwareDefaults parallel slots", func() {
+		It("adds a VRAM-scaled parallel option on a capable GPU", func() {
+			cfg := &ModelConfig{}
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
+			Expect(cfg.Options).To(ContainElement("parallel:8"))
+		})
+		It("scales the slot count down with VRAM", func() {
+			cfg := &ModelConfig{}
+			ApplyHardwareDefaults(cfg, GPU{VRAM: 24 * gib})
+			Expect(cfg.Options).To(ContainElement("parallel:4"))
+		})
+		It("adds no parallel option on small/unknown VRAM", func() {
+			cfg := &ModelConfig{}
+			ApplyHardwareDefaults(cfg, GPU{VRAM: 2 * gib})
+			Expect(cfg.Options).ToNot(ContainElement(ContainSubstring("parallel")))
+		})
+		It("never overrides an explicit parallel option", func() {
+			cfg := &ModelConfig{Options: []string{"parallel:2"}}
+			ApplyHardwareDefaults(cfg, GPU{VRAM: 119 * gib})
+			Expect(cfg.Options).To(Equal([]string{"parallel:2"}))
+		})
+	})
+})
--- a/core/config/hooks_llamacpp.go
+++ b/core/config/hooks_llamacpp.go
@@ -34,7 +34,7 @@ func llamaCppDefaults(cfg *ModelConfig, modelPath string) {
 	// Default context size if not set, regardless of whether GGUF parsing succeeds
 	defer func() {
 		if cfg.ContextSize == nil {
-			ctx := defaultContextSize
+			ctx := GGUFFallbackContextSize
 			cfg.ContextSize = &ctx
 		}
 	}()
--- a/core/config/meta/constants.go
+++ b/core/config/meta/constants.go
@@ -68,6 +68,7 @@ var UsecaseOptions = []FieldOption{
 	{Value: "face_recognition", Label: "Face Recognition"},
 	{Value: "transcript", Label: "Transcript"},
 	{Value: "diarization", Label: "Diarization"},
+	{Value: "sound_classification", Label: "Sound Classification"},
 	{Value: "speaker_recognition", Label: "Speaker Recognition"},
 	{Value: "tts", Label: "TTS"},
 	{Value: "sound_generation", Label: "Sound Generation"},
--- a/core/config/meta/registry.go
+++ b/core/config/meta/registry.go
@@ -286,6 +286,15 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order:       45,
 		},

+		// --- Alias ---
+		"alias": {
+			Section:     "alias",
+			Label:       "Alias target",
+			Description: "Redirect all traffic for this model to another configured model. When set, every other field on this config is ignored and requests are served by the target model.",
+			Component:   "model-select",
+			Order:       0,
+		},
+
 		// --- Pipeline ---
 		"pipeline.llm": {
 			Section:              "pipeline",
@@ -319,6 +328,30 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			AutocompleteProvider: ProviderModelsVAD,
 			Order:                63,
 		},
+		"pipeline.sound_detection": {
+			Section:              "pipeline",
+			Label:                "Sound Detection Model",
+			Description:          "Model to use for sound-event classification (audio tagging, e.g. ced) in the pipeline. When set, committed realtime audio is also classified and the scored AudioSet tags are emitted as a conversation.item.sound_detection event.",
+			Component:            "model-select",
+			AutocompleteProvider: ProviderModels,
+			Order:                64,
+		},
+		"pipeline.sound_detection_window_ms": {
+			Section:     "pipeline",
+			Label:       "Sound Detection Window (ms)",
+			Description: "Server-side windowing for a sound-only realtime session: length in ms of the audio window classified each hop. 0 = client-driven (the client commits windows).",
+			Component:   "number",
+			Min:         f64(0),
+			Order:       65,
+		},
+		"pipeline.sound_detection_hop_ms": {
+			Section:     "pipeline",
+			Label:       "Sound Detection Hop (ms)",
+			Description: "Server-side windowing hop in ms: how often the server classifies the last window. 0 = client-driven.",
+			Component:   "number",
+			Min:         f64(0),
+			Order:       66,
+		},
 		"pipeline.reasoning_effort": {
 			Section:     "pipeline",
 			Label:       "Reasoning Effort",
@@ -448,6 +481,55 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Component:   "json-editor",
 			Order:       78,
 		},
+		"pipeline.voice_recognition.enforce": {
+			Section:     "pipeline",
+			Label:       "Voice Gate Enforce",
+			Description: "Whether the gate rejects unauthorized speakers. Enabled (default) drops unauthorized utterances before the LLM. Disabled still resolves and surfaces the speaker (for the conversation.item.speaker event and personalization) but never drops a turn.",
+			Component:   "toggle",
+			Order:       80,
+		},
+		"pipeline.voice_recognition.identity.announce": {
+			Section:     "pipeline",
+			Label:       "Speaker Identity Announce",
+			Description: "Emit a conversation.item.speaker event to the client naming the recognized speaker. When set, identity is resolved on every turn even if 'when' is 'first'.",
+			Component:   "toggle",
+			Order:       81,
+		},
+		"pipeline.voice_recognition.identity.announce_unknown": {
+			Section:     "pipeline",
+			Label:       "Speaker Identity Announce Unknown",
+			Description: "Also emit the conversation.item.speaker event (with matched=false) when no confident match is found. Default only announces on a match.",
+			Component:   "toggle",
+			Order:       82,
+		},
+		"pipeline.voice_recognition.identity.personalize": {
+			Section:     "pipeline",
+			Label:       "Speaker Identity Personalize",
+			Description: "Inform the LLM who is speaking so it can tailor replies. Enables the name and system-note injection below.",
+			Component:   "toggle",
+			Order:       83,
+		},
+		"pipeline.voice_recognition.identity.inject_name": {
+			Section:     "pipeline",
+			Label:       "Speaker Identity Inject Name",
+			Description: "Personalization: set the per-message OpenAI 'name' field on each user turn to the recognized speaker.",
+			Component:   "toggle",
+			Order:       84,
+		},
+		"pipeline.voice_recognition.identity.inject_system_note": {
+			Section:     "pipeline",
+			Label:       "Speaker Identity Inject System Note",
+			Description: "Personalization: append a 'The current speaker is <name>.' note to the system message reflecting the latest speaker.",
+			Component:   "toggle",
+			Order:       85,
+		},
+		"pipeline.voice_recognition.identity.note_unknown": {
+			Section:     "pipeline",
+			Label:       "Speaker Identity Note Unknown",
+			Description: "Personalization: when the speaker is unidentified, append 'The current speaker is unknown.' to the system message so the model can ask who it is talking to.",
+			Component:   "toggle",
+			Order:       86,
+		},
 		"pipeline.max_history_items": {
 			Section:     "pipeline",
 			Label:       "Max History Items",
@@ -455,6 +537,36 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Component:   "number",
 			Order:       79,
 		},
+		"pipeline.compaction.enabled": {
+			Section:     "pipeline",
+			Label:       "Compaction Enabled",
+			Description: "Fold conversation items that age out of the live window (Max History Items) into a rolling summary instead of dropping them, so long realtime sessions stay cheap without losing earlier context. Off by default.",
+			Component:   "toggle",
+			Order:       80,
+		},
+		"pipeline.compaction.trigger_items": {
+			Section:     "pipeline",
+			Label:       "Compaction Trigger Items",
+			Description: "High-water mark: once the live conversation exceeds this many items, the overflow above Max History Items is summarized and evicted. Must be greater than Max History Items; defaults to twice it. The gap controls how often summarization runs.",
+			Component:   "number",
+			Order:       81,
+		},
+		"pipeline.compaction.summary_model": {
+			Section:     "pipeline",
+			Label:       "Compaction Summary Model",
+			Description: "Optional smaller/cheaper model used to produce the rolling summary. Empty reuses the pipeline's own LLM. On CPU, a tiny model here keeps compaction from competing with the conversation LLM.",
+			Component:   "input",
+			Advanced:    true,
+			Order:       82,
+		},
+		"pipeline.compaction.max_summary_tokens": {
+			Section:     "pipeline",
+			Label:       "Compaction Max Summary Tokens",
+			Description: "Advisory cap on the rolling summary length (fed to the summarizer prompt). Defaults to 512.",
+			Component:   "number",
+			Advanced:    true,
+			Order:       83,
+		},

 		// --- Functions ---
 		"function.grammar.parallel_calls": {
--- a/Show More
+++ b/Show More