chore(recon): bump voice-detect pin to ERes2Net blocked-default (30beecd)

Defaults VD_ERES2NET_BLOCKED ON: routes the ERes2Net Res2Net body through the blocked nChw16c AVX-512 directconv island instead of the 1x1 mul_mat fast path (CONT-transpose + skinny low-K GEMM). On the shipped GGML_NATIVE=OFF build (ggml mul_mat is AVX2-only) this wins ~2x at every thread count (2.07x@1t, 2.2x@4t, 2.05x@8t); pure-AVX2 fallback still 1.3-1.62x. Parity exact (cosine=1.000000 vs golden), so registered voices + verify/identify thresholds are unaffected. The prior default-OFF rested on a stale comment whose 23pct regression only held on the non-shipping GGML_NATIVE=ON build. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code]
feat(recon): enable cuDNN conv path on arm64+CUDA13 recon backends
2026-06-24 16:49:06 -04:00 · 2026-06-24 19:51:03 +00:00 · 2026-06-24 15:54:12 +00:00 · 2026-06-24 15:39:42 +00:00 · 2026-06-24 13:03:48 +00:00 · 2026-06-23 23:44:04 +00:00
191 changed files with 10992 additions and 554 deletions
--- a/.agents/adding-backends.md
+++ b/.agents/adding-backends.md
@@ -198,6 +198,27 @@ docker-build-backends: ... docker-build-<backend-name>
 - If the backend is in `backend/python/<backend-name>/` but uses `.` as context in the workflow file, use `.` context
 - Check similar backends to determine the correct context
 ## Documenting the backend (README + docs)
 A backend is not "added" until it is discoverable. Update the user-facing docs:
 - **`docs/content/features/backends.md`** - add the backend to the right
  category in the "LocalAI supports various types of backends" list (and add a
  new category if it introduces a new modality, e.g. sound classification).
 - If the backend introduces a **new API surface** (a new endpoint or a realtime
  capability), document it under `docs/content/` where its area lives (audio,
  vision, etc.) and follow the api-endpoints checklist in
  [api-endpoints-and-auth.md](api-endpoints-and-auth.md).
 **If the backend is a native C/C++/GGML engine created and maintained by the
 LocalAI team** (a from-scratch port like `parakeet.cpp`, `ced.cpp`,
 `vibevoice.cpp`, `rf-detr.cpp`, not a wrapper around a third-party runtime), it
 ALSO belongs in the top-level **`README.md`** table under "native C/C++/GGML
 engines ... developed and maintained by the LocalAI project itself". Add a row
 linking the upstream engine repo with a one-line description. This is the
 project's showcase of its own engines; a new in-house backend that is missing
 from it is a documentation bug.
 ## 5. Verification Checklist
 After adding a new backend, verify:
@@ -211,6 +232,8 @@ After adding a new backend, verify:
 - [ ] No YAML syntax errors (check with linter)
 - [ ] No Makefile syntax errors (check with linter)
 - [ ] Follows the same pattern as similar backends (e.g., if it's a transcription backend, follow `faster-whisper` pattern)
 - [ ] Documented: added to the category list in `docs/content/features/backends.md` (and any new endpoint/realtime capability documented under `docs/content/`)
 - [ ] If it is an in-house native C/C++/GGML engine, added to the maintained-engines table in the top-level `README.md`
 ## Bundling runtime shared libraries (`package.sh`)
--- a/.docker/install-base-deps.sh
+++ b/.docker/install-base-deps.sh
@@ -70,6 +70,12 @@ if [ "${BUILD_TYPE:-}" = "vulkan" ] && [ "${SKIP_DRIVERS:-false}" = "false" ]; t
        git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
        ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
        clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
    # Mesa Vulkan ICD drivers (ANV/RADV/lavapipe + Arm SoC) and their ICD
    # manifests. The LunarG SDK below only provides the loader and shader
    # tooling, not hardware drivers — without Mesa the packaged Vulkan backend
    # would ship a loader that finds no GPU. package-gpu-libs.sh bundles these
    # .so files plus their deps into the backend so it stays self-contained.
    apt-get install -y mesa-vulkan-drivers libdrm2
    if [ "amd64" = "${TARGETARCH:-}" ]; then
        wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz"
        tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -3575,6 +3575,450 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  # ced
  - build-type: 'cublas'
    cuda-major-version: "12"
    cuda-minor-version: "8"
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-nvidia-cuda-12-ced'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "ced"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "13"
    cuda-minor-version: "0"
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-nvidia-cuda-13-ced'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "ced"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "13"
    cuda-minor-version: "0"
    platforms: 'linux/arm64'
    skip-drivers: 'false'
    tag-latest: 'auto'
    tag-suffix: '-nvidia-l4t-cuda-13-arm64-ced'
    base-image: "ubuntu:24.04"
    ubuntu-version: '2404'
    runs-on: 'ubuntu-24.04-arm'
    backend: "ced"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
  - build-type: ''
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    platform-tag: 'amd64'
    tag-latest: 'auto'
    tag-suffix: '-cpu-ced'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "ced"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: ''
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/arm64'
    platform-tag: 'arm64'
    tag-latest: 'auto'
    tag-suffix: '-cpu-ced'
    runs-on: 'ubuntu-24.04-arm'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "ced"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'sycl_f32'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-intel-sycl-f32-ced'
    runs-on: 'ubuntu-latest'
    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
    skip-drivers: 'false'
    backend: "ced"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'sycl_f16'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-intel-sycl-f16-ced'
    runs-on: 'ubuntu-latest'
    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
    skip-drivers: 'false'
    backend: "ced"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'vulkan'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    platform-tag: 'amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-vulkan-ced'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "ced"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'vulkan'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/arm64'
    platform-tag: 'arm64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-vulkan-ced'
    runs-on: 'ubuntu-24.04-arm'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "ced"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "12"
    cuda-minor-version: "0"
    platforms: 'linux/arm64'
    skip-drivers: 'false'
    tag-latest: 'auto'
    tag-suffix: '-nvidia-l4t-arm64-ced'
    base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
    runs-on: 'ubuntu-24.04-arm'
    backend: "ced"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2204'
  - build-type: 'hipblas'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-rocm-hipblas-ced'
    base-image: "rocm/dev-ubuntu-24.04:7.2.1"
    runs-on: 'ubuntu-latest'
    skip-drivers: 'false'
    backend: "ced"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  # voice-detect
  - build-type: 'cublas'
    cuda-major-version: "12"
    cuda-minor-version: "8"
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-nvidia-cuda-12-voice-detect'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "voice-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "13"
    cuda-minor-version: "0"
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-nvidia-cuda-13-voice-detect'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "voice-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "13"
    cuda-minor-version: "0"
    platforms: 'linux/arm64'
    skip-drivers: 'false'
    tag-latest: 'auto'
    tag-suffix: '-nvidia-l4t-cuda-13-arm64-voice-detect'
    base-image: "ubuntu:24.04"
    ubuntu-version: '2404'
    runs-on: 'ubuntu-24.04-arm'
    backend: "voice-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
  - build-type: ''
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    platform-tag: 'amd64'
    tag-latest: 'auto'
    tag-suffix: '-cpu-voice-detect'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "voice-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: ''
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/arm64'
    platform-tag: 'arm64'
    tag-latest: 'auto'
    tag-suffix: '-cpu-voice-detect'
    runs-on: 'ubuntu-24.04-arm'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "voice-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'sycl_f32'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-intel-sycl-f32-voice-detect'
    runs-on: 'ubuntu-latest'
    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
    skip-drivers: 'false'
    backend: "voice-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'sycl_f16'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-intel-sycl-f16-voice-detect'
    runs-on: 'ubuntu-latest'
    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
    skip-drivers: 'false'
    backend: "voice-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'vulkan'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    platform-tag: 'amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-vulkan-voice-detect'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "voice-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'vulkan'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/arm64'
    platform-tag: 'arm64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-vulkan-voice-detect'
    runs-on: 'ubuntu-24.04-arm'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "voice-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "12"
    cuda-minor-version: "0"
    platforms: 'linux/arm64'
    skip-drivers: 'false'
    tag-latest: 'auto'
    tag-suffix: '-nvidia-l4t-arm64-voice-detect'
    base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
    runs-on: 'ubuntu-24.04-arm'
    backend: "voice-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2204'
  - build-type: 'hipblas'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-rocm-hipblas-voice-detect'
    base-image: "rocm/dev-ubuntu-24.04:7.2.1"
    runs-on: 'ubuntu-latest'
    skip-drivers: 'false'
    backend: "voice-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  # face-detect
  - build-type: 'cublas'
    cuda-major-version: "12"
    cuda-minor-version: "8"
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-nvidia-cuda-12-face-detect'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "face-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "13"
    cuda-minor-version: "0"
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-nvidia-cuda-13-face-detect'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "face-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "13"
    cuda-minor-version: "0"
    platforms: 'linux/arm64'
    skip-drivers: 'false'
    tag-latest: 'auto'
    tag-suffix: '-nvidia-l4t-cuda-13-arm64-face-detect'
    base-image: "ubuntu:24.04"
    ubuntu-version: '2404'
    runs-on: 'ubuntu-24.04-arm'
    backend: "face-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
  - build-type: ''
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    platform-tag: 'amd64'
    tag-latest: 'auto'
    tag-suffix: '-cpu-face-detect'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "face-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: ''
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/arm64'
    platform-tag: 'arm64'
    tag-latest: 'auto'
    tag-suffix: '-cpu-face-detect'
    runs-on: 'ubuntu-24.04-arm'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "face-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'sycl_f32'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-intel-sycl-f32-face-detect'
    runs-on: 'ubuntu-latest'
    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
    skip-drivers: 'false'
    backend: "face-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'sycl_f16'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-intel-sycl-f16-face-detect'
    runs-on: 'ubuntu-latest'
    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
    skip-drivers: 'false'
    backend: "face-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'vulkan'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    platform-tag: 'amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-vulkan-face-detect'
    runs-on: 'ubuntu-latest'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "face-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'vulkan'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/arm64'
    platform-tag: 'arm64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-vulkan-face-detect'
    runs-on: 'ubuntu-24.04-arm'
    base-image: "ubuntu:24.04"
    skip-drivers: 'false'
    backend: "face-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "12"
    cuda-minor-version: "0"
    platforms: 'linux/arm64'
    skip-drivers: 'false'
    tag-latest: 'auto'
    tag-suffix: '-nvidia-l4t-arm64-face-detect'
    base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
    runs-on: 'ubuntu-24.04-arm'
    backend: "face-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2204'
  - build-type: 'hipblas'
    cuda-major-version: ""
    cuda-minor-version: ""
    platforms: 'linux/amd64'
    tag-latest: 'auto'
    tag-suffix: '-gpu-rocm-hipblas-face-detect'
    base-image: "rocm/dev-ubuntu-24.04:7.2.1"
    runs-on: 'ubuntu-latest'
    skip-drivers: 'false'
    backend: "face-detect"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
  # acestep-cpp
  - build-type: ''
    cuda-major-version: ""
@@ -4754,6 +5198,18 @@ includeDarwin:
    tag-suffix: "-metal-darwin-arm64-parakeet-cpp"
    build-type: "metal"
    lang: "go"
  - backend: "ced"
    tag-suffix: "-metal-darwin-arm64-ced"
    build-type: "metal"
    lang: "go"
  - backend: "voice-detect"
    tag-suffix: "-metal-darwin-arm64-voice-detect"
    build-type: "metal"
    lang: "go"
  - backend: "face-detect"
    tag-suffix: "-metal-darwin-arm64-face-detect"
    build-type: "metal"
    lang: "go"
  - backend: "acestep-cpp"
    tag-suffix: "-metal-darwin-arm64-acestep-cpp"
    build-type: "metal"
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -42,6 +42,18 @@ jobs:
            variable: "PARAKEET_VERSION"
            branch: "master"
            file: "backend/go/parakeet-cpp/Makefile"
          - repository: "mudler/ced.cpp"
            variable: "CED_VERSION"
            branch: "master"
            file: "backend/go/ced/Makefile"
          - repository: "mudler/voice-detect.cpp"
            variable: "VOICEDETECT_VERSION"
            branch: "master"
            file: "backend/go/voice-detect/Makefile"
          - repository: "mudler/face-detect.cpp"
            variable: "FACEDETECT_VERSION"
            branch: "master"
            file: "backend/go/face-detect/Makefile"
          - repository: "mudler/depth-anything.cpp"
            variable: "DEPTHANYTHING_VERSION"
            branch: "master"
--- a/README.md
+++ b/README.md
@@ -231,6 +231,7 @@ Most backends wrap a best-in-class upstream engine. A handful of them are native
 | Backend | What it does |
 |---------|-------------|
 | [parakeet.cpp](https://github.com/mudler/parakeet.cpp) | C++/GGML port of NVIDIA NeMo Parakeet ASR (tdt/ctc/rnnt/hybrid), with cache-aware streaming transcription |
 | [ced.cpp](https://github.com/mudler/ced.cpp) | C++/GGML port of the CED audio-tagging models: sound-event classification (527-class AudioSet) over REST and the realtime API for live recognition |
 | [voxtral.c](https://github.com/mudler/voxtral.c) | Voxtral Realtime 4B speech-to-text in pure C |
 | [vibevoice.cpp](https://github.com/mudler/vibevoice.cpp) | Native port of Microsoft VibeVoice for TTS (voice cloning) and long-form ASR with speaker diarization |
 | [rf-detr.cpp](https://github.com/mudler/rf-detr.cpp) | Native RF-DETR object detection and instance segmentation |
@@ -240,6 +241,8 @@ Most backends wrap a best-in-class upstream engine. A handful of them are native
 | [LocalVQE](https://github.com/localai-org/LocalVQE) | Joint acoustic echo cancellation, noise suppression, and dereverberation |
 | [local-store](https://github.com/mudler/LocalAI) | Local-first vector database for embeddings (shipped in-tree) |
 We also maintain [apex-quant](https://github.com/localai-org/apex-quant), a per-tensor, per-layer quantization recipe for Mixture-of-Experts models that exploits their structural sparsity to produce GGUFs matching or beating Q8_0 quality - and they run out of the box on stock llama.cpp.
 ## Resources
 - [Documentation](https://localai.io/)
--- a/backend/Dockerfile.golang
+++ b/backend/Dockerfile.golang
@@ -65,7 +65,12 @@ RUN <<EOT bash
            libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
            git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
            ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
-            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
+            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils && \
        apt-get install -y mesa-vulkan-drivers libdrm2
        # Mesa Vulkan ICD drivers (ANV/RADV/lavapipe) + their manifests. The
        # LunarG SDK below only provides the loader and shader tooling, not
        # hardware drivers — without Mesa, package-gpu-libs.sh has no ICD to
        # bundle and the packaged backend finds no GPU at runtime.
        if [ "amd64" = "$TARGETARCH" ]; then
            wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
            tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
@@ -132,7 +137,7 @@ RUN <<EOT bash
            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
        if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
            apt-get install -y --no-install-recommends \
-            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
+            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} libcudnn9-dev-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
        fi
        apt-get clean && \
        rm -rf /var/lib/apt/lists/*
--- a/backend/Dockerfile.python
+++ b/backend/Dockerfile.python
@@ -66,7 +66,12 @@ RUN <<EOT bash
            libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
            git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
            ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
-            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
+            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils && \
        apt-get install -y mesa-vulkan-drivers libdrm2
        # Mesa Vulkan ICD drivers (ANV/RADV/lavapipe) + their manifests. The
        # LunarG SDK below only provides the loader and shader tooling, not
        # hardware drivers — without Mesa, package-gpu-libs.sh has no ICD to
        # bundle and the packaged backend finds no GPU at runtime.
        if [ "amd64" = "$TARGETARCH" ]; then
            wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
            tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -24,6 +24,9 @@ service Backend {
  rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
  rpc Status(HealthMessage) returns (StatusResponse) {}
  rpc Detect(DetectOptions) returns (DetectResponse) {}
  // SoundDetection runs an audio-tagging / sound-event-classification model
  // (e.g. CED over the AudioSet ontology) on a clip and returns scored labels.
  rpc SoundDetection(SoundDetectionRequest) returns (SoundDetectionResponse) {}
  rpc Depth(DepthRequest) returns (DepthResponse) {}
  rpc FaceVerify(FaceVerifyRequest) returns (FaceVerifyResponse) {}
  rpc FaceAnalyze(FaceAnalyzeRequest) returns (FaceAnalyzeResponse) {}
@@ -671,6 +674,24 @@ message DetectResponse {
  repeated Detection Detections = 1;
 }
 // --- Sound-event classification / audio tagging messages (CED) ---
 message SoundDetectionRequest {
  string src = 1;       // audio file path (LocalAI writes the upload to disk)
  int32 top_k = 2;      // number of top tags to return (0 = all classes)
  float threshold = 3;  // optional: drop tags scoring below this
 }
 message SoundClass {
  string label = 1;     // AudioSet class name, e.g. "Baby cry, infant cry"
  float score = 2;      // per-class probability (multi-label, independent)
  int32 index = 3;      // class index in the model ontology
 }
 message SoundDetectionResponse {
  repeated SoundClass detections = 1;  // score-descending
 }
 // --- Depth estimation messages (Depth Anything 3) ---
 message DepthRequest {
--- a/backend/cpp/ik-llama-cpp/Makefile
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -1,5 +1,5 @@
-IK_LLAMA_VERSION?=b3dfb7858cfcb9166e92f366e5af87f19ebc94be
+IK_LLAMA_VERSION?=6c00e87ac84404af588ad2e65935bd6f079c696f
 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@
-LLAMA_VERSION?=f3e182816421c648188b5eab269853bf1531d950
+LLAMA_VERSION?=e475fa2b5f9fb50c3d6fc3e7c6fdf1e004465b62
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -18,6 +18,18 @@
 #if __has_include("server-chat.cpp")
 #include "server-chat.cpp"
 #endif
 // server-schema.cpp exists only in llama.cpp after the upstream refactor that
 // extracted the JSON request-schema evaluation (previously the static
 // server_task::params_from_json_cmpl) into server_schema::eval_llama_cmpl_schema.
 // server-context.cpp and grpc-server.cpp both call into it, so its definitions
 // must be part of this translation unit or the link fails. __has_include keeps
 // the source compatible with older pins/forks (e.g. llama-cpp-turboquant) that
 // predate the split and still expose params_from_json_cmpl (see the guarded
 // call sites below).
 #if __has_include("server-schema.cpp")
 #define LOCALAI_HAS_SERVER_SCHEMA 1
 #include "server-schema.cpp"
 #endif
 #include "server-context.cpp"
 // LocalAI
@@ -2102,7 +2114,11 @@ public:
                task.index = i;
                task.tokens    = std::move(inputs[i]);
 #ifdef LOCALAI_HAS_SERVER_SCHEMA
                task.params           = server_schema::eval_llama_cmpl_schema(
 #else
                task.params           = server_task::params_from_json_cmpl(
 #endif
                        ctx_server.impl->vocab,
                        params_base,
                        ctx_server.get_meta().slot_n_ctx,
@@ -2116,7 +2132,7 @@ public:
                // cannot detect tool calls or separate reasoning from content.
                task.params.res_type                 = TASK_RESPONSE_TYPE_OAI_CHAT;
                task.params.oaicompat_cmpl_id         = completion_id;
-                // oaicompat_model is already populated by params_from_json_cmpl
+                // oaicompat_model is already populated by eval_llama_cmpl_schema
                tasks.push_back(std::move(task));
            }
@@ -2940,7 +2956,11 @@ public:
                task.index = i;
                task.tokens    = std::move(inputs[i]);
 #ifdef LOCALAI_HAS_SERVER_SCHEMA
                task.params           = server_schema::eval_llama_cmpl_schema(
 #else
                task.params           = server_task::params_from_json_cmpl(
 #endif
                        ctx_server.impl->vocab,
                        params_base,
                        ctx_server.get_meta().slot_n_ctx,
@@ -2952,7 +2972,7 @@ public:
                // reasoning, tool calls, and content are classified into ChatDeltas.
                task.params.res_type                 = TASK_RESPONSE_TYPE_OAI_CHAT;
                task.params.oaicompat_cmpl_id         = completion_id;
-                // oaicompat_model is already populated by params_from_json_cmpl
+                // oaicompat_model is already populated by eval_llama_cmpl_schema
                tasks.push_back(std::move(task));
            }
--- a/backend/cpp/privacy-filter/Makefile
+++ b/backend/cpp/privacy-filter/Makefile
@@ -8,7 +8,7 @@
 # Local development: point at a working checkout instead of cloning, e.g.
 #   make PRIVACY_FILTER_SRC=$HOME/c/privacy-filter.cpp grpc-server
-PRIVACY_FILTER_VERSION?=646342f7a59c6b7d195185eac60bad762e572f1d
+PRIVACY_FILTER_VERSION?=98f52c5ef2250f207cc6b9a6aef05393a120cb7c
 PRIVACY_FILTER_REPO?=https://github.com/localai-org/privacy-filter.cpp
 PRIVACY_FILTER_SRC?=
--- a/backend/go/ced/.gitignore
+++ b/backend/go/ced/.gitignore
@@ -0,0 +1,11 @@
 .cache/
 sources/
 build/
 package/
 ced-grpc
 # build artifacts staged in-tree by the Makefile (cp from sources/) or
 # symlinked for local dev; the real sources live in ced.cpp upstream.
 *.so
 *.so.*
 ced_capi.h
 compile_commands.json
--- a/backend/go/ced/Makefile
+++ b/backend/go/ced/Makefile
@@ -0,0 +1,77 @@
 # ced sound-classification backend Makefile.
 #
 # Upstream pin lives below as CED_VERSION?=<sha> so .github/bump_deps.sh can find
 # and update it (matches the parakeet-cpp / whisper.cpp convention).
 #
 # Local dev shortcut: symlink an out-of-tree ced.cpp shared build + header and
 # skip the clone/cmake steps entirely:
 #   ln -sf /path/to/ced.cpp/build-shared/libced.so .
 #   ln -sf /path/to/ced.cpp/include/ced_capi.h .
 #   go build -o ced-grpc .
 CED_VERSION?=c04ac14b7992d00584d9e812c9bb6268598a6ce7
 CED_REPO?=https://github.com/mudler/ced.cpp
 GOCMD?=go
 GO_TAGS?=
 JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
 BUILD_TYPE?=
 NATIVE?=false
 # Static-link ggml into libced.so (PIC) so the shared lib is self-contained:
 # dlopen needs no libggml*.so alongside it, only system libs the runtime image
 # already provides.
 CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DCED_SHARED=ON -DCED_BUILD_CLI=OFF -DCED_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
 endif
 # ced.cpp gates its ggml backends behind CED_GGML_* options (set(... CACHE BOOL
 # "" FORCE)), so forward those instead of a bare -DGGML_CUDA=ON.
 ifeq ($(BUILD_TYPE),cublas)
 	CMAKE_ARGS+=-DCED_GGML_CUDA=ON -DGGML_CUDA_GRAPHS=ON
 else ifeq ($(BUILD_TYPE),openblas)
 	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DCED_GGML_HIP=ON
 else ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DCED_GGML_VULKAN=ON
 endif
 .PHONY: ced-grpc package build clean purge test all
 all: ced-grpc
 sources/ced.cpp:
 	mkdir -p sources/ced.cpp
 	cd sources/ced.cpp && \
 	git init -q && \
 	git remote add origin $(CED_REPO) && \
 	git fetch --depth 1 origin $(CED_VERSION) && \
 	git checkout FETCH_HEAD && \
 	git submodule update --init --recursive --depth 1 --single-branch
 libced.so: sources/ced.cpp
 	cmake -B sources/ced.cpp/build-shared -S sources/ced.cpp $(CMAKE_ARGS)
 	cmake --build sources/ced.cpp/build-shared --config Release -j$(JOBS)
 	cp -fv sources/ced.cpp/build-shared/libced.so* ./ 2>/dev/null || true
 	cp -fv sources/ced.cpp/include/ced_capi.h ./
 ced-grpc: libced.so main.go goced.go
 	CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o ced-grpc .
 package: ced-grpc
 	bash package.sh
 build: package
 test:
 	LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
 clean: purge
 	rm -rf libced.so* ced_capi.h package ced-grpc
 purge:
 	rm -rf sources/ced.cpp
--- a/backend/go/ced/goced.go
+++ b/backend/go/ced/goced.go
@@ -0,0 +1,130 @@
 package main
 // Go side of the ced backend: purego bindings over ced_capi.h plus the gRPC
 // SoundDetection implementation.
 //
 // SKETCH: the pb.SoundDetection* types come from backend.proto (regenerate with
 // `make protogen-go`). The C side is single-threaded per ctx, so we guard the
 // engine with engineMu; LocalAI also serializes via base.SingleThread.
 import (
 	"context"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"sort"
 	"sync"
 	"unsafe"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )
 // purego-bound entry points from libced.so. Names match ced_capi.h exactly.
 var (
 	CppAbiVersion       func() int32
 	CppLoad             func(ggufPath string) uintptr
 	CppFree             func(ctx uintptr)
 	CppLastError        func(ctx uintptr) string
 	CppNumClasses       func(ctx uintptr) int32
 	CppSampleRate       func(ctx uintptr) int32
 	CppClassifyPathJSON func(ctx uintptr, wavPath string, topK int32) uintptr
 	CppClassifyPcmJSON  func(ctx uintptr, pcm []float32, nSamples int32, sampleRate int32, topK int32) uintptr
 	CppFreeString       func(s uintptr)
 )
 // cstr copies a malloc'd C string (returned as uintptr) into a Go string and
 // frees the original via ced_capi_free_string. Empty/0 -> "".
 func cstr(p uintptr) string {
 	if p == 0 {
 		return ""
 	}
 	defer CppFreeString(p)
 	var b []byte
 	for i := 0; ; i++ {
 		ch := *(*byte)(unsafe.Pointer(p + uintptr(i))) //nolint:govet // #nosec G103 -- C-owned NUL-terminated string from libced (not Go-GC memory)
 		if ch == 0 {
 			break
 		}
 		b = append(b, ch)
 	}
 	return string(b)
 }
 // Ced is the gRPC backend. One loaded CED model per instance.
 type Ced struct {
 	base.Base
 	ctxPtr   uintptr
 	engineMu sync.Mutex
 }
 // Load resolves the GGUF and opens the C-API context.
 func (c *Ced) Load(opts *pb.ModelOptions) error {
 	if opts.ModelFile == "" {
 		return errors.New("ced: ModelFile is required")
 	}
 	ctx := CppLoad(opts.ModelFile)
 	if ctx == 0 {
 		return fmt.Errorf("ced: ced_capi_load failed for %q: %s", opts.ModelFile, CppLastError(0))
 	}
 	c.ctxPtr = ctx
 	return nil
 }
 // jsonTag mirrors the ced_capi JSON tag objects.
 type jsonTag struct {
 	Index int     `json:"index"`
 	Score float32 `json:"score"`
 	Label string  `json:"label"`
 }
 // SoundDetection classifies the clip at req.Src and returns scored AudioSet tags.
 func (c *Ced) SoundDetection(ctx context.Context, req *pb.SoundDetectionRequest) (*pb.SoundDetectionResponse, error) {
 	if c.ctxPtr == 0 {
 		return nil, errors.New("ced: model not loaded")
 	}
 	if req.GetSrc() == "" {
 		return nil, errors.New("ced: SoundDetectionRequest.src (audio path) is required")
 	}
 	topK := req.GetTopK()
 	if topK <= 0 {
 		topK = 10 // sensible default for a tagging response
 	}
 	c.engineMu.Lock()
 	out := cstr(CppClassifyPathJSON(c.ctxPtr, req.GetSrc(), topK))
 	lastErr := CppLastError(c.ctxPtr)
 	c.engineMu.Unlock()
 	if out == "" {
 		return nil, fmt.Errorf("ced: classification failed: %s", lastErr)
 	}
 	var tags []jsonTag
 	if err := json.Unmarshal([]byte(out), &tags); err != nil {
 		return nil, fmt.Errorf("ced: bad classifier JSON: %w", err)
 	}
 	thr := req.GetThreshold()
 	resp := &pb.SoundDetectionResponse{}
 	for _, t := range tags {
 		if t.Score < thr {
 			continue
 		}
 		resp.Detections = append(resp.Detections, &pb.SoundClass{
 			Label: t.Label, Score: t.Score, Index: int32(t.Index),
 		})
 	}
 	sort.Slice(resp.Detections, func(i, j int) bool {
 		return resp.Detections[i].Score > resp.Detections[j].Score
 	})
 	return resp, nil
 }
 func (c *Ced) Free() error {
 	c.engineMu.Lock()
 	defer c.engineMu.Unlock()
 	if c.ctxPtr != 0 {
 		CppFree(c.ctxPtr)
 		c.ctxPtr = 0
 	}
 	return nil
 }
--- a/backend/go/ced/main.go
+++ b/backend/go/ced/main.go
@@ -0,0 +1,59 @@
 package main
 // ced sound-classification backend. Started internally by LocalAI: one gRPC
 // server per loaded model. Loads libced.so via purego and registers the flat
 // C-API declared in ced_capi.h. The library name can be overridden with
 // CED_LIBRARY (mirrors PARAKEET_LIBRARY / WHISPER_LIBRARY); the default looks
 // for the .so next to this binary.
 //
 // SKETCH: requires `make protogen-go` after the backend.proto SoundDetection
 // addition, and a built libced.so (see Makefile). See DESIGN.md.
 import (
 	"flag"
 	"fmt"
 	"os"
 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )
 var addr = flag.String("addr", "localhost:50051", "the address to connect to")
 type libFunc struct {
 	ptr  any
 	name string
 }
 func main() {
 	libName := os.Getenv("CED_LIBRARY")
 	if libName == "" {
 		libName = "libced.so"
 	}
 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
 	if err != nil {
 		panic(fmt.Errorf("ced: dlopen %q: %w", libName, err))
 	}
 	// Bound 1:1 to ced_capi.h. char*-returning functions are declared uintptr
 	// so we can free the same pointer with ced_capi_free_string after copying
 	// (purego's string return would copy and leak the original).
 	for _, lf := range []libFunc{
 		{&CppAbiVersion, "ced_capi_abi_version"},
 		{&CppLoad, "ced_capi_load"},
 		{&CppFree, "ced_capi_free"},
 		{&CppLastError, "ced_capi_last_error"},
 		{&CppNumClasses, "ced_capi_num_classes"},
 		{&CppSampleRate, "ced_capi_sample_rate"},
 		{&CppClassifyPathJSON, "ced_capi_classify_path_json"},
 		{&CppClassifyPcmJSON, "ced_capi_classify_pcm_json"},
 		{&CppFreeString, "ced_capi_free_string"},
 	} {
 		purego.RegisterLibFunc(lf.ptr, lib, lf.name)
 	}
 	fmt.Fprintf(os.Stderr, "[ced] ABI=%d\n", CppAbiVersion())
 	flag.Parse()
 	if err := grpc.StartServer(*addr, &Ced{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/ced/package.sh
+++ b/backend/go/ced/package.sh
@@ -0,0 +1,60 @@
 #!/bin/bash
 #
 # Bundle the ced-grpc binary, libced.so, the core runtime libs (libc/libstdc++/
 # libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE so the package
 # is self-contained. Mirrors backend/go/parakeet-cpp/package.sh; run.sh routes
 # the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc is used.
 set -e
 CURDIR=$(dirname "$(realpath "$0")")
 REPO_ROOT="${CURDIR}/../../.."
 mkdir -p "$CURDIR/package/lib"
 cp -avf "$CURDIR/ced-grpc" "$CURDIR/package/"
 cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
 cp -avf "$CURDIR"/libced.so* "$CURDIR/package/lib/" 2>/dev/null || {
 	echo "ERROR: libced.so not found in $CURDIR, run 'make' first" >&2
 	exit 1
 }
 if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
    echo "Detected x86_64 architecture, copying x86_64 libraries..."
    cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
 elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
    echo "Detected ARM64 architecture, copying ARM64 libraries..."
    cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
 elif [ "$(uname -s)" = "Darwin" ]; then
    echo "Detected Darwin"
 else
    echo "Error: Could not detect architecture"
    exit 1
 fi
 GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
 if [ -f "$GPU_LIB_SCRIPT" ]; then
    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
    package_gpu_libs
 fi
 echo "Packaging completed successfully"
 ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
--- a/backend/go/ced/run.sh
+++ b/backend/go/ced/run.sh
@@ -0,0 +1,15 @@
 #!/bin/bash
 set -e
 CURDIR=$(dirname "$(realpath "$0")")
 export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
 # If a self-contained ld.so was packaged, route through it so the packaged
 # libc / libstdc++ are used instead of the host's (matches the sibling backends).
 if [ -f "$CURDIR/lib/ld.so" ]; then
 	echo "Using lib/ld.so"
 	exec "$CURDIR/lib/ld.so" "$CURDIR/ced-grpc" "$@"
 fi
 exec "$CURDIR/ced-grpc" "$@"
--- a/backend/go/crispasr/Makefile
+++ b/backend/go/crispasr/Makefile
@@ -67,7 +67,7 @@ sources/CrispASR:
 	# it, so ${CMAKE_SOURCE_DIR} is THIS backend dir and the talk-llama sources
 	# aren't found. Rewrite to ${PROJECT_SOURCE_DIR} (the crispasr project root),
 	# which is correct both standalone and as a subproject. Idempotent.
-	sed -i 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt
+	sed -i.bak 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt && rm -f sources/CrispASR/src/CMakeLists.txt.bak
 # Detect OS
 UNAME_S := $(shell uname -s)
--- a/backend/go/crispasr/cpp/crispasr_shim.cpp
+++ b/backend/go/crispasr/cpp/crispasr_shim.cpp
@@ -47,6 +47,74 @@ extern "C" void set_abort(int v) {
  g_abort.store(v, std::memory_order_relaxed);
 }
 // --- word-level timestamp accessors ---
 extern "C" {
 int crispasr_session_result_n_words(crispasr_session_result *r, int seg_i);
 const char *crispasr_session_result_word_text(crispasr_session_result *r,
                                               int seg_i, int word_i);
 int64_t crispasr_session_result_word_t0(crispasr_session_result *r, int seg_i,
                                         int word_i);
 int64_t crispasr_session_result_word_t1(crispasr_session_result *r, int seg_i,
                                         int word_i);
 // Parakeet-specific word accessors
 int crispasr_parakeet_result_n_words(void *r);
 const char *crispasr_parakeet_result_word_text(void *r, int word_i);
 int64_t crispasr_parakeet_result_word_t0(void *r, int word_i);
 int64_t crispasr_parakeet_result_word_t1(void *r, int word_i);
 }
 void *get_result(void) { return g_result; }
 int get_word_count(int seg_i) {
  if (!g_result)
    return 0;
  return crispasr_session_result_n_words(g_result, seg_i);
 }
 const char *get_word_text(int seg_i, int word_i) {
  if (!g_result)
    return "";
  return crispasr_session_result_word_text(g_result, seg_i, word_i);
 }
 int64_t get_word_t0(int seg_i, int word_i) {
  if (!g_result)
    return 0;
  return crispasr_session_result_word_t0(g_result, seg_i, word_i);
 }
 int64_t get_word_t1(int seg_i, int word_i) {
  if (!g_result)
    return 0;
  return crispasr_session_result_word_t1(g_result, seg_i, word_i);
 }
 // Parakeet-specific word accessors
 int get_parakeet_word_count(void) {
  if (!g_result)
    return 0;
  return crispasr_parakeet_result_n_words(g_result);
 }
 const char *get_parakeet_word_text(int word_i) {
  if (!g_result)
    return "";
  return crispasr_parakeet_result_word_text(g_result, word_i);
 }
 int64_t get_parakeet_word_t0(int word_i) {
  if (!g_result)
    return 0;
  return crispasr_parakeet_result_word_t0(g_result, word_i);
 }
 int64_t get_parakeet_word_t1(int word_i) {
  if (!g_result)
    return 0;
  return crispasr_parakeet_result_word_t1(g_result, word_i);
 }
 static void ggml_log_cb(enum ggml_log_level level, const char *log,
                        void *data) {
  const char *level_str;
--- a/backend/go/crispasr/cpp/crispasr_shim.h
+++ b/backend/go/crispasr/cpp/crispasr_shim.h
@@ -20,4 +20,18 @@ float *tts_synthesize(const char *text, int *out_n_samples); // 24kHz mono float
 void tts_free(float *pcm);
 int tts_set_voice(const char *name); // best-effort speaker selection; 0 ok
 int tts_set_voice_file(const char *path, const char *ref_text); // load voice pack (.gguf) or zero-shot clone (.wav + ref_text)
 // --- word-level timestamp accessors ---
 // Session-based (works for whisper-like backends)
 void *get_result(void);
 int get_word_count(int seg_i);
 const char *get_word_text(int seg_i, int word_i);
 int64_t get_word_t0(int seg_i, int word_i);
 int64_t get_word_t1(int seg_i, int word_i);
 // Parakeet-specific (global word list, no segment index)
 int get_parakeet_word_count(void);
 const char *get_parakeet_word_text(int word_i);
 int64_t get_parakeet_word_t0(int word_i);
 int64_t get_parakeet_word_t1(int word_i);
 }
--- a/backend/go/crispasr/gocrispasr.go
+++ b/backend/go/crispasr/gocrispasr.go
@@ -34,6 +34,18 @@ var (
 	CppTTSFree         func(ptr uintptr)
 	CppTTSSetVoice     func(name string) int
 	CppTTSSetVoiceFile func(path string, refText string) int
 	// Word-level timestamp accessors (session-based, per-segment)
 	CppGetWordCount func(segI int) int
 	CppGetWordText  func(segI int, wordI int) string
 	CppGetWordT0    func(segI int, wordI int) int64
 	CppGetWordT1    func(segI int, wordI int) int64
 	// Parakeet-specific word accessors (global, no segment index)
 	CppGetParakeetWordCount func() int
 	CppGetParakeetWordText  func(wordI int) string
 	CppGetParakeetWordT0    func(wordI int) int64
 	CppGetParakeetWordT1    func(wordI int) int64
 )
 type CrispASR struct {
@@ -212,6 +224,28 @@ func (w *CrispASR) VAD(req *pb.VADRequest) (pb.VADResponse, error) {
 	}, nil
 }
 // isValidWord reports whether a TranscriptWord contains recognisable speech
 // content. The parakeet-specific word accessors can return stale initialisation
 // data (model name, binary blobs) when a segment has no real speech. A word is
 // considered valid only when:
 //   - the text is non-empty after trimming,
 //   - it contains no U+FFFD replacement characters (from binary data scrubbing),
 //   - both timestamps are non-negative,
 //   - the word has positive duration (end > start).
 func isValidWord(w *pb.TranscriptWord) bool {
 	txt := strings.TrimSpace(w.Text)
 	if txt == "" {
 		return false
 	}
 	if strings.ContainsRune(txt, '\uFFFD') {
 		return false
 	}
 	if w.Start < 0 || w.End < 0 || w.End <= w.Start {
 		return false
 	}
 	return true
 }
 func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRequest) (pb.TranscriptResult, error) {
 	if err := ctx.Err(); err != nil {
 		return pb.TranscriptResult{}, status.Error(codes.Canceled, "transcription cancelled")
@@ -290,15 +324,54 @@ func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRe
 		// IDs, so Tokens is left empty.
 		txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "<22>")
 		// Populate word-level timestamps. Try session-based functions first
 		// (per-segment); fall back to parakeet-specific functions (global word
 		// list with no segment index — only populated on the first segment to
 		// avoid duplication).
 		words := []*pb.TranscriptWord{}
 		wordCount := CppGetWordCount(i)
 		if wordCount == 0 && i == 0 {
 			wordCount = CppGetParakeetWordCount()
 			for j := 0; j < wordCount; j++ {
 				w := &pb.TranscriptWord{
 					Start: CppGetParakeetWordT0(j) * (10000000),
 					End:   CppGetParakeetWordT1(j) * (10000000),
 					Text:  strings.ToValidUTF8(strings.Clone(CppGetParakeetWordText(j)), "<22>"),
 				}
 				if isValidWord(w) {
 					words = append(words, w)
 				}
 			}
 		} else {
 			for j := 0; j < wordCount; j++ {
 				w := &pb.TranscriptWord{
 					Start: CppGetWordT0(i, j) * (10000000),
 					End:   CppGetWordT1(i, j) * (10000000),
 					Text:  strings.ToValidUTF8(strings.Clone(CppGetWordText(i, j)), "<22>"),
 				}
 				if isValidWord(w) {
 					words = append(words, w)
 				}
 			}
 		}
 		// Skip empty segments with no recognisable content (e.g. trailing
 		// silence segments that parakeet emits with stale init data).
 		trimmed := strings.TrimSpace(txt)
 		if trimmed == "" && len(words) == 0 {
 			continue
 		}
 		segment := &pb.TranscriptSegment{
 			Id:    int32(i),
 			Text:  txt,
 			Start: s, End: t,
 			Words: words,
 		}
 		segments = append(segments, segment)
-		text += " " + strings.TrimSpace(txt)
+		text += " " + trimmed
 	}
 	return pb.TranscriptResult{
@@ -390,13 +463,20 @@ func (w *CrispASR) AudioTranscriptionStream(ctx context.Context, opts *pb.Transc
 		s := CppGetSegmentStart(i) * 10000000
 		t := CppGetSegmentEnd(i) * 10000000
 		txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "<22>")
 		// Skip empty segments (e.g. trailing silence that parakeet emits
 		// with stale init data).
 		trimmed := strings.TrimSpace(txt)
 		if trimmed == "" && s == t {
 			continue
 		}
 		segments = append(segments, &pb.TranscriptSegment{
 			Id:    int32(i),
 			Text:  txt,
 			Start: s, End: t,
 		})
 		trimmed := strings.TrimSpace(txt)
 		if trimmed == "" {
 			continue
 		}
--- a/backend/go/crispasr/main.go
+++ b/backend/go/crispasr/main.go
@@ -44,6 +44,14 @@ func main() {
 		{&CppTTSFree, "tts_free"},
 		{&CppTTSSetVoice, "tts_set_voice"},
 		{&CppTTSSetVoiceFile, "tts_set_voice_file"},
 		{&CppGetWordCount, "get_word_count"},
 		{&CppGetWordText, "get_word_text"},
 		{&CppGetWordT0, "get_word_t0"},
 		{&CppGetWordT1, "get_word_t1"},
 		{&CppGetParakeetWordCount, "get_parakeet_word_count"},
 		{&CppGetParakeetWordText, "get_parakeet_word_text"},
 		{&CppGetParakeetWordT0, "get_parakeet_word_t0"},
 		{&CppGetParakeetWordT1, "get_parakeet_word_t1"},
 	}
 	for _, lf := range libFuncs {
--- a/backend/go/depth-anything-cpp/Makefile
+++ b/backend/go/depth-anything-cpp/Makefile
@@ -8,11 +8,13 @@ JOBS?=$(shell nproc --ignore=1)
 # depth-anything.cpp. Pin to a specific commit for a stable build; a squash
 # merge upstream can orphan a branch, so the native version is pinned by SHA.
-# This SHA adds the nested two-file metric C-API (abi_version 4,
+# This SHA adds the Depth Anything V2 engine + C-API routing (depth-only,
-# da_capi_load_nested) required by the depth-anything-3-nested gallery model;
+# relative + metric) on top of the nested two-file metric C-API (abi_version 4,
-# tag it (e.g. v0.1.3) upstream to keep the SHA alive.
+# da_capi_load_nested) required by the depth-anything-3-nested gallery model.
 # It is kept alive by the upstream tag da2-support (survives a squash-merge);
 # repoint to the master merge commit once mudler/depth-anything.cpp PR #1 lands.
 DEPTHANYTHING_REPO?=https://github.com/mudler/depth-anything.cpp.git
-DEPTHANYTHING_VERSION?=cce5edc395fd1843806093d7ccc0c8b0d0b97b72
+DEPTHANYTHING_VERSION?=f4e17dea695dd12ae76bea98ba58030996b98118
 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
--- a/backend/go/face-detect/.gitignore
+++ b/backend/go/face-detect/.gitignore
@@ -0,0 +1,18 @@
 # Fetched upstream sources
 sources/
 # CMake build directories
 build*/
 # build artifacts staged in-tree by the Makefile (cp from sources/) or
 # symlinked for local dev; the real sources live in face-detect.cpp upstream.
 *.so
 *.so.*
 facedetect_capi.h
 compile_commands.json
 # Compiled backend binary
 face-detect-grpc
 # Packaging output
 package/
--- a/backend/go/face-detect/Makefile
+++ b/backend/go/face-detect/Makefile
@@ -0,0 +1,110 @@
 # face-detect backend Makefile.
 #
 # Upstream pin lives below as FACEDETECT_VERSION?=6107a24... (.github/bump_deps.sh
 # can find and update it - matches the voice-detect / parakeet.cpp / whisper.cpp
 # convention).
 #
 # Local dev shortcut: if you already have an out-of-tree face-detect.cpp build,
 # symlink the .so + header into this directory and skip the clone/cmake steps:
 #
 #   ln -sf /path/to/face-detect.cpp/build-shared/libfacedetect.so .
 #   ln -sf /path/to/face-detect.cpp/include/facedetect_capi.h .
 #   go build -o face-detect-grpc .
 #
 # The default target below does the proper clone-at-pin + cmake build so CI does
 # not need a side-checkout.
 FACEDETECT_VERSION?=6107a2414fdaccc9ce8650b762f9436d20541cbe
 FACEDETECT_REPO?=https://github.com/mudler/face-detect.cpp
 GOCMD?=go
 GO_TAGS?=
 JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
 BUILD_TYPE?=
 NATIVE?=false
 # Resolve the target arch. The backend matrix / Docker build pass TARGETARCH
 # (amd64|arm64); fall back to uname -m (aarch64|x86_64) for a local build.
 RECON_ARCH?=$(or $(TARGETARCH),$(shell uname -m))
 # Build ggml + the vendored libjpeg-turbo statically into libfacedetect.so (PIC)
 # so the shared lib is self-contained: dlopen needs no libggml*.so alongside it,
 # only system libs (libstdc++/libgomp/libc) the runtime image already provides.
 # The vendored jpeg symbols are hidden via -Wl,--exclude-libs,ALL on the C++
 # side, so only the facedetect_capi_* surface is exported.
 CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DFACEDETECT_SHARED=ON -DFACEDETECT_BUILD_CLI=OFF -DFACEDETECT_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
 endif
 # face-detect.cpp gates its GGML backends behind FACEDETECT_GGML_* options and
 # does set(GGML_CUDA ${FACEDETECT_GGML_CUDA} CACHE BOOL "" FORCE), so a bare
 # -DGGML_CUDA=ON is overwritten back to OFF. Forward the FACEDETECT_GGML_*
 # options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
 ifeq ($(BUILD_TYPE),cublas)
 	CMAKE_ARGS+=-DFACEDETECT_GGML_CUDA=ON
 	# Opt-in cuDNN implicit-GEMM conv path (kills im2col on GPU, SCRFD 2.3x
 	# vs torch-cuDNN parity). Only the arm64 + CUDA 13 image (GB10/Jetson/L4T)
 	# ships libcudnn9 + the -dev headers, so gate cuDNN to that variant.
 	# x86 CUDA images carry no cuDNN -> enabling it there is a link failure.
 	ifeq ($(CUDA_MAJOR_VERSION),13)
 	ifneq (,$(filter arm64 aarch64,$(RECON_ARCH)))
 		CMAKE_ARGS+=-DFACEDETECT_GGML_CUDNN=ON
 	endif
 	endif
 else ifeq ($(BUILD_TYPE),openblas)
 	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DFACEDETECT_GGML_HIP=ON
 else ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DFACEDETECT_GGML_VULKAN=ON
 else ifeq ($(BUILD_TYPE),metal)
 	CMAKE_ARGS+=-DFACEDETECT_GGML_METAL=ON
 endif
 .PHONY: face-detect-grpc package build clean purge test all
 all: face-detect-grpc
 # Clone the upstream face-detect.cpp source at the pinned commit. Directory acts
 # as the target so make only re-clones when missing. After a FACEDETECT_VERSION
 # bump, run 'make purge && make' to refetch.
 sources/face-detect.cpp:
 	mkdir -p sources/face-detect.cpp
 	cd sources/face-detect.cpp && \
 	git init -q && \
 	git remote add origin $(FACEDETECT_REPO) && \
 	git fetch --depth 1 origin $(FACEDETECT_VERSION) && \
 	git checkout FETCH_HEAD && \
 	git submodule update --init --recursive --depth 1 --single-branch
 # Build the shared lib + header out-of-tree, then stage them next to the Go
 # sources so purego.Dlopen("libfacedetect.so") and the cgo-less build both pick
 # them up.
 libfacedetect.so: sources/face-detect.cpp
 	cmake -B sources/face-detect.cpp/build-shared -S sources/face-detect.cpp $(CMAKE_ARGS)
 	cmake --build sources/face-detect.cpp/build-shared --config Release -j$(JOBS) --target facedetect
 	cp -fv sources/face-detect.cpp/build-shared/libfacedetect.so* ./ 2>/dev/null || true
 	cp -fv sources/face-detect.cpp/include/facedetect_capi.h ./
 face-detect-grpc: libfacedetect.so main.go gofacedetect.go options.go
 	CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o face-detect-grpc .
 package: face-detect-grpc
 	bash package.sh
 build: package
 # Test target. The embed/detect/verify/analyze smoke specs are gated on
 # FACEDETECT_BACKEND_TEST_MODEL + FACEDETECT_BACKEND_TEST_IMAGE; without them the
 # heavy specs auto-skip and only the pure-Go parsing specs run.
 test:
 	LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
 clean: purge
 	rm -rf libfacedetect.so* facedetect_capi.h package face-detect-grpc
 purge:
 	rm -rf sources/face-detect.cpp
--- a/backend/go/face-detect/gofacedetect.go
+++ b/backend/go/face-detect/gofacedetect.go
@@ -0,0 +1,431 @@
 package main
 import (
 	"encoding/base64"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"math"
 	"os"
 	"path/filepath"
 	"strconv"
 	"strings"
 	"time"
 	"unsafe"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/xlog"
 )
 // purego-bound entry points from libfacedetect.so. Names match
 // facedetect_capi.h exactly so a `nm libfacedetect.so | grep facedetect_capi`
 // is enough to spot drift.
 //
 // The opaque ctx and the malloc'd char*/float* return values are declared as
 // uintptr so we get the raw pointer back and can release it via the matching
 // capi free function. purego's native string/[]float32 returns would copy and
 // forget the original pointer, leaking the C-owned buffer on every call.
 var (
 	CppAbiVersion  func() int32
 	CppLoad        func(ggufPath string) uintptr
 	CppFree        func(ctx uintptr)
 	CppLastError   func(ctx uintptr) string
 	CppFreeString  func(s uintptr)
 	CppFreeVec     func(v uintptr)
 	CppEmbedPath   func(ctx uintptr, imagePath string, outVec, outDim unsafe.Pointer) int32
 	CppEmbedRGB    func(ctx uintptr, rgb []byte, width, height int32, outVec, outDim unsafe.Pointer) int32
 	CppDetectJSON  func(ctx uintptr, imagePath string) uintptr
 	CppVerifyPaths func(ctx uintptr, a, b string, threshold float32, antiSpoof int32, outDistance, outVerified unsafe.Pointer) int32
 	CppAnalyzeJSON func(ctx uintptr, imagePath string) uintptr
 )
 // FaceDetect implements the face-recognition (biometric) subset of the Backend
 // gRPC service over libfacedetect.so. The C side keeps a single loaded model
 // pack plus a per-ctx last-error buffer and is not reentrant, so
 // base.SingleThread serializes every call.
 type FaceDetect struct {
 	base.SingleThread
 	opts   loadOptions
 	ctxPtr uintptr
 }
 func (f *FaceDetect) Load(opts *pb.ModelOptions) error {
 	model := opts.ModelFile
 	if model == "" {
 		model = opts.ModelPath
 	}
 	if !filepath.IsAbs(model) && opts.ModelPath != "" {
 		model = filepath.Join(opts.ModelPath, model)
 	}
 	if model == "" {
 		return errors.New("face-detect: ModelFile is required")
 	}
 	f.opts = parseOptions(opts.Options)
 	if f.opts.modelName == "" {
 		f.opts.modelName = filepath.Base(model)
 	}
 	// Propagate LocalAI's per-model thread budget to the engine. LocalAI spawns
 	// one backend process per model and serves requests concurrently, so the
 	// engine's own min(hardware_concurrency, 8) default can oversubscribe cores.
 	// FACEDETECT_THREADS is read by the engine at backend construction, so it
 	// must be set before the capi load. A non-positive Threads means "unset":
 	// leave the env alone so the engine keeps its sane default.
 	threads := opts.Threads
 	if threads > 0 {
 		if err := os.Setenv("FACEDETECT_THREADS", strconv.Itoa(int(threads))); err != nil {
 			return fmt.Errorf("face-detect: set FACEDETECT_THREADS: %w", err)
 		}
 		xlog.Info("face-detect: applying LocalAI thread budget", "threads", threads)
 	}
 	xlog.Info("face-detect: loading model", "model", model,
 		"verify_threshold", f.opts.verifyThreshold, "abi", CppAbiVersion())
 	ctx := CppLoad(model)
 	if ctx == 0 {
 		// The last-error buffer lives on the ctx that was never returned, so
 		// surface the path the operator tried to load instead.
 		return fmt.Errorf("face-detect: facedetect_capi_load failed for %q", model)
 	}
 	f.ctxPtr = ctx
 	return nil
 }
 // Embeddings returns the L2-normalized ArcFace embedding of the primary face in
 // the supplied image. Mirroring the Python face backend, the image is read from
 // Images[0] as a base64 payload; materializeImage decodes it to a temp file so
 // the path-based C-API can run its own decode (cv2.imread parity). The gRPC
 // server wraps the returned slice in an EmbeddingResult.
 func (f *FaceDetect) Embeddings(req *pb.PredictOptions) ([]float32, error) {
 	if f.ctxPtr == 0 {
 		return nil, errors.New("face-detect: model not loaded")
 	}
 	if len(req.Images) == 0 || req.Images[0] == "" {
 		return nil, errors.New("face-detect: Embedding requires Images[0] to be a base64 image")
 	}
 	path, cleanup, err := materializeImage(req.Images[0])
 	if err != nil {
 		return nil, err
 	}
 	defer cleanup()
 	return f.embedPath(path)
 }
 func (f *FaceDetect) embedPath(path string) ([]float32, error) {
 	var vec uintptr
 	var dim int32
 	rc := CppEmbedPath(f.ctxPtr, path, unsafe.Pointer(&vec), unsafe.Pointer(&dim))
 	if rc != 0 || vec == 0 || dim <= 0 {
 		return nil, f.lastErr("embed", path)
 	}
 	defer CppFreeVec(vec)
 	// Copy out of the C-owned malloc'd buffer before freeing it. The
 	// uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
 	// a C heap pointer from Go-managed memory; safe here, the GC neither tracks
 	// nor moves this buffer and we copy immediately.
 	src := unsafe.Slice((*float32)(unsafe.Pointer(vec)), int(dim)) //nolint:govet // C-owned malloc'd vector, copied out before free
 	out := make([]float32, int(dim))
 	copy(out, src)
 	return out, nil
 }
 // Detect runs SCRFD over the image and returns one Detection per face. The
 // C-API emits a box as [x1,y1,x2,y2] in pixels; the proto carries x/y plus
 // width/height, so the corners are converted. The 5 facial landmarks the engine
 // also returns are dropped: the Detection message has no field for them.
 func (f *FaceDetect) Detect(req *pb.DetectOptions) (pb.DetectResponse, error) {
 	if f.ctxPtr == 0 {
 		return pb.DetectResponse{}, errors.New("face-detect: model not loaded")
 	}
 	if req.Src == "" {
 		return pb.DetectResponse{}, errors.New("face-detect: src image is required")
 	}
 	path, cleanup, err := materializeImage(req.Src)
 	if err != nil {
 		return pb.DetectResponse{}, err
 	}
 	defer cleanup()
 	faces, err := f.detectFaces(path)
 	if err != nil {
 		return pb.DetectResponse{}, err
 	}
 	dets := make([]*pb.Detection, 0, len(faces))
 	for _, fc := range faces {
 		if req.Threshold > 0 && fc.Score < req.Threshold {
 			continue
 		}
 		x, y, w, h := fc.xywh()
 		dets = append(dets, &pb.Detection{
 			X:          x,
 			Y:          y,
 			Width:      w,
 			Height:     h,
 			Confidence: fc.Score,
 			ClassName:  "face",
 		})
 	}
 	return pb.DetectResponse{Detections: dets}, nil
 }
 // FaceVerify embeds the primary face in each image and reports whether they are
 // the same identity by cosine distance against a threshold. A request threshold
 // <= 0 falls back to the model-configured default (verify_threshold option,
 // 0.35 if unset). When anti_spoofing is set, the C-API applies a MiniFASNet
 // veto internally (verified forced false on a spoof); the per-image liveness
 // scores are not exposed by the verify entry point, so img*_is_real /
 // img*_antispoof_score stay at their zero values.
 func (f *FaceDetect) FaceVerify(req *pb.FaceVerifyRequest) (pb.FaceVerifyResponse, error) {
 	if f.ctxPtr == 0 {
 		return pb.FaceVerifyResponse{}, errors.New("face-detect: model not loaded")
 	}
 	if req.Img1 == "" || req.Img2 == "" {
 		return pb.FaceVerifyResponse{}, errors.New("face-detect: img1 and img2 are required")
 	}
 	path1, cleanup1, err := materializeImage(req.Img1)
 	if err != nil {
 		return pb.FaceVerifyResponse{}, err
 	}
 	defer cleanup1()
 	path2, cleanup2, err := materializeImage(req.Img2)
 	if err != nil {
 		return pb.FaceVerifyResponse{}, err
 	}
 	defer cleanup2()
 	threshold := req.Threshold
 	if threshold <= 0 {
 		threshold = f.opts.verifyThreshold
 	}
 	antiSpoof := int32(0)
 	if req.AntiSpoofing {
 		antiSpoof = 1
 	}
 	started := time.Now()
 	var distance float32
 	var verified int32
 	rc := CppVerifyPaths(f.ctxPtr, path1, path2, threshold, antiSpoof,
 		unsafe.Pointer(&distance), unsafe.Pointer(&verified))
 	if rc != 0 {
 		return pb.FaceVerifyResponse{}, f.lastErr("verify", req.Img1[:min(8, len(req.Img1))]+"...")
 	}
 	elapsedMs := float32(time.Since(started).Seconds() * 1000.0)
 	// Confidence decays linearly from 100 at distance 0 to 0 at the threshold,
 	// matching the Python face backend's reporting.
 	confidence := float32(0)
 	if threshold > 0 {
 		confidence = float32(math.Max(0, math.Min(100, (1.0-float64(distance)/float64(threshold))*100.0)))
 	}
 	return pb.FaceVerifyResponse{
 		Verified:         verified != 0,
 		Distance:         distance,
 		Threshold:        threshold,
 		Confidence:       confidence,
 		Model:            f.opts.modelName,
 		Img1Area:         f.bestArea(path1),
 		Img2Area:         f.bestArea(path2),
 		ProcessingTimeMs: elapsedMs,
 	}, nil
 }
 // FaceAnalyze runs the genderage head on every detected face. The C-API returns
 // "M"/"F" gender labels and a rounded age; the labels are normalized to the
 // "Man"/"Woman" values the proto documents.
 func (f *FaceDetect) FaceAnalyze(req *pb.FaceAnalyzeRequest) (pb.FaceAnalyzeResponse, error) {
 	if f.ctxPtr == 0 {
 		return pb.FaceAnalyzeResponse{}, errors.New("face-detect: model not loaded")
 	}
 	if req.Img == "" {
 		return pb.FaceAnalyzeResponse{}, errors.New("face-detect: img is required")
 	}
 	path, cleanup, err := materializeImage(req.Img)
 	if err != nil {
 		return pb.FaceAnalyzeResponse{}, err
 	}
 	defer cleanup()
 	ptr := CppAnalyzeJSON(f.ctxPtr, path)
 	if ptr == 0 {
 		return pb.FaceAnalyzeResponse{}, f.lastErr("analyze", path)
 	}
 	defer CppFreeString(ptr)
 	faces, err := parseAnalyzeJSON(goStringFromCPtr(ptr))
 	if err != nil {
 		return pb.FaceAnalyzeResponse{}, fmt.Errorf("face-detect: analyze JSON: %w", err)
 	}
 	return pb.FaceAnalyzeResponse{Faces: faces}, nil
 }
 // faceBox is one entry of the detect/analyze JSON documents the engine emits.
 type faceBox struct {
 	Score  float32   `json:"score"`
 	Box    []float32 `json:"box"`
 	Age    float32   `json:"age"`
 	Gender string    `json:"gender"`
 }
 // xywh converts the engine's [x1,y1,x2,y2] box into the x/y/width/height the
 // proto carries. A short or missing box yields zeros.
 func (b faceBox) xywh() (x, y, w, h float32) {
 	if len(b.Box) < 4 {
 		return 0, 0, 0, 0
 	}
 	return b.Box[0], b.Box[1], b.Box[2] - b.Box[0], b.Box[3] - b.Box[1]
 }
 type facesJSON struct {
 	Faces []faceBox `json:"faces"`
 }
 func (f *FaceDetect) detectFaces(path string) ([]faceBox, error) {
 	ptr := CppDetectJSON(f.ctxPtr, path)
 	if ptr == 0 {
 		return nil, f.lastErr("detect", path)
 	}
 	defer CppFreeString(ptr)
 	var doc facesJSON
 	if err := json.Unmarshal([]byte(goStringFromCPtr(ptr)), &doc); err != nil {
 		return nil, fmt.Errorf("face-detect: detect JSON: %w", err)
 	}
 	return doc.Faces, nil
 }
 // bestArea returns the FacialArea of the highest-scoring face in an image, or an
 // empty area when detection fails or finds nothing. Best-effort: verify already
 // succeeded, so a missing region must not turn a valid match into an error.
 func (f *FaceDetect) bestArea(path string) *pb.FacialArea {
 	faces, err := f.detectFaces(path)
 	if err != nil || len(faces) == 0 {
 		return &pb.FacialArea{}
 	}
 	best := faces[0]
 	for _, fc := range faces[1:] {
 		if fc.Score > best.Score {
 			best = fc
 		}
 	}
 	x, y, w, h := best.xywh()
 	return &pb.FacialArea{X: x, Y: y, W: w, H: h}
 }
 // parseAnalyzeJSON maps the engine's analyze document onto FaceAnalysis entries.
 // The engine reports gender as "M"/"F"; both the dominant label and the score
 // map are filled with the "Man"/"Woman" form the proto documents.
 func parseAnalyzeJSON(doc string) ([]*pb.FaceAnalysis, error) {
 	var parsed facesJSON
 	if err := json.Unmarshal([]byte(doc), &parsed); err != nil {
 		return nil, err
 	}
 	out := make([]*pb.FaceAnalysis, 0, len(parsed.Faces))
 	for _, fc := range parsed.Faces {
 		x, y, w, h := fc.xywh()
 		fa := &pb.FaceAnalysis{
 			Region:         &pb.FacialArea{X: x, Y: y, W: w, H: h},
 			FaceConfidence: fc.Score,
 			Age:            fc.Age,
 		}
 		if label := normalizeGender(fc.Gender); label != "" {
 			fa.DominantGender = label
 			fa.Gender = map[string]float32{label: 1.0}
 		}
 		out = append(out, fa)
 	}
 	return out, nil
 }
 // normalizeGender maps the engine's "M"/"F" code to the "Man"/"Woman" labels the
 // proto documents. Unknown codes pass through unchanged.
 func normalizeGender(g string) string {
 	switch strings.ToUpper(strings.TrimSpace(g)) {
 	case "M":
 		return "Man"
 	case "F":
 		return "Woman"
 	case "":
 		return ""
 	default:
 		return g
 	}
 }
 // materializeImage decodes a base64 image payload into a temp file and returns
 // its path plus a cleanup func. As a convenience for callers that already pass a
 // filesystem path (e.g. a test fixture), an existing path is used as-is with a
 // no-op cleanup. data: URI prefixes are stripped before decoding.
 func materializeImage(src string) (path string, cleanup func(), err error) {
 	noop := func() {}
 	if src == "" {
 		return "", noop, errors.New("face-detect: empty image input")
 	}
 	if _, statErr := os.Stat(src); statErr == nil {
 		return src, noop, nil
 	}
 	payload := src
 	if i := strings.Index(payload, ","); strings.HasPrefix(payload, "data:") && i >= 0 {
 		payload = payload[i+1:]
 	}
 	data, decErr := base64.StdEncoding.DecodeString(strings.TrimSpace(payload))
 	if decErr != nil || len(data) == 0 {
 		return "", noop, errors.New("face-detect: image is neither an existing path nor valid base64")
 	}
 	tmp, createErr := os.CreateTemp("", "face-detect-*.img")
 	if createErr != nil {
 		return "", noop, fmt.Errorf("face-detect: create temp image: %w", createErr)
 	}
 	cleanup = func() { _ = os.Remove(tmp.Name()) }
 	if _, wErr := tmp.Write(data); wErr != nil {
 		_ = tmp.Close()
 		cleanup()
 		return "", noop, fmt.Errorf("face-detect: write temp image: %w", wErr)
 	}
 	if cErr := tmp.Close(); cErr != nil {
 		cleanup()
 		return "", noop, fmt.Errorf("face-detect: close temp image: %w", cErr)
 	}
 	return tmp.Name(), cleanup, nil
 }
 // lastErr wraps the C-API's per-ctx last-error buffer into a Go error.
 func (f *FaceDetect) lastErr(op, subject string) error {
 	msg := strings.TrimSpace(CppLastError(f.ctxPtr))
 	if msg == "" {
 		msg = "no error detail"
 	}
 	return fmt.Errorf("face-detect: %s failed for %q: %s", op, subject, msg)
 }
 // goStringFromCPtr copies a NUL-terminated C string into Go memory. cptr is a
 // malloc'd buffer the caller owns; release it via CppFreeString after the copy.
 //
 // The uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
 // a C heap pointer from Go-managed memory. Safe here: the GC neither tracks nor
 // moves the buffer and we dereference it immediately to copy the bytes out.
 func goStringFromCPtr(cptr uintptr) string {
 	if cptr == 0 {
 		return ""
 	}
 	p := unsafe.Pointer(cptr) //nolint:govet // C-owned malloc'd buffer, not Go-GC memory (see doc above)
 	n := 0
 	for *(*byte)(unsafe.Add(p, n)) != 0 {
 		n++
 	}
 	return string(unsafe.Slice((*byte)(p), n))
 }
--- a/backend/go/face-detect/gofacedetect_test.go
+++ b/backend/go/face-detect/gofacedetect_test.go
@@ -0,0 +1,230 @@
 package main
 import (
 	"encoding/base64"
 	"os"
 	"sync"
 	"testing"
 	"github.com/ebitengine/purego"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
 func TestFaceDetect(t *testing.T) {
 	RegisterFailHandler(Fail)
 	RunSpecs(t, "face-detect Backend Suite")
 }
 var (
 	libLoadOnce sync.Once
 	libLoadErr  error
 )
 // ensureLibLoaded mirrors main.go's bootstrap so a Go test can drive the C-API
 // bridge without spinning up the gRPC server. Records the error (the smoke
 // specs skip themselves) when libfacedetect.so is not loadable from cwd
 // (LD_LIBRARY_PATH or a symlink in ./).
 func ensureLibLoaded() error {
 	libLoadOnce.Do(func() {
 		libName := os.Getenv("FACEDETECT_LIBRARY")
 		if libName == "" {
 			libName = "libfacedetect.so"
 		}
 		lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
 		if err != nil {
 			libLoadErr = err
 			return
 		}
 		purego.RegisterLibFunc(&CppAbiVersion, lib, "facedetect_capi_abi_version")
 		purego.RegisterLibFunc(&CppLoad, lib, "facedetect_capi_load")
 		purego.RegisterLibFunc(&CppFree, lib, "facedetect_capi_free")
 		purego.RegisterLibFunc(&CppLastError, lib, "facedetect_capi_last_error")
 		purego.RegisterLibFunc(&CppFreeString, lib, "facedetect_capi_free_string")
 		purego.RegisterLibFunc(&CppFreeVec, lib, "facedetect_capi_free_vec")
 		purego.RegisterLibFunc(&CppEmbedPath, lib, "facedetect_capi_embed_path")
 		purego.RegisterLibFunc(&CppEmbedRGB, lib, "facedetect_capi_embed_rgb")
 		purego.RegisterLibFunc(&CppDetectJSON, lib, "facedetect_capi_detect_path_json")
 		purego.RegisterLibFunc(&CppVerifyPaths, lib, "facedetect_capi_verify_paths")
 		purego.RegisterLibFunc(&CppAnalyzeJSON, lib, "facedetect_capi_analyze_path_json")
 	})
 	return libLoadErr
 }
 var _ = Describe("parseOptions", func() {
 	It("defaults verify_threshold to 0.35", func() {
 		o := parseOptions(nil)
 		Expect(o.verifyThreshold).To(Equal(float32(0.35)))
 		Expect(o.modelName).To(Equal(""))
 	})
 	It("parses verify_threshold, threshold alias and model_name", func() {
 		o := parseOptions([]string{"verify_threshold:0.4", "model_name:buffalo_l", "unknown:x"})
 		Expect(o.verifyThreshold).To(Equal(float32(0.4)))
 		Expect(o.modelName).To(Equal("buffalo_l"))
 		o2 := parseOptions([]string{"threshold:0.3"})
 		Expect(o2.verifyThreshold).To(Equal(float32(0.3)))
 	})
 	It("ignores non-positive thresholds and keeps the default", func() {
 		o := parseOptions([]string{"verify_threshold:0", "threshold:-1"})
 		Expect(o.verifyThreshold).To(Equal(float32(0.35)))
 	})
 })
 var _ = Describe("normalizeGender", func() {
 	It("maps M/F codes to Man/Woman", func() {
 		Expect(normalizeGender("M")).To(Equal("Man"))
 		Expect(normalizeGender("f")).To(Equal("Woman"))
 		Expect(normalizeGender(" m ")).To(Equal("Man"))
 	})
 	It("passes empty and unknown codes through", func() {
 		Expect(normalizeGender("")).To(Equal(""))
 		Expect(normalizeGender("nonbinary")).To(Equal("nonbinary"))
 	})
 })
 var _ = Describe("faceBox.xywh", func() {
 	It("converts an [x1,y1,x2,y2] box to x/y/width/height", func() {
 		b := faceBox{Box: []float32{10, 20, 50, 80}}
 		x, y, w, h := b.xywh()
 		Expect(x).To(Equal(float32(10)))
 		Expect(y).To(Equal(float32(20)))
 		Expect(w).To(Equal(float32(40)))
 		Expect(h).To(Equal(float32(60)))
 	})
 	It("returns zeros for a short box", func() {
 		x, y, w, h := faceBox{Box: []float32{1, 2}}.xywh()
 		Expect([]float32{x, y, w, h}).To(Equal([]float32{0, 0, 0, 0}))
 	})
 })
 var _ = Describe("parseAnalyzeJSON", func() {
 	It("maps region, age and gender for each face", func() {
 		doc := `{"faces":[
 			{"score":0.997,"box":[10,20,50,80],"age":31,"gender":"M"},
 			{"score":0.81,"box":[0,0,40,40],"age":24,"gender":"F"}]}`
 		faces, err := parseAnalyzeJSON(doc)
 		Expect(err).ToNot(HaveOccurred())
 		Expect(faces).To(HaveLen(2))
 		Expect(faces[0].FaceConfidence).To(BeNumerically("~", 0.997, 1e-4))
 		Expect(faces[0].Age).To(BeNumerically("~", 31, 1e-4))
 		Expect(faces[0].DominantGender).To(Equal("Man"))
 		Expect(faces[0].Gender).To(HaveKeyWithValue("Man", float32(1.0)))
 		Expect(faces[0].Region.W).To(Equal(float32(40)))
 		Expect(faces[0].Region.H).To(Equal(float32(60)))
 		Expect(faces[1].DominantGender).To(Equal("Woman"))
 	})
 	It("tolerates a missing gender field", func() {
 		faces, err := parseAnalyzeJSON(`{"faces":[{"score":0.5,"box":[0,0,10,10],"age":40}]}`)
 		Expect(err).ToNot(HaveOccurred())
 		Expect(faces).To(HaveLen(1))
 		Expect(faces[0].DominantGender).To(Equal(""))
 		Expect(faces[0].Gender).To(BeEmpty())
 	})
 	It("returns no faces for an empty document", func() {
 		faces, err := parseAnalyzeJSON(`{"faces":[]}`)
 		Expect(err).ToNot(HaveOccurred())
 		Expect(faces).To(BeEmpty())
 	})
 	It("returns an error on malformed JSON", func() {
 		_, err := parseAnalyzeJSON(`{not-json`)
 		Expect(err).To(HaveOccurred())
 	})
 })
 var _ = Describe("materializeImage", func() {
 	It("decodes a base64 payload to a temp file", func() {
 		payload := base64.StdEncoding.EncodeToString([]byte("\xff\xd8\xff\xe0fake-jpeg"))
 		path, cleanup, err := materializeImage(payload)
 		Expect(err).ToNot(HaveOccurred())
 		defer cleanup()
 		data, rerr := os.ReadFile(path)
 		Expect(rerr).ToNot(HaveOccurred())
 		Expect(data).To(Equal([]byte("\xff\xd8\xff\xe0fake-jpeg")))
 	})
 	It("strips a data: URI prefix before decoding", func() {
 		payload := "data:image/png;base64," + base64.StdEncoding.EncodeToString([]byte("hello"))
 		path, cleanup, err := materializeImage(payload)
 		Expect(err).ToNot(HaveOccurred())
 		defer cleanup()
 		data, rerr := os.ReadFile(path)
 		Expect(rerr).ToNot(HaveOccurred())
 		Expect(data).To(Equal([]byte("hello")))
 	})
 	It("uses an existing path as-is", func() {
 		tmp, err := os.CreateTemp("", "face-detect-fixture-*.bin")
 		Expect(err).ToNot(HaveOccurred())
 		defer func() { _ = os.Remove(tmp.Name()) }()
 		Expect(tmp.Close()).To(Succeed())
 		path, cleanup, err := materializeImage(tmp.Name())
 		Expect(err).ToNot(HaveOccurred())
 		defer cleanup()
 		Expect(path).To(Equal(tmp.Name()))
 	})
 	It("errors on input that is neither a path nor base64", func() {
 		_, _, err := materializeImage("not base64!!!")
 		Expect(err).To(HaveOccurred())
 	})
 })
 // The specs below exercise the real C-API end to end. They run only when both a
 // model GGUF and a test image are provided, and skip cleanly otherwise so the
 // suite stays green without large assets.
 var _ = Describe("FaceDetect end-to-end", Ordered, func() {
 	var (
 		f         *FaceDetect
 		modelPath = os.Getenv("FACEDETECT_BACKEND_TEST_MODEL")
 		imagePath = os.Getenv("FACEDETECT_BACKEND_TEST_IMAGE")
 	)
 	BeforeAll(func() {
 		if modelPath == "" || imagePath == "" {
 			Skip("set FACEDETECT_BACKEND_TEST_MODEL and FACEDETECT_BACKEND_TEST_IMAGE to run the e2e specs")
 		}
 		if err := ensureLibLoaded(); err != nil {
 			Skip("libfacedetect.so not loadable: " + err.Error())
 		}
 		f = &FaceDetect{}
 		Expect(f.Load(&pb.ModelOptions{ModelFile: modelPath})).To(Succeed())
 	})
 	It("embeds the primary face in an image", func() {
 		emb, err := f.Embeddings(&pb.PredictOptions{Images: []string{imagePath}})
 		Expect(err).ToNot(HaveOccurred())
 		Expect(emb).ToNot(BeEmpty())
 	})
 	It("detects at least one face", func() {
 		resp, err := f.Detect(&pb.DetectOptions{Src: imagePath})
 		Expect(err).ToNot(HaveOccurred())
 		Expect(resp.Detections).ToNot(BeEmpty())
 		Expect(resp.Detections[0].ClassName).To(Equal("face"))
 	})
 	It("verifies an image against itself as the same identity", func() {
 		resp, err := f.FaceVerify(&pb.FaceVerifyRequest{Img1: imagePath, Img2: imagePath})
 		Expect(err).ToNot(HaveOccurred())
 		Expect(resp.Verified).To(BeTrue())
 		Expect(resp.Distance).To(BeNumerically("<=", resp.Threshold))
 	})
 	It("analyzes age/gender for each face", func() {
 		resp, err := f.FaceAnalyze(&pb.FaceAnalyzeRequest{Img: imagePath})
 		Expect(err).ToNot(HaveOccurred())
 		Expect(resp.Faces).ToNot(BeEmpty())
 	})
 })
--- a/backend/go/face-detect/main.go
+++ b/backend/go/face-detect/main.go
@@ -0,0 +1,65 @@
 package main
 // Started internally by LocalAI - one gRPC server per loaded model.
 //
 // Loads libfacedetect.so via purego and registers the flat C-API entry points
 // declared in facedetect_capi.h. The library name can be overridden with
 // FACEDETECT_LIBRARY (mirrors the VOICEDETECT_LIBRARY / PARAKEET_LIBRARY
 // convention in the sibling backends); the default looks for the .so next to
 // this binary (resolved via LD_LIBRARY_PATH by run.sh).
 import (
 	"flag"
 	"fmt"
 	"os"
 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )
 var (
 	addr = flag.String("addr", "localhost:50051", "the address to connect to")
 )
 type LibFuncs struct {
 	FuncPtr any
 	Name    string
 }
 func main() {
 	libName := os.Getenv("FACEDETECT_LIBRARY")
 	if libName == "" {
 		libName = "libfacedetect.so"
 	}
 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
 	if err != nil {
 		panic(fmt.Errorf("face-detect: dlopen %q: %w", libName, err))
 	}
 	// Bound 1:1 to facedetect_capi.h. char*/float* returns are registered as
 	// uintptr so the raw pointer can be freed via the matching capi free fn.
 	libFuncs := []LibFuncs{
 		{&CppAbiVersion, "facedetect_capi_abi_version"},
 		{&CppLoad, "facedetect_capi_load"},
 		{&CppFree, "facedetect_capi_free"},
 		{&CppLastError, "facedetect_capi_last_error"},
 		{&CppFreeString, "facedetect_capi_free_string"},
 		{&CppFreeVec, "facedetect_capi_free_vec"},
 		{&CppEmbedPath, "facedetect_capi_embed_path"},
 		{&CppEmbedRGB, "facedetect_capi_embed_rgb"},
 		{&CppDetectJSON, "facedetect_capi_detect_path_json"},
 		{&CppVerifyPaths, "facedetect_capi_verify_paths"},
 		{&CppAnalyzeJSON, "facedetect_capi_analyze_path_json"},
 	}
 	for _, lf := range libFuncs {
 		purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
 	}
 	fmt.Fprintf(os.Stderr, "[face-detect] ABI=%d\n", CppAbiVersion())
 	flag.Parse()
 	if err := grpc.StartServer(*addr, &FaceDetect{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/face-detect/options.go
+++ b/backend/go/face-detect/options.go
@@ -0,0 +1,47 @@
 package main
 import (
 	"strconv"
 	"strings"
 )
 // defaultVerifyThreshold is the cosine-distance cutoff used when a request does
 // not set one. Matches the insightface buffalo_l ArcFace R50 default the Python
 // face backend ships with so the two implementations agree on verdicts out of
 // the box.
 const defaultVerifyThreshold float32 = 0.35
 // loadOptions holds the parsed model-level options for face-detect.
 type loadOptions struct {
 	verifyThreshold float32
 	modelName       string
 }
 func splitOption(o string) (key, value string, ok bool) {
 	i := strings.Index(o, ":")
 	if i < 0 {
 		return "", "", false
 	}
 	return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true
 }
 // parseOptions reads the backend "key:value" option slice. Unknown keys are
 // ignored. Defaults: verify_threshold 0.35, model_name derived from the file.
 func parseOptions(opts []string) loadOptions {
 	o := loadOptions{verifyThreshold: defaultVerifyThreshold}
 	for _, oo := range opts {
 		key, value, ok := splitOption(oo)
 		if !ok {
 			continue
 		}
 		switch key {
 		case "verify_threshold", "threshold":
 			if f, err := strconv.ParseFloat(value, 32); err == nil && f > 0 {
 				o.verifyThreshold = float32(f)
 			}
 		case "model_name":
 			o.modelName = value
 		}
 	}
 	return o
 }
--- a/backend/go/face-detect/package.sh
+++ b/backend/go/face-detect/package.sh
@@ -0,0 +1,68 @@
 #!/bin/bash
 #
 # Bundle the face-detect-grpc binary, libfacedetect.so, the core runtime libs
 # (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE
 # so the package is self-contained. Mirrors backend/go/voice-detect/package.sh;
 # run.sh routes the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc
 # is used instead of the host's.
 set -e
 CURDIR=$(dirname "$(realpath "$0")")
 REPO_ROOT="${CURDIR}/../../.."
 mkdir -p "$CURDIR/package/lib"
 cp -avf "$CURDIR/face-detect-grpc" "$CURDIR/package/"
 cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
 # libfacedetect.so + any soname symlinks. purego.Dlopen resolves it via
 # LD_LIBRARY_PATH, which run.sh points at lib/.
 cp -avf "$CURDIR"/libfacedetect.so* "$CURDIR/package/lib/" 2>/dev/null || {
 	echo "ERROR: libfacedetect.so not found in $CURDIR, run 'make' first" >&2
 	exit 1
 }
 # Detect architecture and copy the core runtime libs libfacedetect.so links
 # against, plus the matching dynamic loader as lib/ld.so.
 if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
    echo "Detected x86_64 architecture, copying x86_64 libraries..."
    cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
 elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
    echo "Detected ARM64 architecture, copying ARM64 libraries..."
    cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
 elif [ "$(uname -s)" = "Darwin" ]; then
    echo "Detected Darwin"
 else
    echo "Error: Could not detect architecture"
    exit 1
 fi
 # Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers) based on
 # BUILD_TYPE so the backend can reach the GPU without the runtime base image
 # shipping those drivers.
 GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
 if [ -f "$GPU_LIB_SCRIPT" ]; then
    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
    package_gpu_libs
 fi
 echo "Packaging completed successfully"
 ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
--- a/backend/go/face-detect/run.sh
+++ b/backend/go/face-detect/run.sh
@@ -0,0 +1,16 @@
 #!/bin/bash
 set -e
 CURDIR=$(dirname "$(realpath "$0")")
 export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
 # If a self-contained ld.so was packaged, route through it so the packaged
 # libc / libstdc++ are used instead of the host's (matches the voice-detect /
 # whisper / parakeet backends' runtime layout).
 if [ -f "$CURDIR/lib/ld.so" ]; then
 	echo "Using lib/ld.so"
 	exec "$CURDIR/lib/ld.so" "$CURDIR/face-detect-grpc" "$@"
 fi
 exec "$CURDIR/face-detect-grpc" "$@"
--- a/backend/go/face-detect/test.sh
+++ b/backend/go/face-detect/test.sh
@@ -0,0 +1,15 @@
 #!/bin/bash
 set -e
 CURDIR=$(dirname "$(realpath "$0")")
 cd "$CURDIR"
 echo "Running face-detect backend tests..."
 # The pure-Go parsing specs always run. The embed/detect/verify/analyze smoke
 # specs run only when a model + image are provided via
 # FACEDETECT_BACKEND_TEST_MODEL and FACEDETECT_BACKEND_TEST_IMAGE; otherwise they
 # auto-skip.
 LD_LIBRARY_PATH="$CURDIR:${LD_LIBRARY_PATH:-}" go test -v -timeout 1200s .
 echo "face-detect tests completed."
--- a/backend/go/omnivoice-cpp/Makefile
+++ b/backend/go/omnivoice-cpp/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 # omnivoice.cpp version
 OMNIVOICE_REPO?=https://github.com/ServeurpersoCom/omnivoice.cpp
-OMNIVOICE_VERSION?=2603355a5dfacae5cfc33531d5d0933221843509
+OMNIVOICE_VERSION?=96d30169afd5e6bb3fd6a0e9be0eb505bfe81fcd
 SO_TARGET?=libgomnivoicecpp.so
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/parakeet-cpp/Makefile
+++ b/backend/go/parakeet-cpp/Makefile
@@ -1,6 +1,6 @@
 # parakeet-cpp backend Makefile.
 #
-# Upstream pin lives below as PARAKEET_VERSION?=92a5f0306be354c109150fe58ae4cc4f8a21ca45
+# Upstream pin lives below as PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36
 # (.github/bump_deps.sh) can find and update it - matches the
 # whisper.cpp / ds4 / vibevoice-cpp convention.
 #
@@ -15,7 +15,7 @@
 # That's what the L0 smoke test uses. The default target below does the
 # proper clone-at-pin + cmake build so CI doesn't need a side-checkout.
-PARAKEET_VERSION?=92a5f0306be354c109150fe58ae4cc4f8a21ca45
+PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36
 PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp
 GOCMD?=go
--- a/backend/go/parakeet-cpp/package.sh
+++ b/backend/go/parakeet-cpp/package.sh
@@ -1,23 +1,68 @@
 #!/bin/bash
 #
-# L0 packaging stub: copy the binary, run.sh and libparakeet.so* into
+# Bundle the parakeet-cpp-grpc binary, libparakeet.so, the core runtime
-# package/. The full ldd walk (libc, libstdc++, libgomp, GPU runtimes,
+# libs (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active
-# arch detection) lands in L3, mirroring backend/go/whisper/package.sh.
+# BUILD_TYPE so the package is self-contained. Mirrors
 # backend/go/whisper/package.sh; run.sh routes the (CGO_ENABLED=0) binary
 # through lib/ld.so so the packaged libc is used instead of the host's.
 set -e
 CURDIR=$(dirname "$(realpath "$0")")
 REPO_ROOT="${CURDIR}/../../.."
 mkdir -p "$CURDIR/package/lib"
 cp -avf "$CURDIR/parakeet-cpp-grpc" "$CURDIR/package/"
 cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
-# libparakeet.so + any soname symlinks (libparakeet.so.X, libparakeet.so.X.Y).
+# libparakeet.so + any soname symlinks (libparakeet.so.X[.Y]). purego.Dlopen
 # resolves it via LD_LIBRARY_PATH, which run.sh points at lib/.
 cp -avf "$CURDIR"/libparakeet.so* "$CURDIR/package/lib/" 2>/dev/null || {
 	echo "ERROR: libparakeet.so not found in $CURDIR, run 'make' first" >&2
 	exit 1
 }
-echo "L0 package layout (full ldd walk lands in L3):"
+# Detect architecture and copy the core runtime libs libparakeet.so links
 # against, plus the matching dynamic loader as lib/ld.so.
 if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
    echo "Detected x86_64 architecture, copying x86_64 libraries..."
    cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
 elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
    echo "Detected ARM64 architecture, copying ARM64 libraries..."
    cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
 elif [ "$(uname -s)" = "Darwin" ]; then
    echo "Detected Darwin"
 else
    echo "Error: Could not detect architecture"
    exit 1
 fi
 # Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers)
 # based on BUILD_TYPE so the backend can reach the GPU without the runtime
 # base image shipping those drivers.
 GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
 if [ -f "$GPU_LIB_SCRIPT" ]; then
    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
    package_gpu_libs
 fi
 echo "Packaging completed successfully"
 ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
--- a/backend/go/qwen3-tts-cpp/Makefile
+++ b/backend/go/qwen3-tts-cpp/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 # qwentts.cpp version
 QWEN3TTS_REPO?=https://github.com/ServeurpersoCom/qwentts.cpp
-QWEN3TTS_CPP_VERSION?=0bf4a18b22e8bb8718d95294e9f7f45c0d4270a4
+QWEN3TTS_CPP_VERSION?=4536dcdce27c3764a93a06d6bf64026b124962f5
 SO_TARGET?=libgoqwen3ttscpp.so
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=7f0e728b7d42f2490dfa5dd9539082d904f2f6b2
+STABLEDIFFUSION_GGML_VERSION?=b12098f5d09fc83da36e65c784f7bdb16a5a5ebf
 CMAKE_ARGS+=-DGGML_MAX_NAME=128
--- a/backend/go/voice-detect/.gitignore
+++ b/backend/go/voice-detect/.gitignore
@@ -0,0 +1,18 @@
 # Fetched upstream sources
 sources/
 # CMake build directories
 build*/
 # build artifacts staged in-tree by the Makefile (cp from sources/) or
 # symlinked for local dev; the real sources live in voice-detect.cpp upstream.
 *.so
 *.so.*
 voicedetect_capi.h
 compile_commands.json
 # Compiled backend binary
 voice-detect-grpc
 # Packaging output
 package/
--- a/backend/go/voice-detect/Makefile
+++ b/backend/go/voice-detect/Makefile
@@ -0,0 +1,107 @@
 # voice-detect backend Makefile.
 #
 # Upstream pin lives below as VOICEDETECT_VERSION?=30beecd... (.github/bump_deps.sh
 # can find and update it - matches the parakeet.cpp / whisper.cpp / ds4 convention).
 #
 # Local dev shortcut: if you already have an out-of-tree voice-detect.cpp build,
 # symlink the .so + header into this directory and skip the clone/cmake steps:
 #
 #   ln -sf /path/to/voice-detect.cpp/build-shared/libvoicedetect.so .
 #   ln -sf /path/to/voice-detect.cpp/include/voicedetect_capi.h .
 #   go build -o voice-detect-grpc .
 #
 # The default target below does the proper clone-at-pin + cmake build so CI does
 # not need a side-checkout.
 VOICEDETECT_VERSION?=30beecdbe9662fb27e826ae4ec949d3fa02ff366
 VOICEDETECT_REPO?=https://github.com/mudler/voice-detect.cpp
 GOCMD?=go
 GO_TAGS?=
 JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
 BUILD_TYPE?=
 NATIVE?=false
 # Resolve the target arch. The backend matrix / Docker build pass TARGETARCH
 # (amd64|arm64); fall back to uname -m (aarch64|x86_64) for a local build.
 RECON_ARCH?=$(or $(TARGETARCH),$(shell uname -m))
 # Build ggml statically into libvoicedetect.so (PIC) so the shared lib is
 # self-contained: dlopen needs no libggml*.so alongside it, only system libs
 # (libstdc++/libgomp/libc) that the runtime image already provides.
 CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DVOICEDETECT_SHARED=ON -DVOICEDETECT_BUILD_CLI=OFF -DVOICEDETECT_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
 endif
 # voice-detect.cpp gates its GGML backends behind VOICEDETECT_GGML_* options and
 # does set(GGML_CUDA ${VOICEDETECT_GGML_CUDA} CACHE BOOL "" FORCE), so a bare
 # -DGGML_CUDA=ON is overwritten back to OFF. Forward the VOICEDETECT_GGML_*
 # options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
 ifeq ($(BUILD_TYPE),cublas)
 	CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDA=ON
 	# Opt-in cuDNN implicit-GEMM conv path (kills im2col on GPU, reaches
 	# torch-cuDNN parity). Only the arm64 + CUDA 13 image (GB10/Jetson/L4T)
 	# ships libcudnn9 + the -dev headers, so gate cuDNN to that variant.
 	# x86 CUDA images carry no cuDNN -> enabling it there is a link failure.
 	ifeq ($(CUDA_MAJOR_VERSION),13)
 	ifneq (,$(filter arm64 aarch64,$(RECON_ARCH)))
 		CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDNN=ON
 	endif
 	endif
 else ifeq ($(BUILD_TYPE),openblas)
 	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DVOICEDETECT_GGML_HIP=ON
 else ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DVOICEDETECT_GGML_VULKAN=ON
 else ifeq ($(BUILD_TYPE),metal)
 	CMAKE_ARGS+=-DVOICEDETECT_GGML_METAL=ON
 endif
 .PHONY: voice-detect-grpc package build clean purge test all
 all: voice-detect-grpc
 # Clone the upstream voice-detect.cpp source at the pinned commit. Directory acts
 # as the target so make only re-clones when missing. After a VOICEDETECT_VERSION
 # bump, run 'make purge && make' to refetch.
 sources/voice-detect.cpp:
 	mkdir -p sources/voice-detect.cpp
 	cd sources/voice-detect.cpp && \
 	git init -q && \
 	git remote add origin $(VOICEDETECT_REPO) && \
 	git fetch --depth 1 origin $(VOICEDETECT_VERSION) && \
 	git checkout FETCH_HEAD && \
 	git submodule update --init --recursive --depth 1 --single-branch
 # Build the shared lib + header out-of-tree, then stage them next to the Go
 # sources so purego.Dlopen("libvoicedetect.so") and the cgo-less build both pick
 # them up.
 libvoicedetect.so: sources/voice-detect.cpp
 	cmake -B sources/voice-detect.cpp/build-shared -S sources/voice-detect.cpp $(CMAKE_ARGS)
 	cmake --build sources/voice-detect.cpp/build-shared --config Release -j$(JOBS) --target voicedetect
 	cp -fv sources/voice-detect.cpp/build-shared/libvoicedetect.so* ./ 2>/dev/null || true
 	cp -fv sources/voice-detect.cpp/include/voicedetect_capi.h ./
 voice-detect-grpc: libvoicedetect.so main.go govoicedetect.go options.go
 	CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o voice-detect-grpc .
 package: voice-detect-grpc
 	bash package.sh
 build: package
 # Test target. The embed/verify/analyze smoke specs are gated on
 # VOICEDETECT_BACKEND_TEST_MODEL + VOICEDETECT_BACKEND_TEST_WAV; without them the
 # heavy specs auto-skip and only the pure-Go parsing specs run.
 test:
 	LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
 clean: purge
 	rm -rf libvoicedetect.so* voicedetect_capi.h package voice-detect-grpc
 purge:
 	rm -rf sources/voice-detect.cpp
--- a/backend/go/voice-detect/govoicedetect.go
+++ b/backend/go/voice-detect/govoicedetect.go
@@ -0,0 +1,273 @@
 package main
 import (
 	"encoding/json"
 	"errors"
 	"fmt"
 	"math"
 	"os"
 	"path/filepath"
 	"strconv"
 	"strings"
 	"time"
 	"unsafe"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/xlog"
 )
 // purego-bound entry points from libvoicedetect.so. Names match
 // voicedetect_capi.h exactly so a `nm libvoicedetect.so | grep voicedetect_capi`
 // is enough to spot drift.
 //
 // The opaque ctx and the malloc'd char*/float* return values are declared as
 // uintptr so we get the raw pointer back and can release it via the matching
 // capi free function. purego's native string/[]float32 returns would copy and
 // forget the original pointer, leaking the C-owned buffer on every call.
 var (
 	CppAbiVersion  func() int32
 	CppLoad        func(ggufPath string) uintptr
 	CppFree        func(ctx uintptr)
 	CppLastError   func(ctx uintptr) string
 	CppFreeString  func(s uintptr)
 	CppFreeVec     func(v uintptr)
 	CppEmbedPath   func(ctx uintptr, wavPath string, outVec, outDim unsafe.Pointer) int32
 	CppEmbedPCM    func(ctx uintptr, pcm []float32, nSamples, sampleRate int32, outVec, outDim unsafe.Pointer) int32
 	CppVerifyPaths func(ctx uintptr, a, b string, threshold float32, outDistance, outVerified unsafe.Pointer) int32
 	CppAnalyzeJSON func(ctx uintptr, wavPath string) uintptr
 )
 // VoiceDetect implements the speaker-recognition voice subset of the Backend
 // gRPC service over libvoicedetect.so. The C side keeps a single loaded model
 // plus a per-ctx last-error buffer and is not reentrant, so base.SingleThread
 // serializes every call.
 type VoiceDetect struct {
 	base.SingleThread
 	opts   loadOptions
 	ctxPtr uintptr
 }
 func (v *VoiceDetect) Load(opts *pb.ModelOptions) error {
 	model := opts.ModelFile
 	if model == "" {
 		model = opts.ModelPath
 	}
 	if !filepath.IsAbs(model) && opts.ModelPath != "" {
 		model = filepath.Join(opts.ModelPath, model)
 	}
 	if model == "" {
 		return errors.New("voice-detect: ModelFile is required")
 	}
 	v.opts = parseOptions(opts.Options)
 	if v.opts.modelName == "" {
 		v.opts.modelName = filepath.Base(model)
 	}
 	// Propagate LocalAI's per-model thread budget to the engine. LocalAI spawns
 	// one backend process per model and serves requests concurrently, so the
 	// engine's own min(hardware_concurrency, 8) default can oversubscribe cores.
 	// VOICEDETECT_THREADS is read by the engine at backend construction, so it
 	// must be set before the capi load. A non-positive Threads means "unset":
 	// leave the env alone so the engine keeps its sane default.
 	threads := opts.Threads
 	if threads > 0 {
 		if err := os.Setenv("VOICEDETECT_THREADS", strconv.Itoa(int(threads))); err != nil {
 			return fmt.Errorf("voice-detect: set VOICEDETECT_THREADS: %w", err)
 		}
 		xlog.Info("voice-detect: applying LocalAI thread budget", "threads", threads)
 	}
 	xlog.Info("voice-detect: loading model", "model", model,
 		"verify_threshold", v.opts.verifyThreshold, "abi", CppAbiVersion())
 	ctx := CppLoad(model)
 	if ctx == 0 {
 		// The last-error buffer lives on the ctx that was never returned, so
 		// surface the path the operator tried to load instead.
 		return fmt.Errorf("voice-detect: voicedetect_capi_load failed for %q", model)
 	}
 	v.ctxPtr = ctx
 	return nil
 }
 // VoiceEmbed returns the L2-normalized speaker embedding for an audio clip.
 // The request carries a filesystem PATH; the HTTP layer materializes
 // base64/URL/data-URI inputs to a temp file before the gRPC call.
 func (v *VoiceDetect) VoiceEmbed(req *pb.VoiceEmbedRequest) (pb.VoiceEmbedResponse, error) {
 	if v.ctxPtr == 0 {
 		return pb.VoiceEmbedResponse{}, errors.New("voice-detect: model not loaded")
 	}
 	if req.Audio == "" {
 		return pb.VoiceEmbedResponse{}, errors.New("voice-detect: audio path is required")
 	}
 	emb, err := v.embedPath(req.Audio)
 	if err != nil {
 		return pb.VoiceEmbedResponse{}, err
 	}
 	return pb.VoiceEmbedResponse{Embedding: emb, Model: v.opts.modelName}, nil
 }
 func (v *VoiceDetect) embedPath(path string) ([]float32, error) {
 	var vec uintptr
 	var dim int32
 	rc := CppEmbedPath(v.ctxPtr, path, unsafe.Pointer(&vec), unsafe.Pointer(&dim))
 	if rc != 0 || vec == 0 || dim <= 0 {
 		return nil, v.lastErr("embed", path)
 	}
 	defer CppFreeVec(vec)
 	// Copy out of the C-owned malloc'd buffer before freeing it. The
 	// uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
 	// a C heap pointer from Go-managed memory; safe here, the GC neither tracks
 	// nor moves this buffer and we copy immediately.
 	src := unsafe.Slice((*float32)(unsafe.Pointer(vec)), int(dim)) //nolint:govet // C-owned malloc'd vector, copied out before free
 	out := make([]float32, int(dim))
 	copy(out, src)
 	return out, nil
 }
 // VoiceVerify embeds two clips and reports whether they are the same speaker by
 // cosine distance against a threshold. A request threshold <= 0 falls back to
 // the model-configured default (verify_threshold option, 0.25 if unset).
 func (v *VoiceDetect) VoiceVerify(req *pb.VoiceVerifyRequest) (pb.VoiceVerifyResponse, error) {
 	if v.ctxPtr == 0 {
 		return pb.VoiceVerifyResponse{}, errors.New("voice-detect: model not loaded")
 	}
 	if req.Audio1 == "" || req.Audio2 == "" {
 		return pb.VoiceVerifyResponse{}, errors.New("voice-detect: audio1 and audio2 are required")
 	}
 	threshold := req.Threshold
 	if threshold <= 0 {
 		threshold = v.opts.verifyThreshold
 	}
 	started := time.Now()
 	var distance float32
 	var verified int32
 	rc := CppVerifyPaths(v.ctxPtr, req.Audio1, req.Audio2, threshold,
 		unsafe.Pointer(&distance), unsafe.Pointer(&verified))
 	if rc != 0 {
 		return pb.VoiceVerifyResponse{}, v.lastErr("verify", req.Audio1+","+req.Audio2)
 	}
 	elapsedMs := float32(time.Since(started).Seconds() * 1000.0)
 	// Confidence decays linearly from 100 at distance 0 to 0 at the threshold,
 	// matching the Python speaker-recognition backend's reporting.
 	confidence := float32(0)
 	if threshold > 0 {
 		confidence = float32(math.Max(0, math.Min(100, (1.0-float64(distance)/float64(threshold))*100.0)))
 	}
 	return pb.VoiceVerifyResponse{
 		Verified:         verified != 0,
 		Distance:         distance,
 		Threshold:        threshold,
 		Confidence:       confidence,
 		Model:            v.opts.modelName,
 		ProcessingTimeMs: elapsedMs,
 	}, nil
 }
 // VoiceAnalyze runs the age/gender/emotion heads on a single clip. The C-API
 // always evaluates every supported head, so the request's actions filter is
 // advisory and the full analysis is returned as a single segment (the engine
 // does not produce time-bounded segments).
 func (v *VoiceDetect) VoiceAnalyze(req *pb.VoiceAnalyzeRequest) (pb.VoiceAnalyzeResponse, error) {
 	if v.ctxPtr == 0 {
 		return pb.VoiceAnalyzeResponse{}, errors.New("voice-detect: model not loaded")
 	}
 	if req.Audio == "" {
 		return pb.VoiceAnalyzeResponse{}, errors.New("voice-detect: audio path is required")
 	}
 	ptr := CppAnalyzeJSON(v.ctxPtr, req.Audio)
 	if ptr == 0 {
 		return pb.VoiceAnalyzeResponse{}, v.lastErr("analyze", req.Audio)
 	}
 	defer CppFreeString(ptr)
 	seg, err := parseAnalyzeJSON(goStringFromCPtr(ptr))
 	if err != nil {
 		return pb.VoiceAnalyzeResponse{}, fmt.Errorf("voice-detect: analyze JSON for %q: %w", req.Audio, err)
 	}
 	return pb.VoiceAnalyzeResponse{Segments: []*pb.VoiceAnalysis{seg}}, nil
 }
 // analyzeJSON mirrors the document returned by voicedetect_capi_analyze_path_json:
 //
 //	{"age":42.0,
 //	 "gender":{"label":"female","female":0.88,"male":0.12},
 //	 "emotion":{"label":"neutral","scores":{"neutral":0.7, ...}}}
 //
 // gender is a mixed object (a "label" string plus per-class float scores), so
 // it is decoded into raw messages and split in parseAnalyzeJSON.
 type analyzeJSON struct {
 	Age     float32                    `json:"age"`
 	Gender  map[string]json.RawMessage `json:"gender"`
 	Emotion struct {
 		Label  string             `json:"label"`
 		Scores map[string]float32 `json:"scores"`
 	} `json:"emotion"`
 }
 // parseAnalyzeJSON maps the engine's analyze document onto a VoiceAnalysis.
 // start/end stay 0: the model emits a single whole-utterance result, not
 // time-bounded segments.
 func parseAnalyzeJSON(doc string) (*pb.VoiceAnalysis, error) {
 	var a analyzeJSON
 	if err := json.Unmarshal([]byte(doc), &a); err != nil {
 		return nil, err
 	}
 	seg := &pb.VoiceAnalysis{
 		Age:             a.Age,
 		DominantEmotion: a.Emotion.Label,
 		Emotion:         a.Emotion.Scores,
 	}
 	if len(a.Gender) > 0 {
 		gender := make(map[string]float32, len(a.Gender))
 		for k, raw := range a.Gender {
 			if k == "label" {
 				_ = json.Unmarshal(raw, &seg.DominantGender)
 				continue
 			}
 			var score float32
 			if err := json.Unmarshal(raw, &score); err == nil {
 				gender[k] = score
 			}
 		}
 		seg.Gender = gender
 	}
 	return seg, nil
 }
 // lastErr wraps the C-API's per-ctx last-error buffer into a Go error.
 func (v *VoiceDetect) lastErr(op, subject string) error {
 	msg := strings.TrimSpace(CppLastError(v.ctxPtr))
 	if msg == "" {
 		msg = "no error detail"
 	}
 	return fmt.Errorf("voice-detect: %s failed for %q: %s", op, subject, msg)
 }
 // goStringFromCPtr copies a NUL-terminated C string into Go memory. cptr is a
 // malloc'd buffer the caller owns; release it via CppFreeString after the copy.
 //
 // The uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
 // a C heap pointer from Go-managed memory. Safe here: the GC neither tracks nor
 // moves the buffer and we dereference it immediately to copy the bytes out.
 func goStringFromCPtr(cptr uintptr) string {
 	if cptr == 0 {
 		return ""
 	}
 	p := unsafe.Pointer(cptr) //nolint:govet // C-owned malloc'd buffer, not Go-GC memory (see doc above)
 	n := 0
 	for *(*byte)(unsafe.Add(p, n)) != 0 {
 		n++
 	}
 	return string(unsafe.Slice((*byte)(p), n))
 }
--- a/backend/go/voice-detect/govoicedetect_test.go
+++ b/backend/go/voice-detect/govoicedetect_test.go
@@ -0,0 +1,144 @@
 package main
 import (
 	"os"
 	"sync"
 	"testing"
 	"github.com/ebitengine/purego"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
 func TestVoiceDetect(t *testing.T) {
 	RegisterFailHandler(Fail)
 	RunSpecs(t, "voice-detect Backend Suite")
 }
 var (
 	libLoadOnce sync.Once
 	libLoadErr  error
 )
 // ensureLibLoaded mirrors main.go's bootstrap so a Go test can drive the C-API
 // bridge without spinning up the gRPC server. Records the error (the smoke
 // specs skip themselves) when libvoicedetect.so is not loadable from cwd
 // (LD_LIBRARY_PATH or a symlink in ./).
 func ensureLibLoaded() error {
 	libLoadOnce.Do(func() {
 		libName := os.Getenv("VOICEDETECT_LIBRARY")
 		if libName == "" {
 			libName = "libvoicedetect.so"
 		}
 		lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
 		if err != nil {
 			libLoadErr = err
 			return
 		}
 		purego.RegisterLibFunc(&CppAbiVersion, lib, "voicedetect_capi_abi_version")
 		purego.RegisterLibFunc(&CppLoad, lib, "voicedetect_capi_load")
 		purego.RegisterLibFunc(&CppFree, lib, "voicedetect_capi_free")
 		purego.RegisterLibFunc(&CppLastError, lib, "voicedetect_capi_last_error")
 		purego.RegisterLibFunc(&CppFreeString, lib, "voicedetect_capi_free_string")
 		purego.RegisterLibFunc(&CppFreeVec, lib, "voicedetect_capi_free_vec")
 		purego.RegisterLibFunc(&CppEmbedPath, lib, "voicedetect_capi_embed_path")
 		purego.RegisterLibFunc(&CppEmbedPCM, lib, "voicedetect_capi_embed_pcm")
 		purego.RegisterLibFunc(&CppVerifyPaths, lib, "voicedetect_capi_verify_paths")
 		purego.RegisterLibFunc(&CppAnalyzeJSON, lib, "voicedetect_capi_analyze_path_json")
 	})
 	return libLoadErr
 }
 var _ = Describe("parseOptions", func() {
 	It("defaults verify_threshold to 0.25", func() {
 		o := parseOptions(nil)
 		Expect(o.verifyThreshold).To(Equal(float32(0.25)))
 		Expect(o.modelName).To(Equal(""))
 	})
 	It("parses verify_threshold, threshold alias and model_name", func() {
 		o := parseOptions([]string{"verify_threshold:0.4", "model_name:ecapa", "unknown:x"})
 		Expect(o.verifyThreshold).To(Equal(float32(0.4)))
 		Expect(o.modelName).To(Equal("ecapa"))
 		o2 := parseOptions([]string{"threshold:0.3"})
 		Expect(o2.verifyThreshold).To(Equal(float32(0.3)))
 	})
 	It("ignores non-positive thresholds and keeps the default", func() {
 		o := parseOptions([]string{"verify_threshold:0", "threshold:-1"})
 		Expect(o.verifyThreshold).To(Equal(float32(0.25)))
 	})
 })
 var _ = Describe("parseAnalyzeJSON", func() {
 	It("maps age, gender label+scores and emotion label+scores", func() {
 		doc := `{"age":42.0,
 			"gender":{"label":"female","female":0.88,"male":0.12},
 			"emotion":{"label":"neutral","scores":{"neutral":0.7,"happy":0.2,"sad":0.1}}}`
 		seg, err := parseAnalyzeJSON(doc)
 		Expect(err).ToNot(HaveOccurred())
 		Expect(seg.Age).To(BeNumerically("~", 42.0, 1e-4))
 		Expect(seg.Start).To(Equal(float32(0)))
 		Expect(seg.End).To(Equal(float32(0)))
 		Expect(seg.DominantGender).To(Equal("female"))
 		Expect(seg.Gender).To(HaveKeyWithValue("female", BeNumerically("~", 0.88, 1e-4)))
 		Expect(seg.Gender).To(HaveKeyWithValue("male", BeNumerically("~", 0.12, 1e-4)))
 		// The "label" entry is consumed into DominantGender, not the score map.
 		Expect(seg.Gender).ToNot(HaveKey("label"))
 		Expect(seg.DominantEmotion).To(Equal("neutral"))
 		Expect(seg.Emotion).To(HaveKeyWithValue("neutral", BeNumerically("~", 0.7, 1e-4)))
 		Expect(seg.Emotion).To(HaveKeyWithValue("happy", BeNumerically("~", 0.2, 1e-4)))
 	})
 	It("tolerates a missing gender block", func() {
 		seg, err := parseAnalyzeJSON(`{"age":30.0,"emotion":{"label":"happy","scores":{"happy":1.0}}}`)
 		Expect(err).ToNot(HaveOccurred())
 		Expect(seg.DominantGender).To(Equal(""))
 		Expect(seg.DominantEmotion).To(Equal("happy"))
 	})
 	It("returns an error on malformed JSON", func() {
 		_, err := parseAnalyzeJSON(`{not-json`)
 		Expect(err).To(HaveOccurred())
 	})
 })
 // The specs below exercise the real C-API end to end. They run only when both a
 // model GGUF and a test WAV are provided, and skip cleanly otherwise so the
 // suite stays green without large assets.
 var _ = Describe("VoiceDetect end-to-end", Ordered, func() {
 	var (
 		v         *VoiceDetect
 		modelPath = os.Getenv("VOICEDETECT_BACKEND_TEST_MODEL")
 		wavPath   = os.Getenv("VOICEDETECT_BACKEND_TEST_WAV")
 	)
 	BeforeAll(func() {
 		if modelPath == "" || wavPath == "" {
 			Skip("set VOICEDETECT_BACKEND_TEST_MODEL and VOICEDETECT_BACKEND_TEST_WAV to run the e2e specs")
 		}
 		if err := ensureLibLoaded(); err != nil {
 			Skip("libvoicedetect.so not loadable: " + err.Error())
 		}
 		v = &VoiceDetect{}
 		Expect(v.Load(&pb.ModelOptions{ModelFile: modelPath})).To(Succeed())
 	})
 	It("embeds an audio clip", func() {
 		resp, err := v.VoiceEmbed(&pb.VoiceEmbedRequest{Audio: wavPath})
 		Expect(err).ToNot(HaveOccurred())
 		Expect(resp.Embedding).ToNot(BeEmpty())
 		Expect(resp.Model).ToNot(BeEmpty())
 	})
 	It("verifies a clip against itself as the same speaker", func() {
 		resp, err := v.VoiceVerify(&pb.VoiceVerifyRequest{Audio1: wavPath, Audio2: wavPath})
 		Expect(err).ToNot(HaveOccurred())
 		Expect(resp.Verified).To(BeTrue())
 		Expect(resp.Distance).To(BeNumerically("<=", resp.Threshold))
 	})
 })
--- a/backend/go/voice-detect/main.go
+++ b/backend/go/voice-detect/main.go
@@ -0,0 +1,64 @@
 package main
 // Started internally by LocalAI - one gRPC server per loaded model.
 //
 // Loads libvoicedetect.so via purego and registers the flat C-API entry points
 // declared in voicedetect_capi.h. The library name can be overridden with
 // VOICEDETECT_LIBRARY (mirrors the PARAKEET_LIBRARY / OMNIVOICE_LIBRARY
 // convention in the sibling backends); the default looks for the .so next to
 // this binary (resolved via LD_LIBRARY_PATH by run.sh).
 import (
 	"flag"
 	"fmt"
 	"os"
 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )
 var (
 	addr = flag.String("addr", "localhost:50051", "the address to connect to")
 )
 type LibFuncs struct {
 	FuncPtr any
 	Name    string
 }
 func main() {
 	libName := os.Getenv("VOICEDETECT_LIBRARY")
 	if libName == "" {
 		libName = "libvoicedetect.so"
 	}
 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
 	if err != nil {
 		panic(fmt.Errorf("voice-detect: dlopen %q: %w", libName, err))
 	}
 	// Bound 1:1 to voicedetect_capi.h. char*/float* returns are registered as
 	// uintptr so the raw pointer can be freed via the matching capi free fn.
 	libFuncs := []LibFuncs{
 		{&CppAbiVersion, "voicedetect_capi_abi_version"},
 		{&CppLoad, "voicedetect_capi_load"},
 		{&CppFree, "voicedetect_capi_free"},
 		{&CppLastError, "voicedetect_capi_last_error"},
 		{&CppFreeString, "voicedetect_capi_free_string"},
 		{&CppFreeVec, "voicedetect_capi_free_vec"},
 		{&CppEmbedPath, "voicedetect_capi_embed_path"},
 		{&CppEmbedPCM, "voicedetect_capi_embed_pcm"},
 		{&CppVerifyPaths, "voicedetect_capi_verify_paths"},
 		{&CppAnalyzeJSON, "voicedetect_capi_analyze_path_json"},
 	}
 	for _, lf := range libFuncs {
 		purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
 	}
 	fmt.Fprintf(os.Stderr, "[voice-detect] ABI=%d\n", CppAbiVersion())
 	flag.Parse()
 	if err := grpc.StartServer(*addr, &VoiceDetect{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/voice-detect/options.go
+++ b/backend/go/voice-detect/options.go
@@ -0,0 +1,46 @@
 package main
 import (
 	"strconv"
 	"strings"
 )
 // defaultVerifyThreshold is the cosine-distance cutoff used when a request does
 // not set one. Matches the Python speaker-recognition backend's default so the
 // two implementations agree on verdicts out of the box.
 const defaultVerifyThreshold float32 = 0.25
 // loadOptions holds the parsed model-level options for voice-detect.
 type loadOptions struct {
 	verifyThreshold float32
 	modelName       string
 }
 func splitOption(o string) (key, value string, ok bool) {
 	i := strings.Index(o, ":")
 	if i < 0 {
 		return "", "", false
 	}
 	return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true
 }
 // parseOptions reads the backend "key:value" option slice. Unknown keys are
 // ignored. Defaults: verify_threshold 0.25, model_name derived from the file.
 func parseOptions(opts []string) loadOptions {
 	o := loadOptions{verifyThreshold: defaultVerifyThreshold}
 	for _, oo := range opts {
 		key, value, ok := splitOption(oo)
 		if !ok {
 			continue
 		}
 		switch key {
 		case "verify_threshold", "threshold":
 			if f, err := strconv.ParseFloat(value, 32); err == nil && f > 0 {
 				o.verifyThreshold = float32(f)
 			}
 		case "model_name":
 			o.modelName = value
 		}
 	}
 	return o
 }
--- a/backend/go/voice-detect/package.sh
+++ b/backend/go/voice-detect/package.sh
@@ -0,0 +1,68 @@
 #!/bin/bash
 #
 # Bundle the voice-detect-grpc binary, libvoicedetect.so, the core runtime libs
 # (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE
 # so the package is self-contained. Mirrors backend/go/parakeet-cpp/package.sh;
 # run.sh routes the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc
 # is used instead of the host's.
 set -e
 CURDIR=$(dirname "$(realpath "$0")")
 REPO_ROOT="${CURDIR}/../../.."
 mkdir -p "$CURDIR/package/lib"
 cp -avf "$CURDIR/voice-detect-grpc" "$CURDIR/package/"
 cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
 # libvoicedetect.so + any soname symlinks. purego.Dlopen resolves it via
 # LD_LIBRARY_PATH, which run.sh points at lib/.
 cp -avf "$CURDIR"/libvoicedetect.so* "$CURDIR/package/lib/" 2>/dev/null || {
 	echo "ERROR: libvoicedetect.so not found in $CURDIR, run 'make' first" >&2
 	exit 1
 }
 # Detect architecture and copy the core runtime libs libvoicedetect.so links
 # against, plus the matching dynamic loader as lib/ld.so.
 if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
    echo "Detected x86_64 architecture, copying x86_64 libraries..."
    cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
 elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
    echo "Detected ARM64 architecture, copying ARM64 libraries..."
    cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
 elif [ "$(uname -s)" = "Darwin" ]; then
    echo "Detected Darwin"
 else
    echo "Error: Could not detect architecture"
    exit 1
 fi
 # Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers) based on
 # BUILD_TYPE so the backend can reach the GPU without the runtime base image
 # shipping those drivers.
 GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
 if [ -f "$GPU_LIB_SCRIPT" ]; then
    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
    package_gpu_libs
 fi
 echo "Packaging completed successfully"
 ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
--- a/backend/go/voice-detect/run.sh
+++ b/backend/go/voice-detect/run.sh
@@ -0,0 +1,16 @@
 #!/bin/bash
 set -e
 CURDIR=$(dirname "$(realpath "$0")")
 export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
 # If a self-contained ld.so was packaged, route through it so the packaged
 # libc / libstdc++ are used instead of the host's (matches the whisper /
 # parakeet backends' runtime layout).
 if [ -f "$CURDIR/lib/ld.so" ]; then
 	echo "Using lib/ld.so"
 	exec "$CURDIR/lib/ld.so" "$CURDIR/voice-detect-grpc" "$@"
 fi
 exec "$CURDIR/voice-detect-grpc" "$@"
--- a/backend/go/voice-detect/test.sh
+++ b/backend/go/voice-detect/test.sh
@@ -0,0 +1,14 @@
 #!/bin/bash
 set -e
 CURDIR=$(dirname "$(realpath "$0")")
 cd "$CURDIR"
 echo "Running voice-detect backend tests..."
 # The pure-Go parsing specs always run. The embed/verify/analyze smoke specs run
 # only when a model + WAV are provided via VOICEDETECT_BACKEND_TEST_MODEL and
 # VOICEDETECT_BACKEND_TEST_WAV; otherwise they auto-skip.
 LD_LIBRARY_PATH="$CURDIR:${LD_LIBRARY_PATH:-}" go test -v -timeout 1200s .
 echo "voice-detect tests completed."
--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=86c40c3bd6fc86f1187fb751d111b49e0fc18e84
+WHISPER_CPP_VERSION?=5ed76e9a079962f1c85cfce44edd325c27ef1f97
 SO_TARGET?=libgowhisper.so
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -178,6 +178,109 @@
    nvidia-cuda-12: "cuda12-parakeet-cpp"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-parakeet-cpp"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-parakeet-cpp"
 - &ced
  name: "ced"
  alias: "ced"
  license: mit
  icon: https://avatars.githubusercontent.com/u/95302084
  description: |
    CED sound-event classification / audio tagging (527-class AudioSet).
    ced.cpp is a C++/ggml port that performs audio tagging over the AudioSet
    taxonomy, exposed through the SoundDetection gRPC rpc and the
    /v1/audio/classification REST endpoint. It runs on CPU, NVIDIA CUDA,
    AMD ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets.
  urls:
    - https://github.com/mudler/ced.cpp
  tags:
    - audio-classification
    - CPU
    - GPU
    - CUDA
    - HIP
  capabilities:
    default: "cpu-ced"
    nvidia: "cuda12-ced"
    intel: "intel-sycl-f16-ced"
    metal: "metal-ced"
    amd: "rocm-ced"
    vulkan: "vulkan-ced"
    nvidia-l4t: "nvidia-l4t-arm64-ced"
    nvidia-cuda-13: "cuda13-ced"
    nvidia-cuda-12: "cuda12-ced"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced"
 - &voicedetect
  name: "voice-detect"
  alias: "voice-detect"
  license: mit
  icon: https://avatars.githubusercontent.com/u/95302084
  description: |
    voice-detect speaker recognition and voice analysis.
    voice-detect.cpp is a C++/ggml engine that produces L2-normalised
    speaker embeddings (ECAPA-TDNN, WeSpeaker ResNet34, 3D-Speaker
    ERes2Net, CAM++) for voice verification and 1:N identification, plus
    a wav2vec2 age / gender / emotion analysis head. It replaces the
    Python speaker-recognition backend and is exposed through the Voice*
    gRPC rpcs and the /v1/voice/* REST endpoints. It runs on CPU, NVIDIA
    CUDA, AMD ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets.
  urls:
    - https://github.com/mudler/voice-detect.cpp
  tags:
    - voice-recognition
    - speaker-verification
    - speaker-embedding
    - CPU
    - GPU
    - CUDA
    - HIP
  capabilities:
    default: "cpu-voice-detect"
    nvidia: "cuda12-voice-detect"
    intel: "intel-sycl-f16-voice-detect"
    metal: "metal-voice-detect"
    amd: "rocm-voice-detect"
    vulkan: "vulkan-voice-detect"
    nvidia-l4t: "nvidia-l4t-arm64-voice-detect"
    nvidia-cuda-13: "cuda13-voice-detect"
    nvidia-cuda-12: "cuda12-voice-detect"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-voice-detect"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-voice-detect"
 - &facedetect
  name: "face-detect"
  alias: "face-detect"
  license: mit
  icon: https://avatars.githubusercontent.com/u/95302084
  description: |
    face-detect face detection, embedding, verification and analysis.
    face-detect.cpp is a C++/ggml engine that runs SCRFD / YuNet face
    detection and ArcFace / SFace 512-d (or 128-d) L2-normalised face
    embeddings for verification and 1:N identification, plus a landmark /
    age / gender analysis head. It replaces the Python insightface backend
    and is exposed through the Embedding, Detect and Face* gRPC rpcs and
    the /v1/face/* REST endpoints. It runs on CPU, NVIDIA CUDA, AMD
    ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets.
  urls:
    - https://github.com/mudler/face-detect.cpp
  tags:
    - face-recognition
    - face-verification
    - face-embedding
    - CPU
    - GPU
    - CUDA
    - HIP
  capabilities:
    default: "cpu-face-detect"
    nvidia: "cuda12-face-detect"
    intel: "intel-sycl-f16-face-detect"
    metal: "metal-face-detect"
    amd: "rocm-face-detect"
    vulkan: "vulkan-face-detect"
    nvidia-l4t: "nvidia-l4t-arm64-face-detect"
    nvidia-cuda-13: "cuda13-face-detect"
    nvidia-cuda-12: "cuda12-face-detect"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-face-detect"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-face-detect"
 - &voxtral
  name: "voxtral"
  alias: "voxtral"
@@ -2650,6 +2753,351 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-parakeet-cpp"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-13-parakeet-cpp
 ## ced
 - !!merge <<: *ced
  name: "ced-development"
  capabilities:
    default: "cpu-ced-development"
    nvidia: "cuda12-ced-development"
    intel: "intel-sycl-f16-ced-development"
    metal: "metal-ced-development"
    amd: "rocm-ced-development"
    vulkan: "vulkan-ced-development"
    nvidia-l4t: "nvidia-l4t-arm64-ced-development"
    nvidia-cuda-13: "cuda13-ced-development"
    nvidia-cuda-12: "cuda12-ced-development"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced-development"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced-development"
 - !!merge <<: *ced
  name: "nvidia-l4t-arm64-ced"
  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-ced"
  mirrors:
    - localai/localai-backends:latest-nvidia-l4t-arm64-ced
 - !!merge <<: *ced
  name: "nvidia-l4t-arm64-ced-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-ced"
  mirrors:
    - localai/localai-backends:master-nvidia-l4t-arm64-ced
 - !!merge <<: *ced
  name: "cuda13-nvidia-l4t-arm64-ced"
  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-ced"
  mirrors:
    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-ced
 - !!merge <<: *ced
  name: "cuda13-nvidia-l4t-arm64-ced-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-ced"
  mirrors:
    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-ced
 - !!merge <<: *ced
  name: "cpu-ced"
  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-ced"
  mirrors:
    - localai/localai-backends:latest-cpu-ced
 - !!merge <<: *ced
  name: "cpu-ced-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-ced"
  mirrors:
    - localai/localai-backends:master-cpu-ced
 - !!merge <<: *ced
  name: "metal-ced"
  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-ced"
  mirrors:
    - localai/localai-backends:latest-metal-darwin-arm64-ced
 - !!merge <<: *ced
  name: "metal-ced-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-ced"
  mirrors:
    - localai/localai-backends:master-metal-darwin-arm64-ced
 - !!merge <<: *ced
  name: "cuda12-ced"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-ced"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-12-ced
 - !!merge <<: *ced
  name: "cuda12-ced-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-ced"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-12-ced
 - !!merge <<: *ced
  name: "rocm-ced"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-ced"
  mirrors:
    - localai/localai-backends:latest-gpu-rocm-hipblas-ced
 - !!merge <<: *ced
  name: "rocm-ced-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-ced"
  mirrors:
    - localai/localai-backends:master-gpu-rocm-hipblas-ced
 - !!merge <<: *ced
  name: "intel-sycl-f32-ced"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-ced"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-sycl-f32-ced
 - !!merge <<: *ced
  name: "intel-sycl-f32-ced-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-ced"
  mirrors:
    - localai/localai-backends:master-gpu-intel-sycl-f32-ced
 - !!merge <<: *ced
  name: "intel-sycl-f16-ced"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-ced"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-sycl-f16-ced
 - !!merge <<: *ced
  name: "intel-sycl-f16-ced-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-ced"
  mirrors:
    - localai/localai-backends:master-gpu-intel-sycl-f16-ced
 - !!merge <<: *ced
  name: "vulkan-ced"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-ced"
  mirrors:
    - localai/localai-backends:latest-gpu-vulkan-ced
 - !!merge <<: *ced
  name: "vulkan-ced-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-ced"
  mirrors:
    - localai/localai-backends:master-gpu-vulkan-ced
 - !!merge <<: *ced
  name: "cuda13-ced"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-ced"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-13-ced
 - !!merge <<: *ced
  name: "cuda13-ced-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-ced"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-13-ced
 ## voice-detect
 - !!merge <<: *voicedetect
  name: "voice-detect-development"
  capabilities:
    default: "cpu-voice-detect-development"
    nvidia: "cuda12-voice-detect-development"
    intel: "intel-sycl-f16-voice-detect-development"
    metal: "metal-voice-detect-development"
    amd: "rocm-voice-detect-development"
    vulkan: "vulkan-voice-detect-development"
    nvidia-l4t: "nvidia-l4t-arm64-voice-detect-development"
    nvidia-cuda-13: "cuda13-voice-detect-development"
    nvidia-cuda-12: "cuda12-voice-detect-development"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-voice-detect-development"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-voice-detect-development"
 - !!merge <<: *voicedetect
  name: "nvidia-l4t-arm64-voice-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-voice-detect"
  mirrors:
    - localai/localai-backends:latest-nvidia-l4t-arm64-voice-detect
 - !!merge <<: *voicedetect
  name: "nvidia-l4t-arm64-voice-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-voice-detect"
  mirrors:
    - localai/localai-backends:master-nvidia-l4t-arm64-voice-detect
 - !!merge <<: *voicedetect
  name: "cuda13-nvidia-l4t-arm64-voice-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-voice-detect"
  mirrors:
    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-voice-detect
 - !!merge <<: *voicedetect
  name: "cuda13-nvidia-l4t-arm64-voice-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-voice-detect"
  mirrors:
    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-voice-detect
 - !!merge <<: *voicedetect
  name: "cpu-voice-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-voice-detect"
  mirrors:
    - localai/localai-backends:latest-cpu-voice-detect
 - !!merge <<: *voicedetect
  name: "cpu-voice-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-voice-detect"
  mirrors:
    - localai/localai-backends:master-cpu-voice-detect
 - !!merge <<: *voicedetect
  name: "metal-voice-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-voice-detect"
  mirrors:
    - localai/localai-backends:latest-metal-darwin-arm64-voice-detect
 - !!merge <<: *voicedetect
  name: "metal-voice-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-voice-detect"
  mirrors:
    - localai/localai-backends:master-metal-darwin-arm64-voice-detect
 - !!merge <<: *voicedetect
  name: "cuda12-voice-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-voice-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-12-voice-detect
 - !!merge <<: *voicedetect
  name: "cuda12-voice-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-voice-detect"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-12-voice-detect
 - !!merge <<: *voicedetect
  name: "rocm-voice-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-voice-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-rocm-hipblas-voice-detect
 - !!merge <<: *voicedetect
  name: "rocm-voice-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-voice-detect"
  mirrors:
    - localai/localai-backends:master-gpu-rocm-hipblas-voice-detect
 - !!merge <<: *voicedetect
  name: "intel-sycl-f32-voice-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-voice-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-sycl-f32-voice-detect
 - !!merge <<: *voicedetect
  name: "intel-sycl-f32-voice-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-voice-detect"
  mirrors:
    - localai/localai-backends:master-gpu-intel-sycl-f32-voice-detect
 - !!merge <<: *voicedetect
  name: "intel-sycl-f16-voice-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-voice-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-sycl-f16-voice-detect
 - !!merge <<: *voicedetect
  name: "intel-sycl-f16-voice-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-voice-detect"
  mirrors:
    - localai/localai-backends:master-gpu-intel-sycl-f16-voice-detect
 - !!merge <<: *voicedetect
  name: "vulkan-voice-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-voice-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-vulkan-voice-detect
 - !!merge <<: *voicedetect
  name: "vulkan-voice-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-voice-detect"
  mirrors:
    - localai/localai-backends:master-gpu-vulkan-voice-detect
 - !!merge <<: *voicedetect
  name: "cuda13-voice-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-voice-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-13-voice-detect
 - !!merge <<: *voicedetect
  name: "cuda13-voice-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-voice-detect"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-13-voice-detect
 ## face-detect
 - !!merge <<: *facedetect
  name: "face-detect-development"
  capabilities:
    default: "cpu-face-detect-development"
    nvidia: "cuda12-face-detect-development"
    intel: "intel-sycl-f16-face-detect-development"
    metal: "metal-face-detect-development"
    amd: "rocm-face-detect-development"
    vulkan: "vulkan-face-detect-development"
    nvidia-l4t: "nvidia-l4t-arm64-face-detect-development"
    nvidia-cuda-13: "cuda13-face-detect-development"
    nvidia-cuda-12: "cuda12-face-detect-development"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-face-detect-development"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-face-detect-development"
 - !!merge <<: *facedetect
  name: "nvidia-l4t-arm64-face-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-face-detect"
  mirrors:
    - localai/localai-backends:latest-nvidia-l4t-arm64-face-detect
 - !!merge <<: *facedetect
  name: "nvidia-l4t-arm64-face-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-face-detect"
  mirrors:
    - localai/localai-backends:master-nvidia-l4t-arm64-face-detect
 - !!merge <<: *facedetect
  name: "cuda13-nvidia-l4t-arm64-face-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-face-detect"
  mirrors:
    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-face-detect
 - !!merge <<: *facedetect
  name: "cuda13-nvidia-l4t-arm64-face-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-face-detect"
  mirrors:
    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-face-detect
 - !!merge <<: *facedetect
  name: "cpu-face-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-face-detect"
  mirrors:
    - localai/localai-backends:latest-cpu-face-detect
 - !!merge <<: *facedetect
  name: "cpu-face-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-face-detect"
  mirrors:
    - localai/localai-backends:master-cpu-face-detect
 - !!merge <<: *facedetect
  name: "metal-face-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-face-detect"
  mirrors:
    - localai/localai-backends:latest-metal-darwin-arm64-face-detect
 - !!merge <<: *facedetect
  name: "metal-face-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-face-detect"
  mirrors:
    - localai/localai-backends:master-metal-darwin-arm64-face-detect
 - !!merge <<: *facedetect
  name: "cuda12-face-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-face-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-12-face-detect
 - !!merge <<: *facedetect
  name: "cuda12-face-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-face-detect"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-12-face-detect
 - !!merge <<: *facedetect
  name: "rocm-face-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-face-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-rocm-hipblas-face-detect
 - !!merge <<: *facedetect
  name: "rocm-face-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-face-detect"
  mirrors:
    - localai/localai-backends:master-gpu-rocm-hipblas-face-detect
 - !!merge <<: *facedetect
  name: "intel-sycl-f32-face-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-face-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-sycl-f32-face-detect
 - !!merge <<: *facedetect
  name: "intel-sycl-f32-face-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-face-detect"
  mirrors:
    - localai/localai-backends:master-gpu-intel-sycl-f32-face-detect
 - !!merge <<: *facedetect
  name: "intel-sycl-f16-face-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-face-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-sycl-f16-face-detect
 - !!merge <<: *facedetect
  name: "intel-sycl-f16-face-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-face-detect"
  mirrors:
    - localai/localai-backends:master-gpu-intel-sycl-f16-face-detect
 - !!merge <<: *facedetect
  name: "vulkan-face-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-face-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-vulkan-face-detect
 - !!merge <<: *facedetect
  name: "vulkan-face-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-face-detect"
  mirrors:
    - localai/localai-backends:master-gpu-vulkan-face-detect
 - !!merge <<: *facedetect
  name: "cuda13-face-detect"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-face-detect"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-13-face-detect
 - !!merge <<: *facedetect
  name: "cuda13-face-detect-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-face-detect"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-13-face-detect
 ## stablediffusion-ggml
 - !!merge <<: *stablediffusionggml
  name: "cpu-stablediffusion-ggml"
--- a/backend/python/nemo/backend.py
+++ b/backend/python/nemo/backend.py
@@ -84,6 +84,135 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(message="Model loaded successfully", success=True)
    def _get_stride_seconds(self):
        """Compute the seconds-per-frame stride for the loaded model.
        stride = preprocessor_window_stride * encoder_subsampling_factor
        """
        try:
            preprocessor = self.model.preprocessor
            window_stride = preprocessor._cfg.get('window_stride', 0.01)
            subsampling_factor = getattr(self.model.encoder, 'subsampling_factor', 8)
            return window_stride * subsampling_factor
        except (AttributeError, KeyError, TypeError) as err:
            print(
                f"Warning: could not compute stride from model config ({err}), "
                f"falling back to 0.08s/frame",
                file=sys.stderr,
            )
            return 0.08
    def _build_segments_with_words(self, hypothesis, stride, timestamp_granularities=None):
        """Build TranscriptSegment list from a NeMo Hypothesis with timestamps.
        Supports two granularity modes:
          - "word": one TranscriptSegment per word, each with a single TranscriptWord entry
          - "segment" (default): merge consecutive words into sentence-level segments,
            splitting at word-level time gaps that exceed a dynamic threshold.
        """
        if not hypothesis or not isinstance(hypothesis.timestamp, dict):
            return []
        word_offsets = hypothesis.timestamp.get('word', [])
        if not word_offsets:
            return []
        granularities = list(timestamp_granularities) if timestamp_granularities else []
        granularity = "word" if "word" in granularities else "segment"
        # Build a flat list of (text, start_ns, end_ns) from NeMo word offsets
        transcript_words = []
        for wo in word_offsets:
            word_text = wo.get('word', '')
            if not word_text:
                continue
            start_offset = wo.get('start_offset', 0)
            end_offset = wo.get('end_offset', start_offset)
            start_ns = int(start_offset * stride * 1_000_000_000)
            end_ns = int(end_offset * stride * 1_000_000_000)
            transcript_words.append({
                'text': word_text,
                'start': start_ns,
                'end': end_ns,
            })
        if not transcript_words:
            return []
        if granularity == "word":
            # One segment per word
            result = []
            for idx, tw in enumerate(transcript_words):
                word = backend_pb2.TranscriptWord(
                    start=tw['start'], end=tw['end'], text=tw['text']
                )
                result.append(backend_pb2.TranscriptSegment(
                    id=idx,
                    start=tw['start'],
                    end=tw['end'],
                    text=tw['text'],
                    words=[word],
                ))
            return result
        # segment mode — merge at word-level time-gap boundaries
        # Compute gap threshold: median inter-word gap * 3, clamped to [0.3, 2.0]s
        gaps = []
        for i in range(1, len(transcript_words)):
            gap = (transcript_words[i]['start'] - transcript_words[i - 1]['end']) / 1_000_000_000
            if gap > 0:
                gaps.append(gap)
        if gaps:
            gaps.sort()
            median_gap = gaps[len(gaps) // 2]
            threshold_ns = int(max(0.3, min(median_gap * 3, 2.0)) * 1_000_000_000)
        else:
            threshold_ns = int(0.5 * 1_000_000_000)
        result = []
        buf_words = []  # list of TranscriptWord protobuf
        buf_start = None
        buf_end = 0
        buf_text = []
        prev_end = None
        for tw in transcript_words:
            # Detect word-level time gap
            if prev_end is not None and (tw['start'] - prev_end) >= threshold_ns and buf_text:
                seg_text = ' '.join(buf_text)
                result.append(backend_pb2.TranscriptSegment(
                    id=len(result),
                    start=buf_start,
                    end=buf_end,
                    text=seg_text,
                    words=list(buf_words),
                ))
                buf_words = []
                buf_text = []
                buf_start = None
            if buf_start is None:
                buf_start = tw['start']
            buf_end = tw['end']
            buf_text.append(tw['text'])
            buf_words.append(backend_pb2.TranscriptWord(
                start=tw['start'], end=tw['end'], text=tw['text']
            ))
            prev_end = tw['end']
        # flush remaining
        if buf_text and buf_start is not None:
            seg_text = ' '.join(buf_text)
            result.append(backend_pb2.TranscriptSegment(
                id=len(result),
                start=buf_start,
                end=buf_end,
                text=seg_text,
                words=list(buf_words),
            ))
        return result
    def AudioTranscription(self, request, context):
        result_segments = []
        text = ""
@@ -93,26 +222,67 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                print(f"Error: Audio file not found: {audio_path}", file=sys.stderr)
                return backend_pb2.TranscriptResult(segments=[], text="")
-            # NEMO's transcribe method accepts a list of audio paths and returns a list of transcripts
+            # Determine requested timestamp granularity
-            results = self.model.transcribe([audio_path])
+            timestamp_granularities = list(request.timestamp_granularities) if request.timestamp_granularities else []
            want_timestamps = bool(timestamp_granularities)
-            if not results or len(results) == 0:
+            if want_timestamps:
-                return backend_pb2.TranscriptResult(segments=[], text="")
+                # Request timestamps from NeMo.
                # timestamps=True forces NeMo to return Hypothesis objects with
                # the timestamp dict populated, so we omit return_hypotheses to
                # let NeMo choose the correct return type.
                results = self.model.transcribe([audio_path], timestamps=True)
-            # Get the transcript text from the first result.
+                if results and len(results) > 0:
-            # CTC models return List[str], TDT/RNNT models return List[Hypothesis]
+                    hypotheses = results[0] if isinstance(results[0], list) else results
-            # where the actual text lives in Hypothesis.text.
+                    if hypotheses and len(hypotheses) > 0:
-            result = results[0]
+                        hypothesis = hypotheses[0]
-            if isinstance(result, str):
+
-                text = result
+                        # Hypothesis object should have .timestamp populated
                        if not hasattr(hypothesis, 'timestamp') or not isinstance(hypothesis.timestamp, dict):
                            print(
                                "Warning: timestamps were requested but NeMo did not return "
                                "Hypothesis objects; falling back to untimestamped output",
                                file=sys.stderr,
                            )
                        # Extract text
                        if hasattr(hypothesis, 'text'):
                            text = hypothesis.text or ""
                        elif isinstance(hypothesis, str):
                            text = hypothesis
                        # Build segments with word-level timestamps
                        stride = self._get_stride_seconds()
                        result_segments = self._build_segments_with_words(
                            hypothesis, stride, timestamp_granularities
                        )
                        # If no word offsets but we have text, fall back to single segment
                        if not result_segments and text:
                            result_segments.append(backend_pb2.TranscriptSegment(
                                id=0, start=0, end=0, text=text
                            ))
            else:
-                text = getattr(result, 'text', None) or ""
+                # Simple transcription without timestamps
                # NEMO's transcribe method accepts a list of audio paths and returns a list of transcripts
                results = self.model.transcribe([audio_path])
-            if text:
+                if results and len(results) > 0:
-                # Create a single segment with the full transcription
+                    # Get the transcript text from the first result.
-                result_segments.append(backend_pb2.TranscriptSegment(
+                    # CTC models return List[str], TDT/RNNT models return List[Hypothesis]
-                    id=0, start=0, end=0, text=text
+                    # where the actual text lives in Hypothesis.text.
-                ))
+                    result = results[0]
                    if isinstance(result, str):
                        text = result
                    else:
                        text = getattr(result, 'text', None) or ""
                    if text:
                        # Create a single segment with the full transcription
                        result_segments.append(backend_pb2.TranscriptSegment(
                            id=0, start=0, end=0, text=text
                        ))
        except Exception as err:
            print(f"Error in AudioTranscription: {err}", file=sys.stderr)
--- a/backend/python/trl/backend.py
+++ b/backend/python/trl/backend.py
@@ -309,6 +309,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        dataset_split = request.dataset_split or "train"
        if os.path.exists(request.dataset_source):
            _allowed_dir = os.path.realpath(os.path.abspath(os.environ.get("LOCALAI_DATASET_DIR", os.getcwd())))
            _real_path = os.path.realpath(os.path.abspath(request.dataset_source))
            if not (_real_path == _allowed_dir or _real_path.startswith(_allowed_dir + os.sep)):
                raise ValueError("Dataset source path is outside the allowed directory")
            if request.dataset_source.endswith('.json') or request.dataset_source.endswith('.jsonl'):
                dataset = load_dataset("json", data_files=request.dataset_source, split=dataset_split)
            elif request.dataset_source.endswith('.csv'):
@@ -687,6 +691,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
    def ExportModel(self, request, context):
        export_format = request.export_format or "lora"
        output_path = request.output_path
        _allowed_output_dir = os.path.realpath(os.path.abspath(os.environ.get("LOCALAI_OUTPUT_DIR", os.getcwd())))
        _real_output_path = os.path.realpath(os.path.abspath(output_path))
        if not (_real_output_path == _allowed_output_dir or _real_output_path.startswith(_allowed_output_dir + os.sep)):
            raise ValueError("Output path is outside the allowed directory")
        output_path = _real_output_path
        checkpoint_path = request.checkpoint_path
        # Extract HF token for gated model access
@@ -807,7 +816,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                env = os.environ.copy()
                env["NO_LOCAL_GGUF"] = "1"
                cmd = [sys.executable, convert_script, merge_dir, "--outtype", outtype, "--outfile", gguf_path]
-                conv_result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600, env=env)
+                conv_result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600, env=env, shell=False)  # nosemgrep: python.django.security.injection.command.subprocess-injection.subprocess-injection
                if conv_result.returncode != 0:
                    diag = f"stdout: {conv_result.stdout[-300:]}\nstderr: {conv_result.stderr[-500:]}"
                    return backend_pb2.Result(success=False,
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -48,8 +48,10 @@ try:
 except ImportError:
    HAS_REASONING_PARSERS = False
 # vLLM >= 0.23 renamed GuidedDecodingParams -> StructuredOutputsParams and the
 # SamplingParams field guided_decoding -> structured_outputs.
 try:
-    from vllm.sampling_params import GuidedDecodingParams
+    from vllm.sampling_params import StructuredOutputsParams
    HAS_GUIDED_DECODING = True
 except ImportError:
    HAS_GUIDED_DECODING = False
@@ -536,13 +538,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                if value not in (None, 0, [], False):
                    setattr(sampling_params, param_field, value)
-        # Guided decoding: use Grammar field to pass JSON schema or BNF
+        # Structured-output decoding: use Grammar field to pass JSON schema or BNF
        if HAS_GUIDED_DECODING and request.Grammar:
            try:
                json.loads(request.Grammar)  # valid JSON = JSON schema
-                sampling_params.guided_decoding = GuidedDecodingParams(json=request.Grammar)
+                sampling_params.structured_outputs = StructuredOutputsParams(json=request.Grammar)
            except json.JSONDecodeError:
-                sampling_params.guided_decoding = GuidedDecodingParams(grammar=request.Grammar)
+                sampling_params.structured_outputs = StructuredOutputsParams(grammar=request.Grammar)
        # Extract image paths and process images
        prompt = request.Prompt
@@ -596,23 +598,124 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        # Stream the results
        generated_text = ""
        generated_token_ids: list[int] = []
        last_output = None
        # Tool-parsing strategy decision (made once, before the loop):
        #
        # When a tool parser is active, the model's raw tool-call markup
        # (e.g. <tool_call>...) must not be streamed verbatim as delta.content
        # — clients would see the unparsed syntax. Two paths:
        #
        # (A) native streaming via parser.extract_tool_calls_streaming. All
        #     concrete tool parsers shipped with vLLM 0.23+ implement this
        #     (Granite4, Qwen3Coder, DeepSeekV31, Jamba, Ernie45, Hermes,
        #     llama3_json, mistral, …). The parser decides per-delta whether
        #     to emit content or suppress tool-call markup, and emits a
        #     structured DeltaMessage(tool_calls=[...]) when a call is ready.
        # (B) buffer fallback — used only when the parser surprisingly lacks
        #     the streaming method or it raises mid-stream. The post-loop
        #     extract_tool_calls assembles the final chat_delta. Same correctness
        #     guarantee as a non-streaming response, at the cost of a delayed
        #     final chunk.
        has_tool_parser = bool(self.tool_parser_cls and request.Tools)
        tp_instance = None
        tp_request = None
        native_streaming = False
        native_streaming_error = False
        if has_tool_parser:
            try:
                tools_for_parser = json.loads(request.Tools)
            except json.JSONDecodeError:
                tools_for_parser = []
            try:
                tp_instance = self.tool_parser_cls(self.tokenizer, tools=tools_for_parser)
            except TypeError:
                tp_instance = self.tool_parser_cls(self.tokenizer)
            # Build a minimal ChatCompletionRequest so the streaming method
            # sees the tools list. We do not need any other request fields —
            # parsers only read .tools (and sometimes .tool_choice, which we
            # leave at default).
            try:
                from vllm.entrypoints.openai.chat_completion.protocol import (
                    ChatCompletionRequest as _CCR,
                )
                tp_request = _CCR(
                    model="local",
                    messages=[{"role": "user", "content": ""}],
                    tools=tools_for_parser or None,
                )
            except Exception as e:
                print(f"Could not build ChatCompletionRequest for streaming parser: {e}",
                      file=sys.stderr)
                tp_request = None
            native_streaming = (
                tp_request is not None
                and hasattr(tp_instance, "extract_tool_calls_streaming")
            )
        try:
            async for request_output in outputs:
                iteration_text = request_output.outputs[0].text
                last_output = request_output
                if streaming:
                    # Remove text already sent as vllm concatenates the text from previous yields
                    delta_iteration_text = iteration_text.removeprefix(generated_text)
-                    # Send the partial result
+                    new_token_ids = list(request_output.outputs[0].token_ids)
-                    yield backend_pb2.Reply(
+                    delta_token_ids = new_token_ids[len(generated_token_ids):]
                        message=bytes(delta_iteration_text, encoding='utf-8'),
                        chat_deltas=[backend_pb2.ChatDelta(content=delta_iteration_text)],
                    )
-                # Keep track of text generated
+                    if not has_tool_parser:
                        # Plain streaming — unchanged from pre-tool-parser path.
                        yield backend_pb2.Reply(
                            message=bytes(delta_iteration_text, encoding='utf-8'),
                            chat_deltas=[backend_pb2.ChatDelta(content=delta_iteration_text)],
                        )
                    elif native_streaming and not native_streaming_error:
                        # (A) Native vLLM extract_tool_calls_streaming.
                        try:
                            msg = tp_instance.extract_tool_calls_streaming(
                                previous_text=generated_text,
                                current_text=iteration_text,
                                delta_text=delta_iteration_text,
                                previous_token_ids=generated_token_ids,
                                current_token_ids=new_token_ids,
                                delta_token_ids=delta_token_ids,
                                request=tp_request,
                            )
                        except Exception as e:
                            print(f"Streaming tool parser error (falling back to "
                                  f"buffer for the rest of the stream): {e}",
                                  file=sys.stderr)
                            native_streaming_error = True
                            msg = None
                        if msg is not None:
                            tc_protos = []
                            for tc in (msg.tool_calls or []):
                                fn = tc.function or None
                                tc_protos.append(backend_pb2.ToolCallDelta(
                                    index=tc.index,
                                    id=tc.id or "",
                                    name=(fn.name if fn and fn.name else "") or "",
                                    arguments=(fn.arguments if fn and fn.arguments else "") or "",
                                ))
                            cd_kwargs = {}
                            if msg.content:
                                cd_kwargs["content"] = msg.content
                            if msg.reasoning:
                                cd_kwargs["reasoning_content"] = msg.reasoning
                            if tc_protos:
                                cd_kwargs["tool_calls"] = tc_protos
                            if cd_kwargs:
                                yield backend_pb2.Reply(
                                    message=bytes(msg.content or "", encoding='utf-8'),
                                    chat_deltas=[backend_pb2.ChatDelta(**cd_kwargs)],
                                )
                    # (B) buffer fallback — emit nothing during the stream.
                    # The post-loop extract_tool_calls block builds the final chunk.
                # Keep track of text + token_ids generated
                generated_text = iteration_text
                generated_token_ids = list(request_output.outputs[0].token_ids)
        finally:
            await outputs.aclose()
@@ -637,16 +740,19 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            except Exception as e:
                print(f"Reasoning parser error: {e}", file=sys.stderr)
-        if self.tool_parser_cls and request.Tools:
+        # When (A) native streaming ran cleanly, per-delta yields above already
        # delivered everything — do NOT extract again on the full text or we'd
        # duplicate content/tool_calls into the final chunk.
        if has_tool_parser and not (native_streaming and not native_streaming_error):
            try:
-                tools = json.loads(request.Tools)
+                tp = tp_instance
-                # Some concrete parsers only accept the tokenizer; only the
+                if tp is None:
-                # abstract base declares the tools kwarg. Try with tools first,
+                    # Defensive: tp_instance build failed earlier; reconstruct.
-                # fall back to tokenizer-only.
+                    tools = json.loads(request.Tools)
-                try:
+                    try:
-                    tp = self.tool_parser_cls(self.tokenizer, tools=tools)
+                        tp = self.tool_parser_cls(self.tokenizer, tools=tools)
-                except TypeError:
+                    except TypeError:
-                    tp = self.tool_parser_cls(self.tokenizer)
+                        tp = self.tool_parser_cls(self.tokenizer)
                info = tp.extract_tool_calls(content, request=None)
                if info.tools_called:
                    content = info.content or ""
@@ -659,6 +765,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        ))
            except Exception as e:
                print(f"Tool parser error: {e}", file=sys.stderr)
        elif native_streaming and not native_streaming_error:
            # Per-delta path already emitted content + tool_calls; the final
            # chat_delta should carry only metadata (token counts, logprobs).
            content = ""
        # Extract token counts
        prompt_tokens = 0
@@ -698,7 +808,26 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        )
        if streaming:
-            # Final chunk with structured data
+            # Final chunk with structured data.
            #
            # If we used the buffer fallback (has_tool_parser=True AND native
            # streaming did NOT run cleanly) and the parser found no tool call,
            # flush the buffered content as ONE content delta — and clear the
            # final chat_delta's content so the metadata chunk does not repeat
            # what we just sent. This is the plain-text-with-tool-parser path.
            buffered_fallback = (
                has_tool_parser
                and not (native_streaming and not native_streaming_error)
            )
            if buffered_fallback and not tool_calls_proto and content:
                yield backend_pb2.Reply(
                    message=bytes(content, encoding='utf-8'),
                    chat_deltas=[backend_pb2.ChatDelta(content=content)],
                )
                chat_delta = backend_pb2.ChatDelta(
                    reasoning_content=reasoning_content,
                    tool_calls=tool_calls_proto,
                )
            yield backend_pb2.Reply(
                message=b"",
                prompt_tokens=prompt_tokens,
--- a/backend/python/vllm/test.py
+++ b/backend/python/vllm/test.py
@@ -279,3 +279,260 @@ class TestBackendServicer(unittest.TestCase):
            self.fail("Embedding service failed")
        finally:
            self.tearDown()
 class TestStreamingToolParser(unittest.TestCase):
    """
    Server-less unit tests for the streaming + tool-parser machinery in
    BackendServicer._predict. These tests instantiate BackendServicer
    directly and mock the vLLM engine + tool parser, so they do not need
    a GPU, a model, or a running gRPC server. Kept in a separate class to
    avoid the parent setUp() which spawns a subprocess.
    Covers #582 (follow-up to #10346):
      1. Markup-leak prevention with a non-streaming parser (buffer fallback)
      2. No content duplication on the plain-text path with the buffer fallback
      3. Native streaming progressive plain-text emission
      4. Native streaming structured tool_call, no markup leak
      5. Parser exception → graceful fallback to buffer, still no markup
      6. No-tool-parser regression: unchanged per-delta content stream
    """
    @staticmethod
    def _make_generate(chunks):
        """Build a fake vLLM engine.generate that yields cumulative chunks."""
        from types import SimpleNamespace
        async def gen(*a, **k):
            for i, t in enumerate(chunks):
                yield SimpleNamespace(
                    outputs=[SimpleNamespace(
                        text=t,
                        token_ids=list(range(i + 1)),
                        logprobs=None,
                    )],
                    prompt_token_ids=[0],
                )
        return lambda *a, **k: gen()
    @staticmethod
    def _collect(servicer, req):
        import asyncio
        async def run():
            return [r async for r in servicer._predict(req, None, streaming=True)]
        return asyncio.run(run())
    def _new_servicer(self):
        import sys, os
        sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
        from backend import BackendServicer
        s = BackendServicer()
        s.reasoning_parser_cls = None
        s.tool_parser_cls = None
        s.tokenizer = None
        return s
    # ── Case 1+2: parser without streaming method → buffer fallback ──
    def test_buffer_path_no_markup_no_duplication(self):
        from types import SimpleNamespace
        def parser_cls(called, content_text, calls):
            class _P:
                def __init__(self, tokenizer, tools=None):
                    pass
                # NOTE: NO extract_tool_calls_streaming → takes the buffer path
                def extract_tool_calls(self, c, request=None):
                    return SimpleNamespace(
                        tools_called=called, content=content_text, tool_calls=calls,
                    )
            return _P
        tools_json = '[{"type":"function","function":{"name":"calc","parameters":{}}}]'
        # Tool-call case: no raw markup in any delta.content
        s = self._new_servicer()
        s.llm = SimpleNamespace(generate=self._make_generate([
            '<tool_call>\n{"name": "calc"',
            '<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
        ]))
        call = SimpleNamespace(id="call_1",
                               function=SimpleNamespace(name="calc", arguments='{"x": 1}'))
        s.tool_parser_cls = parser_cls(True, "", [call])
        req = backend_pb2.PredictOptions(Prompt="x", Tools=tools_json)
        replies = self._collect(s, req)
        contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
        self.assertFalse(
            any("<tool_call" in c for c in contents),
            f"markup leaked: {contents!r}",
        )
        names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls]
        self.assertIn("calc", names, "tool_call missing from final chunk")
        # Plain-text-with-tools case: full content delivered exactly once
        s2 = self._new_servicer()
        s2.llm = SimpleNamespace(generate=self._make_generate([
            "The capital ",
            "The capital of France is Paris.",
        ]))
        s2.tool_parser_cls = parser_cls(False, "", [])
        req2 = backend_pb2.PredictOptions(Prompt="x", Tools=tools_json)
        joined = "".join(
            cd.content for r in self._collect(s2, req2)
            for cd in r.chat_deltas if cd.content
        )
        self.assertEqual(
            joined.count("The capital of France is Paris."), 1,
            f"buffered content duplicated: {joined!r}",
        )
    # ── Case 3: native streaming, progressive plain text ──
    def test_native_streaming_progressive_plain_text(self):
        from types import SimpleNamespace
        class _DeltaMsg:
            def __init__(self, content=None, reasoning=None, tool_calls=None):
                self.content = content
                self.reasoning = reasoning
                self.tool_calls = tool_calls or []
        class StreamingParser:
            def __init__(self, tokenizer, tools=None):
                pass
            def extract_tool_calls(self, c, request=None):
                # Should NOT be called when native streaming runs successfully.
                raise AssertionError("extract_tool_calls invoked on native-streaming path")
            def extract_tool_calls_streaming(
                self, previous_text, current_text, delta_text,
                previous_token_ids, current_token_ids, delta_token_ids, request,
            ):
                if not delta_text:
                    return None
                return _DeltaMsg(content=delta_text)
        s = self._new_servicer()
        s.llm = SimpleNamespace(generate=self._make_generate([
            "Paris ",
            "Paris is ",
            "Paris is the capital of France.",
        ]))
        s.tool_parser_cls = StreamingParser
        req = backend_pb2.PredictOptions(
            Prompt="x",
            Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
        )
        replies = self._collect(s, req)
        intermediate_content = [
            cd.content for r in replies[:-1] for cd in r.chat_deltas if cd.content
        ]
        self.assertTrue(
            len(intermediate_content) > 0,
            "Plain-text response not streamed progressively (native streaming inactive?)",
        )
        assembled = "".join(
            cd.content for r in replies for cd in r.chat_deltas if cd.content
        )
        self.assertEqual(
            assembled, "Paris is the capital of France.",
            f"Assembled content wrong: {assembled!r}",
        )
    # ── Case 4: native streaming, structured tool_call, no markup ──
    def test_native_streaming_tool_call_no_markup_leak(self):
        from types import SimpleNamespace
        class _DeltaMsg:
            def __init__(self, content=None, reasoning=None, tool_calls=None):
                self.content = content
                self.reasoning = reasoning
                self.tool_calls = tool_calls or []
        class _ToolCallStreamer:
            def __init__(self, tokenizer, tools=None):
                self._emitted = False
            def extract_tool_calls(self, c, request=None):
                raise AssertionError("extract_tool_calls invoked on native-streaming path")
            def extract_tool_calls_streaming(
                self, previous_text, current_text, delta_text,
                previous_token_ids, current_token_ids, delta_token_ids, request,
            ):
                if "</tool_call>" in current_text and not self._emitted:
                    self._emitted = True
                    fn = SimpleNamespace(name="calc", arguments='{"x": 1}')
                    tc = SimpleNamespace(id="call_1", type="function", index=0, function=fn)
                    return _DeltaMsg(tool_calls=[tc])
                return None
        s = self._new_servicer()
        s.llm = SimpleNamespace(generate=self._make_generate([
            '<tool_call>\n',
            '<tool_call>\n{"name": "calc"',
            '<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
        ]))
        s.tool_parser_cls = _ToolCallStreamer
        req = backend_pb2.PredictOptions(
            Prompt="x",
            Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
        )
        replies = self._collect(s, req)
        contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
        self.assertFalse(
            any("<tool_call" in c or "</tool_call>" in c for c in contents),
            f"markup leaked as content: {contents!r}",
        )
        names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls if tc.name]
        args  = [tc.arguments for r in replies for cd in r.chat_deltas for tc in cd.tool_calls if tc.arguments]
        self.assertIn("calc", names, f"tool_call name missing; got {names!r}")
        self.assertIn('{"x": 1}', args, f"tool_call args missing; got {args!r}")
    # ── Case 5: parser exception → fallback to buffer, no leak ──
    def test_native_streaming_parser_exception_falls_back_to_buffer(self):
        from types import SimpleNamespace
        call = SimpleNamespace(id="call_1",
                               function=SimpleNamespace(name="calc", arguments='{"x": 1}'))
        class _BrokenStreamer:
            def __init__(self, tokenizer, tools=None):
                pass
            def extract_tool_calls(self, c, request=None):
                return SimpleNamespace(tools_called=True, content="", tool_calls=[call])
            def extract_tool_calls_streaming(self, *a, **kw):
                raise RuntimeError("simulated parser bug")
        s = self._new_servicer()
        s.llm = SimpleNamespace(generate=self._make_generate([
            '<tool_call>\n{"name": "calc"',
            '<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
        ]))
        s.tool_parser_cls = _BrokenStreamer
        req = backend_pb2.PredictOptions(
            Prompt="x",
            Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
        )
        replies = self._collect(s, req)
        contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
        self.assertFalse(
            any("<tool_call" in c for c in contents),
            f"markup leaked after parser exception: {contents!r}",
        )
        names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls]
        self.assertIn("calc", names, "tool_call missing from final chunk after fallback")
    # ── Case 6: no tool parser → unchanged per-delta content stream ──
    def test_no_tool_parser_unchanged_per_delta_stream(self):
        from types import SimpleNamespace
        s = self._new_servicer()  # tool_parser_cls already None
        s.llm = SimpleNamespace(generate=self._make_generate([
            "Hello ", "Hello world", "Hello world!",
        ]))
        req = backend_pb2.PredictOptions(Prompt="x", Tools="")
        replies = self._collect(s, req)
        intermediate = [
            cd.content for r in replies[:-1] for cd in r.chat_deltas if cd.content
        ]
        self.assertEqual(
            intermediate, ["Hello ", "world", "!"],
            f"plain streaming changed; got {intermediate!r}",
        )
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -25,6 +25,7 @@ import (
 	"github.com/mudler/LocalAI/core/services/storage"
 	coreStartup "github.com/mudler/LocalAI/core/startup"
 	"github.com/mudler/LocalAI/internal"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/signals"
 	"github.com/mudler/LocalAI/pkg/vram"
@@ -71,6 +72,16 @@ func New(opts ...config.AppOption) (*Application, error) {
 	if err != nil {
 		return nil, fmt.Errorf("unable to create ModelPath: %q", err)
 	}
 	// Reap *.partial downloads abandoned by a previous run (killed mid-transfer
 	// by an OOM/restart, or stalled before cleanup could run). The 24h window
 	// is well beyond any legitimate in-flight download, so this never trims an
 	// active transfer; it just stops dead partials accumulating on the volume.
 	if removed, cErr := downloader.CleanupStalePartialFiles(options.SystemState.Model.ModelsPath, 24*time.Hour); cErr != nil {
 		xlog.Warn("Failed to reap stale partial downloads", "error", cErr)
 	} else if removed > 0 {
 		xlog.Info("Reaped stale partial downloads", "count", removed)
 	}
 	if options.GeneratedContentDir != "" {
 		err := os.MkdirAll(options.GeneratedContentDir, 0o750)
 		if err != nil {
@@ -633,6 +644,12 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
 			options.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
 		}
 	}
 	if settings.SizeAwareEviction != nil {
 		// Only apply if current value is default (false), suggesting it wasn't set from env var
 		if !options.SizeAwareEviction {
 			options.SizeAwareEviction = *settings.SizeAwareEviction
 		}
 	}
 	if settings.LRUEvictionMaxRetries != nil {
 		// Only apply if current value is default (30), suggesting it wasn't set from env var
 		if options.LRUEvictionMaxRetries == 0 {
@@ -836,6 +853,7 @@ func initializeWatchdog(application *Application, options *config.ApplicationCon
 			model.WithLRULimit(lruLimit),
 			model.WithMemoryReclaimer(options.MemoryReclaimerEnabled, options.MemoryReclaimerThreshold),
 			model.WithForceEvictionWhenBusy(options.ForceEvictionWhenBusy),
 			model.WithSizeAwareEviction(options.SizeAwareEviction),
 		)
 		application.ModelLoader().SetWatchDog(wd)
--- a/core/application/watchdog.go
+++ b/core/application/watchdog.go
@@ -90,6 +90,7 @@ func (a *Application) startWatchdog() error {
 			model.WithLRULimit(lruLimit),
 			model.WithMemoryReclaimer(appConfig.MemoryReclaimerEnabled, appConfig.MemoryReclaimerThreshold),
 			model.WithForceEvictionWhenBusy(appConfig.ForceEvictionWhenBusy),
 			model.WithSizeAwareEviction(appConfig.SizeAwareEviction),
 		)
 		// Create new stop channel BEFORE setting up any goroutines
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -1,6 +1,7 @@
 package backend
 import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"math/rand/v2"
@@ -12,7 +13,9 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/trace"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/vram"
 	"github.com/mudler/xlog"
 )
@@ -33,6 +36,67 @@ func recordModelLoadFailure(appConfig *config.ApplicationConfig, modelName, back
 	})
 }
 // estimateModelSizeBytes uses the unified EstimateModel entry point to compute
 // the total weight-file size for a model config.  It collects all weight files
 // from DownloadFiles, Model, and MMProj, and also extracts the HuggingFace
 // repo ID so EstimateModel can fall back to the HF API when local file
 // metadata is unavailable (e.g. not-yet-downloaded models).
 func estimateModelSizeBytes(c config.ModelConfig, modelsPath string) int64 {
 	seen := make(map[string]bool)
 	input := vram.ModelEstimateInput{}
 	addFile := func(uri string) {
 		if !vram.IsWeightFile(uri) {
 			return
 		}
 		resolved := uri
 		if !strings.Contains(uri, "://") {
 			resolved = "file://" + filepath.Join(modelsPath, uri)
 		}
 		if seen[resolved] {
 			return
 		}
 		seen[resolved] = true
 		input.Files = append(input.Files, vram.FileInput{URI: resolved})
 	}
 	// tryHFRepo resolves any huggingface:// or hf:// URI to an HTTPS URL and
 	// then extracts the org/model repo ID for use as the HF fallback path.
 	tryHFRepo := func(uri string) {
 		if input.HFRepo != "" {
 			return
 		}
 		resolved := downloader.URI(uri).ResolveURL()
 		if repoID, ok := vram.ExtractHFRepoID(resolved); ok {
 			input.HFRepo = repoID
 		}
 	}
 	for _, f := range c.DownloadFiles {
 		uriStr := string(f.URI)
 		addFile(uriStr)
 		tryHFRepo(uriStr)
 	}
 	addFile(c.Model)
 	tryHFRepo(c.Model)
 	if c.MMProj != "" {
 		addFile(c.MMProj)
 	}
 	if len(input.Files) == 0 && input.HFRepo == "" {
 		return 0
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
 	result, err := vram.EstimateModelMultiContext(ctx, input, nil)
 	if err != nil || result.SizeBytes == 0 {
 		return 0
 	}
 	return int64(result.SizeBytes)
 }
 func ModelOptions(c config.ModelConfig, so *config.ApplicationConfig, opts ...model.Option) []model.Option {
 	defOpts := []model.Option{
 		model.WithBackendString(c.Backend),
@@ -70,6 +134,10 @@ func ModelOptions(c config.ModelConfig, so *config.ApplicationConfig, opts ...mo
 		defOpts = append(defOpts, model.WithExternalBackend(k, v))
 	}
 	if sizeBytes := estimateModelSizeBytes(c, so.SystemState.Model.ModelsPath); sizeBytes > 0 {
 		defOpts = append(defOpts, model.WithModelSizeBytes(sizeBytes))
 	}
 	return append(defOpts, opts...)
 }
@@ -90,10 +158,11 @@ func getSeed(c config.ModelConfig) int32 {
 // DefaultContextSize and DefaultBatchSize are the backend's fallbacks when a
 // model config leaves them unset. Exported so callers that must respect the
 // effective decode window — notably the router's prompt trimmer — resolve the
-// same numbers grpcModelOpts does instead of guessing.
+// same numbers grpcModelOpts does instead of guessing. The values are owned by
 // core/config (single source of truth shared with the config default tiers).
 const (
-	DefaultContextSize = 4096
+	DefaultContextSize = config.DefaultContextSize
-	DefaultBatchSize   = 512
+	DefaultBatchSize   = config.DefaultPhysicalBatch
 )
 // EffectiveContextSize is the context window the backend will run with: the
@@ -129,7 +198,7 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
 	ctxSize := EffectiveContextSize(c)
 	b := EffectiveBatchSize(c)
-	flashAttention := "auto"
+	flashAttention := config.DefaultFlashAttention
 	if c.FlashAttention != nil {
 		flashAttention = *c.FlashAttention
@@ -175,7 +244,7 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
 		mmlock = *c.MMlock
 	}
-	nGPULayers := 9999999
+	nGPULayers := config.DefaultNGPULayers
 	if c.NGPULayers != nil {
 		nGPULayers = *c.NGPULayers
 	}
--- a/core/backend/sound_classification.go
+++ b/core/backend/sound_classification.go
@@ -0,0 +1,88 @@
 package backend
 import (
 	"context"
 	"fmt"
 	"sort"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
 	grpcPkg "github.com/mudler/LocalAI/pkg/grpc"
 	"github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/model"
 )
 // SoundDetectionRequest carries the knobs the HTTP layer collects for an
 // audio-tagging / sound-event-classification call. Audio is the path to the
 // uploaded clip on disk; TopK and Threshold are optional (0 = backend default).
 type SoundDetectionRequest struct {
 	Audio     string
 	TopK      int32
 	Threshold float32
 }
 func (r *SoundDetectionRequest) toProto() *proto.SoundDetectionRequest {
 	return &proto.SoundDetectionRequest{
 		Src:       r.Audio,
 		TopK:      r.TopK,
 		Threshold: r.Threshold,
 	}
 }
 func loadSoundDetectionModel(ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (grpcPkg.Backend, error) {
 	if modelConfig.Backend == "" {
 		return nil, fmt.Errorf("sound classification: model %q has no backend set; supported backends include ced", modelConfig.Name)
 	}
 	opts := ModelOptions(modelConfig, appConfig)
 	m, err := ml.Load(opts...)
 	if err != nil {
 		recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
 		return nil, err
 	}
 	if m == nil {
 		return nil, fmt.Errorf("could not load sound classification model")
 	}
 	return m, nil
 }
 // ModelSoundDetection runs the SoundDetection RPC against the configured
 // backend and returns a normalized schema.SoundClassificationResult.
 func ModelSoundDetection(ctx context.Context, req SoundDetectionRequest, ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (*schema.SoundClassificationResult, error) {
 	m, err := loadSoundDetectionModel(ml, modelConfig, appConfig)
 	if err != nil {
 		return nil, err
 	}
 	r, err := m.SoundDetection(ctx, req.toProto())
 	if err != nil {
 		return nil, err
 	}
 	return soundClassificationResultFromProto(modelConfig.Name, r), nil
 }
 // soundClassificationResultFromProto maps the backend detections to the
 // HTTP-facing schema, keeping the backend's score-descending order.
 func soundClassificationResultFromProto(modelName string, r *proto.SoundDetectionResponse) *schema.SoundClassificationResult {
 	out := &schema.SoundClassificationResult{
 		Model:      modelName,
 		Detections: []schema.SoundClassification{},
 	}
 	if r == nil {
 		return out
 	}
 	for _, d := range r.Detections {
 		if d == nil {
 			continue
 		}
 		out.Detections = append(out.Detections, schema.SoundClassification{
 			Index: int(d.Index),
 			Label: d.Label,
 			Score: d.Score,
 		})
 	}
 	sort.SliceStable(out.Detections, func(i, j int) bool {
 		return out.Detections[i].Score > out.Detections[j].Score
 	})
 	return out
 }
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -93,6 +93,7 @@ type RunCMD struct {
 	EnableMemoryReclaimer              bool     `env:"LOCALAI_MEMORY_RECLAIMER,MEMORY_RECLAIMER,LOCALAI_GPU_RECLAIMER,GPU_RECLAIMER" default:"false" help:"Enable memory threshold monitoring to auto-evict backends when memory usage exceeds threshold (uses GPU VRAM if available, otherwise RAM)" group:"backends"`
 	MemoryReclaimerThreshold           float64  `env:"LOCALAI_MEMORY_RECLAIMER_THRESHOLD,MEMORY_RECLAIMER_THRESHOLD,LOCALAI_GPU_RECLAIMER_THRESHOLD,GPU_RECLAIMER_THRESHOLD" default:"0.95" help:"Memory usage threshold (0.0-1.0) that triggers backend eviction (default 0.95 = 95%%)" group:"backends"`
 	ForceEvictionWhenBusy              bool     `env:"LOCALAI_FORCE_EVICTION_WHEN_BUSY,FORCE_EVICTION_WHEN_BUSY" default:"false" help:"Force eviction even when models have active API calls (default: false for safety)" group:"backends"`
 	SizeAwareEviction                  bool     `env:"LOCALAI_SIZE_AWARE_EVICTION,SIZE_AWARE_EVICTION" default:"false" help:"Evict the largest loaded model first rather than the least-recently-used one, keeping small utility models resident and maximizing freed memory per eviction" group:"backends"`
 	LRUEvictionMaxRetries              int      `env:"LOCALAI_LRU_EVICTION_MAX_RETRIES,LRU_EVICTION_MAX_RETRIES" default:"30" help:"Maximum number of retries when waiting for busy models to become idle before eviction (default: 30)" group:"backends"`
 	LRUEvictionRetryInterval           string   `env:"LOCALAI_LRU_EVICTION_RETRY_INTERVAL,LRU_EVICTION_RETRY_INTERVAL" default:"1s" help:"Interval between retries when waiting for busy models to become idle (e.g., 1s, 2s) (default: 1s)" group:"backends"`
 	Federated                          bool     `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
@@ -564,6 +565,9 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 	if r.ForceEvictionWhenBusy {
 		opts = append(opts, config.WithForceEvictionWhenBusy(true))
 	}
 	if r.SizeAwareEviction {
 		opts = append(opts, config.WithSizeAwareEviction(true))
 	}
 	if r.LRUEvictionMaxRetries > 0 {
 		opts = append(opts, config.WithLRUEvictionMaxRetries(r.LRUEvictionMaxRetries))
 	}
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -119,6 +119,7 @@ type ApplicationConfig struct {
 	// Eviction settings
 	ForceEvictionWhenBusy    bool          // Force eviction even when models have active API calls (default: false for safety)
 	SizeAwareEviction        bool          // Evict largest models first rather than least-recently-used (default: false)
 	LRUEvictionMaxRetries    int           // Maximum number of retries when waiting for busy models to become idle (default: 30)
 	LRUEvictionRetryInterval time.Duration // Interval between retries when waiting for busy models (default: 1s)
@@ -488,6 +489,16 @@ func WithForceEvictionWhenBusy(enabled bool) AppOption {
 	}
 }
 // WithSizeAwareEviction enables size-aware eviction ordering.
 // When true, the watchdog evicts the largest loaded model first rather than the
 // least-recently-used one, keeping small utility models resident and maximizing
 // memory freed per eviction.
 func WithSizeAwareEviction(enabled bool) AppOption {
 	return func(o *ApplicationConfig) {
 		o.SizeAwareEviction = enabled
 	}
 }
 // WithLRUEvictionMaxRetries sets the maximum number of retries when waiting for busy models to become idle
 func WithLRUEvictionMaxRetries(maxRetries int) AppOption {
 	return func(o *ApplicationConfig) {
@@ -1028,6 +1039,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 	memoryReclaimerEnabled := o.MemoryReclaimerEnabled
 	memoryReclaimerThreshold := o.MemoryReclaimerThreshold
 	forceEvictionWhenBusy := o.ForceEvictionWhenBusy
 	sizeAwareEviction := o.SizeAwareEviction
 	lruEvictionMaxRetries := o.LRUEvictionMaxRetries
 	threads := o.Threads
 	contextSize := o.ContextSize
@@ -1120,6 +1132,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 		MemoryReclaimerEnabled:    &memoryReclaimerEnabled,
 		MemoryReclaimerThreshold:  &memoryReclaimerThreshold,
 		ForceEvictionWhenBusy:     &forceEvictionWhenBusy,
 		SizeAwareEviction:         &sizeAwareEviction,
 		LRUEvictionMaxRetries:     &lruEvictionMaxRetries,
 		LRUEvictionRetryInterval:  &lruEvictionRetryInterval,
 		Threads:                   &threads,
@@ -1244,6 +1257,10 @@ func (o *ApplicationConfig) ApplyRuntimeSettings(settings *RuntimeSettings) (req
 		o.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
 		// This setting doesn't require restart, can be updated dynamically
 	}
 	if settings.SizeAwareEviction != nil {
 		o.SizeAwareEviction = *settings.SizeAwareEviction
 		// This setting doesn't require restart, can be updated dynamically
 	}
 	if settings.LRUEvictionMaxRetries != nil {
 		o.LRUEvictionMaxRetries = *settings.LRUEvictionMaxRetries
 		// This setting doesn't require restart, can be updated dynamically
--- a/core/config/backend_capabilities.go
+++ b/core/config/backend_capabilities.go
@@ -8,27 +8,28 @@ import (
 // Usecase name constants — the canonical string values used in gallery entries,
 // model configs (known_usecases), and UsecaseInfoMap keys.
 const (
-	UsecaseChat               = "chat"
+	UsecaseChat                = "chat"
-	UsecaseCompletion         = "completion"
+	UsecaseCompletion          = "completion"
-	UsecaseEdit               = "edit"
+	UsecaseEdit                = "edit"
-	UsecaseVision             = "vision"
+	UsecaseVision              = "vision"
-	UsecaseEmbeddings         = "embeddings"
+	UsecaseEmbeddings          = "embeddings"
-	UsecaseTokenize           = "tokenize"
+	UsecaseTokenize            = "tokenize"
-	UsecaseImage              = "image"
+	UsecaseImage               = "image"
-	UsecaseVideo              = "video"
+	UsecaseVideo               = "video"
-	UsecaseTranscript         = "transcript"
+	UsecaseTranscript          = "transcript"
-	UsecaseTTS                = "tts"
+	UsecaseTTS                 = "tts"
-	UsecaseSoundGeneration    = "sound_generation"
+	UsecaseSoundGeneration     = "sound_generation"
-	UsecaseRerank             = "rerank"
+	UsecaseRerank              = "rerank"
-	UsecaseDetection          = "detection"
+	UsecaseDetection           = "detection"
-	UsecaseDepth              = "depth"
+	UsecaseDepth               = "depth"
-	UsecaseVAD                = "vad"
+	UsecaseVAD                 = "vad"
-	UsecaseAudioTransform     = "audio_transform"
+	UsecaseAudioTransform      = "audio_transform"
-	UsecaseDiarization        = "diarization"
+	UsecaseDiarization         = "diarization"
-	UsecaseRealtimeAudio      = "realtime_audio"
+	UsecaseSoundClassification = "sound_classification"
-	UsecaseFaceRecognition    = "face_recognition"
+	UsecaseRealtimeAudio       = "realtime_audio"
-	UsecaseSpeakerRecognition = "speaker_recognition"
+	UsecaseFaceRecognition     = "face_recognition"
-	UsecaseTokenClassify      = "token_classify"
+	UsecaseSpeakerRecognition  = "speaker_recognition"
 	UsecaseTokenClassify       = "token_classify"
 )
 // GRPCMethod identifies a Backend service RPC from backend.proto.
@@ -51,6 +52,7 @@ const (
 	MethodVAD                GRPCMethod = "VAD"
 	MethodAudioTransform     GRPCMethod = "AudioTransform"
 	MethodDiarize            GRPCMethod = "Diarize"
 	MethodSoundDetection     GRPCMethod = "SoundDetection"
 	MethodAudioToAudioStream GRPCMethod = "AudioToAudioStream"
 	MethodFaceVerify         GRPCMethod = "FaceVerify"
 	MethodFaceAnalyze        GRPCMethod = "FaceAnalyze"
@@ -165,6 +167,11 @@ var UsecaseInfoMap = map[string]UsecaseInfo{
 		GRPCMethod:  MethodDiarize,
 		Description: "Speaker diarization (who-spoke-when, per-speaker segments) via the Diarize RPC.",
 	},
 	UsecaseSoundClassification: {
 		Flag:        FLAG_SOUND_CLASSIFICATION,
 		GRPCMethod:  MethodSoundDetection,
 		Description: "Sound-event classification / audio tagging (scored AudioSet labels like baby cry, glass breaking, alarms) via the SoundDetection RPC.",
 	},
 	UsecaseRealtimeAudio: {
 		Flag:        FLAG_REALTIME_AUDIO,
 		GRPCMethod:  MethodAudioToAudioStream,
@@ -535,6 +542,19 @@ var BackendCapabilities = map[string]BackendCapability{
 		DefaultUsecases:  []string{UsecaseSpeakerRecognition},
 		Description:      "Speaker recognition — voice identity verification and analysis",
 	},
 	"voice-detect": {
 		GRPCMethods:      []GRPCMethod{MethodVoiceVerify, MethodVoiceEmbed, MethodVoiceAnalyze},
 		PossibleUsecases: []string{UsecaseSpeakerRecognition},
 		DefaultUsecases:  []string{UsecaseSpeakerRecognition},
 		Description:      "voice-detect.cpp: C++/ggml speaker embedding, verification and voice analysis (age/gender/emotion)",
 	},
 	"face-detect": {
 		GRPCMethods:      []GRPCMethod{MethodEmbedding, MethodDetect, MethodFaceVerify, MethodFaceAnalyze},
 		PossibleUsecases: []string{UsecaseEmbeddings, UsecaseDetection, UsecaseFaceRecognition},
 		DefaultUsecases:  []string{UsecaseFaceRecognition},
 		AcceptsImages:    true,
 		Description:      "face-detect.cpp: C++/ggml face detection, embedding, verification and attribute analysis",
 	},
 	"silero-vad": {
 		GRPCMethods:      []GRPCMethod{MethodVAD},
 		PossibleUsecases: []string{UsecaseVAD},
--- a/core/config/defaults.go
+++ b/core/config/defaults.go
@@ -0,0 +1,30 @@
 package config
 // Canonical default values.
 //
 // These are owned here so the two layers that need them share a single source
 // of truth: the config tiers (ApplyInference/Hardware/Serving/Generic — which
 // *decide* defaults) and core/backend/options.go (which *translates* a
 // ModelConfig to the backend wire format and supplies the same fallbacks
 // defensively). Previously these were duplicated as literals across both
 // packages and had drifted (e.g. n_gpu_layers 9999999 vs 99999999, two batch
 // constants of 512). core/backend imports core/config, so backend references
 // these; config never imports backend.
 const (
 	// DefaultContextSize is the fallback context window when none is configured
 	// or estimable from the model.
 	DefaultContextSize = 4096
 	// GGUFFallbackContextSize is the context window for a GGUF model whose
 	// metadata yields no usable estimate (see guessGGUFFromFile). Deliberately
 	// smaller than DefaultContextSize to stay conservative on memory there.
 	GGUFFallbackContextSize = 1024
 	// DefaultNGPULayers means "offload all layers"; the backend (fit_params)
 	// clamps to what actually fits in device memory.
 	DefaultNGPULayers = 99999999
 	// DefaultFlashAttention is the flash-attention mode default; "auto" lets the
 	// backend enable it when the model + backend support it.
 	DefaultFlashAttention = "auto"
 )
--- a/core/config/generic_defaults.go
+++ b/core/config/generic_defaults.go
@@ -0,0 +1,115 @@
 package config
 import "os"
 // ApplyGenericDefaults fills the generic fallback values applied after the
 // higher-priority tiers (ApplyInferenceDefaults for the model family,
 // ApplyHardwareDefaults for the device, ApplyServingDefaults for serving
 // policy): sampling parameters and a few runtime flags. Like the other tiers it
 // only fills values still left unset, so model-family / explicit config wins.
 func ApplyGenericDefaults(cfg *ModelConfig) {
 	if cfg == nil {
 		return
 	}
 	// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
 	defaultTopP := 0.95
 	defaultTopK := 40
 	defaultMinP := 0.0
 	defaultTemp := 0.9
 	// https://github.com/mudler/LocalAI/issues/2780
 	defaultMirostat := 0
 	defaultMirostatTAU := 5.0
 	defaultMirostatETA := 0.1
 	defaultTypicalP := 1.0
 	defaultTFZ := 1.0
 	defaultZero := 0
 	trueV := true
 	falseV := false
 	if cfg.Seed == nil {
 		//  random number generator seed
 		defaultSeed := RAND_SEED
 		cfg.Seed = &defaultSeed
 	}
 	// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
 	// native default differs (issue #6632). Only inject it for the llama.cpp
 	// family and the empty/auto backend; leave TopK nil for known non-llama
 	// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
 	// is 0 rather than a silently-changed 40.
 	if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
 		cfg.TopK = &defaultTopK
 	}
 	if cfg.MinP == nil {
 		cfg.MinP = &defaultMinP
 	}
 	if cfg.TypicalP == nil {
 		cfg.TypicalP = &defaultTypicalP
 	}
 	if cfg.TFZ == nil {
 		cfg.TFZ = &defaultTFZ
 	}
 	if cfg.MMap == nil {
 		// MMap is enabled by default
 		// Only exception is for Intel GPUs
 		if os.Getenv("XPU") != "" {
 			cfg.MMap = &falseV
 		} else {
 			cfg.MMap = &trueV
 		}
 	}
 	if cfg.MMlock == nil {
 		// MMlock is disabled by default
 		cfg.MMlock = &falseV
 	}
 	if cfg.TopP == nil {
 		cfg.TopP = &defaultTopP
 	}
 	if cfg.Temperature == nil {
 		cfg.Temperature = &defaultTemp
 	}
 	if cfg.Maxtokens == nil {
 		cfg.Maxtokens = &defaultZero
 	}
 	if cfg.Mirostat == nil {
 		cfg.Mirostat = &defaultMirostat
 	}
 	if cfg.MirostatETA == nil {
 		cfg.MirostatETA = &defaultMirostatETA
 	}
 	if cfg.MirostatTAU == nil {
 		cfg.MirostatTAU = &defaultMirostatTAU
 	}
 	if cfg.LowVRAM == nil {
 		cfg.LowVRAM = &falseV
 	}
 	if cfg.Embeddings == nil {
 		cfg.Embeddings = &falseV
 	}
 	if cfg.Reranking == nil {
 		cfg.Reranking = &falseV
 	}
 	if cfg.PromptCacheAll == nil {
 		// Match upstream llama.cpp's default (common/common.h: cache_prompt = true)
 		// and let cache_idle_slots / kv_unified actually do useful work; users can
 		// opt out with an explicit `prompt_cache_all: false` in the model YAML.
 		cfg.PromptCacheAll = &trueV
 	}
 }
--- a/core/config/generic_defaults_test.go
+++ b/core/config/generic_defaults_test.go
@@ -0,0 +1,36 @@
 package config_test
 import (
 	. "github.com/mudler/LocalAI/core/config"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
 var _ = Describe("ApplyGenericDefaults (generic fallback tier)", func() {
 	It("fills sampling + runtime fallbacks when unset", func() {
 		cfg := &ModelConfig{} // empty backend uses the llama sampler defaults
 		ApplyGenericDefaults(cfg)
 		Expect(cfg.TopP).ToNot(BeNil())
 		Expect(*cfg.TopP).To(Equal(0.95))
 		Expect(*cfg.TopK).To(Equal(40))
 		Expect(*cfg.Temperature).To(Equal(0.9))
 		Expect(*cfg.MMap).To(BeTrue())
 		Expect(*cfg.MMlock).To(BeFalse())
 		Expect(*cfg.PromptCacheAll).To(BeTrue())
 	})
 	It("never overrides explicit values", func() {
 		tk := 7
 		tp := 0.5
 		cfg := &ModelConfig{}
 		cfg.TopK = &tk
 		cfg.TopP = &tp
 		ApplyGenericDefaults(cfg)
 		Expect(*cfg.TopK).To(Equal(7))
 		Expect(*cfg.TopP).To(Equal(0.5))
 	})
 	It("no-ops on nil", func() {
 		Expect(func() { ApplyGenericDefaults(nil) }).ToNot(Panic())
 	})
 })
--- a/core/config/gguf.go
+++ b/core/config/gguf.go
@@ -14,11 +14,6 @@ import (
 	"github.com/gpustack/gguf-parser-go/util/ptr"
 )
 const (
 	defaultContextSize = 1024
 	defaultNGPULayers  = 99999999
 )
 // reservedNonChatModel reports whether the operator reserved this model for an
 // internal primitive — the router score classifier or the PII NER
 // token_classify tier. Such a model has no chat template and must not be
@@ -38,7 +33,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
 			cSize := int(ctxSize)
 			cfg.ContextSize = &cSize
 		} else {
-			defaultCtx = defaultContextSize
+			defaultCtx = GGUFFallbackContextSize
 			cfg.ContextSize = &defaultCtx
 		}
 	}
@@ -52,7 +47,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
 	if cfg.NGPULayers == nil {
 		// we assume we want to offload all layers
-		defaultHigh := defaultNGPULayers
+		defaultHigh := DefaultNGPULayers
 		cfg.NGPULayers = &defaultHigh
 	}
--- a/core/config/hardware_defaults.go
+++ b/core/config/hardware_defaults.go
@@ -0,0 +1,180 @@
 package config
 import (
 	"fmt"
 	"strconv"
 	"strings"
 	"github.com/mudler/LocalAI/pkg/xsysinfo"
 	"github.com/mudler/xlog"
 )
 // Hardware-driven model-config defaults.
 //
 // This sits alongside the other config overriders (ApplyInferenceDefaults for
 // model families, guessDefaultsFromFile for GGUF/NGPULayers): they all
 // heuristically fill ModelConfig values the user left unset. Hardware tuning is
 // the same domain — "adjust the config from the device that will run it" — so
 // it lives here rather than scattered into the backend or a separate package.
 //
 // The heuristics are parameterized on a GPU descriptor (not on direct
 // detection) so they apply in both deployment shapes: SetDefaults passes the
 // LocalGPU on a single host, and the distributed router passes the *selected
 // node's* reported GPU before loading there (the frontend that loaded the
 // config may have no GPU at all).
 // GPU describes the device that will run a model.
 type GPU struct {
 	// Vendor is "nvidia", "amd", … (matches xsysinfo vendor constants).
 	Vendor string
 	// ComputeCapability is the NVIDIA compute capability as "major.minor"
 	// (e.g. "12.1" for GB10 / DGX Spark). Empty for non-NVIDIA / unknown.
 	ComputeCapability string
 	// VRAM is total device memory in bytes (0 = unknown).
 	VRAM uint64
 }
 // Physical batch (n_batch / n_ubatch) defaults.
 const (
 	// DefaultPhysicalBatch is the conservative default when no hardware-specific
 	// tuning applies. core/backend.DefaultBatchSize references this (single source).
 	DefaultPhysicalBatch = 512
 	// BlackwellPhysicalBatch is the default on NVIDIA Blackwell consumer GPUs
 	// (sm_12x: sm_120 RTX 50-series, sm_121 GB10 / DGX Spark). A larger physical
 	// batch materially lifts MoE prefill there (per-expert GEMM tiles fill
 	// better); measured on a GB10 with Qwen3-30B-A3B to saturate around 2048.
 	BlackwellPhysicalBatch = 2048
 )
 // IsNVIDIABlackwell reports whether the GPU is in the NVIDIA Blackwell consumer
 // family (sm_12x). Datacenter Blackwell (B100/B200/GB200, sm_100 / cc 10.0)
 // reports a different compute capability and is intentionally not matched.
 func (g GPU) IsNVIDIABlackwell() bool {
 	maj, _ := parseComputeCapability(g.ComputeCapability)
 	return maj >= 12
 }
 // PhysicalBatch returns the canonical physical batch (n_batch/n_ubatch) for the
 // given hardware, used when the model config leaves batch unset.
 func PhysicalBatch(g GPU) int {
 	if g.IsNVIDIABlackwell() {
 		return BlackwellPhysicalBatch
 	}
 	return DefaultPhysicalBatch
 }
 // IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
 // Callers that re-tune a value chosen by an upstream host (the distributed
 // router correcting the frontend's guess) use this to avoid clobbering an
 // explicit user batch such as 1024.
 func IsManagedPhysicalBatch(n int) bool {
 	return n == DefaultPhysicalBatch || n == BlackwellPhysicalBatch
 }
 // Parallel-slot (n_parallel) VRAM tiers. llama.cpp serializes requests at
 // n_parallel=1 (the backend default) and only auto-enables continuous batching
 // when n_parallel > 1 — so a single-slot default makes concurrent requests
 // queue. We default a slot count by GPU size so multi-user serving works out of
 // the box. With the backend's unified KV cache the slots SHARE the context
 // budget, so more slots add concurrency without multiplying KV memory.
 const (
 	parallelSlotsVRAMHigh = uint64(32) << 30 // >=32 GiB -> 8 slots
 	parallelSlotsVRAMMid  = uint64(8) << 30  // >=8 GiB  -> 4 slots
 	parallelSlotsVRAMLow  = uint64(4) << 30  // >=4 GiB  -> 2 slots
 )
 // DefaultParallelSlots returns the n_parallel default for the given GPU. Returns
 // 1 (no concurrency) when VRAM is unknown or too small, so we never change
 // behavior on CPU-only / tiny devices.
 func DefaultParallelSlots(g GPU) int {
 	switch {
 	case g.VRAM >= parallelSlotsVRAMHigh:
 		return 8
 	case g.VRAM >= parallelSlotsVRAMMid:
 		return 4
 	case g.VRAM >= parallelSlotsVRAMLow:
 		return 2
 	default:
 		return 1
 	}
 }
 // EnsureParallelOption appends a VRAM-scaled "parallel:N" backend option when the
 // model doesn't already set one (and the GPU warrants concurrency). Returns the
 // possibly-extended options. Shared by the single-host config path
 // (ApplyHardwareDefaults) and the distributed router (per selected node).
 func EnsureParallelOption(opts []string, gpu GPU) []string {
 	if slots := DefaultParallelSlots(gpu); slots > 1 && !hasParallelOption(opts) {
 		return append(opts, fmt.Sprintf("parallel:%d", slots))
 	}
 	return opts
 }
 // hasParallelOption reports whether the model already sets parallel/n_parallel
 // so we never override an explicit value (helper shared with serving_defaults.go).
 func hasParallelOption(opts []string) bool {
 	return backendOptionSet(opts, "parallel", "n_parallel")
 }
 // localGPU builds a GPU descriptor from local detection, used by SetDefaults on
 // a single host (the distributed router builds it from the selected node's
 // reported info instead). It is a package var so tests can inject a
 // deterministic device — detection does a live nvidia-smi call.
 var localGPU = func() GPU {
 	vendor, _ := xsysinfo.DetectGPUVendor()
 	vram, _ := xsysinfo.TotalAvailableVRAM()
 	return GPU{
 		Vendor:            vendor,
 		ComputeCapability: xsysinfo.NVIDIAComputeCapability(),
 		VRAM:              vram,
 	}
 }
 // ApplyHardwareDefaults fills ModelConfig values that depend on the target GPU
 // and were left unset by the user. Currently: a larger physical batch on
 // Blackwell. Explicit config always wins (we only touch zero values).
 func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
 	if cfg == nil {
 		return
 	}
 	if cfg.Batch == 0 && gpu.IsNVIDIABlackwell() {
 		cfg.Batch = BlackwellPhysicalBatch
 		xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
 			"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability)
 	}
 	// Enable concurrent serving by default on a capable GPU: without this the
 	// llama.cpp backend runs n_parallel=1 and serializes multi-user requests
 	// (continuous batching stays off). Unified KV means the slots share the
 	// context budget, so this is concurrency without extra KV memory. Explicit
 	// parallel/n_parallel in the model options always wins.
 	if before := len(cfg.Options); true {
 		cfg.Options = EnsureParallelOption(cfg.Options, gpu)
 		if len(cfg.Options) > before {
 			xlog.Debug("[hardware_defaults] defaulting parallel slots for concurrent serving",
 				"option", cfg.Options[len(cfg.Options)-1], "vram_gib", gpu.VRAM>>30)
 		}
 	}
 }
 // parseComputeCapability splits a "major.minor" string into integer parts.
 // Returns (-1, -1) when it can't be parsed.
 func parseComputeCapability(cc string) (int, int) {
 	cc = strings.TrimSpace(cc)
 	if cc == "" {
 		return -1, -1
 	}
 	majStr, minStr := cc, "0"
 	if dot := strings.IndexByte(cc, '.'); dot >= 0 {
 		majStr, minStr = cc[:dot], cc[dot+1:]
 	}
 	maj, err := strconv.Atoi(strings.TrimSpace(majStr))
 	if err != nil {
 		return -1, -1
 	}
 	min, err := strconv.Atoi(strings.TrimSpace(minStr))
 	if err != nil {
 		min = 0
 	}
 	return maj, min
 }
--- a/core/config/hardware_defaults_internal_test.go
+++ b/core/config/hardware_defaults_internal_test.go
@@ -0,0 +1,37 @@
 package config
 import (
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
 // Single-instance path: SetDefaults applies hardware defaults from the local
 // GPU. The detection seam (localGPU) is injected so the path is deterministic
 // without a real GPU.
 var _ = Describe("SetDefaults hardware defaults (single-instance)", func() {
 	var orig func() GPU
 	BeforeEach(func() { orig = localGPU })
 	AfterEach(func() { localGPU = orig })
 	It("sets the physical batch on a local Blackwell GPU", func() {
 		localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
 		cfg := &ModelConfig{}
 		cfg.SetDefaults()
 		Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
 	})
 	It("leaves batch unset on a non-Blackwell local GPU", func() {
 		localGPU = func() GPU { return GPU{ComputeCapability: "8.9"} }
 		cfg := &ModelConfig{}
 		cfg.SetDefaults()
 		Expect(cfg.Batch).To(Equal(0))
 	})
 	It("never overrides an explicit batch", func() {
 		localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
 		cfg := &ModelConfig{}
 		cfg.Batch = 1024
 		cfg.SetDefaults()
 		Expect(cfg.Batch).To(Equal(1024))
 	})
 })
--- a/core/config/hardware_defaults_test.go
+++ b/core/config/hardware_defaults_test.go
@@ -0,0 +1,97 @@
 package config_test
 import (
 	. "github.com/mudler/LocalAI/core/config"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
 var _ = Describe("Hardware-driven config defaults", func() {
 	DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
 		func(cc string, want bool) {
 			Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
 		},
 		Entry("GB10 12.1", "12.1", true),
 		Entry("RTX 50 12.0", "12.0", true),
 		Entry("future 13.0", "13.0", true),
 		Entry("Hopper 9.0", "9.0", false),
 		Entry("Ada 8.9", "8.9", false),
 		Entry("datacenter Blackwell sm_100 10.0", "10.0", false),
 		Entry("unknown", "", false),
 	)
 	Describe("PhysicalBatch / IsManagedPhysicalBatch", func() {
 		It("returns the Blackwell batch on Blackwell", func() {
 			Expect(PhysicalBatch(GPU{ComputeCapability: "12.1"})).To(Equal(BlackwellPhysicalBatch))
 		})
 		It("returns the default batch otherwise", func() {
 			Expect(PhysicalBatch(GPU{ComputeCapability: "9.0"})).To(Equal(DefaultPhysicalBatch))
 			Expect(PhysicalBatch(GPU{})).To(Equal(DefaultPhysicalBatch))
 		})
 		It("recognizes managed defaults but not explicit values", func() {
 			Expect(IsManagedPhysicalBatch(DefaultPhysicalBatch)).To(BeTrue())
 			Expect(IsManagedPhysicalBatch(BlackwellPhysicalBatch)).To(BeTrue())
 			Expect(IsManagedPhysicalBatch(1024)).To(BeFalse())
 		})
 	})
 	Describe("ApplyHardwareDefaults", func() {
 		It("raises an unset batch to 2048 on Blackwell", func() {
 			cfg := &ModelConfig{}
 			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
 			Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
 		})
 		It("leaves batch unset on non-Blackwell", func() {
 			cfg := &ModelConfig{}
 			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0"})
 			Expect(cfg.Batch).To(Equal(0))
 		})
 		It("never overrides an explicit batch", func() {
 			cfg := &ModelConfig{}
 			cfg.Batch = 1024
 			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
 			Expect(cfg.Batch).To(Equal(1024))
 		})
 		It("no-ops on nil", func() {
 			Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic())
 		})
 	})
 	const gib = uint64(1) << 30
 	DescribeTable("DefaultParallelSlots (by VRAM)",
 		func(vramGiB uint64, want int) {
 			Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want))
 		},
 		Entry("GB10 119 GiB", uint64(119), 8),
 		Entry("48 GiB", uint64(48), 8),
 		Entry("24 GiB", uint64(24), 4),
 		Entry("8 GiB", uint64(8), 4),
 		Entry("6 GiB", uint64(6), 2),
 		Entry("2 GiB", uint64(2), 1),
 		Entry("unknown 0", uint64(0), 1),
 	)
 	Describe("ApplyHardwareDefaults parallel slots", func() {
 		It("adds a VRAM-scaled parallel option on a capable GPU", func() {
 			cfg := &ModelConfig{}
 			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
 			Expect(cfg.Options).To(ContainElement("parallel:8"))
 		})
 		It("scales the slot count down with VRAM", func() {
 			cfg := &ModelConfig{}
 			ApplyHardwareDefaults(cfg, GPU{VRAM: 24 * gib})
 			Expect(cfg.Options).To(ContainElement("parallel:4"))
 		})
 		It("adds no parallel option on small/unknown VRAM", func() {
 			cfg := &ModelConfig{}
 			ApplyHardwareDefaults(cfg, GPU{VRAM: 2 * gib})
 			Expect(cfg.Options).ToNot(ContainElement(ContainSubstring("parallel")))
 		})
 		It("never overrides an explicit parallel option", func() {
 			cfg := &ModelConfig{Options: []string{"parallel:2"}}
 			ApplyHardwareDefaults(cfg, GPU{VRAM: 119 * gib})
 			Expect(cfg.Options).To(Equal([]string{"parallel:2"}))
 		})
 	})
 })
--- a/core/config/hooks_llamacpp.go
+++ b/core/config/hooks_llamacpp.go
@@ -34,7 +34,7 @@ func llamaCppDefaults(cfg *ModelConfig, modelPath string) {
 	// Default context size if not set, regardless of whether GGUF parsing succeeds
 	defer func() {
 		if cfg.ContextSize == nil {
-			ctx := defaultContextSize
+			ctx := GGUFFallbackContextSize
 			cfg.ContextSize = &ctx
 		}
 	}()
--- a/core/config/meta/constants.go
+++ b/core/config/meta/constants.go
@@ -68,6 +68,7 @@ var UsecaseOptions = []FieldOption{
 	{Value: "face_recognition", Label: "Face Recognition"},
 	{Value: "transcript", Label: "Transcript"},
 	{Value: "diarization", Label: "Diarization"},
 	{Value: "sound_classification", Label: "Sound Classification"},
 	{Value: "speaker_recognition", Label: "Speaker Recognition"},
 	{Value: "tts", Label: "TTS"},
 	{Value: "sound_generation", Label: "Sound Generation"},
--- a/core/config/meta/registry.go
+++ b/core/config/meta/registry.go
@@ -286,6 +286,15 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order:       45,
 		},
 		// --- Alias ---
 		"alias": {
 			Section:     "alias",
 			Label:       "Alias target",
 			Description: "Redirect all traffic for this model to another configured model. When set, every other field on this config is ignored and requests are served by the target model.",
 			Component:   "model-select",
 			Order:       0,
 		},
 		// --- Pipeline ---
 		"pipeline.llm": {
 			Section:              "pipeline",
@@ -319,6 +328,30 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			AutocompleteProvider: ProviderModelsVAD,
 			Order:                63,
 		},
 		"pipeline.sound_detection": {
 			Section:              "pipeline",
 			Label:                "Sound Detection Model",
 			Description:          "Model to use for sound-event classification (audio tagging, e.g. ced) in the pipeline. When set, committed realtime audio is also classified and the scored AudioSet tags are emitted as a conversation.item.sound_detection event.",
 			Component:            "model-select",
 			AutocompleteProvider: ProviderModels,
 			Order:                64,
 		},
 		"pipeline.sound_detection_window_ms": {
 			Section:     "pipeline",
 			Label:       "Sound Detection Window (ms)",
 			Description: "Server-side windowing for a sound-only realtime session: length in ms of the audio window classified each hop. 0 = client-driven (the client commits windows).",
 			Component:   "number",
 			Min:         f64(0),
 			Order:       65,
 		},
 		"pipeline.sound_detection_hop_ms": {
 			Section:     "pipeline",
 			Label:       "Sound Detection Hop (ms)",
 			Description: "Server-side windowing hop in ms: how often the server classifies the last window. 0 = client-driven.",
 			Component:   "number",
 			Min:         f64(0),
 			Order:       66,
 		},
 		"pipeline.reasoning_effort": {
 			Section:     "pipeline",
 			Label:       "Reasoning Effort",
@@ -448,6 +481,55 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Component:   "json-editor",
 			Order:       78,
 		},
 		"pipeline.voice_recognition.enforce": {
 			Section:     "pipeline",
 			Label:       "Voice Gate Enforce",
 			Description: "Whether the gate rejects unauthorized speakers. Enabled (default) drops unauthorized utterances before the LLM. Disabled still resolves and surfaces the speaker (for the conversation.item.speaker event and personalization) but never drops a turn.",
 			Component:   "toggle",
 			Order:       80,
 		},
 		"pipeline.voice_recognition.identity.announce": {
 			Section:     "pipeline",
 			Label:       "Speaker Identity Announce",
 			Description: "Emit a conversation.item.speaker event to the client naming the recognized speaker. When set, identity is resolved on every turn even if 'when' is 'first'.",
 			Component:   "toggle",
 			Order:       81,
 		},
 		"pipeline.voice_recognition.identity.announce_unknown": {
 			Section:     "pipeline",
 			Label:       "Speaker Identity Announce Unknown",
 			Description: "Also emit the conversation.item.speaker event (with matched=false) when no confident match is found. Default only announces on a match.",
 			Component:   "toggle",
 			Order:       82,
 		},
 		"pipeline.voice_recognition.identity.personalize": {
 			Section:     "pipeline",
 			Label:       "Speaker Identity Personalize",
 			Description: "Inform the LLM who is speaking so it can tailor replies. Enables the name and system-note injection below.",
 			Component:   "toggle",
 			Order:       83,
 		},
 		"pipeline.voice_recognition.identity.inject_name": {
 			Section:     "pipeline",
 			Label:       "Speaker Identity Inject Name",
 			Description: "Personalization: set the per-message OpenAI 'name' field on each user turn to the recognized speaker.",
 			Component:   "toggle",
 			Order:       84,
 		},
 		"pipeline.voice_recognition.identity.inject_system_note": {
 			Section:     "pipeline",
 			Label:       "Speaker Identity Inject System Note",
 			Description: "Personalization: append a 'The current speaker is <name>.' note to the system message reflecting the latest speaker.",
 			Component:   "toggle",
 			Order:       85,
 		},
 		"pipeline.voice_recognition.identity.note_unknown": {
 			Section:     "pipeline",
 			Label:       "Speaker Identity Note Unknown",
 			Description: "Personalization: when the speaker is unidentified, append 'The current speaker is unknown.' to the system message so the model can ask who it is talking to.",
 			Component:   "toggle",
 			Order:       86,
 		},
 		"pipeline.max_history_items": {
 			Section:     "pipeline",
 			Label:       "Max History Items",
--- a/core/config/meta/registry_test.go
+++ b/core/config/meta/registry_test.go
@@ -0,0 +1,28 @@
 package meta_test
 import (
 	"github.com/mudler/LocalAI/core/config/meta"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
 var _ = Describe("alias field metadata", func() {
 	It("registers the alias field as a model-select in the alias section", func() {
 		reg := meta.DefaultRegistry()
 		f, ok := reg["alias"]
 		Expect(ok).To(BeTrue(), "alias field should have a registry override")
 		Expect(f.Section).To(Equal("alias"))
 		Expect(f.Component).To(Equal("model-select"))
 	})
 	It("defines an alias section", func() {
 		var found bool
 		for _, s := range meta.DefaultSections() {
 			if s.ID == "alias" {
 				found = true
 			}
 		}
 		Expect(found).To(BeTrue(), "DefaultSections should include an alias section")
 	})
 })
--- a/core/config/meta/types.go
+++ b/core/config/meta/types.go
@@ -69,6 +69,7 @@ type FieldMetaOverride struct {
 func DefaultSections() []Section {
 	return []Section{
 		{ID: "general", Label: "General", Icon: "settings", Order: 0},
 		{ID: "alias", Label: "Alias", Icon: "git-merge", Order: 5},
 		{ID: "llm", Label: "LLM", Icon: "cpu", Order: 10},
 		{ID: "parameters", Label: "Parameters", Icon: "sliders", Order: 20},
 		{ID: "templates", Label: "Templates", Icon: "file-text", Order: 30},
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -37,6 +37,12 @@ type ModelConfig struct {
 	schema.PredictionOptions `yaml:"parameters,omitempty" json:"parameters,omitempty"`
 	Name                     string `yaml:"name,omitempty" json:"name,omitempty"`
 	// Alias, when set, makes this config a pure redirect: every request for
 	// Name is served by the model named here. All other fields are ignored.
 	// The target must be an existing, non-alias model (enforced at load and
 	// at create/swap time). See docs/content for Model Aliases.
 	Alias string `yaml:"alias,omitempty" json:"alias,omitempty"`
 	F16                 *bool               `yaml:"f16,omitempty" json:"f16,omitempty"`
 	Threads             *int                `yaml:"threads,omitempty" json:"threads,omitempty"`
 	Debug               *bool               `yaml:"debug,omitempty" json:"debug,omitempty"`
@@ -391,6 +397,10 @@ func (c *ModelConfig) HasRouter() bool {
 	return len(c.Router.Candidates) > 0
 }
 // IsAlias reports whether this config is a pure redirect to another model.
 // Value receiver so it is callable on non-addressable config values too.
 func (c ModelConfig) IsAlias() bool { return c.Alias != "" }
 // @Description PII filtering configuration. PII redaction is per-model so
 // that local models don't pay the latency or behaviour change of regex
 // scanning, while cloud-bound traffic (cloud-proxy backend) can default to
@@ -594,6 +604,20 @@ type Pipeline struct {
 	LLM           string `yaml:"llm,omitempty" json:"llm,omitempty"`
 	Transcription string `yaml:"transcription,omitempty" json:"transcription,omitempty"`
 	VAD           string `yaml:"vad,omitempty" json:"vad,omitempty"`
 	// SoundDetection names a sound-event-classification model (e.g. ced). When
 	// set, each VAD-committed realtime utterance is also run through it and the
 	// scored AudioSet tags are emitted as a conversation.item.sound_detection
 	// server event, alongside (and independent of) transcription.
 	SoundDetection string `yaml:"sound_detection,omitempty" json:"sound_detection,omitempty"`
 	// SoundDetectionWindowMs / SoundDetectionHopMs enable server-side windowing
 	// for a sound-detection-only realtime session: instead of the client
 	// committing audio buffers, the server classifies the last WindowMs of
 	// streamed audio every HopMs and emits a sound_detection event per hop. Both
 	// must be > 0 to activate; otherwise the session stays client-driven (the
 	// client commits windows via input_audio_buffer.commit).
 	SoundDetectionWindowMs int `yaml:"sound_detection_window_ms,omitempty" json:"sound_detection_window_ms,omitempty"`
 	SoundDetectionHopMs    int `yaml:"sound_detection_hop_ms,omitempty" json:"sound_detection_hop_ms,omitempty"`
 	// ReasoningEffort sets the reasoning effort (none|minimal|low|medium|high) for
 	// the pipeline's LLM without editing the LLM model config. Overrides the LLM's
@@ -759,6 +783,13 @@ type PipelineVoiceRecognition struct {
 	Allow VoiceRecognitionAllow `yaml:"allow,omitempty" json:"allow,omitempty"`
 	// References are the authorized reference speakers (verify mode).
 	References []VoiceReference `yaml:"references,omitempty" json:"references,omitempty"`
 	// Enforce controls the authorization gate. A nil value or true rejects
 	// unauthorized speakers (the historical behavior). false resolves the
 	// speaker's identity for surfacing/personalization but never drops a turn.
 	Enforce *bool `yaml:"enforce,omitempty" json:"enforce,omitempty"`
 	// Identity surfaces the recognized speaker to the client and the LLM. It is
 	// independent of Enforce: identity can be surfaced without gating.
 	Identity *VoiceIdentityConfig `yaml:"identity,omitempty" json:"identity,omitempty"`
 }
 // @Description VoiceRecognitionAllow filters authorized registry identities.
@@ -775,6 +806,25 @@ type VoiceReference struct {
 	Audio string `yaml:"audio,omitempty" json:"audio,omitempty"`
 }
 // @Description VoiceIdentityConfig surfaces the recognized speaker to the realtime
 // client and the LLM. When set, identity is resolved on every turn even if the
 // gate's When is "first" (the gate still authorizes only once).
 type VoiceIdentityConfig struct {
 	// Announce emits a conversation.item.speaker event to the client.
 	Announce bool `yaml:"announce,omitempty" json:"announce,omitempty"`
 	// AnnounceUnknown also emits the event when there is no confident match.
 	AnnounceUnknown bool `yaml:"announce_unknown,omitempty" json:"announce_unknown,omitempty"`
 	// Personalize informs the LLM who is speaking.
 	Personalize bool `yaml:"personalize,omitempty" json:"personalize,omitempty"`
 	// InjectName sets the per-message name field on each user turn.
 	InjectName bool `yaml:"inject_name,omitempty" json:"inject_name,omitempty"`
 	// InjectSystemNote maintains a "current speaker" note in the system message.
 	InjectSystemNote bool `yaml:"inject_system_note,omitempty" json:"inject_system_note,omitempty"`
 	// NoteUnknown adds a "the current speaker is unknown" note (enables the model
 	// to ask who it is talking to).
 	NoteUnknown bool `yaml:"note_unknown,omitempty" json:"note_unknown,omitempty"`
 }
 // VoiceGateEnabled reports whether a voice-recognition gate is configured. The
 // mere presence of the block is the intent signal: a present-but-incomplete
 // block (e.g. missing model) must fail closed at construction, not be silently
@@ -783,6 +833,28 @@ func (p Pipeline) VoiceGateEnabled() bool {
 	return p.VoiceRecognition != nil
 }
 // EnforceGate reports whether the gate rejects unauthorized speakers. A nil
 // Enforce means "enforce" so existing configs keep gating.
 func (p PipelineVoiceRecognition) EnforceGate() bool {
 	return p.Enforce == nil || *p.Enforce
 }
 // IdentityEnabled reports whether the speaker's identity must be resolved for
 // surfacing or personalization.
 func (p PipelineVoiceRecognition) IdentityEnabled() bool {
 	return p.Identity != nil && (p.Identity.Announce || p.Identity.Personalize)
 }
 // AnnounceEnabled reports whether to emit the conversation.item.speaker event.
 func (p PipelineVoiceRecognition) AnnounceEnabled() bool {
 	return p.Identity != nil && p.Identity.Announce
 }
 // PersonalizeEnabled reports whether to inform the LLM of the speaker.
 func (p PipelineVoiceRecognition) PersonalizeEnabled() bool {
 	return p.Identity != nil && p.Identity.Personalize
 }
 // Normalize fills in defaults in place for omitted fields.
 func (v *PipelineVoiceRecognition) Normalize() {
 	if v.Mode == "" {
@@ -1111,107 +1183,22 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	// This ensures gallery-installed and runtime-loaded models get optimal parameters.
 	ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model)
-	// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
+	// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell).
-	defaultTopP := 0.95
+	// Uses the local GPU here; in distributed mode the router re-applies the same
-	defaultTopK := 40
+	// heuristics for the selected node's GPU before loading. Explicit config wins.
-	defaultMinP := 0.0
+	ApplyHardwareDefaults(cfg, localGPU())
-	defaultTemp := 0.9
+
-	// https://github.com/mudler/LocalAI/issues/2780
+	// Apply serving-policy defaults (device-independent): cross-request prefix
-	defaultMirostat := 0
+	// caching. Propagates to distributed nodes via the model options.
-	defaultMirostatTAU := 5.0
+	ApplyServingDefaults(cfg)
-	defaultMirostatETA := 0.1
+
-	defaultTypicalP := 1.0
+	// Generic fallback defaults (sampling params + runtime flags), applied after
-	defaultTFZ := 1.0
+	// the model-family / hardware / serving tiers above. Only fills unset values.
-	defaultZero := 0
+	ApplyGenericDefaults(cfg)
 	trueV := true
 	falseV := false
 	if cfg.Seed == nil {
 		//  random number generator seed
 		defaultSeed := RAND_SEED
 		cfg.Seed = &defaultSeed
 	}
 	// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
 	// native default differs (issue #6632). Only inject it for the llama.cpp
 	// family and the empty/auto backend; leave TopK nil for known non-llama
 	// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
 	// is 0 rather than a silently-changed 40.
 	if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
 		cfg.TopK = &defaultTopK
 	}
 	if cfg.MinP == nil {
 		cfg.MinP = &defaultMinP
 	}
 	if cfg.TypicalP == nil {
 		cfg.TypicalP = &defaultTypicalP
 	}
 	if cfg.TFZ == nil {
 		cfg.TFZ = &defaultTFZ
 	}
 	if cfg.MMap == nil {
 		// MMap is enabled by default
 		// Only exception is for Intel GPUs
 		if os.Getenv("XPU") != "" {
 			cfg.MMap = &falseV
 		} else {
 			cfg.MMap = &trueV
 		}
 	}
 	if cfg.MMlock == nil {
 		// MMlock is disabled by default
 		cfg.MMlock = &falseV
 	}
 	if cfg.TopP == nil {
 		cfg.TopP = &defaultTopP
 	}
 	if cfg.Temperature == nil {
 		cfg.Temperature = &defaultTemp
 	}
 	if cfg.Maxtokens == nil {
 		cfg.Maxtokens = &defaultZero
 	}
 	if cfg.Mirostat == nil {
 		cfg.Mirostat = &defaultMirostat
 	}
 	if cfg.MirostatETA == nil {
 		cfg.MirostatETA = &defaultMirostatETA
 	}
 	if cfg.MirostatTAU == nil {
 		cfg.MirostatTAU = &defaultMirostatTAU
 	}
 	if cfg.LowVRAM == nil {
 		cfg.LowVRAM = &falseV
 	}
 	if cfg.Embeddings == nil {
 		cfg.Embeddings = &falseV
 	}
 	if cfg.Reranking == nil {
 		cfg.Reranking = &falseV
 	}
 	if cfg.PromptCacheAll == nil {
 		// Match upstream llama.cpp's default (common/common.h: cache_prompt = true)
 		// and let cache_idle_slots / kv_unified actually do useful work; users can
 		// opt out with an explicit `prompt_cache_all: false` in the model YAML.
 		cfg.PromptCacheAll = &trueV
 	}
 	if threads == 0 {
 		// Threads can't be 0
 		threads = 4
@@ -1243,6 +1230,22 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 }
 func (c *ModelConfig) Validate() (bool, error) {
 	// An alias is a pure redirect: validate only its own shape here. Target
 	// existence and the no-chain rule need the full config set, so the loader
 	// (load-time) and the create/swap endpoints enforce those.
 	if c.IsAlias() {
 		if c.Name == "" {
 			return false, fmt.Errorf("alias config requires a name")
 		}
 		if c.Alias == c.Name {
 			return false, fmt.Errorf("alias %q cannot point to itself", c.Name)
 		}
 		if c.Backend != "" || c.Model != "" {
 			return false, fmt.Errorf("alias config %q must not set backend or parameters.model: an alias is a pure redirect", c.Name)
 		}
 		return true, nil
 	}
 	downloadedFileNames := []string{}
 	for _, f := range c.DownloadFiles {
 		downloadedFileNames = append(downloadedFileNames, f.Filename)
@@ -1463,6 +1466,11 @@ const (
 	// so it may combine freely with other usecases.
 	FLAG_TOKEN_CLASSIFY ModelConfigUsecase = 0b1000000000000000000000
 	// Marks a model as wired for the SoundDetection gRPC primitive
 	// (audio tagging / sound-event classification — scored AudioSet
 	// labels via the SoundDetection RPC, e.g. ced).
 	FLAG_SOUND_CLASSIFICATION ModelConfigUsecase = 0b10000000000000000000000
 	// Common Subsets
 	FLAG_LLM ModelConfigUsecase = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
 )
@@ -1471,12 +1479,12 @@ const (
 // Flags within the same group are NOT orthogonal (e.g., chat and completion are
 // both text/language). A model is multimodal when its usecases span 2+ groups.
 var ModalityGroups = []ModelConfigUsecase{
-	FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT,                // text/language
+	FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT,                           // text/language
-	FLAG_VISION | FLAG_DETECTION,                           // visual understanding
+	FLAG_VISION | FLAG_DETECTION,                                      // visual understanding
-	FLAG_TRANSCRIPT | FLAG_REALTIME_AUDIO,                  // speech input — realtime_audio is any-to-any, so it counts here too
+	FLAG_TRANSCRIPT | FLAG_REALTIME_AUDIO | FLAG_SOUND_CLASSIFICATION, // audio input — realtime_audio is any-to-any, so it counts here too
-	FLAG_TTS | FLAG_SOUND_GENERATION | FLAG_REALTIME_AUDIO, // audio output — and here, so a lone realtime_audio flag still reads as multimodal
+	FLAG_TTS | FLAG_SOUND_GENERATION | FLAG_REALTIME_AUDIO,            // audio output — and here, so a lone realtime_audio flag still reads as multimodal
-	FLAG_AUDIO_TRANSFORM,                                   // audio in/out transforms
+	FLAG_AUDIO_TRANSFORM,                                              // audio in/out transforms
-	FLAG_IMAGE | FLAG_VIDEO,                                // visual generation
+	FLAG_IMAGE | FLAG_VIDEO,                                           // visual generation
 }
 // IsMultimodal returns true if the given usecases span two or more orthogonal
@@ -1499,29 +1507,30 @@ func GetAllModelConfigUsecases() map[string]ModelConfigUsecase {
 	return map[string]ModelConfigUsecase{
 		// Note: FLAG_ANY is intentionally excluded from this map
 		// because it's 0 and would always match in HasUsecases checks
-		"FLAG_CHAT":                FLAG_CHAT,
+		"FLAG_CHAT":                 FLAG_CHAT,
-		"FLAG_COMPLETION":          FLAG_COMPLETION,
+		"FLAG_COMPLETION":           FLAG_COMPLETION,
-		"FLAG_EDIT":                FLAG_EDIT,
+		"FLAG_EDIT":                 FLAG_EDIT,
-		"FLAG_EMBEDDINGS":          FLAG_EMBEDDINGS,
+		"FLAG_EMBEDDINGS":           FLAG_EMBEDDINGS,
-		"FLAG_RERANK":              FLAG_RERANK,
+		"FLAG_RERANK":               FLAG_RERANK,
-		"FLAG_IMAGE":               FLAG_IMAGE,
+		"FLAG_IMAGE":                FLAG_IMAGE,
-		"FLAG_TRANSCRIPT":          FLAG_TRANSCRIPT,
+		"FLAG_TRANSCRIPT":           FLAG_TRANSCRIPT,
-		"FLAG_TTS":                 FLAG_TTS,
+		"FLAG_TTS":                  FLAG_TTS,
-		"FLAG_SOUND_GENERATION":    FLAG_SOUND_GENERATION,
+		"FLAG_SOUND_GENERATION":     FLAG_SOUND_GENERATION,
-		"FLAG_TOKENIZE":            FLAG_TOKENIZE,
+		"FLAG_TOKENIZE":             FLAG_TOKENIZE,
-		"FLAG_VAD":                 FLAG_VAD,
+		"FLAG_VAD":                  FLAG_VAD,
-		"FLAG_LLM":                 FLAG_LLM,
+		"FLAG_LLM":                  FLAG_LLM,
-		"FLAG_VIDEO":               FLAG_VIDEO,
+		"FLAG_VIDEO":                FLAG_VIDEO,
-		"FLAG_DETECTION":           FLAG_DETECTION,
+		"FLAG_DETECTION":            FLAG_DETECTION,
-		"FLAG_VISION":              FLAG_VISION,
+		"FLAG_VISION":               FLAG_VISION,
-		"FLAG_FACE_RECOGNITION":    FLAG_FACE_RECOGNITION,
+		"FLAG_FACE_RECOGNITION":     FLAG_FACE_RECOGNITION,
-		"FLAG_SPEAKER_RECOGNITION": FLAG_SPEAKER_RECOGNITION,
+		"FLAG_SPEAKER_RECOGNITION":  FLAG_SPEAKER_RECOGNITION,
-		"FLAG_AUDIO_TRANSFORM":     FLAG_AUDIO_TRANSFORM,
+		"FLAG_AUDIO_TRANSFORM":      FLAG_AUDIO_TRANSFORM,
-		"FLAG_DIARIZATION":         FLAG_DIARIZATION,
+		"FLAG_DIARIZATION":          FLAG_DIARIZATION,
-		"FLAG_REALTIME_AUDIO":      FLAG_REALTIME_AUDIO,
+		"FLAG_SOUND_CLASSIFICATION": FLAG_SOUND_CLASSIFICATION,
-		"FLAG_SCORE":               FLAG_SCORE,
+		"FLAG_REALTIME_AUDIO":       FLAG_REALTIME_AUDIO,
-		"FLAG_DEPTH":               FLAG_DEPTH,
+		"FLAG_SCORE":                FLAG_SCORE,
-		"FLAG_TOKEN_CLASSIFY":      FLAG_TOKEN_CLASSIFY,
+		"FLAG_DEPTH":                FLAG_DEPTH,
 		"FLAG_TOKEN_CLASSIFY":       FLAG_TOKEN_CLASSIFY,
 	}
 }
@@ -1724,6 +1733,16 @@ func (c *ModelConfig) GuessUsecases(u ModelConfigUsecase) bool {
 		}
 	}
 	if (u & FLAG_SOUND_CLASSIFICATION) == FLAG_SOUND_CLASSIFICATION {
 		// ced is a sound-event tagger (AudioSet labels) surfaced via the
 		// SoundDetection gRPC. Models without an explicit known_usecases
 		// still surface when they run on one of these backends.
 		soundClassificationBackends := []string{"ced"}
 		if !slices.Contains(soundClassificationBackends, c.Backend) {
 			return false
 		}
 	}
 	if (u & FLAG_REALTIME_AUDIO) == FLAG_REALTIME_AUDIO {
 		// Backends that own a single any-to-any loop and implement
 		// AudioToAudioStream — listed here so models without an explicit
--- a/core/config/model_config_loader.go
+++ b/core/config/model_config_loader.go
@@ -294,6 +294,44 @@ func (bcl *ModelConfigLoader) UpdateModelConfig(m string, updater func(*ModelCon
 	}
 }
 // ResolveAlias follows a one-hop alias to its target config. Returns
 // (resolved, wasAlias, err). Non-alias configs return (cfg, false, nil)
 // unchanged. Strict: the target must exist and must not itself be an alias
 // (chains are rejected). The returned config is a copy of the target.
 func (bcl *ModelConfigLoader) ResolveAlias(cfg *ModelConfig) (*ModelConfig, bool, error) {
 	if cfg == nil || !cfg.IsAlias() {
 		return cfg, false, nil
 	}
 	target, exists := bcl.GetModelConfig(cfg.Alias)
 	if !exists {
 		return nil, true, fmt.Errorf("alias %q points to unknown model %q", cfg.Name, cfg.Alias)
 	}
 	if target.IsAlias() {
 		return nil, true, fmt.Errorf("alias %q points to another alias %q (chains are not allowed)", cfg.Name, cfg.Alias)
 	}
 	return &target, true, nil
 }
 // ValidateAliasTarget checks an alias config's target at create/swap time:
 // the target must exist, must not be an alias, and must not be disabled.
 // Returns nil for non-alias configs.
 func (bcl *ModelConfigLoader) ValidateAliasTarget(cfg *ModelConfig) error {
 	if cfg == nil || !cfg.IsAlias() {
 		return nil
 	}
 	target, exists := bcl.GetModelConfig(cfg.Alias)
 	if !exists {
 		return fmt.Errorf("alias target %q does not exist", cfg.Alias)
 	}
 	if target.IsAlias() {
 		return fmt.Errorf("alias target %q is itself an alias (chains are not allowed)", cfg.Alias)
 	}
 	if target.IsDisabled() {
 		return fmt.Errorf("alias target %q is disabled", cfg.Alias)
 	}
 	return nil
 }
 // Preload prepare models if they are not local but url or huggingface repositories
 func (bcl *ModelConfigLoader) Preload(modelPath string) error {
 	bcl.Lock()
@@ -475,5 +513,21 @@ func (bcl *ModelConfigLoader) LoadModelConfigsFromPath(path string, opts ...Conf
 		}
 	}
 	// Surface aliases whose targets are missing or themselves aliases. These
 	// resolve to a clear request-time error; warning here gives operators
 	// visibility without failing startup.
 	for name, c := range bcl.configs {
 		if !c.IsAlias() {
 			continue
 		}
 		target, ok := bcl.configs[c.Alias]
 		switch {
 		case !ok:
 			xlog.Warn("alias points to unknown model", "alias", name, "target", c.Alias)
 		case target.IsAlias():
 			xlog.Warn("alias points to another alias (chains are not allowed)", "alias", name, "target", c.Alias)
 		}
 	}
 	return nil
 }
--- a/core/config/model_config_loader_test.go
+++ b/core/config/model_config_loader_test.go
@@ -61,3 +61,51 @@ var _ = Describe("ModelConfigLoader.GetModelsConflictingWith", func() {
 		Expect(bcl.GetModelsConflictingWith("a")).To(ConsistOf("b"))
 	})
 })
 var _ = Describe("ModelConfigLoader alias resolution", func() {
 	var loader *ModelConfigLoader
 	BeforeEach(func() {
 		loader = NewModelConfigLoader("")
 		loader.configs["real"] = ModelConfig{Name: "real", Backend: "llama-cpp"}
 		loader.configs["gpt-4"] = ModelConfig{Name: "gpt-4", Alias: "real"}
 		loader.configs["chain"] = ModelConfig{Name: "chain", Alias: "gpt-4"}
 		loader.configs["dangling"] = ModelConfig{Name: "dangling", Alias: "nope"}
 	})
 	It("returns non-alias configs unchanged", func() {
 		cfg := loader.configs["real"]
 		got, was, err := loader.ResolveAlias(&cfg)
 		Expect(err).ToNot(HaveOccurred())
 		Expect(was).To(BeFalse())
 		Expect(got.Name).To(Equal("real"))
 	})
 	It("resolves an alias to its target", func() {
 		cfg := loader.configs["gpt-4"]
 		got, was, err := loader.ResolveAlias(&cfg)
 		Expect(err).ToNot(HaveOccurred())
 		Expect(was).To(BeTrue())
 		Expect(got.Name).To(Equal("real"))
 	})
 	It("rejects an alias chain", func() {
 		cfg := loader.configs["chain"]
 		_, was, err := loader.ResolveAlias(&cfg)
 		Expect(was).To(BeTrue())
 		Expect(err).To(MatchError(ContainSubstring("chains are not allowed")))
 	})
 	It("rejects a dangling alias", func() {
 		cfg := loader.configs["dangling"]
 		_, _, err := loader.ResolveAlias(&cfg)
 		Expect(err).To(MatchError(ContainSubstring("unknown model")))
 	})
 	It("ValidateAliasTarget passes for a real target and fails for a chain", func() {
 		good := loader.configs["gpt-4"]
 		Expect(loader.ValidateAliasTarget(&good)).ToNot(HaveOccurred())
 		bad := loader.configs["chain"]
 		Expect(loader.ValidateAliasTarget(&bad)).To(MatchError(ContainSubstring("itself an alias")))
 	})
 })
--- a/core/config/model_config_test.go
+++ b/core/config/model_config_test.go
@@ -787,3 +787,32 @@ var _ = Describe("pattern detector config", func() {
 		Expect(err).To(MatchError(ContainSubstring("pattern \"EMAILish\"")))
 	})
 })
 var _ = Describe("ModelConfig alias", func() {
 	It("reports IsAlias when alias is set", func() {
 		c := ModelConfig{Name: "gpt-4", Alias: "my-llama-3"}
 		Expect(c.IsAlias()).To(BeTrue())
 		Expect(ModelConfig{Name: "real"}.IsAlias()).To(BeFalse())
 	})
 	It("validates a minimal alias config", func() {
 		c := ModelConfig{Name: "gpt-4", Alias: "my-llama-3"}
 		ok, err := c.Validate()
 		Expect(err).ToNot(HaveOccurred())
 		Expect(ok).To(BeTrue())
 	})
 	It("rejects an alias pointing to itself", func() {
 		c := ModelConfig{Name: "loop", Alias: "loop"}
 		ok, err := c.Validate()
 		Expect(ok).To(BeFalse())
 		Expect(err).To(MatchError(ContainSubstring("itself")))
 	})
 	It("rejects an alias that also sets a backend", func() {
 		c := ModelConfig{Name: "gpt-4", Alias: "my-llama-3", Backend: "llama-cpp"}
 		ok, err := c.Validate()
 		Expect(ok).To(BeFalse())
 		Expect(err).To(MatchError(ContainSubstring("pure redirect")))
 	})
 })
--- a/core/config/runtime_settings.go
+++ b/core/config/runtime_settings.go
@@ -28,6 +28,7 @@ type RuntimeSettings struct {
 	// Eviction settings
 	ForceEvictionWhenBusy    *bool   `json:"force_eviction_when_busy,omitempty"`    // Force eviction even when models have active API calls (default: false for safety)
 	SizeAwareEviction        *bool   `json:"size_aware_eviction,omitempty"`          // Evict largest models first rather than least-recently-used (default: false)
 	LRUEvictionMaxRetries    *int    `json:"lru_eviction_max_retries,omitempty"`    // Maximum number of retries when waiting for busy models to become idle (default: 30)
 	LRUEvictionRetryInterval *string `json:"lru_eviction_retry_interval,omitempty"` // Interval between retries when waiting for busy models (e.g., 1s, 2s) (default: 1s)
--- a/core/config/serving_defaults.go
+++ b/core/config/serving_defaults.go
@@ -0,0 +1,56 @@
 package config
 import (
 	"fmt"
 	"strings"
 	"github.com/mudler/xlog"
 )
 // Serving-policy model-config defaults.
 //
 // Sibling to hardware_defaults.go: those fill values driven by the target
 // *device* (Blackwell batch, VRAM-scaled parallel slots); these fill values
 // that improve multi-request / multi-user *serving* regardless of the GPU. They
 // run together from SetDefaults and only ever fill values the user left unset.
 // DefaultCacheReuse is the minimum shared-prefix chunk (in tokens) the backend
 // reuses across requests via KV-cache shifting. The llama.cpp backend ships this
 // disabled (n_cache_reuse = 0); we enable it so repeated prefixes (system
 // prompts, RAG context, agent scaffolds, multi-turn chat) are not recomputed.
 // This is the universally-useful part of "paged attention" (cross-request prefix
 // sharing) and needs none of the block-KV machinery.
 const DefaultCacheReuse = 256
 // ApplyServingDefaults fills serving-policy ModelConfig values the user left
 // unset. Currently: enable cross-request prefix caching. Explicit
 // cache_reuse/n_cache_reuse in the model options always wins.
 func ApplyServingDefaults(cfg *ModelConfig) {
 	if cfg == nil {
 		return
 	}
 	if !backendOptionSet(cfg.Options, "cache_reuse", "n_cache_reuse") {
 		cfg.Options = append(cfg.Options, fmt.Sprintf("cache_reuse:%d", DefaultCacheReuse))
 		xlog.Debug("[serving_defaults] enabling cross-request prefix cache",
 			"cache_reuse", DefaultCacheReuse)
 	}
 }
 // backendOptionSet reports whether the backend options already set any of names.
 // Options are "name:value" strings (or bare "name"); used so we never override
 // an explicit value. Shared with hardware_defaults.go.
 func backendOptionSet(opts []string, names ...string) bool {
 	for _, o := range opts {
 		name := o
 		if i := strings.IndexByte(o, ':'); i >= 0 {
 			name = o[:i]
 		}
 		name = strings.TrimSpace(strings.ToLower(name))
 		for _, n := range names {
 			if name == n {
 				return true
 			}
 		}
 	}
 	return false
 }
--- a/core/config/serving_defaults_test.go
+++ b/core/config/serving_defaults_test.go
@@ -0,0 +1,30 @@
 package config_test
 import (
 	. "github.com/mudler/LocalAI/core/config"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
 var _ = Describe("Serving-policy config defaults", func() {
 	Describe("ApplyServingDefaults (cross-request prefix cache)", func() {
 		It("enables cache_reuse when unset", func() {
 			cfg := &ModelConfig{}
 			ApplyServingDefaults(cfg)
 			Expect(cfg.Options).To(ContainElement("cache_reuse:256"))
 		})
 		It("never overrides an explicit cache_reuse", func() {
 			cfg := &ModelConfig{Options: []string{"cache_reuse:0"}}
 			ApplyServingDefaults(cfg)
 			Expect(cfg.Options).To(Equal([]string{"cache_reuse:0"}))
 		})
 		It("recognizes the n_cache_reuse alias", func() {
 			cfg := &ModelConfig{Options: []string{"n_cache_reuse:512"}}
 			ApplyServingDefaults(cfg)
 			Expect(cfg.Options).To(Equal([]string{"n_cache_reuse:512"}))
 		})
 		It("no-ops on nil", func() {
 			Expect(func() { ApplyServingDefaults(nil) }).ToNot(Panic())
 		})
 	})
 })
--- a/core/config/voice_gate_test.go
+++ b/core/config/voice_gate_test.go
@@ -70,4 +70,32 @@ var _ = Describe("PipelineVoiceRecognition", func() {
 			Expect((Pipeline{VoiceRecognition: &PipelineVoiceRecognition{}}).VoiceGateEnabled()).To(BeTrue())
 		})
 	})
 	Describe("Enforce / Identity helpers", func() {
 		It("treats a nil Enforce as enforcing (backward compatible)", func() {
 			v := PipelineVoiceRecognition{Model: "spk"}
 			Expect(v.EnforceGate()).To(BeTrue())
 		})
 		It("honors an explicit enforce:false", func() {
 			off := false
 			v := PipelineVoiceRecognition{Model: "spk", Enforce: &off}
 			Expect(v.EnforceGate()).To(BeFalse())
 		})
 		It("reports identity disabled when no identity block is set", func() {
 			v := PipelineVoiceRecognition{Model: "spk"}
 			Expect(v.IdentityEnabled()).To(BeFalse())
 			Expect(v.AnnounceEnabled()).To(BeFalse())
 			Expect(v.PersonalizeEnabled()).To(BeFalse())
 		})
 		It("reports identity enabled when announce or personalize is on", func() {
 			v := PipelineVoiceRecognition{Model: "spk", Identity: &VoiceIdentityConfig{Announce: true}}
 			Expect(v.IdentityEnabled()).To(BeTrue())
 			Expect(v.AnnounceEnabled()).To(BeTrue())
 			Expect(v.PersonalizeEnabled()).To(BeFalse())
 			v2 := PipelineVoiceRecognition{Model: "spk", Identity: &VoiceIdentityConfig{Personalize: true}}
 			Expect(v2.IdentityEnabled()).To(BeTrue())
 			Expect(v2.PersonalizeEnabled()).To(BeTrue())
 		})
 	})
 })
--- a/core/http/auth/features.go
+++ b/core/http/auth/features.go
@@ -48,6 +48,10 @@ var RouteFeatureRegistry = []RouteFeature{
 	{"POST", "/v1/audio/diarization", FeatureAudioDiarization},
 	{"POST", "/audio/diarization", FeatureAudioDiarization},
 	// Audio classification (sound-event tagging)
 	{"POST", "/v1/audio/classification", FeatureAudioClassification},
 	{"POST", "/audio/classification", FeatureAudioClassification},
 	// Audio speech / TTS
 	{"POST", "/v1/audio/speech", FeatureAudioSpeech},
 	{"POST", "/audio/speech", FeatureAudioSpeech},
@@ -172,6 +176,7 @@ func APIFeatureMetas() []FeatureMeta {
 		{FeatureAudioSpeech, "Audio Speech / TTS", true},
 		{FeatureAudioTranscription, "Audio Transcription", true},
 		{FeatureAudioDiarization, "Audio Diarization", true},
 		{FeatureAudioClassification, "Audio Classification", true},
 		{FeatureVAD, "Voice Activity Detection", true},
 		{FeatureDetection, "Detection", true},
 		{FeatureVideo, "Video Generation", true},
--- a/core/http/auth/permissions.go
+++ b/core/http/auth/permissions.go
@@ -38,24 +38,25 @@ const (
 	FeatureQuantization = "quantization"
 	// API features (default ON for new users)
-	FeatureChat               = "chat"
+	FeatureChat                = "chat"
-	FeatureImages             = "images"
+	FeatureImages              = "images"
-	FeatureAudioSpeech        = "audio_speech"
+	FeatureAudioSpeech         = "audio_speech"
-	FeatureAudioTranscription = "audio_transcription"
+	FeatureAudioTranscription  = "audio_transcription"
-	FeatureAudioDiarization   = "audio_diarization"
+	FeatureAudioDiarization    = "audio_diarization"
-	FeatureVAD                = "vad"
+	FeatureAudioClassification = "audio_classification"
-	FeatureDetection          = "detection"
+	FeatureVAD                 = "vad"
-	FeatureVideo              = "video"
+	FeatureDetection           = "detection"
-	FeatureEmbeddings         = "embeddings"
+	FeatureVideo               = "video"
-	FeatureSound              = "sound"
+	FeatureEmbeddings          = "embeddings"
-	FeatureRealtime           = "realtime"
+	FeatureSound               = "sound"
-	FeatureRerank             = "rerank"
+	FeatureRealtime            = "realtime"
-	FeatureTokenize           = "tokenize"
+	FeatureRerank              = "rerank"
-	FeatureMCP                = "mcp"
+	FeatureTokenize            = "tokenize"
-	FeatureStores             = "stores"
+	FeatureMCP                 = "mcp"
-	FeatureFaceRecognition    = "face_recognition"
+	FeatureStores              = "stores"
-	FeatureVoiceRecognition   = "voice_recognition"
+	FeatureFaceRecognition     = "face_recognition"
-	FeatureAudioTransform     = "audio_transform"
+	FeatureVoiceRecognition    = "voice_recognition"
 	FeatureAudioTransform      = "audio_transform"
 	// FeaturePIIFilter gates the synchronous PII analyze/redact service
 	// (POST /api/pii/{analyze,redact}). Default ON like the other API
 	// features; the admin-only events log is gated separately in-handler.
@@ -71,7 +72,7 @@ var GeneralFeatures = []string{FeatureFineTuning, FeatureQuantization}
 // APIFeatures lists API endpoint features (default ON).
 var APIFeatures = []string{
 	FeatureChat, FeatureImages, FeatureAudioSpeech, FeatureAudioTranscription,
-	FeatureAudioDiarization,
+	FeatureAudioDiarization, FeatureAudioClassification,
 	FeatureVAD, FeatureDetection, FeatureVideo, FeatureEmbeddings, FeatureSound,
 	FeatureRealtime, FeatureRerank, FeatureTokenize, FeatureMCP, FeatureStores,
 	FeatureFaceRecognition, FeatureVoiceRecognition, FeatureAudioTransform,
--- a/core/http/endpoints/localai/aliases.go
+++ b/core/http/endpoints/localai/aliases.go
@@ -0,0 +1,33 @@
 package localai
 import (
 	"net/http"
 	"github.com/labstack/echo/v4"
 	"github.com/mudler/LocalAI/core/config"
 )
 // AliasInfo is one alias -> target pair.
 type AliasInfo struct {
 	Name   string `json:"name"`
 	Target string `json:"target"`
 }
 // ListAliasesEndpoint returns every configured model alias and its target.
 //
 //	@Summary	List model aliases
 //	@Tags		models
 //	@Success	200	{array}	AliasInfo
 //	@Router		/api/aliases [get]
 func ListAliasesEndpoint(cl *config.ModelConfigLoader) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		// Non-nil so an empty result marshals as [] rather than null.
 		out := []AliasInfo{}
 		for _, cfg := range cl.GetAllModelsConfigs() {
 			if cfg.IsAlias() {
 				out = append(out, AliasInfo{Name: cfg.Name, Target: cfg.Alias})
 			}
 		}
 		return c.JSON(http.StatusOK, out)
 	}
 }
--- a/core/http/endpoints/localai/aliases_test.go
+++ b/core/http/endpoints/localai/aliases_test.go
@@ -0,0 +1,57 @@
 package localai_test
 import (
 	"net/http"
 	"net/http/httptest"
 	"os"
 	"path/filepath"
 	"github.com/labstack/echo/v4"
 	"github.com/mudler/LocalAI/core/config"
 	. "github.com/mudler/LocalAI/core/http/endpoints/localai"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
 var _ = Describe("ListAliasesEndpoint", func() {
 	var tempDir string
 	BeforeEach(func() {
 		var err error
 		tempDir, err = os.MkdirTemp("", "localai-aliases-test")
 		Expect(err).ToNot(HaveOccurred())
 	})
 	AfterEach(func() {
 		_ = os.RemoveAll(tempDir)
 	})
 	It("returns only alias configs as name/target pairs", func() {
 		// Seed one real model and one alias pointing at it.
 		Expect(os.WriteFile(
 			filepath.Join(tempDir, "real.yaml"),
 			[]byte("name: real\nbackend: llama-cpp\nmodel: foo\n"),
 			0644,
 		)).To(Succeed())
 		Expect(os.WriteFile(
 			filepath.Join(tempDir, "gpt-4.yaml"),
 			[]byte("name: gpt-4\nalias: real\n"),
 			0644,
 		)).To(Succeed())
 		loader := config.NewModelConfigLoader(tempDir)
 		Expect(loader.LoadModelConfigsFromPath(tempDir)).To(Succeed())
 		app := echo.New()
 		app.GET("/api/aliases", ListAliasesEndpoint(loader))
 		req := httptest.NewRequest("GET", "/api/aliases", nil)
 		rec := httptest.NewRecorder()
 		app.ServeHTTP(rec, req)
 		Expect(rec.Code).To(Equal(http.StatusOK))
 		Expect(rec.Body.String()).To(ContainSubstring(`"name":"gpt-4"`))
 		Expect(rec.Body.String()).To(ContainSubstring(`"target":"real"`))
 		// The real model must not appear as an alias entry.
 		Expect(rec.Body.String()).ToNot(ContainSubstring(`"name":"real"`))
 	})
 })
--- a/core/http/endpoints/localai/api_instructions.go
+++ b/core/http/endpoints/localai/api_instructions.go
@@ -32,9 +32,9 @@ var instructionDefs = []instructionDef{
 	},
 	{
 		Name:        "audio",
-		Description: "Text-to-speech, voice activity detection, transcription, speaker diarization, and sound generation",
+		Description: "Text-to-speech, voice activity detection, transcription, speaker diarization, sound classification, and sound generation",
 		Tags:        []string{"audio"},
-		Intro:       "Diarization (/v1/audio/diarization) returns speaker-labelled time segments. Backends with native ASR-diarization (vibevoice-cpp) can also emit per-segment text via include_text=true; backends with a dedicated pipeline (sherpa-onnx + pyannote) emit segmentation only. Response formats: json (default), verbose_json (adds speakers summary + text), rttm (NIST format).",
+		Intro:       "Diarization (/v1/audio/diarization) returns speaker-labelled time segments. Backends with native ASR-diarization (vibevoice-cpp) can also emit per-segment text via include_text=true; backends with a dedicated pipeline (sherpa-onnx + pyannote) emit segmentation only. Response formats: json (default), verbose_json (adds speakers summary + text), rttm (NIST format). Sound classification (/v1/audio/classification) returns scored AudioSet sound-event tags (audio tagging via the ced backend); top_k and threshold control the returned set.",
 	},
 	{
 		Name:        "images",
--- a/core/http/endpoints/localai/import_model.go
+++ b/core/http/endpoints/localai/import_model.go
@@ -181,6 +181,12 @@ func ImportModelEndpoint(cl *config.ModelConfigLoader, appConfig *config.Applica
 			return c.JSON(http.StatusBadRequest, ModelResponse{Success: false, Error: msg})
 		}
 		// Reject aliases whose target is missing, chained, or disabled so a
 		// dangling alias can't be persisted and surface as a runtime error later.
 		if err := cl.ValidateAliasTarget(&modelConfig); err != nil {
 			return c.JSON(http.StatusBadRequest, ModelResponse{Success: false, Error: err.Error()})
 		}
 		// Create the configuration file
 		configPath := filepath.Join(appConfig.SystemState.Model.ModelsPath, modelConfig.Name+".yaml")
 		if err := utils.VerifyPath(modelConfig.Name+".yaml", appConfig.SystemState.Model.ModelsPath); err != nil {
--- a/core/http/endpoints/localai/nodes.go
+++ b/core/http/endpoints/localai/nodes.go
@@ -70,17 +70,20 @@ func GetNodeEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {
 // RegisterNodeRequest is the request body for registering a new worker node.
 type RegisterNodeRequest struct {
-	Name          string            `json:"name"`
+	Name          string `json:"name"`
-	NodeType      string            `json:"node_type,omitempty"` // "backend" (default) or "agent"
+	NodeType      string `json:"node_type,omitempty"` // "backend" (default) or "agent"
-	Address       string            `json:"address"`
+	Address       string `json:"address"`
-	HTTPAddress   string            `json:"http_address,omitempty"`
+	HTTPAddress   string `json:"http_address,omitempty"`
-	Token         string            `json:"token,omitempty"`
+	Token         string `json:"token,omitempty"`
-	TotalVRAM     uint64            `json:"total_vram,omitempty"`
+	TotalVRAM     uint64 `json:"total_vram,omitempty"`
-	AvailableVRAM uint64            `json:"available_vram,omitempty"`
+	AvailableVRAM uint64 `json:"available_vram,omitempty"`
-	TotalRAM      uint64            `json:"total_ram,omitempty"`
+	TotalRAM      uint64 `json:"total_ram,omitempty"`
-	AvailableRAM  uint64            `json:"available_ram,omitempty"`
+	AvailableRAM  uint64 `json:"available_ram,omitempty"`
-	GPUVendor     string            `json:"gpu_vendor,omitempty"`
+	GPUVendor     string `json:"gpu_vendor,omitempty"`
-	Labels        map[string]string `json:"labels,omitempty"`
+	// GPUComputeCapability is the worker GPU's compute capability ("major.minor",
 	// e.g. "12.1" for GB10). Used by the router for per-arch option tuning.
 	GPUComputeCapability string            `json:"gpu_compute_capability,omitempty"`
 	Labels               map[string]string `json:"labels,omitempty"`
 	// MaxReplicasPerModel is the per-node cap on replicas of any single model.
 	// Workers older than this field omit it; we coerce 0 → 1 below to preserve
 	// historical single-replica behavior.
@@ -152,17 +155,18 @@ func RegisterNodeEndpoint(registry *nodes.NodeRegistry, expectedToken string, au
 		}
 		node := &nodes.BackendNode{
-			Name:                req.Name,
+			Name:                 req.Name,
-			NodeType:            nodeType,
+			NodeType:             nodeType,
-			Address:             req.Address,
+			Address:              req.Address,
-			HTTPAddress:         req.HTTPAddress,
+			HTTPAddress:          req.HTTPAddress,
-			TokenHash:           tokenHash,
+			TokenHash:            tokenHash,
-			TotalVRAM:           req.TotalVRAM,
+			TotalVRAM:            req.TotalVRAM,
-			AvailableVRAM:       req.AvailableVRAM,
+			AvailableVRAM:        req.AvailableVRAM,
-			TotalRAM:            req.TotalRAM,
+			TotalRAM:             req.TotalRAM,
-			AvailableRAM:        req.AvailableRAM,
+			AvailableRAM:         req.AvailableRAM,
-			GPUVendor:           req.GPUVendor,
+			GPUVendor:            req.GPUVendor,
-			MaxReplicasPerModel: maxReplicasPerModel,
+			GPUComputeCapability: req.GPUComputeCapability,
 			MaxReplicasPerModel:  maxReplicasPerModel,
 		}
 		ctx := c.Request().Context()
--- a/core/http/endpoints/mcp/localai_assistant_test.go
+++ b/core/http/endpoints/mcp/localai_assistant_test.go
@@ -51,6 +51,12 @@ func (stubClient) EditModelConfig(_ context.Context, _ string, _ map[string]any)
 	return nil
 }
 func (stubClient) ReloadModels(_ context.Context) error { return nil }
 func (stubClient) SetAlias(_ context.Context, _, _ string) error {
 	return nil
 }
 func (stubClient) ListAliases(_ context.Context) ([]localaitools.AliasInfo, error) {
 	return nil, nil
 }
 func (stubClient) ListBackends(_ context.Context) ([]localaitools.Backend, error) {
 	return []localaitools.Backend{{Name: "stub-backend", Installed: true}}, nil
 }
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -93,16 +93,31 @@ type Session struct {
 	Voice                   string
 	TurnDetection           *types.TurnDetectionUnion // "server_vad", "semantic_vad" or "none"
 	InputAudioTranscription *types.AudioTranscription
-	Tools                   []types.ToolUnion
+
-	ToolChoice              *types.ToolChoiceUnion
+	// SoundDetectionEnabled is set when pipeline.sound_detection names a
-	Conversations           map[string]*Conversation
+	// sound-event-classification model. When true, each committed utterance is
-	InputAudioBuffer        []byte
+	// also run through ModelInterface.SoundDetection and the scored tags are
-	AudioBufferLock         sync.Mutex
+	// emitted as a conversation.item.sound_detection event. SoundDetectionTopK
-	OpusFrames              [][]byte
+	// and SoundDetectionThreshold are the knobs passed to that call (defaults:
-	OpusFramesLock          sync.Mutex
+	// top_k=5, threshold=0).
-	Instructions            string
+	SoundDetectionEnabled   bool
-	DefaultConversationID   string
+	SoundDetectionTopK      int
-	ModelInterface          Model
+	SoundDetectionThreshold float32
 	// SoundDetectionWindowMs / SoundDetectionHopMs, when both > 0, enable
 	// server-side windowing for a sound-only session: the server classifies the
 	// last WindowMs of streamed audio every HopMs (no client commits needed).
 	SoundDetectionWindowMs int
 	SoundDetectionHopMs    int
 	Tools                  []types.ToolUnion
 	ToolChoice             *types.ToolChoiceUnion
 	Conversations          map[string]*Conversation
 	InputAudioBuffer       []byte
 	AudioBufferLock        sync.Mutex
 	OpusFrames             [][]byte
 	OpusFramesLock         sync.Mutex
 	Instructions           string
 	DefaultConversationID  string
 	ModelInterface         Model
 	// The pipeline model config or the config for an any-to-any model
 	ModelConfig      *config.ModelConfig
 	InputSampleRate  int
@@ -250,6 +265,10 @@ type Model interface {
 	// TranscribeStream transcribes audio incrementally, invoking onDelta for each
 	// transcript text fragment and returning the final aggregated result.
 	TranscribeStream(ctx context.Context, audio, language string, translate, diarize bool, prompt string, onDelta func(text string)) (*schema.TranscriptionResult, error)
 	// SoundDetection classifies a committed audio window into scored AudioSet
 	// sound-event tags. topK caps the number of returned tags (0 = backend
 	// default), threshold drops tags below the given score (0 = keep all).
 	SoundDetection(ctx context.Context, audio string, topK int, threshold float32) (*schema.SoundClassificationResult, error)
 	PredictConfig() *config.ModelConfig
 }
@@ -399,7 +418,7 @@ func prepareRealtimeConfig(cfg *config.ModelConfig) (errCode, errMsg string, ok
 		return "", "", true
 	}
-	if cfg.Pipeline.VAD == "" && cfg.Pipeline.Transcription == "" && cfg.Pipeline.TTS == "" && cfg.Pipeline.LLM == "" {
+	if cfg.Pipeline.VAD == "" && cfg.Pipeline.Transcription == "" && cfg.Pipeline.TTS == "" && cfg.Pipeline.LLM == "" && cfg.Pipeline.SoundDetection == "" {
 		return "invalid_model", "Model is not a pipeline model", false
 	}
 	return "", "", true
@@ -469,6 +488,26 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	sttModel := cfg.Pipeline.Transcription
 	// A sound-detection-only pipeline (sound_detection set, no transcription/LLM)
 	// activates on sounds, not speech, so it runs WITHOUT the voice VAD: the
 	// session defaults to turn_detection none and the client drives windowing via
 	// input_audio_buffer.commit. There is no transcription stage in that case.
 	soundOnly := cfg.Pipeline.SoundDetection != "" && cfg.Pipeline.Transcription == "" && cfg.Pipeline.LLM == ""
 	turnDetection := &types.TurnDetectionUnion{
 		ServerVad: &types.ServerVad{
 			Threshold:         0.5,
 			PrefixPaddingMs:   300,
 			SilenceDurationMs: 500,
 			CreateResponse:    true,
 		},
 	}
 	inputAudioTranscription := &types.AudioTranscription{Model: sttModel}
 	if soundOnly {
 		turnDetection = nil           // turn_detection none: no VAD
 		inputAudioTranscription = nil // no transcription stage
 	}
 	// Compose the system prompt: prepend the assistant prompt when we have
 	// one (it teaches the model the safety rules and tool recipes), then the
 	// session's default voice instructions. Order matches chat.go's
@@ -480,30 +519,26 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	sessionID := generateSessionID()
 	session := &Session{
-		ID:                sessionID,
+		ID:                      sessionID,
-		TranscriptionOnly: false,
+		TranscriptionOnly:       false,
-		Model:             model,
+		Model:                   model,
-		Voice:             cfg.TTSConfig.Voice,
+		Voice:                   cfg.TTSConfig.Voice,
-		Instructions:      instructions,
+		Instructions:            instructions,
-		ModelConfig:       cfg,
+		ModelConfig:             cfg,
-		Tools:             assistantTools,
+		Tools:                   assistantTools,
-		AssistantTools:    assistantTools,
+		AssistantTools:          assistantTools,
-		AssistantExecutor: assistantExecutor,
+		AssistantExecutor:       assistantExecutor,
-		TurnDetection: &types.TurnDetectionUnion{
+		TurnDetection:           turnDetection,
-			ServerVad: &types.ServerVad{
+		InputAudioTranscription: inputAudioTranscription,
-				Threshold:         0.5,
+		Conversations:           make(map[string]*Conversation),
-				PrefixPaddingMs:   300,
+		InputSampleRate:         defaultRemoteSampleRate,
-				SilenceDurationMs: 500,
+		OutputSampleRate:        defaultRemoteSampleRate,
-				CreateResponse:    true,
+		MaxHistoryItems:         resolveMaxHistoryItems(cfg),
-			},
+		SoundDetectionEnabled:   cfg.Pipeline.SoundDetection != "",
-		},
+		SoundDetectionTopK:      defaultSoundDetectionTopK,
-		InputAudioTranscription: &types.AudioTranscription{
+		SoundDetectionThreshold: 0,
-			Model: sttModel,
+		SoundDetectionWindowMs:  cfg.Pipeline.SoundDetectionWindowMs,
-		},
+		SoundDetectionHopMs:     cfg.Pipeline.SoundDetectionHopMs,
 		Conversations:    make(map[string]*Conversation),
 		InputSampleRate:  defaultRemoteSampleRate,
 		OutputSampleRate: defaultRemoteSampleRate,
 		MaxHistoryItems:  resolveMaxHistoryItems(cfg),
 	}
 	// Create a default conversation
@@ -517,14 +552,24 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	session.Conversations[conversationID] = conversation
 	session.DefaultConversationID = conversationID
-	m, err := newModel(
+	var m Model
-		&cfg.Pipeline,
+	if soundOnly {
-		application.ModelConfigLoader(),
+		m, err = newSoundDetectionOnlyModel(
-		application.ModelLoader(),
+			&cfg.Pipeline,
-		application.ApplicationConfig(),
+			application.ModelConfigLoader(),
-		evaluator,
+			application.ModelLoader(),
-		buildRealtimeRoutingContext(application, sessionID),
+			application.ApplicationConfig(),
-	)
+		)
 	} else {
 		m, err = newModel(
 			&cfg.Pipeline,
 			application.ModelConfigLoader(),
 			application.ModelLoader(),
 			application.ApplicationConfig(),
 			evaluator,
 			buildRealtimeRoutingContext(application, sessionID),
 		)
 	}
 	if err != nil {
 		xlog.Error("failed to load model", "error", err)
 		sendError(t, "model_load_error", "Failed to load model", "", "")
@@ -605,6 +650,20 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	toggleVAD()
 	// Server-side sound-detection windowing (option B): for a sound-only session
 	// with window/hop configured, the server classifies the last window of
 	// streamed audio on a timer, so the client only has to stream (no commits).
 	// This runs independent of VAD (sound events are not speech).
 	var soundWindowDone chan struct{}
 	if soundOnly && session.SoundDetectionWindowMs > 0 && session.SoundDetectionHopMs > 0 {
 		soundWindowDone = make(chan struct{})
 		wg.Go(func() {
 			handleSoundWindow(session, t, soundWindowDone)
 		})
 		xlog.Debug("Starting server-side sound-detection windowing",
 			"window_ms", session.SoundDetectionWindowMs, "hop_ms", session.SoundDetectionHopMs)
 	}
 	for {
 		msg, err = t.ReadEvent()
 		if err != nil {
@@ -880,6 +939,10 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	if vadServerStarted {
 		close(done)
 	}
 	// Stop the server-side sound-detection windowing goroutine (if running).
 	if soundWindowDone != nil {
 		close(soundWindowDone)
 	}
 	wg.Wait()
 	// Remove the session from the sessions map
@@ -971,6 +1034,10 @@ func updateTransSession(session *Session, update *types.SessionUnion, cl *config
 		session.ModelInterface = m
 		session.ModelConfig = cfg
 		session.SoundDetectionEnabled = cfg.Pipeline.SoundDetection != ""
 		if session.SoundDetectionTopK <= 0 {
 			session.SoundDetectionTopK = defaultSoundDetectionTopK
 		}
 	}
 	if trUpd != nil {
@@ -1311,35 +1378,40 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
 	// turn wastes only transcription compute, which has no side effects. The
 	// transcript is still emitted to the same peer that sent the audio, which
 	// reveals nothing new to them.
-	type gateOutcome struct {
+	// Resolve the speaker when the gate must authorize this turn, or when identity
-		allowed bool
+	// surfacing/personalization needs a fresh identity. Identity resolution
-		matched string
+	// ignores the when:first short-circuit (that only skips re-authorization).
-		reason  string
+	type resolveOutcome struct {
-		err     error
+		res resolution
 		err error
 	}
-	var gateCh chan gateOutcome
+	var resolveCh chan resolveOutcome
-	runGate := false
+	runResolve := false
 	if session.voiceGate != nil && session.InputAudioTranscription != nil {
-		skip := false
+		enforce := session.voiceGate.cfg.EnforceGate()
-		if session.voiceGate.cfg.When == config.VoiceGateWhenFirst {
+		gateNeedsAuth := enforce
 		if enforce && session.voiceGate.cfg.When == config.VoiceGateWhenFirst {
 			session.gateMu.Lock()
-			skip = session.voiceVerified
+			if session.voiceVerified {
 				gateNeedsAuth = false
 			}
 			session.gateMu.Unlock()
 		}
-		if !skip {
+		if gateNeedsAuth || session.voiceGate.cfg.IdentityEnabled() {
-			runGate = true
+			runResolve = true
-			gateCh = make(chan gateOutcome, 1)
+			resolveCh = make(chan resolveOutcome, 1)
 			wavPath := f.Name()
 			go func() {
-				allowed, matched, reason, gerr := session.voiceGate.Authorize(ctx, wavPath)
+				r, rerr := session.voiceGate.Resolve(ctx, wavPath)
-				gateCh <- gateOutcome{allowed: allowed, matched: matched, reason: reason, err: gerr}
+				resolveCh <- resolveOutcome{res: r, err: rerr}
 			}()
 		}
 	}
 	// TODO: If we have a real any-to-any model then transcription is optional
 	var transcript string
-	if session.InputAudioTranscription != nil {
+	switch {
 	case session.InputAudioTranscription != nil:
 		// emitTranscription streams transcript deltas when
 		// pipeline.streaming.transcription is set, otherwise emits a single
 		// completed event; either way it returns the final transcript text.
@@ -1348,55 +1420,169 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
 		if err != nil {
 			// Drain the gate goroutine before returning so its in-flight read of
 			// the temp WAV finishes before the deferred os.Remove fires.
-			if runGate {
+			if runResolve {
-				<-gateCh
+				<-resolveCh
 			}
 			sendError(t, "transcription_failed", err.Error(), "", "event_TODO")
 			return
 		}
-	} else {
+	case session.SoundDetectionEnabled:
 		// Sound-detection-only session: no transcription and no LLM. The
 		// sound-detection emit below carries the result; there is no any-to-any
 		// path to fall into. Windowing is client-driven (turn_detection none +
 		// input_audio_buffer.commit), so this is not voice-gated.
 	default:
 		// The voice gate runs only on the transcription path above; if an
 		// any-to-any model path is added here, join the gate before responding.
 		sendNotImplemented(t, "any-to-any models")
 		return
 	}
-	// Join on the gate before any side-effecting step.
+	// Sound-event detection is additive to transcription: classify the same
-	if runGate {
+	// committed window and emit its scored AudioSet tags as a separate event.
-		out := <-gateCh
+	// A failure here is logged but must never abort the turn.
-		allowed := out.allowed
+	if session.SoundDetectionEnabled {
-		reason := out.reason
+		if sderr := emitSoundDetection(ctx, t, session, generateItemID(), f.Name()); sderr != nil {
-		if out.err != nil {
+			xlog.Error("sound detection failed", "error", sderr)
 			// Fail closed: a gate that cannot decide must not let audio through.
 			xlog.Error("voice recognition gate error", "error", out.err)
 			allowed = false
 			reason = "verification error"
 		}
 		alreadyVerified := false
 		if session.voiceGate.cfg.When == config.VoiceGateWhenFirst {
 			session.gateMu.Lock()
 			alreadyVerified = session.voiceVerified
 			session.gateMu.Unlock()
 		}
 		proceed, markVerified := session.voiceGate.decide(alreadyVerified, allowed)
 		if !proceed {
 			xlog.Debug("voice recognition gate rejected utterance", "reason", reason)
 			if session.voiceGate.cfg.OnReject == config.VoiceGateRejectEvent {
 				sendError(t, "speaker_not_authorized", "speaker not authorized: "+reason, "", "event_TODO")
 			}
 			return
 		}
 		xlog.Debug("voice recognition gate authorized utterance", "speaker", out.matched)
 		if markVerified {
 			session.gateMu.Lock()
 			session.voiceVerified = true
 			session.gateMu.Unlock()
 		}
 	}
-	if !session.TranscriptionOnly {
+	// Join on the resolution before any side-effecting step.
-		generateResponse(ctx, session, utt, transcript, conv, t)
+	var speaker *types.Speaker
 	if runResolve {
 		out := <-resolveCh
 		enforce := session.voiceGate.cfg.EnforceGate()
 		if out.err != nil {
 			if enforce {
 				// Fail closed: a gate that cannot decide must not let audio through.
 				xlog.Error("voice recognition gate error", "error", out.err)
 				if session.voiceGate.cfg.OnReject == config.VoiceGateRejectEvent {
 					sendError(t, "speaker_not_authorized", "speaker not authorized: verification error", "", "event_TODO")
 				}
 				return
 			}
 			// Non-enforcing: degrade to an unknown speaker and continue.
 			xlog.Warn("voice identity resolve failed; continuing as unknown speaker", "error", out.err)
 		} else {
 			s := out.res.speaker
 			speaker = &s
 		}
 		if enforce {
 			alreadyVerified := false
 			if session.voiceGate.cfg.When == config.VoiceGateWhenFirst {
 				session.gateMu.Lock()
 				alreadyVerified = session.voiceVerified
 				session.gateMu.Unlock()
 			}
 			allowed, reason := false, "verification error"
 			if out.err == nil {
 				allowed, reason = session.voiceGate.authorize(out.res)
 			}
 			proceed, markVerified := session.voiceGate.decide(alreadyVerified, allowed)
 			if !proceed {
 				xlog.Debug("voice recognition gate rejected utterance", "reason", reason)
 				if session.voiceGate.cfg.OnReject == config.VoiceGateRejectEvent {
 					sendError(t, "speaker_not_authorized", "speaker not authorized: "+reason, "", "event_TODO")
 				}
 				return
 			}
 			if markVerified {
 				session.gateMu.Lock()
 				session.voiceVerified = true
 				session.gateMu.Unlock()
 			}
 			xlog.Debug("voice recognition gate authorized utterance", "speaker", out.res.speaker.Name)
 		}
 	}
 	// Generate an LLM response only when there is a transcript to feed it. A
 	// sound-detection-only session (no transcription) has no LLM stage, so it
 	// stops here after emitting the sound-detection event.
 	if session.InputAudioTranscription != nil && !session.TranscriptionOnly {
 		generateResponse(ctx, session, utt, transcript, speaker, conv, t)
 	}
 }
 // handleSoundWindow runs server-side windowed sound-event detection (option B):
 // every HopMs it classifies the last WindowMs of streamed audio and emits a
 // sound_detection event, so a sound-only client only has to stream audio (no
 // input_audio_buffer.commit). It keeps the input buffer trimmed to one window
 // so a long stream stays bounded. Runs until done is closed. This is
 // independent of VAD: sound events are not speech.
 func handleSoundWindow(session *Session, t Transport, done chan struct{}) {
 	ticker := time.NewTicker(time.Duration(session.SoundDetectionHopMs) * time.Millisecond)
 	defer ticker.Stop()
 	for {
 		select {
 		case <-done:
 			return
 		case <-ticker.C:
 			classifySoundWindow(session, t)
 		}
 	}
 }
 // classifySoundWindow is one windowing tick: it snapshots the most recent
 // WindowMs of buffered audio (trimming the buffer so a long stream stays
 // bounded) and, when there is enough, classifies it and emits a sound_detection
 // event. Extracted from handleSoundWindow so it can be driven synchronously in
 // tests.
 func classifySoundWindow(session *Session, t Transport) {
 	const bytesPerSample = 2 // 16-bit mono PCM
 	sr := session.InputSampleRate
 	windowBytes := session.SoundDetectionWindowMs * sr / 1000 * bytesPerSample
 	minBytes := sr / 100 * bytesPerSample // ~10ms before classifying
 	session.AudioBufferLock.Lock()
 	// Keep only the most recent window so a long stream stays bounded.
 	if windowBytes > 0 && len(session.InputAudioBuffer) > windowBytes {
 		trimmed := make([]byte, windowBytes)
 		copy(trimmed, session.InputAudioBuffer[len(session.InputAudioBuffer)-windowBytes:])
 		session.InputAudioBuffer = trimmed
 	}
 	window := make([]byte, len(session.InputAudioBuffer))
 	copy(window, session.InputAudioBuffer)
 	session.AudioBufferLock.Unlock()
 	if len(window) < minBytes {
 		return // not enough audio buffered yet
 	}
 	path, err := writeWindowWAV(window, sr)
 	if err != nil {
 		xlog.Error("sound window: failed to write wav", "error", err)
 		return
 	}
 	if sderr := emitSoundDetection(context.Background(), t, session, generateItemID(), path); sderr != nil {
 		xlog.Error("sound window: detection failed", "error", sderr)
 	}
 	if rerr := os.Remove(path); rerr != nil {
 		xlog.Debug("sound window: temp cleanup failed", "error", rerr)
 	}
 }
 // writeWindowWAV writes mono 16-bit PCM to a temp WAV at the given sample rate
 // (the ced classifier reads the declared rate and resamples). Returns the path;
 // the caller removes it.
 func writeWindowWAV(pcm []byte, sampleRate int) (string, error) {
 	f, err := os.CreateTemp("", "realtime-sound-window-*.wav")
 	if err != nil {
 		return "", err
 	}
 	defer func() { _ = f.Close() }()
 	hdr := laudio.NewWAVHeaderWithRate(uint32(len(pcm)), uint32(sampleRate))
 	if err := hdr.Write(f); err != nil {
 		_ = os.Remove(f.Name())
 		return "", err
 	}
 	if _, err := f.Write(pcm); err != nil {
 		_ = os.Remove(f.Name())
 		return "", err
 	}
 	_ = f.Sync()
 	return f.Name(), nil
 }
 func runVAD(ctx context.Context, session *Session, adata []int16) ([]schema.VADSegment, error) {
@@ -1419,15 +1605,28 @@ func runVAD(ctx context.Context, session *Session, adata []int16) ([]schema.VADS
 	return resp.Segments, nil
 }
 // speakerNote renders the system-prompt note for the current speaker. Returns
 // an empty string when there is no name and unknown notes are disabled.
 func speakerNote(s *types.Speaker, noteUnknown bool) string {
 	if s != nil && s.Matched && s.Name != "" {
 		return "The current speaker is " + s.Name + "."
 	}
 	if noteUnknown {
 		return "The current speaker is unknown."
 	}
 	return ""
 }
 // Function to generate a response based on the conversation
-func generateResponse(ctx context.Context, session *Session, utt []byte, transcript string, conv *Conversation, t Transport) {
+func generateResponse(ctx context.Context, session *Session, utt []byte, transcript string, speaker *types.Speaker, conv *Conversation, t Transport) {
 	xlog.Debug("Generating realtime response...")
 	// Create user message item
 	item := types.MessageItemUnion{
 		User: &types.MessageItemUser{
-			ID:     generateItemID(),
+			ID:      generateItemID(),
-			Status: types.ItemStatusCompleted,
+			Status:  types.ItemStatusCompleted,
 			Speaker: speaker,
 			Content: []types.MessageContentInput{
 				{
 					Type:       types.MessageContentTypeInputAudio,
@@ -1445,6 +1644,17 @@ func generateResponse(ctx context.Context, session *Session, utt []byte, transcr
 		Item: item,
 	})
 	// Surface the recognized speaker to the client. Skip the event for an
 	// unidentified speaker unless announce_unknown is set.
 	if speaker != nil && session.voiceGate != nil && session.voiceGate.cfg.AnnounceEnabled() {
 		if speaker.Matched || session.voiceGate.cfg.Identity.AnnounceUnknown {
 			sendEvent(t, types.ConversationItemSpeakerEvent{
 				ItemID:  item.User.ID,
 				Speaker: *speaker,
 			})
 		}
 	}
 	triggerResponse(ctx, session, conv, t, nil)
 }
@@ -1508,6 +1718,8 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 	})
 	imgIndex := 0
 	var lastUserSpeaker *types.Speaker
 	personalize := session.voiceGate != nil && session.voiceGate.cfg.PersonalizeEnabled()
 	conv.Lock.Lock()
 	items := trimRealtimeItems(conv.Items, session.MaxHistoryItems)
 	for _, item := range items {
@@ -1515,6 +1727,11 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 			msg := schema.Message{
 				Role: string(types.MessageRoleUser),
 			}
 			lastUserSpeaker = item.User.Speaker
 			if personalize && session.voiceGate.cfg.Identity.InjectName &&
 				item.User.Speaker != nil && item.User.Speaker.Matched && item.User.Speaker.Name != "" {
 				msg.Name = item.User.Speaker.Name
 			}
 			textContent := ""
 			nrOfImgsInMessage := 0
 			for _, content := range item.User.Content {
@@ -1601,6 +1818,13 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 	}
 	conv.Lock.Unlock()
 	if personalize && session.voiceGate.cfg.Identity.InjectSystemNote {
 		if note := speakerNote(lastUserSpeaker, session.voiceGate.cfg.Identity.NoteUnknown); note != "" {
 			conversationHistory[0].StringContent += "\n\n" + note
 			conversationHistory[0].Content = conversationHistory[0].StringContent
 		}
 	}
 	var images []string
 	for _, m := range conversationHistory {
 		images = append(images, m.StringImages...)
--- a/core/http/endpoints/openai/realtime_doubles_test.go
+++ b/core/http/endpoints/openai/realtime_doubles_test.go
@@ -75,6 +75,11 @@ type fakeModel struct {
 	transcribeDeltas []string
 	transcribeFinal  *schema.TranscriptionResult
 	// soundDetectionResult/soundDetectionErr drive the SoundDetection double so
 	// the sound-event path can be exercised deterministically.
 	soundDetectionResult *schema.SoundClassificationResult
 	soundDetectionErr    error
 	// Predict streaming: predictTokens are replayed through the token callback
 	// (simulating streamed LLM output); predictResp/predictErr are returned by
 	// the deferred predict function. predictChunkDeltas, when set, are delivered
@@ -83,6 +88,8 @@ type fakeModel struct {
 	predictChunkDeltas [][]*proto.ChatDelta
 	predictResp        backend.LLMResponse
 	predictErr         error
 	lastMessages schema.Messages
 }
 func (m *fakeModel) VAD(context.Context, *schema.VADRequest) (*schema.VADResponse, error) {
@@ -93,7 +100,15 @@ func (m *fakeModel) Transcribe(context.Context, string, string, bool, bool, stri
 	return m.transcribeFinal, nil
 }
-func (m *fakeModel) Predict(_ context.Context, _ schema.Messages, _, _, _ []string, cb func(string, backend.TokenUsage) bool, _ []types.ToolUnion, _ *types.ToolChoiceUnion, _, _ *int, _ map[string]float64) (func() (backend.LLMResponse, error), error) {
+func (m *fakeModel) SoundDetection(context.Context, string, int, float32) (*schema.SoundClassificationResult, error) {
 	if m.soundDetectionErr != nil {
 		return nil, m.soundDetectionErr
 	}
 	return m.soundDetectionResult, nil
 }
 func (m *fakeModel) Predict(_ context.Context, msgs schema.Messages, _, _, _ []string, cb func(string, backend.TokenUsage) bool, _ []types.ToolUnion, _ *types.ToolChoiceUnion, _, _ *int, _ map[string]float64) (func() (backend.LLMResponse, error), error) {
 	m.lastMessages = msgs
 	if m.predictErr != nil {
 		return nil, m.predictErr
 	}
--- a/core/http/endpoints/openai/realtime_model.go
+++ b/core/http/endpoints/openai/realtime_model.go
@@ -31,10 +31,11 @@ var (
 // This means that we will fake an Any-to-Any model by overriding some of the gRPC client methods
 // which are for Any-To-Any models, but instead we will call a pipeline (for e.g STT->LLM->TTS)
 type wrappedModel struct {
-	TTSConfig           *config.ModelConfig
+	TTSConfig            *config.ModelConfig
-	TranscriptionConfig *config.ModelConfig
+	TranscriptionConfig  *config.ModelConfig
-	LLMConfig           *config.ModelConfig
+	LLMConfig            *config.ModelConfig
-	VADConfig           *config.ModelConfig
+	VADConfig            *config.ModelConfig
 	SoundDetectionConfig *config.ModelConfig
 	appConfig   *config.ApplicationConfig
 	modelLoader *model.ModelLoader
@@ -64,8 +65,9 @@ type anyToAnyModel struct {
 }
 type transcriptOnlyModel struct {
-	TranscriptionConfig *config.ModelConfig
+	TranscriptionConfig  *config.ModelConfig
-	VADConfig           *config.ModelConfig
+	VADConfig            *config.ModelConfig
 	SoundDetectionConfig *config.ModelConfig
 	appConfig   *config.ApplicationConfig
 	modelLoader *model.ModelLoader
@@ -80,6 +82,10 @@ func (m *transcriptOnlyModel) Transcribe(ctx context.Context, audio, language st
 	return backend.ModelTranscription(ctx, audio, language, translate, diarize, prompt, m.modelLoader, *m.TranscriptionConfig, m.appConfig)
 }
 func (m *transcriptOnlyModel) SoundDetection(ctx context.Context, audio string, topK int, threshold float32) (*schema.SoundClassificationResult, error) {
 	return modelSoundDetection(ctx, m.modelLoader, m.appConfig, m.SoundDetectionConfig, audio, topK, threshold)
 }
 func (m *transcriptOnlyModel) Predict(ctx context.Context, messages schema.Messages, images, videos, audios []string, tokenCallback func(string, backend.TokenUsage) bool, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, logprobs *int, topLogprobs *int, logitBias map[string]float64) (func() (backend.LLMResponse, error), error) {
 	return nil, fmt.Errorf("predict operation not supported in transcript-only mode")
 }
@@ -108,6 +114,10 @@ func (m *wrappedModel) Transcribe(ctx context.Context, audio, language string, t
 	return backend.ModelTranscription(ctx, audio, language, translate, diarize, prompt, m.modelLoader, *m.TranscriptionConfig, m.appConfig)
 }
 func (m *wrappedModel) SoundDetection(ctx context.Context, audio string, topK int, threshold float32) (*schema.SoundClassificationResult, error) {
 	return modelSoundDetection(ctx, m.modelLoader, m.appConfig, m.SoundDetectionConfig, audio, topK, threshold)
 }
 func (m *wrappedModel) Predict(ctx context.Context, messages schema.Messages, images, videos, audios []string, tokenCallback func(string, backend.TokenUsage) bool, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, logprobs *int, topLogprobs *int, logitBias map[string]float64) (func() (backend.LLMResponse, error), error) {
 	input := schema.OpenAIRequest{
 		Messages: messages,
@@ -399,6 +409,39 @@ func transcribeStream(ctx context.Context, ml *model.ModelLoader, transcriptionC
 	return final, nil
 }
 // modelSoundDetection runs sound-event classification against the session's
 // sound-classification model config, mirroring how Transcribe dispatches to
 // the transcription backend. Returns an error when no sound-detection model is
 // configured for the session.
 func modelSoundDetection(ctx context.Context, ml *model.ModelLoader, appConfig *config.ApplicationConfig, soundConfig *config.ModelConfig, audio string, topK int, threshold float32) (*schema.SoundClassificationResult, error) {
 	if soundConfig == nil {
 		return nil, fmt.Errorf("sound detection is not configured for this session")
 	}
 	return backend.ModelSoundDetection(ctx, backend.SoundDetectionRequest{
 		Audio:     audio,
 		TopK:      int32(topK),
 		Threshold: threshold,
 	}, ml, *soundConfig, appConfig)
 }
 // loadSoundDetectionConfig resolves the optional sound-classification model
 // config named by pipeline.sound_detection. Returns (nil, nil) when no model
 // is configured so sound detection stays additive and never blocks session
 // setup.
 func loadSoundDetectionConfig(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader) (*config.ModelConfig, error) {
 	if pipeline.SoundDetection == "" {
 		return nil, nil
 	}
 	cfg, err := cl.LoadModelConfigFileByName(pipeline.SoundDetection, ml.ModelPath)
 	if err != nil {
 		return nil, fmt.Errorf("failed to load sound detection config: %w", err)
 	}
 	if valid, _ := cfg.Validate(); !valid {
 		return nil, fmt.Errorf("failed to validate sound detection config %q", pipeline.SoundDetection)
 	}
 	return cfg, nil
 }
 func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, *config.ModelConfig, error) {
 	cfgVAD, err := cl.LoadModelConfigFileByName(pipeline.VAD, ml.ModelPath)
 	if err != nil {
@@ -420,9 +463,15 @@ func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfig
 		return nil, nil, fmt.Errorf("failed to validate config: %w", err)
 	}
 	cfgSound, err := loadSoundDetectionConfig(pipeline, cl, ml)
 	if err != nil {
 		return nil, nil, err
 	}
 	return &transcriptOnlyModel{
-		TranscriptionConfig: cfgSST,
+		TranscriptionConfig:  cfgSST,
-		VADConfig:           cfgVAD,
+		VADConfig:            cfgVAD,
 		SoundDetectionConfig: cfgSound,
 		confLoader:  cl,
 		modelLoader: ml,
@@ -430,6 +479,27 @@ func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfig
 	}, cfgSST, nil
 }
 // newSoundDetectionOnlyModel builds a realtime model that only does sound-event
 // classification: no VAD, transcription, LLM or TTS stages are loaded. Used for
 // a sound-detection-only realtime session, which activates on sounds (not
 // speech) and is driven by client-side windowing (turn_detection none +
 // input_audio_buffer.commit) rather than the voice VAD loop.
 func newSoundDetectionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, error) {
 	cfgSound, err := loadSoundDetectionConfig(pipeline, cl, ml)
 	if err != nil {
 		return nil, err
 	}
 	if cfgSound == nil {
 		return nil, fmt.Errorf("a sound-only realtime session requires pipeline.sound_detection")
 	}
 	return &transcriptOnlyModel{
 		SoundDetectionConfig: cfgSound,
 		confLoader:           cl,
 		modelLoader:          ml,
 		appConfig:            appConfig,
 	}, nil
 }
 // RealtimeRoutingContext is the bundle of routing dependencies the
 // realtime pipeline needs to consult router.Resolve per turn. nil-safe:
 // passing nil skips routing entirely and preserves the historical "one
@@ -544,11 +614,17 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
 		return nil, fmt.Errorf("failed to validate config: %w", err)
 	}
 	cfgSound, err := loadSoundDetectionConfig(pipeline, cl, ml)
 	if err != nil {
 		return nil, err
 	}
 	wm := &wrappedModel{
-		TTSConfig:           cfgTTS,
+		TTSConfig:            cfgTTS,
-		TranscriptionConfig: cfgSST,
+		TranscriptionConfig:  cfgSST,
-		LLMConfig:           cfgLLM,
+		LLMConfig:            cfgLLM,
-		VADConfig:           cfgVAD,
+		VADConfig:            cfgVAD,
 		SoundDetectionConfig: cfgSound,
 		confLoader:  cl,
 		modelLoader: ml,
--- a/core/http/endpoints/openai/realtime_sound_detection.go
+++ b/core/http/endpoints/openai/realtime_sound_detection.go
@@ -0,0 +1,48 @@
 package openai
 import (
 	"context"
 	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
 )
 // defaultSoundDetectionTopK is the number of scored tags requested per
 // committed utterance when the session does not pin its own top_k.
 const defaultSoundDetectionTopK = 5
 // emitSoundDetection classifies a committed utterance into sound-event tags and
 // emits a conversation.item.sound_detection event for it. It mirrors
 // emitTranscription's unary path: it calls the session's sound-event
 // classifier, maps the scored tags onto the server event, and sends it over
 // the transport. Sound detection is additive to transcription: its result is
 // emitted independently and a failure here is the caller's to log, never a
 // reason to abort the turn.
 func emitSoundDetection(ctx context.Context, t Transport, session *Session, itemID, audioPath string) error {
 	topK := session.SoundDetectionTopK
 	if topK <= 0 {
 		topK = defaultSoundDetectionTopK
 	}
 	result, err := session.ModelInterface.SoundDetection(ctx, audioPath, topK, session.SoundDetectionThreshold)
 	if err != nil {
 		return err
 	}
 	detections := make([]types.SoundDetectionTag, 0)
 	if result != nil {
 		for _, d := range result.Detections {
 			detections = append(detections, types.SoundDetectionTag{
 				Label: d.Label,
 				Score: d.Score,
 				Index: d.Index,
 			})
 		}
 	}
 	return t.SendEvent(types.ConversationItemSoundDetectionEvent{
 		ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
 		ItemID:          itemID,
 		ContentIndex:    0,
 		Detections:      detections,
 	})
 }
--- a/core/http/endpoints/openai/realtime_sound_detection_test.go
+++ b/core/http/endpoints/openai/realtime_sound_detection_test.go
@@ -0,0 +1,170 @@
 package openai
 import (
 	"context"
 	"encoding/binary"
 	"errors"
 	"os"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
 	"github.com/mudler/LocalAI/core/schema"
 )
 // emitSoundDetection classifies a committed utterance and emits a single
 // conversation.item.sound_detection event carrying the scored AudioSet tags.
 var _ = Describe("emitSoundDetection", func() {
 	It("emits a sound_detection event with the classifier's scored tags", func() {
 		session := &Session{
 			SoundDetectionEnabled: true,
 			SoundDetectionTopK:    5,
 			ModelInterface: &fakeModel{
 				soundDetectionResult: &schema.SoundClassificationResult{
 					Model: "ced",
 					Detections: []schema.SoundClassification{
 						{Index: 3, Label: "Baby cry, infant cry", Score: 0.91},
 						{Index: 7, Label: "Speech", Score: 0.42},
 					},
 				},
 			},
 		}
 		t := &fakeTransport{}
 		err := emitSoundDetection(context.Background(), t, session, "item1", "/tmp/x.wav")
 		Expect(err).ToNot(HaveOccurred())
 		Expect(t.countEvents(types.ServerEventTypeConversationItemSoundDetection)).To(Equal(1))
 		ev, ok := t.events[0].(types.ConversationItemSoundDetectionEvent)
 		Expect(ok).To(BeTrue())
 		Expect(ev.ItemID).To(Equal("item1"))
 		Expect(ev.ContentIndex).To(Equal(0))
 		Expect(ev.Detections).To(HaveLen(2))
 		Expect(ev.Detections[0].Label).To(Equal("Baby cry, infant cry"))
 		Expect(ev.Detections[0].Score).To(BeNumerically("~", 0.91, 1e-6))
 		Expect(ev.Detections[0].Index).To(Equal(3))
 		Expect(ev.Detections[1].Label).To(Equal("Speech"))
 	})
 	It("emits an event with no detections when the classifier returns none", func() {
 		session := &Session{
 			SoundDetectionEnabled: true,
 			ModelInterface: &fakeModel{
 				soundDetectionResult: &schema.SoundClassificationResult{Model: "ced"},
 			},
 		}
 		t := &fakeTransport{}
 		err := emitSoundDetection(context.Background(), t, session, "item1", "/tmp/x.wav")
 		Expect(err).ToNot(HaveOccurred())
 		Expect(t.countEvents(types.ServerEventTypeConversationItemSoundDetection)).To(Equal(1))
 		ev, ok := t.events[0].(types.ConversationItemSoundDetectionEvent)
 		Expect(ok).To(BeTrue())
 		Expect(ev.Detections).To(BeEmpty())
 	})
 	It("propagates the classifier error and emits no event", func() {
 		session := &Session{
 			SoundDetectionEnabled: true,
 			ModelInterface:        &fakeModel{soundDetectionErr: errors.New("boom")},
 		}
 		t := &fakeTransport{}
 		err := emitSoundDetection(context.Background(), t, session, "item1", "/tmp/x.wav")
 		Expect(err).To(HaveOccurred())
 		Expect(t.countEvents(types.ServerEventTypeConversationItemSoundDetection)).To(Equal(0))
 	})
 })
 // A sound-detection-only session (no transcription, no LLM) runs through
 // commitUtterance WITHOUT the voice/transcription path: it emits the
 // sound_detection event and stops - no transcription event, no LLM response.
 var _ = Describe("commitUtterance (sound-detection-only session)", func() {
 	It("emits sound detection and neither transcribes nor generates a response", func() {
 		session := &Session{
 			SoundDetectionEnabled:   true,
 			SoundDetectionTopK:      5,
 			InputAudioTranscription: nil, // sound-only: no transcription stage
 			ModelConfig:             &config.ModelConfig{},
 			ModelInterface: &fakeModel{
 				soundDetectionResult: &schema.SoundClassificationResult{
 					Model: "ced",
 					Detections: []schema.SoundClassification{
 						{Index: 23, Label: "Baby cry, infant cry", Score: 0.87},
 					},
 				},
 			},
 		}
 		tr := &fakeTransport{}
 		utt := make([]byte, 32) // non-empty PCM so commitUtterance proceeds
 		commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
 		Expect(tr.countEvents(types.ServerEventTypeConversationItemSoundDetection)).To(Equal(1))
 		// No transcription happened.
 		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(0))
 		// No LLM response was generated (sound-only has no LLM stage).
 		Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
 	})
 })
 // Server-side windowing (option B): a sound-only session classifies the last
 // WindowMs of streamed audio per tick, with no client commit, and keeps the
 // input buffer trimmed to one window.
 var _ = Describe("classifySoundWindow (server-side windowing)", func() {
 	newSoundSession := func() (*Session, *fakeTransport) {
 		return &Session{
 			SoundDetectionEnabled:  true,
 			SoundDetectionTopK:     5,
 			SoundDetectionWindowMs: 200, // 200ms @ 16kHz mono16 = 6400 bytes
 			SoundDetectionHopMs:    20,
 			InputSampleRate:        16000,
 			ModelInterface: &fakeModel{
 				soundDetectionResult: &schema.SoundClassificationResult{
 					Model:      "ced",
 					Detections: []schema.SoundClassification{{Index: 23, Label: "Baby cry, infant cry", Score: 0.87}},
 				},
 			},
 		}, &fakeTransport{}
 	}
 	It("emits a sound_detection event and trims the buffer to one window", func() {
 		session, tr := newSoundSession()
 		session.InputAudioBuffer = make([]byte, 10000) // > 6400-byte window
 		classifySoundWindow(session, tr)
 		Expect(tr.countEvents(types.ServerEventTypeConversationItemSoundDetection)).To(Equal(1))
 		// buffer trimmed to exactly one window (200ms @ 16kHz mono 16-bit)
 		Expect(len(session.InputAudioBuffer)).To(Equal(6400))
 	})
 	It("does nothing when too little audio is buffered", func() {
 		session, tr := newSoundSession()
 		session.InputAudioBuffer = make([]byte, 100) // < ~10ms (320 bytes)
 		classifySoundWindow(session, tr)
 		Expect(tr.countEvents(types.ServerEventTypeConversationItemSoundDetection)).To(Equal(0))
 	})
 })
 var _ = Describe("writeWindowWAV", func() {
 	It("writes a mono 16-bit WAV header declaring the given sample rate", func() {
 		pcm := make([]byte, 640)
 		path, err := writeWindowWAV(pcm, 24000)
 		Expect(err).ToNot(HaveOccurred())
 		defer func() { _ = os.Remove(path) }()
 		data, err := os.ReadFile(path)
 		Expect(err).ToNot(HaveOccurred())
 		Expect(len(data)).To(BeNumerically(">=", 44+len(pcm)))
 		// SampleRate is a little-endian uint32 at byte offset 24 of a WAV header.
 		Expect(binary.LittleEndian.Uint32(data[24:28])).To(Equal(uint32(24000)))
 	})
 })
--- a/core/http/endpoints/openai/realtime_speaker_event_test.go
+++ b/core/http/endpoints/openai/realtime_speaker_event_test.go
@@ -0,0 +1,54 @@
 package openai
 import (
 	"encoding/json"
 	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
 var _ = Describe("ConversationItemSpeakerEvent", func() {
 	It("marshals with the conversation.item.speaker type and nested speaker", func() {
 		ev := types.ConversationItemSpeakerEvent{
 			ItemID:  "item_123",
 			Speaker: types.Speaker{Name: "Jeremy", ID: "spk_1", Labels: map[string]string{"family": "yes"}, Confidence: 92, Distance: 0.1, Matched: true},
 		}
 		b, err := json.Marshal(ev)
 		Expect(err).ToNot(HaveOccurred())
 		var got map[string]any
 		Expect(json.Unmarshal(b, &got)).To(Succeed())
 		Expect(got["type"]).To(Equal("conversation.item.speaker"))
 		Expect(got["item_id"]).To(Equal("item_123"))
 		spk := got["speaker"].(map[string]any)
 		Expect(spk["name"]).To(Equal("Jeremy"))
 		Expect(spk["id"]).To(Equal("spk_1"))
 		Expect(spk["matched"]).To(Equal(true))
 		Expect(spk["labels"]).To(HaveKeyWithValue("family", "yes"))
 	})
 	It("omits labels when the speaker has none", func() {
 		ev := types.ConversationItemSpeakerEvent{ItemID: "i", Speaker: types.Speaker{Name: "Jeremy", Matched: true}}
 		b, err := json.Marshal(ev)
 		Expect(err).ToNot(HaveOccurred())
 		var got map[string]any
 		Expect(json.Unmarshal(b, &got)).To(Succeed())
 		spk := got["speaker"].(map[string]any)
 		_, hasLabels := spk["labels"]
 		Expect(hasLabels).To(BeFalse())
 	})
 	It("omits the name for an unknown speaker but keeps matched=false", func() {
 		ev := types.ConversationItemSpeakerEvent{ItemID: "i", Speaker: types.Speaker{Matched: false}}
 		b, err := json.Marshal(ev)
 		Expect(err).ToNot(HaveOccurred())
 		var got map[string]any
 		Expect(json.Unmarshal(b, &got)).To(Succeed())
 		spk := got["speaker"].(map[string]any)
 		_, hasName := spk["name"]
 		Expect(hasName).To(BeFalse())
 		Expect(spk["matched"]).To(Equal(false))
 	})
 })
--- a/core/http/endpoints/openai/realtime_transport_webrtc.go
+++ b/core/http/endpoints/openai/realtime_transport_webrtc.go
@@ -113,8 +113,13 @@ func (t *WebRTCTransport) sendLoop() {
 				return
 			}
 			if err := t.dc.SendText(string(data)); err != nil {
-				xlog.Error("data channel send failed", "error", err)
+				// Drop just this event and keep the loop alive: a single
-				return
+				// failed send (e.g. an event over the negotiated SCTP
 				// max-message-size) must not tear down the session and
 				// silently drop every subsequent event. A genuinely dead
 				// transport is handled by the <-t.closed case.
 				xlog.Error("data channel send failed, dropping event", "error", err)
 				continue
 			}
 		case <-t.closed:
 			// Drain any remaining queued events before exiting
@@ -122,7 +127,8 @@ func (t *WebRTCTransport) sendLoop() {
 				select {
 				case data := <-t.outEvents:
 					if err := t.dc.SendText(string(data)); err != nil {
-						return
+						xlog.Error("data channel send failed while draining, dropping event", "error", err)
 						continue
 					}
 				default:
 					return
--- a/core/http/endpoints/openai/realtime_voicegate.go
+++ b/core/http/endpoints/openai/realtime_voicegate.go
@@ -7,6 +7,7 @@ import (
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
 	"github.com/mudler/LocalAI/core/services/voicerecognition"
 	"github.com/mudler/LocalAI/pkg/model"
 )
@@ -29,6 +30,32 @@ type voiceGate struct {
 	verifyFn func(ctx context.Context, uttWav, refWav string) (bool, error)
 }
 // resolution is the outcome of resolving a committed utterance's speaker. It
 // carries the surfacing-facing Speaker plus the metadata the policy layer needs
 // (labels for the allow-list) and a human reason when no usable identity exists.
 type resolution struct {
 	speaker types.Speaker     // name/id/confidence/distance/matched
 	labels  map[string]string // identify-mode metadata labels, for the allow-list
 	found   bool              // a candidate identity existed at all
 	reason  string            // why-unknown / deny reason at the resolve level
 }
 // confidence maps a cosine distance to a 0..100 score relative to the match
 // threshold, mirroring the /v1/voice/identify endpoint.
 func confidence(distance, threshold float32) float32 {
 	if threshold <= 0 {
 		return 0
 	}
 	c := (1 - distance/threshold) * 100
 	if c < 0 {
 		return 0
 	}
 	if c > 100 {
 		return 100
 	}
 	return c
 }
 // newVoiceGate builds a gate from a pipeline's voice_recognition config. It
 // validates fail-fast (before loading the model), loads the recognition model
 // config, wires the real backend seams, and pre-embeds references for verify
@@ -89,91 +116,143 @@ func newVoiceGate(
 	return g, nil
 }
-// Authorize embeds the utterance and decides allow/deny.
+// Resolve embeds the utterance once and resolves the speaker's identity. It does
-//
+// NOT apply the authorization policy (see authorize). On a backend error it
-//	allowed: speaker is authorized.
+// returns the error and a resolution whose reason explains the failure.
-//	matched: matched person's name (informational), empty if none.
+func (g *voiceGate) Resolve(ctx context.Context, wavPath string) (resolution, error) {
 //	reason:  human-readable deny reason.
 //	err:     backend failure (caller should fail closed).
 func (g *voiceGate) Authorize(ctx context.Context, wavPath string) (allowed bool, matched string, reason string, err error) {
 	if g.cfg.Mode == config.VoiceGateModeVerify {
-		return g.authorizeVerify(ctx, wavPath)
+		return g.resolveVerify(ctx, wavPath)
 	}
-	return g.authorizeIdentify(ctx, wavPath)
+	return g.resolveIdentify(ctx, wavPath)
 }
-func (g *voiceGate) authorizeIdentify(ctx context.Context, wavPath string) (bool, string, string, error) {
+func (g *voiceGate) resolveIdentify(ctx context.Context, wavPath string) (resolution, error) {
 	emb, err := g.embedFn(ctx, wavPath)
 	if err != nil {
-		return false, "", "embed failed", err
+		return resolution{reason: "embed failed"}, err
 	}
 	if len(emb) == 0 {
-		return false, "", "no speech detected", nil
+		return resolution{reason: "no speech detected"}, nil
 	}
 	matches, err := g.registry.Identify(ctx, emb, 1)
 	if err != nil {
-		return false, "", "identify failed", err
+		return resolution{reason: "identify failed"}, err
 	}
 	if len(matches) == 0 {
-		return false, "", "unknown speaker", nil
+		return resolution{reason: "unknown speaker"}, nil
 	}
 	m := matches[0]
-	if m.Distance > g.cfg.Threshold {
+	matched := m.Distance <= g.cfg.Threshold
-		return false, m.Metadata.Name, "distance above threshold", nil
+	r := resolution{
 		speaker: types.Speaker{
 			Name:       m.Metadata.Name,
 			ID:         m.Metadata.ID,
 			Labels:     m.Metadata.Labels,
 			Distance:   m.Distance,
 			Confidence: confidence(m.Distance, g.cfg.Threshold),
 			Matched:    matched,
 		},
 		labels: m.Metadata.Labels,
 		found:  true,
 	}
-	if !g.allowMatch(m.Metadata) {
+	if !matched {
-		return false, m.Metadata.Name, "speaker not in allow list", nil
+		r.reason = "distance above threshold"
 	}
-	return true, m.Metadata.Name, "", nil
+	return r, nil
 }
 func (g *voiceGate) resolveVerify(ctx context.Context, wavPath string) (resolution, error) {
 	if g.cfg.AntiSpoofing {
 		for _, ref := range g.refAudios {
 			ok, err := g.verifyFn(ctx, wavPath, ref.Audio)
 			if err != nil {
 				return resolution{reason: "verify failed"}, err
 			}
 			if ok {
 				return resolution{
 					speaker: types.Speaker{Name: ref.Name, Confidence: 100, Matched: true},
 					found:   true,
 				}, nil
 			}
 		}
 		return resolution{reason: "no reference matched"}, nil
 	}
 	emb, err := g.embedFn(ctx, wavPath)
 	if err != nil {
 		return resolution{reason: "embed failed"}, err
 	}
 	if len(emb) == 0 {
 		return resolution{reason: "no speech detected"}, nil
 	}
 	for _, ref := range g.refEmbeds {
 		d := cosineDistance(emb, ref.emb)
 		if d <= g.cfg.Threshold {
 			return resolution{
 				speaker: types.Speaker{Name: ref.name, Distance: d, Confidence: confidence(d, g.cfg.Threshold), Matched: true},
 				found:   true,
 			}, nil
 		}
 	}
 	return resolution{reason: "no reference matched"}, nil
 }
 // authorize applies the gate's policy to an already-resolved identity.
 func (g *voiceGate) authorize(r resolution) (allowed bool, reason string) {
 	if g.cfg.Mode == config.VoiceGateModeVerify {
 		if r.speaker.Matched {
 			return true, ""
 		}
 		if r.reason == "" {
 			return false, "no reference matched"
 		}
 		return false, r.reason
 	}
 	if !r.found {
 		return false, r.reason
 	}
 	if !r.speaker.Matched {
 		return false, "distance above threshold"
 	}
 	if !g.allowMatch(r.speaker.Name, r.labels) {
 		return false, "speaker not in allow list"
 	}
 	return true, ""
 }
 // allowMatch reports whether a matched identity is authorized. An empty allow
 // (no names and no labels) authorizes any registered speaker.
-func (g *voiceGate) allowMatch(meta voicerecognition.Metadata) bool {
+func (g *voiceGate) allowMatch(name string, labels map[string]string) bool {
 	a := g.cfg.Allow
 	if len(a.Names) == 0 && len(a.Labels) == 0 {
 		return true
 	}
 	for _, n := range a.Names {
-		if n == meta.Name {
+		if n == name {
 			return true
 		}
 	}
 	for _, l := range a.Labels {
-		if _, ok := meta.Labels[l]; ok {
+		if _, ok := labels[l]; ok {
 			return true
 		}
 	}
 	return false
 }
-func (g *voiceGate) authorizeVerify(ctx context.Context, wavPath string) (bool, string, string, error) {
+// Authorize is the legacy convenience wrapper: resolve then apply policy.
-	if g.cfg.AntiSpoofing {
+//
-		for _, r := range g.refAudios {
+//	allowed: speaker is authorized.
-			ok, err := g.verifyFn(ctx, wavPath, r.Audio)
+//	matched: matched person's name (informational), empty if none.
-			if err != nil {
+//	reason:  human-readable deny reason.
-				return false, "", "verify failed", err
+//	err:     backend failure (caller should fail closed).
-			}
+func (g *voiceGate) Authorize(ctx context.Context, wavPath string) (allowed bool, matched string, reason string, err error) {
-			if ok {
+	r, rerr := g.Resolve(ctx, wavPath)
-				return true, r.Name, "", nil
+	if rerr != nil {
-			}
+		return false, "", r.reason, rerr
 		}
 		return false, "", "no reference matched", nil
 	}
-
+	allowed, reason = g.authorize(r)
-	emb, err := g.embedFn(ctx, wavPath)
+	return allowed, r.speaker.Name, reason, nil
 	if err != nil {
 		return false, "", "embed failed", err
 	}
 	if len(emb) == 0 {
 		return false, "", "no speech detected", nil
 	}
 	for _, r := range g.refEmbeds {
 		if cosineDistance(emb, r.emb) <= g.cfg.Threshold {
 			return true, r.name, "", nil
 		}
 	}
 	return false, "", "no reference matched", nil
 }
 // decide interprets an Authorize result against the gate's when-policy and the
--- a/core/http/endpoints/openai/realtime_voicegate_integration_test.go
+++ b/core/http/endpoints/openai/realtime_voicegate_integration_test.go
@@ -152,3 +152,252 @@ var _ = Describe("realtime voice gate integration (commitUtterance)", func() {
 		Expect(tr2.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1))
 	})
 })
 var _ = Describe("realtime speaker surfacing (commitUtterance)", func() {
 	utt := make([]byte, 32)
 	It("emits conversation.item.speaker for a confident match when announce is on", func() {
 		session, _ := itSession(itGate("alice", "alice", []float32{1, 0, 0}, nil,
 			config.VoiceGateWhenEvery, config.VoiceGateRejectEvent))
 		session.voiceGate.cfg.Identity = &config.VoiceIdentityConfig{Announce: true}
 		tr := &fakeTransport{}
 		commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
 		Expect(tr.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(1))
 	})
 	It("does not emit the speaker event for an unknown speaker unless announce_unknown is set", func() {
 		// match distance above threshold => not matched
 		gate := &voiceGate{
 			cfg: config.PipelineVoiceRecognition{
 				Mode: config.VoiceGateModeIdentify, Threshold: 0.25,
 				When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent,
 				Enforce:  boolPtr(false),
 				Identity: &config.VoiceIdentityConfig{Announce: true},
 			},
 			registry: &fakeRegistry{matches: []voicerecognition.Match{
 				{Distance: 0.9, Metadata: voicerecognition.Metadata{Name: "alice"}},
 			}},
 			embedFn: func(context.Context, string) ([]float32, error) { return []float32{1, 0, 0}, nil },
 		}
 		session, _ := itSession(gate)
 		tr := &fakeTransport{}
 		commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
 		Expect(tr.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(0))
 		gate.cfg.Identity.AnnounceUnknown = true
 		tr2 := &fakeTransport{}
 		commitUtterance(context.Background(), utt, session, &Conversation{}, tr2)
 		Expect(tr2.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(1))
 	})
 	It("never drops a turn when enforce is false even for a disallowed speaker", func() {
 		session, _ := itSession(itGate("bob", "alice", []float32{1, 0, 0}, nil,
 			config.VoiceGateWhenEvery, config.VoiceGateRejectEvent))
 		session.voiceGate.cfg.Enforce = boolPtr(false)
 		tr := &fakeTransport{}
 		commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
 		Expect(hasSpeakerNotAuthorized(tr)).To(BeFalse())
 		Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1))
 	})
 })
 var _ = Describe("realtime speaker personalization (triggerResponseAtTurn)", func() {
 	utt := make([]byte, 32)
 	findRole := func(msgs schema.Messages, role string) *schema.Message {
 		for i := range msgs {
 			if msgs[i].Role == role {
 				return &msgs[i]
 			}
 		}
 		return nil
 	}
 	It("sets the user message name and a current-speaker system note", func() {
 		session, m := itSession(itGate("alice", "alice", []float32{1, 0, 0}, nil,
 			config.VoiceGateWhenEvery, config.VoiceGateRejectEvent))
 		session.voiceGate.cfg.Identity = &config.VoiceIdentityConfig{
 			Personalize: true, InjectName: true, InjectSystemNote: true,
 		}
 		session.Instructions = "You are helpful."
 		tr := &fakeTransport{}
 		commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
 		user := findRole(m.lastMessages, "user")
 		Expect(user).ToNot(BeNil())
 		Expect(user.Name).To(Equal("alice"))
 		sys := findRole(m.lastMessages, "system")
 		Expect(sys).ToNot(BeNil())
 		Expect(sys.StringContent).To(ContainSubstring("The current speaker is alice."))
 	})
 	It("omits the unknown note unless note_unknown is set", func() {
 		base := func() (*Session, *fakeModel) {
 			gate := &voiceGate{
 				cfg: config.PipelineVoiceRecognition{
 					Mode: config.VoiceGateModeIdentify, Threshold: 0.25,
 					When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent,
 					Enforce:  boolPtr(false),
 					Identity: &config.VoiceIdentityConfig{Personalize: true, InjectSystemNote: true},
 				},
 				registry: &fakeRegistry{matches: []voicerecognition.Match{
 					{Distance: 0.9, Metadata: voicerecognition.Metadata{Name: "alice"}},
 				}},
 				embedFn: func(context.Context, string) ([]float32, error) { return []float32{1, 0, 0}, nil },
 			}
 			s, m := itSession(gate)
 			s.Instructions = "You are helpful."
 			return s, m
 		}
 		s1, m1 := base()
 		commitUtterance(context.Background(), utt, s1, &Conversation{}, &fakeTransport{})
 		Expect(findRole(m1.lastMessages, "system").StringContent).ToNot(ContainSubstring("unknown"))
 		s2, m2 := base()
 		s2.voiceGate.cfg.Identity.NoteUnknown = true
 		commitUtterance(context.Background(), utt, s2, &Conversation{}, &fakeTransport{})
 		Expect(findRole(m2.lastMessages, "system").StringContent).To(ContainSubstring("The current speaker is unknown."))
 	})
 })
 var _ = Describe("realtime when:first with identity (commitUtterance)", func() {
 	utt := make([]byte, 32)
 	// statefulIdentityGate builds a when:first identify gate with an Identity
 	// block (so identity is resolved every turn) whose embedFn is driven by a
 	// per-turn counter: the failOnSecond flag makes the second and later embeds
 	// return an error, exercising the stricter fail-closed path on a re-resolve.
 	statefulIdentityGate := func(failOnSecond bool) *voiceGate {
 		calls := 0
 		return &voiceGate{
 			cfg: config.PipelineVoiceRecognition{
 				Mode:      config.VoiceGateModeIdentify,
 				Threshold: 0.25,
 				When:      config.VoiceGateWhenFirst,
 				OnReject:  config.VoiceGateRejectEvent,
 				Allow:     config.VoiceRecognitionAllow{Names: []string{"alice"}},
 				Identity:  &config.VoiceIdentityConfig{Announce: true, Personalize: true, InjectName: true},
 			},
 			registry: &fakeRegistry{matches: []voicerecognition.Match{
 				{Distance: 0.1, Metadata: voicerecognition.Metadata{Name: "alice"}},
 			}},
 			embedFn: func(context.Context, string) ([]float32, error) {
 				calls++
 				if failOnSecond && calls > 1 {
 					return nil, errors.New("embed backend down")
 				}
 				return []float32{1, 0, 0}, nil
 			},
 		}
 	}
 	It("re-resolves identity every turn and fails closed when a later embed errors", func() {
 		gate := statefulIdentityGate(true)
 		session, _ := itSession(gate)
 		conv := &Conversation{} // shared so voiceVerified persists across turns
 		// Turn 1: authorized; identity resolved, speaker surfaced, response runs.
 		tr1 := &fakeTransport{}
 		commitUtterance(context.Background(), utt, session, conv, tr1)
 		Expect(hasSpeakerNotAuthorized(tr1)).To(BeFalse())
 		Expect(tr1.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(1))
 		Expect(tr1.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1))
 		// Turn 2: when:first would skip re-authorization, but the Identity block
 		// forces a fresh resolve. That resolve now errors, and because the gate
 		// enforces, the turn is dropped fail-closed rather than riding on the
 		// cached first verification.
 		tr2 := &fakeTransport{}
 		commitUtterance(context.Background(), utt, session, conv, tr2)
 		Expect(hasSpeakerNotAuthorized(tr2)).To(BeTrue())
 		Expect(tr2.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
 	})
 	It("re-resolves identity every turn so a later turn still surfaces and names the speaker", func() {
 		gate := statefulIdentityGate(false)
 		session, m := itSession(gate)
 		conv := &Conversation{}
 		tr1 := &fakeTransport{}
 		commitUtterance(context.Background(), utt, session, conv, tr1)
 		Expect(hasSpeakerNotAuthorized(tr1)).To(BeFalse())
 		Expect(tr1.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1))
 		// Turn 2: authorization is skipped (when:first, already verified) but the
 		// speaker event still fires and the per-message name is set, proving the
 		// per-turn re-resolution (not the cached first verification) drove it.
 		tr2 := &fakeTransport{}
 		commitUtterance(context.Background(), utt, session, conv, tr2)
 		Expect(tr2.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(1))
 		var lastUser *schema.Message
 		for i := range m.lastMessages {
 			if m.lastMessages[i].Role == "user" {
 				lastUser = &m.lastMessages[i]
 			}
 		}
 		Expect(lastUser).ToNot(BeNil())
 		Expect(lastUser.Name).To(Equal("alice"))
 	})
 })
 var _ = Describe("realtime multi-speaker history attribution (triggerResponse)", func() {
 	userAudioItem := func(name, transcript string) *types.MessageItemUnion {
 		return &types.MessageItemUnion{
 			User: &types.MessageItemUser{
 				ID:      generateItemID(),
 				Status:  types.ItemStatusCompleted,
 				Speaker: &types.Speaker{Name: name, Matched: true},
 				Content: []types.MessageContentInput{
 					{Type: types.MessageContentTypeInputAudio, Transcript: transcript},
 				},
 			},
 		}
 	}
 	It("attributes each user turn to its own speaker and notes the latest one", func() {
 		session, m := itSession(itGate("alice", "alice", []float32{1, 0, 0}, nil,
 			config.VoiceGateWhenEvery, config.VoiceGateRejectEvent))
 		session.Instructions = "You are helpful."
 		session.MaxHistoryItems = 10 // keep both items; 0 would mean "no trim" too
 		session.voiceGate.cfg.Identity = &config.VoiceIdentityConfig{
 			Personalize: true, InjectName: true, InjectSystemNote: true,
 		}
 		conv := &Conversation{Items: []*types.MessageItemUnion{
 			userAudioItem("alice", "hello there"),
 			userAudioItem("bob", "what is the weather"),
 		}}
 		tr := &fakeTransport{}
 		triggerResponse(context.Background(), session, conv, tr, nil)
 		var users []*schema.Message
 		var sys *schema.Message
 		for i := range m.lastMessages {
 			switch m.lastMessages[i].Role {
 			case "user":
 				users = append(users, &m.lastMessages[i])
 			case "system":
 				if sys == nil {
 					sys = &m.lastMessages[i]
 				}
 			}
 		}
 		Expect(users).To(HaveLen(2))
 		Expect(users[0].Name).To(Equal("alice"))
 		Expect(users[1].Name).To(Equal("bob"))
 		Expect(sys).ToNot(BeNil())
 		Expect(sys.StringContent).To(ContainSubstring("The current speaker is bob."))
 		Expect(sys.StringContent).ToNot(ContainSubstring("alice"))
 	})
 })
 func boolPtr(b bool) *bool { return &b }
--- a/core/http/endpoints/openai/realtime_voicegate_test.go
+++ b/core/http/endpoints/openai/realtime_voicegate_test.go
@@ -10,6 +10,82 @@ import (
 	. "github.com/onsi/gomega"
 )
 var _ = Describe("voiceGate.Resolve + authorize", func() {
 	mkGate := func(allow []string) *voiceGate {
 		return &voiceGate{
 			cfg: config.PipelineVoiceRecognition{
 				Mode:      config.VoiceGateModeIdentify,
 				Threshold: 0.25,
 				Allow:     config.VoiceRecognitionAllow{Names: allow},
 			},
 			registry: &fakeRegistry{matches: []voicerecognition.Match{
 				{Distance: 0.1, Metadata: voicerecognition.Metadata{ID: "spk_1", Name: "alice", Labels: map[string]string{"family": "yes"}}},
 			}},
 			embedFn: func(context.Context, string) ([]float32, error) { return []float32{1, 0, 0}, nil },
 		}
 	}
 	It("resolves a confident identity with name, id and a 0..100 confidence", func() {
 		r, err := mkGate(nil).Resolve(context.Background(), "x.wav")
 		Expect(err).ToNot(HaveOccurred())
 		Expect(r.found).To(BeTrue())
 		Expect(r.speaker.Name).To(Equal("alice"))
 		Expect(r.speaker.ID).To(Equal("spk_1"))
 		Expect(r.speaker.Matched).To(BeTrue())
 		Expect(r.speaker.Confidence).To(BeNumerically(">", 0))
 		Expect(r.speaker.Confidence).To(BeNumerically("<=", 100))
 		Expect(r.speaker.Labels).To(HaveKeyWithValue("family", "yes"))
 	})
 	It("marks a candidate above the threshold as not matched", func() {
 		g := mkGate(nil)
 		g.registry = &fakeRegistry{matches: []voicerecognition.Match{
 			{Distance: 0.9, Metadata: voicerecognition.Metadata{Name: "alice"}},
 		}}
 		r, err := g.Resolve(context.Background(), "x.wav")
 		Expect(err).ToNot(HaveOccurred())
 		Expect(r.found).To(BeTrue())
 		Expect(r.speaker.Matched).To(BeFalse())
 		Expect(r.speaker.Name).To(Equal("alice")) // name still surfaced
 	})
 	It("authorize allows a confident match in the allow list", func() {
 		g := mkGate([]string{"alice"})
 		r, _ := g.Resolve(context.Background(), "x.wav")
 		allowed, reason := g.authorize(r)
 		Expect(allowed).To(BeTrue())
 		Expect(reason).To(BeEmpty())
 	})
 	It("authorize denies a confident match outside the allow list", func() {
 		g := mkGate([]string{"bob"})
 		r, _ := g.Resolve(context.Background(), "x.wav")
 		allowed, reason := g.authorize(r)
 		Expect(allowed).To(BeFalse())
 		Expect(reason).To(Equal("speaker not in allow list"))
 	})
 	It("authorize allows by label when names do not match", func() {
 		g := mkGate(nil)
 		g.cfg.Allow = config.VoiceRecognitionAllow{Labels: []string{"family"}}
 		r, _ := g.Resolve(context.Background(), "x.wav")
 		allowed, _ := g.authorize(r)
 		Expect(allowed).To(BeTrue())
 	})
 })
 var _ = Describe("confidence", func() {
 	It("is 100 at zero distance", func() {
 		Expect(confidence(0, 0.25)).To(BeNumerically("~", 100, 1e-4))
 	})
 	It("clamps to 0 above the threshold", func() {
 		Expect(confidence(0.5, 0.25)).To(BeNumerically("~", 0, 1e-4))
 	})
 	It("is 0 for a non-positive threshold", func() {
 		Expect(confidence(0.1, 0)).To(BeNumerically("~", 0, 1e-4))
 	})
 })
 var _ = Describe("cosineDistance", func() {
 	It("is 0 for identical vectors", func() {
 		Expect(cosineDistance([]float32{1, 0, 0}, []float32{1, 0, 0})).To(BeNumerically("~", 0, 1e-6))
--- a/core/http/endpoints/openai/realtime_webrtc.go
+++ b/core/http/endpoints/openai/realtime_webrtc.go
@@ -128,10 +128,13 @@ func RealtimeCalls(application *application.Application) echo.HandlerFunc {
 			handleIncomingAudioTrack(track, transport)
 		})
-		// Set the remote SDP (client's offer)
+		// Set the remote SDP (client's offer). Raise the data-channel
 		// max-message-size the browser advertised so pion permits the larger
 		// realtime events some turns produce (e.g. tool calls), which would
 		// otherwise be dropped on send. See realtime_webrtc_sctp.go.
 		if err := pc.SetRemoteDescription(webrtc.SessionDescription{
 			Type: webrtc.SDPTypeOffer,
-			SDP:  req.SDP,
+			SDP:  raiseDataChannelMaxMessageSize(req.SDP),
 		}); err != nil {
 			transport.Close()
 			xlog.Error("failed to set remote description", "error", err)
--- a/core/http/endpoints/openai/realtime_webrtc_sctp.go
+++ b/core/http/endpoints/openai/realtime_webrtc_sctp.go
@@ -0,0 +1,29 @@
 package openai
 import (
 	"fmt"
 	"regexp"
 )
 // realtimeDataChannelMaxMessageSize is the SCTP max-message-size LocalAI honors
 // for the "oai-events" data channel, in bytes.
 //
 // Browsers advertise a conservative max-message-size in their SDP offer (Chrome
 // uses 262144 = 256 KiB). pion enforces the remote's advertised value on send,
 // so a single realtime event larger than it cannot be sent: the SendText fails,
 // the event is dropped, and the turn silently yields no response. Some turns
 // legitimately produce a single JSON event above 256 KiB (notably tool calls
 // with sizeable schemas or results). Browsers advertise this value
 // conservatively but their SCTP stacks reassemble much larger messages, so we
 // raise the value honored for our own server-generated events.
 const realtimeDataChannelMaxMessageSize = 16 * 1024 * 1024 // 16 MiB
 var maxMessageSizeAttrRe = regexp.MustCompile(`a=max-message-size:\d+`)
 // raiseDataChannelMaxMessageSize rewrites the SCTP max-message-size attribute in
 // an SDP offer to realtimeDataChannelMaxMessageSize so pion permits larger
 // outbound realtime events. Offers that don't carry the attribute are returned
 // unchanged.
 func raiseDataChannelMaxMessageSize(sdp string) string {
 	return maxMessageSizeAttrRe.ReplaceAllString(sdp, fmt.Sprintf("a=max-message-size:%d", realtimeDataChannelMaxMessageSize))
 }
--- a/Show More
+++ b/Show More