chore(deps): bump torch in /backend/python/vllm

Bumps torch from 2.9.1+cpu to 2.12.1+xpu. --- updated-dependencies: - dependency-name: torch dependency-version: 2.12.1+xpu dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>
fix(pii): post-merge review fixes + live NER e2e for the privacy-filter tier (#10401 )
2026-06-23 16:19:07 -04:00 · 2026-06-22 18:33:32 +00:00 · 2026-06-22 18:26:19 +02:00 · 2026-06-22 18:24:29 +02:00 · 2026-06-22 16:09:16 +02:00 · 2026-06-22 12:38:06 +02:00
238 changed files with 11097 additions and 2121 deletions
--- a/.agents/adding-backends.md
+++ b/.agents/adding-backends.md
@@ -198,6 +198,27 @@ docker-build-backends: ... docker-build-<backend-name>
 - If the backend is in `backend/python/<backend-name>/` but uses `.` as context in the workflow file, use `.` context
 - Check similar backends to determine the correct context

+## Documenting the backend (README + docs)
+
+A backend is not "added" until it is discoverable. Update the user-facing docs:
+
+- **`docs/content/features/backends.md`** - add the backend to the right
+  category in the "LocalAI supports various types of backends" list (and add a
+  new category if it introduces a new modality, e.g. sound classification).
+- If the backend introduces a **new API surface** (a new endpoint or a realtime
+  capability), document it under `docs/content/` where its area lives (audio,
+  vision, etc.) and follow the api-endpoints checklist in
+  [api-endpoints-and-auth.md](api-endpoints-and-auth.md).
+
+**If the backend is a native C/C++/GGML engine created and maintained by the
+LocalAI team** (a from-scratch port like `parakeet.cpp`, `ced.cpp`,
+`vibevoice.cpp`, `rf-detr.cpp`, not a wrapper around a third-party runtime), it
+ALSO belongs in the top-level **`README.md`** table under "native C/C++/GGML
+engines ... developed and maintained by the LocalAI project itself". Add a row
+linking the upstream engine repo with a one-line description. This is the
+project's showcase of its own engines; a new in-house backend that is missing
+from it is a documentation bug.
+
 ## 5. Verification Checklist

 After adding a new backend, verify:
@@ -211,6 +232,8 @@ After adding a new backend, verify:
 - [ ] No YAML syntax errors (check with linter)
 - [ ] No Makefile syntax errors (check with linter)
 - [ ] Follows the same pattern as similar backends (e.g., if it's a transcription backend, follow `faster-whisper` pattern)
+- [ ] Documented: added to the category list in `docs/content/features/backends.md` (and any new endpoint/realtime capability documented under `docs/content/`)
+- [ ] If it is an in-house native C/C++/GGML engine, added to the maintained-engines table in the top-level `README.md`

 ## Bundling runtime shared libraries (`package.sh`)

--- a/.docker/install-base-deps.sh
+++ b/.docker/install-base-deps.sh
@@ -70,6 +70,12 @@ if [ "${BUILD_TYPE:-}" = "vulkan" ] && [ "${SKIP_DRIVERS:-false}" = "false" ]; t
        git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
        ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
        clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
+    # Mesa Vulkan ICD drivers (ANV/RADV/lavapipe + Arm SoC) and their ICD
+    # manifests. The LunarG SDK below only provides the loader and shader
+    # tooling, not hardware drivers — without Mesa the packaged Vulkan backend
+    # would ship a loader that finds no GPU. package-gpu-libs.sh bundles these
+    # .so files plus their deps into the backend so it stays self-contained.
+    apt-get install -y mesa-vulkan-drivers libdrm2
    if [ "amd64" = "${TARGETARCH:-}" ]; then
        wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz"
        tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -3575,6 +3575,154 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
+  # ced
+  - build-type: 'cublas'
+    cuda-major-version: "12"
+    cuda-minor-version: "8"
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-nvidia-cuda-12-ced'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'cublas'
+    cuda-major-version: "13"
+    cuda-minor-version: "0"
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-nvidia-cuda-13-ced'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'cublas'
+    cuda-major-version: "13"
+    cuda-minor-version: "0"
+    platforms: 'linux/arm64'
+    skip-drivers: 'false'
+    tag-latest: 'auto'
+    tag-suffix: '-nvidia-l4t-cuda-13-arm64-ced'
+    base-image: "ubuntu:24.04"
+    ubuntu-version: '2404'
+    runs-on: 'ubuntu-24.04-arm'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+  - build-type: ''
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    platform-tag: 'amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-cpu-ced'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: ''
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/arm64'
+    platform-tag: 'arm64'
+    tag-latest: 'auto'
+    tag-suffix: '-cpu-ced'
+    runs-on: 'ubuntu-24.04-arm'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'sycl_f32'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-intel-sycl-f32-ced'
+    runs-on: 'ubuntu-latest'
+    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'sycl_f16'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-intel-sycl-f16-ced'
+    runs-on: 'ubuntu-latest'
+    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'vulkan'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    platform-tag: 'amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-vulkan-ced'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'vulkan'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/arm64'
+    platform-tag: 'arm64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-vulkan-ced'
+    runs-on: 'ubuntu-24.04-arm'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'cublas'
+    cuda-major-version: "12"
+    cuda-minor-version: "0"
+    platforms: 'linux/arm64'
+    skip-drivers: 'false'
+    tag-latest: 'auto'
+    tag-suffix: '-nvidia-l4t-arm64-ced'
+    base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
+    runs-on: 'ubuntu-24.04-arm'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2204'
+  - build-type: 'hipblas'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-rocm-hipblas-ced'
+    base-image: "rocm/dev-ubuntu-24.04:7.2.1"
+    runs-on: 'ubuntu-latest'
+    skip-drivers: 'false'
+    backend: "ced"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
  # acestep-cpp
  - build-type: ''
    cuda-major-version: ""
@@ -4754,6 +4902,10 @@ includeDarwin:
    tag-suffix: "-metal-darwin-arm64-parakeet-cpp"
    build-type: "metal"
    lang: "go"
+  - backend: "ced"
+    tag-suffix: "-metal-darwin-arm64-ced"
+    build-type: "metal"
+    lang: "go"
  - backend: "acestep-cpp"
    tag-suffix: "-metal-darwin-arm64-acestep-cpp"
    build-type: "metal"
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -42,6 +42,10 @@ jobs:
            variable: "PARAKEET_VERSION"
            branch: "master"
            file: "backend/go/parakeet-cpp/Makefile"
+          - repository: "mudler/ced.cpp"
+            variable: "CED_VERSION"
+            branch: "master"
+            file: "backend/go/ced/Makefile"
          - repository: "mudler/depth-anything.cpp"
            variable: "DEPTHANYTHING_VERSION"
            branch: "master"
--- a/.github/workflows/tests-pii-ner-e2e.yml
+++ b/.github/workflows/tests-pii-ner-e2e.yml
@@ -0,0 +1,97 @@
+---
+name: 'PII NER tier E2E (live GGUF, CPU)'
+
+# Runs the real privacy-filter GGUF NER tier end-to-end on CPU — the gap the
+# hermetic tests/e2e suite cannot cover (it only exercises the in-process
+# pattern tier). Heavy (builds the C++ backend image + downloads a ~2.7 GB
+# GGUF), so it is path-filtered on PRs and otherwise runs nightly / on demand.
+#
+# This drives the container-level harness (tests/e2e-backends) via
+# `make test-extra-backend-privacy-filter`: it builds the privacy-filter image,
+# downloads the model, loads it on CPU, and asserts byte-correct, UTF-8-aligned
+# TokenClassify spans. The complementary HTTP-path specs in tests/e2e
+# (e2e_pii_ner_test.go) Skip unless PII_NER_MODEL_GGUF is wired.
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 3 * * *'
+  push:
+    branches:
+      - master
+    paths:
+      - 'backend/cpp/privacy-filter/**'
+      - 'backend/Dockerfile.privacy-filter'
+      - 'core/services/routing/pii/**'
+      - 'core/services/routing/piidetector/**'
+      - 'core/backend/token_classify.go'
+      - 'core/http/endpoints/localai/pii.go'
+      - 'core/schema/pii.go'
+      - 'tests/e2e-backends/**'
+      - 'tests/e2e/e2e_pii_ner_test.go'
+      - 'tests/e2e/e2e_suite_test.go'
+      - '.github/workflows/tests-pii-ner-e2e.yml'
+  pull_request:
+    paths:
+      - 'backend/cpp/privacy-filter/**'
+      - 'backend/Dockerfile.privacy-filter'
+      - 'core/services/routing/pii/**'
+      - 'core/services/routing/piidetector/**'
+      - 'core/backend/token_classify.go'
+      - 'core/http/endpoints/localai/pii.go'
+      - 'core/schema/pii.go'
+      - 'tests/e2e-backends/**'
+      - 'tests/e2e/e2e_pii_ner_test.go'
+      - 'tests/e2e/e2e_suite_test.go'
+      - '.github/workflows/tests-pii-ner-e2e.yml'
+
+concurrency:
+  group: ci-tests-pii-ner-e2e-${{ github.event.pull_request.number || github.sha }}-${{ github.repository }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  tests-pii-ner-e2e:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        go-version: ['1.25.x']
+    steps:
+      - name: Clone
+        uses: actions/checkout@v6
+        with:
+          submodules: true
+      - name: Free disk space
+        run: |
+          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL || true
+          sudo docker image prune --all --force || true
+          df -h
+      - name: Configure apt mirror on runner
+        uses: ./.github/actions/configure-apt-mirror
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v5
+        with:
+          go-version: ${{ matrix.go-version }}
+          cache: false
+      - name: Proto Dependencies
+        run: |
+          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
+          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+          rm protoc.zip
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          PATH="$PATH:$HOME/go/bin" make protogen-go
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential
+      # Builds local-ai-backend:privacy-filter, downloads the GGUF, loads it on
+      # CPU and runs the token_classify capability spec (byte-offset contract).
+      - name: Run live PII NER backend E2E
+        run: PATH="$PATH:$HOME/go/bin" make test-extra-backend-privacy-filter
+      - name: Setup tmate session if tests fail
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3.23
+        with:
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true
--- a/.gitignore
+++ b/.gitignore
@@ -91,3 +91,6 @@ core/http/react-ui/test-results/

 # Local worktrees
 .worktrees/
+
+# SDD / brainstorm scratch (agent-driven development)
+.superpowers/
--- a/10
+++ b/10
@@ -690,6 +690,16 @@ test-extra-backend-llama-cpp-transcription: docker-build-llama-cpp
 	BACKEND_TEST_CTX_SIZE=2048 \
 	$(MAKE) test-extra-backend

+## privacy-filter: the PII/NER token-classification backend. Exercises the
+## TokenClassify RPC and asserts byte-correct, UTF-8-aligned span offsets
+## against the openai-privacy-filter multilingual GGUF (CPU-runnable, ~50M
+## active params). This is the live-backend coverage for the PII NER tier.
+test-extra-backend-privacy-filter: docker-build-privacy-filter
+	BACKEND_IMAGE=local-ai-backend:privacy-filter \
+	BACKEND_TEST_MODEL_URL=https://huggingface.co/LocalAI-io/privacy-filter-multilingual-GGUF/resolve/main/privacy-filter-multilingual-f16.gguf \
+	BACKEND_TEST_CAPS=health,load,token_classify \
+	$(MAKE) test-extra-backend
+
 ## vllm is resolved from a HuggingFace model id (no file download) and
 ## exercises Predict + streaming + tool-call extraction via the hermes parser.
 ## Requires a host CPU with the SIMD instructions the prebuilt vllm CPU
--- a/README.md
+++ b/README.md
@@ -231,6 +231,7 @@ Most backends wrap a best-in-class upstream engine. A handful of them are native
 | Backend | What it does |
 |---------|-------------|
 | [parakeet.cpp](https://github.com/mudler/parakeet.cpp) | C++/GGML port of NVIDIA NeMo Parakeet ASR (tdt/ctc/rnnt/hybrid), with cache-aware streaming transcription |
+| [ced.cpp](https://github.com/mudler/ced.cpp) | C++/GGML port of the CED audio-tagging models: sound-event classification (527-class AudioSet) over REST and the realtime API for live recognition |
 | [voxtral.c](https://github.com/mudler/voxtral.c) | Voxtral Realtime 4B speech-to-text in pure C |
 | [vibevoice.cpp](https://github.com/mudler/vibevoice.cpp) | Native port of Microsoft VibeVoice for TTS (voice cloning) and long-form ASR with speaker diarization |
 | [rf-detr.cpp](https://github.com/mudler/rf-detr.cpp) | Native RF-DETR object detection and instance segmentation |
@@ -240,6 +241,8 @@ Most backends wrap a best-in-class upstream engine. A handful of them are native
 | [LocalVQE](https://github.com/localai-org/LocalVQE) | Joint acoustic echo cancellation, noise suppression, and dereverberation |
 | [local-store](https://github.com/mudler/LocalAI) | Local-first vector database for embeddings (shipped in-tree) |

+We also maintain [apex-quant](https://github.com/localai-org/apex-quant), a per-tensor, per-layer quantization recipe for Mixture-of-Experts models that exploits their structural sparsity to produce GGUFs matching or beating Q8_0 quality - and they run out of the box on stock llama.cpp.
+
 ## Resources

 - [Documentation](https://localai.io/)
--- a/backend/Dockerfile.golang
+++ b/backend/Dockerfile.golang
@@ -65,7 +65,12 @@ RUN <<EOT bash
            libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
            git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
            ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
-            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
+            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils && \
+        apt-get install -y mesa-vulkan-drivers libdrm2
+        # Mesa Vulkan ICD drivers (ANV/RADV/lavapipe) + their manifests. The
+        # LunarG SDK below only provides the loader and shader tooling, not
+        # hardware drivers — without Mesa, package-gpu-libs.sh has no ICD to
+        # bundle and the packaged backend finds no GPU at runtime.
        if [ "amd64" = "$TARGETARCH" ]; then
            wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
            tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
--- a/backend/Dockerfile.python
+++ b/backend/Dockerfile.python
@@ -66,7 +66,12 @@ RUN <<EOT bash
            libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
            git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
            ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
-            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
+            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils && \
+        apt-get install -y mesa-vulkan-drivers libdrm2
+        # Mesa Vulkan ICD drivers (ANV/RADV/lavapipe) + their manifests. The
+        # LunarG SDK below only provides the loader and shader tooling, not
+        # hardware drivers — without Mesa, package-gpu-libs.sh has no ICD to
+        # bundle and the packaged backend finds no GPU at runtime.
        if [ "amd64" = "$TARGETARCH" ]; then
            wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
            tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -24,6 +24,9 @@ service Backend {
  rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
  rpc Status(HealthMessage) returns (StatusResponse) {}
  rpc Detect(DetectOptions) returns (DetectResponse) {}
+  // SoundDetection runs an audio-tagging / sound-event-classification model
+  // (e.g. CED over the AudioSet ontology) on a clip and returns scored labels.
+  rpc SoundDetection(SoundDetectionRequest) returns (SoundDetectionResponse) {}
  rpc Depth(DepthRequest) returns (DepthResponse) {}
  rpc FaceVerify(FaceVerifyRequest) returns (FaceVerifyResponse) {}
  rpc FaceAnalyze(FaceAnalyzeRequest) returns (FaceAnalyzeResponse) {}
@@ -671,6 +674,24 @@ message DetectResponse {
  repeated Detection Detections = 1;
 }

+// --- Sound-event classification / audio tagging messages (CED) ---
+
+message SoundDetectionRequest {
+  string src = 1;       // audio file path (LocalAI writes the upload to disk)
+  int32 top_k = 2;      // number of top tags to return (0 = all classes)
+  float threshold = 3;  // optional: drop tags scoring below this
+}
+
+message SoundClass {
+  string label = 1;     // AudioSet class name, e.g. "Baby cry, infant cry"
+  float score = 2;      // per-class probability (multi-label, independent)
+  int32 index = 3;      // class index in the model ontology
+}
+
+message SoundDetectionResponse {
+  repeated SoundClass detections = 1;  // score-descending
+}
+
 // --- Depth estimation messages (Depth Anything 3) ---

 message DepthRequest {
--- a/backend/cpp/ik-llama-cpp/Makefile
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -1,5 +1,5 @@

-IK_LLAMA_VERSION?=b3dfb7858cfcb9166e92f366e5af87f19ebc94be
+IK_LLAMA_VERSION?=6c00e87ac84404af588ad2e65935bd6f079c696f
 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=f3e182816421c648188b5eab269853bf1531d950
+LLAMA_VERSION?=7c082bc417bbe53210a83df4ba5b49e18ce6193c
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -18,6 +18,18 @@
 #if __has_include("server-chat.cpp")
 #include "server-chat.cpp"
 #endif
+// server-schema.cpp exists only in llama.cpp after the upstream refactor that
+// extracted the JSON request-schema evaluation (previously the static
+// server_task::params_from_json_cmpl) into server_schema::eval_llama_cmpl_schema.
+// server-context.cpp and grpc-server.cpp both call into it, so its definitions
+// must be part of this translation unit or the link fails. __has_include keeps
+// the source compatible with older pins/forks (e.g. llama-cpp-turboquant) that
+// predate the split and still expose params_from_json_cmpl (see the guarded
+// call sites below).
+#if __has_include("server-schema.cpp")
+#define LOCALAI_HAS_SERVER_SCHEMA 1
+#include "server-schema.cpp"
+#endif
 #include "server-context.cpp"

 // LocalAI
@@ -2102,7 +2114,11 @@ public:
                task.index = i;

                task.tokens    = std::move(inputs[i]);
+#ifdef LOCALAI_HAS_SERVER_SCHEMA
+                task.params           = server_schema::eval_llama_cmpl_schema(
+#else
                task.params           = server_task::params_from_json_cmpl(
+#endif
                        ctx_server.impl->vocab,
                        params_base,
                        ctx_server.get_meta().slot_n_ctx,
@@ -2116,7 +2132,7 @@ public:
                // cannot detect tool calls or separate reasoning from content.
                task.params.res_type                 = TASK_RESPONSE_TYPE_OAI_CHAT;
                task.params.oaicompat_cmpl_id         = completion_id;
-                // oaicompat_model is already populated by params_from_json_cmpl
+                // oaicompat_model is already populated by eval_llama_cmpl_schema

                tasks.push_back(std::move(task));
            }
@@ -2940,7 +2956,11 @@ public:
                task.index = i;

                task.tokens    = std::move(inputs[i]);
+#ifdef LOCALAI_HAS_SERVER_SCHEMA
+                task.params           = server_schema::eval_llama_cmpl_schema(
+#else
                task.params           = server_task::params_from_json_cmpl(
+#endif
                        ctx_server.impl->vocab,
                        params_base,
                        ctx_server.get_meta().slot_n_ctx,
@@ -2952,7 +2972,7 @@ public:
                // reasoning, tool calls, and content are classified into ChatDeltas.
                task.params.res_type                 = TASK_RESPONSE_TYPE_OAI_CHAT;
                task.params.oaicompat_cmpl_id         = completion_id;
-                // oaicompat_model is already populated by params_from_json_cmpl
+                // oaicompat_model is already populated by eval_llama_cmpl_schema

                tasks.push_back(std::move(task));
            }
--- a/backend/cpp/privacy-filter/Makefile
+++ b/backend/cpp/privacy-filter/Makefile
@@ -8,7 +8,7 @@
 # Local development: point at a working checkout instead of cloning, e.g.
 #   make PRIVACY_FILTER_SRC=$HOME/c/privacy-filter.cpp grpc-server

-PRIVACY_FILTER_VERSION?=646342f7a59c6b7d195185eac60bad762e572f1d
+PRIVACY_FILTER_VERSION?=98f52c5ef2250f207cc6b9a6aef05393a120cb7c
 PRIVACY_FILTER_REPO?=https://github.com/localai-org/privacy-filter.cpp
 PRIVACY_FILTER_SRC?=

--- a/backend/go/ced/.gitignore
+++ b/backend/go/ced/.gitignore
@@ -0,0 +1,11 @@
+.cache/
+sources/
+build/
+package/
+ced-grpc
+# build artifacts staged in-tree by the Makefile (cp from sources/) or
+# symlinked for local dev; the real sources live in ced.cpp upstream.
+*.so
+*.so.*
+ced_capi.h
+compile_commands.json
--- a/backend/go/ced/Makefile
+++ b/backend/go/ced/Makefile
@@ -0,0 +1,77 @@
+# ced sound-classification backend Makefile.
+#
+# Upstream pin lives below as CED_VERSION?=<sha> so .github/bump_deps.sh can find
+# and update it (matches the parakeet-cpp / whisper.cpp convention).
+#
+# Local dev shortcut: symlink an out-of-tree ced.cpp shared build + header and
+# skip the clone/cmake steps entirely:
+#   ln -sf /path/to/ced.cpp/build-shared/libced.so .
+#   ln -sf /path/to/ced.cpp/include/ced_capi.h .
+#   go build -o ced-grpc .
+
+CED_VERSION?=c04ac14b7992d00584d9e812c9bb6268598a6ce7
+CED_REPO?=https://github.com/mudler/ced.cpp
+
+GOCMD?=go
+GO_TAGS?=
+JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
+
+BUILD_TYPE?=
+NATIVE?=false
+
+# Static-link ggml into libced.so (PIC) so the shared lib is self-contained:
+# dlopen needs no libggml*.so alongside it, only system libs the runtime image
+# already provides.
+CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DCED_SHARED=ON -DCED_BUILD_CLI=OFF -DCED_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+
+ifeq ($(NATIVE),false)
+	CMAKE_ARGS+=-DGGML_NATIVE=OFF
+endif
+
+# ced.cpp gates its ggml backends behind CED_GGML_* options (set(... CACHE BOOL
+# "" FORCE)), so forward those instead of a bare -DGGML_CUDA=ON.
+ifeq ($(BUILD_TYPE),cublas)
+	CMAKE_ARGS+=-DCED_GGML_CUDA=ON -DGGML_CUDA_GRAPHS=ON
+else ifeq ($(BUILD_TYPE),openblas)
+	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+else ifeq ($(BUILD_TYPE),hipblas)
+	CMAKE_ARGS+=-DCED_GGML_HIP=ON
+else ifeq ($(BUILD_TYPE),vulkan)
+	CMAKE_ARGS+=-DCED_GGML_VULKAN=ON
+endif
+
+.PHONY: ced-grpc package build clean purge test all
+
+all: ced-grpc
+
+sources/ced.cpp:
+	mkdir -p sources/ced.cpp
+	cd sources/ced.cpp && \
+	git init -q && \
+	git remote add origin $(CED_REPO) && \
+	git fetch --depth 1 origin $(CED_VERSION) && \
+	git checkout FETCH_HEAD && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+libced.so: sources/ced.cpp
+	cmake -B sources/ced.cpp/build-shared -S sources/ced.cpp $(CMAKE_ARGS)
+	cmake --build sources/ced.cpp/build-shared --config Release -j$(JOBS)
+	cp -fv sources/ced.cpp/build-shared/libced.so* ./ 2>/dev/null || true
+	cp -fv sources/ced.cpp/include/ced_capi.h ./
+
+ced-grpc: libced.so main.go goced.go
+	CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o ced-grpc .
+
+package: ced-grpc
+	bash package.sh
+
+build: package
+
+test:
+	LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
+
+clean: purge
+	rm -rf libced.so* ced_capi.h package ced-grpc
+
+purge:
+	rm -rf sources/ced.cpp
--- a/backend/go/ced/goced.go
+++ b/backend/go/ced/goced.go
@@ -0,0 +1,130 @@
+package main
+
+// Go side of the ced backend: purego bindings over ced_capi.h plus the gRPC
+// SoundDetection implementation.
+//
+// SKETCH: the pb.SoundDetection* types come from backend.proto (regenerate with
+// `make protogen-go`). The C side is single-threaded per ctx, so we guard the
+// engine with engineMu; LocalAI also serializes via base.SingleThread.
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"sort"
+	"sync"
+	"unsafe"
+
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+)
+
+// purego-bound entry points from libced.so. Names match ced_capi.h exactly.
+var (
+	CppAbiVersion       func() int32
+	CppLoad             func(ggufPath string) uintptr
+	CppFree             func(ctx uintptr)
+	CppLastError        func(ctx uintptr) string
+	CppNumClasses       func(ctx uintptr) int32
+	CppSampleRate       func(ctx uintptr) int32
+	CppClassifyPathJSON func(ctx uintptr, wavPath string, topK int32) uintptr
+	CppClassifyPcmJSON  func(ctx uintptr, pcm []float32, nSamples int32, sampleRate int32, topK int32) uintptr
+	CppFreeString       func(s uintptr)
+)
+
+// cstr copies a malloc'd C string (returned as uintptr) into a Go string and
+// frees the original via ced_capi_free_string. Empty/0 -> "".
+func cstr(p uintptr) string {
+	if p == 0 {
+		return ""
+	}
+	defer CppFreeString(p)
+	var b []byte
+	for i := 0; ; i++ {
+		ch := *(*byte)(unsafe.Pointer(p + uintptr(i))) //nolint:govet // #nosec G103 -- C-owned NUL-terminated string from libced (not Go-GC memory)
+		if ch == 0 {
+			break
+		}
+		b = append(b, ch)
+	}
+	return string(b)
+}
+
+// Ced is the gRPC backend. One loaded CED model per instance.
+type Ced struct {
+	base.Base
+	ctxPtr   uintptr
+	engineMu sync.Mutex
+}
+
+// Load resolves the GGUF and opens the C-API context.
+func (c *Ced) Load(opts *pb.ModelOptions) error {
+	if opts.ModelFile == "" {
+		return errors.New("ced: ModelFile is required")
+	}
+	ctx := CppLoad(opts.ModelFile)
+	if ctx == 0 {
+		return fmt.Errorf("ced: ced_capi_load failed for %q: %s", opts.ModelFile, CppLastError(0))
+	}
+	c.ctxPtr = ctx
+	return nil
+}
+
+// jsonTag mirrors the ced_capi JSON tag objects.
+type jsonTag struct {
+	Index int     `json:"index"`
+	Score float32 `json:"score"`
+	Label string  `json:"label"`
+}
+
+// SoundDetection classifies the clip at req.Src and returns scored AudioSet tags.
+func (c *Ced) SoundDetection(ctx context.Context, req *pb.SoundDetectionRequest) (*pb.SoundDetectionResponse, error) {
+	if c.ctxPtr == 0 {
+		return nil, errors.New("ced: model not loaded")
+	}
+	if req.GetSrc() == "" {
+		return nil, errors.New("ced: SoundDetectionRequest.src (audio path) is required")
+	}
+	topK := req.GetTopK()
+	if topK <= 0 {
+		topK = 10 // sensible default for a tagging response
+	}
+
+	c.engineMu.Lock()
+	out := cstr(CppClassifyPathJSON(c.ctxPtr, req.GetSrc(), topK))
+	lastErr := CppLastError(c.ctxPtr)
+	c.engineMu.Unlock()
+
+	if out == "" {
+		return nil, fmt.Errorf("ced: classification failed: %s", lastErr)
+	}
+	var tags []jsonTag
+	if err := json.Unmarshal([]byte(out), &tags); err != nil {
+		return nil, fmt.Errorf("ced: bad classifier JSON: %w", err)
+	}
+
+	thr := req.GetThreshold()
+	resp := &pb.SoundDetectionResponse{}
+	for _, t := range tags {
+		if t.Score < thr {
+			continue
+		}
+		resp.Detections = append(resp.Detections, &pb.SoundClass{
+			Label: t.Label, Score: t.Score, Index: int32(t.Index),
+		})
+	}
+	sort.Slice(resp.Detections, func(i, j int) bool {
+		return resp.Detections[i].Score > resp.Detections[j].Score
+	})
+	return resp, nil
+}
+
+func (c *Ced) Free() error {
+	c.engineMu.Lock()
+	defer c.engineMu.Unlock()
+	if c.ctxPtr != 0 {
+		CppFree(c.ctxPtr)
+		c.ctxPtr = 0
+	}
+	return nil
+}
--- a/backend/go/ced/main.go
+++ b/backend/go/ced/main.go
@@ -0,0 +1,59 @@
+package main
+
+// ced sound-classification backend. Started internally by LocalAI: one gRPC
+// server per loaded model. Loads libced.so via purego and registers the flat
+// C-API declared in ced_capi.h. The library name can be overridden with
+// CED_LIBRARY (mirrors PARAKEET_LIBRARY / WHISPER_LIBRARY); the default looks
+// for the .so next to this binary.
+//
+// SKETCH: requires `make protogen-go` after the backend.proto SoundDetection
+// addition, and a built libced.so (see Makefile). See DESIGN.md.
+import (
+	"flag"
+	"fmt"
+	"os"
+
+	"github.com/ebitengine/purego"
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
+)
+
+var addr = flag.String("addr", "localhost:50051", "the address to connect to")
+
+type libFunc struct {
+	ptr  any
+	name string
+}
+
+func main() {
+	libName := os.Getenv("CED_LIBRARY")
+	if libName == "" {
+		libName = "libced.so"
+	}
+	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
+	if err != nil {
+		panic(fmt.Errorf("ced: dlopen %q: %w", libName, err))
+	}
+
+	// Bound 1:1 to ced_capi.h. char*-returning functions are declared uintptr
+	// so we can free the same pointer with ced_capi_free_string after copying
+	// (purego's string return would copy and leak the original).
+	for _, lf := range []libFunc{
+		{&CppAbiVersion, "ced_capi_abi_version"},
+		{&CppLoad, "ced_capi_load"},
+		{&CppFree, "ced_capi_free"},
+		{&CppLastError, "ced_capi_last_error"},
+		{&CppNumClasses, "ced_capi_num_classes"},
+		{&CppSampleRate, "ced_capi_sample_rate"},
+		{&CppClassifyPathJSON, "ced_capi_classify_path_json"},
+		{&CppClassifyPcmJSON, "ced_capi_classify_pcm_json"},
+		{&CppFreeString, "ced_capi_free_string"},
+	} {
+		purego.RegisterLibFunc(lf.ptr, lib, lf.name)
+	}
+
+	fmt.Fprintf(os.Stderr, "[ced] ABI=%d\n", CppAbiVersion())
+	flag.Parse()
+	if err := grpc.StartServer(*addr, &Ced{}); err != nil {
+		panic(err)
+	}
+}
--- a/backend/go/ced/package.sh
+++ b/backend/go/ced/package.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+#
+# Bundle the ced-grpc binary, libced.so, the core runtime libs (libc/libstdc++/
+# libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE so the package
+# is self-contained. Mirrors backend/go/parakeet-cpp/package.sh; run.sh routes
+# the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc is used.
+
+set -e
+
+CURDIR=$(dirname "$(realpath "$0")")
+REPO_ROOT="${CURDIR}/../../.."
+
+mkdir -p "$CURDIR/package/lib"
+
+cp -avf "$CURDIR/ced-grpc" "$CURDIR/package/"
+cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
+
+cp -avf "$CURDIR"/libced.so* "$CURDIR/package/lib/" 2>/dev/null || {
+	echo "ERROR: libced.so not found in $CURDIR, run 'make' first" >&2
+	exit 1
+}
+
+if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
+    echo "Detected x86_64 architecture, copying x86_64 libraries..."
+    cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
+    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
+    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
+elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
+    echo "Detected ARM64 architecture, copying ARM64 libraries..."
+    cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
+    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
+    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
+elif [ "$(uname -s)" = "Darwin" ]; then
+    echo "Detected Darwin"
+else
+    echo "Error: Could not detect architecture"
+    exit 1
+fi
+
+GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
+if [ -f "$GPU_LIB_SCRIPT" ]; then
+    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
+    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
+    package_gpu_libs
+fi
+
+echo "Packaging completed successfully"
+ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
--- a/backend/go/ced/run.sh
+++ b/backend/go/ced/run.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -e
+
+CURDIR=$(dirname "$(realpath "$0")")
+
+export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
+
+# If a self-contained ld.so was packaged, route through it so the packaged
+# libc / libstdc++ are used instead of the host's (matches the sibling backends).
+if [ -f "$CURDIR/lib/ld.so" ]; then
+	echo "Using lib/ld.so"
+	exec "$CURDIR/lib/ld.so" "$CURDIR/ced-grpc" "$@"
+fi
+
+exec "$CURDIR/ced-grpc" "$@"
--- a/backend/go/crispasr/Makefile
+++ b/backend/go/crispasr/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # CrispASR version (release tag)
 CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
-CRISPASR_VERSION?=d745bda4386ae0f9d1d2f23fff8ec95d76428221
+CRISPASR_VERSION?=7a8cb80907341c0204bd0488c1244764f4163883
 SO_TARGET?=libgocrispasr.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
@@ -67,7 +67,7 @@ sources/CrispASR:
 	# it, so ${CMAKE_SOURCE_DIR} is THIS backend dir and the talk-llama sources
 	# aren't found. Rewrite to ${PROJECT_SOURCE_DIR} (the crispasr project root),
 	# which is correct both standalone and as a subproject. Idempotent.
-	sed -i 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt
+	sed -i.bak 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt && rm -f sources/CrispASR/src/CMakeLists.txt.bak

 # Detect OS
 UNAME_S := $(shell uname -s)
--- a/backend/go/crispasr/cpp/crispasr_shim.cpp
+++ b/backend/go/crispasr/cpp/crispasr_shim.cpp
@@ -47,6 +47,74 @@ extern "C" void set_abort(int v) {
  g_abort.store(v, std::memory_order_relaxed);
 }

+// --- word-level timestamp accessors ---
+extern "C" {
+int crispasr_session_result_n_words(crispasr_session_result *r, int seg_i);
+const char *crispasr_session_result_word_text(crispasr_session_result *r,
+                                               int seg_i, int word_i);
+int64_t crispasr_session_result_word_t0(crispasr_session_result *r, int seg_i,
+                                         int word_i);
+int64_t crispasr_session_result_word_t1(crispasr_session_result *r, int seg_i,
+                                         int word_i);
+
+// Parakeet-specific word accessors
+int crispasr_parakeet_result_n_words(void *r);
+const char *crispasr_parakeet_result_word_text(void *r, int word_i);
+int64_t crispasr_parakeet_result_word_t0(void *r, int word_i);
+int64_t crispasr_parakeet_result_word_t1(void *r, int word_i);
+}
+
+void *get_result(void) { return g_result; }
+
+int get_word_count(int seg_i) {
+  if (!g_result)
+    return 0;
+  return crispasr_session_result_n_words(g_result, seg_i);
+}
+
+const char *get_word_text(int seg_i, int word_i) {
+  if (!g_result)
+    return "";
+  return crispasr_session_result_word_text(g_result, seg_i, word_i);
+}
+
+int64_t get_word_t0(int seg_i, int word_i) {
+  if (!g_result)
+    return 0;
+  return crispasr_session_result_word_t0(g_result, seg_i, word_i);
+}
+
+int64_t get_word_t1(int seg_i, int word_i) {
+  if (!g_result)
+    return 0;
+  return crispasr_session_result_word_t1(g_result, seg_i, word_i);
+}
+
+// Parakeet-specific word accessors
+int get_parakeet_word_count(void) {
+  if (!g_result)
+    return 0;
+  return crispasr_parakeet_result_n_words(g_result);
+}
+
+const char *get_parakeet_word_text(int word_i) {
+  if (!g_result)
+    return "";
+  return crispasr_parakeet_result_word_text(g_result, word_i);
+}
+
+int64_t get_parakeet_word_t0(int word_i) {
+  if (!g_result)
+    return 0;
+  return crispasr_parakeet_result_word_t0(g_result, word_i);
+}
+
+int64_t get_parakeet_word_t1(int word_i) {
+  if (!g_result)
+    return 0;
+  return crispasr_parakeet_result_word_t1(g_result, word_i);
+}
+
 static void ggml_log_cb(enum ggml_log_level level, const char *log,
                        void *data) {
  const char *level_str;
--- a/backend/go/crispasr/cpp/crispasr_shim.h
+++ b/backend/go/crispasr/cpp/crispasr_shim.h
@@ -20,4 +20,18 @@ float *tts_synthesize(const char *text, int *out_n_samples); // 24kHz mono float
 void tts_free(float *pcm);
 int tts_set_voice(const char *name); // best-effort speaker selection; 0 ok
 int tts_set_voice_file(const char *path, const char *ref_text); // load voice pack (.gguf) or zero-shot clone (.wav + ref_text)
+
+// --- word-level timestamp accessors ---
+// Session-based (works for whisper-like backends)
+void *get_result(void);
+int get_word_count(int seg_i);
+const char *get_word_text(int seg_i, int word_i);
+int64_t get_word_t0(int seg_i, int word_i);
+int64_t get_word_t1(int seg_i, int word_i);
+
+// Parakeet-specific (global word list, no segment index)
+int get_parakeet_word_count(void);
+const char *get_parakeet_word_text(int word_i);
+int64_t get_parakeet_word_t0(int word_i);
+int64_t get_parakeet_word_t1(int word_i);
 }
--- a/backend/go/crispasr/gocrispasr.go
+++ b/backend/go/crispasr/gocrispasr.go
@@ -34,6 +34,18 @@ var (
 	CppTTSFree         func(ptr uintptr)
 	CppTTSSetVoice     func(name string) int
 	CppTTSSetVoiceFile func(path string, refText string) int
+
+	// Word-level timestamp accessors (session-based, per-segment)
+	CppGetWordCount func(segI int) int
+	CppGetWordText  func(segI int, wordI int) string
+	CppGetWordT0    func(segI int, wordI int) int64
+	CppGetWordT1    func(segI int, wordI int) int64
+
+	// Parakeet-specific word accessors (global, no segment index)
+	CppGetParakeetWordCount func() int
+	CppGetParakeetWordText  func(wordI int) string
+	CppGetParakeetWordT0    func(wordI int) int64
+	CppGetParakeetWordT1    func(wordI int) int64
 )

 type CrispASR struct {
@@ -212,6 +224,28 @@ func (w *CrispASR) VAD(req *pb.VADRequest) (pb.VADResponse, error) {
 	}, nil
 }

+// isValidWord reports whether a TranscriptWord contains recognisable speech
+// content. The parakeet-specific word accessors can return stale initialisation
+// data (model name, binary blobs) when a segment has no real speech. A word is
+// considered valid only when:
+//   - the text is non-empty after trimming,
+//   - it contains no U+FFFD replacement characters (from binary data scrubbing),
+//   - both timestamps are non-negative,
+//   - the word has positive duration (end > start).
+func isValidWord(w *pb.TranscriptWord) bool {
+	txt := strings.TrimSpace(w.Text)
+	if txt == "" {
+		return false
+	}
+	if strings.ContainsRune(txt, '\uFFFD') {
+		return false
+	}
+	if w.Start < 0 || w.End < 0 || w.End <= w.Start {
+		return false
+	}
+	return true
+}
+
 func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRequest) (pb.TranscriptResult, error) {
 	if err := ctx.Err(); err != nil {
 		return pb.TranscriptResult{}, status.Error(codes.Canceled, "transcription cancelled")
@@ -290,15 +324,54 @@ func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRe
 		// IDs, so Tokens is left empty.
 		txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "<22>")

+		// Populate word-level timestamps. Try session-based functions first
+		// (per-segment); fall back to parakeet-specific functions (global word
+		// list with no segment index — only populated on the first segment to
+		// avoid duplication).
+		words := []*pb.TranscriptWord{}
+		wordCount := CppGetWordCount(i)
+		if wordCount == 0 && i == 0 {
+			wordCount = CppGetParakeetWordCount()
+			for j := 0; j < wordCount; j++ {
+				w := &pb.TranscriptWord{
+					Start: CppGetParakeetWordT0(j) * (10000000),
+					End:   CppGetParakeetWordT1(j) * (10000000),
+					Text:  strings.ToValidUTF8(strings.Clone(CppGetParakeetWordText(j)), "<22>"),
+				}
+				if isValidWord(w) {
+					words = append(words, w)
+				}
+			}
+		} else {
+			for j := 0; j < wordCount; j++ {
+				w := &pb.TranscriptWord{
+					Start: CppGetWordT0(i, j) * (10000000),
+					End:   CppGetWordT1(i, j) * (10000000),
+					Text:  strings.ToValidUTF8(strings.Clone(CppGetWordText(i, j)), "<22>"),
+				}
+				if isValidWord(w) {
+					words = append(words, w)
+				}
+			}
+		}
+
+		// Skip empty segments with no recognisable content (e.g. trailing
+		// silence segments that parakeet emits with stale init data).
+		trimmed := strings.TrimSpace(txt)
+		if trimmed == "" && len(words) == 0 {
+			continue
+		}
+
 		segment := &pb.TranscriptSegment{
 			Id:    int32(i),
 			Text:  txt,
 			Start: s, End: t,
+			Words: words,
 		}

 		segments = append(segments, segment)

-		text += " " + strings.TrimSpace(txt)
+		text += " " + trimmed
 	}

 	return pb.TranscriptResult{
@@ -390,13 +463,20 @@ func (w *CrispASR) AudioTranscriptionStream(ctx context.Context, opts *pb.Transc
 		s := CppGetSegmentStart(i) * 10000000
 		t := CppGetSegmentEnd(i) * 10000000
 		txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "<22>")
+
+		// Skip empty segments (e.g. trailing silence that parakeet emits
+		// with stale init data).
+		trimmed := strings.TrimSpace(txt)
+		if trimmed == "" && s == t {
+			continue
+		}
+
 		segments = append(segments, &pb.TranscriptSegment{
 			Id:    int32(i),
 			Text:  txt,
 			Start: s, End: t,
 		})

-		trimmed := strings.TrimSpace(txt)
 		if trimmed == "" {
 			continue
 		}
--- a/backend/go/crispasr/main.go
+++ b/backend/go/crispasr/main.go
@@ -44,6 +44,14 @@ func main() {
 		{&CppTTSFree, "tts_free"},
 		{&CppTTSSetVoice, "tts_set_voice"},
 		{&CppTTSSetVoiceFile, "tts_set_voice_file"},
+		{&CppGetWordCount, "get_word_count"},
+		{&CppGetWordText, "get_word_text"},
+		{&CppGetWordT0, "get_word_t0"},
+		{&CppGetWordT1, "get_word_t1"},
+		{&CppGetParakeetWordCount, "get_parakeet_word_count"},
+		{&CppGetParakeetWordText, "get_parakeet_word_text"},
+		{&CppGetParakeetWordT0, "get_parakeet_word_t0"},
+		{&CppGetParakeetWordT1, "get_parakeet_word_t1"},
 	}

 	for _, lf := range libFuncs {
--- a/backend/go/depth-anything-cpp/Makefile
+++ b/backend/go/depth-anything-cpp/Makefile
@@ -8,11 +8,13 @@ JOBS?=$(shell nproc --ignore=1)

 # depth-anything.cpp. Pin to a specific commit for a stable build; a squash
 # merge upstream can orphan a branch, so the native version is pinned by SHA.
-# This SHA adds the nested two-file metric C-API (abi_version 4,
-# da_capi_load_nested) required by the depth-anything-3-nested gallery model;
-# tag it (e.g. v0.1.3) upstream to keep the SHA alive.
+# This SHA adds the Depth Anything V2 engine + C-API routing (depth-only,
+# relative + metric) on top of the nested two-file metric C-API (abi_version 4,
+# da_capi_load_nested) required by the depth-anything-3-nested gallery model.
+# It is kept alive by the upstream tag da2-support (survives a squash-merge);
+# repoint to the master merge commit once mudler/depth-anything.cpp PR #1 lands.
 DEPTHANYTHING_REPO?=https://github.com/mudler/depth-anything.cpp.git
-DEPTHANYTHING_VERSION?=cce5edc395fd1843806093d7ccc0c8b0d0b97b72
+DEPTHANYTHING_VERSION?=f4e17dea695dd12ae76bea98ba58030996b98118

 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
--- a/backend/go/omnivoice-cpp/Makefile
+++ b/backend/go/omnivoice-cpp/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # omnivoice.cpp version
 OMNIVOICE_REPO?=https://github.com/ServeurpersoCom/omnivoice.cpp
-OMNIVOICE_VERSION?=2603355a5dfacae5cfc33531d5d0933221843509
+OMNIVOICE_VERSION?=96d30169afd5e6bb3fd6a0e9be0eb505bfe81fcd
 SO_TARGET?=libgomnivoicecpp.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/parakeet-cpp/Makefile
+++ b/backend/go/parakeet-cpp/Makefile
@@ -1,6 +1,6 @@
 # parakeet-cpp backend Makefile.
 #
-# Upstream pin lives below as PARAKEET_VERSION?=92a5f0306be354c109150fe58ae4cc4f8a21ca45
+# Upstream pin lives below as PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36
 # (.github/bump_deps.sh) can find and update it - matches the
 # whisper.cpp / ds4 / vibevoice-cpp convention.
 #
@@ -15,7 +15,7 @@
 # That's what the L0 smoke test uses. The default target below does the
 # proper clone-at-pin + cmake build so CI doesn't need a side-checkout.

-PARAKEET_VERSION?=92a5f0306be354c109150fe58ae4cc4f8a21ca45
+PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36
 PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp

 GOCMD?=go
--- a/backend/go/parakeet-cpp/package.sh
+++ b/backend/go/parakeet-cpp/package.sh
@@ -1,23 +1,68 @@
 #!/bin/bash
 #
-# L0 packaging stub: copy the binary, run.sh and libparakeet.so* into
-# package/. The full ldd walk (libc, libstdc++, libgomp, GPU runtimes,
-# arch detection) lands in L3, mirroring backend/go/whisper/package.sh.
+# Bundle the parakeet-cpp-grpc binary, libparakeet.so, the core runtime
+# libs (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active
+# BUILD_TYPE so the package is self-contained. Mirrors
+# backend/go/whisper/package.sh; run.sh routes the (CGO_ENABLED=0) binary
+# through lib/ld.so so the packaged libc is used instead of the host's.

 set -e

 CURDIR=$(dirname "$(realpath "$0")")
+REPO_ROOT="${CURDIR}/../../.."

 mkdir -p "$CURDIR/package/lib"

 cp -avf "$CURDIR/parakeet-cpp-grpc" "$CURDIR/package/"
 cp -avf "$CURDIR/run.sh" "$CURDIR/package/"

-# libparakeet.so + any soname symlinks (libparakeet.so.X, libparakeet.so.X.Y).
+# libparakeet.so + any soname symlinks (libparakeet.so.X[.Y]). purego.Dlopen
+# resolves it via LD_LIBRARY_PATH, which run.sh points at lib/.
 cp -avf "$CURDIR"/libparakeet.so* "$CURDIR/package/lib/" 2>/dev/null || {
 	echo "ERROR: libparakeet.so not found in $CURDIR, run 'make' first" >&2
 	exit 1
 }

-echo "L0 package layout (full ldd walk lands in L3):"
+# Detect architecture and copy the core runtime libs libparakeet.so links
+# against, plus the matching dynamic loader as lib/ld.so.
+if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
+    echo "Detected x86_64 architecture, copying x86_64 libraries..."
+    cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
+    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
+    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
+elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
+    echo "Detected ARM64 architecture, copying ARM64 libraries..."
+    cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
+    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
+    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
+elif [ "$(uname -s)" = "Darwin" ]; then
+    echo "Detected Darwin"
+else
+    echo "Error: Could not detect architecture"
+    exit 1
+fi
+
+# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers)
+# based on BUILD_TYPE so the backend can reach the GPU without the runtime
+# base image shipping those drivers.
+GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
+if [ -f "$GPU_LIB_SCRIPT" ]; then
+    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
+    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
+    package_gpu_libs
+fi
+
+echo "Packaging completed successfully"
 ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
--- a/backend/go/qwen3-tts-cpp/Makefile
+++ b/backend/go/qwen3-tts-cpp/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # qwentts.cpp version
 QWEN3TTS_REPO?=https://github.com/ServeurpersoCom/qwentts.cpp
-QWEN3TTS_CPP_VERSION?=0bf4a18b22e8bb8718d95294e9f7f45c0d4270a4
+QWEN3TTS_CPP_VERSION?=4536dcdce27c3764a93a06d6bf64026b124962f5
 SO_TARGET?=libgoqwen3ttscpp.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=7f0e728b7d42f2490dfa5dd9539082d904f2f6b2
+STABLEDIFFUSION_GGML_VERSION?=b12098f5d09fc83da36e65c784f7bdb16a5a5ebf

 CMAKE_ARGS+=-DGGML_MAX_NAME=128

--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=86c40c3bd6fc86f1187fb751d111b49e0fc18e84
+WHISPER_CPP_VERSION?=5ed76e9a079962f1c85cfce44edd325c27ef1f97
 SO_TARGET?=libgowhisper.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -178,6 +178,37 @@
    nvidia-cuda-12: "cuda12-parakeet-cpp"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-parakeet-cpp"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-parakeet-cpp"
+- &ced
+  name: "ced"
+  alias: "ced"
+  license: mit
+  icon: https://avatars.githubusercontent.com/u/95302084
+  description: |
+    CED sound-event classification / audio tagging (527-class AudioSet).
+    ced.cpp is a C++/ggml port that performs audio tagging over the AudioSet
+    taxonomy, exposed through the SoundDetection gRPC rpc and the
+    /v1/audio/classification REST endpoint. It runs on CPU, NVIDIA CUDA,
+    AMD ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets.
+  urls:
+    - https://github.com/mudler/ced.cpp
+  tags:
+    - audio-classification
+    - CPU
+    - GPU
+    - CUDA
+    - HIP
+  capabilities:
+    default: "cpu-ced"
+    nvidia: "cuda12-ced"
+    intel: "intel-sycl-f16-ced"
+    metal: "metal-ced"
+    amd: "rocm-ced"
+    vulkan: "vulkan-ced"
+    nvidia-l4t: "nvidia-l4t-arm64-ced"
+    nvidia-cuda-13: "cuda13-ced"
+    nvidia-cuda-12: "cuda12-ced"
+    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced"
 - &voxtral
  name: "voxtral"
  alias: "voxtral"
@@ -2650,6 +2681,121 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-parakeet-cpp"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-13-parakeet-cpp
+## ced
+- !!merge <<: *ced
+  name: "ced-development"
+  capabilities:
+    default: "cpu-ced-development"
+    nvidia: "cuda12-ced-development"
+    intel: "intel-sycl-f16-ced-development"
+    metal: "metal-ced-development"
+    amd: "rocm-ced-development"
+    vulkan: "vulkan-ced-development"
+    nvidia-l4t: "nvidia-l4t-arm64-ced-development"
+    nvidia-cuda-13: "cuda13-ced-development"
+    nvidia-cuda-12: "cuda12-ced-development"
+    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced-development"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced-development"
+- !!merge <<: *ced
+  name: "nvidia-l4t-arm64-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-ced"
+  mirrors:
+    - localai/localai-backends:latest-nvidia-l4t-arm64-ced
+- !!merge <<: *ced
+  name: "nvidia-l4t-arm64-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-ced"
+  mirrors:
+    - localai/localai-backends:master-nvidia-l4t-arm64-ced
+- !!merge <<: *ced
+  name: "cuda13-nvidia-l4t-arm64-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-ced"
+  mirrors:
+    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-ced
+- !!merge <<: *ced
+  name: "cuda13-nvidia-l4t-arm64-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-ced"
+  mirrors:
+    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-ced
+- !!merge <<: *ced
+  name: "cpu-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-ced"
+  mirrors:
+    - localai/localai-backends:latest-cpu-ced
+- !!merge <<: *ced
+  name: "cpu-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-ced"
+  mirrors:
+    - localai/localai-backends:master-cpu-ced
+- !!merge <<: *ced
+  name: "metal-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-ced"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-ced
+- !!merge <<: *ced
+  name: "metal-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-ced"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-ced
+- !!merge <<: *ced
+  name: "cuda12-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-ced"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-12-ced
+- !!merge <<: *ced
+  name: "cuda12-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-ced"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-cuda-12-ced
+- !!merge <<: *ced
+  name: "rocm-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-ced"
+  mirrors:
+    - localai/localai-backends:latest-gpu-rocm-hipblas-ced
+- !!merge <<: *ced
+  name: "rocm-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-ced"
+  mirrors:
+    - localai/localai-backends:master-gpu-rocm-hipblas-ced
+- !!merge <<: *ced
+  name: "intel-sycl-f32-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-ced"
+  mirrors:
+    - localai/localai-backends:latest-gpu-intel-sycl-f32-ced
+- !!merge <<: *ced
+  name: "intel-sycl-f32-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-ced"
+  mirrors:
+    - localai/localai-backends:master-gpu-intel-sycl-f32-ced
+- !!merge <<: *ced
+  name: "intel-sycl-f16-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-ced"
+  mirrors:
+    - localai/localai-backends:latest-gpu-intel-sycl-f16-ced
+- !!merge <<: *ced
+  name: "intel-sycl-f16-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-ced"
+  mirrors:
+    - localai/localai-backends:master-gpu-intel-sycl-f16-ced
+- !!merge <<: *ced
+  name: "vulkan-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-ced"
+  mirrors:
+    - localai/localai-backends:latest-gpu-vulkan-ced
+- !!merge <<: *ced
+  name: "vulkan-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-ced"
+  mirrors:
+    - localai/localai-backends:master-gpu-vulkan-ced
+- !!merge <<: *ced
+  name: "cuda13-ced"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-ced"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-13-ced
+- !!merge <<: *ced
+  name: "cuda13-ced-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-ced"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-cuda-13-ced
 ## stablediffusion-ggml
 - !!merge <<: *stablediffusionggml
  name: "cpu-stablediffusion-ggml"
--- a/backend/python/diffusers/requirements-cpu.txt
+++ b/backend/python/diffusers/requirements-cpu.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 torchvision==0.22.1
 accelerate
 git+https://github.com/xhinker/sd_embed
@@ -10,9 +10,15 @@ sentencepiece
 torch==2.7.1
 optimum-quanto
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-cublas12.txt
+++ b/backend/python/diffusers/requirements-cublas12.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cu121
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 torchvision
 accelerate
 git+https://github.com/xhinker/sd_embed
@@ -10,9 +10,15 @@ sentencepiece
 torch
 ftfy
 optimum-quanto
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-cublas13.txt
+++ b/backend/python/diffusers/requirements-cublas13.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cu130
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 torchvision
 accelerate
 git+https://github.com/xhinker/sd_embed
@@ -10,9 +10,15 @@ sentencepiece
 torch
 ftfy
 optimum-quanto
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-hipblas.txt
+++ b/backend/python/diffusers/requirements-hipblas.txt
@@ -1,17 +1,23 @@
 --extra-index-url https://download.pytorch.org/whl/rocm7.0
 torch==2.10.0+rocm7.0
 torchvision==0.25.0+rocm7.0
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 accelerate
 peft
 sentencepiece
 optimum-quanto
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-intel.txt
+++ b/backend/python/diffusers/requirements-intel.txt
@@ -3,18 +3,24 @@ torch
 torchvision
 optimum[openvino]
 setuptools
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 accelerate
 git+https://github.com/xhinker/sd_embed
 peft
 sentencepiece
 optimum-quanto
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-l4t12.txt
+++ b/backend/python/diffusers/requirements-l4t12.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu129/
 torch
-git+https://github.com/huggingface/diffusers
-transformers
+diffusers==0.38.0
+transformers==4.57.6
 accelerate
 peft
 optimum-quanto
@@ -9,9 +9,15 @@ numpy<2
 sentencepiece
 torchvision
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-l4t13.txt
+++ b/backend/python/diffusers/requirements-l4t13.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cu130
 torch
-git+https://github.com/huggingface/diffusers
-transformers
+diffusers==0.38.0
+transformers==4.57.6
 accelerate
 peft
 optimum-quanto
@@ -10,9 +10,15 @@ sentencepiece
 torchvision
 ftfy
 chardet
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/backend/python/diffusers/requirements-mps.txt
+++ b/backend/python/diffusers/requirements-mps.txt
@@ -1,16 +1,22 @@
 torch==2.7.1
 torchvision==0.22.1
-git+https://github.com/huggingface/diffusers
+diffusers==0.38.0
 opencv-python
-transformers
+transformers==4.57.6
 accelerate
 peft
 sentencepiece
 optimum-quanto
 ftfy
-# TODO: re-add compel once it supports transformers >= 5.
-# Tracking: https://github.com/damian0815/compel/pull/129
-#           https://github.com/damian0815/compel/issues/128
-# compel currently pins transformers~=4.25, which forced pip into multi-hour
-# resolver backtracking storms in CI. backend.py imports it lazily and gates
-# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
+# diffusers and transformers are pinned together on purpose. transformers v5
+# restructured CLIPTextModel and dropped the `.text_model` attribute, which
+# breaks single-file Stable Diffusion loading on every released diffusers
+# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
+# main via git froze whichever broken pair existed at image-build time. Pin the
+# last known-good released pair so builds are reproducible and can't drift into
+# the broken window. See https://github.com/mudler/LocalAI/issues/9979
+#
+# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
+# with this pin and previously forced pip into multi-hour resolver backtracking
+# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
+# the import succeeding, so dropping it here is safe.
--- a/backend/python/nemo/backend.py
+++ b/backend/python/nemo/backend.py
@@ -84,6 +84,135 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

        return backend_pb2.Result(message="Model loaded successfully", success=True)

+    def _get_stride_seconds(self):
+        """Compute the seconds-per-frame stride for the loaded model.
+
+        stride = preprocessor_window_stride * encoder_subsampling_factor
+        """
+        try:
+            preprocessor = self.model.preprocessor
+            window_stride = preprocessor._cfg.get('window_stride', 0.01)
+            subsampling_factor = getattr(self.model.encoder, 'subsampling_factor', 8)
+            return window_stride * subsampling_factor
+        except (AttributeError, KeyError, TypeError) as err:
+            print(
+                f"Warning: could not compute stride from model config ({err}), "
+                f"falling back to 0.08s/frame",
+                file=sys.stderr,
+            )
+            return 0.08
+
+    def _build_segments_with_words(self, hypothesis, stride, timestamp_granularities=None):
+        """Build TranscriptSegment list from a NeMo Hypothesis with timestamps.
+
+        Supports two granularity modes:
+          - "word": one TranscriptSegment per word, each with a single TranscriptWord entry
+          - "segment" (default): merge consecutive words into sentence-level segments,
+            splitting at word-level time gaps that exceed a dynamic threshold.
+        """
+        if not hypothesis or not isinstance(hypothesis.timestamp, dict):
+            return []
+
+        word_offsets = hypothesis.timestamp.get('word', [])
+        if not word_offsets:
+            return []
+
+        granularities = list(timestamp_granularities) if timestamp_granularities else []
+        granularity = "word" if "word" in granularities else "segment"
+
+        # Build a flat list of (text, start_ns, end_ns) from NeMo word offsets
+        transcript_words = []
+        for wo in word_offsets:
+            word_text = wo.get('word', '')
+            if not word_text:
+                continue
+            start_offset = wo.get('start_offset', 0)
+            end_offset = wo.get('end_offset', start_offset)
+            start_ns = int(start_offset * stride * 1_000_000_000)
+            end_ns = int(end_offset * stride * 1_000_000_000)
+            transcript_words.append({
+                'text': word_text,
+                'start': start_ns,
+                'end': end_ns,
+            })
+
+        if not transcript_words:
+            return []
+
+        if granularity == "word":
+            # One segment per word
+            result = []
+            for idx, tw in enumerate(transcript_words):
+                word = backend_pb2.TranscriptWord(
+                    start=tw['start'], end=tw['end'], text=tw['text']
+                )
+                result.append(backend_pb2.TranscriptSegment(
+                    id=idx,
+                    start=tw['start'],
+                    end=tw['end'],
+                    text=tw['text'],
+                    words=[word],
+                ))
+            return result
+
+        # segment mode — merge at word-level time-gap boundaries
+        # Compute gap threshold: median inter-word gap * 3, clamped to [0.3, 2.0]s
+        gaps = []
+        for i in range(1, len(transcript_words)):
+            gap = (transcript_words[i]['start'] - transcript_words[i - 1]['end']) / 1_000_000_000
+            if gap > 0:
+                gaps.append(gap)
+        if gaps:
+            gaps.sort()
+            median_gap = gaps[len(gaps) // 2]
+            threshold_ns = int(max(0.3, min(median_gap * 3, 2.0)) * 1_000_000_000)
+        else:
+            threshold_ns = int(0.5 * 1_000_000_000)
+
+        result = []
+        buf_words = []  # list of TranscriptWord protobuf
+        buf_start = None
+        buf_end = 0
+        buf_text = []
+        prev_end = None
+
+        for tw in transcript_words:
+            # Detect word-level time gap
+            if prev_end is not None and (tw['start'] - prev_end) >= threshold_ns and buf_text:
+                seg_text = ' '.join(buf_text)
+                result.append(backend_pb2.TranscriptSegment(
+                    id=len(result),
+                    start=buf_start,
+                    end=buf_end,
+                    text=seg_text,
+                    words=list(buf_words),
+                ))
+                buf_words = []
+                buf_text = []
+                buf_start = None
+
+            if buf_start is None:
+                buf_start = tw['start']
+            buf_end = tw['end']
+            buf_text.append(tw['text'])
+            buf_words.append(backend_pb2.TranscriptWord(
+                start=tw['start'], end=tw['end'], text=tw['text']
+            ))
+            prev_end = tw['end']
+
+        # flush remaining
+        if buf_text and buf_start is not None:
+            seg_text = ' '.join(buf_text)
+            result.append(backend_pb2.TranscriptSegment(
+                id=len(result),
+                start=buf_start,
+                end=buf_end,
+                text=seg_text,
+                words=list(buf_words),
+            ))
+
+        return result
+
    def AudioTranscription(self, request, context):
        result_segments = []
        text = ""
@@ -93,26 +222,67 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                print(f"Error: Audio file not found: {audio_path}", file=sys.stderr)
                return backend_pb2.TranscriptResult(segments=[], text="")

-            # NEMO's transcribe method accepts a list of audio paths and returns a list of transcripts
-            results = self.model.transcribe([audio_path])
+            # Determine requested timestamp granularity
+            timestamp_granularities = list(request.timestamp_granularities) if request.timestamp_granularities else []
+            want_timestamps = bool(timestamp_granularities)

-            if not results or len(results) == 0:
-                return backend_pb2.TranscriptResult(segments=[], text="")
+            if want_timestamps:
+                # Request timestamps from NeMo.
+                # timestamps=True forces NeMo to return Hypothesis objects with
+                # the timestamp dict populated, so we omit return_hypotheses to
+                # let NeMo choose the correct return type.
+                results = self.model.transcribe([audio_path], timestamps=True)

-            # Get the transcript text from the first result.
-            # CTC models return List[str], TDT/RNNT models return List[Hypothesis]
-            # where the actual text lives in Hypothesis.text.
-            result = results[0]
-            if isinstance(result, str):
-                text = result
+                if results and len(results) > 0:
+                    hypotheses = results[0] if isinstance(results[0], list) else results
+                    if hypotheses and len(hypotheses) > 0:
+                        hypothesis = hypotheses[0]
+
+                        # Hypothesis object should have .timestamp populated
+                        if not hasattr(hypothesis, 'timestamp') or not isinstance(hypothesis.timestamp, dict):
+                            print(
+                                "Warning: timestamps were requested but NeMo did not return "
+                                "Hypothesis objects; falling back to untimestamped output",
+                                file=sys.stderr,
+                            )
+
+                        # Extract text
+                        if hasattr(hypothesis, 'text'):
+                            text = hypothesis.text or ""
+                        elif isinstance(hypothesis, str):
+                            text = hypothesis
+
+                        # Build segments with word-level timestamps
+                        stride = self._get_stride_seconds()
+                        result_segments = self._build_segments_with_words(
+                            hypothesis, stride, timestamp_granularities
+                        )
+
+                        # If no word offsets but we have text, fall back to single segment
+                        if not result_segments and text:
+                            result_segments.append(backend_pb2.TranscriptSegment(
+                                id=0, start=0, end=0, text=text
+                            ))
            else:
-                text = getattr(result, 'text', None) or ""
+                # Simple transcription without timestamps
+                # NEMO's transcribe method accepts a list of audio paths and returns a list of transcripts
+                results = self.model.transcribe([audio_path])

-            if text:
-                # Create a single segment with the full transcription
-                result_segments.append(backend_pb2.TranscriptSegment(
-                    id=0, start=0, end=0, text=text
-                ))
+                if results and len(results) > 0:
+                    # Get the transcript text from the first result.
+                    # CTC models return List[str], TDT/RNNT models return List[Hypothesis]
+                    # where the actual text lives in Hypothesis.text.
+                    result = results[0]
+                    if isinstance(result, str):
+                        text = result
+                    else:
+                        text = getattr(result, 'text', None) or ""
+
+                    if text:
+                        # Create a single segment with the full transcription
+                        result_segments.append(backend_pb2.TranscriptSegment(
+                            id=0, start=0, end=0, text=text
+                        ))

        except Exception as err:
            print(f"Error in AudioTranscription: {err}", file=sys.stderr)
--- a/backend/python/trl/backend.py
+++ b/backend/python/trl/backend.py
@@ -309,6 +309,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

        dataset_split = request.dataset_split or "train"
        if os.path.exists(request.dataset_source):
+            _allowed_dir = os.path.realpath(os.path.abspath(os.environ.get("LOCALAI_DATASET_DIR", os.getcwd())))
+            _real_path = os.path.realpath(os.path.abspath(request.dataset_source))
+            if not (_real_path == _allowed_dir or _real_path.startswith(_allowed_dir + os.sep)):
+                raise ValueError("Dataset source path is outside the allowed directory")
            if request.dataset_source.endswith('.json') or request.dataset_source.endswith('.jsonl'):
                dataset = load_dataset("json", data_files=request.dataset_source, split=dataset_split)
            elif request.dataset_source.endswith('.csv'):
@@ -687,6 +691,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
    def ExportModel(self, request, context):
        export_format = request.export_format or "lora"
        output_path = request.output_path
+        _allowed_output_dir = os.path.realpath(os.path.abspath(os.environ.get("LOCALAI_OUTPUT_DIR", os.getcwd())))
+        _real_output_path = os.path.realpath(os.path.abspath(output_path))
+        if not (_real_output_path == _allowed_output_dir or _real_output_path.startswith(_allowed_output_dir + os.sep)):
+            raise ValueError("Output path is outside the allowed directory")
+        output_path = _real_output_path
        checkpoint_path = request.checkpoint_path

        # Extract HF token for gated model access
@@ -807,7 +816,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                env = os.environ.copy()
                env["NO_LOCAL_GGUF"] = "1"
                cmd = [sys.executable, convert_script, merge_dir, "--outtype", outtype, "--outfile", gguf_path]
-                conv_result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600, env=env)
+                conv_result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600, env=env, shell=False)  # nosemgrep: python.django.security.injection.command.subprocess-injection.subprocess-injection
                if conv_result.returncode != 0:
                    diag = f"stdout: {conv_result.stdout[-300:]}\nstderr: {conv_result.stderr[-500:]}"
                    return backend_pb2.Result(success=False,
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -48,8 +48,10 @@ try:
 except ImportError:
    HAS_REASONING_PARSERS = False

+# vLLM >= 0.23 renamed GuidedDecodingParams -> StructuredOutputsParams and the
+# SamplingParams field guided_decoding -> structured_outputs.
 try:
-    from vllm.sampling_params import GuidedDecodingParams
+    from vllm.sampling_params import StructuredOutputsParams
    HAS_GUIDED_DECODING = True
 except ImportError:
    HAS_GUIDED_DECODING = False
@@ -536,13 +538,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                if value not in (None, 0, [], False):
                    setattr(sampling_params, param_field, value)

-        # Guided decoding: use Grammar field to pass JSON schema or BNF
+        # Structured-output decoding: use Grammar field to pass JSON schema or BNF
        if HAS_GUIDED_DECODING and request.Grammar:
            try:
                json.loads(request.Grammar)  # valid JSON = JSON schema
-                sampling_params.guided_decoding = GuidedDecodingParams(json=request.Grammar)
+                sampling_params.structured_outputs = StructuredOutputsParams(json=request.Grammar)
            except json.JSONDecodeError:
-                sampling_params.guided_decoding = GuidedDecodingParams(grammar=request.Grammar)
+                sampling_params.structured_outputs = StructuredOutputsParams(grammar=request.Grammar)

        # Extract image paths and process images
        prompt = request.Prompt
@@ -596,23 +598,124 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

        # Stream the results
        generated_text = ""
+        generated_token_ids: list[int] = []
        last_output = None
+
+        # Tool-parsing strategy decision (made once, before the loop):
+        #
+        # When a tool parser is active, the model's raw tool-call markup
+        # (e.g. <tool_call>...) must not be streamed verbatim as delta.content
+        # — clients would see the unparsed syntax. Two paths:
+        #
+        # (A) native streaming via parser.extract_tool_calls_streaming. All
+        #     concrete tool parsers shipped with vLLM 0.23+ implement this
+        #     (Granite4, Qwen3Coder, DeepSeekV31, Jamba, Ernie45, Hermes,
+        #     llama3_json, mistral, …). The parser decides per-delta whether
+        #     to emit content or suppress tool-call markup, and emits a
+        #     structured DeltaMessage(tool_calls=[...]) when a call is ready.
+        # (B) buffer fallback — used only when the parser surprisingly lacks
+        #     the streaming method or it raises mid-stream. The post-loop
+        #     extract_tool_calls assembles the final chat_delta. Same correctness
+        #     guarantee as a non-streaming response, at the cost of a delayed
+        #     final chunk.
+        has_tool_parser = bool(self.tool_parser_cls and request.Tools)
+        tp_instance = None
+        tp_request = None
+        native_streaming = False
+        native_streaming_error = False
+        if has_tool_parser:
+            try:
+                tools_for_parser = json.loads(request.Tools)
+            except json.JSONDecodeError:
+                tools_for_parser = []
+            try:
+                tp_instance = self.tool_parser_cls(self.tokenizer, tools=tools_for_parser)
+            except TypeError:
+                tp_instance = self.tool_parser_cls(self.tokenizer)
+            # Build a minimal ChatCompletionRequest so the streaming method
+            # sees the tools list. We do not need any other request fields —
+            # parsers only read .tools (and sometimes .tool_choice, which we
+            # leave at default).
+            try:
+                from vllm.entrypoints.openai.chat_completion.protocol import (
+                    ChatCompletionRequest as _CCR,
+                )
+                tp_request = _CCR(
+                    model="local",
+                    messages=[{"role": "user", "content": ""}],
+                    tools=tools_for_parser or None,
+                )
+            except Exception as e:
+                print(f"Could not build ChatCompletionRequest for streaming parser: {e}",
+                      file=sys.stderr)
+                tp_request = None
+            native_streaming = (
+                tp_request is not None
+                and hasattr(tp_instance, "extract_tool_calls_streaming")
+            )
+
        try:
            async for request_output in outputs:
                iteration_text = request_output.outputs[0].text
                last_output = request_output

                if streaming:
-                    # Remove text already sent as vllm concatenates the text from previous yields
                    delta_iteration_text = iteration_text.removeprefix(generated_text)
-                    # Send the partial result
-                    yield backend_pb2.Reply(
-                        message=bytes(delta_iteration_text, encoding='utf-8'),
-                        chat_deltas=[backend_pb2.ChatDelta(content=delta_iteration_text)],
-                    )
+                    new_token_ids = list(request_output.outputs[0].token_ids)
+                    delta_token_ids = new_token_ids[len(generated_token_ids):]

-                # Keep track of text generated
+                    if not has_tool_parser:
+                        # Plain streaming — unchanged from pre-tool-parser path.
+                        yield backend_pb2.Reply(
+                            message=bytes(delta_iteration_text, encoding='utf-8'),
+                            chat_deltas=[backend_pb2.ChatDelta(content=delta_iteration_text)],
+                        )
+                    elif native_streaming and not native_streaming_error:
+                        # (A) Native vLLM extract_tool_calls_streaming.
+                        try:
+                            msg = tp_instance.extract_tool_calls_streaming(
+                                previous_text=generated_text,
+                                current_text=iteration_text,
+                                delta_text=delta_iteration_text,
+                                previous_token_ids=generated_token_ids,
+                                current_token_ids=new_token_ids,
+                                delta_token_ids=delta_token_ids,
+                                request=tp_request,
+                            )
+                        except Exception as e:
+                            print(f"Streaming tool parser error (falling back to "
+                                  f"buffer for the rest of the stream): {e}",
+                                  file=sys.stderr)
+                            native_streaming_error = True
+                            msg = None
+                        if msg is not None:
+                            tc_protos = []
+                            for tc in (msg.tool_calls or []):
+                                fn = tc.function or None
+                                tc_protos.append(backend_pb2.ToolCallDelta(
+                                    index=tc.index,
+                                    id=tc.id or "",
+                                    name=(fn.name if fn and fn.name else "") or "",
+                                    arguments=(fn.arguments if fn and fn.arguments else "") or "",
+                                ))
+                            cd_kwargs = {}
+                            if msg.content:
+                                cd_kwargs["content"] = msg.content
+                            if msg.reasoning:
+                                cd_kwargs["reasoning_content"] = msg.reasoning
+                            if tc_protos:
+                                cd_kwargs["tool_calls"] = tc_protos
+                            if cd_kwargs:
+                                yield backend_pb2.Reply(
+                                    message=bytes(msg.content or "", encoding='utf-8'),
+                                    chat_deltas=[backend_pb2.ChatDelta(**cd_kwargs)],
+                                )
+                    # (B) buffer fallback — emit nothing during the stream.
+                    # The post-loop extract_tool_calls block builds the final chunk.
+
+                # Keep track of text + token_ids generated
                generated_text = iteration_text
+                generated_token_ids = list(request_output.outputs[0].token_ids)
        finally:
            await outputs.aclose()

@@ -637,16 +740,19 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            except Exception as e:
                print(f"Reasoning parser error: {e}", file=sys.stderr)

-        if self.tool_parser_cls and request.Tools:
+        # When (A) native streaming ran cleanly, per-delta yields above already
+        # delivered everything — do NOT extract again on the full text or we'd
+        # duplicate content/tool_calls into the final chunk.
+        if has_tool_parser and not (native_streaming and not native_streaming_error):
            try:
-                tools = json.loads(request.Tools)
-                # Some concrete parsers only accept the tokenizer; only the
-                # abstract base declares the tools kwarg. Try with tools first,
-                # fall back to tokenizer-only.
-                try:
-                    tp = self.tool_parser_cls(self.tokenizer, tools=tools)
-                except TypeError:
-                    tp = self.tool_parser_cls(self.tokenizer)
+                tp = tp_instance
+                if tp is None:
+                    # Defensive: tp_instance build failed earlier; reconstruct.
+                    tools = json.loads(request.Tools)
+                    try:
+                        tp = self.tool_parser_cls(self.tokenizer, tools=tools)
+                    except TypeError:
+                        tp = self.tool_parser_cls(self.tokenizer)
                info = tp.extract_tool_calls(content, request=None)
                if info.tools_called:
                    content = info.content or ""
@@ -659,6 +765,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        ))
            except Exception as e:
                print(f"Tool parser error: {e}", file=sys.stderr)
+        elif native_streaming and not native_streaming_error:
+            # Per-delta path already emitted content + tool_calls; the final
+            # chat_delta should carry only metadata (token counts, logprobs).
+            content = ""

        # Extract token counts
        prompt_tokens = 0
@@ -698,7 +808,26 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        )

        if streaming:
-            # Final chunk with structured data
+            # Final chunk with structured data.
+            #
+            # If we used the buffer fallback (has_tool_parser=True AND native
+            # streaming did NOT run cleanly) and the parser found no tool call,
+            # flush the buffered content as ONE content delta — and clear the
+            # final chat_delta's content so the metadata chunk does not repeat
+            # what we just sent. This is the plain-text-with-tool-parser path.
+            buffered_fallback = (
+                has_tool_parser
+                and not (native_streaming and not native_streaming_error)
+            )
+            if buffered_fallback and not tool_calls_proto and content:
+                yield backend_pb2.Reply(
+                    message=bytes(content, encoding='utf-8'),
+                    chat_deltas=[backend_pb2.ChatDelta(content=content)],
+                )
+                chat_delta = backend_pb2.ChatDelta(
+                    reasoning_content=reasoning_content,
+                    tool_calls=tool_calls_proto,
+                )
            yield backend_pb2.Reply(
                message=b"",
                prompt_tokens=prompt_tokens,
--- a/backend/python/vllm/requirements-cpu.txt
+++ b/backend/python/vllm/requirements-cpu.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 accelerate
-torch==2.9.1+cpu
+torch==2.12.1+xpu
 torchvision
 torchaudio
 transformers
--- a/backend/python/vllm/test.py
+++ b/backend/python/vllm/test.py
@@ -278,4 +278,261 @@ class TestBackendServicer(unittest.TestCase):
            print(err)
            self.fail("Embedding service failed")
        finally:
-            self.tearDown()
+            self.tearDown()
+
+
+class TestStreamingToolParser(unittest.TestCase):
+    """
+    Server-less unit tests for the streaming + tool-parser machinery in
+    BackendServicer._predict. These tests instantiate BackendServicer
+    directly and mock the vLLM engine + tool parser, so they do not need
+    a GPU, a model, or a running gRPC server. Kept in a separate class to
+    avoid the parent setUp() which spawns a subprocess.
+
+    Covers #582 (follow-up to #10346):
+      1. Markup-leak prevention with a non-streaming parser (buffer fallback)
+      2. No content duplication on the plain-text path with the buffer fallback
+      3. Native streaming progressive plain-text emission
+      4. Native streaming structured tool_call, no markup leak
+      5. Parser exception → graceful fallback to buffer, still no markup
+      6. No-tool-parser regression: unchanged per-delta content stream
+    """
+
+    @staticmethod
+    def _make_generate(chunks):
+        """Build a fake vLLM engine.generate that yields cumulative chunks."""
+        from types import SimpleNamespace
+        async def gen(*a, **k):
+            for i, t in enumerate(chunks):
+                yield SimpleNamespace(
+                    outputs=[SimpleNamespace(
+                        text=t,
+                        token_ids=list(range(i + 1)),
+                        logprobs=None,
+                    )],
+                    prompt_token_ids=[0],
+                )
+        return lambda *a, **k: gen()
+
+    @staticmethod
+    def _collect(servicer, req):
+        import asyncio
+        async def run():
+            return [r async for r in servicer._predict(req, None, streaming=True)]
+        return asyncio.run(run())
+
+    def _new_servicer(self):
+        import sys, os
+        sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+        from backend import BackendServicer
+        s = BackendServicer()
+        s.reasoning_parser_cls = None
+        s.tool_parser_cls = None
+        s.tokenizer = None
+        return s
+
+    # ── Case 1+2: parser without streaming method → buffer fallback ──
+    def test_buffer_path_no_markup_no_duplication(self):
+        from types import SimpleNamespace
+
+        def parser_cls(called, content_text, calls):
+            class _P:
+                def __init__(self, tokenizer, tools=None):
+                    pass
+                # NOTE: NO extract_tool_calls_streaming → takes the buffer path
+                def extract_tool_calls(self, c, request=None):
+                    return SimpleNamespace(
+                        tools_called=called, content=content_text, tool_calls=calls,
+                    )
+            return _P
+
+        tools_json = '[{"type":"function","function":{"name":"calc","parameters":{}}}]'
+
+        # Tool-call case: no raw markup in any delta.content
+        s = self._new_servicer()
+        s.llm = SimpleNamespace(generate=self._make_generate([
+            '<tool_call>\n{"name": "calc"',
+            '<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
+        ]))
+        call = SimpleNamespace(id="call_1",
+                               function=SimpleNamespace(name="calc", arguments='{"x": 1}'))
+        s.tool_parser_cls = parser_cls(True, "", [call])
+        req = backend_pb2.PredictOptions(Prompt="x", Tools=tools_json)
+        replies = self._collect(s, req)
+        contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
+        self.assertFalse(
+            any("<tool_call" in c for c in contents),
+            f"markup leaked: {contents!r}",
+        )
+        names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls]
+        self.assertIn("calc", names, "tool_call missing from final chunk")
+
+        # Plain-text-with-tools case: full content delivered exactly once
+        s2 = self._new_servicer()
+        s2.llm = SimpleNamespace(generate=self._make_generate([
+            "The capital ",
+            "The capital of France is Paris.",
+        ]))
+        s2.tool_parser_cls = parser_cls(False, "", [])
+        req2 = backend_pb2.PredictOptions(Prompt="x", Tools=tools_json)
+        joined = "".join(
+            cd.content for r in self._collect(s2, req2)
+            for cd in r.chat_deltas if cd.content
+        )
+        self.assertEqual(
+            joined.count("The capital of France is Paris."), 1,
+            f"buffered content duplicated: {joined!r}",
+        )
+
+    # ── Case 3: native streaming, progressive plain text ──
+    def test_native_streaming_progressive_plain_text(self):
+        from types import SimpleNamespace
+
+        class _DeltaMsg:
+            def __init__(self, content=None, reasoning=None, tool_calls=None):
+                self.content = content
+                self.reasoning = reasoning
+                self.tool_calls = tool_calls or []
+
+        class StreamingParser:
+            def __init__(self, tokenizer, tools=None):
+                pass
+            def extract_tool_calls(self, c, request=None):
+                # Should NOT be called when native streaming runs successfully.
+                raise AssertionError("extract_tool_calls invoked on native-streaming path")
+            def extract_tool_calls_streaming(
+                self, previous_text, current_text, delta_text,
+                previous_token_ids, current_token_ids, delta_token_ids, request,
+            ):
+                if not delta_text:
+                    return None
+                return _DeltaMsg(content=delta_text)
+
+        s = self._new_servicer()
+        s.llm = SimpleNamespace(generate=self._make_generate([
+            "Paris ",
+            "Paris is ",
+            "Paris is the capital of France.",
+        ]))
+        s.tool_parser_cls = StreamingParser
+        req = backend_pb2.PredictOptions(
+            Prompt="x",
+            Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
+        )
+        replies = self._collect(s, req)
+
+        intermediate_content = [
+            cd.content for r in replies[:-1] for cd in r.chat_deltas if cd.content
+        ]
+        self.assertTrue(
+            len(intermediate_content) > 0,
+            "Plain-text response not streamed progressively (native streaming inactive?)",
+        )
+        assembled = "".join(
+            cd.content for r in replies for cd in r.chat_deltas if cd.content
+        )
+        self.assertEqual(
+            assembled, "Paris is the capital of France.",
+            f"Assembled content wrong: {assembled!r}",
+        )
+
+    # ── Case 4: native streaming, structured tool_call, no markup ──
+    def test_native_streaming_tool_call_no_markup_leak(self):
+        from types import SimpleNamespace
+
+        class _DeltaMsg:
+            def __init__(self, content=None, reasoning=None, tool_calls=None):
+                self.content = content
+                self.reasoning = reasoning
+                self.tool_calls = tool_calls or []
+
+        class _ToolCallStreamer:
+            def __init__(self, tokenizer, tools=None):
+                self._emitted = False
+            def extract_tool_calls(self, c, request=None):
+                raise AssertionError("extract_tool_calls invoked on native-streaming path")
+            def extract_tool_calls_streaming(
+                self, previous_text, current_text, delta_text,
+                previous_token_ids, current_token_ids, delta_token_ids, request,
+            ):
+                if "</tool_call>" in current_text and not self._emitted:
+                    self._emitted = True
+                    fn = SimpleNamespace(name="calc", arguments='{"x": 1}')
+                    tc = SimpleNamespace(id="call_1", type="function", index=0, function=fn)
+                    return _DeltaMsg(tool_calls=[tc])
+                return None
+
+        s = self._new_servicer()
+        s.llm = SimpleNamespace(generate=self._make_generate([
+            '<tool_call>\n',
+            '<tool_call>\n{"name": "calc"',
+            '<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
+        ]))
+        s.tool_parser_cls = _ToolCallStreamer
+        req = backend_pb2.PredictOptions(
+            Prompt="x",
+            Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
+        )
+        replies = self._collect(s, req)
+
+        contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
+        self.assertFalse(
+            any("<tool_call" in c or "</tool_call>" in c for c in contents),
+            f"markup leaked as content: {contents!r}",
+        )
+        names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls if tc.name]
+        args  = [tc.arguments for r in replies for cd in r.chat_deltas for tc in cd.tool_calls if tc.arguments]
+        self.assertIn("calc", names, f"tool_call name missing; got {names!r}")
+        self.assertIn('{"x": 1}', args, f"tool_call args missing; got {args!r}")
+
+    # ── Case 5: parser exception → fallback to buffer, no leak ──
+    def test_native_streaming_parser_exception_falls_back_to_buffer(self):
+        from types import SimpleNamespace
+        call = SimpleNamespace(id="call_1",
+                               function=SimpleNamespace(name="calc", arguments='{"x": 1}'))
+
+        class _BrokenStreamer:
+            def __init__(self, tokenizer, tools=None):
+                pass
+            def extract_tool_calls(self, c, request=None):
+                return SimpleNamespace(tools_called=True, content="", tool_calls=[call])
+            def extract_tool_calls_streaming(self, *a, **kw):
+                raise RuntimeError("simulated parser bug")
+
+        s = self._new_servicer()
+        s.llm = SimpleNamespace(generate=self._make_generate([
+            '<tool_call>\n{"name": "calc"',
+            '<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
+        ]))
+        s.tool_parser_cls = _BrokenStreamer
+        req = backend_pb2.PredictOptions(
+            Prompt="x",
+            Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
+        )
+        replies = self._collect(s, req)
+
+        contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
+        self.assertFalse(
+            any("<tool_call" in c for c in contents),
+            f"markup leaked after parser exception: {contents!r}",
+        )
+        names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls]
+        self.assertIn("calc", names, "tool_call missing from final chunk after fallback")
+
+    # ── Case 6: no tool parser → unchanged per-delta content stream ──
+    def test_no_tool_parser_unchanged_per_delta_stream(self):
+        from types import SimpleNamespace
+        s = self._new_servicer()  # tool_parser_cls already None
+        s.llm = SimpleNamespace(generate=self._make_generate([
+            "Hello ", "Hello world", "Hello world!",
+        ]))
+        req = backend_pb2.PredictOptions(Prompt="x", Tools="")
+        replies = self._collect(s, req)
+
+        intermediate = [
+            cd.content for r in replies[:-1] for cd in r.chat_deltas if cd.content
+        ]
+        self.assertEqual(
+            intermediate, ["Hello ", "world", "!"],
+            f"plain streaming changed; got {intermediate!r}",
+        )
--- a/core/application/application.go
+++ b/core/application/application.go
@@ -341,11 +341,9 @@ func (a *Application) ResolvePIIPolicy(cfg *config.ModelConfig) (enabled bool, d
 	}
 	appCfg := a.ApplicationConfig()

-	if cfg.PII.Enabled != nil {
-		enabled = *cfg.PII.Enabled
-	} else {
-		enabled = cfg.PIIIsEnabled() // backend default (cloud-proxy)
-	}
+	// PIIIsEnabled already encodes "explicit pii.enabled wins, else backend
+	// default (cloud-proxy)" — the single source of that rule.
+	enabled = cfg.PIIIsEnabled()
 	if !enabled {
 		return false, nil
 	}
@@ -354,7 +352,7 @@ func (a *Application) ResolvePIIPolicy(cfg *config.ModelConfig) (enabled bool, d
 	if len(detectors) == 0 {
 		detectors = append([]string(nil), appCfg.PIIDefaultDetectors...)
 	}
-	return enabled, detectors
+	return true, detectors // enabled is necessarily true past the !enabled guard
 }

 // PIIPolicyResolver adapts ResolvePIIPolicy to pii.PolicyResolver for
--- a/core/application/distributed.go
+++ b/core/application/distributed.go
@@ -357,6 +357,15 @@ func initDistributed(cfg *config.ApplicationConfig, authDB *gorm.DB, configLoade
 		Pressure:         pressure,
 	})

+	// Wire staging-progress broadcasting so file-staging shows up on every
+	// replica, not just the one performing the transfer. Without this, a
+	// /api/operations poll that round-robins onto a peer sees no staging row and
+	// the progress flickers. The origin publishes; peers mirror via the wildcard.
+	router.StagingTracker().SetPublisher(natsClient)
+	if _, err := router.StagingTracker().SubscribeBroadcasts(natsClient); err != nil {
+		xlog.Warn("Failed to subscribe to staging progress broadcasts", "error", err)
+	}
+
 	// Create ReplicaReconciler for auto-scaling model replicas. Adapter +
 	// RegistrationToken feed the state-reconciliation passes: pending op
 	// drain uses the adapter, and model health probes use the token to auth
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -25,6 +25,7 @@ import (
 	"github.com/mudler/LocalAI/core/services/storage"
 	coreStartup "github.com/mudler/LocalAI/core/startup"
 	"github.com/mudler/LocalAI/internal"
+	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/signals"
 	"github.com/mudler/LocalAI/pkg/vram"

@@ -71,6 +72,16 @@ func New(opts ...config.AppOption) (*Application, error) {
 	if err != nil {
 		return nil, fmt.Errorf("unable to create ModelPath: %q", err)
 	}
+
+	// Reap *.partial downloads abandoned by a previous run (killed mid-transfer
+	// by an OOM/restart, or stalled before cleanup could run). The 24h window
+	// is well beyond any legitimate in-flight download, so this never trims an
+	// active transfer; it just stops dead partials accumulating on the volume.
+	if removed, cErr := downloader.CleanupStalePartialFiles(options.SystemState.Model.ModelsPath, 24*time.Hour); cErr != nil {
+		xlog.Warn("Failed to reap stale partial downloads", "error", cErr)
+	} else if removed > 0 {
+		xlog.Info("Reaped stale partial downloads", "count", removed)
+	}
 	if options.GeneratedContentDir != "" {
 		err := os.MkdirAll(options.GeneratedContentDir, 0o750)
 		if err != nil {
@@ -633,6 +644,12 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
 			options.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
 		}
 	}
+	if settings.SizeAwareEviction != nil {
+		// Only apply if current value is default (false), suggesting it wasn't set from env var
+		if !options.SizeAwareEviction {
+			options.SizeAwareEviction = *settings.SizeAwareEviction
+		}
+	}
 	if settings.LRUEvictionMaxRetries != nil {
 		// Only apply if current value is default (30), suggesting it wasn't set from env var
 		if options.LRUEvictionMaxRetries == 0 {
@@ -836,6 +853,7 @@ func initializeWatchdog(application *Application, options *config.ApplicationCon
 			model.WithLRULimit(lruLimit),
 			model.WithMemoryReclaimer(options.MemoryReclaimerEnabled, options.MemoryReclaimerThreshold),
 			model.WithForceEvictionWhenBusy(options.ForceEvictionWhenBusy),
+			model.WithSizeAwareEviction(options.SizeAwareEviction),
 		)
 		application.ModelLoader().SetWatchDog(wd)

--- a/core/application/watchdog.go
+++ b/core/application/watchdog.go
@@ -90,6 +90,7 @@ func (a *Application) startWatchdog() error {
 			model.WithLRULimit(lruLimit),
 			model.WithMemoryReclaimer(appConfig.MemoryReclaimerEnabled, appConfig.MemoryReclaimerThreshold),
 			model.WithForceEvictionWhenBusy(appConfig.ForceEvictionWhenBusy),
+			model.WithSizeAwareEviction(appConfig.SizeAwareEviction),
 		)

 		// Create new stop channel BEFORE setting up any goroutines
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -1,6 +1,7 @@
 package backend

 import (
+	"context"
 	"encoding/json"
 	"fmt"
 	"math/rand/v2"
@@ -12,7 +13,9 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/trace"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/vram"
 	"github.com/mudler/xlog"
 )

@@ -33,6 +36,67 @@ func recordModelLoadFailure(appConfig *config.ApplicationConfig, modelName, back
 	})
 }

+// estimateModelSizeBytes uses the unified EstimateModel entry point to compute
+// the total weight-file size for a model config.  It collects all weight files
+// from DownloadFiles, Model, and MMProj, and also extracts the HuggingFace
+// repo ID so EstimateModel can fall back to the HF API when local file
+// metadata is unavailable (e.g. not-yet-downloaded models).
+func estimateModelSizeBytes(c config.ModelConfig, modelsPath string) int64 {
+	seen := make(map[string]bool)
+	input := vram.ModelEstimateInput{}
+
+	addFile := func(uri string) {
+		if !vram.IsWeightFile(uri) {
+			return
+		}
+		resolved := uri
+		if !strings.Contains(uri, "://") {
+			resolved = "file://" + filepath.Join(modelsPath, uri)
+		}
+		if seen[resolved] {
+			return
+		}
+		seen[resolved] = true
+		input.Files = append(input.Files, vram.FileInput{URI: resolved})
+	}
+
+	// tryHFRepo resolves any huggingface:// or hf:// URI to an HTTPS URL and
+	// then extracts the org/model repo ID for use as the HF fallback path.
+	tryHFRepo := func(uri string) {
+		if input.HFRepo != "" {
+			return
+		}
+		resolved := downloader.URI(uri).ResolveURL()
+		if repoID, ok := vram.ExtractHFRepoID(resolved); ok {
+			input.HFRepo = repoID
+		}
+	}
+
+	for _, f := range c.DownloadFiles {
+		uriStr := string(f.URI)
+		addFile(uriStr)
+		tryHFRepo(uriStr)
+	}
+	addFile(c.Model)
+	tryHFRepo(c.Model)
+	if c.MMProj != "" {
+		addFile(c.MMProj)
+	}
+
+	if len(input.Files) == 0 && input.HFRepo == "" {
+		return 0
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+
+	result, err := vram.EstimateModelMultiContext(ctx, input, nil)
+	if err != nil || result.SizeBytes == 0 {
+		return 0
+	}
+	return int64(result.SizeBytes)
+}
+
 func ModelOptions(c config.ModelConfig, so *config.ApplicationConfig, opts ...model.Option) []model.Option {
 	defOpts := []model.Option{
 		model.WithBackendString(c.Backend),
@@ -70,6 +134,10 @@ func ModelOptions(c config.ModelConfig, so *config.ApplicationConfig, opts ...mo
 		defOpts = append(defOpts, model.WithExternalBackend(k, v))
 	}

+	if sizeBytes := estimateModelSizeBytes(c, so.SystemState.Model.ModelsPath); sizeBytes > 0 {
+		defOpts = append(defOpts, model.WithModelSizeBytes(sizeBytes))
+	}
+
 	return append(defOpts, opts...)
 }

@@ -90,10 +158,11 @@ func getSeed(c config.ModelConfig) int32 {
 // DefaultContextSize and DefaultBatchSize are the backend's fallbacks when a
 // model config leaves them unset. Exported so callers that must respect the
 // effective decode window — notably the router's prompt trimmer — resolve the
-// same numbers grpcModelOpts does instead of guessing.
+// same numbers grpcModelOpts does instead of guessing. The values are owned by
+// core/config (single source of truth shared with the config default tiers).
 const (
-	DefaultContextSize = 4096
-	DefaultBatchSize   = 512
+	DefaultContextSize = config.DefaultContextSize
+	DefaultBatchSize   = config.DefaultPhysicalBatch
 )

 // EffectiveContextSize is the context window the backend will run with: the
@@ -129,7 +198,7 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
 	ctxSize := EffectiveContextSize(c)
 	b := EffectiveBatchSize(c)

-	flashAttention := "auto"
+	flashAttention := config.DefaultFlashAttention

 	if c.FlashAttention != nil {
 		flashAttention = *c.FlashAttention
@@ -175,7 +244,7 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
 		mmlock = *c.MMlock
 	}

-	nGPULayers := 9999999
+	nGPULayers := config.DefaultNGPULayers
 	if c.NGPULayers != nil {
 		nGPULayers = *c.NGPULayers
 	}
--- a/core/backend/sound_classification.go
+++ b/core/backend/sound_classification.go
@@ -0,0 +1,88 @@
+package backend
+
+import (
+	"context"
+	"fmt"
+	"sort"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"
+
+	grpcPkg "github.com/mudler/LocalAI/pkg/grpc"
+	"github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/model"
+)
+
+// SoundDetectionRequest carries the knobs the HTTP layer collects for an
+// audio-tagging / sound-event-classification call. Audio is the path to the
+// uploaded clip on disk; TopK and Threshold are optional (0 = backend default).
+type SoundDetectionRequest struct {
+	Audio     string
+	TopK      int32
+	Threshold float32
+}
+
+func (r *SoundDetectionRequest) toProto() *proto.SoundDetectionRequest {
+	return &proto.SoundDetectionRequest{
+		Src:       r.Audio,
+		TopK:      r.TopK,
+		Threshold: r.Threshold,
+	}
+}
+
+func loadSoundDetectionModel(ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (grpcPkg.Backend, error) {
+	if modelConfig.Backend == "" {
+		return nil, fmt.Errorf("sound classification: model %q has no backend set; supported backends include ced", modelConfig.Name)
+	}
+	opts := ModelOptions(modelConfig, appConfig)
+	m, err := ml.Load(opts...)
+	if err != nil {
+		recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
+		return nil, err
+	}
+	if m == nil {
+		return nil, fmt.Errorf("could not load sound classification model")
+	}
+	return m, nil
+}
+
+// ModelSoundDetection runs the SoundDetection RPC against the configured
+// backend and returns a normalized schema.SoundClassificationResult.
+func ModelSoundDetection(ctx context.Context, req SoundDetectionRequest, ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (*schema.SoundClassificationResult, error) {
+	m, err := loadSoundDetectionModel(ml, modelConfig, appConfig)
+	if err != nil {
+		return nil, err
+	}
+
+	r, err := m.SoundDetection(ctx, req.toProto())
+	if err != nil {
+		return nil, err
+	}
+	return soundClassificationResultFromProto(modelConfig.Name, r), nil
+}
+
+// soundClassificationResultFromProto maps the backend detections to the
+// HTTP-facing schema, keeping the backend's score-descending order.
+func soundClassificationResultFromProto(modelName string, r *proto.SoundDetectionResponse) *schema.SoundClassificationResult {
+	out := &schema.SoundClassificationResult{
+		Model:      modelName,
+		Detections: []schema.SoundClassification{},
+	}
+	if r == nil {
+		return out
+	}
+	for _, d := range r.Detections {
+		if d == nil {
+			continue
+		}
+		out.Detections = append(out.Detections, schema.SoundClassification{
+			Index: int(d.Index),
+			Label: d.Label,
+			Score: d.Score,
+		})
+	}
+	sort.SliceStable(out.Detections, func(i, j int) bool {
+		return out.Detections[i].Score > out.Detections[j].Score
+	})
+	return out
+}
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -93,6 +93,7 @@ type RunCMD struct {
 	EnableMemoryReclaimer              bool     `env:"LOCALAI_MEMORY_RECLAIMER,MEMORY_RECLAIMER,LOCALAI_GPU_RECLAIMER,GPU_RECLAIMER" default:"false" help:"Enable memory threshold monitoring to auto-evict backends when memory usage exceeds threshold (uses GPU VRAM if available, otherwise RAM)" group:"backends"`
 	MemoryReclaimerThreshold           float64  `env:"LOCALAI_MEMORY_RECLAIMER_THRESHOLD,MEMORY_RECLAIMER_THRESHOLD,LOCALAI_GPU_RECLAIMER_THRESHOLD,GPU_RECLAIMER_THRESHOLD" default:"0.95" help:"Memory usage threshold (0.0-1.0) that triggers backend eviction (default 0.95 = 95%%)" group:"backends"`
 	ForceEvictionWhenBusy              bool     `env:"LOCALAI_FORCE_EVICTION_WHEN_BUSY,FORCE_EVICTION_WHEN_BUSY" default:"false" help:"Force eviction even when models have active API calls (default: false for safety)" group:"backends"`
+	SizeAwareEviction                  bool     `env:"LOCALAI_SIZE_AWARE_EVICTION,SIZE_AWARE_EVICTION" default:"false" help:"Evict the largest loaded model first rather than the least-recently-used one, keeping small utility models resident and maximizing freed memory per eviction" group:"backends"`
 	LRUEvictionMaxRetries              int      `env:"LOCALAI_LRU_EVICTION_MAX_RETRIES,LRU_EVICTION_MAX_RETRIES" default:"30" help:"Maximum number of retries when waiting for busy models to become idle before eviction (default: 30)" group:"backends"`
 	LRUEvictionRetryInterval           string   `env:"LOCALAI_LRU_EVICTION_RETRY_INTERVAL,LRU_EVICTION_RETRY_INTERVAL" default:"1s" help:"Interval between retries when waiting for busy models to become idle (e.g., 1s, 2s) (default: 1s)" group:"backends"`
 	Federated                          bool     `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
@@ -564,6 +565,9 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 	if r.ForceEvictionWhenBusy {
 		opts = append(opts, config.WithForceEvictionWhenBusy(true))
 	}
+	if r.SizeAwareEviction {
+		opts = append(opts, config.WithSizeAwareEviction(true))
+	}
 	if r.LRUEvictionMaxRetries > 0 {
 		opts = append(opts, config.WithLRUEvictionMaxRetries(r.LRUEvictionMaxRetries))
 	}
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -119,6 +119,7 @@ type ApplicationConfig struct {

 	// Eviction settings
 	ForceEvictionWhenBusy    bool          // Force eviction even when models have active API calls (default: false for safety)
+	SizeAwareEviction        bool          // Evict largest models first rather than least-recently-used (default: false)
 	LRUEvictionMaxRetries    int           // Maximum number of retries when waiting for busy models to become idle (default: 30)
 	LRUEvictionRetryInterval time.Duration // Interval between retries when waiting for busy models (default: 1s)

@@ -488,6 +489,16 @@ func WithForceEvictionWhenBusy(enabled bool) AppOption {
 	}
 }

+// WithSizeAwareEviction enables size-aware eviction ordering.
+// When true, the watchdog evicts the largest loaded model first rather than the
+// least-recently-used one, keeping small utility models resident and maximizing
+// memory freed per eviction.
+func WithSizeAwareEviction(enabled bool) AppOption {
+	return func(o *ApplicationConfig) {
+		o.SizeAwareEviction = enabled
+	}
+}
+
 // WithLRUEvictionMaxRetries sets the maximum number of retries when waiting for busy models to become idle
 func WithLRUEvictionMaxRetries(maxRetries int) AppOption {
 	return func(o *ApplicationConfig) {
@@ -1028,6 +1039,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 	memoryReclaimerEnabled := o.MemoryReclaimerEnabled
 	memoryReclaimerThreshold := o.MemoryReclaimerThreshold
 	forceEvictionWhenBusy := o.ForceEvictionWhenBusy
+	sizeAwareEviction := o.SizeAwareEviction
 	lruEvictionMaxRetries := o.LRUEvictionMaxRetries
 	threads := o.Threads
 	contextSize := o.ContextSize
@@ -1120,6 +1132,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 		MemoryReclaimerEnabled:    &memoryReclaimerEnabled,
 		MemoryReclaimerThreshold:  &memoryReclaimerThreshold,
 		ForceEvictionWhenBusy:     &forceEvictionWhenBusy,
+		SizeAwareEviction:         &sizeAwareEviction,
 		LRUEvictionMaxRetries:     &lruEvictionMaxRetries,
 		LRUEvictionRetryInterval:  &lruEvictionRetryInterval,
 		Threads:                   &threads,
@@ -1244,6 +1257,10 @@ func (o *ApplicationConfig) ApplyRuntimeSettings(settings *RuntimeSettings) (req
 		o.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
 		// This setting doesn't require restart, can be updated dynamically
 	}
+	if settings.SizeAwareEviction != nil {
+		o.SizeAwareEviction = *settings.SizeAwareEviction
+		// This setting doesn't require restart, can be updated dynamically
+	}
 	if settings.LRUEvictionMaxRetries != nil {
 		o.LRUEvictionMaxRetries = *settings.LRUEvictionMaxRetries
 		// This setting doesn't require restart, can be updated dynamically
--- a/core/config/backend_capabilities.go
+++ b/core/config/backend_capabilities.go
@@ -8,27 +8,28 @@ import (
 // Usecase name constants — the canonical string values used in gallery entries,
 // model configs (known_usecases), and UsecaseInfoMap keys.
 const (
-	UsecaseChat               = "chat"
-	UsecaseCompletion         = "completion"
-	UsecaseEdit               = "edit"
-	UsecaseVision             = "vision"
-	UsecaseEmbeddings         = "embeddings"
-	UsecaseTokenize           = "tokenize"
-	UsecaseImage              = "image"
-	UsecaseVideo              = "video"
-	UsecaseTranscript         = "transcript"
-	UsecaseTTS                = "tts"
-	UsecaseSoundGeneration    = "sound_generation"
-	UsecaseRerank             = "rerank"
-	UsecaseDetection          = "detection"
-	UsecaseDepth              = "depth"
-	UsecaseVAD                = "vad"
-	UsecaseAudioTransform     = "audio_transform"
-	UsecaseDiarization        = "diarization"
-	UsecaseRealtimeAudio      = "realtime_audio"
-	UsecaseFaceRecognition    = "face_recognition"
-	UsecaseSpeakerRecognition = "speaker_recognition"
-	UsecaseTokenClassify      = "token_classify"
+	UsecaseChat                = "chat"
+	UsecaseCompletion          = "completion"
+	UsecaseEdit                = "edit"
+	UsecaseVision              = "vision"
+	UsecaseEmbeddings          = "embeddings"
+	UsecaseTokenize            = "tokenize"
+	UsecaseImage               = "image"
+	UsecaseVideo               = "video"
+	UsecaseTranscript          = "transcript"
+	UsecaseTTS                 = "tts"
+	UsecaseSoundGeneration     = "sound_generation"
+	UsecaseRerank              = "rerank"
+	UsecaseDetection           = "detection"
+	UsecaseDepth               = "depth"
+	UsecaseVAD                 = "vad"
+	UsecaseAudioTransform      = "audio_transform"
+	UsecaseDiarization         = "diarization"
+	UsecaseSoundClassification = "sound_classification"
+	UsecaseRealtimeAudio       = "realtime_audio"
+	UsecaseFaceRecognition     = "face_recognition"
+	UsecaseSpeakerRecognition  = "speaker_recognition"
+	UsecaseTokenClassify       = "token_classify"
 )

 // GRPCMethod identifies a Backend service RPC from backend.proto.
@@ -51,6 +52,7 @@ const (
 	MethodVAD                GRPCMethod = "VAD"
 	MethodAudioTransform     GRPCMethod = "AudioTransform"
 	MethodDiarize            GRPCMethod = "Diarize"
+	MethodSoundDetection     GRPCMethod = "SoundDetection"
 	MethodAudioToAudioStream GRPCMethod = "AudioToAudioStream"
 	MethodFaceVerify         GRPCMethod = "FaceVerify"
 	MethodFaceAnalyze        GRPCMethod = "FaceAnalyze"
@@ -165,6 +167,11 @@ var UsecaseInfoMap = map[string]UsecaseInfo{
 		GRPCMethod:  MethodDiarize,
 		Description: "Speaker diarization (who-spoke-when, per-speaker segments) via the Diarize RPC.",
 	},
+	UsecaseSoundClassification: {
+		Flag:        FLAG_SOUND_CLASSIFICATION,
+		GRPCMethod:  MethodSoundDetection,
+		Description: "Sound-event classification / audio tagging (scored AudioSet labels like baby cry, glass breaking, alarms) via the SoundDetection RPC.",
+	},
 	UsecaseRealtimeAudio: {
 		Flag:        FLAG_REALTIME_AUDIO,
 		GRPCMethod:  MethodAudioToAudioStream,
--- a/core/config/defaults.go
+++ b/core/config/defaults.go
@@ -0,0 +1,30 @@
+package config
+
+// Canonical default values.
+//
+// These are owned here so the two layers that need them share a single source
+// of truth: the config tiers (ApplyInference/Hardware/Serving/Generic — which
+// *decide* defaults) and core/backend/options.go (which *translates* a
+// ModelConfig to the backend wire format and supplies the same fallbacks
+// defensively). Previously these were duplicated as literals across both
+// packages and had drifted (e.g. n_gpu_layers 9999999 vs 99999999, two batch
+// constants of 512). core/backend imports core/config, so backend references
+// these; config never imports backend.
+const (
+	// DefaultContextSize is the fallback context window when none is configured
+	// or estimable from the model.
+	DefaultContextSize = 4096
+
+	// GGUFFallbackContextSize is the context window for a GGUF model whose
+	// metadata yields no usable estimate (see guessGGUFFromFile). Deliberately
+	// smaller than DefaultContextSize to stay conservative on memory there.
+	GGUFFallbackContextSize = 1024
+
+	// DefaultNGPULayers means "offload all layers"; the backend (fit_params)
+	// clamps to what actually fits in device memory.
+	DefaultNGPULayers = 99999999
+
+	// DefaultFlashAttention is the flash-attention mode default; "auto" lets the
+	// backend enable it when the model + backend support it.
+	DefaultFlashAttention = "auto"
+)
--- a/core/config/generic_defaults.go
+++ b/core/config/generic_defaults.go
@@ -0,0 +1,115 @@
+package config
+
+import "os"
+
+// ApplyGenericDefaults fills the generic fallback values applied after the
+// higher-priority tiers (ApplyInferenceDefaults for the model family,
+// ApplyHardwareDefaults for the device, ApplyServingDefaults for serving
+// policy): sampling parameters and a few runtime flags. Like the other tiers it
+// only fills values still left unset, so model-family / explicit config wins.
+func ApplyGenericDefaults(cfg *ModelConfig) {
+	if cfg == nil {
+		return
+	}
+
+	// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
+	defaultTopP := 0.95
+	defaultTopK := 40
+	defaultMinP := 0.0
+	defaultTemp := 0.9
+	// https://github.com/mudler/LocalAI/issues/2780
+	defaultMirostat := 0
+	defaultMirostatTAU := 5.0
+	defaultMirostatETA := 0.1
+	defaultTypicalP := 1.0
+	defaultTFZ := 1.0
+	defaultZero := 0
+
+	trueV := true
+	falseV := false
+
+	if cfg.Seed == nil {
+		//  random number generator seed
+		defaultSeed := RAND_SEED
+		cfg.Seed = &defaultSeed
+	}
+
+	// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
+	// native default differs (issue #6632). Only inject it for the llama.cpp
+	// family and the empty/auto backend; leave TopK nil for known non-llama
+	// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
+	// is 0 rather than a silently-changed 40.
+	if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
+		cfg.TopK = &defaultTopK
+	}
+
+	if cfg.MinP == nil {
+		cfg.MinP = &defaultMinP
+	}
+
+	if cfg.TypicalP == nil {
+		cfg.TypicalP = &defaultTypicalP
+	}
+
+	if cfg.TFZ == nil {
+		cfg.TFZ = &defaultTFZ
+	}
+
+	if cfg.MMap == nil {
+		// MMap is enabled by default
+
+		// Only exception is for Intel GPUs
+		if os.Getenv("XPU") != "" {
+			cfg.MMap = &falseV
+		} else {
+			cfg.MMap = &trueV
+		}
+	}
+
+	if cfg.MMlock == nil {
+		// MMlock is disabled by default
+		cfg.MMlock = &falseV
+	}
+
+	if cfg.TopP == nil {
+		cfg.TopP = &defaultTopP
+	}
+	if cfg.Temperature == nil {
+		cfg.Temperature = &defaultTemp
+	}
+
+	if cfg.Maxtokens == nil {
+		cfg.Maxtokens = &defaultZero
+	}
+
+	if cfg.Mirostat == nil {
+		cfg.Mirostat = &defaultMirostat
+	}
+
+	if cfg.MirostatETA == nil {
+		cfg.MirostatETA = &defaultMirostatETA
+	}
+
+	if cfg.MirostatTAU == nil {
+		cfg.MirostatTAU = &defaultMirostatTAU
+	}
+
+	if cfg.LowVRAM == nil {
+		cfg.LowVRAM = &falseV
+	}
+
+	if cfg.Embeddings == nil {
+		cfg.Embeddings = &falseV
+	}
+
+	if cfg.Reranking == nil {
+		cfg.Reranking = &falseV
+	}
+
+	if cfg.PromptCacheAll == nil {
+		// Match upstream llama.cpp's default (common/common.h: cache_prompt = true)
+		// and let cache_idle_slots / kv_unified actually do useful work; users can
+		// opt out with an explicit `prompt_cache_all: false` in the model YAML.
+		cfg.PromptCacheAll = &trueV
+	}
+}
--- a/core/config/generic_defaults_test.go
+++ b/core/config/generic_defaults_test.go
@@ -0,0 +1,36 @@
+package config_test
+
+import (
+	. "github.com/mudler/LocalAI/core/config"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("ApplyGenericDefaults (generic fallback tier)", func() {
+	It("fills sampling + runtime fallbacks when unset", func() {
+		cfg := &ModelConfig{} // empty backend uses the llama sampler defaults
+		ApplyGenericDefaults(cfg)
+		Expect(cfg.TopP).ToNot(BeNil())
+		Expect(*cfg.TopP).To(Equal(0.95))
+		Expect(*cfg.TopK).To(Equal(40))
+		Expect(*cfg.Temperature).To(Equal(0.9))
+		Expect(*cfg.MMap).To(BeTrue())
+		Expect(*cfg.MMlock).To(BeFalse())
+		Expect(*cfg.PromptCacheAll).To(BeTrue())
+	})
+
+	It("never overrides explicit values", func() {
+		tk := 7
+		tp := 0.5
+		cfg := &ModelConfig{}
+		cfg.TopK = &tk
+		cfg.TopP = &tp
+		ApplyGenericDefaults(cfg)
+		Expect(*cfg.TopK).To(Equal(7))
+		Expect(*cfg.TopP).To(Equal(0.5))
+	})
+
+	It("no-ops on nil", func() {
+		Expect(func() { ApplyGenericDefaults(nil) }).ToNot(Panic())
+	})
+})
--- a/core/config/gguf.go
+++ b/core/config/gguf.go
@@ -14,11 +14,6 @@ import (
 	"github.com/gpustack/gguf-parser-go/util/ptr"
 )

-const (
-	defaultContextSize = 1024
-	defaultNGPULayers  = 99999999
-)
-
 // reservedNonChatModel reports whether the operator reserved this model for an
 // internal primitive — the router score classifier or the PII NER
 // token_classify tier. Such a model has no chat template and must not be
@@ -38,7 +33,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
 			cSize := int(ctxSize)
 			cfg.ContextSize = &cSize
 		} else {
-			defaultCtx = defaultContextSize
+			defaultCtx = GGUFFallbackContextSize
 			cfg.ContextSize = &defaultCtx
 		}
 	}
@@ -52,7 +47,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {

 	if cfg.NGPULayers == nil {
 		// we assume we want to offload all layers
-		defaultHigh := defaultNGPULayers
+		defaultHigh := DefaultNGPULayers
 		cfg.NGPULayers = &defaultHigh
 	}

--- a/core/config/hardware_defaults.go
+++ b/core/config/hardware_defaults.go
@@ -0,0 +1,180 @@
+package config
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+
+	"github.com/mudler/LocalAI/pkg/xsysinfo"
+	"github.com/mudler/xlog"
+)
+
+// Hardware-driven model-config defaults.
+//
+// This sits alongside the other config overriders (ApplyInferenceDefaults for
+// model families, guessDefaultsFromFile for GGUF/NGPULayers): they all
+// heuristically fill ModelConfig values the user left unset. Hardware tuning is
+// the same domain — "adjust the config from the device that will run it" — so
+// it lives here rather than scattered into the backend or a separate package.
+//
+// The heuristics are parameterized on a GPU descriptor (not on direct
+// detection) so they apply in both deployment shapes: SetDefaults passes the
+// LocalGPU on a single host, and the distributed router passes the *selected
+// node's* reported GPU before loading there (the frontend that loaded the
+// config may have no GPU at all).
+
+// GPU describes the device that will run a model.
+type GPU struct {
+	// Vendor is "nvidia", "amd", … (matches xsysinfo vendor constants).
+	Vendor string
+	// ComputeCapability is the NVIDIA compute capability as "major.minor"
+	// (e.g. "12.1" for GB10 / DGX Spark). Empty for non-NVIDIA / unknown.
+	ComputeCapability string
+	// VRAM is total device memory in bytes (0 = unknown).
+	VRAM uint64
+}
+
+// Physical batch (n_batch / n_ubatch) defaults.
+const (
+	// DefaultPhysicalBatch is the conservative default when no hardware-specific
+	// tuning applies. core/backend.DefaultBatchSize references this (single source).
+	DefaultPhysicalBatch = 512
+	// BlackwellPhysicalBatch is the default on NVIDIA Blackwell consumer GPUs
+	// (sm_12x: sm_120 RTX 50-series, sm_121 GB10 / DGX Spark). A larger physical
+	// batch materially lifts MoE prefill there (per-expert GEMM tiles fill
+	// better); measured on a GB10 with Qwen3-30B-A3B to saturate around 2048.
+	BlackwellPhysicalBatch = 2048
+)
+
+// IsNVIDIABlackwell reports whether the GPU is in the NVIDIA Blackwell consumer
+// family (sm_12x). Datacenter Blackwell (B100/B200/GB200, sm_100 / cc 10.0)
+// reports a different compute capability and is intentionally not matched.
+func (g GPU) IsNVIDIABlackwell() bool {
+	maj, _ := parseComputeCapability(g.ComputeCapability)
+	return maj >= 12
+}
+
+// PhysicalBatch returns the canonical physical batch (n_batch/n_ubatch) for the
+// given hardware, used when the model config leaves batch unset.
+func PhysicalBatch(g GPU) int {
+	if g.IsNVIDIABlackwell() {
+		return BlackwellPhysicalBatch
+	}
+	return DefaultPhysicalBatch
+}
+
+// IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
+// Callers that re-tune a value chosen by an upstream host (the distributed
+// router correcting the frontend's guess) use this to avoid clobbering an
+// explicit user batch such as 1024.
+func IsManagedPhysicalBatch(n int) bool {
+	return n == DefaultPhysicalBatch || n == BlackwellPhysicalBatch
+}
+
+// Parallel-slot (n_parallel) VRAM tiers. llama.cpp serializes requests at
+// n_parallel=1 (the backend default) and only auto-enables continuous batching
+// when n_parallel > 1 — so a single-slot default makes concurrent requests
+// queue. We default a slot count by GPU size so multi-user serving works out of
+// the box. With the backend's unified KV cache the slots SHARE the context
+// budget, so more slots add concurrency without multiplying KV memory.
+const (
+	parallelSlotsVRAMHigh = uint64(32) << 30 // >=32 GiB -> 8 slots
+	parallelSlotsVRAMMid  = uint64(8) << 30  // >=8 GiB  -> 4 slots
+	parallelSlotsVRAMLow  = uint64(4) << 30  // >=4 GiB  -> 2 slots
+)
+
+// DefaultParallelSlots returns the n_parallel default for the given GPU. Returns
+// 1 (no concurrency) when VRAM is unknown or too small, so we never change
+// behavior on CPU-only / tiny devices.
+func DefaultParallelSlots(g GPU) int {
+	switch {
+	case g.VRAM >= parallelSlotsVRAMHigh:
+		return 8
+	case g.VRAM >= parallelSlotsVRAMMid:
+		return 4
+	case g.VRAM >= parallelSlotsVRAMLow:
+		return 2
+	default:
+		return 1
+	}
+}
+
+// EnsureParallelOption appends a VRAM-scaled "parallel:N" backend option when the
+// model doesn't already set one (and the GPU warrants concurrency). Returns the
+// possibly-extended options. Shared by the single-host config path
+// (ApplyHardwareDefaults) and the distributed router (per selected node).
+func EnsureParallelOption(opts []string, gpu GPU) []string {
+	if slots := DefaultParallelSlots(gpu); slots > 1 && !hasParallelOption(opts) {
+		return append(opts, fmt.Sprintf("parallel:%d", slots))
+	}
+	return opts
+}
+
+// hasParallelOption reports whether the model already sets parallel/n_parallel
+// so we never override an explicit value (helper shared with serving_defaults.go).
+func hasParallelOption(opts []string) bool {
+	return backendOptionSet(opts, "parallel", "n_parallel")
+}
+
+// localGPU builds a GPU descriptor from local detection, used by SetDefaults on
+// a single host (the distributed router builds it from the selected node's
+// reported info instead). It is a package var so tests can inject a
+// deterministic device — detection does a live nvidia-smi call.
+var localGPU = func() GPU {
+	vendor, _ := xsysinfo.DetectGPUVendor()
+	vram, _ := xsysinfo.TotalAvailableVRAM()
+	return GPU{
+		Vendor:            vendor,
+		ComputeCapability: xsysinfo.NVIDIAComputeCapability(),
+		VRAM:              vram,
+	}
+}
+
+// ApplyHardwareDefaults fills ModelConfig values that depend on the target GPU
+// and were left unset by the user. Currently: a larger physical batch on
+// Blackwell. Explicit config always wins (we only touch zero values).
+func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
+	if cfg == nil {
+		return
+	}
+	if cfg.Batch == 0 && gpu.IsNVIDIABlackwell() {
+		cfg.Batch = BlackwellPhysicalBatch
+		xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
+			"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability)
+	}
+
+	// Enable concurrent serving by default on a capable GPU: without this the
+	// llama.cpp backend runs n_parallel=1 and serializes multi-user requests
+	// (continuous batching stays off). Unified KV means the slots share the
+	// context budget, so this is concurrency without extra KV memory. Explicit
+	// parallel/n_parallel in the model options always wins.
+	if before := len(cfg.Options); true {
+		cfg.Options = EnsureParallelOption(cfg.Options, gpu)
+		if len(cfg.Options) > before {
+			xlog.Debug("[hardware_defaults] defaulting parallel slots for concurrent serving",
+				"option", cfg.Options[len(cfg.Options)-1], "vram_gib", gpu.VRAM>>30)
+		}
+	}
+}
+
+// parseComputeCapability splits a "major.minor" string into integer parts.
+// Returns (-1, -1) when it can't be parsed.
+func parseComputeCapability(cc string) (int, int) {
+	cc = strings.TrimSpace(cc)
+	if cc == "" {
+		return -1, -1
+	}
+	majStr, minStr := cc, "0"
+	if dot := strings.IndexByte(cc, '.'); dot >= 0 {
+		majStr, minStr = cc[:dot], cc[dot+1:]
+	}
+	maj, err := strconv.Atoi(strings.TrimSpace(majStr))
+	if err != nil {
+		return -1, -1
+	}
+	min, err := strconv.Atoi(strings.TrimSpace(minStr))
+	if err != nil {
+		min = 0
+	}
+	return maj, min
+}
--- a/core/config/hardware_defaults_internal_test.go
+++ b/core/config/hardware_defaults_internal_test.go
@@ -0,0 +1,37 @@
+package config
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// Single-instance path: SetDefaults applies hardware defaults from the local
+// GPU. The detection seam (localGPU) is injected so the path is deterministic
+// without a real GPU.
+var _ = Describe("SetDefaults hardware defaults (single-instance)", func() {
+	var orig func() GPU
+	BeforeEach(func() { orig = localGPU })
+	AfterEach(func() { localGPU = orig })
+
+	It("sets the physical batch on a local Blackwell GPU", func() {
+		localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
+		cfg := &ModelConfig{}
+		cfg.SetDefaults()
+		Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
+	})
+
+	It("leaves batch unset on a non-Blackwell local GPU", func() {
+		localGPU = func() GPU { return GPU{ComputeCapability: "8.9"} }
+		cfg := &ModelConfig{}
+		cfg.SetDefaults()
+		Expect(cfg.Batch).To(Equal(0))
+	})
+
+	It("never overrides an explicit batch", func() {
+		localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
+		cfg := &ModelConfig{}
+		cfg.Batch = 1024
+		cfg.SetDefaults()
+		Expect(cfg.Batch).To(Equal(1024))
+	})
+})
--- a/core/config/hardware_defaults_test.go
+++ b/core/config/hardware_defaults_test.go
@@ -0,0 +1,97 @@
+package config_test
+
+import (
+	. "github.com/mudler/LocalAI/core/config"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("Hardware-driven config defaults", func() {
+	DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
+		func(cc string, want bool) {
+			Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
+		},
+		Entry("GB10 12.1", "12.1", true),
+		Entry("RTX 50 12.0", "12.0", true),
+		Entry("future 13.0", "13.0", true),
+		Entry("Hopper 9.0", "9.0", false),
+		Entry("Ada 8.9", "8.9", false),
+		Entry("datacenter Blackwell sm_100 10.0", "10.0", false),
+		Entry("unknown", "", false),
+	)
+
+	Describe("PhysicalBatch / IsManagedPhysicalBatch", func() {
+		It("returns the Blackwell batch on Blackwell", func() {
+			Expect(PhysicalBatch(GPU{ComputeCapability: "12.1"})).To(Equal(BlackwellPhysicalBatch))
+		})
+		It("returns the default batch otherwise", func() {
+			Expect(PhysicalBatch(GPU{ComputeCapability: "9.0"})).To(Equal(DefaultPhysicalBatch))
+			Expect(PhysicalBatch(GPU{})).To(Equal(DefaultPhysicalBatch))
+		})
+		It("recognizes managed defaults but not explicit values", func() {
+			Expect(IsManagedPhysicalBatch(DefaultPhysicalBatch)).To(BeTrue())
+			Expect(IsManagedPhysicalBatch(BlackwellPhysicalBatch)).To(BeTrue())
+			Expect(IsManagedPhysicalBatch(1024)).To(BeFalse())
+		})
+	})
+
+	Describe("ApplyHardwareDefaults", func() {
+		It("raises an unset batch to 2048 on Blackwell", func() {
+			cfg := &ModelConfig{}
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
+			Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
+		})
+		It("leaves batch unset on non-Blackwell", func() {
+			cfg := &ModelConfig{}
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0"})
+			Expect(cfg.Batch).To(Equal(0))
+		})
+		It("never overrides an explicit batch", func() {
+			cfg := &ModelConfig{}
+			cfg.Batch = 1024
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
+			Expect(cfg.Batch).To(Equal(1024))
+		})
+		It("no-ops on nil", func() {
+			Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic())
+		})
+	})
+
+	const gib = uint64(1) << 30
+
+	DescribeTable("DefaultParallelSlots (by VRAM)",
+		func(vramGiB uint64, want int) {
+			Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want))
+		},
+		Entry("GB10 119 GiB", uint64(119), 8),
+		Entry("48 GiB", uint64(48), 8),
+		Entry("24 GiB", uint64(24), 4),
+		Entry("8 GiB", uint64(8), 4),
+		Entry("6 GiB", uint64(6), 2),
+		Entry("2 GiB", uint64(2), 1),
+		Entry("unknown 0", uint64(0), 1),
+	)
+
+	Describe("ApplyHardwareDefaults parallel slots", func() {
+		It("adds a VRAM-scaled parallel option on a capable GPU", func() {
+			cfg := &ModelConfig{}
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
+			Expect(cfg.Options).To(ContainElement("parallel:8"))
+		})
+		It("scales the slot count down with VRAM", func() {
+			cfg := &ModelConfig{}
+			ApplyHardwareDefaults(cfg, GPU{VRAM: 24 * gib})
+			Expect(cfg.Options).To(ContainElement("parallel:4"))
+		})
+		It("adds no parallel option on small/unknown VRAM", func() {
+			cfg := &ModelConfig{}
+			ApplyHardwareDefaults(cfg, GPU{VRAM: 2 * gib})
+			Expect(cfg.Options).ToNot(ContainElement(ContainSubstring("parallel")))
+		})
+		It("never overrides an explicit parallel option", func() {
+			cfg := &ModelConfig{Options: []string{"parallel:2"}}
+			ApplyHardwareDefaults(cfg, GPU{VRAM: 119 * gib})
+			Expect(cfg.Options).To(Equal([]string{"parallel:2"}))
+		})
+	})
+})
--- a/core/config/hooks_llamacpp.go
+++ b/core/config/hooks_llamacpp.go
@@ -34,7 +34,7 @@ func llamaCppDefaults(cfg *ModelConfig, modelPath string) {
 	// Default context size if not set, regardless of whether GGUF parsing succeeds
 	defer func() {
 		if cfg.ContextSize == nil {
-			ctx := defaultContextSize
+			ctx := GGUFFallbackContextSize
 			cfg.ContextSize = &ctx
 		}
 	}()
--- a/core/config/meta/constants.go
+++ b/core/config/meta/constants.go
@@ -68,6 +68,7 @@ var UsecaseOptions = []FieldOption{
 	{Value: "face_recognition", Label: "Face Recognition"},
 	{Value: "transcript", Label: "Transcript"},
 	{Value: "diarization", Label: "Diarization"},
+	{Value: "sound_classification", Label: "Sound Classification"},
 	{Value: "speaker_recognition", Label: "Speaker Recognition"},
 	{Value: "tts", Label: "TTS"},
 	{Value: "sound_generation", Label: "Sound Generation"},
--- a/core/config/meta/registry.go
+++ b/core/config/meta/registry.go
@@ -286,6 +286,15 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order:       45,
 		},

+		// --- Alias ---
+		"alias": {
+			Section:     "alias",
+			Label:       "Alias target",
+			Description: "Redirect all traffic for this model to another configured model. When set, every other field on this config is ignored and requests are served by the target model.",
+			Component:   "model-select",
+			Order:       0,
+		},
+
 		// --- Pipeline ---
 		"pipeline.llm": {
 			Section:              "pipeline",
@@ -319,6 +328,30 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			AutocompleteProvider: ProviderModelsVAD,
 			Order:                63,
 		},
+		"pipeline.sound_detection": {
+			Section:              "pipeline",
+			Label:                "Sound Detection Model",
+			Description:          "Model to use for sound-event classification (audio tagging, e.g. ced) in the pipeline. When set, committed realtime audio is also classified and the scored AudioSet tags are emitted as a conversation.item.sound_detection event.",
+			Component:            "model-select",
+			AutocompleteProvider: ProviderModels,
+			Order:                64,
+		},
+		"pipeline.sound_detection_window_ms": {
+			Section:     "pipeline",
+			Label:       "Sound Detection Window (ms)",
+			Description: "Server-side windowing for a sound-only realtime session: length in ms of the audio window classified each hop. 0 = client-driven (the client commits windows).",
+			Component:   "number",
+			Min:         f64(0),
+			Order:       65,
+		},
+		"pipeline.sound_detection_hop_ms": {
+			Section:     "pipeline",
+			Label:       "Sound Detection Hop (ms)",
+			Description: "Server-side windowing hop in ms: how often the server classifies the last window. 0 = client-driven.",
+			Component:   "number",
+			Min:         f64(0),
+			Order:       66,
+		},
 		"pipeline.reasoning_effort": {
 			Section:     "pipeline",
 			Label:       "Reasoning Effort",
@@ -448,6 +481,55 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Component:   "json-editor",
 			Order:       78,
 		},
+		"pipeline.voice_recognition.enforce": {
+			Section:     "pipeline",
+			Label:       "Voice Gate Enforce",
+			Description: "Whether the gate rejects unauthorized speakers. Enabled (default) drops unauthorized utterances before the LLM. Disabled still resolves and surfaces the speaker (for the conversation.item.speaker event and personalization) but never drops a turn.",
+			Component:   "toggle",
+			Order:       80,
+		},
+		"pipeline.voice_recognition.identity.announce": {
+			Section:     "pipeline",
+			Label:       "Speaker Identity Announce",
+			Description: "Emit a conversation.item.speaker event to the client naming the recognized speaker. When set, identity is resolved on every turn even if 'when' is 'first'.",
+			Component:   "toggle",
+			Order:       81,
+		},
+		"pipeline.voice_recognition.identity.announce_unknown": {
+			Section:     "pipeline",
+			Label:       "Speaker Identity Announce Unknown",
+			Description: "Also emit the conversation.item.speaker event (with matched=false) when no confident match is found. Default only announces on a match.",
+			Component:   "toggle",
+			Order:       82,
+		},
+		"pipeline.voice_recognition.identity.personalize": {
+			Section:     "pipeline",
+			Label:       "Speaker Identity Personalize",
+			Description: "Inform the LLM who is speaking so it can tailor replies. Enables the name and system-note injection below.",
+			Component:   "toggle",
+			Order:       83,
+		},
+		"pipeline.voice_recognition.identity.inject_name": {
+			Section:     "pipeline",
+			Label:       "Speaker Identity Inject Name",
+			Description: "Personalization: set the per-message OpenAI 'name' field on each user turn to the recognized speaker.",
+			Component:   "toggle",
+			Order:       84,
+		},
+		"pipeline.voice_recognition.identity.inject_system_note": {
+			Section:     "pipeline",
+			Label:       "Speaker Identity Inject System Note",
+			Description: "Personalization: append a 'The current speaker is <name>.' note to the system message reflecting the latest speaker.",
+			Component:   "toggle",
+			Order:       85,
+		},
+		"pipeline.voice_recognition.identity.note_unknown": {
+			Section:     "pipeline",
+			Label:       "Speaker Identity Note Unknown",
+			Description: "Personalization: when the speaker is unidentified, append 'The current speaker is unknown.' to the system message so the model can ask who it is talking to.",
+			Component:   "toggle",
+			Order:       86,
+		},
 		"pipeline.max_history_items": {
 			Section:     "pipeline",
 			Label:       "Max History Items",
--- a/core/config/meta/registry_test.go
+++ b/core/config/meta/registry_test.go
@@ -0,0 +1,28 @@
+package meta_test
+
+import (
+	"github.com/mudler/LocalAI/core/config/meta"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("alias field metadata", func() {
+	It("registers the alias field as a model-select in the alias section", func() {
+		reg := meta.DefaultRegistry()
+		f, ok := reg["alias"]
+		Expect(ok).To(BeTrue(), "alias field should have a registry override")
+		Expect(f.Section).To(Equal("alias"))
+		Expect(f.Component).To(Equal("model-select"))
+	})
+
+	It("defines an alias section", func() {
+		var found bool
+		for _, s := range meta.DefaultSections() {
+			if s.ID == "alias" {
+				found = true
+			}
+		}
+		Expect(found).To(BeTrue(), "DefaultSections should include an alias section")
+	})
+})
--- a/core/config/meta/types.go
+++ b/core/config/meta/types.go
@@ -69,6 +69,7 @@ type FieldMetaOverride struct {
 func DefaultSections() []Section {
 	return []Section{
 		{ID: "general", Label: "General", Icon: "settings", Order: 0},
+		{ID: "alias", Label: "Alias", Icon: "git-merge", Order: 5},
 		{ID: "llm", Label: "LLM", Icon: "cpu", Order: 10},
 		{ID: "parameters", Label: "Parameters", Icon: "sliders", Order: 20},
 		{ID: "templates", Label: "Templates", Icon: "file-text", Order: 30},
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -37,6 +37,12 @@ type ModelConfig struct {
 	schema.PredictionOptions `yaml:"parameters,omitempty" json:"parameters,omitempty"`
 	Name                     string `yaml:"name,omitempty" json:"name,omitempty"`

+	// Alias, when set, makes this config a pure redirect: every request for
+	// Name is served by the model named here. All other fields are ignored.
+	// The target must be an existing, non-alias model (enforced at load and
+	// at create/swap time). See docs/content for Model Aliases.
+	Alias string `yaml:"alias,omitempty" json:"alias,omitempty"`
+
 	F16                 *bool               `yaml:"f16,omitempty" json:"f16,omitempty"`
 	Threads             *int                `yaml:"threads,omitempty" json:"threads,omitempty"`
 	Debug               *bool               `yaml:"debug,omitempty" json:"debug,omitempty"`
@@ -391,6 +397,10 @@ func (c *ModelConfig) HasRouter() bool {
 	return len(c.Router.Candidates) > 0
 }

+// IsAlias reports whether this config is a pure redirect to another model.
+// Value receiver so it is callable on non-addressable config values too.
+func (c ModelConfig) IsAlias() bool { return c.Alias != "" }
+
 // @Description PII filtering configuration. PII redaction is per-model so
 // that local models don't pay the latency or behaviour change of regex
 // scanning, while cloud-bound traffic (cloud-proxy backend) can default to
@@ -594,6 +604,20 @@ type Pipeline struct {
 	LLM           string `yaml:"llm,omitempty" json:"llm,omitempty"`
 	Transcription string `yaml:"transcription,omitempty" json:"transcription,omitempty"`
 	VAD           string `yaml:"vad,omitempty" json:"vad,omitempty"`
+	// SoundDetection names a sound-event-classification model (e.g. ced). When
+	// set, each VAD-committed realtime utterance is also run through it and the
+	// scored AudioSet tags are emitted as a conversation.item.sound_detection
+	// server event, alongside (and independent of) transcription.
+	SoundDetection string `yaml:"sound_detection,omitempty" json:"sound_detection,omitempty"`
+
+	// SoundDetectionWindowMs / SoundDetectionHopMs enable server-side windowing
+	// for a sound-detection-only realtime session: instead of the client
+	// committing audio buffers, the server classifies the last WindowMs of
+	// streamed audio every HopMs and emits a sound_detection event per hop. Both
+	// must be > 0 to activate; otherwise the session stays client-driven (the
+	// client commits windows via input_audio_buffer.commit).
+	SoundDetectionWindowMs int `yaml:"sound_detection_window_ms,omitempty" json:"sound_detection_window_ms,omitempty"`
+	SoundDetectionHopMs    int `yaml:"sound_detection_hop_ms,omitempty" json:"sound_detection_hop_ms,omitempty"`

 	// ReasoningEffort sets the reasoning effort (none|minimal|low|medium|high) for
 	// the pipeline's LLM without editing the LLM model config. Overrides the LLM's
@@ -759,6 +783,13 @@ type PipelineVoiceRecognition struct {
 	Allow VoiceRecognitionAllow `yaml:"allow,omitempty" json:"allow,omitempty"`
 	// References are the authorized reference speakers (verify mode).
 	References []VoiceReference `yaml:"references,omitempty" json:"references,omitempty"`
+	// Enforce controls the authorization gate. A nil value or true rejects
+	// unauthorized speakers (the historical behavior). false resolves the
+	// speaker's identity for surfacing/personalization but never drops a turn.
+	Enforce *bool `yaml:"enforce,omitempty" json:"enforce,omitempty"`
+	// Identity surfaces the recognized speaker to the client and the LLM. It is
+	// independent of Enforce: identity can be surfaced without gating.
+	Identity *VoiceIdentityConfig `yaml:"identity,omitempty" json:"identity,omitempty"`
 }

 // @Description VoiceRecognitionAllow filters authorized registry identities.
@@ -775,6 +806,25 @@ type VoiceReference struct {
 	Audio string `yaml:"audio,omitempty" json:"audio,omitempty"`
 }

+// @Description VoiceIdentityConfig surfaces the recognized speaker to the realtime
+// client and the LLM. When set, identity is resolved on every turn even if the
+// gate's When is "first" (the gate still authorizes only once).
+type VoiceIdentityConfig struct {
+	// Announce emits a conversation.item.speaker event to the client.
+	Announce bool `yaml:"announce,omitempty" json:"announce,omitempty"`
+	// AnnounceUnknown also emits the event when there is no confident match.
+	AnnounceUnknown bool `yaml:"announce_unknown,omitempty" json:"announce_unknown,omitempty"`
+	// Personalize informs the LLM who is speaking.
+	Personalize bool `yaml:"personalize,omitempty" json:"personalize,omitempty"`
+	// InjectName sets the per-message name field on each user turn.
+	InjectName bool `yaml:"inject_name,omitempty" json:"inject_name,omitempty"`
+	// InjectSystemNote maintains a "current speaker" note in the system message.
+	InjectSystemNote bool `yaml:"inject_system_note,omitempty" json:"inject_system_note,omitempty"`
+	// NoteUnknown adds a "the current speaker is unknown" note (enables the model
+	// to ask who it is talking to).
+	NoteUnknown bool `yaml:"note_unknown,omitempty" json:"note_unknown,omitempty"`
+}
+
 // VoiceGateEnabled reports whether a voice-recognition gate is configured. The
 // mere presence of the block is the intent signal: a present-but-incomplete
 // block (e.g. missing model) must fail closed at construction, not be silently
@@ -783,6 +833,28 @@ func (p Pipeline) VoiceGateEnabled() bool {
 	return p.VoiceRecognition != nil
 }

+// EnforceGate reports whether the gate rejects unauthorized speakers. A nil
+// Enforce means "enforce" so existing configs keep gating.
+func (p PipelineVoiceRecognition) EnforceGate() bool {
+	return p.Enforce == nil || *p.Enforce
+}
+
+// IdentityEnabled reports whether the speaker's identity must be resolved for
+// surfacing or personalization.
+func (p PipelineVoiceRecognition) IdentityEnabled() bool {
+	return p.Identity != nil && (p.Identity.Announce || p.Identity.Personalize)
+}
+
+// AnnounceEnabled reports whether to emit the conversation.item.speaker event.
+func (p PipelineVoiceRecognition) AnnounceEnabled() bool {
+	return p.Identity != nil && p.Identity.Announce
+}
+
+// PersonalizeEnabled reports whether to inform the LLM of the speaker.
+func (p PipelineVoiceRecognition) PersonalizeEnabled() bool {
+	return p.Identity != nil && p.Identity.Personalize
+}
+
 // Normalize fills in defaults in place for omitted fields.
 func (v *PipelineVoiceRecognition) Normalize() {
 	if v.Mode == "" {
@@ -1111,107 +1183,22 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	// This ensures gallery-installed and runtime-loaded models get optimal parameters.
 	ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model)

-	// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
-	defaultTopP := 0.95
-	defaultTopK := 40
-	defaultMinP := 0.0
-	defaultTemp := 0.9
-	// https://github.com/mudler/LocalAI/issues/2780
-	defaultMirostat := 0
-	defaultMirostatTAU := 5.0
-	defaultMirostatETA := 0.1
-	defaultTypicalP := 1.0
-	defaultTFZ := 1.0
-	defaultZero := 0
+	// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell).
+	// Uses the local GPU here; in distributed mode the router re-applies the same
+	// heuristics for the selected node's GPU before loading. Explicit config wins.
+	ApplyHardwareDefaults(cfg, localGPU())
+
+	// Apply serving-policy defaults (device-independent): cross-request prefix
+	// caching. Propagates to distributed nodes via the model options.
+	ApplyServingDefaults(cfg)
+
+	// Generic fallback defaults (sampling params + runtime flags), applied after
+	// the model-family / hardware / serving tiers above. Only fills unset values.
+	ApplyGenericDefaults(cfg)

 	trueV := true
 	falseV := false

-	if cfg.Seed == nil {
-		//  random number generator seed
-		defaultSeed := RAND_SEED
-		cfg.Seed = &defaultSeed
-	}
-
-	// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
-	// native default differs (issue #6632). Only inject it for the llama.cpp
-	// family and the empty/auto backend; leave TopK nil for known non-llama
-	// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
-	// is 0 rather than a silently-changed 40.
-	if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
-		cfg.TopK = &defaultTopK
-	}
-
-	if cfg.MinP == nil {
-		cfg.MinP = &defaultMinP
-	}
-
-	if cfg.TypicalP == nil {
-		cfg.TypicalP = &defaultTypicalP
-	}
-
-	if cfg.TFZ == nil {
-		cfg.TFZ = &defaultTFZ
-	}
-
-	if cfg.MMap == nil {
-		// MMap is enabled by default
-
-		// Only exception is for Intel GPUs
-		if os.Getenv("XPU") != "" {
-			cfg.MMap = &falseV
-		} else {
-			cfg.MMap = &trueV
-		}
-	}
-
-	if cfg.MMlock == nil {
-		// MMlock is disabled by default
-		cfg.MMlock = &falseV
-	}
-
-	if cfg.TopP == nil {
-		cfg.TopP = &defaultTopP
-	}
-	if cfg.Temperature == nil {
-		cfg.Temperature = &defaultTemp
-	}
-
-	if cfg.Maxtokens == nil {
-		cfg.Maxtokens = &defaultZero
-	}
-
-	if cfg.Mirostat == nil {
-		cfg.Mirostat = &defaultMirostat
-	}
-
-	if cfg.MirostatETA == nil {
-		cfg.MirostatETA = &defaultMirostatETA
-	}
-
-	if cfg.MirostatTAU == nil {
-		cfg.MirostatTAU = &defaultMirostatTAU
-	}
-
-	if cfg.LowVRAM == nil {
-		cfg.LowVRAM = &falseV
-	}
-
-	if cfg.Embeddings == nil {
-		cfg.Embeddings = &falseV
-	}
-
-	if cfg.Reranking == nil {
-		cfg.Reranking = &falseV
-	}
-
-	if cfg.PromptCacheAll == nil {
-		// Match upstream llama.cpp's default (common/common.h: cache_prompt = true)
-		// and let cache_idle_slots / kv_unified actually do useful work; users can
-		// opt out with an explicit `prompt_cache_all: false` in the model YAML.
-		cfg.PromptCacheAll = &trueV
-	}
-
 	if threads == 0 {
 		// Threads can't be 0
 		threads = 4
@@ -1243,6 +1230,22 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 }

 func (c *ModelConfig) Validate() (bool, error) {
+	// An alias is a pure redirect: validate only its own shape here. Target
+	// existence and the no-chain rule need the full config set, so the loader
+	// (load-time) and the create/swap endpoints enforce those.
+	if c.IsAlias() {
+		if c.Name == "" {
+			return false, fmt.Errorf("alias config requires a name")
+		}
+		if c.Alias == c.Name {
+			return false, fmt.Errorf("alias %q cannot point to itself", c.Name)
+		}
+		if c.Backend != "" || c.Model != "" {
+			return false, fmt.Errorf("alias config %q must not set backend or parameters.model: an alias is a pure redirect", c.Name)
+		}
+		return true, nil
+	}
+
 	downloadedFileNames := []string{}
 	for _, f := range c.DownloadFiles {
 		downloadedFileNames = append(downloadedFileNames, f.Filename)
@@ -1463,6 +1466,11 @@ const (
 	// so it may combine freely with other usecases.
 	FLAG_TOKEN_CLASSIFY ModelConfigUsecase = 0b1000000000000000000000

+	// Marks a model as wired for the SoundDetection gRPC primitive
+	// (audio tagging / sound-event classification — scored AudioSet
+	// labels via the SoundDetection RPC, e.g. ced).
+	FLAG_SOUND_CLASSIFICATION ModelConfigUsecase = 0b10000000000000000000000
+
 	// Common Subsets
 	FLAG_LLM ModelConfigUsecase = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
 )
@@ -1471,12 +1479,12 @@ const (
 // Flags within the same group are NOT orthogonal (e.g., chat and completion are
 // both text/language). A model is multimodal when its usecases span 2+ groups.
 var ModalityGroups = []ModelConfigUsecase{
-	FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT,                // text/language
-	FLAG_VISION | FLAG_DETECTION,                           // visual understanding
-	FLAG_TRANSCRIPT | FLAG_REALTIME_AUDIO,                  // speech input — realtime_audio is any-to-any, so it counts here too
-	FLAG_TTS | FLAG_SOUND_GENERATION | FLAG_REALTIME_AUDIO, // audio output — and here, so a lone realtime_audio flag still reads as multimodal
-	FLAG_AUDIO_TRANSFORM,                                   // audio in/out transforms
-	FLAG_IMAGE | FLAG_VIDEO,                                // visual generation
+	FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT,                           // text/language
+	FLAG_VISION | FLAG_DETECTION,                                      // visual understanding
+	FLAG_TRANSCRIPT | FLAG_REALTIME_AUDIO | FLAG_SOUND_CLASSIFICATION, // audio input — realtime_audio is any-to-any, so it counts here too
+	FLAG_TTS | FLAG_SOUND_GENERATION | FLAG_REALTIME_AUDIO,            // audio output — and here, so a lone realtime_audio flag still reads as multimodal
+	FLAG_AUDIO_TRANSFORM,                                              // audio in/out transforms
+	FLAG_IMAGE | FLAG_VIDEO,                                           // visual generation
 }

 // IsMultimodal returns true if the given usecases span two or more orthogonal
@@ -1499,29 +1507,30 @@ func GetAllModelConfigUsecases() map[string]ModelConfigUsecase {
 	return map[string]ModelConfigUsecase{
 		// Note: FLAG_ANY is intentionally excluded from this map
 		// because it's 0 and would always match in HasUsecases checks
-		"FLAG_CHAT":                FLAG_CHAT,
-		"FLAG_COMPLETION":          FLAG_COMPLETION,
-		"FLAG_EDIT":                FLAG_EDIT,
-		"FLAG_EMBEDDINGS":          FLAG_EMBEDDINGS,
-		"FLAG_RERANK":              FLAG_RERANK,
-		"FLAG_IMAGE":               FLAG_IMAGE,
-		"FLAG_TRANSCRIPT":          FLAG_TRANSCRIPT,
-		"FLAG_TTS":                 FLAG_TTS,
-		"FLAG_SOUND_GENERATION":    FLAG_SOUND_GENERATION,
-		"FLAG_TOKENIZE":            FLAG_TOKENIZE,
-		"FLAG_VAD":                 FLAG_VAD,
-		"FLAG_LLM":                 FLAG_LLM,
-		"FLAG_VIDEO":               FLAG_VIDEO,
-		"FLAG_DETECTION":           FLAG_DETECTION,
-		"FLAG_VISION":              FLAG_VISION,
-		"FLAG_FACE_RECOGNITION":    FLAG_FACE_RECOGNITION,
-		"FLAG_SPEAKER_RECOGNITION": FLAG_SPEAKER_RECOGNITION,
-		"FLAG_AUDIO_TRANSFORM":     FLAG_AUDIO_TRANSFORM,
-		"FLAG_DIARIZATION":         FLAG_DIARIZATION,
-		"FLAG_REALTIME_AUDIO":      FLAG_REALTIME_AUDIO,
-		"FLAG_SCORE":               FLAG_SCORE,
-		"FLAG_DEPTH":               FLAG_DEPTH,
-		"FLAG_TOKEN_CLASSIFY":      FLAG_TOKEN_CLASSIFY,
+		"FLAG_CHAT":                 FLAG_CHAT,
+		"FLAG_COMPLETION":           FLAG_COMPLETION,
+		"FLAG_EDIT":                 FLAG_EDIT,
+		"FLAG_EMBEDDINGS":           FLAG_EMBEDDINGS,
+		"FLAG_RERANK":               FLAG_RERANK,
+		"FLAG_IMAGE":                FLAG_IMAGE,
+		"FLAG_TRANSCRIPT":           FLAG_TRANSCRIPT,
+		"FLAG_TTS":                  FLAG_TTS,
+		"FLAG_SOUND_GENERATION":     FLAG_SOUND_GENERATION,
+		"FLAG_TOKENIZE":             FLAG_TOKENIZE,
+		"FLAG_VAD":                  FLAG_VAD,
+		"FLAG_LLM":                  FLAG_LLM,
+		"FLAG_VIDEO":                FLAG_VIDEO,
+		"FLAG_DETECTION":            FLAG_DETECTION,
+		"FLAG_VISION":               FLAG_VISION,
+		"FLAG_FACE_RECOGNITION":     FLAG_FACE_RECOGNITION,
+		"FLAG_SPEAKER_RECOGNITION":  FLAG_SPEAKER_RECOGNITION,
+		"FLAG_AUDIO_TRANSFORM":      FLAG_AUDIO_TRANSFORM,
+		"FLAG_DIARIZATION":          FLAG_DIARIZATION,
+		"FLAG_SOUND_CLASSIFICATION": FLAG_SOUND_CLASSIFICATION,
+		"FLAG_REALTIME_AUDIO":       FLAG_REALTIME_AUDIO,
+		"FLAG_SCORE":                FLAG_SCORE,
+		"FLAG_DEPTH":                FLAG_DEPTH,
+		"FLAG_TOKEN_CLASSIFY":       FLAG_TOKEN_CLASSIFY,
 	}
 }

@@ -1724,6 +1733,16 @@ func (c *ModelConfig) GuessUsecases(u ModelConfigUsecase) bool {
 		}
 	}

+	if (u & FLAG_SOUND_CLASSIFICATION) == FLAG_SOUND_CLASSIFICATION {
+		// ced is a sound-event tagger (AudioSet labels) surfaced via the
+		// SoundDetection gRPC. Models without an explicit known_usecases
+		// still surface when they run on one of these backends.
+		soundClassificationBackends := []string{"ced"}
+		if !slices.Contains(soundClassificationBackends, c.Backend) {
+			return false
+		}
+	}
+
 	if (u & FLAG_REALTIME_AUDIO) == FLAG_REALTIME_AUDIO {
 		// Backends that own a single any-to-any loop and implement
 		// AudioToAudioStream — listed here so models without an explicit
--- a/core/config/model_config_loader.go
+++ b/core/config/model_config_loader.go
@@ -294,6 +294,44 @@ func (bcl *ModelConfigLoader) UpdateModelConfig(m string, updater func(*ModelCon
 	}
 }

+// ResolveAlias follows a one-hop alias to its target config. Returns
+// (resolved, wasAlias, err). Non-alias configs return (cfg, false, nil)
+// unchanged. Strict: the target must exist and must not itself be an alias
+// (chains are rejected). The returned config is a copy of the target.
+func (bcl *ModelConfigLoader) ResolveAlias(cfg *ModelConfig) (*ModelConfig, bool, error) {
+	if cfg == nil || !cfg.IsAlias() {
+		return cfg, false, nil
+	}
+	target, exists := bcl.GetModelConfig(cfg.Alias)
+	if !exists {
+		return nil, true, fmt.Errorf("alias %q points to unknown model %q", cfg.Name, cfg.Alias)
+	}
+	if target.IsAlias() {
+		return nil, true, fmt.Errorf("alias %q points to another alias %q (chains are not allowed)", cfg.Name, cfg.Alias)
+	}
+	return &target, true, nil
+}
+
+// ValidateAliasTarget checks an alias config's target at create/swap time:
+// the target must exist, must not be an alias, and must not be disabled.
+// Returns nil for non-alias configs.
+func (bcl *ModelConfigLoader) ValidateAliasTarget(cfg *ModelConfig) error {
+	if cfg == nil || !cfg.IsAlias() {
+		return nil
+	}
+	target, exists := bcl.GetModelConfig(cfg.Alias)
+	if !exists {
+		return fmt.Errorf("alias target %q does not exist", cfg.Alias)
+	}
+	if target.IsAlias() {
+		return fmt.Errorf("alias target %q is itself an alias (chains are not allowed)", cfg.Alias)
+	}
+	if target.IsDisabled() {
+		return fmt.Errorf("alias target %q is disabled", cfg.Alias)
+	}
+	return nil
+}
+
 // Preload prepare models if they are not local but url or huggingface repositories
 func (bcl *ModelConfigLoader) Preload(modelPath string) error {
 	bcl.Lock()
@@ -475,5 +513,21 @@ func (bcl *ModelConfigLoader) LoadModelConfigsFromPath(path string, opts ...Conf
 		}
 	}

+	// Surface aliases whose targets are missing or themselves aliases. These
+	// resolve to a clear request-time error; warning here gives operators
+	// visibility without failing startup.
+	for name, c := range bcl.configs {
+		if !c.IsAlias() {
+			continue
+		}
+		target, ok := bcl.configs[c.Alias]
+		switch {
+		case !ok:
+			xlog.Warn("alias points to unknown model", "alias", name, "target", c.Alias)
+		case target.IsAlias():
+			xlog.Warn("alias points to another alias (chains are not allowed)", "alias", name, "target", c.Alias)
+		}
+	}
+
 	return nil
 }
--- a/core/config/model_config_loader_test.go
+++ b/core/config/model_config_loader_test.go
@@ -61,3 +61,51 @@ var _ = Describe("ModelConfigLoader.GetModelsConflictingWith", func() {
 		Expect(bcl.GetModelsConflictingWith("a")).To(ConsistOf("b"))
 	})
 })
+
+var _ = Describe("ModelConfigLoader alias resolution", func() {
+	var loader *ModelConfigLoader
+
+	BeforeEach(func() {
+		loader = NewModelConfigLoader("")
+		loader.configs["real"] = ModelConfig{Name: "real", Backend: "llama-cpp"}
+		loader.configs["gpt-4"] = ModelConfig{Name: "gpt-4", Alias: "real"}
+		loader.configs["chain"] = ModelConfig{Name: "chain", Alias: "gpt-4"}
+		loader.configs["dangling"] = ModelConfig{Name: "dangling", Alias: "nope"}
+	})
+
+	It("returns non-alias configs unchanged", func() {
+		cfg := loader.configs["real"]
+		got, was, err := loader.ResolveAlias(&cfg)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(was).To(BeFalse())
+		Expect(got.Name).To(Equal("real"))
+	})
+
+	It("resolves an alias to its target", func() {
+		cfg := loader.configs["gpt-4"]
+		got, was, err := loader.ResolveAlias(&cfg)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(was).To(BeTrue())
+		Expect(got.Name).To(Equal("real"))
+	})
+
+	It("rejects an alias chain", func() {
+		cfg := loader.configs["chain"]
+		_, was, err := loader.ResolveAlias(&cfg)
+		Expect(was).To(BeTrue())
+		Expect(err).To(MatchError(ContainSubstring("chains are not allowed")))
+	})
+
+	It("rejects a dangling alias", func() {
+		cfg := loader.configs["dangling"]
+		_, _, err := loader.ResolveAlias(&cfg)
+		Expect(err).To(MatchError(ContainSubstring("unknown model")))
+	})
+
+	It("ValidateAliasTarget passes for a real target and fails for a chain", func() {
+		good := loader.configs["gpt-4"]
+		Expect(loader.ValidateAliasTarget(&good)).ToNot(HaveOccurred())
+		bad := loader.configs["chain"]
+		Expect(loader.ValidateAliasTarget(&bad)).To(MatchError(ContainSubstring("itself an alias")))
+	})
+})
--- a/core/config/model_config_test.go
+++ b/core/config/model_config_test.go
@@ -787,3 +787,32 @@ var _ = Describe("pattern detector config", func() {
 		Expect(err).To(MatchError(ContainSubstring("pattern \"EMAILish\"")))
 	})
 })
+
+var _ = Describe("ModelConfig alias", func() {
+	It("reports IsAlias when alias is set", func() {
+		c := ModelConfig{Name: "gpt-4", Alias: "my-llama-3"}
+		Expect(c.IsAlias()).To(BeTrue())
+		Expect(ModelConfig{Name: "real"}.IsAlias()).To(BeFalse())
+	})
+
+	It("validates a minimal alias config", func() {
+		c := ModelConfig{Name: "gpt-4", Alias: "my-llama-3"}
+		ok, err := c.Validate()
+		Expect(err).ToNot(HaveOccurred())
+		Expect(ok).To(BeTrue())
+	})
+
+	It("rejects an alias pointing to itself", func() {
+		c := ModelConfig{Name: "loop", Alias: "loop"}
+		ok, err := c.Validate()
+		Expect(ok).To(BeFalse())
+		Expect(err).To(MatchError(ContainSubstring("itself")))
+	})
+
+	It("rejects an alias that also sets a backend", func() {
+		c := ModelConfig{Name: "gpt-4", Alias: "my-llama-3", Backend: "llama-cpp"}
+		ok, err := c.Validate()
+		Expect(ok).To(BeFalse())
+		Expect(err).To(MatchError(ContainSubstring("pure redirect")))
+	})
+})
--- a/core/config/runtime_settings.go
+++ b/core/config/runtime_settings.go
@@ -28,6 +28,7 @@ type RuntimeSettings struct {

 	// Eviction settings
 	ForceEvictionWhenBusy    *bool   `json:"force_eviction_when_busy,omitempty"`    // Force eviction even when models have active API calls (default: false for safety)
+	SizeAwareEviction        *bool   `json:"size_aware_eviction,omitempty"`          // Evict largest models first rather than least-recently-used (default: false)
 	LRUEvictionMaxRetries    *int    `json:"lru_eviction_max_retries,omitempty"`    // Maximum number of retries when waiting for busy models to become idle (default: 30)
 	LRUEvictionRetryInterval *string `json:"lru_eviction_retry_interval,omitempty"` // Interval between retries when waiting for busy models (e.g., 1s, 2s) (default: 1s)

--- a/core/config/serving_defaults.go
+++ b/core/config/serving_defaults.go
@@ -0,0 +1,56 @@
+package config
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/mudler/xlog"
+)
+
+// Serving-policy model-config defaults.
+//
+// Sibling to hardware_defaults.go: those fill values driven by the target
+// *device* (Blackwell batch, VRAM-scaled parallel slots); these fill values
+// that improve multi-request / multi-user *serving* regardless of the GPU. They
+// run together from SetDefaults and only ever fill values the user left unset.
+
+// DefaultCacheReuse is the minimum shared-prefix chunk (in tokens) the backend
+// reuses across requests via KV-cache shifting. The llama.cpp backend ships this
+// disabled (n_cache_reuse = 0); we enable it so repeated prefixes (system
+// prompts, RAG context, agent scaffolds, multi-turn chat) are not recomputed.
+// This is the universally-useful part of "paged attention" (cross-request prefix
+// sharing) and needs none of the block-KV machinery.
+const DefaultCacheReuse = 256
+
+// ApplyServingDefaults fills serving-policy ModelConfig values the user left
+// unset. Currently: enable cross-request prefix caching. Explicit
+// cache_reuse/n_cache_reuse in the model options always wins.
+func ApplyServingDefaults(cfg *ModelConfig) {
+	if cfg == nil {
+		return
+	}
+	if !backendOptionSet(cfg.Options, "cache_reuse", "n_cache_reuse") {
+		cfg.Options = append(cfg.Options, fmt.Sprintf("cache_reuse:%d", DefaultCacheReuse))
+		xlog.Debug("[serving_defaults] enabling cross-request prefix cache",
+			"cache_reuse", DefaultCacheReuse)
+	}
+}
+
+// backendOptionSet reports whether the backend options already set any of names.
+// Options are "name:value" strings (or bare "name"); used so we never override
+// an explicit value. Shared with hardware_defaults.go.
+func backendOptionSet(opts []string, names ...string) bool {
+	for _, o := range opts {
+		name := o
+		if i := strings.IndexByte(o, ':'); i >= 0 {
+			name = o[:i]
+		}
+		name = strings.TrimSpace(strings.ToLower(name))
+		for _, n := range names {
+			if name == n {
+				return true
+			}
+		}
+	}
+	return false
+}
--- a/core/config/serving_defaults_test.go
+++ b/core/config/serving_defaults_test.go
@@ -0,0 +1,30 @@
+package config_test
+
+import (
+	. "github.com/mudler/LocalAI/core/config"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("Serving-policy config defaults", func() {
+	Describe("ApplyServingDefaults (cross-request prefix cache)", func() {
+		It("enables cache_reuse when unset", func() {
+			cfg := &ModelConfig{}
+			ApplyServingDefaults(cfg)
+			Expect(cfg.Options).To(ContainElement("cache_reuse:256"))
+		})
+		It("never overrides an explicit cache_reuse", func() {
+			cfg := &ModelConfig{Options: []string{"cache_reuse:0"}}
+			ApplyServingDefaults(cfg)
+			Expect(cfg.Options).To(Equal([]string{"cache_reuse:0"}))
+		})
+		It("recognizes the n_cache_reuse alias", func() {
+			cfg := &ModelConfig{Options: []string{"n_cache_reuse:512"}}
+			ApplyServingDefaults(cfg)
+			Expect(cfg.Options).To(Equal([]string{"n_cache_reuse:512"}))
+		})
+		It("no-ops on nil", func() {
+			Expect(func() { ApplyServingDefaults(nil) }).ToNot(Panic())
+		})
+	})
+})
--- a/core/config/voice_gate_test.go
+++ b/core/config/voice_gate_test.go
@@ -70,4 +70,32 @@ var _ = Describe("PipelineVoiceRecognition", func() {
 			Expect((Pipeline{VoiceRecognition: &PipelineVoiceRecognition{}}).VoiceGateEnabled()).To(BeTrue())
 		})
 	})
+
+	Describe("Enforce / Identity helpers", func() {
+		It("treats a nil Enforce as enforcing (backward compatible)", func() {
+			v := PipelineVoiceRecognition{Model: "spk"}
+			Expect(v.EnforceGate()).To(BeTrue())
+		})
+		It("honors an explicit enforce:false", func() {
+			off := false
+			v := PipelineVoiceRecognition{Model: "spk", Enforce: &off}
+			Expect(v.EnforceGate()).To(BeFalse())
+		})
+		It("reports identity disabled when no identity block is set", func() {
+			v := PipelineVoiceRecognition{Model: "spk"}
+			Expect(v.IdentityEnabled()).To(BeFalse())
+			Expect(v.AnnounceEnabled()).To(BeFalse())
+			Expect(v.PersonalizeEnabled()).To(BeFalse())
+		})
+		It("reports identity enabled when announce or personalize is on", func() {
+			v := PipelineVoiceRecognition{Model: "spk", Identity: &VoiceIdentityConfig{Announce: true}}
+			Expect(v.IdentityEnabled()).To(BeTrue())
+			Expect(v.AnnounceEnabled()).To(BeTrue())
+			Expect(v.PersonalizeEnabled()).To(BeFalse())
+
+			v2 := PipelineVoiceRecognition{Model: "spk", Identity: &VoiceIdentityConfig{Personalize: true}}
+			Expect(v2.IdentityEnabled()).To(BeTrue())
+			Expect(v2.PersonalizeEnabled()).To(BeTrue())
+		})
+	})
 })
--- a/core/http/auth/features.go
+++ b/core/http/auth/features.go
@@ -48,6 +48,10 @@ var RouteFeatureRegistry = []RouteFeature{
 	{"POST", "/v1/audio/diarization", FeatureAudioDiarization},
 	{"POST", "/audio/diarization", FeatureAudioDiarization},

+	// Audio classification (sound-event tagging)
+	{"POST", "/v1/audio/classification", FeatureAudioClassification},
+	{"POST", "/audio/classification", FeatureAudioClassification},
+
 	// Audio speech / TTS
 	{"POST", "/v1/audio/speech", FeatureAudioSpeech},
 	{"POST", "/audio/speech", FeatureAudioSpeech},
@@ -172,6 +176,7 @@ func APIFeatureMetas() []FeatureMeta {
 		{FeatureAudioSpeech, "Audio Speech / TTS", true},
 		{FeatureAudioTranscription, "Audio Transcription", true},
 		{FeatureAudioDiarization, "Audio Diarization", true},
+		{FeatureAudioClassification, "Audio Classification", true},
 		{FeatureVAD, "Voice Activity Detection", true},
 		{FeatureDetection, "Detection", true},
 		{FeatureVideo, "Video Generation", true},
--- a/core/http/auth/permissions.go
+++ b/core/http/auth/permissions.go
@@ -38,24 +38,25 @@ const (
 	FeatureQuantization = "quantization"

 	// API features (default ON for new users)
-	FeatureChat               = "chat"
-	FeatureImages             = "images"
-	FeatureAudioSpeech        = "audio_speech"
-	FeatureAudioTranscription = "audio_transcription"
-	FeatureAudioDiarization   = "audio_diarization"
-	FeatureVAD                = "vad"
-	FeatureDetection          = "detection"
-	FeatureVideo              = "video"
-	FeatureEmbeddings         = "embeddings"
-	FeatureSound              = "sound"
-	FeatureRealtime           = "realtime"
-	FeatureRerank             = "rerank"
-	FeatureTokenize           = "tokenize"
-	FeatureMCP                = "mcp"
-	FeatureStores             = "stores"
-	FeatureFaceRecognition    = "face_recognition"
-	FeatureVoiceRecognition   = "voice_recognition"
-	FeatureAudioTransform     = "audio_transform"
+	FeatureChat                = "chat"
+	FeatureImages              = "images"
+	FeatureAudioSpeech         = "audio_speech"
+	FeatureAudioTranscription  = "audio_transcription"
+	FeatureAudioDiarization    = "audio_diarization"
+	FeatureAudioClassification = "audio_classification"
+	FeatureVAD                 = "vad"
+	FeatureDetection           = "detection"
+	FeatureVideo               = "video"
+	FeatureEmbeddings          = "embeddings"
+	FeatureSound               = "sound"
+	FeatureRealtime            = "realtime"
+	FeatureRerank              = "rerank"
+	FeatureTokenize            = "tokenize"
+	FeatureMCP                 = "mcp"
+	FeatureStores              = "stores"
+	FeatureFaceRecognition     = "face_recognition"
+	FeatureVoiceRecognition    = "voice_recognition"
+	FeatureAudioTransform      = "audio_transform"
 	// FeaturePIIFilter gates the synchronous PII analyze/redact service
 	// (POST /api/pii/{analyze,redact}). Default ON like the other API
 	// features; the admin-only events log is gated separately in-handler.
@@ -71,7 +72,7 @@ var GeneralFeatures = []string{FeatureFineTuning, FeatureQuantization}
 // APIFeatures lists API endpoint features (default ON).
 var APIFeatures = []string{
 	FeatureChat, FeatureImages, FeatureAudioSpeech, FeatureAudioTranscription,
-	FeatureAudioDiarization,
+	FeatureAudioDiarization, FeatureAudioClassification,
 	FeatureVAD, FeatureDetection, FeatureVideo, FeatureEmbeddings, FeatureSound,
 	FeatureRealtime, FeatureRerank, FeatureTokenize, FeatureMCP, FeatureStores,
 	FeatureFaceRecognition, FeatureVoiceRecognition, FeatureAudioTransform,
--- a/core/http/endpoints/localai/aliases.go
+++ b/core/http/endpoints/localai/aliases.go
@@ -0,0 +1,33 @@
+package localai
+
+import (
+	"net/http"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/config"
+)
+
+// AliasInfo is one alias -> target pair.
+type AliasInfo struct {
+	Name   string `json:"name"`
+	Target string `json:"target"`
+}
+
+// ListAliasesEndpoint returns every configured model alias and its target.
+//
+//	@Summary	List model aliases
+//	@Tags		models
+//	@Success	200	{array}	AliasInfo
+//	@Router		/api/aliases [get]
+func ListAliasesEndpoint(cl *config.ModelConfigLoader) echo.HandlerFunc {
+	return func(c echo.Context) error {
+		// Non-nil so an empty result marshals as [] rather than null.
+		out := []AliasInfo{}
+		for _, cfg := range cl.GetAllModelsConfigs() {
+			if cfg.IsAlias() {
+				out = append(out, AliasInfo{Name: cfg.Name, Target: cfg.Alias})
+			}
+		}
+		return c.JSON(http.StatusOK, out)
+	}
+}
--- a/core/http/endpoints/localai/aliases_test.go
+++ b/core/http/endpoints/localai/aliases_test.go
@@ -0,0 +1,57 @@
+package localai_test
+
+import (
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"path/filepath"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/config"
+	. "github.com/mudler/LocalAI/core/http/endpoints/localai"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("ListAliasesEndpoint", func() {
+	var tempDir string
+
+	BeforeEach(func() {
+		var err error
+		tempDir, err = os.MkdirTemp("", "localai-aliases-test")
+		Expect(err).ToNot(HaveOccurred())
+	})
+	AfterEach(func() {
+		_ = os.RemoveAll(tempDir)
+	})
+
+	It("returns only alias configs as name/target pairs", func() {
+		// Seed one real model and one alias pointing at it.
+		Expect(os.WriteFile(
+			filepath.Join(tempDir, "real.yaml"),
+			[]byte("name: real\nbackend: llama-cpp\nmodel: foo\n"),
+			0644,
+		)).To(Succeed())
+		Expect(os.WriteFile(
+			filepath.Join(tempDir, "gpt-4.yaml"),
+			[]byte("name: gpt-4\nalias: real\n"),
+			0644,
+		)).To(Succeed())
+
+		loader := config.NewModelConfigLoader(tempDir)
+		Expect(loader.LoadModelConfigsFromPath(tempDir)).To(Succeed())
+
+		app := echo.New()
+		app.GET("/api/aliases", ListAliasesEndpoint(loader))
+
+		req := httptest.NewRequest("GET", "/api/aliases", nil)
+		rec := httptest.NewRecorder()
+		app.ServeHTTP(rec, req)
+
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(rec.Body.String()).To(ContainSubstring(`"name":"gpt-4"`))
+		Expect(rec.Body.String()).To(ContainSubstring(`"target":"real"`))
+		// The real model must not appear as an alias entry.
+		Expect(rec.Body.String()).ToNot(ContainSubstring(`"name":"real"`))
+	})
+})
--- a/core/http/endpoints/localai/api_instructions.go
+++ b/core/http/endpoints/localai/api_instructions.go
@@ -32,9 +32,9 @@ var instructionDefs = []instructionDef{
 	},
 	{
 		Name:        "audio",
-		Description: "Text-to-speech, voice activity detection, transcription, speaker diarization, and sound generation",
+		Description: "Text-to-speech, voice activity detection, transcription, speaker diarization, sound classification, and sound generation",
 		Tags:        []string{"audio"},
-		Intro:       "Diarization (/v1/audio/diarization) returns speaker-labelled time segments. Backends with native ASR-diarization (vibevoice-cpp) can also emit per-segment text via include_text=true; backends with a dedicated pipeline (sherpa-onnx + pyannote) emit segmentation only. Response formats: json (default), verbose_json (adds speakers summary + text), rttm (NIST format).",
+		Intro:       "Diarization (/v1/audio/diarization) returns speaker-labelled time segments. Backends with native ASR-diarization (vibevoice-cpp) can also emit per-segment text via include_text=true; backends with a dedicated pipeline (sherpa-onnx + pyannote) emit segmentation only. Response formats: json (default), verbose_json (adds speakers summary + text), rttm (NIST format). Sound classification (/v1/audio/classification) returns scored AudioSet sound-event tags (audio tagging via the ced backend); top_k and threshold control the returned set.",
 	},
 	{
 		Name:        "images",
--- a/core/http/endpoints/localai/import_model.go
+++ b/core/http/endpoints/localai/import_model.go
@@ -181,6 +181,12 @@ func ImportModelEndpoint(cl *config.ModelConfigLoader, appConfig *config.Applica
 			return c.JSON(http.StatusBadRequest, ModelResponse{Success: false, Error: msg})
 		}

+		// Reject aliases whose target is missing, chained, or disabled so a
+		// dangling alias can't be persisted and surface as a runtime error later.
+		if err := cl.ValidateAliasTarget(&modelConfig); err != nil {
+			return c.JSON(http.StatusBadRequest, ModelResponse{Success: false, Error: err.Error()})
+		}
+
 		// Create the configuration file
 		configPath := filepath.Join(appConfig.SystemState.Model.ModelsPath, modelConfig.Name+".yaml")
 		if err := utils.VerifyPath(modelConfig.Name+".yaml", appConfig.SystemState.Model.ModelsPath); err != nil {
--- a/core/http/endpoints/localai/nodes.go
+++ b/core/http/endpoints/localai/nodes.go
@@ -70,17 +70,20 @@ func GetNodeEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {

 // RegisterNodeRequest is the request body for registering a new worker node.
 type RegisterNodeRequest struct {
-	Name          string            `json:"name"`
-	NodeType      string            `json:"node_type,omitempty"` // "backend" (default) or "agent"
-	Address       string            `json:"address"`
-	HTTPAddress   string            `json:"http_address,omitempty"`
-	Token         string            `json:"token,omitempty"`
-	TotalVRAM     uint64            `json:"total_vram,omitempty"`
-	AvailableVRAM uint64            `json:"available_vram,omitempty"`
-	TotalRAM      uint64            `json:"total_ram,omitempty"`
-	AvailableRAM  uint64            `json:"available_ram,omitempty"`
-	GPUVendor     string            `json:"gpu_vendor,omitempty"`
-	Labels        map[string]string `json:"labels,omitempty"`
+	Name          string `json:"name"`
+	NodeType      string `json:"node_type,omitempty"` // "backend" (default) or "agent"
+	Address       string `json:"address"`
+	HTTPAddress   string `json:"http_address,omitempty"`
+	Token         string `json:"token,omitempty"`
+	TotalVRAM     uint64 `json:"total_vram,omitempty"`
+	AvailableVRAM uint64 `json:"available_vram,omitempty"`
+	TotalRAM      uint64 `json:"total_ram,omitempty"`
+	AvailableRAM  uint64 `json:"available_ram,omitempty"`
+	GPUVendor     string `json:"gpu_vendor,omitempty"`
+	// GPUComputeCapability is the worker GPU's compute capability ("major.minor",
+	// e.g. "12.1" for GB10). Used by the router for per-arch option tuning.
+	GPUComputeCapability string            `json:"gpu_compute_capability,omitempty"`
+	Labels               map[string]string `json:"labels,omitempty"`
 	// MaxReplicasPerModel is the per-node cap on replicas of any single model.
 	// Workers older than this field omit it; we coerce 0 → 1 below to preserve
 	// historical single-replica behavior.
@@ -152,17 +155,18 @@ func RegisterNodeEndpoint(registry *nodes.NodeRegistry, expectedToken string, au
 		}

 		node := &nodes.BackendNode{
-			Name:                req.Name,
-			NodeType:            nodeType,
-			Address:             req.Address,
-			HTTPAddress:         req.HTTPAddress,
-			TokenHash:           tokenHash,
-			TotalVRAM:           req.TotalVRAM,
-			AvailableVRAM:       req.AvailableVRAM,
-			TotalRAM:            req.TotalRAM,
-			AvailableRAM:        req.AvailableRAM,
-			GPUVendor:           req.GPUVendor,
-			MaxReplicasPerModel: maxReplicasPerModel,
+			Name:                 req.Name,
+			NodeType:             nodeType,
+			Address:              req.Address,
+			HTTPAddress:          req.HTTPAddress,
+			TokenHash:            tokenHash,
+			TotalVRAM:            req.TotalVRAM,
+			AvailableVRAM:        req.AvailableVRAM,
+			TotalRAM:             req.TotalRAM,
+			AvailableRAM:         req.AvailableRAM,
+			GPUVendor:            req.GPUVendor,
+			GPUComputeCapability: req.GPUComputeCapability,
+			MaxReplicasPerModel:  maxReplicasPerModel,
 		}

 		ctx := c.Request().Context()
@@ -381,6 +385,23 @@ func GetNodeModelsEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {
 	}
 }

+// ListAllNodeModelsEndpoint returns all loaded models across all healthy nodes.
+// @Summary List all loaded models cluster-wide
+// @Tags Nodes
+// @Success 200 {array} nodes.NodeModel
+// @Router /api/nodes/models [get]
+func ListAllNodeModelsEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {
+	return func(c echo.Context) error {
+		ctx := c.Request().Context()
+		models, err := registry.ListAllLoadedModels(ctx)
+		if err != nil {
+			xlog.Error("Failed to list all node models", "error", err)
+			return c.JSON(http.StatusInternalServerError, nodeError(http.StatusInternalServerError, "failed to list node models"))
+		}
+		return c.JSON(http.StatusOK, models)
+	}
+}
+
 // DrainNodeEndpoint sets a node to draining status (no new requests).
 func DrainNodeEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {
 	return func(c echo.Context) error {
--- a/core/http/endpoints/localai/nodes_test.go
+++ b/core/http/endpoints/localai/nodes_test.go
@@ -407,4 +407,44 @@ var _ = Describe("Node HTTP handlers", func() {
 			Expect(names).To(ConsistOf("alpha", "beta"))
 		})
 	})
+
+	Describe("ListAllNodeModelsEndpoint", func() {
+		It("returns an empty list when no models are loaded", func() {
+			e := echo.New()
+			req := httptest.NewRequest(http.MethodGet, "/", nil)
+			rec := httptest.NewRecorder()
+			c := e.NewContext(req, rec)
+
+			handler := ListAllNodeModelsEndpoint(registry)
+			Expect(handler(c)).To(Succeed())
+			Expect(rec.Code).To(Equal(http.StatusOK))
+
+			var list []nodes.NodeModel
+			Expect(json.Unmarshal(rec.Body.Bytes(), &list)).To(Succeed())
+			Expect(list).To(BeEmpty())
+		})
+
+		It("returns loaded models across healthy nodes", func() {
+			ctx := context.Background()
+			Expect(registry.Register(ctx, &nodes.BackendNode{
+				ID: "n1", Name: "alpha", Address: "10.0.0.1:50051", Status: nodes.StatusHealthy,
+			}, true)).To(Succeed())
+			Expect(registry.SetNodeModel(ctx, "n1", "llama-3.3", 0, "loaded", "10.0.0.1:50051", 0)).To(Succeed())
+
+			e := echo.New()
+			req := httptest.NewRequest(http.MethodGet, "/", nil)
+			rec := httptest.NewRecorder()
+			c := e.NewContext(req, rec)
+
+			handler := ListAllNodeModelsEndpoint(registry)
+			Expect(handler(c)).To(Succeed())
+			Expect(rec.Code).To(Equal(http.StatusOK))
+
+			var list []nodes.NodeModel
+			Expect(json.Unmarshal(rec.Body.Bytes(), &list)).To(Succeed())
+			Expect(list).To(HaveLen(1))
+			Expect(list[0].ModelName).To(Equal("llama-3.3"))
+			Expect(list[0].NodeID).To(Equal("n1"))
+		})
+	})
 })
--- a/core/http/endpoints/mcp/localai_assistant_test.go
+++ b/core/http/endpoints/mcp/localai_assistant_test.go
@@ -51,6 +51,12 @@ func (stubClient) EditModelConfig(_ context.Context, _ string, _ map[string]any)
 	return nil
 }
 func (stubClient) ReloadModels(_ context.Context) error { return nil }
+func (stubClient) SetAlias(_ context.Context, _, _ string) error {
+	return nil
+}
+func (stubClient) ListAliases(_ context.Context) ([]localaitools.AliasInfo, error) {
+	return nil, nil
+}
 func (stubClient) ListBackends(_ context.Context) ([]localaitools.Backend, error) {
 	return []localaitools.Backend{{Name: "stub-backend", Installed: true}}, nil
 }
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -93,16 +93,31 @@ type Session struct {
 	Voice                   string
 	TurnDetection           *types.TurnDetectionUnion // "server_vad", "semantic_vad" or "none"
 	InputAudioTranscription *types.AudioTranscription
-	Tools                   []types.ToolUnion
-	ToolChoice              *types.ToolChoiceUnion
-	Conversations           map[string]*Conversation
-	InputAudioBuffer        []byte
-	AudioBufferLock         sync.Mutex
-	OpusFrames              [][]byte
-	OpusFramesLock          sync.Mutex
-	Instructions            string
-	DefaultConversationID   string
-	ModelInterface          Model
+
+	// SoundDetectionEnabled is set when pipeline.sound_detection names a
+	// sound-event-classification model. When true, each committed utterance is
+	// also run through ModelInterface.SoundDetection and the scored tags are
+	// emitted as a conversation.item.sound_detection event. SoundDetectionTopK
+	// and SoundDetectionThreshold are the knobs passed to that call (defaults:
+	// top_k=5, threshold=0).
+	SoundDetectionEnabled   bool
+	SoundDetectionTopK      int
+	SoundDetectionThreshold float32
+	// SoundDetectionWindowMs / SoundDetectionHopMs, when both > 0, enable
+	// server-side windowing for a sound-only session: the server classifies the
+	// last WindowMs of streamed audio every HopMs (no client commits needed).
+	SoundDetectionWindowMs int
+	SoundDetectionHopMs    int
+	Tools                  []types.ToolUnion
+	ToolChoice             *types.ToolChoiceUnion
+	Conversations          map[string]*Conversation
+	InputAudioBuffer       []byte
+	AudioBufferLock        sync.Mutex
+	OpusFrames             [][]byte
+	OpusFramesLock         sync.Mutex
+	Instructions           string
+	DefaultConversationID  string
+	ModelInterface         Model
 	// The pipeline model config or the config for an any-to-any model
 	ModelConfig      *config.ModelConfig
 	InputSampleRate  int
@@ -250,6 +265,10 @@ type Model interface {
 	// TranscribeStream transcribes audio incrementally, invoking onDelta for each
 	// transcript text fragment and returning the final aggregated result.
 	TranscribeStream(ctx context.Context, audio, language string, translate, diarize bool, prompt string, onDelta func(text string)) (*schema.TranscriptionResult, error)
+	// SoundDetection classifies a committed audio window into scored AudioSet
+	// sound-event tags. topK caps the number of returned tags (0 = backend
+	// default), threshold drops tags below the given score (0 = keep all).
+	SoundDetection(ctx context.Context, audio string, topK int, threshold float32) (*schema.SoundClassificationResult, error)
 	PredictConfig() *config.ModelConfig
 }

@@ -399,7 +418,7 @@ func prepareRealtimeConfig(cfg *config.ModelConfig) (errCode, errMsg string, ok
 		return "", "", true
 	}

-	if cfg.Pipeline.VAD == "" && cfg.Pipeline.Transcription == "" && cfg.Pipeline.TTS == "" && cfg.Pipeline.LLM == "" {
+	if cfg.Pipeline.VAD == "" && cfg.Pipeline.Transcription == "" && cfg.Pipeline.TTS == "" && cfg.Pipeline.LLM == "" && cfg.Pipeline.SoundDetection == "" {
 		return "invalid_model", "Model is not a pipeline model", false
 	}
 	return "", "", true
@@ -469,6 +488,26 @@ func runRealtimeSession(application *application.Application, t Transport, model

 	sttModel := cfg.Pipeline.Transcription

+	// A sound-detection-only pipeline (sound_detection set, no transcription/LLM)
+	// activates on sounds, not speech, so it runs WITHOUT the voice VAD: the
+	// session defaults to turn_detection none and the client drives windowing via
+	// input_audio_buffer.commit. There is no transcription stage in that case.
+	soundOnly := cfg.Pipeline.SoundDetection != "" && cfg.Pipeline.Transcription == "" && cfg.Pipeline.LLM == ""
+
+	turnDetection := &types.TurnDetectionUnion{
+		ServerVad: &types.ServerVad{
+			Threshold:         0.5,
+			PrefixPaddingMs:   300,
+			SilenceDurationMs: 500,
+			CreateResponse:    true,
+		},
+	}
+	inputAudioTranscription := &types.AudioTranscription{Model: sttModel}
+	if soundOnly {
+		turnDetection = nil           // turn_detection none: no VAD
+		inputAudioTranscription = nil // no transcription stage
+	}
+
 	// Compose the system prompt: prepend the assistant prompt when we have
 	// one (it teaches the model the safety rules and tool recipes), then the
 	// session's default voice instructions. Order matches chat.go's
@@ -480,30 +519,26 @@ func runRealtimeSession(application *application.Application, t Transport, model

 	sessionID := generateSessionID()
 	session := &Session{
-		ID:                sessionID,
-		TranscriptionOnly: false,
-		Model:             model,
-		Voice:             cfg.TTSConfig.Voice,
-		Instructions:      instructions,
-		ModelConfig:       cfg,
-		Tools:             assistantTools,
-		AssistantTools:    assistantTools,
-		AssistantExecutor: assistantExecutor,
-		TurnDetection: &types.TurnDetectionUnion{
-			ServerVad: &types.ServerVad{
-				Threshold:         0.5,
-				PrefixPaddingMs:   300,
-				SilenceDurationMs: 500,
-				CreateResponse:    true,
-			},
-		},
-		InputAudioTranscription: &types.AudioTranscription{
-			Model: sttModel,
-		},
-		Conversations:    make(map[string]*Conversation),
-		InputSampleRate:  defaultRemoteSampleRate,
-		OutputSampleRate: defaultRemoteSampleRate,
-		MaxHistoryItems:  resolveMaxHistoryItems(cfg),
+		ID:                      sessionID,
+		TranscriptionOnly:       false,
+		Model:                   model,
+		Voice:                   cfg.TTSConfig.Voice,
+		Instructions:            instructions,
+		ModelConfig:             cfg,
+		Tools:                   assistantTools,
+		AssistantTools:          assistantTools,
+		AssistantExecutor:       assistantExecutor,
+		TurnDetection:           turnDetection,
+		InputAudioTranscription: inputAudioTranscription,
+		Conversations:           make(map[string]*Conversation),
+		InputSampleRate:         defaultRemoteSampleRate,
+		OutputSampleRate:        defaultRemoteSampleRate,
+		MaxHistoryItems:         resolveMaxHistoryItems(cfg),
+		SoundDetectionEnabled:   cfg.Pipeline.SoundDetection != "",
+		SoundDetectionTopK:      defaultSoundDetectionTopK,
+		SoundDetectionThreshold: 0,
+		SoundDetectionWindowMs:  cfg.Pipeline.SoundDetectionWindowMs,
+		SoundDetectionHopMs:     cfg.Pipeline.SoundDetectionHopMs,
 	}

 	// Create a default conversation
@@ -517,14 +552,24 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	session.Conversations[conversationID] = conversation
 	session.DefaultConversationID = conversationID

-	m, err := newModel(
-		&cfg.Pipeline,
-		application.ModelConfigLoader(),
-		application.ModelLoader(),
-		application.ApplicationConfig(),
-		evaluator,
-		buildRealtimeRoutingContext(application, sessionID),
-	)
+	var m Model
+	if soundOnly {
+		m, err = newSoundDetectionOnlyModel(
+			&cfg.Pipeline,
+			application.ModelConfigLoader(),
+			application.ModelLoader(),
+			application.ApplicationConfig(),
+		)
+	} else {
+		m, err = newModel(
+			&cfg.Pipeline,
+			application.ModelConfigLoader(),
+			application.ModelLoader(),
+			application.ApplicationConfig(),
+			evaluator,
+			buildRealtimeRoutingContext(application, sessionID),
+		)
+	}
 	if err != nil {
 		xlog.Error("failed to load model", "error", err)
 		sendError(t, "model_load_error", "Failed to load model", "", "")
@@ -605,6 +650,20 @@ func runRealtimeSession(application *application.Application, t Transport, model

 	toggleVAD()

+	// Server-side sound-detection windowing (option B): for a sound-only session
+	// with window/hop configured, the server classifies the last window of
+	// streamed audio on a timer, so the client only has to stream (no commits).
+	// This runs independent of VAD (sound events are not speech).
+	var soundWindowDone chan struct{}
+	if soundOnly && session.SoundDetectionWindowMs > 0 && session.SoundDetectionHopMs > 0 {
+		soundWindowDone = make(chan struct{})
+		wg.Go(func() {
+			handleSoundWindow(session, t, soundWindowDone)
+		})
+		xlog.Debug("Starting server-side sound-detection windowing",
+			"window_ms", session.SoundDetectionWindowMs, "hop_ms", session.SoundDetectionHopMs)
+	}
+
 	for {
 		msg, err = t.ReadEvent()
 		if err != nil {
@@ -880,6 +939,10 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	if vadServerStarted {
 		close(done)
 	}
+	// Stop the server-side sound-detection windowing goroutine (if running).
+	if soundWindowDone != nil {
+		close(soundWindowDone)
+	}
 	wg.Wait()

 	// Remove the session from the sessions map
@@ -971,6 +1034,10 @@ func updateTransSession(session *Session, update *types.SessionUnion, cl *config

 		session.ModelInterface = m
 		session.ModelConfig = cfg
+		session.SoundDetectionEnabled = cfg.Pipeline.SoundDetection != ""
+		if session.SoundDetectionTopK <= 0 {
+			session.SoundDetectionTopK = defaultSoundDetectionTopK
+		}
 	}

 	if trUpd != nil {
@@ -1311,35 +1378,40 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
 	// turn wastes only transcription compute, which has no side effects. The
 	// transcript is still emitted to the same peer that sent the audio, which
 	// reveals nothing new to them.
-	type gateOutcome struct {
-		allowed bool
-		matched string
-		reason  string
-		err     error
+	// Resolve the speaker when the gate must authorize this turn, or when identity
+	// surfacing/personalization needs a fresh identity. Identity resolution
+	// ignores the when:first short-circuit (that only skips re-authorization).
+	type resolveOutcome struct {
+		res resolution
+		err error
 	}
-	var gateCh chan gateOutcome
-	runGate := false
+	var resolveCh chan resolveOutcome
+	runResolve := false
 	if session.voiceGate != nil && session.InputAudioTranscription != nil {
-		skip := false
-		if session.voiceGate.cfg.When == config.VoiceGateWhenFirst {
+		enforce := session.voiceGate.cfg.EnforceGate()
+		gateNeedsAuth := enforce
+		if enforce && session.voiceGate.cfg.When == config.VoiceGateWhenFirst {
 			session.gateMu.Lock()
-			skip = session.voiceVerified
+			if session.voiceVerified {
+				gateNeedsAuth = false
+			}
 			session.gateMu.Unlock()
 		}
-		if !skip {
-			runGate = true
-			gateCh = make(chan gateOutcome, 1)
+		if gateNeedsAuth || session.voiceGate.cfg.IdentityEnabled() {
+			runResolve = true
+			resolveCh = make(chan resolveOutcome, 1)
 			wavPath := f.Name()
 			go func() {
-				allowed, matched, reason, gerr := session.voiceGate.Authorize(ctx, wavPath)
-				gateCh <- gateOutcome{allowed: allowed, matched: matched, reason: reason, err: gerr}
+				r, rerr := session.voiceGate.Resolve(ctx, wavPath)
+				resolveCh <- resolveOutcome{res: r, err: rerr}
 			}()
 		}
 	}

 	// TODO: If we have a real any-to-any model then transcription is optional
 	var transcript string
-	if session.InputAudioTranscription != nil {
+	switch {
+	case session.InputAudioTranscription != nil:
 		// emitTranscription streams transcript deltas when
 		// pipeline.streaming.transcription is set, otherwise emits a single
 		// completed event; either way it returns the final transcript text.
@@ -1348,55 +1420,169 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
 		if err != nil {
 			// Drain the gate goroutine before returning so its in-flight read of
 			// the temp WAV finishes before the deferred os.Remove fires.
-			if runGate {
-				<-gateCh
+			if runResolve {
+				<-resolveCh
 			}
 			sendError(t, "transcription_failed", err.Error(), "", "event_TODO")
 			return
 		}
-	} else {
+	case session.SoundDetectionEnabled:
+		// Sound-detection-only session: no transcription and no LLM. The
+		// sound-detection emit below carries the result; there is no any-to-any
+		// path to fall into. Windowing is client-driven (turn_detection none +
+		// input_audio_buffer.commit), so this is not voice-gated.
+	default:
 		// The voice gate runs only on the transcription path above; if an
 		// any-to-any model path is added here, join the gate before responding.
 		sendNotImplemented(t, "any-to-any models")
 		return
 	}

-	// Join on the gate before any side-effecting step.
-	if runGate {
-		out := <-gateCh
-		allowed := out.allowed
-		reason := out.reason
-		if out.err != nil {
-			// Fail closed: a gate that cannot decide must not let audio through.
-			xlog.Error("voice recognition gate error", "error", out.err)
-			allowed = false
-			reason = "verification error"
-		}
-		alreadyVerified := false
-		if session.voiceGate.cfg.When == config.VoiceGateWhenFirst {
-			session.gateMu.Lock()
-			alreadyVerified = session.voiceVerified
-			session.gateMu.Unlock()
-		}
-		proceed, markVerified := session.voiceGate.decide(alreadyVerified, allowed)
-		if !proceed {
-			xlog.Debug("voice recognition gate rejected utterance", "reason", reason)
-			if session.voiceGate.cfg.OnReject == config.VoiceGateRejectEvent {
-				sendError(t, "speaker_not_authorized", "speaker not authorized: "+reason, "", "event_TODO")
-			}
-			return
-		}
-		xlog.Debug("voice recognition gate authorized utterance", "speaker", out.matched)
-		if markVerified {
-			session.gateMu.Lock()
-			session.voiceVerified = true
-			session.gateMu.Unlock()
+	// Sound-event detection is additive to transcription: classify the same
+	// committed window and emit its scored AudioSet tags as a separate event.
+	// A failure here is logged but must never abort the turn.
+	if session.SoundDetectionEnabled {
+		if sderr := emitSoundDetection(ctx, t, session, generateItemID(), f.Name()); sderr != nil {
+			xlog.Error("sound detection failed", "error", sderr)
 		}
 	}

-	if !session.TranscriptionOnly {
-		generateResponse(ctx, session, utt, transcript, conv, t)
+	// Join on the resolution before any side-effecting step.
+	var speaker *types.Speaker
+	if runResolve {
+		out := <-resolveCh
+		enforce := session.voiceGate.cfg.EnforceGate()
+
+		if out.err != nil {
+			if enforce {
+				// Fail closed: a gate that cannot decide must not let audio through.
+				xlog.Error("voice recognition gate error", "error", out.err)
+				if session.voiceGate.cfg.OnReject == config.VoiceGateRejectEvent {
+					sendError(t, "speaker_not_authorized", "speaker not authorized: verification error", "", "event_TODO")
+				}
+				return
+			}
+			// Non-enforcing: degrade to an unknown speaker and continue.
+			xlog.Warn("voice identity resolve failed; continuing as unknown speaker", "error", out.err)
+		} else {
+			s := out.res.speaker
+			speaker = &s
+		}
+
+		if enforce {
+			alreadyVerified := false
+			if session.voiceGate.cfg.When == config.VoiceGateWhenFirst {
+				session.gateMu.Lock()
+				alreadyVerified = session.voiceVerified
+				session.gateMu.Unlock()
+			}
+			allowed, reason := false, "verification error"
+			if out.err == nil {
+				allowed, reason = session.voiceGate.authorize(out.res)
+			}
+			proceed, markVerified := session.voiceGate.decide(alreadyVerified, allowed)
+			if !proceed {
+				xlog.Debug("voice recognition gate rejected utterance", "reason", reason)
+				if session.voiceGate.cfg.OnReject == config.VoiceGateRejectEvent {
+					sendError(t, "speaker_not_authorized", "speaker not authorized: "+reason, "", "event_TODO")
+				}
+				return
+			}
+			if markVerified {
+				session.gateMu.Lock()
+				session.voiceVerified = true
+				session.gateMu.Unlock()
+			}
+			xlog.Debug("voice recognition gate authorized utterance", "speaker", out.res.speaker.Name)
+		}
 	}
+
+	// Generate an LLM response only when there is a transcript to feed it. A
+	// sound-detection-only session (no transcription) has no LLM stage, so it
+	// stops here after emitting the sound-detection event.
+	if session.InputAudioTranscription != nil && !session.TranscriptionOnly {
+		generateResponse(ctx, session, utt, transcript, speaker, conv, t)
+	}
+}
+
+// handleSoundWindow runs server-side windowed sound-event detection (option B):
+// every HopMs it classifies the last WindowMs of streamed audio and emits a
+// sound_detection event, so a sound-only client only has to stream audio (no
+// input_audio_buffer.commit). It keeps the input buffer trimmed to one window
+// so a long stream stays bounded. Runs until done is closed. This is
+// independent of VAD: sound events are not speech.
+func handleSoundWindow(session *Session, t Transport, done chan struct{}) {
+	ticker := time.NewTicker(time.Duration(session.SoundDetectionHopMs) * time.Millisecond)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-done:
+			return
+		case <-ticker.C:
+			classifySoundWindow(session, t)
+		}
+	}
+}
+
+// classifySoundWindow is one windowing tick: it snapshots the most recent
+// WindowMs of buffered audio (trimming the buffer so a long stream stays
+// bounded) and, when there is enough, classifies it and emits a sound_detection
+// event. Extracted from handleSoundWindow so it can be driven synchronously in
+// tests.
+func classifySoundWindow(session *Session, t Transport) {
+	const bytesPerSample = 2 // 16-bit mono PCM
+	sr := session.InputSampleRate
+	windowBytes := session.SoundDetectionWindowMs * sr / 1000 * bytesPerSample
+	minBytes := sr / 100 * bytesPerSample // ~10ms before classifying
+
+	session.AudioBufferLock.Lock()
+	// Keep only the most recent window so a long stream stays bounded.
+	if windowBytes > 0 && len(session.InputAudioBuffer) > windowBytes {
+		trimmed := make([]byte, windowBytes)
+		copy(trimmed, session.InputAudioBuffer[len(session.InputAudioBuffer)-windowBytes:])
+		session.InputAudioBuffer = trimmed
+	}
+	window := make([]byte, len(session.InputAudioBuffer))
+	copy(window, session.InputAudioBuffer)
+	session.AudioBufferLock.Unlock()
+
+	if len(window) < minBytes {
+		return // not enough audio buffered yet
+	}
+	path, err := writeWindowWAV(window, sr)
+	if err != nil {
+		xlog.Error("sound window: failed to write wav", "error", err)
+		return
+	}
+	if sderr := emitSoundDetection(context.Background(), t, session, generateItemID(), path); sderr != nil {
+		xlog.Error("sound window: detection failed", "error", sderr)
+	}
+	if rerr := os.Remove(path); rerr != nil {
+		xlog.Debug("sound window: temp cleanup failed", "error", rerr)
+	}
+}
+
+// writeWindowWAV writes mono 16-bit PCM to a temp WAV at the given sample rate
+// (the ced classifier reads the declared rate and resamples). Returns the path;
+// the caller removes it.
+func writeWindowWAV(pcm []byte, sampleRate int) (string, error) {
+	f, err := os.CreateTemp("", "realtime-sound-window-*.wav")
+	if err != nil {
+		return "", err
+	}
+	defer func() { _ = f.Close() }()
+	hdr := laudio.NewWAVHeaderWithRate(uint32(len(pcm)), uint32(sampleRate))
+	if err := hdr.Write(f); err != nil {
+		_ = os.Remove(f.Name())
+		return "", err
+	}
+	if _, err := f.Write(pcm); err != nil {
+		_ = os.Remove(f.Name())
+		return "", err
+	}
+	_ = f.Sync()
+	return f.Name(), nil
 }

 func runVAD(ctx context.Context, session *Session, adata []int16) ([]schema.VADSegment, error) {
@@ -1419,15 +1605,28 @@ func runVAD(ctx context.Context, session *Session, adata []int16) ([]schema.VADS
 	return resp.Segments, nil
 }

+// speakerNote renders the system-prompt note for the current speaker. Returns
+// an empty string when there is no name and unknown notes are disabled.
+func speakerNote(s *types.Speaker, noteUnknown bool) string {
+	if s != nil && s.Matched && s.Name != "" {
+		return "The current speaker is " + s.Name + "."
+	}
+	if noteUnknown {
+		return "The current speaker is unknown."
+	}
+	return ""
+}
+
 // Function to generate a response based on the conversation
-func generateResponse(ctx context.Context, session *Session, utt []byte, transcript string, conv *Conversation, t Transport) {
+func generateResponse(ctx context.Context, session *Session, utt []byte, transcript string, speaker *types.Speaker, conv *Conversation, t Transport) {
 	xlog.Debug("Generating realtime response...")

 	// Create user message item
 	item := types.MessageItemUnion{
 		User: &types.MessageItemUser{
-			ID:     generateItemID(),
-			Status: types.ItemStatusCompleted,
+			ID:      generateItemID(),
+			Status:  types.ItemStatusCompleted,
+			Speaker: speaker,
 			Content: []types.MessageContentInput{
 				{
 					Type:       types.MessageContentTypeInputAudio,
@@ -1445,6 +1644,17 @@ func generateResponse(ctx context.Context, session *Session, utt []byte, transcr
 		Item: item,
 	})

+	// Surface the recognized speaker to the client. Skip the event for an
+	// unidentified speaker unless announce_unknown is set.
+	if speaker != nil && session.voiceGate != nil && session.voiceGate.cfg.AnnounceEnabled() {
+		if speaker.Matched || session.voiceGate.cfg.Identity.AnnounceUnknown {
+			sendEvent(t, types.ConversationItemSpeakerEvent{
+				ItemID:  item.User.ID,
+				Speaker: *speaker,
+			})
+		}
+	}
+
 	triggerResponse(ctx, session, conv, t, nil)
 }

@@ -1508,6 +1718,8 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 	})

 	imgIndex := 0
+	var lastUserSpeaker *types.Speaker
+	personalize := session.voiceGate != nil && session.voiceGate.cfg.PersonalizeEnabled()
 	conv.Lock.Lock()
 	items := trimRealtimeItems(conv.Items, session.MaxHistoryItems)
 	for _, item := range items {
@@ -1515,6 +1727,11 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 			msg := schema.Message{
 				Role: string(types.MessageRoleUser),
 			}
+			lastUserSpeaker = item.User.Speaker
+			if personalize && session.voiceGate.cfg.Identity.InjectName &&
+				item.User.Speaker != nil && item.User.Speaker.Matched && item.User.Speaker.Name != "" {
+				msg.Name = item.User.Speaker.Name
+			}
 			textContent := ""
 			nrOfImgsInMessage := 0
 			for _, content := range item.User.Content {
@@ -1601,6 +1818,13 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 	}
 	conv.Lock.Unlock()

+	if personalize && session.voiceGate.cfg.Identity.InjectSystemNote {
+		if note := speakerNote(lastUserSpeaker, session.voiceGate.cfg.Identity.NoteUnknown); note != "" {
+			conversationHistory[0].StringContent += "\n\n" + note
+			conversationHistory[0].Content = conversationHistory[0].StringContent
+		}
+	}
+
 	var images []string
 	for _, m := range conversationHistory {
 		images = append(images, m.StringImages...)
--- a/core/http/endpoints/openai/realtime_doubles_test.go
+++ b/core/http/endpoints/openai/realtime_doubles_test.go
@@ -75,6 +75,11 @@ type fakeModel struct {
 	transcribeDeltas []string
 	transcribeFinal  *schema.TranscriptionResult

+	// soundDetectionResult/soundDetectionErr drive the SoundDetection double so
+	// the sound-event path can be exercised deterministically.
+	soundDetectionResult *schema.SoundClassificationResult
+	soundDetectionErr    error
+
 	// Predict streaming: predictTokens are replayed through the token callback
 	// (simulating streamed LLM output); predictResp/predictErr are returned by
 	// the deferred predict function. predictChunkDeltas, when set, are delivered
@@ -83,6 +88,8 @@ type fakeModel struct {
 	predictChunkDeltas [][]*proto.ChatDelta
 	predictResp        backend.LLMResponse
 	predictErr         error
+
+	lastMessages schema.Messages
 }

 func (m *fakeModel) VAD(context.Context, *schema.VADRequest) (*schema.VADResponse, error) {
@@ -93,7 +100,15 @@ func (m *fakeModel) Transcribe(context.Context, string, string, bool, bool, stri
 	return m.transcribeFinal, nil
 }

-func (m *fakeModel) Predict(_ context.Context, _ schema.Messages, _, _, _ []string, cb func(string, backend.TokenUsage) bool, _ []types.ToolUnion, _ *types.ToolChoiceUnion, _, _ *int, _ map[string]float64) (func() (backend.LLMResponse, error), error) {
+func (m *fakeModel) SoundDetection(context.Context, string, int, float32) (*schema.SoundClassificationResult, error) {
+	if m.soundDetectionErr != nil {
+		return nil, m.soundDetectionErr
+	}
+	return m.soundDetectionResult, nil
+}
+
+func (m *fakeModel) Predict(_ context.Context, msgs schema.Messages, _, _, _ []string, cb func(string, backend.TokenUsage) bool, _ []types.ToolUnion, _ *types.ToolChoiceUnion, _, _ *int, _ map[string]float64) (func() (backend.LLMResponse, error), error) {
+	m.lastMessages = msgs
 	if m.predictErr != nil {
 		return nil, m.predictErr
 	}
--- a/core/http/endpoints/openai/realtime_model.go
+++ b/core/http/endpoints/openai/realtime_model.go
@@ -31,10 +31,11 @@ var (
 // This means that we will fake an Any-to-Any model by overriding some of the gRPC client methods
 // which are for Any-To-Any models, but instead we will call a pipeline (for e.g STT->LLM->TTS)
 type wrappedModel struct {
-	TTSConfig           *config.ModelConfig
-	TranscriptionConfig *config.ModelConfig
-	LLMConfig           *config.ModelConfig
-	VADConfig           *config.ModelConfig
+	TTSConfig            *config.ModelConfig
+	TranscriptionConfig  *config.ModelConfig
+	LLMConfig            *config.ModelConfig
+	VADConfig            *config.ModelConfig
+	SoundDetectionConfig *config.ModelConfig

 	appConfig   *config.ApplicationConfig
 	modelLoader *model.ModelLoader
@@ -64,8 +65,9 @@ type anyToAnyModel struct {
 }

 type transcriptOnlyModel struct {
-	TranscriptionConfig *config.ModelConfig
-	VADConfig           *config.ModelConfig
+	TranscriptionConfig  *config.ModelConfig
+	VADConfig            *config.ModelConfig
+	SoundDetectionConfig *config.ModelConfig

 	appConfig   *config.ApplicationConfig
 	modelLoader *model.ModelLoader
@@ -80,6 +82,10 @@ func (m *transcriptOnlyModel) Transcribe(ctx context.Context, audio, language st
 	return backend.ModelTranscription(ctx, audio, language, translate, diarize, prompt, m.modelLoader, *m.TranscriptionConfig, m.appConfig)
 }

+func (m *transcriptOnlyModel) SoundDetection(ctx context.Context, audio string, topK int, threshold float32) (*schema.SoundClassificationResult, error) {
+	return modelSoundDetection(ctx, m.modelLoader, m.appConfig, m.SoundDetectionConfig, audio, topK, threshold)
+}
+
 func (m *transcriptOnlyModel) Predict(ctx context.Context, messages schema.Messages, images, videos, audios []string, tokenCallback func(string, backend.TokenUsage) bool, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, logprobs *int, topLogprobs *int, logitBias map[string]float64) (func() (backend.LLMResponse, error), error) {
 	return nil, fmt.Errorf("predict operation not supported in transcript-only mode")
 }
@@ -108,6 +114,10 @@ func (m *wrappedModel) Transcribe(ctx context.Context, audio, language string, t
 	return backend.ModelTranscription(ctx, audio, language, translate, diarize, prompt, m.modelLoader, *m.TranscriptionConfig, m.appConfig)
 }

+func (m *wrappedModel) SoundDetection(ctx context.Context, audio string, topK int, threshold float32) (*schema.SoundClassificationResult, error) {
+	return modelSoundDetection(ctx, m.modelLoader, m.appConfig, m.SoundDetectionConfig, audio, topK, threshold)
+}
+
 func (m *wrappedModel) Predict(ctx context.Context, messages schema.Messages, images, videos, audios []string, tokenCallback func(string, backend.TokenUsage) bool, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, logprobs *int, topLogprobs *int, logitBias map[string]float64) (func() (backend.LLMResponse, error), error) {
 	input := schema.OpenAIRequest{
 		Messages: messages,
@@ -399,6 +409,39 @@ func transcribeStream(ctx context.Context, ml *model.ModelLoader, transcriptionC
 	return final, nil
 }

+// modelSoundDetection runs sound-event classification against the session's
+// sound-classification model config, mirroring how Transcribe dispatches to
+// the transcription backend. Returns an error when no sound-detection model is
+// configured for the session.
+func modelSoundDetection(ctx context.Context, ml *model.ModelLoader, appConfig *config.ApplicationConfig, soundConfig *config.ModelConfig, audio string, topK int, threshold float32) (*schema.SoundClassificationResult, error) {
+	if soundConfig == nil {
+		return nil, fmt.Errorf("sound detection is not configured for this session")
+	}
+	return backend.ModelSoundDetection(ctx, backend.SoundDetectionRequest{
+		Audio:     audio,
+		TopK:      int32(topK),
+		Threshold: threshold,
+	}, ml, *soundConfig, appConfig)
+}
+
+// loadSoundDetectionConfig resolves the optional sound-classification model
+// config named by pipeline.sound_detection. Returns (nil, nil) when no model
+// is configured so sound detection stays additive and never blocks session
+// setup.
+func loadSoundDetectionConfig(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader) (*config.ModelConfig, error) {
+	if pipeline.SoundDetection == "" {
+		return nil, nil
+	}
+	cfg, err := cl.LoadModelConfigFileByName(pipeline.SoundDetection, ml.ModelPath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to load sound detection config: %w", err)
+	}
+	if valid, _ := cfg.Validate(); !valid {
+		return nil, fmt.Errorf("failed to validate sound detection config %q", pipeline.SoundDetection)
+	}
+	return cfg, nil
+}
+
 func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, *config.ModelConfig, error) {
 	cfgVAD, err := cl.LoadModelConfigFileByName(pipeline.VAD, ml.ModelPath)
 	if err != nil {
@@ -420,9 +463,15 @@ func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfig
 		return nil, nil, fmt.Errorf("failed to validate config: %w", err)
 	}

+	cfgSound, err := loadSoundDetectionConfig(pipeline, cl, ml)
+	if err != nil {
+		return nil, nil, err
+	}
+
 	return &transcriptOnlyModel{
-		TranscriptionConfig: cfgSST,
-		VADConfig:           cfgVAD,
+		TranscriptionConfig:  cfgSST,
+		VADConfig:            cfgVAD,
+		SoundDetectionConfig: cfgSound,

 		confLoader:  cl,
 		modelLoader: ml,
@@ -430,6 +479,27 @@ func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfig
 	}, cfgSST, nil
 }

+// newSoundDetectionOnlyModel builds a realtime model that only does sound-event
+// classification: no VAD, transcription, LLM or TTS stages are loaded. Used for
+// a sound-detection-only realtime session, which activates on sounds (not
+// speech) and is driven by client-side windowing (turn_detection none +
+// input_audio_buffer.commit) rather than the voice VAD loop.
+func newSoundDetectionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, error) {
+	cfgSound, err := loadSoundDetectionConfig(pipeline, cl, ml)
+	if err != nil {
+		return nil, err
+	}
+	if cfgSound == nil {
+		return nil, fmt.Errorf("a sound-only realtime session requires pipeline.sound_detection")
+	}
+	return &transcriptOnlyModel{
+		SoundDetectionConfig: cfgSound,
+		confLoader:           cl,
+		modelLoader:          ml,
+		appConfig:            appConfig,
+	}, nil
+}
+
 // RealtimeRoutingContext is the bundle of routing dependencies the
 // realtime pipeline needs to consult router.Resolve per turn. nil-safe:
 // passing nil skips routing entirely and preserves the historical "one
@@ -544,11 +614,17 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
 		return nil, fmt.Errorf("failed to validate config: %w", err)
 	}

+	cfgSound, err := loadSoundDetectionConfig(pipeline, cl, ml)
+	if err != nil {
+		return nil, err
+	}
+
 	wm := &wrappedModel{
-		TTSConfig:           cfgTTS,
-		TranscriptionConfig: cfgSST,
-		LLMConfig:           cfgLLM,
-		VADConfig:           cfgVAD,
+		TTSConfig:            cfgTTS,
+		TranscriptionConfig:  cfgSST,
+		LLMConfig:            cfgLLM,
+		VADConfig:            cfgVAD,
+		SoundDetectionConfig: cfgSound,

 		confLoader:  cl,
 		modelLoader: ml,
--- a/core/http/endpoints/openai/realtime_sound_detection.go
+++ b/core/http/endpoints/openai/realtime_sound_detection.go
@@ -0,0 +1,48 @@
+package openai
+
+import (
+	"context"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+)
+
+// defaultSoundDetectionTopK is the number of scored tags requested per
+// committed utterance when the session does not pin its own top_k.
+const defaultSoundDetectionTopK = 5
+
+// emitSoundDetection classifies a committed utterance into sound-event tags and
+// emits a conversation.item.sound_detection event for it. It mirrors
+// emitTranscription's unary path: it calls the session's sound-event
+// classifier, maps the scored tags onto the server event, and sends it over
+// the transport. Sound detection is additive to transcription: its result is
+// emitted independently and a failure here is the caller's to log, never a
+// reason to abort the turn.
+func emitSoundDetection(ctx context.Context, t Transport, session *Session, itemID, audioPath string) error {
+	topK := session.SoundDetectionTopK
+	if topK <= 0 {
+		topK = defaultSoundDetectionTopK
+	}
+
+	result, err := session.ModelInterface.SoundDetection(ctx, audioPath, topK, session.SoundDetectionThreshold)
+	if err != nil {
+		return err
+	}
+
+	detections := make([]types.SoundDetectionTag, 0)
+	if result != nil {
+		for _, d := range result.Detections {
+			detections = append(detections, types.SoundDetectionTag{
+				Label: d.Label,
+				Score: d.Score,
+				Index: d.Index,
+			})
+		}
+	}
+
+	return t.SendEvent(types.ConversationItemSoundDetectionEvent{
+		ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
+		ItemID:          itemID,
+		ContentIndex:    0,
+		Detections:      detections,
+	})
+}
--- a/core/http/endpoints/openai/realtime_sound_detection_test.go
+++ b/core/http/endpoints/openai/realtime_sound_detection_test.go
@@ -0,0 +1,170 @@
+package openai
+
+import (
+	"context"
+	"encoding/binary"
+	"errors"
+	"os"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+	"github.com/mudler/LocalAI/core/schema"
+)
+
+// emitSoundDetection classifies a committed utterance and emits a single
+// conversation.item.sound_detection event carrying the scored AudioSet tags.
+var _ = Describe("emitSoundDetection", func() {
+	It("emits a sound_detection event with the classifier's scored tags", func() {
+		session := &Session{
+			SoundDetectionEnabled: true,
+			SoundDetectionTopK:    5,
+			ModelInterface: &fakeModel{
+				soundDetectionResult: &schema.SoundClassificationResult{
+					Model: "ced",
+					Detections: []schema.SoundClassification{
+						{Index: 3, Label: "Baby cry, infant cry", Score: 0.91},
+						{Index: 7, Label: "Speech", Score: 0.42},
+					},
+				},
+			},
+		}
+		t := &fakeTransport{}
+
+		err := emitSoundDetection(context.Background(), t, session, "item1", "/tmp/x.wav")
+
+		Expect(err).ToNot(HaveOccurred())
+		Expect(t.countEvents(types.ServerEventTypeConversationItemSoundDetection)).To(Equal(1))
+
+		ev, ok := t.events[0].(types.ConversationItemSoundDetectionEvent)
+		Expect(ok).To(BeTrue())
+		Expect(ev.ItemID).To(Equal("item1"))
+		Expect(ev.ContentIndex).To(Equal(0))
+		Expect(ev.Detections).To(HaveLen(2))
+		Expect(ev.Detections[0].Label).To(Equal("Baby cry, infant cry"))
+		Expect(ev.Detections[0].Score).To(BeNumerically("~", 0.91, 1e-6))
+		Expect(ev.Detections[0].Index).To(Equal(3))
+		Expect(ev.Detections[1].Label).To(Equal("Speech"))
+	})
+
+	It("emits an event with no detections when the classifier returns none", func() {
+		session := &Session{
+			SoundDetectionEnabled: true,
+			ModelInterface: &fakeModel{
+				soundDetectionResult: &schema.SoundClassificationResult{Model: "ced"},
+			},
+		}
+		t := &fakeTransport{}
+
+		err := emitSoundDetection(context.Background(), t, session, "item1", "/tmp/x.wav")
+
+		Expect(err).ToNot(HaveOccurred())
+		Expect(t.countEvents(types.ServerEventTypeConversationItemSoundDetection)).To(Equal(1))
+		ev, ok := t.events[0].(types.ConversationItemSoundDetectionEvent)
+		Expect(ok).To(BeTrue())
+		Expect(ev.Detections).To(BeEmpty())
+	})
+
+	It("propagates the classifier error and emits no event", func() {
+		session := &Session{
+			SoundDetectionEnabled: true,
+			ModelInterface:        &fakeModel{soundDetectionErr: errors.New("boom")},
+		}
+		t := &fakeTransport{}
+
+		err := emitSoundDetection(context.Background(), t, session, "item1", "/tmp/x.wav")
+
+		Expect(err).To(HaveOccurred())
+		Expect(t.countEvents(types.ServerEventTypeConversationItemSoundDetection)).To(Equal(0))
+	})
+})
+
+// A sound-detection-only session (no transcription, no LLM) runs through
+// commitUtterance WITHOUT the voice/transcription path: it emits the
+// sound_detection event and stops - no transcription event, no LLM response.
+var _ = Describe("commitUtterance (sound-detection-only session)", func() {
+	It("emits sound detection and neither transcribes nor generates a response", func() {
+		session := &Session{
+			SoundDetectionEnabled:   true,
+			SoundDetectionTopK:      5,
+			InputAudioTranscription: nil, // sound-only: no transcription stage
+			ModelConfig:             &config.ModelConfig{},
+			ModelInterface: &fakeModel{
+				soundDetectionResult: &schema.SoundClassificationResult{
+					Model: "ced",
+					Detections: []schema.SoundClassification{
+						{Index: 23, Label: "Baby cry, infant cry", Score: 0.87},
+					},
+				},
+			},
+		}
+		tr := &fakeTransport{}
+		utt := make([]byte, 32) // non-empty PCM so commitUtterance proceeds
+
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
+
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemSoundDetection)).To(Equal(1))
+		// No transcription happened.
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(0))
+		// No LLM response was generated (sound-only has no LLM stage).
+		Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
+	})
+})
+
+// Server-side windowing (option B): a sound-only session classifies the last
+// WindowMs of streamed audio per tick, with no client commit, and keeps the
+// input buffer trimmed to one window.
+var _ = Describe("classifySoundWindow (server-side windowing)", func() {
+	newSoundSession := func() (*Session, *fakeTransport) {
+		return &Session{
+			SoundDetectionEnabled:  true,
+			SoundDetectionTopK:     5,
+			SoundDetectionWindowMs: 200, // 200ms @ 16kHz mono16 = 6400 bytes
+			SoundDetectionHopMs:    20,
+			InputSampleRate:        16000,
+			ModelInterface: &fakeModel{
+				soundDetectionResult: &schema.SoundClassificationResult{
+					Model:      "ced",
+					Detections: []schema.SoundClassification{{Index: 23, Label: "Baby cry, infant cry", Score: 0.87}},
+				},
+			},
+		}, &fakeTransport{}
+	}
+
+	It("emits a sound_detection event and trims the buffer to one window", func() {
+		session, tr := newSoundSession()
+		session.InputAudioBuffer = make([]byte, 10000) // > 6400-byte window
+
+		classifySoundWindow(session, tr)
+
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemSoundDetection)).To(Equal(1))
+		// buffer trimmed to exactly one window (200ms @ 16kHz mono 16-bit)
+		Expect(len(session.InputAudioBuffer)).To(Equal(6400))
+	})
+
+	It("does nothing when too little audio is buffered", func() {
+		session, tr := newSoundSession()
+		session.InputAudioBuffer = make([]byte, 100) // < ~10ms (320 bytes)
+
+		classifySoundWindow(session, tr)
+
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemSoundDetection)).To(Equal(0))
+	})
+})
+
+var _ = Describe("writeWindowWAV", func() {
+	It("writes a mono 16-bit WAV header declaring the given sample rate", func() {
+		pcm := make([]byte, 640)
+		path, err := writeWindowWAV(pcm, 24000)
+		Expect(err).ToNot(HaveOccurred())
+		defer func() { _ = os.Remove(path) }()
+
+		data, err := os.ReadFile(path)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(len(data)).To(BeNumerically(">=", 44+len(pcm)))
+		// SampleRate is a little-endian uint32 at byte offset 24 of a WAV header.
+		Expect(binary.LittleEndian.Uint32(data[24:28])).To(Equal(uint32(24000)))
+	})
+})
--- a/core/http/endpoints/openai/realtime_speaker_event_test.go
+++ b/core/http/endpoints/openai/realtime_speaker_event_test.go
@@ -0,0 +1,54 @@
+package openai
+
+import (
+	"encoding/json"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("ConversationItemSpeakerEvent", func() {
+	It("marshals with the conversation.item.speaker type and nested speaker", func() {
+		ev := types.ConversationItemSpeakerEvent{
+			ItemID:  "item_123",
+			Speaker: types.Speaker{Name: "Jeremy", ID: "spk_1", Labels: map[string]string{"family": "yes"}, Confidence: 92, Distance: 0.1, Matched: true},
+		}
+		b, err := json.Marshal(ev)
+		Expect(err).ToNot(HaveOccurred())
+
+		var got map[string]any
+		Expect(json.Unmarshal(b, &got)).To(Succeed())
+		Expect(got["type"]).To(Equal("conversation.item.speaker"))
+		Expect(got["item_id"]).To(Equal("item_123"))
+
+		spk := got["speaker"].(map[string]any)
+		Expect(spk["name"]).To(Equal("Jeremy"))
+		Expect(spk["id"]).To(Equal("spk_1"))
+		Expect(spk["matched"]).To(Equal(true))
+		Expect(spk["labels"]).To(HaveKeyWithValue("family", "yes"))
+	})
+
+	It("omits labels when the speaker has none", func() {
+		ev := types.ConversationItemSpeakerEvent{ItemID: "i", Speaker: types.Speaker{Name: "Jeremy", Matched: true}}
+		b, err := json.Marshal(ev)
+		Expect(err).ToNot(HaveOccurred())
+		var got map[string]any
+		Expect(json.Unmarshal(b, &got)).To(Succeed())
+		spk := got["speaker"].(map[string]any)
+		_, hasLabels := spk["labels"]
+		Expect(hasLabels).To(BeFalse())
+	})
+
+	It("omits the name for an unknown speaker but keeps matched=false", func() {
+		ev := types.ConversationItemSpeakerEvent{ItemID: "i", Speaker: types.Speaker{Matched: false}}
+		b, err := json.Marshal(ev)
+		Expect(err).ToNot(HaveOccurred())
+		var got map[string]any
+		Expect(json.Unmarshal(b, &got)).To(Succeed())
+		spk := got["speaker"].(map[string]any)
+		_, hasName := spk["name"]
+		Expect(hasName).To(BeFalse())
+		Expect(spk["matched"]).To(Equal(false))
+	})
+})
--- a/core/http/endpoints/openai/realtime_transport_webrtc.go
+++ b/core/http/endpoints/openai/realtime_transport_webrtc.go
@@ -113,8 +113,13 @@ func (t *WebRTCTransport) sendLoop() {
 				return
 			}
 			if err := t.dc.SendText(string(data)); err != nil {
-				xlog.Error("data channel send failed", "error", err)
-				return
+				// Drop just this event and keep the loop alive: a single
+				// failed send (e.g. an event over the negotiated SCTP
+				// max-message-size) must not tear down the session and
+				// silently drop every subsequent event. A genuinely dead
+				// transport is handled by the <-t.closed case.
+				xlog.Error("data channel send failed, dropping event", "error", err)
+				continue
 			}
 		case <-t.closed:
 			// Drain any remaining queued events before exiting
@@ -122,7 +127,8 @@ func (t *WebRTCTransport) sendLoop() {
 				select {
 				case data := <-t.outEvents:
 					if err := t.dc.SendText(string(data)); err != nil {
-						return
+						xlog.Error("data channel send failed while draining, dropping event", "error", err)
+						continue
 					}
 				default:
 					return
--- a/core/http/endpoints/openai/realtime_voicegate.go
+++ b/core/http/endpoints/openai/realtime_voicegate.go
@@ -7,6 +7,7 @@ import (

 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
 	"github.com/mudler/LocalAI/core/services/voicerecognition"
 	"github.com/mudler/LocalAI/pkg/model"
 )
@@ -29,6 +30,32 @@ type voiceGate struct {
 	verifyFn func(ctx context.Context, uttWav, refWav string) (bool, error)
 }

+// resolution is the outcome of resolving a committed utterance's speaker. It
+// carries the surfacing-facing Speaker plus the metadata the policy layer needs
+// (labels for the allow-list) and a human reason when no usable identity exists.
+type resolution struct {
+	speaker types.Speaker     // name/id/confidence/distance/matched
+	labels  map[string]string // identify-mode metadata labels, for the allow-list
+	found   bool              // a candidate identity existed at all
+	reason  string            // why-unknown / deny reason at the resolve level
+}
+
+// confidence maps a cosine distance to a 0..100 score relative to the match
+// threshold, mirroring the /v1/voice/identify endpoint.
+func confidence(distance, threshold float32) float32 {
+	if threshold <= 0 {
+		return 0
+	}
+	c := (1 - distance/threshold) * 100
+	if c < 0 {
+		return 0
+	}
+	if c > 100 {
+		return 100
+	}
+	return c
+}
+
 // newVoiceGate builds a gate from a pipeline's voice_recognition config. It
 // validates fail-fast (before loading the model), loads the recognition model
 // config, wires the real backend seams, and pre-embeds references for verify
@@ -89,91 +116,143 @@ func newVoiceGate(
 	return g, nil
 }

-// Authorize embeds the utterance and decides allow/deny.
-//
-//	allowed: speaker is authorized.
-//	matched: matched person's name (informational), empty if none.
-//	reason:  human-readable deny reason.
-//	err:     backend failure (caller should fail closed).
-func (g *voiceGate) Authorize(ctx context.Context, wavPath string) (allowed bool, matched string, reason string, err error) {
+// Resolve embeds the utterance once and resolves the speaker's identity. It does
+// NOT apply the authorization policy (see authorize). On a backend error it
+// returns the error and a resolution whose reason explains the failure.
+func (g *voiceGate) Resolve(ctx context.Context, wavPath string) (resolution, error) {
 	if g.cfg.Mode == config.VoiceGateModeVerify {
-		return g.authorizeVerify(ctx, wavPath)
+		return g.resolveVerify(ctx, wavPath)
 	}
-	return g.authorizeIdentify(ctx, wavPath)
+	return g.resolveIdentify(ctx, wavPath)
 }

-func (g *voiceGate) authorizeIdentify(ctx context.Context, wavPath string) (bool, string, string, error) {
+func (g *voiceGate) resolveIdentify(ctx context.Context, wavPath string) (resolution, error) {
 	emb, err := g.embedFn(ctx, wavPath)
 	if err != nil {
-		return false, "", "embed failed", err
+		return resolution{reason: "embed failed"}, err
 	}
 	if len(emb) == 0 {
-		return false, "", "no speech detected", nil
+		return resolution{reason: "no speech detected"}, nil
 	}
 	matches, err := g.registry.Identify(ctx, emb, 1)
 	if err != nil {
-		return false, "", "identify failed", err
+		return resolution{reason: "identify failed"}, err
 	}
 	if len(matches) == 0 {
-		return false, "", "unknown speaker", nil
+		return resolution{reason: "unknown speaker"}, nil
 	}
 	m := matches[0]
-	if m.Distance > g.cfg.Threshold {
-		return false, m.Metadata.Name, "distance above threshold", nil
+	matched := m.Distance <= g.cfg.Threshold
+	r := resolution{
+		speaker: types.Speaker{
+			Name:       m.Metadata.Name,
+			ID:         m.Metadata.ID,
+			Labels:     m.Metadata.Labels,
+			Distance:   m.Distance,
+			Confidence: confidence(m.Distance, g.cfg.Threshold),
+			Matched:    matched,
+		},
+		labels: m.Metadata.Labels,
+		found:  true,
 	}
-	if !g.allowMatch(m.Metadata) {
-		return false, m.Metadata.Name, "speaker not in allow list", nil
+	if !matched {
+		r.reason = "distance above threshold"
 	}
-	return true, m.Metadata.Name, "", nil
+	return r, nil
+}
+
+func (g *voiceGate) resolveVerify(ctx context.Context, wavPath string) (resolution, error) {
+	if g.cfg.AntiSpoofing {
+		for _, ref := range g.refAudios {
+			ok, err := g.verifyFn(ctx, wavPath, ref.Audio)
+			if err != nil {
+				return resolution{reason: "verify failed"}, err
+			}
+			if ok {
+				return resolution{
+					speaker: types.Speaker{Name: ref.Name, Confidence: 100, Matched: true},
+					found:   true,
+				}, nil
+			}
+		}
+		return resolution{reason: "no reference matched"}, nil
+	}
+
+	emb, err := g.embedFn(ctx, wavPath)
+	if err != nil {
+		return resolution{reason: "embed failed"}, err
+	}
+	if len(emb) == 0 {
+		return resolution{reason: "no speech detected"}, nil
+	}
+	for _, ref := range g.refEmbeds {
+		d := cosineDistance(emb, ref.emb)
+		if d <= g.cfg.Threshold {
+			return resolution{
+				speaker: types.Speaker{Name: ref.name, Distance: d, Confidence: confidence(d, g.cfg.Threshold), Matched: true},
+				found:   true,
+			}, nil
+		}
+	}
+	return resolution{reason: "no reference matched"}, nil
+}
+
+// authorize applies the gate's policy to an already-resolved identity.
+func (g *voiceGate) authorize(r resolution) (allowed bool, reason string) {
+	if g.cfg.Mode == config.VoiceGateModeVerify {
+		if r.speaker.Matched {
+			return true, ""
+		}
+		if r.reason == "" {
+			return false, "no reference matched"
+		}
+		return false, r.reason
+	}
+	if !r.found {
+		return false, r.reason
+	}
+	if !r.speaker.Matched {
+		return false, "distance above threshold"
+	}
+	if !g.allowMatch(r.speaker.Name, r.labels) {
+		return false, "speaker not in allow list"
+	}
+	return true, ""
 }

 // allowMatch reports whether a matched identity is authorized. An empty allow
 // (no names and no labels) authorizes any registered speaker.
-func (g *voiceGate) allowMatch(meta voicerecognition.Metadata) bool {
+func (g *voiceGate) allowMatch(name string, labels map[string]string) bool {
 	a := g.cfg.Allow
 	if len(a.Names) == 0 && len(a.Labels) == 0 {
 		return true
 	}
 	for _, n := range a.Names {
-		if n == meta.Name {
+		if n == name {
 			return true
 		}
 	}
 	for _, l := range a.Labels {
-		if _, ok := meta.Labels[l]; ok {
+		if _, ok := labels[l]; ok {
 			return true
 		}
 	}
 	return false
 }

-func (g *voiceGate) authorizeVerify(ctx context.Context, wavPath string) (bool, string, string, error) {
-	if g.cfg.AntiSpoofing {
-		for _, r := range g.refAudios {
-			ok, err := g.verifyFn(ctx, wavPath, r.Audio)
-			if err != nil {
-				return false, "", "verify failed", err
-			}
-			if ok {
-				return true, r.Name, "", nil
-			}
-		}
-		return false, "", "no reference matched", nil
+// Authorize is the legacy convenience wrapper: resolve then apply policy.
+//
+//	allowed: speaker is authorized.
+//	matched: matched person's name (informational), empty if none.
+//	reason:  human-readable deny reason.
+//	err:     backend failure (caller should fail closed).
+func (g *voiceGate) Authorize(ctx context.Context, wavPath string) (allowed bool, matched string, reason string, err error) {
+	r, rerr := g.Resolve(ctx, wavPath)
+	if rerr != nil {
+		return false, "", r.reason, rerr
 	}
-
-	emb, err := g.embedFn(ctx, wavPath)
-	if err != nil {
-		return false, "", "embed failed", err
-	}
-	if len(emb) == 0 {
-		return false, "", "no speech detected", nil
-	}
-	for _, r := range g.refEmbeds {
-		if cosineDistance(emb, r.emb) <= g.cfg.Threshold {
-			return true, r.name, "", nil
-		}
-	}
-	return false, "", "no reference matched", nil
+	allowed, reason = g.authorize(r)
+	return allowed, r.speaker.Name, reason, nil
 }

 // decide interprets an Authorize result against the gate's when-policy and the
--- a/core/http/endpoints/openai/realtime_voicegate_integration_test.go
+++ b/core/http/endpoints/openai/realtime_voicegate_integration_test.go
@@ -152,3 +152,252 @@ var _ = Describe("realtime voice gate integration (commitUtterance)", func() {
 		Expect(tr2.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1))
 	})
 })
+
+var _ = Describe("realtime speaker surfacing (commitUtterance)", func() {
+	utt := make([]byte, 32)
+
+	It("emits conversation.item.speaker for a confident match when announce is on", func() {
+		session, _ := itSession(itGate("alice", "alice", []float32{1, 0, 0}, nil,
+			config.VoiceGateWhenEvery, config.VoiceGateRejectEvent))
+		session.voiceGate.cfg.Identity = &config.VoiceIdentityConfig{Announce: true}
+		tr := &fakeTransport{}
+
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
+
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(1))
+	})
+
+	It("does not emit the speaker event for an unknown speaker unless announce_unknown is set", func() {
+		// match distance above threshold => not matched
+		gate := &voiceGate{
+			cfg: config.PipelineVoiceRecognition{
+				Mode: config.VoiceGateModeIdentify, Threshold: 0.25,
+				When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent,
+				Enforce:  boolPtr(false),
+				Identity: &config.VoiceIdentityConfig{Announce: true},
+			},
+			registry: &fakeRegistry{matches: []voicerecognition.Match{
+				{Distance: 0.9, Metadata: voicerecognition.Metadata{Name: "alice"}},
+			}},
+			embedFn: func(context.Context, string) ([]float32, error) { return []float32{1, 0, 0}, nil },
+		}
+		session, _ := itSession(gate)
+		tr := &fakeTransport{}
+
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(0))
+
+		gate.cfg.Identity.AnnounceUnknown = true
+		tr2 := &fakeTransport{}
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr2)
+		Expect(tr2.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(1))
+	})
+
+	It("never drops a turn when enforce is false even for a disallowed speaker", func() {
+		session, _ := itSession(itGate("bob", "alice", []float32{1, 0, 0}, nil,
+			config.VoiceGateWhenEvery, config.VoiceGateRejectEvent))
+		session.voiceGate.cfg.Enforce = boolPtr(false)
+		tr := &fakeTransport{}
+
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
+
+		Expect(hasSpeakerNotAuthorized(tr)).To(BeFalse())
+		Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1))
+	})
+})
+
+var _ = Describe("realtime speaker personalization (triggerResponseAtTurn)", func() {
+	utt := make([]byte, 32)
+
+	findRole := func(msgs schema.Messages, role string) *schema.Message {
+		for i := range msgs {
+			if msgs[i].Role == role {
+				return &msgs[i]
+			}
+		}
+		return nil
+	}
+
+	It("sets the user message name and a current-speaker system note", func() {
+		session, m := itSession(itGate("alice", "alice", []float32{1, 0, 0}, nil,
+			config.VoiceGateWhenEvery, config.VoiceGateRejectEvent))
+		session.voiceGate.cfg.Identity = &config.VoiceIdentityConfig{
+			Personalize: true, InjectName: true, InjectSystemNote: true,
+		}
+		session.Instructions = "You are helpful."
+		tr := &fakeTransport{}
+
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
+
+		user := findRole(m.lastMessages, "user")
+		Expect(user).ToNot(BeNil())
+		Expect(user.Name).To(Equal("alice"))
+		sys := findRole(m.lastMessages, "system")
+		Expect(sys).ToNot(BeNil())
+		Expect(sys.StringContent).To(ContainSubstring("The current speaker is alice."))
+	})
+
+	It("omits the unknown note unless note_unknown is set", func() {
+		base := func() (*Session, *fakeModel) {
+			gate := &voiceGate{
+				cfg: config.PipelineVoiceRecognition{
+					Mode: config.VoiceGateModeIdentify, Threshold: 0.25,
+					When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent,
+					Enforce:  boolPtr(false),
+					Identity: &config.VoiceIdentityConfig{Personalize: true, InjectSystemNote: true},
+				},
+				registry: &fakeRegistry{matches: []voicerecognition.Match{
+					{Distance: 0.9, Metadata: voicerecognition.Metadata{Name: "alice"}},
+				}},
+				embedFn: func(context.Context, string) ([]float32, error) { return []float32{1, 0, 0}, nil },
+			}
+			s, m := itSession(gate)
+			s.Instructions = "You are helpful."
+			return s, m
+		}
+
+		s1, m1 := base()
+		commitUtterance(context.Background(), utt, s1, &Conversation{}, &fakeTransport{})
+		Expect(findRole(m1.lastMessages, "system").StringContent).ToNot(ContainSubstring("unknown"))
+
+		s2, m2 := base()
+		s2.voiceGate.cfg.Identity.NoteUnknown = true
+		commitUtterance(context.Background(), utt, s2, &Conversation{}, &fakeTransport{})
+		Expect(findRole(m2.lastMessages, "system").StringContent).To(ContainSubstring("The current speaker is unknown."))
+	})
+})
+
+var _ = Describe("realtime when:first with identity (commitUtterance)", func() {
+	utt := make([]byte, 32)
+
+	// statefulIdentityGate builds a when:first identify gate with an Identity
+	// block (so identity is resolved every turn) whose embedFn is driven by a
+	// per-turn counter: the failOnSecond flag makes the second and later embeds
+	// return an error, exercising the stricter fail-closed path on a re-resolve.
+	statefulIdentityGate := func(failOnSecond bool) *voiceGate {
+		calls := 0
+		return &voiceGate{
+			cfg: config.PipelineVoiceRecognition{
+				Mode:      config.VoiceGateModeIdentify,
+				Threshold: 0.25,
+				When:      config.VoiceGateWhenFirst,
+				OnReject:  config.VoiceGateRejectEvent,
+				Allow:     config.VoiceRecognitionAllow{Names: []string{"alice"}},
+				Identity:  &config.VoiceIdentityConfig{Announce: true, Personalize: true, InjectName: true},
+			},
+			registry: &fakeRegistry{matches: []voicerecognition.Match{
+				{Distance: 0.1, Metadata: voicerecognition.Metadata{Name: "alice"}},
+			}},
+			embedFn: func(context.Context, string) ([]float32, error) {
+				calls++
+				if failOnSecond && calls > 1 {
+					return nil, errors.New("embed backend down")
+				}
+				return []float32{1, 0, 0}, nil
+			},
+		}
+	}
+
+	It("re-resolves identity every turn and fails closed when a later embed errors", func() {
+		gate := statefulIdentityGate(true)
+		session, _ := itSession(gate)
+		conv := &Conversation{} // shared so voiceVerified persists across turns
+
+		// Turn 1: authorized; identity resolved, speaker surfaced, response runs.
+		tr1 := &fakeTransport{}
+		commitUtterance(context.Background(), utt, session, conv, tr1)
+		Expect(hasSpeakerNotAuthorized(tr1)).To(BeFalse())
+		Expect(tr1.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(1))
+		Expect(tr1.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1))
+
+		// Turn 2: when:first would skip re-authorization, but the Identity block
+		// forces a fresh resolve. That resolve now errors, and because the gate
+		// enforces, the turn is dropped fail-closed rather than riding on the
+		// cached first verification.
+		tr2 := &fakeTransport{}
+		commitUtterance(context.Background(), utt, session, conv, tr2)
+		Expect(hasSpeakerNotAuthorized(tr2)).To(BeTrue())
+		Expect(tr2.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
+	})
+
+	It("re-resolves identity every turn so a later turn still surfaces and names the speaker", func() {
+		gate := statefulIdentityGate(false)
+		session, m := itSession(gate)
+		conv := &Conversation{}
+
+		tr1 := &fakeTransport{}
+		commitUtterance(context.Background(), utt, session, conv, tr1)
+		Expect(hasSpeakerNotAuthorized(tr1)).To(BeFalse())
+		Expect(tr1.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1))
+
+		// Turn 2: authorization is skipped (when:first, already verified) but the
+		// speaker event still fires and the per-message name is set, proving the
+		// per-turn re-resolution (not the cached first verification) drove it.
+		tr2 := &fakeTransport{}
+		commitUtterance(context.Background(), utt, session, conv, tr2)
+		Expect(tr2.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(1))
+		var lastUser *schema.Message
+		for i := range m.lastMessages {
+			if m.lastMessages[i].Role == "user" {
+				lastUser = &m.lastMessages[i]
+			}
+		}
+		Expect(lastUser).ToNot(BeNil())
+		Expect(lastUser.Name).To(Equal("alice"))
+	})
+})
+
+var _ = Describe("realtime multi-speaker history attribution (triggerResponse)", func() {
+	userAudioItem := func(name, transcript string) *types.MessageItemUnion {
+		return &types.MessageItemUnion{
+			User: &types.MessageItemUser{
+				ID:      generateItemID(),
+				Status:  types.ItemStatusCompleted,
+				Speaker: &types.Speaker{Name: name, Matched: true},
+				Content: []types.MessageContentInput{
+					{Type: types.MessageContentTypeInputAudio, Transcript: transcript},
+				},
+			},
+		}
+	}
+
+	It("attributes each user turn to its own speaker and notes the latest one", func() {
+		session, m := itSession(itGate("alice", "alice", []float32{1, 0, 0}, nil,
+			config.VoiceGateWhenEvery, config.VoiceGateRejectEvent))
+		session.Instructions = "You are helpful."
+		session.MaxHistoryItems = 10 // keep both items; 0 would mean "no trim" too
+		session.voiceGate.cfg.Identity = &config.VoiceIdentityConfig{
+			Personalize: true, InjectName: true, InjectSystemNote: true,
+		}
+
+		conv := &Conversation{Items: []*types.MessageItemUnion{
+			userAudioItem("alice", "hello there"),
+			userAudioItem("bob", "what is the weather"),
+		}}
+		tr := &fakeTransport{}
+
+		triggerResponse(context.Background(), session, conv, tr, nil)
+
+		var users []*schema.Message
+		var sys *schema.Message
+		for i := range m.lastMessages {
+			switch m.lastMessages[i].Role {
+			case "user":
+				users = append(users, &m.lastMessages[i])
+			case "system":
+				if sys == nil {
+					sys = &m.lastMessages[i]
+				}
+			}
+		}
+		Expect(users).To(HaveLen(2))
+		Expect(users[0].Name).To(Equal("alice"))
+		Expect(users[1].Name).To(Equal("bob"))
+
+		Expect(sys).ToNot(BeNil())
+		Expect(sys.StringContent).To(ContainSubstring("The current speaker is bob."))
+		Expect(sys.StringContent).ToNot(ContainSubstring("alice"))
+	})
+})
+
+func boolPtr(b bool) *bool { return &b }
--- a/core/http/endpoints/openai/realtime_voicegate_test.go
+++ b/core/http/endpoints/openai/realtime_voicegate_test.go
@@ -10,6 +10,82 @@ import (
 	. "github.com/onsi/gomega"
 )

+var _ = Describe("voiceGate.Resolve + authorize", func() {
+	mkGate := func(allow []string) *voiceGate {
+		return &voiceGate{
+			cfg: config.PipelineVoiceRecognition{
+				Mode:      config.VoiceGateModeIdentify,
+				Threshold: 0.25,
+				Allow:     config.VoiceRecognitionAllow{Names: allow},
+			},
+			registry: &fakeRegistry{matches: []voicerecognition.Match{
+				{Distance: 0.1, Metadata: voicerecognition.Metadata{ID: "spk_1", Name: "alice", Labels: map[string]string{"family": "yes"}}},
+			}},
+			embedFn: func(context.Context, string) ([]float32, error) { return []float32{1, 0, 0}, nil },
+		}
+	}
+
+	It("resolves a confident identity with name, id and a 0..100 confidence", func() {
+		r, err := mkGate(nil).Resolve(context.Background(), "x.wav")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(r.found).To(BeTrue())
+		Expect(r.speaker.Name).To(Equal("alice"))
+		Expect(r.speaker.ID).To(Equal("spk_1"))
+		Expect(r.speaker.Matched).To(BeTrue())
+		Expect(r.speaker.Confidence).To(BeNumerically(">", 0))
+		Expect(r.speaker.Confidence).To(BeNumerically("<=", 100))
+		Expect(r.speaker.Labels).To(HaveKeyWithValue("family", "yes"))
+	})
+
+	It("marks a candidate above the threshold as not matched", func() {
+		g := mkGate(nil)
+		g.registry = &fakeRegistry{matches: []voicerecognition.Match{
+			{Distance: 0.9, Metadata: voicerecognition.Metadata{Name: "alice"}},
+		}}
+		r, err := g.Resolve(context.Background(), "x.wav")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(r.found).To(BeTrue())
+		Expect(r.speaker.Matched).To(BeFalse())
+		Expect(r.speaker.Name).To(Equal("alice")) // name still surfaced
+	})
+
+	It("authorize allows a confident match in the allow list", func() {
+		g := mkGate([]string{"alice"})
+		r, _ := g.Resolve(context.Background(), "x.wav")
+		allowed, reason := g.authorize(r)
+		Expect(allowed).To(BeTrue())
+		Expect(reason).To(BeEmpty())
+	})
+
+	It("authorize denies a confident match outside the allow list", func() {
+		g := mkGate([]string{"bob"})
+		r, _ := g.Resolve(context.Background(), "x.wav")
+		allowed, reason := g.authorize(r)
+		Expect(allowed).To(BeFalse())
+		Expect(reason).To(Equal("speaker not in allow list"))
+	})
+
+	It("authorize allows by label when names do not match", func() {
+		g := mkGate(nil)
+		g.cfg.Allow = config.VoiceRecognitionAllow{Labels: []string{"family"}}
+		r, _ := g.Resolve(context.Background(), "x.wav")
+		allowed, _ := g.authorize(r)
+		Expect(allowed).To(BeTrue())
+	})
+})
+
+var _ = Describe("confidence", func() {
+	It("is 100 at zero distance", func() {
+		Expect(confidence(0, 0.25)).To(BeNumerically("~", 100, 1e-4))
+	})
+	It("clamps to 0 above the threshold", func() {
+		Expect(confidence(0.5, 0.25)).To(BeNumerically("~", 0, 1e-4))
+	})
+	It("is 0 for a non-positive threshold", func() {
+		Expect(confidence(0.1, 0)).To(BeNumerically("~", 0, 1e-4))
+	})
+})
+
 var _ = Describe("cosineDistance", func() {
 	It("is 0 for identical vectors", func() {
 		Expect(cosineDistance([]float32{1, 0, 0}, []float32{1, 0, 0})).To(BeNumerically("~", 0, 1e-6))
--- a/core/http/endpoints/openai/realtime_webrtc.go
+++ b/core/http/endpoints/openai/realtime_webrtc.go
@@ -128,10 +128,13 @@ func RealtimeCalls(application *application.Application) echo.HandlerFunc {
 			handleIncomingAudioTrack(track, transport)
 		})

-		// Set the remote SDP (client's offer)
+		// Set the remote SDP (client's offer). Raise the data-channel
+		// max-message-size the browser advertised so pion permits the larger
+		// realtime events some turns produce (e.g. tool calls), which would
+		// otherwise be dropped on send. See realtime_webrtc_sctp.go.
 		if err := pc.SetRemoteDescription(webrtc.SessionDescription{
 			Type: webrtc.SDPTypeOffer,
-			SDP:  req.SDP,
+			SDP:  raiseDataChannelMaxMessageSize(req.SDP),
 		}); err != nil {
 			transport.Close()
 			xlog.Error("failed to set remote description", "error", err)
--- a/core/http/endpoints/openai/realtime_webrtc_sctp.go
+++ b/core/http/endpoints/openai/realtime_webrtc_sctp.go
@@ -0,0 +1,29 @@
+package openai
+
+import (
+	"fmt"
+	"regexp"
+)
+
+// realtimeDataChannelMaxMessageSize is the SCTP max-message-size LocalAI honors
+// for the "oai-events" data channel, in bytes.
+//
+// Browsers advertise a conservative max-message-size in their SDP offer (Chrome
+// uses 262144 = 256 KiB). pion enforces the remote's advertised value on send,
+// so a single realtime event larger than it cannot be sent: the SendText fails,
+// the event is dropped, and the turn silently yields no response. Some turns
+// legitimately produce a single JSON event above 256 KiB (notably tool calls
+// with sizeable schemas or results). Browsers advertise this value
+// conservatively but their SCTP stacks reassemble much larger messages, so we
+// raise the value honored for our own server-generated events.
+const realtimeDataChannelMaxMessageSize = 16 * 1024 * 1024 // 16 MiB
+
+var maxMessageSizeAttrRe = regexp.MustCompile(`a=max-message-size:\d+`)
+
+// raiseDataChannelMaxMessageSize rewrites the SCTP max-message-size attribute in
+// an SDP offer to realtimeDataChannelMaxMessageSize so pion permits larger
+// outbound realtime events. Offers that don't carry the attribute are returned
+// unchanged.
+func raiseDataChannelMaxMessageSize(sdp string) string {
+	return maxMessageSizeAttrRe.ReplaceAllString(sdp, fmt.Sprintf("a=max-message-size:%d", realtimeDataChannelMaxMessageSize))
+}
--- a/core/http/endpoints/openai/realtime_webrtc_sctp_test.go
+++ b/core/http/endpoints/openai/realtime_webrtc_sctp_test.go
@@ -0,0 +1,33 @@
+package openai
+
+import (
+	"fmt"
+	"strings"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("raiseDataChannelMaxMessageSize", func() {
+	It("raises a max-message-size the browser advertised", func() {
+		offer := "v=0\r\nm=application 9 UDP/DTLS/SCTP webrtc-datachannel\r\na=max-message-size:262144\r\n"
+		out := raiseDataChannelMaxMessageSize(offer)
+		Expect(out).To(ContainSubstring(fmt.Sprintf("a=max-message-size:%d", realtimeDataChannelMaxMessageSize)))
+		Expect(out).NotTo(ContainSubstring("a=max-message-size:262144"))
+	})
+
+	It("leaves an offer without the attribute unchanged", func() {
+		offer := "v=0\r\nm=application 9 UDP/DTLS/SCTP webrtc-datachannel\r\n"
+		Expect(raiseDataChannelMaxMessageSize(offer)).To(Equal(offer))
+	})
+
+	It("rewrites every occurrence", func() {
+		offer := "a=max-message-size:1024\r\na=max-message-size:262144\r\n"
+		out := raiseDataChannelMaxMessageSize(offer)
+		Expect(strings.Count(out, fmt.Sprintf("a=max-message-size:%d", realtimeDataChannelMaxMessageSize))).To(Equal(2))
+	})
+
+	It("raises above the 256 KiB browsers advertise", func() {
+		Expect(realtimeDataChannelMaxMessageSize).To(BeNumerically(">", 262144))
+	})
+})
--- a/core/http/endpoints/openai/sound_classification.go
+++ b/core/http/endpoints/openai/sound_classification.go
@@ -0,0 +1,91 @@
+package openai
+
+import (
+	"io"
+	"net/http"
+	"os"
+	"path"
+	"path/filepath"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/middleware"
+	"github.com/mudler/LocalAI/core/schema"
+	model "github.com/mudler/LocalAI/pkg/model"
+
+	"github.com/mudler/xlog"
+)
+
+// SoundClassificationEndpoint runs an audio-tagging / sound-event
+// classification model (e.g. ced) over an uploaded clip and returns the
+// scored AudioSet tags in score-descending order. It mirrors the
+// transcription path: multipart audio upload -> temp file -> backend call.
+//
+// @Summary Classify sound events in audio (audio tagging).
+// @Tags audio
+// @accept multipart/form-data
+// @Param model formData string true "model"
+// @Param file formData file true "audio file"
+// @Param top_k formData int false "number of top tags to return (0 = backend default)"
+// @Param threshold formData number false "drop tags scoring below this value"
+// @Success 200 {object} schema.SoundClassificationResult
+// @Router /v1/audio/classification [post]
+func SoundClassificationEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc {
+	return func(c echo.Context) error {
+		input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
+		if !ok || input.Model == "" {
+			return echo.ErrBadRequest
+		}
+
+		modelConfig, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig)
+		if !ok || modelConfig == nil {
+			return echo.ErrBadRequest
+		}
+
+		req := backend.SoundDetectionRequest{
+			TopK:      int32(parseFormInt(c, "top_k", 0)),
+			Threshold: float32(parseFormFloat(c, "threshold", 0)),
+		}
+
+		file, err := c.FormFile("file")
+		if err != nil {
+			return err
+		}
+		f, err := file.Open()
+		if err != nil {
+			return err
+		}
+		defer func() { _ = f.Close() }()
+
+		dir, err := os.MkdirTemp("", "sound-classification")
+		if err != nil {
+			return err
+		}
+		defer func() { _ = os.RemoveAll(dir) }()
+
+		dst := filepath.Join(dir, path.Base(file.Filename))
+		dstFile, err := os.Create(dst) // #nosec G304 -- dst is a server-created temp dir joined with path.Base of the upload name (no traversal)
+		if err != nil {
+			return err
+		}
+		if _, err := io.Copy(dstFile, f); err != nil {
+			xlog.Debug("Audio file copying error", "filename", file.Filename, "dst", dst, "error", err)
+			_ = dstFile.Close()
+			return err
+		}
+		_ = dstFile.Close()
+		req.Audio = dst
+
+		result, err := backend.ModelSoundDetection(c.Request().Context(), req, ml, *modelConfig, appConfig)
+		if err != nil {
+			xlog.Error("Sound classification failed",
+				"model", modelConfig.Name,
+				"audio", dst,
+				"error", err)
+			return err
+		}
+
+		return c.JSON(http.StatusOK, result)
+	}
+}
--- a/core/http/endpoints/openai/types/message_item.go
+++ b/core/http/endpoints/openai/types/message_item.go
@@ -102,6 +102,10 @@ type MessageItemUser struct {

 	// The status of the item. Has no effect on the conversation.
 	Status ItemStatus `json:"status,omitempty"`
+
+	// Speaker is the recognized speaker for this audio turn (LocalAI extension).
+	// Used to attribute past turns when rebuilding the LLM message history.
+	Speaker *Speaker `json:"speaker,omitempty"`
 }

 func (m MessageItemUser) MessageItemType() MessageItemType {
--- a/Show More
+++ b/Show More