test(e2e): live-server voice-recognition gate test (#10324 )

Add mock-backend VoiceEmbed/VoiceVerify (deterministic DC-offset speaker discrimination) and a verify-mode gated realtime pipeline, then drive the real HTTP/WS stack: an authorized speaker reaches response.done while an unauthorized one is dropped before the LLM with a speaker_not_authorized event. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
chore: ⬆️ Update vllm-project/vllm cu130 wheel to 0.23.0 (#10314 )
2026-06-14 11:49:33 -04:00 · 2026-06-13 23:54:27 +02:00 · 2026-06-13 23:39:10 +02:00 · 2026-06-13 23:38:08 +02:00 · 2026-06-13 23:09:59 +02:00 · 2026-06-13 20:10:22 +00:00
98 changed files with 8698 additions and 694 deletions
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -781,6 +781,19 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
+  - build-type: 'cublas'
+    cuda-major-version: "12"
+    cuda-minor-version: "8"
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-nvidia-cuda-12-omnivoice-cpp'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "omnivoice-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "12"
    cuda-minor-version: "8"
@@ -1712,6 +1725,19 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
+  - build-type: 'cublas'
+    cuda-major-version: "13"
+    cuda-minor-version: "0"
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-nvidia-cuda-13-omnivoice-cpp'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "omnivoice-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "13"
    cuda-minor-version: "0"
@@ -1751,6 +1777,19 @@ include:
    backend: "qwen3-tts-cpp"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
+  - build-type: 'cublas'
+    cuda-major-version: "13"
+    cuda-minor-version: "0"
+    platforms: 'linux/arm64'
+    skip-drivers: 'false'
+    tag-latest: 'auto'
+    tag-suffix: '-nvidia-l4t-cuda-13-arm64-omnivoice-cpp'
+    base-image: "ubuntu:24.04"
+    ubuntu-version: '2404'
+    runs-on: 'ubuntu-24.04-arm'
+    backend: "omnivoice-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
  - build-type: 'cublas'
    cuda-major-version: "13"
    cuda-minor-version: "0"
@@ -3483,6 +3522,35 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
+  # omnivoice-cpp
+  - build-type: ''
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    platform-tag: 'amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-cpu-omnivoice-cpp'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "omnivoice-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: ''
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/arm64'
+    platform-tag: 'arm64'
+    tag-latest: 'auto'
+    tag-suffix: '-cpu-omnivoice-cpp'
+    runs-on: 'ubuntu-24.04-arm'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "omnivoice-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
  - build-type: 'sycl_f32'
    cuda-major-version: ""
    cuda-minor-version: ""
@@ -3496,6 +3564,19 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
+  - build-type: 'sycl_f32'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-intel-sycl-f32-omnivoice-cpp'
+    runs-on: 'ubuntu-latest'
+    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
+    skip-drivers: 'false'
+    backend: "omnivoice-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
  - build-type: 'sycl_f16'
    cuda-major-version: ""
    cuda-minor-version: ""
@@ -3509,6 +3590,19 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
+  - build-type: 'sycl_f16'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-intel-sycl-f16-omnivoice-cpp'
+    runs-on: 'ubuntu-latest'
+    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
+    skip-drivers: 'false'
+    backend: "omnivoice-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
  - build-type: 'vulkan'
    cuda-major-version: ""
    cuda-minor-version: ""
@@ -3523,6 +3617,20 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
+  - build-type: 'vulkan'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    platform-tag: 'amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-vulkan-omnivoice-cpp'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "omnivoice-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
  - build-type: 'vulkan'
    cuda-major-version: ""
    cuda-minor-version: ""
@@ -3537,6 +3645,20 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
+  - build-type: 'vulkan'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/arm64'
+    platform-tag: 'arm64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-vulkan-omnivoice-cpp'
+    runs-on: 'ubuntu-24.04-arm'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "omnivoice-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "12"
    cuda-minor-version: "0"
@@ -3550,6 +3672,19 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2204'
+  - build-type: 'cublas'
+    cuda-major-version: "12"
+    cuda-minor-version: "0"
+    platforms: 'linux/arm64'
+    skip-drivers: 'false'
+    tag-latest: 'auto'
+    tag-suffix: '-nvidia-l4t-arm64-omnivoice-cpp'
+    base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
+    runs-on: 'ubuntu-24.04-arm'
+    backend: "omnivoice-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2204'
  - build-type: 'hipblas'
    cuda-major-version: ""
    cuda-minor-version: ""
@@ -3563,6 +3698,19 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
+  - build-type: 'hipblas'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-rocm-hipblas-omnivoice-cpp'
+    base-image: "rocm/dev-ubuntu-24.04:6.4.4"
+    runs-on: 'ubuntu-latest'
+    skip-drivers: 'false'
+    backend: "omnivoice-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
  # vibevoice-cpp
  - build-type: ''
    cuda-major-version: ""
@@ -4393,6 +4541,10 @@ includeDarwin:
    tag-suffix: "-metal-darwin-arm64-qwen3-tts-cpp"
    build-type: "metal"
    lang: "go"
+  - backend: "omnivoice-cpp"
+    tag-suffix: "-metal-darwin-arm64-omnivoice-cpp"
+    build-type: "metal"
+    lang: "go"
  - backend: "vibevoice-cpp"
    tag-suffix: "-metal-darwin-arm64-vibevoice-cpp"
    build-type: "metal"
@@ -4475,3 +4627,6 @@ includeDarwin:
  - backend: "speaker-recognition"
    tag-suffix: "-metal-darwin-arm64-speaker-recognition"
    build-type: "mps"
+  - backend: "ds4"
+    tag-suffix: "-metal-darwin-arm64-ds4"
+    lang: "go"
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -70,6 +70,10 @@ jobs:
            variable: "QWEN3TTS_CPP_VERSION"
            branch: "main"
            file: "backend/go/qwen3-tts-cpp/Makefile"
+          - repository: "ServeurpersoCom/omnivoice.cpp"
+            variable: "OMNIVOICE_VERSION"
+            branch: "master"
+            file: "backend/go/omnivoice-cpp/Makefile"
          - repository: "localai-org/vibevoice.cpp"
            variable: "VIBEVOICE_CPP_VERSION"
            branch: "master"
--- a/6
+++ b/6
@@ -1,5 +1,5 @@
 # Disable parallel execution for backend builds
-.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio
+.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/omnivoice-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio

 GOCMD=go
 GOTEST=$(GOCMD) test
@@ -1176,6 +1176,7 @@ BACKEND_PARAKEET_CPP = parakeet-cpp|golang|.|false|true
 BACKEND_VOXTRAL = voxtral|golang|.|false|true
 BACKEND_ACESTEP_CPP = acestep-cpp|golang|.|false|true
 BACKEND_QWEN3_TTS_CPP = qwen3-tts-cpp|golang|.|false|true
+BACKEND_OMNIVOICE_CPP = omnivoice-cpp|golang|.|false|true
 BACKEND_VIBEVOICE_CPP = vibevoice-cpp|golang|.|false|true
 BACKEND_LOCALVQE = localvqe|golang|.|false|true
 BACKEND_OPUS = opus|golang|.|false|true
@@ -1294,6 +1295,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_WHISPERX)))
 $(eval $(call generate-docker-build-target,$(BACKEND_ACE_STEP)))
 $(eval $(call generate-docker-build-target,$(BACKEND_ACESTEP_CPP)))
 $(eval $(call generate-docker-build-target,$(BACKEND_QWEN3_TTS_CPP)))
+$(eval $(call generate-docker-build-target,$(BACKEND_OMNIVOICE_CPP)))
 $(eval $(call generate-docker-build-target,$(BACKEND_VIBEVOICE_CPP)))
 $(eval $(call generate-docker-build-target,$(BACKEND_LOCALVQE)))
 $(eval $(call generate-docker-build-target,$(BACKEND_MLX)))
@@ -1311,7 +1313,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_SHERPA_ONNX)))
 docker-save-%: backend-images
 	docker save local-ai-backend:$* -o backend-images/$*.tar

-docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-crispasr docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-liquid-audio docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-rfdetr-cpp docker-build-qwen3-tts-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx docker-build-cloud-proxy
+docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-crispasr docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-liquid-audio docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-rfdetr-cpp docker-build-qwen3-tts-cpp docker-build-omnivoice-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx docker-build-cloud-proxy

 ########################################################
 ### Mock Backend for E2E Tests
--- a/README.md
+++ b/README.md
@@ -165,6 +165,10 @@ For more details, see the [Getting Started guide](https://localai.io/basics/gett

 ## Latest News

+- **June 2026**: New [realtime voice assistant demo](https://github.com/localai-org/localai-realtime-demo) (a tiny Go client for the Realtime API with a full talk-back voice loop and tool calling), plus [streaming of the realtime LLM / TTS / transcription pipeline stages](https://github.com/mudler/LocalAI/pull/10176) and [configurable WebRTC ICE candidates](https://github.com/mudler/LocalAI/pull/10231).
+- **June 2026**: Big speech push: the [parakeet.cpp](https://github.com/mudler/parakeet.cpp) ASR engine gains [NeMo-faithful segment timestamps](https://github.com/mudler/LocalAI/pull/10207), a [multilingual streaming Nemotron-3.5 model](https://github.com/mudler/LocalAI/pull/10199), [dynamic batching for concurrent transcription](https://github.com/mudler/LocalAI/pull/10112) and [CUDA graphs](https://github.com/mudler/LocalAI/pull/10273); the new [CrispASR backend](https://github.com/mudler/LocalAI/pull/10099) adds multi-architecture ASR + TTS, and [60 Piper TTS voices across 42 languages](https://github.com/mudler/LocalAI/pull/10296) land in the gallery (plus [per-request TTS instructions and params](https://github.com/mudler/LocalAI/pull/10172)).
+- **June 2026**: New backends and models: [locate-anything.cpp](https://github.com/mudler/LocalAI/pull/10264) for open-vocabulary object detection via ggml, [Ideogram4 image generation](https://github.com/mudler/LocalAI/pull/10201) in stablediffusion-ggml, [llama.cpp video input](https://github.com/mudler/LocalAI/pull/10216), and the [Gemma 4 QAT family with MTP speculative-decoding pairs](https://github.com/mudler/LocalAI/pull/10215). Plus an [interactive CLI chat mode](https://github.com/mudler/LocalAI/pull/10226) and [RAG source citations in agent responses](https://github.com/mudler/LocalAI/pull/10228).
+- **June 2026**: Distributed mode hardening: [prefix-cache-aware routing](https://github.com/mudler/LocalAI/pull/10071), a [production-ready request router with auto-sized embedding/rerank batches](https://github.com/mudler/LocalAI/pull/10104), [ds4 layer-split distributed inference](https://github.com/mudler/LocalAI/pull/10098), [NATS JWT auth + TLS/mTLS](https://github.com/mudler/LocalAI/pull/10159), and [resumable file uploads](https://github.com/mudler/LocalAI/pull/10109).
 - **May 2026**: **LocalAI 4.3.0** - `llama.cpp` [prompt cache on by default](https://github.com/mudler/LocalAI/pull/9925) (repeated system prompts collapse from minutes to seconds), [keyless cosign signing of backend OCI images](https://github.com/mudler/LocalAI/pull/9823), [per-API-key + per-user usage attribution](https://github.com/mudler/LocalAI/pull/9920), Distributed v3 with [per-request replica routing](https://github.com/mudler/LocalAI/pull/9968). [Release notes](https://github.com/mudler/LocalAI/releases/tag/v4.3.0)
 - **May 2026**: **LocalAI 4.2.0** - LocalAI sees and hears: [voice recognition](https://github.com/mudler/LocalAI/pull/9500), [face recognition + antispoofing liveness](https://github.com/mudler/LocalAI/pull/9480), speaker diarization. Plus [drop-in Ollama API](https://github.com/mudler/LocalAI/pull/9284), [video generation](https://github.com/mudler/LocalAI/pull/9420), redesigned UI with i18n + admin-configurable branding, vLLM at feature parity with llama.cpp, and 11 new backends. [Release notes](https://github.com/mudler/LocalAI/releases/tag/v4.2.0)
 - **April 2026**: **LocalAI 4.1.0** - LocalAI becomes a control tower: distributed cluster mode with VRAM-aware smart routing + autoscaling, multi-user platform with OIDC and API keys, per-user quotas with predictive analytics, in-UI fine-tuning with TRL (auto-export to GGUF), on-the-fly quantization backend, visual pipeline editor. [Release notes](https://github.com/mudler/LocalAI/releases/tag/v4.1.0)
@@ -217,7 +221,7 @@ See the full [Backend & Model Compatibility Table](https://localai.io/model-comp
 - [Integrations & community projects](https://localai.io/docs/integrations/)
 - [Installation video walkthrough](https://www.youtube.com/watch?v=cMVNnlqwfw4)
 - [Media & blog posts](https://localai.io/basics/news/#media-blogs-social)
- [Examples](https://github.com/mudler/LocalAI-examples)
+- [Examples](https://github.com/mudler/LocalAI-examples) — including the [realtime voice assistant demo](https://github.com/localai-org/localai-realtime-demo) (Go client for the Realtime API with tool calling)

 ## Team

--- a/backend/go/locate-anything-cpp/Makefile
+++ b/backend/go/locate-anything-cpp/Makefile
@@ -10,7 +10,7 @@ JOBS?=$(shell nproc --ignore=1)
 # this on `master` always picks up the latest C-API surface (incl. the
 # per-detection accessor functions used by golocateanythingcpp.go).
 LOCATEANYTHING_REPO?=https://github.com/mudler/locate-anything.cpp.git
-LOCATEANYTHING_VERSION?=60e450945476d5e97e0754a8c0e71a9ea81690e0
+LOCATEANYTHING_VERSION?=92c1682da792c1e8a5dec91acc2be4b02c742ded

 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
--- a/backend/go/omnivoice-cpp/.gitignore
+++ b/backend/go/omnivoice-cpp/.gitignore
@@ -0,0 +1,17 @@
+# Fetched upstream sources
+sources/
+
+# CMake build directories
+build*/
+
+# Compiled shared libraries
+*.so
+
+# Compiled backend binary
+omnivoice-cpp
+
+# Packaging output
+package/
+
+# Downloaded e2e models
+omnivoice-models/
--- a/backend/go/omnivoice-cpp/CMakeLists.txt
+++ b/backend/go/omnivoice-cpp/CMakeLists.txt
@@ -0,0 +1,53 @@
+cmake_minimum_required(VERSION 3.14)
+project(gomnivoicecpp LANGUAGES C CXX)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+set(OMNIVOICE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/sources/omnivoice.cpp)
+
+# Override upstream's CMAKE_CUDA_ARCHITECTURES before add_subdirectory.
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+    set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real")
+endif()
+
+# Add the upstream project. Its own CMakeLists adds ggml + builds
+# omnivoice-core (STATIC, contains src/omnivoice.cpp i.e. the ov_* impl).
+# EXCLUDE_FROM_ALL keeps its CLI tools/tests from building unless referenced.
+add_subdirectory(${OMNIVOICE_DIR} omnivoice EXCLUDE_FROM_ALL)
+
+# Upstream generates version.h into its own CMAKE_CURRENT_BINARY_DIR and adds
+# the top-level ${CMAKE_BINARY_DIR} to omnivoice-core's include path. When the
+# project is nested under add_subdirectory those two directories differ
+# (<build>/omnivoice vs <build>), so omnivoice.cpp cannot find version.h. Point
+# omnivoice-core at the subproject binary dir where version.h is actually
+# generated. (Fix lives here, never in the fetched upstream checkout.)
+target_include_directories(omnivoice-core PRIVATE ${CMAKE_BINARY_DIR}/omnivoice)
+
+add_library(gomnivoicecpp MODULE cpp/gomnivoicecpp.cpp)
+target_link_libraries(gomnivoicecpp PRIVATE omnivoice-core)
+
+target_include_directories(gomnivoicecpp PRIVATE ${OMNIVOICE_DIR}/src)
+target_include_directories(gomnivoicecpp SYSTEM PRIVATE ${OMNIVOICE_DIR}/ggml/include)
+
+# Link GPU backends if the upstream ggml created them.
+foreach(backend blas cuda metal vulkan sycl)
+    if(TARGET ggml-${backend})
+        target_link_libraries(gomnivoicecpp PRIVATE ggml-${backend})
+        if(backend STREQUAL "cuda")
+            find_package(CUDAToolkit QUIET)
+            if(CUDAToolkit_FOUND)
+                target_link_libraries(gomnivoicecpp PRIVATE CUDA::cudart)
+            endif()
+        endif()
+    endif()
+endforeach()
+
+if(MSVC)
+    target_compile_options(gomnivoicecpp PRIVATE /W4 /wd4100 /wd4505)
+else()
+    target_compile_options(gomnivoicecpp PRIVATE -Wall -Wextra
+                          -Wno-unused-parameter -Wno-unused-function)
+endif()
+
+set_property(TARGET gomnivoicecpp PROPERTY CXX_STANDARD 17)
+set_target_properties(gomnivoicecpp PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
--- a/backend/go/omnivoice-cpp/Makefile
+++ b/backend/go/omnivoice-cpp/Makefile
@@ -0,0 +1,122 @@
+CMAKE_ARGS?=
+BUILD_TYPE?=
+NATIVE?=false
+
+GOCMD?=go
+GO_TAGS?=
+JOBS?=$(shell nproc --ignore=1)
+
+# omnivoice.cpp version
+OMNIVOICE_REPO?=https://github.com/ServeurpersoCom/omnivoice.cpp
+OMNIVOICE_VERSION?=2603355a5dfacae5cfc33531d5d0933221843509
+SO_TARGET?=libgomnivoicecpp.so
+
+CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
+
+ifeq ($(NATIVE),false)
+	CMAKE_ARGS+=-DGGML_NATIVE=OFF
+endif
+
+ifeq ($(BUILD_TYPE),cublas)
+	CMAKE_ARGS+=-DGGML_CUDA=ON
+else ifeq ($(BUILD_TYPE),openblas)
+	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+else ifeq ($(BUILD_TYPE),clblas)
+	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
+else ifeq ($(BUILD_TYPE),hipblas)
+	CMAKE_ARGS+=-DGGML_HIPBLAS=ON
+else ifeq ($(BUILD_TYPE),vulkan)
+	CMAKE_ARGS+=-DGGML_VULKAN=ON
+else ifeq ($(OS),Darwin)
+	ifneq ($(BUILD_TYPE),metal)
+		CMAKE_ARGS+=-DGGML_METAL=OFF
+	else
+		CMAKE_ARGS+=-DGGML_METAL=ON
+		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
+	endif
+endif
+
+ifeq ($(BUILD_TYPE),sycl_f16)
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DGGML_SYCL_F16=ON
+endif
+
+ifeq ($(BUILD_TYPE),sycl_f32)
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx
+endif
+
+sources/omnivoice.cpp:
+	mkdir -p sources/omnivoice.cpp
+	cd sources/omnivoice.cpp && \
+	git init && \
+	git remote add origin $(OMNIVOICE_REPO) && \
+	git fetch origin && \
+	git checkout $(OMNIVOICE_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+# Detect OS
+UNAME_S := $(shell uname -s)
+
+# Only build CPU variants on Linux
+ifeq ($(UNAME_S),Linux)
+	VARIANT_TARGETS = libgomnivoicecpp-avx.so libgomnivoicecpp-avx2.so libgomnivoicecpp-avx512.so libgomnivoicecpp-fallback.so
+else
+	VARIANT_TARGETS = libgomnivoicecpp-fallback.so
+endif
+
+omnivoice-cpp: main.go gomnivoicecpp.go $(VARIANT_TARGETS)
+	CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o omnivoice-cpp ./
+
+package: omnivoice-cpp
+	bash package.sh
+
+build: package
+
+clean: purge
+	rm -rf libgomnivoicecpp*.so package sources/omnivoice.cpp omnivoice-cpp
+
+purge:
+	rm -rf build*
+
+.NOTPARALLEL:
+
+ifeq ($(UNAME_S),Linux)
+libgomnivoicecpp-avx.so: sources/omnivoice.cpp
+	$(info ${GREEN}I omnivoice-cpp build info:avx${RESET})
+	SO_TARGET=libgomnivoicecpp-avx.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgomnivoicecpp-custom
+	rm -rf build-libgomnivoicecpp-avx.so
+
+libgomnivoicecpp-avx2.so: sources/omnivoice.cpp
+	$(info ${GREEN}I omnivoice-cpp build info:avx2${RESET})
+	SO_TARGET=libgomnivoicecpp-avx2.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) libgomnivoicecpp-custom
+	rm -rf build-libgomnivoicecpp-avx2.so
+
+libgomnivoicecpp-avx512.so: sources/omnivoice.cpp
+	$(info ${GREEN}I omnivoice-cpp build info:avx512${RESET})
+	SO_TARGET=libgomnivoicecpp-avx512.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) libgomnivoicecpp-custom
+	rm -rf build-libgomnivoicecpp-avx512.so
+endif
+
+libgomnivoicecpp-fallback.so: sources/omnivoice.cpp
+	$(info ${GREEN}I omnivoice-cpp build info:fallback${RESET})
+	SO_TARGET=libgomnivoicecpp-fallback.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgomnivoicecpp-custom
+	rm -rf build-libgomnivoicecpp-fallback.so
+
+libgomnivoicecpp-custom: CMakeLists.txt cpp/gomnivoicecpp.cpp cpp/gomnivoicecpp.h
+	mkdir -p build-$(SO_TARGET) && \
+	cd build-$(SO_TARGET) && \
+	cmake .. $(CMAKE_ARGS) && \
+	cmake --build . --config Release -j$(JOBS) --target gomnivoicecpp && \
+	cd .. && \
+	mv build-$(SO_TARGET)/libgomnivoicecpp.so ./$(SO_TARGET)
+
+test: omnivoice-cpp
+	@echo "Running omnivoice-cpp tests..."
+	bash test.sh
+	@echo "omnivoice-cpp tests completed."
+
+all: omnivoice-cpp package
--- a/backend/go/omnivoice-cpp/audio.go
+++ b/backend/go/omnivoice-cpp/audio.go
@@ -0,0 +1,129 @@
+package main
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"os"
+	"runtime"
+
+	"github.com/go-audio/audio"
+	"github.com/go-audio/wav"
+)
+
+const omnivoiceSampleRate = 24000
+
+// wavHeader24k returns a 44-byte WAV header for a streaming 24 kHz mono 16-bit
+// PCM stream, with placeholder (0xFFFFFFFF) sizes since the total length is
+// unknown up front. Emitted as the first chunk of TTSStream so the HTTP layer
+// receives a self-describing WAV (the gRPC TTSStream path never sets Message,
+// so the backend owns the header - see core/backend/tts.go:ModelTTSStream).
+func wavHeader24k() []byte {
+	var buf bytes.Buffer
+	w := func(v any) { _ = binary.Write(&buf, binary.LittleEndian, v) }
+	buf.WriteString("RIFF")
+	w(uint32(0xFFFFFFFF))
+	buf.WriteString("WAVE")
+	buf.WriteString("fmt ")
+	w(uint32(16))                      // Subchunk1Size
+	w(uint16(1))                       // PCM
+	w(uint16(1))                       // mono
+	w(uint32(omnivoiceSampleRate))     // sample rate
+	w(uint32(omnivoiceSampleRate * 2)) // byte rate = SR * blockAlign
+	w(uint16(2))                       // block align (16-bit mono)
+	w(uint16(16))                      // bits per sample
+	buf.WriteString("data")
+	w(uint32(0xFFFFFFFF))
+	return buf.Bytes()
+}
+
+// floatToPCM16LE clamps each sample to [-1,1] and encodes it as little-endian
+// signed 16-bit PCM.
+func floatToPCM16LE(samples []float32) []byte {
+	out := make([]byte, len(samples)*2)
+	for i, s := range samples {
+		if s > 1 {
+			s = 1
+		} else if s < -1 {
+			s = -1
+		}
+		v := int16(s * 32767)
+		out[i*2] = byte(v)
+		out[i*2+1] = byte(v >> 8)
+	}
+	return out
+}
+
+// writeWAV24k writes samples as a finalized 24 kHz mono 16-bit WAV at dst.
+func writeWAV24k(dst string, samples []float32) error {
+	f, err := os.Create(dst)
+	if err != nil {
+		return fmt.Errorf("omnivoice: create %q: %w", dst, err)
+	}
+	enc := wav.NewEncoder(f, omnivoiceSampleRate, 16, 1, 1)
+	ints := make([]int, len(samples))
+	for i, s := range samples {
+		if s > 1 {
+			s = 1
+		} else if s < -1 {
+			s = -1
+		}
+		ints[i] = int(s * 32767)
+	}
+	b := &audio.IntBuffer{
+		Format:         &audio.Format{NumChannels: 1, SampleRate: omnivoiceSampleRate},
+		Data:           ints,
+		SourceBitDepth: 16,
+	}
+	if err := enc.Write(b); err != nil {
+		_ = enc.Close()
+		_ = f.Close()
+		return fmt.Errorf("omnivoice: encode WAV: %w", err)
+	}
+	if err := enc.Close(); err != nil {
+		_ = f.Close()
+		return fmt.Errorf("omnivoice: finalize WAV: %w", err)
+	}
+	return f.Close()
+}
+
+// readWAVAsFloat decodes a WAV file (any sample rate/channels) to a mono
+// float32 slice in [-1,1] for use as reference audio. OmniVoice expects 24 kHz;
+// callers should supply 24 kHz reference clips.
+func readWAVAsFloat(path string) ([]float32, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, fmt.Errorf("omnivoice: open ref %q: %w", path, err)
+	}
+	defer func() { _ = f.Close() }()
+
+	dec := wav.NewDecoder(f)
+	buf, err := dec.FullPCMBuffer()
+	if err != nil {
+		return nil, fmt.Errorf("omnivoice: decode ref %q: %w", path, err)
+	}
+	ch := int(buf.Format.NumChannels)
+	if ch < 1 {
+		ch = 1
+	}
+	bitDepth := int(buf.SourceBitDepth)
+	if bitDepth == 0 {
+		bitDepth = 16
+	}
+	scale := float32(int64(1) << uint(bitDepth-1))
+	n := len(buf.Data) / ch
+	out := make([]float32, n)
+	for i := 0; i < n; i++ {
+		// Downmix to mono by averaging channels.
+		var acc int
+		for c := 0; c < ch; c++ {
+			acc += buf.Data[i*ch+c]
+		}
+		out[i] = float32(acc) / float32(ch) / scale
+	}
+	return out, nil
+}
+
+// runtimeKeepAlive prevents the GC from reclaiming the reference-audio slice
+// while its backing pointer is in use across the C call.
+func runtimeKeepAlive(v any) { runtime.KeepAlive(v) }
--- a/backend/go/omnivoice-cpp/cpp/gomnivoicecpp.cpp
+++ b/backend/go/omnivoice-cpp/cpp/gomnivoicecpp.cpp
@@ -0,0 +1,166 @@
+#include "gomnivoicecpp.h"
+#include "ggml-backend.h"
+#include "omnivoice.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+static ov_context *g_ctx = nullptr;
+
+static void ggml_log_cb(enum ggml_log_level level, const char *log,
+                        void * /*data*/) {
+    if (!log)
+        return;
+    const char *lvl = "?????";
+    switch (level) {
+    case GGML_LOG_LEVEL_DEBUG: lvl = "DEBUG"; break;
+    case GGML_LOG_LEVEL_INFO:  lvl = "INFO";  break;
+    case GGML_LOG_LEVEL_WARN:  lvl = "WARN";  break;
+    case GGML_LOG_LEVEL_ERROR: lvl = "ERROR"; break;
+    default: break;
+    }
+    fprintf(stderr, "[%-5s] %s", lvl, log);
+    fflush(stderr);
+}
+
+int omni_load(const char *model_path, const char *codec_path, int use_fa,
+              int clamp_fp16) {
+    ggml_log_set(ggml_log_cb, nullptr);
+    ggml_backend_load_all();
+
+    if (!model_path || model_path[0] == '\0') {
+        fprintf(stderr, "[omnivoice-cpp] ERROR: model_path is required\n");
+        return 1;
+    }
+    if (!codec_path || codec_path[0] == '\0') {
+        fprintf(stderr, "[omnivoice-cpp] ERROR: codec_path is required\n");
+        return 2;
+    }
+
+    ov_init_params p;
+    ov_init_default_params(&p);
+    p.model_path = model_path;
+    p.codec_path = codec_path;
+    p.use_fa = use_fa != 0;
+    p.clamp_fp16 = clamp_fp16 != 0;
+
+    fprintf(stderr, "[omnivoice-cpp] Loading model=%s codec=%s\n", model_path,
+            codec_path);
+
+    g_ctx = ov_init(&p);
+    if (!g_ctx) {
+        fprintf(stderr, "[omnivoice-cpp] FATAL: ov_init failed: %s\n",
+                ov_last_error());
+        return 3;
+    }
+    fprintf(stderr, "[omnivoice-cpp] Model loaded (%s)\n", ov_version());
+    return 0;
+}
+
+// Fill an ov_tts_params from the flat wrapper arguments.
+static void fill_params(ov_tts_params *tp, const char *text, const char *lang,
+                        const char *instruct, const float *ref_samples,
+                        int ref_n, const char *ref_text, long long seed,
+                        int denoise) {
+    ov_tts_default_params(tp);
+    tp->text = text ? text : "";
+    tp->lang = lang ? lang : "";
+    if (instruct && instruct[0] != '\0')
+        tp->instruct = instruct;
+    if (ref_samples && ref_n > 0) {
+        tp->ref_audio_24k = ref_samples;
+        tp->ref_n_samples = ref_n;
+        if (ref_text && ref_text[0] != '\0')
+            tp->ref_text = ref_text;
+        tp->denoise = denoise != 0;
+    }
+    if (seed >= 0)
+        tp->mg_seed = (uint64_t)seed;
+}
+
+float *omni_tts(const char *text, const char *lang, const char *instruct,
+                const float *ref_samples, int ref_n, const char *ref_text,
+                long long seed, int denoise, int *out_n) {
+    if (out_n)
+        *out_n = 0;
+    if (!g_ctx) {
+        fprintf(stderr, "[omnivoice-cpp] ERROR: model not loaded\n");
+        return nullptr;
+    }
+    if (!text || text[0] == '\0') {
+        fprintf(stderr, "[omnivoice-cpp] ERROR: text is required\n");
+        return nullptr; // omni_tts: out_n already 0
+    }
+    ov_tts_params tp;
+    fill_params(&tp, text, lang, instruct, ref_samples, ref_n, ref_text, seed,
+                denoise);
+
+    ov_audio out = {0};
+    enum ov_status rc = ov_synthesize(g_ctx, &tp, &out);
+    if (rc != OV_STATUS_OK || out.n_samples <= 0 || !out.samples) {
+        fprintf(stderr, "[omnivoice-cpp] ERROR: synthesize failed (rc=%d): %s\n",
+                (int)rc, ov_last_error());
+        ov_audio_free(&out);
+        return nullptr;
+    }
+
+    // Copy into a plain malloc buffer the Go side can free symmetrically via
+    // omni_pcm_free; then release the ov_audio-owned buffer.
+    size_t bytes = (size_t)out.n_samples * sizeof(float);
+    float *buf = (float *)malloc(bytes);
+    if (!buf) {
+        fprintf(stderr, "[omnivoice-cpp] ERROR: malloc(%zu) failed\n", bytes);
+        ov_audio_free(&out);
+        return nullptr;
+    }
+    memcpy(buf, out.samples, bytes);
+    if (out_n)
+        *out_n = out.n_samples;
+    ov_audio_free(&out);
+    return buf;
+}
+
+int omni_tts_stream(const char *text, const char *lang, const char *instruct,
+                    const float *ref_samples, int ref_n, const char *ref_text,
+                    long long seed, int denoise, omni_pcm_chunk_cb cb,
+                    void *user_data) {
+    if (!g_ctx) {
+        fprintf(stderr, "[omnivoice-cpp] ERROR: model not loaded\n");
+        return 1;
+    }
+    if (!cb) {
+        fprintf(stderr, "[omnivoice-cpp] ERROR: stream callback is null\n");
+        return 2;
+    }
+    if (!text || text[0] == '\0') {
+        fprintf(stderr, "[omnivoice-cpp] ERROR: text is required\n");
+        return 4;
+    }
+    ov_tts_params tp;
+    fill_params(&tp, text, lang, instruct, ref_samples, ref_n, ref_text, seed,
+                denoise);
+    // ov_audio_chunk_cb has the identical signature to omni_pcm_chunk_cb
+    // (bool vs int return are ABI-compatible; non-zero == true).
+    tp.on_chunk = (ov_audio_chunk_cb)cb;
+    tp.on_chunk_user_data = user_data;
+
+    ov_audio out = {0}; // stays empty in streaming mode
+    enum ov_status rc = ov_synthesize(g_ctx, &tp, &out);
+    ov_audio_free(&out);
+    if (rc != OV_STATUS_OK && rc != OV_STATUS_CANCELLED) {
+        fprintf(stderr, "[omnivoice-cpp] ERROR: stream synth failed (rc=%d): %s\n",
+                (int)rc, ov_last_error());
+        return 3;
+    }
+    return 0;
+}
+
+void omni_pcm_free(float *p) { free(p); }
+
+void omni_unload(void) {
+    if (g_ctx) {
+        ov_free(g_ctx);
+        g_ctx = nullptr;
+    }
+}
--- a/backend/go/omnivoice-cpp/cpp/gomnivoicecpp.h
+++ b/backend/go/omnivoice-cpp/cpp/gomnivoicecpp.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <cstdint>
+
+extern "C" {
+
+// Streaming PCM chunk callback. samples is mono float PCM at 24 kHz, valid
+// only for the duration of the call. Return non-zero to continue, 0 to abort.
+typedef int (*omni_pcm_chunk_cb)(const float *samples, int n_samples,
+                                 void *user_data);
+
+// Load the LM (model_path) + codec (codec_path) GGUFs. use_fa / clamp_fp16
+// map to ov_init_params. Returns 0 on success, non-zero on failure.
+int omni_load(const char *model_path, const char *codec_path, int use_fa,
+              int clamp_fp16);
+
+// Synthesize to a malloc'd float PCM buffer (caller frees via omni_pcm_free).
+// ref_samples != null && ref_n > 0 => voice cloning (ref_text optional).
+// instruct != null && non-empty => voice design. seed < 0 keeps the default
+// MaskGIT seed. denoise toggles the <|denoise|> marker (only with a reference).
+// Writes the sample count to *out_n. Returns NULL on failure (out_n set to 0).
+float *omni_tts(const char *text, const char *lang, const char *instruct,
+                const float *ref_samples, int ref_n, const char *ref_text,
+                long long seed, int denoise, int *out_n);
+
+// Streaming synthesis: cb is invoked per PCM chunk as audio is produced.
+// Same reference/design/seed semantics as omni_tts. Returns 0 on success.
+int omni_tts_stream(const char *text, const char *lang, const char *instruct,
+                    const float *ref_samples, int ref_n, const char *ref_text,
+                    long long seed, int denoise, omni_pcm_chunk_cb cb,
+                    void *user_data);
+
+// Free a buffer returned by omni_tts.
+void omni_pcm_free(float *p);
+
+// Release the OmniVoice context.
+void omni_unload(void);
+}
--- a/backend/go/omnivoice-cpp/e2e_test.go
+++ b/backend/go/omnivoice-cpp/e2e_test.go
@@ -0,0 +1,74 @@
+package main
+
+import (
+	"os"
+	"strings"
+
+	"github.com/ebitengine/purego"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func ttsReq(text, voice string, lang *string, dst string) *pb.TTSRequest {
+	return &pb.TTSRequest{Text: text, Voice: voice, Language: lang, Dst: dst}
+}
+
+var _ = Describe("OmniVoice e2e", Label("e2e"), func() {
+	var loaded bool
+
+	BeforeEach(func() {
+		modelPath := os.Getenv("OMNIVOICE_MODEL")
+		codecPath := os.Getenv("OMNIVOICE_CODEC")
+		if modelPath == "" || codecPath == "" {
+			Skip("OMNIVOICE_MODEL / OMNIVOICE_CODEC not set; skipping e2e")
+		}
+		if !loaded {
+			lib := os.Getenv("OMNIVOICE_LIBRARY")
+			if lib == "" {
+				lib = "./libgomnivoicecpp-fallback.so"
+			}
+			h, err := purego.Dlopen(lib, purego.RTLD_NOW|purego.RTLD_GLOBAL)
+			Expect(err).ToNot(HaveOccurred())
+			purego.RegisterLibFunc(&CppLoad, h, "omni_load")
+			purego.RegisterLibFunc(&CppTTS, h, "omni_tts")
+			purego.RegisterLibFunc(&CppTTSStream, h, "omni_tts_stream")
+			purego.RegisterLibFunc(&CppPCMFree, h, "omni_pcm_free")
+			purego.RegisterLibFunc(&CppUnload, h, "omni_unload")
+			Expect(CppLoad(modelPath, codecPath, 0, 0)).To(Equal(0))
+			loaded = true
+		}
+	})
+
+	It("synthesizes a WAV file via TTS", func() {
+		b := &OmnivoiceCpp{opts: loadOptions{seed: 42, denoise: true}}
+		dst := GinkgoT().TempDir() + "/out.wav"
+		lang := "en"
+		err := b.TTS(ttsReq("Hello world.", "", &lang, dst))
+		Expect(err).ToNot(HaveOccurred())
+		fi, err := os.Stat(dst)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(fi.Size()).To(BeNumerically(">", int64(44)))
+	})
+
+	It("streams audio chunks via TTSStream", func() {
+		b := &OmnivoiceCpp{opts: loadOptions{seed: 42, denoise: true}}
+		results := make(chan []byte, 1024)
+		lang := "en"
+		done := make(chan error, 1)
+		go func() { done <- b.TTSStream(ttsReq("Hello there, streaming test.", "", &lang, ""), results) }()
+
+		var chunks int
+		var first []byte
+		for c := range results {
+			if chunks == 0 {
+				first = c
+			}
+			chunks++
+		}
+		Expect(<-done).ToNot(HaveOccurred())
+		Expect(chunks).To(BeNumerically(">=", 2))
+		Expect(string(first[0:4])).To(Equal("RIFF"))
+		Expect(strings.HasPrefix(string(first[8:12]), "WAVE")).To(BeTrue())
+	})
+})
--- a/backend/go/omnivoice-cpp/gomnivoicecpp.go
+++ b/backend/go/omnivoice-cpp/gomnivoicecpp.go
@@ -0,0 +1,246 @@
+package main
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+	"unsafe"
+
+	"github.com/ebitengine/purego"
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+)
+
+var (
+	// omni_load(model_path, codec_path, use_fa, clamp_fp16) int
+	CppLoad func(modelPath, codecPath string, useFA, clampFP16 int) int
+	// omni_tts(text, lang, instruct, ref_samples, ref_n, ref_text, seed, denoise, out_n) -> float* (uintptr)
+	CppTTS func(text, lang, instruct string, refSamples unsafe.Pointer, refN int,
+		refText string, seed int64, denoise int, outN unsafe.Pointer) uintptr
+	// omni_tts_stream(text, lang, instruct, ref_samples, ref_n, ref_text, seed, denoise, cb, user) int
+	CppTTSStream func(text, lang, instruct string, refSamples unsafe.Pointer, refN int,
+		refText string, seed int64, denoise int, cb uintptr, user uintptr) int
+	CppPCMFree func(ptr uintptr)
+	CppUnload  func()
+)
+
+type OmnivoiceCpp struct {
+	base.SingleThread
+	opts loadOptions
+	// audioPath is the model-config reference voice (tts.audio_path), used as
+	// the default voice-cloning reference when a request does not set Voice.
+	audioPath string
+}
+
+func (o *OmnivoiceCpp) Load(opts *pb.ModelOptions) error {
+	model := opts.ModelFile
+	if model == "" {
+		model = opts.ModelPath
+	}
+	if !filepath.IsAbs(model) && opts.ModelPath != "" {
+		model = filepath.Join(opts.ModelPath, model)
+	}
+
+	o.opts = parseOptions(opts.Options)
+
+	// Resolve the codec/tokenizer GGUF: explicit option, else auto-discover a
+	// *tokenizer*.gguf sibling of the base model.
+	codec := o.opts.codecPath
+	if codec != "" && !filepath.IsAbs(codec) {
+		codec = filepath.Join(filepath.Dir(model), codec)
+	}
+	if codec == "" {
+		codec = discoverTokenizer(filepath.Dir(model))
+	}
+	if codec == "" {
+		return fmt.Errorf("omnivoice: no codec/tokenizer GGUF found; set option 'tokenizer:<file>'")
+	}
+	o.opts.codecPath = codec
+
+	// tts.audio_path (ModelOptions.AudioPath) is the config-level voice-cloning
+	// reference: a default reference WAV used when a request omits Voice.
+	// Resolved relative to the model directory like the codec.
+	o.audioPath = opts.AudioPath
+	if o.audioPath != "" && !filepath.IsAbs(o.audioPath) {
+		o.audioPath = filepath.Join(filepath.Dir(model), o.audioPath)
+	}
+
+	useFA := boolToInt(o.opts.useFA)
+	clamp := boolToInt(o.opts.clampFP16)
+
+	fmt.Fprintf(os.Stderr, "[omnivoice-cpp] Load model=%s codec=%s use_fa=%d clamp_fp16=%d\n",
+		model, codec, useFA, clamp)
+
+	if rc := CppLoad(model, codec, useFA, clamp); rc != 0 {
+		return fmt.Errorf("omnivoice: failed to load model (rc=%d)", rc)
+	}
+	return nil
+}
+
+// discoverTokenizer returns the first *tokenizer*.gguf in dir, or "".
+func discoverTokenizer(dir string) string {
+	entries, err := os.ReadDir(dir)
+	if err != nil {
+		return ""
+	}
+	for _, e := range entries {
+		name := strings.ToLower(e.Name())
+		if strings.Contains(name, "tokenizer") && strings.HasSuffix(name, ".gguf") {
+			return filepath.Join(dir, e.Name())
+		}
+	}
+	return ""
+}
+
+func boolToInt(b bool) int {
+	if b {
+		return 1
+	}
+	return 0
+}
+
+// refAudio loads the reference WAV (voice cloning) if voice points to a file.
+// Returns nil if no cloning (empty or non-path - voice design uses Instructions).
+func (o *OmnivoiceCpp) refAudio(voice string) ([]float32, error) {
+	v := strings.TrimSpace(voice)
+	if v == "" {
+		return nil, nil
+	}
+	if _, err := os.Stat(v); err != nil {
+		return nil, nil
+	}
+	return readWAVAsFloat(v)
+}
+
+// refAudioFor resolves the cloning reference for a request: the per-request
+// Voice takes precedence, falling back to the model-config audio_path. Empty
+// result means no cloning (voice design via Instructions still applies).
+func (o *OmnivoiceCpp) refAudioFor(req *pb.TTSRequest) ([]float32, error) {
+	voice := strings.TrimSpace(req.Voice)
+	if voice == "" {
+		voice = o.audioPath
+	}
+	return o.refAudio(voice)
+}
+
+func reqParam(req *pb.TTSRequest, key string) string {
+	if req.Params == nil {
+		return ""
+	}
+	return req.Params[key]
+}
+
+func (o *OmnivoiceCpp) seedFor(req *pb.TTSRequest) int64 {
+	if s := reqParam(req, "seed"); s != "" {
+		var n int64
+		if _, err := fmt.Sscan(s, &n); err == nil {
+			return n
+		}
+	}
+	return o.opts.seed
+}
+
+func optStr(p *string) string {
+	if p == nil {
+		return ""
+	}
+	return *p
+}
+
+func (o *OmnivoiceCpp) TTS(req *pb.TTSRequest) error {
+	if req.Dst == "" {
+		return fmt.Errorf("omnivoice: TTS requires a destination path")
+	}
+	lang := normalizeLanguage(optStr(req.Language))
+	instruct := optStr(req.Instructions)
+	refText := reqParam(req, "ref_text")
+	seed := o.seedFor(req)
+
+	ref, err := o.refAudioFor(req)
+	if err != nil {
+		return err
+	}
+	var refPtr unsafe.Pointer
+	if len(ref) > 0 {
+		refPtr = unsafe.Pointer(&ref[0])
+	}
+
+	var n int32
+	ptr := CppTTS(req.Text, lang, instruct, refPtr, len(ref), refText, seed,
+		boolToInt(o.opts.denoise), unsafe.Pointer(&n))
+	runtimeKeepAlive(ref)
+	if ptr == 0 || n <= 0 {
+		return fmt.Errorf("omnivoice: synthesis failed")
+	}
+	defer CppPCMFree(ptr)
+	src := unsafe.Slice((*float32)(unsafe.Pointer(ptr)), int(n)) //nolint:govet // C-allocated PCM, copied out before free
+	out := make([]float32, int(n))
+	copy(out, src)
+	return writeWAV24k(req.Dst, out)
+}
+
+// streamState carries the active TTSStream channel to the single shared C
+// callback. base.SingleThread serializes TTS/TTSStream, so one global slot is
+// safe and avoids leaking a purego callback per request (purego callbacks
+// cannot be freed and are capped).
+var (
+	streamMu     sync.Mutex
+	streamChan   chan []byte
+	streamCbOnce sync.Once
+	streamCbPtr  uintptr
+)
+
+// streamCallback is registered once and forwards each PCM chunk to streamChan.
+func streamCallback(samples *float32, nSamples int32, _ uintptr) uintptr {
+	if nSamples <= 0 || samples == nil || streamChan == nil {
+		return 1 // continue
+	}
+	src := unsafe.Slice(samples, int(nSamples))
+	cp := make([]float32, int(nSamples)) // copy out of C memory before returning
+	copy(cp, src)
+	streamChan <- floatToPCM16LE(cp)
+	return 1 // continue
+}
+
+func (o *OmnivoiceCpp) TTSStream(req *pb.TTSRequest, results chan []byte) error {
+	defer close(results)
+	if req.Text == "" {
+		return fmt.Errorf("omnivoice: TTSStream requires text")
+	}
+
+	streamCbOnce.Do(func() {
+		streamCbPtr = purego.NewCallback(streamCallback)
+	})
+
+	lang := normalizeLanguage(optStr(req.Language))
+	instruct := optStr(req.Instructions)
+	refText := reqParam(req, "ref_text")
+	seed := o.seedFor(req)
+
+	ref, err := o.refAudioFor(req)
+	if err != nil {
+		return err
+	}
+	var refPtr unsafe.Pointer
+	if len(ref) > 0 {
+		refPtr = unsafe.Pointer(&ref[0])
+	}
+
+	// Emit the WAV header first so the HTTP layer gets a self-describing stream.
+	results <- wavHeader24k()
+
+	streamMu.Lock()
+	streamChan = results
+	rc := CppTTSStream(req.Text, lang, instruct, refPtr, len(ref), refText, seed,
+		boolToInt(o.opts.denoise), streamCbPtr, 0)
+	streamChan = nil
+	streamMu.Unlock()
+	runtimeKeepAlive(ref)
+
+	if rc != 0 {
+		return fmt.Errorf("omnivoice: streaming synthesis failed (rc=%d)", rc)
+	}
+	return nil
+}
--- a/backend/go/omnivoice-cpp/gomnivoicecpp_test.go
+++ b/backend/go/omnivoice-cpp/gomnivoicecpp_test.go
@@ -0,0 +1,90 @@
+package main
+
+import (
+	"bytes"
+	"encoding/binary"
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestOmnivoiceCpp(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "omnivoice-cpp suite")
+}
+
+var _ = Describe("normalizeLanguage", func() {
+	DescribeTable("maps caller language to OmniVoice codes",
+		func(in, want string) {
+			Expect(normalizeLanguage(in)).To(Equal(want))
+		},
+		Entry("empty stays empty", "", ""),
+		Entry("english full name", "English", "en"),
+		Entry("chinese full name", "Chinese", "zh"),
+		Entry("locale suffix stripped", "en-US", "en"),
+		Entry("underscore locale", "zh_CN", "zh"),
+		Entry("already a code", "en", "en"),
+		Entry("unknown passes through normalized", "xx", "xx"),
+	)
+})
+
+var _ = Describe("parseOptions", func() {
+	It("extracts codec, use_fa, clamp_fp16, seed, denoise", func() {
+		o := parseOptions([]string{
+			"tokenizer:tok.gguf",
+			"use_fa:true",
+			"clamp_fp16:true",
+			"seed:7",
+			"denoise:false",
+			"unknown:ignored",
+		})
+		Expect(o.codecPath).To(Equal("tok.gguf"))
+		Expect(o.useFA).To(BeTrue())
+		Expect(o.clampFP16).To(BeTrue())
+		Expect(o.seed).To(Equal(int64(7)))
+		Expect(o.denoise).To(BeFalse())
+	})
+
+	It("accepts codec: as an alias for tokenizer:", func() {
+		o := parseOptions([]string{"codec:c.gguf"})
+		Expect(o.codecPath).To(Equal("c.gguf"))
+	})
+
+	It("defaults seed to -1 and denoise to true", func() {
+		o := parseOptions(nil)
+		Expect(o.seed).To(Equal(int64(-1)))
+		Expect(o.denoise).To(BeTrue())
+	})
+})
+
+var _ = Describe("wavHeader24k", func() {
+	It("emits a 44-byte streaming WAV header at 24 kHz mono 16-bit", func() {
+		h := wavHeader24k()
+		Expect(h).To(HaveLen(44))
+		Expect(string(h[0:4])).To(Equal("RIFF"))
+		Expect(string(h[8:12])).To(Equal("WAVE"))
+		Expect(string(h[12:16])).To(Equal("fmt "))
+		Expect(string(h[36:40])).To(Equal("data"))
+		var sampleRate uint32
+		Expect(binary.Read(bytes.NewReader(h[24:28]), binary.LittleEndian, &sampleRate)).To(Succeed())
+		Expect(sampleRate).To(Equal(uint32(24000)))
+	})
+})
+
+var _ = Describe("floatToPCM16LE", func() {
+	It("clamps and converts float PCM to little-endian int16 bytes", func() {
+		b := floatToPCM16LE([]float32{0, 1.0, -1.0, 2.0, -2.0})
+		Expect(b).To(HaveLen(10)) // 5 samples * 2 bytes
+		read := func(off int) int16 {
+			var v int16
+			_ = binary.Read(bytes.NewReader(b[off:off+2]), binary.LittleEndian, &v)
+			return v
+		}
+		Expect(read(0)).To(Equal(int16(0)))
+		Expect(read(2)).To(Equal(int16(32767)))
+		Expect(read(4)).To(Equal(int16(-32767)))
+		Expect(read(6)).To(Equal(int16(32767)))  // clamped from 2.0
+		Expect(read(8)).To(Equal(int16(-32767))) // clamped from -2.0
+	})
+})
--- a/backend/go/omnivoice-cpp/main.go
+++ b/backend/go/omnivoice-cpp/main.go
@@ -0,0 +1,48 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+import (
+	"flag"
+	"os"
+
+	"github.com/ebitengine/purego"
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+type LibFuncs struct {
+	FuncPtr any
+	Name    string
+}
+
+func main() {
+	libName := os.Getenv("OMNIVOICE_LIBRARY")
+	if libName == "" {
+		libName = "./libgomnivoicecpp-fallback.so"
+	}
+
+	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
+	if err != nil {
+		panic(err)
+	}
+
+	libFuncs := []LibFuncs{
+		{&CppLoad, "omni_load"},
+		{&CppTTS, "omni_tts"},
+		{&CppTTSStream, "omni_tts_stream"},
+		{&CppPCMFree, "omni_pcm_free"},
+		{&CppUnload, "omni_unload"},
+	}
+	for _, lf := range libFuncs {
+		purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
+	}
+
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &OmnivoiceCpp{}); err != nil {
+		panic(err)
+	}
+}
--- a/backend/go/omnivoice-cpp/options.go
+++ b/backend/go/omnivoice-cpp/options.go
@@ -0,0 +1,74 @@
+package main
+
+import (
+	"strconv"
+	"strings"
+)
+
+// loadOptions holds the parsed model-level options for OmniVoice.
+type loadOptions struct {
+	codecPath string
+	useFA     bool
+	clampFP16 bool
+	seed      int64
+	denoise   bool
+}
+
+func splitOption(o string) (key, value string, ok bool) {
+	i := strings.Index(o, ":")
+	if i < 0 {
+		return "", "", false
+	}
+	return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true
+}
+
+// parseOptions reads the backend "key:value" option slice. Unknown keys are
+// ignored. Defaults: seed -1 (engine default), denoise true.
+func parseOptions(opts []string) loadOptions {
+	o := loadOptions{seed: -1, denoise: true}
+	for _, oo := range opts {
+		key, value, ok := splitOption(oo)
+		if !ok {
+			continue
+		}
+		switch key {
+		case "tokenizer", "codec":
+			o.codecPath = value
+		case "use_fa":
+			o.useFA = value == "true" || value == "1"
+		case "clamp_fp16":
+			o.clampFP16 = value == "true" || value == "1"
+		case "seed":
+			if n, err := strconv.ParseInt(value, 10, 64); err == nil {
+				o.seed = n
+			}
+		case "denoise":
+			o.denoise = value == "true" || value == "1"
+		}
+	}
+	return o
+}
+
+// languageNameAliases maps full language names to OmniVoice codes. OmniVoice's
+// lang hint accepts "" (auto), "en", "zh" per the upstream convention; other
+// codes pass through and the engine treats unknown hints as auto.
+var languageNameAliases = map[string]string{
+	"english": "en",
+	"chinese": "zh",
+}
+
+// normalizeLanguage lowercases, trims, strips a region/locale suffix, and
+// resolves common full names. Empty stays empty so the engine auto-detects.
+func normalizeLanguage(lang string) string {
+	lang = strings.ToLower(strings.TrimSpace(lang))
+	if lang == "" {
+		return ""
+	}
+	if i := strings.IndexAny(lang, "-_."); i >= 0 {
+		lang = lang[:i]
+	}
+	if code, ok := languageNameAliases[lang]; ok {
+		return code
+	}
+	return lang
+}
--- a/backend/go/omnivoice-cpp/package.sh
+++ b/backend/go/omnivoice-cpp/package.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# Script to copy the appropriate libraries based on architecture
+# This script is used in the final stage of the Dockerfile
+
+set -e
+
+CURDIR=$(dirname "$(realpath $0)")
+REPO_ROOT="${CURDIR}/../../.."
+
+# Create lib directory
+mkdir -p $CURDIR/package/lib
+
+cp -avf $CURDIR/omnivoice-cpp $CURDIR/package/
+cp -fv $CURDIR/libgomnivoicecpp-*.so $CURDIR/package/
+cp -fv $CURDIR/run.sh $CURDIR/package/
+
+# Detect architecture and copy appropriate libraries
+if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
+    # x86_64 architecture
+    echo "Detected x86_64 architecture, copying x86_64 libraries..."
+    cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
+    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
+    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
+elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
+    # ARM64 architecture
+    echo "Detected ARM64 architecture, copying ARM64 libraries..."
+    cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
+    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
+    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
+elif [ $(uname -s) = "Darwin" ]; then
+    echo "Detected Darwin"
+else
+    echo "Error: Could not detect architecture"
+    exit 1
+fi
+
+# Package GPU libraries based on BUILD_TYPE
+GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
+if [ -f "$GPU_LIB_SCRIPT" ]; then
+    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
+    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
+    package_gpu_libs
+fi
+
+echo "Packaging completed successfully"
+ls -liah $CURDIR/package/
+ls -liah $CURDIR/package/lib/
--- a/backend/go/omnivoice-cpp/run.sh
+++ b/backend/go/omnivoice-cpp/run.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+set -ex
+
+# Get the absolute current dir where the script is located
+CURDIR=$(dirname "$(realpath $0)")
+
+cd /
+
+echo "CPU info:"
+if [ "$(uname)" != "Darwin" ]; then
+	grep -e "model\sname" /proc/cpuinfo | head -1
+	grep -e "flags" /proc/cpuinfo | head -1
+fi
+
+LIBRARY="$CURDIR/libgomnivoicecpp-fallback.so"
+
+if [ "$(uname)" != "Darwin" ]; then
+	if grep -q -e "\savx\s" /proc/cpuinfo ; then
+		echo "CPU:    AVX    found OK"
+		if [ -e $CURDIR/libgomnivoicecpp-avx.so ]; then
+			LIBRARY="$CURDIR/libgomnivoicecpp-avx.so"
+		fi
+	fi
+
+	if grep -q -e "\savx2\s" /proc/cpuinfo ; then
+		echo "CPU:    AVX2   found OK"
+		if [ -e $CURDIR/libgomnivoicecpp-avx2.so ]; then
+			LIBRARY="$CURDIR/libgomnivoicecpp-avx2.so"
+		fi
+	fi
+
+	# Check avx 512
+	if grep -q -e "\savx512f\s" /proc/cpuinfo ; then
+		echo "CPU:    AVX512F found OK"
+		if [ -e $CURDIR/libgomnivoicecpp-avx512.so ]; then
+			LIBRARY="$CURDIR/libgomnivoicecpp-avx512.so"
+		fi
+	fi
+fi
+
+export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
+export OMNIVOICE_LIBRARY=$LIBRARY
+
+# If there is a lib/ld.so, use it
+if [ -f $CURDIR/lib/ld.so ]; then
+	echo "Using lib/ld.so"
+	echo "Using library: $LIBRARY"
+	exec $CURDIR/lib/ld.so $CURDIR/omnivoice-cpp "$@"
+fi
+
+echo "Using library: $LIBRARY"
+exec $CURDIR/omnivoice-cpp "$@"
--- a/backend/go/omnivoice-cpp/test.sh
+++ b/backend/go/omnivoice-cpp/test.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+set -e
+
+CURDIR=$(dirname "$(realpath $0)")
+cd "$CURDIR"
+
+echo "Running omnivoice-cpp backend tests..."
+
+if [ -z "$OMNIVOICE_MODEL" ]; then
+    MODEL_DIR="./omnivoice-models"
+    mkdir -p "$MODEL_DIR"
+    REPO_ID="Serveurperso/OmniVoice-GGUF"
+    BASE_URL="https://huggingface.co/${REPO_ID}/resolve/main"
+    FILES=( "omnivoice-base-Q4_K_M.gguf" "omnivoice-tokenizer-Q4_K_M.gguf" )
+    for file in "${FILES[@]}"; do
+        dest="${MODEL_DIR}/${file}"
+        if [ -f "${dest}" ]; then
+            echo "  [skip] ${file}"
+        else
+            echo "  [download] ${file}..."
+            curl -L -o "${dest}" "${BASE_URL}/${file}" --progress-bar
+        fi
+    done
+    export OMNIVOICE_MODEL="${MODEL_DIR}/omnivoice-base-Q4_K_M.gguf"
+    export OMNIVOICE_CODEC="${MODEL_DIR}/omnivoice-tokenizer-Q4_K_M.gguf"
+fi
+
+go test -v -timeout 1200s .
+
+echo "All omnivoice-cpp e2e tests passed."
--- a/backend/go/qwen3-tts-cpp/CMakeLists.txt
+++ b/backend/go/qwen3-tts-cpp/CMakeLists.txt
@@ -3,35 +3,36 @@ project(goqwen3ttscpp LANGUAGES C CXX)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

-set(QWEN3TTS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/sources/qwen3-tts.cpp)
+set(QWENTTS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/sources/qwentts.cpp)

 # Override upstream's CMAKE_CUDA_ARCHITECTURES before add_subdirectory.
 if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
    set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real")
 endif()

-# Build ggml from the upstream's submodule FIRST, so that ggml/ggml-base/ggml-cpu
-# CMake targets exist when the upstream project references them by name.
-# The upstream CMakeLists.txt uses target_link_libraries(... ggml ggml-base ggml-cpu)
-# with target_link_directories pointing at a pre-built ggml/build/. By adding ggml
-# as a subdirectory here, CMake resolves those names as targets instead.
-add_subdirectory(${QWEN3TTS_DIR}/ggml ggml EXCLUDE_FROM_ALL)
+# Add the upstream project. Its own CMakeLists adds ggml + cpp-httplib + yyjson
+# and builds qwen-core (STATIC, the qt_* impl). EXCLUDE_FROM_ALL keeps its CLI
+# tools / tts-server / tests from building unless referenced.
+add_subdirectory(${QWENTTS_DIR} qwentts EXCLUDE_FROM_ALL)

-# Now add the upstream project
-add_subdirectory(${QWEN3TTS_DIR} qwen3tts EXCLUDE_FROM_ALL)
+# Upstream generates version.h into its own CMAKE_CURRENT_BINARY_DIR and adds
+# the top-level ${CMAKE_BINARY_DIR} to qwen-core's include path. Under
+# add_subdirectory those two dirs differ (<build>/qwentts vs <build>), so
+# qwen.cpp cannot find version.h. Point qwen-core at the subproject binary dir
+# where version.h is actually generated. (Fix lives here, never in the fetched
+# upstream checkout.)
+target_include_directories(qwen-core PRIVATE ${CMAKE_BINARY_DIR}/qwentts)

 add_library(goqwen3ttscpp MODULE cpp/goqwen3ttscpp.cpp)
-target_link_libraries(goqwen3ttscpp PRIVATE qwen3_tts)
+target_link_libraries(goqwen3ttscpp PRIVATE qwen-core)

-target_include_directories(goqwen3ttscpp PRIVATE ${QWEN3TTS_DIR}/src)
-target_include_directories(goqwen3ttscpp SYSTEM PRIVATE ${QWEN3TTS_DIR}/ggml/include)
+target_include_directories(goqwen3ttscpp PRIVATE ${QWENTTS_DIR}/src)
+target_include_directories(goqwen3ttscpp SYSTEM PRIVATE ${QWENTTS_DIR}/ggml/include)

-# Link GPU backends if available
-foreach(backend blas cuda metal vulkan)
+# Link GPU backends if the upstream ggml created them.
+foreach(backend blas cuda metal vulkan sycl)
    if(TARGET ggml-${backend})
        target_link_libraries(goqwen3ttscpp PRIVATE ggml-${backend})
-        string(TOUPPER ${backend} BACKEND_UPPER)
-        target_compile_definitions(goqwen3ttscpp PRIVATE QWEN3TTS_HAVE_${BACKEND_UPPER})
        if(backend STREQUAL "cuda")
            find_package(CUDAToolkit QUIET)
            if(CUDAToolkit_FOUND)
@@ -44,12 +45,8 @@ endforeach()
 if(MSVC)
    target_compile_options(goqwen3ttscpp PRIVATE /W4 /wd4100 /wd4505)
 else()
-    target_compile_options(goqwen3ttscpp PRIVATE -Wall -Wextra -Wshadow -Wconversion
-                          -Wno-unused-parameter -Wno-unused-function -Wno-sign-conversion)
-endif()
-
-if(CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.0)
-    target_link_libraries(goqwen3ttscpp PRIVATE stdc++fs)
+    target_compile_options(goqwen3ttscpp PRIVATE -Wall -Wextra
+                          -Wno-unused-parameter -Wno-unused-function)
 endif()

 set_property(TARGET goqwen3ttscpp PROPERTY CXX_STANDARD 17)
--- a/backend/go/qwen3-tts-cpp/Makefile
+++ b/backend/go/qwen3-tts-cpp/Makefile
@@ -6,9 +6,9 @@ GOCMD?=go
 GO_TAGS?=
 JOBS?=$(shell nproc --ignore=1)

-# qwen3-tts.cpp version
-QWEN3TTS_REPO?=https://github.com/predict-woo/qwen3-tts.cpp
-QWEN3TTS_CPP_VERSION?=136e5d36c17083da0321fd96512dc7b263f94a44
+# qwentts.cpp version
+QWEN3TTS_REPO?=https://github.com/ServeurpersoCom/qwentts.cpp
+QWEN3TTS_CPP_VERSION?=0bf4a18b22e8bb8718d95294e9f7f45c0d4270a4
 SO_TARGET?=libgoqwen3ttscpp.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
@@ -49,9 +49,9 @@ ifeq ($(BUILD_TYPE),sycl_f32)
 		-DCMAKE_CXX_COMPILER=icpx
 endif

-sources/qwen3-tts.cpp:
-	mkdir -p sources/qwen3-tts.cpp
-	cd sources/qwen3-tts.cpp && \
+sources/qwentts.cpp:
+	mkdir -p sources/qwentts.cpp
+	cd sources/qwentts.cpp && \
 	git init && \
 	git remote add origin $(QWEN3TTS_REPO) && \
 	git fetch origin && \
@@ -78,7 +78,7 @@ package: qwen3-tts-cpp
 build: package

 clean: purge
-	rm -rf libgoqwen3ttscpp*.so package sources/qwen3-tts.cpp qwen3-tts-cpp
+	rm -rf libgoqwen3ttscpp*.so package sources/qwentts.cpp qwen3-tts-cpp

 purge:
 	rm -rf build*
@@ -88,24 +88,24 @@ purge:

 # Build all variants (Linux only)
 ifeq ($(UNAME_S),Linux)
-libgoqwen3ttscpp-avx.so: sources/qwen3-tts.cpp
+libgoqwen3ttscpp-avx.so: sources/qwentts.cpp
 	$(info ${GREEN}I qwen3-tts-cpp build info:avx${RESET})
 	SO_TARGET=libgoqwen3ttscpp-avx.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgoqwen3ttscpp-custom
 	rm -rf build-libgoqwen3ttscpp-avx.so

-libgoqwen3ttscpp-avx2.so: sources/qwen3-tts.cpp
+libgoqwen3ttscpp-avx2.so: sources/qwentts.cpp
 	$(info ${GREEN}I qwen3-tts-cpp build info:avx2${RESET})
 	SO_TARGET=libgoqwen3ttscpp-avx2.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) libgoqwen3ttscpp-custom
 	rm -rf build-libgoqwen3ttscpp-avx2.so

-libgoqwen3ttscpp-avx512.so: sources/qwen3-tts.cpp
+libgoqwen3ttscpp-avx512.so: sources/qwentts.cpp
 	$(info ${GREEN}I qwen3-tts-cpp build info:avx512${RESET})
 	SO_TARGET=libgoqwen3ttscpp-avx512.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) libgoqwen3ttscpp-custom
 	rm -rf build-libgoqwen3ttscpp-avx512.so
 endif

 # Build fallback variant (all platforms)
-libgoqwen3ttscpp-fallback.so: sources/qwen3-tts.cpp
+libgoqwen3ttscpp-fallback.so: sources/qwentts.cpp
 	$(info ${GREEN}I qwen3-tts-cpp build info:fallback${RESET})
 	SO_TARGET=libgoqwen3ttscpp-fallback.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgoqwen3ttscpp-custom
 	rm -rf build-libgoqwen3ttscpp-fallback.so
--- a/backend/go/qwen3-tts-cpp/audio.go
+++ b/backend/go/qwen3-tts-cpp/audio.go
@@ -0,0 +1,128 @@
+package main
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"os"
+	"runtime"
+
+	"github.com/go-audio/audio"
+	"github.com/go-audio/wav"
+)
+
+const qwen3ttsSampleRate = 24000
+
+// wavHeader24k returns a 44-byte WAV header for a streaming 24 kHz mono 16-bit
+// PCM stream, with placeholder (0xFFFFFFFF) sizes since the total length is
+// unknown up front. Emitted as the first chunk of TTSStream so the HTTP layer
+// receives a self-describing WAV (the gRPC TTSStream path never sets Message,
+// so the backend owns the header - see core/backend/tts.go:ModelTTSStream).
+func wavHeader24k() []byte {
+	var buf bytes.Buffer
+	w := func(v any) { _ = binary.Write(&buf, binary.LittleEndian, v) }
+	buf.WriteString("RIFF")
+	w(uint32(0xFFFFFFFF))
+	buf.WriteString("WAVE")
+	buf.WriteString("fmt ")
+	w(uint32(16))                     // Subchunk1Size
+	w(uint16(1))                      // PCM
+	w(uint16(1))                      // mono
+	w(uint32(qwen3ttsSampleRate))     // sample rate
+	w(uint32(qwen3ttsSampleRate * 2)) // byte rate = SR * blockAlign
+	w(uint16(2))                      // block align (16-bit mono)
+	w(uint16(16))                     // bits per sample
+	buf.WriteString("data")
+	w(uint32(0xFFFFFFFF))
+	return buf.Bytes()
+}
+
+// floatToPCM16LE clamps each sample to [-1,1] and encodes it as little-endian
+// signed 16-bit PCM.
+func floatToPCM16LE(samples []float32) []byte {
+	out := make([]byte, len(samples)*2)
+	for i, s := range samples {
+		if s > 1 {
+			s = 1
+		} else if s < -1 {
+			s = -1
+		}
+		v := int16(s * 32767)
+		out[i*2] = byte(v)
+		out[i*2+1] = byte(v >> 8)
+	}
+	return out
+}
+
+// writeWAV24k writes samples as a finalized 24 kHz mono 16-bit WAV at dst.
+func writeWAV24k(dst string, samples []float32) error {
+	f, err := os.Create(dst)
+	if err != nil {
+		return fmt.Errorf("qwen3-tts: create %q: %w", dst, err)
+	}
+	enc := wav.NewEncoder(f, qwen3ttsSampleRate, 16, 1, 1)
+	ints := make([]int, len(samples))
+	for i, s := range samples {
+		if s > 1 {
+			s = 1
+		} else if s < -1 {
+			s = -1
+		}
+		ints[i] = int(s * 32767)
+	}
+	b := &audio.IntBuffer{
+		Format:         &audio.Format{NumChannels: 1, SampleRate: qwen3ttsSampleRate},
+		Data:           ints,
+		SourceBitDepth: 16,
+	}
+	if err := enc.Write(b); err != nil {
+		_ = enc.Close()
+		_ = f.Close()
+		return fmt.Errorf("qwen3-tts: encode WAV: %w", err)
+	}
+	if err := enc.Close(); err != nil {
+		_ = f.Close()
+		return fmt.Errorf("qwen3-tts: finalize WAV: %w", err)
+	}
+	return f.Close()
+}
+
+// readWAVAsFloat decodes a WAV file (any sample rate/channels) to a mono
+// float32 slice in [-1,1] for use as cloning reference audio. qwentts expects
+// 24 kHz; callers should supply 24 kHz reference clips.
+func readWAVAsFloat(path string) ([]float32, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, fmt.Errorf("qwen3-tts: open ref %q: %w", path, err)
+	}
+	defer func() { _ = f.Close() }()
+
+	dec := wav.NewDecoder(f)
+	buf, err := dec.FullPCMBuffer()
+	if err != nil {
+		return nil, fmt.Errorf("qwen3-tts: decode ref %q: %w", path, err)
+	}
+	ch := int(buf.Format.NumChannels)
+	if ch < 1 {
+		ch = 1
+	}
+	bitDepth := int(buf.SourceBitDepth)
+	if bitDepth == 0 {
+		bitDepth = 16
+	}
+	scale := float32(int64(1) << uint(bitDepth-1))
+	n := len(buf.Data) / ch
+	out := make([]float32, n)
+	for i := 0; i < n; i++ {
+		var acc int
+		for c := 0; c < ch; c++ {
+			acc += buf.Data[i*ch+c]
+		}
+		out[i] = float32(acc) / float32(ch) / scale
+	}
+	return out, nil
+}
+
+// runtimeKeepAlive prevents the GC from reclaiming the reference-audio slice
+// while its backing pointer is in use across the C call.
+func runtimeKeepAlive(v any) { runtime.KeepAlive(v) }
--- a/backend/go/qwen3-tts-cpp/audiopath_test.go
+++ b/backend/go/qwen3-tts-cpp/audiopath_test.go
@@ -0,0 +1,54 @@
+package main
+
+import (
+	"path/filepath"
+
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// These specs pin the voice-selection logic in resolveRequest, in particular
+// the config-level audio_path (tts.audio_path -> ModelOptions.AudioPath) being
+// used as the default voice-cloning reference. No model/C library is needed:
+// resolveRequest only reads the reference WAV via readWAVAsFloat (pure Go).
+var _ = Describe("resolveRequest voice/clone selection", func() {
+	var dir, refWav string
+
+	BeforeEach(func() {
+		dir = GinkgoT().TempDir()
+		refWav = filepath.Join(dir, "ref.wav")
+		// 0.5s of non-silent 24kHz mono audio as a clone reference.
+		samples := make([]float32, qwen3ttsSampleRate/2)
+		for i := range samples {
+			samples[i] = 0.1
+		}
+		Expect(writeWAV24k(refWav, samples)).To(Succeed())
+	})
+
+	It("uses the config audio_path as the clone reference when Voice is empty", func() {
+		q := &Qwen3TtsCpp{audioPath: refWav}
+		_, _, speaker, _, ref, _, err := q.resolveRequest(&pb.TTSRequest{Text: "hi"})
+		Expect(err).ToNot(HaveOccurred())
+		Expect(speaker).To(BeEmpty())
+		Expect(len(ref)).To(Equal(qwen3ttsSampleRate / 2))
+	})
+
+	It("lets a per-request audio Voice override audio_path", func() {
+		other := filepath.Join(dir, "other.wav")
+		Expect(writeWAV24k(other, make([]float32, 100))).To(Succeed())
+		q := &Qwen3TtsCpp{audioPath: refWav}
+		_, _, speaker, _, ref, _, err := q.resolveRequest(&pb.TTSRequest{Text: "hi", Voice: other})
+		Expect(err).ToNot(HaveOccurred())
+		Expect(speaker).To(BeEmpty())
+		Expect(len(ref)).To(Equal(100))
+	})
+
+	It("does not trigger audio_path cloning for a named-speaker Voice", func() {
+		q := &Qwen3TtsCpp{audioPath: refWav}
+		_, _, speaker, _, ref, _, err := q.resolveRequest(&pb.TTSRequest{Text: "hi", Voice: "serena"})
+		Expect(err).ToNot(HaveOccurred())
+		Expect(speaker).To(Equal("serena"))
+		Expect(ref).To(BeNil())
+	})
+})
--- a/backend/go/qwen3-tts-cpp/cpp/goqwen3ttscpp.cpp
+++ b/backend/go/qwen3-tts-cpp/cpp/goqwen3ttscpp.cpp
@@ -1,161 +1,191 @@
 #include "goqwen3ttscpp.h"
 #include "ggml-backend.h"
-#include "qwen3_tts.h"
+#include "qwen.h"

-#include <cmath>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
-#include <string>

-using namespace qwen3_tts;
+static qt_context *g_ctx = nullptr;

-// Global engine (loaded once, reused across requests)
-static Qwen3TTS *g_engine = nullptr;
-static bool g_loaded = false;
-static int g_threads = 4;
-
-static void ggml_log_cb(enum ggml_log_level level, const char *log, void *data) {
-    const char *level_str;
+static void ggml_log_cb(enum ggml_log_level level, const char *log,
+                        void * /*data*/) {
    if (!log)
        return;
+    const char *lvl = "?????";
    switch (level) {
-    case GGML_LOG_LEVEL_DEBUG:
-        level_str = "DEBUG";
-        break;
-    case GGML_LOG_LEVEL_INFO:
-        level_str = "INFO";
-        break;
-    case GGML_LOG_LEVEL_WARN:
-        level_str = "WARN";
-        break;
-    case GGML_LOG_LEVEL_ERROR:
-        level_str = "ERROR";
-        break;
-    default:
-        level_str = "?????";
-        break;
+    case GGML_LOG_LEVEL_DEBUG: lvl = "DEBUG"; break;
+    case GGML_LOG_LEVEL_INFO:  lvl = "INFO";  break;
+    case GGML_LOG_LEVEL_WARN:  lvl = "WARN";  break;
+    case GGML_LOG_LEVEL_ERROR: lvl = "ERROR"; break;
+    default: break;
    }
-    fprintf(stderr, "[%-5s] ", level_str);
-    fputs(log, stderr);
+    fprintf(stderr, "[%-5s] %s", lvl, log);
    fflush(stderr);
 }

-// Map language string to language_id token used by the model
-static int language_to_id(const char *lang) {
-    if (!lang || lang[0] == '\0')
-        return 2050; // default: English
-    std::string l(lang);
-    if (l == "en")
-        return 2050;
-    if (l == "ru")
-        return 2069;
-    if (l == "zh")
-        return 2055;
-    if (l == "ja")
-        return 2058;
-    if (l == "ko")
-        return 2064;
-    if (l == "de")
-        return 2053;
-    if (l == "fr")
-        return 2061;
-    if (l == "es")
-        return 2054;
-    if (l == "it")
-        return 2056;
-    if (l == "pt")
-        return 2057;
-    fprintf(stderr, "[qwen3-tts-cpp] Unknown language '%s', defaulting to English\n",
-            lang);
-    return 2050;
-}
-
-int load_model(const char *model_dir, int n_threads) {
+int qt3_load(const char *talker_path, const char *codec_path, int use_fa,
+             int clamp_fp16) {
    ggml_log_set(ggml_log_cb, nullptr);
    ggml_backend_load_all();

-    if (n_threads <= 0)
-        n_threads = 4;
-    g_threads = n_threads;
-
-    fprintf(stderr, "[qwen3-tts-cpp] Loading models from %s (threads=%d)\n",
-            model_dir, n_threads);
-
-    g_engine = new Qwen3TTS();
-    if (!g_engine->load_models(model_dir)) {
-        fprintf(stderr, "[qwen3-tts-cpp] FATAL: failed to load models from %s\n",
-                model_dir);
-        delete g_engine;
-        g_engine = nullptr;
+    if (!talker_path || talker_path[0] == '\0') {
+        fprintf(stderr, "[qwen3-tts-cpp] ERROR: talker_path is required\n");
        return 1;
    }
-
-    g_loaded = true;
-    fprintf(stderr, "[qwen3-tts-cpp] Models loaded successfully\n");
-    return 0;
-}
-
-int synthesize(const char *text, const char *ref_audio_path, const char *dst,
-               const char *language, float temperature, float top_p,
-               int top_k, float repetition_penalty, int max_audio_tokens,
-               int n_threads) {
-    if (!g_loaded || !g_engine) {
-        fprintf(stderr, "[qwen3-tts-cpp] ERROR: models not loaded\n");
-        return 1;
-    }
-
-    if (!text || !dst) {
-        fprintf(stderr, "[qwen3-tts-cpp] ERROR: text and dst are required\n");
+    if (!codec_path || codec_path[0] == '\0') {
+        fprintf(stderr, "[qwen3-tts-cpp] ERROR: codec_path is required\n");
        return 2;
    }

-    tts_params params;
-    params.max_audio_tokens = max_audio_tokens > 0 ? max_audio_tokens : 4096;
-    params.temperature = temperature;
-    params.top_p = top_p;
-    params.top_k = top_k;
-    params.repetition_penalty = repetition_penalty;
-    params.n_threads = n_threads > 0 ? n_threads : g_threads;
-    params.language_id = language_to_id(language);
+    qt_init_params p;
+    qt_init_default_params(&p);
+    p.talker_path = talker_path;
+    p.codec_path = codec_path;
+    p.use_fa = use_fa != 0;
+    p.clamp_fp16 = clamp_fp16 != 0;

-    fprintf(stderr, "[qwen3-tts-cpp] Synthesizing: text='%.50s%s', lang_id=%d, "
-                    "temp=%.2f, threads=%d\n",
-            text, (strlen(text) > 50 ? "..." : ""), params.language_id,
-            temperature, params.n_threads);
+    fprintf(stderr, "[qwen3-tts-cpp] Loading talker=%s codec=%s\n", talker_path,
+            codec_path);

-    tts_result result;
-    bool has_ref = ref_audio_path && ref_audio_path[0] != '\0';
-
-    if (has_ref) {
-        fprintf(stderr, "[qwen3-tts-cpp] Voice cloning with ref: %s\n",
-                ref_audio_path);
-        result = g_engine->synthesize_with_voice(text, ref_audio_path, params);
-    } else {
-        result = g_engine->synthesize(text, params);
-    }
-
-    if (!result.success) {
-        fprintf(stderr, "[qwen3-tts-cpp] ERROR: synthesis failed: %s\n",
-                result.error_msg.c_str());
+    g_ctx = qt_init(&p);
+    if (!g_ctx) {
+        fprintf(stderr, "[qwen3-tts-cpp] FATAL: qt_init failed: %s\n",
+                qt_last_error());
        return 3;
    }
-
-    int n_samples = (int)result.audio.size();
-    if (n_samples == 0) {
-        fprintf(stderr, "[qwen3-tts-cpp] ERROR: synthesis produced no samples\n");
-        return 4;
-    }
-
-    fprintf(stderr,
-            "[qwen3-tts-cpp] Synthesis done: %d samples (%.2fs @ 24kHz)\n",
-            n_samples, (float)n_samples / 24000.0f);
-
-    if (!save_audio_file(dst, result.audio, result.sample_rate)) {
-        fprintf(stderr, "[qwen3-tts-cpp] ERROR: failed to write %s\n", dst);
-        return 5;
-    }
-
-    fprintf(stderr, "[qwen3-tts-cpp] Wrote %s\n", dst);
+    fprintf(stderr, "[qwen3-tts-cpp] Model loaded (%s)\n", qt_version());
    return 0;
 }
+
+// Fill a qt_tts_params from the flat wrapper arguments. Unset/zero scalars keep
+// the qt defaults (temperature 0.9, top_k 50, top_p 1.0, rep 1.05, max 2048).
+static void fill_params(qt_tts_params *tp, const char *text, const char *lang,
+                        const char *instruct, const char *speaker,
+                        const float *ref_samples, int ref_n,
+                        const char *ref_text, long long seed, float temperature,
+                        int top_k, float top_p, float repetition_penalty,
+                        int max_new_tokens) {
+    qt_tts_default_params(tp);
+    tp->text = text ? text : "";
+    if (lang && lang[0] != '\0')
+        tp->lang = lang; // else keep default NULL -> auto
+    if (instruct && instruct[0] != '\0')
+        tp->instruct = instruct;
+    if (speaker && speaker[0] != '\0')
+        tp->speaker = speaker;
+    if (ref_samples && ref_n > 0) {
+        tp->ref_audio_24k = ref_samples;
+        tp->ref_n_samples = ref_n;
+        if (ref_text && ref_text[0] != '\0')
+            tp->ref_text = ref_text;
+    }
+    if (seed >= 0)
+        tp->seed = (int64_t)seed; // else default -1 (random)
+    if (temperature > 0.0f)
+        tp->temperature = temperature;
+    if (top_k > 0)
+        tp->top_k = top_k;
+    if (top_p > 0.0f)
+        tp->top_p = top_p;
+    if (repetition_penalty > 0.0f)
+        tp->repetition_penalty = repetition_penalty;
+    if (max_new_tokens > 0)
+        tp->max_new_tokens = max_new_tokens;
+}
+
+float *qt3_tts(const char *text, const char *lang, const char *instruct,
+               const char *speaker, const float *ref_samples, int ref_n,
+               const char *ref_text, long long seed, float temperature,
+               int top_k, float top_p, float repetition_penalty,
+               int max_new_tokens, int *out_n) {
+    if (out_n)
+        *out_n = 0;
+    if (!g_ctx) {
+        fprintf(stderr, "[qwen3-tts-cpp] ERROR: model not loaded\n");
+        return nullptr;
+    }
+    if (!text || text[0] == '\0') {
+        fprintf(stderr, "[qwen3-tts-cpp] ERROR: text is required\n");
+        return nullptr;
+    }
+    qt_tts_params tp;
+    fill_params(&tp, text, lang, instruct, speaker, ref_samples, ref_n,
+                ref_text, seed, temperature, top_k, top_p, repetition_penalty,
+                max_new_tokens);
+
+    qt_audio out = {0};
+    enum qt_status rc = qt_synthesize(g_ctx, &tp, &out);
+    if (rc != QT_STATUS_OK || out.n_samples <= 0 || !out.samples) {
+        fprintf(stderr, "[qwen3-tts-cpp] ERROR: synthesize failed (rc=%d): %s\n",
+                (int)rc, qt_last_error());
+        qt_audio_free(&out);
+        return nullptr;
+    }
+
+    // Copy into a plain malloc buffer the Go side frees via qt3_pcm_free.
+    size_t bytes = (size_t)out.n_samples * sizeof(float);
+    float *buf = (float *)malloc(bytes);
+    if (!buf) {
+        fprintf(stderr, "[qwen3-tts-cpp] ERROR: malloc(%zu) failed\n", bytes);
+        qt_audio_free(&out);
+        return nullptr;
+    }
+    memcpy(buf, out.samples, bytes);
+    if (out_n)
+        *out_n = out.n_samples;
+    qt_audio_free(&out);
+    return buf;
+}
+
+int qt3_tts_stream(const char *text, const char *lang, const char *instruct,
+                   const char *speaker, const float *ref_samples, int ref_n,
+                   const char *ref_text, long long seed, float temperature,
+                   int top_k, float top_p, float repetition_penalty,
+                   int max_new_tokens, qt3_chunk_cb cb, void *user_data) {
+    if (!g_ctx) {
+        fprintf(stderr, "[qwen3-tts-cpp] ERROR: model not loaded\n");
+        return 1;
+    }
+    if (!cb) {
+        fprintf(stderr, "[qwen3-tts-cpp] ERROR: stream callback is null\n");
+        return 2;
+    }
+    if (!text || text[0] == '\0') {
+        fprintf(stderr, "[qwen3-tts-cpp] ERROR: text is required\n");
+        return 4;
+    }
+    qt_tts_params tp;
+    fill_params(&tp, text, lang, instruct, speaker, ref_samples, ref_n,
+                ref_text, seed, temperature, top_k, top_p, repetition_penalty,
+                max_new_tokens);
+    // qt_audio_chunk_cb has the identical signature to qt3_chunk_cb
+    // (bool vs int return are ABI-compatible; non-zero == true).
+    tp.on_chunk = (qt_audio_chunk_cb)cb;
+    tp.on_chunk_user_data = user_data;
+
+    qt_audio out = {0}; // stays empty in streaming mode
+    enum qt_status rc = qt_synthesize(g_ctx, &tp, &out);
+    qt_audio_free(&out);
+    if (rc != QT_STATUS_OK && rc != QT_STATUS_CANCELLED) {
+        fprintf(stderr, "[qwen3-tts-cpp] ERROR: stream synth failed (rc=%d): %s\n",
+                (int)rc, qt_last_error());
+        return 3;
+    }
+    return 0;
+}
+
+void qt3_pcm_free(float *p) { free(p); }
+
+void qt3_unload(void) {
+    if (g_ctx) {
+        qt_free(g_ctx);
+        g_ctx = nullptr;
+    }
+}
+
+int qt3_n_speakers(void) { return g_ctx ? qt_n_speakers(g_ctx) : 0; }
+
+const char *qt3_speaker_name(int i) {
+    return g_ctx ? qt_speaker_name(g_ctx, i) : nullptr;
+}
--- a/backend/go/qwen3-tts-cpp/cpp/goqwen3ttscpp.h
+++ b/backend/go/qwen3-tts-cpp/cpp/goqwen3ttscpp.h
@@ -1,12 +1,47 @@
 #pragma once

-#include <cstddef>
-#include <cstdint>
-
 extern "C" {
-int load_model(const char *model_dir, int n_threads);
-int synthesize(const char *text, const char *ref_audio_path, const char *dst,
-               const char *language, float temperature, float top_p,
-               int top_k, float repetition_penalty, int max_audio_tokens,
-               int n_threads);
+
+// Streaming PCM chunk callback. samples is mono float PCM at 24 kHz, valid
+// only for the duration of the call. Return non-zero to continue, 0 to abort.
+typedef int (*qt3_chunk_cb)(const float *samples, int n_samples,
+                            void *user_data);
+
+// Load the talker + codec/tokenizer GGUFs. use_fa / clamp_fp16 map to
+// qt_init_params (the qt ABI exposes no thread count; ggml uses its own
+// default). Returns 0 on success, non-zero on failure.
+int qt3_load(const char *talker_path, const char *codec_path, int use_fa,
+             int clamp_fp16);
+
+// Synthesize to a malloc'd float PCM buffer (caller frees via qt3_pcm_free).
+// The synthesis mode (base / custom_voice / voice_design) is auto-detected by
+// qt from the talker GGUF; speaker is honoured only for custom_voice, instruct
+// for voice_design / custom_voice, and ref_samples (+ optional ref_text) drive
+// base-mode cloning. qt enforces the rules and we surface qt_last_error() on
+// QT_STATUS_MODE_INVALID. Writes the sample count to *out_n. Returns NULL on
+// failure (out_n set to 0).
+float *qt3_tts(const char *text, const char *lang, const char *instruct,
+               const char *speaker, const float *ref_samples, int ref_n,
+               const char *ref_text, long long seed, float temperature,
+               int top_k, float top_p, float repetition_penalty,
+               int max_new_tokens, int *out_n);
+
+// Streaming synthesis: cb is invoked per PCM chunk as audio is produced. Same
+// param semantics as qt3_tts. Returns 0 on success.
+int qt3_tts_stream(const char *text, const char *lang, const char *instruct,
+                   const char *speaker, const float *ref_samples, int ref_n,
+                   const char *ref_text, long long seed, float temperature,
+                   int top_k, float top_p, float repetition_penalty,
+                   int max_new_tokens, qt3_chunk_cb cb, void *user_data);
+
+// Free a buffer returned by qt3_tts.
+void qt3_pcm_free(float *p);
+
+// Release the qt context.
+void qt3_unload(void);
+
+// Named-speaker introspection (custom_voice models). Returns 0 / NULL when no
+// model is loaded or the index is out of range.
+int qt3_n_speakers(void);
+const char *qt3_speaker_name(int i);
 }
--- a/backend/go/qwen3-tts-cpp/e2e_test.go
+++ b/backend/go/qwen3-tts-cpp/e2e_test.go
@@ -0,0 +1,95 @@
+package main
+
+import (
+	"math"
+	"os"
+	"strings"
+
+	"github.com/ebitengine/purego"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func ttsReq(text, voice string, lang *string, dst string) *pb.TTSRequest {
+	return &pb.TTSRequest{Text: text, Voice: voice, Language: lang, Dst: dst}
+}
+
+var _ = Describe("qwen3-tts-cpp e2e", Label("e2e"), func() {
+	var loaded bool
+
+	BeforeEach(func() {
+		modelPath := os.Getenv("QWEN3TTS_MODEL")
+		codecPath := os.Getenv("QWEN3TTS_CODEC")
+		if modelPath == "" || codecPath == "" {
+			Skip("QWEN3TTS_MODEL / QWEN3TTS_CODEC not set; skipping e2e")
+		}
+		if !loaded {
+			lib := os.Getenv("QWEN3TTS_LIBRARY")
+			if lib == "" {
+				lib = "./libgoqwen3ttscpp-fallback.so"
+			}
+			h, err := purego.Dlopen(lib, purego.RTLD_NOW|purego.RTLD_GLOBAL)
+			Expect(err).ToNot(HaveOccurred())
+			purego.RegisterLibFunc(&CppLoad, h, "qt3_load")
+			purego.RegisterLibFunc(&CppTTS, h, "qt3_tts")
+			purego.RegisterLibFunc(&CppTTSStream, h, "qt3_tts_stream")
+			purego.RegisterLibFunc(&CppPCMFree, h, "qt3_pcm_free")
+			purego.RegisterLibFunc(&CppUnload, h, "qt3_unload")
+			Expect(CppLoad(modelPath, codecPath, 1, 0)).To(Equal(0))
+			loaded = true
+		}
+	})
+
+	It("synthesizes a WAV file via TTS", func() {
+		b := &Qwen3TtsCpp{opts: loadOptions{seed: 42, useFA: true}}
+		dst := GinkgoT().TempDir() + "/out.wav"
+		lang := "english"
+		err := b.TTS(ttsReq("Hello world.", "", &lang, dst))
+		Expect(err).ToNot(HaveOccurred())
+		fi, err := os.Stat(dst)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(fi.Size()).To(BeNumerically(">", int64(44)))
+	})
+
+	It("streams audio chunks via TTSStream", func() {
+		b := &Qwen3TtsCpp{opts: loadOptions{seed: 42, useFA: true}}
+		results := make(chan []byte, 1024)
+		lang := "english"
+		done := make(chan error, 1)
+		go func() { done <- b.TTSStream(ttsReq("Hello there, streaming test.", "", &lang, ""), results) }()
+
+		var chunks int
+		var first []byte
+		for c := range results {
+			if chunks == 0 {
+				first = c
+			}
+			chunks++
+		}
+		Expect(<-done).ToNot(HaveOccurred())
+		Expect(chunks).To(BeNumerically(">=", 2))
+		Expect(string(first[0:4])).To(Equal("RIFF"))
+		Expect(strings.HasPrefix(string(first[8:12]), "WAVE")).To(BeTrue())
+	})
+
+	It("clones a voice from the config audio_path reference", func() {
+		// 1s of 24kHz mono audio as a clone reference; the base model carries
+		// a speaker encoder, so audio_path drives x-vector voice cloning.
+		ref := GinkgoT().TempDir() + "/ref.wav"
+		samples := make([]float32, qwen3ttsSampleRate)
+		for i := range samples {
+			samples[i] = float32(0.05 * math.Sin(float64(i)*0.06))
+		}
+		Expect(writeWAV24k(ref, samples)).To(Succeed())
+
+		b := &Qwen3TtsCpp{opts: loadOptions{seed: 42, useFA: true}, audioPath: ref}
+		dst := GinkgoT().TempDir() + "/clone.wav"
+		lang := "english"
+		// Empty Voice -> the config audio_path is used as the clone reference.
+		Expect(b.TTS(ttsReq("Cloned voice test.", "", &lang, dst))).To(Succeed())
+		fi, err := os.Stat(dst)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(fi.Size()).To(BeNumerically(">", int64(44)))
+	})
+})
--- a/backend/go/qwen3-tts-cpp/goqwen3ttscpp.go
+++ b/backend/go/qwen3-tts-cpp/goqwen3ttscpp.go
@@ -5,108 +5,225 @@ import (
 	"os"
 	"path/filepath"
 	"strings"
+	"sync"
+	"unsafe"

+	"github.com/ebitengine/purego"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )

 var (
-	CppLoadModel  func(modelDir string, nThreads int) int
-	CppSynthesize func(text, refAudioPath, dst, language string,
-		temperature, topP float32, topK int,
-		repetitionPenalty float32, maxAudioTokens, nThreads int) int
+	// qt3_load(talker_path, codec_path, use_fa, clamp_fp16) int
+	CppLoad func(talkerPath, codecPath string, useFA, clampFP16 int) int
+	// qt3_tts(text, lang, instruct, speaker, ref_samples, ref_n, ref_text,
+	//         seed, temperature, top_k, top_p, rep_pen, max_new, out_n) -> float*
+	CppTTS func(text, lang, instruct, speaker string, refSamples unsafe.Pointer,
+		refN int, refText string, seed int64, temperature float32, topK int,
+		topP, repPen float32, maxNew int, outN unsafe.Pointer) uintptr
+	// qt3_tts_stream(..., cb, user) int
+	CppTTSStream func(text, lang, instruct, speaker string, refSamples unsafe.Pointer,
+		refN int, refText string, seed int64, temperature float32, topK int,
+		topP, repPen float32, maxNew int, cb uintptr, user uintptr) int
+	CppPCMFree func(ptr uintptr)
+	CppUnload  func()
 )

 type Qwen3TtsCpp struct {
 	base.SingleThread
-	threads int
-}
-
-// languageNameAliases maps common full language names to the canonical
-// two-letter code understood by the C++ language_to_id table.
-var languageNameAliases = map[string]string{
-	"english":    "en",
-	"russian":    "ru",
-	"chinese":    "zh",
-	"japanese":   "ja",
-	"korean":     "ko",
-	"german":     "de",
-	"french":     "fr",
-	"spanish":    "es",
-	"italian":    "it",
-	"portuguese": "pt",
-}
-
-// normalizeLanguage coerces a caller-supplied language into the canonical code
-// the model expects. It lowercases, trims, strips any region/locale suffix
-// (en-US, en_US, ja.JP -> en/ja), and resolves common full names (english -> en).
-// An empty input stays empty so the C++ side applies its English default; an
-// unrecognized value is returned normalized so C++ can log it and default.
-func normalizeLanguage(lang string) string {
-	lang = strings.ToLower(strings.TrimSpace(lang))
-	if lang == "" {
-		return ""
-	}
-
-	// Strip region/locale suffix: keep the segment before the first separator.
-	if i := strings.IndexAny(lang, "-_."); i >= 0 {
-		lang = lang[:i]
-	}
-
-	if code, ok := languageNameAliases[lang]; ok {
-		return code
-	}
-	return lang
+	opts loadOptions
+	// audioPath is the model-config reference voice (tts.audio_path), the
+	// default clone reference when a request omits an audio Voice.
+	audioPath string
 }

 func (q *Qwen3TtsCpp) Load(opts *pb.ModelOptions) error {
-	// ModelFile is the model directory path (containing GGUF files)
-	modelDir := opts.ModelFile
-	if modelDir == "" {
-		modelDir = opts.ModelPath
+	model := opts.ModelFile
+	if model == "" {
+		model = opts.ModelPath
+	}
+	if !filepath.IsAbs(model) && opts.ModelPath != "" {
+		model = filepath.Join(opts.ModelPath, model)
 	}

-	// Resolve relative paths
-	if !filepath.IsAbs(modelDir) && opts.ModelPath != "" {
-		modelDir = filepath.Join(opts.ModelPath, modelDir)
+	q.opts = parseOptions(opts.Options)
+
+	// Resolve the codec/tokenizer GGUF: explicit option, else auto-discover a
+	// *tokenizer*.gguf sibling of the talker model.
+	codec := q.opts.codecPath
+	if codec != "" && !filepath.IsAbs(codec) {
+		codec = filepath.Join(filepath.Dir(model), codec)
+	}
+	if codec == "" {
+		codec = discoverTokenizer(filepath.Dir(model))
+	}
+	if codec == "" {
+		return fmt.Errorf("qwen3-tts: no codec/tokenizer GGUF found; set option 'tokenizer:<file>'")
+	}
+	q.opts.codecPath = codec
+
+	q.audioPath = opts.AudioPath
+	if q.audioPath != "" && !filepath.IsAbs(q.audioPath) {
+		q.audioPath = filepath.Join(filepath.Dir(model), q.audioPath)
 	}

-	threads := int(opts.Threads)
-	if threads <= 0 {
-		threads = 4
+	useFA := boolToInt(q.opts.useFA)
+	clamp := boolToInt(q.opts.clampFP16)
+
+	fmt.Fprintf(os.Stderr, "[qwen3-tts-cpp] Load talker=%s codec=%s use_fa=%d clamp_fp16=%d\n",
+		model, codec, useFA, clamp)
+
+	if rc := CppLoad(model, codec, useFA, clamp); rc != 0 {
+		return fmt.Errorf("qwen3-tts: failed to load model (rc=%d)", rc)
 	}
-	q.threads = threads
-
-	fmt.Fprintf(os.Stderr, "[qwen3-tts-cpp] Loading models from: %s (threads=%d)\n", modelDir, threads)
-
-	if ret := CppLoadModel(modelDir, threads); ret != 0 {
-		return fmt.Errorf("failed to load qwen3-tts model (error code: %d)", ret)
-	}
-
 	return nil
 }

+// discoverTokenizer returns the first *tokenizer*.gguf in dir, or "".
+func discoverTokenizer(dir string) string {
+	entries, err := os.ReadDir(dir)
+	if err != nil {
+		return ""
+	}
+	for _, e := range entries {
+		name := strings.ToLower(e.Name())
+		if strings.Contains(name, "tokenizer") && strings.HasSuffix(name, ".gguf") {
+			return filepath.Join(dir, e.Name())
+		}
+	}
+	return ""
+}
+
+func boolToInt(b bool) int {
+	if b {
+		return 1
+	}
+	return 0
+}
+
+func optStr(p *string) string {
+	if p == nil {
+		return ""
+	}
+	return *p
+}
+
+// resolveRequest derives the synthesis inputs from a TTSRequest:
+// language, instruct, speaker, ref-audio samples, ref-text and sampling.
+func (q *Qwen3TtsCpp) resolveRequest(req *pb.TTSRequest) (lang, instruct, speaker, refText string, ref []float32, s sampling, err error) {
+	lang = normalizeLanguage(optStr(req.Language))
+	instruct = optStr(req.Instructions)
+
+	var refPath string
+	speaker, refPath = resolveVoice(req.Voice)
+	if refPath == "" && speaker == "" && q.audioPath != "" {
+		// No per-request voice: fall back to the config clone reference.
+		refPath = q.audioPath
+	}
+	if refPath != "" {
+		ref, err = readWAVAsFloat(refPath)
+		if err != nil {
+			return
+		}
+	}
+
+	if req.Params != nil {
+		refText = req.Params["ref_text"]
+	}
+	s = parseSampling(req.Params, q.opts.seed)
+	return
+}
+
 func (q *Qwen3TtsCpp) TTS(req *pb.TTSRequest) error {
-	text := req.Text
-	voice := req.Voice // reference audio path for voice cloning (empty = no cloning)
-	dst := req.Dst
-	language := ""
-	if req.Language != nil {
-		language = normalizeLanguage(*req.Language)
+	if req.Dst == "" {
+		return fmt.Errorf("qwen3-tts: TTS requires a destination path")
+	}
+	if req.Text == "" {
+		return fmt.Errorf("qwen3-tts: TTS requires text")
+	}
+	lang, instruct, speaker, refText, ref, s, err := q.resolveRequest(req)
+	if err != nil {
+		return err
+	}
+	var refPtr unsafe.Pointer
+	if len(ref) > 0 {
+		refPtr = unsafe.Pointer(&ref[0])
 	}

-	// Synthesis parameters with sensible defaults
-	temperature := float32(0.9)
-	topP := float32(0.8)
-	topK := 50
-	repetitionPenalty := float32(1.05)
-	maxAudioTokens := 4096
+	var n int32
+	ptr := CppTTS(req.Text, lang, instruct, speaker, refPtr, len(ref), refText,
+		s.seed, s.temperature, s.topK, s.topP, s.repPen, s.maxNew, unsafe.Pointer(&n))
+	runtimeKeepAlive(ref)
+	if ptr == 0 {
+		return fmt.Errorf("qwen3-tts: synthesis failed")
+	}
+	// Register the free as soon as we own a non-null buffer, so the n<=0 guard
+	// below cannot leak it (defensive: the C contract returns NULL on failure).
+	defer CppPCMFree(ptr)
+	if n <= 0 {
+		return fmt.Errorf("qwen3-tts: synthesis produced no samples")
+	}
+	src := unsafe.Slice((*float32)(unsafe.Pointer(ptr)), int(n)) //nolint:govet // C-allocated PCM, copied out before free
+	out := make([]float32, int(n))
+	copy(out, src)
+	return writeWAV24k(req.Dst, out)
+}

-	if ret := CppSynthesize(text, voice, dst, language,
-		temperature, topP, topK, repetitionPenalty,
-		maxAudioTokens, q.threads); ret != 0 {
-		return fmt.Errorf("failed to synthesize audio (error code: %d)", ret)
+// streamState carries the active TTSStream channel to the single shared C
+// callback. base.SingleThread serializes TTS/TTSStream, so one global slot is
+// safe and avoids leaking a purego callback per request (purego callbacks
+// cannot be freed and are capped).
+var (
+	streamMu     sync.Mutex
+	streamChan   chan []byte
+	streamCbOnce sync.Once
+	streamCbPtr  uintptr
+)
+
+// streamCallback is registered once and forwards each PCM chunk to streamChan.
+func streamCallback(samples *float32, nSamples int32, _ uintptr) uintptr {
+	if nSamples <= 0 || samples == nil || streamChan == nil {
+		return 1 // continue
+	}
+	src := unsafe.Slice(samples, int(nSamples))
+	cp := make([]float32, int(nSamples)) // copy out of C memory before returning
+	copy(cp, src)
+	streamChan <- floatToPCM16LE(cp)
+	return 1 // continue
+}
+
+func (q *Qwen3TtsCpp) TTSStream(req *pb.TTSRequest, results chan []byte) error {
+	defer close(results)
+	if req.Text == "" {
+		return fmt.Errorf("qwen3-tts: TTSStream requires text")
 	}

+	streamCbOnce.Do(func() {
+		streamCbPtr = purego.NewCallback(streamCallback)
+	})
+
+	lang, instruct, speaker, refText, ref, s, err := q.resolveRequest(req)
+	if err != nil {
+		return err
+	}
+	var refPtr unsafe.Pointer
+	if len(ref) > 0 {
+		refPtr = unsafe.Pointer(&ref[0])
+	}
+
+	// Emit the WAV header first so the HTTP layer gets a self-describing stream.
+	results <- wavHeader24k()
+
+	streamMu.Lock()
+	streamChan = results
+	rc := CppTTSStream(req.Text, lang, instruct, speaker, refPtr, len(ref), refText,
+		s.seed, s.temperature, s.topK, s.topP, s.repPen, s.maxNew, streamCbPtr, 0)
+	streamChan = nil
+	streamMu.Unlock()
+	runtimeKeepAlive(ref)
+
+	if rc != 0 {
+		return fmt.Errorf("qwen3-tts: streaming synthesis failed (rc=%d)", rc)
+	}
 	return nil
 }
--- a/backend/go/qwen3-tts-cpp/language_test.go
+++ b/backend/go/qwen3-tts-cpp/language_test.go
@@ -1,53 +0,0 @@
-package main
-
-import (
-	"testing"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-func TestLanguageNormalization(t *testing.T) {
-	RegisterFailHandler(Fail)
-	RunSpecs(t, "qwen3-tts-cpp language normalization")
-}
-
-var _ = Describe("normalizeLanguage", func() {
-	DescribeTable("maps caller input to the canonical model language code",
-		func(input, expected string) {
-			Expect(normalizeLanguage(input)).To(Equal(expected))
-		},
-		// Canonical codes pass through unchanged
-		Entry("canonical en", "en", "en"),
-		Entry("canonical zh", "zh", "zh"),
-		Entry("canonical pt", "pt", "pt"),
-
-		// Case-insensitive
-		Entry("uppercase", "EN", "en"),
-		Entry("mixed case", "Ja", "ja"),
-
-		// Surrounding whitespace
-		Entry("trims whitespace", "  en  ", "en"),
-
-		// Region/locale stripping
-		Entry("BCP-47 region", "en-US", "en"),
-		Entry("underscore region", "en_US", "en"),
-		Entry("dotted locale", "ja.JP", "ja"),
-		Entry("region + case", "ZH-CN", "zh"),
-
-		// Full-name aliases
-		Entry("english name", "english", "en"),
-		Entry("chinese name cased", "Chinese", "zh"),
-		Entry("japanese name", "japanese", "ja"),
-		Entry("russian name", "russian", "ru"),
-		Entry("portuguese name", "portuguese", "pt"),
-
-		// Empty stays empty (C++ applies the English default)
-		Entry("empty", "", ""),
-		Entry("whitespace only", "   ", ""),
-
-		// Unknown values pass through normalized so C++ can log + default
-		Entry("unknown code", "klingon", "klingon"),
-		Entry("unknown with region", "xx-YY", "xx"),
-	)
-})
--- a/backend/go/qwen3-tts-cpp/main.go
+++ b/backend/go/qwen3-tts-cpp/main.go
@@ -19,24 +19,25 @@ type LibFuncs struct {
 }

 func main() {
-	// Get library name from environment variable, default to fallback
 	libName := os.Getenv("QWEN3TTS_LIBRARY")
 	if libName == "" {
 		libName = "./libgoqwen3ttscpp-fallback.so"
 	}

-	gosd, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
+	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
 	if err != nil {
 		panic(err)
 	}

 	libFuncs := []LibFuncs{
-		{&CppLoadModel, "load_model"},
-		{&CppSynthesize, "synthesize"},
+		{&CppLoad, "qt3_load"},
+		{&CppTTS, "qt3_tts"},
+		{&CppTTSStream, "qt3_tts_stream"},
+		{&CppPCMFree, "qt3_pcm_free"},
+		{&CppUnload, "qt3_unload"},
 	}
-
 	for _, lf := range libFuncs {
-		purego.RegisterLibFunc(lf.FuncPtr, gosd, lf.Name)
+		purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
 	}

 	flag.Parse()
--- a/backend/go/qwen3-tts-cpp/options.go
+++ b/backend/go/qwen3-tts-cpp/options.go
@@ -0,0 +1,161 @@
+package main
+
+import (
+	"strconv"
+	"strings"
+)
+
+// loadOptions holds the parsed model-level options.
+type loadOptions struct {
+	codecPath string
+	useFA     bool
+	clampFP16 bool
+	seed      int64
+}
+
+// sampling holds per-request generation parameters with qt defaults applied.
+type sampling struct {
+	temperature float32
+	topK        int
+	topP        float32
+	repPen      float32
+	maxNew      int
+	seed        int64
+}
+
+func splitOption(o string) (key, value string, ok bool) {
+	i := strings.Index(o, ":")
+	if i < 0 {
+		return "", "", false
+	}
+	return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true
+}
+
+func parseBool(v string) bool { return v == "true" || v == "1" }
+
+// parseOptions reads the backend "key:value" option slice. Unknown keys are
+// ignored. Defaults: use_fa true (qt default; CPU still uses the F32 chain),
+// seed -1 (engine random).
+func parseOptions(opts []string) loadOptions {
+	o := loadOptions{useFA: true, seed: -1}
+	for _, oo := range opts {
+		key, value, ok := splitOption(oo)
+		if !ok {
+			continue
+		}
+		switch key {
+		case "tokenizer", "codec":
+			o.codecPath = value
+		case "use_fa":
+			o.useFA = parseBool(value)
+		case "clamp_fp16":
+			o.clampFP16 = parseBool(value)
+		case "seed":
+			if n, err := strconv.ParseInt(value, 10, 64); err == nil {
+				o.seed = n
+			}
+		}
+	}
+	return o
+}
+
+// languageAliases maps codes / locales / full names to the upstream qwentts
+// language names. "auto" (and empty) map to "" so the engine auto-detects.
+var languageAliases = map[string]string{
+	"en": "english", "english": "english",
+	"zh": "chinese", "chinese": "chinese", "mandarin": "chinese",
+	"ja": "japanese", "japanese": "japanese",
+	"ko": "korean", "korean": "korean",
+	"de": "german", "german": "german",
+	"fr": "french", "french": "french",
+	"es": "spanish", "spanish": "spanish",
+	"it": "italian", "italian": "italian",
+	"pt": "portuguese", "portuguese": "portuguese",
+	"ru": "russian", "russian": "russian",
+	"auto": "",
+}
+
+// normalizeLanguage lowercases, trims, strips a region/locale suffix
+// (en-US -> en), and resolves to the qwentts language name. Empty stays empty
+// (engine auto-detects); an unknown value passes through normalized.
+func normalizeLanguage(lang string) string {
+	lang = strings.ToLower(strings.TrimSpace(lang))
+	if lang == "" {
+		return ""
+	}
+	if i := strings.IndexAny(lang, "-_."); i >= 0 {
+		lang = lang[:i]
+	}
+	if v, ok := languageAliases[lang]; ok {
+		return v
+	}
+	return lang
+}
+
+var refAudioExts = []string{".wav", ".flac", ".mp3", ".ogg", ".m4a"}
+
+// resolveVoice interprets the request Voice field: a value ending in a known
+// audio extension is a clone-reference path; anything else is a named speaker
+// (custom_voice). Empty input yields no speaker and no reference.
+func resolveVoice(voice string) (speaker, refPath string) {
+	v := strings.TrimSpace(voice)
+	if v == "" {
+		return "", ""
+	}
+	lower := strings.ToLower(v)
+	for _, ext := range refAudioExts {
+		if strings.HasSuffix(lower, ext) {
+			return "", v
+		}
+	}
+	return v, ""
+}
+
+func parseFloat32(v string, def float32) float32 {
+	if v == "" {
+		return def
+	}
+	f, err := strconv.ParseFloat(v, 32)
+	if err != nil {
+		return def
+	}
+	return float32(f)
+}
+
+func parseInt(v string, def int) int {
+	if v == "" {
+		return def
+	}
+	n, err := strconv.Atoi(v)
+	if err != nil {
+		return def
+	}
+	return n
+}
+
+func parseInt64(v string, def int64) int64 {
+	if v == "" {
+		return def
+	}
+	n, err := strconv.ParseInt(v, 10, 64)
+	if err != nil {
+		return def
+	}
+	return n
+}
+
+// parseSampling reads per-request sampling params from the TTSRequest params
+// map, applying qt defaults (matching qt_tts_default_params).
+func parseSampling(params map[string]string, defaultSeed int64) sampling {
+	s := sampling{temperature: 0.9, topK: 50, topP: 1.0, repPen: 1.05, maxNew: 2048, seed: defaultSeed}
+	if params == nil {
+		return s
+	}
+	s.temperature = parseFloat32(params["temperature"], s.temperature)
+	s.topK = parseInt(params["top_k"], s.topK)
+	s.topP = parseFloat32(params["top_p"], s.topP)
+	s.repPen = parseFloat32(params["repetition_penalty"], s.repPen)
+	s.maxNew = parseInt(params["max_new_tokens"], s.maxNew)
+	s.seed = parseInt64(params["seed"], s.seed)
+	return s
+}
--- a/backend/go/qwen3-tts-cpp/qwen3ttscpp_test.go
+++ b/backend/go/qwen3-tts-cpp/qwen3ttscpp_test.go
@@ -1,173 +1,136 @@
 package main

 import (
-	"context"
-	"os"
-	"os/exec"
-	"path/filepath"
+	"bytes"
+	"encoding/binary"
 	"testing"
-	"time"

-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	"google.golang.org/grpc"
-	"google.golang.org/grpc/credentials/insecure"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
 )

-const (
-	testAddr    = "localhost:50051"
-	startupWait = 5 * time.Second
-)
-
-func skipIfNoModel(t *testing.T) string {
-	t.Helper()
-	modelDir := os.Getenv("QWEN3TTS_MODEL_DIR")
-	if modelDir == "" {
-		t.Skip("QWEN3TTS_MODEL_DIR not set, skipping test (set to directory with GGUF models)")
-	}
-	if _, err := os.Stat(filepath.Join(modelDir, "qwen3-tts-0.6b-f16.gguf")); os.IsNotExist(err) {
-		t.Skipf("TTS model file not found in %s, skipping", modelDir)
-	}
-	if _, err := os.Stat(filepath.Join(modelDir, "qwen3-tts-tokenizer-f16.gguf")); os.IsNotExist(err) {
-		t.Skipf("Tokenizer model file not found in %s, skipping", modelDir)
-	}
-	return modelDir
+func TestQwen3TtsCpp(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "qwen3-tts-cpp suite")
 }

-func startServer(t *testing.T) *exec.Cmd {
-	t.Helper()
-	binary := os.Getenv("QWEN3TTS_BINARY")
-	if binary == "" {
-		binary = "./qwen3-tts-cpp"
-	}
-	if _, err := os.Stat(binary); os.IsNotExist(err) {
-		t.Skipf("Backend binary not found at %s, skipping", binary)
-	}
-	cmd := exec.Command(binary, "--addr", testAddr)
-	cmd.Stdout = os.Stderr
-	cmd.Stderr = os.Stderr
-	if err := cmd.Start(); err != nil {
-		t.Fatalf("Failed to start server: %v", err)
-	}
-	time.Sleep(startupWait)
-	return cmd
-}
-
-func stopServer(cmd *exec.Cmd) {
-	if cmd != nil && cmd.Process != nil {
-		cmd.Process.Kill()
-		cmd.Wait()
-	}
-}
-
-func dialGRPC(t *testing.T) *grpc.ClientConn {
-	t.Helper()
-	conn, err := grpc.Dial(testAddr,
-		grpc.WithTransportCredentials(insecure.NewCredentials()),
-		grpc.WithDefaultCallOptions(
-			grpc.MaxCallRecvMsgSize(50*1024*1024),
-			grpc.MaxCallSendMsgSize(50*1024*1024),
-		),
+var _ = Describe("normalizeLanguage", func() {
+	DescribeTable("maps caller language to qwentts language names",
+		func(in, want string) {
+			Expect(normalizeLanguage(in)).To(Equal(want))
+		},
+		Entry("empty stays empty", "", ""),
+		Entry("auto maps to empty", "auto", ""),
+		Entry("english full name", "English", "english"),
+		Entry("english code", "en", "english"),
+		Entry("locale suffix stripped", "en-US", "english"),
+		Entry("underscore locale", "zh_CN", "chinese"),
+		Entry("mandarin alias", "mandarin", "chinese"),
+		Entry("japanese already full", "japanese", "japanese"),
+		Entry("unknown passes through normalized", "xx", "xx"),
 	)
-	if err != nil {
-		t.Fatalf("Failed to dial gRPC: %v", err)
-	}
-	return conn
-}
+})

-func TestServerHealth(t *testing.T) {
-	cmd := startServer(t)
-	defer stopServer(cmd)
-
-	conn := dialGRPC(t)
-	defer conn.Close()
-
-	client := pb.NewBackendClient(conn)
-	resp, err := client.Health(context.Background(), &pb.HealthMessage{})
-	if err != nil {
-		t.Fatalf("Health check failed: %v", err)
-	}
-	if string(resp.Message) != "OK" {
-		t.Fatalf("Expected OK, got %s", string(resp.Message))
-	}
-}
-
-func TestLoadModel(t *testing.T) {
-	modelDir := skipIfNoModel(t)
-	cmd := startServer(t)
-	defer stopServer(cmd)
-
-	conn := dialGRPC(t)
-	defer conn.Close()
-
-	client := pb.NewBackendClient(conn)
-
-	resp, err := client.LoadModel(context.Background(), &pb.ModelOptions{
-		ModelFile: modelDir,
-		Threads:   4,
+var _ = Describe("resolveVoice", func() {
+	It("treats a bare token as a named speaker", func() {
+		sp, ref := resolveVoice("serena")
+		Expect(sp).To(Equal("serena"))
+		Expect(ref).To(BeEmpty())
 	})
-	if err != nil {
-		t.Fatalf("LoadModel failed: %v", err)
-	}
-	if !resp.Success {
-		t.Fatalf("LoadModel returned failure: %s", resp.Message)
-	}
-}
-
-func TestTTS(t *testing.T) {
-	modelDir := skipIfNoModel(t)
-
-	tmpDir, err := os.MkdirTemp("", "qwen3tts-test")
-	if err != nil {
-		t.Fatal(err)
-	}
-	t.Cleanup(func() { os.RemoveAll(tmpDir) })
-
-	outputFile := filepath.Join(tmpDir, "output.wav")
-
-	cmd := startServer(t)
-	defer stopServer(cmd)
-
-	conn := dialGRPC(t)
-	defer conn.Close()
-
-	client := pb.NewBackendClient(conn)
-
-	// Load models
-	loadResp, err := client.LoadModel(context.Background(), &pb.ModelOptions{
-		ModelFile: modelDir,
-		Threads:   4,
+	It("treats an audio path as a clone reference (case-insensitive ext)", func() {
+		sp, ref := resolveVoice("/x/ref.WAV")
+		Expect(sp).To(BeEmpty())
+		Expect(ref).To(Equal("/x/ref.WAV"))
 	})
-	if err != nil {
-		t.Fatalf("LoadModel failed: %v", err)
-	}
-	if !loadResp.Success {
-		t.Fatalf("LoadModel returned failure: %s", loadResp.Message)
-	}
-
-	// Synthesize speech
-	language := "en"
-	_, err = client.TTS(context.Background(), &pb.TTSRequest{
-		Text:     "Hello, this is a test of the Qwen3 text to speech system.",
-		Dst:      outputFile,
-		Language: &language,
+	It("recognizes mp3/flac/ogg/m4a", func() {
+		for _, p := range []string{"a.mp3", "b.flac", "c.ogg", "d.m4a"} {
+			sp, ref := resolveVoice(p)
+			Expect(sp).To(BeEmpty())
+			Expect(ref).To(Equal(p))
+		}
 	})
-	if err != nil {
-		t.Fatalf("TTS failed: %v", err)
-	}
+	It("returns empty for empty input", func() {
+		sp, ref := resolveVoice("  ")
+		Expect(sp).To(BeEmpty())
+		Expect(ref).To(BeEmpty())
+	})
+})

-	// Verify output file exists and has content
-	info, err := os.Stat(outputFile)
-	if os.IsNotExist(err) {
-		t.Fatal("Output audio file was not created")
-	}
-	if err != nil {
-		t.Fatalf("Failed to stat output file: %v", err)
-	}
+var _ = Describe("parseOptions", func() {
+	It("extracts codec, use_fa, clamp_fp16, seed", func() {
+		o := parseOptions([]string{
+			"tokenizer:tok.gguf", "use_fa:false", "clamp_fp16:true",
+			"seed:7", "unknown:ignored",
+		})
+		Expect(o.codecPath).To(Equal("tok.gguf"))
+		Expect(o.useFA).To(BeFalse())
+		Expect(o.clampFP16).To(BeTrue())
+		Expect(o.seed).To(Equal(int64(7)))
+	})
+	It("accepts codec: as an alias for tokenizer:", func() {
+		Expect(parseOptions([]string{"codec:c.gguf"}).codecPath).To(Equal("c.gguf"))
+	})
+	It("defaults use_fa true and seed -1", func() {
+		o := parseOptions(nil)
+		Expect(o.useFA).To(BeTrue())
+		Expect(o.seed).To(Equal(int64(-1)))
+	})
+})

-	t.Logf("Output file size: %d bytes", info.Size())
+var _ = Describe("parseSampling", func() {
+	It("applies qt defaults when params are absent", func() {
+		s := parseSampling(nil, -1)
+		Expect(s.temperature).To(BeNumerically("~", 0.9, 1e-6))
+		Expect(s.topK).To(Equal(50))
+		Expect(s.topP).To(BeNumerically("~", 1.0, 1e-6))
+		Expect(s.repPen).To(BeNumerically("~", 1.05, 1e-6))
+		Expect(s.maxNew).To(Equal(2048))
+		Expect(s.seed).To(Equal(int64(-1)))
+	})
+	It("reads overrides and falls back to default seed", func() {
+		s := parseSampling(map[string]string{
+			"temperature": "0.5", "top_k": "10", "top_p": "0.8",
+			"repetition_penalty": "1.2", "max_new_tokens": "512",
+		}, 99)
+		Expect(s.temperature).To(BeNumerically("~", 0.5, 1e-6))
+		Expect(s.topK).To(Equal(10))
+		Expect(s.topP).To(BeNumerically("~", 0.8, 1e-6))
+		Expect(s.repPen).To(BeNumerically("~", 1.2, 1e-6))
+		Expect(s.maxNew).To(Equal(512))
+		Expect(s.seed).To(Equal(int64(99)))
+	})
+	It("reads an explicit seed override", func() {
+		Expect(parseSampling(map[string]string{"seed": "123"}, -1).seed).To(Equal(int64(123)))
+	})
+})

-	// WAV header is 44 bytes minimum; any real audio should be much larger
-	if info.Size() < 1000 {
-		t.Errorf("Output file too small (%d bytes), expected real audio data", info.Size())
-	}
-}
+var _ = Describe("wavHeader24k", func() {
+	It("emits a 44-byte streaming WAV header at 24 kHz mono 16-bit", func() {
+		h := wavHeader24k()
+		Expect(h).To(HaveLen(44))
+		Expect(string(h[0:4])).To(Equal("RIFF"))
+		Expect(string(h[8:12])).To(Equal("WAVE"))
+		Expect(string(h[12:16])).To(Equal("fmt "))
+		Expect(string(h[36:40])).To(Equal("data"))
+		var sampleRate uint32
+		Expect(binary.Read(bytes.NewReader(h[24:28]), binary.LittleEndian, &sampleRate)).To(Succeed())
+		Expect(sampleRate).To(Equal(uint32(24000)))
+	})
+})
+
+var _ = Describe("floatToPCM16LE", func() {
+	It("clamps and converts float PCM to little-endian int16 bytes", func() {
+		b := floatToPCM16LE([]float32{0, 1.0, -1.0, 2.0, -2.0})
+		Expect(b).To(HaveLen(10))
+		read := func(off int) int16 {
+			var v int16
+			_ = binary.Read(bytes.NewReader(b[off:off+2]), binary.LittleEndian, &v)
+			return v
+		}
+		Expect(read(0)).To(Equal(int16(0)))
+		Expect(read(2)).To(Equal(int16(32767)))
+		Expect(read(4)).To(Equal(int16(-32767)))
+		Expect(read(6)).To(Equal(int16(32767)))  // clamped from 2.0
+		Expect(read(8)).To(Equal(int16(-32767))) // clamped from -2.0
+	})
+})
--- a/backend/go/qwen3-tts-cpp/test.sh
+++ b/backend/go/qwen3-tts-cpp/test.sh
@@ -2,51 +2,30 @@
 set -e

 CURDIR=$(dirname "$(realpath $0)")
+cd "$CURDIR"

 echo "Running qwen3-tts-cpp backend tests..."

-# The test requires:
-#   - QWEN3TTS_MODEL_DIR: path to directory containing GGUF model files
-#   - QWEN3TTS_BINARY: path to the qwen3-tts-cpp binary (defaults to ./qwen3-tts-cpp)
-#
-# Tests that require the model will be skipped if QWEN3TTS_MODEL_DIR is not set
-# or the directory does not contain the required model files.
-
-cd "$CURDIR"
-
-# Only auto-download models when QWEN3TTS_MODEL_DIR is not explicitly set
-if [ -z "$QWEN3TTS_MODEL_DIR" ]; then
-    export QWEN3TTS_MODEL_DIR="./qwen3-tts-models"
-
-    if [ ! -d "$QWEN3TTS_MODEL_DIR" ]; then
-        echo "Creating qwen3-tts-models directory for tests..."
-        mkdir -p "$QWEN3TTS_MODEL_DIR"
-        REPO_ID="endo5501/qwen3-tts.cpp"
-        echo "Repository: ${REPO_ID}"
-        echo ""
-
-        # Files to download (smallest model for testing)
-        FILES=(
-            "qwen3-tts-0.6b-f16.gguf"
-            "qwen3-tts-tokenizer-f16.gguf"
-        )
-
-        BASE_URL="https://huggingface.co/${REPO_ID}/resolve/main"
-
-        for file in "${FILES[@]}"; do
-            dest="${QWEN3TTS_MODEL_DIR}/${file}"
-            if [ -f "${dest}" ]; then
-                echo "  [skip] ${file} (already exists)"
-            else
-                echo "  [download] ${file}..."
-                curl -L -o "${dest}" "${BASE_URL}/${file}" --progress-bar
-                echo "  [done] ${file}"
-            fi
-        done
-    fi
+# Auto-download a small model pair only when QWEN3TTS_MODEL is not set.
+if [ -z "$QWEN3TTS_MODEL" ]; then
+    MODEL_DIR="./qwen3-tts-models"
+    mkdir -p "$MODEL_DIR"
+    REPO_ID="Serveurperso/Qwen3-TTS-GGUF"
+    BASE_URL="https://huggingface.co/${REPO_ID}/resolve/main"
+    FILES=( "qwen-talker-0.6b-base-Q4_K_M.gguf" "qwen-tokenizer-12hz-Q4_K_M.gguf" )
+    for file in "${FILES[@]}"; do
+        dest="${MODEL_DIR}/${file}"
+        if [ -f "${dest}" ]; then
+            echo "  [skip] ${file}"
+        else
+            echo "  [download] ${file}..."
+            curl -L -o "${dest}" "${BASE_URL}/${file}" --progress-bar
+        fi
+    done
+    export QWEN3TTS_MODEL="${MODEL_DIR}/qwen-talker-0.6b-base-Q4_K_M.gguf"
+    export QWEN3TTS_CODEC="${MODEL_DIR}/qwen-tokenizer-12hz-Q4_K_M.gguf"
 fi

-# Run Go tests
-go test -v -timeout 600s .
+go test -v -timeout 1200s .

 echo "All qwen3-tts-cpp tests passed."
--- a/backend/go/sherpa-onnx/backend.go
+++ b/backend/go/sherpa-onnx/backend.go
@@ -62,7 +62,7 @@ var (
 	shimVadConfigSetDebug                    func(uintptr, int32)
 	shimCreateVad                            func(uintptr, float32) uintptr

-	// TTS (offline, VITS) config
+	// TTS (offline, VITS/Piper and Kokoro) config
 	shimTtsConfigNew                  func() uintptr
 	shimTtsConfigFree                 func(uintptr)
 	shimTtsConfigSetVitsModel         func(uintptr, string)
@@ -76,6 +76,14 @@ var (
 	shimTtsConfigSetDebug             func(uintptr, int32)
 	shimTtsConfigSetProvider          func(uintptr, string)
 	shimTtsConfigSetMaxNumSentences   func(uintptr, int32)
+	shimTtsConfigSetKokoroModel       func(uintptr, string)
+	shimTtsConfigSetKokoroVoices      func(uintptr, string)
+	shimTtsConfigSetKokoroTokens      func(uintptr, string)
+	shimTtsConfigSetKokoroDataDir     func(uintptr, string)
+	shimTtsConfigSetKokoroDictDir     func(uintptr, string)
+	shimTtsConfigSetKokoroLexicon     func(uintptr, string)
+	shimTtsConfigSetKokoroLang        func(uintptr, string)
+	shimTtsConfigSetKokoroLengthScale func(uintptr, float32)
 	shimCreateOfflineTts              func(uintptr) uintptr

 	// Offline recognizer config
@@ -101,37 +109,37 @@ var (
 	shimCreateOfflineRecognizer                  func(uintptr) uintptr

 	// Online recognizer config
-	shimOnlineRecogConfigNew                      func() uintptr
-	shimOnlineRecogConfigFree                     func(uintptr)
-	shimOnlineRecogConfigSetTransducerEncoder     func(uintptr, string)
-	shimOnlineRecogConfigSetTransducerDecoder     func(uintptr, string)
-	shimOnlineRecogConfigSetTransducerJoiner      func(uintptr, string)
-	shimOnlineRecogConfigSetTokens                func(uintptr, string)
-	shimOnlineRecogConfigSetNumThreads            func(uintptr, int32)
-	shimOnlineRecogConfigSetDebug                 func(uintptr, int32)
-	shimOnlineRecogConfigSetProvider              func(uintptr, string)
-	shimOnlineRecogConfigSetFeatSampleRate        func(uintptr, int32)
-	shimOnlineRecogConfigSetFeatFeatureDim        func(uintptr, int32)
-	shimOnlineRecogConfigSetDecodingMethod        func(uintptr, string)
-	shimOnlineRecogConfigSetEnableEndpoint        func(uintptr, int32)
+	shimOnlineRecogConfigNew                        func() uintptr
+	shimOnlineRecogConfigFree                       func(uintptr)
+	shimOnlineRecogConfigSetTransducerEncoder       func(uintptr, string)
+	shimOnlineRecogConfigSetTransducerDecoder       func(uintptr, string)
+	shimOnlineRecogConfigSetTransducerJoiner        func(uintptr, string)
+	shimOnlineRecogConfigSetTokens                  func(uintptr, string)
+	shimOnlineRecogConfigSetNumThreads              func(uintptr, int32)
+	shimOnlineRecogConfigSetDebug                   func(uintptr, int32)
+	shimOnlineRecogConfigSetProvider                func(uintptr, string)
+	shimOnlineRecogConfigSetFeatSampleRate          func(uintptr, int32)
+	shimOnlineRecogConfigSetFeatFeatureDim          func(uintptr, int32)
+	shimOnlineRecogConfigSetDecodingMethod          func(uintptr, string)
+	shimOnlineRecogConfigSetEnableEndpoint          func(uintptr, int32)
 	shimOnlineRecogConfigSetRule1MinTrailingSilence func(uintptr, float32)
 	shimOnlineRecogConfigSetRule2MinTrailingSilence func(uintptr, float32)
 	shimOnlineRecogConfigSetRule3MinUtteranceLength func(uintptr, float32)
-	shimCreateOnlineRecognizer                    func(uintptr) uintptr
+	shimCreateOnlineRecognizer                      func(uintptr) uintptr

 	// Result accessors. Pointer returns use unsafe.Pointer so Go's
 	// vet checker doesn't flag them — the returned memory is C-owned,
 	// not subject to Go GC motion.
-	shimWaveSampleRate            func(uintptr) int32
-	shimWaveNumSamples            func(uintptr) int32
-	shimWaveSamples               func(uintptr) unsafe.Pointer
-	shimOfflineResultText         func(uintptr) unsafe.Pointer
-	shimOnlineResultText          func(uintptr) unsafe.Pointer
-	shimGeneratedAudioSampleRate  func(uintptr) int32
-	shimGeneratedAudioN           func(uintptr) int32
-	shimGeneratedAudioSamples     func(uintptr) unsafe.Pointer
-	shimSpeechSegmentStart        func(uintptr) int32
-	shimSpeechSegmentN            func(uintptr) int32
+	shimWaveSampleRate           func(uintptr) int32
+	shimWaveNumSamples           func(uintptr) int32
+	shimWaveSamples              func(uintptr) unsafe.Pointer
+	shimOfflineResultText        func(uintptr) unsafe.Pointer
+	shimOnlineResultText         func(uintptr) unsafe.Pointer
+	shimGeneratedAudioSampleRate func(uintptr) int32
+	shimGeneratedAudioN          func(uintptr) int32
+	shimGeneratedAudioSamples    func(uintptr) unsafe.Pointer
+	shimSpeechSegmentStart       func(uintptr) int32
+	shimSpeechSegmentN           func(uintptr) int32

 	// TTS streaming callback trampoline
 	shimTtsGenerateWithCallback func(tts uintptr, text string, sid int32, speed float32, cb uintptr, ud uintptr) uintptr
@@ -161,13 +169,13 @@ var (
 // pointer returned by the shim or `unsafe.Pointer(&slice[0])` from Go.
 var (
 	// VAD
-	sherpaVadAcceptWaveform       func(vad uintptr, samples unsafe.Pointer, n int32)
-	sherpaVadReset                func(vad uintptr)
-	sherpaVadFlush                func(vad uintptr)
-	sherpaVadEmpty                func(vad uintptr) int32
-	sherpaVadFront                func(vad uintptr) uintptr
-	sherpaVadPop                  func(vad uintptr)
-	sherpaDestroySpeechSegment    func(seg uintptr)
+	sherpaVadAcceptWaveform    func(vad uintptr, samples unsafe.Pointer, n int32)
+	sherpaVadReset             func(vad uintptr)
+	sherpaVadFlush             func(vad uintptr)
+	sherpaVadEmpty             func(vad uintptr) int32
+	sherpaVadFront             func(vad uintptr) uintptr
+	sherpaVadPop               func(vad uintptr)
+	sherpaDestroySpeechSegment func(seg uintptr)

 	// Wave IO
 	sherpaReadWave  func(filename string) uintptr
@@ -175,11 +183,11 @@ var (
 	sherpaWriteWave func(samples unsafe.Pointer, n int32, sampleRate int32, filename string) int32

 	// Offline ASR
-	sherpaCreateOfflineStream           func(rec uintptr) uintptr
-	sherpaDestroyOfflineStream          func(stream uintptr)
-	sherpaAcceptWaveformOffline         func(stream uintptr, sr int32, samples unsafe.Pointer, n int32)
-	sherpaDecodeOfflineStream           func(rec uintptr, stream uintptr)
-	sherpaGetOfflineStreamResult        func(stream uintptr) uintptr
+	sherpaCreateOfflineStream            func(rec uintptr) uintptr
+	sherpaDestroyOfflineStream           func(stream uintptr)
+	sherpaAcceptWaveformOffline          func(stream uintptr, sr int32, samples unsafe.Pointer, n int32)
+	sherpaDecodeOfflineStream            func(rec uintptr, stream uintptr)
+	sherpaGetOfflineStreamResult         func(stream uintptr) uintptr
 	sherpaDestroyOfflineRecognizerResult func(result uintptr)

 	// Online ASR
@@ -195,21 +203,21 @@ var (
 	sherpaOnlineStreamInputFinished     func(stream uintptr)

 	// TTS
-	sherpaOfflineTtsGenerate             func(tts uintptr, text string, sid int32, speed float32) uintptr
+	sherpaOfflineTtsGenerate              func(tts uintptr, text string, sid int32, speed float32) uintptr
 	sherpaDestroyOfflineTtsGeneratedAudio func(audio uintptr)
-	sherpaOfflineTtsSampleRate           func(tts uintptr) int32
+	sherpaOfflineTtsSampleRate            func(tts uintptr) int32

 	// Offline speaker diarization. Result handle owns the segment-array
 	// pointer returned by ResultSortByStartTime; destroy the segment
 	// array first, then the result, then (at backend Free()) the diarizer.
-	sherpaDestroyOfflineSpeakerDiarization                 func(sd uintptr)
-	sherpaOfflineSpeakerDiarizationGetSampleRate           func(sd uintptr) int32
-	sherpaOfflineSpeakerDiarizationProcess                 func(sd uintptr, samples unsafe.Pointer, n int32) uintptr
-	sherpaOfflineSpeakerDiarizationResultGetNumSegments    func(result uintptr) int32
-	sherpaOfflineSpeakerDiarizationResultGetNumSpeakers    func(result uintptr) int32
-	sherpaOfflineSpeakerDiarizationResultSortByStartTime   func(result uintptr) uintptr
-	sherpaOfflineSpeakerDiarizationDestroySegment          func(segs uintptr)
-	sherpaDestroyOfflineSpeakerDiarizationResult           func(result uintptr)
+	sherpaDestroyOfflineSpeakerDiarization               func(sd uintptr)
+	sherpaOfflineSpeakerDiarizationGetSampleRate         func(sd uintptr) int32
+	sherpaOfflineSpeakerDiarizationProcess               func(sd uintptr, samples unsafe.Pointer, n int32) uintptr
+	sherpaOfflineSpeakerDiarizationResultGetNumSegments  func(result uintptr) int32
+	sherpaOfflineSpeakerDiarizationResultGetNumSpeakers  func(result uintptr) int32
+	sherpaOfflineSpeakerDiarizationResultSortByStartTime func(result uintptr) uintptr
+	sherpaOfflineSpeakerDiarizationDestroySegment        func(segs uintptr)
+	sherpaDestroyOfflineSpeakerDiarizationResult         func(result uintptr)
 )

 var (
@@ -278,6 +286,14 @@ func loadSherpaLibsOnce() error {
 		{&shimTtsConfigSetDebug, "sherpa_shim_tts_config_set_debug"},
 		{&shimTtsConfigSetProvider, "sherpa_shim_tts_config_set_provider"},
 		{&shimTtsConfigSetMaxNumSentences, "sherpa_shim_tts_config_set_max_num_sentences"},
+		{&shimTtsConfigSetKokoroModel, "sherpa_shim_tts_config_set_kokoro_model"},
+		{&shimTtsConfigSetKokoroVoices, "sherpa_shim_tts_config_set_kokoro_voices"},
+		{&shimTtsConfigSetKokoroTokens, "sherpa_shim_tts_config_set_kokoro_tokens"},
+		{&shimTtsConfigSetKokoroDataDir, "sherpa_shim_tts_config_set_kokoro_data_dir"},
+		{&shimTtsConfigSetKokoroDictDir, "sherpa_shim_tts_config_set_kokoro_dict_dir"},
+		{&shimTtsConfigSetKokoroLexicon, "sherpa_shim_tts_config_set_kokoro_lexicon"},
+		{&shimTtsConfigSetKokoroLang, "sherpa_shim_tts_config_set_kokoro_lang"},
+		{&shimTtsConfigSetKokoroLengthScale, "sherpa_shim_tts_config_set_kokoro_length_scale"},
 		{&shimCreateOfflineTts, "sherpa_shim_create_offline_tts"},

 		{&shimOfflineRecogConfigNew, "sherpa_shim_offline_recog_config_new"},
@@ -688,21 +704,14 @@ func (s *SherpaBackend) loadTTS(opts *pb.ModelOptions) error {
 	cfg := shimTtsConfigNew()
 	defer shimTtsConfigFree(cfg)

-	shimTtsConfigSetVitsModel(cfg, modelFile)
-
-	if tokensPath := filepath.Join(modelDir, "tokens.txt"); fileExists(tokensPath) {
-		shimTtsConfigSetVitsTokens(cfg, tokensPath)
+	// Kokoro models ship a voices style file alongside the ONNX, whereas
+	// VITS/Piper voices do not. That presence is what tells the two model
+	// families apart, since both arrive as a plain *.onnx in modelDir.
+	if isKokoroModel(modelDir) {
+		s.configureKokoroTTS(cfg, opts, modelFile, modelDir)
+	} else {
+		s.configureVitsTTS(cfg, opts, modelFile, modelDir)
 	}
-	if lexiconPath := filepath.Join(modelDir, "lexicon.txt"); fileExists(lexiconPath) {
-		shimTtsConfigSetVitsLexicon(cfg, lexiconPath)
-	}
-	if dataDir := filepath.Join(modelDir, "espeak-ng-data"); dirExists(dataDir) {
-		shimTtsConfigSetVitsDataDir(cfg, dataDir)
-	}
-
-	shimTtsConfigSetVitsNoiseScale(cfg, findOptionFloat(opts, optionTtsNoiseScale, 0.667))
-	shimTtsConfigSetVitsNoiseScaleW(cfg, findOptionFloat(opts, optionTtsNoiseScaleW, 0.8))
-	shimTtsConfigSetVitsLengthScale(cfg, findOptionFloat(opts, optionTtsLengthScale, 1.0))

 	threads := int32(1)
 	if opts.Threads != 0 {
@@ -723,6 +732,80 @@ func (s *SherpaBackend) loadTTS(opts *pb.ModelOptions) error {
 	return nil
 }

+// kokoroVoicesFile is the speaker-style bank that ships with Kokoro models and
+// is absent from VITS/Piper voices; its presence is how loadTTS tells them apart.
+const kokoroVoicesFile = "voices.bin"
+
+// isKokoroModel reports whether modelDir holds a Kokoro model (a voices file
+// next to the ONNX) rather than a VITS/Piper single-speaker model.
+func isKokoroModel(modelDir string) bool {
+	return fileExists(filepath.Join(modelDir, kokoroVoicesFile))
+}
+
+// configureVitsTTS wires a VITS/Piper single-speaker model into cfg: the ONNX
+// plus the optional tokens, lexicon and espeak-ng-data found beside it.
+func (s *SherpaBackend) configureVitsTTS(cfg uintptr, opts *pb.ModelOptions, modelFile, modelDir string) {
+	shimTtsConfigSetVitsModel(cfg, modelFile)
+
+	if tokensPath := filepath.Join(modelDir, "tokens.txt"); fileExists(tokensPath) {
+		shimTtsConfigSetVitsTokens(cfg, tokensPath)
+	}
+	if lexiconPath := filepath.Join(modelDir, "lexicon.txt"); fileExists(lexiconPath) {
+		shimTtsConfigSetVitsLexicon(cfg, lexiconPath)
+	}
+	if dataDir := filepath.Join(modelDir, "espeak-ng-data"); dirExists(dataDir) {
+		shimTtsConfigSetVitsDataDir(cfg, dataDir)
+	}
+
+	shimTtsConfigSetVitsNoiseScale(cfg, findOptionFloat(opts, optionTtsNoiseScale, 0.667))
+	shimTtsConfigSetVitsNoiseScaleW(cfg, findOptionFloat(opts, optionTtsNoiseScaleW, 0.8))
+	shimTtsConfigSetVitsLengthScale(cfg, findOptionFloat(opts, optionTtsLengthScale, 1.0))
+}
+
+// configureKokoroTTS wires a Kokoro model into cfg: the ONNX, its voices bank,
+// tokens, and the optional espeak-ng-data / jieba dict / lexicon assets the
+// multi-lingual packs ship. A language hint comes from the `language=` option.
+func (s *SherpaBackend) configureKokoroTTS(cfg uintptr, opts *pb.ModelOptions, modelFile, modelDir string) {
+	shimTtsConfigSetKokoroModel(cfg, modelFile)
+	shimTtsConfigSetKokoroVoices(cfg, filepath.Join(modelDir, kokoroVoicesFile))
+
+	if tokensPath := filepath.Join(modelDir, "tokens.txt"); fileExists(tokensPath) {
+		shimTtsConfigSetKokoroTokens(cfg, tokensPath)
+	}
+	if dataDir := filepath.Join(modelDir, "espeak-ng-data"); dirExists(dataDir) {
+		shimTtsConfigSetKokoroDataDir(cfg, dataDir)
+	}
+	if dictDir := filepath.Join(modelDir, "dict"); dirExists(dictDir) {
+		shimTtsConfigSetKokoroDictDir(cfg, dictDir)
+	}
+
+	// Multi-lingual Kokoro ships per-language lexicons; the C API takes them as
+	// a single comma-separated list. US and GB English overlap almost entirely,
+	// so pass only one (US preferred) to avoid tens of thousands of "duplicated
+	// word" warnings at load; non-English lexicons (e.g. zh) are additive.
+	var lexicons []string
+	addLexicon := func(name string) {
+		if p := filepath.Join(modelDir, name); fileExists(p) {
+			lexicons = append(lexicons, p)
+		}
+	}
+	if fileExists(filepath.Join(modelDir, "lexicon-us-en.txt")) {
+		addLexicon("lexicon-us-en.txt")
+	} else {
+		addLexicon("lexicon-gb-en.txt")
+	}
+	addLexicon("lexicon-zh.txt")
+	addLexicon("lexicon.txt")
+	if len(lexicons) > 0 {
+		shimTtsConfigSetKokoroLexicon(cfg, strings.Join(lexicons, ","))
+	}
+
+	if lang := findOptionValue(opts, optionLanguage, ""); lang != "" {
+		shimTtsConfigSetKokoroLang(cfg, lang)
+	}
+	shimTtsConfigSetKokoroLengthScale(cfg, findOptionFloat(opts, optionTtsLengthScale, 1.0))
+}
+
 func fileExists(p string) bool {
 	info, err := os.Stat(p)
 	return err == nil && !info.IsDir()
@@ -1252,7 +1335,7 @@ type ttsStreamState struct {
 var (
 	ttsStates      sync.Map // uint64 → *ttsStreamState
 	ttsNextID      atomic.Uint64
-	ttsCallbackPtr uintptr  // purego.NewCallback return; registered in loadSherpaLibs
+	ttsCallbackPtr uintptr // purego.NewCallback return; registered in loadSherpaLibs
 )

 // ttsStreamCallback is invoked by sherpa-onnx for each PCM chunk VITS
--- a/backend/go/sherpa-onnx/backend_test.go
+++ b/backend/go/sherpa-onnx/backend_test.go
@@ -124,6 +124,20 @@ var _ = Describe("Sherpa-ONNX", func() {
 			Entry("empty", "", false),
 			Entry("other", "other", false),
 		)
+
+		It("isKokoroModel detects a voices file beside the ONNX", func() {
+			dir, err := os.MkdirTemp("", "sherpa-kokoro-*")
+			Expect(err).NotTo(HaveOccurred())
+			defer func() { _ = os.RemoveAll(dir) }()
+
+			// A bare VITS/Piper directory (ONNX only) is not Kokoro.
+			Expect(os.WriteFile(filepath.Join(dir, "model.onnx"), []byte("x"), 0o600)).To(Succeed())
+			Expect(isKokoroModel(dir)).To(BeFalse())
+
+			// Adding the Kokoro voices bank flips detection on.
+			Expect(os.WriteFile(filepath.Join(dir, kokoroVoicesFile), []byte("x"), 0o600)).To(Succeed())
+			Expect(isKokoroModel(dir)).To(BeTrue())
+		})
 	})

 	Context("option parsing", func() {
--- a/backend/go/sherpa-onnx/csrc/shim.c
+++ b/backend/go/sherpa-onnx/csrc/shim.c
@@ -79,6 +79,13 @@ void sherpa_shim_tts_config_free(void *h) {
    free((char *)c->model.vits.tokens);
    free((char *)c->model.vits.lexicon);
    free((char *)c->model.vits.data_dir);
+    free((char *)c->model.kokoro.model);
+    free((char *)c->model.kokoro.voices);
+    free((char *)c->model.kokoro.tokens);
+    free((char *)c->model.kokoro.data_dir);
+    free((char *)c->model.kokoro.dict_dir);
+    free((char *)c->model.kokoro.lexicon);
+    free((char *)c->model.kokoro.lang);
    free((char *)c->model.provider);
    free(c);
 }
@@ -117,6 +124,34 @@ void sherpa_shim_tts_config_set_max_num_sentences(void *h, int32_t v) {
    ((SherpaOnnxOfflineTtsConfig *)h)->max_num_sentences = v;
 }

+// Kokoro multi-speaker / multi-lingual TTS. Distinct ONNX + a voices style
+// file (voices.bin) instead of VITS' single-speaker graph; espeak-ng-data,
+// lexicon and a language hint are optional refinements.
+void sherpa_shim_tts_config_set_kokoro_model(void *h, const char *v) {
+    shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.model, v);
+}
+void sherpa_shim_tts_config_set_kokoro_voices(void *h, const char *v) {
+    shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.voices, v);
+}
+void sherpa_shim_tts_config_set_kokoro_tokens(void *h, const char *v) {
+    shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.tokens, v);
+}
+void sherpa_shim_tts_config_set_kokoro_data_dir(void *h, const char *v) {
+    shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.data_dir, v);
+}
+void sherpa_shim_tts_config_set_kokoro_dict_dir(void *h, const char *v) {
+    shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.dict_dir, v);
+}
+void sherpa_shim_tts_config_set_kokoro_lexicon(void *h, const char *v) {
+    shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.lexicon, v);
+}
+void sherpa_shim_tts_config_set_kokoro_lang(void *h, const char *v) {
+    shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.lang, v);
+}
+void sherpa_shim_tts_config_set_kokoro_length_scale(void *h, float v) {
+    ((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.length_scale = v;
+}
+
 void *sherpa_shim_create_offline_tts(void *h) {
    return (void *)SherpaOnnxCreateOfflineTts(
        (const SherpaOnnxOfflineTtsConfig *)h);
--- a/backend/go/sherpa-onnx/csrc/shim.h
+++ b/backend/go/sherpa-onnx/csrc/shim.h
@@ -37,7 +37,7 @@ void  sherpa_shim_vad_config_set_provider(void *cfg, const char *v);
 void  sherpa_shim_vad_config_set_debug(void *cfg, int32_t v);
 void *sherpa_shim_create_vad(void *cfg, float buffer_size_seconds);

-// --- Offline TTS config (VITS path — the only TTS family the backend uses) ---
+// --- Offline TTS config (VITS/Piper and Kokoro model families) ---
 void *sherpa_shim_tts_config_new(void);
 void  sherpa_shim_tts_config_free(void *cfg);
 void  sherpa_shim_tts_config_set_vits_model(void *cfg, const char *v);
@@ -51,6 +51,14 @@ void  sherpa_shim_tts_config_set_num_threads(void *cfg, int32_t v);
 void  sherpa_shim_tts_config_set_debug(void *cfg, int32_t v);
 void  sherpa_shim_tts_config_set_provider(void *cfg, const char *v);
 void  sherpa_shim_tts_config_set_max_num_sentences(void *cfg, int32_t v);
+void  sherpa_shim_tts_config_set_kokoro_model(void *cfg, const char *v);
+void  sherpa_shim_tts_config_set_kokoro_voices(void *cfg, const char *v);
+void  sherpa_shim_tts_config_set_kokoro_tokens(void *cfg, const char *v);
+void  sherpa_shim_tts_config_set_kokoro_data_dir(void *cfg, const char *v);
+void  sherpa_shim_tts_config_set_kokoro_dict_dir(void *cfg, const char *v);
+void  sherpa_shim_tts_config_set_kokoro_lexicon(void *cfg, const char *v);
+void  sherpa_shim_tts_config_set_kokoro_lang(void *cfg, const char *v);
+void  sherpa_shim_tts_config_set_kokoro_length_scale(void *cfg, float v);
 void *sherpa_shim_create_offline_tts(void *cfg);

 // --- Offline recognizer config (Whisper / Paraformer / SenseVoice / Omnilingual) ---
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -366,6 +366,98 @@
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-locate-anything-cpp"
    intel: "intel-sycl-f32-locate-anything-cpp"
    vulkan: "vulkan-locate-anything-cpp"
+- !!merge <<: *locateanything
+  name: "locate-anything-development"
+  capabilities:
+    default: "cpu-locate-anything-cpp-development"
+    nvidia: "cuda12-locate-anything-cpp-development"
+    nvidia-cuda-12: "cuda12-locate-anything-cpp-development"
+    nvidia-cuda-13: "cuda13-locate-anything-cpp-development"
+    nvidia-l4t: "nvidia-l4t-arm64-locate-anything-cpp-development"
+    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-locate-anything-cpp-development"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-locate-anything-cpp-development"
+    intel: "intel-sycl-f32-locate-anything-cpp-development"
+    vulkan: "vulkan-locate-anything-cpp-development"
+- !!merge <<: *locateanything
+  name: "cpu-locate-anything-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-locate-anything-cpp"
+  mirrors:
+    - localai/localai-backends:latest-cpu-locate-anything-cpp
+- !!merge <<: *locateanything
+  name: "cpu-locate-anything-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-locate-anything-cpp"
+  mirrors:
+    - localai/localai-backends:master-cpu-locate-anything-cpp
+- !!merge <<: *locateanything
+  name: "cuda12-locate-anything-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-locate-anything-cpp"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-12-locate-anything-cpp
+- !!merge <<: *locateanything
+  name: "cuda12-locate-anything-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-locate-anything-cpp"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-cuda-12-locate-anything-cpp
+- !!merge <<: *locateanything
+  name: "cuda13-locate-anything-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-locate-anything-cpp"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-13-locate-anything-cpp
+- !!merge <<: *locateanything
+  name: "cuda13-locate-anything-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-locate-anything-cpp"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-cuda-13-locate-anything-cpp
+- !!merge <<: *locateanything
+  name: "nvidia-l4t-arm64-locate-anything-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-locate-anything-cpp"
+  mirrors:
+    - localai/localai-backends:latest-nvidia-l4t-arm64-locate-anything-cpp
+- !!merge <<: *locateanything
+  name: "nvidia-l4t-arm64-locate-anything-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-locate-anything-cpp"
+  mirrors:
+    - localai/localai-backends:master-nvidia-l4t-arm64-locate-anything-cpp
+- !!merge <<: *locateanything
+  name: "cuda13-nvidia-l4t-arm64-locate-anything-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-locate-anything-cpp"
+  mirrors:
+    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-locate-anything-cpp
+- !!merge <<: *locateanything
+  name: "cuda13-nvidia-l4t-arm64-locate-anything-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-locate-anything-cpp"
+  mirrors:
+    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-locate-anything-cpp
+- !!merge <<: *locateanything
+  name: "intel-sycl-f32-locate-anything-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-locate-anything-cpp"
+  mirrors:
+    - localai/localai-backends:latest-gpu-intel-sycl-f32-locate-anything-cpp
+- !!merge <<: *locateanything
+  name: "intel-sycl-f32-locate-anything-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-locate-anything-cpp"
+  mirrors:
+    - localai/localai-backends:master-gpu-intel-sycl-f32-locate-anything-cpp
+- !!merge <<: *locateanything
+  name: "intel-sycl-f16-locate-anything-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-locate-anything-cpp"
+  mirrors:
+    - localai/localai-backends:latest-gpu-intel-sycl-f16-locate-anything-cpp
+- !!merge <<: *locateanything
+  name: "intel-sycl-f16-locate-anything-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-locate-anything-cpp"
+  mirrors:
+    - localai/localai-backends:master-gpu-intel-sycl-f16-locate-anything-cpp
+- !!merge <<: *locateanything
+  name: "vulkan-locate-anything-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-locate-anything-cpp"
+  mirrors:
+    - localai/localai-backends:latest-gpu-vulkan-locate-anything-cpp
+- !!merge <<: *locateanything
+  name: "vulkan-locate-anything-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-locate-anything-cpp"
+  mirrors:
+    - localai/localai-backends:master-gpu-vulkan-locate-anything-cpp
 - &vllm
  name: "vllm"
  license: apache-2.0
@@ -455,12 +547,9 @@
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm-omni"
 - &mlx
  name: "mlx"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx"
  icon: https://avatars.githubusercontent.com/u/102832242?s=200&v=4
  urls:
    - https://github.com/ml-explore/mlx-lm
-  mirrors:
-    - localai/localai-backends:latest-metal-darwin-arm64-mlx
  license: MIT
  description: |
      Run LLMs with MLX
@@ -479,12 +568,9 @@
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-mlx"
 - &mlx-vlm
  name: "mlx-vlm"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx-vlm"
  icon: https://avatars.githubusercontent.com/u/102832242?s=200&v=4
  urls:
    - https://github.com/Blaizzy/mlx-vlm
-  mirrors:
-    - localai/localai-backends:latest-metal-darwin-arm64-mlx-vlm
  license: MIT
  description: |
      Run Vision-Language Models with MLX
@@ -505,12 +591,9 @@
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-mlx-vlm"
 - &mlx-audio
  name: "mlx-audio"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx-audio"
  icon: https://avatars.githubusercontent.com/u/102832242?s=200&v=4
  urls:
    - https://github.com/Blaizzy/mlx-audio
-  mirrors:
-    - localai/localai-backends:latest-metal-darwin-arm64-mlx-audio
  license: MIT
  description: |
      Run Audio Models with MLX
@@ -531,12 +614,9 @@
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-mlx-audio"
 - &mlx-distributed
  name: "mlx-distributed"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx-distributed"
  icon: https://avatars.githubusercontent.com/u/102832242?s=200&v=4
  urls:
    - https://github.com/ml-explore/mlx-lm
-  mirrors:
-    - localai/localai-backends:latest-metal-darwin-arm64-mlx-distributed
  license: MIT
  description: |
      Run distributed LLM inference with MLX across multiple Apple Silicon Macs
@@ -632,7 +712,7 @@
    default: "cpu-diffusers"
    nvidia-cuda-13: "cuda13-diffusers"
    nvidia-cuda-12: "cuda12-diffusers"
-    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-diffusers"
+    nvidia-l4t-cuda-12: "nvidia-l4t-diffusers"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-diffusers"
 - &ace-step
  name: "ace-step"
@@ -688,14 +768,17 @@
 - &qwen3ttscpp
  name: "qwen3-tts-cpp"
  description: |
-    Qwen3-TTS C++ backend using GGML. Native C++ text-to-speech with voice cloning support.
-    Generates 24kHz mono audio from text with optional reference audio for voice cloning via ECAPA-TDNN speaker embeddings.
+    Qwen3-TTS C++ backend using GGML (qwentts.cpp). Native C++ text-to-speech
+    with streaming output, named speakers, voice design, and zero-shot voice
+    cloning. 24kHz mono, 11 languages with Mandarin dialects. 0.6B and 1.7B
+    models in Q8_0 / Q4_K_M.
  urls:
-    - https://github.com/predict-woo/qwen3-tts.cpp
+    - https://github.com/ServeurpersoCom/qwentts.cpp
  tags:
    - text-to-speech
    - tts
    - voice-cloning
+    - streaming
  alias: "qwen3-tts-cpp"
  capabilities:
    default: "cpu-qwen3-tts-cpp"
@@ -709,6 +792,33 @@
    nvidia-l4t: "nvidia-l4t-arm64-qwen3-tts-cpp"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-qwen3-tts-cpp"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-qwen3-tts-cpp"
+- &omnivoicecpp
+  name: "omnivoice-cpp"
+  description: |
+    OmniVoice C++ backend using GGML. Native text-to-speech with voice cloning
+    (reference audio + transcript) and voice design (attribute keywords: gender,
+    age, pitch, style, volume, emotion). 24kHz mono output, 646 languages.
+    Supports streaming synthesis.
+  urls:
+    - https://github.com/ServeurpersoCom/omnivoice.cpp
+  tags:
+    - text-to-speech
+    - tts
+    - voice-cloning
+    - voice-design
+  alias: "omnivoice-cpp"
+  capabilities:
+    default: "cpu-omnivoice-cpp"
+    nvidia: "cuda12-omnivoice-cpp"
+    nvidia-cuda-13: "cuda13-omnivoice-cpp"
+    nvidia-cuda-12: "cuda12-omnivoice-cpp"
+    intel: "intel-sycl-f16-omnivoice-cpp"
+    metal: "metal-omnivoice-cpp"
+    amd: "rocm-omnivoice-cpp"
+    vulkan: "vulkan-omnivoice-cpp"
+    nvidia-l4t: "nvidia-l4t-arm64-omnivoice-cpp"
+    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-omnivoice-cpp"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-omnivoice-cpp"
 - &vibevoicecpp
  name: "vibevoice-cpp"
  description: |
@@ -854,7 +964,7 @@
    metal: "metal-kokoro"
    nvidia-cuda-13: "cuda13-kokoro"
    nvidia-cuda-12: "cuda12-kokoro"
-    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-kokoro"
+    nvidia-l4t-cuda-12: "nvidia-l4t-kokoro"
 - &kokoros
  icon: https://avatars.githubusercontent.com/u/166769057?v=4
  description: |
@@ -897,7 +1007,6 @@
    intel: "intel-coqui"
    amd: "rocm-coqui"
    metal: "metal-coqui"
-    nvidia-cuda-13: "cuda13-coqui"
    nvidia-cuda-12: "cuda12-coqui"
  icon: https://avatars.githubusercontent.com/u/1338804?s=200&v=4
 - &outetts
@@ -1147,27 +1256,27 @@
  icon: https://avatars.githubusercontent.com/u/151010778?s=200&v=4
 - &piper
  name: "piper"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-piper"
  icon: https://github.com/OHF-Voice/piper1-gpl/raw/main/etc/logo.png
  urls:
    - https://github.com/rhasspy/piper
    - https://github.com/mudler/go-piper
-  mirrors:
-    - localai/localai-backends:latest-piper
  license: MIT
  description: |
     A fast, local neural text to speech system
  tags:
    - text-to-speech
    - TTS
+  capabilities:
+    default: "cpu-piper"
+    metal: "metal-piper"
 - &opus
  name: "opus"
  alias: "opus"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-opus"
+  capabilities:
+    default: "cpu-opus"
+    metal: "metal-opus"
  urls:
    - https://opus-codec.org/
-  mirrors:
-    - localai/localai-backends:latest-cpu-opus
  license: BSD-3-Clause
  description: |
    Opus audio codec backend for encoding and decoding audio.
@@ -1177,15 +1286,19 @@
    - opus
    - WebRTC
    - realtime
-    - CPU
+- !!merge <<: *opus
+  name: "opus-development"
+  capabilities:
+    default: "cpu-opus-development"
+    metal: "metal-opus-development"
 - &silero-vad
  name: "silero-vad"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-silero-vad"
  icon: https://user-images.githubusercontent.com/12515440/89997349-b3523080-dc94-11ea-9906-ca2e8bc50535.png
  urls:
    - https://github.com/snakers4/silero-vad
-  mirrors:
-    - localai/localai-backends:latest-cpu-silero-vad
+  capabilities:
+    default: "cpu-silero-vad"
+    metal: "metal-silero-vad"
  description: |
    Silero VAD: pre-trained enterprise-grade Voice Activity Detector.
    Silero VAD is a voice activity detection model that can be used to detect whether a given audio contains speech or not.
@@ -1196,9 +1309,6 @@
    - CPU
 - &local-store
  name: "local-store"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-local-store"
-  mirrors:
-    - localai/localai-backends:latest-cpu-local-store
  urls:
    - https://github.com/mudler/LocalAI
  description: |
@@ -1209,11 +1319,11 @@
    - open-source
    - CPU
  license: MIT
+  capabilities:
+    default: "cpu-local-store"
+    metal: "metal-local-store"
 - &kitten-tts
  name: "kitten-tts"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-kitten-tts"
-  mirrors:
-    - localai/localai-backends:latest-kitten-tts
  urls:
    - https://github.com/KittenML/KittenTTS
  description: |
@@ -1222,6 +1332,9 @@
    - text-to-speech
    - TTS
  license: apache-2.0
+  capabilities:
+    default: "cpu-kitten-tts"
+    metal: "metal-kitten-tts"
 - &neutts
  name: "neutts"
  urls:
@@ -1347,25 +1460,89 @@
  mirrors:
    - localai/localai-backends:master-gpu-rocm-hipblas-neutts
 - !!merge <<: *mlx
-  name: "mlx-development"
+  name: "metal-mlx"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-mlx
+- !!merge <<: *mlx
+  name: "metal-mlx-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-mlx"
  mirrors:
    - localai/localai-backends:master-metal-darwin-arm64-mlx
+- !!merge <<: *mlx
+  name: "mlx-development"
+  capabilities:
+    default: "cpu-mlx-development"
+    nvidia: "cuda12-mlx-development"
+    metal: "metal-mlx-development"
+    nvidia-cuda-12: "cuda12-mlx-development"
+    nvidia-cuda-13: "cuda13-mlx-development"
+    nvidia-l4t: "nvidia-l4t-mlx-development"
+    nvidia-l4t-cuda-12: "nvidia-l4t-mlx-development"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-mlx-development"
 - !!merge <<: *mlx-vlm
-  name: "mlx-vlm-development"
+  name: "metal-mlx-vlm"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx-vlm"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-mlx-vlm
+- !!merge <<: *mlx-vlm
+  name: "metal-mlx-vlm-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-mlx-vlm"
  mirrors:
    - localai/localai-backends:master-metal-darwin-arm64-mlx-vlm
+- !!merge <<: *mlx-vlm
+  name: "mlx-vlm-development"
+  capabilities:
+    default: "cpu-mlx-vlm-development"
+    nvidia: "cuda12-mlx-vlm-development"
+    metal: "metal-mlx-vlm-development"
+    nvidia-cuda-12: "cuda12-mlx-vlm-development"
+    nvidia-cuda-13: "cuda13-mlx-vlm-development"
+    nvidia-l4t: "nvidia-l4t-mlx-vlm-development"
+    nvidia-l4t-cuda-12: "nvidia-l4t-mlx-vlm-development"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-mlx-vlm-development"
 - !!merge <<: *mlx-audio
-  name: "mlx-audio-development"
+  name: "metal-mlx-audio"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx-audio"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-mlx-audio
+- !!merge <<: *mlx-audio
+  name: "metal-mlx-audio-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-mlx-audio"
  mirrors:
    - localai/localai-backends:master-metal-darwin-arm64-mlx-audio
+- !!merge <<: *mlx-audio
+  name: "mlx-audio-development"
+  capabilities:
+    default: "cpu-mlx-audio-development"
+    nvidia: "cuda12-mlx-audio-development"
+    metal: "metal-mlx-audio-development"
+    nvidia-cuda-12: "cuda12-mlx-audio-development"
+    nvidia-cuda-13: "cuda13-mlx-audio-development"
+    nvidia-l4t: "nvidia-l4t-mlx-audio-development"
+    nvidia-l4t-cuda-12: "nvidia-l4t-mlx-audio-development"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-mlx-audio-development"
 - !!merge <<: *mlx-distributed
-  name: "mlx-distributed-development"
+  name: "metal-mlx-distributed"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx-distributed"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-mlx-distributed
+- !!merge <<: *mlx-distributed
+  name: "metal-mlx-distributed-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-mlx-distributed"
  mirrors:
    - localai/localai-backends:master-metal-darwin-arm64-mlx-distributed
+- !!merge <<: *mlx-distributed
+  name: "mlx-distributed-development"
+  capabilities:
+    default: "cpu-mlx-distributed-development"
+    nvidia: "cuda12-mlx-distributed-development"
+    metal: "metal-mlx-distributed-development"
+    nvidia-cuda-12: "cuda12-mlx-distributed-development"
+    nvidia-cuda-13: "cuda13-mlx-distributed-development"
+    nvidia-l4t: "nvidia-l4t-mlx-distributed-development"
+    nvidia-l4t-cuda-12: "nvidia-l4t-mlx-distributed-development"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-mlx-distributed-development"
 ## mlx
 - !!merge <<: *mlx
  name: "cpu-mlx"
@@ -1571,10 +1748,20 @@
  mirrors:
    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-mlx-distributed
 - !!merge <<: *kitten-tts
-  name: "kitten-tts-development"
+  name: "cpu-kitten-tts"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-kitten-tts"
+  mirrors:
+    - localai/localai-backends:latest-kitten-tts
+- !!merge <<: *kitten-tts
+  name: "cpu-kitten-tts-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-kitten-tts"
  mirrors:
    - localai/localai-backends:master-kitten-tts
+- !!merge <<: *kitten-tts
+  name: "kitten-tts-development"
+  capabilities:
+    default: "cpu-kitten-tts-development"
+    metal: "metal-kitten-tts-development"
 - !!merge <<: *kitten-tts
  name: "metal-kitten-tts"
  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-kitten-tts"
@@ -1586,11 +1773,23 @@
  mirrors:
    - localai/localai-backends:master-metal-darwin-arm64-kitten-tts
 - !!merge <<: *local-store
-  name: "local-store-development"
+  name: "cpu-local-store"
+  alias: "local-store"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-local-store"
+  mirrors:
+    - localai/localai-backends:latest-cpu-local-store
+- !!merge <<: *local-store
+  name: "cpu-local-store-development"
  alias: "local-store"
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-local-store"
  mirrors:
    - localai/localai-backends:master-cpu-local-store
+- !!merge <<: *local-store
+  name: "local-store-development"
+  alias: "local-store"
+  capabilities:
+    default: "cpu-local-store-development"
+    metal: "metal-local-store-development"
 - !!merge <<: *local-store
  name: "metal-local-store"
  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-local-store"
@@ -1603,7 +1802,12 @@
  mirrors:
    - localai/localai-backends:master-metal-darwin-arm64-local-store
 - !!merge <<: *opus
-  name: "opus-development"
+  name: "cpu-opus"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-opus"
+  mirrors:
+    - localai/localai-backends:latest-cpu-opus
+- !!merge <<: *opus
+  name: "cpu-opus-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-opus"
  mirrors:
    - localai/localai-backends:master-cpu-opus
@@ -1618,10 +1822,20 @@
  mirrors:
    - localai/localai-backends:master-metal-darwin-arm64-opus
 - !!merge <<: *silero-vad
-  name: "silero-vad-development"
+  name: "cpu-silero-vad"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-silero-vad"
+  mirrors:
+    - localai/localai-backends:latest-cpu-silero-vad
+- !!merge <<: *silero-vad
+  name: "cpu-silero-vad-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-silero-vad"
  mirrors:
    - localai/localai-backends:master-cpu-silero-vad
+- !!merge <<: *silero-vad
+  name: "silero-vad-development"
+  capabilities:
+    default: "cpu-silero-vad-development"
+    metal: "metal-silero-vad-development"
 - !!merge <<: *silero-vad
  name: "metal-silero-vad"
  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-silero-vad"
@@ -1633,10 +1847,20 @@
  mirrors:
    - localai/localai-backends:master-metal-darwin-arm64-silero-vad
 - !!merge <<: *piper
-  name: "piper-development"
+  name: "cpu-piper"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-piper"
+  mirrors:
+    - localai/localai-backends:latest-piper
+- !!merge <<: *piper
+  name: "cpu-piper-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-piper"
  mirrors:
    - localai/localai-backends:master-piper
+- !!merge <<: *piper
+  name: "piper-development"
+  capabilities:
+    default: "cpu-piper-development"
+    metal: "metal-piper-development"
 - !!merge <<: *piper
  name: "metal-piper"
  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-piper"
@@ -3279,6 +3503,121 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-qwen3-tts-cpp"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-13-qwen3-tts-cpp
+## omnivoice-cpp
+- !!merge <<: *omnivoicecpp
+  name: "omnivoice-cpp-development"
+  capabilities:
+    default: "cpu-omnivoice-cpp-development"
+    nvidia: "cuda12-omnivoice-cpp-development"
+    nvidia-cuda-13: "cuda13-omnivoice-cpp-development"
+    nvidia-cuda-12: "cuda12-omnivoice-cpp-development"
+    intel: "intel-sycl-f16-omnivoice-cpp-development"
+    metal: "metal-omnivoice-cpp-development"
+    amd: "rocm-omnivoice-cpp-development"
+    vulkan: "vulkan-omnivoice-cpp-development"
+    nvidia-l4t: "nvidia-l4t-arm64-omnivoice-cpp-development"
+    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-omnivoice-cpp-development"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-omnivoice-cpp-development"
+- !!merge <<: *omnivoicecpp
+  name: "nvidia-l4t-arm64-omnivoice-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-omnivoice-cpp"
+  mirrors:
+    - localai/localai-backends:latest-nvidia-l4t-arm64-omnivoice-cpp
+- !!merge <<: *omnivoicecpp
+  name: "nvidia-l4t-arm64-omnivoice-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-omnivoice-cpp"
+  mirrors:
+    - localai/localai-backends:master-nvidia-l4t-arm64-omnivoice-cpp
+- !!merge <<: *omnivoicecpp
+  name: "cuda13-nvidia-l4t-arm64-omnivoice-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-omnivoice-cpp"
+  mirrors:
+    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-omnivoice-cpp
+- !!merge <<: *omnivoicecpp
+  name: "cuda13-nvidia-l4t-arm64-omnivoice-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-omnivoice-cpp"
+  mirrors:
+    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-omnivoice-cpp
+- !!merge <<: *omnivoicecpp
+  name: "cpu-omnivoice-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-omnivoice-cpp"
+  mirrors:
+    - localai/localai-backends:latest-cpu-omnivoice-cpp
+- !!merge <<: *omnivoicecpp
+  name: "metal-omnivoice-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-omnivoice-cpp"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-omnivoice-cpp
+- !!merge <<: *omnivoicecpp
+  name: "metal-omnivoice-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-omnivoice-cpp"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-omnivoice-cpp
+- !!merge <<: *omnivoicecpp
+  name: "cpu-omnivoice-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-omnivoice-cpp"
+  mirrors:
+    - localai/localai-backends:master-cpu-omnivoice-cpp
+- !!merge <<: *omnivoicecpp
+  name: "cuda12-omnivoice-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-omnivoice-cpp"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-12-omnivoice-cpp
+- !!merge <<: *omnivoicecpp
+  name: "rocm-omnivoice-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-omnivoice-cpp"
+  mirrors:
+    - localai/localai-backends:latest-gpu-rocm-hipblas-omnivoice-cpp
+- !!merge <<: *omnivoicecpp
+  name: "intel-sycl-f32-omnivoice-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-omnivoice-cpp"
+  mirrors:
+    - localai/localai-backends:latest-gpu-intel-sycl-f32-omnivoice-cpp
+- !!merge <<: *omnivoicecpp
+  name: "intel-sycl-f16-omnivoice-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-omnivoice-cpp"
+  mirrors:
+    - localai/localai-backends:latest-gpu-intel-sycl-f16-omnivoice-cpp
+- !!merge <<: *omnivoicecpp
+  name: "vulkan-omnivoice-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-omnivoice-cpp"
+  mirrors:
+    - localai/localai-backends:latest-gpu-vulkan-omnivoice-cpp
+- !!merge <<: *omnivoicecpp
+  name: "vulkan-omnivoice-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-omnivoice-cpp"
+  mirrors:
+    - localai/localai-backends:master-gpu-vulkan-omnivoice-cpp
+- !!merge <<: *omnivoicecpp
+  name: "cuda12-omnivoice-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-omnivoice-cpp"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-cuda-12-omnivoice-cpp
+- !!merge <<: *omnivoicecpp
+  name: "rocm-omnivoice-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-omnivoice-cpp"
+  mirrors:
+    - localai/localai-backends:master-gpu-rocm-hipblas-omnivoice-cpp
+- !!merge <<: *omnivoicecpp
+  name: "intel-sycl-f32-omnivoice-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-omnivoice-cpp"
+  mirrors:
+    - localai/localai-backends:master-gpu-intel-sycl-f32-omnivoice-cpp
+- !!merge <<: *omnivoicecpp
+  name: "intel-sycl-f16-omnivoice-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-omnivoice-cpp"
+  mirrors:
+    - localai/localai-backends:master-gpu-intel-sycl-f16-omnivoice-cpp
+- !!merge <<: *omnivoicecpp
+  name: "cuda13-omnivoice-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-omnivoice-cpp"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-13-omnivoice-cpp
+- !!merge <<: *omnivoicecpp
+  name: "cuda13-omnivoice-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-omnivoice-cpp"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-cuda-13-omnivoice-cpp
 ## vibevoice-cpp
 - !!merge <<: *vibevoicecpp
  name: "nvidia-l4t-arm64-vibevoice-cpp"
@@ -4609,24 +4948,24 @@
    - localai/localai-backends:master-cpu-trl
 - !!merge <<: *trl
  name: "cuda12-trl"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-cublas-cuda12-trl"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-trl"
  mirrors:
-    - localai/localai-backends:latest-cublas-cuda12-trl
+    - localai/localai-backends:latest-gpu-nvidia-cuda-12-trl
 - !!merge <<: *trl
  name: "cuda12-trl-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-cublas-cuda12-trl"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-trl"
  mirrors:
-    - localai/localai-backends:master-cublas-cuda12-trl
+    - localai/localai-backends:master-gpu-nvidia-cuda-12-trl
 - !!merge <<: *trl
  name: "cuda13-trl"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-cublas-cuda13-trl"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-trl"
  mirrors:
-    - localai/localai-backends:latest-cublas-cuda13-trl
+    - localai/localai-backends:latest-gpu-nvidia-cuda-13-trl
 - !!merge <<: *trl
  name: "cuda13-trl-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-cublas-cuda13-trl"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-trl"
  mirrors:
-    - localai/localai-backends:master-cublas-cuda13-trl
+    - localai/localai-backends:master-gpu-nvidia-cuda-13-trl
 ## llama.cpp quantization backend
 - &llama-cpp-quantization
  name: "llama-cpp-quantization"
--- a/backend/python/neutts/requirements-cpu.txt
+++ b/backend/python/neutts/requirements-cpu.txt
@@ -1,6 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 accelerate
 torch==2.8.0
+torchaudio==2.8.0
 transformers==4.56.1
 librosa==0.11.0
 neucodec>=0.0.4
--- a/backend/python/neutts/requirements-cublas12.txt
+++ b/backend/python/neutts/requirements-cublas12.txt
@@ -3,6 +3,7 @@ neucodec>=0.0.4
 phonemizer==3.3.0
 soundfile==0.13.1
 torch==2.8.0
+torchaudio==2.8.0
 transformers==4.56.1
 resemble-perth==1.0.1
 accelerate
--- a/backend/python/vllm/requirements-cublas13-after.txt
+++ b/backend/python/vllm/requirements-cublas13-after.txt
@@ -3,5 +3,5 @@
 # on a cu130 host. Pull the cu130-flavoured wheel from vLLM's per-tag index
 # instead — the cublas13 case in install.sh adds --index-strategy=unsafe-best-match
 # so uv consults this index alongside PyPI.
--extra-index-url https://wheels.vllm.ai/0.22.1/cu130
-vllm==0.22.1
+--extra-index-url https://wheels.vllm.ai/0.23.0/cu130
+vllm==0.23.0
--- a/core/application/distributed.go
+++ b/core/application/distributed.go
@@ -161,6 +161,21 @@ func initDistributed(cfg *config.ApplicationConfig, authDB *gorm.DB, configLoade
 	}
 	xlog.Info("Node registry initialized")

+	// Seed declarative per-model scheduling config (LOCALAI_MODEL_SCHEDULING /
+	// LOCALAI_MODEL_SCHEDULING_CONFIG). Authoritative: overwrites matching models
+	// on every boot. Runs before the reconciler starts so the first tick already
+	// sees the desired state. Models not listed are left untouched.
+	if cfg.Distributed.ModelSchedulingJSON != "" || cfg.Distributed.ModelSchedulingConfigPath != "" {
+		schedConfigs, err := nodes.ParseSchedulingSeed(cfg.Distributed.ModelSchedulingJSON, cfg.Distributed.ModelSchedulingConfigPath)
+		if err != nil {
+			return nil, fmt.Errorf("parsing declarative model scheduling config: %w", err)
+		}
+		if err := registry.SeedModelScheduling(context.Background(), schedConfigs); err != nil {
+			return nil, fmt.Errorf("seeding declarative model scheduling config: %w", err)
+		}
+		xlog.Info("Applied declarative model scheduling config", "models", len(schedConfigs))
+	}
+
 	// Collect SmartRouter option values; the router itself is created after all
 	// dependencies (including FileStager and Unloader) are ready.
 	var routerAuthToken string
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -307,11 +307,19 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions
 		}
 	}

+	// TopK may be nil after SetDefaults for backends that don't use llama.cpp's
+	// top_k=40 default (issue #6632, e.g. mlx). proto3 int32 can't be unset, so
+	// send 0 — the value mlx actually wants (top-k disabled).
+	var topK int32
+	if c.TopK != nil {
+		topK = int32(*c.TopK)
+	}
+
 	pbOpts := &pb.PredictOptions{
 		Temperature:         float32(*c.Temperature),
 		TopP:                float32(*c.TopP),
 		NDraft:              c.NDraft,
-		TopK:                int32(*c.TopK),
+		TopK:                topK,
 		MinP:                float32(*c.MinP),
 		Tokens:              int32(*c.Maxtokens),
 		Threads:             int32(*c.Threads),
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -172,6 +172,8 @@ type RunCMD struct {
 	NatsTLSCert               string `env:"LOCALAI_NATS_TLS_CERT" type:"existingfile" help:"Client certificate for NATS mTLS" group:"distributed"`
 	NatsTLSKey                string `env:"LOCALAI_NATS_TLS_KEY" type:"existingfile" help:"Client private key for NATS mTLS" group:"distributed"`
 	ExposeNodeHeader          bool   `env:"LOCALAI_EXPOSE_NODE_HEADER" default:"false" help:"Set the X-LocalAI-Node response header on inference responses (OpenAI chat/completions/embeddings, Anthropic /v1/messages, Ollama /api/chat,/api/generate,/api/embed) with the ID of the worker that served the request. Disabled by default: the node ID reveals internal topology and should not be exposed on a public endpoint. Best-effort: under heavy concurrency the header may reflect a recent routing decision rather than this exact request's." group:"distributed"`
+	ModelScheduling           string `env:"LOCALAI_MODEL_SCHEDULING" help:"Declarative per-model scheduling config applied at startup (inline JSON list of {model_name,node_selector,min_replicas,max_replicas,replicas:\"all\"}). Authoritative: overwrites matching models on every boot. Distributed mode only." group:"distributed"`
+	ModelSchedulingConfig     string `env:"LOCALAI_MODEL_SCHEDULING_CONFIG" help:"Path to a YAML file with the same per-model scheduling list as LOCALAI_MODEL_SCHEDULING. Distributed mode only." group:"distributed"`

 	Version bool

@@ -347,6 +349,15 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 	if r.ExposeNodeHeader {
 		opts = append(opts, config.WithExposeNodeHeader(true))
 	}
+	if r.ModelScheduling != "" {
+		opts = append(opts, config.WithModelSchedulingJSON(r.ModelScheduling))
+	}
+	if r.ModelSchedulingConfig != "" {
+		opts = append(opts, config.WithModelSchedulingConfigPath(r.ModelSchedulingConfig))
+	}
+	if !r.Distributed && (r.ModelScheduling != "" || r.ModelSchedulingConfig != "") {
+		xlog.Warn("LOCALAI_MODEL_SCHEDULING / LOCALAI_MODEL_SCHEDULING_CONFIG is set but distributed mode is disabled (LOCALAI_DISTRIBUTED=false) - ignoring")
+	}

 	if r.DisableMetricsEndpoint {
 		opts = append(opts, config.DisableMetricsEndpoint)
--- a/core/config/backend_capabilities.go
+++ b/core/config/backend_capabilities.go
@@ -396,10 +396,10 @@ var BackendCapabilities = map[string]BackendCapability{
 		Description:      "Qwen TTS",
 	},
 	"qwen3-tts-cpp": {
-		GRPCMethods:      []GRPCMethod{MethodTTS},
+		GRPCMethods:      []GRPCMethod{MethodTTS, MethodTTSStream},
 		PossibleUsecases: []string{UsecaseTTS},
 		DefaultUsecases:  []string{UsecaseTTS},
-		Description:      "Qwen3 TTS C++ — text-to-speech, C++ backend",
+		Description:      "Qwen3 TTS C++ - text-to-speech with streaming, named speakers, voice design and cloning (qwentts.cpp / GGML)",
 	},
 	"faster-qwen3-tts": {
 		GRPCMethods:      []GRPCMethod{MethodTTS},
@@ -517,6 +517,33 @@ func NormalizeBackendName(backend string) string {
 	return strings.ReplaceAll(backend, ".", "-")
 }

+// nonLlamaSamplerBackends lists backends whose native sampler defaults differ
+// from llama.cpp's, so LocalAI must NOT inject llama.cpp's top_k=40 default for
+// them (issue #6632). mlx_lm's intended default is top_k=0 (disabled) and mlx
+// does not remap 0->40, so shipping 40 silently changes sampling for clients
+// that omit top_k. Leaving TopK nil lets the wire value default to 0.
+//
+// This is intentionally a small allow-list of KNOWN non-llama backends: empty
+// and unknown backends fall through to the llama.cpp default to preserve the
+// GGUF auto-detect path's behavior.
+var nonLlamaSamplerBackends = map[string]struct{}{
+	"mlx":             {},
+	"mlx-vlm":         {},
+	"mlx-distributed": {},
+}
+
+// UsesLlamaSamplerDefaults reports whether a backend should receive llama.cpp's
+// sampler defaults (e.g. top_k=40). Empty/unknown backends return true so the
+// GGUF auto-detect path (which resolves to llama.cpp) keeps today's behavior;
+// only the known non-llama backends in nonLlamaSamplerBackends return false.
+func UsesLlamaSamplerDefaults(backend string) bool {
+	if backend == "" {
+		return true
+	}
+	_, isNonLlama := nonLlamaSamplerBackends[NormalizeBackendName(backend)]
+	return !isNonLlama
+}
+
 // GetBackendCapability returns the capability info for a backend, or nil if unknown.
 // Handles backend name normalization.
 func GetBackendCapability(backend string) *BackendCapability {
--- a/core/config/distributed_config.go
+++ b/core/config/distributed_config.go
@@ -84,6 +84,12 @@ type DistributedConfig struct {
 	// drives the background eviction cadence (eviction runs every TTL/2). Zero
 	// means use the prefixcache package default (5m).
 	PrefixCacheTTL time.Duration
+	// ModelSchedulingJSON is an inline JSON list of per-model scheduling configs
+	// applied authoritatively at startup (LOCALAI_MODEL_SCHEDULING).
+	ModelSchedulingJSON string
+	// ModelSchedulingConfigPath is a path to a YAML file with the same list
+	// (LOCALAI_MODEL_SCHEDULING_CONFIG).
+	ModelSchedulingConfigPath string
 }

 // Validate checks that the distributed configuration is internally consistent.
@@ -290,6 +296,21 @@ func WithPrefixCacheTTL(d time.Duration) AppOption {
 	}
 }

+// WithModelSchedulingJSON sets the inline-JSON declarative scheduling config.
+func WithModelSchedulingJSON(s string) AppOption {
+	return func(o *ApplicationConfig) {
+		o.Distributed.ModelSchedulingJSON = s
+	}
+}
+
+// WithModelSchedulingConfigPath sets the path to a YAML declarative scheduling
+// config file.
+func WithModelSchedulingConfigPath(path string) AppOption {
+	return func(o *ApplicationConfig) {
+		o.Distributed.ModelSchedulingConfigPath = path
+	}
+}
+
 // Flag names for distributed timeout / interval configuration. These are
 // the kebab-case identifiers kong derives from the matching RunCMD struct
 // fields; they appear in Validate error messages and any other operator-
--- a/core/config/meta/registry.go
+++ b/core/config/meta/registry.go
@@ -355,6 +355,85 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Component:   "toggle",
 			Order:       69,
 		},
+		"pipeline.voice_recognition.model": {
+			Section:              "pipeline",
+			Label:                "Voice Recognition Model",
+			Description:          "Speaker-recognition backend model used to gate the pipeline behind speaker verification. Leave empty to disable the voice gate.",
+			Component:            "model-select",
+			AutocompleteProvider: ProviderModels,
+			Order:                70,
+		},
+		"pipeline.voice_recognition.mode": {
+			Section:     "pipeline",
+			Label:       "Voice Gate Mode",
+			Description: "How callers are authorized: 'identify' matches the speaker 1:N against the voice registry; 'verify' matches 1:few against the configured reference audios.",
+			Component:   "select",
+			Options: []FieldOption{
+				{Value: "identify", Label: "identify (registry)"},
+				{Value: "verify", Label: "verify (references)"},
+			},
+			Order: 71,
+		},
+		"pipeline.voice_recognition.threshold": {
+			Section:     "pipeline",
+			Label:       "Voice Gate Threshold",
+			Description: "Maximum cosine distance between the caller and an authorized speaker that still counts as a match. Lower is stricter. Default 0.25 is tuned for the ECAPA-TDNN encoder on VoxCeleb.",
+			Component:   "slider",
+			Min:         f64(0.01),
+			Max:         f64(2),
+			Step:        f64(0.01),
+			Order:       72,
+		},
+		"pipeline.voice_recognition.when": {
+			Section:     "pipeline",
+			Label:       "Voice Gate When",
+			Description: "How often to verify the speaker: 'every' checks each utterance; 'first' verifies once and then trusts the session.",
+			Component:   "select",
+			Options: []FieldOption{
+				{Value: "every", Label: "every utterance"},
+				{Value: "first", Label: "first only"},
+			},
+			Order: 73,
+		},
+		"pipeline.voice_recognition.on_reject": {
+			Section:     "pipeline",
+			Label:       "Voice Gate On Reject",
+			Description: "What to do with an unauthorized utterance: 'drop_event' drops it and emits an error event to the client; 'drop_silent' drops it quietly.",
+			Component:   "select",
+			Options: []FieldOption{
+				{Value: "drop_event", Label: "drop + error event"},
+				{Value: "drop_silent", Label: "drop silently"},
+			},
+			Order: 74,
+		},
+		"pipeline.voice_recognition.anti_spoofing": {
+			Section:     "pipeline",
+			Label:       "Voice Gate Anti-Spoofing",
+			Description: "Enable the backend liveness/anti-spoofing check (verify mode only) to reject replayed or synthesized audio.",
+			Component:   "toggle",
+			Order:       75,
+		},
+		"pipeline.voice_recognition.allow.names": {
+			Section:     "pipeline",
+			Label:       "Voice Gate Allowed Names",
+			Description: "Identify mode: authorize only registry identities whose name matches one of these exactly. Empty allows any registered identity.",
+			Component:   "string-list",
+			Order:       76,
+		},
+		"pipeline.voice_recognition.allow.labels": {
+			Section:     "pipeline",
+			Label:       "Voice Gate Allowed Labels",
+			Description: "Identify mode: authorize any registry identity carrying one of these label keys. Empty allows any registered identity.",
+			Component:   "string-list",
+			Order:       77,
+		},
+		"pipeline.voice_recognition.references": {
+			Section:     "pipeline",
+			Label:       "Voice Gate References",
+			Description: "Verify mode: the authorized reference speakers, each with a name and an audio file path the caller's voice is matched against.",
+			Component:   "json-editor",
+			Order:       78,
+		},

 		// --- Functions ---
 		"function.grammar.parallel_calls": {
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -509,6 +509,10 @@ type Pipeline struct {
 	// to enable_thinking=false backend metadata) without editing the underlying
 	// LLM model config. Unset leaves the LLM model config in charge.
 	DisableThinking *bool `yaml:"disable_thinking,omitempty" json:"disable_thinking,omitempty"`
+
+	// VoiceRecognition gates the pipeline behind speaker verification. Nil
+	// (block absent) means no gate, preserving existing behavior.
+	VoiceRecognition *PipelineVoiceRecognition `yaml:"voice_recognition,omitempty" json:"voice_recognition,omitempty"`
 }

 // ApplyReasoningEffort resolves the effective reasoning effort — a per-request
@@ -575,6 +579,123 @@ func (p Pipeline) ThinkingDisabled() bool {
 	return p.DisableThinking != nil && *p.DisableThinking
 }

+// Voice-recognition gate enum values.
+const (
+	VoiceGateModeIdentify = "identify"
+	VoiceGateModeVerify   = "verify"
+	VoiceGateWhenEvery    = "every"
+	VoiceGateWhenFirst    = "first"
+	VoiceGateRejectEvent  = "drop_event"
+	VoiceGateRejectSilent = "drop_silent"
+
+	// defaultVoiceGateThreshold is the cosine-distance default tuned for the
+	// ECAPA-TDNN speaker encoder on VoxCeleb.
+	defaultVoiceGateThreshold = 0.25
+)
+
+// @Description PipelineVoiceRecognition gates a realtime pipeline behind speaker verification.
+type PipelineVoiceRecognition struct {
+	// Model is the speaker-recognition backend model name.
+	Model string `yaml:"model,omitempty" json:"model,omitempty"`
+	// Mode is "identify" (1:N against the voice registry) or "verify"
+	// (1:few against reference audios).
+	Mode string `yaml:"mode,omitempty" json:"mode,omitempty"`
+	// Threshold is the maximum cosine distance that still counts as a match.
+	Threshold float32 `yaml:"threshold,omitempty" json:"threshold,omitempty"`
+	// When is "every" (verify each utterance) or "first" (verify once, then
+	// trust the session).
+	When string `yaml:"when,omitempty" json:"when,omitempty"`
+	// OnReject is "drop_event" (drop + emit an error event) or "drop_silent"
+	// (drop quietly).
+	OnReject string `yaml:"on_reject,omitempty" json:"on_reject,omitempty"`
+	// AntiSpoofing enables the backend liveness check (verify mode only).
+	AntiSpoofing bool `yaml:"anti_spoofing,omitempty" json:"anti_spoofing,omitempty"`
+	// Allow filters which registry identities are authorized (identify mode).
+	Allow VoiceRecognitionAllow `yaml:"allow,omitempty" json:"allow,omitempty"`
+	// References are the authorized reference speakers (verify mode).
+	References []VoiceReference `yaml:"references,omitempty" json:"references,omitempty"`
+}
+
+// @Description VoiceRecognitionAllow filters authorized registry identities.
+type VoiceRecognitionAllow struct {
+	// Names matches registered Metadata.Name exactly.
+	Names []string `yaml:"names,omitempty" json:"names,omitempty"`
+	// Labels authorizes any identity carrying a matching label key.
+	Labels []string `yaml:"labels,omitempty" json:"labels,omitempty"`
+}
+
+// @Description VoiceReference is one authorized reference speaker for verify mode.
+type VoiceReference struct {
+	Name  string `yaml:"name,omitempty" json:"name,omitempty"`
+	Audio string `yaml:"audio,omitempty" json:"audio,omitempty"`
+}
+
+// VoiceGateEnabled reports whether a voice-recognition gate is configured. The
+// mere presence of the block is the intent signal: a present-but-incomplete
+// block (e.g. missing model) must fail closed at construction, not be silently
+// skipped here.
+func (p Pipeline) VoiceGateEnabled() bool {
+	return p.VoiceRecognition != nil
+}
+
+// Normalize fills in defaults in place for omitted fields.
+func (v *PipelineVoiceRecognition) Normalize() {
+	if v.Mode == "" {
+		v.Mode = VoiceGateModeIdentify
+	}
+	if v.When == "" {
+		v.When = VoiceGateWhenEvery
+	}
+	if v.OnReject == "" {
+		v.OnReject = VoiceGateRejectEvent
+	}
+	if v.Threshold == 0 {
+		v.Threshold = defaultVoiceGateThreshold
+	}
+}
+
+// Validate checks shape and enum values. registryAvailable indicates whether a
+// VoiceRegistry exists (required by identify mode). Empty When/OnReject/Mode are
+// treated as valid because Normalize defaults them.
+func (v PipelineVoiceRecognition) Validate(registryAvailable bool) error {
+	if v.Model == "" {
+		return fmt.Errorf("voice_recognition: model is required")
+	}
+	switch v.Mode {
+	case "", VoiceGateModeIdentify:
+		if !registryAvailable {
+			return fmt.Errorf("voice_recognition mode 'identify' requires a voice registry")
+		}
+	case VoiceGateModeVerify:
+		if len(v.References) == 0 {
+			return fmt.Errorf("voice_recognition mode 'verify' requires at least one reference")
+		}
+		for i, r := range v.References {
+			if r.Audio == "" {
+				return fmt.Errorf("voice_recognition reference %d (%q) is missing an audio path", i, r.Name)
+			}
+		}
+	default:
+		return fmt.Errorf("voice_recognition: unknown mode %q", v.Mode)
+	}
+	switch v.When {
+	case "", VoiceGateWhenEvery, VoiceGateWhenFirst:
+	default:
+		return fmt.Errorf("voice_recognition: unknown when %q", v.When)
+	}
+	switch v.OnReject {
+	case "", VoiceGateRejectEvent, VoiceGateRejectSilent:
+	default:
+		return fmt.Errorf("voice_recognition: unknown on_reject %q", v.OnReject)
+	}
+	// A zero threshold means "unset" (Normalize defaults it); only validate an
+	// explicitly-set value. Cosine distance ranges 0..2.
+	if v.Threshold != 0 && (v.Threshold < 0 || v.Threshold > 2) {
+		return fmt.Errorf("voice_recognition: threshold %v out of range (0..2)", v.Threshold)
+	}
+	return nil
+}
+
 // @Description File configuration for model downloads
 type File struct {
 	Filename string         `yaml:"filename,omitempty" json:"filename,omitempty"`
@@ -867,7 +988,12 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Seed = &defaultSeed
 	}

-	if cfg.TopK == nil {
+	// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
+	// native default differs (issue #6632). Only inject it for the llama.cpp
+	// family and the empty/auto backend; leave TopK nil for known non-llama
+	// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
+	// is 0 rather than a silently-changed 40.
+	if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
 		cfg.TopK = &defaultTopK
 	}

--- a/core/config/model_config_test.go
+++ b/core/config/model_config_test.go
@@ -529,4 +529,72 @@ concurrency_groups:
 				"models that template in Go still rely on the Go-generated grammar")
 		})
 	})
+
+	// The default top_k=40 is llama.cpp's sampling default and is WRONG for
+	// backends whose native default differs. mlx_lm's intended default is
+	// top_k=0 (disabled) and mlx does not remap 0->40, so injecting 40 silently
+	// changes sampling for mlx clients that omit top_k (issue #6632). Gate the
+	// injection on backend family: keep 40 for the llama.cpp family and for the
+	// empty/auto backend (the GGUF auto-detect path resolves to llama.cpp), but
+	// leave TopK nil for the mlx family so the wire value is 0.
+	Context("TopK default is backend-gated (issue #6632)", func() {
+		It("injects top_k=40 for the llama.cpp backend", func() {
+			cfg := &ModelConfig{}
+			cfg.Backend = "llama-cpp"
+
+			cfg.SetDefaults()
+
+			Expect(cfg.TopK).NotTo(BeNil(), "llama.cpp must keep its top_k=40 default")
+			Expect(*cfg.TopK).To(Equal(40))
+		})
+
+		It("injects top_k=40 for the empty/auto backend (GGUF auto-detect)", func() {
+			cfg := &ModelConfig{}
+
+			cfg.SetDefaults()
+
+			Expect(cfg.TopK).NotTo(BeNil(), "empty backend resolves to llama.cpp; default unchanged")
+			Expect(*cfg.TopK).To(Equal(40))
+		})
+
+		It("leaves TopK nil for the mlx backend", func() {
+			cfg := &ModelConfig{}
+			cfg.Backend = "mlx"
+
+			cfg.SetDefaults()
+
+			Expect(cfg.TopK).To(BeNil(),
+				"mlx_lm's intended default is top_k=0 (disabled); LocalAI must not inject 40")
+		})
+
+		It("leaves TopK nil for the mlx-vlm backend", func() {
+			cfg := &ModelConfig{}
+			cfg.Backend = "mlx-vlm"
+
+			cfg.SetDefaults()
+
+			Expect(cfg.TopK).To(BeNil())
+		})
+
+		It("leaves TopK nil for the mlx-distributed backend", func() {
+			cfg := &ModelConfig{}
+			cfg.Backend = "mlx-distributed"
+
+			cfg.SetDefaults()
+
+			Expect(cfg.TopK).To(BeNil())
+		})
+
+		It("respects an explicit top_k even for the mlx backend", func() {
+			explicit := 7
+			cfg := &ModelConfig{}
+			cfg.Backend = "mlx"
+			cfg.TopK = &explicit
+
+			cfg.SetDefaults()
+
+			Expect(cfg.TopK).NotTo(BeNil())
+			Expect(*cfg.TopK).To(Equal(7))
+		})
+	})
 })
--- a/core/config/voice_gate_test.go
+++ b/core/config/voice_gate_test.go
@@ -0,0 +1,73 @@
+package config
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("PipelineVoiceRecognition", func() {
+	Describe("Normalize", func() {
+		It("fills defaults for empty fields", func() {
+			v := PipelineVoiceRecognition{Model: "spk"}
+			v.Normalize()
+			Expect(v.Mode).To(Equal(VoiceGateModeIdentify))
+			Expect(v.When).To(Equal(VoiceGateWhenEvery))
+			Expect(v.OnReject).To(Equal(VoiceGateRejectEvent))
+			Expect(v.Threshold).To(BeNumerically("~", defaultVoiceGateThreshold, 1e-6))
+		})
+		It("keeps explicit values", func() {
+			v := PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeVerify, When: VoiceGateWhenFirst, OnReject: VoiceGateRejectSilent, Threshold: 0.4}
+			v.Normalize()
+			Expect(v.Mode).To(Equal(VoiceGateModeVerify))
+			Expect(v.When).To(Equal(VoiceGateWhenFirst))
+			Expect(v.OnReject).To(Equal(VoiceGateRejectSilent))
+			Expect(v.Threshold).To(BeNumerically("~", 0.4, 1e-6))
+		})
+	})
+
+	Describe("Validate", func() {
+		It("requires a registry for identify mode", func() {
+			v := PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify}
+			Expect(v.Validate(false)).To(HaveOccurred())
+			Expect(v.Validate(true)).ToNot(HaveOccurred())
+		})
+		It("requires references for verify mode", func() {
+			v := PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeVerify}
+			Expect(v.Validate(false)).To(HaveOccurred())
+			v.References = []VoiceReference{{Name: "a", Audio: "/a.wav"}}
+			Expect(v.Validate(false)).ToNot(HaveOccurred())
+		})
+		It("rejects a reference with no audio path", func() {
+			v := PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeVerify, References: []VoiceReference{{Name: "a"}}}
+			Expect(v.Validate(false)).To(HaveOccurred())
+		})
+		It("rejects unknown enum values", func() {
+			Expect((PipelineVoiceRecognition{Model: "spk", Mode: "bogus"}).Validate(true)).To(HaveOccurred())
+			Expect((PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify, When: "bogus"}).Validate(true)).To(HaveOccurred())
+			Expect((PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify, OnReject: "bogus"}).Validate(true)).To(HaveOccurred())
+		})
+		It("accepts a zero (unset) threshold", func() {
+			v := PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify, Threshold: 0}
+			Expect(v.Validate(true)).ToNot(HaveOccurred())
+		})
+		It("rejects an out-of-range threshold", func() {
+			Expect((PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify, Threshold: 5}).Validate(true)).To(HaveOccurred())
+			Expect((PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify, Threshold: -1}).Validate(true)).To(HaveOccurred())
+		})
+		It("rejects an empty model", func() {
+			Expect((PipelineVoiceRecognition{Mode: VoiceGateModeIdentify}).Validate(true)).To(HaveOccurred())
+		})
+	})
+
+	Describe("VoiceGateEnabled", func() {
+		It("is false when block absent", func() {
+			Expect((Pipeline{}).VoiceGateEnabled()).To(BeFalse())
+		})
+		It("is true when a model is set", func() {
+			Expect((Pipeline{VoiceRecognition: &PipelineVoiceRecognition{Model: "spk"}}).VoiceGateEnabled()).To(BeTrue())
+		})
+		It("is true when the block is present even without a model (fails closed downstream)", func() {
+			Expect((Pipeline{VoiceRecognition: &PipelineVoiceRecognition{}}).VoiceGateEnabled()).To(BeTrue())
+		})
+	})
+})
--- a/core/gallery/importers/omnivoice_test.go
+++ b/core/gallery/importers/omnivoice_test.go
@@ -0,0 +1,32 @@
+package importers_test
+
+import (
+	"encoding/json"
+	"fmt"
+
+	"github.com/mudler/LocalAI/core/gallery/importers"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("OmniVoice pref-only guard", func() {
+	Context("With only a bare OmniVoice GGUF URI", func() {
+		It("does not auto-import as omnivoice-cpp", func() {
+			// omnivoice-cpp is a preference-only backend (listed in the
+			// /backends/known registry with AutoDetect:false). No importer
+			// emits it, so discovering a bare OmniVoice GGUF must never
+			// silently resolve to omnivoice-cpp. It may legitimately match a
+			// generic GGUF importer (e.g. llama-cpp) or error/be ambiguous —
+			// the only hard requirement is that it is NOT omnivoice-cpp.
+			uri := "huggingface://Serveurperso/OmniVoice-GGUF/omnivoice-base-Q8_0.gguf"
+			preferences := json.RawMessage(`{}`)
+
+			modelConfig, err := importers.DiscoverModelConfig(uri, preferences)
+			if err != nil {
+				// An error (including ambiguous) is acceptable for a pref-only backend.
+				return
+			}
+			Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("backend: omnivoice-cpp"), fmt.Sprintf("Model config: %+v", modelConfig))
+		})
+	})
+})
--- a/core/http/endpoints/localai/backend.go
+++ b/core/http/endpoints/localai/backend.go
@@ -36,6 +36,7 @@ var knownPrefOnlyBackends = []schema.KnownBackend{
 	{Name: "kokoros", Modality: "tts", AutoDetect: false, Description: "Kokoros TTS (preference-only)"},
 	{Name: "qwen-tts", Modality: "tts", AutoDetect: false, Description: "Qwen TTS (preference-only)"},
 	{Name: "qwen3-tts-cpp", Modality: "tts", AutoDetect: false, Description: "Qwen3 TTS C++ (preference-only)"},
+	{Name: "omnivoice-cpp", Modality: "tts", AutoDetect: false, Description: "OmniVoice C++ TTS with voice cloning and voice design (preference-only)"},
 	{Name: "faster-qwen3-tts", Modality: "tts", AutoDetect: false, Description: "Faster Qwen3 TTS (preference-only)"},
 	// Detection
 	{Name: "sam3-cpp", Modality: "detection", AutoDetect: false, Description: "SAM3 C++ object detection (preference-only)"},
--- a/core/http/endpoints/localai/nodes.go
+++ b/core/http/endpoints/localai/nodes.go
@@ -937,12 +937,13 @@ func GetSchedulingEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {
 // distinguishable from an explicit zero. On update, an omitted prefix-cache
 // field preserves the model's previously-configured value instead of resetting
 // it (see SetSchedulingEndpoint's PATCH-style merge). ModelName, NodeSelector,
-// MinReplicas and MaxReplicas keep their full-replace PUT semantics.
+// MinReplicas, MaxReplicas and SpreadAll keep their full-replace PUT semantics.
 type SetSchedulingRequest struct {
 	ModelName           string            `json:"model_name"`
 	NodeSelector        map[string]string `json:"node_selector,omitempty"`
 	MinReplicas         int               `json:"min_replicas"`
 	MaxReplicas         int               `json:"max_replicas"`
+	SpreadAll           bool              `json:"spread_all,omitempty"`
 	RoutePolicy         *string           `json:"route_policy,omitempty"`
 	BalanceAbsThreshold *int              `json:"balance_abs_threshold,omitempty"`
 	BalanceRelThreshold *float64          `json:"balance_rel_threshold,omitempty"`
@@ -959,6 +960,9 @@ func validateSchedulingRequest(req SetSchedulingRequest, routePolicy string, abs
 	if req.ModelName == "" {
 		return errors.New("model_name is required")
 	}
+	if req.SpreadAll && (req.MinReplicas != 0 || req.MaxReplicas != 0) {
+		return errors.New("spread_all and min_replicas/max_replicas are mutually exclusive")
+	}
 	if req.MinReplicas < 0 {
 		return errors.New("min_replicas must be >= 0")
 	}
@@ -1045,6 +1049,7 @@ func SetSchedulingEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {
 			NodeSelector:        selectorJSON,
 			MinReplicas:         req.MinReplicas,
 			MaxReplicas:         req.MaxReplicas,
+			SpreadAll:           req.SpreadAll,
 			RoutePolicy:         routePolicy,
 			BalanceAbsThreshold: absThr,
 			BalanceRelThreshold: relThr,
--- a/core/http/endpoints/localai/nodes_scheduling_test.go
+++ b/core/http/endpoints/localai/nodes_scheduling_test.go
@@ -0,0 +1,22 @@
+package localai
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("validateSchedulingRequest spread_all", func() {
+	It("rejects spread_all combined with min_replicas", func() {
+		err := validateSchedulingRequest(SetSchedulingRequest{
+			ModelName: "m", SpreadAll: true, MinReplicas: 2,
+		}, "", 0, 0, 0)
+		Expect(err).To(MatchError(ContainSubstring("mutually exclusive")))
+	})
+
+	It("accepts spread_all alone", func() {
+		err := validateSchedulingRequest(SetSchedulingRequest{
+			ModelName: "m", SpreadAll: true,
+		}, "", 0, 0, 0)
+		Expect(err).ToNot(HaveOccurred())
+	})
+})
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -133,6 +133,13 @@ type Session struct {
 	// silently strip Manage Mode's tools.
 	AssistantTools []types.ToolUnion

+	// voiceGate is non-nil when pipeline.voice_recognition is configured. It
+	// authorizes each committed utterance's speaker before the LLM runs.
+	voiceGate *voiceGate
+	// gateMu guards the when:first verification state below.
+	gateMu        sync.Mutex
+	voiceVerified bool
+
 	// Response cancellation: protects activeResponseCancel/activeResponseDone
 	responseMu           sync.Mutex
 	activeResponseCancel context.CancelFunc
@@ -514,6 +521,23 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	}
 	session.ModelInterface = m

+	if cfg.Pipeline.VoiceGateEnabled() {
+		gate, gerr := newVoiceGate(
+			*cfg.Pipeline.VoiceRecognition,
+			application.ModelConfigLoader(),
+			application.ModelLoader(),
+			application.ApplicationConfig(),
+			application.VoiceRegistry(),
+		)
+		if gerr != nil {
+			xlog.Error("failed to initialize voice recognition gate", "error", gerr)
+			sendError(t, "voice_gate_error", gerr.Error(), "", "")
+			return
+		}
+		session.voiceGate = gate
+		xlog.Info("realtime voice recognition gate enabled", "mode", gate.cfg.Mode, "when", gate.cfg.When)
+	}
+
 	// Store the session and notify the transport (for WebRTC audio track handling)
 	sessionLock.Lock()
 	sessions[sessionID] = session
@@ -990,8 +1014,18 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode
 	}

 	if rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil {
-		session.InputAudioTranscription = rt.Audio.Input.Transcription
-		session.ModelConfig.Pipeline.Transcription = rt.Audio.Input.Transcription.Model
+		trUpd := rt.Audio.Input.Transcription
+		// A language-only update (e.g. a client forcing the STT language) carries
+		// an empty Model. Preserve the pipeline's configured transcription backend
+		// instead of blanking it — otherwise the next utterance transcribes against
+		// an empty model and the backend RPC fails with "unimplemented".
+		if trUpd.Model == "" && session.InputAudioTranscription != nil {
+			trUpd.Model = session.InputAudioTranscription.Model
+		}
+		session.InputAudioTranscription = trUpd
+		if trUpd.Model != "" {
+			session.ModelConfig.Pipeline.Transcription = trUpd.Model
+		}
 	}

 	if rt.Model != "" || (rt.Audio != nil && rt.Audio.Output != nil && rt.Audio.Output.Voice != "") || (rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil) {
@@ -1259,6 +1293,39 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co

 	f.Sync()

+	// Start speaker verification concurrently with transcription. This is a
+	// latency optimization only: there is a hard join below before the LLM, so
+	// an unauthorized utterance never reaches generateResponse (no LLM, no
+	// tools, no TTS) regardless of how fast transcription finishes. A rejected
+	// turn wastes only transcription compute, which has no side effects. The
+	// transcript is still emitted to the same peer that sent the audio, which
+	// reveals nothing new to them.
+	type gateOutcome struct {
+		allowed bool
+		matched string
+		reason  string
+		err     error
+	}
+	var gateCh chan gateOutcome
+	runGate := false
+	if session.voiceGate != nil && session.InputAudioTranscription != nil {
+		skip := false
+		if session.voiceGate.cfg.When == config.VoiceGateWhenFirst {
+			session.gateMu.Lock()
+			skip = session.voiceVerified
+			session.gateMu.Unlock()
+		}
+		if !skip {
+			runGate = true
+			gateCh = make(chan gateOutcome, 1)
+			wavPath := f.Name()
+			go func() {
+				allowed, matched, reason, gerr := session.voiceGate.Authorize(ctx, wavPath)
+				gateCh <- gateOutcome{allowed: allowed, matched: matched, reason: reason, err: gerr}
+			}()
+		}
+	}
+
 	// TODO: If we have a real any-to-any model then transcription is optional
 	var transcript string
 	if session.InputAudioTranscription != nil {
@@ -1268,14 +1335,54 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
 		var err error
 		transcript, err = emitTranscription(ctx, t, session, generateItemID(), f.Name())
 		if err != nil {
+			// Drain the gate goroutine before returning so its in-flight read of
+			// the temp WAV finishes before the deferred os.Remove fires.
+			if runGate {
+				<-gateCh
+			}
 			sendError(t, "transcription_failed", err.Error(), "", "event_TODO")
 			return
 		}
 	} else {
+		// The voice gate runs only on the transcription path above; if an
+		// any-to-any model path is added here, join the gate before responding.
 		sendNotImplemented(t, "any-to-any models")
 		return
 	}

+	// Join on the gate before any side-effecting step.
+	if runGate {
+		out := <-gateCh
+		allowed := out.allowed
+		reason := out.reason
+		if out.err != nil {
+			// Fail closed: a gate that cannot decide must not let audio through.
+			xlog.Error("voice recognition gate error", "error", out.err)
+			allowed = false
+			reason = "verification error"
+		}
+		alreadyVerified := false
+		if session.voiceGate.cfg.When == config.VoiceGateWhenFirst {
+			session.gateMu.Lock()
+			alreadyVerified = session.voiceVerified
+			session.gateMu.Unlock()
+		}
+		proceed, markVerified := session.voiceGate.decide(alreadyVerified, allowed)
+		if !proceed {
+			xlog.Debug("voice recognition gate rejected utterance", "reason", reason)
+			if session.voiceGate.cfg.OnReject == config.VoiceGateRejectEvent {
+				sendError(t, "speaker_not_authorized", "speaker not authorized: "+reason, "", "event_TODO")
+			}
+			return
+		}
+		xlog.Debug("voice recognition gate authorized utterance", "speaker", out.matched)
+		if markVerified {
+			session.gateMu.Lock()
+			session.voiceVerified = true
+			session.gateMu.Unlock()
+		}
+	}
+
 	if !session.TranscriptionOnly {
 		generateResponse(ctx, session, utt, transcript, conv, t)
 	}
--- a/core/http/endpoints/openai/realtime_voicegate.go
+++ b/core/http/endpoints/openai/realtime_voicegate.go
@@ -0,0 +1,212 @@
+package openai
+
+import (
+	"context"
+	"fmt"
+	"math"
+
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/services/voicerecognition"
+	"github.com/mudler/LocalAI/pkg/model"
+)
+
+type namedEmbedding struct {
+	name string
+	emb  []float32
+}
+
+// voiceGate decides whether a committed utterance's speaker is authorized to
+// drive the realtime pipeline.
+type voiceGate struct {
+	cfg       config.PipelineVoiceRecognition // normalized
+	registry  voicerecognition.Registry       // identify mode (nil otherwise)
+	refEmbeds []namedEmbedding                // verify mode, pre-embedded refs
+	refAudios []config.VoiceReference         // verify + anti-spoofing: ref paths
+
+	// Seams for testing; set by newVoiceGate to call the real backend.
+	embedFn  func(ctx context.Context, wavPath string) ([]float32, error)
+	verifyFn func(ctx context.Context, uttWav, refWav string) (bool, error)
+}
+
+// newVoiceGate builds a gate from a pipeline's voice_recognition config. It
+// validates fail-fast (before loading the model), loads the recognition model
+// config, wires the real backend seams, and pre-embeds references for verify
+// mode so per-turn cost is one utterance embed plus cheap cosine comparisons.
+func newVoiceGate(
+	cfg config.PipelineVoiceRecognition,
+	cl *config.ModelConfigLoader,
+	ml *model.ModelLoader,
+	appConfig *config.ApplicationConfig,
+	registry voicerecognition.Registry,
+) (*voiceGate, error) {
+	cfg.Normalize()
+	if err := cfg.Validate(registry != nil); err != nil {
+		return nil, err
+	}
+
+	recCfg, err := cl.LoadModelConfigFileByName(cfg.Model, ml.ModelPath)
+	if err != nil {
+		return nil, fmt.Errorf("voice_recognition: failed to load model %q: %w", cfg.Model, err)
+	}
+	if valid, _ := recCfg.Validate(); !valid {
+		return nil, fmt.Errorf("voice_recognition: invalid model config %q", cfg.Model)
+	}
+
+	g := &voiceGate{
+		cfg:      cfg,
+		registry: registry,
+		embedFn: func(ctx context.Context, wavPath string) ([]float32, error) {
+			res, err := backend.VoiceEmbed(ctx, wavPath, ml, appConfig, *recCfg)
+			if err != nil {
+				return nil, err
+			}
+			return res.Embedding, nil
+		},
+		verifyFn: func(ctx context.Context, uttWav, refWav string) (bool, error) {
+			res, err := backend.VoiceVerify(ctx, uttWav, refWav, cfg.Threshold, true, ml, appConfig, *recCfg)
+			if err != nil {
+				return false, err
+			}
+			return res.Verified, nil
+		},
+	}
+
+	if cfg.Mode == config.VoiceGateModeVerify {
+		if cfg.AntiSpoofing {
+			g.refAudios = cfg.References
+		} else {
+			for _, r := range cfg.References {
+				emb, err := g.embedFn(context.Background(), r.Audio)
+				if err != nil {
+					return nil, fmt.Errorf("voice_recognition: failed to embed reference %q: %w", r.Name, err)
+				}
+				g.refEmbeds = append(g.refEmbeds, namedEmbedding{name: r.Name, emb: emb})
+			}
+		}
+	}
+
+	return g, nil
+}
+
+// Authorize embeds the utterance and decides allow/deny.
+//
+//	allowed: speaker is authorized.
+//	matched: matched person's name (informational), empty if none.
+//	reason:  human-readable deny reason.
+//	err:     backend failure (caller should fail closed).
+func (g *voiceGate) Authorize(ctx context.Context, wavPath string) (allowed bool, matched string, reason string, err error) {
+	if g.cfg.Mode == config.VoiceGateModeVerify {
+		return g.authorizeVerify(ctx, wavPath)
+	}
+	return g.authorizeIdentify(ctx, wavPath)
+}
+
+func (g *voiceGate) authorizeIdentify(ctx context.Context, wavPath string) (bool, string, string, error) {
+	emb, err := g.embedFn(ctx, wavPath)
+	if err != nil {
+		return false, "", "embed failed", err
+	}
+	if len(emb) == 0 {
+		return false, "", "no speech detected", nil
+	}
+	matches, err := g.registry.Identify(ctx, emb, 1)
+	if err != nil {
+		return false, "", "identify failed", err
+	}
+	if len(matches) == 0 {
+		return false, "", "unknown speaker", nil
+	}
+	m := matches[0]
+	if m.Distance > g.cfg.Threshold {
+		return false, m.Metadata.Name, "distance above threshold", nil
+	}
+	if !g.allowMatch(m.Metadata) {
+		return false, m.Metadata.Name, "speaker not in allow list", nil
+	}
+	return true, m.Metadata.Name, "", nil
+}
+
+// allowMatch reports whether a matched identity is authorized. An empty allow
+// (no names and no labels) authorizes any registered speaker.
+func (g *voiceGate) allowMatch(meta voicerecognition.Metadata) bool {
+	a := g.cfg.Allow
+	if len(a.Names) == 0 && len(a.Labels) == 0 {
+		return true
+	}
+	for _, n := range a.Names {
+		if n == meta.Name {
+			return true
+		}
+	}
+	for _, l := range a.Labels {
+		if _, ok := meta.Labels[l]; ok {
+			return true
+		}
+	}
+	return false
+}
+
+func (g *voiceGate) authorizeVerify(ctx context.Context, wavPath string) (bool, string, string, error) {
+	if g.cfg.AntiSpoofing {
+		for _, r := range g.refAudios {
+			ok, err := g.verifyFn(ctx, wavPath, r.Audio)
+			if err != nil {
+				return false, "", "verify failed", err
+			}
+			if ok {
+				return true, r.Name, "", nil
+			}
+		}
+		return false, "", "no reference matched", nil
+	}
+
+	emb, err := g.embedFn(ctx, wavPath)
+	if err != nil {
+		return false, "", "embed failed", err
+	}
+	if len(emb) == 0 {
+		return false, "", "no speech detected", nil
+	}
+	for _, r := range g.refEmbeds {
+		if cosineDistance(emb, r.emb) <= g.cfg.Threshold {
+			return true, r.name, "", nil
+		}
+	}
+	return false, "", "no reference matched", nil
+}
+
+// decide interprets an Authorize result against the gate's when-policy and the
+// session's prior verification state.
+//   proceed:      run the LLM response for this utterance.
+//   markVerified: record a successful first-utterance verification.
+// Note: when:first AND alreadyVerified is normally handled by the caller
+// skipping Authorize entirely; if it still reaches here, proceed is true.
+func (g *voiceGate) decide(alreadyVerified, allowed bool) (proceed, markVerified bool) {
+	if g.cfg.When == config.VoiceGateWhenFirst {
+		if alreadyVerified {
+			return true, false
+		}
+		return allowed, allowed
+	}
+	return allowed, false
+}
+
+// cosineDistance returns 1 - cosine_similarity, matching the voice registry's
+// distance convention (lower = closer). Returns 1 (treated as "no match") for
+// zero-length, mismatched, or zero-magnitude vectors.
+func cosineDistance(a, b []float32) float32 {
+	if len(a) == 0 || len(a) != len(b) {
+		return 1
+	}
+	var dot, na, nb float64
+	for i := range a {
+		dot += float64(a[i]) * float64(b[i])
+		na += float64(a[i]) * float64(a[i])
+		nb += float64(b[i]) * float64(b[i])
+	}
+	if na == 0 || nb == 0 {
+		return 1
+	}
+	return float32(1 - dot/(math.Sqrt(na)*math.Sqrt(nb)))
+}
--- a/core/http/endpoints/openai/realtime_voicegate_integration_test.go
+++ b/core/http/endpoints/openai/realtime_voicegate_integration_test.go
@@ -0,0 +1,154 @@
+package openai
+
+import (
+	"context"
+	"errors"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/voicerecognition"
+)
+
+// These specs drive the REAL commitUtterance path end to end (gate goroutine,
+// the hard join before the LLM, the reject event, and when:first session
+// trust) using the existing fakeTransport/fakeModel doubles. They are the
+// integration counterpart to the unit specs in realtime_voicegate_test.go:
+// here the gate is wired into a Session exactly as runRealtimeSession wires it.
+
+// itGate builds an identify-mode gate whose registry always returns a single
+// match named matchName, and whose embedFn returns embed/embErr. allowName is
+// the authorized identity. when/onReject select the policy.
+func itGate(allowName, matchName string, embed []float32, embErr error, when, onReject string) *voiceGate {
+	return &voiceGate{
+		cfg: config.PipelineVoiceRecognition{
+			Mode:      config.VoiceGateModeIdentify,
+			Threshold: 0.25,
+			When:      when,
+			OnReject:  onReject,
+			Allow:     config.VoiceRecognitionAllow{Names: []string{allowName}},
+		},
+		registry: &fakeRegistry{matches: []voicerecognition.Match{
+			{Distance: 0.1, Metadata: voicerecognition.Metadata{Name: matchName}},
+		}},
+		embedFn: func(context.Context, string) ([]float32, error) { return embed, embErr },
+	}
+}
+
+// itSession returns a Session + fakeModel wired for a full pipeline turn, with
+// the given gate attached. The fakeModel mirrors the streaming-LLM setup used
+// by realtime_stream_test.go so triggerResponse runs to a response.done.
+func itSession(gate *voiceGate) (*Session, *fakeModel) {
+	on := true
+	m := &fakeModel{
+		cfg:             &config.ModelConfig{},
+		transcribeFinal: &schema.TranscriptionResult{Text: "hello"},
+		predictTokens:   []string{"Hi", " there."},
+		predictResp:     backend.LLMResponse{Response: "Hi there."},
+		ttsStreamChunks: [][]byte{{1}},
+		ttsStreamRate:   24000,
+	}
+	session := &Session{
+		OutputSampleRate:        24000,
+		InputAudioTranscription: &types.AudioTranscription{},
+		ModelInterface:          m,
+		ModelConfig: &config.ModelConfig{
+			Pipeline: config.Pipeline{Streaming: config.PipelineStreaming{LLM: &on, TTS: &on}},
+		},
+		voiceGate: gate,
+	}
+	return session, m
+}
+
+// hasSpeakerNotAuthorized reports whether a speaker_not_authorized error event
+// was emitted to the client.
+func hasSpeakerNotAuthorized(tr *fakeTransport) bool {
+	for _, e := range tr.events {
+		if ev, ok := e.(types.ErrorEvent); ok && ev.Error.Code == "speaker_not_authorized" {
+			return true
+		}
+	}
+	return false
+}
+
+var _ = Describe("realtime voice gate integration (commitUtterance)", func() {
+	utt := make([]byte, 32) // non-empty PCM so commitUtterance proceeds
+
+	It("allows an authorized speaker through to a full response", func() {
+		session, _ := itSession(itGate("alice", "alice", []float32{1, 0, 0}, nil,
+			config.VoiceGateWhenEvery, config.VoiceGateRejectEvent))
+		tr := &fakeTransport{}
+
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
+
+		Expect(hasSpeakerNotAuthorized(tr)).To(BeFalse())
+		// The LLM/TTS pipeline ran to completion.
+		Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1))
+		// Transcription still happened (parallel with the gate).
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
+	})
+
+	It("drops an unauthorized speaker before the LLM and emits a reject event", func() {
+		// match name "mallory" is not in the allow list → deny.
+		session, _ := itSession(itGate("alice", "mallory", []float32{1, 0, 0}, nil,
+			config.VoiceGateWhenEvery, config.VoiceGateRejectEvent))
+		tr := &fakeTransport{}
+
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
+
+		// Hard barrier: the LLM/TTS pipeline never ran.
+		Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
+		// The client was told why.
+		Expect(hasSpeakerNotAuthorized(tr)).To(BeTrue())
+		// Transcription of the rejected utterance still emitted (sent only to the
+		// peer that produced the audio; reveals nothing new).
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
+	})
+
+	It("fails closed on a gate backend error", func() {
+		session, _ := itSession(itGate("alice", "alice", nil, errors.New("backend down"),
+			config.VoiceGateWhenEvery, config.VoiceGateRejectEvent))
+		tr := &fakeTransport{}
+
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
+
+		Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
+		Expect(hasSpeakerNotAuthorized(tr)).To(BeTrue())
+	})
+
+	It("drops silently when on_reject is drop_silent (no error event)", func() {
+		session, _ := itSession(itGate("alice", "mallory", []float32{1, 0, 0}, nil,
+			config.VoiceGateWhenEvery, config.VoiceGateRejectSilent))
+		tr := &fakeTransport{}
+
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
+
+		Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
+		Expect(hasSpeakerNotAuthorized(tr)).To(BeFalse())
+	})
+
+	It("when:first trusts the session after one match, even if later embeds fail", func() {
+		gate := itGate("alice", "alice", []float32{1, 0, 0}, nil,
+			config.VoiceGateWhenFirst, config.VoiceGateRejectEvent)
+		session, _ := itSession(gate)
+
+		// First utterance: authorized, marks the session verified.
+		tr1 := &fakeTransport{}
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr1)
+		Expect(hasSpeakerNotAuthorized(tr1)).To(BeFalse())
+		Expect(tr1.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1))
+
+		// Break the gate: any further Authorize would now error.
+		gate.embedFn = func(context.Context, string) ([]float32, error) { return nil, errors.New("boom") }
+
+		// Second utterance still proceeds because when:first skips re-verification.
+		tr2 := &fakeTransport{}
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr2)
+		Expect(hasSpeakerNotAuthorized(tr2)).To(BeFalse())
+		Expect(tr2.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1))
+	})
+})
--- a/core/http/endpoints/openai/realtime_voicegate_test.go
+++ b/core/http/endpoints/openai/realtime_voicegate_test.go
@@ -0,0 +1,231 @@
+package openai
+
+import (
+	"context"
+	"errors"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/services/voicerecognition"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("cosineDistance", func() {
+	It("is 0 for identical vectors", func() {
+		Expect(cosineDistance([]float32{1, 0, 0}, []float32{1, 0, 0})).To(BeNumerically("~", 0, 1e-6))
+	})
+	It("is ~1 for orthogonal vectors", func() {
+		Expect(cosineDistance([]float32{1, 0}, []float32{0, 1})).To(BeNumerically("~", 1, 1e-6))
+	})
+	It("is ~2 for opposite vectors", func() {
+		Expect(cosineDistance([]float32{1, 0}, []float32{-1, 0})).To(BeNumerically("~", 2, 1e-6))
+	})
+	It("returns 1 for length mismatch", func() {
+		Expect(cosineDistance([]float32{1, 0}, []float32{1})).To(BeNumerically("~", 1, 1e-6))
+	})
+	It("returns 1 for a zero vector", func() {
+		Expect(cosineDistance([]float32{0, 0}, []float32{1, 0})).To(BeNumerically("~", 1, 1e-6))
+	})
+})
+
+type fakeRegistry struct {
+	matches []voicerecognition.Match
+	err     error
+}
+
+func (f *fakeRegistry) Register(ctx context.Context, emb []float32, m voicerecognition.Metadata) (voicerecognition.Metadata, error) {
+	return m, nil
+}
+func (f *fakeRegistry) Identify(ctx context.Context, probe []float32, topK int) ([]voicerecognition.Match, error) {
+	return f.matches, f.err
+}
+func (f *fakeRegistry) Forget(ctx context.Context, id string) error { return nil }
+
+var _ = Describe("voiceGate identify mode", func() {
+	stubEmbed := func(emb []float32, err error) func(context.Context, string) ([]float32, error) {
+		return func(context.Context, string) ([]float32, error) { return emb, err }
+	}
+	mkGate := func(allow config.VoiceRecognitionAllow, matches []voicerecognition.Match, embErr error) *voiceGate {
+		return &voiceGate{
+			cfg:      config.PipelineVoiceRecognition{Mode: config.VoiceGateModeIdentify, Threshold: 0.25, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent, Allow: allow},
+			registry: &fakeRegistry{matches: matches},
+			embedFn:  stubEmbed([]float32{1, 0, 0}, embErr),
+		}
+	}
+
+	It("allows a registered speaker within threshold and in the allow list", func() {
+		g := mkGate(config.VoiceRecognitionAllow{Names: []string{"alice"}},
+			[]voicerecognition.Match{{Distance: 0.1, Metadata: voicerecognition.Metadata{Name: "alice"}}}, nil)
+		allowed, matched, _, err := g.Authorize(context.Background(), "x.wav")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(allowed).To(BeTrue())
+		Expect(matched).To(Equal("alice"))
+	})
+	It("allows any registered speaker when the allow list is empty", func() {
+		g := mkGate(config.VoiceRecognitionAllow{},
+			[]voicerecognition.Match{{Distance: 0.1, Metadata: voicerecognition.Metadata{Name: "carol"}}}, nil)
+		allowed, _, _, _ := g.Authorize(context.Background(), "x.wav")
+		Expect(allowed).To(BeTrue())
+	})
+	It("allows by label", func() {
+		g := mkGate(config.VoiceRecognitionAllow{Labels: []string{"family"}},
+			[]voicerecognition.Match{{Distance: 0.1, Metadata: voicerecognition.Metadata{Name: "bob", Labels: map[string]string{"family": "yes"}}}}, nil)
+		allowed, _, _, _ := g.Authorize(context.Background(), "x.wav")
+		Expect(allowed).To(BeTrue())
+	})
+	It("denies a speaker not in the allow list", func() {
+		g := mkGate(config.VoiceRecognitionAllow{Names: []string{"alice"}},
+			[]voicerecognition.Match{{Distance: 0.1, Metadata: voicerecognition.Metadata{Name: "mallory"}}}, nil)
+		allowed, matched, reason, _ := g.Authorize(context.Background(), "x.wav")
+		Expect(allowed).To(BeFalse())
+		Expect(matched).To(Equal("mallory"))
+		Expect(reason).To(ContainSubstring("allow"))
+	})
+	It("denies a match above the threshold", func() {
+		g := mkGate(config.VoiceRecognitionAllow{},
+			[]voicerecognition.Match{{Distance: 0.9, Metadata: voicerecognition.Metadata{Name: "alice"}}}, nil)
+		allowed, matched, _, _ := g.Authorize(context.Background(), "x.wav")
+		Expect(allowed).To(BeFalse())
+		Expect(matched).To(Equal("alice"))
+	})
+	It("denies when no registry match", func() {
+		g := mkGate(config.VoiceRecognitionAllow{}, nil, nil)
+		allowed, _, reason, _ := g.Authorize(context.Background(), "x.wav")
+		Expect(allowed).To(BeFalse())
+		Expect(reason).To(ContainSubstring("unknown"))
+	})
+	It("denies (no error) when no speech is detected", func() {
+		g := mkGate(config.VoiceRecognitionAllow{}, nil, nil)
+		g.embedFn = stubEmbed(nil, nil)
+		allowed, _, reason, err := g.Authorize(context.Background(), "x.wav")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(allowed).To(BeFalse())
+		Expect(reason).To(ContainSubstring("no speech"))
+	})
+	It("denies and surfaces the error when embedding fails", func() {
+		g := mkGate(config.VoiceRecognitionAllow{}, nil, errors.New("boom"))
+		allowed, _, reason, err := g.Authorize(context.Background(), "x.wav")
+		Expect(err).To(HaveOccurred())
+		Expect(allowed).To(BeFalse())
+		Expect(reason).To(ContainSubstring("embed"))
+	})
+	It("denies and surfaces the error when identify fails", func() {
+		g := mkGate(config.VoiceRecognitionAllow{}, nil, nil)
+		g.registry = &fakeRegistry{err: errors.New("boom")}
+		allowed, _, _, err := g.Authorize(context.Background(), "x.wav")
+		Expect(err).To(HaveOccurred())
+		Expect(allowed).To(BeFalse())
+	})
+})
+
+var _ = Describe("voiceGate verify mode", func() {
+	It("allows when the utterance matches a reference embedding", func() {
+		g := &voiceGate{
+			cfg:       config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent},
+			refEmbeds: []namedEmbedding{{name: "alice", emb: []float32{1, 0, 0}}},
+			embedFn:   func(context.Context, string) ([]float32, error) { return []float32{1, 0, 0}, nil },
+		}
+		allowed, matched, _, err := g.Authorize(context.Background(), "x.wav")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(allowed).To(BeTrue())
+		Expect(matched).To(Equal("alice"))
+	})
+	It("denies when no reference is within threshold", func() {
+		g := &voiceGate{
+			cfg:       config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent},
+			refEmbeds: []namedEmbedding{{name: "alice", emb: []float32{1, 0, 0}}},
+			embedFn:   func(context.Context, string) ([]float32, error) { return []float32{0, 1, 0}, nil },
+		}
+		allowed, _, reason, _ := g.Authorize(context.Background(), "x.wav")
+		Expect(allowed).To(BeFalse())
+		Expect(reason).To(ContainSubstring("reference"))
+	})
+	It("denies (no error) when no speech is detected", func() {
+		g := &voiceGate{
+			cfg:       config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent},
+			refEmbeds: []namedEmbedding{{name: "alice", emb: []float32{1, 0, 0}}},
+			embedFn:   func(context.Context, string) ([]float32, error) { return nil, nil },
+		}
+		allowed, _, reason, err := g.Authorize(context.Background(), "x.wav")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(allowed).To(BeFalse())
+		Expect(reason).To(ContainSubstring("no speech"))
+	})
+	It("denies and surfaces the error when embedding fails", func() {
+		g := &voiceGate{
+			cfg:       config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent},
+			refEmbeds: []namedEmbedding{{name: "alice", emb: []float32{1, 0, 0}}},
+			embedFn:   func(context.Context, string) ([]float32, error) { return nil, errors.New("boom") },
+		}
+		allowed, _, _, err := g.Authorize(context.Background(), "x.wav")
+		Expect(err).To(HaveOccurred())
+		Expect(allowed).To(BeFalse())
+	})
+	It("uses verifyFn when anti-spoofing is enabled", func() {
+		called := false
+		g := &voiceGate{
+			cfg:       config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, AntiSpoofing: true, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent},
+			refAudios: []config.VoiceReference{{Name: "alice", Audio: "/alice.wav"}},
+			verifyFn:  func(context.Context, string, string) (bool, error) { called = true; return true, nil },
+		}
+		allowed, matched, _, err := g.Authorize(context.Background(), "x.wav")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(called).To(BeTrue())
+		Expect(allowed).To(BeTrue())
+		Expect(matched).To(Equal("alice"))
+	})
+	It("denies and surfaces the error when verifyFn fails (anti-spoofing)", func() {
+		g := &voiceGate{
+			cfg:       config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, AntiSpoofing: true, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent},
+			refAudios: []config.VoiceReference{{Name: "alice", Audio: "/alice.wav"}},
+			verifyFn:  func(context.Context, string, string) (bool, error) { return false, errors.New("boom") },
+		}
+		allowed, _, _, err := g.Authorize(context.Background(), "x.wav")
+		Expect(err).To(HaveOccurred())
+		Expect(allowed).To(BeFalse())
+	})
+})
+
+var _ = Describe("newVoiceGate", func() {
+	It("fails fast when identify mode has no registry (before touching the loader)", func() {
+		cfg := config.PipelineVoiceRecognition{Model: "spk", Mode: config.VoiceGateModeIdentify}
+		g, err := newVoiceGate(cfg, nil, nil, nil, nil)
+		Expect(err).To(HaveOccurred())
+		Expect(g).To(BeNil())
+	})
+	It("fails fast when verify mode has no references", func() {
+		cfg := config.PipelineVoiceRecognition{Model: "spk", Mode: config.VoiceGateModeVerify}
+		g, err := newVoiceGate(cfg, nil, nil, nil, nil)
+		Expect(err).To(HaveOccurred())
+		Expect(g).To(BeNil())
+	})
+})
+
+var _ = Describe("voiceGate decide", func() {
+	gate := func(when string) *voiceGate {
+		return &voiceGate{cfg: config.PipelineVoiceRecognition{When: when}}
+	}
+	It("every: proceeds iff allowed, never marks verified", func() {
+		proceed, mark := gate(config.VoiceGateWhenEvery).decide(false, true)
+		Expect(proceed).To(BeTrue())
+		Expect(mark).To(BeFalse())
+		proceed, mark = gate(config.VoiceGateWhenEvery).decide(false, false)
+		Expect(proceed).To(BeFalse())
+		Expect(mark).To(BeFalse())
+	})
+	It("first: marks verified on first allow", func() {
+		proceed, mark := gate(config.VoiceGateWhenFirst).decide(false, true)
+		Expect(proceed).To(BeTrue())
+		Expect(mark).To(BeTrue())
+	})
+	It("first: denies on first reject without marking", func() {
+		proceed, mark := gate(config.VoiceGateWhenFirst).decide(false, false)
+		Expect(proceed).To(BeFalse())
+		Expect(mark).To(BeFalse())
+	})
+	It("first: proceeds without re-check once already verified", func() {
+		proceed, mark := gate(config.VoiceGateWhenFirst).decide(true, false)
+		Expect(proceed).To(BeTrue())
+		Expect(mark).To(BeFalse())
+	})
+})
--- a/core/http/react-ui/public/locales/ko/admin.json
+++ b/core/http/react-ui/public/locales/ko/admin.json
@@ -0,0 +1,85 @@
+{
+  "manage": {
+    "title": "시스템",
+    "subtitle": "설치된 모델과 백엔드를 관리합니다"
+  },
+  "settings": {
+    "title": "설정",
+    "subtitle": "LocalAI 런타임 설정을 구성합니다",
+    "saved": "설정이 저장되었습니다",
+    "saveFailed": "저장 실패: {{message}}",
+    "loadFailed": "설정을 불러오지 못했습니다: {{message}}",
+    "sections": {
+      "branding": "브랜딩",
+      "watchdog": "워치독",
+      "memory": "메모리",
+      "backends": "백엔드",
+      "performance": "성능",
+      "tracing": "트레이싱",
+      "api": "API 및 CORS",
+      "p2p": "P2P",
+      "galleries": "갤러리",
+      "apikeys": "API 키",
+      "agents": "에이전트 작업",
+      "agentpool": "에이전트 풀",
+      "assistant": "LocalAI 어시스턴트",
+      "responses": "응답"
+    }
+  },
+  "backends": {
+    "title": "백엔드 관리",
+    "subtitle": "모델을 구동할 AI 백엔드를 탐색하고 설치합니다"
+  },
+  "backendLogs": {
+    "title": "백엔드 로그",
+    "subtitle": "실행 중인 백엔드의 로그를 확인합니다",
+    "empty": "사용 가능한 로그가 없습니다"
+  },
+  "traces": {
+    "title": "트레이스",
+    "subtitle": "기록된 API 요청, 응답, 백엔드 작업을 확인합니다"
+  },
+  "nodes": {
+    "title": "분산 노드",
+    "subtitle": "백엔드 및 에이전트 워커 노드를 관리합니다"
+  },
+  "p2p": {
+    "title": "분산 AI 컴퓨팅",
+    "subtitle": "피어 투 피어 분산으로 여러 기기에 걸쳐 AI 워크로드를 확장합니다"
+  },
+  "users": {
+    "title": "사용자",
+    "subtitle": "등록된 사용자, 역할, 초대를 관리합니다"
+  },
+  "usage": {
+    "title": "사용량",
+    "subtitle": "API 토큰 사용량 통계",
+    "sources": {
+      "tab": "출처",
+      "mixTitle": "출처 구성",
+      "ribbonAria": "API 키 {{apikey}}%, 웹 UI {{web}}%, 레거시 {{legacy}}%",
+      "topSources": "기간별 상위 출처",
+      "searchPlaceholder": "이름 또는 접두사로 검색",
+      "sortBy": "정렬",
+      "sortTokens": "토큰",
+      "sortRequests": "요청",
+      "sortLastUsed": "마지막 사용",
+      "sortName": "이름",
+      "sortUser": "사용자",
+      "webUI": "웹 UI",
+      "legacy": "레거시",
+      "revoked": "해지됨",
+      "filteredTo": "필터: {{name}}",
+      "clearFilter": "필터 지우기",
+      "other": "기타 ({{count}})",
+      "noTrafficShort": "이 기간에는 요청이 없습니다.",
+      "noKeysYet": "요청이 들어오면 여기에 항목별로 표시됩니다.",
+      "createKey": "첫 API 키 만들기",
+      "truncatedWarning": "상위 200개 키를 표시합니다. 필터를 적용해 범위를 좁혀 보세요."
+    }
+  },
+  "explorer": {
+    "title": "탐색기",
+    "subtitle": "파일과 구성을 둘러봅니다"
+  }
+}
--- a/core/http/react-ui/public/locales/ko/agents.json
+++ b/core/http/react-ui/public/locales/ko/agents.json
@@ -0,0 +1,55 @@
+{
+  "title": "에이전트",
+  "subtitle": "자율 AI 에이전트를 관리합니다",
+  "actions": {
+    "agentHub": "에이전트 허브",
+    "import": "가져오기",
+    "createAgent": "에이전트 만들기",
+    "edit": "편집",
+    "chat": "채팅",
+    "export": "내보내기",
+    "delete": "삭제",
+    "pause": "일시중지",
+    "resume": "재개"
+  },
+  "table": {
+    "name": "이름",
+    "status": "상태",
+    "events": "이벤트",
+    "actions": "작업",
+    "eventsTooltip": "이벤트 {{count}}개 - 클릭하여 보기"
+  },
+  "search": {
+    "placeholder": "에이전트 검색...",
+    "summary_one": "에이전트 {{total}}개 중 {{shown}}개",
+    "summary_other": "에이전트 {{total}}개 중 {{shown}}개"
+  },
+  "empty": {
+    "noConfigured": "구성된 에이전트가 없습니다",
+    "noConfiguredText": "에이전트를 만들어 자율 AI 워크플로를 시작하세요.",
+    "browseHub": "어디서 시작할지 모르겠나요? <1>에이전트 허브</1>를 둘러보고 바로 가져올 수 있는 에이전트 구성을 찾아보세요.",
+    "noMatching": "일치하는 에이전트가 없습니다",
+    "noMatchingText": "\"{{query}}\"와 일치하는 에이전트가 없습니다"
+  },
+  "sections": {
+    "yourAgents": "내 에이전트",
+    "otherUsersAgents": "다른 사용자의 에이전트"
+  },
+  "deleteDialog": {
+    "title": "에이전트 삭제",
+    "message": "에이전트 \"{{name}}\"을(를) 삭제하시겠습니까? 이 작업은 되돌릴 수 없습니다.",
+    "confirm": "삭제"
+  },
+  "toasts": {
+    "loadFailed": "에이전트를 불러오지 못했습니다: {{message}}",
+    "deleted": "에이전트 \"{{name}}\"이(가) 삭제되었습니다",
+    "deleteFailed": "에이전트 삭제 실패: {{message}}",
+    "paused": "에이전트 \"{{name}}\"이(가) 일시중지되었습니다",
+    "resumed": "에이전트 \"{{name}}\"이(가) 재개되었습니다",
+    "pauseFailed": "에이전트 일시중지 실패: {{message}}",
+    "resumeFailed": "에이전트 재개 실패: {{message}}",
+    "exported": "에이전트 \"{{name}}\"이(가) 내보내졌습니다",
+    "exportFailed": "에이전트 내보내기 실패: {{message}}",
+    "parseFailed": "에이전트 파일을 분석하지 못했습니다: {{message}}"
+  }
+}
--- a/core/http/react-ui/public/locales/ko/auth.json
+++ b/core/http/react-ui/public/locales/ko/auth.json
@@ -0,0 +1,112 @@
+{
+  "login": {
+    "subtitle": "계속하려면 로그인하세요",
+    "registerSubtitle": "계정을 만듭니다",
+    "createAdminSubtitle": "관리자 계정을 만듭니다",
+    "tokenSubtitle": "계속하려면 API 키를 입력하세요",
+    "email": "이메일",
+    "emailPlaceholder": "you@example.com",
+    "name": "이름",
+    "namePlaceholder": "이름 (선택 사항)",
+    "password": "비밀번호",
+    "passwordPlaceholder": "비밀번호 입력...",
+    "newPasswordPlaceholder": "최소 12자 이상",
+    "confirmPassword": "비밀번호 확인",
+    "confirmPasswordPlaceholder": "비밀번호 재입력",
+    "inviteCodeLabel": "초대 코드",
+    "inviteCodeOptional": " (선택 사항 — 승인 대기를 건너뜁니다)",
+    "inviteCodePlaceholder": "초대 코드를 붙여넣으세요...",
+    "tokenPlaceholder": "API 키 입력...",
+    "tokenAltPlaceholder": "API 토큰 입력...",
+    "signIn": "로그인",
+    "signingIn": "로그인 중...",
+    "register": "회원가입",
+    "creatingAccount": "계정 생성 중...",
+    "createAdminAccount": "관리자 계정 만들기",
+    "signInWithGitHub": "GitHub로 로그인",
+    "signInWithSSO": "SSO로 로그인",
+    "loginWithToken": "토큰으로 로그인",
+    "showTokenLogin": "API 토큰으로 로그인",
+    "hideTokenLogin": "토큰 로그인 숨기기",
+    "noAccount": "계정이 없으신가요?",
+    "hasAccount": "이미 계정이 있으신가요?",
+    "or": "또는",
+    "errors": {
+      "loginFailed": "로그인 실패",
+      "registrationFailed": "회원가입 실패",
+      "invalidToken": "유효하지 않은 토큰",
+      "passwordsDoNotMatch": "비밀번호가 일치하지 않습니다",
+      "enterToken": "토큰을 입력해 주세요",
+      "networkError": "네트워크 오류",
+      "inviteRequired": "회원가입하려면 유효한 초대 코드가 필요합니다"
+    },
+    "messages": {
+      "registrationPending": "회원가입이 완료되었습니다. 승인을 기다리고 있습니다."
+    }
+  },
+  "account": {
+    "title": "계정",
+    "subtitle": "프로필, 자격 증명, API 키",
+    "unavailable": "계정을 사용할 수 없습니다",
+    "unavailableText": "계정을 관리하려면 인증이 활성화되어 있어야 합니다.",
+    "tabs": {
+      "profile": "프로필",
+      "security": "보안",
+      "apiKeys": "API 키"
+    },
+    "profile": {
+      "displayName": "표시 이름",
+      "displayNameDescription": "공개적으로 표시되는 이름",
+      "avatarUrl": "아바타 URL",
+      "avatarUrlDescription": "프로필 사진 URL",
+      "avatarUrlPlaceholder": "https://example.com/avatar.png",
+      "save": "저장",
+      "saving": "저장 중...",
+      "updated": "프로필이 업데이트되었습니다",
+      "updateFailed": "프로필 업데이트 실패: {{message}}"
+    },
+    "security": {
+      "currentPassword": "현재 비밀번호",
+      "currentPasswordDescription": "본인 확인을 위해 기존 비밀번호를 입력하세요",
+      "currentPasswordPlaceholder": "현재 비밀번호",
+      "newPassword": "새 비밀번호",
+      "newPasswordDescription": "최소 12자 이상이어야 합니다",
+      "newPasswordPlaceholder": "새 비밀번호",
+      "confirmPassword": "비밀번호 확인",
+      "confirmPasswordDescription": "새 비밀번호를 다시 입력하세요",
+      "confirmPasswordPlaceholder": "새 비밀번호 확인",
+      "changePassword": "비밀번호 변경",
+      "changing": "변경 중...",
+      "changed": "비밀번호가 변경되었습니다",
+      "passwordsDoNotMatch": "비밀번호가 일치하지 않습니다",
+      "tooShort": "새 비밀번호는 최소 12자 이상이어야 합니다",
+      "oauthOnly": "{{provider}} 계정은 비밀번호 관리를 사용할 수 없습니다."
+    },
+    "apiKeys": {
+      "create": "API 키 만들기",
+      "createDescription": "프로그래밍 방식 접근을 위한 키를 생성합니다",
+      "namePlaceholder": "키 이름 (예: my-app)",
+      "createButton": "만들기",
+      "creating": "생성 중...",
+      "createdToast": "API 키가 생성되었습니다",
+      "createFailed": "API 키 생성 실패: {{message}}",
+      "loadFailed": "API 키를 불러오지 못했습니다: {{message}}",
+      "revoke": "해지",
+      "revokeKey": "키 해지",
+      "revokeTitle": "API 키 해지",
+      "revokeMessage": "API 키 \"{{name}}\"을(를) 해지하시겠습니까? 이 작업은 되돌릴 수 없습니다.",
+      "revoked": "API 키가 해지되었습니다",
+      "revokeFailed": "API 키 해지 실패: {{message}}",
+      "copyNow": "지금 복사하세요 — 이 키는 다시 표시되지 않습니다",
+      "copiedToast": "클립보드에 복사되었습니다",
+      "copyFailed": "복사하지 못했습니다",
+      "empty": "아직 API 키가 없습니다. 위에서 하나를 만들어 프로그래밍 방식 접근을 시작하세요.",
+      "lastUsed": "마지막 사용 {{date}}"
+    }
+  },
+  "notFound": {
+    "title": "페이지를 찾을 수 없습니다",
+    "text": "이 페이지가 사라진 것 같습니다. 다시 돌아가도록 도와드리겠습니다.",
+    "goHome": "홈으로 이동"
+  }
+}
--- a/core/http/react-ui/public/locales/ko/chat.json
+++ b/core/http/react-ui/public/locales/ko/chat.json
@@ -0,0 +1,117 @@
+{
+  "activity": {
+    "thought": "생각",
+    "tool": "도구",
+    "result": "결과",
+    "toolResult": "{{name}} 결과",
+    "thinking": "생각 중..."
+  },
+  "header": {
+    "manageModeTooltip": "이 채팅은 LocalAI와 대화하여 모델을 설치하고, 구성을 편집하고, 백엔드를 관리할 수 있습니다.",
+    "modelInfo": "모델 정보",
+    "chatSettings": "채팅 설정",
+    "modelInfoTitle": "모델 정보: {{model}}",
+    "editConfig": "구성 편집",
+    "close": "닫기"
+  },
+  "modelInfo": {
+    "backend": "백엔드",
+    "modelFile": "모델 파일",
+    "contextSize": "컨텍스트 크기",
+    "threads": "스레드",
+    "mcp": "MCP",
+    "configured": "구성됨",
+    "chatTemplate": "채팅 템플릿",
+    "yes": "예",
+    "gpuLayers": "GPU 레이어"
+  },
+  "context": {
+    "label": "컨텍스트: {{percent}}%",
+    "labelWithTokens": "컨텍스트: {{percent}}% ({{tokens}} 토큰)"
+  },
+  "settings": {
+    "title": "채팅 설정",
+    "manageMode": "관리 모드",
+    "manageModeDesc": "이 채팅이 LocalAI와 대화하여 모델을 설치하고, 백엔드를 전환하고, 구성을 편집할 수 있게 합니다.",
+    "systemPrompt": "시스템 프롬프트",
+    "systemPromptPlaceholder": "당신은 도움이 되는 어시스턴트입니다...",
+    "temperature": "Temperature",
+    "topP": "Top P",
+    "topK": "Top K",
+    "contextSize": "컨텍스트 크기",
+    "contextSizePlaceholder": "2048",
+    "clearHistory": "채팅 기록 지우기"
+  },
+  "empty": {
+    "manageTitle": "채팅으로 LocalAI 관리",
+    "manageText": "모델 설치, 백엔드 전환, 구성 편집 또는 상태 확인을 요청하세요. 어시스턴트가 작업을 요약하고 변경하기 전에 확인을 기다립니다.",
+    "startTitle": "대화 시작",
+    "readyText": "{{model}}와(과) 채팅할 준비가 되었습니다",
+    "selectModelText": "시작하려면 위에서 모델을 선택하세요",
+    "suggestionsManage": [
+      "무엇이 설치되어 있나요?",
+      "채팅 모델 설치하기",
+      "시스템 상태 보기",
+      "백엔드 업데이트하기"
+    ],
+    "suggestionsChat": [
+      "이게 어떻게 작동하는지 설명해 줘",
+      "코드 작성을 도와줘",
+      "문서를 요약해 줘",
+      "아이디어를 브레인스토밍해 줘"
+    ],
+    "recent": "최근",
+    "noMessages": "아직 메시지가 없습니다",
+    "hintEnter": "Enter로 전송",
+    "hintShiftEnter": "Shift+Enter로 줄바꿈",
+    "hintAttach": "파일 첨부"
+  },
+  "errors": {
+    "viewTraces": "자세한 내용은 트레이스를 확인하세요"
+  },
+  "actions": {
+    "copy": "복사",
+    "regenerate": "다시 생성"
+  },
+  "streaming": {
+    "transferring": "모델 전송 중...",
+    "transferringTo": "{{node}}(으)로 모델 전송 중..."
+  },
+  "tokens": {
+    "perSec": "{{count}} tok/s",
+    "peak": "최고: {{count}} tok/s",
+    "usage": "{{prompt}}p + {{completion}}c = {{total}}"
+  },
+  "input": {
+    "placeholder": "메시지...",
+    "attachFile": "파일 첨부",
+    "stopGenerating": "생성 중지",
+    "canvasTitle": "캔버스 — 코드 블록과 미디어를 사이드 패널로 추출해 미리보기, 복사, 다운로드할 수 있습니다",
+    "canvasLabel": "캔버스",
+    "openCanvas": "캔버스 패널 열기"
+  },
+  "deleteAllDialog": {
+    "title": "모든 채팅 삭제",
+    "message": "모든 채팅을 삭제하시겠습니까? 이 작업은 되돌릴 수 없습니다.",
+    "confirm": "모두 삭제"
+  },
+  "toasts": {
+    "selectModel": "모델을 선택해 주세요",
+    "copied": "클립보드에 복사되었습니다",
+    "copyFailed": "클립보드에 복사할 수 없습니다"
+  },
+  "menu": {
+    "trigger": "채팅",
+    "triggerTitle": "대화 (Ctrl/Cmd+K)",
+    "search": "대화 검색...",
+    "clearSearch": "검색 지우기",
+    "noMatch": "검색과 일치하는 대화가 없습니다",
+    "noConversations": "아직 대화가 없습니다",
+    "rename": "이름 변경",
+    "exportMarkdown": "Markdown으로 내보내기",
+    "deleteChat": "채팅 삭제",
+    "newChat": "새 채팅",
+    "clearAll": "모두 지우기",
+    "deleteAllTitle": "모든 대화 삭제"
+  }
+}
--- a/core/http/react-ui/public/locales/ko/collections.json
+++ b/core/http/react-ui/public/locales/ko/collections.json
@@ -0,0 +1,43 @@
+{
+  "title": "지식 베이스",
+  "subtitle": "에이전트 RAG를 위한 문서 컬렉션을 관리합니다",
+  "newPlaceholder": "새 컬렉션 이름...",
+  "actions": {
+    "create": "만들기",
+    "creating": "생성 중...",
+    "details": "세부 정보",
+    "reset": "초기화",
+    "delete": "삭제",
+    "viewDetails": "세부 정보 보기",
+    "resetCollection": "컬렉션 초기화",
+    "deleteCollection": "컬렉션 삭제"
+  },
+  "sections": {
+    "yourCollections": "내 컬렉션",
+    "otherUsersCollections": "다른 사용자의 컬렉션"
+  },
+  "empty": {
+    "title": "아직 컬렉션이 없습니다",
+    "text": "컬렉션을 사용하면 문서를 지식 베이스로 정리하여 에이전트가 RAG(검색 증강 생성)로 검색할 수 있습니다. 위에서 컬렉션을 만들어 시작하세요.",
+    "noPersonal": "아직 컬렉션이 없습니다."
+  },
+  "deleteDialog": {
+    "title": "컬렉션 삭제",
+    "message": "컬렉션 \"{{name}}\"을(를) 삭제하시겠습니까? 모든 항목이 제거되며 되돌릴 수 없습니다.",
+    "confirm": "삭제"
+  },
+  "resetDialog": {
+    "title": "컬렉션 초기화",
+    "message": "컬렉션 \"{{name}}\"을(를) 초기화하시겠습니까? 모든 항목은 제거되지만 컬렉션은 유지됩니다.",
+    "confirm": "초기화"
+  },
+  "toasts": {
+    "loadFailed": "컬렉션을 불러오지 못했습니다: {{message}}",
+    "created": "컬렉션 \"{{name}}\"이(가) 생성되었습니다",
+    "createFailed": "컬렉션 생성 실패: {{message}}",
+    "deleted": "컬렉션 \"{{name}}\"이(가) 삭제되었습니다",
+    "deleteFailed": "컬렉션 삭제 실패: {{message}}",
+    "reset": "컬렉션 \"{{name}}\"이(가) 초기화되었습니다",
+    "resetFailed": "컬렉션 초기화 실패: {{message}}"
+  }
+}
--- a/core/http/react-ui/public/locales/ko/common.json
+++ b/core/http/react-ui/public/locales/ko/common.json
@@ -0,0 +1,109 @@
+{
+  "actions": {
+    "save": "저장",
+    "saving": "저장 중...",
+    "cancel": "취소",
+    "close": "닫기",
+    "confirm": "확인",
+    "delete": "삭제",
+    "edit": "편집",
+    "add": "추가",
+    "remove": "제거",
+    "create": "만들기",
+    "update": "업데이트",
+    "refresh": "새로고침",
+    "reload": "다시 불러오기",
+    "retry": "다시 시도",
+    "search": "검색",
+    "filter": "필터",
+    "clear": "지우기",
+    "reset": "초기화",
+    "apply": "적용",
+    "back": "뒤로",
+    "next": "다음",
+    "previous": "이전",
+    "open": "열기",
+    "submit": "제출",
+    "select": "선택",
+    "selectAll": "모두 선택",
+    "copy": "복사",
+    "copied": "복사됨",
+    "download": "다운로드",
+    "upload": "업로드",
+    "import": "가져오기",
+    "export": "내보내기",
+    "view": "보기",
+    "details": "세부 정보",
+    "settings": "설정",
+    "help": "도움말",
+    "yes": "예",
+    "no": "아니요",
+    "loading": "불러오는 중..."
+  },
+  "status": {
+    "loading": "불러오는 중...",
+    "saving": "저장 중...",
+    "saved": "저장됨",
+    "ready": "준비됨",
+    "running": "실행 중",
+    "stopped": "중지됨",
+    "starting": "시작 중...",
+    "stopping": "중지 중...",
+    "pending": "대기 중",
+    "active": "활성",
+    "inactive": "비활성",
+    "enabled": "사용",
+    "disabled": "사용 안 함",
+    "online": "온라인",
+    "offline": "오프라인",
+    "error": "오류",
+    "success": "성공",
+    "warning": "경고",
+    "info": "정보",
+    "empty": "항목 없음",
+    "none": "없음",
+    "unknown": "알 수 없음"
+  },
+  "dialogs": {
+    "confirmDelete": {
+      "title": "삭제 확인",
+      "message": "정말로 삭제하시겠습니까? 이 작업은 되돌릴 수 없습니다.",
+      "confirm": "삭제",
+      "cancel": "취소"
+    },
+    "unsavedChanges": {
+      "title": "저장되지 않은 변경 사항",
+      "message": "저장되지 않은 변경 사항이 있습니다. 폐기하시겠습니까?",
+      "discard": "폐기",
+      "keepEditing": "계속 편집"
+    }
+  },
+  "forms": {
+    "required": "필수",
+    "optional": "선택",
+    "name": "이름",
+    "description": "설명",
+    "type": "유형",
+    "value": "값",
+    "search": "검색...",
+    "selectPlaceholder": "옵션을 선택하세요..."
+  },
+  "time": {
+    "now": "방금",
+    "secondsAgo_one": "{{count}}초 전",
+    "secondsAgo_other": "{{count}}초 전",
+    "minutesAgo_one": "{{count}}분 전",
+    "minutesAgo_other": "{{count}}분 전",
+    "hoursAgo_one": "{{count}}시간 전",
+    "hoursAgo_other": "{{count}}시간 전",
+    "daysAgo_one": "{{count}}일 전",
+    "daysAgo_other": "{{count}}일 전"
+  },
+  "units": {
+    "bytes": "B",
+    "kilobytes": "KB",
+    "megabytes": "MB",
+    "gigabytes": "GB",
+    "terabytes": "TB"
+  }
+}
--- a/core/http/react-ui/public/locales/ko/errors.json
+++ b/core/http/react-ui/public/locales/ko/errors.json
@@ -0,0 +1,17 @@
+{
+  "generic": "문제가 발생했습니다",
+  "network": "네트워크 오류입니다. 연결을 확인하고 다시 시도하세요.",
+  "unauthorized": "이 작업을 수행할 권한이 없습니다.",
+  "forbidden": "접근이 거부되었습니다.",
+  "notFound": "요청한 리소스를 찾을 수 없습니다.",
+  "serverError": "서버 오류입니다. 잠시 후 다시 시도하세요.",
+  "loadFailed": "불러오기 실패: {{message}}",
+  "saveFailed": "저장 실패: {{message}}",
+  "deleteFailed": "삭제 실패: {{message}}",
+  "updateFailed": "업데이트 실패: {{message}}",
+  "createFailed": "생성 실패: {{message}}",
+  "operationFailed": "작업 실패: {{message}}",
+  "invalidInput": "잘못된 입력입니다. 양식을 확인하고 다시 시도하세요.",
+  "tryAgain": "다시 시도해 주세요.",
+  "contactAdmin": "문제가 계속되면 관리자에게 문의하세요."
+}
--- a/core/http/react-ui/public/locales/ko/home.json
+++ b/core/http/react-ui/public/locales/ko/home.json
@@ -0,0 +1,66 @@
+{
+  "cluster": {
+    "vram": "클러스터 VRAM",
+    "ram": "클러스터 RAM",
+    "nodesOnline": "노드 {{total}}개 중 {{healthy}}개 온라인"
+  },
+  "resourceGpu": "GPU",
+  "resourceRam": "RAM",
+  "assistant": {
+    "title": "채팅으로 LocalAI 관리",
+    "description": "LocalAI와 대화하여 모델을 설치하고, 백엔드를 전환하고, 구성을 편집하고, 상태를 확인하세요.",
+    "open": "어시스턴트 열기",
+    "tooltip": "채팅으로 LocalAI 관리"
+  },
+  "input": {
+    "placeholder": "메시지...",
+    "attachImage": "이미지 첨부",
+    "attachAudio": "오디오 첨부",
+    "attachFile": "파일 첨부",
+    "enterToSend": "Enter로 전송",
+    "selectModelFirst": "먼저 모델을 선택하세요",
+    "sendMessage": "메시지 보내기",
+    "selectModelToast": "먼저 모델을 선택해 주세요"
+  },
+  "quickLinks": {
+    "manageByChat": "채팅으로 관리",
+    "installedModels": "설치된 모델",
+    "browseGallery": "갤러리 둘러보기",
+    "importModel": "모델 가져오기",
+    "documentation": "문서"
+  },
+  "loadedModels": {
+    "count_one": "모델 {{count}}개 로드됨",
+    "count_other": "모델 {{count}}개 로드됨",
+    "stop": "모델 중지",
+    "stopAll": "모두 중지"
+  },
+  "stopDialog": {
+    "title": "모델 중지",
+    "message": "모델 {{model}}을(를) 중지하시겠습니까?",
+    "confirm": "{{model}} 중지",
+    "stopAllTitle": "모든 모델 중지",
+    "stopAllMessage": "로드된 모델 {{count}}개를 모두 중지하시겠습니까?",
+    "stopAllConfirm": "모두 중지",
+    "stoppedToast": "{{model}} 중지됨",
+    "allStoppedToast": "모든 모델이 중지되었습니다",
+    "stopFailed": "중지 실패: {{message}}"
+  },
+  "wizard": {
+    "getStarted": "{{name}} 시작하기",
+    "intro": "첫 모델을 설치하여 시작하세요. 갤러리를 둘러보거나 직접 가져올 수 있습니다.",
+    "steps": {
+      "step1Title": "모델 갤러리 둘러보기",
+      "step1Body": "엄선된 컬렉션에서 필요에 맞는 모델을 찾으세요.",
+      "step2Title": "모델 설치",
+      "step2Body": "설치를 클릭하면 자동으로 다운로드되고 구성됩니다.",
+      "step3Title": "채팅 시작",
+      "step3Body": "브라우저에서 바로 모델과 채팅하거나 API를 사용하세요."
+    },
+    "browseGallery": "모델 갤러리 둘러보기",
+    "importModel": "모델 가져오기",
+    "docs": "문서",
+    "noModelsTitle": "사용 가능한 모델 없음",
+    "noModelsBody": "아직 설치된 모델이 없습니다. 채팅을 시작할 수 있도록 관리자에게 모델 설정을 요청하세요."
+  }
+}
--- a/core/http/react-ui/public/locales/ko/importModel.json
+++ b/core/http/react-ui/public/locales/ko/importModel.json
@@ -0,0 +1,142 @@
+{
+  "title": "새 모델 가져오기",
+  "subtitle": {
+    "simple": "URI에서 모델을 가져옵니다 — 자동 감지가 백엔드를 선택합니다.",
+    "powerYaml": "전체 모델 YAML 구성을 작성합니다.",
+    "powerPrefs": "세밀한 가져오기 환경설정."
+  },
+  "actions": {
+    "import": "모델 가져오기",
+    "importing": "가져오는 중...",
+    "create": "만들기",
+    "saving": "저장 중...",
+    "browseHF": "HF에서 모델 둘러보기",
+    "addCustom": "사용자 지정 추가",
+    "copy": "복사"
+  },
+  "form": {
+    "modelUri": "모델 URI",
+    "uriPlaceholder": "huggingface://TheBloke/Llama-2-7B-Chat-GGUF 또는 https://example.com/model.gguf",
+    "uriHint": "가져올 모델 파일의 URI 또는 경로를 입력하세요",
+    "supportedFormats": "지원되는 URI 형식",
+    "options": "옵션",
+    "preferences": "환경설정 (선택 사항)",
+    "commonPreferences": "공통 환경설정",
+    "customPreferences": "사용자 지정 환경설정",
+    "customKeyValueHint": "고급 구성을 위한 사용자 지정 키-값 쌍을 추가합니다.",
+    "preferenceKey": "{{index}}행의 환경설정 키",
+    "preferenceValue": "{{index}}행의 환경설정 값",
+    "removePref": "이 환경설정 제거",
+    "key": "키",
+    "value": "값",
+    "backend": "백엔드",
+    "backendAuto": "자동 감지 (URI 기반)",
+    "backendLoading": "백엔드 불러오는 중…",
+    "backendSearch": "백엔드 검색...",
+    "backendHint": "특정 백엔드를 강제로 지정합니다. 비워 두면 URI에서 자동 감지합니다. \"수동 선택\"으로 표시된 항목은 자동 감지되지 않으므로 모델에 필요한 것을 알고 있다면 직접 선택하세요.",
+    "backendErrorHint": "백엔드 목록을 불러오지 못했습니다 — 자동 감지만 사용합니다.",
+    "backendNotInstalled": "이 백엔드는 아직 설치되지 않았습니다. 가져오기를 제출하면 먼저 다운로드됩니다.",
+    "modelName": "모델 이름",
+    "modelNamePlaceholder": "비워 두면 파일 이름을 사용합니다",
+    "modelNameHint": "모델의 사용자 지정 이름입니다. 비워 두면 파일 이름이 사용됩니다.",
+    "description": "설명",
+    "descriptionPlaceholder": "비워 두면 기본 설명을 사용합니다",
+    "descriptionHint": "모델의 사용자 지정 설명입니다.",
+    "quantizations": "양자화",
+    "quantizationsPlaceholder": "q4_k_m,q4_k_s,q3_k_m (쉼표로 구분)",
+    "quantizationsHint": "선호하는 양자화 (쉼표로 구분). 비워 두면 기본값(q4_k_m)을 사용합니다.",
+    "mmprojQuantizations": "MMProj 양자화",
+    "mmprojQuantizationsPlaceholder": "fp16,fp32 (쉼표로 구분)",
+    "mmprojQuantizationsHint": "선호하는 MMProj 양자화입니다. 비워 두면 기본값(fp16)을 사용합니다.",
+    "embeddings": "임베딩",
+    "embeddingsHint": "이 모델에 대한 임베딩 지원을 활성화합니다.",
+    "modelType": "모델 유형",
+    "modelTypePlaceholder": "AutoModelForCausalLM (transformers 백엔드용)",
+    "modelTypeHint": "transformers 백엔드의 모델 유형입니다. 예: AutoModelForCausalLM, SentenceTransformer, Mamba.",
+    "pipelineType": "파이프라인 유형",
+    "pipelineTypeHint": "diffusers 백엔드의 파이프라인 유형입니다.",
+    "schedulerType": "스케줄러 유형",
+    "schedulerTypePlaceholder": "k_dpmpp_2m (선택 사항)",
+    "schedulerTypeHint": "diffusers 백엔드의 스케줄러 유형입니다. 예: k_dpmpp_2m, euler_a, ddim.",
+    "enableParameters": "활성화 매개변수",
+    "enableParametersPlaceholder": "negative_prompt,num_inference_steps (쉼표로 구분)",
+    "enableParametersHint": "diffusers 백엔드에 대해 활성화된 매개변수 (쉼표로 구분).",
+    "cuda": "CUDA",
+    "cudaHint": "GPU 가속을 위한 CUDA 지원을 활성화합니다.",
+    "yamlEditor": "YAML 구성 편집기",
+    "manualPick": "수동 선택",
+    "manualPickTooltip": "자동 감지는 이 백엔드로 라우팅하지 않습니다. 원하는 것이 무엇인지 안다면 여기서 직접 선택하세요."
+  },
+  "modality": {
+    "text": "텍스트 LLM",
+    "asr": "음성 인식",
+    "tts": "텍스트 음성 변환",
+    "image": "이미지 / 비디오",
+    "embeddings": "임베딩",
+    "reranker": "리랭커",
+    "detection": "객체 감지",
+    "vad": "음성 활동 감지",
+    "other": "기타"
+  },
+  "powerTabs": {
+    "ariaLabel": "고급 모드 탭",
+    "preferences": "환경설정",
+    "yaml": "YAML"
+  },
+  "switchDialog": {
+    "title": "사용자 지정 환경설정을 유지할까요?",
+    "body": "간단 모드로 전환하면 백엔드, 이름, 설명을 제외한 환경설정이 숨겨집니다. 가져오기 시 여전히 전송됩니다.",
+    "cancel": "취소",
+    "discard": "폐기하고 전환",
+    "keep": "유지하고 전환"
+  },
+  "estimate": {
+    "title": "예상 요구 사항",
+    "download": "다운로드: {{size}}",
+    "vram": "VRAM: {{vram}}"
+  },
+  "toasts": {
+    "noUri": "모델 URI를 입력해 주세요",
+    "noYaml": "YAML 구성을 입력해 주세요",
+    "started": "가져오기를 시작했습니다! 진행 상황을 추적하는 중...",
+    "startedWithMeta": "가져오기를 시작했습니다! 진행 상황을 추적하는 중... ({{meta}})",
+    "imported": "모델을 성공적으로 가져왔습니다!",
+    "importedYaml": "모델 구성을 성공적으로 가져왔습니다!",
+    "importFailed": "가져오기 실패: {{message}}",
+    "startImportFailed": "가져오기를 시작하지 못했습니다: {{message}}",
+    "backendsLoadFailed": "백엔드 목록을 불러오지 못했습니다 — 자동 감지만 사용합니다",
+    "modalityClearedBackend": "백엔드 선택을 해제했습니다 — {{label}} 그룹에 없었습니다.",
+    "copied": "클립보드에 복사되었습니다"
+  },
+  "uriFormats": {
+    "huggingface": {
+      "title": "HuggingFace",
+      "standard": "표준 HuggingFace 형식",
+      "short": "짧은 HuggingFace 형식",
+      "fullUrl": "전체 HuggingFace URL"
+    },
+    "http": {
+      "title": "HTTP/HTTPS URL",
+      "direct": "모든 HTTPS URL에서 직접 다운로드"
+    },
+    "local": {
+      "title": "로컬 파일",
+      "filePath": "로컬 파일 경로 (절대 경로)",
+      "directYaml": "직접 로컬 YAML 구성 파일"
+    },
+    "oci": {
+      "title": "OCI 레지스트리",
+      "registry": "OCI 컨테이너 레지스트리",
+      "tarball": "로컬 OCI tarball 파일"
+    },
+    "ollama": {
+      "title": "Ollama",
+      "model": "Ollama 모델 형식"
+    },
+    "yaml": {
+      "title": "YAML 구성 파일",
+      "remote": "원격 YAML 구성 파일",
+      "local": "로컬 YAML 구성 파일"
+    }
+  }
+}
--- a/core/http/react-ui/public/locales/ko/media.json
+++ b/core/http/react-ui/public/locales/ko/media.json
@@ -0,0 +1,154 @@
+{
+  "studio": {
+    "tabs": {
+      "images": "이미지",
+      "video": "비디오",
+      "tts": "TTS",
+      "sound": "사운드"
+    }
+  },
+  "image": {
+    "title": "이미지 생성",
+    "labels": {
+      "model": "모델",
+      "prompt": "프롬프트",
+      "promptPlaceholder": "생성할 이미지를 설명하세요...",
+      "negativePrompt": "네거티브 프롬프트",
+      "negativePromptPlaceholder": "피할 요소...",
+      "size": "크기",
+      "count": "개수 (1-4)",
+      "advanced": "고급 설정",
+      "imageInputs": "이미지 입력",
+      "steps": "스텝",
+      "stepsPlaceholder": "20",
+      "seed": "시드",
+      "seedPlaceholder": "랜덤",
+      "sourceImage": "원본 이미지 (img2img)",
+      "refImages": "참조 이미지",
+      "refImagesAdded_one": "이미지 {{count}}개 추가됨",
+      "refImagesAdded_other": "이미지 {{count}}개 추가됨"
+    },
+    "actions": {
+      "generate": "생성",
+      "generating": "생성 중..."
+    },
+    "empty": "생성된 이미지가 여기에 표시됩니다",
+    "toasts": {
+      "noPrompt": "프롬프트를 입력해 주세요",
+      "noModel": "모델을 선택해 주세요",
+      "noResults": "생성된 이미지가 없습니다"
+    }
+  },
+  "video": {
+    "title": "비디오 생성",
+    "labels": {
+      "model": "모델",
+      "prompt": "프롬프트",
+      "promptPlaceholder": "생성할 비디오를 설명하세요...",
+      "duration": "길이 (초)",
+      "fps": "FPS",
+      "size": "크기",
+      "advanced": "고급 설정",
+      "seed": "시드",
+      "seedPlaceholder": "랜덤"
+    },
+    "actions": {
+      "generate": "생성",
+      "generating": "생성 중..."
+    },
+    "empty": "생성된 비디오가 여기에 표시됩니다",
+    "toasts": {
+      "noPrompt": "프롬프트를 입력해 주세요",
+      "noModel": "모델을 선택해 주세요",
+      "noResults": "생성된 비디오가 없습니다"
+    }
+  },
+  "tts": {
+    "title": "텍스트 음성 변환",
+    "labels": {
+      "model": "모델",
+      "voice": "음성",
+      "voicePlaceholder": "음성 ID (선택 사항)",
+      "input": "텍스트",
+      "inputPlaceholder": "합성할 텍스트를 입력하세요..."
+    },
+    "actions": {
+      "generate": "생성",
+      "generating": "생성 중..."
+    },
+    "empty": "생성된 오디오가 여기에 표시됩니다",
+    "toasts": {
+      "noText": "텍스트를 입력해 주세요",
+      "noModel": "모델을 선택해 주세요",
+      "generateFailed": "생성에 실패했습니다"
+    }
+  },
+  "sound": {
+    "title": "사운드 생성",
+    "labels": {
+      "model": "모델",
+      "prompt": "프롬프트",
+      "promptPlaceholder": "생성할 사운드를 설명하세요...",
+      "duration": "길이 (초)",
+      "language": "언어",
+      "vocalLanguage": "보컬 언어",
+      "lyrics": "가사 (선택 사항)",
+      "lyricsPlaceholder": "보컬 생성을 위한 가사",
+      "advanced": "고급 설정",
+      "seed": "시드",
+      "seedPlaceholder": "랜덤"
+    },
+    "actions": {
+      "generate": "생성",
+      "generating": "생성 중..."
+    },
+    "empty": "생성된 오디오가 여기에 표시됩니다",
+    "toasts": {
+      "noPrompt": "프롬프트를 입력해 주세요",
+      "noModel": "모델을 선택해 주세요",
+      "generateFailed": "생성에 실패했습니다"
+    }
+  },
+  "talk": {
+    "title": "대화",
+    "subtitle": "실시간 음성 대화",
+    "actions": {
+      "start": "세션 시작",
+      "stop": "세션 종료",
+      "connecting": "연결 중...",
+      "muted": "음소거됨",
+      "mute": "음소거",
+      "unmute": "음소거 해제"
+    },
+    "labels": {
+      "model": "모델",
+      "voice": "음성",
+      "voicePlaceholder": "alloy",
+      "language": "언어",
+      "languagePlaceholder": "en",
+      "instructions": "지침",
+      "instructionsPlaceholder": "어시스턴트의 페르소나를 설정하세요..."
+    },
+    "status": {
+      "idle": "대기 중",
+      "connecting": "연결 중...",
+      "listening": "듣는 중...",
+      "speaking": "말하는 중...",
+      "ended": "세션이 종료되었습니다"
+    },
+    "toasts": {
+      "noModel": "먼저 모델을 선택하세요",
+      "connectFailed": "연결에 실패했습니다: {{message}}"
+    }
+  },
+  "history": {
+    "title": "기록",
+    "empty": "아직 기록이 없습니다",
+    "deleteEntry": "항목 삭제",
+    "clear": "기록 지우기",
+    "clearTitle": "모든 기록 지우기",
+    "clearMessage": "모든 기록 항목을 제거하시겠습니까? 이 작업은 되돌릴 수 없습니다.",
+    "clearConfirm": "지우기",
+    "cleared": "기록이 지워졌습니다"
+  }
+}
--- a/core/http/react-ui/public/locales/ko/models.json
+++ b/core/http/react-ui/public/locales/ko/models.json
@@ -0,0 +1,93 @@
+{
+  "title": "모델 설치",
+  "subtitle": "갤러리에서 AI 모델을 둘러보고 설치합니다",
+  "stats": {
+    "available": "사용 가능",
+    "installed": "설치됨"
+  },
+  "actions": {
+    "addModel": "모델 추가",
+    "importModel": "모델 가져오기",
+    "install": "설치",
+    "reinstall": "재설치",
+    "delete": "삭제"
+  },
+  "filters": {
+    "all": "전체",
+    "llm": "채팅",
+    "image": "이미지",
+    "video": "비디오",
+    "multimodal": "멀티모달",
+    "vision": "비전",
+    "tts": "TTS",
+    "stt": "STT",
+    "diarization": "화자 분리",
+    "soundGen": "사운드",
+    "audioTransform": "오디오 FX",
+    "realtimeAudio": "실시간 오디오",
+    "embedding": "임베딩",
+    "rerank": "리랭크",
+    "detection": "감지",
+    "vad": "VAD",
+    "fitsGpu": "GPU에 적합",
+    "allBackends": "모든 백엔드",
+    "searchBackends": "백엔드 검색..."
+  },
+  "search": {
+    "placeholder": "모델 검색...",
+    "clearFilters": "필터 지우기"
+  },
+  "table": {
+    "modelName": "모델 이름",
+    "description": "설명",
+    "backend": "백엔드",
+    "sizeVram": "크기 / VRAM",
+    "status": "상태",
+    "actions": "작업",
+    "size": "크기: {{size}}",
+    "vram": "VRAM: {{vram}}",
+    "fits": "적합",
+    "mayNotFit": "맞지 않을 수 있음",
+    "trustRemoteCode": "원격 코드 신뢰",
+    "installing": "설치 중",
+    "installingPct": "설치 중 · {{percent}}%",
+    "installed": "설치됨",
+    "notInstalled": "설치되지 않음"
+  },
+  "detail": {
+    "description": "설명",
+    "gallery": "갤러리",
+    "backend": "백엔드",
+    "size": "크기",
+    "vram": "VRAM",
+    "license": "라이선스",
+    "tags": "태그",
+    "links": "링크",
+    "warning": "경고",
+    "files": "파일",
+    "fitsGpu": "GPU에 적합",
+    "mayNotFitGpu": "GPU에 맞지 않을 수 있음",
+    "requiresTrustRemoteCode": "원격 코드 신뢰 필요",
+    "fileCount_one": "파일 {{count}}개",
+    "fileCount_other": "파일 {{count}}개",
+    "filename": "파일 이름",
+    "uri": "URI",
+    "sha256": "SHA256"
+  },
+  "empty": {
+    "title": "모델을 찾을 수 없습니다",
+    "withFilters": "현재 검색 또는 필터와 일치하는 모델이 없습니다.",
+    "noFilters": "모델 갤러리가 비어 있습니다."
+  },
+  "deleteDialog": {
+    "title": "모델 삭제",
+    "message": "모델 {{model}}을(를) 삭제하시겠습니까?",
+    "confirm": "{{model}} 삭제",
+    "deletingToast": "{{model}} 삭제 중..."
+  },
+  "errors": {
+    "loadFailed": "모델을 불러오지 못했습니다: {{message}}",
+    "installFailed": "설치 실패: {{message}}",
+    "deleteFailed": "삭제 실패: {{message}}"
+  }
+}
--- a/core/http/react-ui/public/locales/ko/nav.json
+++ b/core/http/react-ui/public/locales/ko/nav.json
@@ -0,0 +1,54 @@
+{
+  "appName": "LocalAI",
+  "openMenu": "메뉴 열기",
+  "closeMenu": "메뉴 닫기",
+  "primaryNavigation": "기본 탐색",
+  "switchToLightMode": "라이트 모드로 전환",
+  "switchToDarkMode": "다크 모드로 전환",
+  "expandSidebar": "사이드바 펼치기",
+  "collapseSidebar": "사이드바 접기",
+  "changeLanguage": "언어 변경",
+  "logout": "로그아웃",
+  "accountSettings": "계정 설정",
+  "account": "계정",
+  "accountFor": "계정: {{name}}",
+  "sections": {
+    "tools": "도구",
+    "enhance": "향상",
+    "biometrics": "생체 인식",
+    "agents": "에이전트",
+    "system": "시스템"
+  },
+  "items": {
+    "home": "홈",
+    "installModels": "모델 설치",
+    "chat": "채팅",
+    "studio": "스튜디오",
+    "talk": "대화",
+    "fineTune": "파인튜닝 (실험적)",
+    "quantize": "양자화 (실험적)",
+    "audioTransform": "오디오 변환",
+    "faceRecognition": "얼굴 인식",
+    "voiceRecognition": "음성 인식",
+    "agents": "에이전트",
+    "skills": "스킬",
+    "memory": "메모리",
+    "mcpJobs": "MCP CI 작업",
+    "usage": "사용량",
+    "users": "사용자",
+    "middleware": "미들웨어",
+    "backends": "백엔드",
+    "traces": "트레이스",
+    "nodes": "노드",
+    "swarm": "Swarm",
+    "system": "시스템",
+    "settings": "설정",
+    "api": "API"
+  },
+  "footer": {
+    "github": "GitHub",
+    "documentation": "문서",
+    "author": "작성자",
+    "copyright": "© 2023-{{year}} {{author}}"
+  }
+}
--- a/core/http/react-ui/public/locales/ko/skills.json
+++ b/core/http/react-ui/public/locales/ko/skills.json
@@ -0,0 +1,79 @@
+{
+  "title": "스킬",
+  "subtitle": "에이전트 스킬(재사용 가능한 지침 및 리소스)을 관리합니다",
+  "unavailable": {
+    "subtitle": "스킬 서비스를 사용할 수 없거나 인덱스를 다시 빌드하는 중입니다. 잠시 후 다시 시도하세요.",
+    "retry": "다시 시도"
+  },
+  "actions": {
+    "newSkill": "새 스킬",
+    "createSkill": "스킬 만들기",
+    "import": "가져오기",
+    "importing": "가져오는 중...",
+    "gitRepos": "Git 저장소",
+    "edit": "편집",
+    "delete": "삭제",
+    "export": "내보내기",
+    "sync": "동기화",
+    "addRepo": "저장소 추가",
+    "adding": "추가 중...",
+    "remove": "제거",
+    "enable": "사용",
+    "disable": "사용 안 함"
+  },
+  "search": {
+    "placeholder": "스킬 검색..."
+  },
+  "git": {
+    "title": "Git 저장소",
+    "description": "스킬을 가져올 Git 저장소를 추가합니다. 동기화 후 스킬이 목록에 표시됩니다.",
+    "urlPlaceholder": "https://github.com/user/repo 또는 git@github.com:user/repo.git",
+    "noRepos": "구성된 Git 저장소가 없습니다. 위에서 추가하세요.",
+    "disabled": "사용 안 함",
+    "removeRepo": "저장소 제거"
+  },
+  "card": {
+    "noDescription": "설명 없음",
+    "readOnly": "읽기 전용",
+    "editTitle": "스킬 편집",
+    "deleteTitle": "스킬 삭제",
+    "exportTitle": ".tar.gz로 내보내기"
+  },
+  "empty": {
+    "title": "스킬을 찾을 수 없습니다",
+    "text": "스킬을 만들거나 가져와서 시작하세요.",
+    "noPersonal": "아직 스킬이 없습니다."
+  },
+  "sections": {
+    "yourSkills": "내 스킬",
+    "otherUsersSkills": "다른 사용자의 스킬"
+  },
+  "deleteDialog": {
+    "title": "스킬 삭제",
+    "message": "스킬 \"{{name}}\"을(를) 삭제하시겠습니까? 이 작업은 되돌릴 수 없습니다.",
+    "confirm": "삭제"
+  },
+  "removeRepoDialog": {
+    "title": "Git 저장소 제거",
+    "message": "이 Git 저장소를 제거하시겠습니까? 해당 저장소의 스킬을 더 이상 사용할 수 없게 됩니다.",
+    "confirm": "제거"
+  },
+  "toasts": {
+    "loadFailed": "스킬을 불러오지 못했습니다",
+    "deleted": "스킬 \"{{name}}\"이(가) 삭제되었습니다",
+    "deleteFailed": "스킬 삭제 실패",
+    "exported": "스킬 \"{{name}}\"이(가) 내보내졌습니다",
+    "exportFailed": "내보내기 실패",
+    "imported": "\"{{file}}\"에서 스킬을 가져왔습니다",
+    "importFailed": "가져오기 실패",
+    "loadReposFailed": "Git 저장소를 불러오지 못했습니다",
+    "repoAdded": "Git 저장소가 추가되고 동기화 중입니다",
+    "addRepoFailed": "저장소 추가 실패",
+    "synced": "저장소가 동기화되었습니다",
+    "syncFailed": "동기화 실패",
+    "toggled": "저장소가 전환되었습니다",
+    "toggleFailed": "전환 실패",
+    "removed": "저장소가 제거되었습니다",
+    "removeFailed": "제거 실패"
+  }
+}
--- a/core/http/react-ui/src/App.css
+++ b/core/http/react-ui/src/App.css
@@ -5017,6 +5017,10 @@ select.input {
  }
  .sidebar:not(.open) .sidebar-user-name,
  .sidebar:not(.open) .sidebar-logout-btn { display: none; }
+  /* Center the avatar in the icon rail (mirrors .sidebar.collapsed, which
+     isn't necessarily present in the tablet :not(.open) state). */
+  .sidebar:not(.open) .sidebar-user { justify-content: center; }
+  .sidebar:not(.open) .sidebar-user-link { flex: 0; margin: 0; padding: 2px; }

  /* Pinned open: overlay the full sidebar on top of content */
  .sidebar.open {
--- a/core/http/react-ui/src/i18n/index.js
+++ b/core/http/react-ui/src/i18n/index.js
@@ -11,6 +11,7 @@ export const SUPPORTED_LANGUAGES = [
  { code: 'de', name: 'Deutsch', flag: 'DE' },
  { code: 'zh-CN', name: '简体中文', flag: 'ZH' },
  { code: 'id', name: 'Bahasa Indonesia', flag: 'ID' },
+  { code: 'ko', name: '한국어', flag: 'KO' },
 ]

 export const NAMESPACES = [
--- a/core/http/react-ui/src/pages/AgentChat.jsx
+++ b/core/http/react-ui/src/pages/AgentChat.jsx
@@ -8,7 +8,7 @@ import CanvasPanel from '../components/CanvasPanel'
 import ResourceCards from '../components/ResourceCards'
 import ConfirmDialog from '../components/ConfirmDialog'
 import { useAgentChat } from '../hooks/useAgentChat'
-import { relativeTime } from '../utils/format'
+import { relativeTime, normalizeTimestampMs } from '../utils/format'
 import { copyToClipboard } from '../utils/clipboard'

 function getLastMessagePreview(conv) {
@@ -139,8 +139,9 @@ export default function AgentChat() {
          id: nextId(),
          sender,
          content: data.content || data.message || '',
-          // Backend sends Unix milliseconds (see core/services/agents events).
-          timestamp: data.timestamp || Date.now(),
+          // Backend timestamp encoding varies by deploy mode (RFC3339 string,
+          // Unix ms, or Unix ns); normalize to JS milliseconds.
+          timestamp: normalizeTimestampMs(data.timestamp),
        }
        if (data.metadata && Object.keys(data.metadata).length > 0) {
          msg.metadata = data.metadata
--- a/core/http/react-ui/src/pages/Nodes.jsx
+++ b/core/http/react-ui/src/pages/Nodes.jsx
@@ -506,6 +506,7 @@ function SchedulingForm({ onSave, onCancel }) {
  const isValid = () => {
    if (!modelName) return false
    if (mode === 'placement') return hasSelector
+    if (mode === 'spread') return true
    return minReplicas > 0 || maxReplicas > 0
  }

@@ -513,8 +514,9 @@ function SchedulingForm({ onSave, onCancel }) {
    onSave({
      model_name: modelName,
      node_selector: hasSelector ? selector : undefined,
-      min_replicas: mode === 'placement' ? 0 : minReplicas,
-      max_replicas: mode === 'placement' ? 0 : maxReplicas,
+      min_replicas: mode === 'autoscaling' ? minReplicas : 0,
+      max_replicas: mode === 'autoscaling' ? maxReplicas : 0,
+      spread_all: mode === 'spread',
      route_policy: routePolicy,
      balance_abs_threshold: balanceAbsThreshold,
      balance_rel_threshold: balanceRelThreshold,
@@ -542,10 +544,19 @@ function SchedulingForm({ onSave, onCancel }) {
        >
          <i className="fas fa-arrows-up-down" aria-hidden="true" /> Auto-scale
        </button>
+        <button
+          type="button" role="radio" aria-checked={mode === 'spread'}
+          className={`segmented__item${mode === 'spread' ? ' is-active' : ''}`}
+          onClick={() => setMode('spread')}
+        >
+          <i className="fas fa-network-wired" aria-hidden="true" /> Spread to all
+        </button>
      </div>
      <p style={{ fontSize: '0.8125rem', color: 'var(--color-text-muted)', margin: '0 0 var(--spacing-lg) 0' }}>
        {mode === 'placement'
          ? 'Restrict this model to specific nodes. Loaded on demand, evictable when idle.'
+          : mode === 'spread'
+          ? 'Run one replica on every node matching the selector (all healthy nodes when empty). Tracks nodes joining and leaving.'
          : 'Maintain a target replica count across the cluster. Min \u2265 1 protects from eviction.'}
      </p>

@@ -1563,10 +1574,11 @@ export default function Nodes() {
                </tr></thead>
                <tbody>
                  {schedulingConfigs.map(cfg => {
-                    const isAutoScaling = cfg.min_replicas > 0 || cfg.max_replicas > 0
+                    const isSpread = !!cfg.spread_all
+                    const isAutoScaling = !isSpread && (cfg.min_replicas > 0 || cfg.max_replicas > 0)
                    const hasSelector = !!cfg.node_selector
-                    const modeLabel = isAutoScaling ? 'Auto-scaling' : hasSelector ? 'Placement' : 'Inactive'
-                    const modeColor = isAutoScaling ? 'var(--color-success)' : hasSelector ? 'var(--color-primary)' : 'var(--color-text-muted)'
+                    const modeLabel = isSpread ? 'Spread' : isAutoScaling ? 'Auto-scaling' : hasSelector ? 'Placement' : 'Inactive'
+                    const modeColor = isSpread ? 'var(--color-warning)' : isAutoScaling ? 'var(--color-success)' : hasSelector ? 'var(--color-primary)' : 'var(--color-text-muted)'
                    // Cooldown: reconciler tripped the circuit breaker because cluster
                    // capacity is exhausted. Surface so the operator sees it instead
                    // of the model silently failing to scale.
@@ -1597,10 +1609,16 @@ export default function Nodes() {
                        })() : <span style={{ color: 'var(--color-text-muted)', fontSize: '0.8125rem' }}>Any node</span>}
                      </td>
                      <td style={{ fontFamily: 'var(--font-mono)' }}>
-                        {isAutoScaling ? cfg.min_replicas : '-'}
+                        {isSpread
+                          ? <span style={{
+                              display: 'inline-block', fontSize: '0.75rem', padding: '2px 8px', borderRadius: "var(--radius-sm)",
+                              background: 'var(--color-bg-tertiary)', border: '1px solid var(--color-warning)',
+                              color: 'var(--color-warning)', fontWeight: 600, fontFamily: 'var(--font-sans)',
+                            }}>Spread: all matching nodes</span>
+                          : isAutoScaling ? cfg.min_replicas : '-'}
                      </td>
                      <td style={{ fontFamily: 'var(--font-mono)' }}>
-                        {isAutoScaling ? (cfg.max_replicas || 'no limit') : '-'}
+                        {isSpread ? '-' : isAutoScaling ? (cfg.max_replicas || 'no limit') : '-'}
                      </td>
                      <td style={{ fontSize: '0.8125rem' }}>
                        {cfg.route_policy || 'default'}
--- a/core/http/react-ui/src/pages/Talk.jsx
+++ b/core/http/react-ui/src/pages/Talk.jsx
@@ -705,7 +705,7 @@ export default function Talk() {
          )}
          {selectedModelInfo && !selectedModelInfo.self_contained && (
            <div style={{
-              display: 'grid', gridTemplateColumns: 'repeat(4, 1fr)', gap: 'var(--spacing-xs)',
+              display: 'grid', gridTemplateColumns: 'repeat(4, minmax(0, 1fr))', gap: 'var(--spacing-xs)',
              marginBottom: 'var(--spacing-xs)', fontSize: '0.75rem',
            }}>
              {[
--- a/core/http/react-ui/src/utils/format.js
+++ b/core/http/react-ui/src/utils/format.js
@@ -12,6 +12,26 @@ export function percentColor(pct) {
  return 'var(--color-success)'
 }

+// normalizeTimestampMs converts a timestamp emitted by the backend into JS
+// milliseconds, regardless of its encoding. The agent SSE bridge emits the
+// json_message timestamp in three different shapes depending on deploy mode:
+// an RFC3339 string (standalone agent pool), Unix milliseconds (local
+// dispatcher), or Unix nanoseconds (older NATS path). A numeric value is
+// classified by magnitude (s / ms / us / ns) so any of them yields a sane
+// epoch. Falls back to Date.now() for null/empty/unparseable input.
+export function normalizeTimestampMs(ts) {
+  if (ts === null || ts === undefined || ts === '') return Date.now()
+  if (typeof ts === 'string') {
+    const parsed = Date.parse(ts)
+    return Number.isNaN(parsed) ? Date.now() : parsed
+  }
+  if (typeof ts !== 'number' || !Number.isFinite(ts)) return Date.now()
+  if (ts > 1e17) return Math.floor(ts / 1e6) // nanoseconds
+  if (ts > 1e14) return Math.floor(ts / 1e3) // microseconds
+  if (ts > 1e11) return ts                    // milliseconds
+  return ts * 1000                            // seconds
+}
+
 export function formatTimestamp(ts) {
  if (!ts) return '-'
  const d = new Date(ts)
--- a/core/services/nodes/reconciler.go
+++ b/core/services/nodes/reconciler.go
@@ -399,6 +399,28 @@ func (rc *ReplicaReconciler) candidateNodeIDsForSelector(ctx context.Context, cf
 }

 func (rc *ReplicaReconciler) reconcileModel(ctx context.Context, cfg ModelSchedulingConfig) {
+	// spread_all: derive a dynamic replica target equal to the number of nodes
+	// currently matching the selector (all healthy backend nodes when the
+	// selector is empty). Feeding it through Min==Max==target reuses every
+	// existing path: the floor scales up toward target (capped at capacity),
+	// Max==target stops busy-burst/pressure overshooting, and idle scale-down
+	// trims above target. The target re-tracks node join/leave each tick. cfg is
+	// a by-value copy, so mutating it here is local to this tick.
+	if cfg.SpreadAll {
+		matched, err := rc.registry.FindNodesBySelector(ctx, parseSelector(cfg.NodeSelector))
+		if err != nil {
+			xlog.Warn("Reconciler: spread_all failed to resolve matching nodes", "model", cfg.ModelName, "error", err)
+			return
+		}
+		if len(matched) == 0 {
+			xlog.Info("Reconciler: spread_all has no matching nodes; nothing to schedule",
+				"model", cfg.ModelName, "selector", cfg.NodeSelector)
+			return
+		}
+		cfg.MinReplicas = len(matched)
+		cfg.MaxReplicas = len(matched)
+	}
+
 	// Cooldown gate: if we previously decided this config is unsatisfiable,
 	// don't even bother checking until the cooldown expires. ClearAllUnsatisfiable
 	// (fired by node lifecycle events) bypasses this by zeroing the column.
--- a/core/services/nodes/reconciler_test.go
+++ b/core/services/nodes/reconciler_test.go
@@ -34,6 +34,13 @@ func (f *fakeScheduler) ScheduleAndLoadModel(_ context.Context, modelName string
 	return f.scheduleNode, f.scheduleErr
 }

+func mustGetSched(r *NodeRegistry, model string) ModelSchedulingConfig {
+	cfg, err := r.GetModelScheduling(context.Background(), model)
+	Expect(err).ToNot(HaveOccurred())
+	Expect(cfg).ToNot(BeNil())
+	return *cfg
+}
+
 var _ = Describe("ReplicaReconciler", func() {
 	var (
 		db       *gorm.DB
@@ -78,6 +85,45 @@ var _ = Describe("ReplicaReconciler", func() {
 		Expect(registry.SetModelScheduling(context.Background(), cfg)).To(Succeed())
 	}

+	Context("spread_all mode", func() {
+		It("targets one replica per matching node (empty selector = all nodes)", func() {
+			n1 := registerNode("s1", "10.1.0.1:50051")
+			registerNode("s2", "10.1.0.2:50051")
+			// spread config, no selector -> all healthy backend nodes (2)
+			Expect(registry.SetModelScheduling(context.Background(), &ModelSchedulingConfig{
+				ModelName: "spread-model", SpreadAll: true,
+			})).To(Succeed())
+
+			scheduler := &fakeScheduler{scheduleNode: n1}
+			reconciler := NewReplicaReconciler(ReplicaReconcilerOptions{
+				Registry:  registry,
+				Scheduler: scheduler,
+			})
+
+			reconciler.reconcileModel(context.Background(), mustGetSched(registry, "spread-model"))
+
+			// With current==0 and a target of 2, the MinReplicas floor path
+			// schedules up to cluster capacity (2 nodes).
+			Expect(len(scheduler.scheduleCalls)).To(Equal(2))
+		})
+
+		It("is a no-op when no nodes match", func() {
+			Expect(registry.SetModelScheduling(context.Background(), &ModelSchedulingConfig{
+				ModelName: "spread-model", SpreadAll: true,
+				NodeSelector: `{"tier":"nope"}`,
+			})).To(Succeed())
+
+			scheduler := &fakeScheduler{}
+			reconciler := NewReplicaReconciler(ReplicaReconcilerOptions{
+				Registry:  registry,
+				Scheduler: scheduler,
+			})
+
+			reconciler.reconcileModel(context.Background(), mustGetSched(registry, "spread-model"))
+			Expect(scheduler.scheduleCalls).To(BeEmpty())
+		})
+	})
+
 	Context("model below min_replicas", func() {
 		It("scales up to min_replicas", func() {
 			node := registerNode("node-1", "10.0.0.1:50051")
--- a/core/services/nodes/registry.go
+++ b/core/services/nodes/registry.go
@@ -135,13 +135,18 @@ type NodeLabel struct {
 //   - Both → auto-scale on matching nodes
 //   - Neither → no-op (default behavior)
 //
-// Auto-scaling is enabled when MinReplicas > 0 or MaxReplicas > 0.
+// Auto-scaling is enabled when MinReplicas > 0, MaxReplicas > 0, or SpreadAll is set.
 type ModelSchedulingConfig struct {
 	ID           string `gorm:"primaryKey;size:36" json:"id"`
 	ModelName    string `gorm:"uniqueIndex;size:255" json:"model_name"`
 	NodeSelector string `gorm:"type:text" json:"node_selector,omitempty"` // JSON {"key":"value",...}
 	MinReplicas  int    `gorm:"default:0" json:"min_replicas"`
 	MaxReplicas  int    `gorm:"default:0" json:"max_replicas"`
+	// SpreadAll requests one replica on every node matching NodeSelector
+	// (every healthy backend node when the selector is empty), tracked as
+	// nodes join and leave. Mutually exclusive with MinReplicas/MaxReplicas.
+	// The reconciler turns this into a dynamic Min==Max target each tick.
+	SpreadAll bool `gorm:"column:spread_all;default:false" json:"spread_all,omitempty"`
 	// Prefix-cache-aware routing (epic #10063). RoutePolicy "" means inherit
 	// the cluster-wide default. Thresholds are per-model overrides; 0 means
 	// inherit the global default.
@@ -1392,7 +1397,7 @@ func (r *NodeRegistry) SetModelScheduling(ctx context.Context, config *ModelSche
 		Clauses(clause.OnConflict{
 			Columns: []clause.Column{{Name: "model_name"}},
 			DoUpdates: clause.AssignmentColumns([]string{
-				"node_selector", "min_replicas", "max_replicas",
+				"node_selector", "min_replicas", "max_replicas", "spread_all",
 				"route_policy", "balance_abs_threshold", "balance_rel_threshold", "min_prefix_match",
 				"updated_at",
 			}),
@@ -1400,6 +1405,20 @@ func (r *NodeRegistry) SetModelScheduling(ctx context.Context, config *ModelSche
 		Create(config).Error
 }

+// SeedModelScheduling authoritatively applies a batch of scheduling configs at
+// startup. Each config is upserted (full-replace on model_name), overwriting any
+// prior row for that model. Models not present in configs are left untouched.
+func (r *NodeRegistry) SeedModelScheduling(ctx context.Context, configs []ModelSchedulingConfig) error {
+	for i := range configs {
+		if err := r.SetModelScheduling(ctx, &configs[i]); err != nil {
+			return fmt.Errorf("seeding scheduling config for model %q: %w", configs[i].ModelName, err)
+		}
+		xlog.Info("Seeded model scheduling config", "model", configs[i].ModelName,
+			"spread_all", configs[i].SpreadAll, "min", configs[i].MinReplicas, "max", configs[i].MaxReplicas)
+	}
+	return nil
+}
+
 // GetModelScheduling returns the scheduling config for a model, or nil if none exists.
 func (r *NodeRegistry) GetModelScheduling(ctx context.Context, modelName string) (*ModelSchedulingConfig, error) {
 	var config ModelSchedulingConfig
@@ -1423,7 +1442,7 @@ func (r *NodeRegistry) ListModelSchedulings(ctx context.Context) ([]ModelSchedul
 // ListAutoScalingConfigs returns scheduling configs where auto-scaling is enabled.
 func (r *NodeRegistry) ListAutoScalingConfigs(ctx context.Context) ([]ModelSchedulingConfig, error) {
 	var configs []ModelSchedulingConfig
-	err := r.db.WithContext(ctx).Where("min_replicas > 0 OR max_replicas > 0").Find(&configs).Error
+	err := r.db.WithContext(ctx).Where("min_replicas > 0 OR max_replicas > 0 OR spread_all = ?", true).Find(&configs).Error
 	return configs, err
 }

--- a/core/services/nodes/registry_test.go
+++ b/core/services/nodes/registry_test.go
@@ -1489,3 +1489,59 @@ var _ = Describe("NodeRegistry", func() {
 		})
 	})
 })
+
+var _ = Describe("ModelScheduling spread + seeding", func() {
+	var (
+		db       *gorm.DB
+		registry *NodeRegistry
+	)
+
+	BeforeEach(func() {
+		if runtime.GOOS == "darwin" {
+			Skip("testcontainers requires Docker, not available on macOS CI")
+		}
+		db = testutil.SetupTestDB()
+		var err error
+		registry, err = NewNodeRegistry(db)
+		Expect(err).ToNot(HaveOccurred())
+	})
+
+	It("persists and round-trips SpreadAll", func() {
+		Expect(registry.SetModelScheduling(context.Background(), &ModelSchedulingConfig{
+			ModelName: "m", SpreadAll: true,
+		})).To(Succeed())
+		got, err := registry.GetModelScheduling(context.Background(), "m")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(got.SpreadAll).To(BeTrue())
+	})
+
+	It("includes SpreadAll configs in ListAutoScalingConfigs", func() {
+		Expect(registry.SetModelScheduling(context.Background(), &ModelSchedulingConfig{
+			ModelName: "m", SpreadAll: true,
+		})).To(Succeed())
+		configs, err := registry.ListAutoScalingConfigs(context.Background())
+		Expect(err).ToNot(HaveOccurred())
+		Expect(configs).To(HaveLen(1))
+		Expect(configs[0].ModelName).To(Equal("m"))
+	})
+
+	It("seeds configs with authoritative upsert", func() {
+		Expect(registry.SetModelScheduling(context.Background(), &ModelSchedulingConfig{
+			ModelName: "m", MinReplicas: 9,
+		})).To(Succeed())
+
+		err := registry.SeedModelScheduling(context.Background(), []ModelSchedulingConfig{
+			{ModelName: "m", MinReplicas: 1, MaxReplicas: 2},
+			{ModelName: "n", SpreadAll: true},
+		})
+		Expect(err).ToNot(HaveOccurred())
+
+		m, _ := registry.GetModelScheduling(context.Background(), "m")
+		Expect(m.MinReplicas).To(Equal(1))
+		Expect(m.MaxReplicas).To(Equal(2))
+		Expect(m.SpreadAll).To(BeFalse())
+
+		n, _ := registry.GetModelScheduling(context.Background(), "n")
+		Expect(n.SpreadAll).To(BeTrue())
+	})
+})
--- a/core/services/nodes/scheduling_seed.go
+++ b/core/services/nodes/scheduling_seed.go
@@ -0,0 +1,171 @@
+package nodes
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"strings"
+
+	"github.com/mudler/LocalAI/core/services/nodes/prefixcache"
+	"gopkg.in/yaml.v3"
+)
+
+// ReplicasSpec parses the "replicas" convenience field used in the env/file
+// scheduling config. It accepts the string "all" (or boolean true) to mean
+// "spread one replica onto every matching node". The strings "" / "auto" and
+// boolean false leave SpreadAll unset and defer to min_replicas/max_replicas.
+// A numeric value is rejected with a hint pointing at min/max_replicas, which
+// are the dedicated fields for fixed counts.
+type ReplicasSpec struct {
+	SpreadAll bool
+}
+
+func (r *ReplicasSpec) set(v any) error {
+	switch t := v.(type) {
+	case nil:
+		r.SpreadAll = false
+	case bool:
+		r.SpreadAll = t
+	case string:
+		switch strings.ToLower(strings.TrimSpace(t)) {
+		case "all":
+			r.SpreadAll = true
+		case "", "auto":
+			r.SpreadAll = false
+		default:
+			return fmt.Errorf("invalid replicas value %q (expected \"all\" or \"auto\")", t)
+		}
+	default:
+		return fmt.Errorf("invalid replicas value %v (use min_replicas/max_replicas for a fixed count, or \"all\" to spread)", v)
+	}
+	return nil
+}
+
+// UnmarshalJSON implements json.Unmarshaler for the replicas alias.
+func (r *ReplicasSpec) UnmarshalJSON(b []byte) error {
+	var v any
+	if err := json.Unmarshal(b, &v); err != nil {
+		return err
+	}
+	return r.set(v)
+}
+
+// UnmarshalYAML implements yaml.Unmarshaler for the replicas alias.
+func (r *ReplicasSpec) UnmarshalYAML(value *yaml.Node) error {
+	var v any
+	if err := value.Decode(&v); err != nil {
+		return err
+	}
+	return r.set(v)
+}
+
+// SeedSchedulingEntry is one entry in the env/file scheduling config. It mirrors
+// the API's SetSchedulingRequest shape, plus the "replicas" alias and the
+// canonical "spread_all" boolean.
+type SeedSchedulingEntry struct {
+	ModelName    string            `json:"model_name" yaml:"model_name"`
+	NodeSelector map[string]string `json:"node_selector,omitempty" yaml:"node_selector,omitempty"`
+	MinReplicas  int               `json:"min_replicas,omitempty" yaml:"min_replicas,omitempty"`
+	MaxReplicas  int               `json:"max_replicas,omitempty" yaml:"max_replicas,omitempty"`
+	Replicas     *ReplicasSpec     `json:"replicas,omitempty" yaml:"replicas,omitempty"`
+	SpreadAll    bool              `json:"spread_all,omitempty" yaml:"spread_all,omitempty"`
+
+	RoutePolicy         string  `json:"route_policy,omitempty" yaml:"route_policy,omitempty"`
+	BalanceAbsThreshold int     `json:"balance_abs_threshold,omitempty" yaml:"balance_abs_threshold,omitempty"`
+	BalanceRelThreshold float64 `json:"balance_rel_threshold,omitempty" yaml:"balance_rel_threshold,omitempty"`
+	MinPrefixMatch      float64 `json:"min_prefix_match,omitempty" yaml:"min_prefix_match,omitempty"`
+}
+
+// spread reports whether this entry requests spread-to-all-matching-nodes mode,
+// via either the canonical spread_all field or the replicas alias.
+func (e SeedSchedulingEntry) spread() bool {
+	return e.SpreadAll || (e.Replicas != nil && e.Replicas.SpreadAll)
+}
+
+// ValidateSeedEntry enforces the invariants of a single scheduling entry. It
+// mirrors the API's validateSchedulingRequest, with the added rule that spread
+// mode is mutually exclusive with explicit min/max replica counts.
+func ValidateSeedEntry(e SeedSchedulingEntry) error {
+	if e.ModelName == "" {
+		return fmt.Errorf("model_name is required")
+	}
+	if e.MinReplicas < 0 {
+		return fmt.Errorf("min_replicas must be >= 0 (model %q)", e.ModelName)
+	}
+	if e.MaxReplicas < 0 {
+		return fmt.Errorf("max_replicas must be >= 0 (model %q)", e.ModelName)
+	}
+	if e.spread() && (e.MinReplicas != 0 || e.MaxReplicas != 0) {
+		return fmt.Errorf("spread (replicas: all) and min_replicas/max_replicas are mutually exclusive (model %q)", e.ModelName)
+	}
+	if e.MaxReplicas > 0 && e.MinReplicas > e.MaxReplicas {
+		return fmt.Errorf("min_replicas must be <= max_replicas (model %q)", e.ModelName)
+	}
+	if err := prefixcache.ValidateThresholds(e.RoutePolicy, e.BalanceAbsThreshold, e.BalanceRelThreshold, e.MinPrefixMatch); err != nil {
+		return fmt.Errorf("%w (model %q)", err, e.ModelName)
+	}
+	return nil
+}
+
+func (e SeedSchedulingEntry) toConfig() (ModelSchedulingConfig, error) {
+	selectorJSON := ""
+	if len(e.NodeSelector) > 0 {
+		b, err := json.Marshal(e.NodeSelector)
+		if err != nil {
+			return ModelSchedulingConfig{}, fmt.Errorf("serializing node_selector for model %q: %w", e.ModelName, err)
+		}
+		selectorJSON = string(b)
+	}
+	return ModelSchedulingConfig{
+		ModelName:           e.ModelName,
+		NodeSelector:        selectorJSON,
+		MinReplicas:         e.MinReplicas,
+		MaxReplicas:         e.MaxReplicas,
+		SpreadAll:           e.spread(),
+		RoutePolicy:         e.RoutePolicy,
+		BalanceAbsThreshold: e.BalanceAbsThreshold,
+		BalanceRelThreshold: e.BalanceRelThreshold,
+		MinPrefixMatch:      e.MinPrefixMatch,
+	}, nil
+}
+
+// ParseSchedulingSeed parses the inline-JSON and/or YAML-file scheduling config
+// into validated ModelSchedulingConfig rows ready to upsert. Entries from both
+// sources are concatenated (jsonStr first, then the file). Either argument may
+// be empty.
+func ParseSchedulingSeed(jsonStr, configPath string) ([]ModelSchedulingConfig, error) {
+	var entries []SeedSchedulingEntry
+
+	if strings.TrimSpace(jsonStr) != "" {
+		var fromJSON []SeedSchedulingEntry
+		if err := json.Unmarshal([]byte(jsonStr), &fromJSON); err != nil {
+			return nil, fmt.Errorf("parsing LOCALAI_MODEL_SCHEDULING JSON: %w", err)
+		}
+		entries = append(entries, fromJSON...)
+	}
+
+	if configPath != "" {
+		data, err := os.ReadFile(configPath)
+		if err != nil {
+			return nil, fmt.Errorf("reading model scheduling config %q: %w", configPath, err)
+		}
+		var fromYAML []SeedSchedulingEntry
+		if err := yaml.Unmarshal(data, &fromYAML); err != nil {
+			return nil, fmt.Errorf("parsing model scheduling config %q: %w", configPath, err)
+		}
+		entries = append(entries, fromYAML...)
+	}
+
+	configs := make([]ModelSchedulingConfig, 0, len(entries))
+	for _, e := range entries {
+		if err := ValidateSeedEntry(e); err != nil {
+			return nil, err
+		}
+		cfg, err := e.toConfig()
+		if err != nil {
+			return nil, err
+		}
+		configs = append(configs, cfg)
+	}
+	return configs, nil
+}
--- a/core/services/nodes/scheduling_seed_test.go
+++ b/core/services/nodes/scheduling_seed_test.go
@@ -0,0 +1,75 @@
+package nodes
+
+import (
+	"os"
+	"path/filepath"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("ParseSchedulingSeed", func() {
+	It("parses inline JSON with static min/max replicas", func() {
+		configs, err := ParseSchedulingSeed(`[{"model_name":"m","node_selector":{"tier":"gpu"},"min_replicas":1,"max_replicas":4}]`, "")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(configs).To(HaveLen(1))
+		Expect(configs[0].ModelName).To(Equal("m"))
+		Expect(configs[0].MinReplicas).To(Equal(1))
+		Expect(configs[0].MaxReplicas).To(Equal(4))
+		Expect(configs[0].SpreadAll).To(BeFalse())
+		Expect(configs[0].NodeSelector).To(Equal(`{"tier":"gpu"}`))
+	})
+
+	It("maps replicas: all to SpreadAll", func() {
+		configs, err := ParseSchedulingSeed(`[{"model_name":"m","replicas":"all"}]`, "")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(configs[0].SpreadAll).To(BeTrue())
+	})
+
+	It("maps replicas: true to SpreadAll", func() {
+		configs, err := ParseSchedulingSeed(`[{"model_name":"m","replicas":true}]`, "")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(configs[0].SpreadAll).To(BeTrue())
+	})
+
+	It("accepts the spread_all field directly", func() {
+		configs, err := ParseSchedulingSeed(`[{"model_name":"m","spread_all":true}]`, "")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(configs[0].SpreadAll).To(BeTrue())
+	})
+
+	It("rejects spread_all combined with min/max replicas", func() {
+		_, err := ParseSchedulingSeed(`[{"model_name":"m","replicas":"all","min_replicas":2}]`, "")
+		Expect(err).To(MatchError(ContainSubstring("mutually exclusive")))
+	})
+
+	It("rejects a missing model_name", func() {
+		_, err := ParseSchedulingSeed(`[{"min_replicas":1}]`, "")
+		Expect(err).To(MatchError(ContainSubstring("model_name is required")))
+	})
+
+	It("rejects a numeric replicas value pointing the user at min/max", func() {
+		_, err := ParseSchedulingSeed(`[{"model_name":"m","replicas":3}]`, "")
+		Expect(err).To(MatchError(ContainSubstring("min_replicas")))
+	})
+
+	It("returns no configs for empty input", func() {
+		configs, err := ParseSchedulingSeed("", "")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(configs).To(BeEmpty())
+	})
+
+	It("parses a YAML file with replicas: all and a node_selector", func() {
+		dir := GinkgoT().TempDir()
+		path := filepath.Join(dir, "scheduling.yaml")
+		yaml := "- model_name: m\n  replicas: all\n  node_selector:\n    tier: gpu\n"
+		Expect(os.WriteFile(path, []byte(yaml), 0o600)).To(Succeed())
+
+		configs, err := ParseSchedulingSeed("", path)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(configs).To(HaveLen(1))
+		Expect(configs[0].ModelName).To(Equal("m"))
+		Expect(configs[0].SpreadAll).To(BeTrue())
+		Expect(configs[0].NodeSelector).To(Equal(`{"tier":"gpu"}`))
+	})
+})
--- a/docs/content/features/distributed-mode.md
+++ b/docs/content/features/distributed-mode.md
@@ -604,6 +604,91 @@ All fields are optional and composable:
 - Replicas only: auto-scale across all nodes
 - Both: auto-scale on matching nodes only

+### Declarative per-model scheduling (unattended installs)
+
+In distributed mode you can declare per-model scheduling at startup, instead of
+using the WebUI/API. Config is **authoritative**: it is re-applied on every boot
+and overwrites the listed models (models not listed are left untouched).
+
+| Variable | Description |
+|----------|-------------|
+| `LOCALAI_MODEL_SCHEDULING` | Inline JSON list of scheduling entries |
+| `LOCALAI_MODEL_SCHEDULING_CONFIG` | Path to a YAML file with the same list |
+
+Entry fields: `model_name` (required), `node_selector` (a label map; **omit it to
+match every node**), and then **one of two replica modes** (they are mutually
+exclusive):
+
+- **`replicas: all`** - static spread: place exactly **one replica on every
+  matching node**, proactively, regardless of load, and keep it in sync as nodes
+  join and leave. Use this for "run model X everywhere (with this label)".
+- **`min_replicas` / `max_replicas`** - elastic auto-scaling: keep at least
+  `min_replicas` running, and burst **up to** `max_replicas` only when all
+  replicas are busy, scaling back down to the minimum when idle. `max_replicas: 0`
+  means **no upper bound** (grow to cluster capacity). To enable this mode you
+  must set `min_replicas >= 1` or `max_replicas >= 1` - an entry with only
+  `max_replicas: 0` (and no `replicas: all`) does nothing.
+
+Net effect at a glance:
+
+| Config | Behavior |
+|--------|----------|
+| `replicas: all` | One replica per matching node, placed immediately, tracks join/leave |
+| `min_replicas: 1, max_replicas: 0` | Always >=1, bursts to cluster capacity under load, back to 1 when idle |
+| `min_replicas: 2, max_replicas: 4` | Always >=2, bursts to at most 4 under load |
+
+`node_selector` constrains which nodes a model may use; with no selector the
+model may use **all** healthy nodes. So "spread model X across all nodes" is just
+`replicas: all` with no `node_selector`. `replicas: all` targets one replica per
+matching node; with the default per-node cap of one replica per model this lands
+exactly one on each node (see the note below about `LOCALAI_MAX_REPLICAS_PER_MODEL`).
+
+YAML example (`scheduling.yaml`):
+
+```yaml
+# One replica on every GPU-labelled node (static spread, tracks join/leave):
+- model_name: gpt-oss
+  node_selector:
+    tier: gpu
+  replicas: all
+
+# One replica on EVERY node in the cluster (no selector = all nodes):
+- model_name: embeddings
+  replicas: all
+
+# Elastic on CPU nodes: always >=1, burst to capacity under load, 0 = no cap:
+- model_name: whisper
+  node_selector:
+    tier: cpu
+  min_replicas: 1
+  max_replicas: 0
+```
+
+```bash
+LOCALAI_DISTRIBUTED=true \
+LOCALAI_MODEL_SCHEDULING_CONFIG=/etc/localai/scheduling.yaml \
+local-ai run
+```
+
+Inline equivalent:
+
+```bash
+LOCALAI_MODEL_SCHEDULING='[{"model_name":"gpt-oss","node_selector":{"tier":"gpu"},"replicas":"all"}]'
+```
+
+Notes:
+
+- Because the config is authoritative, each listed model's **entire** scheduling
+  row is replaced on every boot, including the optional prefix-cache routing
+  overrides (`route_policy`, `balance_abs_threshold`, `balance_rel_threshold`,
+  `min_prefix_match`). For a model you manage via this config, set those fields
+  here too if you need non-default values; values set only through the API are
+  reset on the next restart. Models not listed in the config are never touched.
+- `replicas: all` places one replica per matching node by relying on the default
+  per-node cap of one replica per model. If you raise `LOCALAI_MAX_REPLICAS_PER_MODEL`
+  on a worker above 1, the target count can be met by stacking replicas on fewer
+  nodes rather than spreading one to each.
+
 ## Label Management API

 | Method | Path | Description |
--- a/docs/content/features/openai-realtime.md
+++ b/docs/content/features/openai-realtime.md
@@ -136,3 +136,54 @@ most reliable fix for WebRTC connections that establish and then drop.
 ## Protocol

 The API follows the OpenAI Realtime API protocol for handling sessions, audio buffers, and conversation items.
+
+## Gating a realtime pipeline with voice recognition
+
+A pipeline realtime model can require speaker verification before it responds. Add a `voice_recognition` block under `pipeline`. When present, each committed utterance is verified against authorized speakers; unauthorized utterances are dropped before the LLM runs (no LLM call, no tool execution, no TTS). The session stays open.
+
+```yaml
+name: my-realtime
+pipeline:
+  vad: silero-vad
+  transcription: whisper
+  llm: qwen
+  tts: kokoro
+  voice_recognition:
+    model: speaker-recognition   # the speaker-recognition backend model
+    mode: identify               # "identify" (registry) or "verify" (references)
+    threshold: 0.25              # cosine distance; <= passes
+    when: every                  # "every" (default) or "first"
+    on_reject: drop_event        # "drop_event" (default) or "drop_silent"
+    anti_spoofing: false         # optional liveness check (verify mode)
+
+    # identify mode: authorized registry identities (multiple persons)
+    allow:
+      names: ["alice", "bob"]    # match registered speaker names
+      labels: ["family"]         # OR any identity carrying this label
+      # empty allow = any registered speaker within threshold passes
+
+    # verify mode: reference speakers (multiple persons)
+    references:
+      - name: alice
+        audio: /models/voices/alice.wav
+      - name: bob
+        audio: /models/voices/bob.wav
+```
+
+| Field | Meaning |
+|-------|---------|
+| `model` | Speaker-recognition backend model name. |
+| `mode` | `identify` matches against speakers registered via `/v1/voice/register`; `verify` matches against the `references` audios. |
+| `threshold` | Maximum cosine distance that still counts as a match (default ~0.25). |
+| `when` | `every` verifies each utterance; `first` verifies once then trusts the session. |
+| `on_reject` | `drop_event` drops and emits a `speaker_not_authorized` error event; `drop_silent` drops quietly. |
+| `anti_spoofing` | Verify mode only: runs the backend liveness check (slower). |
+| `allow.names` / `allow.labels` | identify mode: which registry identities are authorized. Empty = any registered speaker. |
+| `references` | verify mode: authorized reference speakers; the utterance passes if it matches any. |
+
+`identify` mode requires the voice registry (speakers registered through `/v1/voice/register`). `verify` mode needs no registry: reference audios are embedded once at model load.
+
+## Examples
+
+- [Realtime voice assistant demo (Go)](https://github.com/localai-org/localai-realtime-demo): a minimal Go client for the Realtime (WebSocket) API with a full talk-back voice loop and an example tool call. Ships a `docker compose` setup that brings up a realtime-capable LocalAI for you.
+- [Realtime voice assistant example (Python)](https://github.com/mudler/LocalAI-examples/tree/main/realtime): thin-client architecture (Silero VAD on the client, heavy lifting on LocalAI), suited to running the client on a Raspberry Pi.
--- a/docs/content/features/text-to-audio.md
+++ b/docs/content/features/text-to-audio.md
@@ -226,6 +226,82 @@ curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
   }' | aplay
 ```

+### OmniVoice
+
+[OmniVoice](https://github.com/ServeurpersoCom/omnivoice.cpp) (`omnivoice-cpp` backend) is a native C++ / GGML text-to-speech engine. It supports voice cloning (from reference audio plus its transcript), voice design (steering the voice with attribute keywords such as gender, age, pitch, style, volume, and emotion), and streaming synthesis. Output is 24kHz mono audio and it covers 646 languages.
+
+#### Setup
+
+Install the `omnivoice-cpp` model in the Model gallery or run `local-ai run models install omnivoice-cpp`. A higher-quality BF16 variant is available as `omnivoice-cpp-hq` (the default `omnivoice-cpp` ships Q8_0 GGUFs).
+
+#### Usage
+
+Use the speech endpoint by specifying the omnivoice-cpp backend:
+
+```bash
+curl http://localhost:8080/v1/audio/speech -H "Content-Type: application/json" -d '{
+     "model": "omnivoice-cpp",
+     "input": "Hello world, this is a test."
+   }' | aplay
+```
+
+#### Voice cloning
+
+Pass a reference audio file via the `voice` parameter and its transcript via the `ref_text` generation parameter:
+
+```bash
+curl http://localhost:8080/v1/audio/speech -H "Content-Type: application/json" -d '{
+     "model": "omnivoice-cpp",
+     "input": "Hello world, this is a test.",
+     "voice": "path/to/reference_audio.wav",
+     "params": { "ref_text": "This is the transcript of the reference audio." }
+   }' | aplay
+```
+
+You can also pin a default cloned voice in the model config so callers do not have to pass it on every request. Both `tts.voice` and `tts.audio_path` are honored as the reference audio (a per-request `voice` overrides them); paths are resolved relative to the model directory:
+
+```yaml
+name: omnivoice-cpp
+backend: omnivoice-cpp
+parameters:
+  model: omnivoice-cpp/omnivoice-base-Q8_0.gguf
+tts:
+  audio_path: "voices/my_reference.wav"   # default cloning reference (or use tts.voice)
+options:
+  - "tokenizer:omnivoice-cpp/omnivoice-tokenizer-Q8_0.gguf"
+```
+
+#### Voice design
+
+Steer the synthesized voice with attribute keywords (gender, age, pitch, style, volume, emotion) by passing an `instructions` string per request:
+
+```bash
+curl http://localhost:8080/v1/audio/speech -H "Content-Type: application/json" -d '{
+     "model": "omnivoice-cpp",
+     "input": "Hello world, this is a test.",
+     "instructions": "female young high soft emotion:happy"
+   }' | aplay
+```
+
+#### Configuration
+
+The backend loads the base GGUF from `parameters.model` and its tokenizer from the `tokenizer:` option. A few optional generation knobs are available as `options`:
+
+```yaml
+name: omnivoice-cpp
+backend: omnivoice-cpp
+parameters:
+  model: omnivoice-cpp/omnivoice-base-Q8_0.gguf
+options:
+  - "tokenizer:omnivoice-cpp/omnivoice-tokenizer-Q8_0.gguf"
+  - "use_fa:true"      # enable flash attention
+  - "clamp_fp16:true"  # clamp activations for fp16 stability
+  - "seed:42"          # deterministic generation
+  - "denoise:true"     # denoise the generated audio
+```
+
+A per-request `seed` can also be supplied through the `params` map alongside `ref_text`.
+
 ### Pocket TTS

 [Pocket TTS](https://github.com/kyutai-labs/pocket-tts) is a lightweight text-to-speech model designed to run efficiently on CPUs. It supports voice cloning through HuggingFace voice URLs or local audio files.
--- a/docs/content/integrations.md
+++ b/docs/content/integrations.md
@@ -381,7 +381,7 @@ jobs:

 ### Realtime Voice Assistant

-LocalAI supports realtime voice interactions , enabling voice assistant applications with real-time speech-to-speech communication. A complete example implementation is available in the [LocalAI-examples repository](https://github.com/mudler/LocalAI-examples/tree/main/realtime).
+LocalAI supports realtime voice interactions , enabling voice assistant applications with real-time speech-to-speech communication. A complete example implementation is available in the [LocalAI-examples repository](https://github.com/mudler/LocalAI-examples/tree/main/realtime). For a minimal native client, see the [Go realtime voice assistant demo](https://github.com/localai-org/localai-realtime-demo): a tiny Go client for the Realtime (WebSocket) API with a full talk-back loop and an example tool call, plus a `docker compose` setup that brings up a realtime-capable LocalAI for you.

 #### Overview

@@ -457,7 +457,8 @@ The realtime voice assistant example demonstrates how to build a voice assistant

 #### Additional Resources

- [Realtime Voice Assistant Example](https://github.com/mudler/LocalAI-examples/tree/main/realtime)
+- [Realtime Voice Assistant Example (Python)](https://github.com/mudler/LocalAI-examples/tree/main/realtime)
+- [Realtime Voice Assistant Demo (Go)](https://github.com/localai-org/localai-realtime-demo)
 - [LocalAI Realtime API documentation](/features/)
 - [Audio features documentation](/features/text-to-audio/)
 - [Transcription features documentation](/features/audio-to-text/)
--- a/docs/content/reference/compatibility-table.md
+++ b/docs/content/reference/compatibility-table.md
@@ -57,6 +57,7 @@ LocalAI will attempt to automatically load models which are not explicitly confi
 | [VoxCPM](https://github.com/ModelBest/VoxCPM) | Expressive end-to-end TTS | CPU, CUDA 12/13, ROCm, Intel, Metal |
 | [Kitten TTS](https://github.com/KittenML/KittenTTS) | Kitten TTS model | CPU, Metal |
 | [MLX-Audio](https://github.com/Blaizzy/mlx-audio) | Audio models on Apple Silicon | Metal, CPU, CUDA 12/13, Jetson L4T |
+| [OmniVoice](https://github.com/ServeurpersoCom/omnivoice.cpp) | Native C++/GGML TTS with voice cloning, voice design, and streaming | CPU, CUDA 12/13, ROCm, Intel, Metal, Vulkan, Jetson L4T |

 ## Music Generation

--- a/gallery/index.yaml
+++ b/gallery/index.yaml
--- a/go.mod
+++ b/go.mod
@@ -36,7 +36,7 @@ require (
 	github.com/mholt/archiver/v3 v3.5.1
 	github.com/microcosm-cc/bluemonday v1.0.27
 	github.com/modelcontextprotocol/go-sdk v1.5.0
-	github.com/mudler/cogito v0.9.5-0.20260315222927-63abdec7189b
+	github.com/mudler/cogito v0.10.1-0.20260609212329-bf4010d31047
 	github.com/mudler/edgevpn v0.34.0
 	github.com/mudler/go-processmanager v0.1.1
 	github.com/mudler/memory v0.0.0-20260406210934-424c1ecf2cf8
--- a/go.sum
+++ b/go.sum
@@ -968,8 +968,8 @@ github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM=
 github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw=
 github.com/mudler/LocalAGI v0.0.0-20260606071251-14aed1ae4336 h1:iKBkSnpisOvMVxFoYsAObvAuOqXBakRPMD0PWxWG5EE=
 github.com/mudler/LocalAGI v0.0.0-20260606071251-14aed1ae4336/go.mod h1:U+g6u8mF2wQxhkdBl3dr8G4db1cv3n7KTKmraoJ7D0c=
-github.com/mudler/cogito v0.9.5-0.20260315222927-63abdec7189b h1:A74T2Lauvg61KodYqsjTYDY05kPLcW+efVZjd23dghU=
-github.com/mudler/cogito v0.9.5-0.20260315222927-63abdec7189b/go.mod h1:6sfja3lcu2nWRzEc0wwqGNu/eCG3EWgij+8s7xyUeQ4=
+github.com/mudler/cogito v0.10.1-0.20260609212329-bf4010d31047 h1:wJ8WbDah1YcpBNRDmovQro8JiR228YFk7TUqPCS4m04=
+github.com/mudler/cogito v0.10.1-0.20260609212329-bf4010d31047/go.mod h1:6sfja3lcu2nWRzEc0wwqGNu/eCG3EWgij+8s7xyUeQ4=
 github.com/mudler/edgevpn v0.34.0 h1:qDrD/rCPFY/FdURbXudIZWihVKY4VOX3nMn3CcbeQEU=
 github.com/mudler/edgevpn v0.34.0/go.mod h1:yki7uMi5LR9gSMrw8PdPieuxsrk8BLV2Ui7VBEmbbIA=
 github.com/mudler/go-piper v0.0.0-20241023091659-2494246fd9fc h1:RxwneJl1VgvikiX28EkpdAyL4yQVnJMrbquKospjHyA=
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -193,8 +193,7 @@ var knownModelsNameSuffixToSkip []string = []string{
 	".pt",
 	".onnx",
 	".md",
-	".MD",
-	".DS_Store",
+	".ds_store",
 	".",
 	".safetensors",
 	".bin",
@@ -203,6 +202,7 @@ var knownModelsNameSuffixToSkip []string = []string{
 	".ckpt",
 	".zip",
 	".tag",
+	".bak",
 	".partial",
 	".tar.gz",
 }
@@ -225,12 +225,18 @@ FILE:
 			}
 		}

-		// Skip templates, YAML, .keep, .json, and .DS_Store files
+		// Skip templates, YAML, .keep, .json, .DS_Store, and other non-model files.
+		// Use case-insensitive matching so e.g. CACHEDIR.TAG is caught by ".tag".
+		lowerName := strings.ToLower(file.Name())
 		for _, skip := range knownModelsNameSuffixToSkip {
-			if strings.HasSuffix(file.Name(), skip) {
+			if strings.HasSuffix(lowerName, skip) {
 				continue FILE
 			}
 		}
+		// Skip backup files created by LocalAI or huggingface_hub (e.g. model.yaml.bak-pre-gpumem072).
+		if strings.Contains(lowerName, ".bak") {
+			continue FILE
+		}

 		// Skip directories
 		if file.IsDir() {
--- a/pkg/xsysinfo/memory.go
+++ b/pkg/xsysinfo/memory.go
@@ -1,9 +1,19 @@
 package xsysinfo

 import (
+	"os"
+
 	"github.com/mudler/memory"
 )

+// cgroup/proc paths used to make the reported RAM total container-aware.
+// They are variables (not consts) so tests could override them if needed.
+var (
+	cgroupV2MaxPath   = "/sys/fs/cgroup/memory.max"
+	cgroupV1LimitPath = "/sys/fs/cgroup/memory/memory.limit_in_bytes"
+	procMemInfoPath   = "/proc/meminfo"
+)
+
 // SystemRAMInfo contains system RAM usage information
 type SystemRAMInfo struct {
 	Total        uint64  `json:"total"`
@@ -13,12 +23,45 @@ type SystemRAMInfo struct {
 	UsagePercent float64 `json:"usage_percent"`
 }

+// readFileBestEffort reads a file and returns its contents, or "" on any error.
+// Missing cgroup/proc files (e.g. on non-Linux hosts) are expected and benign.
+func readFileBestEffort(path string) string {
+	b, err := os.ReadFile(path)
+	if err != nil {
+		return ""
+	}
+	return string(b)
+}
+
+// systemTotalMemory returns the container-aware total system RAM in bytes.
+//
+// memory.TotalMemory() reports the HOST kernel total (syscall.Sysinfo on
+// Linux), which lxcfs/LXD does NOT virtualize. Inside a container that
+// over-reports physical RAM and, combined with the virtualized MemAvailable,
+// inflates the reported usage (see issue #8059). We instead derive the total
+// from the minimum of all available container-aware candidates.
+func systemTotalMemory() uint64 {
+	return chooseTotalMemory(
+		readFileBestEffort(cgroupV2MaxPath),
+		readFileBestEffort(cgroupV1LimitPath),
+		readFileBestEffort(procMemInfoPath),
+		memory.TotalMemory(),
+	)
+}
+
 // GetSystemRAMInfo returns real-time system RAM usage
 func GetSystemRAMInfo() (*SystemRAMInfo, error) {
-	total := memory.TotalMemory()
-	free := memory.AvailableMemory()
+	total := systemTotalMemory()
+	available := memory.AvailableMemory()

-	used := total - free
+	// AvailableMemory (MemAvailable) is virtualized by lxcfs, so in edge
+	// cases it can exceed our corrected total; clamp to avoid an unsigned
+	// underflow when computing Used.
+	if available > total {
+		available = total
+	}
+
+	used := total - available

 	usagePercent := 0.0
 	if total > 0 {
@@ -27,8 +70,8 @@ func GetSystemRAMInfo() (*SystemRAMInfo, error) {
 	return &SystemRAMInfo{
 		Total:        total,
 		Used:         used,
-		Free:         free,
-		Available:    total - used,
+		Free:         available,
+		Available:    available,
 		UsagePercent: usagePercent,
 	}, nil
 }
--- a/pkg/xsysinfo/memory_total.go
+++ b/pkg/xsysinfo/memory_total.go
@@ -0,0 +1,120 @@
+package xsysinfo
+
+import (
+	"strconv"
+	"strings"
+)
+
+// cgroupV1UnlimitedSentinel is the value the kernel writes to
+// memory.limit_in_bytes when no limit is set. It is PAGE_COUNTER_MAX
+// (LONG_MAX rounded down to a page boundary), i.e. 0x7FFFFFFFFFFFF000 on
+// 4 KiB-page systems. Any value at or above this is treated as "no limit".
+const cgroupV1UnlimitedSentinel = uint64(0x7FFFFFFFFFFFF000)
+
+// parseUintField parses a trimmed unsigned integer from raw file contents.
+// It returns (0, false) when the content is empty or not a number.
+func parseUintField(raw string) (uint64, bool) {
+	s := strings.TrimSpace(raw)
+	if s == "" {
+		return 0, false
+	}
+	v, err := strconv.ParseUint(s, 10, 64)
+	if err != nil {
+		return 0, false
+	}
+	return v, true
+}
+
+// parseCgroupV2Max interprets the contents of cgroup v2 memory.max.
+// The literal "max" means unlimited, returning 0.
+func parseCgroupV2Max(raw string) uint64 {
+	if strings.TrimSpace(raw) == "max" {
+		return 0
+	}
+	v, ok := parseUintField(raw)
+	if !ok {
+		return 0
+	}
+	return v
+}
+
+// parseCgroupV1Limit interprets the contents of cgroup v1
+// memory.limit_in_bytes. The kernel's "unlimited" sentinel (a value at or
+// above PAGE_COUNTER_MAX) is treated as no limit, returning 0.
+func parseCgroupV1Limit(raw string) uint64 {
+	v, ok := parseUintField(raw)
+	if !ok {
+		return 0
+	}
+	if v >= cgroupV1UnlimitedSentinel {
+		return 0
+	}
+	return v
+}
+
+// parseMemTotal extracts the MemTotal value (in bytes) from raw
+// /proc/meminfo contents. MemTotal is reported in kibibytes, so the parsed
+// value is multiplied by 1024. Returns 0 when the field is missing.
+func parseMemTotal(raw string) uint64 {
+	for _, line := range strings.Split(raw, "\n") {
+		if !strings.HasPrefix(line, "MemTotal:") {
+			continue
+		}
+		fields := strings.Fields(line)
+		// Expected: ["MemTotal:", "<value>", "kB"]
+		if len(fields) < 2 {
+			return 0
+		}
+		v, err := strconv.ParseUint(fields[1], 10, 64)
+		if err != nil {
+			return 0
+		}
+		if len(fields) >= 3 {
+			switch strings.ToLower(fields[2]) {
+			case "kb":
+				return v * 1024
+			case "mb":
+				return v * 1024 * 1024
+			case "gb":
+				return v * 1024 * 1024 * 1024
+			}
+		}
+		return v
+	}
+	return 0
+}
+
+// chooseTotalMemory selects the most accurate system RAM total in bytes.
+//
+// On Linux the host kernel total (sysinfoTotal, from syscall.Sysinfo) is NOT
+// virtualized by lxcfs/LXD, so inside a container it over-reports physical
+// RAM. The cgroup limits and /proc/meminfo MemTotal, by contrast, do reflect
+// the container's view. We therefore take the MINIMUM of all non-zero,
+// non-unlimited candidates:
+//
+//   - cgroup v2 memory.max ("max" => unlimited, skipped)
+//   - cgroup v1 memory.limit_in_bytes (kernel sentinel => unlimited, skipped)
+//   - /proc/meminfo MemTotal (lxcfs/LXD virtualizes this)
+//   - sysinfoTotal (bare-metal fallback)
+//
+// On bare metal the cgroup limits are unlimited and MemTotal == sysinfoTotal,
+// so the result equals the host total exactly as before.
+func chooseTotalMemory(cgroupV2Max, cgroupV1Limit, procMemInfo string, sysinfoTotal uint64) uint64 {
+	candidates := []uint64{
+		parseCgroupV2Max(cgroupV2Max),
+		parseCgroupV1Limit(cgroupV1Limit),
+		parseMemTotal(procMemInfo),
+		sysinfoTotal,
+	}
+
+	var best uint64
+	for _, c := range candidates {
+		if c == 0 {
+			continue
+		}
+		if best == 0 || c < best {
+			best = c
+		}
+	}
+	return best
+}
--- a/pkg/xsysinfo/memory_total_test.go
+++ b/pkg/xsysinfo/memory_total_test.go
@@ -0,0 +1,74 @@
+package xsysinfo
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("chooseTotalMemory", func() {
+	const (
+		gi128 = uint64(128) * 1024 * 1024 * 1024
+		gi20  = uint64(20) * 1024 * 1024 * 1024
+		gi10  = uint64(10) * 1024 * 1024 * 1024
+	)
+
+	// /proc/meminfo MemTotal is in kB; build a snippet for a given byte total.
+	memInfo := func(bytes uint64) string {
+		kb := bytes / 1024
+		return "MemTotal:       " + itoa(kb) + " kB\nMemFree:        123 kB\n"
+	}
+
+	Context("bare metal (no cgroup cap, memory.max == max)", func() {
+		It("uses the host sysinfo total", func() {
+			// MemTotal mirrors sysinfo on bare metal.
+			got := chooseTotalMemory("max\n", string(rune(0)), memInfo(gi128), gi128)
+			Expect(got).To(Equal(gi128))
+		})
+	})
+
+	Context("LXD/lxcfs container (MemTotal virtualized below host, no cap)", func() {
+		It("uses the virtualized MemTotal, not the host sysinfo total", func() {
+			// This is issue #8059: host sysinfo says 128Gi, but lxcfs
+			// virtualizes /proc/meminfo MemTotal to 20Gi and there is no
+			// cgroup cap. The corrected total must be 20Gi.
+			got := chooseTotalMemory("max\n", "", memInfo(gi20), gi128)
+			Expect(got).To(Equal(gi20))
+		})
+	})
+
+	Context("cgroup v2 cap set below MemTotal", func() {
+		It("uses the cgroup cap", func() {
+			got := chooseTotalMemory(itoa(gi10)+"\n", "", memInfo(gi20), gi128)
+			Expect(got).To(Equal(gi10))
+		})
+	})
+
+	Context("cgroup v1 with the kernel unlimited sentinel", func() {
+		It("ignores the sentinel and falls back to MemTotal", func() {
+			got := chooseTotalMemory("", "9223372036854771712\n", memInfo(gi20), gi128)
+			Expect(got).To(Equal(gi20))
+		})
+	})
+
+	Context("all candidates empty/unlimited", func() {
+		It("falls back to sysinfo total", func() {
+			got := chooseTotalMemory("max\n", "", "", gi128)
+			Expect(got).To(Equal(gi128))
+		})
+	})
+})
+
+// itoa is a tiny base-10 formatter to avoid importing strconv into the test.
+func itoa(v uint64) string {
+	if v == 0 {
+		return "0"
+	}
+	var buf [20]byte
+	i := len(buf)
+	for v > 0 {
+		i--
+		buf[i] = byte('0' + v%10)
+		v /= 10
+	}
+	return string(buf[i:])
+}
--- a/scripts/changed-backends.js
+++ b/scripts/changed-backends.js
@@ -56,6 +56,11 @@ function inferBackendPathDarwin(item) {
  if (item.backend === "llama-cpp") {
    return `backend/cpp/llama-cpp/`;
  }
+  // ds4 is C++ too (built via `make backends/ds4-darwin`); the matrix entry
+  // carries lang=go for runner/toolchain selection, but the source is C++.
+  if (item.backend === "ds4") {
+    return `backend/cpp/ds4/`;
+  }
  if (!item.lang) {
    return `backend/python/${item.backend}/`;
  }
--- a/tests/e2e/e2e_suite_test.go
+++ b/tests/e2e/e2e_suite_test.go
@@ -236,6 +236,45 @@ var _ = BeforeSuite(func() {
 	Expect(err).ToNot(HaveOccurred())
 	Expect(os.WriteFile(filepath.Join(modelsPath, "realtime-pipeline.yaml"), pipelineData, 0644)).To(Succeed())

+	// Speaker-recognition model (mock-backend) + a voice-recognition-gated
+	// pipeline for the realtime gate e2e. The reference WAV carries a positive
+	// DC bias so the mock embeds it to one orthogonal "speaker"; the test then
+	// drives matching (authorized) and opposite-bias (unauthorized) audio.
+	speakerCfg := map[string]any{
+		"name":       "mock-speaker",
+		"backend":    "mock-backend",
+		"parameters": map[string]any{"model": "mock-speaker.bin"},
+	}
+	speakerData, err := yaml.Marshal(speakerCfg)
+	Expect(err).ToNot(HaveOccurred())
+	Expect(os.WriteFile(filepath.Join(modelsPath, "mock-speaker.yaml"), speakerData, 0644)).To(Succeed())
+
+	voiceRefPath := filepath.Join(modelsPath, "e2e-voice-ref.wav")
+	Expect(os.WriteFile(voiceRefPath, wavFromPCM(pcmWithDC(300, 16000, 1000, 8000), 16000), 0644)).To(Succeed())
+
+	gatedCfg := map[string]any{
+		"name": "realtime-pipeline-gated",
+		"pipeline": map[string]any{
+			"vad":           "mock-vad",
+			"transcription": "mock-stt",
+			"llm":           "mock-llm",
+			"tts":           "mock-tts",
+			"voice_recognition": map[string]any{
+				"model":     "mock-speaker",
+				"mode":      "verify",
+				"threshold": 0.25,
+				"when":      "every",
+				"on_reject": "drop_event",
+				"references": []map[string]any{
+					{"name": "e2e-speaker", "audio": voiceRefPath},
+				},
+			},
+		},
+	}
+	gatedData, err := yaml.Marshal(gatedCfg)
+	Expect(err).ToNot(HaveOccurred())
+	Expect(os.WriteFile(filepath.Join(modelsPath, "realtime-pipeline-gated.yaml"), gatedData, 0644)).To(Succeed())
+
 	// Router model setup: a score classifier (mock-backend Score) selects
 	// between two candidate chat models based on keyword matches against the
 	// candidate label fragments. Exercises the full RouteModel middleware path
--- a/tests/e2e/mock-backend/main.go
+++ b/tests/e2e/mock-backend/main.go
@@ -852,6 +852,70 @@ func (m *MockBackend) ModelMetadata(ctx context.Context, in *pb.ModelOptions) (*
 	}, nil
 }

+// voiceEmbedFromWAV reads a 16-bit LE mono WAV and returns a 2-d speaker
+// embedding derived from the signed DC offset of the samples. A positive DC
+// bias maps to one orthogonal unit vector, a negative bias to the other, so
+// e2e tests can deterministically simulate two distinct "speakers" that
+// survive resampling (DC is sample-rate independent). Near-zero DC maps to a
+// neutral vector equidistant from both. Returns nil for unreadable audio.
+func voiceEmbedFromWAV(path string) []float32 {
+	data, err := os.ReadFile(path)
+	if err != nil || len(data) < 44 {
+		return nil
+	}
+	pcm := data[44:]
+	n := len(pcm) / 2
+	if n == 0 {
+		return nil
+	}
+	var sum float64
+	for i := 0; i < n; i++ {
+		s := int16(pcm[2*i]) | int16(pcm[2*i+1])<<8
+		sum += float64(s)
+	}
+	mean := sum / float64(n)
+	switch {
+	case mean > 500:
+		return []float32{1, 0}
+	case mean < -500:
+		return []float32{0, 1}
+	default:
+		return []float32{0.7071, 0.7071}
+	}
+}
+
+// VoiceEmbed returns a deterministic 2-d speaker embedding for the audio clip.
+// See voiceEmbedFromWAV for the (test-only) DC-offset discrimination scheme.
+func (m *MockBackend) VoiceEmbed(ctx context.Context, in *pb.VoiceEmbedRequest) (*pb.VoiceEmbedResponse, error) {
+	emb := voiceEmbedFromWAV(in.GetAudio())
+	xlog.Debug("VoiceEmbed called", "audio", in.GetAudio(), "embedding", emb)
+	if len(emb) == 0 {
+		return &pb.VoiceEmbedResponse{}, nil
+	}
+	return &pb.VoiceEmbedResponse{Embedding: emb, Model: "mock-speaker"}, nil
+}
+
+// VoiceVerify compares two clips by cosine distance over their mock embeddings.
+func (m *MockBackend) VoiceVerify(ctx context.Context, in *pb.VoiceVerifyRequest) (*pb.VoiceVerifyResponse, error) {
+	a := voiceEmbedFromWAV(in.GetAudio1())
+	b := voiceEmbedFromWAV(in.GetAudio2())
+	dist := float32(1)
+	if len(a) == 2 && len(b) == 2 {
+		dist = 1 - (a[0]*b[0] + a[1]*b[1]) // both unit vectors
+	}
+	threshold := in.GetThreshold()
+	if threshold == 0 {
+		threshold = 0.25
+	}
+	xlog.Debug("VoiceVerify called", "distance", dist, "threshold", threshold)
+	return &pb.VoiceVerifyResponse{
+		Verified:  dist <= threshold,
+		Distance:  dist,
+		Threshold: threshold,
+		Model:     "mock-speaker",
+	}, nil
+}
+
 func main() {
 	xlog.SetLogger(xlog.NewLogger(xlog.LogLevel(os.Getenv("LOCALAI_LOG_LEVEL")), os.Getenv("LOCALAI_LOG_FORMAT")))

--- a/tests/e2e/realtime_voicegate_test.go
+++ b/tests/e2e/realtime_voicegate_test.go
@@ -0,0 +1,134 @@
+package e2e_test
+
+import (
+	"encoding/base64"
+	"encoding/binary"
+	"math"
+	"time"
+
+	"github.com/gorilla/websocket"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// --- helpers: DC-biased PCM/WAV for the voice-recognition gate e2e ---
+//
+// The mock-backend embeds audio to one of two orthogonal "speaker" vectors
+// based on the signed DC offset of the samples (see voiceEmbedFromWAV in the
+// mock-backend). A positive bias is the authorized speaker (matches the
+// enrolled reference); a negative bias is an unauthorized one.
+
+// pcmWithDC returns 16-bit LE mono PCM of a sine wave plus a constant DC bias.
+func pcmWithDC(freq float64, sampleRate, durationMs int, dc int16) []byte {
+	numSamples := sampleRate * durationMs / 1000
+	pcm := make([]byte, numSamples*2)
+	for i := 0; i < numSamples; i++ {
+		t := float64(i) / float64(sampleRate)
+		v := float64(dc) + math.MaxInt16/4*math.Sin(2*math.Pi*freq*t)
+		if v > math.MaxInt16 {
+			v = math.MaxInt16
+		}
+		if v < math.MinInt16 {
+			v = math.MinInt16
+		}
+		s := int16(v)
+		pcm[2*i] = byte(s)
+		pcm[2*i+1] = byte(s >> 8)
+	}
+	return pcm
+}
+
+// wavFromPCM wraps 16-bit LE mono PCM in a canonical 44-byte WAV header.
+func wavFromPCM(pcm []byte, sampleRate int) []byte {
+	var hdr [44]byte
+	copy(hdr[0:4], "RIFF")
+	binary.LittleEndian.PutUint32(hdr[4:8], uint32(36+len(pcm)))
+	copy(hdr[8:12], "WAVE")
+	copy(hdr[12:16], "fmt ")
+	binary.LittleEndian.PutUint32(hdr[16:20], 16) // PCM fmt chunk size
+	binary.LittleEndian.PutUint16(hdr[20:22], 1)  // audio format = PCM
+	binary.LittleEndian.PutUint16(hdr[22:24], 1)  // channels = mono
+	binary.LittleEndian.PutUint32(hdr[24:28], uint32(sampleRate))
+	binary.LittleEndian.PutUint32(hdr[28:32], uint32(sampleRate*2)) // byte rate
+	binary.LittleEndian.PutUint16(hdr[32:34], 2)                    // block align
+	binary.LittleEndian.PutUint16(hdr[34:36], 16)                   // bits per sample
+	copy(hdr[36:40], "data")
+	binary.LittleEndian.PutUint32(hdr[40:44], uint32(len(pcm)))
+	return append(hdr[:], pcm...)
+}
+
+var _ = Describe("Realtime voice recognition gate", Label("Realtime"), func() {
+	// open connects to the gated pipeline and disables server VAD so we can
+	// commit manually.
+	open := func() *websocket.Conn {
+		c := connectWS("realtime-pipeline-gated")
+		created := readServerEvent(c, 30*time.Second)
+		Expect(created["type"]).To(Equal("session.created"))
+		sendClientEvent(c, disableVADEvent())
+		drainUntil(c, "session.updated", 10*time.Second)
+		return c
+	}
+
+	// commit appends raw PCM (base64) and commits the input buffer.
+	commit := func(c *websocket.Conn, pcm []byte) {
+		sendClientEvent(c, map[string]any{
+			"type":  "input_audio_buffer.append",
+			"audio": base64.StdEncoding.EncodeToString(pcm),
+		})
+		sendClientEvent(c, map[string]any{"type": "input_audio_buffer.commit"})
+	}
+
+	It("admits an authorized speaker through to a full response", func() {
+		c := open()
+		defer c.Close()
+
+		// Positive DC bias matches the enrolled reference speaker.
+		commit(c, pcmWithDC(300, 16000, 1000, 8000))
+		drainUntil(c, "input_audio_buffer.committed", 30*time.Second)
+
+		var gotDone, gotReject bool
+		deadline := time.Now().Add(60 * time.Second)
+		for time.Now().Before(deadline) {
+			evt := readServerEvent(c, time.Until(deadline))
+			if evt["type"] == "error" {
+				if e, ok := evt["error"].(map[string]any); ok && e["code"] == "speaker_not_authorized" {
+					gotReject = true
+				}
+			}
+			if evt["type"] == "response.done" {
+				gotDone = true
+				break
+			}
+		}
+		Expect(gotReject).To(BeFalse(), "authorized speaker must not be rejected")
+		Expect(gotDone).To(BeTrue(), "authorized speaker should reach response.done")
+	})
+
+	It("drops an unauthorized speaker before the LLM with a reject event", func() {
+		c := open()
+		defer c.Close()
+
+		// Negative DC bias is a different speaker, not within threshold.
+		commit(c, pcmWithDC(300, 16000, 1000, -8000))
+		drainUntil(c, "input_audio_buffer.committed", 30*time.Second)
+
+		var gotReject, gotDone bool
+		deadline := time.Now().Add(30 * time.Second)
+		for time.Now().Before(deadline) {
+			evt := readServerEvent(c, time.Until(deadline))
+			switch evt["type"] {
+			case "error":
+				if e, ok := evt["error"].(map[string]any); ok && e["code"] == "speaker_not_authorized" {
+					gotReject = true
+				}
+			case "response.done":
+				gotDone = true
+			}
+			if gotReject {
+				break
+			}
+		}
+		Expect(gotReject).To(BeTrue(), "unauthorized speaker should get a speaker_not_authorized event")
+		Expect(gotDone).To(BeFalse(), "unauthorized speaker must not reach the LLM/response.done")
+	})
+})