Compare commits

..

1 Commits

Author SHA1 Message Date
Ettore Di Giacinto
69c7a8e71d fix(mlx): strip file:// LocalPrefix before loading filesystem-imported models
MLX backends passed request.Model verbatim to mlx_lm/mlx_vlm load(). For a
model imported from the filesystem, LocalAI hands the backend a file:// URI
(its LocalPrefix), which load() rejects: the scheme is neither a valid HF
repo id nor an existing path (Path(model).exists() fails on the scheme),
producing "Repo id must be in the form 'repo_name' or 'namespace/repo_name'
... Use repo_type argument if needed".

Add a pure, unit-testable resolve_model_path(model, model_file) helper in the
shared python_utils: it prefers the resolved ModelFile, strips a file://
scheme and percent-decodes the path, and leaves plain repo ids and local
paths untouched. Wire it into the mlx, mlx-vlm and mlx-distributed backends
(load, model_key, and the distributed broadcast all use the normalized path).

Fixes #7461.

Assisted-by: claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-12 22:07:06 +00:00
172 changed files with 1014 additions and 13508 deletions

View File

@@ -716,19 +716,6 @@ include:
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-12-depth-anything-cpp'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "depth-anything-cpp"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
@@ -794,19 +781,6 @@ include:
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-12-omnivoice-cpp'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "omnivoice-cpp"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
@@ -1595,19 +1569,6 @@ include:
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-13-depth-anything-cpp'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "depth-anything-cpp"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
@@ -1647,19 +1608,6 @@ include:
backend: "locate-anything-cpp"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/arm64'
skip-drivers: 'false'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-cuda-13-arm64-depth-anything-cpp'
base-image: "ubuntu:24.04"
ubuntu-version: '2404'
runs-on: 'ubuntu-24.04-arm'
backend: "depth-anything-cpp"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
@@ -1764,19 +1712,6 @@ include:
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-13-omnivoice-cpp'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "omnivoice-cpp"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
@@ -1816,19 +1751,6 @@ include:
backend: "qwen3-tts-cpp"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/arm64'
skip-drivers: 'false'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-cuda-13-arm64-omnivoice-cpp'
base-image: "ubuntu:24.04"
ubuntu-version: '2404'
runs-on: 'ubuntu-24.04-arm'
backend: "omnivoice-cpp"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
@@ -2937,19 +2859,6 @@ include:
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-cpu-depth-anything-cpp'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "depth-anything-cpp"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'sycl_f32'
cuda-major-version: ""
cuda-minor-version: ""
@@ -2963,19 +2872,6 @@ include:
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'sycl_f32'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-intel-sycl-f32-depth-anything-cpp'
runs-on: 'ubuntu-latest'
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
skip-drivers: 'false'
backend: "depth-anything-cpp"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'sycl_f16'
cuda-major-version: ""
cuda-minor-version: ""
@@ -2989,19 +2885,6 @@ include:
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'sycl_f16'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-intel-sycl-f16-depth-anything-cpp'
runs-on: 'ubuntu-latest'
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
skip-drivers: 'false'
backend: "depth-anything-cpp"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'vulkan'
cuda-major-version: ""
cuda-minor-version: ""
@@ -3016,20 +2899,6 @@ include:
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'vulkan'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
platform-tag: 'amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-vulkan-depth-anything-cpp'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "depth-anything-cpp"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'vulkan'
cuda-major-version: ""
cuda-minor-version: ""
@@ -3044,20 +2913,6 @@ include:
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'vulkan'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/arm64'
platform-tag: 'arm64'
tag-latest: 'auto'
tag-suffix: '-gpu-vulkan-depth-anything-cpp'
runs-on: 'ubuntu-24.04-arm'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "depth-anything-cpp"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'sycl_f32'
cuda-major-version: ""
cuda-minor-version: ""
@@ -3164,19 +3019,6 @@ include:
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2204'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
platforms: 'linux/arm64'
skip-drivers: 'false'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-arm64-depth-anything-cpp'
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
runs-on: 'ubuntu-24.04-arm'
backend: "depth-anything-cpp"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2204'
# whisper
- build-type: ''
cuda-major-version: ""
@@ -3641,35 +3483,6 @@ include:
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
# omnivoice-cpp
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
platform-tag: 'amd64'
tag-latest: 'auto'
tag-suffix: '-cpu-omnivoice-cpp'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "omnivoice-cpp"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/arm64'
platform-tag: 'arm64'
tag-latest: 'auto'
tag-suffix: '-cpu-omnivoice-cpp'
runs-on: 'ubuntu-24.04-arm'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "omnivoice-cpp"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'sycl_f32'
cuda-major-version: ""
cuda-minor-version: ""
@@ -3683,19 +3496,6 @@ include:
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'sycl_f32'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-intel-sycl-f32-omnivoice-cpp'
runs-on: 'ubuntu-latest'
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
skip-drivers: 'false'
backend: "omnivoice-cpp"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'sycl_f16'
cuda-major-version: ""
cuda-minor-version: ""
@@ -3709,19 +3509,6 @@ include:
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'sycl_f16'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-intel-sycl-f16-omnivoice-cpp'
runs-on: 'ubuntu-latest'
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
skip-drivers: 'false'
backend: "omnivoice-cpp"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'vulkan'
cuda-major-version: ""
cuda-minor-version: ""
@@ -3736,20 +3523,6 @@ include:
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'vulkan'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
platform-tag: 'amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-vulkan-omnivoice-cpp'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "omnivoice-cpp"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'vulkan'
cuda-major-version: ""
cuda-minor-version: ""
@@ -3764,20 +3537,6 @@ include:
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'vulkan'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/arm64'
platform-tag: 'arm64'
tag-latest: 'auto'
tag-suffix: '-gpu-vulkan-omnivoice-cpp'
runs-on: 'ubuntu-24.04-arm'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "omnivoice-cpp"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
@@ -3791,19 +3550,6 @@ include:
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2204'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
platforms: 'linux/arm64'
skip-drivers: 'false'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-arm64-omnivoice-cpp'
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
runs-on: 'ubuntu-24.04-arm'
backend: "omnivoice-cpp"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2204'
- build-type: 'hipblas'
cuda-major-version: ""
cuda-minor-version: ""
@@ -3817,19 +3563,6 @@ include:
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'hipblas'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-rocm-hipblas-omnivoice-cpp'
base-image: "rocm/dev-ubuntu-24.04:6.4.4"
runs-on: 'ubuntu-latest'
skip-drivers: 'false'
backend: "omnivoice-cpp"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
# vibevoice-cpp
- build-type: ''
cuda-major-version: ""
@@ -4609,36 +4342,6 @@ include:
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
# supertonic CPU (amd64)
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
platform-tag: 'amd64'
tag-latest: 'auto'
tag-suffix: '-cpu-supertonic'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "supertonic"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
# supertonic CPU (arm64)
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/arm64'
platform-tag: 'arm64'
tag-latest: 'auto'
tag-suffix: '-cpu-supertonic'
runs-on: 'ubuntu-24.04-arm'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "supertonic"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
# Darwin matrix (consumed by backend-jobs-darwin).
includeDarwin:
@@ -4690,10 +4393,6 @@ includeDarwin:
tag-suffix: "-metal-darwin-arm64-qwen3-tts-cpp"
build-type: "metal"
lang: "go"
- backend: "omnivoice-cpp"
tag-suffix: "-metal-darwin-arm64-omnivoice-cpp"
build-type: "metal"
lang: "go"
- backend: "vibevoice-cpp"
tag-suffix: "-metal-darwin-arm64-vibevoice-cpp"
build-type: "metal"
@@ -4776,6 +4475,3 @@ includeDarwin:
- backend: "speaker-recognition"
tag-suffix: "-metal-darwin-arm64-speaker-recognition"
build-type: "mps"
- backend: "ds4"
tag-suffix: "-metal-darwin-arm64-ds4"
lang: "go"

View File

@@ -70,10 +70,6 @@ jobs:
variable: "QWEN3TTS_CPP_VERSION"
branch: "main"
file: "backend/go/qwen3-tts-cpp/Makefile"
- repository: "ServeurpersoCom/omnivoice.cpp"
variable: "OMNIVOICE_VERSION"
branch: "master"
file: "backend/go/omnivoice-cpp/Makefile"
- repository: "localai-org/vibevoice.cpp"
variable: "VIBEVOICE_CPP_VERSION"
branch: "master"

View File

@@ -21,10 +21,7 @@ jobs:
uses: securego/gosec@v2.27.1
with:
# we let the report trigger content trigger a failure using the GitHub Security features.
# backend/go/supertonic is excluded: it vendors upstream supertone-inc/supertonic
# (helper.go), whose findings (G304 model-file loads, G404 math/rand for flow-matching
# noise, G104 unhandled errors) are inherent to that upstream code, not ours to rewrite.
args: '-no-fail -exclude-dir=backend/go/supertonic -fmt sarif -out results.sarif ./...'
args: '-no-fail -fmt sarif -out results.sarif ./...'
- name: Upload SARIF file
if: ${{ github.actor != 'dependabot[bot]' }}
uses: github/codeql-action/upload-sarif@v4

View File

@@ -74,8 +74,6 @@ linters:
paths:
# Upstream whisper.cpp source tree fetched by the whisper backend Makefile.
- 'backend/go/whisper/sources'
# Vendored upstream supertonic pipeline (supertone-inc/supertonic go/helper.go).
- 'backend/go/supertonic/helper.go'
- 'docs/'
rules:
# CLI entry points: kong's `env:"..."` tag is the legitimate env→struct

View File

@@ -1,5 +1,5 @@
# Disable parallel execution for backend builds
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/omnivoice-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio backends/supertonic
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio
GOCMD=go
GOTEST=$(GOCMD) test
@@ -595,7 +595,6 @@ test-extra: prepare-test-extra
$(MAKE) -C backend/rust/kokoros test
$(MAKE) -C backend/go/rfdetr-cpp test
$(MAKE) -C backend/go/locate-anything-cpp test
$(MAKE) -C backend/go/supertonic test
##
## End-to-end gRPC tests that exercise a built backend container image.
@@ -1177,12 +1176,10 @@ BACKEND_PARAKEET_CPP = parakeet-cpp|golang|.|false|true
BACKEND_VOXTRAL = voxtral|golang|.|false|true
BACKEND_ACESTEP_CPP = acestep-cpp|golang|.|false|true
BACKEND_QWEN3_TTS_CPP = qwen3-tts-cpp|golang|.|false|true
BACKEND_OMNIVOICE_CPP = omnivoice-cpp|golang|.|false|true
BACKEND_VIBEVOICE_CPP = vibevoice-cpp|golang|.|false|true
BACKEND_LOCALVQE = localvqe|golang|.|false|true
BACKEND_OPUS = opus|golang|.|false|true
BACKEND_SHERPA_ONNX = sherpa-onnx|golang|.|false|true
BACKEND_SUPERTONIC = supertonic|golang|.|false|true
# Python backends with root context
BACKEND_RERANKERS = rerankers|python|.|false|true
@@ -1297,7 +1294,6 @@ $(eval $(call generate-docker-build-target,$(BACKEND_WHISPERX)))
$(eval $(call generate-docker-build-target,$(BACKEND_ACE_STEP)))
$(eval $(call generate-docker-build-target,$(BACKEND_ACESTEP_CPP)))
$(eval $(call generate-docker-build-target,$(BACKEND_QWEN3_TTS_CPP)))
$(eval $(call generate-docker-build-target,$(BACKEND_OMNIVOICE_CPP)))
$(eval $(call generate-docker-build-target,$(BACKEND_VIBEVOICE_CPP)))
$(eval $(call generate-docker-build-target,$(BACKEND_LOCALVQE)))
$(eval $(call generate-docker-build-target,$(BACKEND_MLX)))
@@ -1310,13 +1306,12 @@ $(eval $(call generate-docker-build-target,$(BACKEND_KOKOROS)))
$(eval $(call generate-docker-build-target,$(BACKEND_SAM3_CPP)))
$(eval $(call generate-docker-build-target,$(BACKEND_RFDETR_CPP)))
$(eval $(call generate-docker-build-target,$(BACKEND_SHERPA_ONNX)))
$(eval $(call generate-docker-build-target,$(BACKEND_SUPERTONIC)))
# Pattern rule for docker-save targets
docker-save-%: backend-images
docker save local-ai-backend:$* -o backend-images/$*.tar
docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-crispasr docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-liquid-audio docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-rfdetr-cpp docker-build-qwen3-tts-cpp docker-build-omnivoice-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx docker-build-cloud-proxy docker-build-supertonic
docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-crispasr docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-liquid-audio docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-rfdetr-cpp docker-build-qwen3-tts-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx docker-build-cloud-proxy
########################################################
### Mock Backend for E2E Tests

View File

@@ -165,10 +165,6 @@ For more details, see the [Getting Started guide](https://localai.io/basics/gett
## Latest News
- **June 2026**: New [realtime voice assistant demo](https://github.com/localai-org/localai-realtime-demo) (a tiny Go client for the Realtime API with a full talk-back voice loop and tool calling), plus [streaming of the realtime LLM / TTS / transcription pipeline stages](https://github.com/mudler/LocalAI/pull/10176) and [configurable WebRTC ICE candidates](https://github.com/mudler/LocalAI/pull/10231).
- **June 2026**: Big speech push: the [parakeet.cpp](https://github.com/mudler/parakeet.cpp) ASR engine gains [NeMo-faithful segment timestamps](https://github.com/mudler/LocalAI/pull/10207), a [multilingual streaming Nemotron-3.5 model](https://github.com/mudler/LocalAI/pull/10199), [dynamic batching for concurrent transcription](https://github.com/mudler/LocalAI/pull/10112) and [CUDA graphs](https://github.com/mudler/LocalAI/pull/10273); the new [CrispASR backend](https://github.com/mudler/LocalAI/pull/10099) adds multi-architecture ASR + TTS, and [60 Piper TTS voices across 42 languages](https://github.com/mudler/LocalAI/pull/10296) land in the gallery (plus [per-request TTS instructions and params](https://github.com/mudler/LocalAI/pull/10172)).
- **June 2026**: New backends and models: [locate-anything.cpp](https://github.com/mudler/LocalAI/pull/10264) for open-vocabulary object detection via ggml, [Ideogram4 image generation](https://github.com/mudler/LocalAI/pull/10201) in stablediffusion-ggml, [llama.cpp video input](https://github.com/mudler/LocalAI/pull/10216), and the [Gemma 4 QAT family with MTP speculative-decoding pairs](https://github.com/mudler/LocalAI/pull/10215). Plus an [interactive CLI chat mode](https://github.com/mudler/LocalAI/pull/10226) and [RAG source citations in agent responses](https://github.com/mudler/LocalAI/pull/10228).
- **June 2026**: Distributed mode hardening: [prefix-cache-aware routing](https://github.com/mudler/LocalAI/pull/10071), a [production-ready request router with auto-sized embedding/rerank batches](https://github.com/mudler/LocalAI/pull/10104), [ds4 layer-split distributed inference](https://github.com/mudler/LocalAI/pull/10098), [NATS JWT auth + TLS/mTLS](https://github.com/mudler/LocalAI/pull/10159), and [resumable file uploads](https://github.com/mudler/LocalAI/pull/10109).
- **May 2026**: **LocalAI 4.3.0** - `llama.cpp` [prompt cache on by default](https://github.com/mudler/LocalAI/pull/9925) (repeated system prompts collapse from minutes to seconds), [keyless cosign signing of backend OCI images](https://github.com/mudler/LocalAI/pull/9823), [per-API-key + per-user usage attribution](https://github.com/mudler/LocalAI/pull/9920), Distributed v3 with [per-request replica routing](https://github.com/mudler/LocalAI/pull/9968). [Release notes](https://github.com/mudler/LocalAI/releases/tag/v4.3.0)
- **May 2026**: **LocalAI 4.2.0** - LocalAI sees and hears: [voice recognition](https://github.com/mudler/LocalAI/pull/9500), [face recognition + antispoofing liveness](https://github.com/mudler/LocalAI/pull/9480), speaker diarization. Plus [drop-in Ollama API](https://github.com/mudler/LocalAI/pull/9284), [video generation](https://github.com/mudler/LocalAI/pull/9420), redesigned UI with i18n + admin-configurable branding, vLLM at feature parity with llama.cpp, and 11 new backends. [Release notes](https://github.com/mudler/LocalAI/releases/tag/v4.2.0)
- **April 2026**: **LocalAI 4.1.0** - LocalAI becomes a control tower: distributed cluster mode with VRAM-aware smart routing + autoscaling, multi-user platform with OIDC and API keys, per-user quotas with predictive analytics, in-UI fine-tuning with TRL (auto-export to GGUF), on-the-fly quantization backend, visual pipeline editor. [Release notes](https://github.com/mudler/LocalAI/releases/tag/v4.1.0)
@@ -221,7 +217,7 @@ See the full [Backend & Model Compatibility Table](https://localai.io/model-comp
- [Integrations & community projects](https://localai.io/docs/integrations/)
- [Installation video walkthrough](https://www.youtube.com/watch?v=cMVNnlqwfw4)
- [Media & blog posts](https://localai.io/basics/news/#media-blogs-social)
- [Examples](https://github.com/mudler/LocalAI-examples) — including the [realtime voice assistant demo](https://github.com/localai-org/localai-realtime-demo) (Go client for the Realtime API with tool calling)
- [Examples](https://github.com/mudler/LocalAI-examples)
## Team

View File

@@ -24,7 +24,6 @@ service Backend {
rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
rpc Status(HealthMessage) returns (StatusResponse) {}
rpc Detect(DetectOptions) returns (DetectResponse) {}
rpc Depth(DepthRequest) returns (DepthResponse) {}
rpc FaceVerify(FaceVerifyRequest) returns (FaceVerifyResponse) {}
rpc FaceAnalyze(FaceAnalyzeRequest) returns (FaceAnalyzeResponse) {}
rpc VoiceVerify(VoiceVerifyRequest) returns (VoiceVerifyResponse) {}
@@ -671,35 +670,6 @@ message DetectResponse {
repeated Detection Detections = 1;
}
// --- Depth estimation messages (Depth Anything 3) ---
message DepthRequest {
string src = 1; // input image (filesystem path or base64-encoded payload)
string dst = 2; // optional output directory for exports (glb/colmap)
bool include_depth = 3; // return the per-pixel metric depth map
bool include_confidence = 4; // return the per-pixel confidence map (DualDPT)
bool include_pose = 5; // return camera extrinsics/intrinsics (DualDPT)
bool include_sky = 6; // return the per-pixel sky map (mono models)
bool include_points = 7; // back-project to a 3D point cloud (DualDPT)
float points_conf_thresh = 8; // keep points with confidence >= this threshold
repeated string exports = 9; // requested exports: "glb", "colmap"
}
message DepthResponse {
int32 width = 1; // processed depth-map width
int32 height = 2; // processed depth-map height
repeated float depth = 3; // width*height row-major metric depth
repeated float confidence = 4; // width*height row-major confidence (DualDPT)
repeated float sky = 5; // width*height row-major sky map (mono)
repeated float extrinsics = 6; // 12 floats, 3x4 row-major (world-to-camera)
repeated float intrinsics = 7; // 9 floats, 3x3 row-major
int32 num_points = 8; // number of 3D points
repeated float points = 9; // num_points*3 xyz, world space
bytes point_colors = 10; // num_points*3 uint8 rgb
repeated string export_paths = 11; // paths written for the requested exports
bool is_metric = 12; // depth is in metric units
}
// --- Face recognition messages ---
message FacialArea {

View File

@@ -1,5 +1,5 @@
IK_LLAMA_VERSION?=5f917a64b391b7d31839845153a473a65f630458
IK_LLAMA_VERSION?=e6f8112f3ba126eed3ff5b30cdd08085414a7516
LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
CMAKE_ARGS?=

View File

@@ -1,5 +1,5 @@
LLAMA_VERSION?=4988f6e866057afd130c1515ecef0c9bab9a15f8
LLAMA_VERSION?=4c6595503fe45d5a39f88d194e270f64c7424677
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
CMAKE_ARGS?=

View File

@@ -1,7 +0,0 @@
sources/
build*/
package/
libdepthanythingcpp*.so
depth-anything-cpp
test-models/
test-data/

View File

@@ -1,28 +0,0 @@
cmake_minimum_required(VERSION 3.18)
project(libdepthanythingcpp LANGUAGES C CXX)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
# Static-link ggml into the depth-anything shared library so the resulting .so
# has no runtime dependency on an external libggml — only on
# libc/libstdc++/libgomp, which the LocalAI package step bundles into the
# docker image.
set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build static libraries" FORCE)
# depth-anything.cpp build switches: skip CLI/tests, but build libdepthanything
# itself as a SHARED library (DA_SHARED) while ggml stays static
# (BUILD_SHARED_LIBS OFF above). The da_capi_* C ABI is compiled into
# src/da_capi.cpp and re-exported by that shared library, so no extra MODULE
# wrapper is needed (unlike locate-anything.cpp).
set(DA_BUILD_CLI OFF CACHE BOOL "Disable depth-anything CLI" FORCE)
set(DA_BUILD_TESTS OFF CACHE BOOL "Disable depth-anything tests" FORCE)
set(DA_SHARED ON CACHE BOOL "Build libdepthanything as a shared lib" FORCE)
add_subdirectory(./sources/depth-anything.cpp)
# Emit libdepthanything.so into the top-level build dir so the Makefile can
# rename it to the per-variant libdepthanythingcpp-<variant>.so.
set_target_properties(depthanything PROPERTIES
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})

View File

@@ -1,136 +0,0 @@
CMAKE_ARGS?=
BUILD_TYPE?=
NATIVE?=false
GOCMD?=go
GO_TAGS?=
JOBS?=$(shell nproc --ignore=1)
# depth-anything.cpp. Pin to a specific commit for a stable build; a squash
# merge upstream can orphan a branch, so the native version is pinned by SHA.
DEPTHANYTHING_REPO?=https://github.com/mudler/depth-anything.cpp.git
DEPTHANYTHING_VERSION?=e0b6814d2f58261216da69d63326f1f2d75d4435
ifeq ($(NATIVE),false)
CMAKE_ARGS+=-DGGML_NATIVE=OFF
endif
# Forward LocalAI's BUILD_TYPE to the matching ggml backend switch. depth-anything.cpp
# force-sets GGML_CUDA/GGML_VULKAN/GGML_METAL from its own DA_GGML_* options, so
# those must be toggled via the DA_GGML_* names (a bare -DGGML_CUDA=ON would be
# overridden); the remaining ggml switches pass straight through.
ifeq ($(BUILD_TYPE),cublas)
CMAKE_ARGS+=-DGGML_CUDA=ON -DDA_GGML_CUDA=ON
else ifeq ($(BUILD_TYPE),openblas)
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
else ifeq ($(BUILD_TYPE),clblas)
CMAKE_ARGS+=-DGGML_CLBLAST=ON
else ifeq ($(BUILD_TYPE),hipblas)
ROCM_HOME ?= /opt/rocm
ROCM_PATH ?= /opt/rocm
export CXX=$(ROCM_HOME)/llvm/bin/clang++
export CC=$(ROCM_HOME)/llvm/bin/clang
AMDGPU_TARGETS?=gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1200,gfx1201
CMAKE_ARGS+=-DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
else ifeq ($(BUILD_TYPE),vulkan)
CMAKE_ARGS+=-DGGML_VULKAN=ON -DDA_GGML_VULKAN=ON
else ifeq ($(OS),Darwin)
ifneq ($(BUILD_TYPE),metal)
CMAKE_ARGS+=-DGGML_METAL=OFF
else
CMAKE_ARGS+=-DGGML_METAL=ON
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
CMAKE_ARGS+=-DDA_GGML_METAL=ON
endif
endif
ifeq ($(BUILD_TYPE),sycl_f16)
CMAKE_ARGS+=-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DGGML_SYCL_F16=ON
endif
ifeq ($(BUILD_TYPE),sycl_f32)
CMAKE_ARGS+=-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx
endif
sources/depth-anything.cpp:
mkdir -p sources && \
git clone --recursive $(DEPTHANYTHING_REPO) sources/depth-anything.cpp && \
cd sources/depth-anything.cpp && \
git checkout $(DEPTHANYTHING_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch
# Detect OS
UNAME_S := $(shell uname -s)
# Only build CPU variants on Linux
ifeq ($(UNAME_S),Linux)
VARIANT_TARGETS = libdepthanythingcpp-avx.so libdepthanythingcpp-avx2.so libdepthanythingcpp-avx512.so libdepthanythingcpp-fallback.so
else
# On non-Linux (e.g., Darwin), build only fallback variant
VARIANT_TARGETS = libdepthanythingcpp-fallback.so
endif
depth-anything-cpp: main.go godepthanythingcpp.go $(VARIANT_TARGETS)
CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o depth-anything-cpp ./
package: depth-anything-cpp
bash package.sh
build: package
clean: purge
rm -rf libdepthanythingcpp*.so depth-anything-cpp package sources
purge:
rm -rf build*
# Build all variants (Linux only)
ifeq ($(UNAME_S),Linux)
libdepthanythingcpp-avx.so: sources/depth-anything.cpp
rm -rfv build-$@
$(info ${GREEN}I depth-anything-cpp build info:avx${RESET})
SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libdepthanythingcpp-custom
rm -rfv build-$@
libdepthanythingcpp-avx2.so: sources/depth-anything.cpp
rm -rfv build-$@
$(info ${GREEN}I depth-anything-cpp build info:avx2${RESET})
SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) libdepthanythingcpp-custom
rm -rfv build-$@
libdepthanythingcpp-avx512.so: sources/depth-anything.cpp
rm -rfv build-$@
$(info ${GREEN}I depth-anything-cpp build info:avx512${RESET})
SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) libdepthanythingcpp-custom
rm -rfv build-$@
endif
# Build fallback variant (all platforms)
libdepthanythingcpp-fallback.so: sources/depth-anything.cpp
rm -rfv build-$@
$(info ${GREEN}I depth-anything-cpp build info:fallback${RESET})
SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libdepthanythingcpp-custom
rm -rfv build-$@
libdepthanythingcpp-custom: CMakeLists.txt
mkdir -p build-$(SO_TARGET) && \
cd build-$(SO_TARGET) && \
cmake .. $(CMAKE_ARGS) && \
cmake --build . --config Release -j$(JOBS) && \
cd .. && \
mv build-$(SO_TARGET)/libdepthanything.so ./$(SO_TARGET)
all: depth-anything-cpp package
# `test` is invoked by the top-level Makefile's `test-extra` target. It builds
# the backend binary + the fallback shared library (needed for dlopen at
# runtime), then runs test.sh which downloads a small GGUF + a test image and
# exercises the gRPC Load/Predict wire path via the Go smoke test in
# main_test.go.
test: depth-anything-cpp libdepthanythingcpp-fallback.so
bash test.sh

View File

@@ -1,509 +0,0 @@
package main
// godepthanythingcpp.go - gRPC handlers (Load, Predict, GenerateImage) for the
// depth-anything-cpp backend, wrapping the Depth Anything 3 ggml C-API
// (libdepthanythingcpp-<variant>.so) via purego.
//
// Embeds base.SingleThread to default the unimplemented RPCs to "not supported"
// and to serialize calls — the C side shares a ggml graph allocator and is NOT
// reentrant, so all inference must run one-at-a-time.
//
// Depth has no native OpenAI endpoint, so the model is exposed two ways:
//
// - GenerateImage(src, dst): run depth on the src image and write a
// min-max-normalised grayscale depth PNG to dst.
// - Predict(images[0]): run depth+pose and return a JSON blob with the depth
// dimensions, depth stats and the camera extrinsics (3x4) / intrinsics (3x3).
import (
"encoding/base64"
"encoding/json"
"fmt"
"image"
"image/png"
"math"
"os"
"path/filepath"
"unsafe"
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
)
// C-API function pointers, registered in main.go via purego. The da_capi_*
// symbols live inside libdepthanything (src/da_capi.cpp) and are re-exported by
// the DA_SHARED build.
var (
// da_capi_load(const char* gguf_path, int n_threads) -> da_ctx* (0 = fail)
CapiLoad func(gguf string, nThreads int32) uintptr
// da_capi_free(da_ctx* ctx) — safe on a 0 handle.
CapiFree func(handle uintptr)
// da_capi_last_error(da_ctx* ctx) -> const char* (owned by ctx, "" if none).
// purego marshals the returned C string into a Go string (a copy), so we
// never free it.
CapiLastError func(handle uintptr) string
// da_capi_depth_path(ctx, image_path, out_h*, out_w*) -> float* depth map
// (row-major H*W); nil on error. Caller frees via da_capi_free_floats.
CapiDepthPath func(handle uintptr, imagePath string, outH *int32, outW *int32) *float32
// da_capi_free_floats(float* p)
CapiFreeFloats func(p *float32)
// da_capi_pose_path(ctx, image_path, out_ext[12], out_intr[9]) -> 0 ok, -1 err
CapiPosePath func(handle uintptr, imagePath string, outExt *float32, outIntr *float32) int32
// da_capi_depth_dense(ctx, image_path, out_h*, out_w*, out_depth**, out_conf**,
// out_sky**, out_ext[12], out_intr[9], out_is_metric*) -> 0 ok, -1 err.
// Each non-NULL out_depth/out_conf/out_sky receives a malloc'd float[H*W] (free
// via da_capi_free_floats); buffers the model doesn't produce are set NULL.
CapiDepthDense func(handle uintptr, imagePath string,
outH, outW *int32,
outDepth, outConf, outSky **float32,
outExt, outIntr *float32,
outIsMetric *int32) int32
// da_capi_points(ctx, image_path, conf_thresh, out_n*, out_xyz**, out_rgb**) ->
// 0 ok, -1 err. *out_xyz = malloc'd float[3*N] (free via da_capi_free_floats),
// *out_rgb = malloc'd uint8[3*N] (free via da_capi_free_bytes).
CapiPoints func(handle uintptr, imagePath string, confThresh float32,
outN *int32, outXyz **float32, outRgb **byte) int32
// da_capi_free_bytes(unsigned char* p)
CapiFreeBytes func(p *byte)
// da_capi_export_glb(ctx, image_path, out_glb) -> 0 ok, -1 err
CapiExportGlb func(handle uintptr, imagePath string, outGlb string) int32
// da_capi_export_colmap(ctx, image_path, out_dir, binary) -> 0 ok, -1 err
CapiExportColmap func(handle uintptr, imagePath string, outDir string, binary int32) int32
)
type DepthAnythingCpp struct {
base.SingleThread
handle uintptr
}
// Load loads the GGUF model at opts.ModelFile (joined with opts.ModelPath if
// relative) and stores the da_ctx handle for later inference calls.
func (r *DepthAnythingCpp) Load(opts *pb.ModelOptions) error {
modelFile := opts.ModelFile
if modelFile == "" {
modelFile = opts.Model
}
if modelFile == "" {
return fmt.Errorf("depth-anything-cpp: ModelFile is empty")
}
var modelPath string
if filepath.IsAbs(modelFile) {
modelPath = modelFile
} else {
modelPath = filepath.Join(opts.ModelPath, modelFile)
}
if _, err := os.Stat(modelPath); err != nil {
return fmt.Errorf("depth-anything-cpp: model file not found: %s: %w", modelPath, err)
}
threads := opts.Threads
if threads <= 0 {
threads = 4
}
// Release previous model if any (re-Load).
if r.handle != 0 {
CapiFree(r.handle)
r.handle = 0
}
h := CapiLoad(modelPath, threads)
if h == 0 {
// da_capi_last_error needs a ctx; on a failed load we have none (it
// returns "" for a null ctx), so the text is best-effort.
if msg := CapiLastError(0); msg != "" {
return fmt.Errorf("depth-anything-cpp: da_capi_load failed for %s: %s", modelPath, msg)
}
return fmt.Errorf("depth-anything-cpp: da_capi_load failed for %s", modelPath)
}
r.handle = h
return nil
}
// depthResult is the JSON payload returned by Predict.
type depthResult struct {
DepthW int `json:"depth_w"`
DepthH int `json:"depth_h"`
DepthMin float32 `json:"depth_min"`
DepthMax float32 `json:"depth_max"`
Extrinsics [12]float32 `json:"extrinsics"` // 3x4 row-major
Intrinsics [9]float32 `json:"intrinsics"` // 3x3 row-major
}
// Predict runs depth+pose on the first supplied image and returns depth
// statistics + camera pose as a JSON string. LocalAI wraps the string into the
// Reply.Message of the gRPC response. The image in Images[0] may be a
// filesystem path or a base64-encoded payload.
func (r *DepthAnythingCpp) Predict(opts *pb.PredictOptions) (string, error) {
imgs := opts.GetImages()
if len(imgs) == 0 {
return "", fmt.Errorf("depth-anything-cpp: Predict requires an image in Images[]")
}
imgPath, cleanup, err := materializeImage(imgs[0])
if err != nil {
return "", fmt.Errorf("depth-anything-cpp: %w", err)
}
defer cleanup()
depth, h, w, ext, intr, err := r.runDepthPose(imgPath)
if err != nil {
return "", err
}
dmin, dmax := minMax(depth)
payload, err := json.Marshal(depthResult{
DepthW: w, DepthH: h,
DepthMin: dmin, DepthMax: dmax,
Extrinsics: ext, Intrinsics: intr,
})
if err != nil {
return "", fmt.Errorf("depth-anything-cpp: marshal: %w", err)
}
return string(payload), nil
}
// GenerateImage runs depth on req.Src and writes a normalised grayscale depth
// PNG to req.Dst.
func (r *DepthAnythingCpp) GenerateImage(req *pb.GenerateImageRequest) error {
if req.GetSrc() == "" {
return fmt.Errorf("depth-anything-cpp: GenerateImage requires src")
}
if req.GetDst() == "" {
return fmt.Errorf("depth-anything-cpp: GenerateImage requires dst")
}
imgPath, cleanup, err := materializeImage(req.GetSrc())
if err != nil {
return fmt.Errorf("depth-anything-cpp: %w", err)
}
defer cleanup()
depth, h, w, _, _, err := r.runDepthPose(imgPath)
if err != nil {
return err
}
return writeDepthPNG(req.GetDst(), depth, h, w)
}
// Depth is the typed Depth RPC. It runs the Depth Anything 3 pipeline on the
// request's src image and fills a DepthResponse honoring the include_* flags and
// exports: per-pixel metric depth + confidence (DualDPT) or depth + sky (mono),
// camera extrinsics/intrinsics, an optional back-projected 3D point cloud and
// glb/COLMAP exports. The src may be a filesystem path or a base64 payload.
func (r *DepthAnythingCpp) Depth(in *pb.DepthRequest) (pb.DepthResponse, error) {
// Accumulate into locals and return a single composite literal at the end:
// returning a named pb.DepthResponse value would copy its embedded mutex
// (go vet copylocks).
if r.handle == 0 {
return pb.DepthResponse{}, fmt.Errorf("depth-anything-cpp: model not loaded")
}
if in.GetSrc() == "" {
return pb.DepthResponse{}, fmt.Errorf("depth-anything-cpp: Depth requires src")
}
imgPath, cleanup, err := materializeImage(in.GetSrc())
if err != nil {
return pb.DepthResponse{}, fmt.Errorf("depth-anything-cpp: %w", err)
}
defer cleanup()
// Dense per-pixel output + pose. Pass buffer pointers only for the
// requested maps so the native side can skip unrequested work; ext/intr
// must always point at 12/9 floats per the C ABI.
var (
h, w, isMetric int32
depthPtr, confPtr *float32
skyPtr *float32
ext [12]float32
intr [9]float32
pDepth, pConf, pSky **float32
)
if in.GetIncludeDepth() {
pDepth = &depthPtr
}
if in.GetIncludeConfidence() {
pConf = &confPtr
}
if in.GetIncludeSky() {
pSky = &skyPtr
}
rc := CapiDepthDense(r.handle, imgPath, &h, &w, pDepth, pConf, pSky, &ext[0], &intr[0], &isMetric)
if rc != 0 {
return pb.DepthResponse{}, fmt.Errorf("depth-anything-cpp: da_capi_depth_dense failed (rc=%d): %s", rc, r.lastError())
}
n := int(h) * int(w)
var (
depth, conf, sky []float32
extrinsics, intrinsic []float32
numPoints int32
points []float32
pointColors []byte
exportPaths []string
)
if depthPtr != nil {
depth = copyFloats(depthPtr, n)
CapiFreeFloats(depthPtr)
}
if confPtr != nil {
conf = copyFloats(confPtr, n)
CapiFreeFloats(confPtr)
}
if skyPtr != nil {
sky = copyFloats(skyPtr, n)
CapiFreeFloats(skyPtr)
}
if in.GetIncludePose() {
extrinsics = append([]float32(nil), ext[:]...)
intrinsic = append([]float32(nil), intr[:]...)
}
// 3D point cloud (DualDPT / pose-capable models only).
if in.GetIncludePoints() {
var (
np int32
xyzPtr *float32
rgbPtr *byte
)
if rc := CapiPoints(r.handle, imgPath, in.GetPointsConfThresh(), &np, &xyzPtr, &rgbPtr); rc != 0 {
return pb.DepthResponse{}, fmt.Errorf("depth-anything-cpp: da_capi_points failed (rc=%d): %s", rc, r.lastError())
}
numPoints = np
if xyzPtr != nil {
points = copyFloats(xyzPtr, int(np)*3)
CapiFreeFloats(xyzPtr)
}
if rgbPtr != nil {
pointColors = copyBytes(rgbPtr, int(np)*3)
CapiFreeBytes(rgbPtr)
}
}
// Exports (glb / colmap). They are written under in.Dst (a directory); a
// temp dir is used when Dst is empty.
if len(in.GetExports()) > 0 {
exportPaths, err = r.runExports(imgPath, in.GetDst(), in.GetExports())
if err != nil {
return pb.DepthResponse{}, err
}
}
return pb.DepthResponse{
Width: w,
Height: h,
Depth: depth,
Confidence: conf,
Sky: sky,
Extrinsics: extrinsics,
Intrinsics: intrinsic,
NumPoints: numPoints,
Points: points,
PointColors: pointColors,
ExportPaths: exportPaths,
IsMetric: isMetric != 0,
}, nil
}
// runExports writes the requested exports for imgPath into dstDir and returns
// the written paths. Supported exports: "glb", "colmap".
func (r *DepthAnythingCpp) runExports(imgPath, dstDir string, exports []string) ([]string, error) {
if dstDir == "" {
tmp, err := os.MkdirTemp("", "depth-anything-export-*")
if err != nil {
return nil, fmt.Errorf("depth-anything-cpp: mkdir export dir: %w", err)
}
dstDir = tmp
} else if err := os.MkdirAll(dstDir, 0o755); err != nil {
return nil, fmt.Errorf("depth-anything-cpp: mkdir %s: %w", dstDir, err)
}
var paths []string
for _, exp := range exports {
switch exp {
case "glb":
out := filepath.Join(dstDir, "pointcloud.glb")
if rc := CapiExportGlb(r.handle, imgPath, out); rc != 0 {
return nil, fmt.Errorf("depth-anything-cpp: da_capi_export_glb failed (rc=%d): %s", rc, r.lastError())
}
paths = append(paths, out)
case "colmap":
out := filepath.Join(dstDir, "colmap")
if err := os.MkdirAll(out, 0o755); err != nil {
return nil, fmt.Errorf("depth-anything-cpp: mkdir %s: %w", out, err)
}
if rc := CapiExportColmap(r.handle, imgPath, out, 1); rc != 0 {
return nil, fmt.Errorf("depth-anything-cpp: da_capi_export_colmap failed (rc=%d): %s", rc, r.lastError())
}
paths = append(paths, out)
default:
return nil, fmt.Errorf("depth-anything-cpp: unknown export %q (want glb|colmap)", exp)
}
}
return paths, nil
}
// copyFloats copies n float32 values from a C heap pointer into a fresh Go
// slice so the C buffer can be freed afterwards.
func copyFloats(p *float32, n int) []float32 {
if p == nil || n <= 0 {
return nil
}
src := unsafe.Slice(p, n)
out := make([]float32, n)
copy(out, src)
return out
}
// copyBytes copies n bytes from a C heap pointer into a fresh Go slice.
func copyBytes(p *byte, n int) []byte {
if p == nil || n <= 0 {
return nil
}
src := unsafe.Slice(p, n)
out := make([]byte, n)
copy(out, src)
return out
}
// runDepthPose runs depth estimation then pose recovery on an image file. It
// returns the row-major depth map (length h*w), its dimensions, the 3x4
// extrinsics (12 floats) and 3x3 intrinsics (9 floats).
func (r *DepthAnythingCpp) runDepthPose(imagePath string) (depth []float32, h, w int, ext [12]float32, intr [9]float32, err error) {
if r.handle == 0 {
err = fmt.Errorf("depth-anything-cpp: model not loaded")
return
}
var ch, cw int32
ptr := CapiDepthPath(r.handle, imagePath, &ch, &cw)
if ptr == nil {
err = fmt.Errorf("depth-anything-cpp: da_capi_depth_path failed: %s", r.lastError())
return
}
h, w = int(ch), int(cw)
n := h * w
if n > 0 {
src := unsafe.Slice(ptr, n)
depth = make([]float32, n)
copy(depth, src)
}
CapiFreeFloats(ptr)
if rc := CapiPosePath(r.handle, imagePath, &ext[0], &intr[0]); rc != 0 {
err = fmt.Errorf("depth-anything-cpp: da_capi_pose_path failed (rc=%d): %s", rc, r.lastError())
return
}
return
}
// lastError returns the context's last error string, or "" if none.
func (r *DepthAnythingCpp) lastError() string {
if CapiLastError == nil || r.handle == 0 {
return ""
}
return CapiLastError(r.handle)
}
// materializeImage returns a filesystem path for an image argument that may be
// either an existing path or a base64-encoded payload. When the input is
// base64 it is decoded into a temp file; cleanup removes it (no-op for a path).
func materializeImage(arg string) (path string, cleanup func(), err error) {
cleanup = func() {}
if _, statErr := os.Stat(arg); statErr == nil {
return arg, cleanup, nil
}
// Strip an optional data URL prefix (data:image/...;base64,<payload>).
b64 := arg
if i := indexComma(b64); i >= 0 && hasDataPrefix(b64) {
b64 = b64[i+1:]
}
data, decErr := base64.StdEncoding.DecodeString(b64)
if decErr != nil {
return "", cleanup, fmt.Errorf("image is neither an existing path nor valid base64: %v", decErr)
}
f, tErr := os.CreateTemp("", "depth-anything-*.img")
if tErr != nil {
return "", cleanup, tErr
}
if _, wErr := f.Write(data); wErr != nil {
_ = f.Close()
_ = os.Remove(f.Name())
return "", cleanup, wErr
}
_ = f.Close()
name := f.Name()
return name, func() { _ = os.Remove(name) }, nil
}
func hasDataPrefix(s string) bool {
return len(s) >= 5 && s[:5] == "data:"
}
func indexComma(s string) int {
for i := 0; i < len(s); i++ {
if s[i] == ',' {
return i
}
}
return -1
}
// writeDepthPNG min-max normalises a depth map and writes it as an 8-bit
// grayscale PNG. Near = bright (255), far = dark (0), matching the usual
// depth-map convention for inverse-depth-like outputs.
func writeDepthPNG(dst string, depth []float32, h, w int) error {
if h <= 0 || w <= 0 || len(depth) < h*w {
return fmt.Errorf("depth-anything-cpp: writeDepthPNG: bad dims h=%d w=%d len=%d", h, w, len(depth))
}
dmin, dmax := minMax(depth)
span := dmax - dmin
if span <= 0 || math.IsNaN(float64(span)) {
span = 1
}
img := image.NewGray(image.Rect(0, 0, w, h))
for y := 0; y < h; y++ {
for x := 0; x < w; x++ {
v := depth[y*w+x]
n := (v - dmin) / span // 0..1
if math.IsNaN(float64(n)) {
n = 0
}
if n < 0 {
n = 0
} else if n > 1 {
n = 1
}
img.Pix[y*img.Stride+x] = uint8(n * 255)
}
}
f, err := os.Create(dst)
if err != nil {
return err
}
defer func() { _ = f.Close() }()
return png.Encode(f, img)
}
func minMax(v []float32) (mn, mx float32) {
if len(v) == 0 {
return 0, 0
}
mn, mx = v[0], v[0]
for _, x := range v {
if math.IsNaN(float64(x)) || math.IsInf(float64(x), 0) {
continue
}
if x < mn {
mn = x
}
if x > mx {
mx = x
}
}
return mn, mx
}

View File

@@ -1,61 +0,0 @@
package main
// main.go - entry point for the depth-anything-cpp gRPC backend.
//
// Dlopens libdepthanythingcpp-<variant>.so via purego at the path in
// DEPTHANYTHING_LIBRARY (set by run.sh based on /proc/cpuinfo), registers the
// da_capi_* C ABI symbols, then starts the gRPC server.
import (
"flag"
"os"
"github.com/ebitengine/purego"
grpc "github.com/mudler/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
type LibFuncs struct {
FuncPtr any
Name string
}
func main() {
// Get library name from environment variable, default to fallback
libName := os.Getenv("DEPTHANYTHING_LIBRARY")
if libName == "" {
libName = "./libdepthanythingcpp-fallback.so"
}
lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
if err != nil {
panic(err)
}
libFuncs := []LibFuncs{
{&CapiLoad, "da_capi_load"},
{&CapiFree, "da_capi_free"},
{&CapiLastError, "da_capi_last_error"},
{&CapiDepthPath, "da_capi_depth_path"},
{&CapiFreeFloats, "da_capi_free_floats"},
{&CapiPosePath, "da_capi_pose_path"},
{&CapiDepthDense, "da_capi_depth_dense"},
{&CapiPoints, "da_capi_points"},
{&CapiFreeBytes, "da_capi_free_bytes"},
{&CapiExportGlb, "da_capi_export_glb"},
{&CapiExportColmap, "da_capi_export_colmap"},
}
for _, lf := range libFuncs {
purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
}
flag.Parse()
if err := grpc.StartServer(*addr, &DepthAnythingCpp{}); err != nil {
panic(err)
}
}

View File

@@ -1,167 +0,0 @@
package main
// main_test.go - end-to-end smoke test for the depth-anything-cpp gRPC backend.
//
// Spawns the compiled depth-anything-cpp binary on a free local port, dials it
// via gRPC, and exercises LoadModel + Predict against the test fixtures
// downloaded by test.sh: the small (vits) f32 GGUF of Depth Anything 3 and a
// real photo. Asserts that Predict returns a JSON payload with a positive
// depth-map width/height.
//
// The spec Skip()s cleanly if its fixtures (the model, the test image, the
// built binary, or the fallback .so) are missing, so the test target stays
// usable on a fresh checkout / on CI runners where the model hasn't been
// downloaded.
import (
"context"
"encoding/base64"
"encoding/json"
"fmt"
"net"
"os"
"os/exec"
"path/filepath"
"testing"
"time"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
)
func TestDepth(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "depth-anything-cpp backend smoke suite")
}
// freePort grabs an ephemeral TCP port and immediately releases it so the
// spawned backend can bind to it. There is a tiny TOCTOU window here but in
// practice it's adequate for a smoke test on a quiet runner.
func freePort() int {
l, err := net.Listen("tcp", "127.0.0.1:0")
Expect(err).ToNot(HaveOccurred(), "freePort listen")
port := l.Addr().(*net.TCPAddr).Port
Expect(l.Close()).To(Succeed())
return port
}
// startBackend spawns the depth-anything-cpp binary on the given port and waits
// until it accepts TCP connections (up to 10s). It mirrors how main.go resolves
// the purego library: the DEPTHANYTHING_LIBRARY env var points the dlopen at the
// freshly built fallback .so. The returned cleanup func kills the process.
func startBackend(port int) func() {
binary, err := filepath.Abs("./depth-anything-cpp")
Expect(err).ToNot(HaveOccurred())
if _, err := os.Stat(binary); err != nil {
Skip(fmt.Sprintf("backend binary not built: %s (run `make depth-anything-cpp` first)", binary))
}
libPath, err := filepath.Abs("./libdepthanythingcpp-fallback.so")
Expect(err).ToNot(HaveOccurred())
if _, err := os.Stat(libPath); err != nil {
Skip(fmt.Sprintf("fallback library not built: %s (run `make libdepthanythingcpp-fallback.so` first)", libPath))
}
addr := fmt.Sprintf("127.0.0.1:%d", port)
cmd := exec.Command(binary, "--addr", addr)
cmd.Env = append(os.Environ(), "DEPTHANYTHING_LIBRARY="+libPath)
cmd.Stdout = os.Stderr
cmd.Stderr = os.Stderr
Expect(cmd.Start()).To(Succeed())
cleanup := func() {
if cmd.Process != nil {
_ = cmd.Process.Kill()
_, _ = cmd.Process.Wait()
}
}
deadline := time.Now().Add(10 * time.Second)
for time.Now().Before(deadline) {
c, err := net.DialTimeout("tcp", addr, 200*time.Millisecond)
if err == nil {
_ = c.Close()
return cleanup
}
time.Sleep(200 * time.Millisecond)
}
cleanup()
Fail(fmt.Sprintf("backend did not become ready on %s within 10s", addr))
return func() {}
}
// loadTestImage reads the test image downloaded by test.sh and returns its
// base64-encoded content (one of the wire formats accepted by Predict).
func loadTestImage() string {
imgPath, err := filepath.Abs("test-data/test.jpg")
Expect(err).ToNot(HaveOccurred())
imgBytes, err := os.ReadFile(imgPath)
if err != nil {
Skip(fmt.Sprintf("test image not present: %s (run test.sh first)", imgPath))
}
return base64.StdEncoding.EncodeToString(imgBytes)
}
// dialBackend opens a gRPC client connection to the spawned backend.
func dialBackend(port int) (pb.BackendClient, func()) {
addr := fmt.Sprintf("127.0.0.1:%d", port)
conn, err := grpc.NewClient(addr, grpc.WithTransportCredentials(insecure.NewCredentials()))
Expect(err).ToNot(HaveOccurred())
return pb.NewBackendClient(conn), func() { _ = conn.Close() }
}
// modelPathOrSkip resolves the model file under ./test-models/ and Skip()s the
// current spec if it's missing (not present on a fresh checkout / on CI runners
// without the download).
func modelPathOrSkip(name string) string {
modelDir, err := filepath.Abs("test-models")
Expect(err).ToNot(HaveOccurred())
modelPath := filepath.Join(modelDir, name)
if _, err := os.Stat(modelPath); err != nil {
Skip(fmt.Sprintf("model not present: %s (run test.sh first)", modelPath))
}
return modelPath
}
var _ = Describe("depth-anything-cpp backend", func() {
It("runs depth+pose against a known-good image", func() {
modelPath := modelPathOrSkip("depth-anything-small-f32.gguf")
imgB64 := loadTestImage()
port := freePort()
cleanup := startBackend(port)
defer cleanup()
client, closeConn := dialBackend(port)
defer closeConn()
ctx, cancel := context.WithTimeout(context.Background(), 20*time.Minute)
defer cancel()
loadResp, err := client.LoadModel(ctx, &pb.ModelOptions{
Model: "depth-anything-small-f32.gguf",
ModelFile: modelPath,
Threads: 4,
})
Expect(err).ToNot(HaveOccurred(), "LoadModel")
Expect(loadResp.GetSuccess()).To(BeTrue(), "LoadModel reported failure: %s", loadResp.GetMessage())
// Predict runs depth+pose and returns the JSON depthResult in Reply.Message.
reply, err := client.Predict(ctx, &pb.PredictOptions{
Images: []string{imgB64},
})
Expect(err).ToNot(HaveOccurred(), "Predict")
var res depthResult
Expect(json.Unmarshal(reply.GetMessage(), &res)).To(Succeed(), "Predict returned non-JSON: %q", string(reply.GetMessage()))
Expect(res.DepthW).To(BeNumerically(">", 0), "depth width should be positive")
Expect(res.DepthH).To(BeNumerically(">", 0), "depth height should be positive")
_, _ = fmt.Fprintf(GinkgoWriter, "depth OK: %dx%d min=%.3f max=%.3f\n",
res.DepthW, res.DepthH, res.DepthMin, res.DepthMax)
})
})

View File

@@ -1,59 +0,0 @@
#!/bin/bash
# Script to copy the appropriate libraries based on architecture
set -e
CURDIR=$(dirname "$(realpath $0)")
REPO_ROOT="${CURDIR}/../../.."
# Create lib directory
mkdir -p $CURDIR/package/lib
cp -avf $CURDIR/libdepthanythingcpp-*.so $CURDIR/package/
cp -avf $CURDIR/depth-anything-cpp $CURDIR/package/
cp -fv $CURDIR/run.sh $CURDIR/package/
# Detect architecture and copy appropriate libraries
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
# x86_64 architecture
echo "Detected x86_64 architecture, copying x86_64 libraries..."
cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
# ARM64 architecture
echo "Detected ARM64 architecture, copying ARM64 libraries..."
cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
elif [ $(uname -s) = "Darwin" ]; then
echo "Detected Darwin"
else
echo "Error: Could not detect architecture"
exit 1
fi
# Package GPU libraries based on BUILD_TYPE
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
if [ -f "$GPU_LIB_SCRIPT" ]; then
echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
package_gpu_libs
fi
echo "Packaging completed successfully"
ls -liah $CURDIR/package/
ls -liah $CURDIR/package/lib/

View File

@@ -1,52 +0,0 @@
#!/bin/bash
set -ex
# Get the absolute current dir where the script is located
CURDIR=$(dirname "$(realpath $0)")
cd /
echo "CPU info:"
if [ "$(uname)" != "Darwin" ]; then
grep -e "model\sname" /proc/cpuinfo | head -1
grep -e "flags" /proc/cpuinfo | head -1
fi
LIBRARY="$CURDIR/libdepthanythingcpp-fallback.so"
if [ "$(uname)" != "Darwin" ]; then
if grep -q -e "\savx\s" /proc/cpuinfo ; then
echo "CPU: AVX found OK"
if [ -e $CURDIR/libdepthanythingcpp-avx.so ]; then
LIBRARY="$CURDIR/libdepthanythingcpp-avx.so"
fi
fi
if grep -q -e "\savx2\s" /proc/cpuinfo ; then
echo "CPU: AVX2 found OK"
if [ -e $CURDIR/libdepthanythingcpp-avx2.so ]; then
LIBRARY="$CURDIR/libdepthanythingcpp-avx2.so"
fi
fi
# Check avx 512
if grep -q -e "\savx512f\s" /proc/cpuinfo ; then
echo "CPU: AVX512F found OK"
if [ -e $CURDIR/libdepthanythingcpp-avx512.so ]; then
LIBRARY="$CURDIR/libdepthanythingcpp-avx512.so"
fi
fi
fi
export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
export DEPTHANYTHING_LIBRARY=$LIBRARY
# If there is a lib/ld.so, use it
if [ -f $CURDIR/lib/ld.so ]; then
echo "Using lib/ld.so"
echo "Using library: $LIBRARY"
exec $CURDIR/lib/ld.so $CURDIR/depth-anything-cpp "$@"
fi
echo "Using library: $LIBRARY"
exec $CURDIR/depth-anything-cpp "$@"

View File

@@ -1,45 +0,0 @@
#!/bin/bash
set -e
CURDIR=$(dirname "$(realpath $0)")
echo "Running depth-anything-cpp backend tests..."
# Test model from the mudler/depth-anything.cpp-gguf HuggingFace repo. The small
# (vits) f32 GGUF is the lightest backbone (~131 MB), so it keeps the download
# cheap. It is resumed with `curl -C -` and skipped entirely if already present.
DEPTHANYTHING_MODEL_DIR="${DEPTHANYTHING_MODEL_DIR:-$CURDIR/test-models}"
DEPTHANYTHING_MODEL_FILE="${DEPTHANYTHING_MODEL_FILE:-depth-anything-small-f32.gguf}"
DEPTHANYTHING_MODEL_URL="${DEPTHANYTHING_MODEL_URL:-https://huggingface.co/mudler/depth-anything.cpp-gguf/resolve/main/depth-anything-small-f32.gguf}"
mkdir -p "$DEPTHANYTHING_MODEL_DIR"
if [ ! -f "$DEPTHANYTHING_MODEL_DIR/$DEPTHANYTHING_MODEL_FILE" ]; then
echo "Downloading depth-anything small f32 model (~131 MB)..."
# -C - resumes a partial download so an interrupted run doesn't restart from 0.
curl -L -C - -o "$DEPTHANYTHING_MODEL_DIR/$DEPTHANYTHING_MODEL_FILE" "$DEPTHANYTHING_MODEL_URL" --progress-bar
fi
# Use a real photo (people + cars) from the upstream rf-detr.cpp repo (~46 KB).
# Depth estimation needs real content; a synthetic image would be degenerate.
TEST_IMAGE_DIR="$CURDIR/test-data"
TEST_IMAGE_FILE="$TEST_IMAGE_DIR/test.jpg"
TEST_IMAGE_URL="${TEST_IMAGE_URL:-https://raw.githubusercontent.com/mudler/rf-detr.cpp/main/tests/fixtures/ci/test_image.jpg}"
mkdir -p "$TEST_IMAGE_DIR"
if [ ! -f "$TEST_IMAGE_FILE" ]; then
echo "Downloading test image..."
curl -L -o "$TEST_IMAGE_FILE" "$TEST_IMAGE_URL" --progress-bar
fi
echo "depth-anything-cpp test setup complete."
echo " model: $DEPTHANYTHING_MODEL_DIR/$DEPTHANYTHING_MODEL_FILE"
echo " test image: $TEST_IMAGE_FILE"
# Run the Go smoke test: spawns the backend binary on a free port, calls
# LoadModel + Predict via gRPC against the downloaded GGUF + image.
echo ""
echo "Running Go smoke test..."
cd "$CURDIR"
go test -v -timeout 30m ./...

View File

@@ -10,7 +10,7 @@ JOBS?=$(shell nproc --ignore=1)
# this on `master` always picks up the latest C-API surface (incl. the
# per-detection accessor functions used by golocateanythingcpp.go).
LOCATEANYTHING_REPO?=https://github.com/mudler/locate-anything.cpp.git
LOCATEANYTHING_VERSION?=92c1682da792c1e8a5dec91acc2be4b02c742ded
LOCATEANYTHING_VERSION?=60e450945476d5e97e0754a8c0e71a9ea81690e0
ifeq ($(NATIVE),false)
CMAKE_ARGS+=-DGGML_NATIVE=OFF

View File

@@ -1,17 +0,0 @@
# Fetched upstream sources
sources/
# CMake build directories
build*/
# Compiled shared libraries
*.so
# Compiled backend binary
omnivoice-cpp
# Packaging output
package/
# Downloaded e2e models
omnivoice-models/

View File

@@ -1,53 +0,0 @@
cmake_minimum_required(VERSION 3.14)
project(gomnivoicecpp LANGUAGES C CXX)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(OMNIVOICE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/sources/omnivoice.cpp)
# Override upstream's CMAKE_CUDA_ARCHITECTURES before add_subdirectory.
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real")
endif()
# Add the upstream project. Its own CMakeLists adds ggml + builds
# omnivoice-core (STATIC, contains src/omnivoice.cpp i.e. the ov_* impl).
# EXCLUDE_FROM_ALL keeps its CLI tools/tests from building unless referenced.
add_subdirectory(${OMNIVOICE_DIR} omnivoice EXCLUDE_FROM_ALL)
# Upstream generates version.h into its own CMAKE_CURRENT_BINARY_DIR and adds
# the top-level ${CMAKE_BINARY_DIR} to omnivoice-core's include path. When the
# project is nested under add_subdirectory those two directories differ
# (<build>/omnivoice vs <build>), so omnivoice.cpp cannot find version.h. Point
# omnivoice-core at the subproject binary dir where version.h is actually
# generated. (Fix lives here, never in the fetched upstream checkout.)
target_include_directories(omnivoice-core PRIVATE ${CMAKE_BINARY_DIR}/omnivoice)
add_library(gomnivoicecpp MODULE cpp/gomnivoicecpp.cpp)
target_link_libraries(gomnivoicecpp PRIVATE omnivoice-core)
target_include_directories(gomnivoicecpp PRIVATE ${OMNIVOICE_DIR}/src)
target_include_directories(gomnivoicecpp SYSTEM PRIVATE ${OMNIVOICE_DIR}/ggml/include)
# Link GPU backends if the upstream ggml created them.
foreach(backend blas cuda metal vulkan sycl)
if(TARGET ggml-${backend})
target_link_libraries(gomnivoicecpp PRIVATE ggml-${backend})
if(backend STREQUAL "cuda")
find_package(CUDAToolkit QUIET)
if(CUDAToolkit_FOUND)
target_link_libraries(gomnivoicecpp PRIVATE CUDA::cudart)
endif()
endif()
endif()
endforeach()
if(MSVC)
target_compile_options(gomnivoicecpp PRIVATE /W4 /wd4100 /wd4505)
else()
target_compile_options(gomnivoicecpp PRIVATE -Wall -Wextra
-Wno-unused-parameter -Wno-unused-function)
endif()
set_property(TARGET gomnivoicecpp PROPERTY CXX_STANDARD 17)
set_target_properties(gomnivoicecpp PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})

View File

@@ -1,122 +0,0 @@
CMAKE_ARGS?=
BUILD_TYPE?=
NATIVE?=false
GOCMD?=go
GO_TAGS?=
JOBS?=$(shell nproc --ignore=1)
# omnivoice.cpp version
OMNIVOICE_REPO?=https://github.com/ServeurpersoCom/omnivoice.cpp
OMNIVOICE_VERSION?=2603355a5dfacae5cfc33531d5d0933221843509
SO_TARGET?=libgomnivoicecpp.so
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
ifeq ($(NATIVE),false)
CMAKE_ARGS+=-DGGML_NATIVE=OFF
endif
ifeq ($(BUILD_TYPE),cublas)
CMAKE_ARGS+=-DGGML_CUDA=ON
else ifeq ($(BUILD_TYPE),openblas)
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
else ifeq ($(BUILD_TYPE),clblas)
CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
else ifeq ($(BUILD_TYPE),hipblas)
CMAKE_ARGS+=-DGGML_HIPBLAS=ON
else ifeq ($(BUILD_TYPE),vulkan)
CMAKE_ARGS+=-DGGML_VULKAN=ON
else ifeq ($(OS),Darwin)
ifneq ($(BUILD_TYPE),metal)
CMAKE_ARGS+=-DGGML_METAL=OFF
else
CMAKE_ARGS+=-DGGML_METAL=ON
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
endif
endif
ifeq ($(BUILD_TYPE),sycl_f16)
CMAKE_ARGS+=-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DGGML_SYCL_F16=ON
endif
ifeq ($(BUILD_TYPE),sycl_f32)
CMAKE_ARGS+=-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx
endif
sources/omnivoice.cpp:
mkdir -p sources/omnivoice.cpp
cd sources/omnivoice.cpp && \
git init && \
git remote add origin $(OMNIVOICE_REPO) && \
git fetch origin && \
git checkout $(OMNIVOICE_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch
# Detect OS
UNAME_S := $(shell uname -s)
# Only build CPU variants on Linux
ifeq ($(UNAME_S),Linux)
VARIANT_TARGETS = libgomnivoicecpp-avx.so libgomnivoicecpp-avx2.so libgomnivoicecpp-avx512.so libgomnivoicecpp-fallback.so
else
VARIANT_TARGETS = libgomnivoicecpp-fallback.so
endif
omnivoice-cpp: main.go gomnivoicecpp.go $(VARIANT_TARGETS)
CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o omnivoice-cpp ./
package: omnivoice-cpp
bash package.sh
build: package
clean: purge
rm -rf libgomnivoicecpp*.so package sources/omnivoice.cpp omnivoice-cpp
purge:
rm -rf build*
.NOTPARALLEL:
ifeq ($(UNAME_S),Linux)
libgomnivoicecpp-avx.so: sources/omnivoice.cpp
$(info ${GREEN}I omnivoice-cpp build info:avx${RESET})
SO_TARGET=libgomnivoicecpp-avx.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgomnivoicecpp-custom
rm -rf build-libgomnivoicecpp-avx.so
libgomnivoicecpp-avx2.so: sources/omnivoice.cpp
$(info ${GREEN}I omnivoice-cpp build info:avx2${RESET})
SO_TARGET=libgomnivoicecpp-avx2.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) libgomnivoicecpp-custom
rm -rf build-libgomnivoicecpp-avx2.so
libgomnivoicecpp-avx512.so: sources/omnivoice.cpp
$(info ${GREEN}I omnivoice-cpp build info:avx512${RESET})
SO_TARGET=libgomnivoicecpp-avx512.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) libgomnivoicecpp-custom
rm -rf build-libgomnivoicecpp-avx512.so
endif
libgomnivoicecpp-fallback.so: sources/omnivoice.cpp
$(info ${GREEN}I omnivoice-cpp build info:fallback${RESET})
SO_TARGET=libgomnivoicecpp-fallback.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgomnivoicecpp-custom
rm -rf build-libgomnivoicecpp-fallback.so
libgomnivoicecpp-custom: CMakeLists.txt cpp/gomnivoicecpp.cpp cpp/gomnivoicecpp.h
mkdir -p build-$(SO_TARGET) && \
cd build-$(SO_TARGET) && \
cmake .. $(CMAKE_ARGS) && \
cmake --build . --config Release -j$(JOBS) --target gomnivoicecpp && \
cd .. && \
mv build-$(SO_TARGET)/libgomnivoicecpp.so ./$(SO_TARGET)
test: omnivoice-cpp
@echo "Running omnivoice-cpp tests..."
bash test.sh
@echo "omnivoice-cpp tests completed."
all: omnivoice-cpp package

View File

@@ -1,129 +0,0 @@
package main
import (
"bytes"
"encoding/binary"
"fmt"
"os"
"runtime"
"github.com/go-audio/audio"
"github.com/go-audio/wav"
)
const omnivoiceSampleRate = 24000
// wavHeader24k returns a 44-byte WAV header for a streaming 24 kHz mono 16-bit
// PCM stream, with placeholder (0xFFFFFFFF) sizes since the total length is
// unknown up front. Emitted as the first chunk of TTSStream so the HTTP layer
// receives a self-describing WAV (the gRPC TTSStream path never sets Message,
// so the backend owns the header - see core/backend/tts.go:ModelTTSStream).
func wavHeader24k() []byte {
var buf bytes.Buffer
w := func(v any) { _ = binary.Write(&buf, binary.LittleEndian, v) }
buf.WriteString("RIFF")
w(uint32(0xFFFFFFFF))
buf.WriteString("WAVE")
buf.WriteString("fmt ")
w(uint32(16)) // Subchunk1Size
w(uint16(1)) // PCM
w(uint16(1)) // mono
w(uint32(omnivoiceSampleRate)) // sample rate
w(uint32(omnivoiceSampleRate * 2)) // byte rate = SR * blockAlign
w(uint16(2)) // block align (16-bit mono)
w(uint16(16)) // bits per sample
buf.WriteString("data")
w(uint32(0xFFFFFFFF))
return buf.Bytes()
}
// floatToPCM16LE clamps each sample to [-1,1] and encodes it as little-endian
// signed 16-bit PCM.
func floatToPCM16LE(samples []float32) []byte {
out := make([]byte, len(samples)*2)
for i, s := range samples {
if s > 1 {
s = 1
} else if s < -1 {
s = -1
}
v := int16(s * 32767)
out[i*2] = byte(v)
out[i*2+1] = byte(v >> 8)
}
return out
}
// writeWAV24k writes samples as a finalized 24 kHz mono 16-bit WAV at dst.
func writeWAV24k(dst string, samples []float32) error {
f, err := os.Create(dst)
if err != nil {
return fmt.Errorf("omnivoice: create %q: %w", dst, err)
}
enc := wav.NewEncoder(f, omnivoiceSampleRate, 16, 1, 1)
ints := make([]int, len(samples))
for i, s := range samples {
if s > 1 {
s = 1
} else if s < -1 {
s = -1
}
ints[i] = int(s * 32767)
}
b := &audio.IntBuffer{
Format: &audio.Format{NumChannels: 1, SampleRate: omnivoiceSampleRate},
Data: ints,
SourceBitDepth: 16,
}
if err := enc.Write(b); err != nil {
_ = enc.Close()
_ = f.Close()
return fmt.Errorf("omnivoice: encode WAV: %w", err)
}
if err := enc.Close(); err != nil {
_ = f.Close()
return fmt.Errorf("omnivoice: finalize WAV: %w", err)
}
return f.Close()
}
// readWAVAsFloat decodes a WAV file (any sample rate/channels) to a mono
// float32 slice in [-1,1] for use as reference audio. OmniVoice expects 24 kHz;
// callers should supply 24 kHz reference clips.
func readWAVAsFloat(path string) ([]float32, error) {
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("omnivoice: open ref %q: %w", path, err)
}
defer func() { _ = f.Close() }()
dec := wav.NewDecoder(f)
buf, err := dec.FullPCMBuffer()
if err != nil {
return nil, fmt.Errorf("omnivoice: decode ref %q: %w", path, err)
}
ch := int(buf.Format.NumChannels)
if ch < 1 {
ch = 1
}
bitDepth := int(buf.SourceBitDepth)
if bitDepth == 0 {
bitDepth = 16
}
scale := float32(int64(1) << uint(bitDepth-1))
n := len(buf.Data) / ch
out := make([]float32, n)
for i := 0; i < n; i++ {
// Downmix to mono by averaging channels.
var acc int
for c := 0; c < ch; c++ {
acc += buf.Data[i*ch+c]
}
out[i] = float32(acc) / float32(ch) / scale
}
return out, nil
}
// runtimeKeepAlive prevents the GC from reclaiming the reference-audio slice
// while its backing pointer is in use across the C call.
func runtimeKeepAlive(v any) { runtime.KeepAlive(v) }

View File

@@ -1,166 +0,0 @@
#include "gomnivoicecpp.h"
#include "ggml-backend.h"
#include "omnivoice.h"
#include <cstdio>
#include <cstdlib>
#include <cstring>
static ov_context *g_ctx = nullptr;
static void ggml_log_cb(enum ggml_log_level level, const char *log,
void * /*data*/) {
if (!log)
return;
const char *lvl = "?????";
switch (level) {
case GGML_LOG_LEVEL_DEBUG: lvl = "DEBUG"; break;
case GGML_LOG_LEVEL_INFO: lvl = "INFO"; break;
case GGML_LOG_LEVEL_WARN: lvl = "WARN"; break;
case GGML_LOG_LEVEL_ERROR: lvl = "ERROR"; break;
default: break;
}
fprintf(stderr, "[%-5s] %s", lvl, log);
fflush(stderr);
}
int omni_load(const char *model_path, const char *codec_path, int use_fa,
int clamp_fp16) {
ggml_log_set(ggml_log_cb, nullptr);
ggml_backend_load_all();
if (!model_path || model_path[0] == '\0') {
fprintf(stderr, "[omnivoice-cpp] ERROR: model_path is required\n");
return 1;
}
if (!codec_path || codec_path[0] == '\0') {
fprintf(stderr, "[omnivoice-cpp] ERROR: codec_path is required\n");
return 2;
}
ov_init_params p;
ov_init_default_params(&p);
p.model_path = model_path;
p.codec_path = codec_path;
p.use_fa = use_fa != 0;
p.clamp_fp16 = clamp_fp16 != 0;
fprintf(stderr, "[omnivoice-cpp] Loading model=%s codec=%s\n", model_path,
codec_path);
g_ctx = ov_init(&p);
if (!g_ctx) {
fprintf(stderr, "[omnivoice-cpp] FATAL: ov_init failed: %s\n",
ov_last_error());
return 3;
}
fprintf(stderr, "[omnivoice-cpp] Model loaded (%s)\n", ov_version());
return 0;
}
// Fill an ov_tts_params from the flat wrapper arguments.
static void fill_params(ov_tts_params *tp, const char *text, const char *lang,
const char *instruct, const float *ref_samples,
int ref_n, const char *ref_text, long long seed,
int denoise) {
ov_tts_default_params(tp);
tp->text = text ? text : "";
tp->lang = lang ? lang : "";
if (instruct && instruct[0] != '\0')
tp->instruct = instruct;
if (ref_samples && ref_n > 0) {
tp->ref_audio_24k = ref_samples;
tp->ref_n_samples = ref_n;
if (ref_text && ref_text[0] != '\0')
tp->ref_text = ref_text;
tp->denoise = denoise != 0;
}
if (seed >= 0)
tp->mg_seed = (uint64_t)seed;
}
float *omni_tts(const char *text, const char *lang, const char *instruct,
const float *ref_samples, int ref_n, const char *ref_text,
long long seed, int denoise, int *out_n) {
if (out_n)
*out_n = 0;
if (!g_ctx) {
fprintf(stderr, "[omnivoice-cpp] ERROR: model not loaded\n");
return nullptr;
}
if (!text || text[0] == '\0') {
fprintf(stderr, "[omnivoice-cpp] ERROR: text is required\n");
return nullptr; // omni_tts: out_n already 0
}
ov_tts_params tp;
fill_params(&tp, text, lang, instruct, ref_samples, ref_n, ref_text, seed,
denoise);
ov_audio out = {0};
enum ov_status rc = ov_synthesize(g_ctx, &tp, &out);
if (rc != OV_STATUS_OK || out.n_samples <= 0 || !out.samples) {
fprintf(stderr, "[omnivoice-cpp] ERROR: synthesize failed (rc=%d): %s\n",
(int)rc, ov_last_error());
ov_audio_free(&out);
return nullptr;
}
// Copy into a plain malloc buffer the Go side can free symmetrically via
// omni_pcm_free; then release the ov_audio-owned buffer.
size_t bytes = (size_t)out.n_samples * sizeof(float);
float *buf = (float *)malloc(bytes);
if (!buf) {
fprintf(stderr, "[omnivoice-cpp] ERROR: malloc(%zu) failed\n", bytes);
ov_audio_free(&out);
return nullptr;
}
memcpy(buf, out.samples, bytes);
if (out_n)
*out_n = out.n_samples;
ov_audio_free(&out);
return buf;
}
int omni_tts_stream(const char *text, const char *lang, const char *instruct,
const float *ref_samples, int ref_n, const char *ref_text,
long long seed, int denoise, omni_pcm_chunk_cb cb,
void *user_data) {
if (!g_ctx) {
fprintf(stderr, "[omnivoice-cpp] ERROR: model not loaded\n");
return 1;
}
if (!cb) {
fprintf(stderr, "[omnivoice-cpp] ERROR: stream callback is null\n");
return 2;
}
if (!text || text[0] == '\0') {
fprintf(stderr, "[omnivoice-cpp] ERROR: text is required\n");
return 4;
}
ov_tts_params tp;
fill_params(&tp, text, lang, instruct, ref_samples, ref_n, ref_text, seed,
denoise);
// ov_audio_chunk_cb has the identical signature to omni_pcm_chunk_cb
// (bool vs int return are ABI-compatible; non-zero == true).
tp.on_chunk = (ov_audio_chunk_cb)cb;
tp.on_chunk_user_data = user_data;
ov_audio out = {0}; // stays empty in streaming mode
enum ov_status rc = ov_synthesize(g_ctx, &tp, &out);
ov_audio_free(&out);
if (rc != OV_STATUS_OK && rc != OV_STATUS_CANCELLED) {
fprintf(stderr, "[omnivoice-cpp] ERROR: stream synth failed (rc=%d): %s\n",
(int)rc, ov_last_error());
return 3;
}
return 0;
}
void omni_pcm_free(float *p) { free(p); }
void omni_unload(void) {
if (g_ctx) {
ov_free(g_ctx);
g_ctx = nullptr;
}
}

View File

@@ -1,38 +0,0 @@
#pragma once
#include <cstdint>
extern "C" {
// Streaming PCM chunk callback. samples is mono float PCM at 24 kHz, valid
// only for the duration of the call. Return non-zero to continue, 0 to abort.
typedef int (*omni_pcm_chunk_cb)(const float *samples, int n_samples,
void *user_data);
// Load the LM (model_path) + codec (codec_path) GGUFs. use_fa / clamp_fp16
// map to ov_init_params. Returns 0 on success, non-zero on failure.
int omni_load(const char *model_path, const char *codec_path, int use_fa,
int clamp_fp16);
// Synthesize to a malloc'd float PCM buffer (caller frees via omni_pcm_free).
// ref_samples != null && ref_n > 0 => voice cloning (ref_text optional).
// instruct != null && non-empty => voice design. seed < 0 keeps the default
// MaskGIT seed. denoise toggles the <|denoise|> marker (only with a reference).
// Writes the sample count to *out_n. Returns NULL on failure (out_n set to 0).
float *omni_tts(const char *text, const char *lang, const char *instruct,
const float *ref_samples, int ref_n, const char *ref_text,
long long seed, int denoise, int *out_n);
// Streaming synthesis: cb is invoked per PCM chunk as audio is produced.
// Same reference/design/seed semantics as omni_tts. Returns 0 on success.
int omni_tts_stream(const char *text, const char *lang, const char *instruct,
const float *ref_samples, int ref_n, const char *ref_text,
long long seed, int denoise, omni_pcm_chunk_cb cb,
void *user_data);
// Free a buffer returned by omni_tts.
void omni_pcm_free(float *p);
// Release the OmniVoice context.
void omni_unload(void);
}

View File

@@ -1,74 +0,0 @@
package main
import (
"os"
"strings"
"github.com/ebitengine/purego"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
func ttsReq(text, voice string, lang *string, dst string) *pb.TTSRequest {
return &pb.TTSRequest{Text: text, Voice: voice, Language: lang, Dst: dst}
}
var _ = Describe("OmniVoice e2e", Label("e2e"), func() {
var loaded bool
BeforeEach(func() {
modelPath := os.Getenv("OMNIVOICE_MODEL")
codecPath := os.Getenv("OMNIVOICE_CODEC")
if modelPath == "" || codecPath == "" {
Skip("OMNIVOICE_MODEL / OMNIVOICE_CODEC not set; skipping e2e")
}
if !loaded {
lib := os.Getenv("OMNIVOICE_LIBRARY")
if lib == "" {
lib = "./libgomnivoicecpp-fallback.so"
}
h, err := purego.Dlopen(lib, purego.RTLD_NOW|purego.RTLD_GLOBAL)
Expect(err).ToNot(HaveOccurred())
purego.RegisterLibFunc(&CppLoad, h, "omni_load")
purego.RegisterLibFunc(&CppTTS, h, "omni_tts")
purego.RegisterLibFunc(&CppTTSStream, h, "omni_tts_stream")
purego.RegisterLibFunc(&CppPCMFree, h, "omni_pcm_free")
purego.RegisterLibFunc(&CppUnload, h, "omni_unload")
Expect(CppLoad(modelPath, codecPath, 0, 0)).To(Equal(0))
loaded = true
}
})
It("synthesizes a WAV file via TTS", func() {
b := &OmnivoiceCpp{opts: loadOptions{seed: 42, denoise: true}}
dst := GinkgoT().TempDir() + "/out.wav"
lang := "en"
err := b.TTS(ttsReq("Hello world.", "", &lang, dst))
Expect(err).ToNot(HaveOccurred())
fi, err := os.Stat(dst)
Expect(err).ToNot(HaveOccurred())
Expect(fi.Size()).To(BeNumerically(">", int64(44)))
})
It("streams audio chunks via TTSStream", func() {
b := &OmnivoiceCpp{opts: loadOptions{seed: 42, denoise: true}}
results := make(chan []byte, 1024)
lang := "en"
done := make(chan error, 1)
go func() { done <- b.TTSStream(ttsReq("Hello there, streaming test.", "", &lang, ""), results) }()
var chunks int
var first []byte
for c := range results {
if chunks == 0 {
first = c
}
chunks++
}
Expect(<-done).ToNot(HaveOccurred())
Expect(chunks).To(BeNumerically(">=", 2))
Expect(string(first[0:4])).To(Equal("RIFF"))
Expect(strings.HasPrefix(string(first[8:12]), "WAVE")).To(BeTrue())
})
})

View File

@@ -1,246 +0,0 @@
package main
import (
"fmt"
"os"
"path/filepath"
"strings"
"sync"
"unsafe"
"github.com/ebitengine/purego"
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
)
var (
// omni_load(model_path, codec_path, use_fa, clamp_fp16) int
CppLoad func(modelPath, codecPath string, useFA, clampFP16 int) int
// omni_tts(text, lang, instruct, ref_samples, ref_n, ref_text, seed, denoise, out_n) -> float* (uintptr)
CppTTS func(text, lang, instruct string, refSamples unsafe.Pointer, refN int,
refText string, seed int64, denoise int, outN unsafe.Pointer) uintptr
// omni_tts_stream(text, lang, instruct, ref_samples, ref_n, ref_text, seed, denoise, cb, user) int
CppTTSStream func(text, lang, instruct string, refSamples unsafe.Pointer, refN int,
refText string, seed int64, denoise int, cb uintptr, user uintptr) int
CppPCMFree func(ptr uintptr)
CppUnload func()
)
type OmnivoiceCpp struct {
base.SingleThread
opts loadOptions
// audioPath is the model-config reference voice (tts.audio_path), used as
// the default voice-cloning reference when a request does not set Voice.
audioPath string
}
func (o *OmnivoiceCpp) Load(opts *pb.ModelOptions) error {
model := opts.ModelFile
if model == "" {
model = opts.ModelPath
}
if !filepath.IsAbs(model) && opts.ModelPath != "" {
model = filepath.Join(opts.ModelPath, model)
}
o.opts = parseOptions(opts.Options)
// Resolve the codec/tokenizer GGUF: explicit option, else auto-discover a
// *tokenizer*.gguf sibling of the base model.
codec := o.opts.codecPath
if codec != "" && !filepath.IsAbs(codec) {
codec = filepath.Join(filepath.Dir(model), codec)
}
if codec == "" {
codec = discoverTokenizer(filepath.Dir(model))
}
if codec == "" {
return fmt.Errorf("omnivoice: no codec/tokenizer GGUF found; set option 'tokenizer:<file>'")
}
o.opts.codecPath = codec
// tts.audio_path (ModelOptions.AudioPath) is the config-level voice-cloning
// reference: a default reference WAV used when a request omits Voice.
// Resolved relative to the model directory like the codec.
o.audioPath = opts.AudioPath
if o.audioPath != "" && !filepath.IsAbs(o.audioPath) {
o.audioPath = filepath.Join(filepath.Dir(model), o.audioPath)
}
useFA := boolToInt(o.opts.useFA)
clamp := boolToInt(o.opts.clampFP16)
fmt.Fprintf(os.Stderr, "[omnivoice-cpp] Load model=%s codec=%s use_fa=%d clamp_fp16=%d\n",
model, codec, useFA, clamp)
if rc := CppLoad(model, codec, useFA, clamp); rc != 0 {
return fmt.Errorf("omnivoice: failed to load model (rc=%d)", rc)
}
return nil
}
// discoverTokenizer returns the first *tokenizer*.gguf in dir, or "".
func discoverTokenizer(dir string) string {
entries, err := os.ReadDir(dir)
if err != nil {
return ""
}
for _, e := range entries {
name := strings.ToLower(e.Name())
if strings.Contains(name, "tokenizer") && strings.HasSuffix(name, ".gguf") {
return filepath.Join(dir, e.Name())
}
}
return ""
}
func boolToInt(b bool) int {
if b {
return 1
}
return 0
}
// refAudio loads the reference WAV (voice cloning) if voice points to a file.
// Returns nil if no cloning (empty or non-path - voice design uses Instructions).
func (o *OmnivoiceCpp) refAudio(voice string) ([]float32, error) {
v := strings.TrimSpace(voice)
if v == "" {
return nil, nil
}
if _, err := os.Stat(v); err != nil {
return nil, nil
}
return readWAVAsFloat(v)
}
// refAudioFor resolves the cloning reference for a request: the per-request
// Voice takes precedence, falling back to the model-config audio_path. Empty
// result means no cloning (voice design via Instructions still applies).
func (o *OmnivoiceCpp) refAudioFor(req *pb.TTSRequest) ([]float32, error) {
voice := strings.TrimSpace(req.Voice)
if voice == "" {
voice = o.audioPath
}
return o.refAudio(voice)
}
func reqParam(req *pb.TTSRequest, key string) string {
if req.Params == nil {
return ""
}
return req.Params[key]
}
func (o *OmnivoiceCpp) seedFor(req *pb.TTSRequest) int64 {
if s := reqParam(req, "seed"); s != "" {
var n int64
if _, err := fmt.Sscan(s, &n); err == nil {
return n
}
}
return o.opts.seed
}
func optStr(p *string) string {
if p == nil {
return ""
}
return *p
}
func (o *OmnivoiceCpp) TTS(req *pb.TTSRequest) error {
if req.Dst == "" {
return fmt.Errorf("omnivoice: TTS requires a destination path")
}
lang := normalizeLanguage(optStr(req.Language))
instruct := optStr(req.Instructions)
refText := reqParam(req, "ref_text")
seed := o.seedFor(req)
ref, err := o.refAudioFor(req)
if err != nil {
return err
}
var refPtr unsafe.Pointer
if len(ref) > 0 {
refPtr = unsafe.Pointer(&ref[0])
}
var n int32
ptr := CppTTS(req.Text, lang, instruct, refPtr, len(ref), refText, seed,
boolToInt(o.opts.denoise), unsafe.Pointer(&n))
runtimeKeepAlive(ref)
if ptr == 0 || n <= 0 {
return fmt.Errorf("omnivoice: synthesis failed")
}
defer CppPCMFree(ptr)
src := unsafe.Slice((*float32)(unsafe.Pointer(ptr)), int(n)) //nolint:govet // C-allocated PCM, copied out before free
out := make([]float32, int(n))
copy(out, src)
return writeWAV24k(req.Dst, out)
}
// streamState carries the active TTSStream channel to the single shared C
// callback. base.SingleThread serializes TTS/TTSStream, so one global slot is
// safe and avoids leaking a purego callback per request (purego callbacks
// cannot be freed and are capped).
var (
streamMu sync.Mutex
streamChan chan []byte
streamCbOnce sync.Once
streamCbPtr uintptr
)
// streamCallback is registered once and forwards each PCM chunk to streamChan.
func streamCallback(samples *float32, nSamples int32, _ uintptr) uintptr {
if nSamples <= 0 || samples == nil || streamChan == nil {
return 1 // continue
}
src := unsafe.Slice(samples, int(nSamples))
cp := make([]float32, int(nSamples)) // copy out of C memory before returning
copy(cp, src)
streamChan <- floatToPCM16LE(cp)
return 1 // continue
}
func (o *OmnivoiceCpp) TTSStream(req *pb.TTSRequest, results chan []byte) error {
defer close(results)
if req.Text == "" {
return fmt.Errorf("omnivoice: TTSStream requires text")
}
streamCbOnce.Do(func() {
streamCbPtr = purego.NewCallback(streamCallback)
})
lang := normalizeLanguage(optStr(req.Language))
instruct := optStr(req.Instructions)
refText := reqParam(req, "ref_text")
seed := o.seedFor(req)
ref, err := o.refAudioFor(req)
if err != nil {
return err
}
var refPtr unsafe.Pointer
if len(ref) > 0 {
refPtr = unsafe.Pointer(&ref[0])
}
// Emit the WAV header first so the HTTP layer gets a self-describing stream.
results <- wavHeader24k()
streamMu.Lock()
streamChan = results
rc := CppTTSStream(req.Text, lang, instruct, refPtr, len(ref), refText, seed,
boolToInt(o.opts.denoise), streamCbPtr, 0)
streamChan = nil
streamMu.Unlock()
runtimeKeepAlive(ref)
if rc != 0 {
return fmt.Errorf("omnivoice: streaming synthesis failed (rc=%d)", rc)
}
return nil
}

View File

@@ -1,90 +0,0 @@
package main
import (
"bytes"
"encoding/binary"
"testing"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
func TestOmnivoiceCpp(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "omnivoice-cpp suite")
}
var _ = Describe("normalizeLanguage", func() {
DescribeTable("maps caller language to OmniVoice codes",
func(in, want string) {
Expect(normalizeLanguage(in)).To(Equal(want))
},
Entry("empty stays empty", "", ""),
Entry("english full name", "English", "en"),
Entry("chinese full name", "Chinese", "zh"),
Entry("locale suffix stripped", "en-US", "en"),
Entry("underscore locale", "zh_CN", "zh"),
Entry("already a code", "en", "en"),
Entry("unknown passes through normalized", "xx", "xx"),
)
})
var _ = Describe("parseOptions", func() {
It("extracts codec, use_fa, clamp_fp16, seed, denoise", func() {
o := parseOptions([]string{
"tokenizer:tok.gguf",
"use_fa:true",
"clamp_fp16:true",
"seed:7",
"denoise:false",
"unknown:ignored",
})
Expect(o.codecPath).To(Equal("tok.gguf"))
Expect(o.useFA).To(BeTrue())
Expect(o.clampFP16).To(BeTrue())
Expect(o.seed).To(Equal(int64(7)))
Expect(o.denoise).To(BeFalse())
})
It("accepts codec: as an alias for tokenizer:", func() {
o := parseOptions([]string{"codec:c.gguf"})
Expect(o.codecPath).To(Equal("c.gguf"))
})
It("defaults seed to -1 and denoise to true", func() {
o := parseOptions(nil)
Expect(o.seed).To(Equal(int64(-1)))
Expect(o.denoise).To(BeTrue())
})
})
var _ = Describe("wavHeader24k", func() {
It("emits a 44-byte streaming WAV header at 24 kHz mono 16-bit", func() {
h := wavHeader24k()
Expect(h).To(HaveLen(44))
Expect(string(h[0:4])).To(Equal("RIFF"))
Expect(string(h[8:12])).To(Equal("WAVE"))
Expect(string(h[12:16])).To(Equal("fmt "))
Expect(string(h[36:40])).To(Equal("data"))
var sampleRate uint32
Expect(binary.Read(bytes.NewReader(h[24:28]), binary.LittleEndian, &sampleRate)).To(Succeed())
Expect(sampleRate).To(Equal(uint32(24000)))
})
})
var _ = Describe("floatToPCM16LE", func() {
It("clamps and converts float PCM to little-endian int16 bytes", func() {
b := floatToPCM16LE([]float32{0, 1.0, -1.0, 2.0, -2.0})
Expect(b).To(HaveLen(10)) // 5 samples * 2 bytes
read := func(off int) int16 {
var v int16
_ = binary.Read(bytes.NewReader(b[off:off+2]), binary.LittleEndian, &v)
return v
}
Expect(read(0)).To(Equal(int16(0)))
Expect(read(2)).To(Equal(int16(32767)))
Expect(read(4)).To(Equal(int16(-32767)))
Expect(read(6)).To(Equal(int16(32767))) // clamped from 2.0
Expect(read(8)).To(Equal(int16(-32767))) // clamped from -2.0
})
})

View File

@@ -1,48 +0,0 @@
package main
// Note: this is started internally by LocalAI and a server is allocated for each model
import (
"flag"
"os"
"github.com/ebitengine/purego"
grpc "github.com/mudler/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
type LibFuncs struct {
FuncPtr any
Name string
}
func main() {
libName := os.Getenv("OMNIVOICE_LIBRARY")
if libName == "" {
libName = "./libgomnivoicecpp-fallback.so"
}
lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
if err != nil {
panic(err)
}
libFuncs := []LibFuncs{
{&CppLoad, "omni_load"},
{&CppTTS, "omni_tts"},
{&CppTTSStream, "omni_tts_stream"},
{&CppPCMFree, "omni_pcm_free"},
{&CppUnload, "omni_unload"},
}
for _, lf := range libFuncs {
purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
}
flag.Parse()
if err := grpc.StartServer(*addr, &OmnivoiceCpp{}); err != nil {
panic(err)
}
}

View File

@@ -1,74 +0,0 @@
package main
import (
"strconv"
"strings"
)
// loadOptions holds the parsed model-level options for OmniVoice.
type loadOptions struct {
codecPath string
useFA bool
clampFP16 bool
seed int64
denoise bool
}
func splitOption(o string) (key, value string, ok bool) {
i := strings.Index(o, ":")
if i < 0 {
return "", "", false
}
return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true
}
// parseOptions reads the backend "key:value" option slice. Unknown keys are
// ignored. Defaults: seed -1 (engine default), denoise true.
func parseOptions(opts []string) loadOptions {
o := loadOptions{seed: -1, denoise: true}
for _, oo := range opts {
key, value, ok := splitOption(oo)
if !ok {
continue
}
switch key {
case "tokenizer", "codec":
o.codecPath = value
case "use_fa":
o.useFA = value == "true" || value == "1"
case "clamp_fp16":
o.clampFP16 = value == "true" || value == "1"
case "seed":
if n, err := strconv.ParseInt(value, 10, 64); err == nil {
o.seed = n
}
case "denoise":
o.denoise = value == "true" || value == "1"
}
}
return o
}
// languageNameAliases maps full language names to OmniVoice codes. OmniVoice's
// lang hint accepts "" (auto), "en", "zh" per the upstream convention; other
// codes pass through and the engine treats unknown hints as auto.
var languageNameAliases = map[string]string{
"english": "en",
"chinese": "zh",
}
// normalizeLanguage lowercases, trims, strips a region/locale suffix, and
// resolves common full names. Empty stays empty so the engine auto-detects.
func normalizeLanguage(lang string) string {
lang = strings.ToLower(strings.TrimSpace(lang))
if lang == "" {
return ""
}
if i := strings.IndexAny(lang, "-_."); i >= 0 {
lang = lang[:i]
}
if code, ok := languageNameAliases[lang]; ok {
return code
}
return lang
}

View File

@@ -1,64 +0,0 @@
#!/bin/bash
# Script to copy the appropriate libraries based on architecture
# This script is used in the final stage of the Dockerfile
set -e
CURDIR=$(dirname "$(realpath $0)")
REPO_ROOT="${CURDIR}/../../.."
# Create lib directory
mkdir -p $CURDIR/package/lib
cp -avf $CURDIR/omnivoice-cpp $CURDIR/package/
cp -fv $CURDIR/libgomnivoicecpp-*.so $CURDIR/package/
cp -fv $CURDIR/run.sh $CURDIR/package/
# Detect architecture and copy appropriate libraries
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
# x86_64 architecture
echo "Detected x86_64 architecture, copying x86_64 libraries..."
cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
# ARM64 architecture
echo "Detected ARM64 architecture, copying ARM64 libraries..."
cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
elif [ $(uname -s) = "Darwin" ]; then
echo "Detected Darwin"
else
echo "Error: Could not detect architecture"
exit 1
fi
# Package GPU libraries based on BUILD_TYPE
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
if [ -f "$GPU_LIB_SCRIPT" ]; then
echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
package_gpu_libs
fi
echo "Packaging completed successfully"
ls -liah $CURDIR/package/
ls -liah $CURDIR/package/lib/

View File

@@ -1,52 +0,0 @@
#!/bin/bash
set -ex
# Get the absolute current dir where the script is located
CURDIR=$(dirname "$(realpath $0)")
cd /
echo "CPU info:"
if [ "$(uname)" != "Darwin" ]; then
grep -e "model\sname" /proc/cpuinfo | head -1
grep -e "flags" /proc/cpuinfo | head -1
fi
LIBRARY="$CURDIR/libgomnivoicecpp-fallback.so"
if [ "$(uname)" != "Darwin" ]; then
if grep -q -e "\savx\s" /proc/cpuinfo ; then
echo "CPU: AVX found OK"
if [ -e $CURDIR/libgomnivoicecpp-avx.so ]; then
LIBRARY="$CURDIR/libgomnivoicecpp-avx.so"
fi
fi
if grep -q -e "\savx2\s" /proc/cpuinfo ; then
echo "CPU: AVX2 found OK"
if [ -e $CURDIR/libgomnivoicecpp-avx2.so ]; then
LIBRARY="$CURDIR/libgomnivoicecpp-avx2.so"
fi
fi
# Check avx 512
if grep -q -e "\savx512f\s" /proc/cpuinfo ; then
echo "CPU: AVX512F found OK"
if [ -e $CURDIR/libgomnivoicecpp-avx512.so ]; then
LIBRARY="$CURDIR/libgomnivoicecpp-avx512.so"
fi
fi
fi
export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
export OMNIVOICE_LIBRARY=$LIBRARY
# If there is a lib/ld.so, use it
if [ -f $CURDIR/lib/ld.so ]; then
echo "Using lib/ld.so"
echo "Using library: $LIBRARY"
exec $CURDIR/lib/ld.so $CURDIR/omnivoice-cpp "$@"
fi
echo "Using library: $LIBRARY"
exec $CURDIR/omnivoice-cpp "$@"

View File

@@ -1,30 +0,0 @@
#!/bin/bash
set -e
CURDIR=$(dirname "$(realpath $0)")
cd "$CURDIR"
echo "Running omnivoice-cpp backend tests..."
if [ -z "$OMNIVOICE_MODEL" ]; then
MODEL_DIR="./omnivoice-models"
mkdir -p "$MODEL_DIR"
REPO_ID="Serveurperso/OmniVoice-GGUF"
BASE_URL="https://huggingface.co/${REPO_ID}/resolve/main"
FILES=( "omnivoice-base-Q4_K_M.gguf" "omnivoice-tokenizer-Q4_K_M.gguf" )
for file in "${FILES[@]}"; do
dest="${MODEL_DIR}/${file}"
if [ -f "${dest}" ]; then
echo " [skip] ${file}"
else
echo " [download] ${file}..."
curl -L -o "${dest}" "${BASE_URL}/${file}" --progress-bar
fi
done
export OMNIVOICE_MODEL="${MODEL_DIR}/omnivoice-base-Q4_K_M.gguf"
export OMNIVOICE_CODEC="${MODEL_DIR}/omnivoice-tokenizer-Q4_K_M.gguf"
fi
go test -v -timeout 1200s .
echo "All omnivoice-cpp e2e tests passed."

View File

@@ -3,36 +3,35 @@ project(goqwen3ttscpp LANGUAGES C CXX)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(QWENTTS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/sources/qwentts.cpp)
set(QWEN3TTS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/sources/qwen3-tts.cpp)
# Override upstream's CMAKE_CUDA_ARCHITECTURES before add_subdirectory.
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real")
endif()
# Add the upstream project. Its own CMakeLists adds ggml + cpp-httplib + yyjson
# and builds qwen-core (STATIC, the qt_* impl). EXCLUDE_FROM_ALL keeps its CLI
# tools / tts-server / tests from building unless referenced.
add_subdirectory(${QWENTTS_DIR} qwentts EXCLUDE_FROM_ALL)
# Build ggml from the upstream's submodule FIRST, so that ggml/ggml-base/ggml-cpu
# CMake targets exist when the upstream project references them by name.
# The upstream CMakeLists.txt uses target_link_libraries(... ggml ggml-base ggml-cpu)
# with target_link_directories pointing at a pre-built ggml/build/. By adding ggml
# as a subdirectory here, CMake resolves those names as targets instead.
add_subdirectory(${QWEN3TTS_DIR}/ggml ggml EXCLUDE_FROM_ALL)
# Upstream generates version.h into its own CMAKE_CURRENT_BINARY_DIR and adds
# the top-level ${CMAKE_BINARY_DIR} to qwen-core's include path. Under
# add_subdirectory those two dirs differ (<build>/qwentts vs <build>), so
# qwen.cpp cannot find version.h. Point qwen-core at the subproject binary dir
# where version.h is actually generated. (Fix lives here, never in the fetched
# upstream checkout.)
target_include_directories(qwen-core PRIVATE ${CMAKE_BINARY_DIR}/qwentts)
# Now add the upstream project
add_subdirectory(${QWEN3TTS_DIR} qwen3tts EXCLUDE_FROM_ALL)
add_library(goqwen3ttscpp MODULE cpp/goqwen3ttscpp.cpp)
target_link_libraries(goqwen3ttscpp PRIVATE qwen-core)
target_link_libraries(goqwen3ttscpp PRIVATE qwen3_tts)
target_include_directories(goqwen3ttscpp PRIVATE ${QWENTTS_DIR}/src)
target_include_directories(goqwen3ttscpp SYSTEM PRIVATE ${QWENTTS_DIR}/ggml/include)
target_include_directories(goqwen3ttscpp PRIVATE ${QWEN3TTS_DIR}/src)
target_include_directories(goqwen3ttscpp SYSTEM PRIVATE ${QWEN3TTS_DIR}/ggml/include)
# Link GPU backends if the upstream ggml created them.
foreach(backend blas cuda metal vulkan sycl)
# Link GPU backends if available
foreach(backend blas cuda metal vulkan)
if(TARGET ggml-${backend})
target_link_libraries(goqwen3ttscpp PRIVATE ggml-${backend})
string(TOUPPER ${backend} BACKEND_UPPER)
target_compile_definitions(goqwen3ttscpp PRIVATE QWEN3TTS_HAVE_${BACKEND_UPPER})
if(backend STREQUAL "cuda")
find_package(CUDAToolkit QUIET)
if(CUDAToolkit_FOUND)
@@ -45,8 +44,12 @@ endforeach()
if(MSVC)
target_compile_options(goqwen3ttscpp PRIVATE /W4 /wd4100 /wd4505)
else()
target_compile_options(goqwen3ttscpp PRIVATE -Wall -Wextra
-Wno-unused-parameter -Wno-unused-function)
target_compile_options(goqwen3ttscpp PRIVATE -Wall -Wextra -Wshadow -Wconversion
-Wno-unused-parameter -Wno-unused-function -Wno-sign-conversion)
endif()
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.0)
target_link_libraries(goqwen3ttscpp PRIVATE stdc++fs)
endif()
set_property(TARGET goqwen3ttscpp PROPERTY CXX_STANDARD 17)

View File

@@ -6,9 +6,9 @@ GOCMD?=go
GO_TAGS?=
JOBS?=$(shell nproc --ignore=1)
# qwentts.cpp version
QWEN3TTS_REPO?=https://github.com/ServeurpersoCom/qwentts.cpp
QWEN3TTS_CPP_VERSION?=0bf4a18b22e8bb8718d95294e9f7f45c0d4270a4
# qwen3-tts.cpp version
QWEN3TTS_REPO?=https://github.com/predict-woo/qwen3-tts.cpp
QWEN3TTS_CPP_VERSION?=136e5d36c17083da0321fd96512dc7b263f94a44
SO_TARGET?=libgoqwen3ttscpp.so
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
@@ -49,9 +49,9 @@ ifeq ($(BUILD_TYPE),sycl_f32)
-DCMAKE_CXX_COMPILER=icpx
endif
sources/qwentts.cpp:
mkdir -p sources/qwentts.cpp
cd sources/qwentts.cpp && \
sources/qwen3-tts.cpp:
mkdir -p sources/qwen3-tts.cpp
cd sources/qwen3-tts.cpp && \
git init && \
git remote add origin $(QWEN3TTS_REPO) && \
git fetch origin && \
@@ -78,7 +78,7 @@ package: qwen3-tts-cpp
build: package
clean: purge
rm -rf libgoqwen3ttscpp*.so package sources/qwentts.cpp qwen3-tts-cpp
rm -rf libgoqwen3ttscpp*.so package sources/qwen3-tts.cpp qwen3-tts-cpp
purge:
rm -rf build*
@@ -88,24 +88,24 @@ purge:
# Build all variants (Linux only)
ifeq ($(UNAME_S),Linux)
libgoqwen3ttscpp-avx.so: sources/qwentts.cpp
libgoqwen3ttscpp-avx.so: sources/qwen3-tts.cpp
$(info ${GREEN}I qwen3-tts-cpp build info:avx${RESET})
SO_TARGET=libgoqwen3ttscpp-avx.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgoqwen3ttscpp-custom
rm -rf build-libgoqwen3ttscpp-avx.so
libgoqwen3ttscpp-avx2.so: sources/qwentts.cpp
libgoqwen3ttscpp-avx2.so: sources/qwen3-tts.cpp
$(info ${GREEN}I qwen3-tts-cpp build info:avx2${RESET})
SO_TARGET=libgoqwen3ttscpp-avx2.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) libgoqwen3ttscpp-custom
rm -rf build-libgoqwen3ttscpp-avx2.so
libgoqwen3ttscpp-avx512.so: sources/qwentts.cpp
libgoqwen3ttscpp-avx512.so: sources/qwen3-tts.cpp
$(info ${GREEN}I qwen3-tts-cpp build info:avx512${RESET})
SO_TARGET=libgoqwen3ttscpp-avx512.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) libgoqwen3ttscpp-custom
rm -rf build-libgoqwen3ttscpp-avx512.so
endif
# Build fallback variant (all platforms)
libgoqwen3ttscpp-fallback.so: sources/qwentts.cpp
libgoqwen3ttscpp-fallback.so: sources/qwen3-tts.cpp
$(info ${GREEN}I qwen3-tts-cpp build info:fallback${RESET})
SO_TARGET=libgoqwen3ttscpp-fallback.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgoqwen3ttscpp-custom
rm -rf build-libgoqwen3ttscpp-fallback.so

View File

@@ -1,128 +0,0 @@
package main
import (
"bytes"
"encoding/binary"
"fmt"
"os"
"runtime"
"github.com/go-audio/audio"
"github.com/go-audio/wav"
)
const qwen3ttsSampleRate = 24000
// wavHeader24k returns a 44-byte WAV header for a streaming 24 kHz mono 16-bit
// PCM stream, with placeholder (0xFFFFFFFF) sizes since the total length is
// unknown up front. Emitted as the first chunk of TTSStream so the HTTP layer
// receives a self-describing WAV (the gRPC TTSStream path never sets Message,
// so the backend owns the header - see core/backend/tts.go:ModelTTSStream).
func wavHeader24k() []byte {
var buf bytes.Buffer
w := func(v any) { _ = binary.Write(&buf, binary.LittleEndian, v) }
buf.WriteString("RIFF")
w(uint32(0xFFFFFFFF))
buf.WriteString("WAVE")
buf.WriteString("fmt ")
w(uint32(16)) // Subchunk1Size
w(uint16(1)) // PCM
w(uint16(1)) // mono
w(uint32(qwen3ttsSampleRate)) // sample rate
w(uint32(qwen3ttsSampleRate * 2)) // byte rate = SR * blockAlign
w(uint16(2)) // block align (16-bit mono)
w(uint16(16)) // bits per sample
buf.WriteString("data")
w(uint32(0xFFFFFFFF))
return buf.Bytes()
}
// floatToPCM16LE clamps each sample to [-1,1] and encodes it as little-endian
// signed 16-bit PCM.
func floatToPCM16LE(samples []float32) []byte {
out := make([]byte, len(samples)*2)
for i, s := range samples {
if s > 1 {
s = 1
} else if s < -1 {
s = -1
}
v := int16(s * 32767)
out[i*2] = byte(v)
out[i*2+1] = byte(v >> 8)
}
return out
}
// writeWAV24k writes samples as a finalized 24 kHz mono 16-bit WAV at dst.
func writeWAV24k(dst string, samples []float32) error {
f, err := os.Create(dst)
if err != nil {
return fmt.Errorf("qwen3-tts: create %q: %w", dst, err)
}
enc := wav.NewEncoder(f, qwen3ttsSampleRate, 16, 1, 1)
ints := make([]int, len(samples))
for i, s := range samples {
if s > 1 {
s = 1
} else if s < -1 {
s = -1
}
ints[i] = int(s * 32767)
}
b := &audio.IntBuffer{
Format: &audio.Format{NumChannels: 1, SampleRate: qwen3ttsSampleRate},
Data: ints,
SourceBitDepth: 16,
}
if err := enc.Write(b); err != nil {
_ = enc.Close()
_ = f.Close()
return fmt.Errorf("qwen3-tts: encode WAV: %w", err)
}
if err := enc.Close(); err != nil {
_ = f.Close()
return fmt.Errorf("qwen3-tts: finalize WAV: %w", err)
}
return f.Close()
}
// readWAVAsFloat decodes a WAV file (any sample rate/channels) to a mono
// float32 slice in [-1,1] for use as cloning reference audio. qwentts expects
// 24 kHz; callers should supply 24 kHz reference clips.
func readWAVAsFloat(path string) ([]float32, error) {
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("qwen3-tts: open ref %q: %w", path, err)
}
defer func() { _ = f.Close() }()
dec := wav.NewDecoder(f)
buf, err := dec.FullPCMBuffer()
if err != nil {
return nil, fmt.Errorf("qwen3-tts: decode ref %q: %w", path, err)
}
ch := int(buf.Format.NumChannels)
if ch < 1 {
ch = 1
}
bitDepth := int(buf.SourceBitDepth)
if bitDepth == 0 {
bitDepth = 16
}
scale := float32(int64(1) << uint(bitDepth-1))
n := len(buf.Data) / ch
out := make([]float32, n)
for i := 0; i < n; i++ {
var acc int
for c := 0; c < ch; c++ {
acc += buf.Data[i*ch+c]
}
out[i] = float32(acc) / float32(ch) / scale
}
return out, nil
}
// runtimeKeepAlive prevents the GC from reclaiming the reference-audio slice
// while its backing pointer is in use across the C call.
func runtimeKeepAlive(v any) { runtime.KeepAlive(v) }

View File

@@ -1,54 +0,0 @@
package main
import (
"path/filepath"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
// These specs pin the voice-selection logic in resolveRequest, in particular
// the config-level audio_path (tts.audio_path -> ModelOptions.AudioPath) being
// used as the default voice-cloning reference. No model/C library is needed:
// resolveRequest only reads the reference WAV via readWAVAsFloat (pure Go).
var _ = Describe("resolveRequest voice/clone selection", func() {
var dir, refWav string
BeforeEach(func() {
dir = GinkgoT().TempDir()
refWav = filepath.Join(dir, "ref.wav")
// 0.5s of non-silent 24kHz mono audio as a clone reference.
samples := make([]float32, qwen3ttsSampleRate/2)
for i := range samples {
samples[i] = 0.1
}
Expect(writeWAV24k(refWav, samples)).To(Succeed())
})
It("uses the config audio_path as the clone reference when Voice is empty", func() {
q := &Qwen3TtsCpp{audioPath: refWav}
_, _, speaker, _, ref, _, err := q.resolveRequest(&pb.TTSRequest{Text: "hi"})
Expect(err).ToNot(HaveOccurred())
Expect(speaker).To(BeEmpty())
Expect(len(ref)).To(Equal(qwen3ttsSampleRate / 2))
})
It("lets a per-request audio Voice override audio_path", func() {
other := filepath.Join(dir, "other.wav")
Expect(writeWAV24k(other, make([]float32, 100))).To(Succeed())
q := &Qwen3TtsCpp{audioPath: refWav}
_, _, speaker, _, ref, _, err := q.resolveRequest(&pb.TTSRequest{Text: "hi", Voice: other})
Expect(err).ToNot(HaveOccurred())
Expect(speaker).To(BeEmpty())
Expect(len(ref)).To(Equal(100))
})
It("does not trigger audio_path cloning for a named-speaker Voice", func() {
q := &Qwen3TtsCpp{audioPath: refWav}
_, _, speaker, _, ref, _, err := q.resolveRequest(&pb.TTSRequest{Text: "hi", Voice: "serena"})
Expect(err).ToNot(HaveOccurred())
Expect(speaker).To(Equal("serena"))
Expect(ref).To(BeNil())
})
})

View File

@@ -1,191 +1,161 @@
#include "goqwen3ttscpp.h"
#include "ggml-backend.h"
#include "qwen.h"
#include "qwen3_tts.h"
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <string>
static qt_context *g_ctx = nullptr;
using namespace qwen3_tts;
static void ggml_log_cb(enum ggml_log_level level, const char *log,
void * /*data*/) {
// Global engine (loaded once, reused across requests)
static Qwen3TTS *g_engine = nullptr;
static bool g_loaded = false;
static int g_threads = 4;
static void ggml_log_cb(enum ggml_log_level level, const char *log, void *data) {
const char *level_str;
if (!log)
return;
const char *lvl = "?????";
switch (level) {
case GGML_LOG_LEVEL_DEBUG: lvl = "DEBUG"; break;
case GGML_LOG_LEVEL_INFO: lvl = "INFO"; break;
case GGML_LOG_LEVEL_WARN: lvl = "WARN"; break;
case GGML_LOG_LEVEL_ERROR: lvl = "ERROR"; break;
default: break;
case GGML_LOG_LEVEL_DEBUG:
level_str = "DEBUG";
break;
case GGML_LOG_LEVEL_INFO:
level_str = "INFO";
break;
case GGML_LOG_LEVEL_WARN:
level_str = "WARN";
break;
case GGML_LOG_LEVEL_ERROR:
level_str = "ERROR";
break;
default:
level_str = "?????";
break;
}
fprintf(stderr, "[%-5s] %s", lvl, log);
fprintf(stderr, "[%-5s] ", level_str);
fputs(log, stderr);
fflush(stderr);
}
int qt3_load(const char *talker_path, const char *codec_path, int use_fa,
int clamp_fp16) {
// Map language string to language_id token used by the model
static int language_to_id(const char *lang) {
if (!lang || lang[0] == '\0')
return 2050; // default: English
std::string l(lang);
if (l == "en")
return 2050;
if (l == "ru")
return 2069;
if (l == "zh")
return 2055;
if (l == "ja")
return 2058;
if (l == "ko")
return 2064;
if (l == "de")
return 2053;
if (l == "fr")
return 2061;
if (l == "es")
return 2054;
if (l == "it")
return 2056;
if (l == "pt")
return 2057;
fprintf(stderr, "[qwen3-tts-cpp] Unknown language '%s', defaulting to English\n",
lang);
return 2050;
}
int load_model(const char *model_dir, int n_threads) {
ggml_log_set(ggml_log_cb, nullptr);
ggml_backend_load_all();
if (!talker_path || talker_path[0] == '\0') {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: talker_path is required\n");
if (n_threads <= 0)
n_threads = 4;
g_threads = n_threads;
fprintf(stderr, "[qwen3-tts-cpp] Loading models from %s (threads=%d)\n",
model_dir, n_threads);
g_engine = new Qwen3TTS();
if (!g_engine->load_models(model_dir)) {
fprintf(stderr, "[qwen3-tts-cpp] FATAL: failed to load models from %s\n",
model_dir);
delete g_engine;
g_engine = nullptr;
return 1;
}
if (!codec_path || codec_path[0] == '\0') {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: codec_path is required\n");
return 2;
}
qt_init_params p;
qt_init_default_params(&p);
p.talker_path = talker_path;
p.codec_path = codec_path;
p.use_fa = use_fa != 0;
p.clamp_fp16 = clamp_fp16 != 0;
fprintf(stderr, "[qwen3-tts-cpp] Loading talker=%s codec=%s\n", talker_path,
codec_path);
g_ctx = qt_init(&p);
if (!g_ctx) {
fprintf(stderr, "[qwen3-tts-cpp] FATAL: qt_init failed: %s\n",
qt_last_error());
return 3;
}
fprintf(stderr, "[qwen3-tts-cpp] Model loaded (%s)\n", qt_version());
g_loaded = true;
fprintf(stderr, "[qwen3-tts-cpp] Models loaded successfully\n");
return 0;
}
// Fill a qt_tts_params from the flat wrapper arguments. Unset/zero scalars keep
// the qt defaults (temperature 0.9, top_k 50, top_p 1.0, rep 1.05, max 2048).
static void fill_params(qt_tts_params *tp, const char *text, const char *lang,
const char *instruct, const char *speaker,
const float *ref_samples, int ref_n,
const char *ref_text, long long seed, float temperature,
int top_k, float top_p, float repetition_penalty,
int max_new_tokens) {
qt_tts_default_params(tp);
tp->text = text ? text : "";
if (lang && lang[0] != '\0')
tp->lang = lang; // else keep default NULL -> auto
if (instruct && instruct[0] != '\0')
tp->instruct = instruct;
if (speaker && speaker[0] != '\0')
tp->speaker = speaker;
if (ref_samples && ref_n > 0) {
tp->ref_audio_24k = ref_samples;
tp->ref_n_samples = ref_n;
if (ref_text && ref_text[0] != '\0')
tp->ref_text = ref_text;
}
if (seed >= 0)
tp->seed = (int64_t)seed; // else default -1 (random)
if (temperature > 0.0f)
tp->temperature = temperature;
if (top_k > 0)
tp->top_k = top_k;
if (top_p > 0.0f)
tp->top_p = top_p;
if (repetition_penalty > 0.0f)
tp->repetition_penalty = repetition_penalty;
if (max_new_tokens > 0)
tp->max_new_tokens = max_new_tokens;
}
float *qt3_tts(const char *text, const char *lang, const char *instruct,
const char *speaker, const float *ref_samples, int ref_n,
const char *ref_text, long long seed, float temperature,
int top_k, float top_p, float repetition_penalty,
int max_new_tokens, int *out_n) {
if (out_n)
*out_n = 0;
if (!g_ctx) {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: model not loaded\n");
return nullptr;
}
if (!text || text[0] == '\0') {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: text is required\n");
return nullptr;
}
qt_tts_params tp;
fill_params(&tp, text, lang, instruct, speaker, ref_samples, ref_n,
ref_text, seed, temperature, top_k, top_p, repetition_penalty,
max_new_tokens);
qt_audio out = {0};
enum qt_status rc = qt_synthesize(g_ctx, &tp, &out);
if (rc != QT_STATUS_OK || out.n_samples <= 0 || !out.samples) {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: synthesize failed (rc=%d): %s\n",
(int)rc, qt_last_error());
qt_audio_free(&out);
return nullptr;
}
// Copy into a plain malloc buffer the Go side frees via qt3_pcm_free.
size_t bytes = (size_t)out.n_samples * sizeof(float);
float *buf = (float *)malloc(bytes);
if (!buf) {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: malloc(%zu) failed\n", bytes);
qt_audio_free(&out);
return nullptr;
}
memcpy(buf, out.samples, bytes);
if (out_n)
*out_n = out.n_samples;
qt_audio_free(&out);
return buf;
}
int qt3_tts_stream(const char *text, const char *lang, const char *instruct,
const char *speaker, const float *ref_samples, int ref_n,
const char *ref_text, long long seed, float temperature,
int top_k, float top_p, float repetition_penalty,
int max_new_tokens, qt3_chunk_cb cb, void *user_data) {
if (!g_ctx) {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: model not loaded\n");
int synthesize(const char *text, const char *ref_audio_path, const char *dst,
const char *language, float temperature, float top_p,
int top_k, float repetition_penalty, int max_audio_tokens,
int n_threads) {
if (!g_loaded || !g_engine) {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: models not loaded\n");
return 1;
}
if (!cb) {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: stream callback is null\n");
if (!text || !dst) {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: text and dst are required\n");
return 2;
}
if (!text || text[0] == '\0') {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: text is required\n");
tts_params params;
params.max_audio_tokens = max_audio_tokens > 0 ? max_audio_tokens : 4096;
params.temperature = temperature;
params.top_p = top_p;
params.top_k = top_k;
params.repetition_penalty = repetition_penalty;
params.n_threads = n_threads > 0 ? n_threads : g_threads;
params.language_id = language_to_id(language);
fprintf(stderr, "[qwen3-tts-cpp] Synthesizing: text='%.50s%s', lang_id=%d, "
"temp=%.2f, threads=%d\n",
text, (strlen(text) > 50 ? "..." : ""), params.language_id,
temperature, params.n_threads);
tts_result result;
bool has_ref = ref_audio_path && ref_audio_path[0] != '\0';
if (has_ref) {
fprintf(stderr, "[qwen3-tts-cpp] Voice cloning with ref: %s\n",
ref_audio_path);
result = g_engine->synthesize_with_voice(text, ref_audio_path, params);
} else {
result = g_engine->synthesize(text, params);
}
if (!result.success) {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: synthesis failed: %s\n",
result.error_msg.c_str());
return 3;
}
int n_samples = (int)result.audio.size();
if (n_samples == 0) {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: synthesis produced no samples\n");
return 4;
}
qt_tts_params tp;
fill_params(&tp, text, lang, instruct, speaker, ref_samples, ref_n,
ref_text, seed, temperature, top_k, top_p, repetition_penalty,
max_new_tokens);
// qt_audio_chunk_cb has the identical signature to qt3_chunk_cb
// (bool vs int return are ABI-compatible; non-zero == true).
tp.on_chunk = (qt_audio_chunk_cb)cb;
tp.on_chunk_user_data = user_data;
qt_audio out = {0}; // stays empty in streaming mode
enum qt_status rc = qt_synthesize(g_ctx, &tp, &out);
qt_audio_free(&out);
if (rc != QT_STATUS_OK && rc != QT_STATUS_CANCELLED) {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: stream synth failed (rc=%d): %s\n",
(int)rc, qt_last_error());
return 3;
fprintf(stderr,
"[qwen3-tts-cpp] Synthesis done: %d samples (%.2fs @ 24kHz)\n",
n_samples, (float)n_samples / 24000.0f);
if (!save_audio_file(dst, result.audio, result.sample_rate)) {
fprintf(stderr, "[qwen3-tts-cpp] ERROR: failed to write %s\n", dst);
return 5;
}
fprintf(stderr, "[qwen3-tts-cpp] Wrote %s\n", dst);
return 0;
}
void qt3_pcm_free(float *p) { free(p); }
void qt3_unload(void) {
if (g_ctx) {
qt_free(g_ctx);
g_ctx = nullptr;
}
}
int qt3_n_speakers(void) { return g_ctx ? qt_n_speakers(g_ctx) : 0; }
const char *qt3_speaker_name(int i) {
return g_ctx ? qt_speaker_name(g_ctx, i) : nullptr;
}

View File

@@ -1,47 +1,12 @@
#pragma once
#include <cstddef>
#include <cstdint>
extern "C" {
// Streaming PCM chunk callback. samples is mono float PCM at 24 kHz, valid
// only for the duration of the call. Return non-zero to continue, 0 to abort.
typedef int (*qt3_chunk_cb)(const float *samples, int n_samples,
void *user_data);
// Load the talker + codec/tokenizer GGUFs. use_fa / clamp_fp16 map to
// qt_init_params (the qt ABI exposes no thread count; ggml uses its own
// default). Returns 0 on success, non-zero on failure.
int qt3_load(const char *talker_path, const char *codec_path, int use_fa,
int clamp_fp16);
// Synthesize to a malloc'd float PCM buffer (caller frees via qt3_pcm_free).
// The synthesis mode (base / custom_voice / voice_design) is auto-detected by
// qt from the talker GGUF; speaker is honoured only for custom_voice, instruct
// for voice_design / custom_voice, and ref_samples (+ optional ref_text) drive
// base-mode cloning. qt enforces the rules and we surface qt_last_error() on
// QT_STATUS_MODE_INVALID. Writes the sample count to *out_n. Returns NULL on
// failure (out_n set to 0).
float *qt3_tts(const char *text, const char *lang, const char *instruct,
const char *speaker, const float *ref_samples, int ref_n,
const char *ref_text, long long seed, float temperature,
int top_k, float top_p, float repetition_penalty,
int max_new_tokens, int *out_n);
// Streaming synthesis: cb is invoked per PCM chunk as audio is produced. Same
// param semantics as qt3_tts. Returns 0 on success.
int qt3_tts_stream(const char *text, const char *lang, const char *instruct,
const char *speaker, const float *ref_samples, int ref_n,
const char *ref_text, long long seed, float temperature,
int top_k, float top_p, float repetition_penalty,
int max_new_tokens, qt3_chunk_cb cb, void *user_data);
// Free a buffer returned by qt3_tts.
void qt3_pcm_free(float *p);
// Release the qt context.
void qt3_unload(void);
// Named-speaker introspection (custom_voice models). Returns 0 / NULL when no
// model is loaded or the index is out of range.
int qt3_n_speakers(void);
const char *qt3_speaker_name(int i);
int load_model(const char *model_dir, int n_threads);
int synthesize(const char *text, const char *ref_audio_path, const char *dst,
const char *language, float temperature, float top_p,
int top_k, float repetition_penalty, int max_audio_tokens,
int n_threads);
}

View File

@@ -1,95 +0,0 @@
package main
import (
"math"
"os"
"strings"
"github.com/ebitengine/purego"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
func ttsReq(text, voice string, lang *string, dst string) *pb.TTSRequest {
return &pb.TTSRequest{Text: text, Voice: voice, Language: lang, Dst: dst}
}
var _ = Describe("qwen3-tts-cpp e2e", Label("e2e"), func() {
var loaded bool
BeforeEach(func() {
modelPath := os.Getenv("QWEN3TTS_MODEL")
codecPath := os.Getenv("QWEN3TTS_CODEC")
if modelPath == "" || codecPath == "" {
Skip("QWEN3TTS_MODEL / QWEN3TTS_CODEC not set; skipping e2e")
}
if !loaded {
lib := os.Getenv("QWEN3TTS_LIBRARY")
if lib == "" {
lib = "./libgoqwen3ttscpp-fallback.so"
}
h, err := purego.Dlopen(lib, purego.RTLD_NOW|purego.RTLD_GLOBAL)
Expect(err).ToNot(HaveOccurred())
purego.RegisterLibFunc(&CppLoad, h, "qt3_load")
purego.RegisterLibFunc(&CppTTS, h, "qt3_tts")
purego.RegisterLibFunc(&CppTTSStream, h, "qt3_tts_stream")
purego.RegisterLibFunc(&CppPCMFree, h, "qt3_pcm_free")
purego.RegisterLibFunc(&CppUnload, h, "qt3_unload")
Expect(CppLoad(modelPath, codecPath, 1, 0)).To(Equal(0))
loaded = true
}
})
It("synthesizes a WAV file via TTS", func() {
b := &Qwen3TtsCpp{opts: loadOptions{seed: 42, useFA: true}}
dst := GinkgoT().TempDir() + "/out.wav"
lang := "english"
err := b.TTS(ttsReq("Hello world.", "", &lang, dst))
Expect(err).ToNot(HaveOccurred())
fi, err := os.Stat(dst)
Expect(err).ToNot(HaveOccurred())
Expect(fi.Size()).To(BeNumerically(">", int64(44)))
})
It("streams audio chunks via TTSStream", func() {
b := &Qwen3TtsCpp{opts: loadOptions{seed: 42, useFA: true}}
results := make(chan []byte, 1024)
lang := "english"
done := make(chan error, 1)
go func() { done <- b.TTSStream(ttsReq("Hello there, streaming test.", "", &lang, ""), results) }()
var chunks int
var first []byte
for c := range results {
if chunks == 0 {
first = c
}
chunks++
}
Expect(<-done).ToNot(HaveOccurred())
Expect(chunks).To(BeNumerically(">=", 2))
Expect(string(first[0:4])).To(Equal("RIFF"))
Expect(strings.HasPrefix(string(first[8:12]), "WAVE")).To(BeTrue())
})
It("clones a voice from the config audio_path reference", func() {
// 1s of 24kHz mono audio as a clone reference; the base model carries
// a speaker encoder, so audio_path drives x-vector voice cloning.
ref := GinkgoT().TempDir() + "/ref.wav"
samples := make([]float32, qwen3ttsSampleRate)
for i := range samples {
samples[i] = float32(0.05 * math.Sin(float64(i)*0.06))
}
Expect(writeWAV24k(ref, samples)).To(Succeed())
b := &Qwen3TtsCpp{opts: loadOptions{seed: 42, useFA: true}, audioPath: ref}
dst := GinkgoT().TempDir() + "/clone.wav"
lang := "english"
// Empty Voice -> the config audio_path is used as the clone reference.
Expect(b.TTS(ttsReq("Cloned voice test.", "", &lang, dst))).To(Succeed())
fi, err := os.Stat(dst)
Expect(err).ToNot(HaveOccurred())
Expect(fi.Size()).To(BeNumerically(">", int64(44)))
})
})

View File

@@ -5,225 +5,108 @@ import (
"os"
"path/filepath"
"strings"
"sync"
"unsafe"
"github.com/ebitengine/purego"
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
)
var (
// qt3_load(talker_path, codec_path, use_fa, clamp_fp16) int
CppLoad func(talkerPath, codecPath string, useFA, clampFP16 int) int
// qt3_tts(text, lang, instruct, speaker, ref_samples, ref_n, ref_text,
// seed, temperature, top_k, top_p, rep_pen, max_new, out_n) -> float*
CppTTS func(text, lang, instruct, speaker string, refSamples unsafe.Pointer,
refN int, refText string, seed int64, temperature float32, topK int,
topP, repPen float32, maxNew int, outN unsafe.Pointer) uintptr
// qt3_tts_stream(..., cb, user) int
CppTTSStream func(text, lang, instruct, speaker string, refSamples unsafe.Pointer,
refN int, refText string, seed int64, temperature float32, topK int,
topP, repPen float32, maxNew int, cb uintptr, user uintptr) int
CppPCMFree func(ptr uintptr)
CppUnload func()
CppLoadModel func(modelDir string, nThreads int) int
CppSynthesize func(text, refAudioPath, dst, language string,
temperature, topP float32, topK int,
repetitionPenalty float32, maxAudioTokens, nThreads int) int
)
type Qwen3TtsCpp struct {
base.SingleThread
opts loadOptions
// audioPath is the model-config reference voice (tts.audio_path), the
// default clone reference when a request omits an audio Voice.
audioPath string
threads int
}
// languageNameAliases maps common full language names to the canonical
// two-letter code understood by the C++ language_to_id table.
var languageNameAliases = map[string]string{
"english": "en",
"russian": "ru",
"chinese": "zh",
"japanese": "ja",
"korean": "ko",
"german": "de",
"french": "fr",
"spanish": "es",
"italian": "it",
"portuguese": "pt",
}
// normalizeLanguage coerces a caller-supplied language into the canonical code
// the model expects. It lowercases, trims, strips any region/locale suffix
// (en-US, en_US, ja.JP -> en/ja), and resolves common full names (english -> en).
// An empty input stays empty so the C++ side applies its English default; an
// unrecognized value is returned normalized so C++ can log it and default.
func normalizeLanguage(lang string) string {
lang = strings.ToLower(strings.TrimSpace(lang))
if lang == "" {
return ""
}
// Strip region/locale suffix: keep the segment before the first separator.
if i := strings.IndexAny(lang, "-_."); i >= 0 {
lang = lang[:i]
}
if code, ok := languageNameAliases[lang]; ok {
return code
}
return lang
}
func (q *Qwen3TtsCpp) Load(opts *pb.ModelOptions) error {
model := opts.ModelFile
if model == "" {
model = opts.ModelPath
}
if !filepath.IsAbs(model) && opts.ModelPath != "" {
model = filepath.Join(opts.ModelPath, model)
// ModelFile is the model directory path (containing GGUF files)
modelDir := opts.ModelFile
if modelDir == "" {
modelDir = opts.ModelPath
}
q.opts = parseOptions(opts.Options)
// Resolve the codec/tokenizer GGUF: explicit option, else auto-discover a
// *tokenizer*.gguf sibling of the talker model.
codec := q.opts.codecPath
if codec != "" && !filepath.IsAbs(codec) {
codec = filepath.Join(filepath.Dir(model), codec)
}
if codec == "" {
codec = discoverTokenizer(filepath.Dir(model))
}
if codec == "" {
return fmt.Errorf("qwen3-tts: no codec/tokenizer GGUF found; set option 'tokenizer:<file>'")
}
q.opts.codecPath = codec
q.audioPath = opts.AudioPath
if q.audioPath != "" && !filepath.IsAbs(q.audioPath) {
q.audioPath = filepath.Join(filepath.Dir(model), q.audioPath)
// Resolve relative paths
if !filepath.IsAbs(modelDir) && opts.ModelPath != "" {
modelDir = filepath.Join(opts.ModelPath, modelDir)
}
useFA := boolToInt(q.opts.useFA)
clamp := boolToInt(q.opts.clampFP16)
fmt.Fprintf(os.Stderr, "[qwen3-tts-cpp] Load talker=%s codec=%s use_fa=%d clamp_fp16=%d\n",
model, codec, useFA, clamp)
if rc := CppLoad(model, codec, useFA, clamp); rc != 0 {
return fmt.Errorf("qwen3-tts: failed to load model (rc=%d)", rc)
threads := int(opts.Threads)
if threads <= 0 {
threads = 4
}
q.threads = threads
fmt.Fprintf(os.Stderr, "[qwen3-tts-cpp] Loading models from: %s (threads=%d)\n", modelDir, threads)
if ret := CppLoadModel(modelDir, threads); ret != 0 {
return fmt.Errorf("failed to load qwen3-tts model (error code: %d)", ret)
}
return nil
}
// discoverTokenizer returns the first *tokenizer*.gguf in dir, or "".
func discoverTokenizer(dir string) string {
entries, err := os.ReadDir(dir)
if err != nil {
return ""
}
for _, e := range entries {
name := strings.ToLower(e.Name())
if strings.Contains(name, "tokenizer") && strings.HasSuffix(name, ".gguf") {
return filepath.Join(dir, e.Name())
}
}
return ""
}
func boolToInt(b bool) int {
if b {
return 1
}
return 0
}
func optStr(p *string) string {
if p == nil {
return ""
}
return *p
}
// resolveRequest derives the synthesis inputs from a TTSRequest:
// language, instruct, speaker, ref-audio samples, ref-text and sampling.
func (q *Qwen3TtsCpp) resolveRequest(req *pb.TTSRequest) (lang, instruct, speaker, refText string, ref []float32, s sampling, err error) {
lang = normalizeLanguage(optStr(req.Language))
instruct = optStr(req.Instructions)
var refPath string
speaker, refPath = resolveVoice(req.Voice)
if refPath == "" && speaker == "" && q.audioPath != "" {
// No per-request voice: fall back to the config clone reference.
refPath = q.audioPath
}
if refPath != "" {
ref, err = readWAVAsFloat(refPath)
if err != nil {
return
}
}
if req.Params != nil {
refText = req.Params["ref_text"]
}
s = parseSampling(req.Params, q.opts.seed)
return
}
func (q *Qwen3TtsCpp) TTS(req *pb.TTSRequest) error {
if req.Dst == "" {
return fmt.Errorf("qwen3-tts: TTS requires a destination path")
}
if req.Text == "" {
return fmt.Errorf("qwen3-tts: TTS requires text")
}
lang, instruct, speaker, refText, ref, s, err := q.resolveRequest(req)
if err != nil {
return err
}
var refPtr unsafe.Pointer
if len(ref) > 0 {
refPtr = unsafe.Pointer(&ref[0])
text := req.Text
voice := req.Voice // reference audio path for voice cloning (empty = no cloning)
dst := req.Dst
language := ""
if req.Language != nil {
language = normalizeLanguage(*req.Language)
}
var n int32
ptr := CppTTS(req.Text, lang, instruct, speaker, refPtr, len(ref), refText,
s.seed, s.temperature, s.topK, s.topP, s.repPen, s.maxNew, unsafe.Pointer(&n))
runtimeKeepAlive(ref)
if ptr == 0 {
return fmt.Errorf("qwen3-tts: synthesis failed")
}
// Register the free as soon as we own a non-null buffer, so the n<=0 guard
// below cannot leak it (defensive: the C contract returns NULL on failure).
defer CppPCMFree(ptr)
if n <= 0 {
return fmt.Errorf("qwen3-tts: synthesis produced no samples")
}
src := unsafe.Slice((*float32)(unsafe.Pointer(ptr)), int(n)) //nolint:govet // C-allocated PCM, copied out before free
out := make([]float32, int(n))
copy(out, src)
return writeWAV24k(req.Dst, out)
}
// Synthesis parameters with sensible defaults
temperature := float32(0.9)
topP := float32(0.8)
topK := 50
repetitionPenalty := float32(1.05)
maxAudioTokens := 4096
// streamState carries the active TTSStream channel to the single shared C
// callback. base.SingleThread serializes TTS/TTSStream, so one global slot is
// safe and avoids leaking a purego callback per request (purego callbacks
// cannot be freed and are capped).
var (
streamMu sync.Mutex
streamChan chan []byte
streamCbOnce sync.Once
streamCbPtr uintptr
)
// streamCallback is registered once and forwards each PCM chunk to streamChan.
func streamCallback(samples *float32, nSamples int32, _ uintptr) uintptr {
if nSamples <= 0 || samples == nil || streamChan == nil {
return 1 // continue
}
src := unsafe.Slice(samples, int(nSamples))
cp := make([]float32, int(nSamples)) // copy out of C memory before returning
copy(cp, src)
streamChan <- floatToPCM16LE(cp)
return 1 // continue
}
func (q *Qwen3TtsCpp) TTSStream(req *pb.TTSRequest, results chan []byte) error {
defer close(results)
if req.Text == "" {
return fmt.Errorf("qwen3-tts: TTSStream requires text")
if ret := CppSynthesize(text, voice, dst, language,
temperature, topP, topK, repetitionPenalty,
maxAudioTokens, q.threads); ret != 0 {
return fmt.Errorf("failed to synthesize audio (error code: %d)", ret)
}
streamCbOnce.Do(func() {
streamCbPtr = purego.NewCallback(streamCallback)
})
lang, instruct, speaker, refText, ref, s, err := q.resolveRequest(req)
if err != nil {
return err
}
var refPtr unsafe.Pointer
if len(ref) > 0 {
refPtr = unsafe.Pointer(&ref[0])
}
// Emit the WAV header first so the HTTP layer gets a self-describing stream.
results <- wavHeader24k()
streamMu.Lock()
streamChan = results
rc := CppTTSStream(req.Text, lang, instruct, speaker, refPtr, len(ref), refText,
s.seed, s.temperature, s.topK, s.topP, s.repPen, s.maxNew, streamCbPtr, 0)
streamChan = nil
streamMu.Unlock()
runtimeKeepAlive(ref)
if rc != 0 {
return fmt.Errorf("qwen3-tts: streaming synthesis failed (rc=%d)", rc)
}
return nil
}

View File

@@ -0,0 +1,53 @@
package main
import (
"testing"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
func TestLanguageNormalization(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "qwen3-tts-cpp language normalization")
}
var _ = Describe("normalizeLanguage", func() {
DescribeTable("maps caller input to the canonical model language code",
func(input, expected string) {
Expect(normalizeLanguage(input)).To(Equal(expected))
},
// Canonical codes pass through unchanged
Entry("canonical en", "en", "en"),
Entry("canonical zh", "zh", "zh"),
Entry("canonical pt", "pt", "pt"),
// Case-insensitive
Entry("uppercase", "EN", "en"),
Entry("mixed case", "Ja", "ja"),
// Surrounding whitespace
Entry("trims whitespace", " en ", "en"),
// Region/locale stripping
Entry("BCP-47 region", "en-US", "en"),
Entry("underscore region", "en_US", "en"),
Entry("dotted locale", "ja.JP", "ja"),
Entry("region + case", "ZH-CN", "zh"),
// Full-name aliases
Entry("english name", "english", "en"),
Entry("chinese name cased", "Chinese", "zh"),
Entry("japanese name", "japanese", "ja"),
Entry("russian name", "russian", "ru"),
Entry("portuguese name", "portuguese", "pt"),
// Empty stays empty (C++ applies the English default)
Entry("empty", "", ""),
Entry("whitespace only", " ", ""),
// Unknown values pass through normalized so C++ can log + default
Entry("unknown code", "klingon", "klingon"),
Entry("unknown with region", "xx-YY", "xx"),
)
})

View File

@@ -19,25 +19,24 @@ type LibFuncs struct {
}
func main() {
// Get library name from environment variable, default to fallback
libName := os.Getenv("QWEN3TTS_LIBRARY")
if libName == "" {
libName = "./libgoqwen3ttscpp-fallback.so"
}
lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
gosd, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
if err != nil {
panic(err)
}
libFuncs := []LibFuncs{
{&CppLoad, "qt3_load"},
{&CppTTS, "qt3_tts"},
{&CppTTSStream, "qt3_tts_stream"},
{&CppPCMFree, "qt3_pcm_free"},
{&CppUnload, "qt3_unload"},
{&CppLoadModel, "load_model"},
{&CppSynthesize, "synthesize"},
}
for _, lf := range libFuncs {
purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
purego.RegisterLibFunc(lf.FuncPtr, gosd, lf.Name)
}
flag.Parse()

View File

@@ -1,161 +0,0 @@
package main
import (
"strconv"
"strings"
)
// loadOptions holds the parsed model-level options.
type loadOptions struct {
codecPath string
useFA bool
clampFP16 bool
seed int64
}
// sampling holds per-request generation parameters with qt defaults applied.
type sampling struct {
temperature float32
topK int
topP float32
repPen float32
maxNew int
seed int64
}
func splitOption(o string) (key, value string, ok bool) {
i := strings.Index(o, ":")
if i < 0 {
return "", "", false
}
return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true
}
func parseBool(v string) bool { return v == "true" || v == "1" }
// parseOptions reads the backend "key:value" option slice. Unknown keys are
// ignored. Defaults: use_fa true (qt default; CPU still uses the F32 chain),
// seed -1 (engine random).
func parseOptions(opts []string) loadOptions {
o := loadOptions{useFA: true, seed: -1}
for _, oo := range opts {
key, value, ok := splitOption(oo)
if !ok {
continue
}
switch key {
case "tokenizer", "codec":
o.codecPath = value
case "use_fa":
o.useFA = parseBool(value)
case "clamp_fp16":
o.clampFP16 = parseBool(value)
case "seed":
if n, err := strconv.ParseInt(value, 10, 64); err == nil {
o.seed = n
}
}
}
return o
}
// languageAliases maps codes / locales / full names to the upstream qwentts
// language names. "auto" (and empty) map to "" so the engine auto-detects.
var languageAliases = map[string]string{
"en": "english", "english": "english",
"zh": "chinese", "chinese": "chinese", "mandarin": "chinese",
"ja": "japanese", "japanese": "japanese",
"ko": "korean", "korean": "korean",
"de": "german", "german": "german",
"fr": "french", "french": "french",
"es": "spanish", "spanish": "spanish",
"it": "italian", "italian": "italian",
"pt": "portuguese", "portuguese": "portuguese",
"ru": "russian", "russian": "russian",
"auto": "",
}
// normalizeLanguage lowercases, trims, strips a region/locale suffix
// (en-US -> en), and resolves to the qwentts language name. Empty stays empty
// (engine auto-detects); an unknown value passes through normalized.
func normalizeLanguage(lang string) string {
lang = strings.ToLower(strings.TrimSpace(lang))
if lang == "" {
return ""
}
if i := strings.IndexAny(lang, "-_."); i >= 0 {
lang = lang[:i]
}
if v, ok := languageAliases[lang]; ok {
return v
}
return lang
}
var refAudioExts = []string{".wav", ".flac", ".mp3", ".ogg", ".m4a"}
// resolveVoice interprets the request Voice field: a value ending in a known
// audio extension is a clone-reference path; anything else is a named speaker
// (custom_voice). Empty input yields no speaker and no reference.
func resolveVoice(voice string) (speaker, refPath string) {
v := strings.TrimSpace(voice)
if v == "" {
return "", ""
}
lower := strings.ToLower(v)
for _, ext := range refAudioExts {
if strings.HasSuffix(lower, ext) {
return "", v
}
}
return v, ""
}
func parseFloat32(v string, def float32) float32 {
if v == "" {
return def
}
f, err := strconv.ParseFloat(v, 32)
if err != nil {
return def
}
return float32(f)
}
func parseInt(v string, def int) int {
if v == "" {
return def
}
n, err := strconv.Atoi(v)
if err != nil {
return def
}
return n
}
func parseInt64(v string, def int64) int64 {
if v == "" {
return def
}
n, err := strconv.ParseInt(v, 10, 64)
if err != nil {
return def
}
return n
}
// parseSampling reads per-request sampling params from the TTSRequest params
// map, applying qt defaults (matching qt_tts_default_params).
func parseSampling(params map[string]string, defaultSeed int64) sampling {
s := sampling{temperature: 0.9, topK: 50, topP: 1.0, repPen: 1.05, maxNew: 2048, seed: defaultSeed}
if params == nil {
return s
}
s.temperature = parseFloat32(params["temperature"], s.temperature)
s.topK = parseInt(params["top_k"], s.topK)
s.topP = parseFloat32(params["top_p"], s.topP)
s.repPen = parseFloat32(params["repetition_penalty"], s.repPen)
s.maxNew = parseInt(params["max_new_tokens"], s.maxNew)
s.seed = parseInt64(params["seed"], s.seed)
return s
}

View File

@@ -1,136 +1,173 @@
package main
import (
"bytes"
"encoding/binary"
"context"
"os"
"os/exec"
"path/filepath"
"testing"
"time"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
)
func TestQwen3TtsCpp(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "qwen3-tts-cpp suite")
const (
testAddr = "localhost:50051"
startupWait = 5 * time.Second
)
func skipIfNoModel(t *testing.T) string {
t.Helper()
modelDir := os.Getenv("QWEN3TTS_MODEL_DIR")
if modelDir == "" {
t.Skip("QWEN3TTS_MODEL_DIR not set, skipping test (set to directory with GGUF models)")
}
if _, err := os.Stat(filepath.Join(modelDir, "qwen3-tts-0.6b-f16.gguf")); os.IsNotExist(err) {
t.Skipf("TTS model file not found in %s, skipping", modelDir)
}
if _, err := os.Stat(filepath.Join(modelDir, "qwen3-tts-tokenizer-f16.gguf")); os.IsNotExist(err) {
t.Skipf("Tokenizer model file not found in %s, skipping", modelDir)
}
return modelDir
}
var _ = Describe("normalizeLanguage", func() {
DescribeTable("maps caller language to qwentts language names",
func(in, want string) {
Expect(normalizeLanguage(in)).To(Equal(want))
},
Entry("empty stays empty", "", ""),
Entry("auto maps to empty", "auto", ""),
Entry("english full name", "English", "english"),
Entry("english code", "en", "english"),
Entry("locale suffix stripped", "en-US", "english"),
Entry("underscore locale", "zh_CN", "chinese"),
Entry("mandarin alias", "mandarin", "chinese"),
Entry("japanese already full", "japanese", "japanese"),
Entry("unknown passes through normalized", "xx", "xx"),
func startServer(t *testing.T) *exec.Cmd {
t.Helper()
binary := os.Getenv("QWEN3TTS_BINARY")
if binary == "" {
binary = "./qwen3-tts-cpp"
}
if _, err := os.Stat(binary); os.IsNotExist(err) {
t.Skipf("Backend binary not found at %s, skipping", binary)
}
cmd := exec.Command(binary, "--addr", testAddr)
cmd.Stdout = os.Stderr
cmd.Stderr = os.Stderr
if err := cmd.Start(); err != nil {
t.Fatalf("Failed to start server: %v", err)
}
time.Sleep(startupWait)
return cmd
}
func stopServer(cmd *exec.Cmd) {
if cmd != nil && cmd.Process != nil {
cmd.Process.Kill()
cmd.Wait()
}
}
func dialGRPC(t *testing.T) *grpc.ClientConn {
t.Helper()
conn, err := grpc.Dial(testAddr,
grpc.WithTransportCredentials(insecure.NewCredentials()),
grpc.WithDefaultCallOptions(
grpc.MaxCallRecvMsgSize(50*1024*1024),
grpc.MaxCallSendMsgSize(50*1024*1024),
),
)
})
if err != nil {
t.Fatalf("Failed to dial gRPC: %v", err)
}
return conn
}
var _ = Describe("resolveVoice", func() {
It("treats a bare token as a named speaker", func() {
sp, ref := resolveVoice("serena")
Expect(sp).To(Equal("serena"))
Expect(ref).To(BeEmpty())
})
It("treats an audio path as a clone reference (case-insensitive ext)", func() {
sp, ref := resolveVoice("/x/ref.WAV")
Expect(sp).To(BeEmpty())
Expect(ref).To(Equal("/x/ref.WAV"))
})
It("recognizes mp3/flac/ogg/m4a", func() {
for _, p := range []string{"a.mp3", "b.flac", "c.ogg", "d.m4a"} {
sp, ref := resolveVoice(p)
Expect(sp).To(BeEmpty())
Expect(ref).To(Equal(p))
}
})
It("returns empty for empty input", func() {
sp, ref := resolveVoice(" ")
Expect(sp).To(BeEmpty())
Expect(ref).To(BeEmpty())
})
})
func TestServerHealth(t *testing.T) {
cmd := startServer(t)
defer stopServer(cmd)
var _ = Describe("parseOptions", func() {
It("extracts codec, use_fa, clamp_fp16, seed", func() {
o := parseOptions([]string{
"tokenizer:tok.gguf", "use_fa:false", "clamp_fp16:true",
"seed:7", "unknown:ignored",
})
Expect(o.codecPath).To(Equal("tok.gguf"))
Expect(o.useFA).To(BeFalse())
Expect(o.clampFP16).To(BeTrue())
Expect(o.seed).To(Equal(int64(7)))
})
It("accepts codec: as an alias for tokenizer:", func() {
Expect(parseOptions([]string{"codec:c.gguf"}).codecPath).To(Equal("c.gguf"))
})
It("defaults use_fa true and seed -1", func() {
o := parseOptions(nil)
Expect(o.useFA).To(BeTrue())
Expect(o.seed).To(Equal(int64(-1)))
})
})
conn := dialGRPC(t)
defer conn.Close()
var _ = Describe("parseSampling", func() {
It("applies qt defaults when params are absent", func() {
s := parseSampling(nil, -1)
Expect(s.temperature).To(BeNumerically("~", 0.9, 1e-6))
Expect(s.topK).To(Equal(50))
Expect(s.topP).To(BeNumerically("~", 1.0, 1e-6))
Expect(s.repPen).To(BeNumerically("~", 1.05, 1e-6))
Expect(s.maxNew).To(Equal(2048))
Expect(s.seed).To(Equal(int64(-1)))
})
It("reads overrides and falls back to default seed", func() {
s := parseSampling(map[string]string{
"temperature": "0.5", "top_k": "10", "top_p": "0.8",
"repetition_penalty": "1.2", "max_new_tokens": "512",
}, 99)
Expect(s.temperature).To(BeNumerically("~", 0.5, 1e-6))
Expect(s.topK).To(Equal(10))
Expect(s.topP).To(BeNumerically("~", 0.8, 1e-6))
Expect(s.repPen).To(BeNumerically("~", 1.2, 1e-6))
Expect(s.maxNew).To(Equal(512))
Expect(s.seed).To(Equal(int64(99)))
})
It("reads an explicit seed override", func() {
Expect(parseSampling(map[string]string{"seed": "123"}, -1).seed).To(Equal(int64(123)))
})
})
client := pb.NewBackendClient(conn)
resp, err := client.Health(context.Background(), &pb.HealthMessage{})
if err != nil {
t.Fatalf("Health check failed: %v", err)
}
if string(resp.Message) != "OK" {
t.Fatalf("Expected OK, got %s", string(resp.Message))
}
}
var _ = Describe("wavHeader24k", func() {
It("emits a 44-byte streaming WAV header at 24 kHz mono 16-bit", func() {
h := wavHeader24k()
Expect(h).To(HaveLen(44))
Expect(string(h[0:4])).To(Equal("RIFF"))
Expect(string(h[8:12])).To(Equal("WAVE"))
Expect(string(h[12:16])).To(Equal("fmt "))
Expect(string(h[36:40])).To(Equal("data"))
var sampleRate uint32
Expect(binary.Read(bytes.NewReader(h[24:28]), binary.LittleEndian, &sampleRate)).To(Succeed())
Expect(sampleRate).To(Equal(uint32(24000)))
})
})
func TestLoadModel(t *testing.T) {
modelDir := skipIfNoModel(t)
cmd := startServer(t)
defer stopServer(cmd)
var _ = Describe("floatToPCM16LE", func() {
It("clamps and converts float PCM to little-endian int16 bytes", func() {
b := floatToPCM16LE([]float32{0, 1.0, -1.0, 2.0, -2.0})
Expect(b).To(HaveLen(10))
read := func(off int) int16 {
var v int16
_ = binary.Read(bytes.NewReader(b[off:off+2]), binary.LittleEndian, &v)
return v
}
Expect(read(0)).To(Equal(int16(0)))
Expect(read(2)).To(Equal(int16(32767)))
Expect(read(4)).To(Equal(int16(-32767)))
Expect(read(6)).To(Equal(int16(32767))) // clamped from 2.0
Expect(read(8)).To(Equal(int16(-32767))) // clamped from -2.0
conn := dialGRPC(t)
defer conn.Close()
client := pb.NewBackendClient(conn)
resp, err := client.LoadModel(context.Background(), &pb.ModelOptions{
ModelFile: modelDir,
Threads: 4,
})
})
if err != nil {
t.Fatalf("LoadModel failed: %v", err)
}
if !resp.Success {
t.Fatalf("LoadModel returned failure: %s", resp.Message)
}
}
func TestTTS(t *testing.T) {
modelDir := skipIfNoModel(t)
tmpDir, err := os.MkdirTemp("", "qwen3tts-test")
if err != nil {
t.Fatal(err)
}
t.Cleanup(func() { os.RemoveAll(tmpDir) })
outputFile := filepath.Join(tmpDir, "output.wav")
cmd := startServer(t)
defer stopServer(cmd)
conn := dialGRPC(t)
defer conn.Close()
client := pb.NewBackendClient(conn)
// Load models
loadResp, err := client.LoadModel(context.Background(), &pb.ModelOptions{
ModelFile: modelDir,
Threads: 4,
})
if err != nil {
t.Fatalf("LoadModel failed: %v", err)
}
if !loadResp.Success {
t.Fatalf("LoadModel returned failure: %s", loadResp.Message)
}
// Synthesize speech
language := "en"
_, err = client.TTS(context.Background(), &pb.TTSRequest{
Text: "Hello, this is a test of the Qwen3 text to speech system.",
Dst: outputFile,
Language: &language,
})
if err != nil {
t.Fatalf("TTS failed: %v", err)
}
// Verify output file exists and has content
info, err := os.Stat(outputFile)
if os.IsNotExist(err) {
t.Fatal("Output audio file was not created")
}
if err != nil {
t.Fatalf("Failed to stat output file: %v", err)
}
t.Logf("Output file size: %d bytes", info.Size())
// WAV header is 44 bytes minimum; any real audio should be much larger
if info.Size() < 1000 {
t.Errorf("Output file too small (%d bytes), expected real audio data", info.Size())
}
}

View File

@@ -2,30 +2,51 @@
set -e
CURDIR=$(dirname "$(realpath $0)")
cd "$CURDIR"
echo "Running qwen3-tts-cpp backend tests..."
# Auto-download a small model pair only when QWEN3TTS_MODEL is not set.
if [ -z "$QWEN3TTS_MODEL" ]; then
MODEL_DIR="./qwen3-tts-models"
mkdir -p "$MODEL_DIR"
REPO_ID="Serveurperso/Qwen3-TTS-GGUF"
BASE_URL="https://huggingface.co/${REPO_ID}/resolve/main"
FILES=( "qwen-talker-0.6b-base-Q4_K_M.gguf" "qwen-tokenizer-12hz-Q4_K_M.gguf" )
for file in "${FILES[@]}"; do
dest="${MODEL_DIR}/${file}"
if [ -f "${dest}" ]; then
echo " [skip] ${file}"
else
echo " [download] ${file}..."
curl -L -o "${dest}" "${BASE_URL}/${file}" --progress-bar
fi
done
export QWEN3TTS_MODEL="${MODEL_DIR}/qwen-talker-0.6b-base-Q4_K_M.gguf"
export QWEN3TTS_CODEC="${MODEL_DIR}/qwen-tokenizer-12hz-Q4_K_M.gguf"
# The test requires:
# - QWEN3TTS_MODEL_DIR: path to directory containing GGUF model files
# - QWEN3TTS_BINARY: path to the qwen3-tts-cpp binary (defaults to ./qwen3-tts-cpp)
#
# Tests that require the model will be skipped if QWEN3TTS_MODEL_DIR is not set
# or the directory does not contain the required model files.
cd "$CURDIR"
# Only auto-download models when QWEN3TTS_MODEL_DIR is not explicitly set
if [ -z "$QWEN3TTS_MODEL_DIR" ]; then
export QWEN3TTS_MODEL_DIR="./qwen3-tts-models"
if [ ! -d "$QWEN3TTS_MODEL_DIR" ]; then
echo "Creating qwen3-tts-models directory for tests..."
mkdir -p "$QWEN3TTS_MODEL_DIR"
REPO_ID="endo5501/qwen3-tts.cpp"
echo "Repository: ${REPO_ID}"
echo ""
# Files to download (smallest model for testing)
FILES=(
"qwen3-tts-0.6b-f16.gguf"
"qwen3-tts-tokenizer-f16.gguf"
)
BASE_URL="https://huggingface.co/${REPO_ID}/resolve/main"
for file in "${FILES[@]}"; do
dest="${QWEN3TTS_MODEL_DIR}/${file}"
if [ -f "${dest}" ]; then
echo " [skip] ${file} (already exists)"
else
echo " [download] ${file}..."
curl -L -o "${dest}" "${BASE_URL}/${file}" --progress-bar
echo " [done] ${file}"
fi
done
fi
fi
go test -v -timeout 1200s .
# Run Go tests
go test -v -timeout 600s .
echo "All qwen3-tts-cpp tests passed."

View File

@@ -62,7 +62,7 @@ var (
shimVadConfigSetDebug func(uintptr, int32)
shimCreateVad func(uintptr, float32) uintptr
// TTS (offline, VITS/Piper and Kokoro) config
// TTS (offline, VITS) config
shimTtsConfigNew func() uintptr
shimTtsConfigFree func(uintptr)
shimTtsConfigSetVitsModel func(uintptr, string)
@@ -76,14 +76,6 @@ var (
shimTtsConfigSetDebug func(uintptr, int32)
shimTtsConfigSetProvider func(uintptr, string)
shimTtsConfigSetMaxNumSentences func(uintptr, int32)
shimTtsConfigSetKokoroModel func(uintptr, string)
shimTtsConfigSetKokoroVoices func(uintptr, string)
shimTtsConfigSetKokoroTokens func(uintptr, string)
shimTtsConfigSetKokoroDataDir func(uintptr, string)
shimTtsConfigSetKokoroDictDir func(uintptr, string)
shimTtsConfigSetKokoroLexicon func(uintptr, string)
shimTtsConfigSetKokoroLang func(uintptr, string)
shimTtsConfigSetKokoroLengthScale func(uintptr, float32)
shimCreateOfflineTts func(uintptr) uintptr
// Offline recognizer config
@@ -109,37 +101,37 @@ var (
shimCreateOfflineRecognizer func(uintptr) uintptr
// Online recognizer config
shimOnlineRecogConfigNew func() uintptr
shimOnlineRecogConfigFree func(uintptr)
shimOnlineRecogConfigSetTransducerEncoder func(uintptr, string)
shimOnlineRecogConfigSetTransducerDecoder func(uintptr, string)
shimOnlineRecogConfigSetTransducerJoiner func(uintptr, string)
shimOnlineRecogConfigSetTokens func(uintptr, string)
shimOnlineRecogConfigSetNumThreads func(uintptr, int32)
shimOnlineRecogConfigSetDebug func(uintptr, int32)
shimOnlineRecogConfigSetProvider func(uintptr, string)
shimOnlineRecogConfigSetFeatSampleRate func(uintptr, int32)
shimOnlineRecogConfigSetFeatFeatureDim func(uintptr, int32)
shimOnlineRecogConfigSetDecodingMethod func(uintptr, string)
shimOnlineRecogConfigSetEnableEndpoint func(uintptr, int32)
shimOnlineRecogConfigNew func() uintptr
shimOnlineRecogConfigFree func(uintptr)
shimOnlineRecogConfigSetTransducerEncoder func(uintptr, string)
shimOnlineRecogConfigSetTransducerDecoder func(uintptr, string)
shimOnlineRecogConfigSetTransducerJoiner func(uintptr, string)
shimOnlineRecogConfigSetTokens func(uintptr, string)
shimOnlineRecogConfigSetNumThreads func(uintptr, int32)
shimOnlineRecogConfigSetDebug func(uintptr, int32)
shimOnlineRecogConfigSetProvider func(uintptr, string)
shimOnlineRecogConfigSetFeatSampleRate func(uintptr, int32)
shimOnlineRecogConfigSetFeatFeatureDim func(uintptr, int32)
shimOnlineRecogConfigSetDecodingMethod func(uintptr, string)
shimOnlineRecogConfigSetEnableEndpoint func(uintptr, int32)
shimOnlineRecogConfigSetRule1MinTrailingSilence func(uintptr, float32)
shimOnlineRecogConfigSetRule2MinTrailingSilence func(uintptr, float32)
shimOnlineRecogConfigSetRule3MinUtteranceLength func(uintptr, float32)
shimCreateOnlineRecognizer func(uintptr) uintptr
shimCreateOnlineRecognizer func(uintptr) uintptr
// Result accessors. Pointer returns use unsafe.Pointer so Go's
// vet checker doesn't flag them — the returned memory is C-owned,
// not subject to Go GC motion.
shimWaveSampleRate func(uintptr) int32
shimWaveNumSamples func(uintptr) int32
shimWaveSamples func(uintptr) unsafe.Pointer
shimOfflineResultText func(uintptr) unsafe.Pointer
shimOnlineResultText func(uintptr) unsafe.Pointer
shimGeneratedAudioSampleRate func(uintptr) int32
shimGeneratedAudioN func(uintptr) int32
shimGeneratedAudioSamples func(uintptr) unsafe.Pointer
shimSpeechSegmentStart func(uintptr) int32
shimSpeechSegmentN func(uintptr) int32
shimWaveSampleRate func(uintptr) int32
shimWaveNumSamples func(uintptr) int32
shimWaveSamples func(uintptr) unsafe.Pointer
shimOfflineResultText func(uintptr) unsafe.Pointer
shimOnlineResultText func(uintptr) unsafe.Pointer
shimGeneratedAudioSampleRate func(uintptr) int32
shimGeneratedAudioN func(uintptr) int32
shimGeneratedAudioSamples func(uintptr) unsafe.Pointer
shimSpeechSegmentStart func(uintptr) int32
shimSpeechSegmentN func(uintptr) int32
// TTS streaming callback trampoline
shimTtsGenerateWithCallback func(tts uintptr, text string, sid int32, speed float32, cb uintptr, ud uintptr) uintptr
@@ -169,13 +161,13 @@ var (
// pointer returned by the shim or `unsafe.Pointer(&slice[0])` from Go.
var (
// VAD
sherpaVadAcceptWaveform func(vad uintptr, samples unsafe.Pointer, n int32)
sherpaVadReset func(vad uintptr)
sherpaVadFlush func(vad uintptr)
sherpaVadEmpty func(vad uintptr) int32
sherpaVadFront func(vad uintptr) uintptr
sherpaVadPop func(vad uintptr)
sherpaDestroySpeechSegment func(seg uintptr)
sherpaVadAcceptWaveform func(vad uintptr, samples unsafe.Pointer, n int32)
sherpaVadReset func(vad uintptr)
sherpaVadFlush func(vad uintptr)
sherpaVadEmpty func(vad uintptr) int32
sherpaVadFront func(vad uintptr) uintptr
sherpaVadPop func(vad uintptr)
sherpaDestroySpeechSegment func(seg uintptr)
// Wave IO
sherpaReadWave func(filename string) uintptr
@@ -183,11 +175,11 @@ var (
sherpaWriteWave func(samples unsafe.Pointer, n int32, sampleRate int32, filename string) int32
// Offline ASR
sherpaCreateOfflineStream func(rec uintptr) uintptr
sherpaDestroyOfflineStream func(stream uintptr)
sherpaAcceptWaveformOffline func(stream uintptr, sr int32, samples unsafe.Pointer, n int32)
sherpaDecodeOfflineStream func(rec uintptr, stream uintptr)
sherpaGetOfflineStreamResult func(stream uintptr) uintptr
sherpaCreateOfflineStream func(rec uintptr) uintptr
sherpaDestroyOfflineStream func(stream uintptr)
sherpaAcceptWaveformOffline func(stream uintptr, sr int32, samples unsafe.Pointer, n int32)
sherpaDecodeOfflineStream func(rec uintptr, stream uintptr)
sherpaGetOfflineStreamResult func(stream uintptr) uintptr
sherpaDestroyOfflineRecognizerResult func(result uintptr)
// Online ASR
@@ -203,21 +195,21 @@ var (
sherpaOnlineStreamInputFinished func(stream uintptr)
// TTS
sherpaOfflineTtsGenerate func(tts uintptr, text string, sid int32, speed float32) uintptr
sherpaOfflineTtsGenerate func(tts uintptr, text string, sid int32, speed float32) uintptr
sherpaDestroyOfflineTtsGeneratedAudio func(audio uintptr)
sherpaOfflineTtsSampleRate func(tts uintptr) int32
sherpaOfflineTtsSampleRate func(tts uintptr) int32
// Offline speaker diarization. Result handle owns the segment-array
// pointer returned by ResultSortByStartTime; destroy the segment
// array first, then the result, then (at backend Free()) the diarizer.
sherpaDestroyOfflineSpeakerDiarization func(sd uintptr)
sherpaOfflineSpeakerDiarizationGetSampleRate func(sd uintptr) int32
sherpaOfflineSpeakerDiarizationProcess func(sd uintptr, samples unsafe.Pointer, n int32) uintptr
sherpaOfflineSpeakerDiarizationResultGetNumSegments func(result uintptr) int32
sherpaOfflineSpeakerDiarizationResultGetNumSpeakers func(result uintptr) int32
sherpaOfflineSpeakerDiarizationResultSortByStartTime func(result uintptr) uintptr
sherpaOfflineSpeakerDiarizationDestroySegment func(segs uintptr)
sherpaDestroyOfflineSpeakerDiarizationResult func(result uintptr)
sherpaDestroyOfflineSpeakerDiarization func(sd uintptr)
sherpaOfflineSpeakerDiarizationGetSampleRate func(sd uintptr) int32
sherpaOfflineSpeakerDiarizationProcess func(sd uintptr, samples unsafe.Pointer, n int32) uintptr
sherpaOfflineSpeakerDiarizationResultGetNumSegments func(result uintptr) int32
sherpaOfflineSpeakerDiarizationResultGetNumSpeakers func(result uintptr) int32
sherpaOfflineSpeakerDiarizationResultSortByStartTime func(result uintptr) uintptr
sherpaOfflineSpeakerDiarizationDestroySegment func(segs uintptr)
sherpaDestroyOfflineSpeakerDiarizationResult func(result uintptr)
)
var (
@@ -286,14 +278,6 @@ func loadSherpaLibsOnce() error {
{&shimTtsConfigSetDebug, "sherpa_shim_tts_config_set_debug"},
{&shimTtsConfigSetProvider, "sherpa_shim_tts_config_set_provider"},
{&shimTtsConfigSetMaxNumSentences, "sherpa_shim_tts_config_set_max_num_sentences"},
{&shimTtsConfigSetKokoroModel, "sherpa_shim_tts_config_set_kokoro_model"},
{&shimTtsConfigSetKokoroVoices, "sherpa_shim_tts_config_set_kokoro_voices"},
{&shimTtsConfigSetKokoroTokens, "sherpa_shim_tts_config_set_kokoro_tokens"},
{&shimTtsConfigSetKokoroDataDir, "sherpa_shim_tts_config_set_kokoro_data_dir"},
{&shimTtsConfigSetKokoroDictDir, "sherpa_shim_tts_config_set_kokoro_dict_dir"},
{&shimTtsConfigSetKokoroLexicon, "sherpa_shim_tts_config_set_kokoro_lexicon"},
{&shimTtsConfigSetKokoroLang, "sherpa_shim_tts_config_set_kokoro_lang"},
{&shimTtsConfigSetKokoroLengthScale, "sherpa_shim_tts_config_set_kokoro_length_scale"},
{&shimCreateOfflineTts, "sherpa_shim_create_offline_tts"},
{&shimOfflineRecogConfigNew, "sherpa_shim_offline_recog_config_new"},
@@ -704,14 +688,21 @@ func (s *SherpaBackend) loadTTS(opts *pb.ModelOptions) error {
cfg := shimTtsConfigNew()
defer shimTtsConfigFree(cfg)
// Kokoro models ship a voices style file alongside the ONNX, whereas
// VITS/Piper voices do not. That presence is what tells the two model
// families apart, since both arrive as a plain *.onnx in modelDir.
if isKokoroModel(modelDir) {
s.configureKokoroTTS(cfg, opts, modelFile, modelDir)
} else {
s.configureVitsTTS(cfg, opts, modelFile, modelDir)
shimTtsConfigSetVitsModel(cfg, modelFile)
if tokensPath := filepath.Join(modelDir, "tokens.txt"); fileExists(tokensPath) {
shimTtsConfigSetVitsTokens(cfg, tokensPath)
}
if lexiconPath := filepath.Join(modelDir, "lexicon.txt"); fileExists(lexiconPath) {
shimTtsConfigSetVitsLexicon(cfg, lexiconPath)
}
if dataDir := filepath.Join(modelDir, "espeak-ng-data"); dirExists(dataDir) {
shimTtsConfigSetVitsDataDir(cfg, dataDir)
}
shimTtsConfigSetVitsNoiseScale(cfg, findOptionFloat(opts, optionTtsNoiseScale, 0.667))
shimTtsConfigSetVitsNoiseScaleW(cfg, findOptionFloat(opts, optionTtsNoiseScaleW, 0.8))
shimTtsConfigSetVitsLengthScale(cfg, findOptionFloat(opts, optionTtsLengthScale, 1.0))
threads := int32(1)
if opts.Threads != 0 {
@@ -732,80 +723,6 @@ func (s *SherpaBackend) loadTTS(opts *pb.ModelOptions) error {
return nil
}
// kokoroVoicesFile is the speaker-style bank that ships with Kokoro models and
// is absent from VITS/Piper voices; its presence is how loadTTS tells them apart.
const kokoroVoicesFile = "voices.bin"
// isKokoroModel reports whether modelDir holds a Kokoro model (a voices file
// next to the ONNX) rather than a VITS/Piper single-speaker model.
func isKokoroModel(modelDir string) bool {
return fileExists(filepath.Join(modelDir, kokoroVoicesFile))
}
// configureVitsTTS wires a VITS/Piper single-speaker model into cfg: the ONNX
// plus the optional tokens, lexicon and espeak-ng-data found beside it.
func (s *SherpaBackend) configureVitsTTS(cfg uintptr, opts *pb.ModelOptions, modelFile, modelDir string) {
shimTtsConfigSetVitsModel(cfg, modelFile)
if tokensPath := filepath.Join(modelDir, "tokens.txt"); fileExists(tokensPath) {
shimTtsConfigSetVitsTokens(cfg, tokensPath)
}
if lexiconPath := filepath.Join(modelDir, "lexicon.txt"); fileExists(lexiconPath) {
shimTtsConfigSetVitsLexicon(cfg, lexiconPath)
}
if dataDir := filepath.Join(modelDir, "espeak-ng-data"); dirExists(dataDir) {
shimTtsConfigSetVitsDataDir(cfg, dataDir)
}
shimTtsConfigSetVitsNoiseScale(cfg, findOptionFloat(opts, optionTtsNoiseScale, 0.667))
shimTtsConfigSetVitsNoiseScaleW(cfg, findOptionFloat(opts, optionTtsNoiseScaleW, 0.8))
shimTtsConfigSetVitsLengthScale(cfg, findOptionFloat(opts, optionTtsLengthScale, 1.0))
}
// configureKokoroTTS wires a Kokoro model into cfg: the ONNX, its voices bank,
// tokens, and the optional espeak-ng-data / jieba dict / lexicon assets the
// multi-lingual packs ship. A language hint comes from the `language=` option.
func (s *SherpaBackend) configureKokoroTTS(cfg uintptr, opts *pb.ModelOptions, modelFile, modelDir string) {
shimTtsConfigSetKokoroModel(cfg, modelFile)
shimTtsConfigSetKokoroVoices(cfg, filepath.Join(modelDir, kokoroVoicesFile))
if tokensPath := filepath.Join(modelDir, "tokens.txt"); fileExists(tokensPath) {
shimTtsConfigSetKokoroTokens(cfg, tokensPath)
}
if dataDir := filepath.Join(modelDir, "espeak-ng-data"); dirExists(dataDir) {
shimTtsConfigSetKokoroDataDir(cfg, dataDir)
}
if dictDir := filepath.Join(modelDir, "dict"); dirExists(dictDir) {
shimTtsConfigSetKokoroDictDir(cfg, dictDir)
}
// Multi-lingual Kokoro ships per-language lexicons; the C API takes them as
// a single comma-separated list. US and GB English overlap almost entirely,
// so pass only one (US preferred) to avoid tens of thousands of "duplicated
// word" warnings at load; non-English lexicons (e.g. zh) are additive.
var lexicons []string
addLexicon := func(name string) {
if p := filepath.Join(modelDir, name); fileExists(p) {
lexicons = append(lexicons, p)
}
}
if fileExists(filepath.Join(modelDir, "lexicon-us-en.txt")) {
addLexicon("lexicon-us-en.txt")
} else {
addLexicon("lexicon-gb-en.txt")
}
addLexicon("lexicon-zh.txt")
addLexicon("lexicon.txt")
if len(lexicons) > 0 {
shimTtsConfigSetKokoroLexicon(cfg, strings.Join(lexicons, ","))
}
if lang := findOptionValue(opts, optionLanguage, ""); lang != "" {
shimTtsConfigSetKokoroLang(cfg, lang)
}
shimTtsConfigSetKokoroLengthScale(cfg, findOptionFloat(opts, optionTtsLengthScale, 1.0))
}
func fileExists(p string) bool {
info, err := os.Stat(p)
return err == nil && !info.IsDir()
@@ -1335,7 +1252,7 @@ type ttsStreamState struct {
var (
ttsStates sync.Map // uint64 → *ttsStreamState
ttsNextID atomic.Uint64
ttsCallbackPtr uintptr // purego.NewCallback return; registered in loadSherpaLibs
ttsCallbackPtr uintptr // purego.NewCallback return; registered in loadSherpaLibs
)
// ttsStreamCallback is invoked by sherpa-onnx for each PCM chunk VITS

View File

@@ -124,20 +124,6 @@ var _ = Describe("Sherpa-ONNX", func() {
Entry("empty", "", false),
Entry("other", "other", false),
)
It("isKokoroModel detects a voices file beside the ONNX", func() {
dir, err := os.MkdirTemp("", "sherpa-kokoro-*")
Expect(err).NotTo(HaveOccurred())
defer func() { _ = os.RemoveAll(dir) }()
// A bare VITS/Piper directory (ONNX only) is not Kokoro.
Expect(os.WriteFile(filepath.Join(dir, "model.onnx"), []byte("x"), 0o600)).To(Succeed())
Expect(isKokoroModel(dir)).To(BeFalse())
// Adding the Kokoro voices bank flips detection on.
Expect(os.WriteFile(filepath.Join(dir, kokoroVoicesFile), []byte("x"), 0o600)).To(Succeed())
Expect(isKokoroModel(dir)).To(BeTrue())
})
})
Context("option parsing", func() {

View File

@@ -79,13 +79,6 @@ void sherpa_shim_tts_config_free(void *h) {
free((char *)c->model.vits.tokens);
free((char *)c->model.vits.lexicon);
free((char *)c->model.vits.data_dir);
free((char *)c->model.kokoro.model);
free((char *)c->model.kokoro.voices);
free((char *)c->model.kokoro.tokens);
free((char *)c->model.kokoro.data_dir);
free((char *)c->model.kokoro.dict_dir);
free((char *)c->model.kokoro.lexicon);
free((char *)c->model.kokoro.lang);
free((char *)c->model.provider);
free(c);
}
@@ -124,34 +117,6 @@ void sherpa_shim_tts_config_set_max_num_sentences(void *h, int32_t v) {
((SherpaOnnxOfflineTtsConfig *)h)->max_num_sentences = v;
}
// Kokoro multi-speaker / multi-lingual TTS. Distinct ONNX + a voices style
// file (voices.bin) instead of VITS' single-speaker graph; espeak-ng-data,
// lexicon and a language hint are optional refinements.
void sherpa_shim_tts_config_set_kokoro_model(void *h, const char *v) {
shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.model, v);
}
void sherpa_shim_tts_config_set_kokoro_voices(void *h, const char *v) {
shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.voices, v);
}
void sherpa_shim_tts_config_set_kokoro_tokens(void *h, const char *v) {
shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.tokens, v);
}
void sherpa_shim_tts_config_set_kokoro_data_dir(void *h, const char *v) {
shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.data_dir, v);
}
void sherpa_shim_tts_config_set_kokoro_dict_dir(void *h, const char *v) {
shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.dict_dir, v);
}
void sherpa_shim_tts_config_set_kokoro_lexicon(void *h, const char *v) {
shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.lexicon, v);
}
void sherpa_shim_tts_config_set_kokoro_lang(void *h, const char *v) {
shim_set_str(&((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.lang, v);
}
void sherpa_shim_tts_config_set_kokoro_length_scale(void *h, float v) {
((SherpaOnnxOfflineTtsConfig *)h)->model.kokoro.length_scale = v;
}
void *sherpa_shim_create_offline_tts(void *h) {
return (void *)SherpaOnnxCreateOfflineTts(
(const SherpaOnnxOfflineTtsConfig *)h);

View File

@@ -37,7 +37,7 @@ void sherpa_shim_vad_config_set_provider(void *cfg, const char *v);
void sherpa_shim_vad_config_set_debug(void *cfg, int32_t v);
void *sherpa_shim_create_vad(void *cfg, float buffer_size_seconds);
// --- Offline TTS config (VITS/Piper and Kokoro model families) ---
// --- Offline TTS config (VITS path — the only TTS family the backend uses) ---
void *sherpa_shim_tts_config_new(void);
void sherpa_shim_tts_config_free(void *cfg);
void sherpa_shim_tts_config_set_vits_model(void *cfg, const char *v);
@@ -51,14 +51,6 @@ void sherpa_shim_tts_config_set_num_threads(void *cfg, int32_t v);
void sherpa_shim_tts_config_set_debug(void *cfg, int32_t v);
void sherpa_shim_tts_config_set_provider(void *cfg, const char *v);
void sherpa_shim_tts_config_set_max_num_sentences(void *cfg, int32_t v);
void sherpa_shim_tts_config_set_kokoro_model(void *cfg, const char *v);
void sherpa_shim_tts_config_set_kokoro_voices(void *cfg, const char *v);
void sherpa_shim_tts_config_set_kokoro_tokens(void *cfg, const char *v);
void sherpa_shim_tts_config_set_kokoro_data_dir(void *cfg, const char *v);
void sherpa_shim_tts_config_set_kokoro_dict_dir(void *cfg, const char *v);
void sherpa_shim_tts_config_set_kokoro_lexicon(void *cfg, const char *v);
void sherpa_shim_tts_config_set_kokoro_lang(void *cfg, const char *v);
void sherpa_shim_tts_config_set_kokoro_length_scale(void *cfg, float v);
void *sherpa_shim_create_offline_tts(void *cfg);
// --- Offline recognizer config (Whisper / Paraformer / SenseVoice / Omnilingual) ---

View File

@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
# stablediffusion.cpp (ggml)
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
STABLEDIFFUSION_GGML_VERSION?=276025e054555166ec419413c6748ca79986ee93
STABLEDIFFUSION_GGML_VERSION?=19bdfe22d255d5b4dff39d449318b9bc5ea2317f
CMAKE_ARGS+=-DGGML_MAX_NAME=128

View File

@@ -1,4 +0,0 @@
/supertonic
/sources/
/backend-assets/
/package/

View File

@@ -1,62 +0,0 @@
CURRENT_DIR=$(abspath ./)
GOCMD=go
ONNX_VERSION?=1.24.4
ONNX_ARCH?=x64
ONNX_OS?=linux
ifneq (,$(findstring aarch64,$(shell uname -m)))
ONNX_ARCH=aarch64
endif
ifeq ($(OS),Darwin)
ONNX_OS=osx
ifneq (,$(findstring arm64,$(shell uname -m)))
ONNX_ARCH=arm64
else
ONNX_ARCH=x86_64
endif
endif
# CUDA 12 ships as -gpu, CUDA 13 as -gpu_cuda13 (underscore). CPU has no suffix.
ifeq ($(BUILD_TYPE),cublas)
ONNX_PROVIDER=cuda
ifeq ($(CUDA_MAJOR_VERSION),13)
ONNX_VARIANT=-gpu_cuda13
else
ONNX_VARIANT=-gpu
endif
else
ONNX_VARIANT=
ONNX_PROVIDER=cpu
endif
sources/onnxruntime:
mkdir -p sources/onnxruntime
curl -L https://github.com/microsoft/onnxruntime/releases/download/v$(ONNX_VERSION)/onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)$(ONNX_VARIANT)-$(ONNX_VERSION).tgz \
-o sources/onnxruntime/onnxruntime.tgz
cd sources/onnxruntime && tar -xf onnxruntime.tgz --strip-components=1 && rm onnxruntime.tgz
backend-assets/lib: sources/onnxruntime
mkdir -p backend-assets/lib
cp -rfLv sources/onnxruntime/lib/* backend-assets/lib/
supertonic: backend-assets/lib
CGO_ENABLED=1 $(GOCMD) build \
-ldflags "$(LD_FLAGS) -X main.onnxProvider=$(ONNX_PROVIDER)" \
-tags "$(GO_TAGS)" -o supertonic ./
package:
bash package.sh
build: supertonic package
# Tests need only the Go toolchain (gcc); yalue dlopens onnxruntime at
# runtime, so no tarball download is required to compile or run unit specs.
test:
CGO_ENABLED=1 $(GOCMD) test -v -timeout 120s ./...
clean:
rm -rf supertonic sources/ backend-assets/ package/
.PHONY: build package clean test

View File

@@ -1,307 +0,0 @@
package main
import (
"bytes"
"encoding/binary"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
laudio "github.com/mudler/LocalAI/pkg/audio"
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
)
// onnxProvider is set via -ldflags "-X main.onnxProvider=cuda" by the
// CUDA build (later phase). Defaults to CPU.
var onnxProvider = "cpu"
// Per-model generation defaults, overridable via ModelOptions.Options:
//
// supertonic.steps=<int> denoising steps (quality), default 8
// supertonic.speed=<float> speech rate, default 1.05
// supertonic.silence=<float> inter-chunk silence seconds, default 0.3
// supertonic.default_voice=<name> voice-style used when request omits voice
// supertonic.default_lang=<lang> language tag used when request omits it
const (
optionSteps = "supertonic.steps="
optionSpeed = "supertonic.speed="
optionSilence = "supertonic.silence="
optionDefaultVoice = "supertonic.default_voice="
optionDefaultLang = "supertonic.default_lang="
)
type SupertonicBackend struct {
base.SingleThread
tts *TextToSpeech
cfg Config
modelDir string
voicesDir string
defaultVoice string
defaultLang string
steps int
speed float32
silence float32
styleMu sync.Mutex
styles map[string]*Style // voice name -> loaded style cache
}
func (s *SupertonicBackend) Load(opts *pb.ModelOptions) error {
modelDir, err := resolveModelDir(opts.ModelFile)
if err != nil {
return err
}
s.modelDir = modelDir
s.voicesDir = resolveVoicesDir(modelDir)
cfg, err := LoadCfgs(modelDir)
if err != nil {
return fmt.Errorf("loading tts.json from %s: %w", modelDir, err)
}
s.cfg = cfg
// onnxProvider is "cpu" for the CPU build; the CUDA build sets it to
// "cuda" via -ldflags. Upstream LoadTextToSpeech still errors on GPU
// until the CUDA phase wires the execution provider.
tts, err := LoadTextToSpeech(modelDir, onnxProvider == "cuda", cfg)
if err != nil {
return fmt.Errorf("loading supertonic models from %s: %w", modelDir, err)
}
s.tts = tts
s.steps = int(findOptionInt(opts, optionSteps, 8))
s.speed = findOptionFloat(opts, optionSpeed, 1.05)
s.silence = findOptionFloat(opts, optionSilence, 0.3)
s.defaultVoice = findOptionValue(opts, optionDefaultVoice, "")
s.defaultLang = findOptionValue(opts, optionDefaultLang, "na")
s.styles = map[string]*Style{}
return nil
}
func (s *SupertonicBackend) TTS(req *pb.TTSRequest) error {
wav, sr, err := s.synthesize(req)
if err != nil {
return err
}
out := make([]float64, len(wav))
for i, v := range wav {
out[i] = float64(v)
}
if err := writeWavFile(req.Dst, out, sr); err != nil {
return fmt.Errorf("writing wav to %s: %w", req.Dst, err)
}
return nil
}
func (s *SupertonicBackend) TTSStream(req *pb.TTSRequest, results chan []byte) error {
defer close(results)
wav, sr, err := s.synthesize(req)
if err != nil {
return err
}
results <- streamingWAVHeader(uint32(sr))
const chunkSamples = 4096
for off := 0; off < len(wav); off += chunkSamples {
end := off + chunkSamples
if end > len(wav) {
end = len(wav)
}
results <- pcmFloatToInt16LE(wav[off:end])
}
return nil
}
// synthesize runs the full pipeline and returns the trimmed mono float32
// PCM and its sample rate.
func (s *SupertonicBackend) synthesize(req *pb.TTSRequest) ([]float32, int, error) {
if s.tts == nil {
return nil, 0, fmt.Errorf("supertonic model not loaded")
}
if strings.TrimSpace(req.Text) == "" {
return nil, 0, fmt.Errorf("empty text")
}
style, err := s.loadStyle(s.voiceName(req.Voice))
if err != nil {
return nil, 0, err
}
lang := s.resolveLang("")
if req.Language != nil {
lang = s.resolveLang(*req.Language)
}
wav, dur, err := s.tts.Call(req.Text, lang, style, s.steps, s.speed, s.silence)
if err != nil {
return nil, 0, err
}
sr := s.tts.SampleRate
// Call returns concatenated audio; trim to the reported duration.
wavLen := int(float32(sr) * dur)
if wavLen < 0 {
wavLen = 0
}
if wavLen > len(wav) {
wavLen = len(wav)
}
return wav[:wavLen], sr, nil
}
// voiceName picks the request voice, falling back to the model default.
func (s *SupertonicBackend) voiceName(reqVoice string) string {
v := strings.TrimSpace(reqVoice)
if v == "" {
return s.defaultVoice
}
return v
}
// resolveLang validates against AvailableLangs, falling back to the model
// default (then "na").
func (s *SupertonicBackend) resolveLang(reqLang string) string {
l := strings.TrimSpace(reqLang)
if l != "" && isValidLang(l) {
return l
}
if s.defaultLang != "" && isValidLang(s.defaultLang) {
return s.defaultLang
}
return "na"
}
// loadStyle resolves and caches a voice-style. An empty name with no model
// default is an error (supertonic requires a style embedding).
func (s *SupertonicBackend) loadStyle(name string) (*Style, error) {
if name == "" {
return nil, fmt.Errorf("no voice specified and no supertonic.default_voice set")
}
s.styleMu.Lock()
defer s.styleMu.Unlock()
if st, ok := s.styles[name]; ok {
return st, nil
}
path := s.voiceStylePath(name)
st, err := LoadVoiceStyle([]string{path}, false)
if err != nil {
return nil, fmt.Errorf("loading voice style %q (%s): %w", name, path, err)
}
s.styles[name] = st
return st, nil
}
// voiceStylePath maps a voice name to a JSON path. Absolute paths are honored;
// names containing a separator resolve under modelDir; bare names resolve under
// the resolved voicesDir (see resolveVoicesDir).
func (s *SupertonicBackend) voiceStylePath(name string) string {
if !strings.HasSuffix(name, ".json") {
name += ".json"
}
if filepath.IsAbs(name) {
return name
}
if strings.ContainsRune(name, filepath.Separator) {
return filepath.Join(s.modelDir, name)
}
return filepath.Join(s.voicesDir, name)
}
// resolveVoicesDir locates the voice_styles directory. The HF model layout
// puts the ONNX files in an onnx/ subdir with voice_styles/ as its sibling,
// so check modelDir/voice_styles first, then the parent's voice_styles.
func resolveVoicesDir(modelDir string) string {
candidates := []string{
filepath.Join(modelDir, "voice_styles"),
filepath.Join(filepath.Dir(modelDir), "voice_styles"),
}
for _, c := range candidates {
if info, err := os.Stat(c); err == nil && info.IsDir() {
return c
}
}
return candidates[0]
}
// resolveModelDir accepts either a directory (used as-is) or a file (its
// parent dir is used).
func resolveModelDir(modelFile string) (string, error) {
if modelFile == "" {
return "", fmt.Errorf("empty model path")
}
info, err := os.Stat(modelFile)
if err != nil {
return "", fmt.Errorf("stat model path %s: %w", modelFile, err)
}
if info.IsDir() {
return modelFile, nil
}
return filepath.Dir(modelFile), nil
}
// ---- option helpers (mirrors backend/go/sherpa-onnx/backend.go) ----
func findOptionValue(opts *pb.ModelOptions, prefix, def string) string {
for _, o := range opts.Options {
if strings.HasPrefix(o, prefix) {
return strings.TrimPrefix(o, prefix)
}
}
return def
}
func findOptionFloat(opts *pb.ModelOptions, prefix string, def float32) float32 {
raw := findOptionValue(opts, prefix, "")
if raw == "" {
return def
}
v, err := strconv.ParseFloat(raw, 32)
if err != nil {
return def
}
return float32(v)
}
func findOptionInt(opts *pb.ModelOptions, prefix string, def int32) int32 {
raw := findOptionValue(opts, prefix, "")
if raw == "" {
return def
}
v, err := strconv.ParseInt(raw, 10, 32)
if err != nil {
return def
}
return int32(v)
}
// ---- PCM helpers ----
func pcmFloatToInt16LE(samples []float32) []byte {
buf := make([]byte, len(samples)*2)
for i, f := range samples {
v := int32(f * 32767)
if v > 32767 {
v = 32767
} else if v < -32768 {
v = -32768
}
binary.LittleEndian.PutUint16(buf[2*i:], uint16(int16(v)))
}
return buf
}
func streamingWAVHeader(sampleRate uint32) []byte {
const streamingSize = 0xFFFFFFFF
h := laudio.NewWAVHeaderWithRate(streamingSize, sampleRate)
h.ChunkSize = streamingSize
var buf bytes.Buffer
_ = h.Write(&buf)
return buf.Bytes()
}

View File

@@ -1,86 +0,0 @@
package main
import (
"os"
"path/filepath"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
)
var _ = Describe("voiceStylePath", func() {
s := &SupertonicBackend{modelDir: "/models/st/onnx", voicesDir: "/models/st/voice_styles"}
It("resolves a bare name under the resolved voicesDir", func() {
Expect(s.voiceStylePath("M1")).To(Equal(filepath.Join("/models/st/voice_styles", "M1.json")))
})
It("keeps an explicit .json suffix", func() {
Expect(s.voiceStylePath("M1.json")).To(Equal(filepath.Join("/models/st/voice_styles", "M1.json")))
})
It("honors absolute paths", func() {
Expect(s.voiceStylePath("/abs/v.json")).To(Equal("/abs/v.json"))
})
})
var _ = Describe("resolveVoicesDir", func() {
It("prefers voice_styles under modelDir", func() {
dir := GinkgoT().TempDir()
Expect(os.MkdirAll(filepath.Join(dir, "voice_styles"), 0o755)).To(Succeed())
Expect(resolveVoicesDir(dir)).To(Equal(filepath.Join(dir, "voice_styles")))
})
It("falls back to the sibling voice_styles next to an onnx subdir", func() {
root := GinkgoT().TempDir()
Expect(os.MkdirAll(filepath.Join(root, "voice_styles"), 0o755)).To(Succeed())
Expect(os.MkdirAll(filepath.Join(root, "onnx"), 0o755)).To(Succeed())
Expect(resolveVoicesDir(filepath.Join(root, "onnx"))).To(Equal(filepath.Join(root, "voice_styles")))
})
})
var _ = Describe("resolveLang", func() {
It("accepts a valid request language", func() {
s := &SupertonicBackend{defaultLang: "na"}
Expect(s.resolveLang("ko")).To(Equal("ko"))
})
It("falls back to the model default for an invalid language", func() {
s := &SupertonicBackend{defaultLang: "en"}
Expect(s.resolveLang("zz")).To(Equal("en"))
})
It("falls back to na when nothing is valid", func() {
s := &SupertonicBackend{defaultLang: ""}
Expect(s.resolveLang("")).To(Equal("na"))
})
})
var _ = Describe("pcmFloatToInt16LE", func() {
It("clamps and encodes little-endian", func() {
out := pcmFloatToInt16LE([]float32{0, 1.0, -1.0, 2.0})
Expect(out).To(HaveLen(8))
Expect(out[0:2]).To(Equal([]byte{0x00, 0x00})) // 0
Expect(out[2:4]).To(Equal([]byte{0xff, 0x7f})) // 32767
Expect(out[6:8]).To(Equal([]byte{0xff, 0x7f})) // clamp 2.0 -> 32767
})
})
var _ = Describe("end-to-end synthesis", Ordered, func() {
var modelDir string
BeforeAll(func() {
modelDir = os.Getenv("SUPERTONIC_MODEL_PATH")
if modelDir == "" {
Skip("set SUPERTONIC_MODEL_PATH to a supertonic model dir to run")
}
Expect(InitializeONNXRuntime()).To(Succeed())
})
It("synthesizes a wav file", func() {
b := &SupertonicBackend{}
Expect(b.Load(&pb.ModelOptions{ModelFile: modelDir, Options: []string{"supertonic.default_voice=F1"}})).To(Succeed())
dst := filepath.Join(GinkgoT().TempDir(), "out.wav")
lang := "en"
Expect(b.TTS(&pb.TTSRequest{Text: "Hello from LocalAI.", Dst: dst, Language: &lang})).To(Succeed())
info, err := os.Stat(dst)
Expect(err).ToNot(HaveOccurred())
Expect(info.Size()).To(BeNumerically(">", 44)) // header + PCM
})
})

View File

File diff suppressed because it is too large Load Diff

View File

@@ -1,27 +0,0 @@
package main
// Started internally by LocalAI; a server is allocated per model.
import (
"flag"
grpc "github.com/mudler/LocalAI/pkg/grpc"
ort "github.com/yalue/onnxruntime_go"
)
var addr = flag.String("addr", "localhost:50051", "the address to connect to")
func main() {
flag.Parse()
// InitializeONNXRuntime reads ONNXRUNTIME_LIB_PATH (set by run.sh) and
// dlopens libonnxruntime before any session is created in Load().
if err := InitializeONNXRuntime(); err != nil {
panic(err)
}
defer func() { _ = ort.DestroyEnvironment() }()
if err := grpc.StartServer(*addr, &SupertonicBackend{}); err != nil {
panic(err)
}
}

View File

@@ -1,13 +0,0 @@
package main
import (
"testing"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
func TestSupertonic(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "Supertonic backend test suite")
}

View File

@@ -1,49 +0,0 @@
#!/bin/bash
set -e
CURDIR=$(dirname "$(realpath $0)")
REPO_ROOT="${CURDIR}/../../.."
mkdir -p $CURDIR/package/lib
cp -avf $CURDIR/supertonic $CURDIR/package/
cp -avf $CURDIR/run.sh $CURDIR/package/
cp -rfLv $CURDIR/backend-assets/lib/* $CURDIR/package/lib/
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
echo "Detected x86_64 architecture, copying x86_64 libraries..."
cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
echo "Detected ARM64 architecture, copying ARM64 libraries..."
cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
else
echo "Error: Could not detect architecture"
exit 1
fi
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
if [ -f "$GPU_LIB_SCRIPT" ]; then
echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
package_gpu_libs
fi
echo "Packaging completed successfully"
ls -liah $CURDIR/package/
ls -liah $CURDIR/package/lib/

View File

@@ -1,14 +0,0 @@
#!/bin/bash
set -ex
CURDIR=$(dirname "$(realpath $0)")
export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.so
if [ -f $CURDIR/lib/ld.so ]; then
echo "Using lib/ld.so"
exec $CURDIR/lib/ld.so $CURDIR/supertonic "$@"
fi
exec $CURDIR/supertonic "$@"

View File

@@ -366,218 +366,6 @@
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-locate-anything-cpp"
intel: "intel-sycl-f32-locate-anything-cpp"
vulkan: "vulkan-locate-anything-cpp"
- !!merge <<: *locateanything
name: "locate-anything-development"
capabilities:
default: "cpu-locate-anything-cpp-development"
nvidia: "cuda12-locate-anything-cpp-development"
nvidia-cuda-12: "cuda12-locate-anything-cpp-development"
nvidia-cuda-13: "cuda13-locate-anything-cpp-development"
nvidia-l4t: "nvidia-l4t-arm64-locate-anything-cpp-development"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-locate-anything-cpp-development"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-locate-anything-cpp-development"
intel: "intel-sycl-f32-locate-anything-cpp-development"
vulkan: "vulkan-locate-anything-cpp-development"
- !!merge <<: *locateanything
name: "cpu-locate-anything-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-locate-anything-cpp"
mirrors:
- localai/localai-backends:latest-cpu-locate-anything-cpp
- !!merge <<: *locateanything
name: "cpu-locate-anything-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-locate-anything-cpp"
mirrors:
- localai/localai-backends:master-cpu-locate-anything-cpp
- !!merge <<: *locateanything
name: "cuda12-locate-anything-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-locate-anything-cpp"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-12-locate-anything-cpp
- !!merge <<: *locateanything
name: "cuda12-locate-anything-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-locate-anything-cpp"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-12-locate-anything-cpp
- !!merge <<: *locateanything
name: "cuda13-locate-anything-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-locate-anything-cpp"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-13-locate-anything-cpp
- !!merge <<: *locateanything
name: "cuda13-locate-anything-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-locate-anything-cpp"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-locate-anything-cpp
- !!merge <<: *locateanything
name: "nvidia-l4t-arm64-locate-anything-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-locate-anything-cpp"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-arm64-locate-anything-cpp
- !!merge <<: *locateanything
name: "nvidia-l4t-arm64-locate-anything-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-locate-anything-cpp"
mirrors:
- localai/localai-backends:master-nvidia-l4t-arm64-locate-anything-cpp
- !!merge <<: *locateanything
name: "cuda13-nvidia-l4t-arm64-locate-anything-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-locate-anything-cpp"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-locate-anything-cpp
- !!merge <<: *locateanything
name: "cuda13-nvidia-l4t-arm64-locate-anything-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-locate-anything-cpp"
mirrors:
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-locate-anything-cpp
- !!merge <<: *locateanything
name: "intel-sycl-f32-locate-anything-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-locate-anything-cpp"
mirrors:
- localai/localai-backends:latest-gpu-intel-sycl-f32-locate-anything-cpp
- !!merge <<: *locateanything
name: "intel-sycl-f32-locate-anything-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-locate-anything-cpp"
mirrors:
- localai/localai-backends:master-gpu-intel-sycl-f32-locate-anything-cpp
- !!merge <<: *locateanything
name: "intel-sycl-f16-locate-anything-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-locate-anything-cpp"
mirrors:
- localai/localai-backends:latest-gpu-intel-sycl-f16-locate-anything-cpp
- !!merge <<: *locateanything
name: "intel-sycl-f16-locate-anything-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-locate-anything-cpp"
mirrors:
- localai/localai-backends:master-gpu-intel-sycl-f16-locate-anything-cpp
- !!merge <<: *locateanything
name: "vulkan-locate-anything-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-locate-anything-cpp"
mirrors:
- localai/localai-backends:latest-gpu-vulkan-locate-anything-cpp
- !!merge <<: *locateanything
name: "vulkan-locate-anything-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-locate-anything-cpp"
mirrors:
- localai/localai-backends:master-gpu-vulkan-locate-anything-cpp
- &depthanything
name: "depth-anything"
alias: "depth-anything"
license: apache-2.0
description: |
Depth Anything 3 monocular metric depth + camera pose estimation in C/C++
using GGML. Loads pre-built GGUF weights and, given an image, returns a
dense depth map plus the recovered camera extrinsics (3x4) and intrinsics
(3x3). No Python at inference (purego, cgo-less).
urls:
- https://github.com/mudler/depth-anything.cpp
- https://huggingface.co/depth-anything/Depth-Anything-V3
tags:
- depth-estimation
- camera-pose
- depth-anything
- gpu
- cpu
capabilities:
default: "cpu-depth-anything-cpp"
nvidia: "cuda12-depth-anything-cpp"
nvidia-cuda-12: "cuda12-depth-anything-cpp"
nvidia-cuda-13: "cuda13-depth-anything-cpp"
nvidia-l4t: "nvidia-l4t-arm64-depth-anything-cpp"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-depth-anything-cpp"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-depth-anything-cpp"
intel: "intel-sycl-f32-depth-anything-cpp"
vulkan: "vulkan-depth-anything-cpp"
- !!merge <<: *depthanything
name: "depth-anything-development"
capabilities:
default: "cpu-depth-anything-cpp-development"
nvidia: "cuda12-depth-anything-cpp-development"
nvidia-cuda-12: "cuda12-depth-anything-cpp-development"
nvidia-cuda-13: "cuda13-depth-anything-cpp-development"
nvidia-l4t: "nvidia-l4t-arm64-depth-anything-cpp-development"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-depth-anything-cpp-development"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-depth-anything-cpp-development"
intel: "intel-sycl-f32-depth-anything-cpp-development"
vulkan: "vulkan-depth-anything-cpp-development"
- !!merge <<: *depthanything
name: "cpu-depth-anything-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-depth-anything-cpp"
mirrors:
- localai/localai-backends:latest-cpu-depth-anything-cpp
- !!merge <<: *depthanything
name: "cpu-depth-anything-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-depth-anything-cpp"
mirrors:
- localai/localai-backends:master-cpu-depth-anything-cpp
- !!merge <<: *depthanything
name: "cuda12-depth-anything-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-depth-anything-cpp"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-12-depth-anything-cpp
- !!merge <<: *depthanything
name: "cuda12-depth-anything-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-depth-anything-cpp"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-12-depth-anything-cpp
- !!merge <<: *depthanything
name: "cuda13-depth-anything-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-depth-anything-cpp"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-13-depth-anything-cpp
- !!merge <<: *depthanything
name: "cuda13-depth-anything-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-depth-anything-cpp"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-depth-anything-cpp
- !!merge <<: *depthanything
name: "nvidia-l4t-arm64-depth-anything-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-depth-anything-cpp"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-arm64-depth-anything-cpp
- !!merge <<: *depthanything
name: "nvidia-l4t-arm64-depth-anything-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-depth-anything-cpp"
mirrors:
- localai/localai-backends:master-nvidia-l4t-arm64-depth-anything-cpp
- !!merge <<: *depthanything
name: "cuda13-nvidia-l4t-arm64-depth-anything-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-depth-anything-cpp"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-depth-anything-cpp
- !!merge <<: *depthanything
name: "cuda13-nvidia-l4t-arm64-depth-anything-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-depth-anything-cpp"
mirrors:
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-depth-anything-cpp
- !!merge <<: *depthanything
name: "intel-sycl-f32-depth-anything-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-depth-anything-cpp"
mirrors:
- localai/localai-backends:latest-gpu-intel-sycl-f32-depth-anything-cpp
- !!merge <<: *depthanything
name: "intel-sycl-f32-depth-anything-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-depth-anything-cpp"
mirrors:
- localai/localai-backends:master-gpu-intel-sycl-f32-depth-anything-cpp
- !!merge <<: *depthanything
name: "intel-sycl-f16-depth-anything-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-depth-anything-cpp"
mirrors:
- localai/localai-backends:latest-gpu-intel-sycl-f16-depth-anything-cpp
- !!merge <<: *depthanything
name: "intel-sycl-f16-depth-anything-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-depth-anything-cpp"
mirrors:
- localai/localai-backends:master-gpu-intel-sycl-f16-depth-anything-cpp
- !!merge <<: *depthanything
name: "vulkan-depth-anything-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-depth-anything-cpp"
mirrors:
- localai/localai-backends:latest-gpu-vulkan-depth-anything-cpp
- !!merge <<: *depthanything
name: "vulkan-depth-anything-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-depth-anything-cpp"
mirrors:
- localai/localai-backends:master-gpu-vulkan-depth-anything-cpp
- &vllm
name: "vllm"
license: apache-2.0
@@ -667,9 +455,12 @@
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm-omni"
- &mlx
name: "mlx"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx"
icon: https://avatars.githubusercontent.com/u/102832242?s=200&v=4
urls:
- https://github.com/ml-explore/mlx-lm
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-mlx
license: MIT
description: |
Run LLMs with MLX
@@ -688,9 +479,12 @@
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-mlx"
- &mlx-vlm
name: "mlx-vlm"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx-vlm"
icon: https://avatars.githubusercontent.com/u/102832242?s=200&v=4
urls:
- https://github.com/Blaizzy/mlx-vlm
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-mlx-vlm
license: MIT
description: |
Run Vision-Language Models with MLX
@@ -711,9 +505,12 @@
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-mlx-vlm"
- &mlx-audio
name: "mlx-audio"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx-audio"
icon: https://avatars.githubusercontent.com/u/102832242?s=200&v=4
urls:
- https://github.com/Blaizzy/mlx-audio
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-mlx-audio
license: MIT
description: |
Run Audio Models with MLX
@@ -734,9 +531,12 @@
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-mlx-audio"
- &mlx-distributed
name: "mlx-distributed"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx-distributed"
icon: https://avatars.githubusercontent.com/u/102832242?s=200&v=4
urls:
- https://github.com/ml-explore/mlx-lm
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-mlx-distributed
license: MIT
description: |
Run distributed LLM inference with MLX across multiple Apple Silicon Macs
@@ -832,7 +632,7 @@
default: "cpu-diffusers"
nvidia-cuda-13: "cuda13-diffusers"
nvidia-cuda-12: "cuda12-diffusers"
nvidia-l4t-cuda-12: "nvidia-l4t-diffusers"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-diffusers"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-diffusers"
- &ace-step
name: "ace-step"
@@ -888,17 +688,14 @@
- &qwen3ttscpp
name: "qwen3-tts-cpp"
description: |
Qwen3-TTS C++ backend using GGML (qwentts.cpp). Native C++ text-to-speech
with streaming output, named speakers, voice design, and zero-shot voice
cloning. 24kHz mono, 11 languages with Mandarin dialects. 0.6B and 1.7B
models in Q8_0 / Q4_K_M.
Qwen3-TTS C++ backend using GGML. Native C++ text-to-speech with voice cloning support.
Generates 24kHz mono audio from text with optional reference audio for voice cloning via ECAPA-TDNN speaker embeddings.
urls:
- https://github.com/ServeurpersoCom/qwentts.cpp
- https://github.com/predict-woo/qwen3-tts.cpp
tags:
- text-to-speech
- tts
- voice-cloning
- streaming
alias: "qwen3-tts-cpp"
capabilities:
default: "cpu-qwen3-tts-cpp"
@@ -912,33 +709,6 @@
nvidia-l4t: "nvidia-l4t-arm64-qwen3-tts-cpp"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-qwen3-tts-cpp"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-qwen3-tts-cpp"
- &omnivoicecpp
name: "omnivoice-cpp"
description: |
OmniVoice C++ backend using GGML. Native text-to-speech with voice cloning
(reference audio + transcript) and voice design (attribute keywords: gender,
age, pitch, style, volume, emotion). 24kHz mono output, 646 languages.
Supports streaming synthesis.
urls:
- https://github.com/ServeurpersoCom/omnivoice.cpp
tags:
- text-to-speech
- tts
- voice-cloning
- voice-design
alias: "omnivoice-cpp"
capabilities:
default: "cpu-omnivoice-cpp"
nvidia: "cuda12-omnivoice-cpp"
nvidia-cuda-13: "cuda13-omnivoice-cpp"
nvidia-cuda-12: "cuda12-omnivoice-cpp"
intel: "intel-sycl-f16-omnivoice-cpp"
metal: "metal-omnivoice-cpp"
amd: "rocm-omnivoice-cpp"
vulkan: "vulkan-omnivoice-cpp"
nvidia-l4t: "nvidia-l4t-arm64-omnivoice-cpp"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-omnivoice-cpp"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-omnivoice-cpp"
- &vibevoicecpp
name: "vibevoice-cpp"
description: |
@@ -1084,7 +854,7 @@
metal: "metal-kokoro"
nvidia-cuda-13: "cuda13-kokoro"
nvidia-cuda-12: "cuda12-kokoro"
nvidia-l4t-cuda-12: "nvidia-l4t-kokoro"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-kokoro"
- &kokoros
icon: https://avatars.githubusercontent.com/u/166769057?v=4
description: |
@@ -1127,6 +897,7 @@
intel: "intel-coqui"
amd: "rocm-coqui"
metal: "metal-coqui"
nvidia-cuda-13: "cuda13-coqui"
nvidia-cuda-12: "cuda12-coqui"
icon: https://avatars.githubusercontent.com/u/1338804?s=200&v=4
- &outetts
@@ -1376,27 +1147,27 @@
icon: https://avatars.githubusercontent.com/u/151010778?s=200&v=4
- &piper
name: "piper"
uri: "quay.io/go-skynet/local-ai-backends:latest-piper"
icon: https://github.com/OHF-Voice/piper1-gpl/raw/main/etc/logo.png
urls:
- https://github.com/rhasspy/piper
- https://github.com/mudler/go-piper
mirrors:
- localai/localai-backends:latest-piper
license: MIT
description: |
A fast, local neural text to speech system
tags:
- text-to-speech
- TTS
capabilities:
default: "cpu-piper"
metal: "metal-piper"
- &opus
name: "opus"
alias: "opus"
capabilities:
default: "cpu-opus"
metal: "metal-opus"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-opus"
urls:
- https://opus-codec.org/
mirrors:
- localai/localai-backends:latest-cpu-opus
license: BSD-3-Clause
description: |
Opus audio codec backend for encoding and decoding audio.
@@ -1406,19 +1177,15 @@
- opus
- WebRTC
- realtime
- !!merge <<: *opus
name: "opus-development"
capabilities:
default: "cpu-opus-development"
metal: "metal-opus-development"
- CPU
- &silero-vad
name: "silero-vad"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-silero-vad"
icon: https://user-images.githubusercontent.com/12515440/89997349-b3523080-dc94-11ea-9906-ca2e8bc50535.png
urls:
- https://github.com/snakers4/silero-vad
capabilities:
default: "cpu-silero-vad"
metal: "metal-silero-vad"
mirrors:
- localai/localai-backends:latest-cpu-silero-vad
description: |
Silero VAD: pre-trained enterprise-grade Voice Activity Detector.
Silero VAD is a voice activity detection model that can be used to detect whether a given audio contains speech or not.
@@ -1429,6 +1196,9 @@
- CPU
- &local-store
name: "local-store"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-local-store"
mirrors:
- localai/localai-backends:latest-cpu-local-store
urls:
- https://github.com/mudler/LocalAI
description: |
@@ -1439,11 +1209,11 @@
- open-source
- CPU
license: MIT
capabilities:
default: "cpu-local-store"
metal: "metal-local-store"
- &kitten-tts
name: "kitten-tts"
uri: "quay.io/go-skynet/local-ai-backends:latest-kitten-tts"
mirrors:
- localai/localai-backends:latest-kitten-tts
urls:
- https://github.com/KittenML/KittenTTS
description: |
@@ -1452,9 +1222,6 @@
- text-to-speech
- TTS
license: apache-2.0
capabilities:
default: "cpu-kitten-tts"
metal: "metal-kitten-tts"
- &neutts
name: "neutts"
urls:
@@ -1488,20 +1255,6 @@
nvidia: "cuda12-sherpa-onnx"
nvidia-cuda-12: "cuda12-sherpa-onnx"
metal: "metal-sherpa-onnx"
- &supertonic
name: "supertonic"
alias: "supertonic"
urls:
- https://github.com/supertone-inc/supertonic
description: |
Supertonic backend: lightning-fast, on-device multilingual text-to-speech via ONNX Runtime.
Runs Supertone's flow-matching TTS model (Supertone/supertonic-3), 44.1kHz output, 31 languages,
multiple preset voice styles. No espeak-ng dependency.
tags:
- text-to-speech
- TTS
capabilities:
default: "cpu-supertonic"
- !!merge <<: *neutts
name: "neutts-development"
capabilities:
@@ -1594,89 +1347,25 @@
mirrors:
- localai/localai-backends:master-gpu-rocm-hipblas-neutts
- !!merge <<: *mlx
name: "metal-mlx"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-mlx
- !!merge <<: *mlx
name: "metal-mlx-development"
name: "mlx-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-mlx"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-mlx
- !!merge <<: *mlx
name: "mlx-development"
capabilities:
default: "cpu-mlx-development"
nvidia: "cuda12-mlx-development"
metal: "metal-mlx-development"
nvidia-cuda-12: "cuda12-mlx-development"
nvidia-cuda-13: "cuda13-mlx-development"
nvidia-l4t: "nvidia-l4t-mlx-development"
nvidia-l4t-cuda-12: "nvidia-l4t-mlx-development"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-mlx-development"
- !!merge <<: *mlx-vlm
name: "metal-mlx-vlm"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx-vlm"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-mlx-vlm
- !!merge <<: *mlx-vlm
name: "metal-mlx-vlm-development"
name: "mlx-vlm-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-mlx-vlm"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-mlx-vlm
- !!merge <<: *mlx-vlm
name: "mlx-vlm-development"
capabilities:
default: "cpu-mlx-vlm-development"
nvidia: "cuda12-mlx-vlm-development"
metal: "metal-mlx-vlm-development"
nvidia-cuda-12: "cuda12-mlx-vlm-development"
nvidia-cuda-13: "cuda13-mlx-vlm-development"
nvidia-l4t: "nvidia-l4t-mlx-vlm-development"
nvidia-l4t-cuda-12: "nvidia-l4t-mlx-vlm-development"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-mlx-vlm-development"
- !!merge <<: *mlx-audio
name: "metal-mlx-audio"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx-audio"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-mlx-audio
- !!merge <<: *mlx-audio
name: "metal-mlx-audio-development"
name: "mlx-audio-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-mlx-audio"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-mlx-audio
- !!merge <<: *mlx-audio
name: "mlx-audio-development"
capabilities:
default: "cpu-mlx-audio-development"
nvidia: "cuda12-mlx-audio-development"
metal: "metal-mlx-audio-development"
nvidia-cuda-12: "cuda12-mlx-audio-development"
nvidia-cuda-13: "cuda13-mlx-audio-development"
nvidia-l4t: "nvidia-l4t-mlx-audio-development"
nvidia-l4t-cuda-12: "nvidia-l4t-mlx-audio-development"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-mlx-audio-development"
- !!merge <<: *mlx-distributed
name: "metal-mlx-distributed"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx-distributed"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-mlx-distributed
- !!merge <<: *mlx-distributed
name: "metal-mlx-distributed-development"
name: "mlx-distributed-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-mlx-distributed"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-mlx-distributed
- !!merge <<: *mlx-distributed
name: "mlx-distributed-development"
capabilities:
default: "cpu-mlx-distributed-development"
nvidia: "cuda12-mlx-distributed-development"
metal: "metal-mlx-distributed-development"
nvidia-cuda-12: "cuda12-mlx-distributed-development"
nvidia-cuda-13: "cuda13-mlx-distributed-development"
nvidia-l4t: "nvidia-l4t-mlx-distributed-development"
nvidia-l4t-cuda-12: "nvidia-l4t-mlx-distributed-development"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-mlx-distributed-development"
## mlx
- !!merge <<: *mlx
name: "cpu-mlx"
@@ -1882,20 +1571,10 @@
mirrors:
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-mlx-distributed
- !!merge <<: *kitten-tts
name: "cpu-kitten-tts"
uri: "quay.io/go-skynet/local-ai-backends:latest-kitten-tts"
mirrors:
- localai/localai-backends:latest-kitten-tts
- !!merge <<: *kitten-tts
name: "cpu-kitten-tts-development"
name: "kitten-tts-development"
uri: "quay.io/go-skynet/local-ai-backends:master-kitten-tts"
mirrors:
- localai/localai-backends:master-kitten-tts
- !!merge <<: *kitten-tts
name: "kitten-tts-development"
capabilities:
default: "cpu-kitten-tts-development"
metal: "metal-kitten-tts-development"
- !!merge <<: *kitten-tts
name: "metal-kitten-tts"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-kitten-tts"
@@ -1907,23 +1586,11 @@
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-kitten-tts
- !!merge <<: *local-store
name: "cpu-local-store"
alias: "local-store"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-local-store"
mirrors:
- localai/localai-backends:latest-cpu-local-store
- !!merge <<: *local-store
name: "cpu-local-store-development"
name: "local-store-development"
alias: "local-store"
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-local-store"
mirrors:
- localai/localai-backends:master-cpu-local-store
- !!merge <<: *local-store
name: "local-store-development"
alias: "local-store"
capabilities:
default: "cpu-local-store-development"
metal: "metal-local-store-development"
- !!merge <<: *local-store
name: "metal-local-store"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-local-store"
@@ -1936,12 +1603,7 @@
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-local-store
- !!merge <<: *opus
name: "cpu-opus"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-opus"
mirrors:
- localai/localai-backends:latest-cpu-opus
- !!merge <<: *opus
name: "cpu-opus-development"
name: "opus-development"
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-opus"
mirrors:
- localai/localai-backends:master-cpu-opus
@@ -1956,20 +1618,10 @@
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-opus
- !!merge <<: *silero-vad
name: "cpu-silero-vad"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-silero-vad"
mirrors:
- localai/localai-backends:latest-cpu-silero-vad
- !!merge <<: *silero-vad
name: "cpu-silero-vad-development"
name: "silero-vad-development"
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-silero-vad"
mirrors:
- localai/localai-backends:master-cpu-silero-vad
- !!merge <<: *silero-vad
name: "silero-vad-development"
capabilities:
default: "cpu-silero-vad-development"
metal: "metal-silero-vad-development"
- !!merge <<: *silero-vad
name: "metal-silero-vad"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-silero-vad"
@@ -1981,20 +1633,10 @@
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-silero-vad
- !!merge <<: *piper
name: "cpu-piper"
uri: "quay.io/go-skynet/local-ai-backends:latest-piper"
mirrors:
- localai/localai-backends:latest-piper
- !!merge <<: *piper
name: "cpu-piper-development"
name: "piper-development"
uri: "quay.io/go-skynet/local-ai-backends:master-piper"
mirrors:
- localai/localai-backends:master-piper
- !!merge <<: *piper
name: "piper-development"
capabilities:
default: "cpu-piper-development"
metal: "metal-piper-development"
- !!merge <<: *piper
name: "metal-piper"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-piper"
@@ -3637,121 +3279,6 @@
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-qwen3-tts-cpp"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-qwen3-tts-cpp
## omnivoice-cpp
- !!merge <<: *omnivoicecpp
name: "omnivoice-cpp-development"
capabilities:
default: "cpu-omnivoice-cpp-development"
nvidia: "cuda12-omnivoice-cpp-development"
nvidia-cuda-13: "cuda13-omnivoice-cpp-development"
nvidia-cuda-12: "cuda12-omnivoice-cpp-development"
intel: "intel-sycl-f16-omnivoice-cpp-development"
metal: "metal-omnivoice-cpp-development"
amd: "rocm-omnivoice-cpp-development"
vulkan: "vulkan-omnivoice-cpp-development"
nvidia-l4t: "nvidia-l4t-arm64-omnivoice-cpp-development"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-omnivoice-cpp-development"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-omnivoice-cpp-development"
- !!merge <<: *omnivoicecpp
name: "nvidia-l4t-arm64-omnivoice-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-omnivoice-cpp"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-arm64-omnivoice-cpp
- !!merge <<: *omnivoicecpp
name: "nvidia-l4t-arm64-omnivoice-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-omnivoice-cpp"
mirrors:
- localai/localai-backends:master-nvidia-l4t-arm64-omnivoice-cpp
- !!merge <<: *omnivoicecpp
name: "cuda13-nvidia-l4t-arm64-omnivoice-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-omnivoice-cpp"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-omnivoice-cpp
- !!merge <<: *omnivoicecpp
name: "cuda13-nvidia-l4t-arm64-omnivoice-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-omnivoice-cpp"
mirrors:
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-omnivoice-cpp
- !!merge <<: *omnivoicecpp
name: "cpu-omnivoice-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-omnivoice-cpp"
mirrors:
- localai/localai-backends:latest-cpu-omnivoice-cpp
- !!merge <<: *omnivoicecpp
name: "metal-omnivoice-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-omnivoice-cpp"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-omnivoice-cpp
- !!merge <<: *omnivoicecpp
name: "metal-omnivoice-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-omnivoice-cpp"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-omnivoice-cpp
- !!merge <<: *omnivoicecpp
name: "cpu-omnivoice-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-omnivoice-cpp"
mirrors:
- localai/localai-backends:master-cpu-omnivoice-cpp
- !!merge <<: *omnivoicecpp
name: "cuda12-omnivoice-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-omnivoice-cpp"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-12-omnivoice-cpp
- !!merge <<: *omnivoicecpp
name: "rocm-omnivoice-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-omnivoice-cpp"
mirrors:
- localai/localai-backends:latest-gpu-rocm-hipblas-omnivoice-cpp
- !!merge <<: *omnivoicecpp
name: "intel-sycl-f32-omnivoice-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-omnivoice-cpp"
mirrors:
- localai/localai-backends:latest-gpu-intel-sycl-f32-omnivoice-cpp
- !!merge <<: *omnivoicecpp
name: "intel-sycl-f16-omnivoice-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-omnivoice-cpp"
mirrors:
- localai/localai-backends:latest-gpu-intel-sycl-f16-omnivoice-cpp
- !!merge <<: *omnivoicecpp
name: "vulkan-omnivoice-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-omnivoice-cpp"
mirrors:
- localai/localai-backends:latest-gpu-vulkan-omnivoice-cpp
- !!merge <<: *omnivoicecpp
name: "vulkan-omnivoice-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-omnivoice-cpp"
mirrors:
- localai/localai-backends:master-gpu-vulkan-omnivoice-cpp
- !!merge <<: *omnivoicecpp
name: "cuda12-omnivoice-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-omnivoice-cpp"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-12-omnivoice-cpp
- !!merge <<: *omnivoicecpp
name: "rocm-omnivoice-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-omnivoice-cpp"
mirrors:
- localai/localai-backends:master-gpu-rocm-hipblas-omnivoice-cpp
- !!merge <<: *omnivoicecpp
name: "intel-sycl-f32-omnivoice-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-omnivoice-cpp"
mirrors:
- localai/localai-backends:master-gpu-intel-sycl-f32-omnivoice-cpp
- !!merge <<: *omnivoicecpp
name: "intel-sycl-f16-omnivoice-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-omnivoice-cpp"
mirrors:
- localai/localai-backends:master-gpu-intel-sycl-f16-omnivoice-cpp
- !!merge <<: *omnivoicecpp
name: "cuda13-omnivoice-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-omnivoice-cpp"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-13-omnivoice-cpp
- !!merge <<: *omnivoicecpp
name: "cuda13-omnivoice-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-omnivoice-cpp"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-omnivoice-cpp
## vibevoice-cpp
- !!merge <<: *vibevoicecpp
name: "nvidia-l4t-arm64-vibevoice-cpp"
@@ -5082,24 +4609,24 @@
- localai/localai-backends:master-cpu-trl
- !!merge <<: *trl
name: "cuda12-trl"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-trl"
uri: "quay.io/go-skynet/local-ai-backends:latest-cublas-cuda12-trl"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-12-trl
- localai/localai-backends:latest-cublas-cuda12-trl
- !!merge <<: *trl
name: "cuda12-trl-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-trl"
uri: "quay.io/go-skynet/local-ai-backends:master-cublas-cuda12-trl"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-12-trl
- localai/localai-backends:master-cublas-cuda12-trl
- !!merge <<: *trl
name: "cuda13-trl"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-trl"
uri: "quay.io/go-skynet/local-ai-backends:latest-cublas-cuda13-trl"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-13-trl
- localai/localai-backends:latest-cublas-cuda13-trl
- !!merge <<: *trl
name: "cuda13-trl-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-trl"
uri: "quay.io/go-skynet/local-ai-backends:master-cublas-cuda13-trl"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-trl
- localai/localai-backends:master-cublas-cuda13-trl
## llama.cpp quantization backend
- &llama-cpp-quantization
name: "llama-cpp-quantization"
@@ -5266,18 +4793,3 @@
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-sherpa-onnx"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-sherpa-onnx
## supertonic
- !!merge <<: *supertonic
name: "supertonic-development"
capabilities:
default: "cpu-supertonic-development"
- !!merge <<: *supertonic
name: "cpu-supertonic"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-supertonic"
mirrors:
- localai/localai-backends:latest-cpu-supertonic
- !!merge <<: *supertonic
name: "cpu-supertonic-development"
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-supertonic"
mirrors:
- localai/localai-backends:master-cpu-supertonic

View File

@@ -5,6 +5,31 @@ imported by any backend that needs to parse LocalAI gRPC options or build a
chat-template-compatible message list from proto Message objects.
"""
import json
from urllib.parse import unquote
def resolve_model_path(model, model_file=""):
"""Resolve a LocalAI model reference to something an HF/MLX loader accepts.
LocalAI hands backends either a plain HuggingFace repo id
(``namespace/name``), an already-local filesystem path, or a
``file://`` URI (its ``LocalPrefix``) for models imported from disk.
Loaders such as ``mlx_lm.load`` reject the ``file://`` form because the
scheme is neither a valid repo id nor an existing path, so we normalize
it here before loading.
Resolution order:
1. Prefer ``model_file`` when set and non-empty - that is the resolved
local path LocalAI computed for the model.
2. Strip a ``file://`` scheme and percent-decode it to a plain path.
3. Leave plain repo ids and already-local paths unchanged.
"""
candidate = model_file if model_file else model
if candidate is None:
return candidate
if candidate.startswith("file://"):
return unquote(candidate[len("file://"):])
return candidate
def parse_options(options_list):

View File

@@ -28,7 +28,7 @@ import grpc
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
from grpc_auth import get_auth_interceptors
from python_utils import messages_to_dicts, parse_options as _shared_parse_options
from python_utils import messages_to_dicts, parse_options as _shared_parse_options, resolve_model_path
from mlx_utils import parse_tool_calls, split_reasoning
@@ -99,7 +99,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
from mlx_lm import load
from mlx_lm.models.cache import make_prompt_cache, can_trim_prompt_cache, trim_prompt_cache
print(f"[Rank 0] Loading model: {request.Model}", file=sys.stderr)
# Normalize the model reference: strip LocalAI's file:// LocalPrefix
# and prefer the resolved ModelFile so mlx_lm.load() gets a plain
# repo id or filesystem path (it rejects file:// URIs).
model_path = resolve_model_path(request.Model, request.ModelFile)
print(f"[Rank 0] Loading model: {model_path}", file=sys.stderr)
self.options = parse_options(request.Options)
print(f"Options: {self.options}", file=sys.stderr)
@@ -128,7 +132,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
)
self.coordinator = DistributedCoordinator(self.group)
self.coordinator.broadcast_command(CMD_LOAD_MODEL)
self.coordinator.broadcast_model_name(request.Model)
self.coordinator.broadcast_model_name(model_path)
else:
print("[Rank 0] No hostfile configured, running single-node", file=sys.stderr)
@@ -144,9 +148,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
if tokenizer_config:
print(f"Loading with tokenizer_config: {tokenizer_config}", file=sys.stderr)
self.model, self.tokenizer = load(request.Model, tokenizer_config=tokenizer_config)
self.model, self.tokenizer = load(model_path, tokenizer_config=tokenizer_config)
else:
self.model, self.tokenizer = load(request.Model)
self.model, self.tokenizer = load(model_path)
if self.group is not None:
from sharding import pipeline_auto_parallel
@@ -157,7 +161,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
from mlx_cache import ThreadSafeLRUPromptCache
max_cache_entries = self.options.get("max_cache_entries", 10)
self.max_kv_size = self.options.get("max_kv_size", None)
self.model_key = request.Model
self.model_key = model_path
self.lru_cache = ThreadSafeLRUPromptCache(
max_size=max_cache_entries,
can_trim_fn=can_trim_prompt_cache,

View File

@@ -18,7 +18,7 @@ import grpc
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
from grpc_auth import get_auth_interceptors
from python_utils import messages_to_dicts, parse_options
from python_utils import messages_to_dicts, parse_options, resolve_model_path
from mlx_utils import parse_tool_calls, split_reasoning
from mlx_vlm import load, stream_generate
@@ -67,7 +67,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
backend_pb2.Result: The load model result.
"""
try:
print(f"Loading MLX-VLM model: {request.Model}", file=sys.stderr)
# Normalize the model reference: strip LocalAI's file:// LocalPrefix
# and prefer the resolved ModelFile so mlx_vlm.load() gets a plain
# repo id or filesystem path (it rejects file:// URIs).
model_path = resolve_model_path(request.Model, request.ModelFile)
print(f"Loading MLX-VLM model: {model_path}", file=sys.stderr)
print(f"Request: {request}", file=sys.stderr)
# Parse Options[] key:value strings into a typed dict
@@ -76,10 +80,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
# Load model and processor using MLX-VLM
# mlx-vlm load function returns (model, processor) instead of (model, tokenizer)
self.model, self.processor = load(request.Model)
self.model, self.processor = load(model_path)
# Load model config for chat template support
self.config = load_config(request.Model)
self.config = load_config(model_path)
# Auto-infer the tool parser from the chat template. mlx-vlm has
# its own _infer_tool_parser that falls back to mlx-lm parsers.

View File

@@ -17,7 +17,7 @@ import grpc
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
from grpc_auth import get_auth_interceptors
from python_utils import messages_to_dicts, parse_options
from python_utils import messages_to_dicts, parse_options, resolve_model_path
from mlx_utils import parse_tool_calls, split_reasoning
from mlx_lm import load, stream_generate
@@ -63,7 +63,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
backend_pb2.Result: The load model result.
"""
try:
print(f"Loading MLX model: {request.Model}", file=sys.stderr)
# Normalize the model reference: strip LocalAI's file:// LocalPrefix
# and prefer the resolved ModelFile so mlx_lm.load() gets a plain
# repo id or filesystem path (it rejects file:// URIs).
model_path = resolve_model_path(request.Model, request.ModelFile)
print(f"Loading MLX model: {model_path}", file=sys.stderr)
print(f"Request: {request}", file=sys.stderr)
# Parse Options[] key:value strings into a typed dict (shared helper)
@@ -89,9 +93,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
# Load model and tokenizer using MLX
if tokenizer_config:
print(f"Loading with tokenizer_config: {tokenizer_config}", file=sys.stderr)
self.model, self.tokenizer = load(request.Model, tokenizer_config=tokenizer_config)
self.model, self.tokenizer = load(model_path, tokenizer_config=tokenizer_config)
else:
self.model, self.tokenizer = load(request.Model)
self.model, self.tokenizer = load(model_path)
# mlx_lm.load() returns a TokenizerWrapper that detects tool
# calling and thinking markers from the chat template / vocab.
@@ -111,7 +115,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
# Initialize thread-safe LRU prompt cache for efficient generation
max_cache_entries = self.options.get("max_cache_entries", 10)
self.max_kv_size = self.options.get("max_kv_size", None)
self.model_key = request.Model
self.model_key = model_path
self.lru_cache = ThreadSafeLRUPromptCache(
max_size=max_cache_entries,
can_trim_fn=can_trim_prompt_cache,

View File

@@ -12,7 +12,7 @@ import backend_pb2_grpc
# Make the shared helpers importable so we can unit-test them without a
# running gRPC server.
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
from python_utils import messages_to_dicts, parse_options
from python_utils import messages_to_dicts, parse_options, resolve_model_path
from mlx_utils import parse_tool_calls, split_reasoning
class TestBackendServicer(unittest.TestCase):
@@ -322,6 +322,42 @@ class TestSharedHelpers(unittest.TestCase):
self.assertEqual(r, "")
self.assertEqual(c, "just text")
def test_resolve_model_path_file_uri(self):
# file:// LocalPrefix (LocalAI import) is stripped to a plain path.
self.assertEqual(resolve_model_path("file:///a/b"), "/a/b")
def test_resolve_model_path_file_uri_percent_decoded(self):
# Percent-encoded characters (e.g. spaces) are decoded.
self.assertEqual(
resolve_model_path("file:///Users/me/My%20Models/Qwen3"),
"/Users/me/My Models/Qwen3",
)
def test_resolve_model_path_hf_repo_id_unchanged(self):
# Plain HuggingFace repo ids must pass through untouched.
self.assertEqual(
resolve_model_path("mlx-community/Qwen3-Coder-30B"),
"mlx-community/Qwen3-Coder-30B",
)
def test_resolve_model_path_local_path_unchanged(self):
# An already-local absolute path is left as-is.
self.assertEqual(resolve_model_path("/models/Qwen3"), "/models/Qwen3")
def test_resolve_model_path_prefers_model_file(self):
# The resolved ModelFile wins over Model when both are set.
self.assertEqual(
resolve_model_path("file:///ignored", "/resolved/local/path"),
"/resolved/local/path",
)
def test_resolve_model_path_model_file_file_uri(self):
# A ModelFile that is itself a file:// URI is also normalized.
self.assertEqual(
resolve_model_path("ignored", "file:///a/b"),
"/a/b",
)
def test_parse_tool_calls_with_shim(self):
tm = types.SimpleNamespace(
tool_call_start="<tool_call>",

View File

@@ -1,7 +1,6 @@
--extra-index-url https://download.pytorch.org/whl/cpu
accelerate
torch==2.8.0
torchaudio==2.8.0
transformers==4.56.1
librosa==0.11.0
neucodec>=0.0.4

View File

@@ -3,7 +3,6 @@ neucodec>=0.0.4
phonemizer==3.3.0
soundfile==0.13.1
torch==2.8.0
torchaudio==2.8.0
transformers==4.56.1
resemble-perth==1.0.1
accelerate

View File

@@ -3,5 +3,5 @@
# on a cu130 host. Pull the cu130-flavoured wheel from vLLM's per-tag index
# instead — the cublas13 case in install.sh adds --index-strategy=unsafe-best-match
# so uv consults this index alongside PyPI.
--extra-index-url https://wheels.vllm.ai/0.23.0/cu130
vllm==0.23.0
--extra-index-url https://wheels.vllm.ai/0.22.1/cu130
vllm==0.22.1

View File

@@ -1,4 +1,4 @@
grpcio==1.81.1
grpcio==1.81.0
protobuf
certifi
setuptools

View File

@@ -161,21 +161,6 @@ func initDistributed(cfg *config.ApplicationConfig, authDB *gorm.DB, configLoade
}
xlog.Info("Node registry initialized")
// Seed declarative per-model scheduling config (LOCALAI_MODEL_SCHEDULING /
// LOCALAI_MODEL_SCHEDULING_CONFIG). Authoritative: overwrites matching models
// on every boot. Runs before the reconciler starts so the first tick already
// sees the desired state. Models not listed are left untouched.
if cfg.Distributed.ModelSchedulingJSON != "" || cfg.Distributed.ModelSchedulingConfigPath != "" {
schedConfigs, err := nodes.ParseSchedulingSeed(cfg.Distributed.ModelSchedulingJSON, cfg.Distributed.ModelSchedulingConfigPath)
if err != nil {
return nil, fmt.Errorf("parsing declarative model scheduling config: %w", err)
}
if err := registry.SeedModelScheduling(context.Background(), schedConfigs); err != nil {
return nil, fmt.Errorf("seeding declarative model scheduling config: %w", err)
}
xlog.Info("Applied declarative model scheduling config", "models", len(schedConfigs))
}
// Collect SmartRouter option values; the router itself is created after all
// dependencies (including FileStager and Unloader) are ready.
var routerAuthToken string

View File

@@ -1,66 +0,0 @@
package backend
import (
"context"
"fmt"
"time"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/trace"
"github.com/mudler/LocalAI/pkg/grpc/proto"
"github.com/mudler/LocalAI/pkg/model"
)
// Depth runs depth estimation (Depth Anything 3) on the supplied image and
// returns the full DepthResponse: per-pixel metric depth + confidence + sky,
// camera pose (extrinsics/intrinsics), an optional 3D point cloud and any
// requested exports (glb/colmap). The include_* flags and exports mirror the
// DepthRequest proto so callers can ask for less work.
func Depth(
ctx context.Context,
in *proto.DepthRequest,
loader *model.ModelLoader,
appConfig *config.ApplicationConfig,
modelConfig config.ModelConfig,
) (*proto.DepthResponse, error) {
opts := ModelOptions(modelConfig, appConfig)
depthModel, err := loader.Load(opts...)
if err != nil {
recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
return nil, err
}
if depthModel == nil {
return nil, fmt.Errorf("could not load depth model")
}
var startTime time.Time
if appConfig.EnableTracing {
trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
startTime = time.Now()
}
res, err := depthModel.Depth(ctx, in)
if appConfig.EnableTracing {
errStr := ""
if err != nil {
errStr = err.Error()
}
trace.RecordBackendTrace(trace.BackendTrace{
Timestamp: startTime,
Duration: time.Since(startTime),
Type: trace.BackendTraceDepth,
ModelName: modelConfig.Name,
Backend: modelConfig.Backend,
Summary: trace.TruncateString(in.GetSrc(), 200),
Error: errStr,
Data: map[string]any{
"exports": in.GetExports(),
},
})
}
return res, err
}

View File

@@ -307,19 +307,11 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions
}
}
// TopK may be nil after SetDefaults for backends that don't use llama.cpp's
// top_k=40 default (issue #6632, e.g. mlx). proto3 int32 can't be unset, so
// send 0 — the value mlx actually wants (top-k disabled).
var topK int32
if c.TopK != nil {
topK = int32(*c.TopK)
}
pbOpts := &pb.PredictOptions{
Temperature: float32(*c.Temperature),
TopP: float32(*c.TopP),
NDraft: c.NDraft,
TopK: topK,
TopK: int32(*c.TopK),
MinP: float32(*c.MinP),
Tokens: int32(*c.Maxtokens),
Threads: int32(*c.Threads),

View File

@@ -172,8 +172,6 @@ type RunCMD struct {
NatsTLSCert string `env:"LOCALAI_NATS_TLS_CERT" type:"existingfile" help:"Client certificate for NATS mTLS" group:"distributed"`
NatsTLSKey string `env:"LOCALAI_NATS_TLS_KEY" type:"existingfile" help:"Client private key for NATS mTLS" group:"distributed"`
ExposeNodeHeader bool `env:"LOCALAI_EXPOSE_NODE_HEADER" default:"false" help:"Set the X-LocalAI-Node response header on inference responses (OpenAI chat/completions/embeddings, Anthropic /v1/messages, Ollama /api/chat,/api/generate,/api/embed) with the ID of the worker that served the request. Disabled by default: the node ID reveals internal topology and should not be exposed on a public endpoint. Best-effort: under heavy concurrency the header may reflect a recent routing decision rather than this exact request's." group:"distributed"`
ModelScheduling string `env:"LOCALAI_MODEL_SCHEDULING" help:"Declarative per-model scheduling config applied at startup (inline JSON list of {model_name,node_selector,min_replicas,max_replicas,replicas:\"all\"}). Authoritative: overwrites matching models on every boot. Distributed mode only." group:"distributed"`
ModelSchedulingConfig string `env:"LOCALAI_MODEL_SCHEDULING_CONFIG" help:"Path to a YAML file with the same per-model scheduling list as LOCALAI_MODEL_SCHEDULING. Distributed mode only." group:"distributed"`
Version bool
@@ -349,15 +347,6 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
if r.ExposeNodeHeader {
opts = append(opts, config.WithExposeNodeHeader(true))
}
if r.ModelScheduling != "" {
opts = append(opts, config.WithModelSchedulingJSON(r.ModelScheduling))
}
if r.ModelSchedulingConfig != "" {
opts = append(opts, config.WithModelSchedulingConfigPath(r.ModelSchedulingConfig))
}
if !r.Distributed && (r.ModelScheduling != "" || r.ModelSchedulingConfig != "") {
xlog.Warn("LOCALAI_MODEL_SCHEDULING / LOCALAI_MODEL_SCHEDULING_CONFIG is set but distributed mode is disabled (LOCALAI_DISTRIBUTED=false) - ignoring")
}
if r.DisableMetricsEndpoint {
opts = append(opts, config.DisableMetricsEndpoint)

View File

@@ -488,16 +488,6 @@ func (o *ApplicationConfig) GetEffectiveMaxActiveBackends() int {
return 0
}
// WatchdogShouldRun reports whether the live watchdog process should be
// running for the current config. It mirrors the gating in
// (*Application).startWatchdog so the /api/settings start/stop decision and
// the startup path agree on a single source of truth: the watchdog runs when
// idle/busy checks are enabled (WatchDog), when LRU eviction is active
// (effective max active backends > 0), or when the memory reclaimer is on.
func (o *ApplicationConfig) WatchdogShouldRun() bool {
return o.WatchDog || o.GetEffectiveMaxActiveBackends() > 0 || o.MemoryReclaimerEnabled
}
// WithForceEvictionWhenBusy sets whether to force eviction even when models have active API calls
func WithForceEvictionWhenBusy(enabled bool) AppOption {
return func(o *ApplicationConfig) {
@@ -1208,22 +1198,18 @@ func (o *ApplicationConfig) ApplyRuntimeSettings(settings *RuntimeSettings) (req
}
if settings.WatchdogIdleEnabled != nil {
o.WatchDogIdle = *settings.WatchdogIdleEnabled
if o.WatchDogIdle {
o.WatchDog = true
}
requireRestart = true
}
if settings.WatchdogBusyEnabled != nil {
o.WatchDogBusy = *settings.WatchdogBusyEnabled
if o.WatchDogBusy {
o.WatchDog = true
}
requireRestart = true
}
// The React Settings "Enable Watchdog" master toggle manages only the
// idle/busy checks — watchdog_enabled is vestigial in that UI. Whenever
// either idle/busy field is present in the body, derive the run-state from
// idle||busy so a cold enable starts the watchdog and a full disable stops
// it, instead of trusting the stale watchdog_enabled the UI never updates.
// This mirrors the startup invariant in startup.go. An API client posting
// only watchdog_enabled (idle/busy absent) keeps its explicit value.
if settings.WatchdogIdleEnabled != nil || settings.WatchdogBusyEnabled != nil {
o.WatchDog = o.WatchDogIdle || o.WatchDogBusy
}
if settings.WatchdogIdleTimeout != nil {
if dur, err := time.ParseDuration(*settings.WatchdogIdleTimeout); err == nil {
o.WatchDogIdleTimeout = dur

View File

@@ -223,69 +223,6 @@ var _ = Describe("ApplicationConfig RuntimeSettings Conversion", func() {
Expect(appConfig.WatchDogBusy).To(BeTrue())
})
// Residual #9125: the React Settings "Enable Watchdog" master toggle
// manages only watchdog_idle_enabled / watchdog_busy_enabled — it never
// touches the vestigial watchdog_enabled field. On a cold enable the
// body therefore carries watchdog_enabled=false alongside idle/busy=true.
// The derived run-state (WatchDog) must follow idle||busy so the live
// watchdog actually starts, not the stale watchdog_enabled=false.
It("should derive WatchDog from idle||busy on a cold enable even when watchdog_enabled=false", func() {
appConfig := &ApplicationConfig{WatchDog: false}
watchdogEnabled := false
watchdogIdle := true
watchdogBusy := true
rs := &RuntimeSettings{
WatchdogEnabled: &watchdogEnabled,
WatchdogIdleEnabled: &watchdogIdle,
WatchdogBusyEnabled: &watchdogBusy,
}
appConfig.ApplyRuntimeSettings(rs)
Expect(appConfig.WatchDog).To(BeTrue())
Expect(appConfig.WatchdogShouldRun()).To(BeTrue())
})
// The disable direction: the master toggle off sends idle=false,
// busy=false, but watchdog_enabled may still be the stale true loaded
// before the change. WatchDog must follow idle||busy down to false so
// the live watchdog is stopped (it stays stopped unless LRU / memory
// reclaimer keep it alive, which is gated by WatchdogShouldRun).
It("should disable WatchDog when both idle and busy are turned off", func() {
appConfig := &ApplicationConfig{WatchDog: true, WatchDogIdle: true, WatchDogBusy: true}
watchdogEnabled := true
watchdogIdle := false
watchdogBusy := false
rs := &RuntimeSettings{
WatchdogEnabled: &watchdogEnabled,
WatchdogIdleEnabled: &watchdogIdle,
WatchdogBusyEnabled: &watchdogBusy,
}
appConfig.ApplyRuntimeSettings(rs)
Expect(appConfig.WatchDog).To(BeFalse())
Expect(appConfig.WatchdogShouldRun()).To(BeFalse())
})
// Backward compatibility: an API client that posts only watchdog_enabled
// (idle/busy nil) keeps the explicit value — the idle/busy derivation
// only kicks in when those fields are actually present in the body.
It("should preserve explicit watchdog_enabled when idle/busy are absent", func() {
appConfig := &ApplicationConfig{WatchDog: false}
watchdogEnabled := true
rs := &RuntimeSettings{
WatchdogEnabled: &watchdogEnabled,
}
appConfig.ApplyRuntimeSettings(rs)
Expect(appConfig.WatchDog).To(BeTrue())
})
It("should handle MaxActiveBackends and update SingleBackend accordingly", func() {
appConfig := &ApplicationConfig{}

View File

@@ -21,7 +21,6 @@ const (
UsecaseSoundGeneration = "sound_generation"
UsecaseRerank = "rerank"
UsecaseDetection = "detection"
UsecaseDepth = "depth"
UsecaseVAD = "vad"
UsecaseAudioTransform = "audio_transform"
UsecaseDiarization = "diarization"
@@ -45,7 +44,6 @@ const (
MethodSoundGeneration GRPCMethod = "SoundGeneration"
MethodTokenizeString GRPCMethod = "TokenizeString"
MethodDetect GRPCMethod = "Detect"
MethodDepth GRPCMethod = "Depth"
MethodRerank GRPCMethod = "Rerank"
MethodVAD GRPCMethod = "VAD"
MethodAudioTransform GRPCMethod = "AudioTransform"
@@ -143,11 +141,6 @@ var UsecaseInfoMap = map[string]UsecaseInfo{
GRPCMethod: MethodDetect,
Description: "Object detection via the Detect RPC with bounding boxes.",
},
UsecaseDepth: {
Flag: FLAG_DEPTH,
GRPCMethod: MethodDepth,
Description: "Per-pixel metric depth, camera pose and 3D point cloud via the Depth RPC (Depth Anything 3).",
},
UsecaseVAD: {
Flag: FLAG_VAD,
GRPCMethod: MethodVAD,
@@ -403,10 +396,10 @@ var BackendCapabilities = map[string]BackendCapability{
Description: "Qwen TTS",
},
"qwen3-tts-cpp": {
GRPCMethods: []GRPCMethod{MethodTTS, MethodTTSStream},
GRPCMethods: []GRPCMethod{MethodTTS},
PossibleUsecases: []string{UsecaseTTS},
DefaultUsecases: []string{UsecaseTTS},
Description: "Qwen3 TTS C++ - text-to-speech with streaming, named speakers, voice design and cloning (qwentts.cpp / GGML)",
Description: "Qwen3 TTS C++ text-to-speech, C++ backend",
},
"faster-qwen3-tts": {
GRPCMethods: []GRPCMethod{MethodTTS},
@@ -495,13 +488,6 @@ var BackendCapabilities = map[string]BackendCapability{
DefaultUsecases: []string{UsecaseDetection},
Description: "RF-DETR C++ object detection",
},
"depth-anything": {
GRPCMethods: []GRPCMethod{MethodDepth, MethodPredict, MethodGenerateImage},
PossibleUsecases: []string{UsecaseDepth},
DefaultUsecases: []string{UsecaseDepth},
AcceptsImages: true,
Description: "Depth Anything 3 C++ — per-pixel metric depth, camera pose and 3D point cloud",
},
// --- Face and speaker recognition backends ---
"insightface": {
@@ -531,33 +517,6 @@ func NormalizeBackendName(backend string) string {
return strings.ReplaceAll(backend, ".", "-")
}
// nonLlamaSamplerBackends lists backends whose native sampler defaults differ
// from llama.cpp's, so LocalAI must NOT inject llama.cpp's top_k=40 default for
// them (issue #6632). mlx_lm's intended default is top_k=0 (disabled) and mlx
// does not remap 0->40, so shipping 40 silently changes sampling for clients
// that omit top_k. Leaving TopK nil lets the wire value default to 0.
//
// This is intentionally a small allow-list of KNOWN non-llama backends: empty
// and unknown backends fall through to the llama.cpp default to preserve the
// GGUF auto-detect path's behavior.
var nonLlamaSamplerBackends = map[string]struct{}{
"mlx": {},
"mlx-vlm": {},
"mlx-distributed": {},
}
// UsesLlamaSamplerDefaults reports whether a backend should receive llama.cpp's
// sampler defaults (e.g. top_k=40). Empty/unknown backends return true so the
// GGUF auto-detect path (which resolves to llama.cpp) keeps today's behavior;
// only the known non-llama backends in nonLlamaSamplerBackends return false.
func UsesLlamaSamplerDefaults(backend string) bool {
if backend == "" {
return true
}
_, isNonLlama := nonLlamaSamplerBackends[NormalizeBackendName(backend)]
return !isNonLlama
}
// GetBackendCapability returns the capability info for a backend, or nil if unknown.
// Handles backend name normalization.
func GetBackendCapability(backend string) *BackendCapability {

View File

@@ -84,12 +84,6 @@ type DistributedConfig struct {
// drives the background eviction cadence (eviction runs every TTL/2). Zero
// means use the prefixcache package default (5m).
PrefixCacheTTL time.Duration
// ModelSchedulingJSON is an inline JSON list of per-model scheduling configs
// applied authoritatively at startup (LOCALAI_MODEL_SCHEDULING).
ModelSchedulingJSON string
// ModelSchedulingConfigPath is a path to a YAML file with the same list
// (LOCALAI_MODEL_SCHEDULING_CONFIG).
ModelSchedulingConfigPath string
}
// Validate checks that the distributed configuration is internally consistent.
@@ -296,21 +290,6 @@ func WithPrefixCacheTTL(d time.Duration) AppOption {
}
}
// WithModelSchedulingJSON sets the inline-JSON declarative scheduling config.
func WithModelSchedulingJSON(s string) AppOption {
return func(o *ApplicationConfig) {
o.Distributed.ModelSchedulingJSON = s
}
}
// WithModelSchedulingConfigPath sets the path to a YAML declarative scheduling
// config file.
func WithModelSchedulingConfigPath(path string) AppOption {
return func(o *ApplicationConfig) {
o.Distributed.ModelSchedulingConfigPath = path
}
}
// Flag names for distributed timeout / interval configuration. These are
// the kebab-case identifiers kong derives from the matching RunCMD struct
// fields; they appear in Validate error messages and any other operator-

View File

@@ -64,7 +64,6 @@ var UsecaseOptions = []FieldOption{
{Value: "image", Label: "Image"},
{Value: "vision", Label: "Vision"},
{Value: "detection", Label: "Detection"},
{Value: "depth", Label: "Depth"},
{Value: "face_recognition", Label: "Face Recognition"},
{Value: "transcript", Label: "Transcript"},
{Value: "diarization", Label: "Diarization"},

View File

@@ -355,92 +355,6 @@ func DefaultRegistry() map[string]FieldMetaOverride {
Component: "toggle",
Order: 69,
},
"pipeline.voice_recognition.model": {
Section: "pipeline",
Label: "Voice Recognition Model",
Description: "Speaker-recognition backend model used to gate the pipeline behind speaker verification. Leave empty to disable the voice gate.",
Component: "model-select",
AutocompleteProvider: ProviderModels,
Order: 70,
},
"pipeline.voice_recognition.mode": {
Section: "pipeline",
Label: "Voice Gate Mode",
Description: "How callers are authorized: 'identify' matches the speaker 1:N against the voice registry; 'verify' matches 1:few against the configured reference audios.",
Component: "select",
Options: []FieldOption{
{Value: "identify", Label: "identify (registry)"},
{Value: "verify", Label: "verify (references)"},
},
Order: 71,
},
"pipeline.voice_recognition.threshold": {
Section: "pipeline",
Label: "Voice Gate Threshold",
Description: "Maximum cosine distance between the caller and an authorized speaker that still counts as a match. Lower is stricter. Default 0.25 is tuned for the ECAPA-TDNN encoder on VoxCeleb.",
Component: "slider",
Min: f64(0.01),
Max: f64(2),
Step: f64(0.01),
Order: 72,
},
"pipeline.voice_recognition.when": {
Section: "pipeline",
Label: "Voice Gate When",
Description: "How often to verify the speaker: 'every' checks each utterance; 'first' verifies once and then trusts the session.",
Component: "select",
Options: []FieldOption{
{Value: "every", Label: "every utterance"},
{Value: "first", Label: "first only"},
},
Order: 73,
},
"pipeline.voice_recognition.on_reject": {
Section: "pipeline",
Label: "Voice Gate On Reject",
Description: "What to do with an unauthorized utterance: 'drop_event' drops it and emits an error event to the client; 'drop_silent' drops it quietly.",
Component: "select",
Options: []FieldOption{
{Value: "drop_event", Label: "drop + error event"},
{Value: "drop_silent", Label: "drop silently"},
},
Order: 74,
},
"pipeline.voice_recognition.anti_spoofing": {
Section: "pipeline",
Label: "Voice Gate Anti-Spoofing",
Description: "Enable the backend liveness/anti-spoofing check (verify mode only) to reject replayed or synthesized audio.",
Component: "toggle",
Order: 75,
},
"pipeline.voice_recognition.allow.names": {
Section: "pipeline",
Label: "Voice Gate Allowed Names",
Description: "Identify mode: authorize only registry identities whose name matches one of these exactly. Empty allows any registered identity.",
Component: "string-list",
Order: 76,
},
"pipeline.voice_recognition.allow.labels": {
Section: "pipeline",
Label: "Voice Gate Allowed Labels",
Description: "Identify mode: authorize any registry identity carrying one of these label keys. Empty allows any registered identity.",
Component: "string-list",
Order: 77,
},
"pipeline.voice_recognition.references": {
Section: "pipeline",
Label: "Voice Gate References",
Description: "Verify mode: the authorized reference speakers, each with a name and an audio file path the caller's voice is matched against.",
Component: "json-editor",
Order: 78,
},
"pipeline.max_history_items": {
Section: "pipeline",
Label: "Max History Items",
Description: "Cap how many trailing conversation items are fed to the LLM each realtime turn (0 = unlimited, rely on the LLM's context window). Set it on a composed pipeline (VAD+STT+LLM+TTS) so a long-running session doesn't grow until the context fills. Unset uses the per-model-type default.",
Component: "number",
Order: 79,
},
// --- Functions ---
"function.grammar.parallel_calls": {

View File

@@ -509,17 +509,6 @@ type Pipeline struct {
// to enable_thinking=false backend metadata) without editing the underlying
// LLM model config. Unset leaves the LLM model config in charge.
DisableThinking *bool `yaml:"disable_thinking,omitempty" json:"disable_thinking,omitempty"`
// MaxHistoryItems caps how many trailing conversation items are fed to the
// LLM each realtime turn (0 = unlimited, rely on the LLM's context window).
// Unset (nil) uses the per-model-type default. Set it on a composed pipeline
// (VAD+STT+LLM+TTS) so a long-running session doesn't grow until the LLM's
// context fills.
MaxHistoryItems *int `yaml:"max_history_items,omitempty" json:"max_history_items,omitempty"`
// VoiceRecognition gates the pipeline behind speaker verification. Nil
// (block absent) means no gate, preserving existing behavior.
VoiceRecognition *PipelineVoiceRecognition `yaml:"voice_recognition,omitempty" json:"voice_recognition,omitempty"`
}
// ApplyReasoningEffort resolves the effective reasoning effort — a per-request
@@ -586,123 +575,6 @@ func (p Pipeline) ThinkingDisabled() bool {
return p.DisableThinking != nil && *p.DisableThinking
}
// Voice-recognition gate enum values.
const (
VoiceGateModeIdentify = "identify"
VoiceGateModeVerify = "verify"
VoiceGateWhenEvery = "every"
VoiceGateWhenFirst = "first"
VoiceGateRejectEvent = "drop_event"
VoiceGateRejectSilent = "drop_silent"
// defaultVoiceGateThreshold is the cosine-distance default tuned for the
// ECAPA-TDNN speaker encoder on VoxCeleb.
defaultVoiceGateThreshold = 0.25
)
// @Description PipelineVoiceRecognition gates a realtime pipeline behind speaker verification.
type PipelineVoiceRecognition struct {
// Model is the speaker-recognition backend model name.
Model string `yaml:"model,omitempty" json:"model,omitempty"`
// Mode is "identify" (1:N against the voice registry) or "verify"
// (1:few against reference audios).
Mode string `yaml:"mode,omitempty" json:"mode,omitempty"`
// Threshold is the maximum cosine distance that still counts as a match.
Threshold float32 `yaml:"threshold,omitempty" json:"threshold,omitempty"`
// When is "every" (verify each utterance) or "first" (verify once, then
// trust the session).
When string `yaml:"when,omitempty" json:"when,omitempty"`
// OnReject is "drop_event" (drop + emit an error event) or "drop_silent"
// (drop quietly).
OnReject string `yaml:"on_reject,omitempty" json:"on_reject,omitempty"`
// AntiSpoofing enables the backend liveness check (verify mode only).
AntiSpoofing bool `yaml:"anti_spoofing,omitempty" json:"anti_spoofing,omitempty"`
// Allow filters which registry identities are authorized (identify mode).
Allow VoiceRecognitionAllow `yaml:"allow,omitempty" json:"allow,omitempty"`
// References are the authorized reference speakers (verify mode).
References []VoiceReference `yaml:"references,omitempty" json:"references,omitempty"`
}
// @Description VoiceRecognitionAllow filters authorized registry identities.
type VoiceRecognitionAllow struct {
// Names matches registered Metadata.Name exactly.
Names []string `yaml:"names,omitempty" json:"names,omitempty"`
// Labels authorizes any identity carrying a matching label key.
Labels []string `yaml:"labels,omitempty" json:"labels,omitempty"`
}
// @Description VoiceReference is one authorized reference speaker for verify mode.
type VoiceReference struct {
Name string `yaml:"name,omitempty" json:"name,omitempty"`
Audio string `yaml:"audio,omitempty" json:"audio,omitempty"`
}
// VoiceGateEnabled reports whether a voice-recognition gate is configured. The
// mere presence of the block is the intent signal: a present-but-incomplete
// block (e.g. missing model) must fail closed at construction, not be silently
// skipped here.
func (p Pipeline) VoiceGateEnabled() bool {
return p.VoiceRecognition != nil
}
// Normalize fills in defaults in place for omitted fields.
func (v *PipelineVoiceRecognition) Normalize() {
if v.Mode == "" {
v.Mode = VoiceGateModeIdentify
}
if v.When == "" {
v.When = VoiceGateWhenEvery
}
if v.OnReject == "" {
v.OnReject = VoiceGateRejectEvent
}
if v.Threshold == 0 {
v.Threshold = defaultVoiceGateThreshold
}
}
// Validate checks shape and enum values. registryAvailable indicates whether a
// VoiceRegistry exists (required by identify mode). Empty When/OnReject/Mode are
// treated as valid because Normalize defaults them.
func (v PipelineVoiceRecognition) Validate(registryAvailable bool) error {
if v.Model == "" {
return fmt.Errorf("voice_recognition: model is required")
}
switch v.Mode {
case "", VoiceGateModeIdentify:
if !registryAvailable {
return fmt.Errorf("voice_recognition mode 'identify' requires a voice registry")
}
case VoiceGateModeVerify:
if len(v.References) == 0 {
return fmt.Errorf("voice_recognition mode 'verify' requires at least one reference")
}
for i, r := range v.References {
if r.Audio == "" {
return fmt.Errorf("voice_recognition reference %d (%q) is missing an audio path", i, r.Name)
}
}
default:
return fmt.Errorf("voice_recognition: unknown mode %q", v.Mode)
}
switch v.When {
case "", VoiceGateWhenEvery, VoiceGateWhenFirst:
default:
return fmt.Errorf("voice_recognition: unknown when %q", v.When)
}
switch v.OnReject {
case "", VoiceGateRejectEvent, VoiceGateRejectSilent:
default:
return fmt.Errorf("voice_recognition: unknown on_reject %q", v.OnReject)
}
// A zero threshold means "unset" (Normalize defaults it); only validate an
// explicitly-set value. Cosine distance ranges 0..2.
if v.Threshold != 0 && (v.Threshold < 0 || v.Threshold > 2) {
return fmt.Errorf("voice_recognition: threshold %v out of range (0..2)", v.Threshold)
}
return nil
}
// @Description File configuration for model downloads
type File struct {
Filename string `yaml:"filename,omitempty" json:"filename,omitempty"`
@@ -995,12 +867,7 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
cfg.Seed = &defaultSeed
}
// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
// native default differs (issue #6632). Only inject it for the llama.cpp
// family and the empty/auto backend; leave TopK nil for known non-llama
// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
// is 0 rather than a silently-changed 40.
if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
if cfg.TopK == nil {
cfg.TopK = &defaultTopK
}
@@ -1291,10 +1158,6 @@ const (
// chat/completion/embeddings.
FLAG_SCORE ModelConfigUsecase = 0b10000000000000000000
// Marks a model as wired for the Depth gRPC primitive (per-pixel
// metric depth + camera pose + 3D point cloud via Depth Anything 3).
FLAG_DEPTH ModelConfigUsecase = 0b100000000000000000000
// Common Subsets
FLAG_LLM ModelConfigUsecase = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
)
@@ -1352,7 +1215,6 @@ func GetAllModelConfigUsecases() map[string]ModelConfigUsecase {
"FLAG_DIARIZATION": FLAG_DIARIZATION,
"FLAG_REALTIME_AUDIO": FLAG_REALTIME_AUDIO,
"FLAG_SCORE": FLAG_SCORE,
"FLAG_DEPTH": FLAG_DEPTH,
}
}
@@ -1496,13 +1358,6 @@ func (c *ModelConfig) GuessUsecases(u ModelConfigUsecase) bool {
}
}
if (u & FLAG_DEPTH) == FLAG_DEPTH {
depthBackends := []string{"depth-anything"}
if !slices.Contains(depthBackends, c.Backend) {
return false
}
}
if (u & FLAG_FACE_RECOGNITION) == FLAG_FACE_RECOGNITION {
faceBackends := []string{"insightface"}
if !slices.Contains(faceBackends, c.Backend) {

View File

@@ -529,72 +529,4 @@ concurrency_groups:
"models that template in Go still rely on the Go-generated grammar")
})
})
// The default top_k=40 is llama.cpp's sampling default and is WRONG for
// backends whose native default differs. mlx_lm's intended default is
// top_k=0 (disabled) and mlx does not remap 0->40, so injecting 40 silently
// changes sampling for mlx clients that omit top_k (issue #6632). Gate the
// injection on backend family: keep 40 for the llama.cpp family and for the
// empty/auto backend (the GGUF auto-detect path resolves to llama.cpp), but
// leave TopK nil for the mlx family so the wire value is 0.
Context("TopK default is backend-gated (issue #6632)", func() {
It("injects top_k=40 for the llama.cpp backend", func() {
cfg := &ModelConfig{}
cfg.Backend = "llama-cpp"
cfg.SetDefaults()
Expect(cfg.TopK).NotTo(BeNil(), "llama.cpp must keep its top_k=40 default")
Expect(*cfg.TopK).To(Equal(40))
})
It("injects top_k=40 for the empty/auto backend (GGUF auto-detect)", func() {
cfg := &ModelConfig{}
cfg.SetDefaults()
Expect(cfg.TopK).NotTo(BeNil(), "empty backend resolves to llama.cpp; default unchanged")
Expect(*cfg.TopK).To(Equal(40))
})
It("leaves TopK nil for the mlx backend", func() {
cfg := &ModelConfig{}
cfg.Backend = "mlx"
cfg.SetDefaults()
Expect(cfg.TopK).To(BeNil(),
"mlx_lm's intended default is top_k=0 (disabled); LocalAI must not inject 40")
})
It("leaves TopK nil for the mlx-vlm backend", func() {
cfg := &ModelConfig{}
cfg.Backend = "mlx-vlm"
cfg.SetDefaults()
Expect(cfg.TopK).To(BeNil())
})
It("leaves TopK nil for the mlx-distributed backend", func() {
cfg := &ModelConfig{}
cfg.Backend = "mlx-distributed"
cfg.SetDefaults()
Expect(cfg.TopK).To(BeNil())
})
It("respects an explicit top_k even for the mlx backend", func() {
explicit := 7
cfg := &ModelConfig{}
cfg.Backend = "mlx"
cfg.TopK = &explicit
cfg.SetDefaults()
Expect(cfg.TopK).NotTo(BeNil())
Expect(*cfg.TopK).To(Equal(7))
})
})
})

View File

@@ -1,73 +0,0 @@
package config
import (
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = Describe("PipelineVoiceRecognition", func() {
Describe("Normalize", func() {
It("fills defaults for empty fields", func() {
v := PipelineVoiceRecognition{Model: "spk"}
v.Normalize()
Expect(v.Mode).To(Equal(VoiceGateModeIdentify))
Expect(v.When).To(Equal(VoiceGateWhenEvery))
Expect(v.OnReject).To(Equal(VoiceGateRejectEvent))
Expect(v.Threshold).To(BeNumerically("~", defaultVoiceGateThreshold, 1e-6))
})
It("keeps explicit values", func() {
v := PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeVerify, When: VoiceGateWhenFirst, OnReject: VoiceGateRejectSilent, Threshold: 0.4}
v.Normalize()
Expect(v.Mode).To(Equal(VoiceGateModeVerify))
Expect(v.When).To(Equal(VoiceGateWhenFirst))
Expect(v.OnReject).To(Equal(VoiceGateRejectSilent))
Expect(v.Threshold).To(BeNumerically("~", 0.4, 1e-6))
})
})
Describe("Validate", func() {
It("requires a registry for identify mode", func() {
v := PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify}
Expect(v.Validate(false)).To(HaveOccurred())
Expect(v.Validate(true)).ToNot(HaveOccurred())
})
It("requires references for verify mode", func() {
v := PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeVerify}
Expect(v.Validate(false)).To(HaveOccurred())
v.References = []VoiceReference{{Name: "a", Audio: "/a.wav"}}
Expect(v.Validate(false)).ToNot(HaveOccurred())
})
It("rejects a reference with no audio path", func() {
v := PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeVerify, References: []VoiceReference{{Name: "a"}}}
Expect(v.Validate(false)).To(HaveOccurred())
})
It("rejects unknown enum values", func() {
Expect((PipelineVoiceRecognition{Model: "spk", Mode: "bogus"}).Validate(true)).To(HaveOccurred())
Expect((PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify, When: "bogus"}).Validate(true)).To(HaveOccurred())
Expect((PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify, OnReject: "bogus"}).Validate(true)).To(HaveOccurred())
})
It("accepts a zero (unset) threshold", func() {
v := PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify, Threshold: 0}
Expect(v.Validate(true)).ToNot(HaveOccurred())
})
It("rejects an out-of-range threshold", func() {
Expect((PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify, Threshold: 5}).Validate(true)).To(HaveOccurred())
Expect((PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify, Threshold: -1}).Validate(true)).To(HaveOccurred())
})
It("rejects an empty model", func() {
Expect((PipelineVoiceRecognition{Mode: VoiceGateModeIdentify}).Validate(true)).To(HaveOccurred())
})
})
Describe("VoiceGateEnabled", func() {
It("is false when block absent", func() {
Expect((Pipeline{}).VoiceGateEnabled()).To(BeFalse())
})
It("is true when a model is set", func() {
Expect((Pipeline{VoiceRecognition: &PipelineVoiceRecognition{Model: "spk"}}).VoiceGateEnabled()).To(BeTrue())
})
It("is true when the block is present even without a model (fails closed downstream)", func() {
Expect((Pipeline{VoiceRecognition: &PipelineVoiceRecognition{}}).VoiceGateEnabled()).To(BeTrue())
})
})
})

View File

@@ -1,32 +0,0 @@
package importers_test
import (
"encoding/json"
"fmt"
"github.com/mudler/LocalAI/core/gallery/importers"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = Describe("OmniVoice pref-only guard", func() {
Context("With only a bare OmniVoice GGUF URI", func() {
It("does not auto-import as omnivoice-cpp", func() {
// omnivoice-cpp is a preference-only backend (listed in the
// /backends/known registry with AutoDetect:false). No importer
// emits it, so discovering a bare OmniVoice GGUF must never
// silently resolve to omnivoice-cpp. It may legitimately match a
// generic GGUF importer (e.g. llama-cpp) or error/be ambiguous —
// the only hard requirement is that it is NOT omnivoice-cpp.
uri := "huggingface://Serveurperso/OmniVoice-GGUF/omnivoice-base-Q8_0.gguf"
preferences := json.RawMessage(`{}`)
modelConfig, err := importers.DiscoverModelConfig(uri, preferences)
if err != nil {
// An error (including ambiguous) is acceptable for a pref-only backend.
return
}
Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("backend: omnivoice-cpp"), fmt.Sprintf("Model config: %+v", modelConfig))
})
})
})

View File

@@ -36,9 +36,7 @@ var knownPrefOnlyBackends = []schema.KnownBackend{
{Name: "kokoros", Modality: "tts", AutoDetect: false, Description: "Kokoros TTS (preference-only)"},
{Name: "qwen-tts", Modality: "tts", AutoDetect: false, Description: "Qwen TTS (preference-only)"},
{Name: "qwen3-tts-cpp", Modality: "tts", AutoDetect: false, Description: "Qwen3 TTS C++ (preference-only)"},
{Name: "omnivoice-cpp", Modality: "tts", AutoDetect: false, Description: "OmniVoice C++ TTS with voice cloning and voice design (preference-only)"},
{Name: "faster-qwen3-tts", Modality: "tts", AutoDetect: false, Description: "Faster Qwen3 TTS (preference-only)"},
{Name: "supertonic", Modality: "tts", AutoDetect: false, Description: "Supertonic multilingual ONNX TTS (preference-only)"},
// Detection
{Name: "sam3-cpp", Modality: "detection", AutoDetect: false, Description: "SAM3 C++ object detection (preference-only)"},
// Audio transform (audio-in / audio-out, optional reference signal)

View File

@@ -145,7 +145,6 @@ var _ = Describe("Backend Endpoints", func() {
expectPrefOnly("qwen-tts", "tts")
expectPrefOnly("qwen3-tts-cpp", "tts")
expectPrefOnly("faster-qwen3-tts", "tts")
expectPrefOnly("supertonic", "tts")
expectPrefOnly("sam3-cpp", "detection")
})

View File

@@ -1,95 +0,0 @@
package localai
import (
"encoding/base64"
"github.com/labstack/echo/v4"
"github.com/mudler/LocalAI/core/backend"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/http/middleware"
"github.com/mudler/LocalAI/core/schema"
"github.com/mudler/LocalAI/pkg/grpc/proto"
"github.com/mudler/LocalAI/pkg/model"
"github.com/mudler/xlog"
)
// DepthEndpoint is the LocalAI Depth endpoint exposing the full Depth Anything 3
// output (per-pixel metric depth + confidence + sky, camera pose, 3D point cloud
// and optional glb/COLMAP exports).
// @Summary Estimates per-pixel depth (and optionally pose/points) from an image.
// @Tags depth
// @Param request body schema.DepthRequest true "query params"
// @Success 200 {object} schema.DepthResponse "Response"
// @Router /v1/depth [post]
func DepthEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc {
return func(c echo.Context) error {
input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.DepthRequest)
if !ok || input.Model == "" {
return echo.ErrBadRequest
}
cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig)
if !ok || cfg == nil {
return echo.ErrBadRequest
}
xlog.Debug("Depth", "image", input.Image, "backend", cfg.Backend)
image, err := decodeImageInput(input.Image)
if err != nil {
return err
}
// Default to returning everything the model can produce when the
// caller hasn't asked for any specific subset, so a bare request is
// still useful.
includeDepth := input.IncludeDepth
includeConfidence := input.IncludeConfidence
includePose := input.IncludePose
includeSky := input.IncludeSky
includePoints := input.IncludePoints
if !includeDepth && !includeConfidence && !includePose && !includeSky && !includePoints {
includeDepth = true
includeConfidence = true
includePose = true
includeSky = true
}
req := &proto.DepthRequest{
Src: image,
Dst: input.Dst,
IncludeDepth: includeDepth,
IncludeConfidence: includeConfidence,
IncludePose: includePose,
IncludeSky: includeSky,
IncludePoints: includePoints,
PointsConfThresh: input.PointsConfThresh,
Exports: input.Exports,
}
res, err := backend.Depth(c.Request().Context(), req, ml, appConfig, *cfg)
if err != nil {
return mapBackendError(err)
}
response := schema.DepthResponse{
Width: res.GetWidth(),
Height: res.GetHeight(),
Depth: res.GetDepth(),
Confidence: res.GetConfidence(),
Sky: res.GetSky(),
Extrinsics: res.GetExtrinsics(),
Intrinsics: res.GetIntrinsics(),
NumPoints: res.GetNumPoints(),
Points: res.GetPoints(),
ExportPaths: res.GetExportPaths(),
IsMetric: res.GetIsMetric(),
}
if len(res.GetPointColors()) > 0 {
response.PointColors = base64.StdEncoding.EncodeToString(res.GetPointColors())
}
return c.JSON(200, response)
}
}

View File

@@ -937,13 +937,12 @@ func GetSchedulingEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {
// distinguishable from an explicit zero. On update, an omitted prefix-cache
// field preserves the model's previously-configured value instead of resetting
// it (see SetSchedulingEndpoint's PATCH-style merge). ModelName, NodeSelector,
// MinReplicas, MaxReplicas and SpreadAll keep their full-replace PUT semantics.
// MinReplicas and MaxReplicas keep their full-replace PUT semantics.
type SetSchedulingRequest struct {
ModelName string `json:"model_name"`
NodeSelector map[string]string `json:"node_selector,omitempty"`
MinReplicas int `json:"min_replicas"`
MaxReplicas int `json:"max_replicas"`
SpreadAll bool `json:"spread_all,omitempty"`
RoutePolicy *string `json:"route_policy,omitempty"`
BalanceAbsThreshold *int `json:"balance_abs_threshold,omitempty"`
BalanceRelThreshold *float64 `json:"balance_rel_threshold,omitempty"`
@@ -960,9 +959,6 @@ func validateSchedulingRequest(req SetSchedulingRequest, routePolicy string, abs
if req.ModelName == "" {
return errors.New("model_name is required")
}
if req.SpreadAll && (req.MinReplicas != 0 || req.MaxReplicas != 0) {
return errors.New("spread_all and min_replicas/max_replicas are mutually exclusive")
}
if req.MinReplicas < 0 {
return errors.New("min_replicas must be >= 0")
}
@@ -1049,7 +1045,6 @@ func SetSchedulingEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {
NodeSelector: selectorJSON,
MinReplicas: req.MinReplicas,
MaxReplicas: req.MaxReplicas,
SpreadAll: req.SpreadAll,
RoutePolicy: routePolicy,
BalanceAbsThreshold: absThr,
BalanceRelThreshold: relThr,

View File

@@ -1,22 +0,0 @@
package localai
import (
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = Describe("validateSchedulingRequest spread_all", func() {
It("rejects spread_all combined with min_replicas", func() {
err := validateSchedulingRequest(SetSchedulingRequest{
ModelName: "m", SpreadAll: true, MinReplicas: 2,
}, "", 0, 0, 0)
Expect(err).To(MatchError(ContainSubstring("mutually exclusive")))
})
It("accepts spread_all alone", func() {
err := validateSchedulingRequest(SetSchedulingRequest{
ModelName: "m", SpreadAll: true,
}, "", 0, 0, 0)
Expect(err).ToNot(HaveOccurred())
})
})

View File

@@ -221,18 +221,9 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
// Check if agent job retention changed
agentJobChanged := settings.AgentJobRetentionDays != nil
// Restart watchdog if settings changed.
//
// The live start/stop decision derives from the post-apply config
// (WatchdogShouldRun) rather than the raw watchdog_enabled request
// field: the React master toggle only ever writes the idle/busy flags,
// so keying off watchdog_enabled left the live watchdog stopped on a
// cold enable until the next restart (#9125). WatchdogShouldRun mirrors
// the gating in startWatchdog, so a cold enable starts it immediately
// and a full disable (both checks off, no LRU / memory reclaimer) stops
// it.
// Restart watchdog if settings changed
if watchdogChanged {
if !appConfig.WatchdogShouldRun() {
if settings.WatchdogEnabled != nil && !*settings.WatchdogEnabled {
if err := app.StopWatchdog(); err != nil {
xlog.Error("Failed to stop watchdog", "error", err)
return c.JSON(http.StatusInternalServerError, schema.SettingsResponse{

View File

@@ -108,20 +108,4 @@ var _ = Describe("Settings endpoints", func() {
_, err := os.Stat(filepath.Join(tmp, "runtime_settings.json"))
Expect(err).ToNot(HaveOccurred())
})
// Residual #9125: enabling the watchdog from a cold (off) state via the
// React master toggle must start the live watchdog immediately, without a
// restart. The toggle posts watchdog_idle_enabled/busy_enabled=true while
// the vestigial watchdog_enabled stays false (it was loaded false). The
// old handler keyed its stop decision off that raw watchdog_enabled=false
// and called StopWatchdog(), so the watchdog never started until restart.
It("starts the live watchdog on a cold enable even when watchdog_enabled=false", func() {
Expect(app.ModelLoader().GetWatchDog()).To(BeNil(), "precondition: watchdog should be off")
rec := post(`{"watchdog_enabled":false,"watchdog_idle_enabled":true,"watchdog_busy_enabled":true,"watchdog_idle_timeout":"15m","watchdog_busy_timeout":"5m","watchdog_interval":"1s"}`)
Expect(rec.Code).To(Equal(http.StatusOK))
Expect(app.ModelLoader().GetWatchDog()).ToNot(BeNil(),
"watchdog should be running after a cold enable, without waiting for a restart")
})
})

View File

@@ -133,13 +133,6 @@ type Session struct {
// silently strip Manage Mode's tools.
AssistantTools []types.ToolUnion
// voiceGate is non-nil when pipeline.voice_recognition is configured. It
// authorizes each committed utterance's speaker before the LLM runs.
voiceGate *voiceGate
// gateMu guards the when:first verification state below.
gateMu sync.Mutex
voiceVerified bool
// Response cancellation: protects activeResponseCancel/activeResponseDone
responseMu sync.Mutex
activeResponseCancel context.CancelFunc
@@ -340,17 +333,6 @@ func defaultMaxHistoryItems(cfg *config.ModelConfig) int {
return 0
}
// resolveMaxHistoryItems honors an explicit pipeline.max_history_items when set,
// otherwise falls back to the per-model-type default. This lets a composed
// pipeline (VAD+STT+LLM+TTS) cap its history so a long-running session doesn't
// grow until the LLM's context window fills.
func resolveMaxHistoryItems(cfg *config.ModelConfig) int {
if cfg != nil && cfg.Pipeline.MaxHistoryItems != nil {
return *cfg.Pipeline.MaxHistoryItems
}
return defaultMaxHistoryItems(cfg)
}
// trimRealtimeItems returns the tail of items capped at maxItems (0 = no cap).
// Walks backwards keeping function_call + function_call_output pairs together
// so we never feed the LLM an orphaned tool result that references a call it
@@ -503,7 +485,7 @@ func runRealtimeSession(application *application.Application, t Transport, model
Conversations: make(map[string]*Conversation),
InputSampleRate: defaultRemoteSampleRate,
OutputSampleRate: defaultRemoteSampleRate,
MaxHistoryItems: resolveMaxHistoryItems(cfg),
MaxHistoryItems: defaultMaxHistoryItems(cfg),
}
// Create a default conversation
@@ -532,23 +514,6 @@ func runRealtimeSession(application *application.Application, t Transport, model
}
session.ModelInterface = m
if cfg.Pipeline.VoiceGateEnabled() {
gate, gerr := newVoiceGate(
*cfg.Pipeline.VoiceRecognition,
application.ModelConfigLoader(),
application.ModelLoader(),
application.ApplicationConfig(),
application.VoiceRegistry(),
)
if gerr != nil {
xlog.Error("failed to initialize voice recognition gate", "error", gerr)
sendError(t, "voice_gate_error", gerr.Error(), "", "")
return
}
session.voiceGate = gate
xlog.Info("realtime voice recognition gate enabled", "mode", gate.cfg.Mode, "when", gate.cfg.When)
}
// Store the session and notify the transport (for WebRTC audio track handling)
sessionLock.Lock()
sessions[sessionID] = session
@@ -1025,18 +990,8 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode
}
if rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil {
trUpd := rt.Audio.Input.Transcription
// A language-only update (e.g. a client forcing the STT language) carries
// an empty Model. Preserve the pipeline's configured transcription backend
// instead of blanking it — otherwise the next utterance transcribes against
// an empty model and the backend RPC fails with "unimplemented".
if trUpd.Model == "" && session.InputAudioTranscription != nil {
trUpd.Model = session.InputAudioTranscription.Model
}
session.InputAudioTranscription = trUpd
if trUpd.Model != "" {
session.ModelConfig.Pipeline.Transcription = trUpd.Model
}
session.InputAudioTranscription = rt.Audio.Input.Transcription
session.ModelConfig.Pipeline.Transcription = rt.Audio.Input.Transcription.Model
}
if rt.Model != "" || (rt.Audio != nil && rt.Audio.Output != nil && rt.Audio.Output.Voice != "") || (rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil) {
@@ -1304,39 +1259,6 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
f.Sync()
// Start speaker verification concurrently with transcription. This is a
// latency optimization only: there is a hard join below before the LLM, so
// an unauthorized utterance never reaches generateResponse (no LLM, no
// tools, no TTS) regardless of how fast transcription finishes. A rejected
// turn wastes only transcription compute, which has no side effects. The
// transcript is still emitted to the same peer that sent the audio, which
// reveals nothing new to them.
type gateOutcome struct {
allowed bool
matched string
reason string
err error
}
var gateCh chan gateOutcome
runGate := false
if session.voiceGate != nil && session.InputAudioTranscription != nil {
skip := false
if session.voiceGate.cfg.When == config.VoiceGateWhenFirst {
session.gateMu.Lock()
skip = session.voiceVerified
session.gateMu.Unlock()
}
if !skip {
runGate = true
gateCh = make(chan gateOutcome, 1)
wavPath := f.Name()
go func() {
allowed, matched, reason, gerr := session.voiceGate.Authorize(ctx, wavPath)
gateCh <- gateOutcome{allowed: allowed, matched: matched, reason: reason, err: gerr}
}()
}
}
// TODO: If we have a real any-to-any model then transcription is optional
var transcript string
if session.InputAudioTranscription != nil {
@@ -1346,54 +1268,14 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
var err error
transcript, err = emitTranscription(ctx, t, session, generateItemID(), f.Name())
if err != nil {
// Drain the gate goroutine before returning so its in-flight read of
// the temp WAV finishes before the deferred os.Remove fires.
if runGate {
<-gateCh
}
sendError(t, "transcription_failed", err.Error(), "", "event_TODO")
return
}
} else {
// The voice gate runs only on the transcription path above; if an
// any-to-any model path is added here, join the gate before responding.
sendNotImplemented(t, "any-to-any models")
return
}
// Join on the gate before any side-effecting step.
if runGate {
out := <-gateCh
allowed := out.allowed
reason := out.reason
if out.err != nil {
// Fail closed: a gate that cannot decide must not let audio through.
xlog.Error("voice recognition gate error", "error", out.err)
allowed = false
reason = "verification error"
}
alreadyVerified := false
if session.voiceGate.cfg.When == config.VoiceGateWhenFirst {
session.gateMu.Lock()
alreadyVerified = session.voiceVerified
session.gateMu.Unlock()
}
proceed, markVerified := session.voiceGate.decide(alreadyVerified, allowed)
if !proceed {
xlog.Debug("voice recognition gate rejected utterance", "reason", reason)
if session.voiceGate.cfg.OnReject == config.VoiceGateRejectEvent {
sendError(t, "speaker_not_authorized", "speaker not authorized: "+reason, "", "event_TODO")
}
return
}
xlog.Debug("voice recognition gate authorized utterance", "speaker", out.matched)
if markVerified {
session.gateMu.Lock()
session.voiceVerified = true
session.gateMu.Unlock()
}
}
if !session.TranscriptionOnly {
generateResponse(ctx, session, utt, transcript, conv, t)
}

View File

@@ -107,29 +107,6 @@ var _ = Describe("defaultMaxHistoryItems", func() {
})
})
var _ = Describe("resolveMaxHistoryItems", func() {
ptr := func(i int) *int { return &i }
It("uses an explicit pipeline.max_history_items", func() {
cfg := &config.ModelConfig{Pipeline: config.Pipeline{LLM: "llama", MaxHistoryItems: ptr(10)}}
Expect(resolveMaxHistoryItems(cfg)).To(Equal(10))
})
It("honors an explicit 0 (unlimited) over the type default", func() {
cfg := &config.ModelConfig{
KnownUsecases: withUsecases(config.FLAG_REALTIME_AUDIO),
Pipeline: config.Pipeline{MaxHistoryItems: ptr(0)},
}
Expect(resolveMaxHistoryItems(cfg)).To(Equal(0))
})
It("falls back to the type default when unset", func() {
cfg := &config.ModelConfig{KnownUsecases: withUsecases(config.FLAG_REALTIME_AUDIO)}
Expect(resolveMaxHistoryItems(cfg)).To(Equal(6))
})
It("tolerates nil", func() {
Expect(resolveMaxHistoryItems(nil)).To(Equal(0))
})
})
var _ = Describe("trimRealtimeItems", func() {
user := func(id string) *types.MessageItemUnion {
return &types.MessageItemUnion{User: &types.MessageItemUser{ID: id}}

View File

@@ -1,212 +0,0 @@
package openai
import (
"context"
"fmt"
"math"
"github.com/mudler/LocalAI/core/backend"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/services/voicerecognition"
"github.com/mudler/LocalAI/pkg/model"
)
type namedEmbedding struct {
name string
emb []float32
}
// voiceGate decides whether a committed utterance's speaker is authorized to
// drive the realtime pipeline.
type voiceGate struct {
cfg config.PipelineVoiceRecognition // normalized
registry voicerecognition.Registry // identify mode (nil otherwise)
refEmbeds []namedEmbedding // verify mode, pre-embedded refs
refAudios []config.VoiceReference // verify + anti-spoofing: ref paths
// Seams for testing; set by newVoiceGate to call the real backend.
embedFn func(ctx context.Context, wavPath string) ([]float32, error)
verifyFn func(ctx context.Context, uttWav, refWav string) (bool, error)
}
// newVoiceGate builds a gate from a pipeline's voice_recognition config. It
// validates fail-fast (before loading the model), loads the recognition model
// config, wires the real backend seams, and pre-embeds references for verify
// mode so per-turn cost is one utterance embed plus cheap cosine comparisons.
func newVoiceGate(
cfg config.PipelineVoiceRecognition,
cl *config.ModelConfigLoader,
ml *model.ModelLoader,
appConfig *config.ApplicationConfig,
registry voicerecognition.Registry,
) (*voiceGate, error) {
cfg.Normalize()
if err := cfg.Validate(registry != nil); err != nil {
return nil, err
}
recCfg, err := cl.LoadModelConfigFileByName(cfg.Model, ml.ModelPath)
if err != nil {
return nil, fmt.Errorf("voice_recognition: failed to load model %q: %w", cfg.Model, err)
}
if valid, _ := recCfg.Validate(); !valid {
return nil, fmt.Errorf("voice_recognition: invalid model config %q", cfg.Model)
}
g := &voiceGate{
cfg: cfg,
registry: registry,
embedFn: func(ctx context.Context, wavPath string) ([]float32, error) {
res, err := backend.VoiceEmbed(ctx, wavPath, ml, appConfig, *recCfg)
if err != nil {
return nil, err
}
return res.Embedding, nil
},
verifyFn: func(ctx context.Context, uttWav, refWav string) (bool, error) {
res, err := backend.VoiceVerify(ctx, uttWav, refWav, cfg.Threshold, true, ml, appConfig, *recCfg)
if err != nil {
return false, err
}
return res.Verified, nil
},
}
if cfg.Mode == config.VoiceGateModeVerify {
if cfg.AntiSpoofing {
g.refAudios = cfg.References
} else {
for _, r := range cfg.References {
emb, err := g.embedFn(context.Background(), r.Audio)
if err != nil {
return nil, fmt.Errorf("voice_recognition: failed to embed reference %q: %w", r.Name, err)
}
g.refEmbeds = append(g.refEmbeds, namedEmbedding{name: r.Name, emb: emb})
}
}
}
return g, nil
}
// Authorize embeds the utterance and decides allow/deny.
//
// allowed: speaker is authorized.
// matched: matched person's name (informational), empty if none.
// reason: human-readable deny reason.
// err: backend failure (caller should fail closed).
func (g *voiceGate) Authorize(ctx context.Context, wavPath string) (allowed bool, matched string, reason string, err error) {
if g.cfg.Mode == config.VoiceGateModeVerify {
return g.authorizeVerify(ctx, wavPath)
}
return g.authorizeIdentify(ctx, wavPath)
}
func (g *voiceGate) authorizeIdentify(ctx context.Context, wavPath string) (bool, string, string, error) {
emb, err := g.embedFn(ctx, wavPath)
if err != nil {
return false, "", "embed failed", err
}
if len(emb) == 0 {
return false, "", "no speech detected", nil
}
matches, err := g.registry.Identify(ctx, emb, 1)
if err != nil {
return false, "", "identify failed", err
}
if len(matches) == 0 {
return false, "", "unknown speaker", nil
}
m := matches[0]
if m.Distance > g.cfg.Threshold {
return false, m.Metadata.Name, "distance above threshold", nil
}
if !g.allowMatch(m.Metadata) {
return false, m.Metadata.Name, "speaker not in allow list", nil
}
return true, m.Metadata.Name, "", nil
}
// allowMatch reports whether a matched identity is authorized. An empty allow
// (no names and no labels) authorizes any registered speaker.
func (g *voiceGate) allowMatch(meta voicerecognition.Metadata) bool {
a := g.cfg.Allow
if len(a.Names) == 0 && len(a.Labels) == 0 {
return true
}
for _, n := range a.Names {
if n == meta.Name {
return true
}
}
for _, l := range a.Labels {
if _, ok := meta.Labels[l]; ok {
return true
}
}
return false
}
func (g *voiceGate) authorizeVerify(ctx context.Context, wavPath string) (bool, string, string, error) {
if g.cfg.AntiSpoofing {
for _, r := range g.refAudios {
ok, err := g.verifyFn(ctx, wavPath, r.Audio)
if err != nil {
return false, "", "verify failed", err
}
if ok {
return true, r.Name, "", nil
}
}
return false, "", "no reference matched", nil
}
emb, err := g.embedFn(ctx, wavPath)
if err != nil {
return false, "", "embed failed", err
}
if len(emb) == 0 {
return false, "", "no speech detected", nil
}
for _, r := range g.refEmbeds {
if cosineDistance(emb, r.emb) <= g.cfg.Threshold {
return true, r.name, "", nil
}
}
return false, "", "no reference matched", nil
}
// decide interprets an Authorize result against the gate's when-policy and the
// session's prior verification state.
// proceed: run the LLM response for this utterance.
// markVerified: record a successful first-utterance verification.
// Note: when:first AND alreadyVerified is normally handled by the caller
// skipping Authorize entirely; if it still reaches here, proceed is true.
func (g *voiceGate) decide(alreadyVerified, allowed bool) (proceed, markVerified bool) {
if g.cfg.When == config.VoiceGateWhenFirst {
if alreadyVerified {
return true, false
}
return allowed, allowed
}
return allowed, false
}
// cosineDistance returns 1 - cosine_similarity, matching the voice registry's
// distance convention (lower = closer). Returns 1 (treated as "no match") for
// zero-length, mismatched, or zero-magnitude vectors.
func cosineDistance(a, b []float32) float32 {
if len(a) == 0 || len(a) != len(b) {
return 1
}
var dot, na, nb float64
for i := range a {
dot += float64(a[i]) * float64(b[i])
na += float64(a[i]) * float64(a[i])
nb += float64(b[i]) * float64(b[i])
}
if na == 0 || nb == 0 {
return 1
}
return float32(1 - dot/(math.Sqrt(na)*math.Sqrt(nb)))
}

View File

@@ -1,154 +0,0 @@
package openai
import (
"context"
"errors"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/mudler/LocalAI/core/backend"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
"github.com/mudler/LocalAI/core/schema"
"github.com/mudler/LocalAI/core/services/voicerecognition"
)
// These specs drive the REAL commitUtterance path end to end (gate goroutine,
// the hard join before the LLM, the reject event, and when:first session
// trust) using the existing fakeTransport/fakeModel doubles. They are the
// integration counterpart to the unit specs in realtime_voicegate_test.go:
// here the gate is wired into a Session exactly as runRealtimeSession wires it.
// itGate builds an identify-mode gate whose registry always returns a single
// match named matchName, and whose embedFn returns embed/embErr. allowName is
// the authorized identity. when/onReject select the policy.
func itGate(allowName, matchName string, embed []float32, embErr error, when, onReject string) *voiceGate {
return &voiceGate{
cfg: config.PipelineVoiceRecognition{
Mode: config.VoiceGateModeIdentify,
Threshold: 0.25,
When: when,
OnReject: onReject,
Allow: config.VoiceRecognitionAllow{Names: []string{allowName}},
},
registry: &fakeRegistry{matches: []voicerecognition.Match{
{Distance: 0.1, Metadata: voicerecognition.Metadata{Name: matchName}},
}},
embedFn: func(context.Context, string) ([]float32, error) { return embed, embErr },
}
}
// itSession returns a Session + fakeModel wired for a full pipeline turn, with
// the given gate attached. The fakeModel mirrors the streaming-LLM setup used
// by realtime_stream_test.go so triggerResponse runs to a response.done.
func itSession(gate *voiceGate) (*Session, *fakeModel) {
on := true
m := &fakeModel{
cfg: &config.ModelConfig{},
transcribeFinal: &schema.TranscriptionResult{Text: "hello"},
predictTokens: []string{"Hi", " there."},
predictResp: backend.LLMResponse{Response: "Hi there."},
ttsStreamChunks: [][]byte{{1}},
ttsStreamRate: 24000,
}
session := &Session{
OutputSampleRate: 24000,
InputAudioTranscription: &types.AudioTranscription{},
ModelInterface: m,
ModelConfig: &config.ModelConfig{
Pipeline: config.Pipeline{Streaming: config.PipelineStreaming{LLM: &on, TTS: &on}},
},
voiceGate: gate,
}
return session, m
}
// hasSpeakerNotAuthorized reports whether a speaker_not_authorized error event
// was emitted to the client.
func hasSpeakerNotAuthorized(tr *fakeTransport) bool {
for _, e := range tr.events {
if ev, ok := e.(types.ErrorEvent); ok && ev.Error.Code == "speaker_not_authorized" {
return true
}
}
return false
}
var _ = Describe("realtime voice gate integration (commitUtterance)", func() {
utt := make([]byte, 32) // non-empty PCM so commitUtterance proceeds
It("allows an authorized speaker through to a full response", func() {
session, _ := itSession(itGate("alice", "alice", []float32{1, 0, 0}, nil,
config.VoiceGateWhenEvery, config.VoiceGateRejectEvent))
tr := &fakeTransport{}
commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
Expect(hasSpeakerNotAuthorized(tr)).To(BeFalse())
// The LLM/TTS pipeline ran to completion.
Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1))
// Transcription still happened (parallel with the gate).
Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
})
It("drops an unauthorized speaker before the LLM and emits a reject event", func() {
// match name "mallory" is not in the allow list → deny.
session, _ := itSession(itGate("alice", "mallory", []float32{1, 0, 0}, nil,
config.VoiceGateWhenEvery, config.VoiceGateRejectEvent))
tr := &fakeTransport{}
commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
// Hard barrier: the LLM/TTS pipeline never ran.
Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
// The client was told why.
Expect(hasSpeakerNotAuthorized(tr)).To(BeTrue())
// Transcription of the rejected utterance still emitted (sent only to the
// peer that produced the audio; reveals nothing new).
Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
})
It("fails closed on a gate backend error", func() {
session, _ := itSession(itGate("alice", "alice", nil, errors.New("backend down"),
config.VoiceGateWhenEvery, config.VoiceGateRejectEvent))
tr := &fakeTransport{}
commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
Expect(hasSpeakerNotAuthorized(tr)).To(BeTrue())
})
It("drops silently when on_reject is drop_silent (no error event)", func() {
session, _ := itSession(itGate("alice", "mallory", []float32{1, 0, 0}, nil,
config.VoiceGateWhenEvery, config.VoiceGateRejectSilent))
tr := &fakeTransport{}
commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
Expect(hasSpeakerNotAuthorized(tr)).To(BeFalse())
})
It("when:first trusts the session after one match, even if later embeds fail", func() {
gate := itGate("alice", "alice", []float32{1, 0, 0}, nil,
config.VoiceGateWhenFirst, config.VoiceGateRejectEvent)
session, _ := itSession(gate)
// First utterance: authorized, marks the session verified.
tr1 := &fakeTransport{}
commitUtterance(context.Background(), utt, session, &Conversation{}, tr1)
Expect(hasSpeakerNotAuthorized(tr1)).To(BeFalse())
Expect(tr1.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1))
// Break the gate: any further Authorize would now error.
gate.embedFn = func(context.Context, string) ([]float32, error) { return nil, errors.New("boom") }
// Second utterance still proceeds because when:first skips re-verification.
tr2 := &fakeTransport{}
commitUtterance(context.Background(), utt, session, &Conversation{}, tr2)
Expect(hasSpeakerNotAuthorized(tr2)).To(BeFalse())
Expect(tr2.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1))
})
})

View File

@@ -1,231 +0,0 @@
package openai
import (
"context"
"errors"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/services/voicerecognition"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = Describe("cosineDistance", func() {
It("is 0 for identical vectors", func() {
Expect(cosineDistance([]float32{1, 0, 0}, []float32{1, 0, 0})).To(BeNumerically("~", 0, 1e-6))
})
It("is ~1 for orthogonal vectors", func() {
Expect(cosineDistance([]float32{1, 0}, []float32{0, 1})).To(BeNumerically("~", 1, 1e-6))
})
It("is ~2 for opposite vectors", func() {
Expect(cosineDistance([]float32{1, 0}, []float32{-1, 0})).To(BeNumerically("~", 2, 1e-6))
})
It("returns 1 for length mismatch", func() {
Expect(cosineDistance([]float32{1, 0}, []float32{1})).To(BeNumerically("~", 1, 1e-6))
})
It("returns 1 for a zero vector", func() {
Expect(cosineDistance([]float32{0, 0}, []float32{1, 0})).To(BeNumerically("~", 1, 1e-6))
})
})
type fakeRegistry struct {
matches []voicerecognition.Match
err error
}
func (f *fakeRegistry) Register(ctx context.Context, emb []float32, m voicerecognition.Metadata) (voicerecognition.Metadata, error) {
return m, nil
}
func (f *fakeRegistry) Identify(ctx context.Context, probe []float32, topK int) ([]voicerecognition.Match, error) {
return f.matches, f.err
}
func (f *fakeRegistry) Forget(ctx context.Context, id string) error { return nil }
var _ = Describe("voiceGate identify mode", func() {
stubEmbed := func(emb []float32, err error) func(context.Context, string) ([]float32, error) {
return func(context.Context, string) ([]float32, error) { return emb, err }
}
mkGate := func(allow config.VoiceRecognitionAllow, matches []voicerecognition.Match, embErr error) *voiceGate {
return &voiceGate{
cfg: config.PipelineVoiceRecognition{Mode: config.VoiceGateModeIdentify, Threshold: 0.25, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent, Allow: allow},
registry: &fakeRegistry{matches: matches},
embedFn: stubEmbed([]float32{1, 0, 0}, embErr),
}
}
It("allows a registered speaker within threshold and in the allow list", func() {
g := mkGate(config.VoiceRecognitionAllow{Names: []string{"alice"}},
[]voicerecognition.Match{{Distance: 0.1, Metadata: voicerecognition.Metadata{Name: "alice"}}}, nil)
allowed, matched, _, err := g.Authorize(context.Background(), "x.wav")
Expect(err).ToNot(HaveOccurred())
Expect(allowed).To(BeTrue())
Expect(matched).To(Equal("alice"))
})
It("allows any registered speaker when the allow list is empty", func() {
g := mkGate(config.VoiceRecognitionAllow{},
[]voicerecognition.Match{{Distance: 0.1, Metadata: voicerecognition.Metadata{Name: "carol"}}}, nil)
allowed, _, _, _ := g.Authorize(context.Background(), "x.wav")
Expect(allowed).To(BeTrue())
})
It("allows by label", func() {
g := mkGate(config.VoiceRecognitionAllow{Labels: []string{"family"}},
[]voicerecognition.Match{{Distance: 0.1, Metadata: voicerecognition.Metadata{Name: "bob", Labels: map[string]string{"family": "yes"}}}}, nil)
allowed, _, _, _ := g.Authorize(context.Background(), "x.wav")
Expect(allowed).To(BeTrue())
})
It("denies a speaker not in the allow list", func() {
g := mkGate(config.VoiceRecognitionAllow{Names: []string{"alice"}},
[]voicerecognition.Match{{Distance: 0.1, Metadata: voicerecognition.Metadata{Name: "mallory"}}}, nil)
allowed, matched, reason, _ := g.Authorize(context.Background(), "x.wav")
Expect(allowed).To(BeFalse())
Expect(matched).To(Equal("mallory"))
Expect(reason).To(ContainSubstring("allow"))
})
It("denies a match above the threshold", func() {
g := mkGate(config.VoiceRecognitionAllow{},
[]voicerecognition.Match{{Distance: 0.9, Metadata: voicerecognition.Metadata{Name: "alice"}}}, nil)
allowed, matched, _, _ := g.Authorize(context.Background(), "x.wav")
Expect(allowed).To(BeFalse())
Expect(matched).To(Equal("alice"))
})
It("denies when no registry match", func() {
g := mkGate(config.VoiceRecognitionAllow{}, nil, nil)
allowed, _, reason, _ := g.Authorize(context.Background(), "x.wav")
Expect(allowed).To(BeFalse())
Expect(reason).To(ContainSubstring("unknown"))
})
It("denies (no error) when no speech is detected", func() {
g := mkGate(config.VoiceRecognitionAllow{}, nil, nil)
g.embedFn = stubEmbed(nil, nil)
allowed, _, reason, err := g.Authorize(context.Background(), "x.wav")
Expect(err).ToNot(HaveOccurred())
Expect(allowed).To(BeFalse())
Expect(reason).To(ContainSubstring("no speech"))
})
It("denies and surfaces the error when embedding fails", func() {
g := mkGate(config.VoiceRecognitionAllow{}, nil, errors.New("boom"))
allowed, _, reason, err := g.Authorize(context.Background(), "x.wav")
Expect(err).To(HaveOccurred())
Expect(allowed).To(BeFalse())
Expect(reason).To(ContainSubstring("embed"))
})
It("denies and surfaces the error when identify fails", func() {
g := mkGate(config.VoiceRecognitionAllow{}, nil, nil)
g.registry = &fakeRegistry{err: errors.New("boom")}
allowed, _, _, err := g.Authorize(context.Background(), "x.wav")
Expect(err).To(HaveOccurred())
Expect(allowed).To(BeFalse())
})
})
var _ = Describe("voiceGate verify mode", func() {
It("allows when the utterance matches a reference embedding", func() {
g := &voiceGate{
cfg: config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent},
refEmbeds: []namedEmbedding{{name: "alice", emb: []float32{1, 0, 0}}},
embedFn: func(context.Context, string) ([]float32, error) { return []float32{1, 0, 0}, nil },
}
allowed, matched, _, err := g.Authorize(context.Background(), "x.wav")
Expect(err).ToNot(HaveOccurred())
Expect(allowed).To(BeTrue())
Expect(matched).To(Equal("alice"))
})
It("denies when no reference is within threshold", func() {
g := &voiceGate{
cfg: config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent},
refEmbeds: []namedEmbedding{{name: "alice", emb: []float32{1, 0, 0}}},
embedFn: func(context.Context, string) ([]float32, error) { return []float32{0, 1, 0}, nil },
}
allowed, _, reason, _ := g.Authorize(context.Background(), "x.wav")
Expect(allowed).To(BeFalse())
Expect(reason).To(ContainSubstring("reference"))
})
It("denies (no error) when no speech is detected", func() {
g := &voiceGate{
cfg: config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent},
refEmbeds: []namedEmbedding{{name: "alice", emb: []float32{1, 0, 0}}},
embedFn: func(context.Context, string) ([]float32, error) { return nil, nil },
}
allowed, _, reason, err := g.Authorize(context.Background(), "x.wav")
Expect(err).ToNot(HaveOccurred())
Expect(allowed).To(BeFalse())
Expect(reason).To(ContainSubstring("no speech"))
})
It("denies and surfaces the error when embedding fails", func() {
g := &voiceGate{
cfg: config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent},
refEmbeds: []namedEmbedding{{name: "alice", emb: []float32{1, 0, 0}}},
embedFn: func(context.Context, string) ([]float32, error) { return nil, errors.New("boom") },
}
allowed, _, _, err := g.Authorize(context.Background(), "x.wav")
Expect(err).To(HaveOccurred())
Expect(allowed).To(BeFalse())
})
It("uses verifyFn when anti-spoofing is enabled", func() {
called := false
g := &voiceGate{
cfg: config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, AntiSpoofing: true, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent},
refAudios: []config.VoiceReference{{Name: "alice", Audio: "/alice.wav"}},
verifyFn: func(context.Context, string, string) (bool, error) { called = true; return true, nil },
}
allowed, matched, _, err := g.Authorize(context.Background(), "x.wav")
Expect(err).ToNot(HaveOccurred())
Expect(called).To(BeTrue())
Expect(allowed).To(BeTrue())
Expect(matched).To(Equal("alice"))
})
It("denies and surfaces the error when verifyFn fails (anti-spoofing)", func() {
g := &voiceGate{
cfg: config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, AntiSpoofing: true, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent},
refAudios: []config.VoiceReference{{Name: "alice", Audio: "/alice.wav"}},
verifyFn: func(context.Context, string, string) (bool, error) { return false, errors.New("boom") },
}
allowed, _, _, err := g.Authorize(context.Background(), "x.wav")
Expect(err).To(HaveOccurred())
Expect(allowed).To(BeFalse())
})
})
var _ = Describe("newVoiceGate", func() {
It("fails fast when identify mode has no registry (before touching the loader)", func() {
cfg := config.PipelineVoiceRecognition{Model: "spk", Mode: config.VoiceGateModeIdentify}
g, err := newVoiceGate(cfg, nil, nil, nil, nil)
Expect(err).To(HaveOccurred())
Expect(g).To(BeNil())
})
It("fails fast when verify mode has no references", func() {
cfg := config.PipelineVoiceRecognition{Model: "spk", Mode: config.VoiceGateModeVerify}
g, err := newVoiceGate(cfg, nil, nil, nil, nil)
Expect(err).To(HaveOccurred())
Expect(g).To(BeNil())
})
})
var _ = Describe("voiceGate decide", func() {
gate := func(when string) *voiceGate {
return &voiceGate{cfg: config.PipelineVoiceRecognition{When: when}}
}
It("every: proceeds iff allowed, never marks verified", func() {
proceed, mark := gate(config.VoiceGateWhenEvery).decide(false, true)
Expect(proceed).To(BeTrue())
Expect(mark).To(BeFalse())
proceed, mark = gate(config.VoiceGateWhenEvery).decide(false, false)
Expect(proceed).To(BeFalse())
Expect(mark).To(BeFalse())
})
It("first: marks verified on first allow", func() {
proceed, mark := gate(config.VoiceGateWhenFirst).decide(false, true)
Expect(proceed).To(BeTrue())
Expect(mark).To(BeTrue())
})
It("first: denies on first reject without marking", func() {
proceed, mark := gate(config.VoiceGateWhenFirst).decide(false, false)
Expect(proceed).To(BeFalse())
Expect(mark).To(BeFalse())
})
It("first: proceeds without re-check once already verified", func() {
proceed, mark := gate(config.VoiceGateWhenFirst).decide(true, false)
Expect(proceed).To(BeTrue())
Expect(mark).To(BeFalse())
})
})

View File

@@ -44,7 +44,7 @@ test.describe('Model Editor — Back navigation', () => {
await mockEditorEndpoints(page)
})
test('Back returns to Manage with a "Back to System" caption', async ({ page }) => {
test('Back returns to Manage with a "Back to Manage" caption', async ({ page }) => {
await page.goto('/app/manage')
await expect(page.locator('.table')).toBeVisible({ timeout: 10_000 })
@@ -55,7 +55,7 @@ test.describe('Model Editor — Back navigation', () => {
await page.getByRole('menuitem', { name: 'Edit configuration' }).click()
await expect(page).toHaveURL(/\/app\/model-editor\//)
const back = page.getByRole('button', { name: /Back to System/ })
const back = page.getByRole('button', { name: /Back to Manage/ })
await expect(back).toBeVisible({ timeout: 10_000 })
await back.click()
@@ -89,6 +89,6 @@ test.describe('Model Editor — Back navigation', () => {
test('falls back to "Back to Manage" on a direct visit with no origin state', async ({ page }) => {
await page.goto('/app/model-editor/mock-model')
await expect(page.getByRole('button', { name: /Back to System/ })).toBeVisible({ timeout: 10_000 })
await expect(page.getByRole('button', { name: /Back to Manage/ })).toBeVisible({ timeout: 10_000 })
})
})

View File

@@ -86,8 +86,7 @@
"type": "Type",
"value": "Value",
"search": "Search...",
"selectPlaceholder": "Select an option...",
"noMatch": "No matches"
"selectPlaceholder": "Select an option..."
},
"time": {
"now": "now",

View File

@@ -1,38 +0,0 @@
{
"title": {
"add": "Add Model",
"edit": "Model Editor"
},
"subtitle": {
"chooseModelType": "Choose a model type to get started",
"newModel": "New model"
},
"actions": {
"backTo": "Back to {{page}}",
"system": "System",
"templates": "Templates",
"createModel": "Create Model",
"saveChanges": "Save Changes",
"saving": "Saving...",
"saved": "Saved",
"switchWarning": "Save or discard changes before switching tabs.",
"discardAndSwitch": "Discard & Switch"
},
"tabs": {
"interactive": "Interactive",
"yaml": "YAML",
"yamlDescription": "Edit the YAML directly. The model name must be set in the YAML for create to work."
},
"forms": {
"modelName": {
"label": "Model Name",
"placeholder": "my-model-name",
"hint": "Use letters, numbers, hyphens, underscores, and dots only."
},
"empty": {
"nav": "Use the search bar above to add fields",
"title": "No fields configured",
"text": "Use the search bar above to find and add configuration fields."
}
}
}

View File

@@ -1,7 +1,6 @@
{
"title": "Install Models",
"subtitle": "Browse and install AI models from the gallery",
"models": "Models",
"stats": {
"available": "Available",
"installed": "Installed"
@@ -90,11 +89,5 @@
"loadFailed": "Failed to load models: {{message}}",
"installFailed": "Failed to install: {{message}}",
"deleteFailed": "Failed to delete: {{message}}"
},
"selector": {
"loading": "Loading models...",
"selectModel": "Select model...",
"searchPlaceholder": "Search models...",
"noModels": "No models available"
}
}

Some files were not shown because too many files have changed in this diff Show More