diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml index 92293b764..953913293 100644 --- a/.github/backend-matrix.yml +++ b/.github/backend-matrix.yml @@ -781,6 +781,19 @@ include: dockerfile: "./backend/Dockerfile.golang" context: "./" ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "8" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-12-omnivoice-cpp' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "omnivoice-cpp" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "12" cuda-minor-version: "8" @@ -1712,6 +1725,19 @@ include: dockerfile: "./backend/Dockerfile.golang" context: "./" ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-13-omnivoice-cpp' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "omnivoice-cpp" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "13" cuda-minor-version: "0" @@ -1751,6 +1777,19 @@ include: backend: "qwen3-tts-cpp" dockerfile: "./backend/Dockerfile.golang" context: "./" + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/arm64' + skip-drivers: 'false' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-cuda-13-arm64-omnivoice-cpp' + base-image: "ubuntu:24.04" + ubuntu-version: '2404' + runs-on: 'ubuntu-24.04-arm' + backend: "omnivoice-cpp" + dockerfile: "./backend/Dockerfile.golang" + context: "./" - build-type: 'cublas' cuda-major-version: "13" cuda-minor-version: "0" @@ -3483,6 +3522,35 @@ include: dockerfile: "./backend/Dockerfile.golang" context: "./" ubuntu-version: '2404' + # omnivoice-cpp + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + platform-tag: 'amd64' + tag-latest: 'auto' + tag-suffix: '-cpu-omnivoice-cpp' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "omnivoice-cpp" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/arm64' + platform-tag: 'arm64' + tag-latest: 'auto' + tag-suffix: '-cpu-omnivoice-cpp' + runs-on: 'ubuntu-24.04-arm' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "omnivoice-cpp" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' - build-type: 'sycl_f32' cuda-major-version: "" cuda-minor-version: "" @@ -3496,6 +3564,19 @@ include: dockerfile: "./backend/Dockerfile.golang" context: "./" ubuntu-version: '2404' + - build-type: 'sycl_f32' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-intel-sycl-f32-omnivoice-cpp' + runs-on: 'ubuntu-latest' + base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04" + skip-drivers: 'false' + backend: "omnivoice-cpp" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' - build-type: 'sycl_f16' cuda-major-version: "" cuda-minor-version: "" @@ -3509,6 +3590,19 @@ include: dockerfile: "./backend/Dockerfile.golang" context: "./" ubuntu-version: '2404' + - build-type: 'sycl_f16' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-intel-sycl-f16-omnivoice-cpp' + runs-on: 'ubuntu-latest' + base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04" + skip-drivers: 'false' + backend: "omnivoice-cpp" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' - build-type: 'vulkan' cuda-major-version: "" cuda-minor-version: "" @@ -3523,6 +3617,20 @@ include: dockerfile: "./backend/Dockerfile.golang" context: "./" ubuntu-version: '2404' + - build-type: 'vulkan' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + platform-tag: 'amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-vulkan-omnivoice-cpp' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "omnivoice-cpp" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' - build-type: 'vulkan' cuda-major-version: "" cuda-minor-version: "" @@ -3537,6 +3645,20 @@ include: dockerfile: "./backend/Dockerfile.golang" context: "./" ubuntu-version: '2404' + - build-type: 'vulkan' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/arm64' + platform-tag: 'arm64' + tag-latest: 'auto' + tag-suffix: '-gpu-vulkan-omnivoice-cpp' + runs-on: 'ubuntu-24.04-arm' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "omnivoice-cpp" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "12" cuda-minor-version: "0" @@ -3550,6 +3672,19 @@ include: dockerfile: "./backend/Dockerfile.golang" context: "./" ubuntu-version: '2204' + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "0" + platforms: 'linux/arm64' + skip-drivers: 'false' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-arm64-omnivoice-cpp' + base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0" + runs-on: 'ubuntu-24.04-arm' + backend: "omnivoice-cpp" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2204' - build-type: 'hipblas' cuda-major-version: "" cuda-minor-version: "" @@ -3563,6 +3698,19 @@ include: dockerfile: "./backend/Dockerfile.golang" context: "./" ubuntu-version: '2404' + - build-type: 'hipblas' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-rocm-hipblas-omnivoice-cpp' + base-image: "rocm/dev-ubuntu-24.04:6.4.4" + runs-on: 'ubuntu-latest' + skip-drivers: 'false' + backend: "omnivoice-cpp" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' # vibevoice-cpp - build-type: '' cuda-major-version: "" @@ -4393,6 +4541,10 @@ includeDarwin: tag-suffix: "-metal-darwin-arm64-qwen3-tts-cpp" build-type: "metal" lang: "go" + - backend: "omnivoice-cpp" + tag-suffix: "-metal-darwin-arm64-omnivoice-cpp" + build-type: "metal" + lang: "go" - backend: "vibevoice-cpp" tag-suffix: "-metal-darwin-arm64-vibevoice-cpp" build-type: "metal" diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml index f02414ef9..a2d88e1d1 100644 --- a/.github/workflows/bump_deps.yaml +++ b/.github/workflows/bump_deps.yaml @@ -70,6 +70,10 @@ jobs: variable: "QWEN3TTS_CPP_VERSION" branch: "main" file: "backend/go/qwen3-tts-cpp/Makefile" + - repository: "ServeurpersoCom/omnivoice.cpp" + variable: "OMNIVOICE_VERSION" + branch: "master" + file: "backend/go/omnivoice-cpp/Makefile" - repository: "localai-org/vibevoice.cpp" variable: "VIBEVOICE_CPP_VERSION" branch: "master" diff --git a/Makefile b/Makefile index 8195c06fc..ecca9d3c7 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Disable parallel execution for backend builds -.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio +.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/omnivoice-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio GOCMD=go GOTEST=$(GOCMD) test @@ -1176,6 +1176,7 @@ BACKEND_PARAKEET_CPP = parakeet-cpp|golang|.|false|true BACKEND_VOXTRAL = voxtral|golang|.|false|true BACKEND_ACESTEP_CPP = acestep-cpp|golang|.|false|true BACKEND_QWEN3_TTS_CPP = qwen3-tts-cpp|golang|.|false|true +BACKEND_OMNIVOICE_CPP = omnivoice-cpp|golang|.|false|true BACKEND_VIBEVOICE_CPP = vibevoice-cpp|golang|.|false|true BACKEND_LOCALVQE = localvqe|golang|.|false|true BACKEND_OPUS = opus|golang|.|false|true @@ -1294,6 +1295,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_WHISPERX))) $(eval $(call generate-docker-build-target,$(BACKEND_ACE_STEP))) $(eval $(call generate-docker-build-target,$(BACKEND_ACESTEP_CPP))) $(eval $(call generate-docker-build-target,$(BACKEND_QWEN3_TTS_CPP))) +$(eval $(call generate-docker-build-target,$(BACKEND_OMNIVOICE_CPP))) $(eval $(call generate-docker-build-target,$(BACKEND_VIBEVOICE_CPP))) $(eval $(call generate-docker-build-target,$(BACKEND_LOCALVQE))) $(eval $(call generate-docker-build-target,$(BACKEND_MLX))) @@ -1311,7 +1313,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_SHERPA_ONNX))) docker-save-%: backend-images docker save local-ai-backend:$* -o backend-images/$*.tar -docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-crispasr docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-liquid-audio docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-rfdetr-cpp docker-build-qwen3-tts-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx docker-build-cloud-proxy +docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-crispasr docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-liquid-audio docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-rfdetr-cpp docker-build-qwen3-tts-cpp docker-build-omnivoice-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx docker-build-cloud-proxy ######################################################## ### Mock Backend for E2E Tests diff --git a/backend/go/omnivoice-cpp/.gitignore b/backend/go/omnivoice-cpp/.gitignore new file mode 100644 index 000000000..250529067 --- /dev/null +++ b/backend/go/omnivoice-cpp/.gitignore @@ -0,0 +1,17 @@ +# Fetched upstream sources +sources/ + +# CMake build directories +build*/ + +# Compiled shared libraries +*.so + +# Compiled backend binary +omnivoice-cpp + +# Packaging output +package/ + +# Downloaded e2e models +omnivoice-models/ diff --git a/backend/go/omnivoice-cpp/CMakeLists.txt b/backend/go/omnivoice-cpp/CMakeLists.txt new file mode 100644 index 000000000..43fafb02a --- /dev/null +++ b/backend/go/omnivoice-cpp/CMakeLists.txt @@ -0,0 +1,53 @@ +cmake_minimum_required(VERSION 3.14) +project(gomnivoicecpp LANGUAGES C CXX) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +set(OMNIVOICE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/sources/omnivoice.cpp) + +# Override upstream's CMAKE_CUDA_ARCHITECTURES before add_subdirectory. +if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real") +endif() + +# Add the upstream project. Its own CMakeLists adds ggml + builds +# omnivoice-core (STATIC, contains src/omnivoice.cpp i.e. the ov_* impl). +# EXCLUDE_FROM_ALL keeps its CLI tools/tests from building unless referenced. +add_subdirectory(${OMNIVOICE_DIR} omnivoice EXCLUDE_FROM_ALL) + +# Upstream generates version.h into its own CMAKE_CURRENT_BINARY_DIR and adds +# the top-level ${CMAKE_BINARY_DIR} to omnivoice-core's include path. When the +# project is nested under add_subdirectory those two directories differ +# (/omnivoice vs ), so omnivoice.cpp cannot find version.h. Point +# omnivoice-core at the subproject binary dir where version.h is actually +# generated. (Fix lives here, never in the fetched upstream checkout.) +target_include_directories(omnivoice-core PRIVATE ${CMAKE_BINARY_DIR}/omnivoice) + +add_library(gomnivoicecpp MODULE cpp/gomnivoicecpp.cpp) +target_link_libraries(gomnivoicecpp PRIVATE omnivoice-core) + +target_include_directories(gomnivoicecpp PRIVATE ${OMNIVOICE_DIR}/src) +target_include_directories(gomnivoicecpp SYSTEM PRIVATE ${OMNIVOICE_DIR}/ggml/include) + +# Link GPU backends if the upstream ggml created them. +foreach(backend blas cuda metal vulkan sycl) + if(TARGET ggml-${backend}) + target_link_libraries(gomnivoicecpp PRIVATE ggml-${backend}) + if(backend STREQUAL "cuda") + find_package(CUDAToolkit QUIET) + if(CUDAToolkit_FOUND) + target_link_libraries(gomnivoicecpp PRIVATE CUDA::cudart) + endif() + endif() + endif() +endforeach() + +if(MSVC) + target_compile_options(gomnivoicecpp PRIVATE /W4 /wd4100 /wd4505) +else() + target_compile_options(gomnivoicecpp PRIVATE -Wall -Wextra + -Wno-unused-parameter -Wno-unused-function) +endif() + +set_property(TARGET gomnivoicecpp PROPERTY CXX_STANDARD 17) +set_target_properties(gomnivoicecpp PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) diff --git a/backend/go/omnivoice-cpp/Makefile b/backend/go/omnivoice-cpp/Makefile new file mode 100644 index 000000000..7806ce11f --- /dev/null +++ b/backend/go/omnivoice-cpp/Makefile @@ -0,0 +1,122 @@ +CMAKE_ARGS?= +BUILD_TYPE?= +NATIVE?=false + +GOCMD?=go +GO_TAGS?= +JOBS?=$(shell nproc --ignore=1) + +# omnivoice.cpp version +OMNIVOICE_REPO?=https://github.com/ServeurpersoCom/omnivoice.cpp +OMNIVOICE_VERSION?=2603355a5dfacae5cfc33531d5d0933221843509 +SO_TARGET?=libgomnivoicecpp.so + +CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF + +ifeq ($(NATIVE),false) + CMAKE_ARGS+=-DGGML_NATIVE=OFF +endif + +ifeq ($(BUILD_TYPE),cublas) + CMAKE_ARGS+=-DGGML_CUDA=ON +else ifeq ($(BUILD_TYPE),openblas) + CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS +else ifeq ($(BUILD_TYPE),clblas) + CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path +else ifeq ($(BUILD_TYPE),hipblas) + CMAKE_ARGS+=-DGGML_HIPBLAS=ON +else ifeq ($(BUILD_TYPE),vulkan) + CMAKE_ARGS+=-DGGML_VULKAN=ON +else ifeq ($(OS),Darwin) + ifneq ($(BUILD_TYPE),metal) + CMAKE_ARGS+=-DGGML_METAL=OFF + else + CMAKE_ARGS+=-DGGML_METAL=ON + CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON + endif +endif + +ifeq ($(BUILD_TYPE),sycl_f16) + CMAKE_ARGS+=-DGGML_SYCL=ON \ + -DCMAKE_C_COMPILER=icx \ + -DCMAKE_CXX_COMPILER=icpx \ + -DGGML_SYCL_F16=ON +endif + +ifeq ($(BUILD_TYPE),sycl_f32) + CMAKE_ARGS+=-DGGML_SYCL=ON \ + -DCMAKE_C_COMPILER=icx \ + -DCMAKE_CXX_COMPILER=icpx +endif + +sources/omnivoice.cpp: + mkdir -p sources/omnivoice.cpp + cd sources/omnivoice.cpp && \ + git init && \ + git remote add origin $(OMNIVOICE_REPO) && \ + git fetch origin && \ + git checkout $(OMNIVOICE_VERSION) && \ + git submodule update --init --recursive --depth 1 --single-branch + +# Detect OS +UNAME_S := $(shell uname -s) + +# Only build CPU variants on Linux +ifeq ($(UNAME_S),Linux) + VARIANT_TARGETS = libgomnivoicecpp-avx.so libgomnivoicecpp-avx2.so libgomnivoicecpp-avx512.so libgomnivoicecpp-fallback.so +else + VARIANT_TARGETS = libgomnivoicecpp-fallback.so +endif + +omnivoice-cpp: main.go gomnivoicecpp.go $(VARIANT_TARGETS) + CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o omnivoice-cpp ./ + +package: omnivoice-cpp + bash package.sh + +build: package + +clean: purge + rm -rf libgomnivoicecpp*.so package sources/omnivoice.cpp omnivoice-cpp + +purge: + rm -rf build* + +.NOTPARALLEL: + +ifeq ($(UNAME_S),Linux) +libgomnivoicecpp-avx.so: sources/omnivoice.cpp + $(info ${GREEN}I omnivoice-cpp build info:avx${RESET}) + SO_TARGET=libgomnivoicecpp-avx.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgomnivoicecpp-custom + rm -rf build-libgomnivoicecpp-avx.so + +libgomnivoicecpp-avx2.so: sources/omnivoice.cpp + $(info ${GREEN}I omnivoice-cpp build info:avx2${RESET}) + SO_TARGET=libgomnivoicecpp-avx2.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) libgomnivoicecpp-custom + rm -rf build-libgomnivoicecpp-avx2.so + +libgomnivoicecpp-avx512.so: sources/omnivoice.cpp + $(info ${GREEN}I omnivoice-cpp build info:avx512${RESET}) + SO_TARGET=libgomnivoicecpp-avx512.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) libgomnivoicecpp-custom + rm -rf build-libgomnivoicecpp-avx512.so +endif + +libgomnivoicecpp-fallback.so: sources/omnivoice.cpp + $(info ${GREEN}I omnivoice-cpp build info:fallback${RESET}) + SO_TARGET=libgomnivoicecpp-fallback.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgomnivoicecpp-custom + rm -rf build-libgomnivoicecpp-fallback.so + +libgomnivoicecpp-custom: CMakeLists.txt cpp/gomnivoicecpp.cpp cpp/gomnivoicecpp.h + mkdir -p build-$(SO_TARGET) && \ + cd build-$(SO_TARGET) && \ + cmake .. $(CMAKE_ARGS) && \ + cmake --build . --config Release -j$(JOBS) --target gomnivoicecpp && \ + cd .. && \ + mv build-$(SO_TARGET)/libgomnivoicecpp.so ./$(SO_TARGET) + +test: omnivoice-cpp + @echo "Running omnivoice-cpp tests..." + bash test.sh + @echo "omnivoice-cpp tests completed." + +all: omnivoice-cpp package diff --git a/backend/go/omnivoice-cpp/audio.go b/backend/go/omnivoice-cpp/audio.go new file mode 100644 index 000000000..97ba8b9c4 --- /dev/null +++ b/backend/go/omnivoice-cpp/audio.go @@ -0,0 +1,129 @@ +package main + +import ( + "bytes" + "encoding/binary" + "fmt" + "os" + "runtime" + + "github.com/go-audio/audio" + "github.com/go-audio/wav" +) + +const omnivoiceSampleRate = 24000 + +// wavHeader24k returns a 44-byte WAV header for a streaming 24 kHz mono 16-bit +// PCM stream, with placeholder (0xFFFFFFFF) sizes since the total length is +// unknown up front. Emitted as the first chunk of TTSStream so the HTTP layer +// receives a self-describing WAV (the gRPC TTSStream path never sets Message, +// so the backend owns the header - see core/backend/tts.go:ModelTTSStream). +func wavHeader24k() []byte { + var buf bytes.Buffer + w := func(v any) { _ = binary.Write(&buf, binary.LittleEndian, v) } + buf.WriteString("RIFF") + w(uint32(0xFFFFFFFF)) + buf.WriteString("WAVE") + buf.WriteString("fmt ") + w(uint32(16)) // Subchunk1Size + w(uint16(1)) // PCM + w(uint16(1)) // mono + w(uint32(omnivoiceSampleRate)) // sample rate + w(uint32(omnivoiceSampleRate * 2)) // byte rate = SR * blockAlign + w(uint16(2)) // block align (16-bit mono) + w(uint16(16)) // bits per sample + buf.WriteString("data") + w(uint32(0xFFFFFFFF)) + return buf.Bytes() +} + +// floatToPCM16LE clamps each sample to [-1,1] and encodes it as little-endian +// signed 16-bit PCM. +func floatToPCM16LE(samples []float32) []byte { + out := make([]byte, len(samples)*2) + for i, s := range samples { + if s > 1 { + s = 1 + } else if s < -1 { + s = -1 + } + v := int16(s * 32767) + out[i*2] = byte(v) + out[i*2+1] = byte(v >> 8) + } + return out +} + +// writeWAV24k writes samples as a finalized 24 kHz mono 16-bit WAV at dst. +func writeWAV24k(dst string, samples []float32) error { + f, err := os.Create(dst) + if err != nil { + return fmt.Errorf("omnivoice: create %q: %w", dst, err) + } + enc := wav.NewEncoder(f, omnivoiceSampleRate, 16, 1, 1) + ints := make([]int, len(samples)) + for i, s := range samples { + if s > 1 { + s = 1 + } else if s < -1 { + s = -1 + } + ints[i] = int(s * 32767) + } + b := &audio.IntBuffer{ + Format: &audio.Format{NumChannels: 1, SampleRate: omnivoiceSampleRate}, + Data: ints, + SourceBitDepth: 16, + } + if err := enc.Write(b); err != nil { + _ = enc.Close() + _ = f.Close() + return fmt.Errorf("omnivoice: encode WAV: %w", err) + } + if err := enc.Close(); err != nil { + _ = f.Close() + return fmt.Errorf("omnivoice: finalize WAV: %w", err) + } + return f.Close() +} + +// readWAVAsFloat decodes a WAV file (any sample rate/channels) to a mono +// float32 slice in [-1,1] for use as reference audio. OmniVoice expects 24 kHz; +// callers should supply 24 kHz reference clips. +func readWAVAsFloat(path string) ([]float32, error) { + f, err := os.Open(path) + if err != nil { + return nil, fmt.Errorf("omnivoice: open ref %q: %w", path, err) + } + defer func() { _ = f.Close() }() + + dec := wav.NewDecoder(f) + buf, err := dec.FullPCMBuffer() + if err != nil { + return nil, fmt.Errorf("omnivoice: decode ref %q: %w", path, err) + } + ch := int(buf.Format.NumChannels) + if ch < 1 { + ch = 1 + } + bitDepth := int(buf.SourceBitDepth) + if bitDepth == 0 { + bitDepth = 16 + } + scale := float32(int64(1) << uint(bitDepth-1)) + n := len(buf.Data) / ch + out := make([]float32, n) + for i := 0; i < n; i++ { + // Downmix to mono by averaging channels. + var acc int + for c := 0; c < ch; c++ { + acc += buf.Data[i*ch+c] + } + out[i] = float32(acc) / float32(ch) / scale + } + return out, nil +} + +// runtimeKeepAlive prevents the GC from reclaiming the reference-audio slice +// while its backing pointer is in use across the C call. +func runtimeKeepAlive(v any) { runtime.KeepAlive(v) } diff --git a/backend/go/omnivoice-cpp/cpp/gomnivoicecpp.cpp b/backend/go/omnivoice-cpp/cpp/gomnivoicecpp.cpp new file mode 100644 index 000000000..3bca0c211 --- /dev/null +++ b/backend/go/omnivoice-cpp/cpp/gomnivoicecpp.cpp @@ -0,0 +1,166 @@ +#include "gomnivoicecpp.h" +#include "ggml-backend.h" +#include "omnivoice.h" + +#include +#include +#include + +static ov_context *g_ctx = nullptr; + +static void ggml_log_cb(enum ggml_log_level level, const char *log, + void * /*data*/) { + if (!log) + return; + const char *lvl = "?????"; + switch (level) { + case GGML_LOG_LEVEL_DEBUG: lvl = "DEBUG"; break; + case GGML_LOG_LEVEL_INFO: lvl = "INFO"; break; + case GGML_LOG_LEVEL_WARN: lvl = "WARN"; break; + case GGML_LOG_LEVEL_ERROR: lvl = "ERROR"; break; + default: break; + } + fprintf(stderr, "[%-5s] %s", lvl, log); + fflush(stderr); +} + +int omni_load(const char *model_path, const char *codec_path, int use_fa, + int clamp_fp16) { + ggml_log_set(ggml_log_cb, nullptr); + ggml_backend_load_all(); + + if (!model_path || model_path[0] == '\0') { + fprintf(stderr, "[omnivoice-cpp] ERROR: model_path is required\n"); + return 1; + } + if (!codec_path || codec_path[0] == '\0') { + fprintf(stderr, "[omnivoice-cpp] ERROR: codec_path is required\n"); + return 2; + } + + ov_init_params p; + ov_init_default_params(&p); + p.model_path = model_path; + p.codec_path = codec_path; + p.use_fa = use_fa != 0; + p.clamp_fp16 = clamp_fp16 != 0; + + fprintf(stderr, "[omnivoice-cpp] Loading model=%s codec=%s\n", model_path, + codec_path); + + g_ctx = ov_init(&p); + if (!g_ctx) { + fprintf(stderr, "[omnivoice-cpp] FATAL: ov_init failed: %s\n", + ov_last_error()); + return 3; + } + fprintf(stderr, "[omnivoice-cpp] Model loaded (%s)\n", ov_version()); + return 0; +} + +// Fill an ov_tts_params from the flat wrapper arguments. +static void fill_params(ov_tts_params *tp, const char *text, const char *lang, + const char *instruct, const float *ref_samples, + int ref_n, const char *ref_text, long long seed, + int denoise) { + ov_tts_default_params(tp); + tp->text = text ? text : ""; + tp->lang = lang ? lang : ""; + if (instruct && instruct[0] != '\0') + tp->instruct = instruct; + if (ref_samples && ref_n > 0) { + tp->ref_audio_24k = ref_samples; + tp->ref_n_samples = ref_n; + if (ref_text && ref_text[0] != '\0') + tp->ref_text = ref_text; + tp->denoise = denoise != 0; + } + if (seed >= 0) + tp->mg_seed = (uint64_t)seed; +} + +float *omni_tts(const char *text, const char *lang, const char *instruct, + const float *ref_samples, int ref_n, const char *ref_text, + long long seed, int denoise, int *out_n) { + if (out_n) + *out_n = 0; + if (!g_ctx) { + fprintf(stderr, "[omnivoice-cpp] ERROR: model not loaded\n"); + return nullptr; + } + if (!text || text[0] == '\0') { + fprintf(stderr, "[omnivoice-cpp] ERROR: text is required\n"); + return nullptr; // omni_tts: out_n already 0 + } + ov_tts_params tp; + fill_params(&tp, text, lang, instruct, ref_samples, ref_n, ref_text, seed, + denoise); + + ov_audio out = {0}; + enum ov_status rc = ov_synthesize(g_ctx, &tp, &out); + if (rc != OV_STATUS_OK || out.n_samples <= 0 || !out.samples) { + fprintf(stderr, "[omnivoice-cpp] ERROR: synthesize failed (rc=%d): %s\n", + (int)rc, ov_last_error()); + ov_audio_free(&out); + return nullptr; + } + + // Copy into a plain malloc buffer the Go side can free symmetrically via + // omni_pcm_free; then release the ov_audio-owned buffer. + size_t bytes = (size_t)out.n_samples * sizeof(float); + float *buf = (float *)malloc(bytes); + if (!buf) { + fprintf(stderr, "[omnivoice-cpp] ERROR: malloc(%zu) failed\n", bytes); + ov_audio_free(&out); + return nullptr; + } + memcpy(buf, out.samples, bytes); + if (out_n) + *out_n = out.n_samples; + ov_audio_free(&out); + return buf; +} + +int omni_tts_stream(const char *text, const char *lang, const char *instruct, + const float *ref_samples, int ref_n, const char *ref_text, + long long seed, int denoise, omni_pcm_chunk_cb cb, + void *user_data) { + if (!g_ctx) { + fprintf(stderr, "[omnivoice-cpp] ERROR: model not loaded\n"); + return 1; + } + if (!cb) { + fprintf(stderr, "[omnivoice-cpp] ERROR: stream callback is null\n"); + return 2; + } + if (!text || text[0] == '\0') { + fprintf(stderr, "[omnivoice-cpp] ERROR: text is required\n"); + return 4; + } + ov_tts_params tp; + fill_params(&tp, text, lang, instruct, ref_samples, ref_n, ref_text, seed, + denoise); + // ov_audio_chunk_cb has the identical signature to omni_pcm_chunk_cb + // (bool vs int return are ABI-compatible; non-zero == true). + tp.on_chunk = (ov_audio_chunk_cb)cb; + tp.on_chunk_user_data = user_data; + + ov_audio out = {0}; // stays empty in streaming mode + enum ov_status rc = ov_synthesize(g_ctx, &tp, &out); + ov_audio_free(&out); + if (rc != OV_STATUS_OK && rc != OV_STATUS_CANCELLED) { + fprintf(stderr, "[omnivoice-cpp] ERROR: stream synth failed (rc=%d): %s\n", + (int)rc, ov_last_error()); + return 3; + } + return 0; +} + +void omni_pcm_free(float *p) { free(p); } + +void omni_unload(void) { + if (g_ctx) { + ov_free(g_ctx); + g_ctx = nullptr; + } +} diff --git a/backend/go/omnivoice-cpp/cpp/gomnivoicecpp.h b/backend/go/omnivoice-cpp/cpp/gomnivoicecpp.h new file mode 100644 index 000000000..4d885a7c6 --- /dev/null +++ b/backend/go/omnivoice-cpp/cpp/gomnivoicecpp.h @@ -0,0 +1,38 @@ +#pragma once + +#include + +extern "C" { + +// Streaming PCM chunk callback. samples is mono float PCM at 24 kHz, valid +// only for the duration of the call. Return non-zero to continue, 0 to abort. +typedef int (*omni_pcm_chunk_cb)(const float *samples, int n_samples, + void *user_data); + +// Load the LM (model_path) + codec (codec_path) GGUFs. use_fa / clamp_fp16 +// map to ov_init_params. Returns 0 on success, non-zero on failure. +int omni_load(const char *model_path, const char *codec_path, int use_fa, + int clamp_fp16); + +// Synthesize to a malloc'd float PCM buffer (caller frees via omni_pcm_free). +// ref_samples != null && ref_n > 0 => voice cloning (ref_text optional). +// instruct != null && non-empty => voice design. seed < 0 keeps the default +// MaskGIT seed. denoise toggles the <|denoise|> marker (only with a reference). +// Writes the sample count to *out_n. Returns NULL on failure (out_n set to 0). +float *omni_tts(const char *text, const char *lang, const char *instruct, + const float *ref_samples, int ref_n, const char *ref_text, + long long seed, int denoise, int *out_n); + +// Streaming synthesis: cb is invoked per PCM chunk as audio is produced. +// Same reference/design/seed semantics as omni_tts. Returns 0 on success. +int omni_tts_stream(const char *text, const char *lang, const char *instruct, + const float *ref_samples, int ref_n, const char *ref_text, + long long seed, int denoise, omni_pcm_chunk_cb cb, + void *user_data); + +// Free a buffer returned by omni_tts. +void omni_pcm_free(float *p); + +// Release the OmniVoice context. +void omni_unload(void); +} diff --git a/backend/go/omnivoice-cpp/e2e_test.go b/backend/go/omnivoice-cpp/e2e_test.go new file mode 100644 index 000000000..5dcd6a772 --- /dev/null +++ b/backend/go/omnivoice-cpp/e2e_test.go @@ -0,0 +1,74 @@ +package main + +import ( + "os" + "strings" + + "github.com/ebitengine/purego" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func ttsReq(text, voice string, lang *string, dst string) *pb.TTSRequest { + return &pb.TTSRequest{Text: text, Voice: voice, Language: lang, Dst: dst} +} + +var _ = Describe("OmniVoice e2e", Label("e2e"), func() { + var loaded bool + + BeforeEach(func() { + modelPath := os.Getenv("OMNIVOICE_MODEL") + codecPath := os.Getenv("OMNIVOICE_CODEC") + if modelPath == "" || codecPath == "" { + Skip("OMNIVOICE_MODEL / OMNIVOICE_CODEC not set; skipping e2e") + } + if !loaded { + lib := os.Getenv("OMNIVOICE_LIBRARY") + if lib == "" { + lib = "./libgomnivoicecpp-fallback.so" + } + h, err := purego.Dlopen(lib, purego.RTLD_NOW|purego.RTLD_GLOBAL) + Expect(err).ToNot(HaveOccurred()) + purego.RegisterLibFunc(&CppLoad, h, "omni_load") + purego.RegisterLibFunc(&CppTTS, h, "omni_tts") + purego.RegisterLibFunc(&CppTTSStream, h, "omni_tts_stream") + purego.RegisterLibFunc(&CppPCMFree, h, "omni_pcm_free") + purego.RegisterLibFunc(&CppUnload, h, "omni_unload") + Expect(CppLoad(modelPath, codecPath, 0, 0)).To(Equal(0)) + loaded = true + } + }) + + It("synthesizes a WAV file via TTS", func() { + b := &OmnivoiceCpp{opts: loadOptions{seed: 42, denoise: true}} + dst := GinkgoT().TempDir() + "/out.wav" + lang := "en" + err := b.TTS(ttsReq("Hello world.", "", &lang, dst)) + Expect(err).ToNot(HaveOccurred()) + fi, err := os.Stat(dst) + Expect(err).ToNot(HaveOccurred()) + Expect(fi.Size()).To(BeNumerically(">", int64(44))) + }) + + It("streams audio chunks via TTSStream", func() { + b := &OmnivoiceCpp{opts: loadOptions{seed: 42, denoise: true}} + results := make(chan []byte, 1024) + lang := "en" + done := make(chan error, 1) + go func() { done <- b.TTSStream(ttsReq("Hello there, streaming test.", "", &lang, ""), results) }() + + var chunks int + var first []byte + for c := range results { + if chunks == 0 { + first = c + } + chunks++ + } + Expect(<-done).ToNot(HaveOccurred()) + Expect(chunks).To(BeNumerically(">=", 2)) + Expect(string(first[0:4])).To(Equal("RIFF")) + Expect(strings.HasPrefix(string(first[8:12]), "WAVE")).To(BeTrue()) + }) +}) diff --git a/backend/go/omnivoice-cpp/gomnivoicecpp.go b/backend/go/omnivoice-cpp/gomnivoicecpp.go new file mode 100644 index 000000000..47bcd6a24 --- /dev/null +++ b/backend/go/omnivoice-cpp/gomnivoicecpp.go @@ -0,0 +1,246 @@ +package main + +import ( + "fmt" + "os" + "path/filepath" + "strings" + "sync" + "unsafe" + + "github.com/ebitengine/purego" + "github.com/mudler/LocalAI/pkg/grpc/base" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" +) + +var ( + // omni_load(model_path, codec_path, use_fa, clamp_fp16) int + CppLoad func(modelPath, codecPath string, useFA, clampFP16 int) int + // omni_tts(text, lang, instruct, ref_samples, ref_n, ref_text, seed, denoise, out_n) -> float* (uintptr) + CppTTS func(text, lang, instruct string, refSamples unsafe.Pointer, refN int, + refText string, seed int64, denoise int, outN unsafe.Pointer) uintptr + // omni_tts_stream(text, lang, instruct, ref_samples, ref_n, ref_text, seed, denoise, cb, user) int + CppTTSStream func(text, lang, instruct string, refSamples unsafe.Pointer, refN int, + refText string, seed int64, denoise int, cb uintptr, user uintptr) int + CppPCMFree func(ptr uintptr) + CppUnload func() +) + +type OmnivoiceCpp struct { + base.SingleThread + opts loadOptions + // audioPath is the model-config reference voice (tts.audio_path), used as + // the default voice-cloning reference when a request does not set Voice. + audioPath string +} + +func (o *OmnivoiceCpp) Load(opts *pb.ModelOptions) error { + model := opts.ModelFile + if model == "" { + model = opts.ModelPath + } + if !filepath.IsAbs(model) && opts.ModelPath != "" { + model = filepath.Join(opts.ModelPath, model) + } + + o.opts = parseOptions(opts.Options) + + // Resolve the codec/tokenizer GGUF: explicit option, else auto-discover a + // *tokenizer*.gguf sibling of the base model. + codec := o.opts.codecPath + if codec != "" && !filepath.IsAbs(codec) { + codec = filepath.Join(filepath.Dir(model), codec) + } + if codec == "" { + codec = discoverTokenizer(filepath.Dir(model)) + } + if codec == "" { + return fmt.Errorf("omnivoice: no codec/tokenizer GGUF found; set option 'tokenizer:'") + } + o.opts.codecPath = codec + + // tts.audio_path (ModelOptions.AudioPath) is the config-level voice-cloning + // reference: a default reference WAV used when a request omits Voice. + // Resolved relative to the model directory like the codec. + o.audioPath = opts.AudioPath + if o.audioPath != "" && !filepath.IsAbs(o.audioPath) { + o.audioPath = filepath.Join(filepath.Dir(model), o.audioPath) + } + + useFA := boolToInt(o.opts.useFA) + clamp := boolToInt(o.opts.clampFP16) + + fmt.Fprintf(os.Stderr, "[omnivoice-cpp] Load model=%s codec=%s use_fa=%d clamp_fp16=%d\n", + model, codec, useFA, clamp) + + if rc := CppLoad(model, codec, useFA, clamp); rc != 0 { + return fmt.Errorf("omnivoice: failed to load model (rc=%d)", rc) + } + return nil +} + +// discoverTokenizer returns the first *tokenizer*.gguf in dir, or "". +func discoverTokenizer(dir string) string { + entries, err := os.ReadDir(dir) + if err != nil { + return "" + } + for _, e := range entries { + name := strings.ToLower(e.Name()) + if strings.Contains(name, "tokenizer") && strings.HasSuffix(name, ".gguf") { + return filepath.Join(dir, e.Name()) + } + } + return "" +} + +func boolToInt(b bool) int { + if b { + return 1 + } + return 0 +} + +// refAudio loads the reference WAV (voice cloning) if voice points to a file. +// Returns nil if no cloning (empty or non-path - voice design uses Instructions). +func (o *OmnivoiceCpp) refAudio(voice string) ([]float32, error) { + v := strings.TrimSpace(voice) + if v == "" { + return nil, nil + } + if _, err := os.Stat(v); err != nil { + return nil, nil + } + return readWAVAsFloat(v) +} + +// refAudioFor resolves the cloning reference for a request: the per-request +// Voice takes precedence, falling back to the model-config audio_path. Empty +// result means no cloning (voice design via Instructions still applies). +func (o *OmnivoiceCpp) refAudioFor(req *pb.TTSRequest) ([]float32, error) { + voice := strings.TrimSpace(req.Voice) + if voice == "" { + voice = o.audioPath + } + return o.refAudio(voice) +} + +func reqParam(req *pb.TTSRequest, key string) string { + if req.Params == nil { + return "" + } + return req.Params[key] +} + +func (o *OmnivoiceCpp) seedFor(req *pb.TTSRequest) int64 { + if s := reqParam(req, "seed"); s != "" { + var n int64 + if _, err := fmt.Sscan(s, &n); err == nil { + return n + } + } + return o.opts.seed +} + +func optStr(p *string) string { + if p == nil { + return "" + } + return *p +} + +func (o *OmnivoiceCpp) TTS(req *pb.TTSRequest) error { + if req.Dst == "" { + return fmt.Errorf("omnivoice: TTS requires a destination path") + } + lang := normalizeLanguage(optStr(req.Language)) + instruct := optStr(req.Instructions) + refText := reqParam(req, "ref_text") + seed := o.seedFor(req) + + ref, err := o.refAudioFor(req) + if err != nil { + return err + } + var refPtr unsafe.Pointer + if len(ref) > 0 { + refPtr = unsafe.Pointer(&ref[0]) + } + + var n int32 + ptr := CppTTS(req.Text, lang, instruct, refPtr, len(ref), refText, seed, + boolToInt(o.opts.denoise), unsafe.Pointer(&n)) + runtimeKeepAlive(ref) + if ptr == 0 || n <= 0 { + return fmt.Errorf("omnivoice: synthesis failed") + } + defer CppPCMFree(ptr) + src := unsafe.Slice((*float32)(unsafe.Pointer(ptr)), int(n)) //nolint:govet // C-allocated PCM, copied out before free + out := make([]float32, int(n)) + copy(out, src) + return writeWAV24k(req.Dst, out) +} + +// streamState carries the active TTSStream channel to the single shared C +// callback. base.SingleThread serializes TTS/TTSStream, so one global slot is +// safe and avoids leaking a purego callback per request (purego callbacks +// cannot be freed and are capped). +var ( + streamMu sync.Mutex + streamChan chan []byte + streamCbOnce sync.Once + streamCbPtr uintptr +) + +// streamCallback is registered once and forwards each PCM chunk to streamChan. +func streamCallback(samples *float32, nSamples int32, _ uintptr) uintptr { + if nSamples <= 0 || samples == nil || streamChan == nil { + return 1 // continue + } + src := unsafe.Slice(samples, int(nSamples)) + cp := make([]float32, int(nSamples)) // copy out of C memory before returning + copy(cp, src) + streamChan <- floatToPCM16LE(cp) + return 1 // continue +} + +func (o *OmnivoiceCpp) TTSStream(req *pb.TTSRequest, results chan []byte) error { + defer close(results) + if req.Text == "" { + return fmt.Errorf("omnivoice: TTSStream requires text") + } + + streamCbOnce.Do(func() { + streamCbPtr = purego.NewCallback(streamCallback) + }) + + lang := normalizeLanguage(optStr(req.Language)) + instruct := optStr(req.Instructions) + refText := reqParam(req, "ref_text") + seed := o.seedFor(req) + + ref, err := o.refAudioFor(req) + if err != nil { + return err + } + var refPtr unsafe.Pointer + if len(ref) > 0 { + refPtr = unsafe.Pointer(&ref[0]) + } + + // Emit the WAV header first so the HTTP layer gets a self-describing stream. + results <- wavHeader24k() + + streamMu.Lock() + streamChan = results + rc := CppTTSStream(req.Text, lang, instruct, refPtr, len(ref), refText, seed, + boolToInt(o.opts.denoise), streamCbPtr, 0) + streamChan = nil + streamMu.Unlock() + runtimeKeepAlive(ref) + + if rc != 0 { + return fmt.Errorf("omnivoice: streaming synthesis failed (rc=%d)", rc) + } + return nil +} diff --git a/backend/go/omnivoice-cpp/gomnivoicecpp_test.go b/backend/go/omnivoice-cpp/gomnivoicecpp_test.go new file mode 100644 index 000000000..453ba87c2 --- /dev/null +++ b/backend/go/omnivoice-cpp/gomnivoicecpp_test.go @@ -0,0 +1,90 @@ +package main + +import ( + "bytes" + "encoding/binary" + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestOmnivoiceCpp(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "omnivoice-cpp suite") +} + +var _ = Describe("normalizeLanguage", func() { + DescribeTable("maps caller language to OmniVoice codes", + func(in, want string) { + Expect(normalizeLanguage(in)).To(Equal(want)) + }, + Entry("empty stays empty", "", ""), + Entry("english full name", "English", "en"), + Entry("chinese full name", "Chinese", "zh"), + Entry("locale suffix stripped", "en-US", "en"), + Entry("underscore locale", "zh_CN", "zh"), + Entry("already a code", "en", "en"), + Entry("unknown passes through normalized", "xx", "xx"), + ) +}) + +var _ = Describe("parseOptions", func() { + It("extracts codec, use_fa, clamp_fp16, seed, denoise", func() { + o := parseOptions([]string{ + "tokenizer:tok.gguf", + "use_fa:true", + "clamp_fp16:true", + "seed:7", + "denoise:false", + "unknown:ignored", + }) + Expect(o.codecPath).To(Equal("tok.gguf")) + Expect(o.useFA).To(BeTrue()) + Expect(o.clampFP16).To(BeTrue()) + Expect(o.seed).To(Equal(int64(7))) + Expect(o.denoise).To(BeFalse()) + }) + + It("accepts codec: as an alias for tokenizer:", func() { + o := parseOptions([]string{"codec:c.gguf"}) + Expect(o.codecPath).To(Equal("c.gguf")) + }) + + It("defaults seed to -1 and denoise to true", func() { + o := parseOptions(nil) + Expect(o.seed).To(Equal(int64(-1))) + Expect(o.denoise).To(BeTrue()) + }) +}) + +var _ = Describe("wavHeader24k", func() { + It("emits a 44-byte streaming WAV header at 24 kHz mono 16-bit", func() { + h := wavHeader24k() + Expect(h).To(HaveLen(44)) + Expect(string(h[0:4])).To(Equal("RIFF")) + Expect(string(h[8:12])).To(Equal("WAVE")) + Expect(string(h[12:16])).To(Equal("fmt ")) + Expect(string(h[36:40])).To(Equal("data")) + var sampleRate uint32 + Expect(binary.Read(bytes.NewReader(h[24:28]), binary.LittleEndian, &sampleRate)).To(Succeed()) + Expect(sampleRate).To(Equal(uint32(24000))) + }) +}) + +var _ = Describe("floatToPCM16LE", func() { + It("clamps and converts float PCM to little-endian int16 bytes", func() { + b := floatToPCM16LE([]float32{0, 1.0, -1.0, 2.0, -2.0}) + Expect(b).To(HaveLen(10)) // 5 samples * 2 bytes + read := func(off int) int16 { + var v int16 + _ = binary.Read(bytes.NewReader(b[off:off+2]), binary.LittleEndian, &v) + return v + } + Expect(read(0)).To(Equal(int16(0))) + Expect(read(2)).To(Equal(int16(32767))) + Expect(read(4)).To(Equal(int16(-32767))) + Expect(read(6)).To(Equal(int16(32767))) // clamped from 2.0 + Expect(read(8)).To(Equal(int16(-32767))) // clamped from -2.0 + }) +}) diff --git a/backend/go/omnivoice-cpp/main.go b/backend/go/omnivoice-cpp/main.go new file mode 100644 index 000000000..891201f49 --- /dev/null +++ b/backend/go/omnivoice-cpp/main.go @@ -0,0 +1,48 @@ +package main + +// Note: this is started internally by LocalAI and a server is allocated for each model +import ( + "flag" + "os" + + "github.com/ebitengine/purego" + grpc "github.com/mudler/LocalAI/pkg/grpc" +) + +var ( + addr = flag.String("addr", "localhost:50051", "the address to connect to") +) + +type LibFuncs struct { + FuncPtr any + Name string +} + +func main() { + libName := os.Getenv("OMNIVOICE_LIBRARY") + if libName == "" { + libName = "./libgomnivoicecpp-fallback.so" + } + + lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL) + if err != nil { + panic(err) + } + + libFuncs := []LibFuncs{ + {&CppLoad, "omni_load"}, + {&CppTTS, "omni_tts"}, + {&CppTTSStream, "omni_tts_stream"}, + {&CppPCMFree, "omni_pcm_free"}, + {&CppUnload, "omni_unload"}, + } + for _, lf := range libFuncs { + purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name) + } + + flag.Parse() + + if err := grpc.StartServer(*addr, &OmnivoiceCpp{}); err != nil { + panic(err) + } +} diff --git a/backend/go/omnivoice-cpp/options.go b/backend/go/omnivoice-cpp/options.go new file mode 100644 index 000000000..7f0551c1a --- /dev/null +++ b/backend/go/omnivoice-cpp/options.go @@ -0,0 +1,74 @@ +package main + +import ( + "strconv" + "strings" +) + +// loadOptions holds the parsed model-level options for OmniVoice. +type loadOptions struct { + codecPath string + useFA bool + clampFP16 bool + seed int64 + denoise bool +} + +func splitOption(o string) (key, value string, ok bool) { + i := strings.Index(o, ":") + if i < 0 { + return "", "", false + } + return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true +} + +// parseOptions reads the backend "key:value" option slice. Unknown keys are +// ignored. Defaults: seed -1 (engine default), denoise true. +func parseOptions(opts []string) loadOptions { + o := loadOptions{seed: -1, denoise: true} + for _, oo := range opts { + key, value, ok := splitOption(oo) + if !ok { + continue + } + switch key { + case "tokenizer", "codec": + o.codecPath = value + case "use_fa": + o.useFA = value == "true" || value == "1" + case "clamp_fp16": + o.clampFP16 = value == "true" || value == "1" + case "seed": + if n, err := strconv.ParseInt(value, 10, 64); err == nil { + o.seed = n + } + case "denoise": + o.denoise = value == "true" || value == "1" + } + } + return o +} + +// languageNameAliases maps full language names to OmniVoice codes. OmniVoice's +// lang hint accepts "" (auto), "en", "zh" per the upstream convention; other +// codes pass through and the engine treats unknown hints as auto. +var languageNameAliases = map[string]string{ + "english": "en", + "chinese": "zh", +} + +// normalizeLanguage lowercases, trims, strips a region/locale suffix, and +// resolves common full names. Empty stays empty so the engine auto-detects. +func normalizeLanguage(lang string) string { + lang = strings.ToLower(strings.TrimSpace(lang)) + if lang == "" { + return "" + } + if i := strings.IndexAny(lang, "-_."); i >= 0 { + lang = lang[:i] + } + if code, ok := languageNameAliases[lang]; ok { + return code + } + return lang +} diff --git a/backend/go/omnivoice-cpp/package.sh b/backend/go/omnivoice-cpp/package.sh new file mode 100755 index 000000000..b8313d9d7 --- /dev/null +++ b/backend/go/omnivoice-cpp/package.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# Script to copy the appropriate libraries based on architecture +# This script is used in the final stage of the Dockerfile + +set -e + +CURDIR=$(dirname "$(realpath $0)") +REPO_ROOT="${CURDIR}/../../.." + +# Create lib directory +mkdir -p $CURDIR/package/lib + +cp -avf $CURDIR/omnivoice-cpp $CURDIR/package/ +cp -fv $CURDIR/libgomnivoicecpp-*.so $CURDIR/package/ +cp -fv $CURDIR/run.sh $CURDIR/package/ + +# Detect architecture and copy appropriate libraries +if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then + # x86_64 architecture + echo "Detected x86_64 architecture, copying x86_64 libraries..." + cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so + cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6 + cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1 + cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6 + cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6 + cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1 + cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1 + cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6 + cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2 + cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1 + cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0 +elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then + # ARM64 architecture + echo "Detected ARM64 architecture, copying ARM64 libraries..." + cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so + cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6 + cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1 + cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6 + cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6 + cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1 + cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1 + cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6 + cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2 + cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1 + cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0 +elif [ $(uname -s) = "Darwin" ]; then + echo "Detected Darwin" +else + echo "Error: Could not detect architecture" + exit 1 +fi + +# Package GPU libraries based on BUILD_TYPE +GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh" +if [ -f "$GPU_LIB_SCRIPT" ]; then + echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..." + source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib" + package_gpu_libs +fi + +echo "Packaging completed successfully" +ls -liah $CURDIR/package/ +ls -liah $CURDIR/package/lib/ diff --git a/backend/go/omnivoice-cpp/run.sh b/backend/go/omnivoice-cpp/run.sh new file mode 100755 index 000000000..f677ca21c --- /dev/null +++ b/backend/go/omnivoice-cpp/run.sh @@ -0,0 +1,52 @@ +#!/bin/bash +set -ex + +# Get the absolute current dir where the script is located +CURDIR=$(dirname "$(realpath $0)") + +cd / + +echo "CPU info:" +if [ "$(uname)" != "Darwin" ]; then + grep -e "model\sname" /proc/cpuinfo | head -1 + grep -e "flags" /proc/cpuinfo | head -1 +fi + +LIBRARY="$CURDIR/libgomnivoicecpp-fallback.so" + +if [ "$(uname)" != "Darwin" ]; then + if grep -q -e "\savx\s" /proc/cpuinfo ; then + echo "CPU: AVX found OK" + if [ -e $CURDIR/libgomnivoicecpp-avx.so ]; then + LIBRARY="$CURDIR/libgomnivoicecpp-avx.so" + fi + fi + + if grep -q -e "\savx2\s" /proc/cpuinfo ; then + echo "CPU: AVX2 found OK" + if [ -e $CURDIR/libgomnivoicecpp-avx2.so ]; then + LIBRARY="$CURDIR/libgomnivoicecpp-avx2.so" + fi + fi + + # Check avx 512 + if grep -q -e "\savx512f\s" /proc/cpuinfo ; then + echo "CPU: AVX512F found OK" + if [ -e $CURDIR/libgomnivoicecpp-avx512.so ]; then + LIBRARY="$CURDIR/libgomnivoicecpp-avx512.so" + fi + fi +fi + +export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH +export OMNIVOICE_LIBRARY=$LIBRARY + +# If there is a lib/ld.so, use it +if [ -f $CURDIR/lib/ld.so ]; then + echo "Using lib/ld.so" + echo "Using library: $LIBRARY" + exec $CURDIR/lib/ld.so $CURDIR/omnivoice-cpp "$@" +fi + +echo "Using library: $LIBRARY" +exec $CURDIR/omnivoice-cpp "$@" diff --git a/backend/go/omnivoice-cpp/test.sh b/backend/go/omnivoice-cpp/test.sh new file mode 100755 index 000000000..9205582fe --- /dev/null +++ b/backend/go/omnivoice-cpp/test.sh @@ -0,0 +1,30 @@ +#!/bin/bash +set -e + +CURDIR=$(dirname "$(realpath $0)") +cd "$CURDIR" + +echo "Running omnivoice-cpp backend tests..." + +if [ -z "$OMNIVOICE_MODEL" ]; then + MODEL_DIR="./omnivoice-models" + mkdir -p "$MODEL_DIR" + REPO_ID="Serveurperso/OmniVoice-GGUF" + BASE_URL="https://huggingface.co/${REPO_ID}/resolve/main" + FILES=( "omnivoice-base-Q4_K_M.gguf" "omnivoice-tokenizer-Q4_K_M.gguf" ) + for file in "${FILES[@]}"; do + dest="${MODEL_DIR}/${file}" + if [ -f "${dest}" ]; then + echo " [skip] ${file}" + else + echo " [download] ${file}..." + curl -L -o "${dest}" "${BASE_URL}/${file}" --progress-bar + fi + done + export OMNIVOICE_MODEL="${MODEL_DIR}/omnivoice-base-Q4_K_M.gguf" + export OMNIVOICE_CODEC="${MODEL_DIR}/omnivoice-tokenizer-Q4_K_M.gguf" +fi + +go test -v -timeout 1200s . + +echo "All omnivoice-cpp e2e tests passed." diff --git a/backend/index.yaml b/backend/index.yaml index ab971cff5..43716a7d1 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -789,6 +789,33 @@ nvidia-l4t: "nvidia-l4t-arm64-qwen3-tts-cpp" nvidia-l4t-cuda-12: "nvidia-l4t-arm64-qwen3-tts-cpp" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-qwen3-tts-cpp" +- &omnivoicecpp + name: "omnivoice-cpp" + description: | + OmniVoice C++ backend using GGML. Native text-to-speech with voice cloning + (reference audio + transcript) and voice design (attribute keywords: gender, + age, pitch, style, volume, emotion). 24kHz mono output, 646 languages. + Supports streaming synthesis. + urls: + - https://github.com/ServeurpersoCom/omnivoice.cpp + tags: + - text-to-speech + - tts + - voice-cloning + - voice-design + alias: "omnivoice-cpp" + capabilities: + default: "cpu-omnivoice-cpp" + nvidia: "cuda12-omnivoice-cpp" + nvidia-cuda-13: "cuda13-omnivoice-cpp" + nvidia-cuda-12: "cuda12-omnivoice-cpp" + intel: "intel-sycl-f16-omnivoice-cpp" + metal: "metal-omnivoice-cpp" + amd: "rocm-omnivoice-cpp" + vulkan: "vulkan-omnivoice-cpp" + nvidia-l4t: "nvidia-l4t-arm64-omnivoice-cpp" + nvidia-l4t-cuda-12: "nvidia-l4t-arm64-omnivoice-cpp" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-omnivoice-cpp" - &vibevoicecpp name: "vibevoice-cpp" description: | @@ -3473,6 +3500,121 @@ uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-qwen3-tts-cpp" mirrors: - localai/localai-backends:master-gpu-nvidia-cuda-13-qwen3-tts-cpp +## omnivoice-cpp +- !!merge <<: *omnivoicecpp + name: "omnivoice-cpp-development" + capabilities: + default: "cpu-omnivoice-cpp-development" + nvidia: "cuda12-omnivoice-cpp-development" + nvidia-cuda-13: "cuda13-omnivoice-cpp-development" + nvidia-cuda-12: "cuda12-omnivoice-cpp-development" + intel: "intel-sycl-f16-omnivoice-cpp-development" + metal: "metal-omnivoice-cpp-development" + amd: "rocm-omnivoice-cpp-development" + vulkan: "vulkan-omnivoice-cpp-development" + nvidia-l4t: "nvidia-l4t-arm64-omnivoice-cpp-development" + nvidia-l4t-cuda-12: "nvidia-l4t-arm64-omnivoice-cpp-development" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-omnivoice-cpp-development" +- !!merge <<: *omnivoicecpp + name: "nvidia-l4t-arm64-omnivoice-cpp" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-omnivoice-cpp" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-arm64-omnivoice-cpp +- !!merge <<: *omnivoicecpp + name: "nvidia-l4t-arm64-omnivoice-cpp-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-omnivoice-cpp" + mirrors: + - localai/localai-backends:master-nvidia-l4t-arm64-omnivoice-cpp +- !!merge <<: *omnivoicecpp + name: "cuda13-nvidia-l4t-arm64-omnivoice-cpp" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-omnivoice-cpp" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-omnivoice-cpp +- !!merge <<: *omnivoicecpp + name: "cuda13-nvidia-l4t-arm64-omnivoice-cpp-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-omnivoice-cpp" + mirrors: + - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-omnivoice-cpp +- !!merge <<: *omnivoicecpp + name: "cpu-omnivoice-cpp" + uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-omnivoice-cpp" + mirrors: + - localai/localai-backends:latest-cpu-omnivoice-cpp +- !!merge <<: *omnivoicecpp + name: "metal-omnivoice-cpp" + uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-omnivoice-cpp" + mirrors: + - localai/localai-backends:latest-metal-darwin-arm64-omnivoice-cpp +- !!merge <<: *omnivoicecpp + name: "metal-omnivoice-cpp-development" + uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-omnivoice-cpp" + mirrors: + - localai/localai-backends:master-metal-darwin-arm64-omnivoice-cpp +- !!merge <<: *omnivoicecpp + name: "cpu-omnivoice-cpp-development" + uri: "quay.io/go-skynet/local-ai-backends:master-cpu-omnivoice-cpp" + mirrors: + - localai/localai-backends:master-cpu-omnivoice-cpp +- !!merge <<: *omnivoicecpp + name: "cuda12-omnivoice-cpp" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-omnivoice-cpp" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-12-omnivoice-cpp +- !!merge <<: *omnivoicecpp + name: "rocm-omnivoice-cpp" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-omnivoice-cpp" + mirrors: + - localai/localai-backends:latest-gpu-rocm-hipblas-omnivoice-cpp +- !!merge <<: *omnivoicecpp + name: "intel-sycl-f32-omnivoice-cpp" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-omnivoice-cpp" + mirrors: + - localai/localai-backends:latest-gpu-intel-sycl-f32-omnivoice-cpp +- !!merge <<: *omnivoicecpp + name: "intel-sycl-f16-omnivoice-cpp" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-omnivoice-cpp" + mirrors: + - localai/localai-backends:latest-gpu-intel-sycl-f16-omnivoice-cpp +- !!merge <<: *omnivoicecpp + name: "vulkan-omnivoice-cpp" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-omnivoice-cpp" + mirrors: + - localai/localai-backends:latest-gpu-vulkan-omnivoice-cpp +- !!merge <<: *omnivoicecpp + name: "vulkan-omnivoice-cpp-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-omnivoice-cpp" + mirrors: + - localai/localai-backends:master-gpu-vulkan-omnivoice-cpp +- !!merge <<: *omnivoicecpp + name: "cuda12-omnivoice-cpp-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-omnivoice-cpp" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-12-omnivoice-cpp +- !!merge <<: *omnivoicecpp + name: "rocm-omnivoice-cpp-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-omnivoice-cpp" + mirrors: + - localai/localai-backends:master-gpu-rocm-hipblas-omnivoice-cpp +- !!merge <<: *omnivoicecpp + name: "intel-sycl-f32-omnivoice-cpp-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-omnivoice-cpp" + mirrors: + - localai/localai-backends:master-gpu-intel-sycl-f32-omnivoice-cpp +- !!merge <<: *omnivoicecpp + name: "intel-sycl-f16-omnivoice-cpp-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-omnivoice-cpp" + mirrors: + - localai/localai-backends:master-gpu-intel-sycl-f16-omnivoice-cpp +- !!merge <<: *omnivoicecpp + name: "cuda13-omnivoice-cpp" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-omnivoice-cpp" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-13-omnivoice-cpp +- !!merge <<: *omnivoicecpp + name: "cuda13-omnivoice-cpp-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-omnivoice-cpp" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-13-omnivoice-cpp ## vibevoice-cpp - !!merge <<: *vibevoicecpp name: "nvidia-l4t-arm64-vibevoice-cpp" diff --git a/core/gallery/importers/omnivoice_test.go b/core/gallery/importers/omnivoice_test.go new file mode 100644 index 000000000..e2d46fe93 --- /dev/null +++ b/core/gallery/importers/omnivoice_test.go @@ -0,0 +1,32 @@ +package importers_test + +import ( + "encoding/json" + "fmt" + + "github.com/mudler/LocalAI/core/gallery/importers" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("OmniVoice pref-only guard", func() { + Context("With only a bare OmniVoice GGUF URI", func() { + It("does not auto-import as omnivoice-cpp", func() { + // omnivoice-cpp is a preference-only backend (listed in the + // /backends/known registry with AutoDetect:false). No importer + // emits it, so discovering a bare OmniVoice GGUF must never + // silently resolve to omnivoice-cpp. It may legitimately match a + // generic GGUF importer (e.g. llama-cpp) or error/be ambiguous — + // the only hard requirement is that it is NOT omnivoice-cpp. + uri := "huggingface://Serveurperso/OmniVoice-GGUF/omnivoice-base-Q8_0.gguf" + preferences := json.RawMessage(`{}`) + + modelConfig, err := importers.DiscoverModelConfig(uri, preferences) + if err != nil { + // An error (including ambiguous) is acceptable for a pref-only backend. + return + } + Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("backend: omnivoice-cpp"), fmt.Sprintf("Model config: %+v", modelConfig)) + }) + }) +}) diff --git a/core/http/endpoints/localai/backend.go b/core/http/endpoints/localai/backend.go index cbda648d6..331e49e43 100644 --- a/core/http/endpoints/localai/backend.go +++ b/core/http/endpoints/localai/backend.go @@ -36,6 +36,7 @@ var knownPrefOnlyBackends = []schema.KnownBackend{ {Name: "kokoros", Modality: "tts", AutoDetect: false, Description: "Kokoros TTS (preference-only)"}, {Name: "qwen-tts", Modality: "tts", AutoDetect: false, Description: "Qwen TTS (preference-only)"}, {Name: "qwen3-tts-cpp", Modality: "tts", AutoDetect: false, Description: "Qwen3 TTS C++ (preference-only)"}, + {Name: "omnivoice-cpp", Modality: "tts", AutoDetect: false, Description: "OmniVoice C++ TTS with voice cloning and voice design (preference-only)"}, {Name: "faster-qwen3-tts", Modality: "tts", AutoDetect: false, Description: "Faster Qwen3 TTS (preference-only)"}, // Detection {Name: "sam3-cpp", Modality: "detection", AutoDetect: false, Description: "SAM3 C++ object detection (preference-only)"}, diff --git a/docs/content/features/text-to-audio.md b/docs/content/features/text-to-audio.md index 219814e9d..b896aaf6e 100644 --- a/docs/content/features/text-to-audio.md +++ b/docs/content/features/text-to-audio.md @@ -226,6 +226,82 @@ curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{ }' | aplay ``` +### OmniVoice + +[OmniVoice](https://github.com/ServeurpersoCom/omnivoice.cpp) (`omnivoice-cpp` backend) is a native C++ / GGML text-to-speech engine. It supports voice cloning (from reference audio plus its transcript), voice design (steering the voice with attribute keywords such as gender, age, pitch, style, volume, and emotion), and streaming synthesis. Output is 24kHz mono audio and it covers 646 languages. + +#### Setup + +Install the `omnivoice-cpp` model in the Model gallery or run `local-ai run models install omnivoice-cpp`. A higher-quality BF16 variant is available as `omnivoice-cpp-hq` (the default `omnivoice-cpp` ships Q8_0 GGUFs). + +#### Usage + +Use the speech endpoint by specifying the omnivoice-cpp backend: + +```bash +curl http://localhost:8080/v1/audio/speech -H "Content-Type: application/json" -d '{ + "model": "omnivoice-cpp", + "input": "Hello world, this is a test." + }' | aplay +``` + +#### Voice cloning + +Pass a reference audio file via the `voice` parameter and its transcript via the `ref_text` generation parameter: + +```bash +curl http://localhost:8080/v1/audio/speech -H "Content-Type: application/json" -d '{ + "model": "omnivoice-cpp", + "input": "Hello world, this is a test.", + "voice": "path/to/reference_audio.wav", + "params": { "ref_text": "This is the transcript of the reference audio." } + }' | aplay +``` + +You can also pin a default cloned voice in the model config so callers do not have to pass it on every request. Both `tts.voice` and `tts.audio_path` are honored as the reference audio (a per-request `voice` overrides them); paths are resolved relative to the model directory: + +```yaml +name: omnivoice-cpp +backend: omnivoice-cpp +parameters: + model: omnivoice-cpp/omnivoice-base-Q8_0.gguf +tts: + audio_path: "voices/my_reference.wav" # default cloning reference (or use tts.voice) +options: + - "tokenizer:omnivoice-cpp/omnivoice-tokenizer-Q8_0.gguf" +``` + +#### Voice design + +Steer the synthesized voice with attribute keywords (gender, age, pitch, style, volume, emotion) by passing an `instructions` string per request: + +```bash +curl http://localhost:8080/v1/audio/speech -H "Content-Type: application/json" -d '{ + "model": "omnivoice-cpp", + "input": "Hello world, this is a test.", + "instructions": "female young high soft emotion:happy" + }' | aplay +``` + +#### Configuration + +The backend loads the base GGUF from `parameters.model` and its tokenizer from the `tokenizer:` option. A few optional generation knobs are available as `options`: + +```yaml +name: omnivoice-cpp +backend: omnivoice-cpp +parameters: + model: omnivoice-cpp/omnivoice-base-Q8_0.gguf +options: + - "tokenizer:omnivoice-cpp/omnivoice-tokenizer-Q8_0.gguf" + - "use_fa:true" # enable flash attention + - "clamp_fp16:true" # clamp activations for fp16 stability + - "seed:42" # deterministic generation + - "denoise:true" # denoise the generated audio +``` + +A per-request `seed` can also be supplied through the `params` map alongside `ref_text`. + ### Pocket TTS [Pocket TTS](https://github.com/kyutai-labs/pocket-tts) is a lightweight text-to-speech model designed to run efficiently on CPUs. It supports voice cloning through HuggingFace voice URLs or local audio files. diff --git a/docs/content/reference/compatibility-table.md b/docs/content/reference/compatibility-table.md index e952deba7..53192cedf 100644 --- a/docs/content/reference/compatibility-table.md +++ b/docs/content/reference/compatibility-table.md @@ -57,6 +57,7 @@ LocalAI will attempt to automatically load models which are not explicitly confi | [VoxCPM](https://github.com/ModelBest/VoxCPM) | Expressive end-to-end TTS | CPU, CUDA 12/13, ROCm, Intel, Metal | | [Kitten TTS](https://github.com/KittenML/KittenTTS) | Kitten TTS model | CPU, Metal | | [MLX-Audio](https://github.com/Blaizzy/mlx-audio) | Audio models on Apple Silicon | Metal, CPU, CUDA 12/13, Jetson L4T | +| [OmniVoice](https://github.com/ServeurpersoCom/omnivoice.cpp) | Native C++/GGML TTS with voice cloning, voice design, and streaming | CPU, CUDA 12/13, ROCm, Intel, Metal, Vulkan, Jetson L4T | ## Music Generation diff --git a/gallery/index.yaml b/gallery/index.yaml index 3ca95556e..1a3bf8db2 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -3336,6 +3336,72 @@ - filename: qwen3-tts-cpp/qwen3-tts-tokenizer-f16.gguf sha256: d1ad9660bd99343f4851d5a4b17e31f65648feb3559f6ea062ae6575e5cd9d90 uri: huggingface://endo5501/qwen3-tts.cpp/qwen3-tts-tokenizer-f16.gguf +- name: omnivoice-cpp + url: github:mudler/LocalAI/gallery/virtual.yaml@master + urls: + - https://huggingface.co/Serveurperso/OmniVoice-GGUF + - https://github.com/ServeurpersoCom/omnivoice.cpp + description: | + OmniVoice (C++ / GGML) - native text-to-speech with voice cloning and voice + design. 24kHz mono output, 646 languages, streaming synthesis. Q8_0 GGUFs + (~945 MB total): 612M Qwen3 backbone + RVQ audio codec. + license: apache-2.0 + tags: + - tts + - text-to-speech + - voice-cloning + - voice-design + - omnivoice + - gguf + overrides: + backend: omnivoice-cpp + known_usecases: + - tts + name: omnivoice-cpp + parameters: + model: omnivoice-cpp/omnivoice-base-Q8_0.gguf + options: + - "tokenizer:omnivoice-tokenizer-Q8_0.gguf" + files: + - filename: omnivoice-cpp/omnivoice-base-Q8_0.gguf + sha256: 2882d887921798aea13d45236556bdf8012842ab6f8cd2690943eead6289f298 + uri: huggingface://Serveurperso/OmniVoice-GGUF/omnivoice-base-Q8_0.gguf + - filename: omnivoice-cpp/omnivoice-tokenizer-Q8_0.gguf + sha256: 75204fa566a8e30984e7a1066da6557184c9fd099c8f1bc0cb5b9415edfec255 + uri: huggingface://Serveurperso/OmniVoice-GGUF/omnivoice-tokenizer-Q8_0.gguf +- name: omnivoice-cpp-hq + url: github:mudler/LocalAI/gallery/virtual.yaml@master + urls: + - https://huggingface.co/Serveurperso/OmniVoice-GGUF + - https://github.com/ServeurpersoCom/omnivoice.cpp + description: | + OmniVoice (C++ / GGML), BF16 high-quality variant - text-to-speech with voice + cloning and voice design. 24kHz mono, 646 languages, streaming. BF16 GGUFs + (~1.6 GB total). + license: apache-2.0 + tags: + - tts + - text-to-speech + - voice-cloning + - voice-design + - omnivoice + - gguf + overrides: + backend: omnivoice-cpp + known_usecases: + - tts + name: omnivoice-cpp-hq + parameters: + model: omnivoice-cpp-hq/omnivoice-base-BF16.gguf + options: + - "tokenizer:omnivoice-tokenizer-BF16.gguf" + files: + - filename: omnivoice-cpp-hq/omnivoice-base-BF16.gguf + sha256: c4d2e4e6506a88f9c9900621606470bca6a523c72819bf4a5e5dac80961075bf + uri: huggingface://Serveurperso/OmniVoice-GGUF/omnivoice-base-BF16.gguf + - filename: omnivoice-cpp-hq/omnivoice-tokenizer-BF16.gguf + sha256: c2179e4cf528b19fea22a5be94c34c083877bb5fc28ac0245d2b4299a262dcec + uri: huggingface://Serveurperso/OmniVoice-GGUF/omnivoice-tokenizer-BF16.gguf - name: qwen3-tts-cpp-customvoice url: github:mudler/LocalAI/gallery/virtual.yaml@master urls: