diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml index 593e44cde..71c3d968c 100644 --- a/.github/backend-matrix.yml +++ b/.github/backend-matrix.yml @@ -3723,6 +3723,154 @@ include: dockerfile: "./backend/Dockerfile.golang" context: "./" ubuntu-version: '2404' + # voice-detect + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "8" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-12-voice-detect' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "voice-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-13-voice-detect' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "voice-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/arm64' + skip-drivers: 'false' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-cuda-13-arm64-voice-detect' + base-image: "ubuntu:24.04" + ubuntu-version: '2404' + runs-on: 'ubuntu-24.04-arm' + backend: "voice-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + platform-tag: 'amd64' + tag-latest: 'auto' + tag-suffix: '-cpu-voice-detect' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "voice-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/arm64' + platform-tag: 'arm64' + tag-latest: 'auto' + tag-suffix: '-cpu-voice-detect' + runs-on: 'ubuntu-24.04-arm' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "voice-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'sycl_f32' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-intel-sycl-f32-voice-detect' + runs-on: 'ubuntu-latest' + base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04" + skip-drivers: 'false' + backend: "voice-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'sycl_f16' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-intel-sycl-f16-voice-detect' + runs-on: 'ubuntu-latest' + base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04" + skip-drivers: 'false' + backend: "voice-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'vulkan' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + platform-tag: 'amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-vulkan-voice-detect' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "voice-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'vulkan' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/arm64' + platform-tag: 'arm64' + tag-latest: 'auto' + tag-suffix: '-gpu-vulkan-voice-detect' + runs-on: 'ubuntu-24.04-arm' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "voice-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "0" + platforms: 'linux/arm64' + skip-drivers: 'false' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-arm64-voice-detect' + base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0" + runs-on: 'ubuntu-24.04-arm' + backend: "voice-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2204' + - build-type: 'hipblas' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-rocm-hipblas-voice-detect' + base-image: "rocm/dev-ubuntu-24.04:7.2.1" + runs-on: 'ubuntu-latest' + skip-drivers: 'false' + backend: "voice-detect" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' # acestep-cpp - build-type: '' cuda-major-version: "" @@ -4906,6 +5054,10 @@ includeDarwin: tag-suffix: "-metal-darwin-arm64-ced" build-type: "metal" lang: "go" + - backend: "voice-detect" + tag-suffix: "-metal-darwin-arm64-voice-detect" + build-type: "metal" + lang: "go" - backend: "acestep-cpp" tag-suffix: "-metal-darwin-arm64-acestep-cpp" build-type: "metal" diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml index 481c9a609..c8ad341d4 100644 --- a/.github/workflows/bump_deps.yaml +++ b/.github/workflows/bump_deps.yaml @@ -46,6 +46,10 @@ jobs: variable: "CED_VERSION" branch: "master" file: "backend/go/ced/Makefile" + - repository: "mudler/voice-detect.cpp" + variable: "VOICEDETECT_VERSION" + branch: "master" + file: "backend/go/voice-detect/Makefile" - repository: "mudler/depth-anything.cpp" variable: "DEPTHANYTHING_VERSION" branch: "master" diff --git a/backend/index.yaml b/backend/index.yaml index 3f61f7b4e..466c31314 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -209,6 +209,42 @@ nvidia-cuda-12: "cuda12-ced" nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced" +- &voicedetect + name: "voice-detect" + alias: "voice-detect" + license: mit + icon: https://avatars.githubusercontent.com/u/95302084 + description: | + voice-detect speaker recognition and voice analysis. + voice-detect.cpp is a C++/ggml engine that produces L2-normalised + speaker embeddings (ECAPA-TDNN, WeSpeaker ResNet34, 3D-Speaker + ERes2Net, CAM++) for voice verification and 1:N identification, plus + a wav2vec2 age / gender / emotion analysis head. It replaces the + Python speaker-recognition backend and is exposed through the Voice* + gRPC rpcs and the /v1/voice/* REST endpoints. It runs on CPU, NVIDIA + CUDA, AMD ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets. + urls: + - https://github.com/mudler/voice-detect.cpp + tags: + - voice-recognition + - speaker-verification + - speaker-embedding + - CPU + - GPU + - CUDA + - HIP + capabilities: + default: "cpu-voice-detect" + nvidia: "cuda12-voice-detect" + intel: "intel-sycl-f16-voice-detect" + metal: "metal-voice-detect" + amd: "rocm-voice-detect" + vulkan: "vulkan-voice-detect" + nvidia-l4t: "nvidia-l4t-arm64-voice-detect" + nvidia-cuda-13: "cuda13-voice-detect" + nvidia-cuda-12: "cuda12-voice-detect" + nvidia-l4t-cuda-12: "nvidia-l4t-arm64-voice-detect" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-voice-detect" - &voxtral name: "voxtral" alias: "voxtral" @@ -2796,6 +2832,121 @@ uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-ced" mirrors: - localai/localai-backends:master-gpu-nvidia-cuda-13-ced +## voice-detect +- !!merge <<: *voicedetect + name: "voice-detect-development" + capabilities: + default: "cpu-voice-detect-development" + nvidia: "cuda12-voice-detect-development" + intel: "intel-sycl-f16-voice-detect-development" + metal: "metal-voice-detect-development" + amd: "rocm-voice-detect-development" + vulkan: "vulkan-voice-detect-development" + nvidia-l4t: "nvidia-l4t-arm64-voice-detect-development" + nvidia-cuda-13: "cuda13-voice-detect-development" + nvidia-cuda-12: "cuda12-voice-detect-development" + nvidia-l4t-cuda-12: "nvidia-l4t-arm64-voice-detect-development" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-voice-detect-development" +- !!merge <<: *voicedetect + name: "nvidia-l4t-arm64-voice-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-voice-detect" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-arm64-voice-detect +- !!merge <<: *voicedetect + name: "nvidia-l4t-arm64-voice-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-voice-detect" + mirrors: + - localai/localai-backends:master-nvidia-l4t-arm64-voice-detect +- !!merge <<: *voicedetect + name: "cuda13-nvidia-l4t-arm64-voice-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-voice-detect" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-voice-detect +- !!merge <<: *voicedetect + name: "cuda13-nvidia-l4t-arm64-voice-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-voice-detect" + mirrors: + - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-voice-detect +- !!merge <<: *voicedetect + name: "cpu-voice-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-voice-detect" + mirrors: + - localai/localai-backends:latest-cpu-voice-detect +- !!merge <<: *voicedetect + name: "cpu-voice-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-cpu-voice-detect" + mirrors: + - localai/localai-backends:master-cpu-voice-detect +- !!merge <<: *voicedetect + name: "metal-voice-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-voice-detect" + mirrors: + - localai/localai-backends:latest-metal-darwin-arm64-voice-detect +- !!merge <<: *voicedetect + name: "metal-voice-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-voice-detect" + mirrors: + - localai/localai-backends:master-metal-darwin-arm64-voice-detect +- !!merge <<: *voicedetect + name: "cuda12-voice-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-voice-detect" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-12-voice-detect +- !!merge <<: *voicedetect + name: "cuda12-voice-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-voice-detect" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-12-voice-detect +- !!merge <<: *voicedetect + name: "rocm-voice-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-voice-detect" + mirrors: + - localai/localai-backends:latest-gpu-rocm-hipblas-voice-detect +- !!merge <<: *voicedetect + name: "rocm-voice-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-voice-detect" + mirrors: + - localai/localai-backends:master-gpu-rocm-hipblas-voice-detect +- !!merge <<: *voicedetect + name: "intel-sycl-f32-voice-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-voice-detect" + mirrors: + - localai/localai-backends:latest-gpu-intel-sycl-f32-voice-detect +- !!merge <<: *voicedetect + name: "intel-sycl-f32-voice-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-voice-detect" + mirrors: + - localai/localai-backends:master-gpu-intel-sycl-f32-voice-detect +- !!merge <<: *voicedetect + name: "intel-sycl-f16-voice-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-voice-detect" + mirrors: + - localai/localai-backends:latest-gpu-intel-sycl-f16-voice-detect +- !!merge <<: *voicedetect + name: "intel-sycl-f16-voice-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-voice-detect" + mirrors: + - localai/localai-backends:master-gpu-intel-sycl-f16-voice-detect +- !!merge <<: *voicedetect + name: "vulkan-voice-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-voice-detect" + mirrors: + - localai/localai-backends:latest-gpu-vulkan-voice-detect +- !!merge <<: *voicedetect + name: "vulkan-voice-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-voice-detect" + mirrors: + - localai/localai-backends:master-gpu-vulkan-voice-detect +- !!merge <<: *voicedetect + name: "cuda13-voice-detect" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-voice-detect" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-13-voice-detect +- !!merge <<: *voicedetect + name: "cuda13-voice-detect-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-voice-detect" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-13-voice-detect ## stablediffusion-ggml - !!merge <<: *stablediffusionggml name: "cpu-stablediffusion-ggml" diff --git a/core/config/backend_capabilities.go b/core/config/backend_capabilities.go index cc9567887..201f2c267 100644 --- a/core/config/backend_capabilities.go +++ b/core/config/backend_capabilities.go @@ -542,6 +542,12 @@ var BackendCapabilities = map[string]BackendCapability{ DefaultUsecases: []string{UsecaseSpeakerRecognition}, Description: "Speaker recognition — voice identity verification and analysis", }, + "voice-detect": { + GRPCMethods: []GRPCMethod{MethodVoiceVerify, MethodVoiceEmbed, MethodVoiceAnalyze}, + PossibleUsecases: []string{UsecaseSpeakerRecognition}, + DefaultUsecases: []string{UsecaseSpeakerRecognition}, + Description: "voice-detect.cpp — C++/ggml speaker embedding, verification and voice analysis (age/gender/emotion)", + }, "silero-vad": { GRPCMethods: []GRPCMethod{MethodVAD}, PossibleUsecases: []string{UsecaseVAD}, diff --git a/gallery/index.yaml b/gallery/index.yaml index cde505d72..43828302d 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -8518,6 +8518,183 @@ - filename: wespeaker_voxceleb_resnet34.onnx sha256: 7bb2f06e9df17cdf1ef14ee8a15ab08ed28e8d0ef5054ee135741560df2ec068 uri: https://huggingface.co/Wespeaker/wespeaker-voxceleb-resnet34-LM/resolve/main/voxceleb_resnet34_LM.onnx +- name: voice-detect-ecapa-tdnn + url: github:mudler/LocalAI/gallery/virtual.yaml@master + urls: + - https://github.com/mudler/voice-detect.cpp + - https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb + description: | + Speaker (voice) recognition with SpeechBrain's ECAPA-TDNN trained + on VoxCeleb, ported to C++/ggml and shipped as a single GGUF for the + `voice-detect` backend. 192-d L2-normalised embeddings, ~1.9% Equal + Error Rate on VoxCeleb1-O. APACHE 2.0 — commercial-safe. + + No Python / torch runtime: voice-detect.cpp reads the embedding + architecture (`voicedetect.arch`) directly from the GGUF metadata, + so installing this entry is all that is needed to select ECAPA-TDNN. + Drives the VoiceVerify / VoiceEmbed gRPC rpcs and the + /v1/voice/{verify,embed,register,identify,forget} REST endpoints. + license: apache-2.0 + icon: https://avatars.githubusercontent.com/u/95302084 + tags: + - voice-recognition + - speaker-verification + - speaker-embedding + - commercial-ok + - cpu + - gpu + last_checked: "2026-06-22" + overrides: + backend: voice-detect + known_usecases: + - speaker_recognition + options: + - verify_threshold:0.25 + parameters: + model: voice-detect-ecapa-tdnn-voxceleb.gguf + files: + # TODO publish: fill sha256 after upload to mudler/voice-detect-gguf + - filename: voice-detect-ecapa-tdnn-voxceleb.gguf + uri: https://huggingface.co/mudler/voice-detect-gguf/resolve/main/ecapa-tdnn-voxceleb.gguf +- name: voice-detect-wespeaker-resnet34 + url: github:mudler/LocalAI/gallery/virtual.yaml@master + urls: + - https://github.com/mudler/voice-detect.cpp + - https://github.com/wenet-e2e/wespeaker + description: | + Speaker recognition with WeSpeaker's ResNet34 trained on VoxCeleb, + converted to a C++/ggml GGUF for the `voice-detect` backend. 256-d + embeddings, CPU-friendly and runtime-free (no onnxruntime or torch). + CC-BY-4.0. + + Use when you want WeSpeaker's ResNet34 topology instead of + ECAPA-TDNN. The embedding architecture (`voicedetect.arch`) is read + from the GGUF metadata, so this entry alone selects the engine. + license: cc-by-4.0 + icon: https://avatars.githubusercontent.com/u/95302084 + tags: + - voice-recognition + - speaker-verification + - speaker-embedding + - commercial-ok + - edge + - cpu + last_checked: "2026-06-22" + overrides: + backend: voice-detect + known_usecases: + - speaker_recognition + options: + - verify_threshold:0.25 + parameters: + model: voice-detect-wespeaker-resnet34.gguf + files: + # TODO publish: fill sha256 after upload to mudler/voice-detect-gguf + - filename: voice-detect-wespeaker-resnet34.gguf + uri: https://huggingface.co/mudler/voice-detect-gguf/resolve/main/wespeaker-resnet34-voxceleb.gguf +- name: voice-detect-eres2net + url: github:mudler/LocalAI/gallery/virtual.yaml@master + urls: + - https://github.com/mudler/voice-detect.cpp + - https://huggingface.co/iic/speech_eres2net_sv_en_voxceleb_16k + description: | + Speaker recognition with 3D-Speaker's ERes2Net trained on VoxCeleb, + converted to a C++/ggml GGUF for the `voice-detect` backend. + 192-d embeddings with strong verification accuracy. APACHE 2.0. + + The embedding architecture (`voicedetect.arch`) is read from the + GGUF metadata, so this entry alone selects the ERes2Net engine. + license: apache-2.0 + icon: https://avatars.githubusercontent.com/u/95302084 + tags: + - voice-recognition + - speaker-verification + - speaker-embedding + - commercial-ok + - cpu + - gpu + last_checked: "2026-06-22" + overrides: + backend: voice-detect + known_usecases: + - speaker_recognition + options: + - verify_threshold:0.25 + parameters: + model: voice-detect-eres2net.gguf + files: + # TODO publish: fill sha256 after upload to mudler/voice-detect-gguf + - filename: voice-detect-eres2net.gguf + uri: https://huggingface.co/mudler/voice-detect-gguf/resolve/main/eres2net-voxceleb.gguf +- name: voice-detect-campplus + url: github:mudler/LocalAI/gallery/virtual.yaml@master + urls: + - https://github.com/mudler/voice-detect.cpp + - https://huggingface.co/iic/speech_campplus_sv_en_voxceleb_16k + description: | + Speaker recognition with 3D-Speaker's CAM++ trained on VoxCeleb, + converted to a C++/ggml GGUF for the `voice-detect` backend. 192-d + embeddings, a fast context-aware masking topology well-suited to + CPU and edge deployments. APACHE 2.0. + + The embedding architecture (`voicedetect.arch`) is read from the + GGUF metadata, so this entry alone selects the CAM++ engine. + license: apache-2.0 + icon: https://avatars.githubusercontent.com/u/95302084 + tags: + - voice-recognition + - speaker-verification + - speaker-embedding + - commercial-ok + - edge + - cpu + last_checked: "2026-06-22" + overrides: + backend: voice-detect + known_usecases: + - speaker_recognition + options: + - verify_threshold:0.25 + parameters: + model: voice-detect-campplus.gguf + files: + # TODO publish: fill sha256 after upload to mudler/voice-detect-gguf + - filename: voice-detect-campplus.gguf + uri: https://huggingface.co/mudler/voice-detect-gguf/resolve/main/campplus-voxceleb.gguf +- name: voice-detect-emotion-wav2vec2 + url: github:mudler/LocalAI/gallery/virtual.yaml@master + urls: + - https://github.com/mudler/voice-detect.cpp + - https://huggingface.co/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim + description: | + Voice analysis (age / gender / emotion) with audEERING's wav2vec2 + model, converted to a C++/ggml GGUF for the `voice-detect` backend. + Drives the VoiceAnalyze gRPC rpc and the /v1/voice/analyze REST + endpoint, returning a continuous age estimate plus gender and + emotion class scores for a single utterance. CC-BY-NC-SA-4.0 — + research / non-commercial use only. + + The analysis architecture (`voicedetect.arch`) is read from the + GGUF metadata, so this entry alone selects the wav2vec2 analyze head. + license: cc-by-nc-sa-4.0 + icon: https://avatars.githubusercontent.com/u/95302084 + tags: + - voice-recognition + - voice-analysis + - emotion-recognition + - cpu + - gpu + last_checked: "2026-06-22" + overrides: + backend: voice-detect + known_usecases: + - speaker_recognition + parameters: + model: voice-detect-emotion-wav2vec2.gguf + files: + # TODO publish: fill sha256 after upload to mudler/voice-detect-gguf + - filename: voice-detect-emotion-wav2vec2.gguf + uri: https://huggingface.co/mudler/voice-detect-gguf/resolve/main/emotion-wav2vec2-msp.gguf - name: rfdetr-base url: github:mudler/LocalAI/gallery/virtual.yaml@master urls: