mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-22 07:39:02 -04:00
feat(voice-detect): wire backend into index, gallery and build
Register the voice-detect.cpp speaker-recognition + voice-analysis backend (added in Voice-INT-A) into LocalAI's distribution surfaces, mirroring the ced backend (the closest mudler C++/ggml audio analogue): - backend/index.yaml: add the &voicedetect meta-backend (capabilities platform map, no top-level uri) plus the full set of concrete per-arch image entries (cpu/cuda12/cuda13/metal/rocm/sycl/vulkan/l4t and the -development variants). Referential integrity audited - every alias target resolves. - gallery/index.yaml: add 5 model entries on backend voice-detect - ECAPA-TDNN, WeSpeaker ResNet34, 3D-Speaker ERes2Net, CAM++ and the wav2vec2 age/gender/emotion analyze model. The engine architecture is read from GGUF metadata (voicedetect.arch) at load. GGUF artifacts are not yet published: each files: entry points at the intended mudler/voice-detect-gguf location with a TODO to fill sha256 after upload (no fabricated hashes). - .github/backend-matrix.yml: add the linux build matrix block + the darwin metal entry mirroring ced. - .github/workflows/bump_deps.yaml: track mudler/voice-detect.cpp via VOICEDETECT_VERSION (pin 47546430, = 4754643). - core/config/backend_capabilities.go: register voice-detect in the backend capability map (VoiceVerify/VoiceEmbed/VoiceAnalyze -> speaker_recognition), mirroring speaker-recognition. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code]
This commit is contained in:
152
.github/backend-matrix.yml
vendored
152
.github/backend-matrix.yml
vendored
@@ -3723,6 +3723,154 @@ include:
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
# voice-detect
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "8"
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-nvidia-cuda-12-voice-detect'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "voice-detect"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "13"
|
||||
cuda-minor-version: "0"
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-nvidia-cuda-13-voice-detect'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "voice-detect"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "13"
|
||||
cuda-minor-version: "0"
|
||||
platforms: 'linux/arm64'
|
||||
skip-drivers: 'false'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-nvidia-l4t-cuda-13-arm64-voice-detect'
|
||||
base-image: "ubuntu:24.04"
|
||||
ubuntu-version: '2404'
|
||||
runs-on: 'ubuntu-24.04-arm'
|
||||
backend: "voice-detect"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
platform-tag: 'amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-cpu-voice-detect'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "voice-detect"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/arm64'
|
||||
platform-tag: 'arm64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-cpu-voice-detect'
|
||||
runs-on: 'ubuntu-24.04-arm'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "voice-detect"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'sycl_f32'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-intel-sycl-f32-voice-detect'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "voice-detect"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'sycl_f16'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-intel-sycl-f16-voice-detect'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "voice-detect"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'vulkan'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
platform-tag: 'amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-vulkan-voice-detect'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "voice-detect"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'vulkan'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/arm64'
|
||||
platform-tag: 'arm64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-vulkan-voice-detect'
|
||||
runs-on: 'ubuntu-24.04-arm'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "voice-detect"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "0"
|
||||
platforms: 'linux/arm64'
|
||||
skip-drivers: 'false'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-nvidia-l4t-arm64-voice-detect'
|
||||
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
|
||||
runs-on: 'ubuntu-24.04-arm'
|
||||
backend: "voice-detect"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2204'
|
||||
- build-type: 'hipblas'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-rocm-hipblas-voice-detect'
|
||||
base-image: "rocm/dev-ubuntu-24.04:7.2.1"
|
||||
runs-on: 'ubuntu-latest'
|
||||
skip-drivers: 'false'
|
||||
backend: "voice-detect"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
# acestep-cpp
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
@@ -4906,6 +5054,10 @@ includeDarwin:
|
||||
tag-suffix: "-metal-darwin-arm64-ced"
|
||||
build-type: "metal"
|
||||
lang: "go"
|
||||
- backend: "voice-detect"
|
||||
tag-suffix: "-metal-darwin-arm64-voice-detect"
|
||||
build-type: "metal"
|
||||
lang: "go"
|
||||
- backend: "acestep-cpp"
|
||||
tag-suffix: "-metal-darwin-arm64-acestep-cpp"
|
||||
build-type: "metal"
|
||||
|
||||
4
.github/workflows/bump_deps.yaml
vendored
4
.github/workflows/bump_deps.yaml
vendored
@@ -46,6 +46,10 @@ jobs:
|
||||
variable: "CED_VERSION"
|
||||
branch: "master"
|
||||
file: "backend/go/ced/Makefile"
|
||||
- repository: "mudler/voice-detect.cpp"
|
||||
variable: "VOICEDETECT_VERSION"
|
||||
branch: "master"
|
||||
file: "backend/go/voice-detect/Makefile"
|
||||
- repository: "mudler/depth-anything.cpp"
|
||||
variable: "DEPTHANYTHING_VERSION"
|
||||
branch: "master"
|
||||
|
||||
@@ -209,6 +209,42 @@
|
||||
nvidia-cuda-12: "cuda12-ced"
|
||||
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced"
|
||||
- &voicedetect
|
||||
name: "voice-detect"
|
||||
alias: "voice-detect"
|
||||
license: mit
|
||||
icon: https://avatars.githubusercontent.com/u/95302084
|
||||
description: |
|
||||
voice-detect speaker recognition and voice analysis.
|
||||
voice-detect.cpp is a C++/ggml engine that produces L2-normalised
|
||||
speaker embeddings (ECAPA-TDNN, WeSpeaker ResNet34, 3D-Speaker
|
||||
ERes2Net, CAM++) for voice verification and 1:N identification, plus
|
||||
a wav2vec2 age / gender / emotion analysis head. It replaces the
|
||||
Python speaker-recognition backend and is exposed through the Voice*
|
||||
gRPC rpcs and the /v1/voice/* REST endpoints. It runs on CPU, NVIDIA
|
||||
CUDA, AMD ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets.
|
||||
urls:
|
||||
- https://github.com/mudler/voice-detect.cpp
|
||||
tags:
|
||||
- voice-recognition
|
||||
- speaker-verification
|
||||
- speaker-embedding
|
||||
- CPU
|
||||
- GPU
|
||||
- CUDA
|
||||
- HIP
|
||||
capabilities:
|
||||
default: "cpu-voice-detect"
|
||||
nvidia: "cuda12-voice-detect"
|
||||
intel: "intel-sycl-f16-voice-detect"
|
||||
metal: "metal-voice-detect"
|
||||
amd: "rocm-voice-detect"
|
||||
vulkan: "vulkan-voice-detect"
|
||||
nvidia-l4t: "nvidia-l4t-arm64-voice-detect"
|
||||
nvidia-cuda-13: "cuda13-voice-detect"
|
||||
nvidia-cuda-12: "cuda12-voice-detect"
|
||||
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-voice-detect"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-voice-detect"
|
||||
- &voxtral
|
||||
name: "voxtral"
|
||||
alias: "voxtral"
|
||||
@@ -2796,6 +2832,121 @@
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-nvidia-cuda-13-ced
|
||||
## voice-detect
|
||||
- !!merge <<: *voicedetect
|
||||
name: "voice-detect-development"
|
||||
capabilities:
|
||||
default: "cpu-voice-detect-development"
|
||||
nvidia: "cuda12-voice-detect-development"
|
||||
intel: "intel-sycl-f16-voice-detect-development"
|
||||
metal: "metal-voice-detect-development"
|
||||
amd: "rocm-voice-detect-development"
|
||||
vulkan: "vulkan-voice-detect-development"
|
||||
nvidia-l4t: "nvidia-l4t-arm64-voice-detect-development"
|
||||
nvidia-cuda-13: "cuda13-voice-detect-development"
|
||||
nvidia-cuda-12: "cuda12-voice-detect-development"
|
||||
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-voice-detect-development"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-voice-detect-development"
|
||||
- !!merge <<: *voicedetect
|
||||
name: "nvidia-l4t-arm64-voice-detect"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-voice-detect"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-nvidia-l4t-arm64-voice-detect
|
||||
- !!merge <<: *voicedetect
|
||||
name: "nvidia-l4t-arm64-voice-detect-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-voice-detect"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-nvidia-l4t-arm64-voice-detect
|
||||
- !!merge <<: *voicedetect
|
||||
name: "cuda13-nvidia-l4t-arm64-voice-detect"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-voice-detect"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-voice-detect
|
||||
- !!merge <<: *voicedetect
|
||||
name: "cuda13-nvidia-l4t-arm64-voice-detect-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-voice-detect"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-voice-detect
|
||||
- !!merge <<: *voicedetect
|
||||
name: "cpu-voice-detect"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-voice-detect"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-cpu-voice-detect
|
||||
- !!merge <<: *voicedetect
|
||||
name: "cpu-voice-detect-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-voice-detect"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-cpu-voice-detect
|
||||
- !!merge <<: *voicedetect
|
||||
name: "metal-voice-detect"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-voice-detect"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-metal-darwin-arm64-voice-detect
|
||||
- !!merge <<: *voicedetect
|
||||
name: "metal-voice-detect-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-voice-detect"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-metal-darwin-arm64-voice-detect
|
||||
- !!merge <<: *voicedetect
|
||||
name: "cuda12-voice-detect"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-voice-detect"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-nvidia-cuda-12-voice-detect
|
||||
- !!merge <<: *voicedetect
|
||||
name: "cuda12-voice-detect-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-voice-detect"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-nvidia-cuda-12-voice-detect
|
||||
- !!merge <<: *voicedetect
|
||||
name: "rocm-voice-detect"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-voice-detect"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-rocm-hipblas-voice-detect
|
||||
- !!merge <<: *voicedetect
|
||||
name: "rocm-voice-detect-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-voice-detect"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-rocm-hipblas-voice-detect
|
||||
- !!merge <<: *voicedetect
|
||||
name: "intel-sycl-f32-voice-detect"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-voice-detect"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-intel-sycl-f32-voice-detect
|
||||
- !!merge <<: *voicedetect
|
||||
name: "intel-sycl-f32-voice-detect-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-voice-detect"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-intel-sycl-f32-voice-detect
|
||||
- !!merge <<: *voicedetect
|
||||
name: "intel-sycl-f16-voice-detect"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-voice-detect"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-intel-sycl-f16-voice-detect
|
||||
- !!merge <<: *voicedetect
|
||||
name: "intel-sycl-f16-voice-detect-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-voice-detect"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-intel-sycl-f16-voice-detect
|
||||
- !!merge <<: *voicedetect
|
||||
name: "vulkan-voice-detect"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-voice-detect"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-vulkan-voice-detect
|
||||
- !!merge <<: *voicedetect
|
||||
name: "vulkan-voice-detect-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-voice-detect"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-vulkan-voice-detect
|
||||
- !!merge <<: *voicedetect
|
||||
name: "cuda13-voice-detect"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-voice-detect"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-nvidia-cuda-13-voice-detect
|
||||
- !!merge <<: *voicedetect
|
||||
name: "cuda13-voice-detect-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-voice-detect"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-nvidia-cuda-13-voice-detect
|
||||
## stablediffusion-ggml
|
||||
- !!merge <<: *stablediffusionggml
|
||||
name: "cpu-stablediffusion-ggml"
|
||||
|
||||
@@ -542,6 +542,12 @@ var BackendCapabilities = map[string]BackendCapability{
|
||||
DefaultUsecases: []string{UsecaseSpeakerRecognition},
|
||||
Description: "Speaker recognition — voice identity verification and analysis",
|
||||
},
|
||||
"voice-detect": {
|
||||
GRPCMethods: []GRPCMethod{MethodVoiceVerify, MethodVoiceEmbed, MethodVoiceAnalyze},
|
||||
PossibleUsecases: []string{UsecaseSpeakerRecognition},
|
||||
DefaultUsecases: []string{UsecaseSpeakerRecognition},
|
||||
Description: "voice-detect.cpp — C++/ggml speaker embedding, verification and voice analysis (age/gender/emotion)",
|
||||
},
|
||||
"silero-vad": {
|
||||
GRPCMethods: []GRPCMethod{MethodVAD},
|
||||
PossibleUsecases: []string{UsecaseVAD},
|
||||
|
||||
@@ -8518,6 +8518,183 @@
|
||||
- filename: wespeaker_voxceleb_resnet34.onnx
|
||||
sha256: 7bb2f06e9df17cdf1ef14ee8a15ab08ed28e8d0ef5054ee135741560df2ec068
|
||||
uri: https://huggingface.co/Wespeaker/wespeaker-voxceleb-resnet34-LM/resolve/main/voxceleb_resnet34_LM.onnx
|
||||
- name: voice-detect-ecapa-tdnn
|
||||
url: github:mudler/LocalAI/gallery/virtual.yaml@master
|
||||
urls:
|
||||
- https://github.com/mudler/voice-detect.cpp
|
||||
- https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb
|
||||
description: |
|
||||
Speaker (voice) recognition with SpeechBrain's ECAPA-TDNN trained
|
||||
on VoxCeleb, ported to C++/ggml and shipped as a single GGUF for the
|
||||
`voice-detect` backend. 192-d L2-normalised embeddings, ~1.9% Equal
|
||||
Error Rate on VoxCeleb1-O. APACHE 2.0 — commercial-safe.
|
||||
|
||||
No Python / torch runtime: voice-detect.cpp reads the embedding
|
||||
architecture (`voicedetect.arch`) directly from the GGUF metadata,
|
||||
so installing this entry is all that is needed to select ECAPA-TDNN.
|
||||
Drives the VoiceVerify / VoiceEmbed gRPC rpcs and the
|
||||
/v1/voice/{verify,embed,register,identify,forget} REST endpoints.
|
||||
license: apache-2.0
|
||||
icon: https://avatars.githubusercontent.com/u/95302084
|
||||
tags:
|
||||
- voice-recognition
|
||||
- speaker-verification
|
||||
- speaker-embedding
|
||||
- commercial-ok
|
||||
- cpu
|
||||
- gpu
|
||||
last_checked: "2026-06-22"
|
||||
overrides:
|
||||
backend: voice-detect
|
||||
known_usecases:
|
||||
- speaker_recognition
|
||||
options:
|
||||
- verify_threshold:0.25
|
||||
parameters:
|
||||
model: voice-detect-ecapa-tdnn-voxceleb.gguf
|
||||
files:
|
||||
# TODO publish: fill sha256 after upload to mudler/voice-detect-gguf
|
||||
- filename: voice-detect-ecapa-tdnn-voxceleb.gguf
|
||||
uri: https://huggingface.co/mudler/voice-detect-gguf/resolve/main/ecapa-tdnn-voxceleb.gguf
|
||||
- name: voice-detect-wespeaker-resnet34
|
||||
url: github:mudler/LocalAI/gallery/virtual.yaml@master
|
||||
urls:
|
||||
- https://github.com/mudler/voice-detect.cpp
|
||||
- https://github.com/wenet-e2e/wespeaker
|
||||
description: |
|
||||
Speaker recognition with WeSpeaker's ResNet34 trained on VoxCeleb,
|
||||
converted to a C++/ggml GGUF for the `voice-detect` backend. 256-d
|
||||
embeddings, CPU-friendly and runtime-free (no onnxruntime or torch).
|
||||
CC-BY-4.0.
|
||||
|
||||
Use when you want WeSpeaker's ResNet34 topology instead of
|
||||
ECAPA-TDNN. The embedding architecture (`voicedetect.arch`) is read
|
||||
from the GGUF metadata, so this entry alone selects the engine.
|
||||
license: cc-by-4.0
|
||||
icon: https://avatars.githubusercontent.com/u/95302084
|
||||
tags:
|
||||
- voice-recognition
|
||||
- speaker-verification
|
||||
- speaker-embedding
|
||||
- commercial-ok
|
||||
- edge
|
||||
- cpu
|
||||
last_checked: "2026-06-22"
|
||||
overrides:
|
||||
backend: voice-detect
|
||||
known_usecases:
|
||||
- speaker_recognition
|
||||
options:
|
||||
- verify_threshold:0.25
|
||||
parameters:
|
||||
model: voice-detect-wespeaker-resnet34.gguf
|
||||
files:
|
||||
# TODO publish: fill sha256 after upload to mudler/voice-detect-gguf
|
||||
- filename: voice-detect-wespeaker-resnet34.gguf
|
||||
uri: https://huggingface.co/mudler/voice-detect-gguf/resolve/main/wespeaker-resnet34-voxceleb.gguf
|
||||
- name: voice-detect-eres2net
|
||||
url: github:mudler/LocalAI/gallery/virtual.yaml@master
|
||||
urls:
|
||||
- https://github.com/mudler/voice-detect.cpp
|
||||
- https://huggingface.co/iic/speech_eres2net_sv_en_voxceleb_16k
|
||||
description: |
|
||||
Speaker recognition with 3D-Speaker's ERes2Net trained on VoxCeleb,
|
||||
converted to a C++/ggml GGUF for the `voice-detect` backend.
|
||||
192-d embeddings with strong verification accuracy. APACHE 2.0.
|
||||
|
||||
The embedding architecture (`voicedetect.arch`) is read from the
|
||||
GGUF metadata, so this entry alone selects the ERes2Net engine.
|
||||
license: apache-2.0
|
||||
icon: https://avatars.githubusercontent.com/u/95302084
|
||||
tags:
|
||||
- voice-recognition
|
||||
- speaker-verification
|
||||
- speaker-embedding
|
||||
- commercial-ok
|
||||
- cpu
|
||||
- gpu
|
||||
last_checked: "2026-06-22"
|
||||
overrides:
|
||||
backend: voice-detect
|
||||
known_usecases:
|
||||
- speaker_recognition
|
||||
options:
|
||||
- verify_threshold:0.25
|
||||
parameters:
|
||||
model: voice-detect-eres2net.gguf
|
||||
files:
|
||||
# TODO publish: fill sha256 after upload to mudler/voice-detect-gguf
|
||||
- filename: voice-detect-eres2net.gguf
|
||||
uri: https://huggingface.co/mudler/voice-detect-gguf/resolve/main/eres2net-voxceleb.gguf
|
||||
- name: voice-detect-campplus
|
||||
url: github:mudler/LocalAI/gallery/virtual.yaml@master
|
||||
urls:
|
||||
- https://github.com/mudler/voice-detect.cpp
|
||||
- https://huggingface.co/iic/speech_campplus_sv_en_voxceleb_16k
|
||||
description: |
|
||||
Speaker recognition with 3D-Speaker's CAM++ trained on VoxCeleb,
|
||||
converted to a C++/ggml GGUF for the `voice-detect` backend. 192-d
|
||||
embeddings, a fast context-aware masking topology well-suited to
|
||||
CPU and edge deployments. APACHE 2.0.
|
||||
|
||||
The embedding architecture (`voicedetect.arch`) is read from the
|
||||
GGUF metadata, so this entry alone selects the CAM++ engine.
|
||||
license: apache-2.0
|
||||
icon: https://avatars.githubusercontent.com/u/95302084
|
||||
tags:
|
||||
- voice-recognition
|
||||
- speaker-verification
|
||||
- speaker-embedding
|
||||
- commercial-ok
|
||||
- edge
|
||||
- cpu
|
||||
last_checked: "2026-06-22"
|
||||
overrides:
|
||||
backend: voice-detect
|
||||
known_usecases:
|
||||
- speaker_recognition
|
||||
options:
|
||||
- verify_threshold:0.25
|
||||
parameters:
|
||||
model: voice-detect-campplus.gguf
|
||||
files:
|
||||
# TODO publish: fill sha256 after upload to mudler/voice-detect-gguf
|
||||
- filename: voice-detect-campplus.gguf
|
||||
uri: https://huggingface.co/mudler/voice-detect-gguf/resolve/main/campplus-voxceleb.gguf
|
||||
- name: voice-detect-emotion-wav2vec2
|
||||
url: github:mudler/LocalAI/gallery/virtual.yaml@master
|
||||
urls:
|
||||
- https://github.com/mudler/voice-detect.cpp
|
||||
- https://huggingface.co/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim
|
||||
description: |
|
||||
Voice analysis (age / gender / emotion) with audEERING's wav2vec2
|
||||
model, converted to a C++/ggml GGUF for the `voice-detect` backend.
|
||||
Drives the VoiceAnalyze gRPC rpc and the /v1/voice/analyze REST
|
||||
endpoint, returning a continuous age estimate plus gender and
|
||||
emotion class scores for a single utterance. CC-BY-NC-SA-4.0 —
|
||||
research / non-commercial use only.
|
||||
|
||||
The analysis architecture (`voicedetect.arch`) is read from the
|
||||
GGUF metadata, so this entry alone selects the wav2vec2 analyze head.
|
||||
license: cc-by-nc-sa-4.0
|
||||
icon: https://avatars.githubusercontent.com/u/95302084
|
||||
tags:
|
||||
- voice-recognition
|
||||
- voice-analysis
|
||||
- emotion-recognition
|
||||
- cpu
|
||||
- gpu
|
||||
last_checked: "2026-06-22"
|
||||
overrides:
|
||||
backend: voice-detect
|
||||
known_usecases:
|
||||
- speaker_recognition
|
||||
parameters:
|
||||
model: voice-detect-emotion-wav2vec2.gguf
|
||||
files:
|
||||
# TODO publish: fill sha256 after upload to mudler/voice-detect-gguf
|
||||
- filename: voice-detect-emotion-wav2vec2.gguf
|
||||
uri: https://huggingface.co/mudler/voice-detect-gguf/resolve/main/emotion-wav2vec2-msp.gguf
|
||||
- name: rfdetr-base
|
||||
url: github:mudler/LocalAI/gallery/virtual.yaml@master
|
||||
urls:
|
||||
|
||||
Reference in New Issue
Block a user