mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-22 07:39:02 -04:00
feat(voice-detect): add Go purego backend for voice-detect.cpp
Add backend/go/voice-detect implementing the Backend gRPC voice subset (VoiceEmbed/VoiceVerify/VoiceAnalyze) over libvoicedetect.so via purego, mirroring the parakeet-cpp / omnivoice-cpp backends. The flat voicedetect_capi C ABI is dlopen'd cgo-less; malloc'd string and float-vector returns are owned by Go and released through the matching capi free functions, with the per-ctx last error surfaced into Go errors. Calls are serialized via base.SingleThread since the C context is not reentrant. Proto field mapping: - VoiceEmbed: VoiceEmbedRequest.audio (path) -> embed_path -> Embedding+Model. - VoiceVerify: audio1/audio2 + threshold (<=0 falls back to the verify_threshold option, default 0.25) -> verify_paths -> verified/distance/ threshold/confidence/model/processing_time_ms. - VoiceAnalyze: audio (path) -> analyze_path_json; the JSON age/gender/emotion document maps to a single VoiceAnalysis segment (start/end 0; gender "label" -> dominant_gender with the remaining float scores as the gender map; emotion label/scores -> dominant_emotion/emotion). The Makefile pins voice-detect.cpp to 47546430, clones+builds libvoicedetect.so with ggml static-linked (PIC, GGML_NATIVE off) so dlopen needs no external libggml/libvoicedetect; ldd on the artifact shows only system libs. Ginkgo tests cover option parsing and analyze-JSON mapping; embed/verify smoke specs gate on VOICEDETECT_BACKEND_TEST_MODEL + VOICEDETECT_BACKEND_TEST_WAV. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code]
This commit is contained in:
18
backend/go/voice-detect/.gitignore
vendored
Normal file
18
backend/go/voice-detect/.gitignore
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
# Fetched upstream sources
|
||||
sources/
|
||||
|
||||
# CMake build directories
|
||||
build*/
|
||||
|
||||
# build artifacts staged in-tree by the Makefile (cp from sources/) or
|
||||
# symlinked for local dev; the real sources live in voice-detect.cpp upstream.
|
||||
*.so
|
||||
*.so.*
|
||||
voicedetect_capi.h
|
||||
compile_commands.json
|
||||
|
||||
# Compiled backend binary
|
||||
voice-detect-grpc
|
||||
|
||||
# Packaging output
|
||||
package/
|
||||
92
backend/go/voice-detect/Makefile
Normal file
92
backend/go/voice-detect/Makefile
Normal file
@@ -0,0 +1,92 @@
|
||||
# voice-detect backend Makefile.
|
||||
#
|
||||
# Upstream pin lives below as VOICEDETECT_VERSION?=4754643... (.github/bump_deps.sh
|
||||
# can find and update it - matches the parakeet.cpp / whisper.cpp / ds4 convention).
|
||||
#
|
||||
# Local dev shortcut: if you already have an out-of-tree voice-detect.cpp build,
|
||||
# symlink the .so + header into this directory and skip the clone/cmake steps:
|
||||
#
|
||||
# ln -sf /path/to/voice-detect.cpp/build-shared/libvoicedetect.so .
|
||||
# ln -sf /path/to/voice-detect.cpp/include/voicedetect_capi.h .
|
||||
# go build -o voice-detect-grpc .
|
||||
#
|
||||
# The default target below does the proper clone-at-pin + cmake build so CI does
|
||||
# not need a side-checkout.
|
||||
|
||||
VOICEDETECT_VERSION?=47546430ab0f23713c2f990e43035fb1106a9c74
|
||||
VOICEDETECT_REPO?=https://github.com/mudler/voice-detect.cpp
|
||||
|
||||
GOCMD?=go
|
||||
GO_TAGS?=
|
||||
JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
|
||||
|
||||
BUILD_TYPE?=
|
||||
NATIVE?=false
|
||||
|
||||
# Build ggml statically into libvoicedetect.so (PIC) so the shared lib is
|
||||
# self-contained: dlopen needs no libggml*.so alongside it, only system libs
|
||||
# (libstdc++/libgomp/libc) that the runtime image already provides.
|
||||
CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DVOICEDETECT_SHARED=ON -DVOICEDETECT_BUILD_CLI=OFF -DVOICEDETECT_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
|
||||
|
||||
ifeq ($(NATIVE),false)
|
||||
CMAKE_ARGS+=-DGGML_NATIVE=OFF
|
||||
endif
|
||||
|
||||
# voice-detect.cpp gates its GGML backends behind VOICEDETECT_GGML_* options and
|
||||
# does set(GGML_CUDA ${VOICEDETECT_GGML_CUDA} CACHE BOOL "" FORCE), so a bare
|
||||
# -DGGML_CUDA=ON is overwritten back to OFF. Forward the VOICEDETECT_GGML_*
|
||||
# options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
|
||||
ifeq ($(BUILD_TYPE),cublas)
|
||||
CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDA=ON
|
||||
else ifeq ($(BUILD_TYPE),openblas)
|
||||
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||
else ifeq ($(BUILD_TYPE),hipblas)
|
||||
CMAKE_ARGS+=-DVOICEDETECT_GGML_HIP=ON
|
||||
else ifeq ($(BUILD_TYPE),vulkan)
|
||||
CMAKE_ARGS+=-DVOICEDETECT_GGML_VULKAN=ON
|
||||
endif
|
||||
|
||||
.PHONY: voice-detect-grpc package build clean purge test all
|
||||
|
||||
all: voice-detect-grpc
|
||||
|
||||
# Clone the upstream voice-detect.cpp source at the pinned commit. Directory acts
|
||||
# as the target so make only re-clones when missing. After a VOICEDETECT_VERSION
|
||||
# bump, run 'make purge && make' to refetch.
|
||||
sources/voice-detect.cpp:
|
||||
mkdir -p sources/voice-detect.cpp
|
||||
cd sources/voice-detect.cpp && \
|
||||
git init -q && \
|
||||
git remote add origin $(VOICEDETECT_REPO) && \
|
||||
git fetch --depth 1 origin $(VOICEDETECT_VERSION) && \
|
||||
git checkout FETCH_HEAD && \
|
||||
git submodule update --init --recursive --depth 1 --single-branch
|
||||
|
||||
# Build the shared lib + header out-of-tree, then stage them next to the Go
|
||||
# sources so purego.Dlopen("libvoicedetect.so") and the cgo-less build both pick
|
||||
# them up.
|
||||
libvoicedetect.so: sources/voice-detect.cpp
|
||||
cmake -B sources/voice-detect.cpp/build-shared -S sources/voice-detect.cpp $(CMAKE_ARGS)
|
||||
cmake --build sources/voice-detect.cpp/build-shared --config Release -j$(JOBS) --target voicedetect
|
||||
cp -fv sources/voice-detect.cpp/build-shared/libvoicedetect.so* ./ 2>/dev/null || true
|
||||
cp -fv sources/voice-detect.cpp/include/voicedetect_capi.h ./
|
||||
|
||||
voice-detect-grpc: libvoicedetect.so main.go govoicedetect.go options.go
|
||||
CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o voice-detect-grpc .
|
||||
|
||||
package: voice-detect-grpc
|
||||
bash package.sh
|
||||
|
||||
build: package
|
||||
|
||||
# Test target. The embed/verify/analyze smoke specs are gated on
|
||||
# VOICEDETECT_BACKEND_TEST_MODEL + VOICEDETECT_BACKEND_TEST_WAV; without them the
|
||||
# heavy specs auto-skip and only the pure-Go parsing specs run.
|
||||
test:
|
||||
LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
|
||||
|
||||
clean: purge
|
||||
rm -rf libvoicedetect.so* voicedetect_capi.h package voice-detect-grpc
|
||||
|
||||
purge:
|
||||
rm -rf sources/voice-detect.cpp
|
||||
257
backend/go/voice-detect/govoicedetect.go
Normal file
257
backend/go/voice-detect/govoicedetect.go
Normal file
@@ -0,0 +1,257 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
"unsafe"
|
||||
|
||||
"github.com/mudler/LocalAI/pkg/grpc/base"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
"github.com/mudler/xlog"
|
||||
)
|
||||
|
||||
// purego-bound entry points from libvoicedetect.so. Names match
|
||||
// voicedetect_capi.h exactly so a `nm libvoicedetect.so | grep voicedetect_capi`
|
||||
// is enough to spot drift.
|
||||
//
|
||||
// The opaque ctx and the malloc'd char*/float* return values are declared as
|
||||
// uintptr so we get the raw pointer back and can release it via the matching
|
||||
// capi free function. purego's native string/[]float32 returns would copy and
|
||||
// forget the original pointer, leaking the C-owned buffer on every call.
|
||||
var (
|
||||
CppAbiVersion func() int32
|
||||
CppLoad func(ggufPath string) uintptr
|
||||
CppFree func(ctx uintptr)
|
||||
CppLastError func(ctx uintptr) string
|
||||
CppFreeString func(s uintptr)
|
||||
CppFreeVec func(v uintptr)
|
||||
CppEmbedPath func(ctx uintptr, wavPath string, outVec, outDim unsafe.Pointer) int32
|
||||
CppEmbedPCM func(ctx uintptr, pcm []float32, nSamples, sampleRate int32, outVec, outDim unsafe.Pointer) int32
|
||||
CppVerifyPaths func(ctx uintptr, a, b string, threshold float32, outDistance, outVerified unsafe.Pointer) int32
|
||||
CppAnalyzeJSON func(ctx uintptr, wavPath string) uintptr
|
||||
)
|
||||
|
||||
// VoiceDetect implements the speaker-recognition voice subset of the Backend
|
||||
// gRPC service over libvoicedetect.so. The C side keeps a single loaded model
|
||||
// plus a per-ctx last-error buffer and is not reentrant, so base.SingleThread
|
||||
// serializes every call.
|
||||
type VoiceDetect struct {
|
||||
base.SingleThread
|
||||
opts loadOptions
|
||||
ctxPtr uintptr
|
||||
}
|
||||
|
||||
func (v *VoiceDetect) Load(opts *pb.ModelOptions) error {
|
||||
model := opts.ModelFile
|
||||
if model == "" {
|
||||
model = opts.ModelPath
|
||||
}
|
||||
if !filepath.IsAbs(model) && opts.ModelPath != "" {
|
||||
model = filepath.Join(opts.ModelPath, model)
|
||||
}
|
||||
if model == "" {
|
||||
return errors.New("voice-detect: ModelFile is required")
|
||||
}
|
||||
|
||||
v.opts = parseOptions(opts.Options)
|
||||
if v.opts.modelName == "" {
|
||||
v.opts.modelName = filepath.Base(model)
|
||||
}
|
||||
|
||||
xlog.Info("voice-detect: loading model", "model", model,
|
||||
"verify_threshold", v.opts.verifyThreshold, "abi", CppAbiVersion())
|
||||
|
||||
ctx := CppLoad(model)
|
||||
if ctx == 0 {
|
||||
// The last-error buffer lives on the ctx that was never returned, so
|
||||
// surface the path the operator tried to load instead.
|
||||
return fmt.Errorf("voice-detect: voicedetect_capi_load failed for %q", model)
|
||||
}
|
||||
v.ctxPtr = ctx
|
||||
return nil
|
||||
}
|
||||
|
||||
// VoiceEmbed returns the L2-normalized speaker embedding for an audio clip.
|
||||
// The request carries a filesystem PATH; the HTTP layer materializes
|
||||
// base64/URL/data-URI inputs to a temp file before the gRPC call.
|
||||
func (v *VoiceDetect) VoiceEmbed(req *pb.VoiceEmbedRequest) (pb.VoiceEmbedResponse, error) {
|
||||
if v.ctxPtr == 0 {
|
||||
return pb.VoiceEmbedResponse{}, errors.New("voice-detect: model not loaded")
|
||||
}
|
||||
if req.Audio == "" {
|
||||
return pb.VoiceEmbedResponse{}, errors.New("voice-detect: audio path is required")
|
||||
}
|
||||
emb, err := v.embedPath(req.Audio)
|
||||
if err != nil {
|
||||
return pb.VoiceEmbedResponse{}, err
|
||||
}
|
||||
return pb.VoiceEmbedResponse{Embedding: emb, Model: v.opts.modelName}, nil
|
||||
}
|
||||
|
||||
func (v *VoiceDetect) embedPath(path string) ([]float32, error) {
|
||||
var vec uintptr
|
||||
var dim int32
|
||||
rc := CppEmbedPath(v.ctxPtr, path, unsafe.Pointer(&vec), unsafe.Pointer(&dim))
|
||||
if rc != 0 || vec == 0 || dim <= 0 {
|
||||
return nil, v.lastErr("embed", path)
|
||||
}
|
||||
defer CppFreeVec(vec)
|
||||
// Copy out of the C-owned malloc'd buffer before freeing it. The
|
||||
// uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
|
||||
// a C heap pointer from Go-managed memory; safe here, the GC neither tracks
|
||||
// nor moves this buffer and we copy immediately.
|
||||
src := unsafe.Slice((*float32)(unsafe.Pointer(vec)), int(dim)) //nolint:govet // C-owned malloc'd vector, copied out before free
|
||||
out := make([]float32, int(dim))
|
||||
copy(out, src)
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// VoiceVerify embeds two clips and reports whether they are the same speaker by
|
||||
// cosine distance against a threshold. A request threshold <= 0 falls back to
|
||||
// the model-configured default (verify_threshold option, 0.25 if unset).
|
||||
func (v *VoiceDetect) VoiceVerify(req *pb.VoiceVerifyRequest) (pb.VoiceVerifyResponse, error) {
|
||||
if v.ctxPtr == 0 {
|
||||
return pb.VoiceVerifyResponse{}, errors.New("voice-detect: model not loaded")
|
||||
}
|
||||
if req.Audio1 == "" || req.Audio2 == "" {
|
||||
return pb.VoiceVerifyResponse{}, errors.New("voice-detect: audio1 and audio2 are required")
|
||||
}
|
||||
|
||||
threshold := req.Threshold
|
||||
if threshold <= 0 {
|
||||
threshold = v.opts.verifyThreshold
|
||||
}
|
||||
|
||||
started := time.Now()
|
||||
var distance float32
|
||||
var verified int32
|
||||
rc := CppVerifyPaths(v.ctxPtr, req.Audio1, req.Audio2, threshold,
|
||||
unsafe.Pointer(&distance), unsafe.Pointer(&verified))
|
||||
if rc != 0 {
|
||||
return pb.VoiceVerifyResponse{}, v.lastErr("verify", req.Audio1+","+req.Audio2)
|
||||
}
|
||||
elapsedMs := float32(time.Since(started).Seconds() * 1000.0)
|
||||
|
||||
// Confidence decays linearly from 100 at distance 0 to 0 at the threshold,
|
||||
// matching the Python speaker-recognition backend's reporting.
|
||||
confidence := float32(0)
|
||||
if threshold > 0 {
|
||||
confidence = float32(math.Max(0, math.Min(100, (1.0-float64(distance)/float64(threshold))*100.0)))
|
||||
}
|
||||
|
||||
return pb.VoiceVerifyResponse{
|
||||
Verified: verified != 0,
|
||||
Distance: distance,
|
||||
Threshold: threshold,
|
||||
Confidence: confidence,
|
||||
Model: v.opts.modelName,
|
||||
ProcessingTimeMs: elapsedMs,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// VoiceAnalyze runs the age/gender/emotion heads on a single clip. The C-API
|
||||
// always evaluates every supported head, so the request's actions filter is
|
||||
// advisory and the full analysis is returned as a single segment (the engine
|
||||
// does not produce time-bounded segments).
|
||||
func (v *VoiceDetect) VoiceAnalyze(req *pb.VoiceAnalyzeRequest) (pb.VoiceAnalyzeResponse, error) {
|
||||
if v.ctxPtr == 0 {
|
||||
return pb.VoiceAnalyzeResponse{}, errors.New("voice-detect: model not loaded")
|
||||
}
|
||||
if req.Audio == "" {
|
||||
return pb.VoiceAnalyzeResponse{}, errors.New("voice-detect: audio path is required")
|
||||
}
|
||||
|
||||
ptr := CppAnalyzeJSON(v.ctxPtr, req.Audio)
|
||||
if ptr == 0 {
|
||||
return pb.VoiceAnalyzeResponse{}, v.lastErr("analyze", req.Audio)
|
||||
}
|
||||
defer CppFreeString(ptr)
|
||||
|
||||
seg, err := parseAnalyzeJSON(goStringFromCPtr(ptr))
|
||||
if err != nil {
|
||||
return pb.VoiceAnalyzeResponse{}, fmt.Errorf("voice-detect: analyze JSON for %q: %w", req.Audio, err)
|
||||
}
|
||||
return pb.VoiceAnalyzeResponse{Segments: []*pb.VoiceAnalysis{seg}}, nil
|
||||
}
|
||||
|
||||
// analyzeJSON mirrors the document returned by voicedetect_capi_analyze_path_json:
|
||||
//
|
||||
// {"age":42.0,
|
||||
// "gender":{"label":"female","female":0.88,"male":0.12},
|
||||
// "emotion":{"label":"neutral","scores":{"neutral":0.7, ...}}}
|
||||
//
|
||||
// gender is a mixed object (a "label" string plus per-class float scores), so
|
||||
// it is decoded into raw messages and split in parseAnalyzeJSON.
|
||||
type analyzeJSON struct {
|
||||
Age float32 `json:"age"`
|
||||
Gender map[string]json.RawMessage `json:"gender"`
|
||||
Emotion struct {
|
||||
Label string `json:"label"`
|
||||
Scores map[string]float32 `json:"scores"`
|
||||
} `json:"emotion"`
|
||||
}
|
||||
|
||||
// parseAnalyzeJSON maps the engine's analyze document onto a VoiceAnalysis.
|
||||
// start/end stay 0: the model emits a single whole-utterance result, not
|
||||
// time-bounded segments.
|
||||
func parseAnalyzeJSON(doc string) (*pb.VoiceAnalysis, error) {
|
||||
var a analyzeJSON
|
||||
if err := json.Unmarshal([]byte(doc), &a); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
seg := &pb.VoiceAnalysis{
|
||||
Age: a.Age,
|
||||
DominantEmotion: a.Emotion.Label,
|
||||
Emotion: a.Emotion.Scores,
|
||||
}
|
||||
|
||||
if len(a.Gender) > 0 {
|
||||
gender := make(map[string]float32, len(a.Gender))
|
||||
for k, raw := range a.Gender {
|
||||
if k == "label" {
|
||||
_ = json.Unmarshal(raw, &seg.DominantGender)
|
||||
continue
|
||||
}
|
||||
var score float32
|
||||
if err := json.Unmarshal(raw, &score); err == nil {
|
||||
gender[k] = score
|
||||
}
|
||||
}
|
||||
seg.Gender = gender
|
||||
}
|
||||
|
||||
return seg, nil
|
||||
}
|
||||
|
||||
// lastErr wraps the C-API's per-ctx last-error buffer into a Go error.
|
||||
func (v *VoiceDetect) lastErr(op, subject string) error {
|
||||
msg := strings.TrimSpace(CppLastError(v.ctxPtr))
|
||||
if msg == "" {
|
||||
msg = "no error detail"
|
||||
}
|
||||
return fmt.Errorf("voice-detect: %s failed for %q: %s", op, subject, msg)
|
||||
}
|
||||
|
||||
// goStringFromCPtr copies a NUL-terminated C string into Go memory. cptr is a
|
||||
// malloc'd buffer the caller owns; release it via CppFreeString after the copy.
|
||||
//
|
||||
// The uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
|
||||
// a C heap pointer from Go-managed memory. Safe here: the GC neither tracks nor
|
||||
// moves the buffer and we dereference it immediately to copy the bytes out.
|
||||
func goStringFromCPtr(cptr uintptr) string {
|
||||
if cptr == 0 {
|
||||
return ""
|
||||
}
|
||||
p := unsafe.Pointer(cptr) //nolint:govet // C-owned malloc'd buffer, not Go-GC memory (see doc above)
|
||||
n := 0
|
||||
for *(*byte)(unsafe.Add(p, n)) != 0 {
|
||||
n++
|
||||
}
|
||||
return string(unsafe.Slice((*byte)(p), n))
|
||||
}
|
||||
144
backend/go/voice-detect/govoicedetect_test.go
Normal file
144
backend/go/voice-detect/govoicedetect_test.go
Normal file
@@ -0,0 +1,144 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
"sync"
|
||||
"testing"
|
||||
|
||||
"github.com/ebitengine/purego"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
func TestVoiceDetect(t *testing.T) {
|
||||
RegisterFailHandler(Fail)
|
||||
RunSpecs(t, "voice-detect Backend Suite")
|
||||
}
|
||||
|
||||
var (
|
||||
libLoadOnce sync.Once
|
||||
libLoadErr error
|
||||
)
|
||||
|
||||
// ensureLibLoaded mirrors main.go's bootstrap so a Go test can drive the C-API
|
||||
// bridge without spinning up the gRPC server. Records the error (the smoke
|
||||
// specs skip themselves) when libvoicedetect.so is not loadable from cwd
|
||||
// (LD_LIBRARY_PATH or a symlink in ./).
|
||||
func ensureLibLoaded() error {
|
||||
libLoadOnce.Do(func() {
|
||||
libName := os.Getenv("VOICEDETECT_LIBRARY")
|
||||
if libName == "" {
|
||||
libName = "libvoicedetect.so"
|
||||
}
|
||||
lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
|
||||
if err != nil {
|
||||
libLoadErr = err
|
||||
return
|
||||
}
|
||||
purego.RegisterLibFunc(&CppAbiVersion, lib, "voicedetect_capi_abi_version")
|
||||
purego.RegisterLibFunc(&CppLoad, lib, "voicedetect_capi_load")
|
||||
purego.RegisterLibFunc(&CppFree, lib, "voicedetect_capi_free")
|
||||
purego.RegisterLibFunc(&CppLastError, lib, "voicedetect_capi_last_error")
|
||||
purego.RegisterLibFunc(&CppFreeString, lib, "voicedetect_capi_free_string")
|
||||
purego.RegisterLibFunc(&CppFreeVec, lib, "voicedetect_capi_free_vec")
|
||||
purego.RegisterLibFunc(&CppEmbedPath, lib, "voicedetect_capi_embed_path")
|
||||
purego.RegisterLibFunc(&CppEmbedPCM, lib, "voicedetect_capi_embed_pcm")
|
||||
purego.RegisterLibFunc(&CppVerifyPaths, lib, "voicedetect_capi_verify_paths")
|
||||
purego.RegisterLibFunc(&CppAnalyzeJSON, lib, "voicedetect_capi_analyze_path_json")
|
||||
})
|
||||
return libLoadErr
|
||||
}
|
||||
|
||||
var _ = Describe("parseOptions", func() {
|
||||
It("defaults verify_threshold to 0.25", func() {
|
||||
o := parseOptions(nil)
|
||||
Expect(o.verifyThreshold).To(Equal(float32(0.25)))
|
||||
Expect(o.modelName).To(Equal(""))
|
||||
})
|
||||
|
||||
It("parses verify_threshold, threshold alias and model_name", func() {
|
||||
o := parseOptions([]string{"verify_threshold:0.4", "model_name:ecapa", "unknown:x"})
|
||||
Expect(o.verifyThreshold).To(Equal(float32(0.4)))
|
||||
Expect(o.modelName).To(Equal("ecapa"))
|
||||
|
||||
o2 := parseOptions([]string{"threshold:0.3"})
|
||||
Expect(o2.verifyThreshold).To(Equal(float32(0.3)))
|
||||
})
|
||||
|
||||
It("ignores non-positive thresholds and keeps the default", func() {
|
||||
o := parseOptions([]string{"verify_threshold:0", "threshold:-1"})
|
||||
Expect(o.verifyThreshold).To(Equal(float32(0.25)))
|
||||
})
|
||||
})
|
||||
|
||||
var _ = Describe("parseAnalyzeJSON", func() {
|
||||
It("maps age, gender label+scores and emotion label+scores", func() {
|
||||
doc := `{"age":42.0,
|
||||
"gender":{"label":"female","female":0.88,"male":0.12},
|
||||
"emotion":{"label":"neutral","scores":{"neutral":0.7,"happy":0.2,"sad":0.1}}}`
|
||||
seg, err := parseAnalyzeJSON(doc)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(seg.Age).To(BeNumerically("~", 42.0, 1e-4))
|
||||
Expect(seg.Start).To(Equal(float32(0)))
|
||||
Expect(seg.End).To(Equal(float32(0)))
|
||||
|
||||
Expect(seg.DominantGender).To(Equal("female"))
|
||||
Expect(seg.Gender).To(HaveKeyWithValue("female", BeNumerically("~", 0.88, 1e-4)))
|
||||
Expect(seg.Gender).To(HaveKeyWithValue("male", BeNumerically("~", 0.12, 1e-4)))
|
||||
// The "label" entry is consumed into DominantGender, not the score map.
|
||||
Expect(seg.Gender).ToNot(HaveKey("label"))
|
||||
|
||||
Expect(seg.DominantEmotion).To(Equal("neutral"))
|
||||
Expect(seg.Emotion).To(HaveKeyWithValue("neutral", BeNumerically("~", 0.7, 1e-4)))
|
||||
Expect(seg.Emotion).To(HaveKeyWithValue("happy", BeNumerically("~", 0.2, 1e-4)))
|
||||
})
|
||||
|
||||
It("tolerates a missing gender block", func() {
|
||||
seg, err := parseAnalyzeJSON(`{"age":30.0,"emotion":{"label":"happy","scores":{"happy":1.0}}}`)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(seg.DominantGender).To(Equal(""))
|
||||
Expect(seg.DominantEmotion).To(Equal("happy"))
|
||||
})
|
||||
|
||||
It("returns an error on malformed JSON", func() {
|
||||
_, err := parseAnalyzeJSON(`{not-json`)
|
||||
Expect(err).To(HaveOccurred())
|
||||
})
|
||||
})
|
||||
|
||||
// The specs below exercise the real C-API end to end. They run only when both a
|
||||
// model GGUF and a test WAV are provided, and skip cleanly otherwise so the
|
||||
// suite stays green without large assets.
|
||||
var _ = Describe("VoiceDetect end-to-end", Ordered, func() {
|
||||
var (
|
||||
v *VoiceDetect
|
||||
modelPath = os.Getenv("VOICEDETECT_BACKEND_TEST_MODEL")
|
||||
wavPath = os.Getenv("VOICEDETECT_BACKEND_TEST_WAV")
|
||||
)
|
||||
|
||||
BeforeAll(func() {
|
||||
if modelPath == "" || wavPath == "" {
|
||||
Skip("set VOICEDETECT_BACKEND_TEST_MODEL and VOICEDETECT_BACKEND_TEST_WAV to run the e2e specs")
|
||||
}
|
||||
if err := ensureLibLoaded(); err != nil {
|
||||
Skip("libvoicedetect.so not loadable: " + err.Error())
|
||||
}
|
||||
v = &VoiceDetect{}
|
||||
Expect(v.Load(&pb.ModelOptions{ModelFile: modelPath})).To(Succeed())
|
||||
})
|
||||
|
||||
It("embeds an audio clip", func() {
|
||||
resp, err := v.VoiceEmbed(&pb.VoiceEmbedRequest{Audio: wavPath})
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(resp.Embedding).ToNot(BeEmpty())
|
||||
Expect(resp.Model).ToNot(BeEmpty())
|
||||
})
|
||||
|
||||
It("verifies a clip against itself as the same speaker", func() {
|
||||
resp, err := v.VoiceVerify(&pb.VoiceVerifyRequest{Audio1: wavPath, Audio2: wavPath})
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(resp.Verified).To(BeTrue())
|
||||
Expect(resp.Distance).To(BeNumerically("<=", resp.Threshold))
|
||||
})
|
||||
})
|
||||
64
backend/go/voice-detect/main.go
Normal file
64
backend/go/voice-detect/main.go
Normal file
@@ -0,0 +1,64 @@
|
||||
package main
|
||||
|
||||
// Started internally by LocalAI - one gRPC server per loaded model.
|
||||
//
|
||||
// Loads libvoicedetect.so via purego and registers the flat C-API entry points
|
||||
// declared in voicedetect_capi.h. The library name can be overridden with
|
||||
// VOICEDETECT_LIBRARY (mirrors the PARAKEET_LIBRARY / OMNIVOICE_LIBRARY
|
||||
// convention in the sibling backends); the default looks for the .so next to
|
||||
// this binary (resolved via LD_LIBRARY_PATH by run.sh).
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"github.com/ebitengine/purego"
|
||||
grpc "github.com/mudler/LocalAI/pkg/grpc"
|
||||
)
|
||||
|
||||
var (
|
||||
addr = flag.String("addr", "localhost:50051", "the address to connect to")
|
||||
)
|
||||
|
||||
type LibFuncs struct {
|
||||
FuncPtr any
|
||||
Name string
|
||||
}
|
||||
|
||||
func main() {
|
||||
libName := os.Getenv("VOICEDETECT_LIBRARY")
|
||||
if libName == "" {
|
||||
libName = "libvoicedetect.so"
|
||||
}
|
||||
|
||||
lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("voice-detect: dlopen %q: %w", libName, err))
|
||||
}
|
||||
|
||||
// Bound 1:1 to voicedetect_capi.h. char*/float* returns are registered as
|
||||
// uintptr so the raw pointer can be freed via the matching capi free fn.
|
||||
libFuncs := []LibFuncs{
|
||||
{&CppAbiVersion, "voicedetect_capi_abi_version"},
|
||||
{&CppLoad, "voicedetect_capi_load"},
|
||||
{&CppFree, "voicedetect_capi_free"},
|
||||
{&CppLastError, "voicedetect_capi_last_error"},
|
||||
{&CppFreeString, "voicedetect_capi_free_string"},
|
||||
{&CppFreeVec, "voicedetect_capi_free_vec"},
|
||||
{&CppEmbedPath, "voicedetect_capi_embed_path"},
|
||||
{&CppEmbedPCM, "voicedetect_capi_embed_pcm"},
|
||||
{&CppVerifyPaths, "voicedetect_capi_verify_paths"},
|
||||
{&CppAnalyzeJSON, "voicedetect_capi_analyze_path_json"},
|
||||
}
|
||||
for _, lf := range libFuncs {
|
||||
purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
|
||||
}
|
||||
|
||||
fmt.Fprintf(os.Stderr, "[voice-detect] ABI=%d\n", CppAbiVersion())
|
||||
|
||||
flag.Parse()
|
||||
|
||||
if err := grpc.StartServer(*addr, &VoiceDetect{}); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
46
backend/go/voice-detect/options.go
Normal file
46
backend/go/voice-detect/options.go
Normal file
@@ -0,0 +1,46 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// defaultVerifyThreshold is the cosine-distance cutoff used when a request does
|
||||
// not set one. Matches the Python speaker-recognition backend's default so the
|
||||
// two implementations agree on verdicts out of the box.
|
||||
const defaultVerifyThreshold float32 = 0.25
|
||||
|
||||
// loadOptions holds the parsed model-level options for voice-detect.
|
||||
type loadOptions struct {
|
||||
verifyThreshold float32
|
||||
modelName string
|
||||
}
|
||||
|
||||
func splitOption(o string) (key, value string, ok bool) {
|
||||
i := strings.Index(o, ":")
|
||||
if i < 0 {
|
||||
return "", "", false
|
||||
}
|
||||
return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true
|
||||
}
|
||||
|
||||
// parseOptions reads the backend "key:value" option slice. Unknown keys are
|
||||
// ignored. Defaults: verify_threshold 0.25, model_name derived from the file.
|
||||
func parseOptions(opts []string) loadOptions {
|
||||
o := loadOptions{verifyThreshold: defaultVerifyThreshold}
|
||||
for _, oo := range opts {
|
||||
key, value, ok := splitOption(oo)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
switch key {
|
||||
case "verify_threshold", "threshold":
|
||||
if f, err := strconv.ParseFloat(value, 32); err == nil && f > 0 {
|
||||
o.verifyThreshold = float32(f)
|
||||
}
|
||||
case "model_name":
|
||||
o.modelName = value
|
||||
}
|
||||
}
|
||||
return o
|
||||
}
|
||||
68
backend/go/voice-detect/package.sh
Executable file
68
backend/go/voice-detect/package.sh
Executable file
@@ -0,0 +1,68 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Bundle the voice-detect-grpc binary, libvoicedetect.so, the core runtime libs
|
||||
# (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE
|
||||
# so the package is self-contained. Mirrors backend/go/parakeet-cpp/package.sh;
|
||||
# run.sh routes the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc
|
||||
# is used instead of the host's.
|
||||
|
||||
set -e
|
||||
|
||||
CURDIR=$(dirname "$(realpath "$0")")
|
||||
REPO_ROOT="${CURDIR}/../../.."
|
||||
|
||||
mkdir -p "$CURDIR/package/lib"
|
||||
|
||||
cp -avf "$CURDIR/voice-detect-grpc" "$CURDIR/package/"
|
||||
cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
|
||||
|
||||
# libvoicedetect.so + any soname symlinks. purego.Dlopen resolves it via
|
||||
# LD_LIBRARY_PATH, which run.sh points at lib/.
|
||||
cp -avf "$CURDIR"/libvoicedetect.so* "$CURDIR/package/lib/" 2>/dev/null || {
|
||||
echo "ERROR: libvoicedetect.so not found in $CURDIR, run 'make' first" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Detect architecture and copy the core runtime libs libvoicedetect.so links
|
||||
# against, plus the matching dynamic loader as lib/ld.so.
|
||||
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
|
||||
echo "Detected x86_64 architecture, copying x86_64 libraries..."
|
||||
cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
|
||||
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
|
||||
echo "Detected ARM64 architecture, copying ARM64 libraries..."
|
||||
cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
|
||||
elif [ "$(uname -s)" = "Darwin" ]; then
|
||||
echo "Detected Darwin"
|
||||
else
|
||||
echo "Error: Could not detect architecture"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers) based on
|
||||
# BUILD_TYPE so the backend can reach the GPU without the runtime base image
|
||||
# shipping those drivers.
|
||||
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
|
||||
if [ -f "$GPU_LIB_SCRIPT" ]; then
|
||||
echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
|
||||
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
|
||||
package_gpu_libs
|
||||
fi
|
||||
|
||||
echo "Packaging completed successfully"
|
||||
ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
|
||||
16
backend/go/voice-detect/run.sh
Executable file
16
backend/go/voice-detect/run.sh
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
CURDIR=$(dirname "$(realpath "$0")")
|
||||
|
||||
export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
|
||||
|
||||
# If a self-contained ld.so was packaged, route through it so the packaged
|
||||
# libc / libstdc++ are used instead of the host's (matches the whisper /
|
||||
# parakeet backends' runtime layout).
|
||||
if [ -f "$CURDIR/lib/ld.so" ]; then
|
||||
echo "Using lib/ld.so"
|
||||
exec "$CURDIR/lib/ld.so" "$CURDIR/voice-detect-grpc" "$@"
|
||||
fi
|
||||
|
||||
exec "$CURDIR/voice-detect-grpc" "$@"
|
||||
14
backend/go/voice-detect/test.sh
Executable file
14
backend/go/voice-detect/test.sh
Executable file
@@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
CURDIR=$(dirname "$(realpath "$0")")
|
||||
cd "$CURDIR"
|
||||
|
||||
echo "Running voice-detect backend tests..."
|
||||
|
||||
# The pure-Go parsing specs always run. The embed/verify/analyze smoke specs run
|
||||
# only when a model + WAV are provided via VOICEDETECT_BACKEND_TEST_MODEL and
|
||||
# VOICEDETECT_BACKEND_TEST_WAV; otherwise they auto-skip.
|
||||
LD_LIBRARY_PATH="$CURDIR:${LD_LIBRARY_PATH:-}" go test -v -timeout 1200s .
|
||||
|
||||
echo "voice-detect tests completed."
|
||||
Reference in New Issue
Block a user