diff --git a/backend/go/voice-detect/.gitignore b/backend/go/voice-detect/.gitignore new file mode 100644 index 000000000..812afb9b2 --- /dev/null +++ b/backend/go/voice-detect/.gitignore @@ -0,0 +1,18 @@ +# Fetched upstream sources +sources/ + +# CMake build directories +build*/ + +# build artifacts staged in-tree by the Makefile (cp from sources/) or +# symlinked for local dev; the real sources live in voice-detect.cpp upstream. +*.so +*.so.* +voicedetect_capi.h +compile_commands.json + +# Compiled backend binary +voice-detect-grpc + +# Packaging output +package/ diff --git a/backend/go/voice-detect/Makefile b/backend/go/voice-detect/Makefile new file mode 100644 index 000000000..87999f8ca --- /dev/null +++ b/backend/go/voice-detect/Makefile @@ -0,0 +1,92 @@ +# voice-detect backend Makefile. +# +# Upstream pin lives below as VOICEDETECT_VERSION?=4754643... (.github/bump_deps.sh +# can find and update it - matches the parakeet.cpp / whisper.cpp / ds4 convention). +# +# Local dev shortcut: if you already have an out-of-tree voice-detect.cpp build, +# symlink the .so + header into this directory and skip the clone/cmake steps: +# +# ln -sf /path/to/voice-detect.cpp/build-shared/libvoicedetect.so . +# ln -sf /path/to/voice-detect.cpp/include/voicedetect_capi.h . +# go build -o voice-detect-grpc . +# +# The default target below does the proper clone-at-pin + cmake build so CI does +# not need a side-checkout. + +VOICEDETECT_VERSION?=47546430ab0f23713c2f990e43035fb1106a9c74 +VOICEDETECT_REPO?=https://github.com/mudler/voice-detect.cpp + +GOCMD?=go +GO_TAGS?= +JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4) + +BUILD_TYPE?= +NATIVE?=false + +# Build ggml statically into libvoicedetect.so (PIC) so the shared lib is +# self-contained: dlopen needs no libggml*.so alongside it, only system libs +# (libstdc++/libgomp/libc) that the runtime image already provides. +CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DVOICEDETECT_SHARED=ON -DVOICEDETECT_BUILD_CLI=OFF -DVOICEDETECT_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON + +ifeq ($(NATIVE),false) + CMAKE_ARGS+=-DGGML_NATIVE=OFF +endif + +# voice-detect.cpp gates its GGML backends behind VOICEDETECT_GGML_* options and +# does set(GGML_CUDA ${VOICEDETECT_GGML_CUDA} CACHE BOOL "" FORCE), so a bare +# -DGGML_CUDA=ON is overwritten back to OFF. Forward the VOICEDETECT_GGML_* +# options instead. (openblas is not gated, so -DGGML_BLAS passes through.) +ifeq ($(BUILD_TYPE),cublas) + CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDA=ON +else ifeq ($(BUILD_TYPE),openblas) + CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS +else ifeq ($(BUILD_TYPE),hipblas) + CMAKE_ARGS+=-DVOICEDETECT_GGML_HIP=ON +else ifeq ($(BUILD_TYPE),vulkan) + CMAKE_ARGS+=-DVOICEDETECT_GGML_VULKAN=ON +endif + +.PHONY: voice-detect-grpc package build clean purge test all + +all: voice-detect-grpc + +# Clone the upstream voice-detect.cpp source at the pinned commit. Directory acts +# as the target so make only re-clones when missing. After a VOICEDETECT_VERSION +# bump, run 'make purge && make' to refetch. +sources/voice-detect.cpp: + mkdir -p sources/voice-detect.cpp + cd sources/voice-detect.cpp && \ + git init -q && \ + git remote add origin $(VOICEDETECT_REPO) && \ + git fetch --depth 1 origin $(VOICEDETECT_VERSION) && \ + git checkout FETCH_HEAD && \ + git submodule update --init --recursive --depth 1 --single-branch + +# Build the shared lib + header out-of-tree, then stage them next to the Go +# sources so purego.Dlopen("libvoicedetect.so") and the cgo-less build both pick +# them up. +libvoicedetect.so: sources/voice-detect.cpp + cmake -B sources/voice-detect.cpp/build-shared -S sources/voice-detect.cpp $(CMAKE_ARGS) + cmake --build sources/voice-detect.cpp/build-shared --config Release -j$(JOBS) --target voicedetect + cp -fv sources/voice-detect.cpp/build-shared/libvoicedetect.so* ./ 2>/dev/null || true + cp -fv sources/voice-detect.cpp/include/voicedetect_capi.h ./ + +voice-detect-grpc: libvoicedetect.so main.go govoicedetect.go options.go + CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o voice-detect-grpc . + +package: voice-detect-grpc + bash package.sh + +build: package + +# Test target. The embed/verify/analyze smoke specs are gated on +# VOICEDETECT_BACKEND_TEST_MODEL + VOICEDETECT_BACKEND_TEST_WAV; without them the +# heavy specs auto-skip and only the pure-Go parsing specs run. +test: + LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1 + +clean: purge + rm -rf libvoicedetect.so* voicedetect_capi.h package voice-detect-grpc + +purge: + rm -rf sources/voice-detect.cpp diff --git a/backend/go/voice-detect/govoicedetect.go b/backend/go/voice-detect/govoicedetect.go new file mode 100644 index 000000000..ea648e896 --- /dev/null +++ b/backend/go/voice-detect/govoicedetect.go @@ -0,0 +1,257 @@ +package main + +import ( + "encoding/json" + "errors" + "fmt" + "math" + "path/filepath" + "strings" + "time" + "unsafe" + + "github.com/mudler/LocalAI/pkg/grpc/base" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + "github.com/mudler/xlog" +) + +// purego-bound entry points from libvoicedetect.so. Names match +// voicedetect_capi.h exactly so a `nm libvoicedetect.so | grep voicedetect_capi` +// is enough to spot drift. +// +// The opaque ctx and the malloc'd char*/float* return values are declared as +// uintptr so we get the raw pointer back and can release it via the matching +// capi free function. purego's native string/[]float32 returns would copy and +// forget the original pointer, leaking the C-owned buffer on every call. +var ( + CppAbiVersion func() int32 + CppLoad func(ggufPath string) uintptr + CppFree func(ctx uintptr) + CppLastError func(ctx uintptr) string + CppFreeString func(s uintptr) + CppFreeVec func(v uintptr) + CppEmbedPath func(ctx uintptr, wavPath string, outVec, outDim unsafe.Pointer) int32 + CppEmbedPCM func(ctx uintptr, pcm []float32, nSamples, sampleRate int32, outVec, outDim unsafe.Pointer) int32 + CppVerifyPaths func(ctx uintptr, a, b string, threshold float32, outDistance, outVerified unsafe.Pointer) int32 + CppAnalyzeJSON func(ctx uintptr, wavPath string) uintptr +) + +// VoiceDetect implements the speaker-recognition voice subset of the Backend +// gRPC service over libvoicedetect.so. The C side keeps a single loaded model +// plus a per-ctx last-error buffer and is not reentrant, so base.SingleThread +// serializes every call. +type VoiceDetect struct { + base.SingleThread + opts loadOptions + ctxPtr uintptr +} + +func (v *VoiceDetect) Load(opts *pb.ModelOptions) error { + model := opts.ModelFile + if model == "" { + model = opts.ModelPath + } + if !filepath.IsAbs(model) && opts.ModelPath != "" { + model = filepath.Join(opts.ModelPath, model) + } + if model == "" { + return errors.New("voice-detect: ModelFile is required") + } + + v.opts = parseOptions(opts.Options) + if v.opts.modelName == "" { + v.opts.modelName = filepath.Base(model) + } + + xlog.Info("voice-detect: loading model", "model", model, + "verify_threshold", v.opts.verifyThreshold, "abi", CppAbiVersion()) + + ctx := CppLoad(model) + if ctx == 0 { + // The last-error buffer lives on the ctx that was never returned, so + // surface the path the operator tried to load instead. + return fmt.Errorf("voice-detect: voicedetect_capi_load failed for %q", model) + } + v.ctxPtr = ctx + return nil +} + +// VoiceEmbed returns the L2-normalized speaker embedding for an audio clip. +// The request carries a filesystem PATH; the HTTP layer materializes +// base64/URL/data-URI inputs to a temp file before the gRPC call. +func (v *VoiceDetect) VoiceEmbed(req *pb.VoiceEmbedRequest) (pb.VoiceEmbedResponse, error) { + if v.ctxPtr == 0 { + return pb.VoiceEmbedResponse{}, errors.New("voice-detect: model not loaded") + } + if req.Audio == "" { + return pb.VoiceEmbedResponse{}, errors.New("voice-detect: audio path is required") + } + emb, err := v.embedPath(req.Audio) + if err != nil { + return pb.VoiceEmbedResponse{}, err + } + return pb.VoiceEmbedResponse{Embedding: emb, Model: v.opts.modelName}, nil +} + +func (v *VoiceDetect) embedPath(path string) ([]float32, error) { + var vec uintptr + var dim int32 + rc := CppEmbedPath(v.ctxPtr, path, unsafe.Pointer(&vec), unsafe.Pointer(&dim)) + if rc != 0 || vec == 0 || dim <= 0 { + return nil, v.lastErr("embed", path) + } + defer CppFreeVec(vec) + // Copy out of the C-owned malloc'd buffer before freeing it. The + // uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell + // a C heap pointer from Go-managed memory; safe here, the GC neither tracks + // nor moves this buffer and we copy immediately. + src := unsafe.Slice((*float32)(unsafe.Pointer(vec)), int(dim)) //nolint:govet // C-owned malloc'd vector, copied out before free + out := make([]float32, int(dim)) + copy(out, src) + return out, nil +} + +// VoiceVerify embeds two clips and reports whether they are the same speaker by +// cosine distance against a threshold. A request threshold <= 0 falls back to +// the model-configured default (verify_threshold option, 0.25 if unset). +func (v *VoiceDetect) VoiceVerify(req *pb.VoiceVerifyRequest) (pb.VoiceVerifyResponse, error) { + if v.ctxPtr == 0 { + return pb.VoiceVerifyResponse{}, errors.New("voice-detect: model not loaded") + } + if req.Audio1 == "" || req.Audio2 == "" { + return pb.VoiceVerifyResponse{}, errors.New("voice-detect: audio1 and audio2 are required") + } + + threshold := req.Threshold + if threshold <= 0 { + threshold = v.opts.verifyThreshold + } + + started := time.Now() + var distance float32 + var verified int32 + rc := CppVerifyPaths(v.ctxPtr, req.Audio1, req.Audio2, threshold, + unsafe.Pointer(&distance), unsafe.Pointer(&verified)) + if rc != 0 { + return pb.VoiceVerifyResponse{}, v.lastErr("verify", req.Audio1+","+req.Audio2) + } + elapsedMs := float32(time.Since(started).Seconds() * 1000.0) + + // Confidence decays linearly from 100 at distance 0 to 0 at the threshold, + // matching the Python speaker-recognition backend's reporting. + confidence := float32(0) + if threshold > 0 { + confidence = float32(math.Max(0, math.Min(100, (1.0-float64(distance)/float64(threshold))*100.0))) + } + + return pb.VoiceVerifyResponse{ + Verified: verified != 0, + Distance: distance, + Threshold: threshold, + Confidence: confidence, + Model: v.opts.modelName, + ProcessingTimeMs: elapsedMs, + }, nil +} + +// VoiceAnalyze runs the age/gender/emotion heads on a single clip. The C-API +// always evaluates every supported head, so the request's actions filter is +// advisory and the full analysis is returned as a single segment (the engine +// does not produce time-bounded segments). +func (v *VoiceDetect) VoiceAnalyze(req *pb.VoiceAnalyzeRequest) (pb.VoiceAnalyzeResponse, error) { + if v.ctxPtr == 0 { + return pb.VoiceAnalyzeResponse{}, errors.New("voice-detect: model not loaded") + } + if req.Audio == "" { + return pb.VoiceAnalyzeResponse{}, errors.New("voice-detect: audio path is required") + } + + ptr := CppAnalyzeJSON(v.ctxPtr, req.Audio) + if ptr == 0 { + return pb.VoiceAnalyzeResponse{}, v.lastErr("analyze", req.Audio) + } + defer CppFreeString(ptr) + + seg, err := parseAnalyzeJSON(goStringFromCPtr(ptr)) + if err != nil { + return pb.VoiceAnalyzeResponse{}, fmt.Errorf("voice-detect: analyze JSON for %q: %w", req.Audio, err) + } + return pb.VoiceAnalyzeResponse{Segments: []*pb.VoiceAnalysis{seg}}, nil +} + +// analyzeJSON mirrors the document returned by voicedetect_capi_analyze_path_json: +// +// {"age":42.0, +// "gender":{"label":"female","female":0.88,"male":0.12}, +// "emotion":{"label":"neutral","scores":{"neutral":0.7, ...}}} +// +// gender is a mixed object (a "label" string plus per-class float scores), so +// it is decoded into raw messages and split in parseAnalyzeJSON. +type analyzeJSON struct { + Age float32 `json:"age"` + Gender map[string]json.RawMessage `json:"gender"` + Emotion struct { + Label string `json:"label"` + Scores map[string]float32 `json:"scores"` + } `json:"emotion"` +} + +// parseAnalyzeJSON maps the engine's analyze document onto a VoiceAnalysis. +// start/end stay 0: the model emits a single whole-utterance result, not +// time-bounded segments. +func parseAnalyzeJSON(doc string) (*pb.VoiceAnalysis, error) { + var a analyzeJSON + if err := json.Unmarshal([]byte(doc), &a); err != nil { + return nil, err + } + + seg := &pb.VoiceAnalysis{ + Age: a.Age, + DominantEmotion: a.Emotion.Label, + Emotion: a.Emotion.Scores, + } + + if len(a.Gender) > 0 { + gender := make(map[string]float32, len(a.Gender)) + for k, raw := range a.Gender { + if k == "label" { + _ = json.Unmarshal(raw, &seg.DominantGender) + continue + } + var score float32 + if err := json.Unmarshal(raw, &score); err == nil { + gender[k] = score + } + } + seg.Gender = gender + } + + return seg, nil +} + +// lastErr wraps the C-API's per-ctx last-error buffer into a Go error. +func (v *VoiceDetect) lastErr(op, subject string) error { + msg := strings.TrimSpace(CppLastError(v.ctxPtr)) + if msg == "" { + msg = "no error detail" + } + return fmt.Errorf("voice-detect: %s failed for %q: %s", op, subject, msg) +} + +// goStringFromCPtr copies a NUL-terminated C string into Go memory. cptr is a +// malloc'd buffer the caller owns; release it via CppFreeString after the copy. +// +// The uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell +// a C heap pointer from Go-managed memory. Safe here: the GC neither tracks nor +// moves the buffer and we dereference it immediately to copy the bytes out. +func goStringFromCPtr(cptr uintptr) string { + if cptr == 0 { + return "" + } + p := unsafe.Pointer(cptr) //nolint:govet // C-owned malloc'd buffer, not Go-GC memory (see doc above) + n := 0 + for *(*byte)(unsafe.Add(p, n)) != 0 { + n++ + } + return string(unsafe.Slice((*byte)(p), n)) +} diff --git a/backend/go/voice-detect/govoicedetect_test.go b/backend/go/voice-detect/govoicedetect_test.go new file mode 100644 index 000000000..2de7fcc8a --- /dev/null +++ b/backend/go/voice-detect/govoicedetect_test.go @@ -0,0 +1,144 @@ +package main + +import ( + "os" + "sync" + "testing" + + "github.com/ebitengine/purego" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestVoiceDetect(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "voice-detect Backend Suite") +} + +var ( + libLoadOnce sync.Once + libLoadErr error +) + +// ensureLibLoaded mirrors main.go's bootstrap so a Go test can drive the C-API +// bridge without spinning up the gRPC server. Records the error (the smoke +// specs skip themselves) when libvoicedetect.so is not loadable from cwd +// (LD_LIBRARY_PATH or a symlink in ./). +func ensureLibLoaded() error { + libLoadOnce.Do(func() { + libName := os.Getenv("VOICEDETECT_LIBRARY") + if libName == "" { + libName = "libvoicedetect.so" + } + lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL) + if err != nil { + libLoadErr = err + return + } + purego.RegisterLibFunc(&CppAbiVersion, lib, "voicedetect_capi_abi_version") + purego.RegisterLibFunc(&CppLoad, lib, "voicedetect_capi_load") + purego.RegisterLibFunc(&CppFree, lib, "voicedetect_capi_free") + purego.RegisterLibFunc(&CppLastError, lib, "voicedetect_capi_last_error") + purego.RegisterLibFunc(&CppFreeString, lib, "voicedetect_capi_free_string") + purego.RegisterLibFunc(&CppFreeVec, lib, "voicedetect_capi_free_vec") + purego.RegisterLibFunc(&CppEmbedPath, lib, "voicedetect_capi_embed_path") + purego.RegisterLibFunc(&CppEmbedPCM, lib, "voicedetect_capi_embed_pcm") + purego.RegisterLibFunc(&CppVerifyPaths, lib, "voicedetect_capi_verify_paths") + purego.RegisterLibFunc(&CppAnalyzeJSON, lib, "voicedetect_capi_analyze_path_json") + }) + return libLoadErr +} + +var _ = Describe("parseOptions", func() { + It("defaults verify_threshold to 0.25", func() { + o := parseOptions(nil) + Expect(o.verifyThreshold).To(Equal(float32(0.25))) + Expect(o.modelName).To(Equal("")) + }) + + It("parses verify_threshold, threshold alias and model_name", func() { + o := parseOptions([]string{"verify_threshold:0.4", "model_name:ecapa", "unknown:x"}) + Expect(o.verifyThreshold).To(Equal(float32(0.4))) + Expect(o.modelName).To(Equal("ecapa")) + + o2 := parseOptions([]string{"threshold:0.3"}) + Expect(o2.verifyThreshold).To(Equal(float32(0.3))) + }) + + It("ignores non-positive thresholds and keeps the default", func() { + o := parseOptions([]string{"verify_threshold:0", "threshold:-1"}) + Expect(o.verifyThreshold).To(Equal(float32(0.25))) + }) +}) + +var _ = Describe("parseAnalyzeJSON", func() { + It("maps age, gender label+scores and emotion label+scores", func() { + doc := `{"age":42.0, + "gender":{"label":"female","female":0.88,"male":0.12}, + "emotion":{"label":"neutral","scores":{"neutral":0.7,"happy":0.2,"sad":0.1}}}` + seg, err := parseAnalyzeJSON(doc) + Expect(err).ToNot(HaveOccurred()) + Expect(seg.Age).To(BeNumerically("~", 42.0, 1e-4)) + Expect(seg.Start).To(Equal(float32(0))) + Expect(seg.End).To(Equal(float32(0))) + + Expect(seg.DominantGender).To(Equal("female")) + Expect(seg.Gender).To(HaveKeyWithValue("female", BeNumerically("~", 0.88, 1e-4))) + Expect(seg.Gender).To(HaveKeyWithValue("male", BeNumerically("~", 0.12, 1e-4))) + // The "label" entry is consumed into DominantGender, not the score map. + Expect(seg.Gender).ToNot(HaveKey("label")) + + Expect(seg.DominantEmotion).To(Equal("neutral")) + Expect(seg.Emotion).To(HaveKeyWithValue("neutral", BeNumerically("~", 0.7, 1e-4))) + Expect(seg.Emotion).To(HaveKeyWithValue("happy", BeNumerically("~", 0.2, 1e-4))) + }) + + It("tolerates a missing gender block", func() { + seg, err := parseAnalyzeJSON(`{"age":30.0,"emotion":{"label":"happy","scores":{"happy":1.0}}}`) + Expect(err).ToNot(HaveOccurred()) + Expect(seg.DominantGender).To(Equal("")) + Expect(seg.DominantEmotion).To(Equal("happy")) + }) + + It("returns an error on malformed JSON", func() { + _, err := parseAnalyzeJSON(`{not-json`) + Expect(err).To(HaveOccurred()) + }) +}) + +// The specs below exercise the real C-API end to end. They run only when both a +// model GGUF and a test WAV are provided, and skip cleanly otherwise so the +// suite stays green without large assets. +var _ = Describe("VoiceDetect end-to-end", Ordered, func() { + var ( + v *VoiceDetect + modelPath = os.Getenv("VOICEDETECT_BACKEND_TEST_MODEL") + wavPath = os.Getenv("VOICEDETECT_BACKEND_TEST_WAV") + ) + + BeforeAll(func() { + if modelPath == "" || wavPath == "" { + Skip("set VOICEDETECT_BACKEND_TEST_MODEL and VOICEDETECT_BACKEND_TEST_WAV to run the e2e specs") + } + if err := ensureLibLoaded(); err != nil { + Skip("libvoicedetect.so not loadable: " + err.Error()) + } + v = &VoiceDetect{} + Expect(v.Load(&pb.ModelOptions{ModelFile: modelPath})).To(Succeed()) + }) + + It("embeds an audio clip", func() { + resp, err := v.VoiceEmbed(&pb.VoiceEmbedRequest{Audio: wavPath}) + Expect(err).ToNot(HaveOccurred()) + Expect(resp.Embedding).ToNot(BeEmpty()) + Expect(resp.Model).ToNot(BeEmpty()) + }) + + It("verifies a clip against itself as the same speaker", func() { + resp, err := v.VoiceVerify(&pb.VoiceVerifyRequest{Audio1: wavPath, Audio2: wavPath}) + Expect(err).ToNot(HaveOccurred()) + Expect(resp.Verified).To(BeTrue()) + Expect(resp.Distance).To(BeNumerically("<=", resp.Threshold)) + }) +}) diff --git a/backend/go/voice-detect/main.go b/backend/go/voice-detect/main.go new file mode 100644 index 000000000..35421b5c3 --- /dev/null +++ b/backend/go/voice-detect/main.go @@ -0,0 +1,64 @@ +package main + +// Started internally by LocalAI - one gRPC server per loaded model. +// +// Loads libvoicedetect.so via purego and registers the flat C-API entry points +// declared in voicedetect_capi.h. The library name can be overridden with +// VOICEDETECT_LIBRARY (mirrors the PARAKEET_LIBRARY / OMNIVOICE_LIBRARY +// convention in the sibling backends); the default looks for the .so next to +// this binary (resolved via LD_LIBRARY_PATH by run.sh). +import ( + "flag" + "fmt" + "os" + + "github.com/ebitengine/purego" + grpc "github.com/mudler/LocalAI/pkg/grpc" +) + +var ( + addr = flag.String("addr", "localhost:50051", "the address to connect to") +) + +type LibFuncs struct { + FuncPtr any + Name string +} + +func main() { + libName := os.Getenv("VOICEDETECT_LIBRARY") + if libName == "" { + libName = "libvoicedetect.so" + } + + lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL) + if err != nil { + panic(fmt.Errorf("voice-detect: dlopen %q: %w", libName, err)) + } + + // Bound 1:1 to voicedetect_capi.h. char*/float* returns are registered as + // uintptr so the raw pointer can be freed via the matching capi free fn. + libFuncs := []LibFuncs{ + {&CppAbiVersion, "voicedetect_capi_abi_version"}, + {&CppLoad, "voicedetect_capi_load"}, + {&CppFree, "voicedetect_capi_free"}, + {&CppLastError, "voicedetect_capi_last_error"}, + {&CppFreeString, "voicedetect_capi_free_string"}, + {&CppFreeVec, "voicedetect_capi_free_vec"}, + {&CppEmbedPath, "voicedetect_capi_embed_path"}, + {&CppEmbedPCM, "voicedetect_capi_embed_pcm"}, + {&CppVerifyPaths, "voicedetect_capi_verify_paths"}, + {&CppAnalyzeJSON, "voicedetect_capi_analyze_path_json"}, + } + for _, lf := range libFuncs { + purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name) + } + + fmt.Fprintf(os.Stderr, "[voice-detect] ABI=%d\n", CppAbiVersion()) + + flag.Parse() + + if err := grpc.StartServer(*addr, &VoiceDetect{}); err != nil { + panic(err) + } +} diff --git a/backend/go/voice-detect/options.go b/backend/go/voice-detect/options.go new file mode 100644 index 000000000..c5a6e2595 --- /dev/null +++ b/backend/go/voice-detect/options.go @@ -0,0 +1,46 @@ +package main + +import ( + "strconv" + "strings" +) + +// defaultVerifyThreshold is the cosine-distance cutoff used when a request does +// not set one. Matches the Python speaker-recognition backend's default so the +// two implementations agree on verdicts out of the box. +const defaultVerifyThreshold float32 = 0.25 + +// loadOptions holds the parsed model-level options for voice-detect. +type loadOptions struct { + verifyThreshold float32 + modelName string +} + +func splitOption(o string) (key, value string, ok bool) { + i := strings.Index(o, ":") + if i < 0 { + return "", "", false + } + return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true +} + +// parseOptions reads the backend "key:value" option slice. Unknown keys are +// ignored. Defaults: verify_threshold 0.25, model_name derived from the file. +func parseOptions(opts []string) loadOptions { + o := loadOptions{verifyThreshold: defaultVerifyThreshold} + for _, oo := range opts { + key, value, ok := splitOption(oo) + if !ok { + continue + } + switch key { + case "verify_threshold", "threshold": + if f, err := strconv.ParseFloat(value, 32); err == nil && f > 0 { + o.verifyThreshold = float32(f) + } + case "model_name": + o.modelName = value + } + } + return o +} diff --git a/backend/go/voice-detect/package.sh b/backend/go/voice-detect/package.sh new file mode 100755 index 000000000..de95c8ce2 --- /dev/null +++ b/backend/go/voice-detect/package.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# +# Bundle the voice-detect-grpc binary, libvoicedetect.so, the core runtime libs +# (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE +# so the package is self-contained. Mirrors backend/go/parakeet-cpp/package.sh; +# run.sh routes the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc +# is used instead of the host's. + +set -e + +CURDIR=$(dirname "$(realpath "$0")") +REPO_ROOT="${CURDIR}/../../.." + +mkdir -p "$CURDIR/package/lib" + +cp -avf "$CURDIR/voice-detect-grpc" "$CURDIR/package/" +cp -avf "$CURDIR/run.sh" "$CURDIR/package/" + +# libvoicedetect.so + any soname symlinks. purego.Dlopen resolves it via +# LD_LIBRARY_PATH, which run.sh points at lib/. +cp -avf "$CURDIR"/libvoicedetect.so* "$CURDIR/package/lib/" 2>/dev/null || { + echo "ERROR: libvoicedetect.so not found in $CURDIR, run 'make' first" >&2 + exit 1 +} + +# Detect architecture and copy the core runtime libs libvoicedetect.so links +# against, plus the matching dynamic loader as lib/ld.so. +if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then + echo "Detected x86_64 architecture, copying x86_64 libraries..." + cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so" + cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6" + cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1" + cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6" + cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6" + cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1" + cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2" + cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1" + cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0" +elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then + echo "Detected ARM64 architecture, copying ARM64 libraries..." + cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so" + cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6" + cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1" + cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6" + cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6" + cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1" + cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2" + cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1" + cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0" +elif [ "$(uname -s)" = "Darwin" ]; then + echo "Detected Darwin" +else + echo "Error: Could not detect architecture" + exit 1 +fi + +# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers) based on +# BUILD_TYPE so the backend can reach the GPU without the runtime base image +# shipping those drivers. +GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh" +if [ -f "$GPU_LIB_SCRIPT" ]; then + echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..." + source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib" + package_gpu_libs +fi + +echo "Packaging completed successfully" +ls -liah "$CURDIR/package/" "$CURDIR/package/lib/" diff --git a/backend/go/voice-detect/run.sh b/backend/go/voice-detect/run.sh new file mode 100755 index 000000000..ea5fef508 --- /dev/null +++ b/backend/go/voice-detect/run.sh @@ -0,0 +1,16 @@ +#!/bin/bash +set -e + +CURDIR=$(dirname "$(realpath "$0")") + +export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}" + +# If a self-contained ld.so was packaged, route through it so the packaged +# libc / libstdc++ are used instead of the host's (matches the whisper / +# parakeet backends' runtime layout). +if [ -f "$CURDIR/lib/ld.so" ]; then + echo "Using lib/ld.so" + exec "$CURDIR/lib/ld.so" "$CURDIR/voice-detect-grpc" "$@" +fi + +exec "$CURDIR/voice-detect-grpc" "$@" diff --git a/backend/go/voice-detect/test.sh b/backend/go/voice-detect/test.sh new file mode 100755 index 000000000..17addfebf --- /dev/null +++ b/backend/go/voice-detect/test.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -e + +CURDIR=$(dirname "$(realpath "$0")") +cd "$CURDIR" + +echo "Running voice-detect backend tests..." + +# The pure-Go parsing specs always run. The embed/verify/analyze smoke specs run +# only when a model + WAV are provided via VOICEDETECT_BACKEND_TEST_MODEL and +# VOICEDETECT_BACKEND_TEST_WAV; otherwise they auto-skip. +LD_LIBRARY_PATH="$CURDIR:${LD_LIBRARY_PATH:-}" go test -v -timeout 1200s . + +echo "voice-detect tests completed."