feat(voice-detect): add Go purego backend for voice-detect.cpp

Add backend/go/voice-detect implementing the Backend gRPC voice subset (VoiceEmbed/VoiceVerify/VoiceAnalyze) over libvoicedetect.so via purego, mirroring the parakeet-cpp / omnivoice-cpp backends. The flat voicedetect_capi C ABI is dlopen'd cgo-less; malloc'd string and float-vector returns are owned by Go and released through the matching capi free functions, with the per-ctx last error surfaced into Go errors. Calls are serialized via base.SingleThread since the C context is not reentrant. Proto field mapping: - VoiceEmbed: VoiceEmbedRequest.audio (path) -> embed_path -> Embedding+Model. - VoiceVerify: audio1/audio2 + threshold (<=0 falls back to the verify_threshold option, default 0.25) -> verify_paths -> verified/distance/ threshold/confidence/model/processing_time_ms. - VoiceAnalyze: audio (path) -> analyze_path_json; the JSON age/gender/emotion document maps to a single VoiceAnalysis segment (start/end 0; gender "label" -> dominant_gender with the remaining float scores as the gender map; emotion label/scores -> dominant_emotion/emotion). The Makefile pins voice-detect.cpp to 47546430, clones+builds libvoicedetect.so with ggml static-linked (PIC, GGML_NATIVE off) so dlopen needs no external libggml/libvoicedetect; ldd on the artifact shows only system libs. Ginkgo tests cover option parsing and analyze-JSON mapping; embed/verify smoke specs gate on VOICEDETECT_BACKEND_TEST_MODEL + VOICEDETECT_BACKEND_TEST_WAV. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code]
2026-06-22 07:39:02 -04:00 · 2026-06-22 00:00:32 +00:00
parent 600dafd20b
commit 01e098a844
9 changed files with 719 additions and 0 deletions
--- a/backend/go/voice-detect/.gitignore
+++ b/backend/go/voice-detect/.gitignore
@@ -0,0 +1,18 @@
+# Fetched upstream sources
+sources/
+
+# CMake build directories
+build*/
+
+# build artifacts staged in-tree by the Makefile (cp from sources/) or
+# symlinked for local dev; the real sources live in voice-detect.cpp upstream.
+*.so
+*.so.*
+voicedetect_capi.h
+compile_commands.json
+
+# Compiled backend binary
+voice-detect-grpc
+
+# Packaging output
+package/
--- a/backend/go/voice-detect/Makefile
+++ b/backend/go/voice-detect/Makefile
@@ -0,0 +1,92 @@
+# voice-detect backend Makefile.
+#
+# Upstream pin lives below as VOICEDETECT_VERSION?=4754643... (.github/bump_deps.sh
+# can find and update it - matches the parakeet.cpp / whisper.cpp / ds4 convention).
+#
+# Local dev shortcut: if you already have an out-of-tree voice-detect.cpp build,
+# symlink the .so + header into this directory and skip the clone/cmake steps:
+#
+#   ln -sf /path/to/voice-detect.cpp/build-shared/libvoicedetect.so .
+#   ln -sf /path/to/voice-detect.cpp/include/voicedetect_capi.h .
+#   go build -o voice-detect-grpc .
+#
+# The default target below does the proper clone-at-pin + cmake build so CI does
+# not need a side-checkout.
+
+VOICEDETECT_VERSION?=47546430ab0f23713c2f990e43035fb1106a9c74
+VOICEDETECT_REPO?=https://github.com/mudler/voice-detect.cpp
+
+GOCMD?=go
+GO_TAGS?=
+JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
+
+BUILD_TYPE?=
+NATIVE?=false
+
+# Build ggml statically into libvoicedetect.so (PIC) so the shared lib is
+# self-contained: dlopen needs no libggml*.so alongside it, only system libs
+# (libstdc++/libgomp/libc) that the runtime image already provides.
+CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DVOICEDETECT_SHARED=ON -DVOICEDETECT_BUILD_CLI=OFF -DVOICEDETECT_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+
+ifeq ($(NATIVE),false)
+	CMAKE_ARGS+=-DGGML_NATIVE=OFF
+endif
+
+# voice-detect.cpp gates its GGML backends behind VOICEDETECT_GGML_* options and
+# does set(GGML_CUDA ${VOICEDETECT_GGML_CUDA} CACHE BOOL "" FORCE), so a bare
+# -DGGML_CUDA=ON is overwritten back to OFF. Forward the VOICEDETECT_GGML_*
+# options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
+ifeq ($(BUILD_TYPE),cublas)
+	CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDA=ON
+else ifeq ($(BUILD_TYPE),openblas)
+	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+else ifeq ($(BUILD_TYPE),hipblas)
+	CMAKE_ARGS+=-DVOICEDETECT_GGML_HIP=ON
+else ifeq ($(BUILD_TYPE),vulkan)
+	CMAKE_ARGS+=-DVOICEDETECT_GGML_VULKAN=ON
+endif
+
+.PHONY: voice-detect-grpc package build clean purge test all
+
+all: voice-detect-grpc
+
+# Clone the upstream voice-detect.cpp source at the pinned commit. Directory acts
+# as the target so make only re-clones when missing. After a VOICEDETECT_VERSION
+# bump, run 'make purge && make' to refetch.
+sources/voice-detect.cpp:
+	mkdir -p sources/voice-detect.cpp
+	cd sources/voice-detect.cpp && \
+	git init -q && \
+	git remote add origin $(VOICEDETECT_REPO) && \
+	git fetch --depth 1 origin $(VOICEDETECT_VERSION) && \
+	git checkout FETCH_HEAD && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+# Build the shared lib + header out-of-tree, then stage them next to the Go
+# sources so purego.Dlopen("libvoicedetect.so") and the cgo-less build both pick
+# them up.
+libvoicedetect.so: sources/voice-detect.cpp
+	cmake -B sources/voice-detect.cpp/build-shared -S sources/voice-detect.cpp $(CMAKE_ARGS)
+	cmake --build sources/voice-detect.cpp/build-shared --config Release -j$(JOBS) --target voicedetect
+	cp -fv sources/voice-detect.cpp/build-shared/libvoicedetect.so* ./ 2>/dev/null || true
+	cp -fv sources/voice-detect.cpp/include/voicedetect_capi.h ./
+
+voice-detect-grpc: libvoicedetect.so main.go govoicedetect.go options.go
+	CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o voice-detect-grpc .
+
+package: voice-detect-grpc
+	bash package.sh
+
+build: package
+
+# Test target. The embed/verify/analyze smoke specs are gated on
+# VOICEDETECT_BACKEND_TEST_MODEL + VOICEDETECT_BACKEND_TEST_WAV; without them the
+# heavy specs auto-skip and only the pure-Go parsing specs run.
+test:
+	LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
+
+clean: purge
+	rm -rf libvoicedetect.so* voicedetect_capi.h package voice-detect-grpc
+
+purge:
+	rm -rf sources/voice-detect.cpp
--- a/backend/go/voice-detect/govoicedetect.go
+++ b/backend/go/voice-detect/govoicedetect.go
@@ -0,0 +1,257 @@
+package main
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"math"
+	"path/filepath"
+	"strings"
+	"time"
+	"unsafe"
+
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/xlog"
+)
+
+// purego-bound entry points from libvoicedetect.so. Names match
+// voicedetect_capi.h exactly so a `nm libvoicedetect.so | grep voicedetect_capi`
+// is enough to spot drift.
+//
+// The opaque ctx and the malloc'd char*/float* return values are declared as
+// uintptr so we get the raw pointer back and can release it via the matching
+// capi free function. purego's native string/[]float32 returns would copy and
+// forget the original pointer, leaking the C-owned buffer on every call.
+var (
+	CppAbiVersion  func() int32
+	CppLoad        func(ggufPath string) uintptr
+	CppFree        func(ctx uintptr)
+	CppLastError   func(ctx uintptr) string
+	CppFreeString  func(s uintptr)
+	CppFreeVec     func(v uintptr)
+	CppEmbedPath   func(ctx uintptr, wavPath string, outVec, outDim unsafe.Pointer) int32
+	CppEmbedPCM    func(ctx uintptr, pcm []float32, nSamples, sampleRate int32, outVec, outDim unsafe.Pointer) int32
+	CppVerifyPaths func(ctx uintptr, a, b string, threshold float32, outDistance, outVerified unsafe.Pointer) int32
+	CppAnalyzeJSON func(ctx uintptr, wavPath string) uintptr
+)
+
+// VoiceDetect implements the speaker-recognition voice subset of the Backend
+// gRPC service over libvoicedetect.so. The C side keeps a single loaded model
+// plus a per-ctx last-error buffer and is not reentrant, so base.SingleThread
+// serializes every call.
+type VoiceDetect struct {
+	base.SingleThread
+	opts   loadOptions
+	ctxPtr uintptr
+}
+
+func (v *VoiceDetect) Load(opts *pb.ModelOptions) error {
+	model := opts.ModelFile
+	if model == "" {
+		model = opts.ModelPath
+	}
+	if !filepath.IsAbs(model) && opts.ModelPath != "" {
+		model = filepath.Join(opts.ModelPath, model)
+	}
+	if model == "" {
+		return errors.New("voice-detect: ModelFile is required")
+	}
+
+	v.opts = parseOptions(opts.Options)
+	if v.opts.modelName == "" {
+		v.opts.modelName = filepath.Base(model)
+	}
+
+	xlog.Info("voice-detect: loading model", "model", model,
+		"verify_threshold", v.opts.verifyThreshold, "abi", CppAbiVersion())
+
+	ctx := CppLoad(model)
+	if ctx == 0 {
+		// The last-error buffer lives on the ctx that was never returned, so
+		// surface the path the operator tried to load instead.
+		return fmt.Errorf("voice-detect: voicedetect_capi_load failed for %q", model)
+	}
+	v.ctxPtr = ctx
+	return nil
+}
+
+// VoiceEmbed returns the L2-normalized speaker embedding for an audio clip.
+// The request carries a filesystem PATH; the HTTP layer materializes
+// base64/URL/data-URI inputs to a temp file before the gRPC call.
+func (v *VoiceDetect) VoiceEmbed(req *pb.VoiceEmbedRequest) (pb.VoiceEmbedResponse, error) {
+	if v.ctxPtr == 0 {
+		return pb.VoiceEmbedResponse{}, errors.New("voice-detect: model not loaded")
+	}
+	if req.Audio == "" {
+		return pb.VoiceEmbedResponse{}, errors.New("voice-detect: audio path is required")
+	}
+	emb, err := v.embedPath(req.Audio)
+	if err != nil {
+		return pb.VoiceEmbedResponse{}, err
+	}
+	return pb.VoiceEmbedResponse{Embedding: emb, Model: v.opts.modelName}, nil
+}
+
+func (v *VoiceDetect) embedPath(path string) ([]float32, error) {
+	var vec uintptr
+	var dim int32
+	rc := CppEmbedPath(v.ctxPtr, path, unsafe.Pointer(&vec), unsafe.Pointer(&dim))
+	if rc != 0 || vec == 0 || dim <= 0 {
+		return nil, v.lastErr("embed", path)
+	}
+	defer CppFreeVec(vec)
+	// Copy out of the C-owned malloc'd buffer before freeing it. The
+	// uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
+	// a C heap pointer from Go-managed memory; safe here, the GC neither tracks
+	// nor moves this buffer and we copy immediately.
+	src := unsafe.Slice((*float32)(unsafe.Pointer(vec)), int(dim)) //nolint:govet // C-owned malloc'd vector, copied out before free
+	out := make([]float32, int(dim))
+	copy(out, src)
+	return out, nil
+}
+
+// VoiceVerify embeds two clips and reports whether they are the same speaker by
+// cosine distance against a threshold. A request threshold <= 0 falls back to
+// the model-configured default (verify_threshold option, 0.25 if unset).
+func (v *VoiceDetect) VoiceVerify(req *pb.VoiceVerifyRequest) (pb.VoiceVerifyResponse, error) {
+	if v.ctxPtr == 0 {
+		return pb.VoiceVerifyResponse{}, errors.New("voice-detect: model not loaded")
+	}
+	if req.Audio1 == "" || req.Audio2 == "" {
+		return pb.VoiceVerifyResponse{}, errors.New("voice-detect: audio1 and audio2 are required")
+	}
+
+	threshold := req.Threshold
+	if threshold <= 0 {
+		threshold = v.opts.verifyThreshold
+	}
+
+	started := time.Now()
+	var distance float32
+	var verified int32
+	rc := CppVerifyPaths(v.ctxPtr, req.Audio1, req.Audio2, threshold,
+		unsafe.Pointer(&distance), unsafe.Pointer(&verified))
+	if rc != 0 {
+		return pb.VoiceVerifyResponse{}, v.lastErr("verify", req.Audio1+","+req.Audio2)
+	}
+	elapsedMs := float32(time.Since(started).Seconds() * 1000.0)
+
+	// Confidence decays linearly from 100 at distance 0 to 0 at the threshold,
+	// matching the Python speaker-recognition backend's reporting.
+	confidence := float32(0)
+	if threshold > 0 {
+		confidence = float32(math.Max(0, math.Min(100, (1.0-float64(distance)/float64(threshold))*100.0)))
+	}
+
+	return pb.VoiceVerifyResponse{
+		Verified:         verified != 0,
+		Distance:         distance,
+		Threshold:        threshold,
+		Confidence:       confidence,
+		Model:            v.opts.modelName,
+		ProcessingTimeMs: elapsedMs,
+	}, nil
+}
+
+// VoiceAnalyze runs the age/gender/emotion heads on a single clip. The C-API
+// always evaluates every supported head, so the request's actions filter is
+// advisory and the full analysis is returned as a single segment (the engine
+// does not produce time-bounded segments).
+func (v *VoiceDetect) VoiceAnalyze(req *pb.VoiceAnalyzeRequest) (pb.VoiceAnalyzeResponse, error) {
+	if v.ctxPtr == 0 {
+		return pb.VoiceAnalyzeResponse{}, errors.New("voice-detect: model not loaded")
+	}
+	if req.Audio == "" {
+		return pb.VoiceAnalyzeResponse{}, errors.New("voice-detect: audio path is required")
+	}
+
+	ptr := CppAnalyzeJSON(v.ctxPtr, req.Audio)
+	if ptr == 0 {
+		return pb.VoiceAnalyzeResponse{}, v.lastErr("analyze", req.Audio)
+	}
+	defer CppFreeString(ptr)
+
+	seg, err := parseAnalyzeJSON(goStringFromCPtr(ptr))
+	if err != nil {
+		return pb.VoiceAnalyzeResponse{}, fmt.Errorf("voice-detect: analyze JSON for %q: %w", req.Audio, err)
+	}
+	return pb.VoiceAnalyzeResponse{Segments: []*pb.VoiceAnalysis{seg}}, nil
+}
+
+// analyzeJSON mirrors the document returned by voicedetect_capi_analyze_path_json:
+//
+//	{"age":42.0,
+//	 "gender":{"label":"female","female":0.88,"male":0.12},
+//	 "emotion":{"label":"neutral","scores":{"neutral":0.7, ...}}}
+//
+// gender is a mixed object (a "label" string plus per-class float scores), so
+// it is decoded into raw messages and split in parseAnalyzeJSON.
+type analyzeJSON struct {
+	Age     float32                    `json:"age"`
+	Gender  map[string]json.RawMessage `json:"gender"`
+	Emotion struct {
+		Label  string             `json:"label"`
+		Scores map[string]float32 `json:"scores"`
+	} `json:"emotion"`
+}
+
+// parseAnalyzeJSON maps the engine's analyze document onto a VoiceAnalysis.
+// start/end stay 0: the model emits a single whole-utterance result, not
+// time-bounded segments.
+func parseAnalyzeJSON(doc string) (*pb.VoiceAnalysis, error) {
+	var a analyzeJSON
+	if err := json.Unmarshal([]byte(doc), &a); err != nil {
+		return nil, err
+	}
+
+	seg := &pb.VoiceAnalysis{
+		Age:             a.Age,
+		DominantEmotion: a.Emotion.Label,
+		Emotion:         a.Emotion.Scores,
+	}
+
+	if len(a.Gender) > 0 {
+		gender := make(map[string]float32, len(a.Gender))
+		for k, raw := range a.Gender {
+			if k == "label" {
+				_ = json.Unmarshal(raw, &seg.DominantGender)
+				continue
+			}
+			var score float32
+			if err := json.Unmarshal(raw, &score); err == nil {
+				gender[k] = score
+			}
+		}
+		seg.Gender = gender
+	}
+
+	return seg, nil
+}
+
+// lastErr wraps the C-API's per-ctx last-error buffer into a Go error.
+func (v *VoiceDetect) lastErr(op, subject string) error {
+	msg := strings.TrimSpace(CppLastError(v.ctxPtr))
+	if msg == "" {
+		msg = "no error detail"
+	}
+	return fmt.Errorf("voice-detect: %s failed for %q: %s", op, subject, msg)
+}
+
+// goStringFromCPtr copies a NUL-terminated C string into Go memory. cptr is a
+// malloc'd buffer the caller owns; release it via CppFreeString after the copy.
+//
+// The uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
+// a C heap pointer from Go-managed memory. Safe here: the GC neither tracks nor
+// moves the buffer and we dereference it immediately to copy the bytes out.
+func goStringFromCPtr(cptr uintptr) string {
+	if cptr == 0 {
+		return ""
+	}
+	p := unsafe.Pointer(cptr) //nolint:govet // C-owned malloc'd buffer, not Go-GC memory (see doc above)
+	n := 0
+	for *(*byte)(unsafe.Add(p, n)) != 0 {
+		n++
+	}
+	return string(unsafe.Slice((*byte)(p), n))
+}
--- a/backend/go/voice-detect/govoicedetect_test.go
+++ b/backend/go/voice-detect/govoicedetect_test.go
@@ -0,0 +1,144 @@
+package main
+
+import (
+	"os"
+	"sync"
+	"testing"
+
+	"github.com/ebitengine/purego"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestVoiceDetect(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "voice-detect Backend Suite")
+}
+
+var (
+	libLoadOnce sync.Once
+	libLoadErr  error
+)
+
+// ensureLibLoaded mirrors main.go's bootstrap so a Go test can drive the C-API
+// bridge without spinning up the gRPC server. Records the error (the smoke
+// specs skip themselves) when libvoicedetect.so is not loadable from cwd
+// (LD_LIBRARY_PATH or a symlink in ./).
+func ensureLibLoaded() error {
+	libLoadOnce.Do(func() {
+		libName := os.Getenv("VOICEDETECT_LIBRARY")
+		if libName == "" {
+			libName = "libvoicedetect.so"
+		}
+		lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
+		if err != nil {
+			libLoadErr = err
+			return
+		}
+		purego.RegisterLibFunc(&CppAbiVersion, lib, "voicedetect_capi_abi_version")
+		purego.RegisterLibFunc(&CppLoad, lib, "voicedetect_capi_load")
+		purego.RegisterLibFunc(&CppFree, lib, "voicedetect_capi_free")
+		purego.RegisterLibFunc(&CppLastError, lib, "voicedetect_capi_last_error")
+		purego.RegisterLibFunc(&CppFreeString, lib, "voicedetect_capi_free_string")
+		purego.RegisterLibFunc(&CppFreeVec, lib, "voicedetect_capi_free_vec")
+		purego.RegisterLibFunc(&CppEmbedPath, lib, "voicedetect_capi_embed_path")
+		purego.RegisterLibFunc(&CppEmbedPCM, lib, "voicedetect_capi_embed_pcm")
+		purego.RegisterLibFunc(&CppVerifyPaths, lib, "voicedetect_capi_verify_paths")
+		purego.RegisterLibFunc(&CppAnalyzeJSON, lib, "voicedetect_capi_analyze_path_json")
+	})
+	return libLoadErr
+}
+
+var _ = Describe("parseOptions", func() {
+	It("defaults verify_threshold to 0.25", func() {
+		o := parseOptions(nil)
+		Expect(o.verifyThreshold).To(Equal(float32(0.25)))
+		Expect(o.modelName).To(Equal(""))
+	})
+
+	It("parses verify_threshold, threshold alias and model_name", func() {
+		o := parseOptions([]string{"verify_threshold:0.4", "model_name:ecapa", "unknown:x"})
+		Expect(o.verifyThreshold).To(Equal(float32(0.4)))
+		Expect(o.modelName).To(Equal("ecapa"))
+
+		o2 := parseOptions([]string{"threshold:0.3"})
+		Expect(o2.verifyThreshold).To(Equal(float32(0.3)))
+	})
+
+	It("ignores non-positive thresholds and keeps the default", func() {
+		o := parseOptions([]string{"verify_threshold:0", "threshold:-1"})
+		Expect(o.verifyThreshold).To(Equal(float32(0.25)))
+	})
+})
+
+var _ = Describe("parseAnalyzeJSON", func() {
+	It("maps age, gender label+scores and emotion label+scores", func() {
+		doc := `{"age":42.0,
+			"gender":{"label":"female","female":0.88,"male":0.12},
+			"emotion":{"label":"neutral","scores":{"neutral":0.7,"happy":0.2,"sad":0.1}}}`
+		seg, err := parseAnalyzeJSON(doc)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(seg.Age).To(BeNumerically("~", 42.0, 1e-4))
+		Expect(seg.Start).To(Equal(float32(0)))
+		Expect(seg.End).To(Equal(float32(0)))
+
+		Expect(seg.DominantGender).To(Equal("female"))
+		Expect(seg.Gender).To(HaveKeyWithValue("female", BeNumerically("~", 0.88, 1e-4)))
+		Expect(seg.Gender).To(HaveKeyWithValue("male", BeNumerically("~", 0.12, 1e-4)))
+		// The "label" entry is consumed into DominantGender, not the score map.
+		Expect(seg.Gender).ToNot(HaveKey("label"))
+
+		Expect(seg.DominantEmotion).To(Equal("neutral"))
+		Expect(seg.Emotion).To(HaveKeyWithValue("neutral", BeNumerically("~", 0.7, 1e-4)))
+		Expect(seg.Emotion).To(HaveKeyWithValue("happy", BeNumerically("~", 0.2, 1e-4)))
+	})
+
+	It("tolerates a missing gender block", func() {
+		seg, err := parseAnalyzeJSON(`{"age":30.0,"emotion":{"label":"happy","scores":{"happy":1.0}}}`)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(seg.DominantGender).To(Equal(""))
+		Expect(seg.DominantEmotion).To(Equal("happy"))
+	})
+
+	It("returns an error on malformed JSON", func() {
+		_, err := parseAnalyzeJSON(`{not-json`)
+		Expect(err).To(HaveOccurred())
+	})
+})
+
+// The specs below exercise the real C-API end to end. They run only when both a
+// model GGUF and a test WAV are provided, and skip cleanly otherwise so the
+// suite stays green without large assets.
+var _ = Describe("VoiceDetect end-to-end", Ordered, func() {
+	var (
+		v         *VoiceDetect
+		modelPath = os.Getenv("VOICEDETECT_BACKEND_TEST_MODEL")
+		wavPath   = os.Getenv("VOICEDETECT_BACKEND_TEST_WAV")
+	)
+
+	BeforeAll(func() {
+		if modelPath == "" || wavPath == "" {
+			Skip("set VOICEDETECT_BACKEND_TEST_MODEL and VOICEDETECT_BACKEND_TEST_WAV to run the e2e specs")
+		}
+		if err := ensureLibLoaded(); err != nil {
+			Skip("libvoicedetect.so not loadable: " + err.Error())
+		}
+		v = &VoiceDetect{}
+		Expect(v.Load(&pb.ModelOptions{ModelFile: modelPath})).To(Succeed())
+	})
+
+	It("embeds an audio clip", func() {
+		resp, err := v.VoiceEmbed(&pb.VoiceEmbedRequest{Audio: wavPath})
+		Expect(err).ToNot(HaveOccurred())
+		Expect(resp.Embedding).ToNot(BeEmpty())
+		Expect(resp.Model).ToNot(BeEmpty())
+	})
+
+	It("verifies a clip against itself as the same speaker", func() {
+		resp, err := v.VoiceVerify(&pb.VoiceVerifyRequest{Audio1: wavPath, Audio2: wavPath})
+		Expect(err).ToNot(HaveOccurred())
+		Expect(resp.Verified).To(BeTrue())
+		Expect(resp.Distance).To(BeNumerically("<=", resp.Threshold))
+	})
+})
--- a/backend/go/voice-detect/main.go
+++ b/backend/go/voice-detect/main.go
@@ -0,0 +1,64 @@
+package main
+
+// Started internally by LocalAI - one gRPC server per loaded model.
+//
+// Loads libvoicedetect.so via purego and registers the flat C-API entry points
+// declared in voicedetect_capi.h. The library name can be overridden with
+// VOICEDETECT_LIBRARY (mirrors the PARAKEET_LIBRARY / OMNIVOICE_LIBRARY
+// convention in the sibling backends); the default looks for the .so next to
+// this binary (resolved via LD_LIBRARY_PATH by run.sh).
+import (
+	"flag"
+	"fmt"
+	"os"
+
+	"github.com/ebitengine/purego"
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+type LibFuncs struct {
+	FuncPtr any
+	Name    string
+}
+
+func main() {
+	libName := os.Getenv("VOICEDETECT_LIBRARY")
+	if libName == "" {
+		libName = "libvoicedetect.so"
+	}
+
+	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
+	if err != nil {
+		panic(fmt.Errorf("voice-detect: dlopen %q: %w", libName, err))
+	}
+
+	// Bound 1:1 to voicedetect_capi.h. char*/float* returns are registered as
+	// uintptr so the raw pointer can be freed via the matching capi free fn.
+	libFuncs := []LibFuncs{
+		{&CppAbiVersion, "voicedetect_capi_abi_version"},
+		{&CppLoad, "voicedetect_capi_load"},
+		{&CppFree, "voicedetect_capi_free"},
+		{&CppLastError, "voicedetect_capi_last_error"},
+		{&CppFreeString, "voicedetect_capi_free_string"},
+		{&CppFreeVec, "voicedetect_capi_free_vec"},
+		{&CppEmbedPath, "voicedetect_capi_embed_path"},
+		{&CppEmbedPCM, "voicedetect_capi_embed_pcm"},
+		{&CppVerifyPaths, "voicedetect_capi_verify_paths"},
+		{&CppAnalyzeJSON, "voicedetect_capi_analyze_path_json"},
+	}
+	for _, lf := range libFuncs {
+		purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
+	}
+
+	fmt.Fprintf(os.Stderr, "[voice-detect] ABI=%d\n", CppAbiVersion())
+
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &VoiceDetect{}); err != nil {
+		panic(err)
+	}
+}
--- a/backend/go/voice-detect/options.go
+++ b/backend/go/voice-detect/options.go
@@ -0,0 +1,46 @@
+package main
+
+import (
+	"strconv"
+	"strings"
+)
+
+// defaultVerifyThreshold is the cosine-distance cutoff used when a request does
+// not set one. Matches the Python speaker-recognition backend's default so the
+// two implementations agree on verdicts out of the box.
+const defaultVerifyThreshold float32 = 0.25
+
+// loadOptions holds the parsed model-level options for voice-detect.
+type loadOptions struct {
+	verifyThreshold float32
+	modelName       string
+}
+
+func splitOption(o string) (key, value string, ok bool) {
+	i := strings.Index(o, ":")
+	if i < 0 {
+		return "", "", false
+	}
+	return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true
+}
+
+// parseOptions reads the backend "key:value" option slice. Unknown keys are
+// ignored. Defaults: verify_threshold 0.25, model_name derived from the file.
+func parseOptions(opts []string) loadOptions {
+	o := loadOptions{verifyThreshold: defaultVerifyThreshold}
+	for _, oo := range opts {
+		key, value, ok := splitOption(oo)
+		if !ok {
+			continue
+		}
+		switch key {
+		case "verify_threshold", "threshold":
+			if f, err := strconv.ParseFloat(value, 32); err == nil && f > 0 {
+				o.verifyThreshold = float32(f)
+			}
+		case "model_name":
+			o.modelName = value
+		}
+	}
+	return o
+}
--- a/backend/go/voice-detect/package.sh
+++ b/backend/go/voice-detect/package.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+#
+# Bundle the voice-detect-grpc binary, libvoicedetect.so, the core runtime libs
+# (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE
+# so the package is self-contained. Mirrors backend/go/parakeet-cpp/package.sh;
+# run.sh routes the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc
+# is used instead of the host's.
+
+set -e
+
+CURDIR=$(dirname "$(realpath "$0")")
+REPO_ROOT="${CURDIR}/../../.."
+
+mkdir -p "$CURDIR/package/lib"
+
+cp -avf "$CURDIR/voice-detect-grpc" "$CURDIR/package/"
+cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
+
+# libvoicedetect.so + any soname symlinks. purego.Dlopen resolves it via
+# LD_LIBRARY_PATH, which run.sh points at lib/.
+cp -avf "$CURDIR"/libvoicedetect.so* "$CURDIR/package/lib/" 2>/dev/null || {
+	echo "ERROR: libvoicedetect.so not found in $CURDIR, run 'make' first" >&2
+	exit 1
+}
+
+# Detect architecture and copy the core runtime libs libvoicedetect.so links
+# against, plus the matching dynamic loader as lib/ld.so.
+if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
+    echo "Detected x86_64 architecture, copying x86_64 libraries..."
+    cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
+    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
+    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
+elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
+    echo "Detected ARM64 architecture, copying ARM64 libraries..."
+    cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
+    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
+    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
+elif [ "$(uname -s)" = "Darwin" ]; then
+    echo "Detected Darwin"
+else
+    echo "Error: Could not detect architecture"
+    exit 1
+fi
+
+# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers) based on
+# BUILD_TYPE so the backend can reach the GPU without the runtime base image
+# shipping those drivers.
+GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
+if [ -f "$GPU_LIB_SCRIPT" ]; then
+    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
+    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
+    package_gpu_libs
+fi
+
+echo "Packaging completed successfully"
+ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
--- a/backend/go/voice-detect/run.sh
+++ b/backend/go/voice-detect/run.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+set -e
+
+CURDIR=$(dirname "$(realpath "$0")")
+
+export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
+
+# If a self-contained ld.so was packaged, route through it so the packaged
+# libc / libstdc++ are used instead of the host's (matches the whisper /
+# parakeet backends' runtime layout).
+if [ -f "$CURDIR/lib/ld.so" ]; then
+	echo "Using lib/ld.so"
+	exec "$CURDIR/lib/ld.so" "$CURDIR/voice-detect-grpc" "$@"
+fi
+
+exec "$CURDIR/voice-detect-grpc" "$@"
--- a/backend/go/voice-detect/test.sh
+++ b/backend/go/voice-detect/test.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -e
+
+CURDIR=$(dirname "$(realpath "$0")")
+cd "$CURDIR"
+
+echo "Running voice-detect backend tests..."
+
+# The pure-Go parsing specs always run. The embed/verify/analyze smoke specs run
+# only when a model + WAV are provided via VOICEDETECT_BACKEND_TEST_MODEL and
+# VOICEDETECT_BACKEND_TEST_WAV; otherwise they auto-skip.
+LD_LIBRARY_PATH="$CURDIR:${LD_LIBRARY_PATH:-}" go test -v -timeout 1200s .
+
+echo "voice-detect tests completed."