From 4d3d54d61b083d5b435636c7638dec16b051553f Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 13 Jun 2026 23:54:27 +0200 Subject: [PATCH] test(e2e): live-server voice-recognition gate test (#10324) Add mock-backend VoiceEmbed/VoiceVerify (deterministic DC-offset speaker discrimination) and a verify-mode gated realtime pipeline, then drive the real HTTP/WS stack: an authorized speaker reaches response.done while an unauthorized one is dropped before the LLM with a speaker_not_authorized event. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto Co-authored-by: Ettore Di Giacinto --- tests/e2e/e2e_suite_test.go | 39 ++++++++ tests/e2e/mock-backend/main.go | 64 +++++++++++++ tests/e2e/realtime_voicegate_test.go | 134 +++++++++++++++++++++++++++ 3 files changed, 237 insertions(+) create mode 100644 tests/e2e/realtime_voicegate_test.go diff --git a/tests/e2e/e2e_suite_test.go b/tests/e2e/e2e_suite_test.go index 6c8a6c9a8..c25054351 100644 --- a/tests/e2e/e2e_suite_test.go +++ b/tests/e2e/e2e_suite_test.go @@ -236,6 +236,45 @@ var _ = BeforeSuite(func() { Expect(err).ToNot(HaveOccurred()) Expect(os.WriteFile(filepath.Join(modelsPath, "realtime-pipeline.yaml"), pipelineData, 0644)).To(Succeed()) + // Speaker-recognition model (mock-backend) + a voice-recognition-gated + // pipeline for the realtime gate e2e. The reference WAV carries a positive + // DC bias so the mock embeds it to one orthogonal "speaker"; the test then + // drives matching (authorized) and opposite-bias (unauthorized) audio. + speakerCfg := map[string]any{ + "name": "mock-speaker", + "backend": "mock-backend", + "parameters": map[string]any{"model": "mock-speaker.bin"}, + } + speakerData, err := yaml.Marshal(speakerCfg) + Expect(err).ToNot(HaveOccurred()) + Expect(os.WriteFile(filepath.Join(modelsPath, "mock-speaker.yaml"), speakerData, 0644)).To(Succeed()) + + voiceRefPath := filepath.Join(modelsPath, "e2e-voice-ref.wav") + Expect(os.WriteFile(voiceRefPath, wavFromPCM(pcmWithDC(300, 16000, 1000, 8000), 16000), 0644)).To(Succeed()) + + gatedCfg := map[string]any{ + "name": "realtime-pipeline-gated", + "pipeline": map[string]any{ + "vad": "mock-vad", + "transcription": "mock-stt", + "llm": "mock-llm", + "tts": "mock-tts", + "voice_recognition": map[string]any{ + "model": "mock-speaker", + "mode": "verify", + "threshold": 0.25, + "when": "every", + "on_reject": "drop_event", + "references": []map[string]any{ + {"name": "e2e-speaker", "audio": voiceRefPath}, + }, + }, + }, + } + gatedData, err := yaml.Marshal(gatedCfg) + Expect(err).ToNot(HaveOccurred()) + Expect(os.WriteFile(filepath.Join(modelsPath, "realtime-pipeline-gated.yaml"), gatedData, 0644)).To(Succeed()) + // Router model setup: a score classifier (mock-backend Score) selects // between two candidate chat models based on keyword matches against the // candidate label fragments. Exercises the full RouteModel middleware path diff --git a/tests/e2e/mock-backend/main.go b/tests/e2e/mock-backend/main.go index 50ac696e2..1a8e0418f 100644 --- a/tests/e2e/mock-backend/main.go +++ b/tests/e2e/mock-backend/main.go @@ -852,6 +852,70 @@ func (m *MockBackend) ModelMetadata(ctx context.Context, in *pb.ModelOptions) (* }, nil } +// voiceEmbedFromWAV reads a 16-bit LE mono WAV and returns a 2-d speaker +// embedding derived from the signed DC offset of the samples. A positive DC +// bias maps to one orthogonal unit vector, a negative bias to the other, so +// e2e tests can deterministically simulate two distinct "speakers" that +// survive resampling (DC is sample-rate independent). Near-zero DC maps to a +// neutral vector equidistant from both. Returns nil for unreadable audio. +func voiceEmbedFromWAV(path string) []float32 { + data, err := os.ReadFile(path) + if err != nil || len(data) < 44 { + return nil + } + pcm := data[44:] + n := len(pcm) / 2 + if n == 0 { + return nil + } + var sum float64 + for i := 0; i < n; i++ { + s := int16(pcm[2*i]) | int16(pcm[2*i+1])<<8 + sum += float64(s) + } + mean := sum / float64(n) + switch { + case mean > 500: + return []float32{1, 0} + case mean < -500: + return []float32{0, 1} + default: + return []float32{0.7071, 0.7071} + } +} + +// VoiceEmbed returns a deterministic 2-d speaker embedding for the audio clip. +// See voiceEmbedFromWAV for the (test-only) DC-offset discrimination scheme. +func (m *MockBackend) VoiceEmbed(ctx context.Context, in *pb.VoiceEmbedRequest) (*pb.VoiceEmbedResponse, error) { + emb := voiceEmbedFromWAV(in.GetAudio()) + xlog.Debug("VoiceEmbed called", "audio", in.GetAudio(), "embedding", emb) + if len(emb) == 0 { + return &pb.VoiceEmbedResponse{}, nil + } + return &pb.VoiceEmbedResponse{Embedding: emb, Model: "mock-speaker"}, nil +} + +// VoiceVerify compares two clips by cosine distance over their mock embeddings. +func (m *MockBackend) VoiceVerify(ctx context.Context, in *pb.VoiceVerifyRequest) (*pb.VoiceVerifyResponse, error) { + a := voiceEmbedFromWAV(in.GetAudio1()) + b := voiceEmbedFromWAV(in.GetAudio2()) + dist := float32(1) + if len(a) == 2 && len(b) == 2 { + dist = 1 - (a[0]*b[0] + a[1]*b[1]) // both unit vectors + } + threshold := in.GetThreshold() + if threshold == 0 { + threshold = 0.25 + } + xlog.Debug("VoiceVerify called", "distance", dist, "threshold", threshold) + return &pb.VoiceVerifyResponse{ + Verified: dist <= threshold, + Distance: dist, + Threshold: threshold, + Model: "mock-speaker", + }, nil +} + func main() { xlog.SetLogger(xlog.NewLogger(xlog.LogLevel(os.Getenv("LOCALAI_LOG_LEVEL")), os.Getenv("LOCALAI_LOG_FORMAT"))) diff --git a/tests/e2e/realtime_voicegate_test.go b/tests/e2e/realtime_voicegate_test.go new file mode 100644 index 000000000..1dad4fc20 --- /dev/null +++ b/tests/e2e/realtime_voicegate_test.go @@ -0,0 +1,134 @@ +package e2e_test + +import ( + "encoding/base64" + "encoding/binary" + "math" + "time" + + "github.com/gorilla/websocket" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// --- helpers: DC-biased PCM/WAV for the voice-recognition gate e2e --- +// +// The mock-backend embeds audio to one of two orthogonal "speaker" vectors +// based on the signed DC offset of the samples (see voiceEmbedFromWAV in the +// mock-backend). A positive bias is the authorized speaker (matches the +// enrolled reference); a negative bias is an unauthorized one. + +// pcmWithDC returns 16-bit LE mono PCM of a sine wave plus a constant DC bias. +func pcmWithDC(freq float64, sampleRate, durationMs int, dc int16) []byte { + numSamples := sampleRate * durationMs / 1000 + pcm := make([]byte, numSamples*2) + for i := 0; i < numSamples; i++ { + t := float64(i) / float64(sampleRate) + v := float64(dc) + math.MaxInt16/4*math.Sin(2*math.Pi*freq*t) + if v > math.MaxInt16 { + v = math.MaxInt16 + } + if v < math.MinInt16 { + v = math.MinInt16 + } + s := int16(v) + pcm[2*i] = byte(s) + pcm[2*i+1] = byte(s >> 8) + } + return pcm +} + +// wavFromPCM wraps 16-bit LE mono PCM in a canonical 44-byte WAV header. +func wavFromPCM(pcm []byte, sampleRate int) []byte { + var hdr [44]byte + copy(hdr[0:4], "RIFF") + binary.LittleEndian.PutUint32(hdr[4:8], uint32(36+len(pcm))) + copy(hdr[8:12], "WAVE") + copy(hdr[12:16], "fmt ") + binary.LittleEndian.PutUint32(hdr[16:20], 16) // PCM fmt chunk size + binary.LittleEndian.PutUint16(hdr[20:22], 1) // audio format = PCM + binary.LittleEndian.PutUint16(hdr[22:24], 1) // channels = mono + binary.LittleEndian.PutUint32(hdr[24:28], uint32(sampleRate)) + binary.LittleEndian.PutUint32(hdr[28:32], uint32(sampleRate*2)) // byte rate + binary.LittleEndian.PutUint16(hdr[32:34], 2) // block align + binary.LittleEndian.PutUint16(hdr[34:36], 16) // bits per sample + copy(hdr[36:40], "data") + binary.LittleEndian.PutUint32(hdr[40:44], uint32(len(pcm))) + return append(hdr[:], pcm...) +} + +var _ = Describe("Realtime voice recognition gate", Label("Realtime"), func() { + // open connects to the gated pipeline and disables server VAD so we can + // commit manually. + open := func() *websocket.Conn { + c := connectWS("realtime-pipeline-gated") + created := readServerEvent(c, 30*time.Second) + Expect(created["type"]).To(Equal("session.created")) + sendClientEvent(c, disableVADEvent()) + drainUntil(c, "session.updated", 10*time.Second) + return c + } + + // commit appends raw PCM (base64) and commits the input buffer. + commit := func(c *websocket.Conn, pcm []byte) { + sendClientEvent(c, map[string]any{ + "type": "input_audio_buffer.append", + "audio": base64.StdEncoding.EncodeToString(pcm), + }) + sendClientEvent(c, map[string]any{"type": "input_audio_buffer.commit"}) + } + + It("admits an authorized speaker through to a full response", func() { + c := open() + defer c.Close() + + // Positive DC bias matches the enrolled reference speaker. + commit(c, pcmWithDC(300, 16000, 1000, 8000)) + drainUntil(c, "input_audio_buffer.committed", 30*time.Second) + + var gotDone, gotReject bool + deadline := time.Now().Add(60 * time.Second) + for time.Now().Before(deadline) { + evt := readServerEvent(c, time.Until(deadline)) + if evt["type"] == "error" { + if e, ok := evt["error"].(map[string]any); ok && e["code"] == "speaker_not_authorized" { + gotReject = true + } + } + if evt["type"] == "response.done" { + gotDone = true + break + } + } + Expect(gotReject).To(BeFalse(), "authorized speaker must not be rejected") + Expect(gotDone).To(BeTrue(), "authorized speaker should reach response.done") + }) + + It("drops an unauthorized speaker before the LLM with a reject event", func() { + c := open() + defer c.Close() + + // Negative DC bias is a different speaker, not within threshold. + commit(c, pcmWithDC(300, 16000, 1000, -8000)) + drainUntil(c, "input_audio_buffer.committed", 30*time.Second) + + var gotReject, gotDone bool + deadline := time.Now().Add(30 * time.Second) + for time.Now().Before(deadline) { + evt := readServerEvent(c, time.Until(deadline)) + switch evt["type"] { + case "error": + if e, ok := evt["error"].(map[string]any); ok && e["code"] == "speaker_not_authorized" { + gotReject = true + } + case "response.done": + gotDone = true + } + if gotReject { + break + } + } + Expect(gotReject).To(BeTrue(), "unauthorized speaker should get a speaker_not_authorized event") + Expect(gotDone).To(BeFalse(), "unauthorized speaker must not reach the LLM/response.done") + }) +})