test(e2e): live-server voice-recognition gate test (#10324)

Add mock-backend VoiceEmbed/VoiceVerify (deterministic DC-offset speaker discrimination) and a verify-mode gated realtime pipeline, then drive the real HTTP/WS stack: an authorized speaker reaches response.done while an unauthorized one is dropped before the LLM with a speaker_not_authorized event. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-14 03:37:47 -04:00 · 2026-06-13 23:54:27 +02:00
parent 36e3419203
commit 4d3d54d61b
3 changed files with 237 additions and 0 deletions
--- a/tests/e2e/e2e_suite_test.go
+++ b/tests/e2e/e2e_suite_test.go
@@ -236,6 +236,45 @@ var _ = BeforeSuite(func() {
 	Expect(err).ToNot(HaveOccurred())
 	Expect(os.WriteFile(filepath.Join(modelsPath, "realtime-pipeline.yaml"), pipelineData, 0644)).To(Succeed())

+	// Speaker-recognition model (mock-backend) + a voice-recognition-gated
+	// pipeline for the realtime gate e2e. The reference WAV carries a positive
+	// DC bias so the mock embeds it to one orthogonal "speaker"; the test then
+	// drives matching (authorized) and opposite-bias (unauthorized) audio.
+	speakerCfg := map[string]any{
+		"name":       "mock-speaker",
+		"backend":    "mock-backend",
+		"parameters": map[string]any{"model": "mock-speaker.bin"},
+	}
+	speakerData, err := yaml.Marshal(speakerCfg)
+	Expect(err).ToNot(HaveOccurred())
+	Expect(os.WriteFile(filepath.Join(modelsPath, "mock-speaker.yaml"), speakerData, 0644)).To(Succeed())
+
+	voiceRefPath := filepath.Join(modelsPath, "e2e-voice-ref.wav")
+	Expect(os.WriteFile(voiceRefPath, wavFromPCM(pcmWithDC(300, 16000, 1000, 8000), 16000), 0644)).To(Succeed())
+
+	gatedCfg := map[string]any{
+		"name": "realtime-pipeline-gated",
+		"pipeline": map[string]any{
+			"vad":           "mock-vad",
+			"transcription": "mock-stt",
+			"llm":           "mock-llm",
+			"tts":           "mock-tts",
+			"voice_recognition": map[string]any{
+				"model":     "mock-speaker",
+				"mode":      "verify",
+				"threshold": 0.25,
+				"when":      "every",
+				"on_reject": "drop_event",
+				"references": []map[string]any{
+					{"name": "e2e-speaker", "audio": voiceRefPath},
+				},
+			},
+		},
+	}
+	gatedData, err := yaml.Marshal(gatedCfg)
+	Expect(err).ToNot(HaveOccurred())
+	Expect(os.WriteFile(filepath.Join(modelsPath, "realtime-pipeline-gated.yaml"), gatedData, 0644)).To(Succeed())
+
 	// Router model setup: a score classifier (mock-backend Score) selects
 	// between two candidate chat models based on keyword matches against the
 	// candidate label fragments. Exercises the full RouteModel middleware path
--- a/tests/e2e/mock-backend/main.go
+++ b/tests/e2e/mock-backend/main.go
@@ -852,6 +852,70 @@ func (m *MockBackend) ModelMetadata(ctx context.Context, in *pb.ModelOptions) (*
 	}, nil
 }

+// voiceEmbedFromWAV reads a 16-bit LE mono WAV and returns a 2-d speaker
+// embedding derived from the signed DC offset of the samples. A positive DC
+// bias maps to one orthogonal unit vector, a negative bias to the other, so
+// e2e tests can deterministically simulate two distinct "speakers" that
+// survive resampling (DC is sample-rate independent). Near-zero DC maps to a
+// neutral vector equidistant from both. Returns nil for unreadable audio.
+func voiceEmbedFromWAV(path string) []float32 {
+	data, err := os.ReadFile(path)
+	if err != nil || len(data) < 44 {
+		return nil
+	}
+	pcm := data[44:]
+	n := len(pcm) / 2
+	if n == 0 {
+		return nil
+	}
+	var sum float64
+	for i := 0; i < n; i++ {
+		s := int16(pcm[2*i]) | int16(pcm[2*i+1])<<8
+		sum += float64(s)
+	}
+	mean := sum / float64(n)
+	switch {
+	case mean > 500:
+		return []float32{1, 0}
+	case mean < -500:
+		return []float32{0, 1}
+	default:
+		return []float32{0.7071, 0.7071}
+	}
+}
+
+// VoiceEmbed returns a deterministic 2-d speaker embedding for the audio clip.
+// See voiceEmbedFromWAV for the (test-only) DC-offset discrimination scheme.
+func (m *MockBackend) VoiceEmbed(ctx context.Context, in *pb.VoiceEmbedRequest) (*pb.VoiceEmbedResponse, error) {
+	emb := voiceEmbedFromWAV(in.GetAudio())
+	xlog.Debug("VoiceEmbed called", "audio", in.GetAudio(), "embedding", emb)
+	if len(emb) == 0 {
+		return &pb.VoiceEmbedResponse{}, nil
+	}
+	return &pb.VoiceEmbedResponse{Embedding: emb, Model: "mock-speaker"}, nil
+}
+
+// VoiceVerify compares two clips by cosine distance over their mock embeddings.
+func (m *MockBackend) VoiceVerify(ctx context.Context, in *pb.VoiceVerifyRequest) (*pb.VoiceVerifyResponse, error) {
+	a := voiceEmbedFromWAV(in.GetAudio1())
+	b := voiceEmbedFromWAV(in.GetAudio2())
+	dist := float32(1)
+	if len(a) == 2 && len(b) == 2 {
+		dist = 1 - (a[0]*b[0] + a[1]*b[1]) // both unit vectors
+	}
+	threshold := in.GetThreshold()
+	if threshold == 0 {
+		threshold = 0.25
+	}
+	xlog.Debug("VoiceVerify called", "distance", dist, "threshold", threshold)
+	return &pb.VoiceVerifyResponse{
+		Verified:  dist <= threshold,
+		Distance:  dist,
+		Threshold: threshold,
+		Model:     "mock-speaker",
+	}, nil
+}
+
 func main() {
 	xlog.SetLogger(xlog.NewLogger(xlog.LogLevel(os.Getenv("LOCALAI_LOG_LEVEL")), os.Getenv("LOCALAI_LOG_FORMAT")))

--- a/tests/e2e/realtime_voicegate_test.go
+++ b/tests/e2e/realtime_voicegate_test.go
@@ -0,0 +1,134 @@
+package e2e_test
+
+import (
+	"encoding/base64"
+	"encoding/binary"
+	"math"
+	"time"
+
+	"github.com/gorilla/websocket"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// --- helpers: DC-biased PCM/WAV for the voice-recognition gate e2e ---
+//
+// The mock-backend embeds audio to one of two orthogonal "speaker" vectors
+// based on the signed DC offset of the samples (see voiceEmbedFromWAV in the
+// mock-backend). A positive bias is the authorized speaker (matches the
+// enrolled reference); a negative bias is an unauthorized one.
+
+// pcmWithDC returns 16-bit LE mono PCM of a sine wave plus a constant DC bias.
+func pcmWithDC(freq float64, sampleRate, durationMs int, dc int16) []byte {
+	numSamples := sampleRate * durationMs / 1000
+	pcm := make([]byte, numSamples*2)
+	for i := 0; i < numSamples; i++ {
+		t := float64(i) / float64(sampleRate)
+		v := float64(dc) + math.MaxInt16/4*math.Sin(2*math.Pi*freq*t)
+		if v > math.MaxInt16 {
+			v = math.MaxInt16
+		}
+		if v < math.MinInt16 {
+			v = math.MinInt16
+		}
+		s := int16(v)
+		pcm[2*i] = byte(s)
+		pcm[2*i+1] = byte(s >> 8)
+	}
+	return pcm
+}
+
+// wavFromPCM wraps 16-bit LE mono PCM in a canonical 44-byte WAV header.
+func wavFromPCM(pcm []byte, sampleRate int) []byte {
+	var hdr [44]byte
+	copy(hdr[0:4], "RIFF")
+	binary.LittleEndian.PutUint32(hdr[4:8], uint32(36+len(pcm)))
+	copy(hdr[8:12], "WAVE")
+	copy(hdr[12:16], "fmt ")
+	binary.LittleEndian.PutUint32(hdr[16:20], 16) // PCM fmt chunk size
+	binary.LittleEndian.PutUint16(hdr[20:22], 1)  // audio format = PCM
+	binary.LittleEndian.PutUint16(hdr[22:24], 1)  // channels = mono
+	binary.LittleEndian.PutUint32(hdr[24:28], uint32(sampleRate))
+	binary.LittleEndian.PutUint32(hdr[28:32], uint32(sampleRate*2)) // byte rate
+	binary.LittleEndian.PutUint16(hdr[32:34], 2)                    // block align
+	binary.LittleEndian.PutUint16(hdr[34:36], 16)                   // bits per sample
+	copy(hdr[36:40], "data")
+	binary.LittleEndian.PutUint32(hdr[40:44], uint32(len(pcm)))
+	return append(hdr[:], pcm...)
+}
+
+var _ = Describe("Realtime voice recognition gate", Label("Realtime"), func() {
+	// open connects to the gated pipeline and disables server VAD so we can
+	// commit manually.
+	open := func() *websocket.Conn {
+		c := connectWS("realtime-pipeline-gated")
+		created := readServerEvent(c, 30*time.Second)
+		Expect(created["type"]).To(Equal("session.created"))
+		sendClientEvent(c, disableVADEvent())
+		drainUntil(c, "session.updated", 10*time.Second)
+		return c
+	}
+
+	// commit appends raw PCM (base64) and commits the input buffer.
+	commit := func(c *websocket.Conn, pcm []byte) {
+		sendClientEvent(c, map[string]any{
+			"type":  "input_audio_buffer.append",
+			"audio": base64.StdEncoding.EncodeToString(pcm),
+		})
+		sendClientEvent(c, map[string]any{"type": "input_audio_buffer.commit"})
+	}
+
+	It("admits an authorized speaker through to a full response", func() {
+		c := open()
+		defer c.Close()
+
+		// Positive DC bias matches the enrolled reference speaker.
+		commit(c, pcmWithDC(300, 16000, 1000, 8000))
+		drainUntil(c, "input_audio_buffer.committed", 30*time.Second)
+
+		var gotDone, gotReject bool
+		deadline := time.Now().Add(60 * time.Second)
+		for time.Now().Before(deadline) {
+			evt := readServerEvent(c, time.Until(deadline))
+			if evt["type"] == "error" {
+				if e, ok := evt["error"].(map[string]any); ok && e["code"] == "speaker_not_authorized" {
+					gotReject = true
+				}
+			}
+			if evt["type"] == "response.done" {
+				gotDone = true
+				break
+			}
+		}
+		Expect(gotReject).To(BeFalse(), "authorized speaker must not be rejected")
+		Expect(gotDone).To(BeTrue(), "authorized speaker should reach response.done")
+	})
+
+	It("drops an unauthorized speaker before the LLM with a reject event", func() {
+		c := open()
+		defer c.Close()
+
+		// Negative DC bias is a different speaker, not within threshold.
+		commit(c, pcmWithDC(300, 16000, 1000, -8000))
+		drainUntil(c, "input_audio_buffer.committed", 30*time.Second)
+
+		var gotReject, gotDone bool
+		deadline := time.Now().Add(30 * time.Second)
+		for time.Now().Before(deadline) {
+			evt := readServerEvent(c, time.Until(deadline))
+			switch evt["type"] {
+			case "error":
+				if e, ok := evt["error"].(map[string]any); ok && e["code"] == "speaker_not_authorized" {
+					gotReject = true
+				}
+			case "response.done":
+				gotDone = true
+			}
+			if gotReject {
+				break
+			}
+		}
+		Expect(gotReject).To(BeTrue(), "unauthorized speaker should get a speaker_not_authorized event")
+		Expect(gotDone).To(BeFalse(), "unauthorized speaker must not reach the LLM/response.done")
+	})
+})