test(e2e): live-server voice-recognition gate test (#10324)

Add mock-backend VoiceEmbed/VoiceVerify (deterministic DC-offset speaker
discrimination) and a verify-mode gated realtime pipeline, then drive the
real HTTP/WS stack: an authorized speaker reaches response.done while an
unauthorized one is dropped before the LLM with a speaker_not_authorized
event.


Assisted-by: Claude:opus-4.8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
LocalAI [bot]
2026-06-13 23:54:27 +02:00
committed by GitHub
parent 36e3419203
commit 4d3d54d61b
3 changed files with 237 additions and 0 deletions

View File

@@ -236,6 +236,45 @@ var _ = BeforeSuite(func() {
Expect(err).ToNot(HaveOccurred())
Expect(os.WriteFile(filepath.Join(modelsPath, "realtime-pipeline.yaml"), pipelineData, 0644)).To(Succeed())
// Speaker-recognition model (mock-backend) + a voice-recognition-gated
// pipeline for the realtime gate e2e. The reference WAV carries a positive
// DC bias so the mock embeds it to one orthogonal "speaker"; the test then
// drives matching (authorized) and opposite-bias (unauthorized) audio.
speakerCfg := map[string]any{
"name": "mock-speaker",
"backend": "mock-backend",
"parameters": map[string]any{"model": "mock-speaker.bin"},
}
speakerData, err := yaml.Marshal(speakerCfg)
Expect(err).ToNot(HaveOccurred())
Expect(os.WriteFile(filepath.Join(modelsPath, "mock-speaker.yaml"), speakerData, 0644)).To(Succeed())
voiceRefPath := filepath.Join(modelsPath, "e2e-voice-ref.wav")
Expect(os.WriteFile(voiceRefPath, wavFromPCM(pcmWithDC(300, 16000, 1000, 8000), 16000), 0644)).To(Succeed())
gatedCfg := map[string]any{
"name": "realtime-pipeline-gated",
"pipeline": map[string]any{
"vad": "mock-vad",
"transcription": "mock-stt",
"llm": "mock-llm",
"tts": "mock-tts",
"voice_recognition": map[string]any{
"model": "mock-speaker",
"mode": "verify",
"threshold": 0.25,
"when": "every",
"on_reject": "drop_event",
"references": []map[string]any{
{"name": "e2e-speaker", "audio": voiceRefPath},
},
},
},
}
gatedData, err := yaml.Marshal(gatedCfg)
Expect(err).ToNot(HaveOccurred())
Expect(os.WriteFile(filepath.Join(modelsPath, "realtime-pipeline-gated.yaml"), gatedData, 0644)).To(Succeed())
// Router model setup: a score classifier (mock-backend Score) selects
// between two candidate chat models based on keyword matches against the
// candidate label fragments. Exercises the full RouteModel middleware path

View File

@@ -852,6 +852,70 @@ func (m *MockBackend) ModelMetadata(ctx context.Context, in *pb.ModelOptions) (*
}, nil
}
// voiceEmbedFromWAV reads a 16-bit LE mono WAV and returns a 2-d speaker
// embedding derived from the signed DC offset of the samples. A positive DC
// bias maps to one orthogonal unit vector, a negative bias to the other, so
// e2e tests can deterministically simulate two distinct "speakers" that
// survive resampling (DC is sample-rate independent). Near-zero DC maps to a
// neutral vector equidistant from both. Returns nil for unreadable audio.
func voiceEmbedFromWAV(path string) []float32 {
data, err := os.ReadFile(path)
if err != nil || len(data) < 44 {
return nil
}
pcm := data[44:]
n := len(pcm) / 2
if n == 0 {
return nil
}
var sum float64
for i := 0; i < n; i++ {
s := int16(pcm[2*i]) | int16(pcm[2*i+1])<<8
sum += float64(s)
}
mean := sum / float64(n)
switch {
case mean > 500:
return []float32{1, 0}
case mean < -500:
return []float32{0, 1}
default:
return []float32{0.7071, 0.7071}
}
}
// VoiceEmbed returns a deterministic 2-d speaker embedding for the audio clip.
// See voiceEmbedFromWAV for the (test-only) DC-offset discrimination scheme.
func (m *MockBackend) VoiceEmbed(ctx context.Context, in *pb.VoiceEmbedRequest) (*pb.VoiceEmbedResponse, error) {
emb := voiceEmbedFromWAV(in.GetAudio())
xlog.Debug("VoiceEmbed called", "audio", in.GetAudio(), "embedding", emb)
if len(emb) == 0 {
return &pb.VoiceEmbedResponse{}, nil
}
return &pb.VoiceEmbedResponse{Embedding: emb, Model: "mock-speaker"}, nil
}
// VoiceVerify compares two clips by cosine distance over their mock embeddings.
func (m *MockBackend) VoiceVerify(ctx context.Context, in *pb.VoiceVerifyRequest) (*pb.VoiceVerifyResponse, error) {
a := voiceEmbedFromWAV(in.GetAudio1())
b := voiceEmbedFromWAV(in.GetAudio2())
dist := float32(1)
if len(a) == 2 && len(b) == 2 {
dist = 1 - (a[0]*b[0] + a[1]*b[1]) // both unit vectors
}
threshold := in.GetThreshold()
if threshold == 0 {
threshold = 0.25
}
xlog.Debug("VoiceVerify called", "distance", dist, "threshold", threshold)
return &pb.VoiceVerifyResponse{
Verified: dist <= threshold,
Distance: dist,
Threshold: threshold,
Model: "mock-speaker",
}, nil
}
func main() {
xlog.SetLogger(xlog.NewLogger(xlog.LogLevel(os.Getenv("LOCALAI_LOG_LEVEL")), os.Getenv("LOCALAI_LOG_FORMAT")))

View File

@@ -0,0 +1,134 @@
package e2e_test
import (
"encoding/base64"
"encoding/binary"
"math"
"time"
"github.com/gorilla/websocket"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
// --- helpers: DC-biased PCM/WAV for the voice-recognition gate e2e ---
//
// The mock-backend embeds audio to one of two orthogonal "speaker" vectors
// based on the signed DC offset of the samples (see voiceEmbedFromWAV in the
// mock-backend). A positive bias is the authorized speaker (matches the
// enrolled reference); a negative bias is an unauthorized one.
// pcmWithDC returns 16-bit LE mono PCM of a sine wave plus a constant DC bias.
func pcmWithDC(freq float64, sampleRate, durationMs int, dc int16) []byte {
numSamples := sampleRate * durationMs / 1000
pcm := make([]byte, numSamples*2)
for i := 0; i < numSamples; i++ {
t := float64(i) / float64(sampleRate)
v := float64(dc) + math.MaxInt16/4*math.Sin(2*math.Pi*freq*t)
if v > math.MaxInt16 {
v = math.MaxInt16
}
if v < math.MinInt16 {
v = math.MinInt16
}
s := int16(v)
pcm[2*i] = byte(s)
pcm[2*i+1] = byte(s >> 8)
}
return pcm
}
// wavFromPCM wraps 16-bit LE mono PCM in a canonical 44-byte WAV header.
func wavFromPCM(pcm []byte, sampleRate int) []byte {
var hdr [44]byte
copy(hdr[0:4], "RIFF")
binary.LittleEndian.PutUint32(hdr[4:8], uint32(36+len(pcm)))
copy(hdr[8:12], "WAVE")
copy(hdr[12:16], "fmt ")
binary.LittleEndian.PutUint32(hdr[16:20], 16) // PCM fmt chunk size
binary.LittleEndian.PutUint16(hdr[20:22], 1) // audio format = PCM
binary.LittleEndian.PutUint16(hdr[22:24], 1) // channels = mono
binary.LittleEndian.PutUint32(hdr[24:28], uint32(sampleRate))
binary.LittleEndian.PutUint32(hdr[28:32], uint32(sampleRate*2)) // byte rate
binary.LittleEndian.PutUint16(hdr[32:34], 2) // block align
binary.LittleEndian.PutUint16(hdr[34:36], 16) // bits per sample
copy(hdr[36:40], "data")
binary.LittleEndian.PutUint32(hdr[40:44], uint32(len(pcm)))
return append(hdr[:], pcm...)
}
var _ = Describe("Realtime voice recognition gate", Label("Realtime"), func() {
// open connects to the gated pipeline and disables server VAD so we can
// commit manually.
open := func() *websocket.Conn {
c := connectWS("realtime-pipeline-gated")
created := readServerEvent(c, 30*time.Second)
Expect(created["type"]).To(Equal("session.created"))
sendClientEvent(c, disableVADEvent())
drainUntil(c, "session.updated", 10*time.Second)
return c
}
// commit appends raw PCM (base64) and commits the input buffer.
commit := func(c *websocket.Conn, pcm []byte) {
sendClientEvent(c, map[string]any{
"type": "input_audio_buffer.append",
"audio": base64.StdEncoding.EncodeToString(pcm),
})
sendClientEvent(c, map[string]any{"type": "input_audio_buffer.commit"})
}
It("admits an authorized speaker through to a full response", func() {
c := open()
defer c.Close()
// Positive DC bias matches the enrolled reference speaker.
commit(c, pcmWithDC(300, 16000, 1000, 8000))
drainUntil(c, "input_audio_buffer.committed", 30*time.Second)
var gotDone, gotReject bool
deadline := time.Now().Add(60 * time.Second)
for time.Now().Before(deadline) {
evt := readServerEvent(c, time.Until(deadline))
if evt["type"] == "error" {
if e, ok := evt["error"].(map[string]any); ok && e["code"] == "speaker_not_authorized" {
gotReject = true
}
}
if evt["type"] == "response.done" {
gotDone = true
break
}
}
Expect(gotReject).To(BeFalse(), "authorized speaker must not be rejected")
Expect(gotDone).To(BeTrue(), "authorized speaker should reach response.done")
})
It("drops an unauthorized speaker before the LLM with a reject event", func() {
c := open()
defer c.Close()
// Negative DC bias is a different speaker, not within threshold.
commit(c, pcmWithDC(300, 16000, 1000, -8000))
drainUntil(c, "input_audio_buffer.committed", 30*time.Second)
var gotReject, gotDone bool
deadline := time.Now().Add(30 * time.Second)
for time.Now().Before(deadline) {
evt := readServerEvent(c, time.Until(deadline))
switch evt["type"] {
case "error":
if e, ok := evt["error"].(map[string]any); ok && e["code"] == "speaker_not_authorized" {
gotReject = true
}
case "response.done":
gotDone = true
}
if gotReject {
break
}
}
Expect(gotReject).To(BeTrue(), "unauthorized speaker should get a speaker_not_authorized event")
Expect(gotDone).To(BeFalse(), "unauthorized speaker must not reach the LLM/response.done")
})
})