mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-14 03:37:47 -04:00
test(e2e): live-server voice-recognition gate test (#10324)
Add mock-backend VoiceEmbed/VoiceVerify (deterministic DC-offset speaker discrimination) and a verify-mode gated realtime pipeline, then drive the real HTTP/WS stack: an authorized speaker reaches response.done while an unauthorized one is dropped before the LLM with a speaker_not_authorized event. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
@@ -236,6 +236,45 @@ var _ = BeforeSuite(func() {
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(os.WriteFile(filepath.Join(modelsPath, "realtime-pipeline.yaml"), pipelineData, 0644)).To(Succeed())
|
||||
|
||||
// Speaker-recognition model (mock-backend) + a voice-recognition-gated
|
||||
// pipeline for the realtime gate e2e. The reference WAV carries a positive
|
||||
// DC bias so the mock embeds it to one orthogonal "speaker"; the test then
|
||||
// drives matching (authorized) and opposite-bias (unauthorized) audio.
|
||||
speakerCfg := map[string]any{
|
||||
"name": "mock-speaker",
|
||||
"backend": "mock-backend",
|
||||
"parameters": map[string]any{"model": "mock-speaker.bin"},
|
||||
}
|
||||
speakerData, err := yaml.Marshal(speakerCfg)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(os.WriteFile(filepath.Join(modelsPath, "mock-speaker.yaml"), speakerData, 0644)).To(Succeed())
|
||||
|
||||
voiceRefPath := filepath.Join(modelsPath, "e2e-voice-ref.wav")
|
||||
Expect(os.WriteFile(voiceRefPath, wavFromPCM(pcmWithDC(300, 16000, 1000, 8000), 16000), 0644)).To(Succeed())
|
||||
|
||||
gatedCfg := map[string]any{
|
||||
"name": "realtime-pipeline-gated",
|
||||
"pipeline": map[string]any{
|
||||
"vad": "mock-vad",
|
||||
"transcription": "mock-stt",
|
||||
"llm": "mock-llm",
|
||||
"tts": "mock-tts",
|
||||
"voice_recognition": map[string]any{
|
||||
"model": "mock-speaker",
|
||||
"mode": "verify",
|
||||
"threshold": 0.25,
|
||||
"when": "every",
|
||||
"on_reject": "drop_event",
|
||||
"references": []map[string]any{
|
||||
{"name": "e2e-speaker", "audio": voiceRefPath},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
gatedData, err := yaml.Marshal(gatedCfg)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(os.WriteFile(filepath.Join(modelsPath, "realtime-pipeline-gated.yaml"), gatedData, 0644)).To(Succeed())
|
||||
|
||||
// Router model setup: a score classifier (mock-backend Score) selects
|
||||
// between two candidate chat models based on keyword matches against the
|
||||
// candidate label fragments. Exercises the full RouteModel middleware path
|
||||
|
||||
@@ -852,6 +852,70 @@ func (m *MockBackend) ModelMetadata(ctx context.Context, in *pb.ModelOptions) (*
|
||||
}, nil
|
||||
}
|
||||
|
||||
// voiceEmbedFromWAV reads a 16-bit LE mono WAV and returns a 2-d speaker
|
||||
// embedding derived from the signed DC offset of the samples. A positive DC
|
||||
// bias maps to one orthogonal unit vector, a negative bias to the other, so
|
||||
// e2e tests can deterministically simulate two distinct "speakers" that
|
||||
// survive resampling (DC is sample-rate independent). Near-zero DC maps to a
|
||||
// neutral vector equidistant from both. Returns nil for unreadable audio.
|
||||
func voiceEmbedFromWAV(path string) []float32 {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil || len(data) < 44 {
|
||||
return nil
|
||||
}
|
||||
pcm := data[44:]
|
||||
n := len(pcm) / 2
|
||||
if n == 0 {
|
||||
return nil
|
||||
}
|
||||
var sum float64
|
||||
for i := 0; i < n; i++ {
|
||||
s := int16(pcm[2*i]) | int16(pcm[2*i+1])<<8
|
||||
sum += float64(s)
|
||||
}
|
||||
mean := sum / float64(n)
|
||||
switch {
|
||||
case mean > 500:
|
||||
return []float32{1, 0}
|
||||
case mean < -500:
|
||||
return []float32{0, 1}
|
||||
default:
|
||||
return []float32{0.7071, 0.7071}
|
||||
}
|
||||
}
|
||||
|
||||
// VoiceEmbed returns a deterministic 2-d speaker embedding for the audio clip.
|
||||
// See voiceEmbedFromWAV for the (test-only) DC-offset discrimination scheme.
|
||||
func (m *MockBackend) VoiceEmbed(ctx context.Context, in *pb.VoiceEmbedRequest) (*pb.VoiceEmbedResponse, error) {
|
||||
emb := voiceEmbedFromWAV(in.GetAudio())
|
||||
xlog.Debug("VoiceEmbed called", "audio", in.GetAudio(), "embedding", emb)
|
||||
if len(emb) == 0 {
|
||||
return &pb.VoiceEmbedResponse{}, nil
|
||||
}
|
||||
return &pb.VoiceEmbedResponse{Embedding: emb, Model: "mock-speaker"}, nil
|
||||
}
|
||||
|
||||
// VoiceVerify compares two clips by cosine distance over their mock embeddings.
|
||||
func (m *MockBackend) VoiceVerify(ctx context.Context, in *pb.VoiceVerifyRequest) (*pb.VoiceVerifyResponse, error) {
|
||||
a := voiceEmbedFromWAV(in.GetAudio1())
|
||||
b := voiceEmbedFromWAV(in.GetAudio2())
|
||||
dist := float32(1)
|
||||
if len(a) == 2 && len(b) == 2 {
|
||||
dist = 1 - (a[0]*b[0] + a[1]*b[1]) // both unit vectors
|
||||
}
|
||||
threshold := in.GetThreshold()
|
||||
if threshold == 0 {
|
||||
threshold = 0.25
|
||||
}
|
||||
xlog.Debug("VoiceVerify called", "distance", dist, "threshold", threshold)
|
||||
return &pb.VoiceVerifyResponse{
|
||||
Verified: dist <= threshold,
|
||||
Distance: dist,
|
||||
Threshold: threshold,
|
||||
Model: "mock-speaker",
|
||||
}, nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
xlog.SetLogger(xlog.NewLogger(xlog.LogLevel(os.Getenv("LOCALAI_LOG_LEVEL")), os.Getenv("LOCALAI_LOG_FORMAT")))
|
||||
|
||||
|
||||
134
tests/e2e/realtime_voicegate_test.go
Normal file
134
tests/e2e/realtime_voicegate_test.go
Normal file
@@ -0,0 +1,134 @@
|
||||
package e2e_test
|
||||
|
||||
import (
|
||||
"encoding/base64"
|
||||
"encoding/binary"
|
||||
"math"
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/websocket"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
// --- helpers: DC-biased PCM/WAV for the voice-recognition gate e2e ---
|
||||
//
|
||||
// The mock-backend embeds audio to one of two orthogonal "speaker" vectors
|
||||
// based on the signed DC offset of the samples (see voiceEmbedFromWAV in the
|
||||
// mock-backend). A positive bias is the authorized speaker (matches the
|
||||
// enrolled reference); a negative bias is an unauthorized one.
|
||||
|
||||
// pcmWithDC returns 16-bit LE mono PCM of a sine wave plus a constant DC bias.
|
||||
func pcmWithDC(freq float64, sampleRate, durationMs int, dc int16) []byte {
|
||||
numSamples := sampleRate * durationMs / 1000
|
||||
pcm := make([]byte, numSamples*2)
|
||||
for i := 0; i < numSamples; i++ {
|
||||
t := float64(i) / float64(sampleRate)
|
||||
v := float64(dc) + math.MaxInt16/4*math.Sin(2*math.Pi*freq*t)
|
||||
if v > math.MaxInt16 {
|
||||
v = math.MaxInt16
|
||||
}
|
||||
if v < math.MinInt16 {
|
||||
v = math.MinInt16
|
||||
}
|
||||
s := int16(v)
|
||||
pcm[2*i] = byte(s)
|
||||
pcm[2*i+1] = byte(s >> 8)
|
||||
}
|
||||
return pcm
|
||||
}
|
||||
|
||||
// wavFromPCM wraps 16-bit LE mono PCM in a canonical 44-byte WAV header.
|
||||
func wavFromPCM(pcm []byte, sampleRate int) []byte {
|
||||
var hdr [44]byte
|
||||
copy(hdr[0:4], "RIFF")
|
||||
binary.LittleEndian.PutUint32(hdr[4:8], uint32(36+len(pcm)))
|
||||
copy(hdr[8:12], "WAVE")
|
||||
copy(hdr[12:16], "fmt ")
|
||||
binary.LittleEndian.PutUint32(hdr[16:20], 16) // PCM fmt chunk size
|
||||
binary.LittleEndian.PutUint16(hdr[20:22], 1) // audio format = PCM
|
||||
binary.LittleEndian.PutUint16(hdr[22:24], 1) // channels = mono
|
||||
binary.LittleEndian.PutUint32(hdr[24:28], uint32(sampleRate))
|
||||
binary.LittleEndian.PutUint32(hdr[28:32], uint32(sampleRate*2)) // byte rate
|
||||
binary.LittleEndian.PutUint16(hdr[32:34], 2) // block align
|
||||
binary.LittleEndian.PutUint16(hdr[34:36], 16) // bits per sample
|
||||
copy(hdr[36:40], "data")
|
||||
binary.LittleEndian.PutUint32(hdr[40:44], uint32(len(pcm)))
|
||||
return append(hdr[:], pcm...)
|
||||
}
|
||||
|
||||
var _ = Describe("Realtime voice recognition gate", Label("Realtime"), func() {
|
||||
// open connects to the gated pipeline and disables server VAD so we can
|
||||
// commit manually.
|
||||
open := func() *websocket.Conn {
|
||||
c := connectWS("realtime-pipeline-gated")
|
||||
created := readServerEvent(c, 30*time.Second)
|
||||
Expect(created["type"]).To(Equal("session.created"))
|
||||
sendClientEvent(c, disableVADEvent())
|
||||
drainUntil(c, "session.updated", 10*time.Second)
|
||||
return c
|
||||
}
|
||||
|
||||
// commit appends raw PCM (base64) and commits the input buffer.
|
||||
commit := func(c *websocket.Conn, pcm []byte) {
|
||||
sendClientEvent(c, map[string]any{
|
||||
"type": "input_audio_buffer.append",
|
||||
"audio": base64.StdEncoding.EncodeToString(pcm),
|
||||
})
|
||||
sendClientEvent(c, map[string]any{"type": "input_audio_buffer.commit"})
|
||||
}
|
||||
|
||||
It("admits an authorized speaker through to a full response", func() {
|
||||
c := open()
|
||||
defer c.Close()
|
||||
|
||||
// Positive DC bias matches the enrolled reference speaker.
|
||||
commit(c, pcmWithDC(300, 16000, 1000, 8000))
|
||||
drainUntil(c, "input_audio_buffer.committed", 30*time.Second)
|
||||
|
||||
var gotDone, gotReject bool
|
||||
deadline := time.Now().Add(60 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
evt := readServerEvent(c, time.Until(deadline))
|
||||
if evt["type"] == "error" {
|
||||
if e, ok := evt["error"].(map[string]any); ok && e["code"] == "speaker_not_authorized" {
|
||||
gotReject = true
|
||||
}
|
||||
}
|
||||
if evt["type"] == "response.done" {
|
||||
gotDone = true
|
||||
break
|
||||
}
|
||||
}
|
||||
Expect(gotReject).To(BeFalse(), "authorized speaker must not be rejected")
|
||||
Expect(gotDone).To(BeTrue(), "authorized speaker should reach response.done")
|
||||
})
|
||||
|
||||
It("drops an unauthorized speaker before the LLM with a reject event", func() {
|
||||
c := open()
|
||||
defer c.Close()
|
||||
|
||||
// Negative DC bias is a different speaker, not within threshold.
|
||||
commit(c, pcmWithDC(300, 16000, 1000, -8000))
|
||||
drainUntil(c, "input_audio_buffer.committed", 30*time.Second)
|
||||
|
||||
var gotReject, gotDone bool
|
||||
deadline := time.Now().Add(30 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
evt := readServerEvent(c, time.Until(deadline))
|
||||
switch evt["type"] {
|
||||
case "error":
|
||||
if e, ok := evt["error"].(map[string]any); ok && e["code"] == "speaker_not_authorized" {
|
||||
gotReject = true
|
||||
}
|
||||
case "response.done":
|
||||
gotDone = true
|
||||
}
|
||||
if gotReject {
|
||||
break
|
||||
}
|
||||
}
|
||||
Expect(gotReject).To(BeTrue(), "unauthorized speaker should get a speaker_not_authorized event")
|
||||
Expect(gotDone).To(BeFalse(), "unauthorized speaker must not reach the LLM/response.done")
|
||||
})
|
||||
})
|
||||
Reference in New Issue
Block a user