diff --git a/core/config/meta/registry.go b/core/config/meta/registry.go index c404effb2..c03d52ee4 100644 --- a/core/config/meta/registry.go +++ b/core/config/meta/registry.go @@ -355,6 +355,85 @@ func DefaultRegistry() map[string]FieldMetaOverride { Component: "toggle", Order: 69, }, + "pipeline.voice_recognition.model": { + Section: "pipeline", + Label: "Voice Recognition Model", + Description: "Speaker-recognition backend model used to gate the pipeline behind speaker verification. Leave empty to disable the voice gate.", + Component: "model-select", + AutocompleteProvider: ProviderModels, + Order: 70, + }, + "pipeline.voice_recognition.mode": { + Section: "pipeline", + Label: "Voice Gate Mode", + Description: "How callers are authorized: 'identify' matches the speaker 1:N against the voice registry; 'verify' matches 1:few against the configured reference audios.", + Component: "select", + Options: []FieldOption{ + {Value: "identify", Label: "identify (registry)"}, + {Value: "verify", Label: "verify (references)"}, + }, + Order: 71, + }, + "pipeline.voice_recognition.threshold": { + Section: "pipeline", + Label: "Voice Gate Threshold", + Description: "Maximum cosine distance between the caller and an authorized speaker that still counts as a match. Lower is stricter. Default 0.25 is tuned for the ECAPA-TDNN encoder on VoxCeleb.", + Component: "slider", + Min: f64(0.01), + Max: f64(2), + Step: f64(0.01), + Order: 72, + }, + "pipeline.voice_recognition.when": { + Section: "pipeline", + Label: "Voice Gate When", + Description: "How often to verify the speaker: 'every' checks each utterance; 'first' verifies once and then trusts the session.", + Component: "select", + Options: []FieldOption{ + {Value: "every", Label: "every utterance"}, + {Value: "first", Label: "first only"}, + }, + Order: 73, + }, + "pipeline.voice_recognition.on_reject": { + Section: "pipeline", + Label: "Voice Gate On Reject", + Description: "What to do with an unauthorized utterance: 'drop_event' drops it and emits an error event to the client; 'drop_silent' drops it quietly.", + Component: "select", + Options: []FieldOption{ + {Value: "drop_event", Label: "drop + error event"}, + {Value: "drop_silent", Label: "drop silently"}, + }, + Order: 74, + }, + "pipeline.voice_recognition.anti_spoofing": { + Section: "pipeline", + Label: "Voice Gate Anti-Spoofing", + Description: "Enable the backend liveness/anti-spoofing check (verify mode only) to reject replayed or synthesized audio.", + Component: "toggle", + Order: 75, + }, + "pipeline.voice_recognition.allow.names": { + Section: "pipeline", + Label: "Voice Gate Allowed Names", + Description: "Identify mode: authorize only registry identities whose name matches one of these exactly. Empty allows any registered identity.", + Component: "string-list", + Order: 76, + }, + "pipeline.voice_recognition.allow.labels": { + Section: "pipeline", + Label: "Voice Gate Allowed Labels", + Description: "Identify mode: authorize any registry identity carrying one of these label keys. Empty allows any registered identity.", + Component: "string-list", + Order: 77, + }, + "pipeline.voice_recognition.references": { + Section: "pipeline", + Label: "Voice Gate References", + Description: "Verify mode: the authorized reference speakers, each with a name and an audio file path the caller's voice is matched against.", + Component: "json-editor", + Order: 78, + }, // --- Functions --- "function.grammar.parallel_calls": { diff --git a/core/config/model_config.go b/core/config/model_config.go index 6b1de098e..195739654 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -509,6 +509,10 @@ type Pipeline struct { // to enable_thinking=false backend metadata) without editing the underlying // LLM model config. Unset leaves the LLM model config in charge. DisableThinking *bool `yaml:"disable_thinking,omitempty" json:"disable_thinking,omitempty"` + + // VoiceRecognition gates the pipeline behind speaker verification. Nil + // (block absent) means no gate, preserving existing behavior. + VoiceRecognition *PipelineVoiceRecognition `yaml:"voice_recognition,omitempty" json:"voice_recognition,omitempty"` } // ApplyReasoningEffort resolves the effective reasoning effort — a per-request @@ -575,6 +579,123 @@ func (p Pipeline) ThinkingDisabled() bool { return p.DisableThinking != nil && *p.DisableThinking } +// Voice-recognition gate enum values. +const ( + VoiceGateModeIdentify = "identify" + VoiceGateModeVerify = "verify" + VoiceGateWhenEvery = "every" + VoiceGateWhenFirst = "first" + VoiceGateRejectEvent = "drop_event" + VoiceGateRejectSilent = "drop_silent" + + // defaultVoiceGateThreshold is the cosine-distance default tuned for the + // ECAPA-TDNN speaker encoder on VoxCeleb. + defaultVoiceGateThreshold = 0.25 +) + +// @Description PipelineVoiceRecognition gates a realtime pipeline behind speaker verification. +type PipelineVoiceRecognition struct { + // Model is the speaker-recognition backend model name. + Model string `yaml:"model,omitempty" json:"model,omitempty"` + // Mode is "identify" (1:N against the voice registry) or "verify" + // (1:few against reference audios). + Mode string `yaml:"mode,omitempty" json:"mode,omitempty"` + // Threshold is the maximum cosine distance that still counts as a match. + Threshold float32 `yaml:"threshold,omitempty" json:"threshold,omitempty"` + // When is "every" (verify each utterance) or "first" (verify once, then + // trust the session). + When string `yaml:"when,omitempty" json:"when,omitempty"` + // OnReject is "drop_event" (drop + emit an error event) or "drop_silent" + // (drop quietly). + OnReject string `yaml:"on_reject,omitempty" json:"on_reject,omitempty"` + // AntiSpoofing enables the backend liveness check (verify mode only). + AntiSpoofing bool `yaml:"anti_spoofing,omitempty" json:"anti_spoofing,omitempty"` + // Allow filters which registry identities are authorized (identify mode). + Allow VoiceRecognitionAllow `yaml:"allow,omitempty" json:"allow,omitempty"` + // References are the authorized reference speakers (verify mode). + References []VoiceReference `yaml:"references,omitempty" json:"references,omitempty"` +} + +// @Description VoiceRecognitionAllow filters authorized registry identities. +type VoiceRecognitionAllow struct { + // Names matches registered Metadata.Name exactly. + Names []string `yaml:"names,omitempty" json:"names,omitempty"` + // Labels authorizes any identity carrying a matching label key. + Labels []string `yaml:"labels,omitempty" json:"labels,omitempty"` +} + +// @Description VoiceReference is one authorized reference speaker for verify mode. +type VoiceReference struct { + Name string `yaml:"name,omitempty" json:"name,omitempty"` + Audio string `yaml:"audio,omitempty" json:"audio,omitempty"` +} + +// VoiceGateEnabled reports whether a voice-recognition gate is configured. The +// mere presence of the block is the intent signal: a present-but-incomplete +// block (e.g. missing model) must fail closed at construction, not be silently +// skipped here. +func (p Pipeline) VoiceGateEnabled() bool { + return p.VoiceRecognition != nil +} + +// Normalize fills in defaults in place for omitted fields. +func (v *PipelineVoiceRecognition) Normalize() { + if v.Mode == "" { + v.Mode = VoiceGateModeIdentify + } + if v.When == "" { + v.When = VoiceGateWhenEvery + } + if v.OnReject == "" { + v.OnReject = VoiceGateRejectEvent + } + if v.Threshold == 0 { + v.Threshold = defaultVoiceGateThreshold + } +} + +// Validate checks shape and enum values. registryAvailable indicates whether a +// VoiceRegistry exists (required by identify mode). Empty When/OnReject/Mode are +// treated as valid because Normalize defaults them. +func (v PipelineVoiceRecognition) Validate(registryAvailable bool) error { + if v.Model == "" { + return fmt.Errorf("voice_recognition: model is required") + } + switch v.Mode { + case "", VoiceGateModeIdentify: + if !registryAvailable { + return fmt.Errorf("voice_recognition mode 'identify' requires a voice registry") + } + case VoiceGateModeVerify: + if len(v.References) == 0 { + return fmt.Errorf("voice_recognition mode 'verify' requires at least one reference") + } + for i, r := range v.References { + if r.Audio == "" { + return fmt.Errorf("voice_recognition reference %d (%q) is missing an audio path", i, r.Name) + } + } + default: + return fmt.Errorf("voice_recognition: unknown mode %q", v.Mode) + } + switch v.When { + case "", VoiceGateWhenEvery, VoiceGateWhenFirst: + default: + return fmt.Errorf("voice_recognition: unknown when %q", v.When) + } + switch v.OnReject { + case "", VoiceGateRejectEvent, VoiceGateRejectSilent: + default: + return fmt.Errorf("voice_recognition: unknown on_reject %q", v.OnReject) + } + // A zero threshold means "unset" (Normalize defaults it); only validate an + // explicitly-set value. Cosine distance ranges 0..2. + if v.Threshold != 0 && (v.Threshold < 0 || v.Threshold > 2) { + return fmt.Errorf("voice_recognition: threshold %v out of range (0..2)", v.Threshold) + } + return nil +} + // @Description File configuration for model downloads type File struct { Filename string `yaml:"filename,omitempty" json:"filename,omitempty"` diff --git a/core/config/voice_gate_test.go b/core/config/voice_gate_test.go new file mode 100644 index 000000000..5c7782f1c --- /dev/null +++ b/core/config/voice_gate_test.go @@ -0,0 +1,73 @@ +package config + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("PipelineVoiceRecognition", func() { + Describe("Normalize", func() { + It("fills defaults for empty fields", func() { + v := PipelineVoiceRecognition{Model: "spk"} + v.Normalize() + Expect(v.Mode).To(Equal(VoiceGateModeIdentify)) + Expect(v.When).To(Equal(VoiceGateWhenEvery)) + Expect(v.OnReject).To(Equal(VoiceGateRejectEvent)) + Expect(v.Threshold).To(BeNumerically("~", defaultVoiceGateThreshold, 1e-6)) + }) + It("keeps explicit values", func() { + v := PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeVerify, When: VoiceGateWhenFirst, OnReject: VoiceGateRejectSilent, Threshold: 0.4} + v.Normalize() + Expect(v.Mode).To(Equal(VoiceGateModeVerify)) + Expect(v.When).To(Equal(VoiceGateWhenFirst)) + Expect(v.OnReject).To(Equal(VoiceGateRejectSilent)) + Expect(v.Threshold).To(BeNumerically("~", 0.4, 1e-6)) + }) + }) + + Describe("Validate", func() { + It("requires a registry for identify mode", func() { + v := PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify} + Expect(v.Validate(false)).To(HaveOccurred()) + Expect(v.Validate(true)).ToNot(HaveOccurred()) + }) + It("requires references for verify mode", func() { + v := PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeVerify} + Expect(v.Validate(false)).To(HaveOccurred()) + v.References = []VoiceReference{{Name: "a", Audio: "/a.wav"}} + Expect(v.Validate(false)).ToNot(HaveOccurred()) + }) + It("rejects a reference with no audio path", func() { + v := PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeVerify, References: []VoiceReference{{Name: "a"}}} + Expect(v.Validate(false)).To(HaveOccurred()) + }) + It("rejects unknown enum values", func() { + Expect((PipelineVoiceRecognition{Model: "spk", Mode: "bogus"}).Validate(true)).To(HaveOccurred()) + Expect((PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify, When: "bogus"}).Validate(true)).To(HaveOccurred()) + Expect((PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify, OnReject: "bogus"}).Validate(true)).To(HaveOccurred()) + }) + It("accepts a zero (unset) threshold", func() { + v := PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify, Threshold: 0} + Expect(v.Validate(true)).ToNot(HaveOccurred()) + }) + It("rejects an out-of-range threshold", func() { + Expect((PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify, Threshold: 5}).Validate(true)).To(HaveOccurred()) + Expect((PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify, Threshold: -1}).Validate(true)).To(HaveOccurred()) + }) + It("rejects an empty model", func() { + Expect((PipelineVoiceRecognition{Mode: VoiceGateModeIdentify}).Validate(true)).To(HaveOccurred()) + }) + }) + + Describe("VoiceGateEnabled", func() { + It("is false when block absent", func() { + Expect((Pipeline{}).VoiceGateEnabled()).To(BeFalse()) + }) + It("is true when a model is set", func() { + Expect((Pipeline{VoiceRecognition: &PipelineVoiceRecognition{Model: "spk"}}).VoiceGateEnabled()).To(BeTrue()) + }) + It("is true when the block is present even without a model (fails closed downstream)", func() { + Expect((Pipeline{VoiceRecognition: &PipelineVoiceRecognition{}}).VoiceGateEnabled()).To(BeTrue()) + }) + }) +}) diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go index 63b1a1589..f626a895c 100644 --- a/core/http/endpoints/openai/realtime.go +++ b/core/http/endpoints/openai/realtime.go @@ -133,6 +133,13 @@ type Session struct { // silently strip Manage Mode's tools. AssistantTools []types.ToolUnion + // voiceGate is non-nil when pipeline.voice_recognition is configured. It + // authorizes each committed utterance's speaker before the LLM runs. + voiceGate *voiceGate + // gateMu guards the when:first verification state below. + gateMu sync.Mutex + voiceVerified bool + // Response cancellation: protects activeResponseCancel/activeResponseDone responseMu sync.Mutex activeResponseCancel context.CancelFunc @@ -514,6 +521,23 @@ func runRealtimeSession(application *application.Application, t Transport, model } session.ModelInterface = m + if cfg.Pipeline.VoiceGateEnabled() { + gate, gerr := newVoiceGate( + *cfg.Pipeline.VoiceRecognition, + application.ModelConfigLoader(), + application.ModelLoader(), + application.ApplicationConfig(), + application.VoiceRegistry(), + ) + if gerr != nil { + xlog.Error("failed to initialize voice recognition gate", "error", gerr) + sendError(t, "voice_gate_error", gerr.Error(), "", "") + return + } + session.voiceGate = gate + xlog.Info("realtime voice recognition gate enabled", "mode", gate.cfg.Mode, "when", gate.cfg.When) + } + // Store the session and notify the transport (for WebRTC audio track handling) sessionLock.Lock() sessions[sessionID] = session @@ -1269,6 +1293,39 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co f.Sync() + // Start speaker verification concurrently with transcription. This is a + // latency optimization only: there is a hard join below before the LLM, so + // an unauthorized utterance never reaches generateResponse (no LLM, no + // tools, no TTS) regardless of how fast transcription finishes. A rejected + // turn wastes only transcription compute, which has no side effects. The + // transcript is still emitted to the same peer that sent the audio, which + // reveals nothing new to them. + type gateOutcome struct { + allowed bool + matched string + reason string + err error + } + var gateCh chan gateOutcome + runGate := false + if session.voiceGate != nil && session.InputAudioTranscription != nil { + skip := false + if session.voiceGate.cfg.When == config.VoiceGateWhenFirst { + session.gateMu.Lock() + skip = session.voiceVerified + session.gateMu.Unlock() + } + if !skip { + runGate = true + gateCh = make(chan gateOutcome, 1) + wavPath := f.Name() + go func() { + allowed, matched, reason, gerr := session.voiceGate.Authorize(ctx, wavPath) + gateCh <- gateOutcome{allowed: allowed, matched: matched, reason: reason, err: gerr} + }() + } + } + // TODO: If we have a real any-to-any model then transcription is optional var transcript string if session.InputAudioTranscription != nil { @@ -1278,14 +1335,54 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co var err error transcript, err = emitTranscription(ctx, t, session, generateItemID(), f.Name()) if err != nil { + // Drain the gate goroutine before returning so its in-flight read of + // the temp WAV finishes before the deferred os.Remove fires. + if runGate { + <-gateCh + } sendError(t, "transcription_failed", err.Error(), "", "event_TODO") return } } else { + // The voice gate runs only on the transcription path above; if an + // any-to-any model path is added here, join the gate before responding. sendNotImplemented(t, "any-to-any models") return } + // Join on the gate before any side-effecting step. + if runGate { + out := <-gateCh + allowed := out.allowed + reason := out.reason + if out.err != nil { + // Fail closed: a gate that cannot decide must not let audio through. + xlog.Error("voice recognition gate error", "error", out.err) + allowed = false + reason = "verification error" + } + alreadyVerified := false + if session.voiceGate.cfg.When == config.VoiceGateWhenFirst { + session.gateMu.Lock() + alreadyVerified = session.voiceVerified + session.gateMu.Unlock() + } + proceed, markVerified := session.voiceGate.decide(alreadyVerified, allowed) + if !proceed { + xlog.Debug("voice recognition gate rejected utterance", "reason", reason) + if session.voiceGate.cfg.OnReject == config.VoiceGateRejectEvent { + sendError(t, "speaker_not_authorized", "speaker not authorized: "+reason, "", "event_TODO") + } + return + } + xlog.Debug("voice recognition gate authorized utterance", "speaker", out.matched) + if markVerified { + session.gateMu.Lock() + session.voiceVerified = true + session.gateMu.Unlock() + } + } + if !session.TranscriptionOnly { generateResponse(ctx, session, utt, transcript, conv, t) } diff --git a/core/http/endpoints/openai/realtime_voicegate.go b/core/http/endpoints/openai/realtime_voicegate.go new file mode 100644 index 000000000..54332536f --- /dev/null +++ b/core/http/endpoints/openai/realtime_voicegate.go @@ -0,0 +1,212 @@ +package openai + +import ( + "context" + "fmt" + "math" + + "github.com/mudler/LocalAI/core/backend" + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/services/voicerecognition" + "github.com/mudler/LocalAI/pkg/model" +) + +type namedEmbedding struct { + name string + emb []float32 +} + +// voiceGate decides whether a committed utterance's speaker is authorized to +// drive the realtime pipeline. +type voiceGate struct { + cfg config.PipelineVoiceRecognition // normalized + registry voicerecognition.Registry // identify mode (nil otherwise) + refEmbeds []namedEmbedding // verify mode, pre-embedded refs + refAudios []config.VoiceReference // verify + anti-spoofing: ref paths + + // Seams for testing; set by newVoiceGate to call the real backend. + embedFn func(ctx context.Context, wavPath string) ([]float32, error) + verifyFn func(ctx context.Context, uttWav, refWav string) (bool, error) +} + +// newVoiceGate builds a gate from a pipeline's voice_recognition config. It +// validates fail-fast (before loading the model), loads the recognition model +// config, wires the real backend seams, and pre-embeds references for verify +// mode so per-turn cost is one utterance embed plus cheap cosine comparisons. +func newVoiceGate( + cfg config.PipelineVoiceRecognition, + cl *config.ModelConfigLoader, + ml *model.ModelLoader, + appConfig *config.ApplicationConfig, + registry voicerecognition.Registry, +) (*voiceGate, error) { + cfg.Normalize() + if err := cfg.Validate(registry != nil); err != nil { + return nil, err + } + + recCfg, err := cl.LoadModelConfigFileByName(cfg.Model, ml.ModelPath) + if err != nil { + return nil, fmt.Errorf("voice_recognition: failed to load model %q: %w", cfg.Model, err) + } + if valid, _ := recCfg.Validate(); !valid { + return nil, fmt.Errorf("voice_recognition: invalid model config %q", cfg.Model) + } + + g := &voiceGate{ + cfg: cfg, + registry: registry, + embedFn: func(ctx context.Context, wavPath string) ([]float32, error) { + res, err := backend.VoiceEmbed(ctx, wavPath, ml, appConfig, *recCfg) + if err != nil { + return nil, err + } + return res.Embedding, nil + }, + verifyFn: func(ctx context.Context, uttWav, refWav string) (bool, error) { + res, err := backend.VoiceVerify(ctx, uttWav, refWav, cfg.Threshold, true, ml, appConfig, *recCfg) + if err != nil { + return false, err + } + return res.Verified, nil + }, + } + + if cfg.Mode == config.VoiceGateModeVerify { + if cfg.AntiSpoofing { + g.refAudios = cfg.References + } else { + for _, r := range cfg.References { + emb, err := g.embedFn(context.Background(), r.Audio) + if err != nil { + return nil, fmt.Errorf("voice_recognition: failed to embed reference %q: %w", r.Name, err) + } + g.refEmbeds = append(g.refEmbeds, namedEmbedding{name: r.Name, emb: emb}) + } + } + } + + return g, nil +} + +// Authorize embeds the utterance and decides allow/deny. +// +// allowed: speaker is authorized. +// matched: matched person's name (informational), empty if none. +// reason: human-readable deny reason. +// err: backend failure (caller should fail closed). +func (g *voiceGate) Authorize(ctx context.Context, wavPath string) (allowed bool, matched string, reason string, err error) { + if g.cfg.Mode == config.VoiceGateModeVerify { + return g.authorizeVerify(ctx, wavPath) + } + return g.authorizeIdentify(ctx, wavPath) +} + +func (g *voiceGate) authorizeIdentify(ctx context.Context, wavPath string) (bool, string, string, error) { + emb, err := g.embedFn(ctx, wavPath) + if err != nil { + return false, "", "embed failed", err + } + if len(emb) == 0 { + return false, "", "no speech detected", nil + } + matches, err := g.registry.Identify(ctx, emb, 1) + if err != nil { + return false, "", "identify failed", err + } + if len(matches) == 0 { + return false, "", "unknown speaker", nil + } + m := matches[0] + if m.Distance > g.cfg.Threshold { + return false, m.Metadata.Name, "distance above threshold", nil + } + if !g.allowMatch(m.Metadata) { + return false, m.Metadata.Name, "speaker not in allow list", nil + } + return true, m.Metadata.Name, "", nil +} + +// allowMatch reports whether a matched identity is authorized. An empty allow +// (no names and no labels) authorizes any registered speaker. +func (g *voiceGate) allowMatch(meta voicerecognition.Metadata) bool { + a := g.cfg.Allow + if len(a.Names) == 0 && len(a.Labels) == 0 { + return true + } + for _, n := range a.Names { + if n == meta.Name { + return true + } + } + for _, l := range a.Labels { + if _, ok := meta.Labels[l]; ok { + return true + } + } + return false +} + +func (g *voiceGate) authorizeVerify(ctx context.Context, wavPath string) (bool, string, string, error) { + if g.cfg.AntiSpoofing { + for _, r := range g.refAudios { + ok, err := g.verifyFn(ctx, wavPath, r.Audio) + if err != nil { + return false, "", "verify failed", err + } + if ok { + return true, r.Name, "", nil + } + } + return false, "", "no reference matched", nil + } + + emb, err := g.embedFn(ctx, wavPath) + if err != nil { + return false, "", "embed failed", err + } + if len(emb) == 0 { + return false, "", "no speech detected", nil + } + for _, r := range g.refEmbeds { + if cosineDistance(emb, r.emb) <= g.cfg.Threshold { + return true, r.name, "", nil + } + } + return false, "", "no reference matched", nil +} + +// decide interprets an Authorize result against the gate's when-policy and the +// session's prior verification state. +// proceed: run the LLM response for this utterance. +// markVerified: record a successful first-utterance verification. +// Note: when:first AND alreadyVerified is normally handled by the caller +// skipping Authorize entirely; if it still reaches here, proceed is true. +func (g *voiceGate) decide(alreadyVerified, allowed bool) (proceed, markVerified bool) { + if g.cfg.When == config.VoiceGateWhenFirst { + if alreadyVerified { + return true, false + } + return allowed, allowed + } + return allowed, false +} + +// cosineDistance returns 1 - cosine_similarity, matching the voice registry's +// distance convention (lower = closer). Returns 1 (treated as "no match") for +// zero-length, mismatched, or zero-magnitude vectors. +func cosineDistance(a, b []float32) float32 { + if len(a) == 0 || len(a) != len(b) { + return 1 + } + var dot, na, nb float64 + for i := range a { + dot += float64(a[i]) * float64(b[i]) + na += float64(a[i]) * float64(a[i]) + nb += float64(b[i]) * float64(b[i]) + } + if na == 0 || nb == 0 { + return 1 + } + return float32(1 - dot/(math.Sqrt(na)*math.Sqrt(nb))) +} diff --git a/core/http/endpoints/openai/realtime_voicegate_integration_test.go b/core/http/endpoints/openai/realtime_voicegate_integration_test.go new file mode 100644 index 000000000..f8aae72c5 --- /dev/null +++ b/core/http/endpoints/openai/realtime_voicegate_integration_test.go @@ -0,0 +1,154 @@ +package openai + +import ( + "context" + "errors" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/mudler/LocalAI/core/backend" + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/http/endpoints/openai/types" + "github.com/mudler/LocalAI/core/schema" + "github.com/mudler/LocalAI/core/services/voicerecognition" +) + +// These specs drive the REAL commitUtterance path end to end (gate goroutine, +// the hard join before the LLM, the reject event, and when:first session +// trust) using the existing fakeTransport/fakeModel doubles. They are the +// integration counterpart to the unit specs in realtime_voicegate_test.go: +// here the gate is wired into a Session exactly as runRealtimeSession wires it. + +// itGate builds an identify-mode gate whose registry always returns a single +// match named matchName, and whose embedFn returns embed/embErr. allowName is +// the authorized identity. when/onReject select the policy. +func itGate(allowName, matchName string, embed []float32, embErr error, when, onReject string) *voiceGate { + return &voiceGate{ + cfg: config.PipelineVoiceRecognition{ + Mode: config.VoiceGateModeIdentify, + Threshold: 0.25, + When: when, + OnReject: onReject, + Allow: config.VoiceRecognitionAllow{Names: []string{allowName}}, + }, + registry: &fakeRegistry{matches: []voicerecognition.Match{ + {Distance: 0.1, Metadata: voicerecognition.Metadata{Name: matchName}}, + }}, + embedFn: func(context.Context, string) ([]float32, error) { return embed, embErr }, + } +} + +// itSession returns a Session + fakeModel wired for a full pipeline turn, with +// the given gate attached. The fakeModel mirrors the streaming-LLM setup used +// by realtime_stream_test.go so triggerResponse runs to a response.done. +func itSession(gate *voiceGate) (*Session, *fakeModel) { + on := true + m := &fakeModel{ + cfg: &config.ModelConfig{}, + transcribeFinal: &schema.TranscriptionResult{Text: "hello"}, + predictTokens: []string{"Hi", " there."}, + predictResp: backend.LLMResponse{Response: "Hi there."}, + ttsStreamChunks: [][]byte{{1}}, + ttsStreamRate: 24000, + } + session := &Session{ + OutputSampleRate: 24000, + InputAudioTranscription: &types.AudioTranscription{}, + ModelInterface: m, + ModelConfig: &config.ModelConfig{ + Pipeline: config.Pipeline{Streaming: config.PipelineStreaming{LLM: &on, TTS: &on}}, + }, + voiceGate: gate, + } + return session, m +} + +// hasSpeakerNotAuthorized reports whether a speaker_not_authorized error event +// was emitted to the client. +func hasSpeakerNotAuthorized(tr *fakeTransport) bool { + for _, e := range tr.events { + if ev, ok := e.(types.ErrorEvent); ok && ev.Error.Code == "speaker_not_authorized" { + return true + } + } + return false +} + +var _ = Describe("realtime voice gate integration (commitUtterance)", func() { + utt := make([]byte, 32) // non-empty PCM so commitUtterance proceeds + + It("allows an authorized speaker through to a full response", func() { + session, _ := itSession(itGate("alice", "alice", []float32{1, 0, 0}, nil, + config.VoiceGateWhenEvery, config.VoiceGateRejectEvent)) + tr := &fakeTransport{} + + commitUtterance(context.Background(), utt, session, &Conversation{}, tr) + + Expect(hasSpeakerNotAuthorized(tr)).To(BeFalse()) + // The LLM/TTS pipeline ran to completion. + Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1)) + // Transcription still happened (parallel with the gate). + Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1)) + }) + + It("drops an unauthorized speaker before the LLM and emits a reject event", func() { + // match name "mallory" is not in the allow list → deny. + session, _ := itSession(itGate("alice", "mallory", []float32{1, 0, 0}, nil, + config.VoiceGateWhenEvery, config.VoiceGateRejectEvent)) + tr := &fakeTransport{} + + commitUtterance(context.Background(), utt, session, &Conversation{}, tr) + + // Hard barrier: the LLM/TTS pipeline never ran. + Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0)) + // The client was told why. + Expect(hasSpeakerNotAuthorized(tr)).To(BeTrue()) + // Transcription of the rejected utterance still emitted (sent only to the + // peer that produced the audio; reveals nothing new). + Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1)) + }) + + It("fails closed on a gate backend error", func() { + session, _ := itSession(itGate("alice", "alice", nil, errors.New("backend down"), + config.VoiceGateWhenEvery, config.VoiceGateRejectEvent)) + tr := &fakeTransport{} + + commitUtterance(context.Background(), utt, session, &Conversation{}, tr) + + Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0)) + Expect(hasSpeakerNotAuthorized(tr)).To(BeTrue()) + }) + + It("drops silently when on_reject is drop_silent (no error event)", func() { + session, _ := itSession(itGate("alice", "mallory", []float32{1, 0, 0}, nil, + config.VoiceGateWhenEvery, config.VoiceGateRejectSilent)) + tr := &fakeTransport{} + + commitUtterance(context.Background(), utt, session, &Conversation{}, tr) + + Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0)) + Expect(hasSpeakerNotAuthorized(tr)).To(BeFalse()) + }) + + It("when:first trusts the session after one match, even if later embeds fail", func() { + gate := itGate("alice", "alice", []float32{1, 0, 0}, nil, + config.VoiceGateWhenFirst, config.VoiceGateRejectEvent) + session, _ := itSession(gate) + + // First utterance: authorized, marks the session verified. + tr1 := &fakeTransport{} + commitUtterance(context.Background(), utt, session, &Conversation{}, tr1) + Expect(hasSpeakerNotAuthorized(tr1)).To(BeFalse()) + Expect(tr1.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1)) + + // Break the gate: any further Authorize would now error. + gate.embedFn = func(context.Context, string) ([]float32, error) { return nil, errors.New("boom") } + + // Second utterance still proceeds because when:first skips re-verification. + tr2 := &fakeTransport{} + commitUtterance(context.Background(), utt, session, &Conversation{}, tr2) + Expect(hasSpeakerNotAuthorized(tr2)).To(BeFalse()) + Expect(tr2.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1)) + }) +}) diff --git a/core/http/endpoints/openai/realtime_voicegate_test.go b/core/http/endpoints/openai/realtime_voicegate_test.go new file mode 100644 index 000000000..bdbbc2f4a --- /dev/null +++ b/core/http/endpoints/openai/realtime_voicegate_test.go @@ -0,0 +1,231 @@ +package openai + +import ( + "context" + "errors" + + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/services/voicerecognition" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("cosineDistance", func() { + It("is 0 for identical vectors", func() { + Expect(cosineDistance([]float32{1, 0, 0}, []float32{1, 0, 0})).To(BeNumerically("~", 0, 1e-6)) + }) + It("is ~1 for orthogonal vectors", func() { + Expect(cosineDistance([]float32{1, 0}, []float32{0, 1})).To(BeNumerically("~", 1, 1e-6)) + }) + It("is ~2 for opposite vectors", func() { + Expect(cosineDistance([]float32{1, 0}, []float32{-1, 0})).To(BeNumerically("~", 2, 1e-6)) + }) + It("returns 1 for length mismatch", func() { + Expect(cosineDistance([]float32{1, 0}, []float32{1})).To(BeNumerically("~", 1, 1e-6)) + }) + It("returns 1 for a zero vector", func() { + Expect(cosineDistance([]float32{0, 0}, []float32{1, 0})).To(BeNumerically("~", 1, 1e-6)) + }) +}) + +type fakeRegistry struct { + matches []voicerecognition.Match + err error +} + +func (f *fakeRegistry) Register(ctx context.Context, emb []float32, m voicerecognition.Metadata) (voicerecognition.Metadata, error) { + return m, nil +} +func (f *fakeRegistry) Identify(ctx context.Context, probe []float32, topK int) ([]voicerecognition.Match, error) { + return f.matches, f.err +} +func (f *fakeRegistry) Forget(ctx context.Context, id string) error { return nil } + +var _ = Describe("voiceGate identify mode", func() { + stubEmbed := func(emb []float32, err error) func(context.Context, string) ([]float32, error) { + return func(context.Context, string) ([]float32, error) { return emb, err } + } + mkGate := func(allow config.VoiceRecognitionAllow, matches []voicerecognition.Match, embErr error) *voiceGate { + return &voiceGate{ + cfg: config.PipelineVoiceRecognition{Mode: config.VoiceGateModeIdentify, Threshold: 0.25, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent, Allow: allow}, + registry: &fakeRegistry{matches: matches}, + embedFn: stubEmbed([]float32{1, 0, 0}, embErr), + } + } + + It("allows a registered speaker within threshold and in the allow list", func() { + g := mkGate(config.VoiceRecognitionAllow{Names: []string{"alice"}}, + []voicerecognition.Match{{Distance: 0.1, Metadata: voicerecognition.Metadata{Name: "alice"}}}, nil) + allowed, matched, _, err := g.Authorize(context.Background(), "x.wav") + Expect(err).ToNot(HaveOccurred()) + Expect(allowed).To(BeTrue()) + Expect(matched).To(Equal("alice")) + }) + It("allows any registered speaker when the allow list is empty", func() { + g := mkGate(config.VoiceRecognitionAllow{}, + []voicerecognition.Match{{Distance: 0.1, Metadata: voicerecognition.Metadata{Name: "carol"}}}, nil) + allowed, _, _, _ := g.Authorize(context.Background(), "x.wav") + Expect(allowed).To(BeTrue()) + }) + It("allows by label", func() { + g := mkGate(config.VoiceRecognitionAllow{Labels: []string{"family"}}, + []voicerecognition.Match{{Distance: 0.1, Metadata: voicerecognition.Metadata{Name: "bob", Labels: map[string]string{"family": "yes"}}}}, nil) + allowed, _, _, _ := g.Authorize(context.Background(), "x.wav") + Expect(allowed).To(BeTrue()) + }) + It("denies a speaker not in the allow list", func() { + g := mkGate(config.VoiceRecognitionAllow{Names: []string{"alice"}}, + []voicerecognition.Match{{Distance: 0.1, Metadata: voicerecognition.Metadata{Name: "mallory"}}}, nil) + allowed, matched, reason, _ := g.Authorize(context.Background(), "x.wav") + Expect(allowed).To(BeFalse()) + Expect(matched).To(Equal("mallory")) + Expect(reason).To(ContainSubstring("allow")) + }) + It("denies a match above the threshold", func() { + g := mkGate(config.VoiceRecognitionAllow{}, + []voicerecognition.Match{{Distance: 0.9, Metadata: voicerecognition.Metadata{Name: "alice"}}}, nil) + allowed, matched, _, _ := g.Authorize(context.Background(), "x.wav") + Expect(allowed).To(BeFalse()) + Expect(matched).To(Equal("alice")) + }) + It("denies when no registry match", func() { + g := mkGate(config.VoiceRecognitionAllow{}, nil, nil) + allowed, _, reason, _ := g.Authorize(context.Background(), "x.wav") + Expect(allowed).To(BeFalse()) + Expect(reason).To(ContainSubstring("unknown")) + }) + It("denies (no error) when no speech is detected", func() { + g := mkGate(config.VoiceRecognitionAllow{}, nil, nil) + g.embedFn = stubEmbed(nil, nil) + allowed, _, reason, err := g.Authorize(context.Background(), "x.wav") + Expect(err).ToNot(HaveOccurred()) + Expect(allowed).To(BeFalse()) + Expect(reason).To(ContainSubstring("no speech")) + }) + It("denies and surfaces the error when embedding fails", func() { + g := mkGate(config.VoiceRecognitionAllow{}, nil, errors.New("boom")) + allowed, _, reason, err := g.Authorize(context.Background(), "x.wav") + Expect(err).To(HaveOccurred()) + Expect(allowed).To(BeFalse()) + Expect(reason).To(ContainSubstring("embed")) + }) + It("denies and surfaces the error when identify fails", func() { + g := mkGate(config.VoiceRecognitionAllow{}, nil, nil) + g.registry = &fakeRegistry{err: errors.New("boom")} + allowed, _, _, err := g.Authorize(context.Background(), "x.wav") + Expect(err).To(HaveOccurred()) + Expect(allowed).To(BeFalse()) + }) +}) + +var _ = Describe("voiceGate verify mode", func() { + It("allows when the utterance matches a reference embedding", func() { + g := &voiceGate{ + cfg: config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent}, + refEmbeds: []namedEmbedding{{name: "alice", emb: []float32{1, 0, 0}}}, + embedFn: func(context.Context, string) ([]float32, error) { return []float32{1, 0, 0}, nil }, + } + allowed, matched, _, err := g.Authorize(context.Background(), "x.wav") + Expect(err).ToNot(HaveOccurred()) + Expect(allowed).To(BeTrue()) + Expect(matched).To(Equal("alice")) + }) + It("denies when no reference is within threshold", func() { + g := &voiceGate{ + cfg: config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent}, + refEmbeds: []namedEmbedding{{name: "alice", emb: []float32{1, 0, 0}}}, + embedFn: func(context.Context, string) ([]float32, error) { return []float32{0, 1, 0}, nil }, + } + allowed, _, reason, _ := g.Authorize(context.Background(), "x.wav") + Expect(allowed).To(BeFalse()) + Expect(reason).To(ContainSubstring("reference")) + }) + It("denies (no error) when no speech is detected", func() { + g := &voiceGate{ + cfg: config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent}, + refEmbeds: []namedEmbedding{{name: "alice", emb: []float32{1, 0, 0}}}, + embedFn: func(context.Context, string) ([]float32, error) { return nil, nil }, + } + allowed, _, reason, err := g.Authorize(context.Background(), "x.wav") + Expect(err).ToNot(HaveOccurred()) + Expect(allowed).To(BeFalse()) + Expect(reason).To(ContainSubstring("no speech")) + }) + It("denies and surfaces the error when embedding fails", func() { + g := &voiceGate{ + cfg: config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent}, + refEmbeds: []namedEmbedding{{name: "alice", emb: []float32{1, 0, 0}}}, + embedFn: func(context.Context, string) ([]float32, error) { return nil, errors.New("boom") }, + } + allowed, _, _, err := g.Authorize(context.Background(), "x.wav") + Expect(err).To(HaveOccurred()) + Expect(allowed).To(BeFalse()) + }) + It("uses verifyFn when anti-spoofing is enabled", func() { + called := false + g := &voiceGate{ + cfg: config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, AntiSpoofing: true, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent}, + refAudios: []config.VoiceReference{{Name: "alice", Audio: "/alice.wav"}}, + verifyFn: func(context.Context, string, string) (bool, error) { called = true; return true, nil }, + } + allowed, matched, _, err := g.Authorize(context.Background(), "x.wav") + Expect(err).ToNot(HaveOccurred()) + Expect(called).To(BeTrue()) + Expect(allowed).To(BeTrue()) + Expect(matched).To(Equal("alice")) + }) + It("denies and surfaces the error when verifyFn fails (anti-spoofing)", func() { + g := &voiceGate{ + cfg: config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, AntiSpoofing: true, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent}, + refAudios: []config.VoiceReference{{Name: "alice", Audio: "/alice.wav"}}, + verifyFn: func(context.Context, string, string) (bool, error) { return false, errors.New("boom") }, + } + allowed, _, _, err := g.Authorize(context.Background(), "x.wav") + Expect(err).To(HaveOccurred()) + Expect(allowed).To(BeFalse()) + }) +}) + +var _ = Describe("newVoiceGate", func() { + It("fails fast when identify mode has no registry (before touching the loader)", func() { + cfg := config.PipelineVoiceRecognition{Model: "spk", Mode: config.VoiceGateModeIdentify} + g, err := newVoiceGate(cfg, nil, nil, nil, nil) + Expect(err).To(HaveOccurred()) + Expect(g).To(BeNil()) + }) + It("fails fast when verify mode has no references", func() { + cfg := config.PipelineVoiceRecognition{Model: "spk", Mode: config.VoiceGateModeVerify} + g, err := newVoiceGate(cfg, nil, nil, nil, nil) + Expect(err).To(HaveOccurred()) + Expect(g).To(BeNil()) + }) +}) + +var _ = Describe("voiceGate decide", func() { + gate := func(when string) *voiceGate { + return &voiceGate{cfg: config.PipelineVoiceRecognition{When: when}} + } + It("every: proceeds iff allowed, never marks verified", func() { + proceed, mark := gate(config.VoiceGateWhenEvery).decide(false, true) + Expect(proceed).To(BeTrue()) + Expect(mark).To(BeFalse()) + proceed, mark = gate(config.VoiceGateWhenEvery).decide(false, false) + Expect(proceed).To(BeFalse()) + Expect(mark).To(BeFalse()) + }) + It("first: marks verified on first allow", func() { + proceed, mark := gate(config.VoiceGateWhenFirst).decide(false, true) + Expect(proceed).To(BeTrue()) + Expect(mark).To(BeTrue()) + }) + It("first: denies on first reject without marking", func() { + proceed, mark := gate(config.VoiceGateWhenFirst).decide(false, false) + Expect(proceed).To(BeFalse()) + Expect(mark).To(BeFalse()) + }) + It("first: proceeds without re-check once already verified", func() { + proceed, mark := gate(config.VoiceGateWhenFirst).decide(true, false) + Expect(proceed).To(BeTrue()) + Expect(mark).To(BeFalse()) + }) +}) diff --git a/docs/content/features/openai-realtime.md b/docs/content/features/openai-realtime.md index d8ae1212b..51f8960bf 100644 --- a/docs/content/features/openai-realtime.md +++ b/docs/content/features/openai-realtime.md @@ -137,6 +137,52 @@ most reliable fix for WebRTC connections that establish and then drop. The API follows the OpenAI Realtime API protocol for handling sessions, audio buffers, and conversation items. +## Gating a realtime pipeline with voice recognition + +A pipeline realtime model can require speaker verification before it responds. Add a `voice_recognition` block under `pipeline`. When present, each committed utterance is verified against authorized speakers; unauthorized utterances are dropped before the LLM runs (no LLM call, no tool execution, no TTS). The session stays open. + +```yaml +name: my-realtime +pipeline: + vad: silero-vad + transcription: whisper + llm: qwen + tts: kokoro + voice_recognition: + model: speaker-recognition # the speaker-recognition backend model + mode: identify # "identify" (registry) or "verify" (references) + threshold: 0.25 # cosine distance; <= passes + when: every # "every" (default) or "first" + on_reject: drop_event # "drop_event" (default) or "drop_silent" + anti_spoofing: false # optional liveness check (verify mode) + + # identify mode: authorized registry identities (multiple persons) + allow: + names: ["alice", "bob"] # match registered speaker names + labels: ["family"] # OR any identity carrying this label + # empty allow = any registered speaker within threshold passes + + # verify mode: reference speakers (multiple persons) + references: + - name: alice + audio: /models/voices/alice.wav + - name: bob + audio: /models/voices/bob.wav +``` + +| Field | Meaning | +|-------|---------| +| `model` | Speaker-recognition backend model name. | +| `mode` | `identify` matches against speakers registered via `/v1/voice/register`; `verify` matches against the `references` audios. | +| `threshold` | Maximum cosine distance that still counts as a match (default ~0.25). | +| `when` | `every` verifies each utterance; `first` verifies once then trusts the session. | +| `on_reject` | `drop_event` drops and emits a `speaker_not_authorized` error event; `drop_silent` drops quietly. | +| `anti_spoofing` | Verify mode only: runs the backend liveness check (slower). | +| `allow.names` / `allow.labels` | identify mode: which registry identities are authorized. Empty = any registered speaker. | +| `references` | verify mode: authorized reference speakers; the utterance passes if it matches any. | + +`identify` mode requires the voice registry (speakers registered through `/v1/voice/register`). `verify` mode needs no registry: reference audios are embedded once at model load. + ## Examples - [Realtime voice assistant demo (Go)](https://github.com/localai-org/localai-realtime-demo): a minimal Go client for the Realtime (WebSocket) API with a full talk-back voice loop and an example tool call. Ships a `docker compose` setup that brings up a realtime-capable LocalAI for you.