diff --git a/core/config/meta/registry.go b/core/config/meta/registry.go index 84fc9afda..b7ffa9290 100644 --- a/core/config/meta/registry.go +++ b/core/config/meta/registry.go @@ -457,6 +457,55 @@ func DefaultRegistry() map[string]FieldMetaOverride { Component: "json-editor", Order: 78, }, + "pipeline.voice_recognition.enforce": { + Section: "pipeline", + Label: "Voice Gate Enforce", + Description: "Whether the gate rejects unauthorized speakers. Enabled (default) drops unauthorized utterances before the LLM. Disabled still resolves and surfaces the speaker (for the conversation.item.speaker event and personalization) but never drops a turn.", + Component: "toggle", + Order: 80, + }, + "pipeline.voice_recognition.identity.announce": { + Section: "pipeline", + Label: "Speaker Identity Announce", + Description: "Emit a conversation.item.speaker event to the client naming the recognized speaker. When set, identity is resolved on every turn even if 'when' is 'first'.", + Component: "toggle", + Order: 81, + }, + "pipeline.voice_recognition.identity.announce_unknown": { + Section: "pipeline", + Label: "Speaker Identity Announce Unknown", + Description: "Also emit the conversation.item.speaker event (with matched=false) when no confident match is found. Default only announces on a match.", + Component: "toggle", + Order: 82, + }, + "pipeline.voice_recognition.identity.personalize": { + Section: "pipeline", + Label: "Speaker Identity Personalize", + Description: "Inform the LLM who is speaking so it can tailor replies. Enables the name and system-note injection below.", + Component: "toggle", + Order: 83, + }, + "pipeline.voice_recognition.identity.inject_name": { + Section: "pipeline", + Label: "Speaker Identity Inject Name", + Description: "Personalization: set the per-message OpenAI 'name' field on each user turn to the recognized speaker.", + Component: "toggle", + Order: 84, + }, + "pipeline.voice_recognition.identity.inject_system_note": { + Section: "pipeline", + Label: "Speaker Identity Inject System Note", + Description: "Personalization: append a 'The current speaker is .' note to the system message reflecting the latest speaker.", + Component: "toggle", + Order: 85, + }, + "pipeline.voice_recognition.identity.note_unknown": { + Section: "pipeline", + Label: "Speaker Identity Note Unknown", + Description: "Personalization: when the speaker is unidentified, append 'The current speaker is unknown.' to the system message so the model can ask who it is talking to.", + Component: "toggle", + Order: 86, + }, "pipeline.max_history_items": { Section: "pipeline", Label: "Max History Items", diff --git a/core/config/model_config.go b/core/config/model_config.go index 9586beea3..5dbfd2026 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -769,6 +769,13 @@ type PipelineVoiceRecognition struct { Allow VoiceRecognitionAllow `yaml:"allow,omitempty" json:"allow,omitempty"` // References are the authorized reference speakers (verify mode). References []VoiceReference `yaml:"references,omitempty" json:"references,omitempty"` + // Enforce controls the authorization gate. A nil value or true rejects + // unauthorized speakers (the historical behavior). false resolves the + // speaker's identity for surfacing/personalization but never drops a turn. + Enforce *bool `yaml:"enforce,omitempty" json:"enforce,omitempty"` + // Identity surfaces the recognized speaker to the client and the LLM. It is + // independent of Enforce: identity can be surfaced without gating. + Identity *VoiceIdentityConfig `yaml:"identity,omitempty" json:"identity,omitempty"` } // @Description VoiceRecognitionAllow filters authorized registry identities. @@ -785,6 +792,25 @@ type VoiceReference struct { Audio string `yaml:"audio,omitempty" json:"audio,omitempty"` } +// @Description VoiceIdentityConfig surfaces the recognized speaker to the realtime +// client and the LLM. When set, identity is resolved on every turn even if the +// gate's When is "first" (the gate still authorizes only once). +type VoiceIdentityConfig struct { + // Announce emits a conversation.item.speaker event to the client. + Announce bool `yaml:"announce,omitempty" json:"announce,omitempty"` + // AnnounceUnknown also emits the event when there is no confident match. + AnnounceUnknown bool `yaml:"announce_unknown,omitempty" json:"announce_unknown,omitempty"` + // Personalize informs the LLM who is speaking. + Personalize bool `yaml:"personalize,omitempty" json:"personalize,omitempty"` + // InjectName sets the per-message name field on each user turn. + InjectName bool `yaml:"inject_name,omitempty" json:"inject_name,omitempty"` + // InjectSystemNote maintains a "current speaker" note in the system message. + InjectSystemNote bool `yaml:"inject_system_note,omitempty" json:"inject_system_note,omitempty"` + // NoteUnknown adds a "the current speaker is unknown" note (enables the model + // to ask who it is talking to). + NoteUnknown bool `yaml:"note_unknown,omitempty" json:"note_unknown,omitempty"` +} + // VoiceGateEnabled reports whether a voice-recognition gate is configured. The // mere presence of the block is the intent signal: a present-but-incomplete // block (e.g. missing model) must fail closed at construction, not be silently @@ -793,6 +819,28 @@ func (p Pipeline) VoiceGateEnabled() bool { return p.VoiceRecognition != nil } +// EnforceGate reports whether the gate rejects unauthorized speakers. A nil +// Enforce means "enforce" so existing configs keep gating. +func (p PipelineVoiceRecognition) EnforceGate() bool { + return p.Enforce == nil || *p.Enforce +} + +// IdentityEnabled reports whether the speaker's identity must be resolved for +// surfacing or personalization. +func (p PipelineVoiceRecognition) IdentityEnabled() bool { + return p.Identity != nil && (p.Identity.Announce || p.Identity.Personalize) +} + +// AnnounceEnabled reports whether to emit the conversation.item.speaker event. +func (p PipelineVoiceRecognition) AnnounceEnabled() bool { + return p.Identity != nil && p.Identity.Announce +} + +// PersonalizeEnabled reports whether to inform the LLM of the speaker. +func (p PipelineVoiceRecognition) PersonalizeEnabled() bool { + return p.Identity != nil && p.Identity.Personalize +} + // Normalize fills in defaults in place for omitted fields. func (v *PipelineVoiceRecognition) Normalize() { if v.Mode == "" { diff --git a/core/config/voice_gate_test.go b/core/config/voice_gate_test.go index 5c7782f1c..c0d25bf82 100644 --- a/core/config/voice_gate_test.go +++ b/core/config/voice_gate_test.go @@ -70,4 +70,32 @@ var _ = Describe("PipelineVoiceRecognition", func() { Expect((Pipeline{VoiceRecognition: &PipelineVoiceRecognition{}}).VoiceGateEnabled()).To(BeTrue()) }) }) + + Describe("Enforce / Identity helpers", func() { + It("treats a nil Enforce as enforcing (backward compatible)", func() { + v := PipelineVoiceRecognition{Model: "spk"} + Expect(v.EnforceGate()).To(BeTrue()) + }) + It("honors an explicit enforce:false", func() { + off := false + v := PipelineVoiceRecognition{Model: "spk", Enforce: &off} + Expect(v.EnforceGate()).To(BeFalse()) + }) + It("reports identity disabled when no identity block is set", func() { + v := PipelineVoiceRecognition{Model: "spk"} + Expect(v.IdentityEnabled()).To(BeFalse()) + Expect(v.AnnounceEnabled()).To(BeFalse()) + Expect(v.PersonalizeEnabled()).To(BeFalse()) + }) + It("reports identity enabled when announce or personalize is on", func() { + v := PipelineVoiceRecognition{Model: "spk", Identity: &VoiceIdentityConfig{Announce: true}} + Expect(v.IdentityEnabled()).To(BeTrue()) + Expect(v.AnnounceEnabled()).To(BeTrue()) + Expect(v.PersonalizeEnabled()).To(BeFalse()) + + v2 := PipelineVoiceRecognition{Model: "spk", Identity: &VoiceIdentityConfig{Personalize: true}} + Expect(v2.IdentityEnabled()).To(BeTrue()) + Expect(v2.PersonalizeEnabled()).To(BeTrue()) + }) + }) }) diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go index 343ef4c07..8de50e580 100644 --- a/core/http/endpoints/openai/realtime.go +++ b/core/http/endpoints/openai/realtime.go @@ -1311,28 +1311,32 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co // turn wastes only transcription compute, which has no side effects. The // transcript is still emitted to the same peer that sent the audio, which // reveals nothing new to them. - type gateOutcome struct { - allowed bool - matched string - reason string - err error + // Resolve the speaker when the gate must authorize this turn, or when identity + // surfacing/personalization needs a fresh identity. Identity resolution + // ignores the when:first short-circuit (that only skips re-authorization). + type resolveOutcome struct { + res resolution + err error } - var gateCh chan gateOutcome - runGate := false + var resolveCh chan resolveOutcome + runResolve := false if session.voiceGate != nil && session.InputAudioTranscription != nil { - skip := false - if session.voiceGate.cfg.When == config.VoiceGateWhenFirst { + enforce := session.voiceGate.cfg.EnforceGate() + gateNeedsAuth := enforce + if enforce && session.voiceGate.cfg.When == config.VoiceGateWhenFirst { session.gateMu.Lock() - skip = session.voiceVerified + if session.voiceVerified { + gateNeedsAuth = false + } session.gateMu.Unlock() } - if !skip { - runGate = true - gateCh = make(chan gateOutcome, 1) + if gateNeedsAuth || session.voiceGate.cfg.IdentityEnabled() { + runResolve = true + resolveCh = make(chan resolveOutcome, 1) wavPath := f.Name() go func() { - allowed, matched, reason, gerr := session.voiceGate.Authorize(ctx, wavPath) - gateCh <- gateOutcome{allowed: allowed, matched: matched, reason: reason, err: gerr} + r, rerr := session.voiceGate.Resolve(ctx, wavPath) + resolveCh <- resolveOutcome{res: r, err: rerr} }() } } @@ -1348,8 +1352,8 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co if err != nil { // Drain the gate goroutine before returning so its in-flight read of // the temp WAV finishes before the deferred os.Remove fires. - if runGate { - <-gateCh + if runResolve { + <-resolveCh } sendError(t, "transcription_failed", err.Error(), "", "event_TODO") return @@ -1361,41 +1365,58 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co return } - // Join on the gate before any side-effecting step. - if runGate { - out := <-gateCh - allowed := out.allowed - reason := out.reason + // Join on the resolution before any side-effecting step. + var speaker *types.Speaker + if runResolve { + out := <-resolveCh + enforce := session.voiceGate.cfg.EnforceGate() + if out.err != nil { - // Fail closed: a gate that cannot decide must not let audio through. - xlog.Error("voice recognition gate error", "error", out.err) - allowed = false - reason = "verification error" - } - alreadyVerified := false - if session.voiceGate.cfg.When == config.VoiceGateWhenFirst { - session.gateMu.Lock() - alreadyVerified = session.voiceVerified - session.gateMu.Unlock() - } - proceed, markVerified := session.voiceGate.decide(alreadyVerified, allowed) - if !proceed { - xlog.Debug("voice recognition gate rejected utterance", "reason", reason) - if session.voiceGate.cfg.OnReject == config.VoiceGateRejectEvent { - sendError(t, "speaker_not_authorized", "speaker not authorized: "+reason, "", "event_TODO") + if enforce { + // Fail closed: a gate that cannot decide must not let audio through. + xlog.Error("voice recognition gate error", "error", out.err) + if session.voiceGate.cfg.OnReject == config.VoiceGateRejectEvent { + sendError(t, "speaker_not_authorized", "speaker not authorized: verification error", "", "event_TODO") + } + return } - return + // Non-enforcing: degrade to an unknown speaker and continue. + xlog.Warn("voice identity resolve failed; continuing as unknown speaker", "error", out.err) + } else { + s := out.res.speaker + speaker = &s } - xlog.Debug("voice recognition gate authorized utterance", "speaker", out.matched) - if markVerified { - session.gateMu.Lock() - session.voiceVerified = true - session.gateMu.Unlock() + + if enforce { + alreadyVerified := false + if session.voiceGate.cfg.When == config.VoiceGateWhenFirst { + session.gateMu.Lock() + alreadyVerified = session.voiceVerified + session.gateMu.Unlock() + } + allowed, reason := false, "verification error" + if out.err == nil { + allowed, reason = session.voiceGate.authorize(out.res) + } + proceed, markVerified := session.voiceGate.decide(alreadyVerified, allowed) + if !proceed { + xlog.Debug("voice recognition gate rejected utterance", "reason", reason) + if session.voiceGate.cfg.OnReject == config.VoiceGateRejectEvent { + sendError(t, "speaker_not_authorized", "speaker not authorized: "+reason, "", "event_TODO") + } + return + } + if markVerified { + session.gateMu.Lock() + session.voiceVerified = true + session.gateMu.Unlock() + } + xlog.Debug("voice recognition gate authorized utterance", "speaker", out.res.speaker.Name) } } if !session.TranscriptionOnly { - generateResponse(ctx, session, utt, transcript, conv, t) + generateResponse(ctx, session, utt, transcript, speaker, conv, t) } } @@ -1419,15 +1440,28 @@ func runVAD(ctx context.Context, session *Session, adata []int16) ([]schema.VADS return resp.Segments, nil } +// speakerNote renders the system-prompt note for the current speaker. Returns +// an empty string when there is no name and unknown notes are disabled. +func speakerNote(s *types.Speaker, noteUnknown bool) string { + if s != nil && s.Matched && s.Name != "" { + return "The current speaker is " + s.Name + "." + } + if noteUnknown { + return "The current speaker is unknown." + } + return "" +} + // Function to generate a response based on the conversation -func generateResponse(ctx context.Context, session *Session, utt []byte, transcript string, conv *Conversation, t Transport) { +func generateResponse(ctx context.Context, session *Session, utt []byte, transcript string, speaker *types.Speaker, conv *Conversation, t Transport) { xlog.Debug("Generating realtime response...") // Create user message item item := types.MessageItemUnion{ User: &types.MessageItemUser{ - ID: generateItemID(), - Status: types.ItemStatusCompleted, + ID: generateItemID(), + Status: types.ItemStatusCompleted, + Speaker: speaker, Content: []types.MessageContentInput{ { Type: types.MessageContentTypeInputAudio, @@ -1445,6 +1479,17 @@ func generateResponse(ctx context.Context, session *Session, utt []byte, transcr Item: item, }) + // Surface the recognized speaker to the client. Skip the event for an + // unidentified speaker unless announce_unknown is set. + if speaker != nil && session.voiceGate != nil && session.voiceGate.cfg.AnnounceEnabled() { + if speaker.Matched || session.voiceGate.cfg.Identity.AnnounceUnknown { + sendEvent(t, types.ConversationItemSpeakerEvent{ + ItemID: item.User.ID, + Speaker: *speaker, + }) + } + } + triggerResponse(ctx, session, conv, t, nil) } @@ -1508,6 +1553,8 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa }) imgIndex := 0 + var lastUserSpeaker *types.Speaker + personalize := session.voiceGate != nil && session.voiceGate.cfg.PersonalizeEnabled() conv.Lock.Lock() items := trimRealtimeItems(conv.Items, session.MaxHistoryItems) for _, item := range items { @@ -1515,6 +1562,11 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa msg := schema.Message{ Role: string(types.MessageRoleUser), } + lastUserSpeaker = item.User.Speaker + if personalize && session.voiceGate.cfg.Identity.InjectName && + item.User.Speaker != nil && item.User.Speaker.Matched && item.User.Speaker.Name != "" { + msg.Name = item.User.Speaker.Name + } textContent := "" nrOfImgsInMessage := 0 for _, content := range item.User.Content { @@ -1601,6 +1653,13 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa } conv.Lock.Unlock() + if personalize && session.voiceGate.cfg.Identity.InjectSystemNote { + if note := speakerNote(lastUserSpeaker, session.voiceGate.cfg.Identity.NoteUnknown); note != "" { + conversationHistory[0].StringContent += "\n\n" + note + conversationHistory[0].Content = conversationHistory[0].StringContent + } + } + var images []string for _, m := range conversationHistory { images = append(images, m.StringImages...) diff --git a/core/http/endpoints/openai/realtime_doubles_test.go b/core/http/endpoints/openai/realtime_doubles_test.go index accd6af51..727ce7dcc 100644 --- a/core/http/endpoints/openai/realtime_doubles_test.go +++ b/core/http/endpoints/openai/realtime_doubles_test.go @@ -83,6 +83,8 @@ type fakeModel struct { predictChunkDeltas [][]*proto.ChatDelta predictResp backend.LLMResponse predictErr error + + lastMessages schema.Messages } func (m *fakeModel) VAD(context.Context, *schema.VADRequest) (*schema.VADResponse, error) { @@ -93,7 +95,8 @@ func (m *fakeModel) Transcribe(context.Context, string, string, bool, bool, stri return m.transcribeFinal, nil } -func (m *fakeModel) Predict(_ context.Context, _ schema.Messages, _, _, _ []string, cb func(string, backend.TokenUsage) bool, _ []types.ToolUnion, _ *types.ToolChoiceUnion, _, _ *int, _ map[string]float64) (func() (backend.LLMResponse, error), error) { +func (m *fakeModel) Predict(_ context.Context, msgs schema.Messages, _, _, _ []string, cb func(string, backend.TokenUsage) bool, _ []types.ToolUnion, _ *types.ToolChoiceUnion, _, _ *int, _ map[string]float64) (func() (backend.LLMResponse, error), error) { + m.lastMessages = msgs if m.predictErr != nil { return nil, m.predictErr } diff --git a/core/http/endpoints/openai/realtime_speaker_event_test.go b/core/http/endpoints/openai/realtime_speaker_event_test.go new file mode 100644 index 000000000..fbbe5ded9 --- /dev/null +++ b/core/http/endpoints/openai/realtime_speaker_event_test.go @@ -0,0 +1,54 @@ +package openai + +import ( + "encoding/json" + + "github.com/mudler/LocalAI/core/http/endpoints/openai/types" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("ConversationItemSpeakerEvent", func() { + It("marshals with the conversation.item.speaker type and nested speaker", func() { + ev := types.ConversationItemSpeakerEvent{ + ItemID: "item_123", + Speaker: types.Speaker{Name: "Jeremy", ID: "spk_1", Labels: map[string]string{"family": "yes"}, Confidence: 92, Distance: 0.1, Matched: true}, + } + b, err := json.Marshal(ev) + Expect(err).ToNot(HaveOccurred()) + + var got map[string]any + Expect(json.Unmarshal(b, &got)).To(Succeed()) + Expect(got["type"]).To(Equal("conversation.item.speaker")) + Expect(got["item_id"]).To(Equal("item_123")) + + spk := got["speaker"].(map[string]any) + Expect(spk["name"]).To(Equal("Jeremy")) + Expect(spk["id"]).To(Equal("spk_1")) + Expect(spk["matched"]).To(Equal(true)) + Expect(spk["labels"]).To(HaveKeyWithValue("family", "yes")) + }) + + It("omits labels when the speaker has none", func() { + ev := types.ConversationItemSpeakerEvent{ItemID: "i", Speaker: types.Speaker{Name: "Jeremy", Matched: true}} + b, err := json.Marshal(ev) + Expect(err).ToNot(HaveOccurred()) + var got map[string]any + Expect(json.Unmarshal(b, &got)).To(Succeed()) + spk := got["speaker"].(map[string]any) + _, hasLabels := spk["labels"] + Expect(hasLabels).To(BeFalse()) + }) + + It("omits the name for an unknown speaker but keeps matched=false", func() { + ev := types.ConversationItemSpeakerEvent{ItemID: "i", Speaker: types.Speaker{Matched: false}} + b, err := json.Marshal(ev) + Expect(err).ToNot(HaveOccurred()) + var got map[string]any + Expect(json.Unmarshal(b, &got)).To(Succeed()) + spk := got["speaker"].(map[string]any) + _, hasName := spk["name"] + Expect(hasName).To(BeFalse()) + Expect(spk["matched"]).To(Equal(false)) + }) +}) diff --git a/core/http/endpoints/openai/realtime_voicegate.go b/core/http/endpoints/openai/realtime_voicegate.go index 54332536f..9bd6f10f2 100644 --- a/core/http/endpoints/openai/realtime_voicegate.go +++ b/core/http/endpoints/openai/realtime_voicegate.go @@ -7,6 +7,7 @@ import ( "github.com/mudler/LocalAI/core/backend" "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/http/endpoints/openai/types" "github.com/mudler/LocalAI/core/services/voicerecognition" "github.com/mudler/LocalAI/pkg/model" ) @@ -29,6 +30,32 @@ type voiceGate struct { verifyFn func(ctx context.Context, uttWav, refWav string) (bool, error) } +// resolution is the outcome of resolving a committed utterance's speaker. It +// carries the surfacing-facing Speaker plus the metadata the policy layer needs +// (labels for the allow-list) and a human reason when no usable identity exists. +type resolution struct { + speaker types.Speaker // name/id/confidence/distance/matched + labels map[string]string // identify-mode metadata labels, for the allow-list + found bool // a candidate identity existed at all + reason string // why-unknown / deny reason at the resolve level +} + +// confidence maps a cosine distance to a 0..100 score relative to the match +// threshold, mirroring the /v1/voice/identify endpoint. +func confidence(distance, threshold float32) float32 { + if threshold <= 0 { + return 0 + } + c := (1 - distance/threshold) * 100 + if c < 0 { + return 0 + } + if c > 100 { + return 100 + } + return c +} + // newVoiceGate builds a gate from a pipeline's voice_recognition config. It // validates fail-fast (before loading the model), loads the recognition model // config, wires the real backend seams, and pre-embeds references for verify @@ -89,91 +116,143 @@ func newVoiceGate( return g, nil } -// Authorize embeds the utterance and decides allow/deny. -// -// allowed: speaker is authorized. -// matched: matched person's name (informational), empty if none. -// reason: human-readable deny reason. -// err: backend failure (caller should fail closed). -func (g *voiceGate) Authorize(ctx context.Context, wavPath string) (allowed bool, matched string, reason string, err error) { +// Resolve embeds the utterance once and resolves the speaker's identity. It does +// NOT apply the authorization policy (see authorize). On a backend error it +// returns the error and a resolution whose reason explains the failure. +func (g *voiceGate) Resolve(ctx context.Context, wavPath string) (resolution, error) { if g.cfg.Mode == config.VoiceGateModeVerify { - return g.authorizeVerify(ctx, wavPath) + return g.resolveVerify(ctx, wavPath) } - return g.authorizeIdentify(ctx, wavPath) + return g.resolveIdentify(ctx, wavPath) } -func (g *voiceGate) authorizeIdentify(ctx context.Context, wavPath string) (bool, string, string, error) { +func (g *voiceGate) resolveIdentify(ctx context.Context, wavPath string) (resolution, error) { emb, err := g.embedFn(ctx, wavPath) if err != nil { - return false, "", "embed failed", err + return resolution{reason: "embed failed"}, err } if len(emb) == 0 { - return false, "", "no speech detected", nil + return resolution{reason: "no speech detected"}, nil } matches, err := g.registry.Identify(ctx, emb, 1) if err != nil { - return false, "", "identify failed", err + return resolution{reason: "identify failed"}, err } if len(matches) == 0 { - return false, "", "unknown speaker", nil + return resolution{reason: "unknown speaker"}, nil } m := matches[0] - if m.Distance > g.cfg.Threshold { - return false, m.Metadata.Name, "distance above threshold", nil + matched := m.Distance <= g.cfg.Threshold + r := resolution{ + speaker: types.Speaker{ + Name: m.Metadata.Name, + ID: m.Metadata.ID, + Labels: m.Metadata.Labels, + Distance: m.Distance, + Confidence: confidence(m.Distance, g.cfg.Threshold), + Matched: matched, + }, + labels: m.Metadata.Labels, + found: true, } - if !g.allowMatch(m.Metadata) { - return false, m.Metadata.Name, "speaker not in allow list", nil + if !matched { + r.reason = "distance above threshold" } - return true, m.Metadata.Name, "", nil + return r, nil +} + +func (g *voiceGate) resolveVerify(ctx context.Context, wavPath string) (resolution, error) { + if g.cfg.AntiSpoofing { + for _, ref := range g.refAudios { + ok, err := g.verifyFn(ctx, wavPath, ref.Audio) + if err != nil { + return resolution{reason: "verify failed"}, err + } + if ok { + return resolution{ + speaker: types.Speaker{Name: ref.Name, Confidence: 100, Matched: true}, + found: true, + }, nil + } + } + return resolution{reason: "no reference matched"}, nil + } + + emb, err := g.embedFn(ctx, wavPath) + if err != nil { + return resolution{reason: "embed failed"}, err + } + if len(emb) == 0 { + return resolution{reason: "no speech detected"}, nil + } + for _, ref := range g.refEmbeds { + d := cosineDistance(emb, ref.emb) + if d <= g.cfg.Threshold { + return resolution{ + speaker: types.Speaker{Name: ref.name, Distance: d, Confidence: confidence(d, g.cfg.Threshold), Matched: true}, + found: true, + }, nil + } + } + return resolution{reason: "no reference matched"}, nil +} + +// authorize applies the gate's policy to an already-resolved identity. +func (g *voiceGate) authorize(r resolution) (allowed bool, reason string) { + if g.cfg.Mode == config.VoiceGateModeVerify { + if r.speaker.Matched { + return true, "" + } + if r.reason == "" { + return false, "no reference matched" + } + return false, r.reason + } + if !r.found { + return false, r.reason + } + if !r.speaker.Matched { + return false, "distance above threshold" + } + if !g.allowMatch(r.speaker.Name, r.labels) { + return false, "speaker not in allow list" + } + return true, "" } // allowMatch reports whether a matched identity is authorized. An empty allow // (no names and no labels) authorizes any registered speaker. -func (g *voiceGate) allowMatch(meta voicerecognition.Metadata) bool { +func (g *voiceGate) allowMatch(name string, labels map[string]string) bool { a := g.cfg.Allow if len(a.Names) == 0 && len(a.Labels) == 0 { return true } for _, n := range a.Names { - if n == meta.Name { + if n == name { return true } } for _, l := range a.Labels { - if _, ok := meta.Labels[l]; ok { + if _, ok := labels[l]; ok { return true } } return false } -func (g *voiceGate) authorizeVerify(ctx context.Context, wavPath string) (bool, string, string, error) { - if g.cfg.AntiSpoofing { - for _, r := range g.refAudios { - ok, err := g.verifyFn(ctx, wavPath, r.Audio) - if err != nil { - return false, "", "verify failed", err - } - if ok { - return true, r.Name, "", nil - } - } - return false, "", "no reference matched", nil +// Authorize is the legacy convenience wrapper: resolve then apply policy. +// +// allowed: speaker is authorized. +// matched: matched person's name (informational), empty if none. +// reason: human-readable deny reason. +// err: backend failure (caller should fail closed). +func (g *voiceGate) Authorize(ctx context.Context, wavPath string) (allowed bool, matched string, reason string, err error) { + r, rerr := g.Resolve(ctx, wavPath) + if rerr != nil { + return false, "", r.reason, rerr } - - emb, err := g.embedFn(ctx, wavPath) - if err != nil { - return false, "", "embed failed", err - } - if len(emb) == 0 { - return false, "", "no speech detected", nil - } - for _, r := range g.refEmbeds { - if cosineDistance(emb, r.emb) <= g.cfg.Threshold { - return true, r.name, "", nil - } - } - return false, "", "no reference matched", nil + allowed, reason = g.authorize(r) + return allowed, r.speaker.Name, reason, nil } // decide interprets an Authorize result against the gate's when-policy and the diff --git a/core/http/endpoints/openai/realtime_voicegate_integration_test.go b/core/http/endpoints/openai/realtime_voicegate_integration_test.go index f8aae72c5..b0f7f0b49 100644 --- a/core/http/endpoints/openai/realtime_voicegate_integration_test.go +++ b/core/http/endpoints/openai/realtime_voicegate_integration_test.go @@ -152,3 +152,252 @@ var _ = Describe("realtime voice gate integration (commitUtterance)", func() { Expect(tr2.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1)) }) }) + +var _ = Describe("realtime speaker surfacing (commitUtterance)", func() { + utt := make([]byte, 32) + + It("emits conversation.item.speaker for a confident match when announce is on", func() { + session, _ := itSession(itGate("alice", "alice", []float32{1, 0, 0}, nil, + config.VoiceGateWhenEvery, config.VoiceGateRejectEvent)) + session.voiceGate.cfg.Identity = &config.VoiceIdentityConfig{Announce: true} + tr := &fakeTransport{} + + commitUtterance(context.Background(), utt, session, &Conversation{}, tr) + + Expect(tr.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(1)) + }) + + It("does not emit the speaker event for an unknown speaker unless announce_unknown is set", func() { + // match distance above threshold => not matched + gate := &voiceGate{ + cfg: config.PipelineVoiceRecognition{ + Mode: config.VoiceGateModeIdentify, Threshold: 0.25, + When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent, + Enforce: boolPtr(false), + Identity: &config.VoiceIdentityConfig{Announce: true}, + }, + registry: &fakeRegistry{matches: []voicerecognition.Match{ + {Distance: 0.9, Metadata: voicerecognition.Metadata{Name: "alice"}}, + }}, + embedFn: func(context.Context, string) ([]float32, error) { return []float32{1, 0, 0}, nil }, + } + session, _ := itSession(gate) + tr := &fakeTransport{} + + commitUtterance(context.Background(), utt, session, &Conversation{}, tr) + Expect(tr.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(0)) + + gate.cfg.Identity.AnnounceUnknown = true + tr2 := &fakeTransport{} + commitUtterance(context.Background(), utt, session, &Conversation{}, tr2) + Expect(tr2.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(1)) + }) + + It("never drops a turn when enforce is false even for a disallowed speaker", func() { + session, _ := itSession(itGate("bob", "alice", []float32{1, 0, 0}, nil, + config.VoiceGateWhenEvery, config.VoiceGateRejectEvent)) + session.voiceGate.cfg.Enforce = boolPtr(false) + tr := &fakeTransport{} + + commitUtterance(context.Background(), utt, session, &Conversation{}, tr) + + Expect(hasSpeakerNotAuthorized(tr)).To(BeFalse()) + Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1)) + }) +}) + +var _ = Describe("realtime speaker personalization (triggerResponseAtTurn)", func() { + utt := make([]byte, 32) + + findRole := func(msgs schema.Messages, role string) *schema.Message { + for i := range msgs { + if msgs[i].Role == role { + return &msgs[i] + } + } + return nil + } + + It("sets the user message name and a current-speaker system note", func() { + session, m := itSession(itGate("alice", "alice", []float32{1, 0, 0}, nil, + config.VoiceGateWhenEvery, config.VoiceGateRejectEvent)) + session.voiceGate.cfg.Identity = &config.VoiceIdentityConfig{ + Personalize: true, InjectName: true, InjectSystemNote: true, + } + session.Instructions = "You are helpful." + tr := &fakeTransport{} + + commitUtterance(context.Background(), utt, session, &Conversation{}, tr) + + user := findRole(m.lastMessages, "user") + Expect(user).ToNot(BeNil()) + Expect(user.Name).To(Equal("alice")) + sys := findRole(m.lastMessages, "system") + Expect(sys).ToNot(BeNil()) + Expect(sys.StringContent).To(ContainSubstring("The current speaker is alice.")) + }) + + It("omits the unknown note unless note_unknown is set", func() { + base := func() (*Session, *fakeModel) { + gate := &voiceGate{ + cfg: config.PipelineVoiceRecognition{ + Mode: config.VoiceGateModeIdentify, Threshold: 0.25, + When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent, + Enforce: boolPtr(false), + Identity: &config.VoiceIdentityConfig{Personalize: true, InjectSystemNote: true}, + }, + registry: &fakeRegistry{matches: []voicerecognition.Match{ + {Distance: 0.9, Metadata: voicerecognition.Metadata{Name: "alice"}}, + }}, + embedFn: func(context.Context, string) ([]float32, error) { return []float32{1, 0, 0}, nil }, + } + s, m := itSession(gate) + s.Instructions = "You are helpful." + return s, m + } + + s1, m1 := base() + commitUtterance(context.Background(), utt, s1, &Conversation{}, &fakeTransport{}) + Expect(findRole(m1.lastMessages, "system").StringContent).ToNot(ContainSubstring("unknown")) + + s2, m2 := base() + s2.voiceGate.cfg.Identity.NoteUnknown = true + commitUtterance(context.Background(), utt, s2, &Conversation{}, &fakeTransport{}) + Expect(findRole(m2.lastMessages, "system").StringContent).To(ContainSubstring("The current speaker is unknown.")) + }) +}) + +var _ = Describe("realtime when:first with identity (commitUtterance)", func() { + utt := make([]byte, 32) + + // statefulIdentityGate builds a when:first identify gate with an Identity + // block (so identity is resolved every turn) whose embedFn is driven by a + // per-turn counter: the failOnSecond flag makes the second and later embeds + // return an error, exercising the stricter fail-closed path on a re-resolve. + statefulIdentityGate := func(failOnSecond bool) *voiceGate { + calls := 0 + return &voiceGate{ + cfg: config.PipelineVoiceRecognition{ + Mode: config.VoiceGateModeIdentify, + Threshold: 0.25, + When: config.VoiceGateWhenFirst, + OnReject: config.VoiceGateRejectEvent, + Allow: config.VoiceRecognitionAllow{Names: []string{"alice"}}, + Identity: &config.VoiceIdentityConfig{Announce: true, Personalize: true, InjectName: true}, + }, + registry: &fakeRegistry{matches: []voicerecognition.Match{ + {Distance: 0.1, Metadata: voicerecognition.Metadata{Name: "alice"}}, + }}, + embedFn: func(context.Context, string) ([]float32, error) { + calls++ + if failOnSecond && calls > 1 { + return nil, errors.New("embed backend down") + } + return []float32{1, 0, 0}, nil + }, + } + } + + It("re-resolves identity every turn and fails closed when a later embed errors", func() { + gate := statefulIdentityGate(true) + session, _ := itSession(gate) + conv := &Conversation{} // shared so voiceVerified persists across turns + + // Turn 1: authorized; identity resolved, speaker surfaced, response runs. + tr1 := &fakeTransport{} + commitUtterance(context.Background(), utt, session, conv, tr1) + Expect(hasSpeakerNotAuthorized(tr1)).To(BeFalse()) + Expect(tr1.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(1)) + Expect(tr1.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1)) + + // Turn 2: when:first would skip re-authorization, but the Identity block + // forces a fresh resolve. That resolve now errors, and because the gate + // enforces, the turn is dropped fail-closed rather than riding on the + // cached first verification. + tr2 := &fakeTransport{} + commitUtterance(context.Background(), utt, session, conv, tr2) + Expect(hasSpeakerNotAuthorized(tr2)).To(BeTrue()) + Expect(tr2.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0)) + }) + + It("re-resolves identity every turn so a later turn still surfaces and names the speaker", func() { + gate := statefulIdentityGate(false) + session, m := itSession(gate) + conv := &Conversation{} + + tr1 := &fakeTransport{} + commitUtterance(context.Background(), utt, session, conv, tr1) + Expect(hasSpeakerNotAuthorized(tr1)).To(BeFalse()) + Expect(tr1.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1)) + + // Turn 2: authorization is skipped (when:first, already verified) but the + // speaker event still fires and the per-message name is set, proving the + // per-turn re-resolution (not the cached first verification) drove it. + tr2 := &fakeTransport{} + commitUtterance(context.Background(), utt, session, conv, tr2) + Expect(tr2.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(1)) + var lastUser *schema.Message + for i := range m.lastMessages { + if m.lastMessages[i].Role == "user" { + lastUser = &m.lastMessages[i] + } + } + Expect(lastUser).ToNot(BeNil()) + Expect(lastUser.Name).To(Equal("alice")) + }) +}) + +var _ = Describe("realtime multi-speaker history attribution (triggerResponse)", func() { + userAudioItem := func(name, transcript string) *types.MessageItemUnion { + return &types.MessageItemUnion{ + User: &types.MessageItemUser{ + ID: generateItemID(), + Status: types.ItemStatusCompleted, + Speaker: &types.Speaker{Name: name, Matched: true}, + Content: []types.MessageContentInput{ + {Type: types.MessageContentTypeInputAudio, Transcript: transcript}, + }, + }, + } + } + + It("attributes each user turn to its own speaker and notes the latest one", func() { + session, m := itSession(itGate("alice", "alice", []float32{1, 0, 0}, nil, + config.VoiceGateWhenEvery, config.VoiceGateRejectEvent)) + session.Instructions = "You are helpful." + session.MaxHistoryItems = 10 // keep both items; 0 would mean "no trim" too + session.voiceGate.cfg.Identity = &config.VoiceIdentityConfig{ + Personalize: true, InjectName: true, InjectSystemNote: true, + } + + conv := &Conversation{Items: []*types.MessageItemUnion{ + userAudioItem("alice", "hello there"), + userAudioItem("bob", "what is the weather"), + }} + tr := &fakeTransport{} + + triggerResponse(context.Background(), session, conv, tr, nil) + + var users []*schema.Message + var sys *schema.Message + for i := range m.lastMessages { + switch m.lastMessages[i].Role { + case "user": + users = append(users, &m.lastMessages[i]) + case "system": + if sys == nil { + sys = &m.lastMessages[i] + } + } + } + Expect(users).To(HaveLen(2)) + Expect(users[0].Name).To(Equal("alice")) + Expect(users[1].Name).To(Equal("bob")) + + Expect(sys).ToNot(BeNil()) + Expect(sys.StringContent).To(ContainSubstring("The current speaker is bob.")) + Expect(sys.StringContent).ToNot(ContainSubstring("alice")) + }) +}) + +func boolPtr(b bool) *bool { return &b } diff --git a/core/http/endpoints/openai/realtime_voicegate_test.go b/core/http/endpoints/openai/realtime_voicegate_test.go index bdbbc2f4a..3d9b458e1 100644 --- a/core/http/endpoints/openai/realtime_voicegate_test.go +++ b/core/http/endpoints/openai/realtime_voicegate_test.go @@ -10,6 +10,82 @@ import ( . "github.com/onsi/gomega" ) +var _ = Describe("voiceGate.Resolve + authorize", func() { + mkGate := func(allow []string) *voiceGate { + return &voiceGate{ + cfg: config.PipelineVoiceRecognition{ + Mode: config.VoiceGateModeIdentify, + Threshold: 0.25, + Allow: config.VoiceRecognitionAllow{Names: allow}, + }, + registry: &fakeRegistry{matches: []voicerecognition.Match{ + {Distance: 0.1, Metadata: voicerecognition.Metadata{ID: "spk_1", Name: "alice", Labels: map[string]string{"family": "yes"}}}, + }}, + embedFn: func(context.Context, string) ([]float32, error) { return []float32{1, 0, 0}, nil }, + } + } + + It("resolves a confident identity with name, id and a 0..100 confidence", func() { + r, err := mkGate(nil).Resolve(context.Background(), "x.wav") + Expect(err).ToNot(HaveOccurred()) + Expect(r.found).To(BeTrue()) + Expect(r.speaker.Name).To(Equal("alice")) + Expect(r.speaker.ID).To(Equal("spk_1")) + Expect(r.speaker.Matched).To(BeTrue()) + Expect(r.speaker.Confidence).To(BeNumerically(">", 0)) + Expect(r.speaker.Confidence).To(BeNumerically("<=", 100)) + Expect(r.speaker.Labels).To(HaveKeyWithValue("family", "yes")) + }) + + It("marks a candidate above the threshold as not matched", func() { + g := mkGate(nil) + g.registry = &fakeRegistry{matches: []voicerecognition.Match{ + {Distance: 0.9, Metadata: voicerecognition.Metadata{Name: "alice"}}, + }} + r, err := g.Resolve(context.Background(), "x.wav") + Expect(err).ToNot(HaveOccurred()) + Expect(r.found).To(BeTrue()) + Expect(r.speaker.Matched).To(BeFalse()) + Expect(r.speaker.Name).To(Equal("alice")) // name still surfaced + }) + + It("authorize allows a confident match in the allow list", func() { + g := mkGate([]string{"alice"}) + r, _ := g.Resolve(context.Background(), "x.wav") + allowed, reason := g.authorize(r) + Expect(allowed).To(BeTrue()) + Expect(reason).To(BeEmpty()) + }) + + It("authorize denies a confident match outside the allow list", func() { + g := mkGate([]string{"bob"}) + r, _ := g.Resolve(context.Background(), "x.wav") + allowed, reason := g.authorize(r) + Expect(allowed).To(BeFalse()) + Expect(reason).To(Equal("speaker not in allow list")) + }) + + It("authorize allows by label when names do not match", func() { + g := mkGate(nil) + g.cfg.Allow = config.VoiceRecognitionAllow{Labels: []string{"family"}} + r, _ := g.Resolve(context.Background(), "x.wav") + allowed, _ := g.authorize(r) + Expect(allowed).To(BeTrue()) + }) +}) + +var _ = Describe("confidence", func() { + It("is 100 at zero distance", func() { + Expect(confidence(0, 0.25)).To(BeNumerically("~", 100, 1e-4)) + }) + It("clamps to 0 above the threshold", func() { + Expect(confidence(0.5, 0.25)).To(BeNumerically("~", 0, 1e-4)) + }) + It("is 0 for a non-positive threshold", func() { + Expect(confidence(0.1, 0)).To(BeNumerically("~", 0, 1e-4)) + }) +}) + var _ = Describe("cosineDistance", func() { It("is 0 for identical vectors", func() { Expect(cosineDistance([]float32{1, 0, 0}, []float32{1, 0, 0})).To(BeNumerically("~", 0, 1e-6)) diff --git a/core/http/endpoints/openai/types/message_item.go b/core/http/endpoints/openai/types/message_item.go index 52997fe8c..88d680648 100644 --- a/core/http/endpoints/openai/types/message_item.go +++ b/core/http/endpoints/openai/types/message_item.go @@ -102,6 +102,10 @@ type MessageItemUser struct { // The status of the item. Has no effect on the conversation. Status ItemStatus `json:"status,omitempty"` + + // Speaker is the recognized speaker for this audio turn (LocalAI extension). + // Used to attribute past turns when rebuilding the LLM message history. + Speaker *Speaker `json:"speaker,omitempty"` } func (m MessageItemUser) MessageItemType() MessageItemType { diff --git a/core/http/endpoints/openai/types/server_events.go b/core/http/endpoints/openai/types/server_events.go index bae680fd5..8183a8b78 100644 --- a/core/http/endpoints/openai/types/server_events.go +++ b/core/http/endpoints/openai/types/server_events.go @@ -20,6 +20,9 @@ const ( ServerEventTypeConversationItemInputAudioTranscriptionFailed ServerEventType = "conversation.item.input_audio_transcription.failed" ServerEventTypeConversationItemTruncated ServerEventType = "conversation.item.truncated" ServerEventTypeConversationItemDeleted ServerEventType = "conversation.item.deleted" + // ServerEventTypeConversationItemSpeaker is a LocalAI extension: it reports + // the recognized speaker for a user audio item. OpenAI clients ignore it. + ServerEventTypeConversationItemSpeaker ServerEventType = "conversation.item.speaker" ServerEventTypeInputAudioBufferCommitted ServerEventType = "input_audio_buffer.committed" ServerEventTypeInputAudioBufferCleared ServerEventType = "input_audio_buffer.cleared" ServerEventTypeInputAudioBufferSpeechStarted ServerEventType = "input_audio_buffer.speech_started" @@ -335,6 +338,33 @@ func (m ConversationItemAddedEvent) MarshalJSON() ([]byte, error) { return json.Marshal(shadow) } +// ConversationItemSpeakerEvent reports the recognized speaker for a user audio +// item. LocalAI extension; not part of the OpenAI Realtime API. +type ConversationItemSpeakerEvent struct { + ServerEventBase + // ItemID is the conversation item this speaker belongs to. + ItemID string `json:"item_id"` + // Speaker is the recognized identity. + Speaker Speaker `json:"speaker"` +} + +func (m ConversationItemSpeakerEvent) ServerEventType() ServerEventType { + return ServerEventTypeConversationItemSpeaker +} + +func (m ConversationItemSpeakerEvent) MarshalJSON() ([]byte, error) { + type typeAlias ConversationItemSpeakerEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + // Returned when a conversation item is finalized. // // The event will include the full content of the Item except for audio data, which can be retrieved separately with a `conversation.item.retrieve` event if needed. diff --git a/core/http/endpoints/openai/types/speaker.go b/core/http/endpoints/openai/types/speaker.go new file mode 100644 index 000000000..a5b02c927 --- /dev/null +++ b/core/http/endpoints/openai/types/speaker.go @@ -0,0 +1,14 @@ +package types + +// Speaker is the recognized speaker for a committed audio turn. It is a LocalAI +// extension to the OpenAI Realtime schema, carried on the user conversation item +// and surfaced via the conversation.item.speaker event. Confidence is a 0..100 +// score relative to the match threshold (same formula as /v1/voice/identify). +type Speaker struct { + Name string `json:"name,omitempty"` + ID string `json:"id,omitempty"` + Labels map[string]string `json:"labels,omitempty"` + Confidence float32 `json:"confidence"` + Distance float32 `json:"distance"` + Matched bool `json:"matched"` +} diff --git a/docs/content/features/openai-realtime.md b/docs/content/features/openai-realtime.md index 51f8960bf..48cfc9332 100644 --- a/docs/content/features/openai-realtime.md +++ b/docs/content/features/openai-realtime.md @@ -141,6 +141,8 @@ The API follows the OpenAI Realtime API protocol for handling sessions, audio bu A pipeline realtime model can require speaker verification before it responds. Add a `voice_recognition` block under `pipeline`. When present, each committed utterance is verified against authorized speakers; unauthorized utterances are dropped before the LLM runs (no LLM call, no tool execution, no TTS). The session stays open. +The same block also drives two optional, independent behaviors: an authorization gate (`enforce`) and speaker surfacing/personalization (`identity`). Set `enforce: false` to keep recognizing the speaker without ever rejecting a turn. + ```yaml name: my-realtime pipeline: @@ -152,6 +154,7 @@ pipeline: model: speaker-recognition # the speaker-recognition backend model mode: identify # "identify" (registry) or "verify" (references) threshold: 0.25 # cosine distance; <= passes + enforce: true # authorization gate (default true) when: every # "every" (default) or "first" on_reject: drop_event # "drop_event" (default) or "drop_silent" anti_spoofing: false # optional liveness check (verify mode) @@ -170,19 +173,73 @@ pipeline: audio: /models/voices/bob.wav ``` +### Identifying speakers without gating + +To recognize who is speaking and surface it to the client and the LLM without ever rejecting a turn, set `enforce: false` and add an `identity` block. The `identity` block works with or without the gate; when it is set, the speaker is resolved on every turn even if `when: first`. + +```yaml +name: my-realtime +pipeline: + vad: silero-vad + transcription: whisper + llm: qwen + tts: kokoro + voice_recognition: + model: speaker-recognition + mode: identify + threshold: 0.25 + # Authorization gate. Defaults to enforcing (rejects unauthorized speakers). + # Set enforce:false to identify the speaker WITHOUT rejecting anyone. + enforce: false + when: every + # Surface the recognized speaker to the client and the LLM. Works with or + # without enforce; when set, identity is resolved on every turn even if + # when:first. + identity: + announce: true # emit the conversation.item.speaker event + announce_unknown: false # also emit it when there is no confident match + personalize: true # tell the LLM who is speaking + inject_name: true # set the per-message OpenAI name field + inject_system_note: true # append a "current speaker" line to the system message + note_unknown: false # append a "speaker is unknown" note when unidentified +``` + | Field | Meaning | |-------|---------| | `model` | Speaker-recognition backend model name. | | `mode` | `identify` matches against speakers registered via `/v1/voice/register`; `verify` matches against the `references` audios. | | `threshold` | Maximum cosine distance that still counts as a match (default ~0.25). | -| `when` | `every` verifies each utterance; `first` verifies once then trusts the session. | +| `enforce` | Authorization gate. `true` (or omitted) rejects unauthorized speakers (the gating behavior above). `false` resolves and surfaces the speaker without ever dropping a turn. | +| `when` | `every` verifies each utterance; `first` verifies once then trusts the session. When an `identity` block is set, the speaker is still resolved on every turn even with `first`. | | `on_reject` | `drop_event` drops and emits a `speaker_not_authorized` error event; `drop_silent` drops quietly. | | `anti_spoofing` | Verify mode only: runs the backend liveness check (slower). | | `allow.names` / `allow.labels` | identify mode: which registry identities are authorized. Empty = any registered speaker. | | `references` | verify mode: authorized reference speakers; the utterance passes if it matches any. | +| `identity.announce` | Emit the `conversation.item.speaker` event to the client (see below). | +| `identity.announce_unknown` | Also emit that event when there is no confident match. By default the event is emitted only on a match. | +| `identity.personalize` | Inform the LLM who is speaking. | +| `identity.inject_name` | Set the per-message OpenAI `name` field on each user turn. | +| `identity.inject_system_note` | Append a `The current speaker is .` line to the system message. | +| `identity.note_unknown` | When unidentified, append `The current speaker is unknown.` (lets the model ask who it is talking to). | `identify` mode requires the voice registry (speakers registered through `/v1/voice/register`). `verify` mode needs no registry: reference audios are embedded once at model load. +### The `conversation.item.speaker` event + +When `identity.announce` is enabled, the server emits a `conversation.item.speaker` event after the user conversation item, naming the recognized speaker: + +```json +{ + "type": "conversation.item.speaker", + "item_id": "item_abc", + "speaker": { "name": "Jeremy", "id": "spk_1", "labels": { "role": "owner" }, "confidence": 92.0, "distance": 0.1, "matched": true } +} +``` + +`confidence` is a 0-100 score, `distance` is the cosine distance, and `matched` is `true` when a confident match was found. `labels` carries any labels attached to the registered speaker (identify mode); it is omitted when the speaker has none. The `name` and `id` fields are omitted when empty. By default the event is emitted only on a match; set `identity.announce_unknown: true` to also emit it (with `matched: false`) when no speaker is identified. + +This event is a LocalAI extension to the OpenAI Realtime API and is server-emitted only. Standard OpenAI Realtime clients ignore event types they do not recognize, so enabling it is non-breaking. + ## Examples - [Realtime voice assistant demo (Go)](https://github.com/localai-org/localai-realtime-demo): a minimal Go client for the Realtime (WebSocket) API with a full talk-back voice loop and an example tool call. Ships a `docker compose` setup that brings up a realtime-capable LocalAI for you. diff --git a/tests/e2e/e2e_suite_test.go b/tests/e2e/e2e_suite_test.go index 72ac88b74..5a257bdb0 100644 --- a/tests/e2e/e2e_suite_test.go +++ b/tests/e2e/e2e_suite_test.go @@ -275,6 +275,40 @@ var _ = BeforeSuite(func() { Expect(err).ToNot(HaveOccurred()) Expect(os.WriteFile(filepath.Join(modelsPath, "realtime-pipeline-gated.yaml"), gatedData, 0644)).To(Succeed()) + // Identity-surfacing pipeline: the same speaker backend, but enforce:false + // (never drop a turn) plus an identity block so the server emits the + // conversation.item.speaker event and personalizes the LLM turn. Used by the + // speaker-identity e2e specs. + identityCfg := map[string]any{ + "name": "realtime-pipeline-identity", + "pipeline": map[string]any{ + "vad": "mock-vad", + "transcription": "mock-stt", + "llm": "mock-llm", + "tts": "mock-tts", + "voice_recognition": map[string]any{ + "model": "mock-speaker", + "mode": "verify", + "threshold": 0.25, + "when": "every", + "enforce": false, + "references": []map[string]any{ + {"name": "e2e-speaker", "audio": voiceRefPath}, + }, + "identity": map[string]any{ + "announce": true, + "announce_unknown": true, + "personalize": true, + "inject_name": true, + "inject_system_note": true, + }, + }, + }, + } + identityData, err := yaml.Marshal(identityCfg) + Expect(err).ToNot(HaveOccurred()) + Expect(os.WriteFile(filepath.Join(modelsPath, "realtime-pipeline-identity.yaml"), identityData, 0644)).To(Succeed()) + // Router model setup: a score classifier (mock-backend Score) selects // between two candidate chat models based on keyword matches against the // candidate label fragments. Exercises the full RouteModel middleware path diff --git a/tests/e2e/realtime_speaker_identity_test.go b/tests/e2e/realtime_speaker_identity_test.go new file mode 100644 index 000000000..41a15f0fb --- /dev/null +++ b/tests/e2e/realtime_speaker_identity_test.go @@ -0,0 +1,95 @@ +package e2e_test + +import ( + "encoding/base64" + "time" + + "github.com/gorilla/websocket" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// These specs drive the speaker-identity surfacing end to end against a real +// LocalAI server over a real WebSocket, using the mock backend's VoiceEmbed +// (DC-biased PCM -> one of two orthogonal speaker vectors). The pipeline is +// realtime-pipeline-identity: verify mode with enforce:false plus an identity +// block, so the server resolves the speaker, emits a conversation.item.speaker +// event, and never drops a turn. +var _ = Describe("Realtime speaker identity surfacing", Label("Realtime"), func() { + // open connects to the identity pipeline and disables server VAD so the + // test can commit the input buffer manually. + open := func() *websocket.Conn { + c := connectWS("realtime-pipeline-identity") + created := readServerEvent(c, 30*time.Second) + Expect(created["type"]).To(Equal("session.created")) + sendClientEvent(c, disableVADEvent()) + drainUntil(c, "session.updated", 10*time.Second) + return c + } + + commit := func(c *websocket.Conn, pcm []byte) { + sendClientEvent(c, map[string]any{ + "type": "input_audio_buffer.append", + "audio": base64.StdEncoding.EncodeToString(pcm), + }) + sendClientEvent(c, map[string]any{"type": "input_audio_buffer.commit"}) + } + + // collectUntilDone reads events until response.done (or timeout), returning + // the conversation.item.speaker event (nil if none) and whether the turn + // reached response.done. + collectUntilDone := func(c *websocket.Conn, timeout time.Duration) (speaker map[string]any, gotDone bool) { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + evt := readServerEvent(c, time.Until(deadline)) + switch evt["type"] { + case "conversation.item.speaker": + speaker = evt + case "response.done": + return speaker, true + } + } + return speaker, false + } + + It("emits conversation.item.speaker naming an authorized speaker and still responds", func() { + c := open() + defer func() { _ = c.Close() }() + + // Positive DC bias matches the enrolled reference speaker. + commit(c, pcmWithDC(300, 16000, 1000, 8000)) + drainUntil(c, "input_audio_buffer.committed", 30*time.Second) + + speaker, gotDone := collectUntilDone(c, 60*time.Second) + Expect(speaker).ToNot(BeNil(), "expected a conversation.item.speaker event") + Expect(speaker["item_id"]).ToNot(BeEmpty()) + + spk, ok := speaker["speaker"].(map[string]any) + Expect(ok).To(BeTrue(), "speaker payload should be an object") + Expect(spk["matched"]).To(Equal(true)) + Expect(spk["name"]).To(Equal("e2e-speaker")) + + Expect(gotDone).To(BeTrue(), "enforce:false should let the turn reach response.done") + }) + + It("emits an unknown speaker event and still responds when enforce is false", func() { + c := open() + defer func() { _ = c.Close() }() + + // Negative DC bias is a different speaker that matches no reference. + commit(c, pcmWithDC(300, 16000, 1000, -8000)) + drainUntil(c, "input_audio_buffer.committed", 30*time.Second) + + speaker, gotDone := collectUntilDone(c, 60*time.Second) + Expect(speaker).ToNot(BeNil(), "announce_unknown should still emit the event") + + spk, ok := speaker["speaker"].(map[string]any) + Expect(ok).To(BeTrue(), "speaker payload should be an object") + Expect(spk["matched"]).To(Equal(false)) + // name is omitted for an unidentified speaker. + _, hasName := spk["name"] + Expect(hasName).To(BeFalse()) + + Expect(gotDone).To(BeTrue(), "enforce:false must not drop an unauthorized speaker") + }) +})