From 32c47706ae65ff50a4bc5b8891075af7f864e34e Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sun, 21 Jun 2026 21:07:10 +0200 Subject: [PATCH] feat(realtime): speaker-aware conversations - surface identity to client and LLM (#10424) * feat(realtime): add voice_recognition enforce + identity config Add Enforce *bool and Identity *VoiceIdentityConfig to PipelineVoiceRecognition, plus EnforceGate/IdentityEnabled/ AnnounceEnabled/PersonalizeEnabled helpers. Enforce nil defaults to gating (backward compatible); identity surfacing is independent of the gate. Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Ettore Di Giacinto * feat(realtime): add Speaker type and conversation.item.speaker event Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Ettore Di Giacinto * refactor(realtime): split voiceGate into Resolve + authorize Split the speaker authorization into a Resolve step (embed once, produce a types.Speaker identity) and a pure authorize policy step, with a 0..100 confidence score mirroring /v1/voice/identify. The legacy Authorize wrapper is kept so existing specs stay green. Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Ettore Di Giacinto * feat(realtime): resolve speaker per turn and emit conversation.item.speaker Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Ettore Di Giacinto * feat(realtime): personalize LLM turns with recognized speaker Set the per-message name field on each recognized user turn and append a current-speaker note to the system message, both gated by the voice recognition identity config. Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Ettore Di Giacinto * docs(realtime): document speaker identity surfacing and personalization Document the new voice_recognition keys (enforce, identity.*) and the LocalAI-extension conversation.item.speaker server event in the realtime feature docs. Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Ettore Di Giacinto * test(realtime): cover when:first+identity re-resolution and multi-speaker history Add two integration specs to harden the speaker-aware realtime path: - when:first with an Identity block re-resolves the speaker every turn even though re-authorization is skipped after the first match: a later resolve error now fails closed, while a clean later resolve still surfaces and names the speaker. - multi-speaker history attribution: each user turn carries its own per-message name and the injected system note reflects the latest speaker. Test-only change; no production behavior was modified. Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Ettore Di Giacinto * feat(realtime): surface speaker labels in conversation.item.speaker Carry the registered speaker's labels (identify mode) on types.Speaker so they flow into the conversation.item.speaker event and the stored item. Verify mode has no labels, so the field is omitted there. Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Ettore Di Giacinto * test(e2e): cover conversation.item.speaker over a real websocket Add a realtime-pipeline-identity config (verify mode, enforce:false, identity announce+announce_unknown+personalize) and two e2e specs driving the real server over a real WebSocket with the mock VoiceEmbed backend: an authorized speaker yields a conversation.item.speaker event naming e2e-speaker (matched true) and reaches response.done; an unauthorized speaker yields an unknown (matched false, no name) event and still responds, proving enforce:false never drops a turn. Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Ettore Di Giacinto * fix(config): register voice_recognition enforce + identity fields The meta registry coverage test (TestAllFieldsHaveRegistryEntries) requires every config field to have an entry in core/config/meta/registry.go. The new voice_recognition.enforce and voice_recognition.identity.* fields were missing, failing tests-linux and tests-apple. Add registry entries (toggles) so the fields are surfaced in the model-config editor and the coverage test passes. Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Ettore Di Giacinto --------- Signed-off-by: Ettore Di Giacinto Co-authored-by: Ettore Di Giacinto Co-authored-by: Ettore Di Giacinto --- core/config/meta/registry.go | 49 ++++ core/config/model_config.go | 48 ++++ core/config/voice_gate_test.go | 28 ++ core/http/endpoints/openai/realtime.go | 155 +++++++---- .../endpoints/openai/realtime_doubles_test.go | 5 +- .../openai/realtime_speaker_event_test.go | 54 ++++ .../endpoints/openai/realtime_voicegate.go | 175 ++++++++---- .../realtime_voicegate_integration_test.go | 249 ++++++++++++++++++ .../openai/realtime_voicegate_test.go | 76 ++++++ .../endpoints/openai/types/message_item.go | 4 + .../endpoints/openai/types/server_events.go | 30 +++ core/http/endpoints/openai/types/speaker.go | 14 + docs/content/features/openai-realtime.md | 59 ++++- tests/e2e/e2e_suite_test.go | 34 +++ tests/e2e/realtime_speaker_identity_test.go | 95 +++++++ 15 files changed, 977 insertions(+), 98 deletions(-) create mode 100644 core/http/endpoints/openai/realtime_speaker_event_test.go create mode 100644 core/http/endpoints/openai/types/speaker.go create mode 100644 tests/e2e/realtime_speaker_identity_test.go diff --git a/core/config/meta/registry.go b/core/config/meta/registry.go index 84fc9afda..b7ffa9290 100644 --- a/core/config/meta/registry.go +++ b/core/config/meta/registry.go @@ -457,6 +457,55 @@ func DefaultRegistry() map[string]FieldMetaOverride { Component: "json-editor", Order: 78, }, + "pipeline.voice_recognition.enforce": { + Section: "pipeline", + Label: "Voice Gate Enforce", + Description: "Whether the gate rejects unauthorized speakers. Enabled (default) drops unauthorized utterances before the LLM. Disabled still resolves and surfaces the speaker (for the conversation.item.speaker event and personalization) but never drops a turn.", + Component: "toggle", + Order: 80, + }, + "pipeline.voice_recognition.identity.announce": { + Section: "pipeline", + Label: "Speaker Identity Announce", + Description: "Emit a conversation.item.speaker event to the client naming the recognized speaker. When set, identity is resolved on every turn even if 'when' is 'first'.", + Component: "toggle", + Order: 81, + }, + "pipeline.voice_recognition.identity.announce_unknown": { + Section: "pipeline", + Label: "Speaker Identity Announce Unknown", + Description: "Also emit the conversation.item.speaker event (with matched=false) when no confident match is found. Default only announces on a match.", + Component: "toggle", + Order: 82, + }, + "pipeline.voice_recognition.identity.personalize": { + Section: "pipeline", + Label: "Speaker Identity Personalize", + Description: "Inform the LLM who is speaking so it can tailor replies. Enables the name and system-note injection below.", + Component: "toggle", + Order: 83, + }, + "pipeline.voice_recognition.identity.inject_name": { + Section: "pipeline", + Label: "Speaker Identity Inject Name", + Description: "Personalization: set the per-message OpenAI 'name' field on each user turn to the recognized speaker.", + Component: "toggle", + Order: 84, + }, + "pipeline.voice_recognition.identity.inject_system_note": { + Section: "pipeline", + Label: "Speaker Identity Inject System Note", + Description: "Personalization: append a 'The current speaker is .' note to the system message reflecting the latest speaker.", + Component: "toggle", + Order: 85, + }, + "pipeline.voice_recognition.identity.note_unknown": { + Section: "pipeline", + Label: "Speaker Identity Note Unknown", + Description: "Personalization: when the speaker is unidentified, append 'The current speaker is unknown.' to the system message so the model can ask who it is talking to.", + Component: "toggle", + Order: 86, + }, "pipeline.max_history_items": { Section: "pipeline", Label: "Max History Items", diff --git a/core/config/model_config.go b/core/config/model_config.go index 9586beea3..5dbfd2026 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -769,6 +769,13 @@ type PipelineVoiceRecognition struct { Allow VoiceRecognitionAllow `yaml:"allow,omitempty" json:"allow,omitempty"` // References are the authorized reference speakers (verify mode). References []VoiceReference `yaml:"references,omitempty" json:"references,omitempty"` + // Enforce controls the authorization gate. A nil value or true rejects + // unauthorized speakers (the historical behavior). false resolves the + // speaker's identity for surfacing/personalization but never drops a turn. + Enforce *bool `yaml:"enforce,omitempty" json:"enforce,omitempty"` + // Identity surfaces the recognized speaker to the client and the LLM. It is + // independent of Enforce: identity can be surfaced without gating. + Identity *VoiceIdentityConfig `yaml:"identity,omitempty" json:"identity,omitempty"` } // @Description VoiceRecognitionAllow filters authorized registry identities. @@ -785,6 +792,25 @@ type VoiceReference struct { Audio string `yaml:"audio,omitempty" json:"audio,omitempty"` } +// @Description VoiceIdentityConfig surfaces the recognized speaker to the realtime +// client and the LLM. When set, identity is resolved on every turn even if the +// gate's When is "first" (the gate still authorizes only once). +type VoiceIdentityConfig struct { + // Announce emits a conversation.item.speaker event to the client. + Announce bool `yaml:"announce,omitempty" json:"announce,omitempty"` + // AnnounceUnknown also emits the event when there is no confident match. + AnnounceUnknown bool `yaml:"announce_unknown,omitempty" json:"announce_unknown,omitempty"` + // Personalize informs the LLM who is speaking. + Personalize bool `yaml:"personalize,omitempty" json:"personalize,omitempty"` + // InjectName sets the per-message name field on each user turn. + InjectName bool `yaml:"inject_name,omitempty" json:"inject_name,omitempty"` + // InjectSystemNote maintains a "current speaker" note in the system message. + InjectSystemNote bool `yaml:"inject_system_note,omitempty" json:"inject_system_note,omitempty"` + // NoteUnknown adds a "the current speaker is unknown" note (enables the model + // to ask who it is talking to). + NoteUnknown bool `yaml:"note_unknown,omitempty" json:"note_unknown,omitempty"` +} + // VoiceGateEnabled reports whether a voice-recognition gate is configured. The // mere presence of the block is the intent signal: a present-but-incomplete // block (e.g. missing model) must fail closed at construction, not be silently @@ -793,6 +819,28 @@ func (p Pipeline) VoiceGateEnabled() bool { return p.VoiceRecognition != nil } +// EnforceGate reports whether the gate rejects unauthorized speakers. A nil +// Enforce means "enforce" so existing configs keep gating. +func (p PipelineVoiceRecognition) EnforceGate() bool { + return p.Enforce == nil || *p.Enforce +} + +// IdentityEnabled reports whether the speaker's identity must be resolved for +// surfacing or personalization. +func (p PipelineVoiceRecognition) IdentityEnabled() bool { + return p.Identity != nil && (p.Identity.Announce || p.Identity.Personalize) +} + +// AnnounceEnabled reports whether to emit the conversation.item.speaker event. +func (p PipelineVoiceRecognition) AnnounceEnabled() bool { + return p.Identity != nil && p.Identity.Announce +} + +// PersonalizeEnabled reports whether to inform the LLM of the speaker. +func (p PipelineVoiceRecognition) PersonalizeEnabled() bool { + return p.Identity != nil && p.Identity.Personalize +} + // Normalize fills in defaults in place for omitted fields. func (v *PipelineVoiceRecognition) Normalize() { if v.Mode == "" { diff --git a/core/config/voice_gate_test.go b/core/config/voice_gate_test.go index 5c7782f1c..c0d25bf82 100644 --- a/core/config/voice_gate_test.go +++ b/core/config/voice_gate_test.go @@ -70,4 +70,32 @@ var _ = Describe("PipelineVoiceRecognition", func() { Expect((Pipeline{VoiceRecognition: &PipelineVoiceRecognition{}}).VoiceGateEnabled()).To(BeTrue()) }) }) + + Describe("Enforce / Identity helpers", func() { + It("treats a nil Enforce as enforcing (backward compatible)", func() { + v := PipelineVoiceRecognition{Model: "spk"} + Expect(v.EnforceGate()).To(BeTrue()) + }) + It("honors an explicit enforce:false", func() { + off := false + v := PipelineVoiceRecognition{Model: "spk", Enforce: &off} + Expect(v.EnforceGate()).To(BeFalse()) + }) + It("reports identity disabled when no identity block is set", func() { + v := PipelineVoiceRecognition{Model: "spk"} + Expect(v.IdentityEnabled()).To(BeFalse()) + Expect(v.AnnounceEnabled()).To(BeFalse()) + Expect(v.PersonalizeEnabled()).To(BeFalse()) + }) + It("reports identity enabled when announce or personalize is on", func() { + v := PipelineVoiceRecognition{Model: "spk", Identity: &VoiceIdentityConfig{Announce: true}} + Expect(v.IdentityEnabled()).To(BeTrue()) + Expect(v.AnnounceEnabled()).To(BeTrue()) + Expect(v.PersonalizeEnabled()).To(BeFalse()) + + v2 := PipelineVoiceRecognition{Model: "spk", Identity: &VoiceIdentityConfig{Personalize: true}} + Expect(v2.IdentityEnabled()).To(BeTrue()) + Expect(v2.PersonalizeEnabled()).To(BeTrue()) + }) + }) }) diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go index 343ef4c07..8de50e580 100644 --- a/core/http/endpoints/openai/realtime.go +++ b/core/http/endpoints/openai/realtime.go @@ -1311,28 +1311,32 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co // turn wastes only transcription compute, which has no side effects. The // transcript is still emitted to the same peer that sent the audio, which // reveals nothing new to them. - type gateOutcome struct { - allowed bool - matched string - reason string - err error + // Resolve the speaker when the gate must authorize this turn, or when identity + // surfacing/personalization needs a fresh identity. Identity resolution + // ignores the when:first short-circuit (that only skips re-authorization). + type resolveOutcome struct { + res resolution + err error } - var gateCh chan gateOutcome - runGate := false + var resolveCh chan resolveOutcome + runResolve := false if session.voiceGate != nil && session.InputAudioTranscription != nil { - skip := false - if session.voiceGate.cfg.When == config.VoiceGateWhenFirst { + enforce := session.voiceGate.cfg.EnforceGate() + gateNeedsAuth := enforce + if enforce && session.voiceGate.cfg.When == config.VoiceGateWhenFirst { session.gateMu.Lock() - skip = session.voiceVerified + if session.voiceVerified { + gateNeedsAuth = false + } session.gateMu.Unlock() } - if !skip { - runGate = true - gateCh = make(chan gateOutcome, 1) + if gateNeedsAuth || session.voiceGate.cfg.IdentityEnabled() { + runResolve = true + resolveCh = make(chan resolveOutcome, 1) wavPath := f.Name() go func() { - allowed, matched, reason, gerr := session.voiceGate.Authorize(ctx, wavPath) - gateCh <- gateOutcome{allowed: allowed, matched: matched, reason: reason, err: gerr} + r, rerr := session.voiceGate.Resolve(ctx, wavPath) + resolveCh <- resolveOutcome{res: r, err: rerr} }() } } @@ -1348,8 +1352,8 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co if err != nil { // Drain the gate goroutine before returning so its in-flight read of // the temp WAV finishes before the deferred os.Remove fires. - if runGate { - <-gateCh + if runResolve { + <-resolveCh } sendError(t, "transcription_failed", err.Error(), "", "event_TODO") return @@ -1361,41 +1365,58 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co return } - // Join on the gate before any side-effecting step. - if runGate { - out := <-gateCh - allowed := out.allowed - reason := out.reason + // Join on the resolution before any side-effecting step. + var speaker *types.Speaker + if runResolve { + out := <-resolveCh + enforce := session.voiceGate.cfg.EnforceGate() + if out.err != nil { - // Fail closed: a gate that cannot decide must not let audio through. - xlog.Error("voice recognition gate error", "error", out.err) - allowed = false - reason = "verification error" - } - alreadyVerified := false - if session.voiceGate.cfg.When == config.VoiceGateWhenFirst { - session.gateMu.Lock() - alreadyVerified = session.voiceVerified - session.gateMu.Unlock() - } - proceed, markVerified := session.voiceGate.decide(alreadyVerified, allowed) - if !proceed { - xlog.Debug("voice recognition gate rejected utterance", "reason", reason) - if session.voiceGate.cfg.OnReject == config.VoiceGateRejectEvent { - sendError(t, "speaker_not_authorized", "speaker not authorized: "+reason, "", "event_TODO") + if enforce { + // Fail closed: a gate that cannot decide must not let audio through. + xlog.Error("voice recognition gate error", "error", out.err) + if session.voiceGate.cfg.OnReject == config.VoiceGateRejectEvent { + sendError(t, "speaker_not_authorized", "speaker not authorized: verification error", "", "event_TODO") + } + return } - return + // Non-enforcing: degrade to an unknown speaker and continue. + xlog.Warn("voice identity resolve failed; continuing as unknown speaker", "error", out.err) + } else { + s := out.res.speaker + speaker = &s } - xlog.Debug("voice recognition gate authorized utterance", "speaker", out.matched) - if markVerified { - session.gateMu.Lock() - session.voiceVerified = true - session.gateMu.Unlock() + + if enforce { + alreadyVerified := false + if session.voiceGate.cfg.When == config.VoiceGateWhenFirst { + session.gateMu.Lock() + alreadyVerified = session.voiceVerified + session.gateMu.Unlock() + } + allowed, reason := false, "verification error" + if out.err == nil { + allowed, reason = session.voiceGate.authorize(out.res) + } + proceed, markVerified := session.voiceGate.decide(alreadyVerified, allowed) + if !proceed { + xlog.Debug("voice recognition gate rejected utterance", "reason", reason) + if session.voiceGate.cfg.OnReject == config.VoiceGateRejectEvent { + sendError(t, "speaker_not_authorized", "speaker not authorized: "+reason, "", "event_TODO") + } + return + } + if markVerified { + session.gateMu.Lock() + session.voiceVerified = true + session.gateMu.Unlock() + } + xlog.Debug("voice recognition gate authorized utterance", "speaker", out.res.speaker.Name) } } if !session.TranscriptionOnly { - generateResponse(ctx, session, utt, transcript, conv, t) + generateResponse(ctx, session, utt, transcript, speaker, conv, t) } } @@ -1419,15 +1440,28 @@ func runVAD(ctx context.Context, session *Session, adata []int16) ([]schema.VADS return resp.Segments, nil } +// speakerNote renders the system-prompt note for the current speaker. Returns +// an empty string when there is no name and unknown notes are disabled. +func speakerNote(s *types.Speaker, noteUnknown bool) string { + if s != nil && s.Matched && s.Name != "" { + return "The current speaker is " + s.Name + "." + } + if noteUnknown { + return "The current speaker is unknown." + } + return "" +} + // Function to generate a response based on the conversation -func generateResponse(ctx context.Context, session *Session, utt []byte, transcript string, conv *Conversation, t Transport) { +func generateResponse(ctx context.Context, session *Session, utt []byte, transcript string, speaker *types.Speaker, conv *Conversation, t Transport) { xlog.Debug("Generating realtime response...") // Create user message item item := types.MessageItemUnion{ User: &types.MessageItemUser{ - ID: generateItemID(), - Status: types.ItemStatusCompleted, + ID: generateItemID(), + Status: types.ItemStatusCompleted, + Speaker: speaker, Content: []types.MessageContentInput{ { Type: types.MessageContentTypeInputAudio, @@ -1445,6 +1479,17 @@ func generateResponse(ctx context.Context, session *Session, utt []byte, transcr Item: item, }) + // Surface the recognized speaker to the client. Skip the event for an + // unidentified speaker unless announce_unknown is set. + if speaker != nil && session.voiceGate != nil && session.voiceGate.cfg.AnnounceEnabled() { + if speaker.Matched || session.voiceGate.cfg.Identity.AnnounceUnknown { + sendEvent(t, types.ConversationItemSpeakerEvent{ + ItemID: item.User.ID, + Speaker: *speaker, + }) + } + } + triggerResponse(ctx, session, conv, t, nil) } @@ -1508,6 +1553,8 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa }) imgIndex := 0 + var lastUserSpeaker *types.Speaker + personalize := session.voiceGate != nil && session.voiceGate.cfg.PersonalizeEnabled() conv.Lock.Lock() items := trimRealtimeItems(conv.Items, session.MaxHistoryItems) for _, item := range items { @@ -1515,6 +1562,11 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa msg := schema.Message{ Role: string(types.MessageRoleUser), } + lastUserSpeaker = item.User.Speaker + if personalize && session.voiceGate.cfg.Identity.InjectName && + item.User.Speaker != nil && item.User.Speaker.Matched && item.User.Speaker.Name != "" { + msg.Name = item.User.Speaker.Name + } textContent := "" nrOfImgsInMessage := 0 for _, content := range item.User.Content { @@ -1601,6 +1653,13 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa } conv.Lock.Unlock() + if personalize && session.voiceGate.cfg.Identity.InjectSystemNote { + if note := speakerNote(lastUserSpeaker, session.voiceGate.cfg.Identity.NoteUnknown); note != "" { + conversationHistory[0].StringContent += "\n\n" + note + conversationHistory[0].Content = conversationHistory[0].StringContent + } + } + var images []string for _, m := range conversationHistory { images = append(images, m.StringImages...) diff --git a/core/http/endpoints/openai/realtime_doubles_test.go b/core/http/endpoints/openai/realtime_doubles_test.go index accd6af51..727ce7dcc 100644 --- a/core/http/endpoints/openai/realtime_doubles_test.go +++ b/core/http/endpoints/openai/realtime_doubles_test.go @@ -83,6 +83,8 @@ type fakeModel struct { predictChunkDeltas [][]*proto.ChatDelta predictResp backend.LLMResponse predictErr error + + lastMessages schema.Messages } func (m *fakeModel) VAD(context.Context, *schema.VADRequest) (*schema.VADResponse, error) { @@ -93,7 +95,8 @@ func (m *fakeModel) Transcribe(context.Context, string, string, bool, bool, stri return m.transcribeFinal, nil } -func (m *fakeModel) Predict(_ context.Context, _ schema.Messages, _, _, _ []string, cb func(string, backend.TokenUsage) bool, _ []types.ToolUnion, _ *types.ToolChoiceUnion, _, _ *int, _ map[string]float64) (func() (backend.LLMResponse, error), error) { +func (m *fakeModel) Predict(_ context.Context, msgs schema.Messages, _, _, _ []string, cb func(string, backend.TokenUsage) bool, _ []types.ToolUnion, _ *types.ToolChoiceUnion, _, _ *int, _ map[string]float64) (func() (backend.LLMResponse, error), error) { + m.lastMessages = msgs if m.predictErr != nil { return nil, m.predictErr } diff --git a/core/http/endpoints/openai/realtime_speaker_event_test.go b/core/http/endpoints/openai/realtime_speaker_event_test.go new file mode 100644 index 000000000..fbbe5ded9 --- /dev/null +++ b/core/http/endpoints/openai/realtime_speaker_event_test.go @@ -0,0 +1,54 @@ +package openai + +import ( + "encoding/json" + + "github.com/mudler/LocalAI/core/http/endpoints/openai/types" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("ConversationItemSpeakerEvent", func() { + It("marshals with the conversation.item.speaker type and nested speaker", func() { + ev := types.ConversationItemSpeakerEvent{ + ItemID: "item_123", + Speaker: types.Speaker{Name: "Jeremy", ID: "spk_1", Labels: map[string]string{"family": "yes"}, Confidence: 92, Distance: 0.1, Matched: true}, + } + b, err := json.Marshal(ev) + Expect(err).ToNot(HaveOccurred()) + + var got map[string]any + Expect(json.Unmarshal(b, &got)).To(Succeed()) + Expect(got["type"]).To(Equal("conversation.item.speaker")) + Expect(got["item_id"]).To(Equal("item_123")) + + spk := got["speaker"].(map[string]any) + Expect(spk["name"]).To(Equal("Jeremy")) + Expect(spk["id"]).To(Equal("spk_1")) + Expect(spk["matched"]).To(Equal(true)) + Expect(spk["labels"]).To(HaveKeyWithValue("family", "yes")) + }) + + It("omits labels when the speaker has none", func() { + ev := types.ConversationItemSpeakerEvent{ItemID: "i", Speaker: types.Speaker{Name: "Jeremy", Matched: true}} + b, err := json.Marshal(ev) + Expect(err).ToNot(HaveOccurred()) + var got map[string]any + Expect(json.Unmarshal(b, &got)).To(Succeed()) + spk := got["speaker"].(map[string]any) + _, hasLabels := spk["labels"] + Expect(hasLabels).To(BeFalse()) + }) + + It("omits the name for an unknown speaker but keeps matched=false", func() { + ev := types.ConversationItemSpeakerEvent{ItemID: "i", Speaker: types.Speaker{Matched: false}} + b, err := json.Marshal(ev) + Expect(err).ToNot(HaveOccurred()) + var got map[string]any + Expect(json.Unmarshal(b, &got)).To(Succeed()) + spk := got["speaker"].(map[string]any) + _, hasName := spk["name"] + Expect(hasName).To(BeFalse()) + Expect(spk["matched"]).To(Equal(false)) + }) +}) diff --git a/core/http/endpoints/openai/realtime_voicegate.go b/core/http/endpoints/openai/realtime_voicegate.go index 54332536f..9bd6f10f2 100644 --- a/core/http/endpoints/openai/realtime_voicegate.go +++ b/core/http/endpoints/openai/realtime_voicegate.go @@ -7,6 +7,7 @@ import ( "github.com/mudler/LocalAI/core/backend" "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/http/endpoints/openai/types" "github.com/mudler/LocalAI/core/services/voicerecognition" "github.com/mudler/LocalAI/pkg/model" ) @@ -29,6 +30,32 @@ type voiceGate struct { verifyFn func(ctx context.Context, uttWav, refWav string) (bool, error) } +// resolution is the outcome of resolving a committed utterance's speaker. It +// carries the surfacing-facing Speaker plus the metadata the policy layer needs +// (labels for the allow-list) and a human reason when no usable identity exists. +type resolution struct { + speaker types.Speaker // name/id/confidence/distance/matched + labels map[string]string // identify-mode metadata labels, for the allow-list + found bool // a candidate identity existed at all + reason string // why-unknown / deny reason at the resolve level +} + +// confidence maps a cosine distance to a 0..100 score relative to the match +// threshold, mirroring the /v1/voice/identify endpoint. +func confidence(distance, threshold float32) float32 { + if threshold <= 0 { + return 0 + } + c := (1 - distance/threshold) * 100 + if c < 0 { + return 0 + } + if c > 100 { + return 100 + } + return c +} + // newVoiceGate builds a gate from a pipeline's voice_recognition config. It // validates fail-fast (before loading the model), loads the recognition model // config, wires the real backend seams, and pre-embeds references for verify @@ -89,91 +116,143 @@ func newVoiceGate( return g, nil } -// Authorize embeds the utterance and decides allow/deny. -// -// allowed: speaker is authorized. -// matched: matched person's name (informational), empty if none. -// reason: human-readable deny reason. -// err: backend failure (caller should fail closed). -func (g *voiceGate) Authorize(ctx context.Context, wavPath string) (allowed bool, matched string, reason string, err error) { +// Resolve embeds the utterance once and resolves the speaker's identity. It does +// NOT apply the authorization policy (see authorize). On a backend error it +// returns the error and a resolution whose reason explains the failure. +func (g *voiceGate) Resolve(ctx context.Context, wavPath string) (resolution, error) { if g.cfg.Mode == config.VoiceGateModeVerify { - return g.authorizeVerify(ctx, wavPath) + return g.resolveVerify(ctx, wavPath) } - return g.authorizeIdentify(ctx, wavPath) + return g.resolveIdentify(ctx, wavPath) } -func (g *voiceGate) authorizeIdentify(ctx context.Context, wavPath string) (bool, string, string, error) { +func (g *voiceGate) resolveIdentify(ctx context.Context, wavPath string) (resolution, error) { emb, err := g.embedFn(ctx, wavPath) if err != nil { - return false, "", "embed failed", err + return resolution{reason: "embed failed"}, err } if len(emb) == 0 { - return false, "", "no speech detected", nil + return resolution{reason: "no speech detected"}, nil } matches, err := g.registry.Identify(ctx, emb, 1) if err != nil { - return false, "", "identify failed", err + return resolution{reason: "identify failed"}, err } if len(matches) == 0 { - return false, "", "unknown speaker", nil + return resolution{reason: "unknown speaker"}, nil } m := matches[0] - if m.Distance > g.cfg.Threshold { - return false, m.Metadata.Name, "distance above threshold", nil + matched := m.Distance <= g.cfg.Threshold + r := resolution{ + speaker: types.Speaker{ + Name: m.Metadata.Name, + ID: m.Metadata.ID, + Labels: m.Metadata.Labels, + Distance: m.Distance, + Confidence: confidence(m.Distance, g.cfg.Threshold), + Matched: matched, + }, + labels: m.Metadata.Labels, + found: true, } - if !g.allowMatch(m.Metadata) { - return false, m.Metadata.Name, "speaker not in allow list", nil + if !matched { + r.reason = "distance above threshold" } - return true, m.Metadata.Name, "", nil + return r, nil +} + +func (g *voiceGate) resolveVerify(ctx context.Context, wavPath string) (resolution, error) { + if g.cfg.AntiSpoofing { + for _, ref := range g.refAudios { + ok, err := g.verifyFn(ctx, wavPath, ref.Audio) + if err != nil { + return resolution{reason: "verify failed"}, err + } + if ok { + return resolution{ + speaker: types.Speaker{Name: ref.Name, Confidence: 100, Matched: true}, + found: true, + }, nil + } + } + return resolution{reason: "no reference matched"}, nil + } + + emb, err := g.embedFn(ctx, wavPath) + if err != nil { + return resolution{reason: "embed failed"}, err + } + if len(emb) == 0 { + return resolution{reason: "no speech detected"}, nil + } + for _, ref := range g.refEmbeds { + d := cosineDistance(emb, ref.emb) + if d <= g.cfg.Threshold { + return resolution{ + speaker: types.Speaker{Name: ref.name, Distance: d, Confidence: confidence(d, g.cfg.Threshold), Matched: true}, + found: true, + }, nil + } + } + return resolution{reason: "no reference matched"}, nil +} + +// authorize applies the gate's policy to an already-resolved identity. +func (g *voiceGate) authorize(r resolution) (allowed bool, reason string) { + if g.cfg.Mode == config.VoiceGateModeVerify { + if r.speaker.Matched { + return true, "" + } + if r.reason == "" { + return false, "no reference matched" + } + return false, r.reason + } + if !r.found { + return false, r.reason + } + if !r.speaker.Matched { + return false, "distance above threshold" + } + if !g.allowMatch(r.speaker.Name, r.labels) { + return false, "speaker not in allow list" + } + return true, "" } // allowMatch reports whether a matched identity is authorized. An empty allow // (no names and no labels) authorizes any registered speaker. -func (g *voiceGate) allowMatch(meta voicerecognition.Metadata) bool { +func (g *voiceGate) allowMatch(name string, labels map[string]string) bool { a := g.cfg.Allow if len(a.Names) == 0 && len(a.Labels) == 0 { return true } for _, n := range a.Names { - if n == meta.Name { + if n == name { return true } } for _, l := range a.Labels { - if _, ok := meta.Labels[l]; ok { + if _, ok := labels[l]; ok { return true } } return false } -func (g *voiceGate) authorizeVerify(ctx context.Context, wavPath string) (bool, string, string, error) { - if g.cfg.AntiSpoofing { - for _, r := range g.refAudios { - ok, err := g.verifyFn(ctx, wavPath, r.Audio) - if err != nil { - return false, "", "verify failed", err - } - if ok { - return true, r.Name, "", nil - } - } - return false, "", "no reference matched", nil +// Authorize is the legacy convenience wrapper: resolve then apply policy. +// +// allowed: speaker is authorized. +// matched: matched person's name (informational), empty if none. +// reason: human-readable deny reason. +// err: backend failure (caller should fail closed). +func (g *voiceGate) Authorize(ctx context.Context, wavPath string) (allowed bool, matched string, reason string, err error) { + r, rerr := g.Resolve(ctx, wavPath) + if rerr != nil { + return false, "", r.reason, rerr } - - emb, err := g.embedFn(ctx, wavPath) - if err != nil { - return false, "", "embed failed", err - } - if len(emb) == 0 { - return false, "", "no speech detected", nil - } - for _, r := range g.refEmbeds { - if cosineDistance(emb, r.emb) <= g.cfg.Threshold { - return true, r.name, "", nil - } - } - return false, "", "no reference matched", nil + allowed, reason = g.authorize(r) + return allowed, r.speaker.Name, reason, nil } // decide interprets an Authorize result against the gate's when-policy and the diff --git a/core/http/endpoints/openai/realtime_voicegate_integration_test.go b/core/http/endpoints/openai/realtime_voicegate_integration_test.go index f8aae72c5..b0f7f0b49 100644 --- a/core/http/endpoints/openai/realtime_voicegate_integration_test.go +++ b/core/http/endpoints/openai/realtime_voicegate_integration_test.go @@ -152,3 +152,252 @@ var _ = Describe("realtime voice gate integration (commitUtterance)", func() { Expect(tr2.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1)) }) }) + +var _ = Describe("realtime speaker surfacing (commitUtterance)", func() { + utt := make([]byte, 32) + + It("emits conversation.item.speaker for a confident match when announce is on", func() { + session, _ := itSession(itGate("alice", "alice", []float32{1, 0, 0}, nil, + config.VoiceGateWhenEvery, config.VoiceGateRejectEvent)) + session.voiceGate.cfg.Identity = &config.VoiceIdentityConfig{Announce: true} + tr := &fakeTransport{} + + commitUtterance(context.Background(), utt, session, &Conversation{}, tr) + + Expect(tr.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(1)) + }) + + It("does not emit the speaker event for an unknown speaker unless announce_unknown is set", func() { + // match distance above threshold => not matched + gate := &voiceGate{ + cfg: config.PipelineVoiceRecognition{ + Mode: config.VoiceGateModeIdentify, Threshold: 0.25, + When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent, + Enforce: boolPtr(false), + Identity: &config.VoiceIdentityConfig{Announce: true}, + }, + registry: &fakeRegistry{matches: []voicerecognition.Match{ + {Distance: 0.9, Metadata: voicerecognition.Metadata{Name: "alice"}}, + }}, + embedFn: func(context.Context, string) ([]float32, error) { return []float32{1, 0, 0}, nil }, + } + session, _ := itSession(gate) + tr := &fakeTransport{} + + commitUtterance(context.Background(), utt, session, &Conversation{}, tr) + Expect(tr.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(0)) + + gate.cfg.Identity.AnnounceUnknown = true + tr2 := &fakeTransport{} + commitUtterance(context.Background(), utt, session, &Conversation{}, tr2) + Expect(tr2.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(1)) + }) + + It("never drops a turn when enforce is false even for a disallowed speaker", func() { + session, _ := itSession(itGate("bob", "alice", []float32{1, 0, 0}, nil, + config.VoiceGateWhenEvery, config.VoiceGateRejectEvent)) + session.voiceGate.cfg.Enforce = boolPtr(false) + tr := &fakeTransport{} + + commitUtterance(context.Background(), utt, session, &Conversation{}, tr) + + Expect(hasSpeakerNotAuthorized(tr)).To(BeFalse()) + Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1)) + }) +}) + +var _ = Describe("realtime speaker personalization (triggerResponseAtTurn)", func() { + utt := make([]byte, 32) + + findRole := func(msgs schema.Messages, role string) *schema.Message { + for i := range msgs { + if msgs[i].Role == role { + return &msgs[i] + } + } + return nil + } + + It("sets the user message name and a current-speaker system note", func() { + session, m := itSession(itGate("alice", "alice", []float32{1, 0, 0}, nil, + config.VoiceGateWhenEvery, config.VoiceGateRejectEvent)) + session.voiceGate.cfg.Identity = &config.VoiceIdentityConfig{ + Personalize: true, InjectName: true, InjectSystemNote: true, + } + session.Instructions = "You are helpful." + tr := &fakeTransport{} + + commitUtterance(context.Background(), utt, session, &Conversation{}, tr) + + user := findRole(m.lastMessages, "user") + Expect(user).ToNot(BeNil()) + Expect(user.Name).To(Equal("alice")) + sys := findRole(m.lastMessages, "system") + Expect(sys).ToNot(BeNil()) + Expect(sys.StringContent).To(ContainSubstring("The current speaker is alice.")) + }) + + It("omits the unknown note unless note_unknown is set", func() { + base := func() (*Session, *fakeModel) { + gate := &voiceGate{ + cfg: config.PipelineVoiceRecognition{ + Mode: config.VoiceGateModeIdentify, Threshold: 0.25, + When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent, + Enforce: boolPtr(false), + Identity: &config.VoiceIdentityConfig{Personalize: true, InjectSystemNote: true}, + }, + registry: &fakeRegistry{matches: []voicerecognition.Match{ + {Distance: 0.9, Metadata: voicerecognition.Metadata{Name: "alice"}}, + }}, + embedFn: func(context.Context, string) ([]float32, error) { return []float32{1, 0, 0}, nil }, + } + s, m := itSession(gate) + s.Instructions = "You are helpful." + return s, m + } + + s1, m1 := base() + commitUtterance(context.Background(), utt, s1, &Conversation{}, &fakeTransport{}) + Expect(findRole(m1.lastMessages, "system").StringContent).ToNot(ContainSubstring("unknown")) + + s2, m2 := base() + s2.voiceGate.cfg.Identity.NoteUnknown = true + commitUtterance(context.Background(), utt, s2, &Conversation{}, &fakeTransport{}) + Expect(findRole(m2.lastMessages, "system").StringContent).To(ContainSubstring("The current speaker is unknown.")) + }) +}) + +var _ = Describe("realtime when:first with identity (commitUtterance)", func() { + utt := make([]byte, 32) + + // statefulIdentityGate builds a when:first identify gate with an Identity + // block (so identity is resolved every turn) whose embedFn is driven by a + // per-turn counter: the failOnSecond flag makes the second and later embeds + // return an error, exercising the stricter fail-closed path on a re-resolve. + statefulIdentityGate := func(failOnSecond bool) *voiceGate { + calls := 0 + return &voiceGate{ + cfg: config.PipelineVoiceRecognition{ + Mode: config.VoiceGateModeIdentify, + Threshold: 0.25, + When: config.VoiceGateWhenFirst, + OnReject: config.VoiceGateRejectEvent, + Allow: config.VoiceRecognitionAllow{Names: []string{"alice"}}, + Identity: &config.VoiceIdentityConfig{Announce: true, Personalize: true, InjectName: true}, + }, + registry: &fakeRegistry{matches: []voicerecognition.Match{ + {Distance: 0.1, Metadata: voicerecognition.Metadata{Name: "alice"}}, + }}, + embedFn: func(context.Context, string) ([]float32, error) { + calls++ + if failOnSecond && calls > 1 { + return nil, errors.New("embed backend down") + } + return []float32{1, 0, 0}, nil + }, + } + } + + It("re-resolves identity every turn and fails closed when a later embed errors", func() { + gate := statefulIdentityGate(true) + session, _ := itSession(gate) + conv := &Conversation{} // shared so voiceVerified persists across turns + + // Turn 1: authorized; identity resolved, speaker surfaced, response runs. + tr1 := &fakeTransport{} + commitUtterance(context.Background(), utt, session, conv, tr1) + Expect(hasSpeakerNotAuthorized(tr1)).To(BeFalse()) + Expect(tr1.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(1)) + Expect(tr1.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1)) + + // Turn 2: when:first would skip re-authorization, but the Identity block + // forces a fresh resolve. That resolve now errors, and because the gate + // enforces, the turn is dropped fail-closed rather than riding on the + // cached first verification. + tr2 := &fakeTransport{} + commitUtterance(context.Background(), utt, session, conv, tr2) + Expect(hasSpeakerNotAuthorized(tr2)).To(BeTrue()) + Expect(tr2.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0)) + }) + + It("re-resolves identity every turn so a later turn still surfaces and names the speaker", func() { + gate := statefulIdentityGate(false) + session, m := itSession(gate) + conv := &Conversation{} + + tr1 := &fakeTransport{} + commitUtterance(context.Background(), utt, session, conv, tr1) + Expect(hasSpeakerNotAuthorized(tr1)).To(BeFalse()) + Expect(tr1.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1)) + + // Turn 2: authorization is skipped (when:first, already verified) but the + // speaker event still fires and the per-message name is set, proving the + // per-turn re-resolution (not the cached first verification) drove it. + tr2 := &fakeTransport{} + commitUtterance(context.Background(), utt, session, conv, tr2) + Expect(tr2.countEvents(types.ServerEventTypeConversationItemSpeaker)).To(Equal(1)) + var lastUser *schema.Message + for i := range m.lastMessages { + if m.lastMessages[i].Role == "user" { + lastUser = &m.lastMessages[i] + } + } + Expect(lastUser).ToNot(BeNil()) + Expect(lastUser.Name).To(Equal("alice")) + }) +}) + +var _ = Describe("realtime multi-speaker history attribution (triggerResponse)", func() { + userAudioItem := func(name, transcript string) *types.MessageItemUnion { + return &types.MessageItemUnion{ + User: &types.MessageItemUser{ + ID: generateItemID(), + Status: types.ItemStatusCompleted, + Speaker: &types.Speaker{Name: name, Matched: true}, + Content: []types.MessageContentInput{ + {Type: types.MessageContentTypeInputAudio, Transcript: transcript}, + }, + }, + } + } + + It("attributes each user turn to its own speaker and notes the latest one", func() { + session, m := itSession(itGate("alice", "alice", []float32{1, 0, 0}, nil, + config.VoiceGateWhenEvery, config.VoiceGateRejectEvent)) + session.Instructions = "You are helpful." + session.MaxHistoryItems = 10 // keep both items; 0 would mean "no trim" too + session.voiceGate.cfg.Identity = &config.VoiceIdentityConfig{ + Personalize: true, InjectName: true, InjectSystemNote: true, + } + + conv := &Conversation{Items: []*types.MessageItemUnion{ + userAudioItem("alice", "hello there"), + userAudioItem("bob", "what is the weather"), + }} + tr := &fakeTransport{} + + triggerResponse(context.Background(), session, conv, tr, nil) + + var users []*schema.Message + var sys *schema.Message + for i := range m.lastMessages { + switch m.lastMessages[i].Role { + case "user": + users = append(users, &m.lastMessages[i]) + case "system": + if sys == nil { + sys = &m.lastMessages[i] + } + } + } + Expect(users).To(HaveLen(2)) + Expect(users[0].Name).To(Equal("alice")) + Expect(users[1].Name).To(Equal("bob")) + + Expect(sys).ToNot(BeNil()) + Expect(sys.StringContent).To(ContainSubstring("The current speaker is bob.")) + Expect(sys.StringContent).ToNot(ContainSubstring("alice")) + }) +}) + +func boolPtr(b bool) *bool { return &b } diff --git a/core/http/endpoints/openai/realtime_voicegate_test.go b/core/http/endpoints/openai/realtime_voicegate_test.go index bdbbc2f4a..3d9b458e1 100644 --- a/core/http/endpoints/openai/realtime_voicegate_test.go +++ b/core/http/endpoints/openai/realtime_voicegate_test.go @@ -10,6 +10,82 @@ import ( . "github.com/onsi/gomega" ) +var _ = Describe("voiceGate.Resolve + authorize", func() { + mkGate := func(allow []string) *voiceGate { + return &voiceGate{ + cfg: config.PipelineVoiceRecognition{ + Mode: config.VoiceGateModeIdentify, + Threshold: 0.25, + Allow: config.VoiceRecognitionAllow{Names: allow}, + }, + registry: &fakeRegistry{matches: []voicerecognition.Match{ + {Distance: 0.1, Metadata: voicerecognition.Metadata{ID: "spk_1", Name: "alice", Labels: map[string]string{"family": "yes"}}}, + }}, + embedFn: func(context.Context, string) ([]float32, error) { return []float32{1, 0, 0}, nil }, + } + } + + It("resolves a confident identity with name, id and a 0..100 confidence", func() { + r, err := mkGate(nil).Resolve(context.Background(), "x.wav") + Expect(err).ToNot(HaveOccurred()) + Expect(r.found).To(BeTrue()) + Expect(r.speaker.Name).To(Equal("alice")) + Expect(r.speaker.ID).To(Equal("spk_1")) + Expect(r.speaker.Matched).To(BeTrue()) + Expect(r.speaker.Confidence).To(BeNumerically(">", 0)) + Expect(r.speaker.Confidence).To(BeNumerically("<=", 100)) + Expect(r.speaker.Labels).To(HaveKeyWithValue("family", "yes")) + }) + + It("marks a candidate above the threshold as not matched", func() { + g := mkGate(nil) + g.registry = &fakeRegistry{matches: []voicerecognition.Match{ + {Distance: 0.9, Metadata: voicerecognition.Metadata{Name: "alice"}}, + }} + r, err := g.Resolve(context.Background(), "x.wav") + Expect(err).ToNot(HaveOccurred()) + Expect(r.found).To(BeTrue()) + Expect(r.speaker.Matched).To(BeFalse()) + Expect(r.speaker.Name).To(Equal("alice")) // name still surfaced + }) + + It("authorize allows a confident match in the allow list", func() { + g := mkGate([]string{"alice"}) + r, _ := g.Resolve(context.Background(), "x.wav") + allowed, reason := g.authorize(r) + Expect(allowed).To(BeTrue()) + Expect(reason).To(BeEmpty()) + }) + + It("authorize denies a confident match outside the allow list", func() { + g := mkGate([]string{"bob"}) + r, _ := g.Resolve(context.Background(), "x.wav") + allowed, reason := g.authorize(r) + Expect(allowed).To(BeFalse()) + Expect(reason).To(Equal("speaker not in allow list")) + }) + + It("authorize allows by label when names do not match", func() { + g := mkGate(nil) + g.cfg.Allow = config.VoiceRecognitionAllow{Labels: []string{"family"}} + r, _ := g.Resolve(context.Background(), "x.wav") + allowed, _ := g.authorize(r) + Expect(allowed).To(BeTrue()) + }) +}) + +var _ = Describe("confidence", func() { + It("is 100 at zero distance", func() { + Expect(confidence(0, 0.25)).To(BeNumerically("~", 100, 1e-4)) + }) + It("clamps to 0 above the threshold", func() { + Expect(confidence(0.5, 0.25)).To(BeNumerically("~", 0, 1e-4)) + }) + It("is 0 for a non-positive threshold", func() { + Expect(confidence(0.1, 0)).To(BeNumerically("~", 0, 1e-4)) + }) +}) + var _ = Describe("cosineDistance", func() { It("is 0 for identical vectors", func() { Expect(cosineDistance([]float32{1, 0, 0}, []float32{1, 0, 0})).To(BeNumerically("~", 0, 1e-6)) diff --git a/core/http/endpoints/openai/types/message_item.go b/core/http/endpoints/openai/types/message_item.go index 52997fe8c..88d680648 100644 --- a/core/http/endpoints/openai/types/message_item.go +++ b/core/http/endpoints/openai/types/message_item.go @@ -102,6 +102,10 @@ type MessageItemUser struct { // The status of the item. Has no effect on the conversation. Status ItemStatus `json:"status,omitempty"` + + // Speaker is the recognized speaker for this audio turn (LocalAI extension). + // Used to attribute past turns when rebuilding the LLM message history. + Speaker *Speaker `json:"speaker,omitempty"` } func (m MessageItemUser) MessageItemType() MessageItemType { diff --git a/core/http/endpoints/openai/types/server_events.go b/core/http/endpoints/openai/types/server_events.go index bae680fd5..8183a8b78 100644 --- a/core/http/endpoints/openai/types/server_events.go +++ b/core/http/endpoints/openai/types/server_events.go @@ -20,6 +20,9 @@ const ( ServerEventTypeConversationItemInputAudioTranscriptionFailed ServerEventType = "conversation.item.input_audio_transcription.failed" ServerEventTypeConversationItemTruncated ServerEventType = "conversation.item.truncated" ServerEventTypeConversationItemDeleted ServerEventType = "conversation.item.deleted" + // ServerEventTypeConversationItemSpeaker is a LocalAI extension: it reports + // the recognized speaker for a user audio item. OpenAI clients ignore it. + ServerEventTypeConversationItemSpeaker ServerEventType = "conversation.item.speaker" ServerEventTypeInputAudioBufferCommitted ServerEventType = "input_audio_buffer.committed" ServerEventTypeInputAudioBufferCleared ServerEventType = "input_audio_buffer.cleared" ServerEventTypeInputAudioBufferSpeechStarted ServerEventType = "input_audio_buffer.speech_started" @@ -335,6 +338,33 @@ func (m ConversationItemAddedEvent) MarshalJSON() ([]byte, error) { return json.Marshal(shadow) } +// ConversationItemSpeakerEvent reports the recognized speaker for a user audio +// item. LocalAI extension; not part of the OpenAI Realtime API. +type ConversationItemSpeakerEvent struct { + ServerEventBase + // ItemID is the conversation item this speaker belongs to. + ItemID string `json:"item_id"` + // Speaker is the recognized identity. + Speaker Speaker `json:"speaker"` +} + +func (m ConversationItemSpeakerEvent) ServerEventType() ServerEventType { + return ServerEventTypeConversationItemSpeaker +} + +func (m ConversationItemSpeakerEvent) MarshalJSON() ([]byte, error) { + type typeAlias ConversationItemSpeakerEvent + type typeWrapper struct { + typeAlias + Type ServerEventType `json:"type"` + } + shadow := typeWrapper{ + typeAlias: typeAlias(m), + Type: m.ServerEventType(), + } + return json.Marshal(shadow) +} + // Returned when a conversation item is finalized. // // The event will include the full content of the Item except for audio data, which can be retrieved separately with a `conversation.item.retrieve` event if needed. diff --git a/core/http/endpoints/openai/types/speaker.go b/core/http/endpoints/openai/types/speaker.go new file mode 100644 index 000000000..a5b02c927 --- /dev/null +++ b/core/http/endpoints/openai/types/speaker.go @@ -0,0 +1,14 @@ +package types + +// Speaker is the recognized speaker for a committed audio turn. It is a LocalAI +// extension to the OpenAI Realtime schema, carried on the user conversation item +// and surfaced via the conversation.item.speaker event. Confidence is a 0..100 +// score relative to the match threshold (same formula as /v1/voice/identify). +type Speaker struct { + Name string `json:"name,omitempty"` + ID string `json:"id,omitempty"` + Labels map[string]string `json:"labels,omitempty"` + Confidence float32 `json:"confidence"` + Distance float32 `json:"distance"` + Matched bool `json:"matched"` +} diff --git a/docs/content/features/openai-realtime.md b/docs/content/features/openai-realtime.md index 51f8960bf..48cfc9332 100644 --- a/docs/content/features/openai-realtime.md +++ b/docs/content/features/openai-realtime.md @@ -141,6 +141,8 @@ The API follows the OpenAI Realtime API protocol for handling sessions, audio bu A pipeline realtime model can require speaker verification before it responds. Add a `voice_recognition` block under `pipeline`. When present, each committed utterance is verified against authorized speakers; unauthorized utterances are dropped before the LLM runs (no LLM call, no tool execution, no TTS). The session stays open. +The same block also drives two optional, independent behaviors: an authorization gate (`enforce`) and speaker surfacing/personalization (`identity`). Set `enforce: false` to keep recognizing the speaker without ever rejecting a turn. + ```yaml name: my-realtime pipeline: @@ -152,6 +154,7 @@ pipeline: model: speaker-recognition # the speaker-recognition backend model mode: identify # "identify" (registry) or "verify" (references) threshold: 0.25 # cosine distance; <= passes + enforce: true # authorization gate (default true) when: every # "every" (default) or "first" on_reject: drop_event # "drop_event" (default) or "drop_silent" anti_spoofing: false # optional liveness check (verify mode) @@ -170,19 +173,73 @@ pipeline: audio: /models/voices/bob.wav ``` +### Identifying speakers without gating + +To recognize who is speaking and surface it to the client and the LLM without ever rejecting a turn, set `enforce: false` and add an `identity` block. The `identity` block works with or without the gate; when it is set, the speaker is resolved on every turn even if `when: first`. + +```yaml +name: my-realtime +pipeline: + vad: silero-vad + transcription: whisper + llm: qwen + tts: kokoro + voice_recognition: + model: speaker-recognition + mode: identify + threshold: 0.25 + # Authorization gate. Defaults to enforcing (rejects unauthorized speakers). + # Set enforce:false to identify the speaker WITHOUT rejecting anyone. + enforce: false + when: every + # Surface the recognized speaker to the client and the LLM. Works with or + # without enforce; when set, identity is resolved on every turn even if + # when:first. + identity: + announce: true # emit the conversation.item.speaker event + announce_unknown: false # also emit it when there is no confident match + personalize: true # tell the LLM who is speaking + inject_name: true # set the per-message OpenAI name field + inject_system_note: true # append a "current speaker" line to the system message + note_unknown: false # append a "speaker is unknown" note when unidentified +``` + | Field | Meaning | |-------|---------| | `model` | Speaker-recognition backend model name. | | `mode` | `identify` matches against speakers registered via `/v1/voice/register`; `verify` matches against the `references` audios. | | `threshold` | Maximum cosine distance that still counts as a match (default ~0.25). | -| `when` | `every` verifies each utterance; `first` verifies once then trusts the session. | +| `enforce` | Authorization gate. `true` (or omitted) rejects unauthorized speakers (the gating behavior above). `false` resolves and surfaces the speaker without ever dropping a turn. | +| `when` | `every` verifies each utterance; `first` verifies once then trusts the session. When an `identity` block is set, the speaker is still resolved on every turn even with `first`. | | `on_reject` | `drop_event` drops and emits a `speaker_not_authorized` error event; `drop_silent` drops quietly. | | `anti_spoofing` | Verify mode only: runs the backend liveness check (slower). | | `allow.names` / `allow.labels` | identify mode: which registry identities are authorized. Empty = any registered speaker. | | `references` | verify mode: authorized reference speakers; the utterance passes if it matches any. | +| `identity.announce` | Emit the `conversation.item.speaker` event to the client (see below). | +| `identity.announce_unknown` | Also emit that event when there is no confident match. By default the event is emitted only on a match. | +| `identity.personalize` | Inform the LLM who is speaking. | +| `identity.inject_name` | Set the per-message OpenAI `name` field on each user turn. | +| `identity.inject_system_note` | Append a `The current speaker is .` line to the system message. | +| `identity.note_unknown` | When unidentified, append `The current speaker is unknown.` (lets the model ask who it is talking to). | `identify` mode requires the voice registry (speakers registered through `/v1/voice/register`). `verify` mode needs no registry: reference audios are embedded once at model load. +### The `conversation.item.speaker` event + +When `identity.announce` is enabled, the server emits a `conversation.item.speaker` event after the user conversation item, naming the recognized speaker: + +```json +{ + "type": "conversation.item.speaker", + "item_id": "item_abc", + "speaker": { "name": "Jeremy", "id": "spk_1", "labels": { "role": "owner" }, "confidence": 92.0, "distance": 0.1, "matched": true } +} +``` + +`confidence` is a 0-100 score, `distance` is the cosine distance, and `matched` is `true` when a confident match was found. `labels` carries any labels attached to the registered speaker (identify mode); it is omitted when the speaker has none. The `name` and `id` fields are omitted when empty. By default the event is emitted only on a match; set `identity.announce_unknown: true` to also emit it (with `matched: false`) when no speaker is identified. + +This event is a LocalAI extension to the OpenAI Realtime API and is server-emitted only. Standard OpenAI Realtime clients ignore event types they do not recognize, so enabling it is non-breaking. + ## Examples - [Realtime voice assistant demo (Go)](https://github.com/localai-org/localai-realtime-demo): a minimal Go client for the Realtime (WebSocket) API with a full talk-back voice loop and an example tool call. Ships a `docker compose` setup that brings up a realtime-capable LocalAI for you. diff --git a/tests/e2e/e2e_suite_test.go b/tests/e2e/e2e_suite_test.go index 72ac88b74..5a257bdb0 100644 --- a/tests/e2e/e2e_suite_test.go +++ b/tests/e2e/e2e_suite_test.go @@ -275,6 +275,40 @@ var _ = BeforeSuite(func() { Expect(err).ToNot(HaveOccurred()) Expect(os.WriteFile(filepath.Join(modelsPath, "realtime-pipeline-gated.yaml"), gatedData, 0644)).To(Succeed()) + // Identity-surfacing pipeline: the same speaker backend, but enforce:false + // (never drop a turn) plus an identity block so the server emits the + // conversation.item.speaker event and personalizes the LLM turn. Used by the + // speaker-identity e2e specs. + identityCfg := map[string]any{ + "name": "realtime-pipeline-identity", + "pipeline": map[string]any{ + "vad": "mock-vad", + "transcription": "mock-stt", + "llm": "mock-llm", + "tts": "mock-tts", + "voice_recognition": map[string]any{ + "model": "mock-speaker", + "mode": "verify", + "threshold": 0.25, + "when": "every", + "enforce": false, + "references": []map[string]any{ + {"name": "e2e-speaker", "audio": voiceRefPath}, + }, + "identity": map[string]any{ + "announce": true, + "announce_unknown": true, + "personalize": true, + "inject_name": true, + "inject_system_note": true, + }, + }, + }, + } + identityData, err := yaml.Marshal(identityCfg) + Expect(err).ToNot(HaveOccurred()) + Expect(os.WriteFile(filepath.Join(modelsPath, "realtime-pipeline-identity.yaml"), identityData, 0644)).To(Succeed()) + // Router model setup: a score classifier (mock-backend Score) selects // between two candidate chat models based on keyword matches against the // candidate label fragments. Exercises the full RouteModel middleware path diff --git a/tests/e2e/realtime_speaker_identity_test.go b/tests/e2e/realtime_speaker_identity_test.go new file mode 100644 index 000000000..41a15f0fb --- /dev/null +++ b/tests/e2e/realtime_speaker_identity_test.go @@ -0,0 +1,95 @@ +package e2e_test + +import ( + "encoding/base64" + "time" + + "github.com/gorilla/websocket" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// These specs drive the speaker-identity surfacing end to end against a real +// LocalAI server over a real WebSocket, using the mock backend's VoiceEmbed +// (DC-biased PCM -> one of two orthogonal speaker vectors). The pipeline is +// realtime-pipeline-identity: verify mode with enforce:false plus an identity +// block, so the server resolves the speaker, emits a conversation.item.speaker +// event, and never drops a turn. +var _ = Describe("Realtime speaker identity surfacing", Label("Realtime"), func() { + // open connects to the identity pipeline and disables server VAD so the + // test can commit the input buffer manually. + open := func() *websocket.Conn { + c := connectWS("realtime-pipeline-identity") + created := readServerEvent(c, 30*time.Second) + Expect(created["type"]).To(Equal("session.created")) + sendClientEvent(c, disableVADEvent()) + drainUntil(c, "session.updated", 10*time.Second) + return c + } + + commit := func(c *websocket.Conn, pcm []byte) { + sendClientEvent(c, map[string]any{ + "type": "input_audio_buffer.append", + "audio": base64.StdEncoding.EncodeToString(pcm), + }) + sendClientEvent(c, map[string]any{"type": "input_audio_buffer.commit"}) + } + + // collectUntilDone reads events until response.done (or timeout), returning + // the conversation.item.speaker event (nil if none) and whether the turn + // reached response.done. + collectUntilDone := func(c *websocket.Conn, timeout time.Duration) (speaker map[string]any, gotDone bool) { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + evt := readServerEvent(c, time.Until(deadline)) + switch evt["type"] { + case "conversation.item.speaker": + speaker = evt + case "response.done": + return speaker, true + } + } + return speaker, false + } + + It("emits conversation.item.speaker naming an authorized speaker and still responds", func() { + c := open() + defer func() { _ = c.Close() }() + + // Positive DC bias matches the enrolled reference speaker. + commit(c, pcmWithDC(300, 16000, 1000, 8000)) + drainUntil(c, "input_audio_buffer.committed", 30*time.Second) + + speaker, gotDone := collectUntilDone(c, 60*time.Second) + Expect(speaker).ToNot(BeNil(), "expected a conversation.item.speaker event") + Expect(speaker["item_id"]).ToNot(BeEmpty()) + + spk, ok := speaker["speaker"].(map[string]any) + Expect(ok).To(BeTrue(), "speaker payload should be an object") + Expect(spk["matched"]).To(Equal(true)) + Expect(spk["name"]).To(Equal("e2e-speaker")) + + Expect(gotDone).To(BeTrue(), "enforce:false should let the turn reach response.done") + }) + + It("emits an unknown speaker event and still responds when enforce is false", func() { + c := open() + defer func() { _ = c.Close() }() + + // Negative DC bias is a different speaker that matches no reference. + commit(c, pcmWithDC(300, 16000, 1000, -8000)) + drainUntil(c, "input_audio_buffer.committed", 30*time.Second) + + speaker, gotDone := collectUntilDone(c, 60*time.Second) + Expect(speaker).ToNot(BeNil(), "announce_unknown should still emit the event") + + spk, ok := speaker["speaker"].(map[string]any) + Expect(ok).To(BeTrue(), "speaker payload should be an object") + Expect(spk["matched"]).To(Equal(false)) + // name is omitted for an unidentified speaker. + _, hasName := spk["name"] + Expect(hasName).To(BeFalse()) + + Expect(gotDone).To(BeTrue(), "enforce:false must not drop an unauthorized speaker") + }) +})