mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-27 09:57:14 -04:00
* feat(realtime): add voice_recognition enforce + identity config Add Enforce *bool and Identity *VoiceIdentityConfig to PipelineVoiceRecognition, plus EnforceGate/IdentityEnabled/ AnnounceEnabled/PersonalizeEnabled helpers. Enforce nil defaults to gating (backward compatible); identity surfacing is independent of the gate. Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(realtime): add Speaker type and conversation.item.speaker event Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * refactor(realtime): split voiceGate into Resolve + authorize Split the speaker authorization into a Resolve step (embed once, produce a types.Speaker identity) and a pure authorize policy step, with a 0..100 confidence score mirroring /v1/voice/identify. The legacy Authorize wrapper is kept so existing specs stay green. Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(realtime): resolve speaker per turn and emit conversation.item.speaker Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(realtime): personalize LLM turns with recognized speaker Set the per-message name field on each recognized user turn and append a current-speaker note to the system message, both gated by the voice recognition identity config. Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * docs(realtime): document speaker identity surfacing and personalization Document the new voice_recognition keys (enforce, identity.*) and the LocalAI-extension conversation.item.speaker server event in the realtime feature docs. Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * test(realtime): cover when:first+identity re-resolution and multi-speaker history Add two integration specs to harden the speaker-aware realtime path: - when:first with an Identity block re-resolves the speaker every turn even though re-authorization is skipped after the first match: a later resolve error now fails closed, while a clean later resolve still surfaces and names the speaker. - multi-speaker history attribution: each user turn carries its own per-message name and the injected system note reflects the latest speaker. Test-only change; no production behavior was modified. Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(realtime): surface speaker labels in conversation.item.speaker Carry the registered speaker's labels (identify mode) on types.Speaker so they flow into the conversation.item.speaker event and the stored item. Verify mode has no labels, so the field is omitted there. Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * test(e2e): cover conversation.item.speaker over a real websocket Add a realtime-pipeline-identity config (verify mode, enforce:false, identity announce+announce_unknown+personalize) and two e2e specs driving the real server over a real WebSocket with the mock VoiceEmbed backend: an authorized speaker yields a conversation.item.speaker event naming e2e-speaker (matched true) and reaches response.done; an unauthorized speaker yields an unknown (matched false, no name) event and still responds, proving enforce:false never drops a turn. Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(config): register voice_recognition enforce + identity fields The meta registry coverage test (TestAllFieldsHaveRegistryEntries) requires every config field to have an entry in core/config/meta/registry.go. The new voice_recognition.enforce and voice_recognition.identity.* fields were missing, failing tests-linux and tests-apple. Add registry entries (toggles) so the fields are surfaced in the model-config editor and the coverage test passes. Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
96 lines
3.5 KiB
Go
96 lines
3.5 KiB
Go
package e2e_test
|
|
|
|
import (
|
|
"encoding/base64"
|
|
"time"
|
|
|
|
"github.com/gorilla/websocket"
|
|
. "github.com/onsi/ginkgo/v2"
|
|
. "github.com/onsi/gomega"
|
|
)
|
|
|
|
// These specs drive the speaker-identity surfacing end to end against a real
|
|
// LocalAI server over a real WebSocket, using the mock backend's VoiceEmbed
|
|
// (DC-biased PCM -> one of two orthogonal speaker vectors). The pipeline is
|
|
// realtime-pipeline-identity: verify mode with enforce:false plus an identity
|
|
// block, so the server resolves the speaker, emits a conversation.item.speaker
|
|
// event, and never drops a turn.
|
|
var _ = Describe("Realtime speaker identity surfacing", Label("Realtime"), func() {
|
|
// open connects to the identity pipeline and disables server VAD so the
|
|
// test can commit the input buffer manually.
|
|
open := func() *websocket.Conn {
|
|
c := connectWS("realtime-pipeline-identity")
|
|
created := readServerEvent(c, 30*time.Second)
|
|
Expect(created["type"]).To(Equal("session.created"))
|
|
sendClientEvent(c, disableVADEvent())
|
|
drainUntil(c, "session.updated", 10*time.Second)
|
|
return c
|
|
}
|
|
|
|
commit := func(c *websocket.Conn, pcm []byte) {
|
|
sendClientEvent(c, map[string]any{
|
|
"type": "input_audio_buffer.append",
|
|
"audio": base64.StdEncoding.EncodeToString(pcm),
|
|
})
|
|
sendClientEvent(c, map[string]any{"type": "input_audio_buffer.commit"})
|
|
}
|
|
|
|
// collectUntilDone reads events until response.done (or timeout), returning
|
|
// the conversation.item.speaker event (nil if none) and whether the turn
|
|
// reached response.done.
|
|
collectUntilDone := func(c *websocket.Conn, timeout time.Duration) (speaker map[string]any, gotDone bool) {
|
|
deadline := time.Now().Add(timeout)
|
|
for time.Now().Before(deadline) {
|
|
evt := readServerEvent(c, time.Until(deadline))
|
|
switch evt["type"] {
|
|
case "conversation.item.speaker":
|
|
speaker = evt
|
|
case "response.done":
|
|
return speaker, true
|
|
}
|
|
}
|
|
return speaker, false
|
|
}
|
|
|
|
It("emits conversation.item.speaker naming an authorized speaker and still responds", func() {
|
|
c := open()
|
|
defer func() { _ = c.Close() }()
|
|
|
|
// Positive DC bias matches the enrolled reference speaker.
|
|
commit(c, pcmWithDC(300, 16000, 1000, 8000))
|
|
drainUntil(c, "input_audio_buffer.committed", 30*time.Second)
|
|
|
|
speaker, gotDone := collectUntilDone(c, 60*time.Second)
|
|
Expect(speaker).ToNot(BeNil(), "expected a conversation.item.speaker event")
|
|
Expect(speaker["item_id"]).ToNot(BeEmpty())
|
|
|
|
spk, ok := speaker["speaker"].(map[string]any)
|
|
Expect(ok).To(BeTrue(), "speaker payload should be an object")
|
|
Expect(spk["matched"]).To(Equal(true))
|
|
Expect(spk["name"]).To(Equal("e2e-speaker"))
|
|
|
|
Expect(gotDone).To(BeTrue(), "enforce:false should let the turn reach response.done")
|
|
})
|
|
|
|
It("emits an unknown speaker event and still responds when enforce is false", func() {
|
|
c := open()
|
|
defer func() { _ = c.Close() }()
|
|
|
|
// Negative DC bias is a different speaker that matches no reference.
|
|
commit(c, pcmWithDC(300, 16000, 1000, -8000))
|
|
drainUntil(c, "input_audio_buffer.committed", 30*time.Second)
|
|
|
|
speaker, gotDone := collectUntilDone(c, 60*time.Second)
|
|
Expect(speaker).ToNot(BeNil(), "announce_unknown should still emit the event")
|
|
|
|
spk, ok := speaker["speaker"].(map[string]any)
|
|
Expect(ok).To(BeTrue(), "speaker payload should be an object")
|
|
Expect(spk["matched"]).To(Equal(false))
|
|
// name is omitted for an unidentified speaker.
|
|
_, hasName := spk["name"]
|
|
Expect(hasName).To(BeFalse())
|
|
|
|
Expect(gotDone).To(BeTrue(), "enforce:false must not drop an unauthorized speaker")
|
|
})
|
|
})
|