diff --git a/core/config/meta/registry.go b/core/config/meta/registry.go
index c404effb2..c03d52ee4 100644
--- a/core/config/meta/registry.go
+++ b/core/config/meta/registry.go
@@ -355,6 +355,85 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Component:   "toggle",
 			Order:       69,
 		},
+		"pipeline.voice_recognition.model": {
+			Section:              "pipeline",
+			Label:                "Voice Recognition Model",
+			Description:          "Speaker-recognition backend model used to gate the pipeline behind speaker verification. Leave empty to disable the voice gate.",
+			Component:            "model-select",
+			AutocompleteProvider: ProviderModels,
+			Order:                70,
+		},
+		"pipeline.voice_recognition.mode": {
+			Section:     "pipeline",
+			Label:       "Voice Gate Mode",
+			Description: "How callers are authorized: 'identify' matches the speaker 1:N against the voice registry; 'verify' matches 1:few against the configured reference audios.",
+			Component:   "select",
+			Options: []FieldOption{
+				{Value: "identify", Label: "identify (registry)"},
+				{Value: "verify", Label: "verify (references)"},
+			},
+			Order: 71,
+		},
+		"pipeline.voice_recognition.threshold": {
+			Section:     "pipeline",
+			Label:       "Voice Gate Threshold",
+			Description: "Maximum cosine distance between the caller and an authorized speaker that still counts as a match. Lower is stricter. Default 0.25 is tuned for the ECAPA-TDNN encoder on VoxCeleb.",
+			Component:   "slider",
+			Min:         f64(0.01),
+			Max:         f64(2),
+			Step:        f64(0.01),
+			Order:       72,
+		},
+		"pipeline.voice_recognition.when": {
+			Section:     "pipeline",
+			Label:       "Voice Gate When",
+			Description: "How often to verify the speaker: 'every' checks each utterance; 'first' verifies once and then trusts the session.",
+			Component:   "select",
+			Options: []FieldOption{
+				{Value: "every", Label: "every utterance"},
+				{Value: "first", Label: "first only"},
+			},
+			Order: 73,
+		},
+		"pipeline.voice_recognition.on_reject": {
+			Section:     "pipeline",
+			Label:       "Voice Gate On Reject",
+			Description: "What to do with an unauthorized utterance: 'drop_event' drops it and emits an error event to the client; 'drop_silent' drops it quietly.",
+			Component:   "select",
+			Options: []FieldOption{
+				{Value: "drop_event", Label: "drop + error event"},
+				{Value: "drop_silent", Label: "drop silently"},
+			},
+			Order: 74,
+		},
+		"pipeline.voice_recognition.anti_spoofing": {
+			Section:     "pipeline",
+			Label:       "Voice Gate Anti-Spoofing",
+			Description: "Enable the backend liveness/anti-spoofing check (verify mode only) to reject replayed or synthesized audio.",
+			Component:   "toggle",
+			Order:       75,
+		},
+		"pipeline.voice_recognition.allow.names": {
+			Section:     "pipeline",
+			Label:       "Voice Gate Allowed Names",
+			Description: "Identify mode: authorize only registry identities whose name matches one of these exactly. Empty allows any registered identity.",
+			Component:   "string-list",
+			Order:       76,
+		},
+		"pipeline.voice_recognition.allow.labels": {
+			Section:     "pipeline",
+			Label:       "Voice Gate Allowed Labels",
+			Description: "Identify mode: authorize any registry identity carrying one of these label keys. Empty allows any registered identity.",
+			Component:   "string-list",
+			Order:       77,
+		},
+		"pipeline.voice_recognition.references": {
+			Section:     "pipeline",
+			Label:       "Voice Gate References",
+			Description: "Verify mode: the authorized reference speakers, each with a name and an audio file path the caller's voice is matched against.",
+			Component:   "json-editor",
+			Order:       78,
+		},
 
 		// --- Functions ---
 		"function.grammar.parallel_calls": {
diff --git a/core/config/model_config.go b/core/config/model_config.go
index 6b1de098e..195739654 100644
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -509,6 +509,10 @@ type Pipeline struct {
 	// to enable_thinking=false backend metadata) without editing the underlying
 	// LLM model config. Unset leaves the LLM model config in charge.
 	DisableThinking *bool `yaml:"disable_thinking,omitempty" json:"disable_thinking,omitempty"`
+
+	// VoiceRecognition gates the pipeline behind speaker verification. Nil
+	// (block absent) means no gate, preserving existing behavior.
+	VoiceRecognition *PipelineVoiceRecognition `yaml:"voice_recognition,omitempty" json:"voice_recognition,omitempty"`
 }
 
 // ApplyReasoningEffort resolves the effective reasoning effort — a per-request
@@ -575,6 +579,123 @@ func (p Pipeline) ThinkingDisabled() bool {
 	return p.DisableThinking != nil && *p.DisableThinking
 }
 
+// Voice-recognition gate enum values.
+const (
+	VoiceGateModeIdentify = "identify"
+	VoiceGateModeVerify   = "verify"
+	VoiceGateWhenEvery    = "every"
+	VoiceGateWhenFirst    = "first"
+	VoiceGateRejectEvent  = "drop_event"
+	VoiceGateRejectSilent = "drop_silent"
+
+	// defaultVoiceGateThreshold is the cosine-distance default tuned for the
+	// ECAPA-TDNN speaker encoder on VoxCeleb.
+	defaultVoiceGateThreshold = 0.25
+)
+
+// @Description PipelineVoiceRecognition gates a realtime pipeline behind speaker verification.
+type PipelineVoiceRecognition struct {
+	// Model is the speaker-recognition backend model name.
+	Model string `yaml:"model,omitempty" json:"model,omitempty"`
+	// Mode is "identify" (1:N against the voice registry) or "verify"
+	// (1:few against reference audios).
+	Mode string `yaml:"mode,omitempty" json:"mode,omitempty"`
+	// Threshold is the maximum cosine distance that still counts as a match.
+	Threshold float32 `yaml:"threshold,omitempty" json:"threshold,omitempty"`
+	// When is "every" (verify each utterance) or "first" (verify once, then
+	// trust the session).
+	When string `yaml:"when,omitempty" json:"when,omitempty"`
+	// OnReject is "drop_event" (drop + emit an error event) or "drop_silent"
+	// (drop quietly).
+	OnReject string `yaml:"on_reject,omitempty" json:"on_reject,omitempty"`
+	// AntiSpoofing enables the backend liveness check (verify mode only).
+	AntiSpoofing bool `yaml:"anti_spoofing,omitempty" json:"anti_spoofing,omitempty"`
+	// Allow filters which registry identities are authorized (identify mode).
+	Allow VoiceRecognitionAllow `yaml:"allow,omitempty" json:"allow,omitempty"`
+	// References are the authorized reference speakers (verify mode).
+	References []VoiceReference `yaml:"references,omitempty" json:"references,omitempty"`
+}
+
+// @Description VoiceRecognitionAllow filters authorized registry identities.
+type VoiceRecognitionAllow struct {
+	// Names matches registered Metadata.Name exactly.
+	Names []string `yaml:"names,omitempty" json:"names,omitempty"`
+	// Labels authorizes any identity carrying a matching label key.
+	Labels []string `yaml:"labels,omitempty" json:"labels,omitempty"`
+}
+
+// @Description VoiceReference is one authorized reference speaker for verify mode.
+type VoiceReference struct {
+	Name  string `yaml:"name,omitempty" json:"name,omitempty"`
+	Audio string `yaml:"audio,omitempty" json:"audio,omitempty"`
+}
+
+// VoiceGateEnabled reports whether a voice-recognition gate is configured. The
+// mere presence of the block is the intent signal: a present-but-incomplete
+// block (e.g. missing model) must fail closed at construction, not be silently
+// skipped here.
+func (p Pipeline) VoiceGateEnabled() bool {
+	return p.VoiceRecognition != nil
+}
+
+// Normalize fills in defaults in place for omitted fields.
+func (v *PipelineVoiceRecognition) Normalize() {
+	if v.Mode == "" {
+		v.Mode = VoiceGateModeIdentify
+	}
+	if v.When == "" {
+		v.When = VoiceGateWhenEvery
+	}
+	if v.OnReject == "" {
+		v.OnReject = VoiceGateRejectEvent
+	}
+	if v.Threshold == 0 {
+		v.Threshold = defaultVoiceGateThreshold
+	}
+}
+
+// Validate checks shape and enum values. registryAvailable indicates whether a
+// VoiceRegistry exists (required by identify mode). Empty When/OnReject/Mode are
+// treated as valid because Normalize defaults them.
+func (v PipelineVoiceRecognition) Validate(registryAvailable bool) error {
+	if v.Model == "" {
+		return fmt.Errorf("voice_recognition: model is required")
+	}
+	switch v.Mode {
+	case "", VoiceGateModeIdentify:
+		if !registryAvailable {
+			return fmt.Errorf("voice_recognition mode 'identify' requires a voice registry")
+		}
+	case VoiceGateModeVerify:
+		if len(v.References) == 0 {
+			return fmt.Errorf("voice_recognition mode 'verify' requires at least one reference")
+		}
+		for i, r := range v.References {
+			if r.Audio == "" {
+				return fmt.Errorf("voice_recognition reference %d (%q) is missing an audio path", i, r.Name)
+			}
+		}
+	default:
+		return fmt.Errorf("voice_recognition: unknown mode %q", v.Mode)
+	}
+	switch v.When {
+	case "", VoiceGateWhenEvery, VoiceGateWhenFirst:
+	default:
+		return fmt.Errorf("voice_recognition: unknown when %q", v.When)
+	}
+	switch v.OnReject {
+	case "", VoiceGateRejectEvent, VoiceGateRejectSilent:
+	default:
+		return fmt.Errorf("voice_recognition: unknown on_reject %q", v.OnReject)
+	}
+	// A zero threshold means "unset" (Normalize defaults it); only validate an
+	// explicitly-set value. Cosine distance ranges 0..2.
+	if v.Threshold != 0 && (v.Threshold < 0 || v.Threshold > 2) {
+		return fmt.Errorf("voice_recognition: threshold %v out of range (0..2)", v.Threshold)
+	}
+	return nil
+}
+
 // @Description File configuration for model downloads
 type File struct {
 	Filename string         `yaml:"filename,omitempty" json:"filename,omitempty"`
diff --git a/core/config/voice_gate_test.go b/core/config/voice_gate_test.go
new file mode 100644
index 000000000..5c7782f1c
--- /dev/null
+++ b/core/config/voice_gate_test.go
@@ -0,0 +1,73 @@
+package config
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("PipelineVoiceRecognition", func() {
+	Describe("Normalize", func() {
+		It("fills defaults for empty fields", func() {
+			v := PipelineVoiceRecognition{Model: "spk"}
+			v.Normalize()
+			Expect(v.Mode).To(Equal(VoiceGateModeIdentify))
+			Expect(v.When).To(Equal(VoiceGateWhenEvery))
+			Expect(v.OnReject).To(Equal(VoiceGateRejectEvent))
+			Expect(v.Threshold).To(BeNumerically("~", defaultVoiceGateThreshold, 1e-6))
+		})
+		It("keeps explicit values", func() {
+			v := PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeVerify, When: VoiceGateWhenFirst, OnReject: VoiceGateRejectSilent, Threshold: 0.4}
+			v.Normalize()
+			Expect(v.Mode).To(Equal(VoiceGateModeVerify))
+			Expect(v.When).To(Equal(VoiceGateWhenFirst))
+			Expect(v.OnReject).To(Equal(VoiceGateRejectSilent))
+			Expect(v.Threshold).To(BeNumerically("~", 0.4, 1e-6))
+		})
+	})
+
+	Describe("Validate", func() {
+		It("requires a registry for identify mode", func() {
+			v := PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify}
+			Expect(v.Validate(false)).To(HaveOccurred())
+			Expect(v.Validate(true)).ToNot(HaveOccurred())
+		})
+		It("requires references for verify mode", func() {
+			v := PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeVerify}
+			Expect(v.Validate(false)).To(HaveOccurred())
+			v.References = []VoiceReference{{Name: "a", Audio: "/a.wav"}}
+			Expect(v.Validate(false)).ToNot(HaveOccurred())
+		})
+		It("rejects a reference with no audio path", func() {
+			v := PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeVerify, References: []VoiceReference{{Name: "a"}}}
+			Expect(v.Validate(false)).To(HaveOccurred())
+		})
+		It("rejects unknown enum values", func() {
+			Expect((PipelineVoiceRecognition{Model: "spk", Mode: "bogus"}).Validate(true)).To(HaveOccurred())
+			Expect((PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify, When: "bogus"}).Validate(true)).To(HaveOccurred())
+			Expect((PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify, OnReject: "bogus"}).Validate(true)).To(HaveOccurred())
+		})
+		It("accepts a zero (unset) threshold", func() {
+			v := PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify, Threshold: 0}
+			Expect(v.Validate(true)).ToNot(HaveOccurred())
+		})
+		It("rejects an out-of-range threshold", func() {
+			Expect((PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify, Threshold: 5}).Validate(true)).To(HaveOccurred())
+			Expect((PipelineVoiceRecognition{Model: "spk", Mode: VoiceGateModeIdentify, Threshold: -1}).Validate(true)).To(HaveOccurred())
+		})
+		It("rejects an empty model", func() {
+			Expect((PipelineVoiceRecognition{Mode: VoiceGateModeIdentify}).Validate(true)).To(HaveOccurred())
+		})
+	})
+
+	Describe("VoiceGateEnabled", func() {
+		It("is false when block absent", func() {
+			Expect((Pipeline{}).VoiceGateEnabled()).To(BeFalse())
+		})
+		It("is true when a model is set", func() {
+			Expect((Pipeline{VoiceRecognition: &PipelineVoiceRecognition{Model: "spk"}}).VoiceGateEnabled()).To(BeTrue())
+		})
+		It("is true when the block is present even without a model (fails closed downstream)", func() {
+			Expect((Pipeline{VoiceRecognition: &PipelineVoiceRecognition{}}).VoiceGateEnabled()).To(BeTrue())
+		})
+	})
+})
diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go
index 63b1a1589..f626a895c 100644
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -133,6 +133,13 @@ type Session struct {
 	// silently strip Manage Mode's tools.
 	AssistantTools []types.ToolUnion
 
+	// voiceGate is non-nil when pipeline.voice_recognition is configured. It
+	// authorizes each committed utterance's speaker before the LLM runs.
+	voiceGate *voiceGate
+	// gateMu guards the when:first verification state below.
+	gateMu        sync.Mutex
+	voiceVerified bool
+
 	// Response cancellation: protects activeResponseCancel/activeResponseDone
 	responseMu           sync.Mutex
 	activeResponseCancel context.CancelFunc
@@ -514,6 +521,23 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	}
 	session.ModelInterface = m
 
+	if cfg.Pipeline.VoiceGateEnabled() {
+		gate, gerr := newVoiceGate(
+			*cfg.Pipeline.VoiceRecognition,
+			application.ModelConfigLoader(),
+			application.ModelLoader(),
+			application.ApplicationConfig(),
+			application.VoiceRegistry(),
+		)
+		if gerr != nil {
+			xlog.Error("failed to initialize voice recognition gate", "error", gerr)
+			sendError(t, "voice_gate_error", gerr.Error(), "", "")
+			return
+		}
+		session.voiceGate = gate
+		xlog.Info("realtime voice recognition gate enabled", "mode", gate.cfg.Mode, "when", gate.cfg.When)
+	}
+
 	// Store the session and notify the transport (for WebRTC audio track handling)
 	sessionLock.Lock()
 	sessions[sessionID] = session
@@ -1269,6 +1293,39 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
 
 	f.Sync()
 
+	// Start speaker verification concurrently with transcription. This is a
+	// latency optimization only: there is a hard join below before the LLM, so
+	// an unauthorized utterance never reaches generateResponse (no LLM, no
+	// tools, no TTS) regardless of how fast transcription finishes. A rejected
+	// turn wastes only transcription compute, which has no side effects. The
+	// transcript is still emitted to the same peer that sent the audio, which
+	// reveals nothing new to them.
+	type gateOutcome struct {
+		allowed bool
+		matched string
+		reason  string
+		err     error
+	}
+	var gateCh chan gateOutcome
+	runGate := false
+	if session.voiceGate != nil && session.InputAudioTranscription != nil {
+		skip := false
+		if session.voiceGate.cfg.When == config.VoiceGateWhenFirst {
+			session.gateMu.Lock()
+			skip = session.voiceVerified
+			session.gateMu.Unlock()
+		}
+		if !skip {
+			runGate = true
+			gateCh = make(chan gateOutcome, 1)
+			wavPath := f.Name()
+			go func() {
+				allowed, matched, reason, gerr := session.voiceGate.Authorize(ctx, wavPath)
+				gateCh <- gateOutcome{allowed: allowed, matched: matched, reason: reason, err: gerr}
+			}()
+		}
+	}
+
 	// TODO: If we have a real any-to-any model then transcription is optional
 	var transcript string
 	if session.InputAudioTranscription != nil {
@@ -1278,14 +1335,54 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
 		var err error
 		transcript, err = emitTranscription(ctx, t, session, generateItemID(), f.Name())
 		if err != nil {
+			// Drain the gate goroutine before returning so its in-flight read of
+			// the temp WAV finishes before the deferred os.Remove fires.
+			if runGate {
+				<-gateCh
+			}
 			sendError(t, "transcription_failed", err.Error(), "", "event_TODO")
 			return
 		}
 	} else {
+		// The voice gate runs only on the transcription path above; if an
+		// any-to-any model path is added here, join the gate before responding.
 		sendNotImplemented(t, "any-to-any models")
 		return
 	}
 
+	// Join on the gate before any side-effecting step.
+	if runGate {
+		out := <-gateCh
+		allowed := out.allowed
+		reason := out.reason
+		if out.err != nil {
+			// Fail closed: a gate that cannot decide must not let audio through.
+			xlog.Error("voice recognition gate error", "error", out.err)
+			allowed = false
+			reason = "verification error"
+		}
+		alreadyVerified := false
+		if session.voiceGate.cfg.When == config.VoiceGateWhenFirst {
+			session.gateMu.Lock()
+			alreadyVerified = session.voiceVerified
+			session.gateMu.Unlock()
+		}
+		proceed, markVerified := session.voiceGate.decide(alreadyVerified, allowed)
+		if !proceed {
+			xlog.Debug("voice recognition gate rejected utterance", "reason", reason)
+			if session.voiceGate.cfg.OnReject == config.VoiceGateRejectEvent {
+				sendError(t, "speaker_not_authorized", "speaker not authorized: "+reason, "", "event_TODO")
+			}
+			return
+		}
+		xlog.Debug("voice recognition gate authorized utterance", "speaker", out.matched)
+		if markVerified {
+			session.gateMu.Lock()
+			session.voiceVerified = true
+			session.gateMu.Unlock()
+		}
+	}
+
 	if !session.TranscriptionOnly {
 		generateResponse(ctx, session, utt, transcript, conv, t)
 	}
diff --git a/core/http/endpoints/openai/realtime_voicegate.go b/core/http/endpoints/openai/realtime_voicegate.go
new file mode 100644
index 000000000..54332536f
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_voicegate.go
@@ -0,0 +1,212 @@
+package openai
+
+import (
+	"context"
+	"fmt"
+	"math"
+
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/services/voicerecognition"
+	"github.com/mudler/LocalAI/pkg/model"
+)
+
+type namedEmbedding struct {
+	name string
+	emb  []float32
+}
+
+// voiceGate decides whether a committed utterance's speaker is authorized to
+// drive the realtime pipeline.
+type voiceGate struct {
+	cfg       config.PipelineVoiceRecognition // normalized
+	registry  voicerecognition.Registry       // identify mode (nil otherwise)
+	refEmbeds []namedEmbedding                // verify mode, pre-embedded refs
+	refAudios []config.VoiceReference         // verify + anti-spoofing: ref paths
+
+	// Seams for testing; set by newVoiceGate to call the real backend.
+	embedFn  func(ctx context.Context, wavPath string) ([]float32, error)
+	verifyFn func(ctx context.Context, uttWav, refWav string) (bool, error)
+}
+
+// newVoiceGate builds a gate from a pipeline's voice_recognition config. It
+// validates fail-fast (before loading the model), loads the recognition model
+// config, wires the real backend seams, and pre-embeds references for verify
+// mode so per-turn cost is one utterance embed plus cheap cosine comparisons.
+func newVoiceGate(
+	cfg config.PipelineVoiceRecognition,
+	cl *config.ModelConfigLoader,
+	ml *model.ModelLoader,
+	appConfig *config.ApplicationConfig,
+	registry voicerecognition.Registry,
+) (*voiceGate, error) {
+	cfg.Normalize()
+	if err := cfg.Validate(registry != nil); err != nil {
+		return nil, err
+	}
+
+	recCfg, err := cl.LoadModelConfigFileByName(cfg.Model, ml.ModelPath)
+	if err != nil {
+		return nil, fmt.Errorf("voice_recognition: failed to load model %q: %w", cfg.Model, err)
+	}
+	if valid, _ := recCfg.Validate(); !valid {
+		return nil, fmt.Errorf("voice_recognition: invalid model config %q", cfg.Model)
+	}
+
+	g := &voiceGate{
+		cfg:      cfg,
+		registry: registry,
+		embedFn: func(ctx context.Context, wavPath string) ([]float32, error) {
+			res, err := backend.VoiceEmbed(ctx, wavPath, ml, appConfig, *recCfg)
+			if err != nil {
+				return nil, err
+			}
+			return res.Embedding, nil
+		},
+		verifyFn: func(ctx context.Context, uttWav, refWav string) (bool, error) {
+			res, err := backend.VoiceVerify(ctx, uttWav, refWav, cfg.Threshold, true, ml, appConfig, *recCfg)
+			if err != nil {
+				return false, err
+			}
+			return res.Verified, nil
+		},
+	}
+
+	if cfg.Mode == config.VoiceGateModeVerify {
+		if cfg.AntiSpoofing {
+			g.refAudios = cfg.References
+		} else {
+			for _, r := range cfg.References {
+				emb, err := g.embedFn(context.Background(), r.Audio)
+				if err != nil {
+					return nil, fmt.Errorf("voice_recognition: failed to embed reference %q: %w", r.Name, err)
+				}
+				g.refEmbeds = append(g.refEmbeds, namedEmbedding{name: r.Name, emb: emb})
+			}
+		}
+	}
+
+	return g, nil
+}
+
+// Authorize embeds the utterance and decides allow/deny.
+//
+//	allowed: speaker is authorized.
+//	matched: matched person's name (informational), empty if none.
+//	reason:  human-readable deny reason.
+//	err:     backend failure (caller should fail closed).
+func (g *voiceGate) Authorize(ctx context.Context, wavPath string) (allowed bool, matched string, reason string, err error) {
+	if g.cfg.Mode == config.VoiceGateModeVerify {
+		return g.authorizeVerify(ctx, wavPath)
+	}
+	return g.authorizeIdentify(ctx, wavPath)
+}
+
+func (g *voiceGate) authorizeIdentify(ctx context.Context, wavPath string) (bool, string, string, error) {
+	emb, err := g.embedFn(ctx, wavPath)
+	if err != nil {
+		return false, "", "embed failed", err
+	}
+	if len(emb) == 0 {
+		return false, "", "no speech detected", nil
+	}
+	matches, err := g.registry.Identify(ctx, emb, 1)
+	if err != nil {
+		return false, "", "identify failed", err
+	}
+	if len(matches) == 0 {
+		return false, "", "unknown speaker", nil
+	}
+	m := matches[0]
+	if m.Distance > g.cfg.Threshold {
+		return false, m.Metadata.Name, "distance above threshold", nil
+	}
+	if !g.allowMatch(m.Metadata) {
+		return false, m.Metadata.Name, "speaker not in allow list", nil
+	}
+	return true, m.Metadata.Name, "", nil
+}
+
+// allowMatch reports whether a matched identity is authorized. An empty allow
+// (no names and no labels) authorizes any registered speaker.
+func (g *voiceGate) allowMatch(meta voicerecognition.Metadata) bool {
+	a := g.cfg.Allow
+	if len(a.Names) == 0 && len(a.Labels) == 0 {
+		return true
+	}
+	for _, n := range a.Names {
+		if n == meta.Name {
+			return true
+		}
+	}
+	for _, l := range a.Labels {
+		if _, ok := meta.Labels[l]; ok {
+			return true
+		}
+	}
+	return false
+}
+
+func (g *voiceGate) authorizeVerify(ctx context.Context, wavPath string) (bool, string, string, error) {
+	if g.cfg.AntiSpoofing {
+		for _, r := range g.refAudios {
+			ok, err := g.verifyFn(ctx, wavPath, r.Audio)
+			if err != nil {
+				return false, "", "verify failed", err
+			}
+			if ok {
+				return true, r.Name, "", nil
+			}
+		}
+		return false, "", "no reference matched", nil
+	}
+
+	emb, err := g.embedFn(ctx, wavPath)
+	if err != nil {
+		return false, "", "embed failed", err
+	}
+	if len(emb) == 0 {
+		return false, "", "no speech detected", nil
+	}
+	for _, r := range g.refEmbeds {
+		if cosineDistance(emb, r.emb) <= g.cfg.Threshold {
+			return true, r.name, "", nil
+		}
+	}
+	return false, "", "no reference matched", nil
+}
+
+// decide interprets an Authorize result against the gate's when-policy and the
+// session's prior verification state.
+//   proceed:      run the LLM response for this utterance.
+//   markVerified: record a successful first-utterance verification.
+// Note: when:first AND alreadyVerified is normally handled by the caller
+// skipping Authorize entirely; if it still reaches here, proceed is true.
+func (g *voiceGate) decide(alreadyVerified, allowed bool) (proceed, markVerified bool) {
+	if g.cfg.When == config.VoiceGateWhenFirst {
+		if alreadyVerified {
+			return true, false
+		}
+		return allowed, allowed
+	}
+	return allowed, false
+}
+
+// cosineDistance returns 1 - cosine_similarity, matching the voice registry's
+// distance convention (lower = closer). Returns 1 (treated as "no match") for
+// zero-length, mismatched, or zero-magnitude vectors.
+func cosineDistance(a, b []float32) float32 {
+	if len(a) == 0 || len(a) != len(b) {
+		return 1
+	}
+	var dot, na, nb float64
+	for i := range a {
+		dot += float64(a[i]) * float64(b[i])
+		na += float64(a[i]) * float64(a[i])
+		nb += float64(b[i]) * float64(b[i])
+	}
+	if na == 0 || nb == 0 {
+		return 1
+	}
+	return float32(1 - dot/(math.Sqrt(na)*math.Sqrt(nb)))
+}
diff --git a/core/http/endpoints/openai/realtime_voicegate_integration_test.go b/core/http/endpoints/openai/realtime_voicegate_integration_test.go
new file mode 100644
index 000000000..f8aae72c5
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_voicegate_integration_test.go
@@ -0,0 +1,154 @@
+package openai
+
+import (
+	"context"
+	"errors"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/voicerecognition"
+)
+
+// These specs drive the REAL commitUtterance path end to end (gate goroutine,
+// the hard join before the LLM, the reject event, and when:first session
+// trust) using the existing fakeTransport/fakeModel doubles. They are the
+// integration counterpart to the unit specs in realtime_voicegate_test.go:
+// here the gate is wired into a Session exactly as runRealtimeSession wires it.
+
+// itGate builds an identify-mode gate whose registry always returns a single
+// match named matchName, and whose embedFn returns embed/embErr. allowName is
+// the authorized identity. when/onReject select the policy.
+func itGate(allowName, matchName string, embed []float32, embErr error, when, onReject string) *voiceGate {
+	return &voiceGate{
+		cfg: config.PipelineVoiceRecognition{
+			Mode:      config.VoiceGateModeIdentify,
+			Threshold: 0.25,
+			When:      when,
+			OnReject:  onReject,
+			Allow:     config.VoiceRecognitionAllow{Names: []string{allowName}},
+		},
+		registry: &fakeRegistry{matches: []voicerecognition.Match{
+			{Distance: 0.1, Metadata: voicerecognition.Metadata{Name: matchName}},
+		}},
+		embedFn: func(context.Context, string) ([]float32, error) { return embed, embErr },
+	}
+}
+
+// itSession returns a Session + fakeModel wired for a full pipeline turn, with
+// the given gate attached. The fakeModel mirrors the streaming-LLM setup used
+// by realtime_stream_test.go so triggerResponse runs to a response.done.
+func itSession(gate *voiceGate) (*Session, *fakeModel) {
+	on := true
+	m := &fakeModel{
+		cfg:             &config.ModelConfig{},
+		transcribeFinal: &schema.TranscriptionResult{Text: "hello"},
+		predictTokens:   []string{"Hi", " there."},
+		predictResp:     backend.LLMResponse{Response: "Hi there."},
+		ttsStreamChunks: [][]byte{{1}},
+		ttsStreamRate:   24000,
+	}
+	session := &Session{
+		OutputSampleRate:        24000,
+		InputAudioTranscription: &types.AudioTranscription{},
+		ModelInterface:          m,
+		ModelConfig: &config.ModelConfig{
+			Pipeline: config.Pipeline{Streaming: config.PipelineStreaming{LLM: &on, TTS: &on}},
+		},
+		voiceGate: gate,
+	}
+	return session, m
+}
+
+// hasSpeakerNotAuthorized reports whether a speaker_not_authorized error event
+// was emitted to the client.
+func hasSpeakerNotAuthorized(tr *fakeTransport) bool {
+	for _, e := range tr.events {
+		if ev, ok := e.(types.ErrorEvent); ok && ev.Error.Code == "speaker_not_authorized" {
+			return true
+		}
+	}
+	return false
+}
+
+var _ = Describe("realtime voice gate integration (commitUtterance)", func() {
+	utt := make([]byte, 32) // non-empty PCM so commitUtterance proceeds
+
+	It("allows an authorized speaker through to a full response", func() {
+		session, _ := itSession(itGate("alice", "alice", []float32{1, 0, 0}, nil,
+			config.VoiceGateWhenEvery, config.VoiceGateRejectEvent))
+		tr := &fakeTransport{}
+
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
+
+		Expect(hasSpeakerNotAuthorized(tr)).To(BeFalse())
+		// The LLM/TTS pipeline ran to completion.
+		Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1))
+		// Transcription still happened (parallel with the gate).
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
+	})
+
+	It("drops an unauthorized speaker before the LLM and emits a reject event", func() {
+		// match name "mallory" is not in the allow list → deny.
+		session, _ := itSession(itGate("alice", "mallory", []float32{1, 0, 0}, nil,
+			config.VoiceGateWhenEvery, config.VoiceGateRejectEvent))
+		tr := &fakeTransport{}
+
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
+
+		// Hard barrier: the LLM/TTS pipeline never ran.
+		Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
+		// The client was told why.
+		Expect(hasSpeakerNotAuthorized(tr)).To(BeTrue())
+		// Transcription of the rejected utterance still emitted (sent only to the
+		// peer that produced the audio; reveals nothing new).
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
+	})
+
+	It("fails closed on a gate backend error", func() {
+		session, _ := itSession(itGate("alice", "alice", nil, errors.New("backend down"),
+			config.VoiceGateWhenEvery, config.VoiceGateRejectEvent))
+		tr := &fakeTransport{}
+
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
+
+		Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
+		Expect(hasSpeakerNotAuthorized(tr)).To(BeTrue())
+	})
+
+	It("drops silently when on_reject is drop_silent (no error event)", func() {
+		session, _ := itSession(itGate("alice", "mallory", []float32{1, 0, 0}, nil,
+			config.VoiceGateWhenEvery, config.VoiceGateRejectSilent))
+		tr := &fakeTransport{}
+
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr)
+
+		Expect(tr.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
+		Expect(hasSpeakerNotAuthorized(tr)).To(BeFalse())
+	})
+
+	It("when:first trusts the session after one match, even if later embeds fail", func() {
+		gate := itGate("alice", "alice", []float32{1, 0, 0}, nil,
+			config.VoiceGateWhenFirst, config.VoiceGateRejectEvent)
+		session, _ := itSession(gate)
+
+		// First utterance: authorized, marks the session verified.
+		tr1 := &fakeTransport{}
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr1)
+		Expect(hasSpeakerNotAuthorized(tr1)).To(BeFalse())
+		Expect(tr1.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1))
+
+		// Break the gate: any further Authorize would now error.
+		gate.embedFn = func(context.Context, string) ([]float32, error) { return nil, errors.New("boom") }
+
+		// Second utterance still proceeds because when:first skips re-verification.
+		tr2 := &fakeTransport{}
+		commitUtterance(context.Background(), utt, session, &Conversation{}, tr2)
+		Expect(hasSpeakerNotAuthorized(tr2)).To(BeFalse())
+		Expect(tr2.countEvents(types.ServerEventTypeResponseDone)).To(BeNumerically(">=", 1))
+	})
+})
diff --git a/core/http/endpoints/openai/realtime_voicegate_test.go b/core/http/endpoints/openai/realtime_voicegate_test.go
new file mode 100644
index 000000000..bdbbc2f4a
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_voicegate_test.go
@@ -0,0 +1,231 @@
+package openai
+
+import (
+	"context"
+	"errors"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/services/voicerecognition"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("cosineDistance", func() {
+	It("is 0 for identical vectors", func() {
+		Expect(cosineDistance([]float32{1, 0, 0}, []float32{1, 0, 0})).To(BeNumerically("~", 0, 1e-6))
+	})
+	It("is ~1 for orthogonal vectors", func() {
+		Expect(cosineDistance([]float32{1, 0}, []float32{0, 1})).To(BeNumerically("~", 1, 1e-6))
+	})
+	It("is ~2 for opposite vectors", func() {
+		Expect(cosineDistance([]float32{1, 0}, []float32{-1, 0})).To(BeNumerically("~", 2, 1e-6))
+	})
+	It("returns 1 for length mismatch", func() {
+		Expect(cosineDistance([]float32{1, 0}, []float32{1})).To(BeNumerically("~", 1, 1e-6))
+	})
+	It("returns 1 for a zero vector", func() {
+		Expect(cosineDistance([]float32{0, 0}, []float32{1, 0})).To(BeNumerically("~", 1, 1e-6))
+	})
+})
+
+type fakeRegistry struct {
+	matches []voicerecognition.Match
+	err     error
+}
+
+func (f *fakeRegistry) Register(ctx context.Context, emb []float32, m voicerecognition.Metadata) (voicerecognition.Metadata, error) {
+	return m, nil
+}
+func (f *fakeRegistry) Identify(ctx context.Context, probe []float32, topK int) ([]voicerecognition.Match, error) {
+	return f.matches, f.err
+}
+func (f *fakeRegistry) Forget(ctx context.Context, id string) error { return nil }
+
+var _ = Describe("voiceGate identify mode", func() {
+	stubEmbed := func(emb []float32, err error) func(context.Context, string) ([]float32, error) {
+		return func(context.Context, string) ([]float32, error) { return emb, err }
+	}
+	mkGate := func(allow config.VoiceRecognitionAllow, matches []voicerecognition.Match, embErr error) *voiceGate {
+		return &voiceGate{
+			cfg:      config.PipelineVoiceRecognition{Mode: config.VoiceGateModeIdentify, Threshold: 0.25, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent, Allow: allow},
+			registry: &fakeRegistry{matches: matches},
+			embedFn:  stubEmbed([]float32{1, 0, 0}, embErr),
+		}
+	}
+
+	It("allows a registered speaker within threshold and in the allow list", func() {
+		g := mkGate(config.VoiceRecognitionAllow{Names: []string{"alice"}},
+			[]voicerecognition.Match{{Distance: 0.1, Metadata: voicerecognition.Metadata{Name: "alice"}}}, nil)
+		allowed, matched, _, err := g.Authorize(context.Background(), "x.wav")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(allowed).To(BeTrue())
+		Expect(matched).To(Equal("alice"))
+	})
+	It("allows any registered speaker when the allow list is empty", func() {
+		g := mkGate(config.VoiceRecognitionAllow{},
+			[]voicerecognition.Match{{Distance: 0.1, Metadata: voicerecognition.Metadata{Name: "carol"}}}, nil)
+		allowed, _, _, _ := g.Authorize(context.Background(), "x.wav")
+		Expect(allowed).To(BeTrue())
+	})
+	It("allows by label", func() {
+		g := mkGate(config.VoiceRecognitionAllow{Labels: []string{"family"}},
+			[]voicerecognition.Match{{Distance: 0.1, Metadata: voicerecognition.Metadata{Name: "bob", Labels: map[string]string{"family": "yes"}}}}, nil)
+		allowed, _, _, _ := g.Authorize(context.Background(), "x.wav")
+		Expect(allowed).To(BeTrue())
+	})
+	It("denies a speaker not in the allow list", func() {
+		g := mkGate(config.VoiceRecognitionAllow{Names: []string{"alice"}},
+			[]voicerecognition.Match{{Distance: 0.1, Metadata: voicerecognition.Metadata{Name: "mallory"}}}, nil)
+		allowed, matched, reason, _ := g.Authorize(context.Background(), "x.wav")
+		Expect(allowed).To(BeFalse())
+		Expect(matched).To(Equal("mallory"))
+		Expect(reason).To(ContainSubstring("allow"))
+	})
+	It("denies a match above the threshold", func() {
+		g := mkGate(config.VoiceRecognitionAllow{},
+			[]voicerecognition.Match{{Distance: 0.9, Metadata: voicerecognition.Metadata{Name: "alice"}}}, nil)
+		allowed, matched, _, _ := g.Authorize(context.Background(), "x.wav")
+		Expect(allowed).To(BeFalse())
+		Expect(matched).To(Equal("alice"))
+	})
+	It("denies when no registry match", func() {
+		g := mkGate(config.VoiceRecognitionAllow{}, nil, nil)
+		allowed, _, reason, _ := g.Authorize(context.Background(), "x.wav")
+		Expect(allowed).To(BeFalse())
+		Expect(reason).To(ContainSubstring("unknown"))
+	})
+	It("denies (no error) when no speech is detected", func() {
+		g := mkGate(config.VoiceRecognitionAllow{}, nil, nil)
+		g.embedFn = stubEmbed(nil, nil)
+		allowed, _, reason, err := g.Authorize(context.Background(), "x.wav")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(allowed).To(BeFalse())
+		Expect(reason).To(ContainSubstring("no speech"))
+	})
+	It("denies and surfaces the error when embedding fails", func() {
+		g := mkGate(config.VoiceRecognitionAllow{}, nil, errors.New("boom"))
+		allowed, _, reason, err := g.Authorize(context.Background(), "x.wav")
+		Expect(err).To(HaveOccurred())
+		Expect(allowed).To(BeFalse())
+		Expect(reason).To(ContainSubstring("embed"))
+	})
+	It("denies and surfaces the error when identify fails", func() {
+		g := mkGate(config.VoiceRecognitionAllow{}, nil, nil)
+		g.registry = &fakeRegistry{err: errors.New("boom")}
+		allowed, _, _, err := g.Authorize(context.Background(), "x.wav")
+		Expect(err).To(HaveOccurred())
+		Expect(allowed).To(BeFalse())
+	})
+})
+
+var _ = Describe("voiceGate verify mode", func() {
+	It("allows when the utterance matches a reference embedding", func() {
+		g := &voiceGate{
+			cfg:       config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent},
+			refEmbeds: []namedEmbedding{{name: "alice", emb: []float32{1, 0, 0}}},
+			embedFn:   func(context.Context, string) ([]float32, error) { return []float32{1, 0, 0}, nil },
+		}
+		allowed, matched, _, err := g.Authorize(context.Background(), "x.wav")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(allowed).To(BeTrue())
+		Expect(matched).To(Equal("alice"))
+	})
+	It("denies when no reference is within threshold", func() {
+		g := &voiceGate{
+			cfg:       config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent},
+			refEmbeds: []namedEmbedding{{name: "alice", emb: []float32{1, 0, 0}}},
+			embedFn:   func(context.Context, string) ([]float32, error) { return []float32{0, 1, 0}, nil },
+		}
+		allowed, _, reason, _ := g.Authorize(context.Background(), "x.wav")
+		Expect(allowed).To(BeFalse())
+		Expect(reason).To(ContainSubstring("reference"))
+	})
+	It("denies (no error) when no speech is detected", func() {
+		g := &voiceGate{
+			cfg:       config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent},
+			refEmbeds: []namedEmbedding{{name: "alice", emb: []float32{1, 0, 0}}},
+			embedFn:   func(context.Context, string) ([]float32, error) { return nil, nil },
+		}
+		allowed, _, reason, err := g.Authorize(context.Background(), "x.wav")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(allowed).To(BeFalse())
+		Expect(reason).To(ContainSubstring("no speech"))
+	})
+	It("denies and surfaces the error when embedding fails", func() {
+		g := &voiceGate{
+			cfg:       config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent},
+			refEmbeds: []namedEmbedding{{name: "alice", emb: []float32{1, 0, 0}}},
+			embedFn:   func(context.Context, string) ([]float32, error) { return nil, errors.New("boom") },
+		}
+		allowed, _, _, err := g.Authorize(context.Background(), "x.wav")
+		Expect(err).To(HaveOccurred())
+		Expect(allowed).To(BeFalse())
+	})
+	It("uses verifyFn when anti-spoofing is enabled", func() {
+		called := false
+		g := &voiceGate{
+			cfg:       config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, AntiSpoofing: true, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent},
+			refAudios: []config.VoiceReference{{Name: "alice", Audio: "/alice.wav"}},
+			verifyFn:  func(context.Context, string, string) (bool, error) { called = true; return true, nil },
+		}
+		allowed, matched, _, err := g.Authorize(context.Background(), "x.wav")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(called).To(BeTrue())
+		Expect(allowed).To(BeTrue())
+		Expect(matched).To(Equal("alice"))
+	})
+	It("denies and surfaces the error when verifyFn fails (anti-spoofing)", func() {
+		g := &voiceGate{
+			cfg:       config.PipelineVoiceRecognition{Mode: config.VoiceGateModeVerify, Threshold: 0.25, AntiSpoofing: true, When: config.VoiceGateWhenEvery, OnReject: config.VoiceGateRejectEvent},
+			refAudios: []config.VoiceReference{{Name: "alice", Audio: "/alice.wav"}},
+			verifyFn:  func(context.Context, string, string) (bool, error) { return false, errors.New("boom") },
+		}
+		allowed, _, _, err := g.Authorize(context.Background(), "x.wav")
+		Expect(err).To(HaveOccurred())
+		Expect(allowed).To(BeFalse())
+	})
+})
+
+var _ = Describe("newVoiceGate", func() {
+	It("fails fast when identify mode has no registry (before touching the loader)", func() {
+		cfg := config.PipelineVoiceRecognition{Model: "spk", Mode: config.VoiceGateModeIdentify}
+		g, err := newVoiceGate(cfg, nil, nil, nil, nil)
+		Expect(err).To(HaveOccurred())
+		Expect(g).To(BeNil())
+	})
+	It("fails fast when verify mode has no references", func() {
+		cfg := config.PipelineVoiceRecognition{Model: "spk", Mode: config.VoiceGateModeVerify}
+		g, err := newVoiceGate(cfg, nil, nil, nil, nil)
+		Expect(err).To(HaveOccurred())
+		Expect(g).To(BeNil())
+	})
+})
+
+var _ = Describe("voiceGate decide", func() {
+	gate := func(when string) *voiceGate {
+		return &voiceGate{cfg: config.PipelineVoiceRecognition{When: when}}
+	}
+	It("every: proceeds iff allowed, never marks verified", func() {
+		proceed, mark := gate(config.VoiceGateWhenEvery).decide(false, true)
+		Expect(proceed).To(BeTrue())
+		Expect(mark).To(BeFalse())
+		proceed, mark = gate(config.VoiceGateWhenEvery).decide(false, false)
+		Expect(proceed).To(BeFalse())
+		Expect(mark).To(BeFalse())
+	})
+	It("first: marks verified on first allow", func() {
+		proceed, mark := gate(config.VoiceGateWhenFirst).decide(false, true)
+		Expect(proceed).To(BeTrue())
+		Expect(mark).To(BeTrue())
+	})
+	It("first: denies on first reject without marking", func() {
+		proceed, mark := gate(config.VoiceGateWhenFirst).decide(false, false)
+		Expect(proceed).To(BeFalse())
+		Expect(mark).To(BeFalse())
+	})
+	It("first: proceeds without re-check once already verified", func() {
+		proceed, mark := gate(config.VoiceGateWhenFirst).decide(true, false)
+		Expect(proceed).To(BeTrue())
+		Expect(mark).To(BeFalse())
+	})
+})
diff --git a/docs/content/features/openai-realtime.md b/docs/content/features/openai-realtime.md
index d8ae1212b..51f8960bf 100644
--- a/docs/content/features/openai-realtime.md
+++ b/docs/content/features/openai-realtime.md
@@ -137,6 +137,52 @@ most reliable fix for WebRTC connections that establish and then drop.
 
 The API follows the OpenAI Realtime API protocol for handling sessions, audio buffers, and conversation items.
 
+## Gating a realtime pipeline with voice recognition
+
+A pipeline realtime model can require speaker verification before it responds. Add a `voice_recognition` block under `pipeline`. When present, each committed utterance is verified against authorized speakers; unauthorized utterances are dropped before the LLM runs (no LLM call, no tool execution, no TTS). The session stays open.
+
+```yaml
+name: my-realtime
+pipeline:
+  vad: silero-vad
+  transcription: whisper
+  llm: qwen
+  tts: kokoro
+  voice_recognition:
+    model: speaker-recognition   # the speaker-recognition backend model
+    mode: identify               # "identify" (registry) or "verify" (references)
+    threshold: 0.25              # cosine distance; <= passes
+    when: every                  # "every" (default) or "first"
+    on_reject: drop_event        # "drop_event" (default) or "drop_silent"
+    anti_spoofing: false         # optional liveness check (verify mode)
+
+    # identify mode: authorized registry identities (multiple persons)
+    allow:
+      names: ["alice", "bob"]    # match registered speaker names
+      labels: ["family"]         # OR any identity carrying this label
+      # empty allow = any registered speaker within threshold passes
+
+    # verify mode: reference speakers (multiple persons)
+    references:
+      - name: alice
+        audio: /models/voices/alice.wav
+      - name: bob
+        audio: /models/voices/bob.wav
+```
+
+| Field | Meaning |
+|-------|---------|
+| `model` | Speaker-recognition backend model name. |
+| `mode` | `identify` matches against speakers registered via `/v1/voice/register`; `verify` matches against the `references` audios. |
+| `threshold` | Maximum cosine distance that still counts as a match (default ~0.25). |
+| `when` | `every` verifies each utterance; `first` verifies once then trusts the session. |
+| `on_reject` | `drop_event` drops and emits a `speaker_not_authorized` error event; `drop_silent` drops quietly. |
+| `anti_spoofing` | Verify mode only: runs the backend liveness check (slower). |
+| `allow.names` / `allow.labels` | identify mode: which registry identities are authorized. Empty = any registered speaker. |
+| `references` | verify mode: authorized reference speakers; the utterance passes if it matches any. |
+
+`identify` mode requires the voice registry (speakers registered through `/v1/voice/register`). `verify` mode needs no registry: reference audios are embedded once at model load.
+
 ## Examples
 
 - [Realtime voice assistant demo (Go)](https://github.com/localai-org/localai-realtime-demo): a minimal Go client for the Realtime (WebSocket) API with a full talk-back voice loop and an example tool call. Ships a `docker compose` setup that brings up a realtime-capable LocalAI for you.