diff --git a/.githooks/pre-commit b/.githooks/pre-commit
index c09f68772..025ecfafd 100755
--- a/.githooks/pre-commit
+++ b/.githooks/pre-commit
@@ -7,8 +7,11 @@
 # Runs only the checks relevant to what's staged:
 #   - Go files          -> make lint + make test-coverage-check
 #   - core/http/react-ui -> make test-ui-coverage-check (Playwright e2e + gate)
-# A commit touching neither is skipped entirely (docs/YAML/etc. can't change
-# lint findings, Go coverage, or the UI).
+#   - realtime state machines / specs -> make test-realtime-conformance
+#       (respcoord/**, turncoord/**, or formal-verification/** -- a pure .fizz
+#        spec edit must still re-verify the design, detected separately from Go)
+# A commit touching none of these is skipped entirely (other docs/YAML can't
+# change lint findings, Go coverage, the UI, or the realtime conformance gate).
 #
 # To bypass for a single commit (e.g. a WIP checkpoint): git commit --no-verify
 set -eu
@@ -20,11 +23,13 @@ staged="$(git diff --cached --name-only --diff-filter=ACMRD)"
 
 go_changed=0
 ui_changed=0
+rt_changed=0
 if echo "$staged" | grep -qE '\.go$'; then go_changed=1; fi
 if echo "$staged" | grep -qE '^core/http/react-ui/'; then ui_changed=1; fi
+if echo "$staged" | grep -qE '^(core/http/endpoints/openai/(coordinator|respcoord|turncoord|conncoord|compactcoord|ttscoord)/|formal-verification/)'; then rt_changed=1; fi
 
-if [ "$go_changed" -eq 0 ] && [ "$ui_changed" -eq 0 ]; then
-	echo "pre-commit: no Go or React UI changes staged — skipping."
+if [ "$go_changed" -eq 0 ] && [ "$ui_changed" -eq 0 ] && [ "$rt_changed" -eq 0 ]; then
+	echo "pre-commit: no Go, React UI, or realtime-spec changes staged — skipping."
 	exit 0
 fi
 
@@ -57,4 +62,11 @@ if [ "$ui_changed" -eq 1 ]; then
 	make test-ui-coverage-check
 fi
 
+if [ "$rt_changed" -eq 1 ]; then
+	echo "pre-commit ▶ realtime state-machine conformance (make test-realtime-conformance) —"
+	echo "             Go transition/rapid tests under -race + FizzBee model check of the"
+	echo "             authoritative specs. Fail-closed: needs FizzBee (make install-fizzbee)."
+	make test-realtime-conformance
+fi
+
 echo "pre-commit ✓ all relevant checks passed"
diff --git a/.github/workflows/realtime-conformance.yml b/.github/workflows/realtime-conformance.yml
new file mode 100644
index 000000000..c844a3003
--- /dev/null
+++ b/.github/workflows/realtime-conformance.yml
@@ -0,0 +1,69 @@
+---
+name: 'realtime-conformance'
+
+# Verifies the realtime state-machine implementations conform to their formal
+# designs (docs/design/realtime-state-machines.md, formal-verification/). BOTH
+# layers are enforced and the gate is fail-closed: the Go conformance layer
+# (respcoord + turncoord transition/rapid tests under -race) AND the FizzBee model check of
+# the authoritative specs. FizzBee is pinned + checksum-verified
+# (formal-verification/fizzbee.sha256), so a failed install fails the job rather
+# than silently skipping verification.
+
+on:
+  pull_request:
+    paths:
+      - 'core/http/endpoints/openai/coordinator/**'
+      - 'core/http/endpoints/openai/respcoord/**'
+      - 'core/http/endpoints/openai/turncoord/**'
+      - 'core/http/endpoints/openai/conncoord/**'
+      - 'core/http/endpoints/openai/compactcoord/**'
+      - 'core/http/endpoints/openai/ttscoord/**'
+      - 'formal-verification/**'
+      - 'scripts/realtime-conformance.sh'
+      - 'scripts/install-fizzbee.sh'
+      - '.github/workflows/realtime-conformance.yml'
+  push:
+    branches:
+      - master
+    paths:
+      - 'core/http/endpoints/openai/coordinator/**'
+      - 'core/http/endpoints/openai/respcoord/**'
+      - 'core/http/endpoints/openai/turncoord/**'
+      - 'core/http/endpoints/openai/conncoord/**'
+      - 'core/http/endpoints/openai/compactcoord/**'
+      - 'core/http/endpoints/openai/ttscoord/**'
+      - 'formal-verification/**'
+      - 'scripts/realtime-conformance.sh'
+
+concurrency:
+  group: realtime-conformance-${{ github.event.pull_request.number || github.sha }}-${{ github.repository }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  conformance:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        go-version: ['1.26.x']
+    steps:
+      - name: Clone
+        uses: actions/checkout@v7
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v5
+        with:
+          go-version: ${{ matrix.go-version }}
+          cache: false
+      - name: Cache FizzBee
+        uses: actions/cache@v4
+        with:
+          path: .tools/fizzbee
+          key: fizzbee-v0.5.2-${{ runner.os }}-${{ hashFiles('formal-verification/fizzbee.sha256') }}
+      - name: Install FizzBee (pinned, checksum-verified)
+        # No `|| true`: a failed/forged download must fail the job, not silently
+        # drop the design verification. install-fizzbee.sh is a no-op if the
+        # cached binary is already present and valid.
+        run: ./scripts/install-fizzbee.sh
+      - name: Run conformance gate (fail-closed)
+        # No skip env: both the Go conformance and the FizzBee model check are
+        # required. The gate auto-detects .tools/fizzbee/fizz.
+        run: make test-realtime-conformance
diff --git a/.gitignore b/.gitignore
index 91582c006..666b81df9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -97,3 +97,12 @@ core/http/react-ui/test-results/
 
 # Local Apple signing material (never commit)
 .certs/
+
+# Pinned dev tools (e.g. FizzBee for the realtime-conformance gate)
+.tools/
+
+# FizzBee model-check artifacts: the parser emits <spec>.json next to each
+# .fizz and the checker writes run dirs under out/. Both are regenerated by
+# the realtime-conformance gate; only the .fizz sources are authoritative.
+formal-verification/*.json
+formal-verification/out/
diff --git a/Makefile b/Makefile
index 2a8edc3fc..3e640a3b7 100644
--- a/Makefile
+++ b/Makefile
@@ -405,6 +405,18 @@ test-realtime: build-mock-backend
 	@echo 'Running realtime e2e tests (mock backend)'
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="Realtime && !real-models" --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e
 
+# Verify the realtime state-machine implementations conform to their formal
+# designs (Go transition/rapid tests under -race + FizzBee model check of the
+# authoritative specs). See docs/design/realtime-state-machines.md (Part 6) and
+# docs/design/specs/README.md.
+test-realtime-conformance:
+	GOCMD=$(GOCMD) ./scripts/realtime-conformance.sh
+
+# Install the pinned, checksum-verified FizzBee model checker (into .tools/,
+# gitignored) used by test-realtime-conformance. Idempotent; no-op if present.
+install-fizzbee:
+	./scripts/install-fizzbee.sh
+
 # Container-based real-model realtime testing. Build env vars / pipeline
 # definition kept here so test-realtime-models-docker can drive a fully wired
 # pipeline (VAD + STT + LLM + TTS) from inside a containerised runner.
@@ -1027,7 +1039,7 @@ test-extra-backend-whisper-transcription: docker-build-whisper
 ## is reachable.
 test-extra-backend-parakeet-cpp-transcription: docker-build-parakeet-cpp
 	BACKEND_IMAGE=local-ai-backend:parakeet-cpp \
-	BACKEND_TEST_MODEL_URL=https://huggingface.co/mudler/parakeet-cpp-gguf/resolve/main/tdt_ctc-110m-f16.gguf \
+	BACKEND_TEST_MODEL_URL=https://huggingface.co/mudler/parakeet-cpp-gguf/resolve/main/realtime_eou_120m-v1-f16.gguf \
 	BACKEND_TEST_AUDIO_URL=https://github.com/ggml-org/whisper.cpp/raw/master/samples/jfk.wav \
 	BACKEND_TEST_CAPS=health,load,transcription \
 	$(MAKE) test-extra-backend
diff --git a/backend/backend.proto b/backend/backend.proto
index 2a575426e..01c5b63a7 100644
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -18,6 +18,18 @@ service Backend {
   rpc GenerateVideo(GenerateVideoRequest) returns (Result) {}
   rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
   rpc AudioTranscriptionStream(TranscriptRequest) returns (stream TranscriptStreamResponse) {}
+  // AudioTranscriptionLive is the bidirectional live-microphone ASR RPC. The
+  // first message MUST carry a Config; subsequent messages carry Audio frames
+  // (mono float PCM at config.sample_rate, 16 kHz default). After a
+  // successful open the backend replies with a single ready ack
+  // (TranscriptLiveResponse{ready:true}); backends or models without
+  // cache-aware streaming support return UNIMPLEMENTED instead. Newly
+  // finalized text streams back as deltas; eou=true marks the model's
+  // end-of-utterance token. One stream spans many utterances (the decoder
+  // resets itself after each EOU). Closing the send side finalizes: the
+  // backend flushes the decoder tail and emits a terminal message carrying
+  // final_result. A second Config mid-stream resets the decode session.
+  rpc AudioTranscriptionLive(stream TranscriptLiveRequest) returns (stream TranscriptLiveResponse) {}
   rpc TTS(TTSRequest) returns (Result) {}
   rpc TTSStream(TTSRequest) returns (stream Reply) {}
   rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
@@ -479,6 +491,10 @@ message TranscriptResult {
   string text = 2;
   string language = 3;
   float duration = 4;
+  // True when the decode ended on the model's end-of-utterance special token
+  // (<EOU>/<EOB>, emitted by cache-aware streaming models such as
+  // parakeet_realtime_eou_120m-v1). The marker itself is stripped from text.
+  bool eou = 5;
 }
 
 message TranscriptStreamResponse {
@@ -486,6 +502,34 @@ message TranscriptStreamResponse {
   TranscriptResult final_result = 2;
 }
 
+// === AudioTranscriptionLive messages =====================================
+
+message TranscriptLiveRequest {
+  oneof payload {
+    TranscriptLiveConfig config = 1;
+    TranscriptLiveAudio  audio  = 2;
+  }
+}
+
+message TranscriptLiveConfig {
+  string language = 1;             // "" => model default
+  int32 sample_rate = 2;           // 0 => 16000; backends may reject others
+  map<string, string> params = 3;  // backend-specific tuning
+}
+
+message TranscriptLiveAudio {
+  repeated float pcm = 1;          // mono PCM in [-1,1] at config.sample_rate
+}
+
+message TranscriptLiveResponse {
+  bool ready = 1;                       // open ack: sent once, before any delta
+  string delta = 2;                     // newly-finalized text since previous response
+  bool eou = 3;                         // <EOU> fired during this feed (the user yielded the turn)
+  repeated TranscriptWord words = 4;    // words finalized by this feed (stream-relative ns)
+  TranscriptResult final_result = 5;    // terminal message only, after the send side closes
+  bool eob = 6;                         // <EOB> fired: a backchannel ("uh-huh") ended — NOT a turn boundary
+}
+
 message TranscriptWord {
   int64 start = 1;
   int64 end = 2;
diff --git a/backend/go/parakeet-cpp/boundary.go b/backend/go/parakeet-cpp/boundary.go
new file mode 100644
index 000000000..9c960cbc7
--- /dev/null
+++ b/backend/go/parakeet-cpp/boundary.go
@@ -0,0 +1,81 @@
+package main
+
+// utteranceBoundary is the single definition of a small state machine that was
+// previously open-coded three times — as a bare `finalEou` bool with an ad-hoc
+// toggle — in the live feed (live.go), the file-stream text path, and the
+// file-stream JSON path (goparakeetcpp.go).
+//
+// It answers one running question: does the decode currently rest on an
+// end-of-utterance boundary? That is the value a closing FinalResult reports as
+// .Eou and the realtime turn detector treats as a commit point.
+//
+// parakeet auto-resets its decoder after every <EOU>/<EOB>, so one streaming
+// session is a sequence of utterances and this is a LATCH, not a monotonic
+// flag: it closes on an <EOU> and reopens as soon as the next utterance starts.
+// (Contrast the realtime API's per-turn `eouSeen`, which only ever goes
+// false->true because each turn gets a fresh stream. Here the stream outlives
+// the turn, so the boundary status must be able to reopen.)
+//
+// The only transitions, over the events one streamFeedResult carries — an
+// <EOU>, an <EOB> (backchannel), or plain speech output (text and/or words):
+//
+//	            <EOU>
+//	   open ───────────► closed
+//	    ▲ ▲ │             │ │
+//	    │ └─┘ <EOB>|speech │ │ <EOU>
+//	    │   (stay open)    │ └─┘ (stay closed)
+//	    └──────────────────┘
+//	         <EOB>|speech
+//
+//	open   = NOT on an utterance boundary: mid-utterance, the last boundary was
+//	         a backchannel <EOB>, or the stream just began (the initial state).
+//	closed = the last meaningful event was an <EOU> with no later speech: a real
+//	         turn boundary.
+//
+// A feed that carries nothing (no eou/eob/text/words — e.g. a finalize flush
+// that produced no tail) is a no-op and leaves the state unchanged, matching
+// the legacy "leave finalEou as it was" behaviour.
+//
+// The state carries no data, so it is modelled as a two-valued type (a named
+// bool) rather than an int enum: every inhabitant is legal, so illegal states
+// are unrepresentable — the payload-free analog of the sealed sum types the
+// realtime machines use (those need interfaces because their states carry data,
+// e.g. Active{ID}, where "Active with no ID" is the illegal combination a scalar
+// cannot even express).
+type utteranceBoundary bool
+
+const (
+	// boundaryOpen is the zero value (false), so a fresh decode starts open —
+	// exactly the legacy `var finalEou bool` (false) initial condition.
+	boundaryOpen   utteranceBoundary = false
+	boundaryClosed utteranceBoundary = true
+)
+
+// observe folds one decode increment into the latch and returns the new state.
+//
+// <EOU> takes priority when a single feed carries both an <EOU> and speech
+// (e.g. {"text":"hello","eou":1}): the utterance both produced that text AND
+// ended, so the decode rests on the boundary. This matches the legacy
+// eou-checked-first ordering at every call site.
+func (b utteranceBoundary) observe(r streamFeedResult) utteranceBoundary {
+	switch {
+	case r.Eou:
+		return boundaryClosed
+	case r.Eob || r.Delta != "" || len(r.Words) > 0:
+		return boundaryOpen
+	default:
+		return b
+	}
+}
+
+// ended reports whether the decode currently rests on an end-of-utterance
+// boundary (a real <EOU>, not a backchannel <EOB>). This is what a closing
+// FinalResult carries as .Eou.
+func (b utteranceBoundary) ended() bool { return b == boundaryClosed }
+
+func (b utteranceBoundary) String() string {
+	if b == boundaryClosed {
+		return "closed"
+	}
+	return "open"
+}
diff --git a/backend/go/parakeet-cpp/boundary_test.go b/backend/go/parakeet-cpp/boundary_test.go
new file mode 100644
index 000000000..affd79bf0
--- /dev/null
+++ b/backend/go/parakeet-cpp/boundary_test.go
@@ -0,0 +1,92 @@
+package main
+
+import (
+	"math/rand/v2"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("utteranceBoundary (decode end-of-utterance latch)", func() {
+	It("starts open: a fresh decode is not on a boundary", func() {
+		var b utteranceBoundary
+		Expect(b).To(Equal(boundaryOpen))
+		Expect(b.ended()).To(BeFalse())
+	})
+
+	DescribeTable("single feed transition from the open state",
+		func(r streamFeedResult, wantEnded bool) {
+			Expect(boundaryOpen.observe(r).ended()).To(Equal(wantEnded))
+		},
+		Entry("<EOU> closes it", streamFeedResult{Eou: true}, true),
+		Entry("<EOU> with text closes it (eou wins)", streamFeedResult{Delta: "hi", Eou: true}, true),
+		Entry("<EOB> stays open (backchannel is not a turn boundary)", streamFeedResult{Eob: true}, false),
+		Entry("plain text stays open", streamFeedResult{Delta: "hello"}, false),
+		Entry("words-only stays open", streamFeedResult{Words: []transcriptWord{{W: "x"}}}, false),
+		Entry("empty feed is a no-op (stays open)", streamFeedResult{}, false),
+	)
+
+	DescribeTable("single feed transition from the closed state",
+		func(r streamFeedResult, wantEnded bool) {
+			Expect(boundaryClosed.observe(r).ended()).To(Equal(wantEnded))
+		},
+		Entry("another <EOU> stays closed", streamFeedResult{Eou: true}, true),
+		Entry("trailing speech reopens it", streamFeedResult{Delta: "and more"}, false),
+		Entry("words reopen it", streamFeedResult{Words: []transcriptWord{{W: "x"}}}, false),
+		Entry("a backchannel <EOB> reopens it", streamFeedResult{Eob: true}, false),
+		Entry("empty feed is a no-op (stays closed)", streamFeedResult{}, true),
+	)
+
+	It("is a latch: <EOU> then trailing speech reopens, then <EOU> closes again", func() {
+		b := boundaryOpen
+		b = b.observe(streamFeedResult{Delta: "turn one", Eou: true})
+		Expect(b.ended()).To(BeTrue())
+		b = b.observe(streamFeedResult{Delta: " and more"})
+		Expect(b.ended()).To(BeFalse(), "trailing speech without an EOU is an open utterance")
+		b = b.observe(streamFeedResult{Eou: true})
+		Expect(b.ended()).To(BeTrue())
+	})
+
+	It("treats a backchannel before a real EOU correctly", func() {
+		b := boundaryOpen
+		b = b.observe(streamFeedResult{Delta: "uh huh", Eob: true})
+		Expect(b.ended()).To(BeFalse(), "a backchannel must not masquerade as a turn boundary")
+		b = b.observe(streamFeedResult{Delta: "done", Eou: true})
+		Expect(b.ended()).To(BeTrue())
+	})
+
+	It("matches the reference fold over seeded random feed sequences", func() {
+		// The invariant: after any sequence of feeds, ended() is true iff the
+		// last feed that carried ANY event was an <EOU>. <EOU> takes priority
+		// when a feed carries both an EOU and speech; empty feeds are ignored.
+		for seed := uint64(1); seed <= 200; seed++ {
+			rng := rand.New(rand.NewPCG(seed, seed*2654435761))
+			b := boundaryOpen
+			lastWasEou := false // reference: did the last meaningful feed end on EOU?
+			steps := rng.IntN(30)
+			for i := 0; i < steps; i++ {
+				var r streamFeedResult
+				switch rng.IntN(5) {
+				case 0:
+					r = streamFeedResult{Eou: true}
+				case 1:
+					r = streamFeedResult{Eob: true}
+				case 2:
+					r = streamFeedResult{Delta: "w"}
+				case 3:
+					r = streamFeedResult{Delta: "w", Eou: true} // eou + speech, eou wins
+				case 4:
+					r = streamFeedResult{} // empty: no-op
+				}
+				b = b.observe(r)
+				if r.Eou {
+					lastWasEou = true
+				} else if r.Eob || r.Delta != "" || len(r.Words) > 0 {
+					lastWasEou = false
+				}
+			}
+			Expect(b.ended()).To(Equal(lastWasEou),
+				"seed %d: latch disagreed with the reference fold", seed)
+		}
+	})
+})
diff --git a/backend/go/parakeet-cpp/driver.go b/backend/go/parakeet-cpp/driver.go
new file mode 100644
index 000000000..cf832b165
--- /dev/null
+++ b/backend/go/parakeet-cpp/driver.go
@@ -0,0 +1,82 @@
+package main
+
+import (
+	"context"
+
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
+)
+
+// streamFeedResult is one decode increment from a cache-aware streaming session:
+// the newly-finalized text plus the model's own per-feed boundary tokens
+// (<EOU>/<EOB>) and word timings. It is the single event type both the live
+// (bidi) and file (server-stream) paths fold over, hiding the ABI v4 JSON vs
+// older text-only entry-point split behind one shape.
+type streamFeedResult struct {
+	Delta string
+	Eou   bool
+	Eob   bool
+	Words []transcriptWord
+}
+
+// feedChunk feeds one PCM chunk to the streaming session (or finalizes it, when
+// finalize is true) and returns the unified decode increment. It prefers the
+// ABI v4 JSON entry points (which also carry per-word timestamps) and falls
+// back to the older text-only entry points against an older libparakeet.so.
+//
+// This is the one place the JSON-vs-text choice is made; every consumer works
+// in terms of streamFeedResult.
+func (p *ParakeetCpp) feedChunk(stream uintptr, pcm []float32, finalize bool) (streamFeedResult, error) {
+	if CppStreamFeedJSON != nil {
+		doc, err := p.streamFeedDoc(stream, pcm, finalize)
+		if err != nil {
+			return streamFeedResult{}, err
+		}
+		return streamFeedResult{Delta: doc.Text, Eou: doc.Eou != 0, Eob: doc.Eob != 0, Words: doc.Words}, nil
+	}
+	delta, eou, eob, err := p.streamFeedText(stream, pcm, finalize)
+	if err != nil {
+		return streamFeedResult{}, err
+	}
+	return streamFeedResult{Delta: delta, Eou: eou, Eob: eob}, nil
+}
+
+// feedSlices feeds pcm through the session in streamChunkSamples slices,
+// invoking onFeed for each decode increment. It does NOT finalize: callers
+// decide when the send side is done. The file path finalizes after the whole
+// file; the live path finalizes only when its request channel closes, never
+// between audio messages. Slicing keeps each per-call engineMu hold short so
+// concurrent unary transcription interleaves fairly (the C session buffers
+// internally).
+//
+// If ctx is non-nil it is checked before each slice so a cancelled file
+// transcription stops promptly; the live path passes nil (it is bounded by its
+// request channel instead of a ctx).
+func (p *ParakeetCpp) feedSlices(ctx context.Context, stream uintptr, pcm []float32, onFeed func(streamFeedResult) error) error {
+	for off := 0; off < len(pcm); off += streamChunkSamples {
+		if ctx != nil {
+			if err := ctx.Err(); err != nil {
+				return status.Error(codes.Canceled, "transcription cancelled")
+			}
+		}
+		end := min(off+streamChunkSamples, len(pcm))
+		res, err := p.feedChunk(stream, pcm[off:end], false)
+		if err != nil {
+			return err
+		}
+		if err := onFeed(res); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// flushTail finalizes the session once and folds the flushed tail (the last
+// ~2 encoder frames of text, which only appear on finalize) through onFeed.
+func (p *ParakeetCpp) flushTail(stream uintptr, onFeed func(streamFeedResult) error) error {
+	res, err := p.feedChunk(stream, nil, true)
+	if err != nil {
+		return err
+	}
+	return onFeed(res)
+}
diff --git a/backend/go/parakeet-cpp/goparakeetcpp.go b/backend/go/parakeet-cpp/goparakeetcpp.go
index e87409255..5e023e927 100644
--- a/backend/go/parakeet-cpp/goparakeetcpp.go
+++ b/backend/go/parakeet-cpp/goparakeetcpp.go
@@ -103,12 +103,13 @@ type transcriptJSON struct {
 //	{"text":"...","eou":0,"eob":0,"frame_sec":0.080000,
 //	 "words":[{"w":"...","start":0.480,"end":0.640,"conf":0.9100}, ...]}
 //
-// "text" is the newly-finalized text since the last call; "eou" is 1 when an
-// <EOU> (end of utterance) fired this feed and "eob" is 1 when an <EOB>
-// (backchannel) fired. ABI v4 conflated the two into "eou"; v5 split them, so
-// we read both and treat either as an utterance boundary for segmentation.
-// "words" are the words finalized this call with absolute (stream-relative)
-// start/end seconds.
+// "text" is the newly-finalized text since the last call. Under ABI v5 "eou"
+// is 1 iff an <EOU> fired this feed (the user yielded the turn) and "eob" 1
+// iff an <EOB> fired (a backchannel like "uh-huh" ended — NOT a turn
+// boundary). A v4 library has no "eob" field and its "eou" conflates both
+// tokens: Eob stays 0 and Eou keeps the old any-event meaning. "words" are
+// the words finalized this call with absolute (stream-relative) start/end
+// seconds.
 type streamFeedJSON struct {
 	Text     string           `json:"text"`
 	Eou      int              `json:"eou"`
@@ -364,7 +365,7 @@ var segmentSeparators = []rune{'.', '?', '!'}
 // the caller requested word granularity; token ids populate each segment's
 // Tokens by time-window membership. Shared by the batched and direct paths.
 func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gapFrames int) pb.TranscriptResult {
-	text := strings.TrimSpace(doc.Text)
+	text, eou := stripEouMarker(strings.TrimSpace(doc.Text))
 
 	// Frame-unit gap threshold -> seconds (NeMo segment_gap_threshold). 0 = off.
 	gapSeconds := 0.0
@@ -383,6 +384,7 @@ func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gap
 		return pb.TranscriptResult{
 			Text:     text,
 			Segments: []*pb.TranscriptSegment{{Id: 0, Text: text}},
+			Eou:      eou,
 		}
 	}
 
@@ -409,7 +411,25 @@ func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gap
 		}
 		segments = append(segments, seg)
 	}
-	return pb.TranscriptResult{Text: text, Segments: segments}
+	return pb.TranscriptResult{Text: text, Segments: segments, Eou: eou}
+}
+
+// stripEouMarker removes a trailing literal <EOU>/<EOB> from offline-decode
+// text and reports whether the decode ended on an end-of-UTTERANCE token. The
+// realtime EOU model's offline decode keeps the special token in the
+// detokenized text (the streaming path strips it and surfaces it as flags
+// instead); user-visible transcripts must never carry either marker, but only
+// <EOU> may confirm the semantic_vad retranscribe cross-check — a decode
+// ending on <EOB> means the last thing heard was a backchannel, not the user
+// yielding the turn.
+func stripEouMarker(text string) (string, bool) {
+	if strings.HasSuffix(text, "<EOU>") {
+		return strings.TrimSpace(strings.TrimSuffix(text, "<EOU>")), true
+	}
+	if strings.HasSuffix(text, "<EOB>") {
+		return strings.TrimSpace(strings.TrimSuffix(text, "<EOB>")), false
+	}
+	return text, false
 }
 
 // splitWordsIntoSegments groups words into segments exactly as NeMo's
@@ -476,41 +496,55 @@ func tokensInWindow(tokens []transcriptToken, start, end float64) []int32 {
 	return ids
 }
 
-// streamSegmenter accumulates streaming words into per-utterance segments. EOU
-// is the model's own utterance boundary; each closed segment takes its start/end
-// from its first/last accumulated word.
+// streamSegmenter accumulates streaming decode increments into per-utterance
+// segments. <EOU>/<EOB> are the model's own utterance boundaries; each closes a
+// segment. When the feed carries per-word timings (ABI v4 JSON), a closed
+// segment takes its start/end from its first/last word; against an older
+// text-only library (no words) it falls back to segmenting the delta text, so
+// the same assembler serves both paths.
 type streamSegmenter struct {
-	segs   []*pb.TranscriptSegment
-	cur    []transcriptWord
-	nextID int32
+	segs    []*pb.TranscriptSegment
+	cur     []transcriptWord // words for the open segment (ABI v4 JSON path)
+	curText []string         // delta text for the open segment (text-only path)
+	nextID  int32
 }
 
-func (s *streamSegmenter) add(doc streamFeedJSON) {
-	s.cur = append(s.cur, doc.Words...)
-	// Close the segment on either turn signal: <EOU> (end of utterance) or
-	// <EOB> (backchannel). ABI v4 reported both via "eou"; v5 split them, so we
-	// OR them here to keep the v4 segmentation boundaries.
-	if doc.Eou != 0 || doc.Eob != 0 {
+func (s *streamSegmenter) add(r streamFeedResult) {
+	s.cur = append(s.cur, r.Words...)
+	if len(r.Words) == 0 && r.Delta != "" {
+		// Older libparakeet.so with no per-word timing: segment from the text.
+		s.curText = append(s.curText, r.Delta)
+	}
+	// Both <EOU> and <EOB> reset the decoder, so both close a segment.
+	if r.Eou || r.Eob {
 		s.flush()
 	}
 }
 
 func (s *streamSegmenter) flush() {
-	if len(s.cur) == 0 {
-		return
+	switch {
+	case len(s.cur) > 0:
+		parts := make([]string, len(s.cur))
+		for i, w := range s.cur {
+			parts[i] = w.W
+		}
+		s.segs = append(s.segs, &pb.TranscriptSegment{
+			Id:    s.nextID,
+			Start: secondsToNanos(s.cur[0].Start),
+			End:   secondsToNanos(s.cur[len(s.cur)-1].End),
+			Text:  strings.TrimSpace(strings.Join(parts, " ")),
+		})
+		s.nextID++
+	case len(s.curText) > 0:
+		// No words this segment: emit a text-only segment (no timestamps),
+		// skipping a purely-whitespace one as the legacy text path did.
+		if t := strings.TrimSpace(strings.Join(s.curText, "")); t != "" {
+			s.segs = append(s.segs, &pb.TranscriptSegment{Id: s.nextID, Text: t})
+			s.nextID++
+		}
 	}
-	parts := make([]string, len(s.cur))
-	for i, w := range s.cur {
-		parts[i] = w.W
-	}
-	s.segs = append(s.segs, &pb.TranscriptSegment{
-		Id:    s.nextID,
-		Start: secondsToNanos(s.cur[0].Start),
-		End:   secondsToNanos(s.cur[len(s.cur)-1].End),
-		Text:  strings.TrimSpace(strings.Join(parts, " ")),
-	})
-	s.nextID++
 	s.cur = nil
+	s.curText = nil
 }
 
 func (s *streamSegmenter) segments() []*pb.TranscriptSegment { return s.segs }
@@ -535,18 +569,119 @@ func secondsToNanos(sec float64) int64 {
 	return int64(sec * 1e9)
 }
 
+// Per-C-call engine serialization for the streaming paths.
+//
+// Every individual C call (begin / feed / finalize / free) takes engineMu and
+// re-checks ctxPtr under the lock; the lock is NEVER held across a stream's
+// lifetime. This is safe because each parakeet.cpp call builds its own ggml
+// graph and all streaming caches live in the session object, not the ctx —
+// the only ctx-shared mutable state is last_error, which is why it is read
+// under the same lock as the failing call. Holding the lock per call (rather
+// than per stream, as this file previously did) keeps a long-lived live
+// session from starving batched unary transcription and vice versa.
+//
+// A stream must not outlive its ctx (C-API contract). Free() takes engineMu
+// and zeroes ctxPtr, so a racing per-call helper returns ModelNotLoaded
+// instead of feeding a freed engine; streamFree of an orphaned session only
+// runs the session destructor, which does not touch the ctx.
+
+// streamBegin opens a cache-aware streaming session. A 0 stream with nil
+// error means the loaded model is not a streaming model.
+func (p *ParakeetCpp) streamBegin(lang string) (uintptr, error) {
+	p.engineMu.Lock()
+	defer p.engineMu.Unlock()
+	if p.ctxPtr == 0 {
+		return 0, grpcerrors.ModelNotLoaded("parakeet-cpp")
+	}
+	if CppStreamBeginLang != nil {
+		return CppStreamBeginLang(p.ctxPtr, lang), nil
+	}
+	return CppStreamBegin(p.ctxPtr), nil
+}
+
+func (p *ParakeetCpp) streamFree(stream uintptr) {
+	if stream == 0 {
+		return
+	}
+	p.engineMu.Lock()
+	defer p.engineMu.Unlock()
+	CppStreamFree(stream)
+}
+
+// streamFeedText runs one text-mode feed (or the finalize flush when
+// finalize is true) under engineMu, returning the newly-finalized delta and
+// whether an <EOU>/<EOB> fired during the call.
+func (p *ParakeetCpp) streamFeedText(stream uintptr, pcm []float32, finalize bool) (delta string, eou, eob bool, err error) {
+	p.engineMu.Lock()
+	defer p.engineMu.Unlock()
+	if p.ctxPtr == 0 {
+		return "", false, false, grpcerrors.ModelNotLoaded("parakeet-cpp")
+	}
+	var ret uintptr
+	var events int32
+	if finalize {
+		ret = CppStreamFinalize(stream)
+	} else {
+		ret = CppStreamFeed(stream, pcm, int32(len(pcm)), unsafe.Pointer(&events))
+	}
+	if ret == 0 {
+		// last_error is ctx-shared: read it under the same lock as the call.
+		msg := CppLastError(p.ctxPtr)
+		if msg == "" {
+			msg = "unknown error"
+		}
+		return "", false, false, fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
+	}
+	delta = goStringFromCPtr(ret)
+	CppFreeString(ret)
+	// ABI v5: eou_out is a bitmask (bit 0 = <EOU>, bit 1 = <EOB>). A v4
+	// library sets 0/1 for either token, which the bit-0 test reads as the
+	// old conflated eou — the EOB distinction simply isn't available there.
+	return delta, events&1 != 0, events&2 != 0, nil
+}
+
+// streamFeedDoc runs one ABI v4 JSON feed (or finalize) under engineMu and
+// returns the parsed {text,eou,frame_sec,words} document.
+func (p *ParakeetCpp) streamFeedDoc(stream uintptr, pcm []float32, finalize bool) (streamFeedJSON, error) {
+	p.engineMu.Lock()
+	defer p.engineMu.Unlock()
+	if p.ctxPtr == 0 {
+		return streamFeedJSON{}, grpcerrors.ModelNotLoaded("parakeet-cpp")
+	}
+	var ret uintptr
+	if finalize {
+		ret = CppStreamFinalizeJSON(stream)
+	} else {
+		ret = CppStreamFeedJSON(stream, pcm, int32(len(pcm)))
+	}
+	if ret == 0 {
+		msg := CppLastError(p.ctxPtr)
+		if msg == "" {
+			msg = "unknown error"
+		}
+		return streamFeedJSON{}, fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
+	}
+	raw := goStringFromCPtr(ret)
+	CppFreeString(ret)
+	var doc streamFeedJSON
+	if err := json.Unmarshal([]byte(raw), &doc); err != nil {
+		return streamFeedJSON{}, fmt.Errorf("parakeet-cpp: decode stream json: %w", err)
+	}
+	return doc, nil
+}
+
 // AudioTranscriptionStream drives the cache-aware streaming RNN-T over the
-// audio at opts.Dst: it decodes the file to 16 kHz mono PCM, feeds it in
-// chunks to parakeet_capi_stream_feed, and emits each newly-finalized text
-// run as a TranscriptStreamResponse delta. <EOU>/<EOB> events close the
-// current segment; a closing FinalResult carries the full transcript and the
-// per-utterance segments.
+// audio at opts.Dst: it decodes the file to 16 kHz mono PCM, feeds it through
+// the shared decode driver (feedSlices/flushTail), and emits each
+// newly-finalized text run as a TranscriptStreamResponse delta. <EOU>/<EOB>
+// events close the current segment; a closing FinalResult carries the full
+// transcript, the per-utterance segments, and whether the file ended on an
+// utterance boundary.
 //
 // stream_begin returns 0 for models that are not cache-aware streaming models
-// (only e.g. nvidia/parakeet_realtime_eou_120m-v1 qualifies). For those we fall
-// back to a single offline transcription emitted as one delta plus a closing
-// FinalResult, matching LocalAI's non-streaming streaming contract (and the
-// whisper backend), so the streaming endpoint works for every model.
+// (only e.g. nvidia/parakeet_realtime_eou_120m-v1 qualifies). For those this
+// returns codes.Unimplemented rather than faking a stream from an offline
+// decode — see the stream==0 branch and grpcerrors.StreamTranscriptionUnsupported.
 func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.TranscriptRequest, results chan *pb.TranscriptStreamResponse) error {
 	defer close(results)
 
@@ -560,185 +695,73 @@ func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.Tra
 		return status.Error(codes.Canceled, "transcription cancelled")
 	}
 
-	var stream uintptr
-	if CppStreamBeginLang != nil {
-		stream = CppStreamBeginLang(p.ctxPtr, opts.GetLanguage())
-	} else {
-		stream = CppStreamBegin(p.ctxPtr)
+	stream, err := p.streamBegin(opts.GetLanguage())
+	if err != nil {
+		return err
 	}
 	if stream == 0 {
-		// Not a cache-aware streaming model: run a normal offline
-		// transcription and emit it as one delta + a closing final result.
-		res, err := p.AudioTranscription(ctx, opts)
-		if err != nil {
-			return err
-		}
-		if t := strings.TrimSpace(res.Text); t != "" {
-			results <- &pb.TranscriptStreamResponse{Delta: t}
-		}
-		results <- &pb.TranscriptStreamResponse{FinalResult: &res}
-		return nil
+		// Not a cache-aware streaming model. Report the missing capability
+		// honestly instead of decoding offline and emitting it as one "delta"
+		// + final: a client that asked for streaming must learn the model
+		// cannot stream, not receive a batch result dressed as a stream (which
+		// is indistinguishable except qualitatively, and silently breaks any
+		// feature that genuinely needs incremental output). Callers wanting a
+		// plain transcript use the unary AudioTranscription path. This mirrors
+		// AudioTranscriptionLive, which already returns Unimplemented here.
+		return grpcerrors.StreamTranscriptionUnsupported("parakeet-cpp",
+			"loaded model is not a cache-aware streaming model")
 	}
-	defer CppStreamFree(stream)
-	// The C engine is a single shared context: a streaming session and a batched
-	// unary dispatch must never touch it at once, so hold engineMu for the whole
-	// stream. This lock is intentionally taken AFTER the non-streaming fallback
-	// above returns: that fallback goes through AudioTranscription -> the batcher
-	// -> runBatch, which itself acquires engineMu, so locking here first would
-	// deadlock. Do not hoist this lock above the fallback.
-	p.engineMu.Lock()
-	defer p.engineMu.Unlock()
+	defer p.streamFree(stream)
 
 	data, duration, err := decodeWavMono16k(opts.Dst)
 	if err != nil {
 		return err
 	}
 
-	// ABI v4: when the streaming JSON entry points are present, drive them so the
-	// per-utterance segments carry per-word start/end timestamps. Falls through to
-	// the text-only loop below against an older libparakeet.so. Runs under the
-	// engineMu already held above.
-	if CppStreamFeedJSON != nil {
-		return p.streamJSON(ctx, stream, data, duration, results)
-	}
-
+	// Fold the shared decode driver's per-feed increments into the streamed
+	// deltas and the closing batch result: words/text accumulate into
+	// per-utterance segments (streamSegmenter), and the utterance-boundary
+	// latch (boundary.go) records whether the file ended on an <EOU>. These
+	// are the offline path's concern — the live RPC carries none of them.
 	var (
 		full     strings.Builder
-		segText  strings.Builder
-		segments []*pb.TranscriptSegment
-		segID    int32
+		seg      streamSegmenter
+		boundary utteranceBoundary
 	)
-
-	flushSegment := func() {
-		t := strings.TrimSpace(segText.String())
-		segText.Reset()
-		if t == "" {
-			return
+	emit := func(r streamFeedResult) error {
+		if r.Delta != "" {
+			full.WriteString(r.Delta)
+			results <- &pb.TranscriptStreamResponse{Delta: r.Delta}
 		}
-		segments = append(segments, &pb.TranscriptSegment{Id: segID, Text: t})
-		segID++
-	}
-
-	// emitDelta consumes the malloc'd char* returned by feed/finalize: frees
-	// it, accumulates the text, and sends a delta when non-empty. A 0 return
-	// is an error (vs the "" empty-but-non-NULL no-new-text case).
-	emitDelta := func(ret uintptr) error {
-		if ret == 0 {
-			msg := CppLastError(p.ctxPtr)
-			if msg == "" {
-				msg = "unknown error"
-			}
-			return fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
-		}
-		delta := goStringFromCPtr(ret)
-		CppFreeString(ret)
-		if delta == "" {
-			return nil
-		}
-		full.WriteString(delta)
-		segText.WriteString(delta)
-		results <- &pb.TranscriptStreamResponse{Delta: delta}
+		seg.add(r)
+		boundary = boundary.observe(r)
 		return nil
 	}
 
-	for off := 0; off < len(data); off += streamChunkSamples {
-		if err := ctx.Err(); err != nil {
-			return status.Error(codes.Canceled, "transcription cancelled")
-		}
-		end := min(off+streamChunkSamples, len(data))
-		chunk := data[off:end]
-
-		var eou int32
-		ret := CppStreamFeed(stream, chunk, int32(len(chunk)), unsafe.Pointer(&eou))
-		if err := emitDelta(ret); err != nil {
-			return err
-		}
-		if eou != 0 {
-			flushSegment()
-		}
-	}
-
-	// Flush the streaming tail (final encoder chunk).
-	if err := emitDelta(CppStreamFinalize(stream)); err != nil {
+	if err := p.feedSlices(ctx, stream, data, emit); err != nil {
 		return err
 	}
-	flushSegment()
-
-	text := strings.TrimSpace(full.String())
-	if len(segments) == 0 && text != "" {
-		segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: text})
-	}
-	results <- &pb.TranscriptStreamResponse{
-		FinalResult: &pb.TranscriptResult{
-			Text:     text,
-			Segments: segments,
-			Duration: duration,
-		},
-	}
-	return nil
-}
-
-// streamJSON drives the streaming JSON entry points (present since ABI v4): each
-// feed/finalize returns a {text,eou,eob,frame_sec,words} document. The
-// newly-finalized text is emitted as a delta (unchanged streaming contract)
-// while words are accumulated into per-utterance segments (closed on <EOU> or
-// <EOB>) so the closing FinalResult carries timestamped segments. Runs under
-// engineMu (already held by the caller).
-func (p *ParakeetCpp) streamJSON(ctx context.Context, stream uintptr, data []float32,
-	duration float32, results chan *pb.TranscriptStreamResponse) error {
-	var (
-		full strings.Builder
-		seg  streamSegmenter
-	)
-	// consume frees the malloc'd char* (a 0 return is an error), parses the JSON,
-	// emits the delta, and routes words through the segmenter.
-	consume := func(ret uintptr) error {
-		if ret == 0 {
-			msg := CppLastError(p.ctxPtr)
-			if msg == "" {
-				msg = "unknown error"
-			}
-			return fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
-		}
-		raw := goStringFromCPtr(ret)
-		CppFreeString(ret)
-		var doc streamFeedJSON
-		if err := json.Unmarshal([]byte(raw), &doc); err != nil {
-			return fmt.Errorf("parakeet-cpp: decode stream json: %w", err)
-		}
-		if doc.Text != "" {
-			full.WriteString(doc.Text)
-			results <- &pb.TranscriptStreamResponse{Delta: doc.Text}
-		}
-		seg.add(doc)
-		return nil
-	}
-
-	for off := 0; off < len(data); off += streamChunkSamples {
-		if err := ctx.Err(); err != nil {
-			return status.Error(codes.Canceled, "transcription cancelled")
-		}
-		end := min(off+streamChunkSamples, len(data))
-		chunk := data[off:end]
-		if err := consume(CppStreamFeedJSON(stream, chunk, int32(len(chunk)))); err != nil {
-			return err
-		}
-	}
-	if err := consume(CppStreamFinalizeJSON(stream)); err != nil {
+	if err := p.flushTail(stream, emit); err != nil {
 		return err
 	}
-	seg.flush() // close any trailing utterance that never saw an EOU
+	seg.flush() // close a trailing utterance that never saw an <EOU>
 
-	text := strings.TrimSpace(full.String())
+	// final.Text is the exact concatenation of the streamed deltas (full is
+	// their accumulation), so concat(deltas) == FinalResult.Text holds even
+	// when the model prepends a leading space to the first word (SentencePiece
+	// detokenization). This matches the whisper backend's streaming contract.
+	// The single-segment fallback stays trimmed.
+	fullText := full.String()
 	segments := seg.segments()
-	if len(segments) == 0 && text != "" {
-		segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: text})
+	if trimmed := strings.TrimSpace(fullText); len(segments) == 0 && trimmed != "" {
+		segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: trimmed})
 	}
 	results <- &pb.TranscriptStreamResponse{
 		FinalResult: &pb.TranscriptResult{
-			Text:     text,
+			Text:     fullText,
 			Segments: segments,
 			Duration: duration,
+			Eou:      boundary.ended(),
 		},
 	}
 	return nil
@@ -803,6 +826,10 @@ func (p *ParakeetCpp) Free() error {
 		close(p.batStop)
 		p.batStop = nil
 	}
+	// engineMu so an in-flight streaming call (which locks per C call and
+	// re-checks ctxPtr under the lock) can never feed into a freed ctx.
+	p.engineMu.Lock()
+	defer p.engineMu.Unlock()
 	if p.ctxPtr != 0 {
 		CppFree(p.ctxPtr)
 		p.ctxPtr = 0
diff --git a/backend/go/parakeet-cpp/goparakeetcpp_test.go b/backend/go/parakeet-cpp/goparakeetcpp_test.go
index 0cfcc37e5..a6f6af1f0 100644
--- a/backend/go/parakeet-cpp/goparakeetcpp_test.go
+++ b/backend/go/parakeet-cpp/goparakeetcpp_test.go
@@ -14,6 +14,8 @@ import (
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
 )
 
 func TestParakeetCpp(t *testing.T) {
@@ -201,6 +203,29 @@ var _ = Describe("ParakeetCpp", func() {
 	})
 
 	Context("AudioTranscriptionStream", func() {
+		It("returns the typed Unimplemented signal for non-streaming models (no offline fallback)", func() {
+			// stream_begin == 0 means the loaded model is not a cache-aware
+			// streaming model. The backend must surface that, not silently
+			// decode offline and fake a one-shot "stream".
+			savedBegin, savedBeginLang := CppStreamBegin, CppStreamBeginLang
+			defer func() { CppStreamBegin, CppStreamBeginLang = savedBegin, savedBeginLang }()
+			CppStreamBeginLang = nil
+			CppStreamBegin = func(ctx uintptr) uintptr { return 0 }
+
+			p := &ParakeetCpp{ctxPtr: 1}
+			results := make(chan *pb.TranscriptStreamResponse, 8)
+			err := p.AudioTranscriptionStream(context.Background(),
+				&pb.TranscriptRequest{Dst: "ignored.wav"}, results)
+			Expect(status.Code(err)).To(Equal(codes.Unimplemented))
+
+			// Honest signal: nothing was emitted — no faked batch result.
+			var emitted []*pb.TranscriptStreamResponse
+			for r := range results {
+				emitted = append(emitted, r)
+			}
+			Expect(emitted).To(BeEmpty())
+		})
+
 		It("streams deltas and a closing FinalResult from a cache-aware model", func() {
 			// Streaming needs a cache-aware streaming model (e.g.
 			// realtime_eou); the offline test model would fail stream_begin.
diff --git a/backend/go/parakeet-cpp/live.go b/backend/go/parakeet-cpp/live.go
new file mode 100644
index 000000000..3d68a2914
--- /dev/null
+++ b/backend/go/parakeet-cpp/live.go
@@ -0,0 +1,186 @@
+package main
+
+import (
+	"strings"
+	"time"
+
+	"github.com/mudler/LocalAI/pkg/grpc/grpcerrors"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/xlog"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
+)
+
+// liveSampleRate is the only PCM rate the parakeet C streaming API accepts.
+const liveSampleRate = 16000
+
+// AudioTranscriptionLive drives one cache-aware streaming session over audio
+// fed incrementally by the caller (the realtime API's semantic_vad turn
+// detection). Contract:
+//
+//   - the first request must carry a Config; a Config mid-stream resets the
+//     decode session (free + begin) and drops accumulated transcript state;
+//   - a Ready ack is sent right after a successful stream_begin so callers
+//     can degrade synchronously when the model has no streaming support
+//     (LiveTranscriptionUnsupported, codes.Unimplemented);
+//   - every feed that produced output is forwarded as {delta, eou, words};
+//     the <EOU>/<EOB> flag is the model's own utterance boundary and the
+//     decoder auto-resets after it, so one session spans many utterances;
+//   - closing the send side finalizes: the held-back tail chunk is flushed
+//     (the last ~2 encoder frames of words only appear here) and a terminal
+//     FinalResult carries the full transcript Text only. Per-utterance
+//     segments, duration, and the terminal <EOU> flag are NOT produced here —
+//     the realtime core consumes the streamed per-feed tokens and the final
+//     Text; those batch fields are the file path's concern (see
+//     AudioTranscriptionStream).
+//
+// Engine access is serialized per C call (streamBegin/streamFeed*/streamFree
+// take engineMu internally), never for the session lifetime — unary
+// transcription keeps flowing between feeds.
+func (p *ParakeetCpp) AudioTranscriptionLive(in <-chan *pb.TranscriptLiveRequest, out chan<- *pb.TranscriptLiveResponse) error {
+	defer close(out)
+
+	if p.ctxPtr == 0 {
+		return grpcerrors.ModelNotLoaded("parakeet-cpp")
+	}
+
+	first, ok := <-in
+	if !ok {
+		return nil // caller closed without sending anything
+	}
+	cfg := first.GetConfig()
+	if cfg == nil {
+		return status.Error(codes.InvalidArgument, "parakeet-cpp: first live message must carry a config")
+	}
+	if err := validateLiveConfig(cfg); err != nil {
+		return err
+	}
+
+	stream, err := p.streamBegin(cfg.GetLanguage())
+	if err != nil {
+		return err
+	}
+	if stream == 0 {
+		return grpcerrors.LiveTranscriptionUnsupported("parakeet-cpp",
+			"loaded model is not a cache-aware streaming model")
+	}
+	// stream is reassigned on a mid-stream Config reset; free whatever is
+	// current when the RPC unwinds.
+	defer func() { p.streamFree(stream) }()
+
+	out <- &pb.TranscriptLiveResponse{Ready: true}
+
+	var (
+		full    strings.Builder
+		fedSecs float64
+
+		// behindSec accumulates how far decode wall time has fallen behind
+		// the audio it was fed. A live caller feeds in real time, so a
+		// persistent positive backlog means every downstream signal —
+		// including the <EOU> the turn detector waits on — arrives that many
+		// seconds late. Warned once per session; reset by a Config reset.
+		behindSec    float64
+		behindWarned bool
+	)
+
+	// emit forwards one decode increment: it streams the per-feed tokens the
+	// realtime turn detector consumes (delta/eou/eob/words) and accumulates the
+	// running transcript for the closing FinalResult. No segmentation or
+	// boundary latch here — the live consumer reads only the streamed tokens
+	// and the final Text; per-utterance segments and the terminal <EOU> flag
+	// are an offline-path concern (see AudioTranscriptionStream / boundary.go).
+	emit := func(r streamFeedResult) error {
+		if r.Delta != "" {
+			full.WriteString(r.Delta)
+		}
+		if r.Delta != "" || r.Eou || r.Eob || len(r.Words) > 0 {
+			out <- &pb.TranscriptLiveResponse{
+				Delta: r.Delta,
+				Eou:   r.Eou,
+				Eob:   r.Eob,
+				Words: liveWordsToProto(r.Words),
+			}
+		}
+		return nil
+	}
+
+	for req := range in {
+		switch payload := req.GetPayload().(type) {
+		case *pb.TranscriptLiveRequest_Config:
+			if err := validateLiveConfig(payload.Config); err != nil {
+				return err
+			}
+			// Reset: a fresh decode session, dropping accumulated state.
+			p.streamFree(stream)
+			stream, err = p.streamBegin(payload.Config.GetLanguage())
+			if err != nil {
+				return err
+			}
+			if stream == 0 {
+				return grpcerrors.LiveTranscriptionUnsupported("parakeet-cpp",
+					"loaded model is not a cache-aware streaming model")
+			}
+			full.Reset()
+			fedSecs = 0
+		case *pb.TranscriptLiveRequest_Audio:
+			pcm := payload.Audio.GetPcm()
+			audioSec := float64(len(pcm)) / liveSampleRate
+			fedSecs += audioSec
+			start := time.Now()
+			// nil ctx: a live session is bounded by this request channel, not a
+			// context — cancellation is the caller closing the stream.
+			if err := p.feedSlices(nil, stream, pcm, emit); err != nil {
+				return err
+			}
+			wallSec := time.Since(start).Seconds()
+			behindSec += wallSec - audioSec
+			if behindSec < 0 {
+				behindSec = 0
+			}
+			xlog.Debug("parakeet-cpp: live feed",
+				"audio_ms", int(audioSec*1000), "wall_ms", int(wallSec*1000),
+				"behind_ms", int(behindSec*1000), "fed_s", fedSecs)
+			if behindSec > 1 && !behindWarned {
+				behindWarned = true
+				xlog.Warn("parakeet-cpp: live decode is falling behind real time; "+
+					"end-of-utterance signals will arrive late",
+					"behind_s", behindSec, "fed_s", fedSecs)
+			}
+		}
+	}
+
+	// Send side closed: flush the streaming tail and emit the final transcript.
+	// The live FinalResult carries only Text — the authoritative full-turn
+	// transcript the realtime core commits. Per-utterance segments, duration,
+	// and the terminal <EOU> flag are not produced on the live path.
+	if err := p.flushTail(stream, emit); err != nil {
+		return err
+	}
+	out <- &pb.TranscriptLiveResponse{
+		FinalResult: &pb.TranscriptResult{Text: strings.TrimSpace(full.String())},
+	}
+	return nil
+}
+
+func validateLiveConfig(cfg *pb.TranscriptLiveConfig) error {
+	if sr := cfg.GetSampleRate(); sr != 0 && sr != liveSampleRate {
+		return status.Errorf(codes.InvalidArgument,
+			"parakeet-cpp: unsupported live sample_rate %d (only %d)", sr, liveSampleRate)
+	}
+	return nil
+}
+
+func liveWordsToProto(words []transcriptWord) []*pb.TranscriptWord {
+	if len(words) == 0 {
+		return nil
+	}
+	out := make([]*pb.TranscriptWord, len(words))
+	for i, w := range words {
+		out[i] = &pb.TranscriptWord{
+			Start: secondsToNanos(w.Start),
+			End:   secondsToNanos(w.End),
+			Text:  w.W,
+		}
+	}
+	return out
+}
diff --git a/backend/go/parakeet-cpp/live_test.go b/backend/go/parakeet-cpp/live_test.go
new file mode 100644
index 000000000..0462ee521
--- /dev/null
+++ b/backend/go/parakeet-cpp/live_test.go
@@ -0,0 +1,417 @@
+package main
+
+import (
+	"sync"
+	"time"
+	"unsafe"
+
+	"github.com/mudler/LocalAI/pkg/grpc/grpcerrors"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
+)
+
+// The live-RPC specs drive AudioTranscriptionLive entirely against stubbed
+// Cpp* package vars (the same seam batcher_test.go uses), so they run
+// without libparakeet.so.
+
+// liveCstrPool hands out NUL-terminated C-style strings backed by Go memory
+// and keeps them alive for the duration of a spec (goStringFromCPtr reads
+// through the raw pointer; Go's GC must not collect the backing array while
+// a stub's return value is in flight).
+type liveCstrPool struct {
+	mu   sync.Mutex
+	bufs [][]byte
+}
+
+func (p *liveCstrPool) cstr(s string) uintptr {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	b := append([]byte(s), 0)
+	p.bufs = append(p.bufs, b)
+	return uintptr(unsafe.Pointer(&b[0]))
+}
+
+// liveStubs swaps every C entry point the live path touches and returns a
+// restore func for AfterEach.
+func liveStubs() (restore func()) {
+	savedBegin, savedBeginLang := CppStreamBegin, CppStreamBeginLang
+	savedFeed, savedFeedJSON := CppStreamFeed, CppStreamFeedJSON
+	savedFinalize, savedFinalizeJSON := CppStreamFinalize, CppStreamFinalizeJSON
+	savedFree, savedLastError := CppStreamFree, CppLastError
+	savedFreeString := CppFreeString
+	return func() {
+		CppStreamBegin, CppStreamBeginLang = savedBegin, savedBeginLang
+		CppStreamFeed, CppStreamFeedJSON = savedFeed, savedFeedJSON
+		CppStreamFinalize, CppStreamFinalizeJSON = savedFinalize, savedFinalizeJSON
+		CppStreamFree, CppLastError = savedFree, savedLastError
+		CppFreeString = savedFreeString
+	}
+}
+
+// runLive starts the RPC on its own goroutine and returns the request
+// channel plus a collector for everything the backend emitted.
+func runLive(p *ParakeetCpp) (chan *pb.TranscriptLiveRequest, chan *pb.TranscriptLiveResponse, chan error) {
+	in := make(chan *pb.TranscriptLiveRequest)
+	out := make(chan *pb.TranscriptLiveResponse, 32)
+	errCh := make(chan error, 1)
+	go func() { errCh <- p.AudioTranscriptionLive(in, out) }()
+	return in, out, errCh
+}
+
+func liveConfig(lang string) *pb.TranscriptLiveRequest {
+	return &pb.TranscriptLiveRequest{
+		Payload: &pb.TranscriptLiveRequest_Config{Config: &pb.TranscriptLiveConfig{Language: lang}},
+	}
+}
+
+func liveAudio(pcm []float32) *pb.TranscriptLiveRequest {
+	return &pb.TranscriptLiveRequest{
+		Payload: &pb.TranscriptLiveRequest_Audio{Audio: &pb.TranscriptLiveAudio{Pcm: pcm}},
+	}
+}
+
+func collectLive(out chan *pb.TranscriptLiveResponse) []*pb.TranscriptLiveResponse {
+	var got []*pb.TranscriptLiveResponse
+	for r := range out {
+		got = append(got, r)
+	}
+	return got
+}
+
+var _ = Describe("AudioTranscriptionLive (stubbed C API)", func() {
+	var (
+		pool    *liveCstrPool
+		restore func()
+		p       *ParakeetCpp
+	)
+
+	BeforeEach(func() {
+		pool = &liveCstrPool{}
+		restore = liveStubs()
+		p = &ParakeetCpp{ctxPtr: 1}
+
+		CppStreamBeginLang = nil
+		CppStreamBegin = func(ctx uintptr) uintptr { return 7 }
+		CppStreamFree = func(s uintptr) {}
+		CppFreeString = func(s uintptr) {}
+		CppLastError = func(ctx uintptr) string { return "stub error" }
+		CppStreamFeed = nil
+		CppStreamFeedJSON = nil
+		CppStreamFinalize = nil
+		CppStreamFinalizeJSON = nil
+	})
+
+	AfterEach(func() { restore() })
+
+	It("rejects a stream whose first message is not a config", func() {
+		in, out, errCh := runLive(p)
+		in <- liveAudio([]float32{0.1})
+		close(in)
+
+		err := <-errCh
+		Expect(status.Code(err)).To(Equal(codes.InvalidArgument))
+		Expect(collectLive(out)).To(BeEmpty())
+	})
+
+	It("rejects a non-16k sample rate", func() {
+		in, _, errCh := runLive(p)
+		in <- &pb.TranscriptLiveRequest{
+			Payload: &pb.TranscriptLiveRequest_Config{Config: &pb.TranscriptLiveConfig{SampleRate: 8000}},
+		}
+		close(in)
+		Expect(status.Code(<-errCh)).To(Equal(codes.InvalidArgument))
+	})
+
+	It("returns the typed Unimplemented signal for non-streaming models, before any ack", func() {
+		CppStreamBegin = func(ctx uintptr) uintptr { return 0 }
+
+		in, out, errCh := runLive(p)
+		in <- liveConfig("")
+		close(in)
+
+		err := <-errCh
+		Expect(grpcerrors.IsLiveTranscriptionUnsupported(err)).To(BeTrue())
+		Expect(collectLive(out)).To(BeEmpty())
+	})
+
+	It("streams deltas, eou flags and words on the JSON path and finalizes on close", func() {
+		var freed []uintptr
+		CppStreamFree = func(s uintptr) { freed = append(freed, s) }
+		feeds := 0
+		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
+			feeds++
+			switch feeds {
+			case 1:
+				return pool.cstr(`{"text":"hello ","eou":0,"frame_sec":0.08,` +
+					`"words":[{"w":"hello","start":0.1,"end":0.4,"conf":0.9}]}`)
+			default:
+				return pool.cstr(`{"text":"world","eou":1,"frame_sec":0.08,` +
+					`"words":[{"w":"world","start":0.5,"end":0.8,"conf":0.9}]}`)
+			}
+		}
+		CppStreamFinalizeJSON = func(s uintptr) uintptr {
+			return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
+		}
+
+		in, out, errCh := runLive(p)
+		in <- liveConfig("en")
+		in <- liveAudio(make([]float32, 100))
+		in <- liveAudio(make([]float32, 200))
+		close(in)
+		Expect(<-errCh).NotTo(HaveOccurred())
+
+		got := collectLive(out)
+		Expect(got).To(HaveLen(4)) // ready, two deltas, final
+
+		Expect(got[0].Ready).To(BeTrue())
+
+		Expect(got[1].Delta).To(Equal("hello "))
+		Expect(got[1].Eou).To(BeFalse())
+		Expect(got[1].Words).To(HaveLen(1))
+		Expect(got[1].Words[0].Text).To(Equal("hello"))
+
+		Expect(got[2].Delta).To(Equal("world"))
+		Expect(got[2].Eou).To(BeTrue())
+
+		final := got[3].FinalResult
+		Expect(final).NotTo(BeNil())
+		Expect(final.Text).To(Equal("hello world"))
+		// The live FinalResult carries only Text. Per-utterance segments,
+		// duration and the terminal eou flag are an offline-path concern (see
+		// boundary.go / AudioTranscriptionStream); the realtime core reads the
+		// streamed per-feed tokens above plus this Text.
+		Expect(final.Eou).To(BeFalse())
+		Expect(final.Segments).To(BeEmpty())
+		Expect(final.Duration).To(BeZero())
+
+		Expect(freed).To(Equal([]uintptr{7}))
+	})
+
+	It("falls back to the text feed (eou out-param) when the JSON entry points are absent", func() {
+		feeds := 0
+		CppStreamFeed = func(s uintptr, pcm []float32, n int32, eouOut unsafe.Pointer) uintptr {
+			feeds++
+			if feeds == 2 {
+				*(*int32)(eouOut) = 1
+				return pool.cstr("done")
+			}
+			return pool.cstr("first ")
+		}
+		CppStreamFinalize = func(s uintptr) uintptr { return pool.cstr("") }
+
+		in, out, errCh := runLive(p)
+		in <- liveConfig("")
+		in <- liveAudio(make([]float32, 10))
+		in <- liveAudio(make([]float32, 10))
+		close(in)
+		Expect(<-errCh).NotTo(HaveOccurred())
+
+		got := collectLive(out)
+		Expect(got).To(HaveLen(4))
+		Expect(got[1].Delta).To(Equal("first "))
+		Expect(got[1].Eou).To(BeFalse())
+		Expect(got[2].Delta).To(Equal("done"))
+		Expect(got[2].Eou).To(BeTrue())
+		Expect(got[3].FinalResult.Text).To(Equal("first done"))
+	})
+
+	It("forwards <EOB> as eob — a backchannel, never an eou (ABI v5 JSON)", func() {
+		feeds := 0
+		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
+			feeds++
+			if feeds == 1 {
+				return pool.cstr(`{"text":"uh-huh","eou":0,"eob":1,"frame_sec":0.08,` +
+					`"words":[{"w":"uh-huh","start":0.1,"end":0.3,"conf":0.9}]}`)
+			}
+			return pool.cstr(`{"text":"the turn","eou":1,"eob":0,"frame_sec":0.08,` +
+				`"words":[{"w":"the","start":0.5,"end":0.6,"conf":0.9},{"w":"turn","start":0.6,"end":0.8,"conf":0.9}]}`)
+		}
+		CppStreamFinalizeJSON = func(s uintptr) uintptr {
+			return pool.cstr(`{"text":"","eou":0,"eob":0,"frame_sec":0.08,"words":[]}`)
+		}
+
+		in, out, errCh := runLive(p)
+		in <- liveConfig("")
+		in <- liveAudio(make([]float32, 10))
+		in <- liveAudio(make([]float32, 10))
+		close(in)
+		Expect(<-errCh).NotTo(HaveOccurred())
+
+		got := collectLive(out)
+		Expect(got).To(HaveLen(4))
+		Expect(got[1].Eob).To(BeTrue())
+		Expect(got[1].Eou).To(BeFalse(), "a backchannel must not masquerade as a turn boundary")
+		Expect(got[2].Eou).To(BeTrue())
+	})
+
+	It("maps the v5 eou_out bitmask on the text path (bit0 <EOU>, bit1 <EOB>)", func() {
+		feeds := 0
+		CppStreamFeed = func(s uintptr, pcm []float32, n int32, eouOut unsafe.Pointer) uintptr {
+			feeds++
+			if feeds == 1 {
+				*(*int32)(eouOut) = 2 // <EOB> only
+				return pool.cstr("uh-huh")
+			}
+			*(*int32)(eouOut) = 1 // <EOU>
+			return pool.cstr(" done")
+		}
+		CppStreamFinalize = func(s uintptr) uintptr { return pool.cstr("") }
+
+		in, out, errCh := runLive(p)
+		in <- liveConfig("")
+		in <- liveAudio(make([]float32, 10))
+		in <- liveAudio(make([]float32, 10))
+		close(in)
+		Expect(<-errCh).NotTo(HaveOccurred())
+
+		got := collectLive(out)
+		Expect(got).To(HaveLen(4))
+		Expect(got[1].Eob).To(BeTrue())
+		Expect(got[1].Eou).To(BeFalse())
+		Expect(got[2].Eou).To(BeTrue())
+		Expect(got[2].Eob).To(BeFalse())
+	})
+
+	It("accumulates trailing text after an EOU into the final transcript", func() {
+		feeds := 0
+		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
+			feeds++
+			if feeds == 1 {
+				return pool.cstr(`{"text":"turn one","eou":1,"frame_sec":0.08,"words":[]}`)
+			}
+			return pool.cstr(`{"text":" and more","eou":0,"frame_sec":0.08,"words":[]}`)
+		}
+		CppStreamFinalizeJSON = func(s uintptr) uintptr {
+			return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
+		}
+
+		in, out, errCh := runLive(p)
+		in <- liveConfig("")
+		in <- liveAudio(make([]float32, 10))
+		in <- liveAudio(make([]float32, 10))
+		close(in)
+		Expect(<-errCh).NotTo(HaveOccurred())
+
+		got := collectLive(out)
+		final := got[len(got)-1].FinalResult
+		Expect(final.Text).To(Equal("turn one and more"))
+	})
+
+	It("resets the decode session on a mid-stream config", func() {
+		var begun, freed int
+		CppStreamBegin = func(ctx uintptr) uintptr { begun++; return uintptr(10 + begun) }
+		CppStreamFree = func(s uintptr) { freed++ }
+		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
+			return pool.cstr(`{"text":"x","eou":0,"frame_sec":0.08,"words":[]}`)
+		}
+		CppStreamFinalizeJSON = func(s uintptr) uintptr {
+			return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
+		}
+
+		in, out, errCh := runLive(p)
+		in <- liveConfig("")
+		in <- liveAudio(make([]float32, 10))
+		in <- liveConfig("") // reset
+		in <- liveAudio(make([]float32, 10))
+		close(in)
+		Expect(<-errCh).NotTo(HaveOccurred())
+
+		got := collectLive(out)
+		final := got[len(got)-1].FinalResult
+		Expect(final.Text).To(Equal("x"), "pre-reset transcript dropped")
+		Expect(begun).To(Equal(2))
+		Expect(freed).To(Equal(2), "old session freed on reset, new one on unwind")
+	})
+
+	It("does not hold engineMu between feeds (unary work interleaves with a live session)", func() {
+		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
+			return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
+		}
+		CppStreamFinalizeJSON = func(s uintptr) uintptr {
+			return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
+		}
+
+		in, out, errCh := runLive(p)
+		in <- liveConfig("")
+		in <- liveAudio(make([]float32, 10))
+
+		// The session is open and idle between feeds: the engine lock must be
+		// acquirable, which is what lets batched unary transcription proceed
+		// mid-session. Under stream-lifetime locking this probe would block
+		// until the stream ended and the Eventually would time out.
+		locked := make(chan struct{})
+		go func() {
+			p.engineMu.Lock()
+			p.engineMu.Unlock() //nolint:staticcheck // probe: acquire-release proves availability
+			close(locked)
+		}()
+		Eventually(locked, time.Second).Should(BeClosed())
+
+		close(in)
+		Expect(<-errCh).NotTo(HaveOccurred())
+		collectLive(out)
+	})
+
+	It("errors out and reads last_error under the lock when a feed fails", func() {
+		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr { return 0 }
+
+		in, out, errCh := runLive(p)
+		in <- liveConfig("")
+		in <- liveAudio(make([]float32, 10))
+
+		err := <-errCh
+		Expect(err).To(MatchError(ContainSubstring("stub error")))
+		got := collectLive(out)
+		Expect(got).To(HaveLen(1)) // just the ready ack
+		close(in)
+	})
+})
+
+var _ = Describe("stripEouMarker", func() {
+	It("strips a trailing <EOU> and reports it", func() {
+		text, eou := stripEouMarker("it is certainly very like the old portrait<EOU>")
+		Expect(text).To(Equal("it is certainly very like the old portrait"))
+		Expect(eou).To(BeTrue())
+	})
+
+	It("strips a trailing <EOB> WITHOUT reporting an utterance end", func() {
+		// A decode ending on a backchannel must not confirm the
+		// retranscribe gate — the user was acknowledging, not yielding.
+		text, eou := stripEouMarker("uh-huh<EOB>")
+		Expect(text).To(Equal("uh-huh"))
+		Expect(eou).To(BeFalse())
+	})
+
+	It("leaves marker-free text alone", func() {
+		text, eou := stripEouMarker("plain transcript")
+		Expect(text).To(Equal("plain transcript"))
+		Expect(eou).To(BeFalse())
+	})
+
+	It("does not strip a marker in the middle of the text", func() {
+		text, eou := stripEouMarker("a<EOU>b")
+		Expect(text).To(Equal("a<EOU>b"))
+		Expect(eou).To(BeFalse())
+	})
+})
+
+var _ = Describe("transcriptResultFromDoc EOU handling", func() {
+	It("strips the offline marker from text and sets the result flag", func() {
+		doc := transcriptJSON{Text: "the old portrait<EOU>"}
+		res := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0)
+		Expect(res.Text).To(Equal("the old portrait"))
+		Expect(res.Eou).To(BeTrue())
+		Expect(res.Segments).To(HaveLen(1))
+		Expect(res.Segments[0].Text).To(Equal("the old portrait"))
+	})
+
+	It("reports eou=false for marker-free decodes", func() {
+		doc := transcriptJSON{Text: "no marker here"}
+		res := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0)
+		Expect(res.Text).To(Equal("no marker here"))
+		Expect(res.Eou).To(BeFalse())
+	})
+})
diff --git a/backend/go/parakeet-cpp/segments_test.go b/backend/go/parakeet-cpp/segments_test.go
index 9d8e9f8d5..0295e771f 100644
--- a/backend/go/parakeet-cpp/segments_test.go
+++ b/backend/go/parakeet-cpp/segments_test.go
@@ -106,7 +106,7 @@ var _ = Describe("transcriptResultFromDoc (multi-segment)", func() {
 var _ = Describe("streaming segment assembly", func() {
 	It("closes a segment with start/end from its words on EOU", func() {
 		acc := &streamSegmenter{}
-		acc.add(streamFeedJSON{Text: "hello world", Eou: 1, Words: []transcriptWord{
+		acc.add(streamFeedResult{Delta: "hello world", Eou: true, Words: []transcriptWord{
 			{W: "hello", Start: 0.0, End: 0.4}, {W: "world", Start: 0.4, End: 0.9},
 		}})
 		segs := acc.segments()
@@ -118,9 +118,9 @@ var _ = Describe("streaming segment assembly", func() {
 
 	It("buffers words across feeds until EOU", func() {
 		acc := &streamSegmenter{}
-		acc.add(streamFeedJSON{Text: "hi", Eou: 0, Words: []transcriptWord{{W: "hi", Start: 0, End: 0.3}}})
+		acc.add(streamFeedResult{Delta: "hi", Words: []transcriptWord{{W: "hi", Start: 0, End: 0.3}}})
 		Expect(acc.segments()).To(BeEmpty())
-		acc.add(streamFeedJSON{Text: "there", Eou: 1, Words: []transcriptWord{{W: "there", Start: 0.3, End: 0.7}}})
+		acc.add(streamFeedResult{Delta: "there", Eou: true, Words: []transcriptWord{{W: "there", Start: 0.3, End: 0.7}}})
 		Expect(acc.segments()).To(HaveLen(1))
 		Expect(acc.segments()[0].Text).To(Equal("hi there"))
 	})
@@ -129,7 +129,7 @@ var _ = Describe("streaming segment assembly", func() {
 	// field; a backchannel must still close the segment as it did in v4.
 	It("closes a segment on EOB (backchannel) too", func() {
 		acc := &streamSegmenter{}
-		acc.add(streamFeedJSON{Text: "uh huh", Eou: 0, Eob: 1, Words: []transcriptWord{
+		acc.add(streamFeedResult{Delta: "uh huh", Eob: true, Words: []transcriptWord{
 			{W: "uh", Start: 0.0, End: 0.2}, {W: "huh", Start: 0.2, End: 0.5},
 		}})
 		segs := acc.segments()
@@ -137,4 +137,18 @@ var _ = Describe("streaming segment assembly", func() {
 		Expect(segs[0].Text).To(Equal("uh huh"))
 		Expect(segs[0].End).To(Equal(secondsToNanos(0.5)))
 	})
+
+	// Older text-only libparakeet.so: no per-word timings, so a segment is cut
+	// from the delta text on each <EOU>/<EOB> (no timestamps), one per utterance.
+	It("falls back to text segments when the feed carries no words", func() {
+		acc := &streamSegmenter{}
+		acc.add(streamFeedResult{Delta: "first turn", Eou: true})
+		acc.add(streamFeedResult{Delta: "second turn", Eou: true})
+		segs := acc.segments()
+		Expect(segs).To(HaveLen(2))
+		Expect(segs[0].Text).To(Equal("first turn"))
+		Expect(segs[1].Text).To(Equal("second turn"))
+		Expect(segs[0].Start).To(Equal(int64(0)), "no per-word timing on the text path")
+		Expect(segs[0].End).To(Equal(int64(0)))
+	})
 })
diff --git a/core/application/application.go b/core/application/application.go
index 52f8618f1..83057c9cd 100644
--- a/core/application/application.go
+++ b/core/application/application.go
@@ -103,6 +103,11 @@ func newApplication(appConfig *config.ApplicationConfig) *Application {
 		mcpTools.CloseMCPSessions(modelName)
 	})
 
+	// Record a model_load backend trace for every real backend load, so the
+	// Traces UI shows which backend runtime served each model and how long
+	// the load took. Load failures are traced by the modality wrappers.
+	ml.SetLoadObserver(corebackend.ModelLoadTraceObserver(appConfig))
+
 	app := &Application{
 		backendLoader:      config.NewModelConfigLoader(appConfig.SystemState.Model.ModelsPath),
 		modelLoader:        ml,
diff --git a/core/backend/model_load_trace_test.go b/core/backend/model_load_trace_test.go
new file mode 100644
index 000000000..1cce5da26
--- /dev/null
+++ b/core/backend/model_load_trace_test.go
@@ -0,0 +1,72 @@
+package backend_test
+
+import (
+	"errors"
+	"time"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/trace"
+	"github.com/mudler/LocalAI/pkg/model"
+)
+
+// ModelLoadTraceObserver is what makes successful loads visible on the
+// Traces page: one model_load row per real backend load, carrying the
+// resolved backend runtime. Failures must NOT be recorded here — the
+// modality wrappers own those — and the observer must respect the runtime
+// tracing toggle.
+var _ = Describe("ModelLoadTraceObserver", func() {
+	var appConfig *config.ApplicationConfig
+
+	successEvent := model.BackendLoadEvent{
+		ModelID:    "parakeet-cpp-realtime_eou_120m-v1",
+		ModelName:  "realtime_eou_120m.gguf",
+		Backend:    "parakeet-cpp",
+		BackendURI: "/backends/intel-sycl-f16-parakeet-cpp-development/run.sh",
+		Duration:   1500 * time.Millisecond,
+	}
+
+	BeforeEach(func() {
+		appConfig = &config.ApplicationConfig{
+			EnableTracing:   true,
+			TracingMaxItems: 64,
+		}
+		trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
+		trace.ClearBackendTraces()
+	})
+
+	It("records a model_load trace with the backend runtime on success", func() {
+		backend.ModelLoadTraceObserver(appConfig)(successEvent)
+
+		Eventually(trace.GetBackendTraces).Should(HaveLen(1))
+		got := trace.GetBackendTraces()[0]
+		Expect(got.Type).To(Equal(trace.BackendTraceModelLoad))
+		Expect(got.Summary).To(Equal("Model loaded"))
+		Expect(got.ModelName).To(Equal("parakeet-cpp-realtime_eou_120m-v1"))
+		Expect(got.Backend).To(Equal("parakeet-cpp"))
+		Expect(got.Duration).To(Equal(1500 * time.Millisecond))
+		Expect(got.Data["backend_runtime"]).To(Equal("/backends/intel-sycl-f16-parakeet-cpp-development/run.sh"))
+		Expect(got.Data["model_file"]).To(Equal("realtime_eou_120m.gguf"))
+		Expect(got.Error).To(BeEmpty())
+	})
+
+	It("skips failed loads — the modality wrappers trace those with request context", func() {
+		failed := successEvent
+		failed.Err = errors.New("grpc service not ready")
+
+		backend.ModelLoadTraceObserver(appConfig)(failed)
+
+		Consistently(trace.GetBackendTraces, "100ms", "20ms").Should(BeEmpty())
+	})
+
+	It("records nothing when tracing is disabled", func() {
+		appConfig.EnableTracing = false
+
+		backend.ModelLoadTraceObserver(appConfig)(successEvent)
+
+		Consistently(trace.GetBackendTraces, "100ms", "20ms").Should(BeEmpty())
+	})
+})
diff --git a/core/backend/options.go b/core/backend/options.go
index 528c10e52..9ae22dd22 100644
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -19,6 +19,39 @@ import (
 	"github.com/mudler/xlog"
 )
 
+// ModelLoadTraceObserver returns the ModelLoader load observer that records
+// a model_load backend trace for every successful real load (backend process
+// spawn + LoadModel RPC; cache hits never reach the observer). Failures are
+// deliberately skipped here: the modality wrappers already record them via
+// recordModelLoadFailure with request context, and the backend auto-discovery
+// scan probes several backends before one succeeds — tracing every probe
+// failure would bury the buffer in noise.
+//
+// The traced data includes the resolved backend runtime (the installed
+// backend's launcher path, which names the variant directory) — that is what
+// identifies WHICH build served the load. A stale installed backend is
+// invisible in the model config but obvious here.
+func ModelLoadTraceObserver(appConfig *config.ApplicationConfig) func(model.BackendLoadEvent) {
+	return func(ev model.BackendLoadEvent) {
+		if ev.Err != nil || !appConfig.EnableTracing {
+			return
+		}
+		trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
+		trace.RecordBackendTrace(trace.BackendTrace{
+			Timestamp: time.Now(),
+			Duration:  ev.Duration,
+			Type:      trace.BackendTraceModelLoad,
+			ModelName: ev.ModelID,
+			Backend:   ev.Backend,
+			Summary:   "Model loaded",
+			Data: map[string]any{
+				"model_file":      ev.ModelName,
+				"backend_runtime": ev.BackendURI,
+			},
+		})
+	}
+}
+
 // recordModelLoadFailure records a backend trace when model loading fails.
 func recordModelLoadFailure(appConfig *config.ApplicationConfig, modelName, backend string, err error, data map[string]any) {
 	if !appConfig.EnableTracing {
diff --git a/core/backend/transcript.go b/core/backend/transcript.go
index e6da923cc..211269160 100644
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -181,6 +181,7 @@ func transcriptResultFromProto(r *proto.TranscriptResult) *schema.TranscriptionR
 		Text:     r.Text,
 		Language: r.Language,
 		Duration: float64(r.Duration),
+		Eou:      r.Eou,
 	}
 
 	for _, s := range r.Segments {
diff --git a/core/backend/transcript_live.go b/core/backend/transcript_live.go
new file mode 100644
index 000000000..956e7e717
--- /dev/null
+++ b/core/backend/transcript_live.go
@@ -0,0 +1,297 @@
+package backend
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"maps"
+	"sync"
+	"time"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/trace"
+	grpcPkg "github.com/mudler/LocalAI/pkg/grpc"
+	"github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/sound"
+	"github.com/mudler/xlog"
+)
+
+// LiveTranscriptionEvent is one streamed event from a live (bidirectional)
+// transcription session. Delta/Eou/Eob/Words arrive as the user speaks; Final
+// is set exactly once, on the terminal event after Close flushes the decode
+// tail. Eou means the model judged the user yielded the turn; Eob means a
+// backchannel ("uh-huh") ended — callers must NOT treat Eob as a turn
+// boundary.
+type LiveTranscriptionEvent struct {
+	Delta string
+	Eou   bool
+	Eob   bool
+	Words []schema.TranscriptionWord
+	Final *schema.TranscriptionResult
+}
+
+// LiveTranscriptionSession is a handle on an open live transcription stream.
+// Feed pushes 16 kHz mono float PCM; Close signals end-of-audio, waits for
+// the backend's terminal Final event to be delivered, and releases the
+// stream.
+type LiveTranscriptionSession interface {
+	Feed(pcm []float32) error
+	Close() error
+}
+
+// liveCloseDrainTimeout bounds how long Close waits for the backend to flush
+// the decode tail before force-cancelling the stream. Finalize is one short
+// engine call; seconds here means the backend is wedged.
+const liveCloseDrainTimeout = 10 * time.Second
+
+type liveTranscriptionSession struct {
+	stream    grpcPkg.AudioTranscriptionLiveClient
+	cancel    context.CancelFunc
+	recvDone  chan struct{}
+	recvErr   error // written by the recv goroutine before recvDone closes
+	closeOnce sync.Once
+	closeErr  error
+	trace     *liveTraceState // nil when tracing was disabled at open
+}
+
+func (s *liveTranscriptionSession) Feed(pcm []float32) error {
+	s.trace.addPCM(pcm)
+	return s.stream.Send(&proto.TranscriptLiveRequest{
+		Payload: &proto.TranscriptLiveRequest_Audio{Audio: &proto.TranscriptLiveAudio{Pcm: pcm}},
+	})
+}
+
+func (s *liveTranscriptionSession) Close() error {
+	s.closeOnce.Do(func() {
+		err := s.stream.CloseSend()
+		select {
+		case <-s.recvDone:
+		case <-time.After(liveCloseDrainTimeout):
+			xlog.Warn("live transcription: backend did not finalize in time; cancelling stream")
+			s.cancel()
+			<-s.recvDone
+		}
+		s.cancel()
+		if err == nil {
+			err = s.recvErr
+		}
+		s.closeErr = err
+		s.trace.record(err)
+	})
+	return s.closeErr
+}
+
+// liveSampleRate is the PCM rate of a live transcription session, fixed by
+// the session config sent in ModelTranscriptionLive.
+const liveSampleRate = 16000
+
+// liveTraceState accumulates what the per-turn backend trace needs while a
+// live session runs: a bounded copy of the fed PCM for the audio snippet,
+// the decode outputs, and timing. One trace is recorded at Close — the live
+// path never touches the unary transcription wrapper, so without this a
+// streaming-only pipeline produced no transcription traces at all. Feed and
+// the recv goroutine run concurrently; mu guards the accumulators.
+type liveTraceState struct {
+	appConfig *config.ApplicationConfig
+	modelName string
+	backend   string
+	language  string
+	started   time.Time
+
+	mu          sync.Mutex
+	pcm         []byte // first trace.MaxSnippetSeconds of fed audio, int16 LE
+	fedSamples  int    // ALL samples fed, beyond the snippet cap
+	deltaEvents int
+	eouEvents   int
+	eobEvents   int
+	finalText   string
+}
+
+func newLiveTraceState(modelConfig config.ModelConfig, appConfig *config.ApplicationConfig, language string) *liveTraceState {
+	if !appConfig.EnableTracing {
+		return nil
+	}
+	trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
+	return &liveTraceState{
+		appConfig: appConfig,
+		modelName: modelConfig.Name,
+		backend:   modelConfig.Backend,
+		language:  language,
+		started:   time.Now(),
+	}
+}
+
+func (ts *liveTraceState) addPCM(pcm []float32) {
+	if ts == nil {
+		return
+	}
+	ts.mu.Lock()
+	defer ts.mu.Unlock()
+	ts.fedSamples += len(pcm)
+	maxBytes := trace.MaxSnippetSeconds * liveSampleRate * 2
+	if room := (maxBytes - len(ts.pcm)) / 2; room > 0 {
+		if len(pcm) > room {
+			pcm = pcm[:room]
+		}
+		ts.pcm = append(ts.pcm, sound.Float32sToInt16LEBytes(pcm)...)
+	}
+}
+
+func (ts *liveTraceState) observe(ev LiveTranscriptionEvent) {
+	if ts == nil {
+		return
+	}
+	ts.mu.Lock()
+	defer ts.mu.Unlock()
+	if ev.Delta != "" {
+		ts.deltaEvents++
+	}
+	if ev.Eou {
+		ts.eouEvents++
+	}
+	if ev.Eob {
+		ts.eobEvents++
+	}
+	if ev.Final != nil {
+		ts.finalText = ev.Final.Text
+	}
+}
+
+func (ts *liveTraceState) record(closeErr error) {
+	if ts == nil || !ts.appConfig.EnableTracing {
+		return
+	}
+	ts.mu.Lock()
+	data := map[string]any{
+		"source":       "live_stream",
+		"language":     ts.language,
+		"result_text":  ts.finalText,
+		"eou_events":   ts.eouEvents,
+		"eob_events":   ts.eobEvents,
+		"delta_events": ts.deltaEvents,
+	}
+	if snippet := trace.AudioSnippetFromPCM(ts.pcm, liveSampleRate, ts.fedSamples*2, ts.appConfig.TracingMaxBodyBytes); snippet != nil {
+		maps.Copy(data, snippet)
+	}
+	summary := "live -> " + ts.finalText
+	ts.mu.Unlock()
+
+	bt := trace.BackendTrace{
+		Timestamp: ts.started,
+		Duration:  time.Since(ts.started),
+		Type:      trace.BackendTraceTranscription,
+		ModelName: ts.modelName,
+		Backend:   ts.backend,
+		Summary:   trace.TruncateString(summary, 200),
+		Data:      data,
+	}
+	if closeErr != nil {
+		bt.Error = closeErr.Error()
+	}
+	trace.RecordBackendTrace(bt)
+}
+
+// ModelTranscriptionLive loads the transcription backend, opens the
+// bidirectional AudioTranscriptionLive RPC, sends the session config, and
+// BLOCKS until the backend's ready ack. A grpcerrors.
+// IsLiveTranscriptionUnsupported error means the backend (or the loaded
+// model) cannot do live transcription and the caller should degrade to the
+// unary/file path. After a successful return, onEvent is invoked from a
+// background goroutine — in order, one event at a time — for every response
+// the backend streams, ending with the Final event triggered by Close.
+func ModelTranscriptionLive(ctx context.Context, language string,
+	ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig,
+	onEvent func(LiveTranscriptionEvent)) (LiveTranscriptionSession, error) {
+
+	transcriptionModel, err := loadTranscriptionModel(ctx, ml, modelConfig, appConfig)
+	if err != nil {
+		return nil, err
+	}
+
+	// The derived cancel out-lives this call inside the session: Close uses
+	// it to unwind the stream (and, in embed mode, the server-side recv
+	// pump, which only stops on send-close or context cancellation).
+	streamCtx, cancel := context.WithCancel(ctx)
+	stream, err := transcriptionModel.AudioTranscriptionLive(streamCtx)
+	if err != nil {
+		cancel()
+		return nil, err
+	}
+
+	fail := func(err error) (LiveTranscriptionSession, error) {
+		_ = stream.CloseSend()
+		cancel()
+		return nil, err
+	}
+
+	if err := stream.Send(&proto.TranscriptLiveRequest{
+		Payload: &proto.TranscriptLiveRequest_Config{Config: &proto.TranscriptLiveConfig{
+			Language:   language,
+			SampleRate: liveSampleRate,
+		}},
+	}); err != nil {
+		return fail(err)
+	}
+
+	// Ready-ack contract: the backend answers a successful open with a
+	// {ready:true} response before any transcript data; unsupported
+	// backends surface Unimplemented here instead.
+	ack, err := stream.Recv()
+	if err != nil {
+		return fail(err)
+	}
+	if !ack.GetReady() {
+		return fail(fmt.Errorf("live transcription: backend %q broke the ready-ack contract (first response carried data)", modelConfig.Backend))
+	}
+
+	s := &liveTranscriptionSession{
+		stream:   stream,
+		cancel:   cancel,
+		recvDone: make(chan struct{}),
+		trace:    newLiveTraceState(modelConfig, appConfig, language),
+	}
+
+	go func() {
+		defer close(s.recvDone)
+		for {
+			resp, err := stream.Recv()
+			if err != nil {
+				if !errors.Is(err, io.EOF) && streamCtx.Err() == nil {
+					xlog.Warn("live transcription stream ended unexpectedly", "error", err)
+					s.recvErr = err
+				}
+				return
+			}
+			ev := liveEventFromProto(resp)
+			if ev.Delta == "" && !ev.Eou && !ev.Eob && len(ev.Words) == 0 && ev.Final == nil {
+				continue // duplicate ready ack / keep-alive: nothing to deliver
+			}
+			s.trace.observe(ev)
+			onEvent(ev)
+		}
+	}()
+
+	return s, nil
+}
+
+func liveEventFromProto(r *proto.TranscriptLiveResponse) LiveTranscriptionEvent {
+	ev := LiveTranscriptionEvent{
+		Delta: r.GetDelta(),
+		Eou:   r.GetEou(),
+		Eob:   r.GetEob(),
+	}
+	for _, w := range r.GetWords() {
+		ev.Words = append(ev.Words, schema.TranscriptionWord{
+			Start: time.Duration(w.Start),
+			End:   time.Duration(w.End),
+			Text:  w.Text,
+		})
+	}
+	if r.GetFinalResult() != nil {
+		ev.Final = transcriptResultFromProto(r.GetFinalResult())
+	}
+	return ev
+}
diff --git a/core/backend/transcript_live_internal_test.go b/core/backend/transcript_live_internal_test.go
new file mode 100644
index 000000000..cbd7fac54
--- /dev/null
+++ b/core/backend/transcript_live_internal_test.go
@@ -0,0 +1,162 @@
+package backend
+
+import (
+	"errors"
+	"time"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/trace"
+	"github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("liveEventFromProto", func() {
+	It("maps deltas, eou flags and words (ns -> duration)", func() {
+		ev := liveEventFromProto(&proto.TranscriptLiveResponse{
+			Delta: "hello ",
+			Eou:   true,
+			Words: []*proto.TranscriptWord{
+				{Start: int64(100 * time.Millisecond), End: int64(400 * time.Millisecond), Text: "hello"},
+			},
+		})
+		Expect(ev.Delta).To(Equal("hello "))
+		Expect(ev.Eou).To(BeTrue())
+		Expect(ev.Words).To(HaveLen(1))
+		Expect(ev.Words[0].Text).To(Equal("hello"))
+		Expect(ev.Words[0].Start).To(Equal(100 * time.Millisecond))
+		Expect(ev.Words[0].End).To(Equal(400 * time.Millisecond))
+		Expect(ev.Final).To(BeNil())
+	})
+
+	It("maps the terminal final result including the eou flag", func() {
+		ev := liveEventFromProto(&proto.TranscriptLiveResponse{
+			FinalResult: &proto.TranscriptResult{
+				Text:     "hello world",
+				Duration: 1.5,
+				Eou:      true,
+				Segments: []*proto.TranscriptSegment{{Id: 0, Text: "hello world"}},
+			},
+		})
+		Expect(ev.Final).NotTo(BeNil())
+		Expect(ev.Final.Text).To(Equal("hello world"))
+		Expect(ev.Final.Duration).To(BeNumerically("~", 1.5, 1e-6))
+		Expect(ev.Final.Eou).To(BeTrue())
+		Expect(ev.Final.Segments).To(HaveLen(1))
+	})
+
+	It("yields an empty event for a bare ready ack (filtered by the recv loop)", func() {
+		ev := liveEventFromProto(&proto.TranscriptLiveResponse{Ready: true})
+		Expect(ev.Delta).To(BeEmpty())
+		Expect(ev.Eou).To(BeFalse())
+		Expect(ev.Words).To(BeEmpty())
+		Expect(ev.Final).To(BeNil())
+	})
+
+	It("maps the eob backchannel flag separately from eou", func() {
+		ev := liveEventFromProto(&proto.TranscriptLiveResponse{Delta: "uh-huh", Eob: true})
+		Expect(ev.Eob).To(BeTrue())
+		Expect(ev.Eou).To(BeFalse())
+	})
+})
+
+// liveTraceState is what makes streaming-only pipelines visible on the
+// Traces page: without it a semantic_vad session with retranscribe off
+// produced no transcription trace at all. One trace per session (= one per
+// realtime turn), recorded at Close.
+var _ = Describe("liveTraceState", func() {
+	var appConfig *config.ApplicationConfig
+
+	BeforeEach(func() {
+		appConfig = &config.ApplicationConfig{
+			EnableTracing:   true,
+			TracingMaxItems: 64,
+		}
+		trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
+		trace.ClearBackendTraces()
+	})
+
+	modelCfg := func() config.ModelConfig {
+		cfg := config.ModelConfig{Backend: "parakeet-cpp"}
+		cfg.Name = "parakeet-live"
+		return cfg
+	}
+
+	It("is disabled (nil) when tracing is off, and nil receivers are no-ops", func() {
+		appConfig.EnableTracing = false
+		ts := newLiveTraceState(modelCfg(), appConfig, "en")
+		Expect(ts).To(BeNil())
+
+		// The session calls these unconditionally; nil must be safe.
+		ts.addPCM([]float32{0.5})
+		ts.observe(LiveTranscriptionEvent{Eou: true})
+		ts.record(nil)
+		Consistently(trace.GetBackendTraces, "100ms", "20ms").Should(BeEmpty())
+	})
+
+	It("records one transcription trace with text, eou event counts and audio snippet at Close", func() {
+		ts := newLiveTraceState(modelCfg(), appConfig, "en")
+		Expect(ts).NotTo(BeNil())
+
+		// One second of a loud-ish constant tone so the snippet has signal.
+		pcm := make([]float32, liveSampleRate)
+		for i := range pcm {
+			pcm[i] = 0.25
+		}
+		ts.addPCM(pcm)
+		ts.observe(LiveTranscriptionEvent{Delta: "hello "})
+		ts.observe(LiveTranscriptionEvent{Delta: "world", Eou: true})
+		ts.observe(LiveTranscriptionEvent{Final: &schema.TranscriptionResult{Text: "hello world", Eou: true}})
+
+		ts.record(nil)
+
+		Eventually(trace.GetBackendTraces).Should(HaveLen(1))
+		got := trace.GetBackendTraces()[0]
+		Expect(got.Type).To(Equal(trace.BackendTraceTranscription))
+		Expect(got.ModelName).To(Equal("parakeet-live"))
+		Expect(got.Backend).To(Equal("parakeet-cpp"))
+		Expect(got.Summary).To(ContainSubstring("hello world"))
+		Expect(got.Data["source"]).To(Equal("live_stream"))
+		Expect(got.Data["result_text"]).To(Equal("hello world"))
+		// The live FinalResult no longer carries a terminal eou flag; the
+		// per-feed eou_events count is what the trace records instead.
+		Expect(got.Data).NotTo(HaveKey("eou"))
+		Expect(got.Data["eou_events"]).To(Equal(1))
+		Expect(got.Data["delta_events"]).To(Equal(2))
+		Expect(got.Data["audio_duration_s"]).To(BeNumerically("~", 1.0, 0.01))
+		Expect(got.Data["audio_wav_base64"]).NotTo(BeEmpty())
+		Expect(got.Error).To(BeEmpty())
+	})
+
+	It("caps the stored snippet but keeps counting the full fed duration", func() {
+		ts := newLiveTraceState(modelCfg(), appConfig, "")
+
+		// Feed past the snippet cap in two chunks (cap + one extra second).
+		ts.addPCM(make([]float32, trace.MaxSnippetSeconds*liveSampleRate))
+		ts.addPCM(make([]float32, liveSampleRate))
+
+		Expect(len(ts.pcm)).To(Equal(trace.MaxSnippetSeconds * liveSampleRate * 2))
+		Expect(ts.fedSamples).To(Equal((trace.MaxSnippetSeconds + 1) * liveSampleRate))
+
+		ts.record(nil)
+		Eventually(trace.GetBackendTraces).Should(HaveLen(1))
+		got := trace.GetBackendTraces()[0]
+		Expect(got.Data["audio_duration_s"]).To(BeNumerically("~", float64(trace.MaxSnippetSeconds+1), 0.01))
+		Expect(got.Data["audio_snippet_s"]).To(BeNumerically("~", float64(trace.MaxSnippetSeconds), 0.01))
+	})
+
+	It("clamps out-of-range float samples instead of wrapping", func() {
+		ts := newLiveTraceState(modelCfg(), appConfig, "")
+		ts.addPCM([]float32{2.0, -2.0})
+		Expect(ts.pcm).To(Equal([]byte{0xff, 0x7f, 0x00, 0x80})) // 32767, -32768
+	})
+
+	It("stamps the close error on the trace", func() {
+		ts := newLiveTraceState(modelCfg(), appConfig, "")
+		ts.record(errors.New("stream torn down"))
+
+		Eventually(trace.GetBackendTraces).Should(HaveLen(1))
+		Expect(trace.GetBackendTraces()[0].Error).To(Equal("stream torn down"))
+	})
+})
diff --git a/core/config/meta/registry.go b/core/config/meta/registry.go
index 3476076e1..b8200cd41 100644
--- a/core/config/meta/registry.go
+++ b/core/config/meta/registry.go
@@ -567,6 +567,38 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Advanced:    true,
 			Order:       83,
 		},
+		"pipeline.turn_detection.type": {
+			Section:     "pipeline",
+			Label:       "Turn Detection",
+			Description: "Default turn-detection mode for realtime sessions on this pipeline. server_vad commits after a fixed silence window; semantic_vad lets the transcription model's end-of-utterance token drive a dynamic window (fast commit after the token, long eagerness fallback without it). semantic_vad requires a streaming-EOU transcription model (e.g. parakeet-cpp-realtime_eou_120m-v1) and degrades to silence-only otherwise. Clients can override per session via session.update.",
+			Component:   "select",
+			Options: []FieldOption{
+				{Value: "", Label: "Default (server_vad)"},
+				{Value: "server_vad", Label: "server_vad (silence-based)"},
+				{Value: "semantic_vad", Label: "semantic_vad (end-of-utterance token)"},
+			},
+			Order: 87,
+		},
+		"pipeline.turn_detection.eagerness": {
+			Section:     "pipeline",
+			Label:       "Eagerness",
+			Description: "semantic_vad fallback silence window used when no end-of-utterance token was seen: low waits 8s, medium/auto 4s, high 2s.",
+			Component:   "select",
+			Options: []FieldOption{
+				{Value: "", Label: "Default (auto)"},
+				{Value: "low", Label: "low (8s)"},
+				{Value: "medium", Label: "medium (4s)"},
+				{Value: "high", Label: "high (2s)"},
+			},
+			Order: 88,
+		},
+		"pipeline.turn_detection.retranscribe": {
+			Section:     "pipeline",
+			Label:       "Retranscribe on Commit",
+			Description: "Cross-check every semantic_vad commit with an offline decode of the buffered turn: commit only proceeds when the batch decode also ends in the end-of-utterance token, and its transcript is used. Logs a streamed-vs-batch comparison — useful to gauge streaming/batch alignment — at the cost of one extra decode per turn.",
+			Component:   "toggle",
+			Order:       89,
+		},
 
 		// --- Functions ---
 		"function.grammar.parallel_calls": {
diff --git a/core/config/model_config.go b/core/config/model_config.go
index 2d1e18cc7..69dda331b 100644
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -650,6 +650,12 @@ type Pipeline struct {
 	// VoiceRecognition gates the pipeline behind speaker verification. Nil
 	// (block absent) means no gate, preserving existing behavior.
 	VoiceRecognition *PipelineVoiceRecognition `yaml:"voice_recognition,omitempty" json:"voice_recognition,omitempty"`
+
+	// TurnDetection sets the server-side default turn-detection mode for
+	// realtime sessions on this pipeline, so clients need no session.update
+	// to benefit. A client session.update still overrides type and eagerness
+	// per session; retranscribe is server-side only. Unset keeps server_vad.
+	TurnDetection PipelineTurnDetection `yaml:"turn_detection,omitempty" json:"turn_detection,omitempty"`
 }
 
 // PipelineCompaction configures summarize-then-drop for a realtime pipeline.
@@ -934,6 +940,38 @@ func (v PipelineVoiceRecognition) Validate(registryAvailable bool) error {
 	return nil
 }
 
+// @Description PipelineTurnDetection sets realtime turn-detection defaults.
+type PipelineTurnDetection struct {
+	// Type selects the default turn_detection mode for sessions on this
+	// pipeline: "server_vad" (silence-based) or "semantic_vad" (the
+	// transcription model's end-of-utterance token drives a dynamic silence
+	// window; needs a streaming-EOU transcription model such as
+	// parakeet_realtime_eou_120m-v1, degrades to silence-only otherwise).
+	Type string `yaml:"type,omitempty" json:"type,omitempty"`
+	// Eagerness is the semantic_vad fallback when no end-of-utterance token
+	// was seen: low waits 8s of silence, medium/auto 4s, high 2s.
+	Eagerness string `yaml:"eagerness,omitempty" json:"eagerness,omitempty"`
+	// Retranscribe (semantic_vad only) cross-checks every EOU-triggered
+	// commit with an offline decode of the buffered turn: the commit only
+	// proceeds when the batch decode also ends in the end-of-utterance token,
+	// and its transcript is the one used. The streamed and batch transcripts
+	// are compared in the logs — a diagnostic for streaming/batch alignment
+	// at the cost of one extra decode per turn.
+	Retranscribe *bool `yaml:"retranscribe,omitempty" json:"retranscribe,omitempty"`
+}
+
+// TurnDetectionSemantic reports whether this pipeline defaults sessions to
+// semantic (EOU-driven) turn detection.
+func (p Pipeline) TurnDetectionSemantic() bool {
+	return strings.EqualFold(strings.TrimSpace(p.TurnDetection.Type), "semantic_vad")
+}
+
+// TurnDetectionRetranscribe reports whether semantic_vad commits should be
+// cross-checked (and transcribed) by an offline decode of the buffered turn.
+func (p Pipeline) TurnDetectionRetranscribe() bool {
+	return p.TurnDetection.Retranscribe != nil && *p.TurnDetection.Retranscribe
+}
+
 // @Description File configuration for model downloads
 type File struct {
 	Filename string         `yaml:"filename,omitempty" json:"filename,omitempty"`
diff --git a/core/config/pipeline_turn_detection_test.go b/core/config/pipeline_turn_detection_test.go
new file mode 100644
index 000000000..d2b11a115
--- /dev/null
+++ b/core/config/pipeline_turn_detection_test.go
@@ -0,0 +1,61 @@
+package config
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"gopkg.in/yaml.v3"
+)
+
+// pipeline.turn_detection sets the server-side default turn-detection mode
+// for realtime sessions. Unset keeps server_vad, so existing configs are
+// unaffected; retranscribe is opt-in.
+var _ = Describe("Pipeline turn_detection config", func() {
+	It("defaults to non-semantic with retranscribe off when unset", func() {
+		var p Pipeline
+		Expect(p.TurnDetectionSemantic()).To(BeFalse())
+		Expect(p.TurnDetectionRetranscribe()).To(BeFalse())
+	})
+
+	It("parses the nested turn_detection block from YAML", func() {
+		var c ModelConfig
+		err := yaml.Unmarshal([]byte(`
+name: gpt-realtime
+pipeline:
+  transcription: parakeet-cpp-realtime_eou_120m-v1
+  turn_detection:
+    type: semantic_vad
+    eagerness: high
+    retranscribe: true
+`), &c)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(c.Pipeline.TurnDetectionSemantic()).To(BeTrue())
+		Expect(c.Pipeline.TurnDetection.Eagerness).To(Equal("high"))
+		Expect(c.Pipeline.TurnDetectionRetranscribe()).To(BeTrue())
+	})
+
+	It("treats server_vad and unknown types as non-semantic", func() {
+		var p Pipeline
+		p.TurnDetection.Type = "server_vad"
+		Expect(p.TurnDetectionSemantic()).To(BeFalse())
+		p.TurnDetection.Type = "something_else"
+		Expect(p.TurnDetectionSemantic()).To(BeFalse())
+	})
+
+	It("matches semantic_vad case-insensitively with surrounding space", func() {
+		var p Pipeline
+		p.TurnDetection.Type = " Semantic_VAD "
+		Expect(p.TurnDetectionSemantic()).To(BeTrue())
+	})
+
+	It("treats an explicit retranscribe false as off", func() {
+		var c ModelConfig
+		err := yaml.Unmarshal([]byte(`
+pipeline:
+  turn_detection:
+    type: semantic_vad
+    retranscribe: false
+`), &c)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(c.Pipeline.TurnDetectionRetranscribe()).To(BeFalse())
+	})
+})
diff --git a/core/http/endpoints/openai/compactcoord/compactcoord.go b/core/http/endpoints/openai/compactcoord/compactcoord.go
new file mode 100644
index 000000000..62b7156ed
--- /dev/null
+++ b/core/http/endpoints/openai/compactcoord/compactcoord.go
@@ -0,0 +1,149 @@
+// Package compactcoord is the explicit state machine for the realtime API's
+// conversation-compaction concern (machine "M4" in
+// docs/design/realtime-state-machines.md).
+//
+// In the legacy code this machine is an implicit single-flight guard: a
+// per-conversation `compacting atomic.Bool` that maybeCompact CAS-flips to start
+// a background summarize+evict and a deferred Store(false) clears. The intent —
+// at most one compaction running per conversation at a time, so two goroutines
+// never summarize and evict the same overflow concurrently (Part 4, invariant
+// #9) — is correct but implicit in a bare atomic.
+//
+// This package makes it explicit:
+//   - a sealed sum type for State (Idle | Running) — "two compactions running" is
+//     unrepresentable,
+//   - a total, pure transition function Next(state, event) -> (state, effects),
+//   - a single-writer Coordinator that serializes every transition.
+//
+// Unlike respcoord (M3), a Trigger while Running is NOT a supersede: compaction
+// is idempotent work on the same overflow, so a concurrent trigger is simply
+// dropped (matching the legacy CAS-fails-so-skip), not queued or restarted.
+package compactcoord
+
+import (
+	"fmt"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator"
+)
+
+// State is the sealed sum type of compaction states. Exhaustively:
+// Idle | Running | Terminated.
+type State interface {
+	isState()
+	String() string
+}
+
+// Idle: no compaction is running.
+type Idle struct{}
+
+// Running: exactly one compaction is in flight.
+type Running struct{}
+
+// Terminated: the conversation/session is torn down. Absorbing — no compaction
+// can start from here, so the M1 (connection) parent's teardown can cancel +
+// join the in-flight compaction and guarantee none outlives the session (see
+// formal-verification/session_lifecycle.fizz). This closes the legacy gap where
+// the fire-and-forget compaction goroutine could outlive the session.
+type Terminated struct{}
+
+func (Idle) isState()       {}
+func (Running) isState()    {}
+func (Terminated) isState() {}
+
+func (Idle) String() string       { return "Idle" }
+func (Running) String() string    { return "Running" }
+func (Terminated) String() string { return "Terminated" }
+
+// Event is the sealed sum type of inputs. Exhaustively:
+// Trigger | Finished | Shutdown.
+type Event interface {
+	isEvent()
+	String() string
+}
+
+// Trigger requests a compaction (the live buffer grew past the trigger). It
+// starts one only when Idle; while Running it is a no-op (single-flight).
+type Trigger struct{}
+
+// Finished reports that the running compaction goroutine finished (success, error, or
+// timeout — it always reports Finished so the flag can never stick).
+type Finished struct{}
+
+// Shutdown terminates the coordinator at teardown: the in-flight compaction is
+// cancelled + joined by the sink, and no compaction can start afterwards.
+type Shutdown struct{}
+
+func (Trigger) isEvent()  {}
+func (Finished) isEvent() {}
+func (Shutdown) isEvent() {}
+
+func (Trigger) String() string  { return "Trigger" }
+func (Finished) String() string { return "Finished" }
+func (Shutdown) String() string { return "Shutdown" }
+
+// Effect is a side effect returned by Next as data. Exhaustively: StartCompaction.
+type Effect interface {
+	isEffect()
+	String() string
+}
+
+// StartCompaction: spawn the background summarize+evict goroutine.
+type StartCompaction struct{}
+
+func (StartCompaction) isEffect() {}
+
+func (StartCompaction) String() string { return "StartCompaction" }
+
+// Next is the total, pure transition function. For every (state, event) it
+// returns the next state and the ordered effects. It returns a non-nil error
+// only for an unknown State/Event implementation. Every in-domain pair is
+// defined; there are no forbidden transitions, only no-ops.
+//
+// Single-flight crux: StartCompaction is emitted only on Idle+Trigger, and a
+// Trigger while Running is a no-op — so at most one compaction ever runs.
+func Next(s State, e Event) (State, []Effect, error) {
+	switch s.(type) {
+	case Idle:
+		switch e.(type) {
+		case Trigger:
+			return Running{}, []Effect{StartCompaction{}}, nil
+		case Finished:
+			// No compaction to finish: stale/idempotent no-op.
+			return Idle{}, nil, nil
+		case Shutdown:
+			return Terminated{}, nil, nil
+		}
+	case Running:
+		switch e.(type) {
+		case Trigger:
+			// Already compacting: drop (single-flight).
+			return Running{}, nil, nil
+		case Finished:
+			return Idle{}, nil, nil
+		case Shutdown:
+			// Teardown while compacting: the sink cancels + joins the goroutine,
+			// so its later Finished is absorbed here in Terminated.
+			return Terminated{}, nil, nil
+		}
+	case Terminated:
+		// Absorbing: a Trigger after teardown is rejected (no StartCompaction), so
+		// no compaction outlives the session.
+		switch e.(type) {
+		case Trigger, Finished, Shutdown:
+			return Terminated{}, nil, nil
+		}
+	}
+	return s, nil, fmt.Errorf("compactcoord: unhandled transition %s <- %s", s, e)
+}
+
+// EffectSink performs the effects produced by a transition. See coordinator.Sink:
+// StartCompaction spawns a goroutine, so Perform does not block under the lock.
+type EffectSink = coordinator.Sink[Effect]
+
+// Coordinator serializes the compaction transitions. See coordinator.Coordinator.
+type Coordinator = coordinator.Coordinator[State, Event, Effect]
+
+// New returns an idle Coordinator that performs effects via sink.
+func New(sink EffectSink) *Coordinator {
+	return coordinator.New[State, Event, Effect](Idle{}, Next, sink)
+}
diff --git a/core/http/endpoints/openai/compactcoord/compactcoord_suite_test.go b/core/http/endpoints/openai/compactcoord/compactcoord_suite_test.go
new file mode 100644
index 000000000..0dae15f80
--- /dev/null
+++ b/core/http/endpoints/openai/compactcoord/compactcoord_suite_test.go
@@ -0,0 +1,13 @@
+package compactcoord
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestCompactcoord(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "compactcoord (realtime M4) Suite")
+}
diff --git a/core/http/endpoints/openai/compactcoord/compactcoord_test.go b/core/http/endpoints/openai/compactcoord/compactcoord_test.go
new file mode 100644
index 000000000..caba28ecd
--- /dev/null
+++ b/core/http/endpoints/openai/compactcoord/compactcoord_test.go
@@ -0,0 +1,202 @@
+package compactcoord
+
+import (
+	"math/rand/v2"
+	"sync"
+	"sync/atomic"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// recordingSink captures the ordered stream of effects. Perform is called under
+// the coordinator lock; the mutex here guards reads from the spec goroutine.
+type recordingSink struct {
+	mu  sync.Mutex
+	log []Effect
+}
+
+func (s *recordingSink) Perform(e Effect) {
+	s.mu.Lock()
+	s.log = append(s.log, e)
+	s.mu.Unlock()
+}
+
+func (s *recordingSink) count() int {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	return len(s.log)
+}
+
+type unknownEvent struct{}
+
+func (unknownEvent) isEvent()       {}
+func (unknownEvent) String() string { return "unknownEvent" }
+
+type unknownState struct{}
+
+func (unknownState) isState()       {}
+func (unknownState) String() string { return "unknownState" }
+
+var _ = Describe("compactcoord.Next", func() {
+	DescribeTable("transitions",
+		func(state State, event Event, wantState State, wantEff []Effect) {
+			gotState, gotEff, err := Next(state, event)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(gotState).To(Equal(wantState))
+			Expect(gotEff).To(Equal(wantEff))
+		},
+		Entry("idle+trigger -> running: start",
+			Idle{}, Trigger{}, Running{}, []Effect{StartCompaction{}}),
+		Entry("idle+finished -> idle, no-op (stale)",
+			Idle{}, Finished{}, Idle{}, []Effect(nil)),
+		Entry("running+trigger -> running, no-op (single-flight)",
+			Running{}, Trigger{}, Running{}, []Effect(nil)),
+		Entry("running+finished -> idle",
+			Running{}, Finished{}, Idle{}, []Effect(nil)),
+		Entry("idle+shutdown -> terminated",
+			Idle{}, Shutdown{}, Terminated{}, []Effect(nil)),
+		Entry("running+shutdown -> terminated",
+			Running{}, Shutdown{}, Terminated{}, []Effect(nil)),
+		Entry("terminated+trigger -> terminated, REJECTED",
+			Terminated{}, Trigger{}, Terminated{}, []Effect(nil)),
+		Entry("terminated+finished -> terminated, no-op (stale)",
+			Terminated{}, Finished{}, Terminated{}, []Effect(nil)),
+		Entry("terminated+shutdown -> terminated, idempotent",
+			Terminated{}, Shutdown{}, Terminated{}, []Effect(nil)),
+	)
+
+	It("is total over the defined (state, event) pairs", func() {
+		for _, s := range []State{Idle{}, Running{}, Terminated{}} {
+			for _, e := range []Event{Trigger{}, Finished{}, Shutdown{}} {
+				_, _, err := Next(s, e)
+				Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e)
+			}
+		}
+	})
+
+	It("errors on an unknown event type", func() {
+		_, _, err := Next(Idle{}, unknownEvent{})
+		Expect(err).To(HaveOccurred())
+	})
+
+	It("errors on an unknown state type", func() {
+		_, _, err := Next(unknownState{}, Trigger{})
+		Expect(err).To(HaveOccurred())
+	})
+})
+
+var _ = Describe("compactcoord.Coordinator", func() {
+	// A StartCompaction is only ever produced while Idle (verified by checking the
+	// effect count grows exactly when the model transitions Idle->Running), so at
+	// most one compaction is ever in flight.
+	It("starts at most one compaction at a time over random sequences", func() {
+		seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE}
+		for _, seed := range seeds {
+			r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5))
+			sink := &recordingSink{}
+			c := New(sink)
+			running := false
+			starts := 0
+
+			for range 5000 {
+				if r.IntN(2) == 0 {
+					before := sink.count()
+					Expect(c.Apply(Trigger{})).To(Succeed())
+					if sink.count() > before {
+						// A StartCompaction was produced: must have been Idle.
+						Expect(running).To(BeFalse(), "seed=%d: started while already running", seed)
+						running = true
+						starts++
+					}
+				} else {
+					Expect(c.Apply(Finished{})).To(Succeed())
+					running = false
+				}
+				if running {
+					Expect(c.State()).To(Equal(State(Running{})), "seed=%d", seed)
+				} else {
+					Expect(c.State()).To(Equal(State(Idle{})), "seed=%d", seed)
+				}
+			}
+			Expect(starts).To(BeNumerically(">", 0), "seed=%d: walk should have started at least one", seed)
+		}
+	})
+
+	// Faithful concurrent test: StartCompaction spawns "work" that bumps an active
+	// counter, runs, and reports Finished back to the coordinator (exactly how the
+	// real sink behaves). Single-flight must hold even under many concurrent
+	// Triggers: the active counter never exceeds 1. Run under -race.
+	It("never runs two compactions concurrently", func() {
+		var active, maxActive int32
+		var c *Coordinator
+		var work sync.WaitGroup
+		sink := &spawnSink{onStart: func() {
+			work.Add(1)
+			go func() {
+				defer work.Done()
+				n := atomic.AddInt32(&active, 1)
+				for {
+					m := atomic.LoadInt32(&maxActive)
+					if n <= m || atomic.CompareAndSwapInt32(&maxActive, m, n) {
+						break
+					}
+				}
+				atomic.AddInt32(&active, -1)
+				_ = c.Apply(Finished{})
+			}()
+		}}
+		c = New(sink)
+
+		var wg sync.WaitGroup
+		for g := 0; g < 8; g++ {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
+				for range 1000 {
+					_ = c.Apply(Trigger{})
+				}
+			}()
+		}
+		wg.Wait()
+		work.Wait() // let any in-flight compaction report Finished
+
+		Expect(atomic.LoadInt32(&maxActive)).To(BeNumerically("<=", 1))
+		Expect(c.State()).To(Equal(State(Idle{})))
+	})
+
+	It("terminates on shutdown and rejects later triggers", func() {
+		sink := &recordingSink{}
+		c := New(sink)
+		Expect(c.Apply(Trigger{})).To(Succeed()) // Idle -> Running (StartCompaction)
+		Expect(c.Apply(Shutdown{})).To(Succeed())
+		Expect(c.State()).To(Equal(State(Terminated{})))
+
+		before := sink.count()
+		Expect(c.Apply(Trigger{})).To(Succeed()) // rejected
+		Expect(sink.count()).To(Equal(before), "no StartCompaction after shutdown")
+		Expect(c.Apply(Finished{})).To(Succeed()) // stale, absorbed
+		Expect(c.State()).To(Equal(State(Terminated{})))
+	})
+})
+
+// spawnSink invokes onStart for each StartCompaction (called under the coord lock;
+// onStart must be non-blocking — it spawns the work goroutine).
+type spawnSink struct{ onStart func() }
+
+func (s *spawnSink) Perform(e Effect) {
+	if _, ok := e.(StartCompaction); ok {
+		s.onStart()
+	}
+}
+
+var _ = DescribeTable("compactcoord stringers",
+	func(got, want string) { Expect(got).To(Equal(want)) },
+	Entry(nil, Idle{}.String(), "Idle"),
+	Entry(nil, Running{}.String(), "Running"),
+	Entry(nil, Terminated{}.String(), "Terminated"),
+	Entry(nil, Trigger{}.String(), "Trigger"),
+	Entry(nil, Finished{}.String(), "Finished"),
+	Entry(nil, Shutdown{}.String(), "Shutdown"),
+	Entry(nil, StartCompaction{}.String(), "StartCompaction"),
+)
diff --git a/core/http/endpoints/openai/conncoord/conncoord.go b/core/http/endpoints/openai/conncoord/conncoord.go
new file mode 100644
index 000000000..f6e7e0e03
--- /dev/null
+++ b/core/http/endpoints/openai/conncoord/conncoord.go
@@ -0,0 +1,164 @@
+// Package conncoord is the explicit state machine for the realtime API's
+// connection lifecycle (machine "M1" in docs/design/realtime-state-machines.md).
+//
+// In the legacy code this machine is implicit and fragile. The session handler
+// keeps a `vadServerStarted` bool plus a `done` channel that is REASSIGNED to a
+// fresh channel every time turn detection is toggled on (session.update) and
+// closed both at toggle-off and at teardown (Part 2, failure mode 6). It is
+// correct today only because one goroutine owns it; "one variable name meaning
+// different channels over time, closed from two sites guarded by a bool" is a
+// structural hazard, not an explicit lifecycle. Teardown likewise depends on the
+// bool to avoid closing an already-closed channel.
+//
+// This package makes the lifecycle explicit:
+//   - a sealed sum type for State (Live{VADRunning} | Torn) — illegal states
+//     such as "running after teardown" are unrepresentable,
+//   - a total, pure transition function Next(state, event) -> (state, effects),
+//   - a single-writer Coordinator that serializes every transition.
+//
+// The guarantees the spec checks:
+//   - the VAD goroutine's done channel is closed exactly once per start (StopVAD
+//     is emitted only while running, so never a double close / close of nil),
+//   - teardown runs exactly once (Close from Live; any later Close is a no-op),
+//   - nothing is started after teardown (no resurrection / no send-after-close).
+//
+// Like turncoord (M2), the connection machine is driven by the single session
+// goroutine; the Coordinator's lock keeps State() race-free and guards against a
+// future second writer. The effects are performed by a sink that owns the actual
+// channels/goroutines (see realtime_conncoord.go).
+package conncoord
+
+import (
+	"fmt"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator"
+)
+
+// State is the sealed sum type of connection states. The only implementations
+// are the marker-method structs in this file. Exhaustively: Live | Torn.
+type State interface {
+	isState()
+	String() string
+}
+
+// Live: the session is active. VADRunning records whether the turn-detection
+// (handleVAD) goroutine is currently running — the single source of truth that
+// replaces the legacy vadServerStarted bool, so the per-run done channel is
+// closed exactly once.
+type Live struct{ VADRunning bool }
+
+// Torn: the session has been torn down. Terminal — no effect is ever produced
+// from here again.
+type Torn struct{}
+
+func (Live) isState() {}
+func (Torn) isState() {}
+
+func (s Live) String() string { return fmt.Sprintf("Live(vad=%t)", s.VADRunning) }
+func (Torn) String() string   { return "Torn" }
+
+// Event is the sealed sum type of inputs. Exhaustively: SetVAD | Close.
+type Event interface {
+	isEvent()
+	String() string
+}
+
+// SetVAD requests the turn-detection goroutine be running (Active) or not. It is
+// raised whenever session.update changes whether turn detection is active. It is
+// idempotent: setting the state it is already in is a no-op.
+type SetVAD struct{ Active bool }
+
+// Close requests teardown (the transport read loop ended, or the session is
+// closing). It is idempotent — only the first Close from Live tears down.
+type Close struct{}
+
+func (SetVAD) isEvent() {}
+func (Close) isEvent()  {}
+
+func (e SetVAD) String() string { return fmt.Sprintf("SetVAD(%t)", e.Active) }
+func (Close) String() string    { return "Close" }
+
+// Effect is a side effect returned by Next as data for the caller to perform.
+// Exhaustively: StartVAD | StopVAD | Teardown.
+type Effect interface {
+	isEffect()
+	String() string
+}
+
+// StartVAD: create a fresh done channel and spawn the handleVAD goroutine on it.
+type StartVAD struct{}
+
+// StopVAD: close the running VAD goroutine's done channel (signal it to exit).
+type StopVAD struct{}
+
+// Teardown: the once-only teardown — stop the remaining input goroutines (opus
+// decode, sound window), join them, cancel in-flight responses, and remove the
+// session from the registry. Emitted exactly once.
+type Teardown struct{}
+
+func (StartVAD) isEffect() {}
+func (StopVAD) isEffect()  {}
+func (Teardown) isEffect() {}
+
+func (StartVAD) String() string { return "StartVAD" }
+func (StopVAD) String() string  { return "StopVAD" }
+func (Teardown) String() string { return "Teardown" }
+
+// Next is the total, pure transition function. For every (state, event) it
+// returns the next state and the ordered effects to perform. It returns a
+// non-nil error only for an unknown State/Event implementation. Every in-domain
+// pair is defined; there are no forbidden transitions, only no-ops.
+//
+// The crux: Close moves to Torn, which absorbs every later event with no
+// effects. So teardown's channel closes happen exactly once even if Close is
+// raised again (e.g. an error path and the normal return both reaching it), and
+// no StartVAD can resurrect a torn session.
+func Next(s State, e Event) (State, []Effect, error) {
+	switch st := s.(type) {
+	case Live:
+		switch ev := e.(type) {
+		case SetVAD:
+			switch {
+			case ev.Active && !st.VADRunning:
+				return Live{VADRunning: true}, []Effect{StartVAD{}}, nil
+			case !ev.Active && st.VADRunning:
+				return Live{VADRunning: false}, []Effect{StopVAD{}}, nil
+			default:
+				// Already in the requested state: idempotent no-op.
+				return Live{VADRunning: st.VADRunning}, nil, nil
+			}
+		case Close:
+			if st.VADRunning {
+				return Torn{}, []Effect{StopVAD{}, Teardown{}}, nil
+			}
+			return Torn{}, []Effect{Teardown{}}, nil
+		}
+	case Torn:
+		switch e.(type) {
+		case SetVAD:
+			// No resurrection: a toggle after teardown is ignored.
+			return Torn{}, nil, nil
+		case Close:
+			// Idempotent: teardown already ran.
+			return Torn{}, nil, nil
+		}
+	}
+	return s, nil, fmt.Errorf("conncoord: unhandled transition %s <- %s", s, e)
+}
+
+// EffectSink performs the effects produced by a transition. See coordinator.Sink:
+// Perform runs under the coordinator lock. The Teardown effect does join
+// goroutines (which can block) — acceptable here because the connection
+// coordinator is single-writer and torn down exactly once at the end of the
+// session goroutine, so no other Apply is contending the lock.
+type EffectSink = coordinator.Sink[Effect]
+
+// Coordinator serializes the connection-lifecycle transitions.
+// See coordinator.Coordinator.
+type Coordinator = coordinator.Coordinator[State, Event, Effect]
+
+// New returns a Coordinator in Live{VADRunning:false} that performs effects via
+// sink.
+func New(sink EffectSink) *Coordinator {
+	return coordinator.New[State, Event, Effect](Live{VADRunning: false}, Next, sink)
+}
diff --git a/core/http/endpoints/openai/conncoord/conncoord_suite_test.go b/core/http/endpoints/openai/conncoord/conncoord_suite_test.go
new file mode 100644
index 000000000..3344a2355
--- /dev/null
+++ b/core/http/endpoints/openai/conncoord/conncoord_suite_test.go
@@ -0,0 +1,13 @@
+package conncoord
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestConncoord(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "conncoord (realtime M1) Suite")
+}
diff --git a/core/http/endpoints/openai/conncoord/conncoord_test.go b/core/http/endpoints/openai/conncoord/conncoord_test.go
new file mode 100644
index 000000000..8fb3c5051
--- /dev/null
+++ b/core/http/endpoints/openai/conncoord/conncoord_test.go
@@ -0,0 +1,212 @@
+package conncoord
+
+import (
+	"math/rand/v2"
+	"sync"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// recordingSink captures the ordered stream of effects so the invariants can be
+// checked independently of the transition function. Perform is called by
+// Coordinator.Apply under the coordinator lock; the mutex here only guards reads
+// from the spec goroutine.
+type recordingSink struct {
+	mu  sync.Mutex
+	log []Effect
+}
+
+func (s *recordingSink) Perform(e Effect) {
+	s.mu.Lock()
+	s.log = append(s.log, e)
+	s.mu.Unlock()
+}
+
+func (s *recordingSink) snapshot() []Effect {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	out := make([]Effect, len(s.log))
+	copy(out, s.log)
+	return out
+}
+
+// checkLog replays the effect log and asserts the lifecycle safety properties
+// from docs/design/realtime-state-machines.md, Part 4 (invariants #8, #10 and
+// failure mode 6):
+//
+//	(1) the VAD done channel is closed exactly once per start -- StartVAD only
+//	    while stopped, StopVAD only while running (no double close / close-of-nil);
+//	(2) teardown runs at most once;
+//	(3) no resurrection -- no StartVAD after Teardown.
+func checkLog(log []Effect) {
+	running := false
+	torn := false
+	teardowns := 0
+	for i, eff := range log {
+		switch eff.(type) {
+		case StartVAD:
+			Expect(torn).To(BeFalse(), "invariant (3): StartVAD after teardown (effect #%d)\nlog=%v", i, log)
+			Expect(running).To(BeFalse(), "invariant (1): StartVAD while already running (effect #%d)\nlog=%v", i, log)
+			running = true
+		case StopVAD:
+			Expect(running).To(BeTrue(), "invariant (1): StopVAD while not running (effect #%d)\nlog=%v", i, log)
+			running = false
+		case Teardown:
+			Expect(torn).To(BeFalse(), "invariant (2): Teardown twice (effect #%d)\nlog=%v", i, log)
+			torn = true
+			teardowns++
+		}
+	}
+	Expect(teardowns).To(BeNumerically("<=", 1), "invariant (2): teardown ran %d times\nlog=%v", teardowns, log)
+}
+
+type unknownEvent struct{}
+
+func (unknownEvent) isEvent()       {}
+func (unknownEvent) String() string { return "unknownEvent" }
+
+type unknownState struct{}
+
+func (unknownState) isState()       {}
+func (unknownState) String() string { return "unknownState" }
+
+var _ = Describe("conncoord.Next", func() {
+	DescribeTable("transitions",
+		func(state State, event Event, wantState State, wantEff []Effect) {
+			gotState, gotEff, err := Next(state, event)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(gotState).To(Equal(wantState))
+			Expect(gotEff).To(Equal(wantEff))
+		},
+		Entry("stopped+setvad(on) -> running: start",
+			Live{VADRunning: false}, SetVAD{Active: true},
+			Live{VADRunning: true}, []Effect{StartVAD{}}),
+		Entry("running+setvad(on) -> running, no-op",
+			Live{VADRunning: true}, SetVAD{Active: true},
+			Live{VADRunning: true}, []Effect(nil)),
+		Entry("stopped+setvad(off) -> stopped, no-op",
+			Live{VADRunning: false}, SetVAD{Active: false},
+			Live{VADRunning: false}, []Effect(nil)),
+		Entry("running+setvad(off) -> stopped: stop",
+			Live{VADRunning: true}, SetVAD{Active: false},
+			Live{VADRunning: false}, []Effect{StopVAD{}}),
+		Entry("stopped+close -> torn: teardown",
+			Live{VADRunning: false}, Close{},
+			Torn{}, []Effect{Teardown{}}),
+		Entry("running+close -> torn: stop + teardown",
+			Live{VADRunning: true}, Close{},
+			Torn{}, []Effect{StopVAD{}, Teardown{}}),
+		Entry("torn+setvad(on) -> torn, no-op (no resurrection)",
+			Torn{}, SetVAD{Active: true},
+			Torn{}, []Effect(nil)),
+		Entry("torn+close -> torn, no-op (idempotent)",
+			Torn{}, Close{},
+			Torn{}, []Effect(nil)),
+	)
+
+	It("is total over the defined (state, event) pairs", func() {
+		states := []State{Live{VADRunning: false}, Live{VADRunning: true}, Torn{}}
+		events := []Event{SetVAD{Active: true}, SetVAD{Active: false}, Close{}}
+		for _, s := range states {
+			for _, e := range events {
+				_, _, err := Next(s, e)
+				Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e)
+			}
+		}
+	})
+
+	It("errors on an unknown event type", func() {
+		_, _, err := Next(Live{}, unknownEvent{})
+		Expect(err).To(HaveOccurred())
+	})
+
+	It("errors on an unknown state type", func() {
+		_, _, err := Next(unknownState{}, Close{})
+		Expect(err).To(HaveOccurred())
+	})
+})
+
+var _ = Describe("conncoord.Coordinator", func() {
+	It("upholds the lifecycle invariants over random event sequences", func() {
+		seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE}
+		for _, seed := range seeds {
+			r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5))
+			sink := &recordingSink{}
+			c := New(sink)
+			running := false
+			torn := false
+
+			for range 5000 {
+				switch r.IntN(3) {
+				case 0:
+					Expect(c.Apply(SetVAD{Active: true})).To(Succeed())
+					if !torn {
+						running = true
+					}
+				case 1:
+					Expect(c.Apply(SetVAD{Active: false})).To(Succeed())
+					if !torn {
+						running = false
+					}
+				case 2:
+					Expect(c.Apply(Close{})).To(Succeed())
+					torn = true
+					running = false
+				}
+				if torn {
+					Expect(c.State()).To(Equal(State(Torn{})), "seed=%d", seed)
+				} else {
+					Expect(c.State()).To(Equal(State(Live{VADRunning: running})), "seed=%d", seed)
+				}
+			}
+			checkLog(sink.snapshot())
+		}
+	})
+
+	It("tears down at most once under concurrent SetVAD/Close from two goroutines", func() {
+		const perGoroutine = 2000
+		sink := &recordingSink{}
+		c := New(sink)
+
+		var wg sync.WaitGroup
+		drive := func(active bool) {
+			defer wg.Done()
+			for i := range perGoroutine {
+				switch i % 3 {
+				case 0:
+					_ = c.Apply(SetVAD{Active: active})
+				case 1:
+					_ = c.Apply(SetVAD{Active: !active})
+				case 2:
+					if i > perGoroutine/2 {
+						_ = c.Apply(Close{})
+					}
+				}
+			}
+		}
+
+		wg.Add(2)
+		go drive(true)
+		go drive(false)
+		wg.Wait()
+		_ = c.Apply(Close{})
+
+		checkLog(sink.snapshot())
+		Expect(c.State()).To(Equal(State(Torn{})))
+	})
+})
+
+var _ = DescribeTable("conncoord stringers",
+	func(got, want string) { Expect(got).To(Equal(want)) },
+	Entry(nil, Live{VADRunning: true}.String(), "Live(vad=true)"),
+	Entry(nil, Live{VADRunning: false}.String(), "Live(vad=false)"),
+	Entry(nil, Torn{}.String(), "Torn"),
+
+	Entry(nil, SetVAD{Active: true}.String(), "SetVAD(true)"),
+	Entry(nil, Close{}.String(), "Close"),
+
+	Entry(nil, StartVAD{}.String(), "StartVAD"),
+	Entry(nil, StopVAD{}.String(), "StopVAD"),
+	Entry(nil, Teardown{}.String(), "Teardown"),
+)
diff --git a/core/http/endpoints/openai/coordinator/coordinator.go b/core/http/endpoints/openai/coordinator/coordinator.go
new file mode 100644
index 000000000..d8ae2fa43
--- /dev/null
+++ b/core/http/endpoints/openai/coordinator/coordinator.go
@@ -0,0 +1,82 @@
+// Package coordinator is the shared single-writer state-machine runtime for the
+// realtime API's explicit coordinators (machines M1–M5 in
+// docs/design/realtime-state-machines.md).
+//
+// Each machine package (respcoord, turncoord, conncoord, compactcoord, ttscoord)
+// defines its OWN sealed sum types for State/Event/Effect and a total, pure
+// transition function Next(state, event) -> (state, []effect, error). The
+// plumbing around that — a single-writer Coordinator that serializes every
+// transition behind one lock and performs the returned effects in order — is
+// identical across all five, so it lives here once instead of being copied.
+//
+// A machine package wires itself up with three lines:
+//
+//	type EffectSink = coordinator.Sink[Effect]
+//	type Coordinator = coordinator.Coordinator[State, Event, Effect]
+//	func New(sink EffectSink) *Coordinator { return coordinator.New[State, Event, Effect](Idle{}, Next, sink) }
+//
+// The aliases keep each package's public API (Coordinator, New, EffectSink,
+// Apply, State) unchanged. The single-writer serialization — the load-bearing
+// concurrency guarantee the FizzBee specs check — is therefore implemented and
+// reasoned about in exactly one place.
+package coordinator
+
+import "sync"
+
+// TransitionFunc is a machine's total, pure transition: given the current state
+// and an event it returns the next state, the ordered effects to perform, and a
+// non-nil error ONLY for an unhandled (programmer-error) state/event pair. It
+// must not perform I/O or block; side effects are returned as data (F) for the
+// Coordinator to hand to the Sink.
+type TransitionFunc[S, E, F any] func(state S, event E) (S, []F, error)
+
+// Sink performs the effects a transition produces. Implementations MUST be
+// non-blocking: Perform is called while the Coordinator holds its lock, so it
+// must not block (it should spawn a goroutine, call a cancel func, or do a
+// non-blocking channel send) and MUST NOT call back into the same Coordinator's
+// Apply.
+type Sink[F any] interface {
+	Perform(F)
+}
+
+// Coordinator is the single-writer wrapper around a pure transition function.
+// Every Apply is serialized by mu, so multiple goroutines can drive the machine
+// without racing, and a transition's effects are performed in order under the
+// lock (before any subsequent Apply can observe the new state).
+type Coordinator[S, E, F any] struct {
+	mu    sync.Mutex
+	state S
+	next  TransitionFunc[S, E, F]
+	sink  Sink[F]
+}
+
+// New returns a Coordinator in the given initial state that transitions via next
+// and performs effects via sink.
+func New[S, E, F any](initial S, next TransitionFunc[S, E, F], sink Sink[F]) *Coordinator[S, E, F] {
+	return &Coordinator[S, E, F]{state: initial, next: next, sink: sink}
+}
+
+// Apply runs one transition under the lock and performs its effects in order. If
+// the transition function returns an error (an unhandled state/event), the state
+// is left unchanged and the error is returned to the caller — never silently
+// swallowed.
+func (c *Coordinator[S, E, F]) Apply(e E) error {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	ns, effects, err := c.next(c.state, e)
+	if err != nil {
+		return err
+	}
+	c.state = ns
+	for _, eff := range effects {
+		c.sink.Perform(eff)
+	}
+	return nil
+}
+
+// State returns the current state (a value; safe to call concurrently).
+func (c *Coordinator[S, E, F]) State() S {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	return c.state
+}
diff --git a/core/http/endpoints/openai/coordinator/coordinator_suite_test.go b/core/http/endpoints/openai/coordinator/coordinator_suite_test.go
new file mode 100644
index 000000000..8ea84eeea
--- /dev/null
+++ b/core/http/endpoints/openai/coordinator/coordinator_suite_test.go
@@ -0,0 +1,13 @@
+package coordinator
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestCoordinator(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "coordinator (shared runtime) Suite")
+}
diff --git a/core/http/endpoints/openai/coordinator/coordinator_test.go b/core/http/endpoints/openai/coordinator/coordinator_test.go
new file mode 100644
index 000000000..2eec77124
--- /dev/null
+++ b/core/http/endpoints/openai/coordinator/coordinator_test.go
@@ -0,0 +1,124 @@
+package coordinator
+
+import (
+	"errors"
+	"fmt"
+	"sync"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// A tiny toy machine exercises the generic runtime directly (the five real
+// machines exercise it via their aliases, but the gate measures this package's
+// own coverage). off <-toggle-> on; burst emits three ordered effects; boom is
+// the unhandled/error path.
+type tstate int
+
+const (
+	off tstate = iota
+	on
+)
+
+type tevent int
+
+const (
+	toggle tevent = iota
+	burst
+	boom
+)
+
+type teffect string
+
+func tnext(s tstate, e tevent) (tstate, []teffect, error) {
+	switch e {
+	case toggle:
+		if s == off {
+			return on, []teffect{"on"}, nil
+		}
+		return off, []teffect{"off"}, nil
+	case burst:
+		return s, []teffect{"a", "b", "c"}, nil
+	case boom:
+		return s, nil, errors.New("boom: unhandled")
+	}
+	return s, nil, fmt.Errorf("unknown event %d", int(e))
+}
+
+type recordingSink struct {
+	mu  sync.Mutex
+	log []teffect
+}
+
+func (s *recordingSink) Perform(e teffect) {
+	s.mu.Lock()
+	s.log = append(s.log, e)
+	s.mu.Unlock()
+}
+
+func (s *recordingSink) snapshot() []teffect {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	out := make([]teffect, len(s.log))
+	copy(out, s.log)
+	return out
+}
+
+var _ = Describe("coordinator.Coordinator", func() {
+	It("starts in the initial state", func() {
+		c := New[tstate, tevent, teffect](off, tnext, &recordingSink{})
+		Expect(c.State()).To(Equal(off))
+	})
+
+	It("advances state and performs the transition's effects", func() {
+		sink := &recordingSink{}
+		c := New[tstate, tevent, teffect](off, tnext, sink)
+
+		Expect(c.Apply(toggle)).To(Succeed())
+		Expect(c.State()).To(Equal(on))
+		Expect(c.Apply(toggle)).To(Succeed())
+		Expect(c.State()).To(Equal(off))
+
+		Expect(sink.snapshot()).To(Equal([]teffect{"on", "off"}))
+	})
+
+	It("performs multiple effects in order", func() {
+		sink := &recordingSink{}
+		c := New[tstate, tevent, teffect](off, tnext, sink)
+		Expect(c.Apply(burst)).To(Succeed())
+		Expect(sink.snapshot()).To(Equal([]teffect{"a", "b", "c"}))
+	})
+
+	It("returns the transition error and leaves state unchanged", func() {
+		sink := &recordingSink{}
+		c := New[tstate, tevent, teffect](on, tnext, sink)
+		err := c.Apply(boom)
+		Expect(err).To(HaveOccurred())
+		Expect(c.State()).To(Equal(on), "state unchanged on error")
+		Expect(sink.snapshot()).To(BeEmpty(), "no effects performed on error")
+	})
+
+	It("serializes concurrent Apply from many goroutines (run with -race)", func() {
+		const goroutines = 8
+		const each = 1000
+		sink := &recordingSink{}
+		c := New[tstate, tevent, teffect](off, tnext, sink)
+
+		var wg sync.WaitGroup
+		wg.Add(goroutines)
+		for range goroutines {
+			go func() {
+				defer wg.Done()
+				for range each {
+					_ = c.Apply(toggle)
+				}
+			}()
+		}
+		wg.Wait()
+
+		// goroutines*each toggles from off; an even total returns to off. The
+		// point is race-freedom + a consistent final state, not the value itself.
+		Expect(c.State()).To(Equal(off))
+		Expect(sink.snapshot()).To(HaveLen(goroutines * each))
+	})
+})
diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go
index d4d6a0ac4..94c8a1a65 100644
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -12,7 +12,6 @@ import (
 	"os"
 	"strconv"
 	"sync"
-	"sync/atomic"
 	"time"
 
 	"net/http"
@@ -26,6 +25,8 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/auth"
 	mcpTools "github.com/mudler/LocalAI/core/http/endpoints/mcp"
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/respcoord"
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/turncoord"
 	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/templates"
@@ -168,44 +169,12 @@ type Session struct {
 	gateMu        sync.Mutex
 	voiceVerified bool
 
-	// Response cancellation: protects activeResponseCancel/activeResponseDone
-	responseMu           sync.Mutex
-	activeResponseCancel context.CancelFunc
-	activeResponseDone   chan struct{}
-}
-
-// cancelActiveResponse cancels any in-flight response and waits for its
-// goroutine to exit. This ensures we never have overlapping responses and
-// that interrupted responses are fully cleaned up before starting a new one.
-func (s *Session) cancelActiveResponse() {
-	s.responseMu.Lock()
-	cancel := s.activeResponseCancel
-	done := s.activeResponseDone
-	s.responseMu.Unlock()
-
-	if cancel != nil {
-		cancel()
-	}
-	if done != nil {
-		<-done
-	}
-}
-
-// startResponse cancels any active response and returns a new context for
-// the replacement response. The caller MUST close the returned done channel
-// when the response goroutine exits.
-func (s *Session) startResponse(parent context.Context) (context.Context, chan struct{}) {
-	s.cancelActiveResponse()
-
-	ctx, cancel := context.WithCancel(parent)
-	done := make(chan struct{})
-
-	s.responseMu.Lock()
-	s.activeResponseCancel = cancel
-	s.activeResponseDone = done
-	s.responseMu.Unlock()
-
-	return ctx, done
+	// respSink is the explicit response-coordination state machine (respcoord,
+	// machine M3). It replaces the legacy startResponse/cancelActiveResponse
+	// pair and its dual-writer activeResponse* fields: every start/cancel/finish
+	// decision is serialized through respcoord.Coordinator, guaranteeing at most
+	// one live response. See realtime_respcoord.go.
+	respSink *responseSink
 }
 
 func (s *Session) FromClient(session *types.SessionUnion) {
@@ -258,8 +227,10 @@ type Conversation struct {
 	// is kept out of Items (so trimRealtimeItems never drops it) and rendered
 	// as a system message right after the session instructions.
 	Memory string
-	// compacting ensures at most one background compaction runs per conversation.
-	compacting atomic.Bool
+	// compaction is the explicit single-flight compaction coordinator (M4): at
+	// most one background summarize+evict runs per conversation at a time. It
+	// replaces the legacy `compacting atomic.Bool`. See realtime_compactcoord.go.
+	compaction *compactionSink
 }
 
 func (c *Conversation) ToServer() types.Conversation {
@@ -288,6 +259,12 @@ type Model interface {
 	// sound-event tags. topK caps the number of returned tags (0 = backend
 	// default), threshold drops tags below the given score (0 = keep all).
 	SoundDetection(ctx context.Context, audio string, topK int, threshold float32) (*schema.SoundClassificationResult, error)
+	// TranscribeLive opens a live (bidirectional) transcription session on the
+	// pipeline's transcription backend, used by semantic_vad turn detection;
+	// onEvent fires from a background goroutine for every delta/EOU/final
+	// event. Backends without live support fail with an error satisfying
+	// grpcerrors.IsLiveTranscriptionUnsupported.
+	TranscribeLive(ctx context.Context, language string, onEvent func(backend.LiveTranscriptionEvent)) (backend.LiveTranscriptionSession, error)
 	PredictConfig() *config.ModelConfig
 }
 
@@ -513,14 +490,10 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	// input_audio_buffer.commit. There is no transcription stage in that case.
 	soundOnly := cfg.Pipeline.SoundDetection != "" && cfg.Pipeline.Transcription == "" && cfg.Pipeline.LLM == ""
 
-	turnDetection := &types.TurnDetectionUnion{
-		ServerVad: &types.ServerVad{
-			Threshold:         0.5,
-			PrefixPaddingMs:   300,
-			SilenceDurationMs: 500,
-			CreateResponse:    true,
-		},
-	}
+	// defaultTurnDetection seeds server_vad by default, or semantic_vad when the
+	// pipeline opts in (turn_detection.type: semantic_vad); clients can still
+	// override per session via session.update.
+	turnDetection := defaultTurnDetection(cfg)
 	inputAudioTranscription := &types.AudioTranscription{Model: sttModel}
 	if soundOnly {
 		turnDetection = nil           // turn_detection none: no VAD
@@ -561,12 +534,27 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	}
 	session.CompactionEnabled, session.CompactionTrigger, session.MaxSummaryTokens, session.SummaryModel = resolveCompaction(cfg, session.MaxHistoryItems)
 
+	// Single-writer response coordinator (machine M3). All response starts and
+	// cancels go through this, so the read-loop and VAD goroutine can never race
+	// into two overlapping responses (see realtime_respcoord.go).
+	session.respSink = newResponseSink()
+
 	// Create a default conversation
 	conversationID := generateConversationID()
 	conversation := &Conversation{
 		ID:    conversationID,
 		Items: []*types.MessageItemUnion{},
 	}
+	// The compaction coordinator's work closure resolves the summarizer (lazily
+	// loading a configured summary_model) and runs the summarize+evict off the
+	// response path — only when a compaction actually starts.
+	conversation.compaction = newCompactionSink(func(ctx context.Context) {
+		model := session.summarizerModel()
+		if model == nil {
+			return
+		}
+		session.compact(ctx, conversation, model)
+	})
 	session.Conversations[conversationID] = conversation
 	session.DefaultConversationID = conversationID
 
@@ -648,34 +636,22 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	})
 
 	var (
-		msg  []byte
-		wg   sync.WaitGroup
-		done = make(chan struct{})
+		msg []byte
+		wg  sync.WaitGroup
 	)
 
-	vadServerStarted := false
-	toggleVAD := func() {
-		if session.TurnDetection != nil && session.TurnDetection.ServerVad != nil && !vadServerStarted {
-			xlog.Debug("Starting VAD goroutine...")
-			done = make(chan struct{})
-			wg.Go(func() {
-				conversation := session.Conversations[session.DefaultConversationID]
-				handleVAD(session, conversation, t, done)
-			})
-			vadServerStarted = true
-		} else if (session.TurnDetection == nil || session.TurnDetection.ServerVad == nil) && vadServerStarted {
-			xlog.Debug("Stopping VAD goroutine...")
-			close(done)
-			vadServerStarted = false
-		}
-	}
+	// M1 connection lifecycle. The VAD goroutine's run/stop (and its done channel)
+	// and the once-only teardown are owned by this coordinator, so the channel is
+	// closed exactly once and never resurrected after teardown (Part 2, failure
+	// mode 6; invariants #8, #10). See realtime_conncoord.go and conncoord/.
+	conn := newConnSink(session, sessionID, t, &wg)
+	toggleVAD := func() { conn.setVAD(turnDetectionActive(session.TurnDetection)) }
 
 	// For WebRTC sessions, start the Opus decode loop before VAD so that
 	// decoded PCM is already flowing when VAD's first tick fires.
-	var decodeDone chan struct{}
 	if wt, ok := t.(*WebRTCTransport); ok {
-		decodeDone = make(chan struct{})
-		go decodeOpusLoop(session, wt.opusBackend, decodeDone)
+		conn.decodeDone = make(chan struct{})
+		go decodeOpusLoop(session, wt.opusBackend, conn.decodeDone)
 	}
 
 	toggleVAD()
@@ -684,9 +660,9 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	// with window/hop configured, the server classifies the last window of
 	// streamed audio on a timer, so the client only has to stream (no commits).
 	// This runs independent of VAD (sound events are not speech).
-	var soundWindowDone chan struct{}
 	if soundOnly && session.SoundDetectionWindowMs > 0 && session.SoundDetectionHopMs > 0 {
-		soundWindowDone = make(chan struct{})
+		conn.soundWindowDone = make(chan struct{})
+		soundWindowDone := conn.soundWindowDone
 		wg.Go(func() {
 			handleSoundWindow(session, t, soundWindowDone)
 		})
@@ -811,11 +787,11 @@ func runRealtimeSession(application *application.Application, t Transport, model
 			xlog.Debug("recv", "message", string(msg))
 
 			sessionLock.Lock()
-			isServerVAD := session.TurnDetection != nil && session.TurnDetection.ServerVad != nil
+			autoTurnDetection := turnDetectionActive(session.TurnDetection)
 			sessionLock.Unlock()
 
 			// TODO: At the least need to check locking and timer state in the VAD Go routine before allowing this
-			if isServerVAD {
+			if autoTurnDetection {
 				sendNotImplemented(t, "input_audio_buffer.commit in conjunction with VAD")
 				continue
 			}
@@ -831,11 +807,9 @@ func runRealtimeSession(application *application.Application, t Transport, model
 				ItemID:          generateItemID(),
 			})
 
-			respCtx, respDone := session.startResponse(context.Background())
-			go func() {
-				defer close(respDone)
-				commitUtterance(respCtx, allAudio, session, conversation, t)
-			}()
+			session.respSink.issue(context.Background(), respcoord.SourceClient, func(ctx context.Context) {
+				commitUtterance(ctx, allAudio, session, conversation, t)
+			})
 
 		case types.InputAudioBufferClearEvent:
 			xlog.Debug("recv", "message", string(msg))
@@ -968,15 +942,14 @@ func runRealtimeSession(application *application.Application, t Transport, model
 				conversation.Lock.Unlock()
 			}
 
-			respCtx, respDone := session.startResponse(context.Background())
-			go func() {
-				defer close(respDone)
-				triggerResponse(respCtx, session, conversation, t, &e.Response)
-			}()
+			resp := e.Response
+			session.respSink.issue(context.Background(), respcoord.SourceClient, func(ctx context.Context) {
+				triggerResponse(ctx, session, conversation, t, &resp)
+			})
 
 		case types.ResponseCancelEvent:
 			xlog.Debug("recv", "message", string(msg))
-			session.cancelActiveResponse()
+			session.respSink.cancel(respcoord.SourceClient)
 
 		default:
 			xlog.Error("unknown message type")
@@ -984,28 +957,11 @@ func runRealtimeSession(application *application.Application, t Transport, model
 		}
 	}
 
-	// Cancel any in-flight response before tearing down
-	session.cancelActiveResponse()
-
-	// Stop the Opus decode goroutine (if running)
-	if decodeDone != nil {
-		close(decodeDone)
-	}
-
-	// Signal any running VAD goroutine to exit.
-	if vadServerStarted {
-		close(done)
-	}
-	// Stop the server-side sound-detection windowing goroutine (if running).
-	if soundWindowDone != nil {
-		close(soundWindowDone)
-	}
-	wg.Wait()
-
-	// Remove the session from the sessions map
-	sessionLock.Lock()
-	delete(sessions, sessionID)
-	sessionLock.Unlock()
+	// Tear down through the connection coordinator (once). It stops any running
+	// VAD goroutine, then the opus-decode and sound-window goroutines, joins them,
+	// cancels the in-flight response and drains all response goroutines, and
+	// finally removes the session — all in dependency order, exactly once.
+	conn.close()
 }
 
 // sendEvent sends a server event via the transport, logging any errors.
@@ -1285,8 +1241,38 @@ func decodeOpusLoop(session *Session, opusBackend grpc.Backend, done chan struct
 	}
 }
 
+// noSpeechHoldbackSec is how much of the tail of an inspected, segment-free
+// buffer survives the periodic no-speech clear. It must cover the VAD's
+// onset-detection latency: a word can already be underway in the newest part
+// of the window without silero having crossed its threshold yet, and clearing
+// it cuts the start of the utterance the next tick will detect.
+const noSpeechHoldbackSec = 0.5
+
+// dropInspectedPrefix removes the head of the audio buffer that a VAD tick
+// inspected (the first inspected bytes), keeping the newest holdbackBytes of
+// that window plus everything appended while the tick ran — audio the VAD
+// never saw. When something is dropped the result is a fresh copy, never a
+// sub-slice, so later appends can't scribble on memory shared with the old
+// backing array; when nothing is dropped buf is returned unchanged.
+func dropInspectedPrefix(buf []byte, inspected, holdbackBytes int) []byte {
+	cut := inspected - holdbackBytes
+	if cut <= 0 {
+		return buf
+	}
+	if cut > len(buf) {
+		cut = len(buf)
+	}
+	return append([]byte(nil), buf[cut:]...)
+}
+
 // handleVAD is a goroutine that listens for audio data from the client,
-// runs VAD on the audio data, and commits utterances to the conversation
+// runs VAD on the audio data, and commits utterances to the conversation.
+//
+// With turn_detection.type == "semantic_vad" (sv != nil below) the silero
+// loop is augmented by a live transcription stream: the buffer's new audio
+// is fed to the transcription model every tick and its end-of-utterance
+// token switches the commit threshold between a short post-EOU window and
+// the long eagerness fallback. The server_vad path is untouched.
 func handleVAD(session *Session, conv *Conversation, t Transport, done chan struct{}) {
 	vadContext, cancel := context.WithCancel(context.Background())
 	go func() {
@@ -1299,9 +1285,22 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru
 		silenceThreshold = float64(session.TurnDetection.ServerVad.SilenceDurationMs) / 1000
 	}
 
-	speechStarted := false
+	lts := newLiveTurnState(session, t)
 	startTime := time.Now()
 
+	// M2 turn-detection state machine. "Speech started" and "a turn's live ASR
+	// stream is open" are ONE coordinator state (Idle/Speaking), so they cannot
+	// desync the way the legacy speechStarted bool and lts.open() could (Part 2,
+	// failure mode 4). See realtime_turncoord.go and turncoord/.
+	sink := newTurnSink(session, conv, t, lts, vadContext, startTime)
+	// Teardown: end any open turn through the coordinator (DiscardTurn closes the
+	// live stream; no-op if already idle). Replaces the bare lts.discardTurn().
+	defer func() {
+		if err := sink.coord.Apply(turncoord.Abort{Reason: turncoord.AbortTeardown}); err != nil {
+			xlog.Error("turncoord: abort(teardown) failed", "error", err)
+		}
+	}()
+
 	ticker := time.NewTicker(300 * time.Millisecond)
 	defer ticker.Stop()
 
@@ -1310,6 +1309,30 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru
 		case <-done:
 			return
 		case <-ticker.C:
+			// Semantic mode is re-read each tick: session.update can switch
+			// turn-detection modes (and the retranscribe gate) mid-session.
+			sessionLock.Lock()
+			var sv *types.RealtimeSessionSemanticVad
+			if session.TurnDetection != nil {
+				sv = session.TurnDetection.SemanticVad
+			}
+			retranscribe := sv != nil && session.ModelConfig != nil &&
+				session.ModelConfig.Pipeline.TurnDetectionRetranscribe()
+			sessionLock.Unlock()
+
+			// The turn coordinator's data-heavy effects (OpenTurn/CommitTurn)
+			// need this tick's mode; set it before any Apply below.
+			sink.sv = sv
+
+			// session.update switched semantic -> server mid-turn: drop the
+			// orphaned live stream. This is NOT a turn abort — the turn continues
+			// under server_vad (a config change must not cut off a mid-utterance
+			// speaker), so the coordinator stays Speaking; only the orphaned live
+			// stream is closed.
+			if sv == nil && lts.open() {
+				lts.discardTurn()
+			}
+
 			session.AudioBufferLock.Lock()
 			allAudio := make([]byte, len(session.InputAudioBuffer))
 			copy(allAudio, session.InputAudioBuffer)
@@ -1323,6 +1346,13 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru
 			// Resample from InputSampleRate to 16kHz
 			aints = sound.ResampleInt16(aints, session.InputSampleRate, localSampleRate)
 
+			audioLength := float64(len(aints)) / localSampleRate
+
+			if sv != nil && lts.open() {
+				lts.feedNewAudio(aints)
+				lts.drainEvents(audioLength)
+			}
+
 			segments, err := runVAD(vadContext, session, aints)
 			if err != nil {
 				if err.Error() == "unexpected speech end" {
@@ -1334,31 +1364,52 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru
 				continue
 			}
 
-			audioLength := float64(len(aints)) / localSampleRate
-
-			// TODO: When resetting the buffer we should retain a small postfix
+			// NOTE: the no-speech clear and the min-buffer gate above stay on
+			// the short silenceThreshold even in semantic mode — the eagerness
+			// fallback applies only to the end-of-speech commit decision, or a
+			// low eagerness would delay speech_started/barge-in by seconds.
 			if len(segments) == 0 && audioLength > silenceThreshold {
+				// "No segments" is not "no speech": silero (threshold 0.5)
+				// crosses up to a few hundred ms into a soft word onset, so
+				// the newest audio in the inspected window may be the start
+				// of a word the next tick will recognize — and more audio
+				// arrived while this tick ran. Keep both; drop only the
+				// older, confirmed-silent head, or utterance onsets get cut.
+				holdback := int(noSpeechHoldbackSec*float64(session.InputSampleRate)) * 2
 				session.AudioBufferLock.Lock()
-				session.InputAudioBuffer = nil
+				session.InputAudioBuffer = dropInspectedPrefix(session.InputAudioBuffer, len(allAudio), holdback)
 				session.AudioBufferLock.Unlock()
 
+				// No-speech clear: end any open turn (Speaking -> Idle, discarding
+				// the partial). Returning to Idle is the fix for failure mode 4 —
+				// the legacy discardTurn left speechStarted true, suppressing the
+				// next onset. Idle while not speaking is a no-op.
+				if err := sink.coord.Apply(turncoord.Abort{Reason: turncoord.AbortNoSpeech}); err != nil {
+					xlog.Error("turncoord: abort(no_speech) failed", "error", err)
+				}
 				continue
 			} else if len(segments) == 0 {
 				continue
 			}
 
-			if !speechStarted {
-				// Barge-in: cancel any in-flight response so we stop
-				// sending audio and don't keep the interrupted reply in history.
-				session.cancelActiveResponse()
+			// Speech detected this tick: open the turn (Idle -> Speaking) through
+			// the coordinator. On that transition it opens the turn's live ASR
+			// stream + feeds the buffered prefix (OpenTurn), cancels any in-flight
+			// response (BargeIn, non-blocking — the VAD tick is never stalled), and
+			// emits speech_started. While already Speaking it is a no-op, so "turn
+			// open" and "speech started" can never disagree. The turn id is minted
+			// here and carried by the coordinator through to the committed event.
+			sink.onsetAudio = aints
+			if err := sink.coord.Apply(turncoord.Onset{Turn: turncoord.TurnID(generateItemID())}); err != nil {
+				xlog.Error("turncoord: onset failed", "error", err)
+			}
 
-				sendEvent(t, types.InputAudioBufferSpeechStartedEvent{
-					ServerEventBase: types.ServerEventBase{
-						EventID: "event_TODO",
-					},
-					AudioStartMs: time.Since(startTime).Milliseconds(),
-				})
-				speechStarted = true
+			if sv != nil {
+				// Drain again: events produced by THIS tick's feed have
+				// usually arrived by the time runVAD returns, and leaving
+				// them for the next tick adds 300ms to every EOU-triggered
+				// commit.
+				lts.drainEvents(audioLength)
 			}
 
 			// Segment still in progress when audio ended
@@ -1367,41 +1418,90 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru
 				continue
 			}
 
-			if float32(audioLength)-segEndTime > float32(silenceThreshold) {
+			threshold := silenceThreshold
+			eouPending := false
+			if sv != nil {
+				eouPending = lts.eouPending(segments)
+				threshold = lts.thresholdSec(eouPending, sv)
+			}
+
+			if float32(audioLength)-segEndTime > float32(threshold) {
+				if sv != nil {
+					trigger, eouLag := lts.commitTrigger(eouPending, float64(segEndTime))
+					xlog.Info("semantic_vad: committing turn",
+						"trigger", trigger,
+						"speech_end_s", segEndTime,
+						"eou_lag_s", eouLag,
+						"silence_s", audioLength-float64(segEndTime),
+						"audio_s", audioLength)
+				}
+				// Retranscribe gate (semantic mode, EOU-triggered commits
+				// only): cross-check the streamed EOU with an offline decode
+				// of the buffered turn before committing. Runs synchronously
+				// on the tick — the engine would serialize a concurrent feed
+				// against it anyway. Timeout-triggered commits skip the gate.
+				var gated *schema.TranscriptionResult
+				if retranscribe && eouPending {
+					batch, gerr := transcribeUtterance(vadContext, sound.Int16toBytesLE(aints), session)
+					switch {
+					case gerr != nil:
+						xlog.Warn("semantic_vad: retranscribe gate failed; committing via the file path", "error", gerr)
+					case !batch.Eou:
+						xlog.Info("semantic_vad: batch decode did not confirm the streamed EOU; continuing to listen",
+							"streamed", lts.previewText(), "batch", batch.Text)
+						// The batch decode rejected the streamed EOU as a false
+						// positive: consume the recorded EOU so the next tick
+						// falls back to the eagerness window instead of
+						// re-triggering on the same token.
+						lts.eouAtSec = 0
+						continue
+					default:
+						xlog.Info("semantic_vad: batch decode confirmed the streamed EOU",
+							"streamed", lts.previewText(), "batch", batch.Text)
+						gated = batch
+					}
+				}
+
 				xlog.Debug("Detected end of speech segment")
 				session.AudioBufferLock.Lock()
-				session.InputAudioBuffer = nil
+				// Keep audio appended while this tick ran — it belongs to
+				// the next turn (in any mode: nil-ing it dropped the onset
+				// of an utterance started right after a commit).
+				session.InputAudioBuffer = dropInspectedPrefix(session.InputAudioBuffer, len(allAudio), 0)
 				session.AudioBufferLock.Unlock()
 
-				sendEvent(t, types.InputAudioBufferSpeechStoppedEvent{
-					ServerEventBase: types.ServerEventBase{
-						EventID: "event_TODO",
-					},
-					AudioEndMs: time.Since(startTime).Milliseconds(),
-				})
-				speechStarted = false
-
-				sendEvent(t, types.InputAudioBufferCommittedEvent{
-					ServerEventBase: types.ServerEventBase{
-						EventID: "event_TODO",
-					},
-					ItemID:         generateItemID(),
-					PreviousItemID: "TODO",
-				})
-
-				abytes := sound.Int16toBytesLE(aints)
-				// TODO: Remove prefix silence that is is over TurnDetectionParams.PrefixPaddingMs
-				respCtx, respDone := session.startResponse(vadContext)
-				go func() {
-					defer close(respDone)
-					commitUtterance(respCtx, abytes, session, conv, t)
-				}()
+				// Commit the turn through the coordinator: it emits speech_stopped
+				// (EmitSpeechStopped) then the committed event, finalizes the live
+				// stream, and issues the response (CommitTurn). The committed item
+				// id is the coordinator's turn id (== the id the live captions
+				// streamed under), so the client replaces the partial text.
+				sink.commitAudio = sound.Int16toBytesLE(aints)
+				sink.commitAudioLength = audioLength
+				sink.commitRetranscribe = retranscribe
+				sink.commitGated = gated
+				// TODO: Remove prefix silence that is over TurnDetectionParams.PrefixPaddingMs
+				if err := sink.coord.Apply(turncoord.Silence{}); err != nil {
+					xlog.Error("turncoord: commit failed", "error", err)
+				}
 			}
 		}
 	}
 }
 
 func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Conversation, t Transport) {
+	commitUtteranceWithTranscript(ctx, utt, nil, nil, "", session, conv, t)
+}
+
+// commitUtteranceWithTranscript commits one user turn. live carries the
+// transcript semantic_vad's live stream already produced (its caption deltas
+// were streamed to the client during the turn, so only the completed event
+// is emitted here); gated carries the retranscribe gate's batch decode (the
+// authoritative transcript in that mode). With neither — server_vad, manual
+// commits, semantic degrade, or a live stream that heard nothing — the audio
+// is written to a temp WAV and transcribed via the file path as before.
+// itemID is the turn's conversation item id ("" mints a fresh one); it must
+// match the id any live deltas were sent under.
+func commitUtteranceWithTranscript(ctx context.Context, utt []byte, live *liveUtterance, gated *schema.TranscriptionResult, itemID string, session *Session, conv *Conversation, t Transport) {
 	if len(utt) == 0 {
 		return
 	}
@@ -1466,14 +1566,37 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
 	}
 
 	// TODO: If we have a real any-to-any model then transcription is optional
+
+	// The turn's live captions (semantic_vad) already streamed under this
+	// itemID; the completed event below reuses it so the client replaces the
+	// partial text. server_vad / manual commits arrive with no itemID, so mint
+	// one here.
+	if itemID == "" {
+		itemID = generateItemID()
+	}
+
 	var transcript string
 	switch {
+	case gated != nil:
+		// semantic_vad retranscribe gate: the batch decode is authoritative.
+		transcript = gated.Text
+		if err := emitPrecomputedTranscription(t, itemID, nil, transcript); err != nil {
+			sendError(t, "transcription_failed", err.Error(), "", "event_TODO")
+			return
+		}
+	case live != nil && live.Text != "":
+		// The caption deltas already streamed during the turn under this
+		// itemID; the completed event replaces the partial text client-side.
+		transcript = live.Text
+		if err := emitPrecomputedTranscription(t, itemID, nil, transcript); err != nil {
+			sendError(t, "transcription_failed", err.Error(), "", "event_TODO")
+			return
+		}
 	case session.InputAudioTranscription != nil:
 		// emitTranscription streams transcript deltas when
 		// pipeline.streaming.transcription is set, otherwise emits a single
 		// completed event; either way it returns the final transcript text.
-		var err error
-		transcript, err = emitTranscription(ctx, t, session, generateItemID(), f.Name())
+		transcript, err = emitTranscription(ctx, t, session, itemID, f.Name())
 		if err != nil {
 			// Drain the gate goroutine before returning so its in-flight read of
 			// the temp WAV finishes before the deferred os.Remove fires.
@@ -1642,6 +1765,56 @@ func writeWindowWAV(pcm []byte, sampleRate int) (string, error) {
 	return f.Name(), nil
 }
 
+// writeUtteranceWAV persists raw 16 kHz mono PCM to a temp WAV for the
+// file-based transcription paths. The caller must invoke cleanup.
+func writeUtteranceWAV(utt []byte) (string, func(), error) {
+	f, err := os.CreateTemp("", "realtime-audio-chunk-*.wav")
+	if err != nil {
+		return "", nil, err
+	}
+	cleanup := func() {
+		_ = f.Close()
+		_ = os.Remove(f.Name())
+	}
+	xlog.Debug("Writing to file", "file", f.Name())
+
+	hdr := laudio.NewWAVHeader(uint32(len(utt)))
+	if err := hdr.Write(f); err != nil {
+		cleanup()
+		return "", nil, err
+	}
+	if _, err := f.Write(utt); err != nil {
+		cleanup()
+		return "", nil, err
+	}
+	_ = f.Sync()
+	return f.Name(), cleanup, nil
+}
+
+// transcribeUtterance runs one offline (unary) decode of the buffered turn —
+// the semantic_vad retranscribe gate. The result's Eou flag reports whether
+// the batch decode also ended on the end-of-utterance token.
+func transcribeUtterance(ctx context.Context, utt []byte, session *Session) (*schema.TranscriptionResult, error) {
+	path, cleanup, err := writeUtteranceWAV(utt)
+	if err != nil {
+		return nil, err
+	}
+	defer cleanup()
+
+	language, prompt := "", ""
+	if cfg := session.InputAudioTranscription; cfg != nil {
+		language, prompt = cfg.Language, cfg.Prompt
+	}
+	tr, err := session.ModelInterface.Transcribe(ctx, path, language, false, false, prompt)
+	if err != nil {
+		return nil, err
+	}
+	if tr == nil {
+		return nil, fmt.Errorf("transcribe result is nil")
+	}
+	return tr, nil
+}
+
 func runVAD(ctx context.Context, session *Session, adata []int16) ([]schema.VADSegment, error) {
 	soundIntBuffer := &audio.IntBuffer{
 		Format:         &audio.Format{SampleRate: localSampleRate, NumChannels: 1},
@@ -1721,14 +1894,100 @@ func generateResponse(ctx context.Context, session *Session, utt []byte, transcr
 // without another response cycle.
 const maxAssistantToolTurns = 10
 
+// responseOutcome is how a response ended, decided by the response body and
+// read once by triggerResponse to emit the single terminal event.
+type responseOutcome int
+
+const (
+	outcomeCompleted responseOutcome = iota
+	outcomeCancelled
+	outcomeFailed // an error event was already sent; emit no terminal (legacy behavior)
+)
+
+// liveResponse accumulates the wire-visible result of ONE response.create across
+// the whole agentic tool-turn recursion: a single id, the output items as they
+// complete, the summed token usage, and the final outcome. triggerResponse owns
+// it; triggerResponseAtTurn / streamLLMResponse / emitToolCallItems fill it in.
+// This is what makes "exactly one response.done per response.create, with Output
+// and Usage populated" true — the body no longer emits per-turn terminals.
+type liveResponse struct {
+	id      string
+	output  []types.MessageItemUnion
+	usage   backend.TokenUsage
+	outcome responseOutcome
+}
+
+func (r *liveResponse) addItem(it types.MessageItemUnion) { r.output = append(r.output, it) }
+
+func (r *liveResponse) addUsage(u backend.TokenUsage) {
+	r.usage.Prompt += u.Prompt
+	r.usage.Completion += u.Completion
+}
+
+// responseUsage maps the backend's token counts onto the OpenAI Realtime
+// response.usage shape. Returns nil when there is nothing to report so the
+// field is omitted rather than sent as zeros.
+func responseUsage(u backend.TokenUsage) *types.TokenUsage {
+	if u.Prompt == 0 && u.Completion == 0 {
+		return nil
+	}
+	return &types.TokenUsage{
+		InputTokens:  u.Prompt,
+		OutputTokens: u.Completion,
+		TotalTokens:  u.Prompt + u.Completion,
+	}
+}
+
 func triggerResponse(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams) {
-	triggerResponseAtTurn(ctx, session, conv, t, overrides, 0)
+	// One response.created and one response.done per response.create — even when
+	// the server-side tool loop runs several inference turns. The per-turn
+	// terminals the legacy code emitted (one response.done per turn, with empty
+	// Output/Usage) are gone; tool turns are now internal to this single response.
+	r := &liveResponse{id: generateUniqueID()}
+	sendEvent(t, types.ResponseCreatedEvent{
+		ServerEventBase: types.ServerEventBase{},
+		Response: types.Response{
+			ID:     r.id,
+			Object: "realtime.response",
+			Status: types.ResponseStatusInProgress,
+		},
+	})
+
+	triggerResponseAtTurn(ctx, session, conv, t, overrides, 0, r)
+
+	switch r.outcome {
+	case outcomeCancelled:
+		sendEvent(t, types.ResponseDoneEvent{
+			ServerEventBase: types.ServerEventBase{},
+			Response: types.Response{
+				ID:     r.id,
+				Object: "realtime.response",
+				Status: types.ResponseStatusCancelled,
+				Output: r.output,
+			},
+		})
+	case outcomeFailed:
+		// A specific error event was already sent; emit no terminal (matches the
+		// legacy behavior where failed responses had no response.done).
+	default:
+		sendEvent(t, types.ResponseDoneEvent{
+			ServerEventBase: types.ServerEventBase{},
+			Response: types.Response{
+				ID:     r.id,
+				Object: "realtime.response",
+				Status: types.ResponseStatusCompleted,
+				Output: r.output,
+				Usage:  responseUsage(r.usage),
+			},
+		})
+	}
+
 	// Fold aged-out turns into the rolling memory off the critical path; the
 	// next turn reaps the smaller buffer.
 	session.maybeCompact(conv)
 }
 
-func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams, toolTurn int) {
+func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams, toolTurn int, r *liveResponse) {
 	config := session.ModelInterface.PredictConfig()
 
 	// Default values
@@ -1891,15 +2150,9 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 		images = append(images, m.StringImages...)
 	}
 
-	responseID := generateUniqueID()
-	sendEvent(t, types.ResponseCreatedEvent{
-		ServerEventBase: types.ServerEventBase{},
-		Response: types.Response{
-			ID:     responseID,
-			Object: "realtime.response",
-			Status: types.ResponseStatusInProgress,
-		},
-	})
+	// response.created/done are emitted once per response.create by triggerResponse;
+	// every turn (including agentic recursion) shares this id.
+	responseID := r.id
 
 	// Streamed LLM path: when the pipeline opts into LLM streaming, stream the
 	// transcript to the client as it is generated and synthesize the buffered
@@ -1915,7 +2168,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 			respMods = overrides.OutputModalities
 		}
 		if canStream && modalitiesContainAudio(resolveOutputModalities(session.OutputModalities, respMods)) {
-			if streamLLMResponse(ctx, session, conv, t, responseID, conversationHistory, images, config, tools, toolChoice, toolTurn) {
+			if streamLLMResponse(ctx, session, conv, t, r, conversationHistory, images, config, tools, toolChoice, toolTurn) {
 				return
 			}
 		}
@@ -1924,26 +2177,22 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 	predFunc, err := session.ModelInterface.Predict(ctx, conversationHistory, images, nil, nil, nil, tools, toolChoice, nil, nil, nil)
 	if err != nil {
 		sendError(t, "inference_failed", fmt.Sprintf("backend error: %v", err), "", "") // item.Assistant.ID is unknown here
+		r.outcome = outcomeFailed
 		return
 	}
 
 	pred, err := predFunc()
 	if err != nil {
 		sendError(t, "prediction_failed", fmt.Sprintf("backend error: %v", err), "", "")
+		r.outcome = outcomeFailed
 		return
 	}
+	r.addUsage(pred.Usage)
 
 	// Check for cancellation after LLM inference (barge-in may have fired)
 	if ctx.Err() != nil {
 		xlog.Debug("Response cancelled after LLM inference (barge-in)")
-		sendEvent(t, types.ResponseDoneEvent{
-			ServerEventBase: types.ServerEventBase{},
-			Response: types.Response{
-				ID:     responseID,
-				Object: "realtime.response",
-				Status: types.ResponseStatusCancelled,
-			},
-		})
+		r.outcome = outcomeCancelled
 		return
 	}
 
@@ -2103,18 +2352,12 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 			conv.Lock.Unlock()
 		}
 
-		// sendCancelledResponse emits the cancelled status and cleans up the
-		// assistant item so the interrupted reply is not in chat history.
+		// sendCancelledResponse records the cancelled outcome (triggerResponse
+		// emits the single terminal) and cleans up the partial assistant item so
+		// the interrupted reply is not in chat history.
 		sendCancelledResponse := func() {
 			removeItemFromConv(item.Assistant.ID)
-			sendEvent(t, types.ResponseDoneEvent{
-				ServerEventBase: types.ServerEventBase{},
-				Response: types.Response{
-					ID:     responseID,
-					Object: "realtime.response",
-					Status: types.ResponseStatusCancelled,
-				},
-			})
+			r.outcome = outcomeCancelled
 		}
 
 		var audioString string
@@ -2163,6 +2406,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 				}
 				xlog.Error("TTS failed", "error", err)
 				sendError(t, "tts_error", fmt.Sprintf("TTS generation failed: %v", err), "", item.Assistant.ID)
+				r.outcome = outcomeFailed
 				return
 			}
 			if !isWebRTC {
@@ -2220,12 +2464,13 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 			OutputIndex:     0,
 			Item:            item,
 		})
+		r.addItem(item)
 	}
 
-	// Emit the parsed tool calls, the terminal response.done, and (for
-	// server-side assistant tools) the follow-up response. Shared with the
-	// streamed path so both finalize tool calls identically.
-	emitToolCallItems(ctx, session, conv, t, responseID, finalToolCalls, finalSpeech != "", toolTurn)
+	// Emit the parsed tool calls and (for server-side assistant tools) the
+	// follow-up turn. Shared with the streamed path so both finalize tool calls
+	// identically. The single terminal is emitted by triggerResponse.
+	emitToolCallItems(ctx, session, conv, t, r, finalToolCalls, finalSpeech != "", toolTurn)
 }
 
 // emitToolCallItems emits the realtime function_call items for the parsed tool
@@ -2239,7 +2484,8 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 //   - All other tools follow the standard OpenAI flow: emit
 //     function_call_arguments.done and wait for the client to send
 //     conversation.item.create back.
-func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation, t Transport, responseID string, toolCalls []functions.FuncCallResults, hasContent bool, toolTurn int) {
+func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation, t Transport, r *liveResponse, toolCalls []functions.FuncCallResults, hasContent bool, toolTurn int) {
+	responseID := r.id
 	xlog.Debug("About to handle tool calls", "finalToolCallsCount", len(toolCalls))
 	executedAssistantTool := false
 	for i, tc := range toolCalls {
@@ -2302,6 +2548,7 @@ func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation
 				OutputIndex:     outputIndex,
 				Item:            fcItem,
 			})
+			r.addItem(fcItem)
 			sendEvent(t, types.ResponseOutputItemAddedEvent{
 				ServerEventBase: types.ServerEventBase{},
 				ResponseID:      responseID,
@@ -2314,6 +2561,7 @@ func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation
 				OutputIndex:     outputIndex,
 				Item:            foItem,
 			})
+			r.addItem(foItem)
 			executedAssistantTool = true
 			continue
 		}
@@ -2343,28 +2591,25 @@ func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation
 			OutputIndex:     outputIndex,
 			Item:            fcItem,
 		})
+		r.addItem(fcItem)
 	}
 
-	sendEvent(t, types.ResponseDoneEvent{
-		ServerEventBase: types.ServerEventBase{},
-		Response: types.Response{
-			ID:     responseID,
-			Object: "realtime.response",
-			Status: types.ResponseStatusCompleted,
-		},
-	})
+	// No terminal here: triggerResponse emits the single response.done once the
+	// whole turn (including the agentic recursion below) completes.
 
 	// If we executed any assistant tools inproc, run another response cycle
 	// so the model can speak the result. Mirrors the chat-side agentic loop
 	// but driven server-side rather than by client round-trip. Bounded so a
-	// degenerate "model keeps calling tools" doesn't blow the stack.
+	// degenerate "model keeps calling tools" doesn't blow the stack. The
+	// follow-up turn shares the same liveResponse, so its output accumulates
+	// into the one response.done.
 	if executedAssistantTool {
 		if toolTurn+1 >= maxAssistantToolTurns {
 			xlog.Warn("realtime: assistant tool-turn limit reached, stopping the agentic loop",
 				"limit", maxAssistantToolTurns, "model", session.Model)
 			return
 		}
-		triggerResponseAtTurn(ctx, session, conv, t, nil, toolTurn+1)
+		triggerResponseAtTurn(ctx, session, conv, t, nil, toolTurn+1, r)
 	}
 }
 
diff --git a/core/http/endpoints/openai/realtime_compactcoord.go b/core/http/endpoints/openai/realtime_compactcoord.go
new file mode 100644
index 000000000..10c6304f4
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_compactcoord.go
@@ -0,0 +1,79 @@
+package openai
+
+import (
+	"context"
+	"sync"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/compactcoord"
+	"github.com/mudler/xlog"
+)
+
+// compactionSink wires the explicit compaction state machine
+// (compactcoord.Coordinator — machine "M4" in docs/design/realtime-state-machines.md)
+// into a conversation.
+//
+// It replaces the legacy `compacting atomic.Bool` single-flight guard: the
+// coordinator owns whether a compaction is running, so a Trigger while one is
+// already in flight is dropped (single-flight) and the background goroutine
+// always reports Finished — the flag can never stick (invariant #9).
+//
+// run is the summarize+evict work for this conversation (captured at
+// construction); StartCompaction spawns it and reports Finished when it returns.
+// It takes a context derived from the sink's session-scoped ctx, so shutdown()
+// can cancel an in-flight compaction.
+type compactionSink struct {
+	coord  *compactcoord.Coordinator
+	run    func(ctx context.Context)
+	ctx    context.Context
+	cancel context.CancelFunc
+	wg     sync.WaitGroup
+}
+
+func newCompactionSink(run func(ctx context.Context)) *compactionSink {
+	s := &compactionSink{run: run}
+	s.ctx, s.cancel = context.WithCancel(context.Background())
+	s.coord = compactcoord.New(s)
+	return s
+}
+
+// trigger asks the coordinator to start a compaction; a no-op while one is
+// already running or after shutdown. Non-blocking.
+func (s *compactionSink) trigger() {
+	if err := s.coord.Apply(compactcoord.Trigger{}); err != nil {
+		xlog.Error("compactcoord: trigger failed", "error", err)
+	}
+}
+
+// shutdown is called by the connection (M1) parent's teardown: cancel any
+// in-flight compaction, join it, then move the coordinator to Terminated so no
+// compaction can start afterwards. This closes the legacy gap where the
+// fire-and-forget compaction goroutine could outlive the session. Cancelling the
+// context first makes the in-flight summarizer Predict return promptly, so the
+// join is bounded.
+func (s *compactionSink) shutdown() {
+	s.cancel()
+	s.wg.Wait()
+	if err := s.coord.Apply(compactcoord.Shutdown{}); err != nil {
+		xlog.Error("compactcoord: shutdown apply failed", "error", err)
+	}
+}
+
+// Perform executes one effect. Called under the coordinator lock; StartCompaction
+// only spawns a goroutine, so it does not block.
+func (s *compactionSink) Perform(e compactcoord.Effect) {
+	switch e.(type) {
+	case compactcoord.StartCompaction:
+		s.wg.Add(1)
+		go func() {
+			defer s.wg.Done()
+			defer func() {
+				if err := s.coord.Apply(compactcoord.Finished{}); err != nil {
+					xlog.Error("compactcoord: finished apply failed", "error", err)
+				}
+			}()
+			if s.run != nil {
+				s.run(s.ctx)
+			}
+		}()
+	}
+}
diff --git a/core/http/endpoints/openai/realtime_compaction.go b/core/http/endpoints/openai/realtime_compaction.go
index f79a2d7a2..3b1967465 100644
--- a/core/http/endpoints/openai/realtime_compaction.go
+++ b/core/http/endpoints/openai/realtime_compaction.go
@@ -222,7 +222,7 @@ func prefixMatches(items, snapshot []*types.MessageItemUnion) bool {
 // conv.Lock across the summarizer call: snapshot under lock, summarize unlocked,
 // commit under lock (re-validating the head is unchanged). On any error it
 // leaves the conversation untouched — items are never dropped without a summary.
-func (s *Session) compact(conv *Conversation, model Model) {
+func (s *Session) compact(ctx context.Context, conv *Conversation, model Model) {
 	if model == nil {
 		return
 	}
@@ -241,9 +241,10 @@ func (s *Session) compact(conv *Conversation, model Model) {
 	prior := conv.Memory
 	conv.Lock.Unlock()
 
-	// Summarize (unlocked).
+	// Summarize (unlocked). The timeout is derived from the caller's ctx so the
+	// connection teardown can cancel an in-flight summary (bounding the join).
 	msgs := buildSummaryMessages(prior, renderItemsTranscript(overflow), s.MaxSummaryTokens)
-	ctx, cancel := context.WithTimeout(context.Background(), compactionTimeout)
+	ctx, cancel := context.WithTimeout(ctx, compactionTimeout)
 	defer cancel()
 	predFunc, err := model.Predict(ctx, msgs, nil, nil, nil, nil, nil, nil, nil, nil, nil)
 	if err != nil {
@@ -298,9 +299,13 @@ func (s *Session) summarizerModel() Model {
 }
 
 // maybeCompact schedules a background compaction when the live buffer has grown
-// past the trigger and none is already running. Returns immediately.
+// past the trigger and none is already running. Returns immediately. The
+// single-flight guarantee (at most one compaction per conversation) is owned by
+// the compaction coordinator (M4); see realtime_compactcoord.go. The actual
+// summarize+evict work (and the lazy summary_model load) is the conversation's
+// compaction-sink run closure, so it stays off the response path.
 func (s *Session) maybeCompact(conv *Conversation) {
-	if !s.CompactionEnabled {
+	if !s.CompactionEnabled || conv.compaction == nil {
 		return
 	}
 	conv.Lock.Lock()
@@ -309,18 +314,5 @@ func (s *Session) maybeCompact(conv *Conversation) {
 	if !over {
 		return
 	}
-	if !conv.compacting.CompareAndSwap(false, true) {
-		return
-	}
-	go func() {
-		defer conv.compacting.Store(false)
-		// Resolve (and, for a configured summary_model, lazily load) the
-		// summarizer only when a compaction actually runs, off the response
-		// path — so the model load never blocks a user turn.
-		model := s.summarizerModel()
-		if model == nil {
-			return
-		}
-		s.compact(conv, model)
-	}()
+	conv.compaction.trigger()
 }
diff --git a/core/http/endpoints/openai/realtime_compaction_test.go b/core/http/endpoints/openai/realtime_compaction_test.go
index 5b19a8259..dd8180497 100644
--- a/core/http/endpoints/openai/realtime_compaction_test.go
+++ b/core/http/endpoints/openai/realtime_compaction_test.go
@@ -1,6 +1,7 @@
 package openai
 
 import (
+	"context"
 	"errors"
 
 	. "github.com/onsi/ginkgo/v2"
@@ -198,7 +199,7 @@ var _ = Describe("compact", func() {
 		s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4, MaxSummaryTokens: 512}
 		m := &fakeModel{predictResp: backend.LLMResponse{Response: "ROLLED UP"}}
 
-		s.compact(conv, m)
+		s.compact(context.Background(), conv, m)
 
 		Expect(conv.Memory).To(Equal("ROLLED UP"))
 		Expect(len(conv.Items)).To(Equal(4))
@@ -213,7 +214,7 @@ var _ = Describe("compact", func() {
 		s := &Session{CompactionEnabled: true, CompactionTrigger: 2, MaxHistoryItems: 1, MaxSummaryTokens: 512}
 		m := &fakeModel{predictErr: errors.New("boom")}
 
-		s.compact(conv, m)
+		s.compact(context.Background(), conv, m)
 
 		Expect(conv.Memory).To(Equal(""))
 		Expect(len(conv.Items)).To(Equal(3))
@@ -227,7 +228,7 @@ var _ = Describe("compact", func() {
 		s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4, MaxSummaryTokens: 512}
 		m := &fakeModel{predictResp: backend.LLMResponse{Response: "<think>planning the summary</think>CLEAN SUMMARY"}}
 
-		s.compact(conv, m)
+		s.compact(context.Background(), conv, m)
 
 		Expect(conv.Memory).To(Equal("CLEAN SUMMARY"))
 		Expect(conv.Memory).ToNot(ContainSubstring("planning"))
@@ -236,7 +237,7 @@ var _ = Describe("compact", func() {
 	It("does nothing when items are at or below the trigger", func() {
 		conv := &Conversation{Items: []*types.MessageItemUnion{user("1", "a")}}
 		s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4}
-		s.compact(conv, &fakeModel{predictResp: backend.LLMResponse{Response: "x"}})
+		s.compact(context.Background(), conv, &fakeModel{predictResp: backend.LLMResponse{Response: "x"}})
 		Expect(conv.Memory).To(Equal(""))
 		Expect(len(conv.Items)).To(Equal(1))
 	})
diff --git a/core/http/endpoints/openai/realtime_conncoord.go b/core/http/endpoints/openai/realtime_conncoord.go
new file mode 100644
index 000000000..0dc6016bf
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_conncoord.go
@@ -0,0 +1,122 @@
+package openai
+
+import (
+	"sync"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/conncoord"
+	"github.com/mudler/xlog"
+)
+
+// connSink wires the explicit connection-lifecycle state machine
+// (conncoord.Coordinator — machine "M1" in docs/design/realtime-state-machines.md)
+// into the realtime session handler.
+//
+// It replaces the legacy vadServerStarted bool + the `done` channel that was
+// reassigned on every turn-detection toggle and closed from two sites (Part 2,
+// failure mode 6). The coordinator owns whether the VAD goroutine is running, so
+// the per-run done channel is created and closed in lockstep with that one state
+// — closed exactly once, never resurrected after teardown.
+//
+// The connection machine is driven by the single session goroutine (the handler
+// loop and its teardown), so this sink and its coordinator are loop-local; the
+// Coordinator's lock only keeps State() race-free.
+//
+// Effects:
+//   - StartVAD: create a fresh done channel and spawn handleVAD on it (joined via wg).
+//   - StopVAD:  close that done channel.
+//   - Teardown: stop the remaining input goroutines (opus decode, sound window),
+//     join everything, cancel in-flight responses, and remove the session — once.
+type connSink struct {
+	session   *Session
+	sessionID string
+	transport Transport
+	wg        *sync.WaitGroup
+
+	coord *conncoord.Coordinator
+
+	// vadDone is the current VAD run's stop signal — recreated on each StartVAD,
+	// closed by StopVAD / Teardown. Owned solely by Perform (single goroutine).
+	vadDone chan struct{}
+
+	// One-shot stop signals for the other input goroutines, registered by the
+	// handler when it starts them; closed once by Teardown.
+	decodeDone      chan struct{}
+	soundWindowDone chan struct{}
+}
+
+func newConnSink(session *Session, sessionID string, t Transport, wg *sync.WaitGroup) *connSink {
+	s := &connSink{
+		session:   session,
+		sessionID: sessionID,
+		transport: t,
+		wg:        wg,
+	}
+	s.coord = conncoord.New(s)
+	return s
+}
+
+// setVAD requests the turn-detection goroutine match active. Idempotent.
+func (s *connSink) setVAD(active bool) {
+	if err := s.coord.Apply(conncoord.SetVAD{Active: active}); err != nil {
+		xlog.Error("conncoord: setVAD failed", "error", err)
+	}
+}
+
+// close tears the session down (once). Safe to call from multiple exit paths.
+func (s *connSink) close() {
+	if err := s.coord.Apply(conncoord.Close{}); err != nil {
+		xlog.Error("conncoord: close failed", "error", err)
+	}
+}
+
+// Perform executes one effect. Called by Coordinator.Apply under the coordinator
+// lock; the connection coordinator is single-writer and torn down exactly once at
+// the end of the session goroutine, so the blocking joins in Teardown never
+// contend the lock.
+func (s *connSink) Perform(e conncoord.Effect) {
+	switch e.(type) {
+	case conncoord.StartVAD:
+		xlog.Debug("Starting VAD goroutine...")
+		s.vadDone = make(chan struct{})
+		done := s.vadDone
+		s.wg.Go(func() {
+			conversation := s.session.Conversations[s.session.DefaultConversationID]
+			handleVAD(s.session, conversation, s.transport, done)
+		})
+	case conncoord.StopVAD:
+		xlog.Debug("Stopping VAD goroutine...")
+		close(s.vadDone)
+		s.vadDone = nil
+	case conncoord.Teardown:
+		// Tear down in dependency order, driving every child machine to its
+		// terminal state so none outlives the session (the hierarchy invariant in
+		// formal-verification/session_lifecycle.fizz: conn Torn => children terminal).
+		//
+		// 1. Stop the remaining input goroutines and join them (this joins the VAD
+		//    goroutine, M2, via the StopVAD above + wg).
+		if s.decodeDone != nil {
+			close(s.decodeDone)
+		}
+		if s.soundWindowDone != nil {
+			close(s.soundWindowDone)
+		}
+		s.wg.Wait()
+
+		// 2. Terminate the response coordinator (M3): cancel the in-flight response
+		//    and join all response goroutines (which also closes their TTS
+		//    pipelines, M5). After this no response can start.
+		s.session.respSink.shutdown()
+
+		// 3. Terminate every conversation's compaction coordinator (M4): cancel +
+		//    join any in-flight summarize+evict so it cannot outlive the session.
+		for _, conv := range s.session.Conversations {
+			if conv.compaction != nil {
+				conv.compaction.shutdown()
+			}
+		}
+
+		sessionLock.Lock()
+		delete(sessions, s.sessionID)
+		sessionLock.Unlock()
+	}
+}
diff --git a/core/http/endpoints/openai/realtime_doubles_test.go b/core/http/endpoints/openai/realtime_doubles_test.go
index 10e608c17..6dc1c6ca5 100644
--- a/core/http/endpoints/openai/realtime_doubles_test.go
+++ b/core/http/endpoints/openai/realtime_doubles_test.go
@@ -74,6 +74,16 @@ type fakeModel struct {
 
 	transcribeDeltas []string
 	transcribeFinal  *schema.TranscriptionResult
+	transcribeErr    error
+
+	// TranscribeLive scripting: liveErr makes the open fail (degrade path);
+	// liveEvents are delivered to onEvent synchronously at open;
+	// liveCloseEvents are delivered during Close (the finalize flush).
+	liveErr         error
+	liveEvents      []backend.LiveTranscriptionEvent
+	liveCloseEvents []backend.LiveTranscriptionEvent
+	liveOpened      int
+	liveSession     *fakeLiveSession
 
 	// soundDetectionResult/soundDetectionErr drive the SoundDetection double so
 	// the sound-event path can be exercised deterministically.
@@ -97,7 +107,7 @@ func (m *fakeModel) VAD(context.Context, *schema.VADRequest) (*schema.VADRespons
 }
 
 func (m *fakeModel) Transcribe(context.Context, string, string, bool, bool, string) (*schema.TranscriptionResult, error) {
-	return m.transcribeFinal, nil
+	return m.transcribeFinal, m.transcribeErr
 }
 
 func (m *fakeModel) SoundDetection(context.Context, string, int, float32) (*schema.SoundClassificationResult, error) {
@@ -150,4 +160,43 @@ func (m *fakeModel) TranscribeStream(_ context.Context, _, _ string, _, _ bool,
 	return m.transcribeFinal, nil
 }
 
+func (m *fakeModel) TranscribeLive(_ context.Context, _ string, onEvent func(backend.LiveTranscriptionEvent)) (backend.LiveTranscriptionSession, error) {
+	if m.liveErr != nil {
+		return nil, m.liveErr
+	}
+	m.liveOpened++
+	for _, ev := range m.liveEvents {
+		onEvent(ev)
+	}
+	m.liveSession = &fakeLiveSession{onEvent: onEvent, closeEvents: m.liveCloseEvents}
+	return m.liveSession, nil
+}
+
 func (m *fakeModel) PredictConfig() *config.ModelConfig { return m.cfg }
+
+// fakeLiveSession records what semantic_vad fed and closed; closeEvents are
+// replayed through onEvent during Close, mimicking the backend's finalize
+// flush (trailing delta + Final) landing before Close returns.
+type fakeLiveSession struct {
+	onEvent     func(backend.LiveTranscriptionEvent)
+	closeEvents []backend.LiveTranscriptionEvent
+	fed         [][]float32
+	feedErr     error
+	closed      int
+}
+
+func (s *fakeLiveSession) Feed(pcm []float32) error {
+	if s.feedErr != nil {
+		return s.feedErr
+	}
+	s.fed = append(s.fed, append([]float32(nil), pcm...))
+	return nil
+}
+
+func (s *fakeLiveSession) Close() error {
+	s.closed++
+	for _, ev := range s.closeEvents {
+		s.onEvent(ev)
+	}
+	return nil
+}
diff --git a/core/http/endpoints/openai/realtime_model.go b/core/http/endpoints/openai/realtime_model.go
index 0dafa0a35..71f553980 100644
--- a/core/http/endpoints/openai/realtime_model.go
+++ b/core/http/endpoints/openai/realtime_model.go
@@ -102,6 +102,10 @@ func (m *transcriptOnlyModel) TranscribeStream(ctx context.Context, audio, langu
 	return transcribeStream(ctx, m.modelLoader, *m.TranscriptionConfig, m.appConfig, audio, language, translate, diarize, prompt, onDelta)
 }
 
+func (m *transcriptOnlyModel) TranscribeLive(ctx context.Context, language string, onEvent func(backend.LiveTranscriptionEvent)) (backend.LiveTranscriptionSession, error) {
+	return backend.ModelTranscriptionLive(ctx, language, m.modelLoader, *m.TranscriptionConfig, m.appConfig, onEvent)
+}
+
 func (m *transcriptOnlyModel) PredictConfig() *config.ModelConfig {
 	return nil
 }
@@ -348,6 +352,10 @@ func (m *wrappedModel) TranscribeStream(ctx context.Context, audio, language str
 	return transcribeStream(ctx, m.modelLoader, *m.TranscriptionConfig, m.appConfig, audio, language, translate, diarize, prompt, onDelta)
 }
 
+func (m *wrappedModel) TranscribeLive(ctx context.Context, language string, onEvent func(backend.LiveTranscriptionEvent)) (backend.LiveTranscriptionSession, error) {
+	return backend.ModelTranscriptionLive(ctx, language, m.modelLoader, *m.TranscriptionConfig, m.appConfig, onEvent)
+}
+
 func (m *wrappedModel) PredictConfig() *config.ModelConfig {
 	return m.LLMConfig
 }
diff --git a/core/http/endpoints/openai/realtime_respcoord.go b/core/http/endpoints/openai/realtime_respcoord.go
new file mode 100644
index 000000000..c34ef8bd4
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_respcoord.go
@@ -0,0 +1,143 @@
+package openai
+
+import (
+	"context"
+	"sync"
+	"sync/atomic"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/respcoord"
+	"github.com/mudler/xlog"
+)
+
+// responseSink wires the explicit response-coordination state machine
+// (respcoord.Coordinator — machine "M3" in docs/design/realtime-state-machines.md)
+// into a realtime session.
+//
+// It replaces the legacy startResponse/cancelActiveResponse pair, whose
+// activeResponse* fields were written from two goroutines (the client read-loop
+// and the VAD goroutine) with the <-done wait performed outside the lock — the
+// dual-writer race documented in Part 2 (failure mode 2). The coordinator
+// serializes every start/cancel/finish decision behind one lock and guarantees
+// at most one live response, so the two callers can no longer interleave into
+// two overlapping responses.
+//
+// Each response runs as a goroutine spawned here. The effects map as:
+//   - StartResponse:  spawn the registered body with a fresh cancelable context.
+//   - CancelResponse: cancel that context (cooperative — the body stops at its
+//     next ctx checkpoint and emits its own response.done{cancelled}).
+//   - EmitTerminal:   currently a no-op. response.done is still emitted by the
+//     response body itself; making this the single authoritative terminal (one
+//     response.done per response.create, with Output+Usage populated) is the
+//     next step and does not change the coordination guarantees here.
+type responseSink struct {
+	mu      sync.Mutex
+	coord   *respcoord.Coordinator
+	cancels map[respcoord.ResponseID]context.CancelFunc
+	bodies  map[respcoord.ResponseID]responseBody
+	seq     atomic.Uint64
+	wg      sync.WaitGroup
+}
+
+type responseBody struct {
+	parent context.Context
+	run    func(ctx context.Context)
+}
+
+func newResponseSink() *responseSink {
+	s := &responseSink{
+		cancels: map[respcoord.ResponseID]context.CancelFunc{},
+		bodies:  map[respcoord.ResponseID]responseBody{},
+	}
+	s.coord = respcoord.New(s)
+	return s
+}
+
+// issue registers a response body and asks the coordinator to start it. Any
+// in-flight response is superseded (cancelled, with its own terminal) first,
+// atomically inside the coordinator — no caller-side locking, no dual-writer
+// race. Non-blocking: the superseded response drains concurrently and its later
+// Finished is ignored as stale.
+func (s *responseSink) issue(parent context.Context, source respcoord.Source, run func(ctx context.Context)) {
+	id := respcoord.ResponseID(s.seq.Add(1))
+	s.mu.Lock()
+	s.bodies[id] = responseBody{parent: parent, run: run}
+	s.mu.Unlock()
+	if err := s.coord.Apply(respcoord.Start{ID: id, Source: source}); err != nil {
+		xlog.Error("respcoord: start failed", "error", err)
+	}
+}
+
+// cancel cancels the in-flight response, if any. Non-blocking (barge-in must not
+// stall the VAD tick).
+func (s *responseSink) cancel(source respcoord.Source) {
+	if err := s.coord.Apply(respcoord.Cancel{Source: source}); err != nil {
+		xlog.Error("respcoord: cancel failed", "error", err)
+	}
+}
+
+// wait blocks until every response goroutine (the active one plus any draining
+// superseded ones) has exited. Used at teardown so the session is never deleted
+// out from under a running response.
+func (s *responseSink) wait() {
+	s.wg.Wait()
+}
+
+// shutdown terminates the coordinator (cancelling any in-flight response) and
+// then joins all response goroutines. After this the coordinator is in its
+// absorbing Terminated state, so no further response can be issued — the
+// connection (M1) parent's teardown uses this to guarantee no response outlives
+// the session (see formal-verification/session_lifecycle.fizz).
+func (s *responseSink) shutdown() {
+	if err := s.coord.Apply(respcoord.Shutdown{}); err != nil {
+		xlog.Error("respcoord: shutdown failed", "error", err)
+	}
+	s.wait()
+}
+
+// Perform executes one effect. It is called by Coordinator.Apply while the
+// coordinator lock is held, so it must not block. It briefly takes s.mu but
+// never acquires the coordinator lock while holding s.mu; the spawned
+// goroutine's Finished apply takes the coordinator lock only AFTER releasing
+// s.mu, so there is no lock cycle.
+func (s *responseSink) Perform(e respcoord.Effect) {
+	switch eff := e.(type) {
+	case respcoord.StartResponse:
+		s.mu.Lock()
+		body := s.bodies[eff.ID]
+		delete(s.bodies, eff.ID)
+		parent := body.parent
+		if parent == nil {
+			parent = context.Background()
+		}
+		ctx, cancel := context.WithCancel(parent)
+		s.cancels[eff.ID] = cancel
+		s.mu.Unlock()
+
+		s.wg.Go(func() {
+			defer func() {
+				s.mu.Lock()
+				delete(s.cancels, eff.ID)
+				s.mu.Unlock()
+				// Report completion. If this response was superseded/cancelled
+				// the id is stale and the coordinator ignores it (so the
+				// terminal is never emitted twice).
+				if err := s.coord.Apply(respcoord.Finished{ID: eff.ID}); err != nil {
+					xlog.Error("respcoord: finished apply failed", "error", err)
+				}
+			}()
+			if body.run != nil {
+				body.run(ctx)
+			}
+		})
+	case respcoord.CancelResponse:
+		s.mu.Lock()
+		cancel := s.cancels[eff.ID]
+		s.mu.Unlock()
+		if cancel != nil {
+			cancel()
+		}
+	case respcoord.EmitTerminal:
+		// No-op for now: the response body still emits its own response.done.
+		// Wiring the authoritative single terminal here is the next step.
+	}
+}
diff --git a/core/http/endpoints/openai/realtime_semantic_vad.go b/core/http/endpoints/openai/realtime_semantic_vad.go
new file mode 100644
index 000000000..66dfc6efe
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_semantic_vad.go
@@ -0,0 +1,350 @@
+package openai
+
+import (
+	"context"
+	"strings"
+
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/xlog"
+)
+
+// Semantic (EOU-driven) turn detection.
+//
+// With turn_detection.type == "semantic_vad", the transcription model is fed
+// the microphone audio live while the user speaks and its end-of-utterance
+// token turns the silence window dynamic: an immediate commit once the
+// token fires (the model judged the user finished and expects a reply), the
+// much longer eagerness fallback when it does not (mid-thought pause). The
+// silero VAD stays in charge of speech_started/barge-in and the actual
+// silence measurement, so a spurious EOU mid-speech cannot cut the user off
+// — the commit still requires real silence.
+
+const (
+	// semanticEouSilenceSec is the extra silence required to commit once the
+	// end-of-utterance token has fired. Zero: the token already trails the
+	// audio by the encoder chunk schedule plus a VAD tick (~0.3-0.9s), and
+	// the commit check only runs after silero closes the speech segment —
+	// which itself takes real silence — so any window on top is pure added
+	// response delay.
+	semanticEouSilenceSec = 0.0
+
+	// liveEventsBuffer sizes the recv-callback → VAD-tick handoff channel.
+	// Events arrive at a few per second and the ticker drains every 300ms;
+	// a full channel means the loop is wedged, and dropping (with a warning)
+	// beats blocking the backend's recv goroutine.
+	liveEventsBuffer = 64
+)
+
+// eagernessMaxSilenceSec maps the OpenAI semantic_vad eagerness to the
+// fallback silence window used when no end-of-utterance token was seen:
+// low waits longest, high responds fastest, auto/empty equals medium —
+// the same 8s/4s/2s max timeouts OpenAI documents.
+func eagernessMaxSilenceSec(eagerness string) float64 {
+	switch strings.ToLower(strings.TrimSpace(eagerness)) {
+	case "low":
+		return 8
+	case "high":
+		return 2
+	default: // "medium", "auto", ""
+		return 4
+	}
+}
+
+// liveUtterance is one committed turn's transcript as produced by the live
+// stream. Its delta events were already streamed to the client as they
+// arrived (keyed by the turn's item id), so only the final text travels here.
+type liveUtterance struct {
+	Text string
+}
+
+// liveTurnState is handleVAD's per-session live-ASR companion for
+// semantic_vad. One live stream is opened per user turn (begun when the VAD
+// first reports speech, finalized at commit) — the underlying decode session
+// grows with fed audio, so per-turn streams keep it bounded. All fields are
+// owned by the handleVAD goroutine; the backend's recv callback only writes
+// into the buffered events channel.
+type liveTurnState struct {
+	session   *Session
+	transport Transport // live caption deltas are sent here as they drain
+	events    chan backend.LiveTranscriptionEvent
+
+	live        backend.LiveTranscriptionSession // nil between turns
+	unavailable bool                             // sticky: backend can't do live ASR, degrade for the session
+
+	fed16k int // 16k samples of the current buffer already fed
+	// eouAtSec is the audio time of the most recent EOU this turn (0 = none).
+	// It is a recorded fact: set when an EOU drains and never toggled off
+	// mid-turn. Whether it still governs the trailing silence is derived
+	// purely by eouPending() from this plus the live VAD segments.
+	eouAtSec   float64
+	parts      []string // deltas accumulated for the current turn
+	finalText  string   // authoritative full-turn text from the Final event
+	itemID     string   // the turn's conversation item id, allocated at openTurn
+	deltasSent bool     // at least one caption delta reached the client this turn
+}
+
+func newLiveTurnState(session *Session, transport Transport) *liveTurnState {
+	return &liveTurnState{
+		session:   session,
+		transport: transport,
+		events:    make(chan backend.LiveTranscriptionEvent, liveEventsBuffer),
+	}
+}
+
+func (l *liveTurnState) open() bool { return l.live != nil }
+
+// openTurn starts the turn's live stream under the caller-supplied item id. A
+// failure (most commonly the backend's typed "live transcription unsupported"
+// signal) degrades the whole session to silence-only detection — warned once,
+// then sticky.
+//
+// The item id is supplied by the turn coordinator (turncoord) rather than minted
+// here: it is allocated when the turn STARTS so caption deltas can stream to the
+// client while the user is still speaking, and the committed event and final
+// transcript reuse it (replacing the partial text). The coordinator carries the
+// same id on its CommitTurn/DiscardTurn effects, so the committed event always
+// matches the captions.
+func (l *liveTurnState) openTurn(ctx context.Context, itemID string) bool {
+	if l.live != nil {
+		return true
+	}
+	if l.unavailable {
+		return false
+	}
+	language := ""
+	if l.session.InputAudioTranscription != nil {
+		language = l.session.InputAudioTranscription.Language
+	}
+	live, err := l.session.ModelInterface.TranscribeLive(ctx, language, func(ev backend.LiveTranscriptionEvent) {
+		select {
+		case l.events <- ev:
+		default:
+			xlog.Warn("semantic_vad: live transcription event dropped (event channel full)")
+		}
+	})
+	if err != nil {
+		l.unavailable = true
+		xlog.Warn("semantic_vad: live transcription unavailable; degrading to silence-only turn detection",
+			"error", err)
+		return false
+	}
+	l.resetTurn()
+	l.live = live
+	l.itemID = itemID
+	return true
+}
+
+// feedNewAudio pushes the not-yet-fed tail of the resampled buffer to the
+// live stream. The final sample is held back: ResampleInt16 is prefix-stable
+// except for its last output sample, so excluding it keeps successive
+// whole-buffer resamples bit-identical over the fed range.
+func (l *liveTurnState) feedNewAudio(aints16k []int16) {
+	if l.live == nil {
+		return
+	}
+	end := len(aints16k) - 1
+	if end <= l.fed16k {
+		return
+	}
+	if err := l.live.Feed(int16sToFloat32(aints16k[l.fed16k:end])); err != nil {
+		xlog.Warn("semantic_vad: live feed failed; degrading to silence-only turn detection", "error", err)
+		l.discardTurn()
+		l.unavailable = true
+		return
+	}
+	l.fed16k = end
+}
+
+// drainEvents folds everything the live stream produced since the last tick
+// into the turn state. audioSec (the current buffer length in seconds) marks
+// WHEN an EOU was observed, so later VAD segments can distinguish speech
+// that resumed after it.
+func (l *liveTurnState) drainEvents(audioSec float64) {
+	for {
+		select {
+		case ev := <-l.events:
+			if ev.Delta != "" {
+				l.parts = append(l.parts, ev.Delta)
+				// Live captions: forward the delta immediately under the
+				// turn's item id — the browser shows text while the user
+				// is still speaking; the completed event at commit
+				// replaces it with the authoritative transcript.
+				if l.transport != nil && l.itemID != "" {
+					sendEvent(l.transport, types.ConversationItemInputAudioTranscriptionDeltaEvent{
+						ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
+						ItemID:          l.itemID,
+						ContentIndex:    0,
+						Delta:           ev.Delta,
+					})
+					l.deltasSent = true
+				}
+			}
+			if ev.Eou {
+				// Record the position; do not flip a flag. Whether this EOU
+				// still applies to the trailing silence is decided later by
+				// eouPending(), purely from this and the live VAD segments.
+				l.eouAtSec = audioSec
+				xlog.Debug("semantic_vad: EOU token observed", "audio_s", audioSec)
+			}
+			if ev.Eob {
+				// A backchannel ended ("uh-huh") — the user is still
+				// listening, not yielding the turn. Deliberately NOT a
+				// commit trigger.
+				xlog.Debug("semantic_vad: EOB (backchannel) observed", "audio_s", audioSec)
+			}
+			if ev.Final != nil && strings.TrimSpace(ev.Final.Text) != "" {
+				l.finalText = ev.Final.Text
+			}
+		default:
+			return
+		}
+	}
+}
+
+// eouPending reports whether the recorded EOU still applies to the current
+// trailing silence. It is a pure function of the recorded EOU position and the
+// VAD's live view — there is no stored boolean that can fall out of sync.
+//
+// An EOU stops applying only once the user has STARTED a new utterance after
+// it (a segment whose start is past the EOU): that is genuine resumed speech,
+// so the earlier yield no longer holds. An in-progress segment whose speech
+// began BEFORE the EOU is NOT resumed speech — it is just silero still padding
+// before it closes the segment, which is the normal state at the instant the
+// (predictive) EOU fires. Treating that as resumed speech was the bug that
+// cleared the flag on the very tick the token arrived, dropping almost every
+// EOU to the eagerness timeout.
+func (l *liveTurnState) eouPending(segments []schema.VADSegment) bool {
+	if l.eouAtSec == 0 || len(segments) == 0 {
+		return false
+	}
+	last := segments[len(segments)-1]
+	return float64(last.Start) <= l.eouAtSec
+}
+
+// thresholdSec is the dynamic commit threshold: zero once the model said
+// the utterance is over (any VAD-confirmed silence commits), the eagerness
+// fallback otherwise.
+func (l *liveTurnState) thresholdSec(eouPending bool, sv *types.RealtimeSessionSemanticVad) float64 {
+	if eouPending {
+		return semanticEouSilenceSec
+	}
+	return eagernessMaxSilenceSec(sv.Eagerness)
+}
+
+// commitTrigger describes how a commit decision was reached, for the per-turn
+// timing log: "eou" with the token's lag behind the VAD's speech end, or
+// "timeout" when the eagerness fallback elapsed without one. The lag is the
+// number the user needs to tell a slow EOU emission apart from loop overhead.
+func (l *liveTurnState) commitTrigger(eouPending bool, speechEndSec float64) (trigger string, eouLagSec float64) {
+	if !eouPending {
+		return "timeout", 0
+	}
+	return "eou", l.eouAtSec - speechEndSec
+}
+
+// finishTurn finalizes the live stream (flushing the decode tail — the last
+// ~2 encoder frames of text only appear here), folds the terminal events in,
+// and returns the turn's transcript. Returns nil when the stream never
+// produced text (the VAD triggered on something the model heard nothing in).
+func (l *liveTurnState) finishTurn(audioSec float64) *liveUtterance {
+	if l.live == nil {
+		return nil
+	}
+	if err := l.live.Close(); err != nil {
+		xlog.Warn("semantic_vad: live transcription finalize failed", "error", err)
+	}
+	l.live = nil
+	l.drainEvents(audioSec)
+
+	text := strings.TrimSpace(l.finalText)
+	if text == "" {
+		text = l.previewText()
+	}
+	ut := &liveUtterance{Text: text}
+	l.resetTurn()
+	if ut.Text == "" {
+		return nil
+	}
+	return ut
+}
+
+// discardTurn drops the current turn (no-speech buffer clear, feed failure,
+// session teardown): the stream is closed and its transcript thrown away.
+// Any caption deltas already shown for it are retracted via the failed
+// event, so the client doesn't keep a stuck partial entry.
+func (l *liveTurnState) discardTurn() {
+	if l.live != nil {
+		_ = l.live.Close()
+		l.live = nil
+	}
+	l.drainEvents(0)
+	if l.deltasSent && l.transport != nil && l.itemID != "" {
+		sendEvent(l.transport, types.ConversationItemInputAudioTranscriptionFailedEvent{
+			ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
+			ItemID:          l.itemID,
+			ContentIndex:    0,
+			Error: types.Error{
+				Type:    "transcription_discarded",
+				Message: "turn discarded before commit",
+			},
+		})
+	}
+	l.resetTurn()
+}
+
+func (l *liveTurnState) resetTurn() {
+	l.fed16k = 0
+	l.eouAtSec = 0
+	l.parts = nil
+	l.finalText = ""
+	l.itemID = ""
+	l.deltasSent = false
+}
+
+// previewText is the turn's transcript so far (for the retranscribe
+// comparison log and as the fallback when no Final event arrived).
+func (l *liveTurnState) previewText() string {
+	return strings.TrimSpace(strings.Join(l.parts, ""))
+}
+
+// int16sToFloat32 converts PCM to the [-1,1] float form the live stream
+// feeds the model (the same scaling runVAD's go-audio conversion applies).
+func int16sToFloat32(samples []int16) []float32 {
+	out := make([]float32, len(samples))
+	for i, s := range samples {
+		out[i] = float32(s) / 32768.0
+	}
+	return out
+}
+
+// turnDetectionActive reports whether the session has any automatic turn
+// detection (server or semantic VAD) that should run the handleVAD loop.
+func turnDetectionActive(td *types.TurnDetectionUnion) bool {
+	return td != nil && (td.ServerVad != nil || td.SemanticVad != nil)
+}
+
+// defaultTurnDetection seeds a new session's turn detection from the
+// pipeline's server-side default: semantic_vad pipelines start sessions in
+// semantic mode (clients can still override via session.update); everything
+// else keeps the historical server_vad defaults.
+func defaultTurnDetection(cfg *config.ModelConfig) *types.TurnDetectionUnion {
+	if cfg != nil && cfg.Pipeline.TurnDetectionSemantic() {
+		return &types.TurnDetectionUnion{
+			SemanticVad: &types.RealtimeSessionSemanticVad{
+				CreateResponse: true,
+				Eagerness:      cfg.Pipeline.TurnDetection.Eagerness,
+			},
+		}
+	}
+	return &types.TurnDetectionUnion{
+		ServerVad: &types.ServerVad{
+			Threshold:         0.5,
+			PrefixPaddingMs:   300,
+			SilenceDurationMs: 500,
+			CreateResponse:    true,
+		},
+	}
+}
diff --git a/core/http/endpoints/openai/realtime_semantic_vad_test.go b/core/http/endpoints/openai/realtime_semantic_vad_test.go
new file mode 100644
index 000000000..c3f5d7ef8
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_semantic_vad_test.go
@@ -0,0 +1,414 @@
+package openai
+
+import (
+	"context"
+	"errors"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+	"github.com/mudler/LocalAI/core/schema"
+)
+
+var _ = Describe("eagernessMaxSilenceSec", func() {
+	DescribeTable("maps eagerness to the no-EOU fallback window",
+		func(eagerness string, want float64) {
+			Expect(eagernessMaxSilenceSec(eagerness)).To(Equal(want))
+		},
+		Entry("low", "low", 8.0),
+		Entry("medium", "medium", 4.0),
+		Entry("high", "high", 2.0),
+		Entry("auto equals medium", "auto", 4.0),
+		Entry("empty equals medium", "", 4.0),
+		Entry("case and space insensitive", " High ", 2.0),
+		Entry("unknown equals medium", "frantic", 4.0),
+	)
+})
+
+var _ = Describe("turnDetectionActive", func() {
+	It("is active for server and semantic VAD, inactive otherwise", func() {
+		Expect(turnDetectionActive(nil)).To(BeFalse())
+		Expect(turnDetectionActive(&types.TurnDetectionUnion{})).To(BeFalse())
+		Expect(turnDetectionActive(&types.TurnDetectionUnion{ServerVad: &types.ServerVad{}})).To(BeTrue())
+		Expect(turnDetectionActive(&types.TurnDetectionUnion{SemanticVad: &types.RealtimeSessionSemanticVad{}})).To(BeTrue())
+	})
+})
+
+var _ = Describe("defaultTurnDetection", func() {
+	It("keeps the historical server_vad defaults for non-semantic pipelines", func() {
+		td := defaultTurnDetection(&config.ModelConfig{})
+		Expect(td.ServerVad).NotTo(BeNil())
+		Expect(td.SemanticVad).To(BeNil())
+		Expect(td.ServerVad.SilenceDurationMs).To(Equal(int64(500)))
+		Expect(td.ServerVad.CreateResponse).To(BeTrue())
+	})
+
+	It("seeds semantic_vad with the pipeline's eagerness", func() {
+		cfg := &config.ModelConfig{}
+		cfg.Pipeline.TurnDetection.Type = "semantic_vad"
+		cfg.Pipeline.TurnDetection.Eagerness = "high"
+		td := defaultTurnDetection(cfg)
+		Expect(td.SemanticVad).NotTo(BeNil())
+		Expect(td.ServerVad).To(BeNil())
+		Expect(td.SemanticVad.Eagerness).To(Equal("high"))
+		Expect(td.SemanticVad.CreateResponse).To(BeTrue())
+	})
+
+	It("treats a nil config as server_vad", func() {
+		Expect(defaultTurnDetection(nil).ServerVad).NotTo(BeNil())
+	})
+})
+
+var _ = Describe("int16sToFloat32", func() {
+	It("scales like the VAD conversion", func() {
+		out := int16sToFloat32([]int16{0, 16384, -32768})
+		Expect(out).To(HaveLen(3))
+		Expect(out[0]).To(BeNumerically("~", 0.0, 1e-6))
+		Expect(out[1]).To(BeNumerically("~", 0.5, 1e-6))
+		Expect(out[2]).To(BeNumerically("~", -1.0, 1e-6))
+	})
+})
+
+var _ = Describe("liveTurnState", func() {
+	var (
+		m   *fakeModel
+		lts *liveTurnState
+		ftr *fakeTransport
+	)
+
+	newSemanticSession := func(m *fakeModel) *Session {
+		return &Session{
+			InputAudioTranscription: &types.AudioTranscription{},
+			ModelInterface:          m,
+		}
+	}
+
+	BeforeEach(func() {
+		m = &fakeModel{}
+		ftr = &fakeTransport{}
+		lts = newLiveTurnState(newSemanticSession(m), ftr)
+	})
+
+	Describe("openTurn", func() {
+		It("opens once per turn and reports open()", func() {
+			Expect(lts.open()).To(BeFalse())
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
+			Expect(lts.open()).To(BeTrue())
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue(), "idempotent while open")
+			Expect(m.liveOpened).To(Equal(1))
+		})
+
+		It("degrades stickily when the backend cannot do live transcription", func() {
+			m.liveErr = errors.New("rpc error: code = Unimplemented desc = live transcription unsupported")
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeFalse())
+			Expect(lts.unavailable).To(BeTrue())
+
+			// Later turns never retry: the failure is per-session sticky.
+			m.liveErr = nil
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeFalse())
+			Expect(m.liveOpened).To(Equal(0))
+		})
+	})
+
+	Describe("feedNewAudio", func() {
+		It("feeds only the unfed tail and holds back the final resampled sample", func() {
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
+
+			lts.feedNewAudio([]int16{1, 2, 3, 4})
+			Expect(m.liveSession.fed).To(HaveLen(1))
+			Expect(m.liveSession.fed[0]).To(HaveLen(3), "last sample held back")
+
+			// Same buffer grown by two samples: only the delta is fed.
+			lts.feedNewAudio([]int16{1, 2, 3, 4, 5, 6})
+			Expect(m.liveSession.fed).To(HaveLen(2))
+			Expect(m.liveSession.fed[1]).To(HaveLen(2))
+
+			// No growth past the holdback: nothing fed.
+			lts.feedNewAudio([]int16{1, 2, 3, 4, 5, 6})
+			Expect(m.liveSession.fed).To(HaveLen(2))
+		})
+
+		It("degrades and closes the turn when a feed fails", func() {
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
+			m.liveSession.feedErr = errors.New("backend gone")
+			sess := m.liveSession
+
+			lts.feedNewAudio([]int16{1, 2, 3, 4})
+
+			Expect(lts.open()).To(BeFalse())
+			Expect(lts.unavailable).To(BeTrue())
+			Expect(sess.closed).To(Equal(1))
+		})
+	})
+
+	Describe("event handling and the dynamic threshold", func() {
+		sv := &types.RealtimeSessionSemanticVad{Eagerness: "high"}
+
+		It("uses the eagerness fallback until an EOU is recorded, then commits without an extra window", func() {
+			Expect(lts.thresholdSec(false, sv)).To(Equal(2.0))
+			Expect(lts.thresholdSec(true, sv)).To(Equal(semanticEouSilenceSec))
+
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
+			lts.session.ModelInterface.(*fakeModel).liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "hello ", Eou: false})
+			lts.session.ModelInterface.(*fakeModel).liveSession.onEvent(backend.LiveTranscriptionEvent{Eou: true})
+			lts.drainEvents(3.3)
+
+			Expect(lts.eouAtSec).To(BeNumerically("~", 3.3, 1e-9))
+			Expect(lts.previewText()).To(Equal("hello"))
+		})
+
+		// The bug this replaces: the (predictive) EOU routinely arrives while
+		// silero is still padding the speech segment open. eouPending must NOT
+		// read that as resumed speech.
+		It("keeps the EOU pending while silero is still closing the same segment", func() {
+			lts.eouAtSec = 3.3
+			Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 0}})).To(BeTrue(), "segment began before the EOU and is merely unclosed")
+			Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 3.0}})).To(BeTrue(), "and still pending once it closes")
+		})
+
+		It("drops the EOU only when a new utterance starts after it (resumed speech)", func() {
+			lts.eouAtSec = 3.3
+			Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 3.0}, {Start: 4.0, End: 0}})).To(BeFalse())
+			Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 3.0}, {Start: 4.0, End: 5.0}})).To(BeFalse())
+		})
+
+		It("has no pending EOU before one is recorded", func() {
+			Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 3.0}})).To(BeFalse())
+			Expect(lts.eouPending(nil)).To(BeFalse())
+		})
+
+		It("does not arm the commit threshold on an EOB backchannel", func() {
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
+			lts.session.ModelInterface.(*fakeModel).liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "uh-huh", Eob: true})
+			lts.drainEvents(2.0)
+
+			Expect(lts.eouAtSec).To(BeZero(), "a backchannel is not the user yielding the turn")
+			Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 1.8}})).To(BeFalse(), "still on the eagerness fallback")
+			Expect(lts.previewText()).To(Equal("uh-huh"), "the backchannel text still lands in the transcript")
+		})
+
+		It("reports the commit trigger and the EOU token's lag behind speech end", func() {
+			trigger, lag := lts.commitTrigger(false, 3.2)
+			Expect(trigger).To(Equal("timeout"))
+			Expect(lag).To(BeZero())
+
+			lts.eouAtSec = 3.5
+			trigger, lag = lts.commitTrigger(true, 3.2)
+			Expect(trigger).To(Equal("eou"))
+			Expect(lag).To(BeNumerically("~", 0.3, 1e-9))
+		})
+	})
+
+	Describe("finishTurn", func() {
+		It("finalizes the stream, prefers the Final text, and resets for the next turn", func() {
+			m.liveCloseEvents = []backend.LiveTranscriptionEvent{
+				{Delta: " world"},
+				{Final: &schema.TranscriptionResult{Text: "hello world", Eou: true}},
+			}
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
+			sess := m.liveSession
+			sess.onEvent(backend.LiveTranscriptionEvent{Delta: "hello", Eou: true})
+			lts.drainEvents(2.0)
+
+			ut := lts.finishTurn(2.5)
+
+			Expect(sess.closed).To(Equal(1))
+			Expect(ut).NotTo(BeNil())
+			Expect(ut.Text).To(Equal("hello world"), "Final event text wins over joined deltas")
+			Expect(lts.open()).To(BeFalse())
+			Expect(lts.eouAtSec).To(BeZero())
+			Expect(lts.parts).To(BeEmpty())
+			Expect(lts.fed16k).To(BeZero())
+		})
+
+		It("returns nil when the stream heard nothing", func() {
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
+			Expect(lts.finishTurn(1.0)).To(BeNil())
+			Expect(m.liveSession.closed).To(Equal(1))
+		})
+
+		It("is a no-op without an open stream", func() {
+			Expect(lts.finishTurn(1.0)).To(BeNil())
+		})
+	})
+
+	Describe("discardTurn", func() {
+		It("closes the stream, drops the transcript and retracts streamed captions", func() {
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
+			sess := m.liveSession
+			sess.onEvent(backend.LiveTranscriptionEvent{Delta: "noise"})
+			lts.drainEvents(1.0)
+
+			lts.discardTurn()
+
+			Expect(sess.closed).To(Equal(1))
+			Expect(lts.open()).To(BeFalse())
+			Expect(lts.parts).To(BeEmpty())
+			Expect(ftr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionFailed)).To(Equal(1),
+				"the client saw caption deltas for this turn — it must be told to drop them")
+		})
+
+		It("sends no failed event when no captions ever reached the client", func() {
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
+			lts.discardTurn()
+			Expect(ftr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionFailed)).To(Equal(0))
+		})
+	})
+
+	Describe("live captions", func() {
+		It("streams each delta to the client under the turn's item id as it drains", func() {
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
+			turnID := lts.itemID
+			Expect(turnID).NotTo(BeEmpty(), "the item id exists from turn open so captions can reference it")
+
+			m.liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "hel"})
+			m.liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "lo"})
+			lts.drainEvents(1.0)
+
+			var got []types.ConversationItemInputAudioTranscriptionDeltaEvent
+			for _, e := range ftr.events {
+				if d, ok := e.(types.ConversationItemInputAudioTranscriptionDeltaEvent); ok {
+					got = append(got, d)
+				}
+			}
+			Expect(got).To(HaveLen(2))
+			Expect(got[0].Delta).To(Equal("hel"))
+			Expect(got[1].Delta).To(Equal("lo"))
+			Expect(got[0].ItemID).To(Equal(turnID))
+			Expect(got[1].ItemID).To(Equal(turnID))
+			Expect(lts.deltasSent).To(BeTrue())
+		})
+
+		It("finishTurn does not retract captions — the commit's completed event supersedes them", func() {
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
+			m.liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "hello"})
+			lts.drainEvents(1.0)
+
+			Expect(lts.finishTurn(1.5)).NotTo(BeNil())
+			Expect(ftr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionFailed)).To(Equal(0))
+		})
+	})
+})
+
+// commitUtteranceWithTranscript routes the three transcript sources: the
+// retranscribe gate's batch decode, the live stream's accumulated text, and
+// the historical file path.
+var _ = Describe("commitUtteranceWithTranscript", func() {
+	newTranscriptionOnlySession := func(m *fakeModel, streamTranscription bool) *Session {
+		cfg := &config.ModelConfig{}
+		if streamTranscription {
+			on := true
+			cfg.Pipeline.Streaming.Transcription = &on
+		}
+		return &Session{
+			TranscriptionOnly:       true, // stop after the transcript: no LLM/TTS in these specs
+			InputAudioTranscription: &types.AudioTranscription{},
+			ModelConfig:             cfg,
+			ModelInterface:          m,
+		}
+	}
+
+	It("uses the gate's batch transcript and never re-runs the backend", func() {
+		m := &fakeModel{transcribeErr: errors.New("must not be called")}
+		session := newTranscriptionOnlySession(m, true)
+		tr := &fakeTransport{}
+
+		commitUtteranceWithTranscript(context.Background(), []byte{1, 2}, nil,
+			&schema.TranscriptionResult{Text: "batch text", Eou: true}, "item_turn", session, &Conversation{}, tr)
+
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionDelta)).To(Equal(0))
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
+	})
+
+	It("emits only the completed event for a live transcript — captions already streamed during the turn", func() {
+		m := &fakeModel{transcribeErr: errors.New("must not be called")}
+		session := newTranscriptionOnlySession(m, true)
+		tr := &fakeTransport{}
+
+		commitUtteranceWithTranscript(context.Background(), []byte{1, 2},
+			&liveUtterance{Text: "hello"}, nil, "item_turn", session, &Conversation{}, tr)
+
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionDelta)).To(Equal(0))
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
+
+		var completed types.ConversationItemInputAudioTranscriptionCompletedEvent
+		for _, e := range tr.events {
+			if c, ok := e.(types.ConversationItemInputAudioTranscriptionCompletedEvent); ok {
+				completed = c
+			}
+		}
+		Expect(completed.ItemID).To(Equal("item_turn"),
+			"completed must reuse the caption deltas' item id so the client replaces, not duplicates")
+		Expect(completed.Transcript).To(Equal("hello"))
+	})
+
+	It("falls back to the file path when the live stream heard nothing", func() {
+		m := &fakeModel{transcribeFinal: &schema.TranscriptionResult{Text: "from file"}}
+		session := newTranscriptionOnlySession(m, false)
+		tr := &fakeTransport{}
+
+		commitUtteranceWithTranscript(context.Background(), []byte{1, 2},
+			&liveUtterance{}, nil, "", session, &Conversation{}, tr)
+
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
+	})
+})
+
+// transcribeUtterance is the retranscribe gate's offline decode of the
+// buffered turn.
+var _ = Describe("transcribeUtterance", func() {
+	It("returns the batch decode with its Eou flag", func() {
+		m := &fakeModel{transcribeFinal: &schema.TranscriptionResult{Text: "confirmed", Eou: true}}
+		session := &Session{
+			InputAudioTranscription: &types.AudioTranscription{},
+			ModelInterface:          m,
+		}
+
+		tr, err := transcribeUtterance(context.Background(), []byte{0, 0, 1, 1}, session)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(tr.Text).To(Equal("confirmed"))
+		Expect(tr.Eou).To(BeTrue())
+	})
+
+	It("propagates backend errors", func() {
+		m := &fakeModel{transcribeErr: errors.New("engine fell over")}
+		session := &Session{
+			InputAudioTranscription: &types.AudioTranscription{},
+			ModelInterface:          m,
+		}
+
+		_, err := transcribeUtterance(context.Background(), []byte{0, 0}, session)
+		Expect(err).To(MatchError(ContainSubstring("engine fell over")))
+	})
+})
+
+// emitPrecomputedTranscription replays an already-produced transcript as the
+// standard delta/completed event sequence.
+var _ = Describe("emitPrecomputedTranscription", func() {
+	It("emits deltas then completed, sharing the item id", func() {
+		tr := &fakeTransport{}
+		Expect(emitPrecomputedTranscription(tr, "item42", []string{"a", "", "b"}, "ab")).To(Succeed())
+
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionDelta)).To(Equal(2), "empty deltas skipped")
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
+		for _, e := range tr.events {
+			switch ev := e.(type) {
+			case types.ConversationItemInputAudioTranscriptionDeltaEvent:
+				Expect(ev.ItemID).To(Equal("item42"))
+			case types.ConversationItemInputAudioTranscriptionCompletedEvent:
+				Expect(ev.ItemID).To(Equal("item42"))
+				Expect(ev.Transcript).To(Equal("ab"))
+			}
+		}
+	})
+
+	It("emits only the completed event with no deltas", func() {
+		tr := &fakeTransport{}
+		Expect(emitPrecomputedTranscription(tr, "item1", nil, "hi")).To(Succeed())
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionDelta)).To(Equal(0))
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
+	})
+})
diff --git a/core/http/endpoints/openai/realtime_stream.go b/core/http/endpoints/openai/realtime_stream.go
index 909fc50dc..7c37f7aff 100644
--- a/core/http/endpoints/openai/realtime_stream.go
+++ b/core/http/endpoints/openai/realtime_stream.go
@@ -86,7 +86,8 @@ func (s *transcriptStreamer) content() string {
 // tool calls. It returns true when it has fully handled the response so the
 // caller can return; callers must only invoke it for an audio modality, and with
 // tools only when the model uses its tokenizer template (see triggerResponseAtTurn).
-func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation, t Transport, responseID string, history schema.Messages, images []string, llmCfg *config.ModelConfig, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, toolTurn int) bool {
+func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation, t Transport, r *liveResponse, history schema.Messages, images []string, llmCfg *config.ModelConfig, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, toolTurn int) bool {
+	responseID := r.id
 	itemID := generateItemID()
 	item := types.MessageItemUnion{
 		Assistant: &types.MessageItemAssistant{
@@ -121,6 +122,8 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 		})
 	}
 
+	// cancel rolls back the partial item and records the cancelled outcome; the
+	// single terminal is emitted by triggerResponse.
 	cancel := func() {
 		if announced {
 			conv.Lock.Lock()
@@ -132,10 +135,7 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 			}
 			conv.Lock.Unlock()
 		}
-		sendEvent(t, types.ResponseDoneEvent{
-			ServerEventBase: types.ServerEventBase{},
-			Response:        types.Response{ID: responseID, Object: "realtime.response", Status: types.ResponseStatusCancelled},
-		})
+		r.outcome = outcomeCancelled
 	}
 
 	var template string
@@ -161,24 +161,30 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 	streamer.announce = announce
 
 	// Clause chunking (opt-in): synthesize each clause as soon as it completes
-	// instead of buffering the whole reply. streamedAudio accumulates the PCM
-	// across clauses for the conversation item record; ttsErr captures the first
-	// synthesis failure so the token callback can stop the prediction. emitSpeech
-	// runs synchronously here — the LLM keeps generating into the gRPC stream
-	// while a clause is synthesized, so audio still starts mid-generation.
+	// instead of buffering the whole reply. Synthesis runs on a worker goroutine
+	// (ttsPipeline) rather than inline in the token callback: emitSpeech blocks
+	// until the whole clause is synthesized (and, for WebRTC, played back at
+	// real time), and the callback runs on the goroutine that drains the LLM
+	// gRPC stream — so speaking inline stalls generation and freezes the
+	// assistant transcript at every clause boundary. The worker lets generation
+	// and the transcript stream keep flowing while audio is produced behind them.
 	var chunker *clauseChunker
+	var ttsPipe *ttsPipeline
 	if session.ModelConfig != nil && session.ModelConfig.Pipeline.ChunkClauses() {
 		chunker = newClauseChunker(defaultClauseMinRunes, defaultClauseMaxRunes)
+		ttsPipe = newTTSPipeline(func(clause string) ([]byte, error) {
+			return emitSpeech(ctx, t, session, responseID, itemID, clause)
+		})
 	}
 	var streamedAudio []byte
 	var ttsErr error
-	speakClause := func(clause string) error {
-		a, err := emitSpeech(ctx, t, session, responseID, itemID, clause)
-		if err != nil {
-			return err
-		}
-		streamedAudio = append(streamedAudio, a...)
-		return nil
+
+	// Backstop: always join the TTS worker, even on an unexpected early return.
+	// wait() is idempotent, so the explicit drain below (which captures the
+	// streamed audio and first error) stays authoritative; this only guarantees
+	// the goroutine can never leak if a new return path is added.
+	if ttsPipe != nil {
+		defer func() { _, _ = ttsPipe.wait() }()
 	}
 
 	// fail reports a mid-stream failure. A cancelled context means the client
@@ -188,6 +194,7 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 			cancel()
 		} else {
 			sendError(t, code, fmt.Sprintf("%s: %v", msg, err), "", itemID)
+			r.outcome = outcomeFailed
 		}
 		return true
 	}
@@ -207,8 +214,12 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 		delta := streamer.onToken(text)
 		if chunker != nil && delta != "" {
 			for _, clause := range chunker.push(delta) {
-				if ttsErr = speakClause(clause); ttsErr != nil {
-					return false // stop the prediction; reported after predFunc returns
+				// Hand the clause to the worker and keep going — never block the
+				// recv loop on synthesis. A false return means a prior clause
+				// already failed; stop the prediction (the error is collected
+				// from the pipeline after predFunc returns).
+				if !ttsPipe.enqueue(clause) {
+					return false
 				}
 			}
 		}
@@ -217,10 +228,27 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 
 	predFunc, err := session.ModelInterface.Predict(ctx, history, images, nil, nil, cb, tools, toolChoice, nil, nil, nil)
 	if err != nil {
+		// The deferred wait() joins the (idle) worker.
 		sendError(t, "inference_failed", fmt.Sprintf("backend error: %v", err), "", itemID)
 		return true
 	}
 	pred, err := predFunc()
+
+	// Drain the TTS worker. On a clean finish, enqueue the trailing clause(s) the
+	// chunker was still holding; on an error or barge-in, stop synthesizing.
+	// wait() runs on every path so the worker goroutine never leaks, and it
+	// returns the audio streamed so far plus the first synthesis failure.
+	if ttsPipe != nil {
+		if err == nil && ctx.Err() == nil {
+			for _, clause := range chunker.flush() {
+				if !ttsPipe.enqueue(clause) {
+					break
+				}
+			}
+		}
+		streamedAudio, ttsErr = ttsPipe.wait()
+	}
+
 	// A clause synthesis failed mid-stream (the callback stopped the prediction);
 	// report it as a TTS error rather than a prediction error.
 	if ttsErr != nil {
@@ -233,6 +261,7 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 		cancel()
 		return true
 	}
+	r.addUsage(pred.Usage)
 
 	content := streamer.content()
 	toolCalls := functions.ToolCallsFromChatDeltas(pred.ChatDeltas)
@@ -244,24 +273,19 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 			announce()
 		}
 
-		// Synthesize the audio. With clause chunking the completed clauses were
-		// already spoken inside the token callback; flush the trailing clause(s)
-		// the segmenter was still holding. Otherwise buffer the whole message and
-		// synthesize it once. emitSpeech streams the audio chunks when the TTS
-		// backend supports TTSStream, otherwise it sends a single unary delta.
+		// With clause chunking the clauses were synthesized on the worker as the
+		// reply streamed (including the trailing flush drained above), so the
+		// audio is already accumulated. Otherwise buffer the whole message and
+		// synthesize it once now — emitSpeech streams the audio chunks when the
+		// TTS backend supports TTSStream, otherwise it sends a single unary delta.
 		var audio []byte
 		if chunker != nil {
-			for _, clause := range chunker.flush() {
-				if ttsErr = speakClause(clause); ttsErr != nil {
-					break
-				}
-			}
 			audio = streamedAudio
 		} else {
 			audio, ttsErr = emitSpeech(ctx, t, session, responseID, itemID, content)
-		}
-		if ttsErr != nil {
-			return fail("tts_error", "TTS generation failed", ttsErr)
+			if ttsErr != nil {
+				return fail("tts_error", "TTS generation failed", ttsErr)
+			}
 		}
 
 		_, isWebRTC := t.(*WebRTCTransport)
@@ -306,10 +330,12 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 			OutputIndex:     0,
 			Item:            item,
 		})
+		r.addItem(item)
 	}
 
-	// Emit any tool calls, the terminal response.done, and (for server-side
-	// assistant tools) the follow-up turn — shared with the buffered path.
-	emitToolCallItems(ctx, session, conv, t, responseID, toolCalls, content != "", toolTurn)
+	// Emit any tool calls and (for server-side assistant tools) the follow-up
+	// turn — shared with the buffered path. The single terminal is emitted by
+	// triggerResponse.
+	emitToolCallItems(ctx, session, conv, t, r, toolCalls, content != "", toolTurn)
 	return true
 }
diff --git a/core/http/endpoints/openai/realtime_stream_test.go b/core/http/endpoints/openai/realtime_stream_test.go
index 5150feb21..439f3240e 100644
--- a/core/http/endpoints/openai/realtime_stream_test.go
+++ b/core/http/endpoints/openai/realtime_stream_test.go
@@ -102,7 +102,8 @@ var _ = Describe("streamLLMResponse", func() {
 		t := &fakeTransport{}
 		llmCfg := &config.ModelConfig{}
 
-		handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0)
+		r := &liveResponse{id: "resp1"}
+		handled := streamLLMResponse(context.Background(), session, conv, t, r, nil, nil, llmCfg, nil, nil, 0)
 
 		Expect(handled).To(BeTrue())
 		// One live transcript delta per streamed token.
@@ -132,7 +133,8 @@ var _ = Describe("streamLLMResponse", func() {
 		t := &fakeTransport{}
 		llmCfg := &config.ModelConfig{}
 
-		handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0)
+		r := &liveResponse{id: "resp1"}
+		handled := streamLLMResponse(context.Background(), session, conv, t, r, nil, nil, llmCfg, nil, nil, 0)
 
 		Expect(handled).To(BeTrue())
 		// Two clauses ("Hello world." mid-stream, "How are you?" on flush) → two
@@ -140,8 +142,10 @@ var _ = Describe("streamLLMResponse", func() {
 		Expect(t.countEvents(types.ServerEventTypeResponseOutputAudioDelta)).To(Equal(2))
 		// The full transcript still streams verbatim.
 		Expect(t.transcriptDeltaText()).To(Equal("Hello world. How are you?"))
-		// Exactly one terminal response.done.
-		Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(1))
+		// The terminal response.done is emitted by triggerResponse, not by
+		// streamLLMResponse — so at this layer there are none.
+		Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
+		Expect(r.outcome).To(Equal(outcomeCompleted))
 	})
 
 	It("streams content deltas and emits tool-call items (autoparser tool turn)", func() {
@@ -169,15 +173,18 @@ var _ = Describe("streamLLMResponse", func() {
 		llmCfg := &config.ModelConfig{}
 		llmCfg.TemplateConfig.UseTokenizerTemplate = true
 
-		handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0)
+		r := &liveResponse{id: "resp1"}
+		handled := streamLLMResponse(context.Background(), session, conv, t, r, nil, nil, llmCfg, nil, nil, 0)
 
 		Expect(handled).To(BeTrue())
 		// The spoken content was streamed live.
 		Expect(t.transcriptDeltaText()).To(Equal("Let me check."))
 		// The tool call is emitted as a function_call item.
 		Expect(t.countEvents(types.ServerEventTypeResponseFunctionCallArgumentsDone)).To(Equal(1))
-		// Exactly one terminal response.done.
-		Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(1))
+		// The terminal response.done is emitted by triggerResponse, not by
+		// streamLLMResponse — so at this layer there are none.
+		Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
+		Expect(r.outcome).To(Equal(outcomeCompleted))
 	})
 
 	It("emits only tool-call items for a content-less tool turn (no empty assistant item)", func() {
@@ -200,7 +207,8 @@ var _ = Describe("streamLLMResponse", func() {
 		llmCfg := &config.ModelConfig{}
 		llmCfg.TemplateConfig.UseTokenizerTemplate = true
 
-		handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0)
+		r := &liveResponse{id: "resp1"}
+		handled := streamLLMResponse(context.Background(), session, conv, t, r, nil, nil, llmCfg, nil, nil, 0)
 
 		Expect(handled).To(BeTrue())
 		// No content → no transcript deltas and no spurious assistant content item.
@@ -208,6 +216,51 @@ var _ = Describe("streamLLMResponse", func() {
 		Expect(t.countEvents(types.ServerEventTypeResponseOutputAudioTranscriptDelta)).To(Equal(0))
 		// The tool call is still emitted.
 		Expect(t.countEvents(types.ServerEventTypeResponseFunctionCallArgumentsDone)).To(Equal(1))
-		Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(1))
+		Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
+		Expect(r.outcome).To(Equal(outcomeCompleted))
+	})
+})
+
+var _ = Describe("triggerResponse", func() {
+	It("emits exactly one response.created and one response.done with output and usage", func() {
+		m := &fakeModel{
+			cfg: &config.ModelConfig{},
+			predictResp: backend.LLMResponse{
+				Response: "Hi there.",
+				Usage:    backend.TokenUsage{Prompt: 5, Completion: 3},
+			},
+		}
+		session := &Session{
+			OutputSampleRate: 24000,
+			ModelInterface:   m,
+			ModelConfig:      &config.ModelConfig{},
+			// Text-only so the buffered path skips TTS and the assertion focuses
+			// on the terminal's Output + Usage.
+			OutputModalities: []types.Modality{types.ModalityText},
+		}
+		conv := &Conversation{}
+		t := &fakeTransport{}
+
+		triggerResponse(context.Background(), session, conv, t, nil)
+
+		// Exactly one of each lifecycle event for the whole response.create.
+		Expect(t.countEvents(types.ServerEventTypeResponseCreated)).To(Equal(1))
+		Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(1))
+
+		// The single terminal carries the produced output item and the usage —
+		// both empty in the legacy code.
+		var done *types.ResponseDoneEvent
+		for i := range t.events {
+			if d, ok := t.events[i].(types.ResponseDoneEvent); ok {
+				done = &d
+			}
+		}
+		Expect(done).NotTo(BeNil())
+		Expect(done.Response.Status).To(Equal(types.ResponseStatusCompleted))
+		Expect(done.Response.Output).To(HaveLen(1))
+		Expect(done.Response.Usage).NotTo(BeNil())
+		Expect(done.Response.Usage.InputTokens).To(Equal(5))
+		Expect(done.Response.Usage.OutputTokens).To(Equal(3))
+		Expect(done.Response.Usage.TotalTokens).To(Equal(8))
 	})
 })
diff --git a/core/http/endpoints/openai/realtime_transcription.go b/core/http/endpoints/openai/realtime_transcription.go
index 44456101c..28a5147c1 100644
--- a/core/http/endpoints/openai/realtime_transcription.go
+++ b/core/http/endpoints/openai/realtime_transcription.go
@@ -7,6 +7,33 @@ import (
 	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
 )
 
+// emitPrecomputedTranscription emits the transcription events for a turn
+// whose transcript already exists (semantic_vad's live stream, or the
+// retranscribe gate's batch decode): optional delta replays followed by the
+// completed event — the same contract emitTranscription produces, sharing
+// one itemID — without running the backend again.
+func emitPrecomputedTranscription(t Transport, itemID string, deltas []string, transcript string) error {
+	for _, d := range deltas {
+		if d == "" {
+			continue
+		}
+		if err := t.SendEvent(types.ConversationItemInputAudioTranscriptionDeltaEvent{
+			ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
+			ItemID:          itemID,
+			ContentIndex:    0,
+			Delta:           d,
+		}); err != nil {
+			return err
+		}
+	}
+	return t.SendEvent(types.ConversationItemInputAudioTranscriptionCompletedEvent{
+		ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
+		ItemID:          itemID,
+		ContentIndex:    0,
+		Transcript:      transcript,
+	})
+}
+
 // emitTranscription transcribes a committed utterance and emits the transcription
 // events for it, returning the final transcript text. With
 // pipeline.streaming.transcription enabled it streams each transcript fragment as
diff --git a/core/http/endpoints/openai/realtime_tts_pipeline.go b/core/http/endpoints/openai/realtime_tts_pipeline.go
new file mode 100644
index 000000000..c9828b0aa
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_tts_pipeline.go
@@ -0,0 +1,153 @@
+package openai
+
+import (
+	"sync"
+	"sync/atomic"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/ttscoord"
+)
+
+// ttsPipeline decouples speech synthesis from LLM token generation.
+//
+// The LLM token callback runs on the same goroutine that drains the model's
+// gRPC stream, so anything it does serially — including a blocking TTS call —
+// stops the stream from being read and stalls generation (and, since the same
+// goroutine also sends the assistant transcript, freezes the transcript the
+// client sees). ttsPipeline lets the callback hand each completed clause to a
+// single worker goroutine that synthesizes them in order, concurrently with
+// continued generation. One worker preserves clause — and therefore audio —
+// ordering.
+//
+// The clause queue is intentionally unbounded: clauses are short strings and a
+// reply has a bounded number of them, while the expensive product (audio) is
+// paced by the TTS backend regardless. So enqueue never blocks the callback,
+// and the transcript streams to the client at generation speed while audio is
+// produced behind it.
+type ttsPipeline struct {
+	speak func(clause string) ([]byte, error)
+
+	mu    sync.Mutex
+	queue []string
+	wake  chan struct{} // buffered(1) wakeup signal for the worker
+
+	// coord owns the open->closing->closed lifecycle (machine M5). It replaces the
+	// legacy `closed bool`: the producer raises Close (wait()), the worker raises
+	// WorkerExited. See ttscoord/ and realtime-state-machines.md.
+	coord *ttscoord.Coordinator
+
+	done   chan struct{}
+	failed atomic.Bool
+
+	// audio and firstErr are owned by the worker goroutine and only safe to
+	// read after wait() has returned (it joins on the worker via done).
+	audio    []byte
+	firstErr error
+}
+
+// newTTSPipeline starts the worker. speak performs the actual synthesis and
+// returns the PCM accumulated for the conversation-item record (empty for
+// transports that stream audio out-of-band, e.g. WebRTC).
+func newTTSPipeline(speak func(clause string) ([]byte, error)) *ttsPipeline {
+	p := &ttsPipeline{
+		speak: speak,
+		wake:  make(chan struct{}, 1),
+		done:  make(chan struct{}),
+	}
+	p.coord = ttscoord.New(p)
+	go p.run()
+	return p
+}
+
+// closing reports whether wait() has been called (lifecycle past Open). Read
+// under p.mu in the worker so the queue-empty check and the close check are
+// consistent.
+func (p *ttsPipeline) closing() bool {
+	_, open := p.coord.State().(ttscoord.Open)
+	return !open
+}
+
+// Perform executes a coordinator effect. Wake nudges the worker (non-blocking).
+func (p *ttsPipeline) Perform(e ttscoord.Effect) {
+	if _, ok := e.(ttscoord.Wake); ok {
+		p.signal()
+	}
+}
+
+func (p *ttsPipeline) run() {
+	defer close(p.done)
+	for {
+		p.mu.Lock()
+		for len(p.queue) == 0 && !p.closing() {
+			p.mu.Unlock()
+			<-p.wake
+			p.mu.Lock()
+		}
+		if len(p.queue) == 0 && p.closing() {
+			p.mu.Unlock()
+			// Drained and closed: advance the lifecycle to Closed, then exit
+			// (the deferred close(p.done) joins the producer's wait()).
+			_ = p.coord.Apply(ttscoord.WorkerExited{})
+			return
+		}
+		clause := p.queue[0]
+		p.queue = p.queue[1:]
+		p.mu.Unlock()
+
+		// Once a clause has failed, keep draining the queue without speaking so
+		// the producer's wait() returns promptly and the first error is kept.
+		if p.failed.Load() {
+			continue
+		}
+		a, err := p.speak(clause)
+		if err != nil {
+			p.firstErr = err
+			p.failed.Store(true)
+			continue
+		}
+		p.audio = append(p.audio, a...)
+	}
+}
+
+// enqueue offers a clause for synthesis. It never blocks; it returns false once
+// synthesis has failed, signalling the caller to stop the prediction.
+func (p *ttsPipeline) enqueue(clause string) bool {
+	if p.failed.Load() {
+		return false
+	}
+	p.mu.Lock()
+	// Reject once closing/closed: the worker may have already drained and exited,
+	// so a clause queued now would be silently dropped. The lifecycle (Open) and
+	// the append are checked under the same lock, so the worker cannot exit between
+	// the gate and the enqueue (it takes p.mu to observe the empty queue).
+	if p.closing() {
+		p.mu.Unlock()
+		return false
+	}
+	p.queue = append(p.queue, clause)
+	p.mu.Unlock()
+	p.signal()
+	return true
+}
+
+// signal wakes the worker without blocking; the buffered channel coalesces
+// signals, which is safe because the worker drains the whole queue per wake.
+func (p *ttsPipeline) signal() {
+	select {
+	case p.wake <- struct{}{}:
+	default:
+	}
+}
+
+// wait closes the queue and blocks until the worker has spoken every enqueued
+// clause, then returns the accumulated audio and the first synthesis error. It
+// is idempotent: calling it again returns the same result without blocking, so
+// callers can drain it explicitly to read the audio and still defer a wait() as
+// a leak-proof backstop. No clause may be enqueued after the first wait().
+func (p *ttsPipeline) wait() ([]byte, error) {
+	// Close the lifecycle (Open->Closing) and wake the worker. Idempotent: a
+	// second Close is absorbed (no second wake), and <-p.done returns immediately
+	// once the worker has exited.
+	_ = p.coord.Apply(ttscoord.Close{})
+	<-p.done
+	return p.audio, p.firstErr
+}
diff --git a/core/http/endpoints/openai/realtime_tts_pipeline_test.go b/core/http/endpoints/openai/realtime_tts_pipeline_test.go
new file mode 100644
index 000000000..a5e070248
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_tts_pipeline_test.go
@@ -0,0 +1,114 @@
+package openai
+
+import (
+	"errors"
+	"sync"
+	"time"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("ttsPipeline", func() {
+	It("synthesizes clauses in order and accumulates their audio", func() {
+		p := newTTSPipeline(func(clause string) ([]byte, error) {
+			return []byte(clause), nil
+		})
+		Expect(p.enqueue("a")).To(BeTrue())
+		Expect(p.enqueue("b")).To(BeTrue())
+		Expect(p.enqueue("c")).To(BeTrue())
+
+		audio, err := p.wait()
+		Expect(err).NotTo(HaveOccurred())
+		Expect(string(audio)).To(Equal("abc"))
+	})
+
+	It("never blocks the producer even when synthesis is slow", func() {
+		var started sync.WaitGroup
+		started.Add(1)
+		release := make(chan struct{})
+		first := true
+		p := newTTSPipeline(func(clause string) ([]byte, error) {
+			if first {
+				first = false
+				started.Done()
+				<-release // hold the worker on the first clause
+			}
+			return []byte(clause), nil
+		})
+
+		Expect(p.enqueue("1")).To(BeTrue())
+		started.Wait() // worker is now blocked synthesizing the first clause
+
+		// Enqueuing many more clauses must return immediately, not block on the
+		// stalled worker — this is what keeps the LLM recv loop flowing.
+		done := make(chan struct{})
+		go func() {
+			defer close(done)
+			for _, c := range []string{"2", "3", "4", "5"} {
+				p.enqueue(c)
+			}
+		}()
+		Eventually(done, time.Second).Should(BeClosed())
+
+		close(release)
+		audio, err := p.wait()
+		Expect(err).NotTo(HaveOccurred())
+		Expect(string(audio)).To(Equal("12345"))
+	})
+
+	It("keeps the first error, stops speaking, and signals the producer to stop", func() {
+		boom := errors.New("backend gone")
+		var spoken []string
+		var mu sync.Mutex
+		p := newTTSPipeline(func(clause string) ([]byte, error) {
+			mu.Lock()
+			spoken = append(spoken, clause)
+			mu.Unlock()
+			if clause == "b" {
+				return nil, boom
+			}
+			return []byte(clause), nil
+		})
+
+		Expect(p.enqueue("a")).To(BeTrue())
+		Expect(p.enqueue("b")).To(BeTrue())
+
+		// Once the failure is observed, enqueue reports it so the caller stops
+		// the prediction; any further clauses are dropped, not spoken.
+		Eventually(func() bool { return !p.enqueue("c") }, time.Second).Should(BeTrue())
+
+		_, err := p.wait()
+		Expect(err).To(MatchError(boom))
+
+		mu.Lock()
+		defer mu.Unlock()
+		Expect(spoken).NotTo(ContainElement("c"), "clauses after the failure are not synthesized")
+	})
+
+	It("is idempotent: a second wait returns the same result without blocking", func() {
+		p := newTTSPipeline(func(clause string) ([]byte, error) {
+			return []byte(clause), nil
+		})
+		Expect(p.enqueue("x")).To(BeTrue())
+
+		audio1, err1 := p.wait()
+		// A deferred backstop wait() in the caller runs after the explicit one;
+		// it must not block or change the result.
+		audio2, err2 := p.wait()
+
+		Expect(err1).NotTo(HaveOccurred())
+		Expect(err2).NotTo(HaveOccurred())
+		Expect(string(audio1)).To(Equal("x"))
+		Expect(string(audio2)).To(Equal("x"))
+	})
+
+	It("returns cleanly when no clause was ever enqueued", func() {
+		p := newTTSPipeline(func(clause string) ([]byte, error) {
+			return []byte(clause), nil
+		})
+		audio, err := p.wait()
+		Expect(err).NotTo(HaveOccurred())
+		Expect(audio).To(BeEmpty())
+	})
+})
diff --git a/core/http/endpoints/openai/realtime_turncoord.go b/core/http/endpoints/openai/realtime_turncoord.go
new file mode 100644
index 000000000..30ffffc66
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_turncoord.go
@@ -0,0 +1,127 @@
+package openai
+
+import (
+	"context"
+	"time"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/respcoord"
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/turncoord"
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+	"github.com/mudler/LocalAI/core/schema"
+)
+
+// turnSink wires the explicit turn-detection state machine (turncoord.Coordinator
+// — machine "M2" in docs/design/realtime-state-machines.md) into handleVAD.
+//
+// In the legacy code the turn lifecycle was split across two variables that could
+// disagree: handleVAD's goroutine-local speechStarted bool and the semantic_vad
+// liveTurnState's "is the live stream open" flag (lts.open()). A discardTurn (the
+// no-speech clear, or teardown) closed the live stream but left speechStarted
+// true, so the next speech onset was suppressed by `if !speechStarted` — no
+// speech_started, no barge-in, no commit (Part 2, failure mode 4). Here "speech
+// started" and "a turn is open" are ONE coordinator state, so they cannot desync.
+//
+// Unlike responseSink (M3), which is a genuine dual-writer race, the turn machine
+// is owned by the single handleVAD goroutine; this sink and its coordinator are
+// loop-local. The coordinator's lock only matters for the teardown-time Abort and
+// for keeping State() readable — there is no second writer.
+//
+// The effects map onto the existing turn I/O:
+//   - OpenTurn:          open the live ASR stream (semantic_vad) + feed the onset
+//     audio. A failed open degrades the turn to silence-only — the turn still
+//     proceeds (server_vad-like), matching the legacy behaviour.
+//   - BargeIn:           cancel any in-flight response (non-blocking).
+//   - EmitSpeechStarted: input_audio_buffer.speech_started.
+//   - EmitSpeechStopped: input_audio_buffer.speech_stopped.
+//   - CommitTurn:        committed event + finalize the live stream + issue the
+//     response (via responseSink/respcoord).
+//   - DiscardTurn:       close the live stream and retract any captions.
+//
+// The data-heavy effects (OpenTurn, CommitTurn) need the current tick's audio and
+// transcription context. Because Apply performs effects synchronously on the same
+// (handleVAD) goroutine, the loop sets the relevant scratch fields immediately
+// before each Apply; there is no cross-goroutine sharing.
+type turnSink struct {
+	session    *Session
+	conv       *Conversation
+	transport  Transport
+	lts        *liveTurnState
+	vadContext context.Context
+	startTime  time.Time
+
+	coord *turncoord.Coordinator
+
+	// per-tick context, set by handleVAD before each Apply (single goroutine).
+	sv                 *types.RealtimeSessionSemanticVad // nil = server_vad
+	onsetAudio         []int16                           // OpenTurn feeds this
+	commitAudio        []byte                            // CommitTurn issues this
+	commitAudioLength  float64                           // for finishTurn (flush tail)
+	commitRetranscribe bool                              // gated batch is authoritative
+	commitGated        *schema.TranscriptionResult       // retranscribe batch decode
+}
+
+func newTurnSink(session *Session, conv *Conversation, t Transport, lts *liveTurnState, vadContext context.Context, startTime time.Time) *turnSink {
+	s := &turnSink{
+		session:    session,
+		conv:       conv,
+		transport:  t,
+		lts:        lts,
+		vadContext: vadContext,
+		startTime:  startTime,
+	}
+	s.coord = turncoord.New(s)
+	return s
+}
+
+// Perform executes one effect. It is called by Coordinator.Apply while the
+// coordinator lock is held. The turn coordinator is single-writer (handleVAD), so
+// the synchronous network writes / lts operations here are the same ones the
+// legacy loop did inline on this goroutine; they never contend the lock.
+func (s *turnSink) Perform(e turncoord.Effect) {
+	switch eff := e.(type) {
+	case turncoord.OpenTurn:
+		if s.sv != nil && s.lts.openTurn(s.vadContext, string(eff.Turn)) {
+			s.lts.feedNewAudio(s.onsetAudio)
+		}
+	case turncoord.BargeIn:
+		s.session.respSink.cancel(respcoord.SourceVAD)
+	case turncoord.EmitSpeechStarted:
+		sendEvent(s.transport, types.InputAudioBufferSpeechStartedEvent{
+			ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
+			AudioStartMs:    time.Since(s.startTime).Milliseconds(),
+		})
+	case turncoord.EmitSpeechStopped:
+		sendEvent(s.transport, types.InputAudioBufferSpeechStoppedEvent{
+			ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
+			AudioEndMs:      time.Since(s.startTime).Milliseconds(),
+		})
+	case turncoord.CommitTurn:
+		// The committed item id is the coordinator's turn id (== the live caption
+		// id), so the client's completed event replaces the partial text.
+		itemID := string(eff.Turn)
+		sendEvent(s.transport, types.InputAudioBufferCommittedEvent{
+			ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
+			ItemID:          itemID,
+			PreviousItemID:  "TODO",
+		})
+		// Finalize the turn's live stream (flushes the decode tail). In
+		// retranscribe mode the batch decode is authoritative, so the streamed
+		// transcript is dropped.
+		var live *liveUtterance
+		if s.sv != nil {
+			ut := s.lts.finishTurn(s.commitAudioLength)
+			if !s.commitRetranscribe {
+				live = ut
+			}
+		}
+		audio := s.commitAudio
+		gated := s.commitGated
+		conv := s.conv
+		s.session.respSink.issue(s.vadContext, respcoord.SourceVAD, func(ctx context.Context) {
+			commitUtteranceWithTranscript(ctx, audio, live, gated, itemID, s.session, conv, s.transport)
+		})
+	case turncoord.DiscardTurn:
+		// No-op if the stream was never open (server_vad / already idle).
+		s.lts.discardTurn()
+	}
+}
diff --git a/core/http/endpoints/openai/realtime_vad_buffer_test.go b/core/http/endpoints/openai/realtime_vad_buffer_test.go
new file mode 100644
index 000000000..0fbef3e6b
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_vad_buffer_test.go
@@ -0,0 +1,54 @@
+package openai
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// dropInspectedPrefix is what stands between the VAD loop's buffer clears and
+// cutting the first word off an utterance: the no-speech clear must keep the
+// holdback tail (silero hasn't crossed its onset threshold yet) and both
+// clears must keep audio appended while the tick ran (the VAD never saw it).
+var _ = Describe("dropInspectedPrefix", func() {
+	It("keeps the holdback tail of the inspected window and everything appended mid-tick", func() {
+		inspected := []byte{1, 2, 3, 4, 5, 6}
+		appended := []byte{7, 8}
+		buf := append(append([]byte(nil), inspected...), appended...)
+
+		out := dropInspectedPrefix(buf, len(inspected), 2)
+
+		Expect(out).To(Equal([]byte{5, 6, 7, 8}), "older confirmed-silent head dropped, possible onset + fresh audio kept")
+	})
+
+	It("returns the buffer unchanged when the inspected window fits in the holdback", func() {
+		buf := []byte{1, 2, 3}
+
+		Expect(dropInspectedPrefix(buf, len(buf), 4)).To(Equal(buf))
+		Expect(dropInspectedPrefix(buf, len(buf), len(buf))).To(Equal(buf))
+	})
+
+	It("drops the whole inspected window with zero holdback, keeping only mid-tick appends", func() {
+		// The commit-time clear: the inspected audio was committed, audio
+		// appended while the tick ran belongs to the next turn.
+		buf := []byte{1, 2, 3, 4}
+
+		Expect(dropInspectedPrefix(buf, 4, 0)).To(BeEmpty())
+		Expect(dropInspectedPrefix(append(buf, 9), 4, 0)).To(Equal([]byte{9}))
+	})
+
+	It("clamps when told more was inspected than the buffer holds", func() {
+		buf := []byte{1, 2}
+
+		Expect(dropInspectedPrefix(buf, 10, 0)).To(BeEmpty())
+	})
+
+	It("returns a copy, not a sub-slice, when bytes are dropped", func() {
+		buf := []byte{1, 2, 3, 4}
+
+		out := dropInspectedPrefix(buf, 4, 2)
+
+		Expect(out).To(Equal([]byte{3, 4}))
+		buf[2] = 99
+		Expect(out).To(Equal([]byte{3, 4}), "mutating the old backing array must not leak into the published buffer")
+	})
+})
diff --git a/core/http/endpoints/openai/respcoord/respcoord.go b/core/http/endpoints/openai/respcoord/respcoord.go
new file mode 100644
index 000000000..6c8c6d80f
--- /dev/null
+++ b/core/http/endpoints/openai/respcoord/respcoord.go
@@ -0,0 +1,267 @@
+// Package respcoord is the explicit state machine for the realtime API's
+// response-coordination concern (machine "M3" in
+// docs/design/realtime-state-machines.md).
+//
+// In the legacy code this machine is implicit: a response is "active" iff
+// Session.activeResponseDone is a non-nil, unclosed channel, and the lifecycle
+// is driven from TWO goroutines (the client read-loop and the VAD goroutine)
+// that both call startResponse/cancelActiveResponse. responseMu guards only the
+// field swap, while the <-done wait happens outside the lock, so two concurrent
+// starts can briefly leave two live response goroutines both appending to the
+// conversation. See docs/design/realtime-state-machines.md, Part 2 (failure
+// mode 2) and the ResponseLifecycle spec under formal-verification/.
+//
+// This package replaces that with:
+//   - a sealed sum type for State (illegal states are unrepresentable),
+//   - a total, pure transition function Next(state, event) -> (state, effects),
+//   - a single-writer Coordinator that serializes every transition.
+//
+// The design guarantees the invariants the specs check:
+//   - at most one live response at any instant,
+//   - exactly one terminal (response.done) per started response,
+//   - no response is started after its terminal (no resurrection).
+package respcoord
+
+import (
+	"fmt"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator"
+)
+
+// ResponseID identifies a single response attempt. The caller mints a fresh,
+// monotonically increasing id for every Start; ids are never reused. The
+// monotonic id is what lets the machine ignore "stale" Finished events from a
+// response that was already superseded or cancelled.
+type ResponseID uint64
+
+// Source records which goroutine drove an event. It is carried for
+// observability/logging only; it never affects a transition (both sources are
+// equal authority). Keeping it in the event type makes the dual-writer reality
+// explicit rather than hidden.
+type Source int
+
+const (
+	// SourceClient is the read-loop: response.create or a manual
+	// input_audio_buffer.commit.
+	SourceClient Source = iota
+	// SourceVAD is the turn-detection goroutine: end-of-speech commit or a
+	// barge-in cancel.
+	SourceVAD
+)
+
+func (s Source) String() string {
+	switch s {
+	case SourceClient:
+		return "client"
+	case SourceVAD:
+		return "vad"
+	default:
+		return fmt.Sprintf("Source(%d)", int(s))
+	}
+}
+
+// Status is the terminal status reported on response.done.
+type Status int
+
+const (
+	// StatusCompleted is a response that finished on its own.
+	StatusCompleted Status = iota
+	// StatusCancelled is a response cut short by a barge-in, an explicit
+	// response.cancel, or by being superseded by a newer response.
+	StatusCancelled
+)
+
+func (s Status) String() string {
+	switch s {
+	case StatusCompleted:
+		return "completed"
+	case StatusCancelled:
+		return "cancelled"
+	default:
+		return fmt.Sprintf("Status(%d)", int(s))
+	}
+}
+
+// State is the sealed sum type of coordinator states. The only implementations
+// are the unexported-method-bearing structs in this file, so callers outside
+// the package cannot fabricate an out-of-band state. Exhaustively:
+// Idle | Active | Terminated.
+type State interface {
+	isState()
+	String() string
+}
+
+// Idle: no response is in flight.
+type Idle struct{}
+
+// Active: exactly one response (ID) is in flight. The struct holds a single id,
+// so "two active responses" is not representable.
+type Active struct{ ID ResponseID }
+
+// Terminated: the session is torn down. Absorbing — no response can start from
+// here, so the M1 (connection) parent's teardown can guarantee no response
+// outlives the session (see formal-verification/session_lifecycle.fizz).
+type Terminated struct{}
+
+func (Idle) isState()       {}
+func (Active) isState()     {}
+func (Terminated) isState() {}
+
+func (Idle) String() string       { return "Idle" }
+func (a Active) String() string   { return fmt.Sprintf("Active(%d)", a.ID) }
+func (Terminated) String() string { return "Terminated" }
+
+// Event is the sealed sum type of inputs. Exhaustively:
+// Start | Finished | Cancel | Shutdown.
+type Event interface {
+	isEvent()
+	String() string
+}
+
+// Start requests a new response. ID must be a fresh, never-before-used id.
+type Start struct {
+	ID     ResponseID
+	Source Source
+}
+
+// Finished reports that the response goroutine for ID reached its own terminal.
+// If ID is not the currently-active response it is "stale" (the response was
+// already superseded/cancelled) and is ignored.
+type Finished struct{ ID ResponseID }
+
+// Cancel requests cancellation of the in-flight response (barge-in or explicit
+// response.cancel). It is a no-op when idle.
+type Cancel struct{ Source Source }
+
+// Shutdown terminates the coordinator at session teardown: it cancels any
+// in-flight response and moves to the absorbing Terminated state, after which no
+// response can start. Raised by the connection (M1) parent's teardown.
+type Shutdown struct{}
+
+func (Start) isEvent()    {}
+func (Finished) isEvent() {}
+func (Cancel) isEvent()   {}
+func (Shutdown) isEvent() {}
+
+func (e Start) String() string    { return fmt.Sprintf("Start(%d,%s)", e.ID, e.Source) }
+func (e Finished) String() string { return fmt.Sprintf("Finished(%d)", e.ID) }
+func (e Cancel) String() string   { return fmt.Sprintf("Cancel(%s)", e.Source) }
+func (Shutdown) String() string   { return "Shutdown" }
+
+// Effect is a side effect returned by Next as data for the caller to perform.
+// Returning effects as data (rather than firing callbacks inside the
+// transition) keeps Next pure and exhaustively testable, and lets the
+// Coordinator decide how/when to perform them. Exhaustively:
+// CancelResponse | StartResponse | EmitTerminal.
+type Effect interface {
+	isEffect()
+	String() string
+}
+
+// CancelResponse: cancel the context of the running response ID.
+type CancelResponse struct{ ID ResponseID }
+
+// StartResponse: spawn the response goroutine for ID.
+type StartResponse struct{ ID ResponseID }
+
+// EmitTerminal: send response.done for ID with Status.
+type EmitTerminal struct {
+	ID     ResponseID
+	Status Status
+}
+
+func (CancelResponse) isEffect() {}
+func (StartResponse) isEffect()  {}
+func (EmitTerminal) isEffect()   {}
+
+func (e CancelResponse) String() string { return fmt.Sprintf("CancelResponse(%d)", e.ID) }
+func (e StartResponse) String() string  { return fmt.Sprintf("StartResponse(%d)", e.ID) }
+func (e EmitTerminal) String() string {
+	return fmt.Sprintf("EmitTerminal(%d,%s)", e.ID, e.Status)
+}
+
+// Next is the total, pure transition function. For every (state, event) it
+// returns the next state and the ordered effects to perform. It returns a
+// non-nil error only for an unknown State/Event implementation (a programmer
+// error / future type added without updating this function) — callers must
+// surface that, never silently ignore it. Every in-domain (state, event) pair
+// is defined; there are no "forbidden" transitions, only no-ops for stale or
+// idle inputs.
+//
+// The supersede rule (Active + Start) is the crux of the fix: starting a new
+// response while one is active emits the old response's cancelled terminal and
+// cancels it BEFORE the replacement starts, all within one serialized
+// transition. The old goroutine's later Finished is therefore stale and
+// ignored — so each id gets exactly one terminal and there is never more than
+// one live response.
+func Next(s State, e Event) (State, []Effect, error) {
+	switch st := s.(type) {
+	case Idle:
+		switch ev := e.(type) {
+		case Start:
+			return Active{ID: ev.ID}, []Effect{StartResponse{ID: ev.ID}}, nil
+		case Cancel:
+			// Nothing in flight: idempotent no-op.
+			return Idle{}, nil, nil
+		case Finished:
+			// Stale terminal from an already-superseded/cancelled response.
+			return Idle{}, nil, nil
+		case Shutdown:
+			// Teardown with nothing in flight: go terminal.
+			return Terminated{}, nil, nil
+		}
+	case Active:
+		switch ev := e.(type) {
+		case Start:
+			return Active{ID: ev.ID}, []Effect{
+				CancelResponse{ID: st.ID},
+				EmitTerminal{ID: st.ID, Status: StatusCancelled},
+				StartResponse{ID: ev.ID},
+			}, nil
+		case Finished:
+			if ev.ID == st.ID {
+				return Idle{}, []Effect{EmitTerminal{ID: st.ID, Status: StatusCompleted}}, nil
+			}
+			// Stale finish from a superseded response — already terminal-ed.
+			return Active{ID: st.ID}, nil, nil
+		case Cancel:
+			return Idle{}, []Effect{
+				CancelResponse{ID: st.ID},
+				EmitTerminal{ID: st.ID, Status: StatusCancelled},
+			}, nil
+		case Shutdown:
+			// Teardown while a response is live: cancel it (with its terminal) and
+			// go terminal so nothing can start afterwards.
+			return Terminated{}, []Effect{
+				CancelResponse{ID: st.ID},
+				EmitTerminal{ID: st.ID, Status: StatusCancelled},
+			}, nil
+		}
+	case Terminated:
+		// Absorbing: every event is a no-op. A Start after teardown is rejected
+		// (no StartResponse), so no response can outlive the session.
+		switch e.(type) {
+		case Start, Finished, Cancel, Shutdown:
+			return Terminated{}, nil, nil
+		}
+	}
+	return s, nil, fmt.Errorf("respcoord: unhandled transition %s <- %s", s, e)
+}
+
+// EffectSink performs the effects produced by a transition. See coordinator.Sink
+// for the non-blocking contract: Perform runs under the coordinator lock, so it
+// must not block and must not re-enter Apply (the spawned response goroutine's
+// Finished apply happens only after the sink returns).
+type EffectSink = coordinator.Sink[Effect]
+
+// Coordinator serializes every Start/Finished/Cancel/Shutdown transition behind
+// one lock, so the two driving goroutines (read-loop and VAD) can call Apply
+// concurrently without the legacy dual-writer race. Effects are performed in
+// order under the lock — preserving the (cancel old, emit old terminal, start
+// new) supersede ordering. See coordinator.Coordinator.
+type Coordinator = coordinator.Coordinator[State, Event, Effect]
+
+// New returns an idle Coordinator that performs effects via sink.
+func New(sink EffectSink) *Coordinator {
+	return coordinator.New[State, Event, Effect](Idle{}, Next, sink)
+}
diff --git a/core/http/endpoints/openai/respcoord/respcoord_suite_test.go b/core/http/endpoints/openai/respcoord/respcoord_suite_test.go
new file mode 100644
index 000000000..df26a1813
--- /dev/null
+++ b/core/http/endpoints/openai/respcoord/respcoord_suite_test.go
@@ -0,0 +1,13 @@
+package respcoord
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestRespcoord(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "respcoord (realtime M3) Suite")
+}
diff --git a/core/http/endpoints/openai/respcoord/respcoord_test.go b/core/http/endpoints/openai/respcoord/respcoord_test.go
new file mode 100644
index 000000000..6a3c7c297
--- /dev/null
+++ b/core/http/endpoints/openai/respcoord/respcoord_test.go
@@ -0,0 +1,370 @@
+package respcoord
+
+import (
+	"math/rand/v2"
+	"sync"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// recordingSink captures the ordered stream of effects so the invariants can be
+// checked independently of the transition function's internals. Perform is
+// called by Coordinator.Apply under the coordinator lock, so it is already
+// serialized; the mutex here only guards reads from the spec goroutine.
+type recordingSink struct {
+	mu  sync.Mutex
+	log []Effect
+}
+
+func (s *recordingSink) Perform(e Effect) {
+	s.mu.Lock()
+	s.log = append(s.log, e)
+	s.mu.Unlock()
+}
+
+func (s *recordingSink) snapshot() []Effect {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	out := make([]Effect, len(s.log))
+	copy(out, s.log)
+	return out
+}
+
+// checkInvariants replays the effect log and asserts the three core safety
+// properties from docs/design/realtime-state-machines.md, Part 4:
+//
+//	(1) at most one live response at any instant
+//	    -- after every effect, the number of started-but-not-terminated ids <= 1;
+//	(2) exactly one terminal per started response
+//	    -- each id is started at most once and terminated at most once;
+//	(3) no resurrection
+//	    -- an id is never started after it has been terminated.
+func checkInvariants(log []Effect) {
+	started := map[ResponseID]int{}
+	terminated := map[ResponseID]int{}
+	live := map[ResponseID]bool{}
+
+	for i, eff := range log {
+		switch e := eff.(type) {
+		case StartResponse:
+			Expect(terminated[e.ID]).To(Equal(0), "invariant (3): StartResponse(%d) after it was terminated (effect #%d)\nlog=%v", e.ID, i, log)
+			started[e.ID]++
+			Expect(started[e.ID]).To(Equal(1), "invariant (2): id %d started %d times (effect #%d)\nlog=%v", e.ID, started[e.ID], i, log)
+			live[e.ID] = true
+		case EmitTerminal:
+			terminated[e.ID]++
+			Expect(terminated[e.ID]).To(Equal(1), "invariant (2): id %d terminated %d times (effect #%d)\nlog=%v", e.ID, terminated[e.ID], i, log)
+			delete(live, e.ID)
+		case CancelResponse:
+			// no count assertion; cancellation is paired with a terminal
+		}
+		Expect(len(live)).To(BeNumerically("<=", 1), "invariant (1): %d live responses after effect #%d (%s)\nlog=%v", len(live), i, eff, log)
+	}
+}
+
+// unknownEvent is an Event implementation Next does not know about, to exercise
+// the defensive error path.
+type unknownEvent struct{}
+
+func (unknownEvent) isEvent()       {}
+func (unknownEvent) String() string { return "unknownEvent" }
+
+var _ = Describe("respcoord.Next", func() {
+	// DescribeTable exhaustively pins every (state, event) cell of the pure
+	// transition function, including the stale / idle no-op cells. This is the
+	// practical stand-in for "no transition leads to an inconsistent state": if a
+	// cell changes, this table must change with it.
+	DescribeTable("transitions",
+		func(state State, event Event, wantState State, wantEff []Effect) {
+			gotState, gotEff, err := Next(state, event)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(gotState).To(Equal(wantState))
+			Expect(gotEff).To(Equal(wantEff))
+		},
+		Entry("idle+start -> active, spawns response",
+			Idle{}, Start{ID: 1, Source: SourceClient},
+			Active{ID: 1}, []Effect{StartResponse{ID: 1}}),
+		Entry("idle+cancel -> idle, no-op",
+			Idle{}, Cancel{Source: SourceVAD},
+			Idle{}, []Effect(nil)),
+		Entry("idle+finished(stale) -> idle, no-op",
+			Idle{}, Finished{ID: 7},
+			Idle{}, []Effect(nil)),
+		Entry("active+start -> supersede: cancel+terminal(old)+start(new)",
+			Active{ID: 1}, Start{ID: 2, Source: SourceVAD},
+			Active{ID: 2},
+			[]Effect{
+				CancelResponse{ID: 1},
+				EmitTerminal{ID: 1, Status: StatusCancelled},
+				StartResponse{ID: 2},
+			}),
+		Entry("active+finished(current) -> idle, completed terminal",
+			Active{ID: 3}, Finished{ID: 3},
+			Idle{}, []Effect{EmitTerminal{ID: 3, Status: StatusCompleted}}),
+		Entry("active+finished(stale) -> stay active, no-op",
+			Active{ID: 3}, Finished{ID: 2},
+			Active{ID: 3}, []Effect(nil)),
+		Entry("active+cancel -> idle, cancel+cancelled terminal",
+			Active{ID: 5}, Cancel{Source: SourceClient},
+			Idle{},
+			[]Effect{
+				CancelResponse{ID: 5},
+				EmitTerminal{ID: 5, Status: StatusCancelled},
+			}),
+		Entry("idle+shutdown -> terminated, no-op",
+			Idle{}, Shutdown{},
+			Terminated{}, []Effect(nil)),
+		Entry("active+shutdown -> terminated: cancel+cancelled terminal",
+			Active{ID: 6}, Shutdown{},
+			Terminated{},
+			[]Effect{
+				CancelResponse{ID: 6},
+				EmitTerminal{ID: 6, Status: StatusCancelled},
+			}),
+		Entry("terminated+start -> terminated, REJECTED (no resurrection)",
+			Terminated{}, Start{ID: 9, Source: SourceClient},
+			Terminated{}, []Effect(nil)),
+		Entry("terminated+finished -> terminated, no-op (stale)",
+			Terminated{}, Finished{ID: 9},
+			Terminated{}, []Effect(nil)),
+		Entry("terminated+cancel -> terminated, no-op",
+			Terminated{}, Cancel{Source: SourceVAD},
+			Terminated{}, []Effect(nil)),
+		Entry("terminated+shutdown -> terminated, idempotent",
+			Terminated{}, Shutdown{},
+			Terminated{}, []Effect(nil)),
+	)
+
+	It("is total: every defined (state, event) pair is handled without error", func() {
+		states := []State{Idle{}, Active{ID: 1}, Terminated{}}
+		events := []Event{
+			Start{ID: 2, Source: SourceClient},
+			Finished{ID: 1},
+			Finished{ID: 99},
+			Cancel{Source: SourceVAD},
+			Shutdown{},
+		}
+		for _, s := range states {
+			for _, e := range events {
+				_, _, err := Next(s, e)
+				Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e)
+			}
+		}
+	})
+
+	It("errors on an unknown event type", func() {
+		_, _, err := Next(Active{ID: 1}, unknownEvent{})
+		Expect(err).To(HaveOccurred())
+	})
+})
+
+var _ = Describe("respcoord.Coordinator", func() {
+	// This replaces the previous rapid stateful test: a seeded random walk over
+	// the event space, asserting the invariants hold after every step. Seeds are
+	// fixed so any failure reproduces deterministically.
+	It("upholds the safety invariants over random event sequences", func() {
+		seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE}
+		for _, seed := range seeds {
+			r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5))
+			sink := &recordingSink{}
+			c := New(sink)
+			var nextID uint64
+
+			for range 3000 {
+				switch r.IntN(4) {
+				case 0: // start from client
+					nextID++
+					Expect(c.Apply(Start{ID: ResponseID(nextID), Source: SourceClient})).To(Succeed())
+				case 1: // start from VAD
+					nextID++
+					Expect(c.Apply(Start{ID: ResponseID(nextID), Source: SourceVAD})).To(Succeed())
+				case 2: // possibly-stale finish from any plausible id (incl. future)
+					id := r.Uint64N(nextID + 3)
+					Expect(c.Apply(Finished{ID: ResponseID(id)})).To(Succeed())
+				case 3: // explicit cancel
+					Expect(c.Apply(Cancel{Source: SourceClient})).To(Succeed())
+				}
+			}
+			// One full-log replay per seed: it iterates the whole sequence, so
+			// it catches a violation at any step without the O(n^2) cost of
+			// re-replaying after every Apply.
+			checkInvariants(sink.snapshot())
+		}
+	})
+
+	// Hammer Apply from two goroutines -- the read-loop and the VAD goroutine,
+	// the exact dual-writer scenario that races in the legacy code -- and assert
+	// the invariants still hold. Run under -race to also catch any data race in
+	// the coordinator itself.
+	It("upholds the invariants under concurrent dual-writer Apply", func() {
+		const perGoroutine = 2000
+		sink := &recordingSink{}
+		c := New(sink)
+
+		var idCounter uint64
+		var idMu sync.Mutex
+		nextID := func() ResponseID {
+			idMu.Lock()
+			defer idMu.Unlock()
+			idCounter++
+			return ResponseID(idCounter)
+		}
+
+		var wg sync.WaitGroup
+		drive := func(src Source) {
+			defer wg.Done()
+			for i := range perGoroutine {
+				switch i % 3 {
+				case 0:
+					_ = c.Apply(Start{ID: nextID(), Source: src})
+				case 1:
+					if a, ok := c.State().(Active); ok {
+						_ = c.Apply(Finished{ID: a.ID})
+					}
+				case 2:
+					_ = c.Apply(Cancel{Source: src})
+				}
+			}
+		}
+
+		wg.Add(2)
+		go drive(SourceClient)
+		go drive(SourceVAD)
+		wg.Wait()
+
+		checkInvariants(sink.snapshot())
+	})
+
+	It("rejects the dual-writer interleaving the legacy mechanism allowed", func() {
+		// Equivalent sequence to the legacy double-start race: start id1, then two
+		// superseding starts (id2, id3) such as the read-loop and VAD would each
+		// issue. Each Start is serialized by the coordinator, so each supersede
+		// cancels+terminates the previous -- never two live at once.
+		sink := &recordingSink{}
+		c := New(sink)
+
+		Expect(c.Apply(Start{ID: 1, Source: SourceClient})).To(Succeed())
+		Expect(c.Apply(Start{ID: 2, Source: SourceVAD})).To(Succeed())
+		Expect(c.Apply(Start{ID: 3, Source: SourceClient})).To(Succeed())
+
+		checkInvariants(sink.snapshot())
+
+		got, ok := c.State().(Active)
+		Expect(ok).To(BeTrue(), "state = %s, want Active(3)", c.State())
+		Expect(got.ID).To(Equal(ResponseID(3)))
+	})
+
+	It("terminates on shutdown and rejects any later response (no resurrection)", func() {
+		sink := &recordingSink{}
+		c := New(sink)
+
+		Expect(c.Apply(Start{ID: 1, Source: SourceClient})).To(Succeed())
+		Expect(c.Apply(Shutdown{})).To(Succeed()) // cancels id 1 + goes terminal
+		Expect(c.State()).To(Equal(State(Terminated{})))
+
+		// A late response.create after teardown is structurally rejected.
+		Expect(c.Apply(Start{ID: 2, Source: SourceClient})).To(Succeed())
+		Expect(c.State()).To(Equal(State(Terminated{})))
+		// And a stale Finished from the cancelled response is absorbed.
+		Expect(c.Apply(Finished{ID: 1})).To(Succeed())
+
+		checkInvariants(sink.snapshot())
+		starts := 0
+		for _, e := range sink.snapshot() {
+			if _, ok := e.(StartResponse); ok {
+				starts++
+			}
+		}
+		Expect(starts).To(Equal(1), "only id 1 ever started; the post-shutdown Start was rejected")
+	})
+})
+
+// legacyCoord models the LEGACY startResponse/cancelActiveResponse mechanism, in
+// which the snapshot ("lock" read), the cancel-and-wait, and the spawn are NOT
+// atomic with respect to each other across the two driving goroutines. It exists
+// only to demonstrate the dual-writer race (Part 2, failure mode 2) that
+// respcoord.Coordinator eliminates. It is not used in production.
+//
+// Mapping to the legacy code:
+//   - startStep1  = snapshot Session.activeResponse* under responseMu
+//   - startStep2  = cancelActiveResponse: cancel() then <-done (outside the lock);
+//     a second waiter on an already-closed done returns immediately and does NOT
+//     decrement again (modeled by the snap==registered guard)
+//   - startStep3  = store the new cancel/done pair and spawn the goroutine
+type legacyCoord struct {
+	live       int    // # of live response goroutines (the bug: can exceed 1)
+	registered uint64 // id of the currently-registered response (0 = none)
+	nextID     uint64
+}
+
+func (l *legacyCoord) startStep1() uint64 { return l.registered } // snapshot
+
+func (l *legacyCoord) startStep2(snap uint64) { // cancel-and-wait
+	if snap != 0 && snap == l.registered {
+		l.live--
+		l.registered = 0
+	}
+}
+
+func (l *legacyCoord) startStep3() { // spawn + register
+	l.nextID++
+	l.live++
+	l.registered = l.nextID
+}
+
+var _ = DescribeTable("respcoord stringers",
+	func(got, want string) { Expect(got).To(Equal(want)) },
+	Entry(nil, SourceClient.String(), "client"),
+	Entry(nil, SourceVAD.String(), "vad"),
+	Entry(nil, Source(99).String(), "Source(99)"),
+
+	Entry(nil, StatusCompleted.String(), "completed"),
+	Entry(nil, StatusCancelled.String(), "cancelled"),
+	Entry(nil, Status(99).String(), "Status(99)"),
+
+	Entry(nil, Idle{}.String(), "Idle"),
+	Entry(nil, Active{ID: 7}.String(), "Active(7)"),
+	Entry(nil, Terminated{}.String(), "Terminated"),
+
+	Entry(nil, Start{ID: 1, Source: SourceVAD}.String(), "Start(1,vad)"),
+	Entry(nil, Finished{ID: 2}.String(), "Finished(2)"),
+	Entry(nil, Cancel{Source: SourceClient}.String(), "Cancel(client)"),
+	Entry(nil, Shutdown{}.String(), "Shutdown"),
+
+	Entry(nil, CancelResponse{ID: 3}.String(), "CancelResponse(3)"),
+	Entry(nil, StartResponse{ID: 4}.String(), "StartResponse(4)"),
+	Entry(nil, EmitTerminal{ID: 5, Status: StatusCompleted}.String(), "EmitTerminal(5,completed)"),
+)
+
+var _ = Describe("legacy dual-writer characterization", func() {
+	// Pins the exact interleaving in which the read-loop and the VAD goroutine
+	// both start a response and the machine ends up with TWO live responses. This
+	// is a characterization test for the bug: if a future change to the legacy
+	// model accidentally fixes it, this spec flips and we delete the legacy model.
+	// The production path uses respcoord.Coordinator, proven safe above.
+	It("can reach two live responses (the bug respcoord eliminates)", func() {
+		l := &legacyCoord{}
+
+		// First response established normally.
+		s := l.startStep1()
+		l.startStep2(s)
+		l.startStep3() // live=1, registered=1
+		Expect(l.live).To(Equal(1), "setup")
+
+		// The race: both goroutines snapshot the SAME active response (id 1)...
+		snapVAD := l.startStep1()    // 1
+		snapClient := l.startStep1() // 1
+
+		// ...both "cancel-and-wait" it. The first decrements; the second finds it
+		// already gone and does nothing.
+		l.startStep2(snapVAD)    // live=0, registered=0
+		l.startStep2(snapClient) // no-op (already 0)
+
+		// ...then both spawn their replacement.
+		l.startStep3() // live=1
+		l.startStep3() // live=2  <-- two live responses
+
+		Expect(l.live).To(Equal(2), "expected the legacy race to reach 2 live responses")
+	})
+})
diff --git a/core/http/endpoints/openai/ttscoord/ttscoord.go b/core/http/endpoints/openai/ttscoord/ttscoord.go
new file mode 100644
index 000000000..9b4510347
--- /dev/null
+++ b/core/http/endpoints/openai/ttscoord/ttscoord.go
@@ -0,0 +1,150 @@
+// Package ttscoord is the explicit state machine for the realtime API's
+// TTS-pipeline lifecycle (machine "M5" in docs/design/realtime-state-machines.md).
+//
+// The realtime TTS pipeline (realtime_tts_pipeline.go) decouples synthesis from
+// LLM token generation: the token callback enqueues clauses, a single worker
+// goroutine synthesizes them in order, and wait() closes the queue and joins the
+// worker. In the legacy code the lifecycle is an implicit `closed bool` (guarded
+// by the pipeline mutex) plus a `done` channel closed once by the worker. Two
+// gaps: enqueue does NOT check `closed`, so a clause offered after wait() is
+// silently appended to a worker that may have already exited (dropped); and the
+// open/closed lifecycle is inferred from a bool rather than stored.
+//
+// This package makes the lifecycle explicit:
+//   - a sealed sum type for State (Open | Closing | Closed) — monotonic; illegal
+//     reversals are unrepresentable,
+//   - a total, pure transition function Next(state, event) -> (state, effects),
+//   - a single-writer Coordinator that serializes every transition.
+//
+// It is a genuine two-writer machine: the producer goroutine raises Close (from
+// wait()), and the worker goroutine raises WorkerExited when it has drained the
+// queue and seen the close — so serializing the transition matters. The poison
+// `failed` latch stays a lock-free atomic.Bool in the pipeline (it is read per
+// clause on the worker's hot path and is orthogonal to open/closed); this machine
+// owns only the open->closing->closed lifecycle.
+//
+// Guarantees the spec checks:
+//   - Close wakes the worker to exit exactly once (idempotent wait(); invariant
+//     #10),
+//   - the lifecycle is monotonic and Closed is terminal — so a clause is never
+//     accepted after close (enqueue is gated on Open) and the worker is joined
+//     exactly once (no leak; invariant #8).
+package ttscoord
+
+import (
+	"fmt"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator"
+)
+
+// State is the sealed sum type of TTS-pipeline lifecycle states. Exhaustively:
+// Open | Closing | Closed.
+type State interface {
+	isState()
+	String() string
+}
+
+// Open: the worker is running and accepting clauses.
+type Open struct{}
+
+// Closing: wait() has been called; the worker is draining the remaining queue and
+// will exit. No new clause is accepted.
+type Closing struct{}
+
+// Closed: the worker has exited (its done channel is closed). Terminal.
+type Closed struct{}
+
+func (Open) isState()    {}
+func (Closing) isState() {}
+func (Closed) isState()  {}
+
+func (Open) String() string    { return "Open" }
+func (Closing) String() string { return "Closing" }
+func (Closed) String() string  { return "Closed" }
+
+// Event is the sealed sum type of inputs. Exhaustively: Close | WorkerExited.
+type Event interface {
+	isEvent()
+	String() string
+}
+
+// Close is raised by the producer goroutine (wait()): close the queue and ask
+// the worker to finish. Idempotent.
+type Close struct{}
+
+// WorkerExited is raised by the worker goroutine when it has drained the queue
+// and observed the close, just before it closes its done channel.
+type WorkerExited struct{}
+
+func (Close) isEvent()        {}
+func (WorkerExited) isEvent() {}
+
+func (Close) String() string        { return "Close" }
+func (WorkerExited) String() string { return "WorkerExited" }
+
+// Effect is a side effect returned by Next as data. Exhaustively: Wake.
+type Effect interface {
+	isEffect()
+	String() string
+}
+
+// Wake: signal the worker (via the buffered wake channel) so it re-checks the
+// lifecycle and exits. Emitted once, on the Open->Closing transition.
+type Wake struct{}
+
+func (Wake) isEffect() {}
+
+func (Wake) String() string { return "Wake" }
+
+// Next is the total, pure transition function. For every (state, event) it
+// returns the next state and the ordered effects. It returns a non-nil error
+// only for an unknown State/Event implementation. Every in-domain pair is
+// defined; there are no forbidden transitions, only no-ops.
+//
+// The lifecycle is monotonic Open -> Closing -> Closed. Close wakes the worker
+// only on the first Open->Closing transition (idempotent wait()); a later Close
+// is absorbed. WorkerExited only advances Closing -> Closed.
+func Next(s State, e Event) (State, []Effect, error) {
+	switch s.(type) {
+	case Open:
+		switch e.(type) {
+		case Close:
+			return Closing{}, []Effect{Wake{}}, nil
+		case WorkerExited:
+			// Worker exited while still Open (e.g. never any clause and an early
+			// close race) -- treat as fully closed; defensive, keeps Next total.
+			return Closed{}, nil, nil
+		}
+	case Closing:
+		switch e.(type) {
+		case Close:
+			// Idempotent wait(): already closing, no second wake.
+			return Closing{}, nil, nil
+		case WorkerExited:
+			return Closed{}, nil, nil
+		}
+	case Closed:
+		switch e.(type) {
+		case Close:
+			return Closed{}, nil, nil
+		case WorkerExited:
+			return Closed{}, nil, nil
+		}
+	}
+	return s, nil, fmt.Errorf("ttscoord: unhandled transition %s <- %s", s, e)
+}
+
+// EffectSink performs the effects produced by a transition. See coordinator.Sink:
+// Wake does a non-blocking send on a buffered channel, so Perform does not block
+// under the lock.
+type EffectSink = coordinator.Sink[Effect]
+
+// Coordinator serializes the TTS-pipeline transitions. The producer (Close) and
+// worker (WorkerExited) goroutines both call Apply, so the lock serializes the
+// two writers. See coordinator.Coordinator.
+type Coordinator = coordinator.Coordinator[State, Event, Effect]
+
+// New returns an Open Coordinator that performs effects via sink.
+func New(sink EffectSink) *Coordinator {
+	return coordinator.New[State, Event, Effect](Open{}, Next, sink)
+}
diff --git a/core/http/endpoints/openai/ttscoord/ttscoord_suite_test.go b/core/http/endpoints/openai/ttscoord/ttscoord_suite_test.go
new file mode 100644
index 000000000..3f58e120d
--- /dev/null
+++ b/core/http/endpoints/openai/ttscoord/ttscoord_suite_test.go
@@ -0,0 +1,13 @@
+package ttscoord
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestTtscoord(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "ttscoord (realtime M5) Suite")
+}
diff --git a/core/http/endpoints/openai/ttscoord/ttscoord_test.go b/core/http/endpoints/openai/ttscoord/ttscoord_test.go
new file mode 100644
index 000000000..97524b816
--- /dev/null
+++ b/core/http/endpoints/openai/ttscoord/ttscoord_test.go
@@ -0,0 +1,165 @@
+package ttscoord
+
+import (
+	"math/rand/v2"
+	"sync"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// recordingSink captures the ordered stream of effects.
+type recordingSink struct {
+	mu  sync.Mutex
+	log []Effect
+}
+
+func (s *recordingSink) Perform(e Effect) {
+	s.mu.Lock()
+	s.log = append(s.log, e)
+	s.mu.Unlock()
+}
+
+func (s *recordingSink) wakes() int {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	n := 0
+	for _, e := range s.log {
+		if _, ok := e.(Wake); ok {
+			n++
+		}
+	}
+	return n
+}
+
+type unknownEvent struct{}
+
+func (unknownEvent) isEvent()       {}
+func (unknownEvent) String() string { return "unknownEvent" }
+
+type unknownState struct{}
+
+func (unknownState) isState()       {}
+func (unknownState) String() string { return "unknownState" }
+
+var _ = Describe("ttscoord.Next", func() {
+	DescribeTable("transitions",
+		func(state State, event Event, wantState State, wantEff []Effect) {
+			gotState, gotEff, err := Next(state, event)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(gotState).To(Equal(wantState))
+			Expect(gotEff).To(Equal(wantEff))
+		},
+		Entry("open+close -> closing: wake",
+			Open{}, Close{}, Closing{}, []Effect{Wake{}}),
+		Entry("open+workerexited -> closed (defensive)",
+			Open{}, WorkerExited{}, Closed{}, []Effect(nil)),
+		Entry("closing+close -> closing, no-op (idempotent wait)",
+			Closing{}, Close{}, Closing{}, []Effect(nil)),
+		Entry("closing+workerexited -> closed",
+			Closing{}, WorkerExited{}, Closed{}, []Effect(nil)),
+		Entry("closed+close -> closed, no-op",
+			Closed{}, Close{}, Closed{}, []Effect(nil)),
+		Entry("closed+workerexited -> closed, no-op",
+			Closed{}, WorkerExited{}, Closed{}, []Effect(nil)),
+	)
+
+	It("is total over the defined (state, event) pairs", func() {
+		for _, s := range []State{Open{}, Closing{}, Closed{}} {
+			for _, e := range []Event{Close{}, WorkerExited{}} {
+				_, _, err := Next(s, e)
+				Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e)
+			}
+		}
+	})
+
+	It("errors on an unknown event type", func() {
+		_, _, err := Next(Open{}, unknownEvent{})
+		Expect(err).To(HaveOccurred())
+	})
+
+	It("errors on an unknown state type", func() {
+		_, _, err := Next(unknownState{}, Close{})
+		Expect(err).To(HaveOccurred())
+	})
+})
+
+// phaseOf maps a state to a monotonic rank for the "never goes backwards" check.
+func phaseOf(s State) int {
+	switch s.(type) {
+	case Open:
+		return 0
+	case Closing:
+		return 1
+	case Closed:
+		return 2
+	default:
+		return -1
+	}
+}
+
+var _ = Describe("ttscoord.Coordinator", func() {
+	It("keeps the lifecycle monotonic and wakes at most once over random sequences", func() {
+		seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE}
+		for _, seed := range seeds {
+			r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5))
+			sink := &recordingSink{}
+			c := New(sink)
+			prev := 0
+
+			for range 5000 {
+				if r.IntN(2) == 0 {
+					Expect(c.Apply(Close{})).To(Succeed())
+				} else {
+					Expect(c.Apply(WorkerExited{})).To(Succeed())
+				}
+				cur := phaseOf(c.State())
+				Expect(cur).To(BeNumerically(">=", prev), "seed=%d: lifecycle went backwards", seed)
+				prev = cur
+			}
+			Expect(sink.wakes()).To(BeNumerically("<=", 1), "seed=%d: woke more than once", seed)
+		}
+	})
+
+	// Two-writer test: a producer raises Close while the "worker" raises
+	// WorkerExited, the real concurrency. The lifecycle must stay monotonic and
+	// Wake must fire at most once. Run under -race.
+	It("is two-writer safe (producer Close vs worker WorkerExited)", func() {
+		const iterations = 200
+		for range iterations {
+			sink := &recordingSink{}
+			c := New(sink)
+			var wg sync.WaitGroup
+			wg.Add(2)
+			go func() { defer wg.Done(); _ = c.Apply(Close{}) }()
+			go func() { defer wg.Done(); _ = c.Apply(WorkerExited{}) }()
+			wg.Wait()
+			// After both, drive to terminal and assert idempotence.
+			_ = c.Apply(Close{})
+			_ = c.Apply(WorkerExited{})
+			Expect(c.State()).To(Equal(State(Closed{})))
+			Expect(sink.wakes()).To(BeNumerically("<=", 1))
+		}
+	})
+
+	It("only Open accepts (a gate query never panics across states)", func() {
+		// Mirrors the pipeline's enqueue gate: accepted iff Open.
+		sink := &recordingSink{}
+		c := New(sink)
+		_, open := c.State().(Open)
+		Expect(open).To(BeTrue())
+		Expect(c.Apply(Close{})).To(Succeed())
+		_, open = c.State().(Open)
+		Expect(open).To(BeFalse())
+	})
+})
+
+var _ = DescribeTable("ttscoord stringers",
+	func(got, want string) { Expect(got).To(Equal(want)) },
+	Entry(nil, Open{}.String(), "Open"),
+	Entry(nil, Closing{}.String(), "Closing"),
+	Entry(nil, Closed{}.String(), "Closed"),
+	Entry(nil, Close{}.String(), "Close"),
+	Entry(nil, WorkerExited{}.String(), "WorkerExited"),
+	Entry(nil, Wake{}.String(), "Wake"),
+)
diff --git a/core/http/endpoints/openai/turncoord/turncoord.go b/core/http/endpoints/openai/turncoord/turncoord.go
new file mode 100644
index 000000000..ac9e85052
--- /dev/null
+++ b/core/http/endpoints/openai/turncoord/turncoord.go
@@ -0,0 +1,255 @@
+// Package turncoord is the explicit state machine for the realtime API's
+// turn-detection concern (machine "M2" in
+// docs/design/realtime-state-machines.md).
+//
+// In the legacy code this machine is implicit and, worse, split across TWO
+// variables that can disagree: handleVAD's goroutine-local speechStarted bool
+// and the semantic_vad liveTurnState's "is the live stream open" flag
+// (lts.open()). They are set and cleared at separate points, so a discardTurn
+// (no-speech clear, a semantic->server mode switch mid-turn, or teardown)
+// closes the live stream but leaves speechStarted true. The two then disagree,
+// and the next speech onset is suppressed because `if !speechStarted` is false
+// — the user's next utterance silently produces no speech_started, no barge-in,
+// and no commit. See docs/design/realtime-state-machines.md, Part 2 (failure
+// mode 4) and the turn_lifecycle spec under formal-verification/.
+//
+// This package replaces that with:
+//   - a sealed sum type for State (illegal states are unrepresentable),
+//   - a total, pure transition function Next(state, event) -> (state, effects),
+//   - a single-writer Coordinator that serializes every transition.
+//
+// "Speech detected" and "a turn is open" become ONE state (Speaking), so they
+// can no longer fall out of sync: every path that ends a turn returns to Idle
+// and necessarily clears both. The design guarantees the invariants the specs
+// check:
+//   - speechStarted ⟺ a turn is open (Part 4, invariant #4) — structural here,
+//   - a barge-in cancel precedes the next turn's commit (you must pass through
+//     Speaking, which barges in on entry, before a Silence can commit),
+//   - every opened turn is finished (commit) or discarded (abort) exactly once.
+//
+// Unlike M3 (respcoord), which is a genuine dual-writer race, M2's turn
+// lifecycle is driven by the single handleVAD goroutine: the value here is
+// making the speechStarted/turn-open desync unrepresentable, not serializing
+// concurrent writers. The Coordinator still serializes transitions so that
+// State() is race-free and a teardown-time Abort from another goroutine (or a
+// future second writer) stays safe.
+//
+// Mode note: in server_vad mode there is no live ASR stream, so OpenTurn /
+// DiscardTurn have nothing to open or close — the sink performs them as no-ops
+// and "turn open" is satisfied vacuously. The state coupling (Speaking ⟺ turn
+// open) still holds; it is only semantic_vad that had two real variables to
+// desync.
+package turncoord
+
+import (
+	"fmt"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator"
+)
+
+// TurnID identifies one user turn. The caller mints it when speech begins (it
+// is the conversation item id the live caption deltas stream under, reused by
+// the committed event so the client replaces the partial text). Carrying it in
+// the state makes "commit/discard refer to the turn that was opened" explicit.
+type TurnID string
+
+// AbortReason records why a turn was dropped without committing. Like
+// respcoord.Source it is observability only — every reason aborts the same way;
+// keeping it in the event makes the distinct legacy discardTurn sites explicit
+// rather than collapsed into one anonymous code path.
+type AbortReason int
+
+const (
+	// AbortNoSpeech: the no-speech clear — the VAD found no segments and the
+	// buffer is past the holdback, so the inspected audio was not speech.
+	AbortNoSpeech AbortReason = iota
+	// AbortTeardown: the session is closing.
+	AbortTeardown
+)
+
+// NOTE: a semantic->server turn-detection switch mid-turn is deliberately NOT an
+// Abort: it only drops the orphaned live ASR stream and lets the turn continue
+// under server_vad (so a config change can't cut off a mid-utterance speaker).
+// That orphan cleanup stays inline in handleVAD; only the two reasons above end
+// a turn (return to Idle).
+
+func (r AbortReason) String() string {
+	switch r {
+	case AbortNoSpeech:
+		return "no_speech"
+	case AbortTeardown:
+		return "teardown"
+	default:
+		return fmt.Sprintf("AbortReason(%d)", int(r))
+	}
+}
+
+// State is the sealed sum type of turn-detection states. The only
+// implementations are the marker-method structs in this file, so callers
+// outside the package cannot fabricate an out-of-band state. Exhaustively:
+// Idle | Speaking.
+type State interface {
+	isState()
+	String() string
+}
+
+// Idle: no turn is open and no speech is in progress (legacy: speechStarted ==
+// false AND the live stream is closed — here a single state, so they cannot
+// disagree).
+type Idle struct{}
+
+// Speaking: a turn is open and speech is in progress (legacy: speechStarted ==
+// true AND, in semantic mode, the live stream open). Turn is the open turn's id.
+type Speaking struct{ Turn TurnID }
+
+func (Idle) isState()     {}
+func (Speaking) isState() {}
+
+func (Idle) String() string       { return "Idle" }
+func (s Speaking) String() string { return fmt.Sprintf("Speaking(%s)", s.Turn) }
+
+// Event is the sealed sum type of inputs. Exhaustively: Onset | Silence | Abort.
+type Event interface {
+	isEvent()
+	String() string
+}
+
+// Onset reports that the VAD found speech this tick. Turn is the id to open the
+// turn under (allocated by the caller so caption deltas can stream immediately).
+// While already Speaking it is a no-op: re-detection of ongoing speech does not
+// reopen a turn (legacy `if !speechStarted`).
+type Onset struct{ Turn TurnID }
+
+// Silence reports VAD-confirmed silence past the dynamic commit threshold (the
+// end-of-speech commit trigger). The threshold itself — semantic_vad's EOU vs
+// eagerness fallback — is computed by the caller before raising this event; the
+// machine only sequences the commit. It is a no-op while Idle (nothing to
+// commit).
+type Silence struct{}
+
+// Abort drops the open turn without committing (no-speech clear, mode switch,
+// teardown). It is a no-op while Idle (nothing open).
+type Abort struct{ Reason AbortReason }
+
+func (Onset) isEvent()   {}
+func (Silence) isEvent() {}
+func (Abort) isEvent()   {}
+
+func (e Onset) String() string { return fmt.Sprintf("Onset(%s)", e.Turn) }
+func (Silence) String() string { return "Silence" }
+func (e Abort) String() string { return fmt.Sprintf("Abort(%s)", e.Reason) }
+
+// Effect is a side effect returned by Next as data for the caller to perform.
+// Returning effects as data (rather than firing callbacks inside the
+// transition) keeps Next pure and exhaustively testable. Exhaustively:
+// BargeIn | OpenTurn | EmitSpeechStarted | EmitSpeechStopped | CommitTurn |
+// DiscardTurn.
+type Effect interface {
+	isEffect()
+	String() string
+}
+
+// BargeIn: cancel any in-flight response (the M2->M3 edge). Emitted on the
+// Idle->Speaking onset, before the new turn can ever commit — so a barge-in
+// always precedes the next commit.
+type BargeIn struct{}
+
+// OpenTurn: open the live ASR stream for Turn (semantic_vad). No-op in
+// server_vad mode.
+type OpenTurn struct{ Turn TurnID }
+
+// EmitSpeechStarted: send input_audio_buffer.speech_started.
+type EmitSpeechStarted struct{}
+
+// EmitSpeechStopped: send input_audio_buffer.speech_stopped.
+type EmitSpeechStopped struct{}
+
+// CommitTurn: finalize the turn's live stream, emit input_audio_buffer.committed
+// for Turn, and issue the response (via respcoord). The completion of one turn.
+type CommitTurn struct{ Turn TurnID }
+
+// DiscardTurn: close the turn's live stream and retract any caption deltas
+// already shown for Turn (the failed transcription event). No commit, no
+// response.
+type DiscardTurn struct{ Turn TurnID }
+
+func (BargeIn) isEffect()           {}
+func (OpenTurn) isEffect()          {}
+func (EmitSpeechStarted) isEffect() {}
+func (EmitSpeechStopped) isEffect() {}
+func (CommitTurn) isEffect()        {}
+func (DiscardTurn) isEffect()       {}
+
+func (BargeIn) String() string           { return "BargeIn" }
+func (e OpenTurn) String() string        { return fmt.Sprintf("OpenTurn(%s)", e.Turn) }
+func (EmitSpeechStarted) String() string { return "EmitSpeechStarted" }
+func (EmitSpeechStopped) String() string { return "EmitSpeechStopped" }
+func (e CommitTurn) String() string      { return fmt.Sprintf("CommitTurn(%s)", e.Turn) }
+func (e DiscardTurn) String() string     { return fmt.Sprintf("DiscardTurn(%s)", e.Turn) }
+
+// Next is the total, pure transition function. For every (state, event) it
+// returns the next state and the ordered effects to perform. It returns a
+// non-nil error only for an unknown State/Event implementation (a programmer
+// error / future type added without updating this function) — callers must
+// surface that, never silently ignore it. Every in-domain (state, event) pair
+// is defined; there are no "forbidden" transitions, only no-ops for events that
+// don't apply to the current state.
+//
+// The crux of the fix is that both turn-ending transitions (Silence commit and
+// Abort) go to Idle, which carries no turn data: there is no way to clear "turn
+// open" while leaving "speech started" set, because they are the same state.
+// The legacy desync (discardTurn closed the live stream but left speechStarted
+// true) is therefore unrepresentable.
+//
+// Effect ordering on onset mirrors the live handleVAD: OpenTurn (start the live
+// stream), then BargeIn (cancel the prior response), then EmitSpeechStarted.
+func Next(s State, e Event) (State, []Effect, error) {
+	switch st := s.(type) {
+	case Idle:
+		switch ev := e.(type) {
+		case Onset:
+			return Speaking{Turn: ev.Turn}, []Effect{
+				OpenTurn{Turn: ev.Turn},
+				BargeIn{},
+				EmitSpeechStarted{},
+			}, nil
+		case Silence:
+			// Nothing in flight to commit: idempotent no-op.
+			return Idle{}, nil, nil
+		case Abort:
+			// No open turn: idempotent no-op (discardTurn on a closed stream).
+			return Idle{}, nil, nil
+		}
+	case Speaking:
+		switch e.(type) {
+		case Onset:
+			// Speech already in progress: re-detection does not reopen a turn
+			// or re-emit speech_started (legacy `if !speechStarted`). The turn
+			// id stays the one allocated at onset.
+			return Speaking{Turn: st.Turn}, nil, nil
+		case Silence:
+			return Idle{}, []Effect{
+				EmitSpeechStopped{},
+				CommitTurn{Turn: st.Turn},
+			}, nil
+		case Abort:
+			return Idle{}, []Effect{DiscardTurn{Turn: st.Turn}}, nil
+		}
+	}
+	return s, nil, fmt.Errorf("turncoord: unhandled transition %s <- %s", s, e)
+}
+
+// EffectSink performs the effects produced by a transition. See coordinator.Sink
+// for the non-blocking contract: Perform runs under the coordinator lock, so it
+// must not block and must not re-enter Apply.
+type EffectSink = coordinator.Sink[Effect]
+
+// Coordinator serializes turn transitions. In practice the handleVAD goroutine is
+// the only writer, but serializing keeps State() race-free and a teardown-time
+// Abort from another goroutine safe. See coordinator.Coordinator.
+type Coordinator = coordinator.Coordinator[State, Event, Effect]
+
+// New returns an idle Coordinator that performs effects via sink.
+func New(sink EffectSink) *Coordinator {
+	return coordinator.New[State, Event, Effect](Idle{}, Next, sink)
+}
diff --git a/core/http/endpoints/openai/turncoord/turncoord_suite_test.go b/core/http/endpoints/openai/turncoord/turncoord_suite_test.go
new file mode 100644
index 000000000..8e34feb74
--- /dev/null
+++ b/core/http/endpoints/openai/turncoord/turncoord_suite_test.go
@@ -0,0 +1,13 @@
+package turncoord
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestTurncoord(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "turncoord (realtime M2) Suite")
+}
diff --git a/core/http/endpoints/openai/turncoord/turncoord_test.go b/core/http/endpoints/openai/turncoord/turncoord_test.go
new file mode 100644
index 000000000..a3c342187
--- /dev/null
+++ b/core/http/endpoints/openai/turncoord/turncoord_test.go
@@ -0,0 +1,242 @@
+package turncoord
+
+import (
+	"fmt"
+	"math/rand/v2"
+	"sync"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// recordingSink captures the ordered stream of effects so the invariants can be
+// checked independently of the transition function's internals. Perform is
+// called by Coordinator.Apply under the coordinator lock, so it is already
+// serialized; the mutex here only guards reads from the spec goroutine.
+type recordingSink struct {
+	mu  sync.Mutex
+	log []Effect
+}
+
+func (s *recordingSink) Perform(e Effect) {
+	s.mu.Lock()
+	s.log = append(s.log, e)
+	s.mu.Unlock()
+}
+
+func (s *recordingSink) snapshot() []Effect {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	out := make([]Effect, len(s.log))
+	copy(out, s.log)
+	return out
+}
+
+// checkLog replays the effect log and asserts the turn-lifecycle safety
+// properties from docs/design/realtime-state-machines.md, Part 4 (invariant #4
+// and the discardTurn/speechStarted desync, failure mode 4):
+//
+//	(1) at most one turn open at any instant -- OpenTurn never fires while a
+//	    turn is already open;
+//	(2) every turn id is opened at most once;
+//	(3) no orphan close -- CommitTurn/DiscardTurn only fire on an open turn.
+//
+// The wire pairing of speech_started/speech_stopped is intentionally NOT
+// reconstructed here: like the legacy no-speech clear, an Abort discards the
+// turn without a speech_stopped (the failed-transcription event is its closure
+// signal). The guarantee this package adds is the *state* coupling (Speaking
+// <=> a turn is open), checked inline in the property spec below.
+func checkLog(log []Effect) {
+	open := false
+	opens := map[TurnID]int{}
+	for i, eff := range log {
+		switch e := eff.(type) {
+		case OpenTurn:
+			Expect(open).To(BeFalse(), "invariant (1): OpenTurn(%s) while a turn is already open (effect #%d)\nlog=%v", e.Turn, i, log)
+			open = true
+			opens[e.Turn]++
+			Expect(opens[e.Turn]).To(Equal(1), "invariant (2): turn %s opened %d times (effect #%d)\nlog=%v", e.Turn, opens[e.Turn], i, log)
+		case CommitTurn:
+			Expect(open).To(BeTrue(), "invariant (3): CommitTurn(%s) with no open turn (effect #%d)\nlog=%v", e.Turn, i, log)
+			open = false
+		case DiscardTurn:
+			Expect(open).To(BeTrue(), "invariant (3): DiscardTurn(%s) with no open turn (effect #%d)\nlog=%v", e.Turn, i, log)
+			open = false
+		}
+	}
+}
+
+// unknownEvent / unknownState exercise the defensive error path for a type that
+// Next does not know about (a future variant added without updating Next).
+type unknownEvent struct{}
+
+func (unknownEvent) isEvent()       {}
+func (unknownEvent) String() string { return "unknownEvent" }
+
+type unknownState struct{}
+
+func (unknownState) isState()       {}
+func (unknownState) String() string { return "unknownState" }
+
+var _ = Describe("turncoord.Next", func() {
+	// DescribeTable exhaustively pins every (state, event) cell of the pure
+	// transition function, including the idle no-op cells. This is the practical
+	// stand-in for "no transition leads to an inconsistent state": if a cell
+	// changes, this table must change with it.
+	DescribeTable("transitions",
+		func(state State, event Event, wantState State, wantEff []Effect) {
+			gotState, gotEff, err := Next(state, event)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(gotState).To(Equal(wantState))
+			Expect(gotEff).To(Equal(wantEff))
+		},
+		Entry("idle+onset -> speaking: open, barge-in, speech_started",
+			Idle{}, Onset{Turn: "t1"},
+			Speaking{Turn: "t1"},
+			[]Effect{OpenTurn{Turn: "t1"}, BargeIn{}, EmitSpeechStarted{}}),
+		Entry("idle+silence -> idle, no-op (nothing to commit)",
+			Idle{}, Silence{},
+			Idle{}, []Effect(nil)),
+		Entry("idle+abort -> idle, no-op (nothing open)",
+			Idle{}, Abort{Reason: AbortNoSpeech},
+			Idle{}, []Effect(nil)),
+		Entry("speaking+onset -> stay speaking, no-op (already speaking)",
+			Speaking{Turn: "t1"}, Onset{Turn: "t2"}, // a fresh id is ignored mid-turn
+			Speaking{Turn: "t1"}, []Effect(nil)),
+		Entry("speaking+silence -> idle: speech_stopped + commit",
+			Speaking{Turn: "t1"}, Silence{},
+			Idle{}, []Effect{EmitSpeechStopped{}, CommitTurn{Turn: "t1"}}),
+		Entry("speaking+abort(no_speech) -> idle: discard",
+			Speaking{Turn: "t1"}, Abort{Reason: AbortNoSpeech},
+			Idle{}, []Effect{DiscardTurn{Turn: "t1"}}),
+		Entry("speaking+abort(teardown) -> idle: discard",
+			Speaking{Turn: "t9"}, Abort{Reason: AbortTeardown},
+			Idle{}, []Effect{DiscardTurn{Turn: "t9"}}),
+	)
+
+	It("is total: every defined (state, event) pair is handled without error", func() {
+		states := []State{Idle{}, Speaking{Turn: "t1"}}
+		events := []Event{
+			Onset{Turn: "t2"},
+			Silence{},
+			Abort{Reason: AbortNoSpeech},
+			Abort{Reason: AbortTeardown},
+		}
+		for _, s := range states {
+			for _, e := range events {
+				_, _, err := Next(s, e)
+				Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e)
+			}
+		}
+	})
+
+	It("errors on an unknown event type", func() {
+		_, _, err := Next(Speaking{Turn: "t1"}, unknownEvent{})
+		Expect(err).To(HaveOccurred())
+	})
+
+	It("errors on an unknown state type", func() {
+		_, _, err := Next(unknownState{}, Onset{Turn: "t1"})
+		Expect(err).To(HaveOccurred())
+	})
+})
+
+var _ = Describe("turncoord.Coordinator", func() {
+	// This replaces the previous rapid stateful test: a seeded random walk over
+	// the event space, asserting after every step both the log invariants and
+	// the core state coupling -- the machine is in Speaking IFF a turn is
+	// currently open. That coupling is the whole point of M2: in the legacy code
+	// speechStarted and the live-stream-open flag were separate variables a
+	// discard could desync; here they are one state and cannot. Seeds are fixed
+	// so any failure reproduces deterministically (the failing seed/step is in
+	// the assertion message).
+	It("keeps state coupled to turn-open over random event sequences", func() {
+		seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE}
+		for _, seed := range seeds {
+			r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5))
+			sink := &recordingSink{}
+			c := New(sink)
+			var nextTurn uint64
+			open := false // independent model of "is a turn open"
+
+			for step := range 5000 {
+				switch r.IntN(3) {
+				case 0:
+					nextTurn++
+					Expect(c.Apply(Onset{Turn: TurnID(fmt.Sprintf("t%d", nextTurn))})).To(Succeed())
+					open = true // onset opens a turn (or is a no-op if already open)
+				case 1:
+					Expect(c.Apply(Silence{})).To(Succeed())
+					open = false // commit (or no-op if already idle)
+				case 2:
+					Expect(c.Apply(Abort{Reason: AbortReason(r.IntN(2))})).To(Succeed())
+					open = false // discard (or no-op if already idle)
+				}
+				_, speaking := c.State().(Speaking)
+				Expect(speaking).To(Equal(open), "coupling: seed=%d step=%d state=%s", seed, step, c.State())
+			}
+			checkLog(sink.snapshot())
+		}
+	})
+
+	// M2 is single-writer in practice (handleVAD), but teardown can Abort from
+	// another goroutine, so the Coordinator must be race-safe. Run under -race;
+	// the log invariants must hold regardless of interleaving.
+	It("is race-safe under concurrent Apply from two goroutines", func() {
+		const perGoroutine = 2000
+		sink := &recordingSink{}
+		c := New(sink)
+
+		var idCounter uint64
+		var idMu sync.Mutex
+		nextTurn := func() TurnID {
+			idMu.Lock()
+			defer idMu.Unlock()
+			idCounter++
+			return TurnID(fmt.Sprintf("t%d", idCounter))
+		}
+
+		var wg sync.WaitGroup
+		drive := func(reason AbortReason) {
+			defer wg.Done()
+			for i := range perGoroutine {
+				switch i % 3 {
+				case 0:
+					_ = c.Apply(Onset{Turn: nextTurn()})
+				case 1:
+					_ = c.Apply(Silence{})
+				case 2:
+					_ = c.Apply(Abort{Reason: reason})
+				}
+			}
+		}
+
+		wg.Add(2)
+		go drive(AbortNoSpeech)
+		go drive(AbortTeardown)
+		wg.Wait()
+
+		checkLog(sink.snapshot())
+	})
+})
+
+var _ = DescribeTable("turncoord stringers",
+	func(got, want string) { Expect(got).To(Equal(want)) },
+	Entry(nil, AbortNoSpeech.String(), "no_speech"),
+	Entry(nil, AbortTeardown.String(), "teardown"),
+	Entry(nil, AbortReason(99).String(), "AbortReason(99)"),
+
+	Entry(nil, Idle{}.String(), "Idle"),
+	Entry(nil, Speaking{Turn: "t7"}.String(), "Speaking(t7)"),
+
+	Entry(nil, Onset{Turn: "t1"}.String(), "Onset(t1)"),
+	Entry(nil, Silence{}.String(), "Silence"),
+	Entry(nil, Abort{Reason: AbortTeardown}.String(), "Abort(teardown)"),
+
+	Entry(nil, BargeIn{}.String(), "BargeIn"),
+	Entry(nil, OpenTurn{Turn: "t2"}.String(), "OpenTurn(t2)"),
+	Entry(nil, EmitSpeechStarted{}.String(), "EmitSpeechStarted"),
+	Entry(nil, EmitSpeechStopped{}.String(), "EmitSpeechStopped"),
+	Entry(nil, CommitTurn{Turn: "t3"}.String(), "CommitTurn(t3)"),
+	Entry(nil, DiscardTurn{Turn: "t4"}.String(), "DiscardTurn(t4)"),
+)
diff --git a/core/http/react-ui/e2e/traces-audio.spec.js b/core/http/react-ui/e2e/traces-audio.spec.js
new file mode 100644
index 000000000..567fd56c2
--- /dev/null
+++ b/core/http/react-ui/e2e/traces-audio.spec.js
@@ -0,0 +1,87 @@
+import { test, expect } from './coverage-fixtures.js'
+
+// Audio snippets on the Traces page must play through a blob: object URL —
+// the CSP's connect-src allows blob: but not data:, and the waveform peaks
+// renderer fetch()es the player src — and must degrade to a readable note
+// (not a broken player) when the stored payload is the "<truncated: N bytes>"
+// marker an older server stamped into oversized fields.
+
+// Minimal valid 16 kHz mono 16-bit PCM WAV (0.1s 440 Hz sine), base64-encoded.
+function wavBase64(samples = 1600, rate = 16000) {
+  const dataSize = samples * 2
+  const buf = Buffer.alloc(44 + dataSize)
+  buf.write('RIFF', 0)
+  buf.writeUInt32LE(36 + dataSize, 4)
+  buf.write('WAVE', 8)
+  buf.write('fmt ', 12)
+  buf.writeUInt32LE(16, 16)
+  buf.writeUInt16LE(1, 20) // PCM
+  buf.writeUInt16LE(1, 22) // mono
+  buf.writeUInt32LE(rate, 24)
+  buf.writeUInt32LE(rate * 2, 28)
+  buf.writeUInt16LE(2, 32)
+  buf.writeUInt16LE(16, 34)
+  buf.write('data', 36)
+  buf.writeUInt32LE(dataSize, 40)
+  for (let i = 0; i < samples; i++) {
+    buf.writeInt16LE(Math.round(8000 * Math.sin((2 * Math.PI * 440 * i) / rate)), 44 + i * 2)
+  }
+  return buf.toString('base64')
+}
+
+function transcriptionTrace(audioWavBase64) {
+  return {
+    type: 'transcription',
+    timestamp: Date.now() * 1_000_000,
+    model_name: 'parakeet-test',
+    summary: 'transcribed utterance',
+    duration: 500_000_000,
+    error: null,
+    data: {
+      audio_wav_base64: audioWavBase64,
+      audio_duration_s: 0.1,
+      audio_snippet_s: 0.1,
+      audio_sample_rate: 16000,
+      audio_samples: 1600,
+      audio_rms_dbfs: -12.0,
+      audio_peak_dbfs: -6.0,
+      audio_dc_offset: 0,
+    },
+  }
+}
+
+async function openBackendTraceRow(page, traces) {
+  await page.route('**/api/traces', (route) => {
+    route.fulfill({ contentType: 'application/json', body: JSON.stringify([]) })
+  })
+  await page.route('**/api/backend-traces', (route) => {
+    route.fulfill({ contentType: 'application/json', body: JSON.stringify(traces) })
+  })
+  await page.goto('/app/traces')
+  await expect(page.locator('text=Tracing is')).toBeVisible({ timeout: 10_000 })
+  await page.locator('button', { hasText: 'Backend Traces' }).click()
+  await page.locator('td', { hasText: 'parakeet-test' }).first().click()
+}
+
+test.describe('Traces - Audio Snippets', () => {
+  test('plays a clip through a blob: URL, not a CSP-blocked data: URL', async ({ page }) => {
+    await openBackendTraceRow(page, [transcriptionTrace(wavBase64())])
+
+    // The expanded row carries the snippet metrics and a player whose source
+    // is an object URL (connect-src allows blob:, so the peaks fetch works).
+    await expect(page.locator('text=Audio Snippet')).toBeVisible()
+    const audio = page.locator('audio')
+    await expect(audio).toHaveCount(1)
+    const src = await audio.getAttribute('src')
+    expect(src).toMatch(/^blob:/)
+    await expect(page.getByTestId('audio-snippet-unavailable')).toHaveCount(0)
+  })
+
+  test('shows a readable note instead of a broken player for truncated payloads', async ({ page }) => {
+    await openBackendTraceRow(page, [transcriptionTrace('<truncated: 281660 bytes>')])
+
+    await expect(page.locator('text=Audio Snippet')).toBeVisible()
+    await expect(page.getByTestId('audio-snippet-unavailable')).toBeVisible()
+    await expect(page.locator('audio')).toHaveCount(0)
+  })
+})
diff --git a/core/http/react-ui/src/pages/Talk.jsx b/core/http/react-ui/src/pages/Talk.jsx
index 5a6857a9e..b25643aa7 100644
--- a/core/http/react-ui/src/pages/Talk.jsx
+++ b/core/http/react-ui/src/pages/Talk.jsx
@@ -19,24 +19,31 @@ const STATUS_STYLES = {
   error:        { icon: 'fa-solid fa-circle', color: 'var(--color-error)', bg: 'var(--color-error-light)' },
 }
 
-// upsertAssistant merges a streamed transcript fragment into the assistant entry
-// identified by the server's item_id, or appends a new entry if none exists yet.
-// Keying by item_id (not a mutable index tracked across handler/updater
-// boundaries) makes streamed deltas idempotent and order-independent, so React's
-// batching of non-React data-channel events cannot produce a duplicate bubble.
-// mode 'append' adds to the running text; 'replace' sets the final transcript.
-function upsertAssistant(prev, itemId, text, mode) {
-  // Only assistant entries carry an id, and the streaming entry is almost
-  // always the newest — search from the tail so per-delta cost stays constant.
+// upsertEntry merges a streamed transcript fragment into the entry identified
+// by the server's item_id, or appends a new entry (with the given role) if
+// none exists yet. Keying by item_id (not a mutable index tracked across
+// handler/updater boundaries) makes streamed deltas idempotent and
+// order-independent, so React's batching of non-React data-channel events
+// cannot produce a duplicate bubble. mode 'append' adds to the running text;
+// 'replace' sets the final transcript — the server sends a completed event
+// whose authoritative text supersedes any live captions (e.g. the
+// semantic_vad retranscribe gate's batch decode).
+function upsertEntry(prev, itemId, role, text, mode) {
+  // The streaming entry is almost always the newest — search from the tail
+  // so per-delta cost stays constant.
   const i = prev.findLastIndex(e => e.id === itemId)
   if (i === -1) {
-    return [...prev, { role: 'assistant', id: itemId, text }]
+    return [...prev, { role, id: itemId, text }]
   }
   const next = [...prev]
   next[i] = { ...next[i], text: mode === 'append' ? next[i].text + text : text }
   return next
 }
 
+function upsertAssistant(prev, itemId, text, mode) {
+  return upsertEntry(prev, itemId, 'assistant', text, mode)
+}
+
 export default function Talk() {
   const { addToast } = useOutletContext()
   const navigate = useNavigate()
@@ -252,12 +259,33 @@ export default function Talk() {
       case 'input_audio_buffer.speech_stopped':
         updateStatus('thinking', 'Processing...')
         break
+      case 'conversation.item.input_audio_transcription.delta':
+        // Live captions: semantic_vad streams the user's words while they
+        // are still speaking, keyed by the item id the commit will reuse.
+        if (event.delta && event.item_id) {
+          setTranscript(prev => upsertEntry(prev, event.item_id, 'user', event.delta, 'append'))
+        }
+        break
       case 'conversation.item.input_audio_transcription.completed':
         if (event.transcript) {
-          setTranscript(prev => [...prev, { role: 'user', text: event.transcript }])
+          if (event.item_id) {
+            // Replaces any live captions with the authoritative transcript
+            // (which may differ, e.g. the retranscribe gate's batch decode);
+            // creates the entry when there were none (server_vad).
+            setTranscript(prev => upsertEntry(prev, event.item_id, 'user', event.transcript, 'replace'))
+          } else {
+            setTranscript(prev => [...prev, { role: 'user', text: event.transcript }])
+          }
         }
         updateStatus('thinking', 'Generating response...')
         break
+      case 'conversation.item.input_audio_transcription.failed':
+        // The turn was discarded after captions were shown (e.g. the buffer
+        // was cleared as silence) — retract the partial entry.
+        if (event.item_id) {
+          setTranscript(prev => prev.filter(e => e.id !== event.item_id))
+        }
+        break
       case 'response.output_audio_transcript.delta':
         if (event.delta) {
           inProgressIdRef.current = event.item_id
@@ -712,7 +740,7 @@ export default function Talk() {
           )}
           {selectedModelInfo && !selectedModelInfo.self_contained && (
             <div style={{
-              display: 'grid', gridTemplateColumns: 'repeat(4, minmax(0, 1fr))', gap: 'var(--spacing-xs)',
+              display: 'flex', flexDirection: 'column', gap: 'var(--spacing-xs)',
               marginBottom: 'var(--spacing-xs)', fontSize: '0.75rem',
             }}>
               {[
@@ -724,9 +752,12 @@ export default function Talk() {
                 <div key={item.label} style={{
                   background: 'var(--color-bg-secondary)', borderRadius: 'var(--radius-sm)',
                   padding: 'var(--spacing-xs)', border: '1px solid var(--color-border)',
+                  display: 'flex', alignItems: 'baseline', gap: 'var(--spacing-sm)',
                 }}>
-                  <div style={{ color: 'var(--color-text-secondary)', marginBottom: 2 }}>{item.label}</div>
-                  <div style={{ fontFamily: 'var(--font-mono)', overflow: 'hidden', textOverflow: 'ellipsis', whiteSpace: 'nowrap' }}>{item.value}</div>
+                  <div style={{ color: 'var(--color-text-secondary)', whiteSpace: 'nowrap' }}>{item.label}</div>
+                  {/* full width for the value; wrap rather than overflow when the
+                      model name is long (minWidth:0 lets the flex item shrink) */}
+                  <div style={{ fontFamily: 'var(--font-mono)', minWidth: 0, marginLeft: 'auto', textAlign: 'right', overflowWrap: 'anywhere' }}>{item.value || '—'}</div>
                 </div>
               ))}
             </div>
diff --git a/core/http/react-ui/src/pages/Traces.jsx b/core/http/react-ui/src/pages/Traces.jsx
index 85387f815..933acf344 100644
--- a/core/http/react-ui/src/pages/Traces.jsx
+++ b/core/http/react-ui/src/pages/Traces.jsx
@@ -86,8 +86,40 @@ function typeBadgeStyle(type) {
   return { background: c.bg, color: c.color, padding: '2px 8px', borderRadius: 'var(--radius-sm)', fontSize: '0.75rem', fontWeight: 500 }
 }
 
+// useWavObjectURL — decode a base64 WAV payload into a blob: object URL for
+// the waveform player. A data: URL would render in <audio> (media-src allows
+// data:) but the peaks renderer fetch()es the src and the CSP's connect-src
+// only allows blob:, so playback broke with a CSP violation. Decoding to a
+// Blob also tolerates payloads that aren't valid base64 — e.g. the
+// "<truncated: N bytes>" marker older servers stamped into oversized fields —
+// by yielding null instead of a broken player.
+function useWavObjectURL(b64) {
+  const [url, setUrl] = useState(null)
+  useEffect(() => {
+    if (!b64) {
+      setUrl(null)
+      return undefined
+    }
+    let objectUrl = null
+    try {
+      const bin = atob(b64)
+      const bytes = new Uint8Array(bin.length)
+      for (let i = 0; i < bin.length; i++) bytes[i] = bin.charCodeAt(i)
+      objectUrl = URL.createObjectURL(new Blob([bytes], { type: 'audio/wav' }))
+      setUrl(objectUrl)
+    } catch {
+      setUrl(null)
+    }
+    return () => {
+      if (objectUrl) URL.revokeObjectURL(objectUrl)
+    }
+  }, [b64])
+  return url
+}
+
 // Audio player + metrics for transcription traces
 function AudioSnippet({ data }) {
+  const audioUrl = useWavObjectURL(data?.audio_wav_base64)
   if (!data?.audio_wav_base64) return null
   const metrics = [
     { label: 'Duration', value: data.audio_duration_s + 's' },
@@ -104,7 +136,11 @@ function AudioSnippet({ data }) {
         <i className="fas fa-headphones" style={{ color: 'var(--color-primary)' }} /> Audio Snippet
       </h4>
       <div style={{ background: 'var(--color-bg-primary)', border: '1px solid var(--color-border)', borderRadius: 'var(--radius-md)', padding: 'var(--spacing-sm)' }}>
-        <WaveformPlayer src={`data:audio/wav;base64,${data.audio_wav_base64}`} height={64} />
+        {audioUrl
+          ? <WaveformPlayer src={audioUrl} height={64} />
+          : <div data-testid="audio-snippet-unavailable" style={{ fontSize: '0.75rem', color: 'var(--color-text-secondary)', padding: 'var(--spacing-xs)' }}>
+              <i className="fas fa-triangle-exclamation" /> Audio clip not playable — it was truncated when recorded (raise Max Body Bytes in the tracing settings).
+            </div>}
         <div style={{ display: 'grid', gridTemplateColumns: 'repeat(auto-fill, minmax(120px, 1fr))', gap: 'var(--spacing-xs)', fontSize: '0.75rem', marginTop: 'var(--spacing-sm)' }}>
           {metrics.map(m => (
             <div key={m.label} style={{ background: 'var(--color-bg-secondary)', borderRadius: 'var(--radius-sm)', padding: 'var(--spacing-xs)' }}>
diff --git a/core/schema/transcription.go b/core/schema/transcription.go
index 747adac94..8414fd0ba 100644
--- a/core/schema/transcription.go
+++ b/core/schema/transcription.go
@@ -24,6 +24,11 @@ type TranscriptionResult struct {
 	Text     string                 `json:"text"`
 	Language string                 `json:"language,omitempty"`
 	Duration float64                `json:"duration,omitempty"`
+	// Eou reports that the decode ended on the model's end-of-utterance
+	// special token (emitted by streaming-EOU models such as
+	// parakeet_realtime_eou_120m-v1; always false elsewhere). The marker
+	// itself never appears in Text.
+	Eou bool `json:"eou,omitempty"`
 }
 
 type TranscriptionSegmentSeconds struct {
diff --git a/core/services/nodes/health_mock_test.go b/core/services/nodes/health_mock_test.go
index 86ac5cdcb..429087f52 100644
--- a/core/services/nodes/health_mock_test.go
+++ b/core/services/nodes/health_mock_test.go
@@ -241,6 +241,9 @@ func (c *fakeBackendClient) AudioTransformStream(_ context.Context, _ ...ggrpc.C
 func (c *fakeBackendClient) AudioToAudioStream(_ context.Context, _ ...ggrpc.CallOption) (grpc.AudioToAudioStreamClient, error) {
 	return nil, nil
 }
+func (c *fakeBackendClient) AudioTranscriptionLive(_ context.Context, _ ...ggrpc.CallOption) (grpc.AudioTranscriptionLiveClient, error) {
+	return nil, nil
+}
 func (c *fakeBackendClient) Forward(_ context.Context, _ ...ggrpc.CallOption) (grpc.ForwardClient, error) {
 	return nil, nil
 }
diff --git a/core/services/nodes/inflight_test.go b/core/services/nodes/inflight_test.go
index 5fc9820e7..1b5755cd6 100644
--- a/core/services/nodes/inflight_test.go
+++ b/core/services/nodes/inflight_test.go
@@ -195,6 +195,10 @@ func (f *fakeGRPCBackend) AudioToAudioStream(_ context.Context, _ ...ggrpc.CallO
 	return nil, nil
 }
 
+func (f *fakeGRPCBackend) AudioTranscriptionLive(_ context.Context, _ ...ggrpc.CallOption) (grpc.AudioTranscriptionLiveClient, error) {
+	return nil, nil
+}
+
 func (f *fakeGRPCBackend) Forward(_ context.Context, _ ...ggrpc.CallOption) (grpc.ForwardClient, error) {
 	return nil, nil
 }
diff --git a/core/trace/backend_trace.go b/core/trace/backend_trace.go
index d0a88c923..a3d04d466 100644
--- a/core/trace/backend_trace.go
+++ b/core/trace/backend_trace.go
@@ -75,8 +75,8 @@ var (
 // trace) or any TTS run (~1.3 MiB of audio_wav_base64 per trace) blows the
 // payload past tens of MiB and locks the Traces page in a loading state.
 //
-// 0 disables the cap. Set on the first InitBackendTracingIfEnabled call only,
-// matching the sync.Once-guarded maxItems semantics.
+// 0 disables the cap. Guarded by backendMu; refreshed on EVERY
+// InitBackendTracingIfEnabled call — see below.
 var backendMaxBodyBytes int
 
 func InitBackendTracingIfEnabled(maxItems, maxBodyBytes int) {
@@ -86,7 +86,6 @@ func InitBackendTracingIfEnabled(maxItems, maxBodyBytes int) {
 		}
 		backendMu.Lock()
 		backendTraceBuffer = circularbuffer.New[*BackendTrace](maxItems)
-		backendMaxBodyBytes = maxBodyBytes
 		backendMu.Unlock()
 
 		go func() {
@@ -99,11 +98,26 @@ func InitBackendTracingIfEnabled(maxItems, maxBodyBytes int) {
 			}
 		}()
 	})
+
+	// The body cap tracks the LATEST call, not the first: tracing_max_body_bytes
+	// is runtime-mutable via the settings API (ApplyRuntimeSettings), and every
+	// recording path calls this right before RecordBackendTrace with the current
+	// appConfig value. Freezing the cap on first init meant a raised setting let
+	// producers (e.g. trace.AudioSnippet, which reads the live value) embed
+	// payloads that this recorder then stomped with the "<truncated: N bytes>"
+	// marker — corrupting audio_wav_base64 into an unplayable string. maxItems
+	// keeps first-call semantics: resizing the ring buffer would drop entries.
+	backendMu.Lock()
+	backendMaxBodyBytes = maxBodyBytes
+	backendMu.Unlock()
 }
 
 func RecordBackendTrace(t BackendTrace) {
-	if t.Data != nil && backendMaxBodyBytes > 0 {
-		t.Data = capDataStrings(t.Data, backendMaxBodyBytes)
+	backendMu.Lock()
+	maxBody := backendMaxBodyBytes
+	backendMu.Unlock()
+	if t.Data != nil && maxBody > 0 {
+		t.Data = capDataStrings(t.Data, maxBody)
 	}
 	select {
 	case backendLogChan <- &t:
diff --git a/core/trace/backend_trace_cap_test.go b/core/trace/backend_trace_cap_test.go
index b850bd1ae..6636ab438 100644
--- a/core/trace/backend_trace_cap_test.go
+++ b/core/trace/backend_trace_cap_test.go
@@ -28,8 +28,9 @@ const (
 
 var _ = Describe("RecordBackendTrace Data capping", func() {
 	BeforeEach(func() {
-		// Init is sync.Once so the first test wins; subsequent tests just
-		// clear the buffer. The cap value below has to match the first call.
+		// The ring buffer is allocated once (sync.Once) but the body cap
+		// follows the latest call, so each spec re-establishes smallCap here
+		// regardless of what a previous spec set.
 		trace.InitBackendTracingIfEnabled(64, smallCap)
 		trace.ClearBackendTraces()
 	})
@@ -131,6 +132,30 @@ var _ = Describe("RecordBackendTrace Data capping", func() {
 		got := trace.GetBackendTraces()[0]
 		Expect(got.Data["messages"]).To(Equal(preTruncated))
 	})
+
+	It("applies a runtime-raised cap without a restart", func() {
+		// tracing_max_body_bytes is runtime-mutable via the settings API.
+		// Producers like AudioSnippet read the live value, so the recorder
+		// must too — under the old first-call-wins behaviour a raised cap
+		// kept truncating audio_wav_base64 payloads the producer had already
+		// let through, corrupting them into "<truncated: N bytes>" markers.
+		oversizedForOldCap := strings.Repeat("w", smallCap*4)
+
+		trace.InitBackendTracingIfEnabled(64, smallCap*8) // simulate the settings raise
+		trace.RecordBackendTrace(trace.BackendTrace{
+			Timestamp: time.Now(),
+			Type:      trace.BackendTraceTranscription,
+			ModelName: "m",
+			Data: map[string]any{
+				"audio_wav_base64": oversizedForOldCap,
+			},
+		})
+
+		Eventually(trace.GetBackendTraces).Should(HaveLen(1))
+		got := trace.GetBackendTraces()[0]
+		Expect(got.Data["audio_wav_base64"]).To(Equal(oversizedForOldCap),
+			"a payload under the raised cap must survive intact")
+	})
 })
 
 var _ = Describe("TruncateToBytes", func() {
diff --git a/coverage-baseline.txt b/coverage-baseline.txt
index 4d966c627..0b320ba1d 100644
--- a/coverage-baseline.txt
+++ b/coverage-baseline.txt
@@ -1 +1 @@
-45.0
+48.5
diff --git a/docs/content/features/openai-realtime.md b/docs/content/features/openai-realtime.md
index a6e99267e..f339b21d7 100644
--- a/docs/content/features/openai-realtime.md
+++ b/docs/content/features/openai-realtime.md
@@ -56,6 +56,41 @@ pipeline:
 
 All streaming flags are off by default, so existing pipelines are unaffected.
 
+### Turn detection
+
+Turn detection decides when the user has finished speaking and the pipeline should respond. Two modes are supported, matching the OpenAI session schema:
+
+- **`server_vad`** (default): silence-based. The VAD model watches the audio and the turn commits after `silence_duration_ms` (default 500 ms) of silence. Simple and model-agnostic, but a fixed silence window must trade interrupting mid-sentence pauses against sluggish responses.
+- **`semantic_vad`**: model-driven. The transcription model itself signals end-of-utterance and the silence window becomes dynamic: short right after the model emits its end-of-utterance token, much longer when it does not — so pausing to think no longer gets cut off, while finished sentences get a fast response.
+
+`semantic_vad` requires a transcription model that emits an end-of-utterance token over a cache-aware streaming decode — currently `parakeet-cpp-realtime_eou_120m-v1` (the model is trained to distinguish "paused, expecting a reply" from "paused mid-thought"). The realtime pipeline feeds it the microphone audio live while the user speaks. With any other transcription backend the session degrades gracefully to silence-only detection using the eagerness timeout below (a warning is logged once). The model also emits a distinct end-of-backchannel token (`<EOB>`) for short acknowledgments like "uh-huh": those are transcribed but never treated as the user yielding the turn.
+
+Sessions can opt in via `session.update` (`turn_detection: {"type": "semantic_vad", "eagerness": "medium"}`), or the pipeline can set a server-side default so clients need no changes:
+
+```yaml
+name: gpt-realtime
+pipeline:
+  vad: silero-vad-ggml
+  transcription: parakeet-cpp-realtime_eou_120m-v1
+  llm: qwen3-4b
+  tts: tts-1
+  turn_detection:
+    type: semantic_vad   # default for sessions on this model (server_vad if unset)
+    eagerness: medium    # low | medium | high | auto (auto == medium)
+    retranscribe: false  # see below
+```
+
+A client `session.update` still overrides `type` and `eagerness` per session.
+
+**Eagerness** sets the fallback silence window used when no end-of-utterance token was seen (the model missed it, or the user genuinely trails off): `low` waits 8 s, `medium`/`auto` 4 s, `high` 2 s — the same max-timeout semantics OpenAI documents. After the token is seen, the turn commits on the next VAD tick (~300 ms).
+
+**Live captions**: while the user speaks, `semantic_vad` streams `conversation.item.input_audio_transcription.delta` events under the item id the commit will later reuse, so clients can render the words as they are recognized. The `completed` event at commit carries the authoritative transcript and replaces the partial text (with `retranscribe: true` it may differ from the captions); a turn discarded before commit emits `conversation.item.input_audio_transcription.failed` so clients can retract its captions.
+
+**`retranscribe`** (server-side only, semantic_vad only) cross-checks the streaming decode against a batch decode at commit time:
+
+- `false` (default): the transcript accumulated from the live stream is used as-is — the model runs once per utterance and the LLM starts immediately at commit.
+- `true`: the committed audio is re-transcribed offline. If the batch decode also ends with the end-of-utterance token the turn proceeds (using the batch transcript); if it does **not**, the commit is cancelled and the session keeps listening — treating the streaming token as a false positive. Both transcripts are compared and logged, which makes this mode a useful diagnostic for how well the streaming and batch decodes align, at the cost of one extra decode per turn.
+
 ### Disabling thinking
 
 For reasoning models, you can force the pipeline LLM's thinking off without editing the LLM model config:
diff --git a/docs/design/realtime-state-machines.md b/docs/design/realtime-state-machines.md
new file mode 100644
index 000000000..d88313760
--- /dev/null
+++ b/docs/design/realtime-state-machines.md
@@ -0,0 +1,603 @@
+# Realtime API state machines — map & re-architecture research
+
+Status: research / design (compaction phase). No code changes implied yet.
+
+The realtime API (`core/http/endpoints/openai/realtime*.go`) grew feature-by-feature
+(server_vad → semantic_vad/EOU, streaming pipeline, tool turns, compaction, voice
+gate, sound detection, WebRTC). The result is several **implicit** state machines
+whose states and transitions are scattered across goroutine-local variables, shared
+`Session`/`Conversation` fields under five different mutexes, raw channels, and
+`context` cancellation. State is *inferred* from variable combinations rather than
+*stored*; several illegal/inconsistent states are reachable.
+
+This document (1) inventories the implicit machines, (2) catalogues the cross-cutting
+failure modes, (3) researches how to re-implement them explicitly and verifiably, and
+(4) lists the invariants a correct implementation must guarantee.
+
+All line numbers are against the current `feat/realtime-semantic-vad-eou` branch and
+will drift; treat them as anchors.
+
+---
+
+## Part 1 — Inventory of the implicit state machines
+
+There is **no `state`/`status` field anywhere** in `Session` or `Conversation`. Every
+machine below is reconstructed from variable combinations.
+
+### M1. Connection / transport lifecycle
+
+Two transports implement one `Transport` interface; their lifecycles differ sharply.
+
+- **WebSocket** (`realtime_transport_ws.go`): essentially stateless — a `*websocket.Conn`
+  plus a write `sync.Mutex`. No send queue, no send goroutine, no closed flag. "Closed"
+  = `ReadEvent` returns an error.
+- **WebRTC** (`realtime_transport_webrtc.go`): an explicit-ish machine built from raw
+  channels — `dcReady` (closed by `dcDone sync.OnceFunc`), `closed` (closed by
+  `closeDone sync.OnceFunc` from *either* `OnConnectionStateChange` or `Close()`),
+  `flushed`, `sessionCh` (cap 1), `inEvents`/`outEvents` (cap 256), plus a `sendLoop`
+  goroutine and RTP counters under `rtpMu`.
+
+Conceptual states (`connecting → data-channel-open → session-created → active →
+closing → closed`) are **not stored**; the only persisted membership state is the
+`sessions[sessionID]` map entry (exists `realtime.go:631`→`:1009`). `session-created`
+and `session-updated` are *events*, not states.
+
+Teardown order (`realtime.go:989-1010`): `cancelActiveResponse` → `close(decodeDone)`
+→ `close(done)` (if VAD running) → `close(soundWindowDone)` → `wg.Wait()` →
+`delete(sessions,…)`. Then, WebRTC only, `defer transport.Close()` → `closeDone()` →
+`<-flushed` → `pc.Close()`.
+
+### M2. Audio-input / turn-detection (server_vad + semantic_vad + EOU)
+
+One `handleVAD` goroutine (`realtime.go:1322`) on a 300 ms ticker. Mode is
+**re-evaluated every tick** under `sessionLock` (`:1350-1357`) so it can flip mid-turn.
+
+- **server_vad** states are encoded by the goroutine-local `speechStarted bool`
+  (`:1337`) plus silence *measured* (not timed) as `audioLength - segEndTime >
+  silenceThreshold` recomputed each tick (`:1461`). States: idle → inspecting →
+  speech-detected → awaiting-commit → committing → transcribing/responding.
+  "Holdback" is a byte count (`noSpeechHoldbackSec*rate*2`), not a timer.
+- **semantic_vad** adds the `liveTurnState` struct (`realtime_semantic_vad.go`):
+  `live` (nil = closed), `unavailable` (sticky degrade → behaves as server_vad),
+  `eouAtSec`, `parts`, `itemID` (allocated at turn open so captions can stream),
+  `deltasSent`. Extra states: closed, open/streaming-ASR, EOU-pending, EOU-fallback
+  (dynamic silence threshold 0 s when EOU pending, else eagerness 8/4/2 s),
+  retranscribe-gate, EOU-rejected, finished, discarded.
+  The one cross-goroutine edge: the backend recv callback pushes onto `events`
+  (buffered 64, **non-blocking — drops on overflow**, `:116-117`); `drainEvents`
+  reads it on the tick.
+- **Voice gate** (`realtime_voicegate.go`) runs *inside* the commit goroutine:
+  resolving → authorized/rejected, with a sticky `voiceVerified` (under `gateMu`) for
+  `when:first`.
+
+### M3. Response lifecycle (+ synchronous tool-turn recursion)
+
+A response is "active" iff `Session.activeResponseDone` is non-nil and unclosed
+(`responseMu`, `:172`). One goroutine owns it; its lifetime == that channel's. State
+is observable only through the `response.*` event stream and `ItemStatus*` on the
+assistant item. Logical states: idle → starting → generating-text →
+generating-audio → tool-call-pending → tool-executing → awaiting-next-tool-turn →
+cancelling → done(completed|cancelled) | failed.
+
+- Cancellation is **cooperative at discrete checkpoints** (`ctx.Err()` at
+  `:2172,2364,2394`, `realtime_stream.go:193,202,241,259`).
+- The tool loop is **synchronous recursion on the same goroutine**, bounded by
+  `maxAssistantToolTurns = 10`; each level mints a fresh `responseID` and emits a full
+  `response.created … response.done{Completed}` cycle — so one user turn can emit
+  *several* `response.done{Completed}` events under different IDs.
+- Terminal events are **not exactly-once**: failed paths `return` with no
+  `response.done`; cancelled paths emit `done{Cancelled}`; the completed terminal is
+  unconditional at the tail of `emitToolCallItems`.
+
+### M4. Conversation / compaction
+
+`Conversation`: `Items` + `Memory` (rolling summary) under `Lock`; `compacting
+atomic.Bool`. States: normal ↔ compacting. Compaction (`realtime_compaction.go`)
+snapshots overflow under `Lock`, summarizes **unlocked**, re-locks and commits guarded
+by an optimistic head-`prefixMatches` check. It is launched **only by turn-0
+`triggerResponse`** (`:1963`), off the response path — so a long agentic turn
+(recursion calls `triggerResponseAtTurn` directly) can append many tool items and
+**never compact** until the next user turn (compaction starvation).
+
+### M5. Streaming sub-machines (transcription, chunker, TTS)
+
+Backend LLM/TTS/transcription streams are **synchronous callback recv loops on the
+caller's goroutine** — no internal goroutines/channels. The only true concurrent FSM is:
+
+- **TTS pipeline** (`realtime_tts_pipeline.go`): one worker goroutine, an **unbounded**
+  mutex-guarded `queue`, a coalesced `wake` chan (cap 1), a `closed` flag, a `done`
+  chan closed once by the worker's `defer`, a lock-free `failed atomic.Bool`, and
+  worker-owned `audio`/`firstErr` that are safe to read only after `wait()` joins via
+  `done`. Idempotent `wait()`; deferred `wait()` backstop guarantees no worker leak.
+- **Chunker** (`realtime_chunker.go`): a pure single-buffer FSM (buffering ↔ emitting,
+  `flush` = hard boundary). **No concurrency guard** — correctness depends entirely on
+  `push`/`flush` being called from one goroutine (the LLM recv loop). On cancel the
+  flush is skipped, so the buffered partial clause is intentionally dropped.
+- **Transcription** (`realtime_transcription.go`): stateless straight-line function;
+  "streaming" is just repeated synchronous callbacks.
+
+---
+
+## Part 2 — Cross-cutting failure modes (why it's a mess)
+
+1. **Shared mutable `Session` config with inconsistent locking (the core problem).**
+   `updateSession`/`updateTransSession` mutate `Voice`, `Instructions`, `Tools`,
+   `OutputModalities`, `ModelConfig`, **`ModelInterface`**, sample rates, and the
+   shared `InputAudioTranscription` pointer under `sessionLock`. But in-flight
+   response/speech/transcription goroutines read those same fields **without any
+   lock** (`realtime_speech.go:72-79`, `realtime_stream.go:228`, semantic_vad
+   `:110`). Reloading `ModelInterface` mid-response is a data race against a running
+   Predict/TTS/Transcribe, and the swapped-out model is dropped without Close.
+   `sessionLock` actually guards the *global `sessions` map*; it only mutually excludes
+   the handful of other sites that happen to also take it (handleVAD tick, the commit
+   branch). Response goroutines never take it.
+
+2. **Two writers of the active-response pair.** `startResponse`/`cancelActiveResponse`
+   are called from both the main read loop (`:836,973,981,990`) **and** the VAD
+   goroutine (barge-in `:1429`, end-of-speech `:1543`). `responseMu` guards only the
+   field swap; the `<-done` wait is outside the lock. A read-loop `ResponseCreate`
+   racing a VAD `speech_stopped` can have both read the same prior pair, both
+   overwrite, and briefly leave **two live response goroutines** both appending to
+   `conv.Items`. The "never overlapping" guarantee holds only under the unstated
+   assumption that responses are driven from a single goroutine — which is false.
+
+3. **State is inferred, not stored.** Whether a response is active, whether a turn is
+   open, whether audio is being buffered — all are derived from combinations of
+   booleans, nil-checks, channel state, and `context` error. No single source of truth;
+   no place to assert an invariant.
+
+4. **Reachable inconsistent states.** e.g. after a semantic-VAD `discardTurn`,
+   `speechStarted` stays true while `lts` is closed, so they disagree and the next
+   onset suppresses `SpeechStarted`. Mid-stream cancel leaves the client having seen
+   `output_item.added`/`content_part.added` with no matching `…done`. `events`-channel
+   overflow silently drops an EOU, degrading EOU-pending to the 2–8 s fallback.
+
+5. **Lifecycle/ownership gaps.** `decodeOpusLoop` is a bare `go` (not in `wg`) and can
+   run after `delete(sessions,…)`. `handleIncomingAudioTrack` (pion `OnTrack`
+   goroutine) has **no shutdown signal** — it appends to `OpusFrames` until `ReadRTP`
+   errors, unjoined by `wg`. WebRTC `outEvents` enqueued before the DC opens are lost
+   on early failure.
+
+6. **The `done`-channel/`vadServerStarted` toggle dance.** A single `done` local
+   (`:655`) is reassigned to a fresh channel on each VAD start (`:662`) and closed at
+   toggle-off (`:670`) and teardown (`:999`). Safe today only because one goroutine
+   owns it — one variable name meaning different channels over time is a structural
+   fragility, not an explicit lifecycle.
+
+---
+
+## Part 3 — Research: explicit, verifiable re-implementation
+
+The goal the user stated: **transitions cannot lead to an inconsistent state, and we
+can verify that.** Four layered techniques, from architecture down to runtime.
+
+### 3.1 Architecture: single-writer session actor (share by communicating)
+
+The root cause of (1) and (2) is *shared mutable state across goroutines*. The most
+effective, idiomatic-Go fix is to give each session **one owning goroutine** that holds
+all session state with **no locks**, and have every other goroutine communicate with it
+over channels:
+
+```
+            ┌────────── inbound events ──────────┐
+ transport ─┤  client events (ReadEvent)         │
+   VAD     ─┤  vad: speech_started/stopped, EOU   ├─►  session actor  ──► outbound
+ model I/O ─┤  llm/tts/asr results, errors        │   (owns ALL state,    events
+ timers    ─┤  ticks, deadlines                   │    single goroutine)
+            └────────────────────────────────────┘
+```
+
+- All state mutation happens in one place; `sessionLock`, `responseMu`, `gateMu`,
+  `AudioBufferLock`, `OpusFramesLock`, `Conversation.Lock` collapse into "the actor owns
+  it." Worker goroutines (Predict/TTS/ASR, opus decode, RTP read) become **stateless
+  effects** that take an immutable snapshot in and send results back as events.
+- `ModelInterface` reload becomes an event the actor sequences relative to responses
+  (e.g. drain/cancel the active response first), eliminating the mid-call swap race.
+- Cancellation stays `context`-based but the actor is the only thing that starts/stops
+  responses, killing the dual-writer race (2).
+
+This is the actor / CSP model. It does not by itself prove correctness — that's what
+3.2–3.4 add — but it makes the state *centralized and explicit*, which is the
+precondition for verification.
+
+### 3.2 Make illegal states unrepresentable (type-level)
+
+Inside the actor, model each machine as an explicit state with a **pure transition
+function** `next(state, event) (state, []effect, error)`:
+
+- Represent states as a Go **sealed sum type** (interface with an unexported marker
+  method, one struct per state carrying only that state's data) so e.g. `EOU-pending`
+  data cannot be accessed while `Closed`. This is the Go equivalent of an ADT and is the
+  single biggest lever for "inconsistent state unrepresentable."
+- The transition function is **total and pure** (no I/O, no goroutines): it returns the
+  next state plus a list of *effects* (send event, start Predict, arm timer) that the
+  actor executes. Pure transition functions are trivially unit-testable and
+  property-testable.
+- An unexpected `(state, event)` pair returns an explicit error / stays put and logs —
+  never a silent half-transition.
+
+The four machines are **hierarchical** (a statechart): Connection ⊃ Turn(M2) and
+Response(M3) ⊃ Tool-turn; Conversation(M4) and the TTS sub-machine(M5) are largely
+orthogonal regions. Model them as nested states rather than one flat enum.
+
+Library options (all guard *logic*, none give concurrency safety — that's 3.1's job):
+- `qmuntal/stateless` — declarative, hierarchical, guard/entry/exit actions; closest fit.
+- `looplab/fsm` — simpler, flat, event-callback based.
+- Hand-rolled transition tables — most control, no dep; recommended here given the
+  hierarchy and the desire to keep transitions auditable. `go.mod` currently pulls no
+  FSM lib.
+
+### 3.3 Design-time formal verification (prove the protocol)
+
+Before/while coding, model the *protocol* (not the Go) in a model checker to prove the
+hard concurrency properties exhaustively:
+
+- **FizzBee** (the adopted tool) to specify the actor's event/state space and check: no
+  two concurrent active responses; barge-in + ResponseCancel + speech_stopped
+  interleavings never deadlock or drop a turn; every `response.created` is eventually
+  followed by exactly one terminal; teardown joins all goroutines. The
+  cancel/startResponse/barge-in interplay (failure mode 2) is exactly the kind of
+  liveness/safety property model checkers exist for.
+- Keep the spec small and focused on the M2↔M3 boundary (turn detection ↔ response),
+  which is where the real races live.
+
+### 3.4 Implementation-time & runtime verification
+
+- **Exhaustive table-driven transition tests**: since transitions are a pure function,
+  enumerate `(state × event)` and assert the result for every cell, including the
+  illegal cells (assert they error / no-op). This is the practical stand-in for a proof
+  that "no transition leads to inconsistent state."
+- **Property-based testing**: feed random event sequences into the actor and assert
+  global invariants hold after every step (Part 4). This catches reachable-bad-state
+  bugs the example tests miss. (Implemented as Ginkgo/Gomega seeded random-walk specs
+  — see Part 6.2 for why not `rapid`.)
+- **Race detector under load**: run the property tests with `-race`; with 3.1 there
+  should be *zero* shared mutable state, so `-race` cleanliness becomes a meaningful
+  signal rather than noise.
+- **Runtime invariant assertions + structured transition logging**: log every
+  `state --event--> state` with the session ID; assert invariants in dev builds.
+  Replace today's silent degradations (dropped EOU, suppressed SpeechStarted) with
+  explicit, observable transitions.
+
+### 3.5 Recommended path for LocalAI
+
+1. Specify the M2↔M3 protocol in FizzBee; nail the cancel/barge-in invariants.
+2. Introduce a per-session actor (3.1) that owns existing state behind the current
+   `Transport` interface — incremental, keeps the event types.
+3. Replace each implicit machine with an explicit sealed-state transition function
+   (3.2), one at a time: Response first (highest-risk dual-writer), then Turn/VAD, then
+   Connection, then leave TTS/Chunker/Compaction (already mostly self-contained) for
+   last.
+4. Land the table-driven + property-based test suites alongside each machine; gate on
+   `-race`.
+
+---
+
+## Part 4 — Invariants a correct implementation must guarantee
+
+These are the "cannot reach inconsistent state" properties to encode as assertions,
+property-test oracles, and FizzBee invariants:
+
+1. **At most one active response per session** at any instant (no overlapping response
+   goroutines; no two appenders to `conv.Items` from response logic).
+2. **Exactly one terminal per `response.created`**: every emitted `response.created` is
+   followed by exactly one of `response.done{completed|cancelled}` or a defined failure
+   terminal — never zero, never two. (Decide whether agentic tool turns are one
+   response or many; make it explicit either way.)
+3. **No `response.*` content events after that response's terminal.** No
+   `output_item.added`/`content_part.added` without a matching `…done` (even on cancel).
+4. **Turn/response coupling**: `speechStarted` ⟺ a live turn is open; barge-in cancels
+   the active response *before* a new turn's commit starts.
+5. **No config field is read by a worker while being mutated** (reload is sequenced
+   against in-flight work; a response uses an immutable snapshot of model/voice/tools).
+6. **Audio buffer monotonic & consistent**: commit/clear/append/VAD-drop never lose or
+   double-consume bytes; `clear` resets *all* turn state (including `lts`).
+7. **No dropped control events**: an EOU/Final is never silently lost (no overflow-drop
+   on a bounded channel that changes turn outcome).
+8. **Clean teardown**: every spawned goroutine (incl. `decodeOpusLoop`,
+   `handleIncomingAudioTrack`) is signalled and joined before the session is deleted; no
+   sends after transport close.
+9. **Compaction safety & liveness**: compaction never races a reader into a torn
+   `Items`; and it actually runs when the trigger is exceeded, including inside long
+   agentic turns.
+10. **Idempotent close**: every channel/resource closed exactly once on every path.
+
+---
+
+## Implementation status
+
+- **M3 (response coordination) — first vertical slice landed.** Explicit machine in
+  `core/http/endpoints/openai/respcoord/` (sealed `State`/`Event`/`Effect` sum types, a
+  total pure `Next`, a single-writer `Coordinator`); transition-table + Ginkgo/Gomega
+  seeded-property + concurrent conformance tests (green under `-race`); a deterministic
+  characterization test pinning the legacy dual-writer race. Authoritative spec:
+  `formal-verification/response_lifecycle.fizz`. Gate:
+  `scripts/realtime-conformance.sh` (Go layer always; FizzBee when pinned) wired as
+  `make test-realtime-conformance` and `.github/workflows/realtime-conformance.yml`. See
+  `formal-verification/README.md`.
+- **Gate is fail-closed and pinned (done).** `fizzbee.sha256` pins all four platforms;
+  the gate hard-fails without FizzBee; CI installs+caches the verified binary with no skip;
+  pre-commit runs the gate on `respcoord/**` or `formal-verification/**` changes.
+- **M3 wired into the live session (done).** `realtime_respcoord.go` adds `responseSink`
+  (the `respcoord.Coordinator` + a goroutine-spawning effect sink) to `Session`. The legacy
+  `startResponse`/`cancelActiveResponse` and the dual-writer `activeResponse*`/`responseMu`
+  fields are gone; all six call sites (manual commit, `response.create`, VAD speech-stopped,
+  `response.cancel`, barge-in, teardown) route through it. Barge-in/cancel are now
+  non-blocking (removes the legacy ~300 ms VAD stall); teardown stops input goroutines, then
+  cancels + `wait()`s all response goroutines before deleting the session. `EmitTerminal` is
+  a no-op for now (the response body still emits its own `response.done`) — coordination is
+  fixed without changing wire behavior. Verified: builds, `go vet` clean, all 300 openai
+  specs pass under `-race`, and `make test-realtime` (the mock-backend realtime e2e suite,
+  12 specs over WS + WebRTC) passes.
+- **Single authoritative terminal + populated Output/Usage (done).** One
+  `response.created` and one `response.done` per `response.create`, even across the
+  server-side agentic tool loop (which is now internal turns of one response, not one
+  terminal each). A `liveResponse` accumulator threads through
+  `triggerResponse`→`triggerResponseAtTurn`→`emitToolCallItems`/`streamLLMResponse`,
+  collecting output items as they complete and summing token usage; `triggerResponse`
+  emits the one terminal (completed/cancelled; failed still emits none, matching legacy)
+  with `Output` + `Usage` filled in (both were always empty before). Verified: 301 openai
+  specs under `-race` (incl. a new `triggerResponse` terminal test) + `make test-realtime`.
+  Design note: emission is hoisted to `triggerResponse` (the body owns it) rather than the
+  coordinator's `EmitTerminal` effect — at cancel/supersede time the coordinator doesn't
+  yet have the body's partial Output, so the body, which does, is the natural emitter. The
+  coordinator still guarantees one body run per `response.create`, so "exactly one terminal"
+  holds transitively; `EmitTerminal` remains the spec's logical marker (no-op in the sink).
+- **M2 (turn detection) — model + spec landed AND wired into the live session.**
+  Explicit machine in `core/http/endpoints/openai/turncoord/` (sealed `State` =
+  `Idle | Speaking{Turn}`, `Event` = `Onset | Silence | Abort{Reason}`, `Effect` =
+  `BargeIn | OpenTurn | EmitSpeechStarted | EmitSpeechStopped | CommitTurn |
+  DiscardTurn`, a total pure `Next`, a single-writer `Coordinator`);
+  transition-table + Ginkgo/Gomega seeded-property + concurrent conformance tests
+  (green under `-race`). The fix it encodes: "speech detected" and "a turn is open"
+  — the two legacy variables (`speechStarted` and `lts.open()`) that a `discardTurn`
+  could desync (failure mode 4) — become ONE state, so the next-onset suppression
+  bug is unrepresentable. Authoritative spec:
+  `formal-verification/turn_lifecycle.fizz`, with an `always assertion Coupled`
+  (speech ⟺ turn-open), verified non-vacuous (deleting `self.speech = 0` in `Abort`
+  makes the checker report `Coupled` violated). The gate
+  (`scripts/realtime-conformance.sh`, pre-commit, CI) covers `turncoord` and the
+  spec. **Wired (done):** `realtime_turncoord.go` adds `turnSink` (the
+  `turncoord.Coordinator` + a loop-local effect sink) to `handleVAD`. The legacy
+  `speechStarted` bool is gone; onset/no-speech-clear/commit/teardown route through
+  `coord.Apply(Onset|Abort{NoSpeech}|Silence|Abort{Teardown})`. The turn id is
+  minted at onset and carried by the coordinator to the committed event (so it
+  matches the live captions); `liveTurnState.openTurn` now takes that id instead of
+  minting its own. A semantic→server mode switch mid-turn is deliberately NOT an
+  abort (it only drops the orphaned live stream and lets the turn continue under
+  server_vad), so it stays inline. Verified: builds, `go vet`/`gofmt`/golangci-lint
+  clean, all openai specs under `-race`, and `make test-realtime` (12 e2e specs over
+  WS + WebRTC) pass.
+- **M1 (connection lifecycle) — model + spec landed AND wired.** Explicit machine
+  in `core/http/endpoints/openai/conncoord/` (sealed `State` = `Live{VADRunning} |
+  Torn`, `Event` = `SetVAD | Close`, `Effect` = `StartVAD | StopVAD | Teardown`, a
+  total pure `Next`, a single-writer `Coordinator`); transition-table +
+  Ginkgo/Gomega seeded-property + concurrent conformance tests (green under
+  `-race`). It replaces the legacy `vadServerStarted` bool + the `done` channel
+  reassigned on every turn-detection toggle and closed from two sites (failure
+  mode 6): the coordinator owns whether the VAD goroutine runs, so its done channel
+  is closed exactly once and never resurrected after teardown; `Close` moves to
+  `Torn`, which absorbs every later event so teardown runs exactly once even from
+  multiple exit paths (invariants #8, #10). Spec:
+  `formal-verification/conn_lifecycle.fizz` (`always assertion TeardownOnce` +
+  `NoRunAfterTorn`), verified non-vacuous (deleting `self.torn = 1` in `Close`
+  fails `TeardownOnce`). **Wired (done):** `realtime_conncoord.go` adds `connSink`;
+  the handler's setup/`toggleVAD`/teardown now route through
+  `conn.setVAD(...)`/`conn.close()`; the `done`/`vadServerStarted` locals and the
+  manual ordered-teardown block are gone (the Teardown effect performs that
+  sequence). Verified: builds, vet/gofmt/golangci-lint clean, openai specs under
+  `-race`, `make test-realtime` (12 e2e WS+WebRTC), full conformance gate green
+  (3 Go packages + 3 fizz specs PASSED).
+- **M4 (conversation compaction) — model + spec landed AND wired.** Explicit
+  machine in `core/http/endpoints/openai/compactcoord/` (sealed `State` =
+  `Idle | Running`, `Event` = `Trigger | Finished`, `Effect` = `StartCompaction`,
+  a total pure `Next`, a single-writer `Coordinator`); transition-table +
+  Ginkgo/Gomega seeded-property + concurrent (effect-spawns-work-reports-Finished)
+  conformance tests (green under `-race`). It makes the legacy `compacting
+  atomic.Bool` single-flight guard explicit: a `Trigger` while `Running` is dropped
+  (not superseded — compaction is idempotent work on the same overflow), so at most
+  one summarize+evict runs per conversation (invariant #9). Spec:
+  `formal-verification/compaction.fizz` (`always assertion SingleFlight`), verified
+  non-vacuous (deleting the `if self.active == 0` guard fails `SingleFlight`).
+  **Wired (done):** `realtime_compactcoord.go` adds `compactionSink`; the
+  `Conversation.compacting atomic.Bool` is replaced by `Conversation.compaction
+  *compactionSink` (built at conversation creation with the summarize+evict run
+  closure); `maybeCompact` now calls `conv.compaction.trigger()`. The summarizer
+  resolution + `compact()` stay in the sink's spawned goroutine (off the response
+  path); `compact()` itself (snapshot/summarize-unlocked/optimistic-commit) is
+  unchanged. Verified: builds, vet/gofmt/golangci-lint clean, openai specs under
+  `-race`, `make test-realtime` (12 e2e), full conformance gate green (4 Go
+  packages + 4 fizz specs PASSED).
+- **M5 (TTS pipeline lifecycle) — model + spec landed AND wired.** Explicit
+  machine in `core/http/endpoints/openai/ttscoord/` (sealed `State` =
+  `Open | Closing | Closed`, `Event` = `Close | WorkerExited`, `Effect` = `Wake`, a
+  total pure `Next`, a single-writer `Coordinator`); transition-table +
+  Ginkgo/Gomega seeded-property + two-writer conformance tests (green under
+  `-race`). It is a genuine two-writer machine (producer `Close` from `wait()` vs
+  worker `WorkerExited`); it makes the legacy `closed bool` lifecycle explicit and
+  monotonic, fixes the latent enqueue-after-close silent drop (enqueue is now gated
+  on `Open`), and guarantees idempotent `wait()` (one wake / one worker join). The
+  poison `failed` latch stays a lock-free `atomic.Bool` (orthogonal, read per
+  clause on the worker's hot path). Spec: `formal-verification/tts_pipeline.fizz`
+  (`always assertion WakeOnce` + `Monotonic`), verified non-vacuous (deleting the
+  `if self.phase == 0` guard in `Close` fails `WakeOnce`). **Wired (done):**
+  `realtime_tts_pipeline.go`'s `ttsPipeline` embeds the coordinator (and is its
+  effect sink — `Wake` → `signal()`); `closed bool` is gone; the worker checks
+  `closing()` and raises `WorkerExited` on drain, `enqueue` rejects once not
+  `Open`, `wait()` raises `Close`. The wake/done channel mechanics are unchanged.
+  Verified: builds, vet/gofmt/golangci-lint clean, openai specs under `-race`,
+  `make test-realtime` (12 e2e), full conformance gate green (5 Go packages + 5
+  fizz specs PASSED).
+- **All five mapped machines (M1–M5) are now explicit, wired, and verified.** The
+  realtime-conformance gate model-checks all `.fizz` specs and runs all five Go
+  conformance suites under `-race`, fail-closed.
+- **The machines form a hierarchy, and that relationship is now modeled and
+  enforced.** M1 (connection) is the parent region; when it tears down, every child
+  must be terminal. Previously this was only an imperative side effect of
+  `conncoord`'s teardown ordering, with a real gap (M4 compaction was
+  fire-and-forget and could outlive the torn session). Now:
+  - `formal-verification/session_lifecycle.fizz` is a **composition spec** that
+    models conn + its direct children (vad/M2, resp/M3, compaction/M4) as one
+    statechart and asserts `ChildrenDieWithParent` (conn torn ⟹ all children
+    terminal) plus "no child starts after teardown". Its non-vacuity reproduces the
+    exact M4 gap (drop the compaction-terminate line → assertion fails).
+  - `respcoord` (M3) and `compactcoord` (M4) gained an absorbing **`Terminated`**
+    state + a `Shutdown` event, so a response/compaction cannot start after
+    teardown (structural "no resurrection").
+  - `conncoord`'s `Teardown` effect now explicitly drives the children terminal:
+    stop+join the VAD goroutine (M2), `respSink.shutdown()` (M3 → Terminated, joins
+    response goroutines and their M5 pipelines), and `compaction.shutdown()` for
+    every conversation (M4: cancel the in-flight summary via a session-scoped
+    context, then join — **closing the gap**). `compact` now takes a `context` so
+    teardown can bound the join. M2's terminal is realized by the goroutine join and
+    M5's by its existing `Closed`; the persistent coordinators (M3/M4) carry the
+    explicit `Terminated` state.
+
+## Part 5 — Library vs hand-rolled (Go ecosystem, verified 2026-06)
+
+Researched against live GitHub/pkg.go.dev data. **Verdict: hand-roll a typed transition
+table over sealed sum-type states for the per-connection machines.** No Go library gives
+the two properties we most want — *compile-time-illegal states* and a *pure
+`next(state,event)->(state,[]effect,error)`*; every library models states as
+`string`/`int`/`any` and fires side-effecting callbacks mid-transition. And since the
+actor (Part 3.1) drives everything from one goroutine, the libraries' main value-add —
+internal locking — is dead weight.
+
+Library landscape:
+
+| Option | Stars / status | Hierarchy | Typed states | Illegal-transition | Viz | Fit |
+|---|---|---|---|---|---|---|
+| **hand-rolled table + sealed sum types** | — | DIY (parent field / nested switch) | **yes** (sealed iface) | explicit `default:` | ~30 LOC Mermaid emitter | **best** |
+| **qmuntal/stateless** (port of .NET Stateless) | 1.36k, v1.8.0 2026-02, maintained | yes (substates, guards, entry/exit, internal/ignored) | `any` | `error` + `OnUnhandledTrigger` + `PermittedTriggers` | DOT | best library fallback if hierarchy grows |
+| **looplab/fsm** | 3.4k, v1.0.3 2025-05, maintained | flat | strings | typed errors | **DOT+Mermaid** | only for flat machines wanting free diagrams |
+| cocoonspace/fsm | 89, dormant 2021 | flat | int | `bool` no-op | — | lock-free but dead; DIY beats it |
+| true Harel statecharts (gstate, statechartx) | ≤10, <1yr, single-author | parallel+history | varies | varies | varies | only if we truly need parallel regions; unproven |
+| Temporal / Cadence | large, maintained | n/a | n/a | n/a | n/a | **overkill** — external cluster+DB, durable replay, wrong latency class |
+
+Decision: hand-roll; keep **qmuntal/stateless** as the fallback if one machine grows deep
+hierarchy/guards faster than we want to hand-maintain (its `error`-on-illegal-trigger and
+`PermittedTriggers()` are the most useful library features for our "reject illegal
+transitions" requirement, at the cost of `any`-typed states). Add a tiny Mermaid emitter
+over the hand-rolled table so we keep the visualization the libraries advertise.
+
+## Part 6 — Formal design tied to code, and making it authoritative
+
+The user requirement: the formal design is **authoritative** — a coding agent should be
+unable to silently change implementation behavior without it being caught against the
+spec; the default path is "update the spec and re-verify," not "edit the code and ignore
+the spec." This is a *conformance + enforcement* problem, in three layers.
+
+### 6.1 The source of truth & design-time check
+
+Write the concurrency-critical core — the **M2↔M3 boundary** (turn detection ↔ response:
+barge-in, ResponseCancel, speech_stopped, the dual-writer race) — as a **FizzBee** spec
+and **model-check it in CI**. Keep the spec small and focused on M2↔M3; that is where the
+real safety/liveness properties (Part 4 invariants 1–4) live. (FizzBee is the adopted
+model checker — see Part 6.4.)
+
+### 6.2 The conformance bridge (code ↔ spec)
+
+The honest finding: design-time model checking is well-supported; the *Go conformance
+bridge is thin everywhere* and needs per-spec glue. Two layers, adopted together:
+
+1. **FizzBee MBT** — the authoritative layer. The `.fizz` spec is model-checked, and
+   `fizz mbt-scaffold --lang go` generates Go interfaces + a `go test` harness; you
+   implement adapters mapping model actions→code and `StateGetter`→state. Conformance
+   runs as plain `go test` — the cleanest CI fit. Risk: pre-1.0, essentially one
+   maintainer (pin a version + sha256, vendor examples).
+2. **Ginkgo/Gomega seeded property tests** — the Go-native floor. A small Go model
+   (the test's `open`/`registered` shadow) is the oracle; a fixed-seed random walk
+   drives random event sequences against the `Coordinator`, asserting the Part-4
+   invariants after each step / per seed. It checks the *implementation* against a Go
+   oracle — it complements, but does not replace, the FizzBee check of the *design*.
+   (We originally specced `pgregory.net/rapid` here for its `(*T).Repeat` driver and
+   automatic shrinking, but LocalAI mandates Ginkgo/Gomega for all tests — its
+   `forbidigo` lint forbids stdlib `testing` assertions — and `rapid.Check` needs a
+   concrete `*testing.T`/`*rapid.T` that cannot run inside a Ginkgo `It`. Rather than
+   weaken the lint gate with an exclusion, the property layer is hand-rolled seeded
+   walks: fixed seeds make every failure reproducible, at the cost of `rapid`'s
+   automatic shrinking. `rapid` is consequently not a direct dependency.)
+
+These compose: model-check the design (6.1) for "the design is right"; conformance-test
+the code (6.2) for "the code matches the design." Add `go test -race` (with `-cpu=1,2,4`,
+repeated runs) over the stateful tests for interleaving-bug discovery, and Go native
+fuzzing over the *same* harness for coverage-guided sequence exploration + a committable
+regression corpus. (`testing/quick` is frozen — do not use.)
+
+There is no viable single-source-of-truth codegen (one spec compiled into both the runtime
+Go and the model) for retrofitting existing Go — the candidates are research-grade and
+greenfield-only. Our practical substitute is the CI gate below plus a single Go transition
+table that emits both the diagram and the test action set.
+
+### 6.3 Enforcement — making the design un-ignorable for agents
+
+Structural enforcement, leveraging this repo's existing non-bypassable gate culture
+(pre-commit + monotonic ratchets; `--no-verify` is forbidden, baselines never lowered):
+
+1. **Add a `realtime-conformance` gate** to the pre-commit/CI pipeline that runs (a) the
+   model check (6.1) and (b) the conformance bridge (6.2). A behavior change that does not
+   conform turns the gate **red**; the only green paths are *make the code conform* or
+   *update the spec* — and updating the spec re-triggers the model check, so an illegal
+   design is rejected too. This is the actual mechanism that makes "update the design and
+   verify" the default rather than optional.
+2. **Treat the spec as a ratchet artifact** like coverage: the gate must not be weakened,
+   the spec not deleted, the build tag not silently disabled.
+3. **Write an `.agents/realtime-state-machines.md` guide** (indexed from `CLAUDE.md`)
+   stating the spec is the source of truth: change the spec first, re-run the gate, then
+   implement. The doc is secondary; the gate is what enforces it.
+
+### 6.4 Decided stack
+
+- **Implementation:** hand-rolled sealed-state transition functions + single-writer actor
+  (Parts 3.1–3.2).
+- **Design-time + conformance:** **FizzBee** (decided). `.fizz` spec is model-checked, and
+  `fizz`'s Go MBT generator (`mbt/generator/templates/go` → interfaces/adapters/test;
+  driven via a gRPC plugin in `mbt/lib/go`) produces a `go test` conformance harness
+  whose adapters map model actions → our actor and `StateGetter` → our state. Go is a
+  first-class MBT target (Go + Rust are the only two). Verified 2026-06: Apache-2.0,
+  v0.5.2, prebuilt linux/macos×x86/arm binaries, ships Claude Code skills
+  (`/fizz-spec|check|debug|mbt`) for the spec-authoring loop.
+- **Go-native layer:** **Ginkgo/Gomega seeded property tests** run alongside — they
+  check the *implementation*, complementing (not substituting for) the FizzBee check
+  of the *design*. Skipping FizzBee is NOT "degrading to the Go layer": the design
+  authority would be gone. The gate is therefore **fail-closed** (see Enforcement).
+  (Originally specced as `rapid`; switched to Ginkgo/Gomega to satisfy LocalAI's
+  Ginkgo-only `forbidigo` lint without weakening that gate — see Part 6.2.)
+- **Enforcement:** the `realtime-conformance` pre-commit/CI gate + `.agents/` guide
+  (Part 6.3).
+
+FizzBee risk mitigations (decided):
+- The gate is **fail-closed**: a missing FizzBee is a hard failure, never a silent skip.
+  The only bypass is the explicit, loud `REALTIME_CONFORMANCE_SKIP_FIZZBEE=1` (local
+  only; CI never sets it; pre-commit runs the gate on any `respcoord/**` or
+  `formal-verification/**` change so a pure `.fizz` edit still re-verifies).
+- CI **pins the FizzBee release binary by version + sha256** (`formal-verification/fizzbee.sha256`,
+  all four platforms, digests from the GitHub release; installer verifies before extract,
+  CI caches it). Not go-gettable: `pkg/modelchecker` imports the Bazel-internal `fizz/proto`
+  with no committed `.pb.go`, so a plain `go get` won't build — hence the pinned binary.
+- Keep the `.fizz` model **portable** (no exotic features) so it stays re-expressible in
+  another model checker if FizzBee is ever abandoned — lock-in is at the tooling layer
+  only, not the design.
+
+## Open questions (decide before implementing)
+
+- **Scope of the actor refactor**: full single-writer per session, or incrementally
+  migrate one machine at a time behind the existing locks? (Suggest: M3 response
+  coordination first — it has the load-bearing dual-writer bug.)
+
+Resolved: **FSM library vs hand-rolled** → hand-rolled sealed-state tables,
+qmuntal/stateless fallback (Part 5). **Conformance bridge** → FizzBee (model-check + Go
+MBT) with a Ginkgo/Gomega seeded-property Go-native floor as hedge (Part 6.4). **Single-source-of-truth codegen**
+(PGo/MPCal) → not viable (research-grade, greenfield-only); substitute is the CI
+conformance gate (Part 6.3).
+
+**Agentic turn semantics** → invariant #2 is **one `response.done` per `response.create`**
+(OpenAI-faithful); the server-side `AssistantExecutor` tool loop becomes internal
+sub-states of a single response rather than emitting one terminal per turn. Verified safe
+in-tree: the current `response.done` carries only `{id, object, status}` (`Output`/`Usage`
+never populated), the React UI (`Talk.jsx:330`) reads only `status`, every unit test
+already asserts `ResponseDone == 1` for tool turns, no test expects multiplicity, and the
+server-side recursion is untested. Collapsing also fixes a latent "Listening…" flicker
+mid-agentic-loop. The client-driven tool loop (fresh `response.create` per round-trip)
+legitimately keeps one terminal each — unaffected. Follow-up: actually populate `Output` +
+`Usage` in the single terminal (currently always empty).
diff --git a/formal-verification/README.md b/formal-verification/README.md
new file mode 100644
index 000000000..7f38e1462
--- /dev/null
+++ b/formal-verification/README.md
@@ -0,0 +1,142 @@
+# Formal verification — realtime state machines
+
+Formal designs (FizzBee specs) for the realtime API state machines and the harness
+that keeps the Go implementation provably in step with them. Background and
+rationale: [../docs/design/realtime-state-machines.md](../docs/design/realtime-state-machines.md) (Part 6).
+
+The design is **authoritative**: behaviour changes go through the spec first, then
+the implementation is checked against it. The `realtime-conformance` gate makes
+that the path of least resistance — you cannot land a non-conforming change green.
+
+## What's here
+
+| File | Role |
+|------|------|
+| `response_lifecycle.fizz` | **Authoritative** FizzBee model of machine M3 (response coordination). Model-checked + drives the Go MBT conformance harness. |
+| `turn_lifecycle.fizz` | **Authoritative** FizzBee model of machine M2 (turn detection): the speechStarted / turn-open coupling. |
+| `conn_lifecycle.fizz` | **Authoritative** FizzBee model of machine M1 (connection lifecycle): VAD toggle + once-only teardown. |
+| `compaction.fizz` | **Authoritative** FizzBee model of machine M4 (conversation compaction): single-flight. |
+| `tts_pipeline.fizz` | **Authoritative** FizzBee model of machine M5 (TTS pipeline): open->closing->closed, idempotent close. |
+| `session_lifecycle.fizz` | **Composition** spec: the M1–M5 hierarchy — conn (M1) is the parent; when it is torn down, every child (vad/M2, resp/M3, compaction/M4) is terminal. Models the relationship the per-machine specs can't express. |
+| `fizzbee.sha256` | Pinned checksum(s) of the FizzBee release the gate uses (created on first `install-fizzbee.sh` run). |
+
+The implementations under test live in
+[`core/http/endpoints/openai/respcoord`](../../../core/http/endpoints/openai/respcoord) (M3),
+[`core/http/endpoints/openai/turncoord`](../../../core/http/endpoints/openai/turncoord) (M2),
+[`core/http/endpoints/openai/conncoord`](../../../core/http/endpoints/openai/conncoord) (M1),
+[`core/http/endpoints/openai/compactcoord`](../../../core/http/endpoints/openai/compactcoord) (M4),
+and [`core/http/endpoints/openai/ttscoord`](../../../core/http/endpoints/openai/ttscoord) (M5).
+
+## Running the gate
+
+```sh
+make test-realtime-conformance
+# or directly:
+./scripts/realtime-conformance.sh
+```
+
+Two layers, **both required — the gate is fail-closed**:
+
+1. **Go-native conformance** — the `respcoord` + `turncoord` + `conncoord` + `compactcoord` + `ttscoord` transition-table
+   tests + Ginkgo/Gomega seeded property (random-walk) tests under `-race`
+   (checks the implementation), plus the shared `coordinator` runtime they all
+   build on. Also run as part of `make test` (they're ordinary Go packages with a
+   Ginkgo suite each). The five machines reduce to their sealed State/Event/Effect
+   types + a pure `Next`; the single-writer Coordinator/Sink plumbing lives once in
+   `core/http/endpoints/openai/coordinator` (a generic `Coordinator[S,E,F]`).
+2. **FizzBee model check** — model-checks the authoritative `.fizz` specs (checks
+   the design). **A missing FizzBee is a hard failure, not a skip** — otherwise
+   the design verification silently disappears whenever the tool is inconvenient,
+   which is the whole thing we're trying to prevent.
+
+FizzBee is pinned and checksum-verified (`fizzbee.sha256`), so "couldn't install"
+is not a reason to skip — run `make install-fizzbee`. The **only** way to skip is
+the explicit, loud `REALTIME_CONFORMANCE_SKIP_FIZZBEE=1` opt-out, intended for
+local work on unrelated code. CI never sets it, and `pre-commit` runs the full
+gate whenever `respcoord/**`, `turncoord/**`, `conncoord/**`, `compactcoord/**`, `ttscoord/**`, or `formal-verification/**` is
+staged (so a pure `.fizz` edit still re-verifies).
+
+## Installing FizzBee (pinned)
+
+FizzBee is pre-1.0 and single-maintainer, so we pin a version + sha256 and use the
+prebuilt release tarball (its primary build is Bazel — it is **not** go-gettable:
+the `pkg/modelchecker` library imports the Bazel-internal `fizz/proto` with no
+committed `.pb.go`, so a plain `go get` won't build it).
+
+```sh
+make install-fizzbee                  # = scripts/install-fizzbee.sh (default v0.5.2)
+```
+
+The four platform assets are pinned by sha256 in `fizzbee.sha256` (digests taken
+from the GitHub release); the installer verifies before extracting. Heads-up: the
+Linux bundles are large (~290–350 MB, because `parser_bin` embeds a full runtime),
+macOS ~36 MB. CI caches `.tools/fizzbee` keyed on the pin so it downloads once.
+
+This unpacks a **self-contained** directory under `.tools/fizzbee/` (gitignored):
+
+```
+.tools/fizzbee/
+  fizz                              -> stable symlink the gate auto-detects
+  fizzbee-v0.5.2-linux_x86/
+    fizz            # CLI wrapper (entrypoint)
+    parser/parser_bin # the .fizz frontend, BUNDLED (no system Python needed)
+    fizzbee         # Go model-checker binary
+    fizz.env        # resolves the above paths relative to `fizz`
+    mbt_gen.zip     # MBT generator (this one DOES need system python)
+```
+
+Keep the directory intact — `fizz.env` resolves its siblings relative to the
+`fizz` wrapper. The gate auto-detects `.tools/fizzbee/fizz`; override with
+`FIZZBEE_BIN` only if you installed elsewhere (point it at the `fizz` wrapper,
+not the raw `fizzbee` binary).
+
+First `install-fizzbee.sh` run prints the computed sha256; record it in
+`fizzbee.sha256` as `<sha256>  <asset>` and commit so later runs verify the pin.
+
+> CLI facts (validate against the pinned version — FizzBee is pre-1.0): the CLI
+> is `fizz [flags] <spec.fizz>` (default = exhaustive BFS); there is **no `run`
+> subcommand**. The checker can print `FAILED`/`DEADLOCK` while still exiting 0,
+> so the gate scans output for those markers in addition to the exit code.
+> Model-checking needs only the bundled `parser_bin` (no Python); only
+> `mbt-scaffold` shells out to system `python`.
+
+## Reproducing the bug the spec catches
+
+Each spec models the **correct** design, so it passes; each documents how to
+reproduce the legacy bug it guards against:
+
+- `response_lifecycle.fizz` (M3): change `atomic func start()` to
+  `serial func start()` — the checker reports `AtMostOneLive` violated (the
+  dual-writer race). Pinned deterministically in Go by the respcoord
+  "legacy dual-writer characterization" spec.
+- `turn_lifecycle.fizz` (M2): in `Abort`, delete `self.speech = 0` (clear only
+  the turn, as the legacy `discardTurn` did) — the checker reports `Coupled`
+  violated (the speechStarted/turn-open desync that suppressed the next onset).
+- `conn_lifecycle.fizz` (M1): in `Close`, delete `self.torn = 1` — the checker
+  reports `TeardownOnce` violated (the legacy double-teardown / double-close
+  hazard when a session reaches teardown from more than one exit path).
+- `compaction.fizz` (M4): in `Trigger`, delete the `if self.active == 0:` guard —
+  the checker reports `SingleFlight` violated (two goroutines compacting the same
+  overflow concurrently, the race the `compacting` CAS prevents).
+- `tts_pipeline.fizz` (M5): in `Close`, delete the `if self.phase == 0` guard —
+  the checker reports `WakeOnce` violated (a non-idempotent wait() that wakes /
+  joins the worker more than once).
+- `session_lifecycle.fizz` (hierarchy): in `Teardown`, delete `self.compaction = 2`
+  — the checker reports `ChildrenDieWithParent` violated. This is the real M4 gap:
+  a fire-and-forget compaction outliving the torn session. The fix is `conncoord`'s
+  teardown cancelling + joining each conversation's compaction (and respcoord/
+  compactcoord gained an absorbing `Terminated` state so no child can start after
+  teardown).
+
+## Adding another machine
+
+All five mapped machines (M1–M5) have landed. To add a new sealed-state machine:
+
+1. Add `<machine>.fizz` here (with an `always assertion`; verify non-vacuity by
+   breaking one guard and confirming the checker fails).
+2. Implement it as a sealed-state package under `core/http/endpoints/openai/`.
+3. Add transition-table + Ginkgo/Gomega seeded property conformance tests
+   (one `*_suite_test.go` bootstrap per package; LocalAI mandates Ginkgo/Gomega).
+4. The gate picks up new `*.fizz` specs automatically; add the new Go package to
+   the `-race` test list in `scripts/realtime-conformance.sh` (and the path
+   filters in `.githooks/pre-commit` + `.github/workflows/realtime-conformance.yml`).
diff --git a/formal-verification/compaction.fizz b/formal-verification/compaction.fizz
new file mode 100644
index 000000000..b96955204
--- /dev/null
+++ b/formal-verification/compaction.fizz
@@ -0,0 +1,57 @@
+---
+# Authoritative formal design for realtime machine M4: conversation compaction.
+#
+# Companion to:
+#   - docs/design/realtime-state-machines.md  (the map + invariants)
+#   - core/http/endpoints/openai/compactcoord (the Go implementation)
+#
+# The Go MBT adapter maps each action below onto compactcoord.Coordinator.Apply
+# and the StateGetter onto compactcoord.Coordinator.State, so this spec is the
+# source of truth the implementation is checked against.
+#
+# The property: at most one background compaction runs per conversation at a time,
+# so two goroutines never summarize+evict the same overflow concurrently (Part 4,
+# invariant #9). The legacy guard is a `compacting atomic.Bool` CAS; here `active`
+# is the number of in-flight compactions, started only from Idle.
+#
+# NOTE: FizzBee is pre-1.0. Validate the exact syntax/CLI against the version
+# pinned in formal-verification/README.md before trusting the gate.
+deadlock_detection: false
+---
+
+role Compactor:
+    action Init:
+        self.active = 0   # compactions in flight -- MUST stay in {0,1}
+        self.torn = 0     # session torn down (Terminated) -- absorbing
+
+    # maybeCompact wants to start a compaction. THE FIX: it starts one only when
+    # none is running (single-flight) and not after teardown. To reproduce the
+    # legacy race where two goroutines could both compact the same overflow,
+    # delete the `self.active == 0` guard (always increment): the checker then
+    # reports SingleFlight violated.
+    atomic action Trigger:
+        if self.active == 0 and self.torn == 0:
+            self.active += 1   # StartCompaction
+
+    # The background compaction goroutine finished (success, error, or timeout).
+    atomic action Finished:
+        if self.active > 0:
+            self.active -= 1
+
+    # Teardown: the connection (M1) parent cancels + joins the in-flight
+    # compaction, then terminates the coordinator so none can start afterwards.
+    atomic action Shutdown:
+        self.active = 0    # cancelled + joined
+        self.torn = 1
+
+action Init:
+    c = Compactor()
+
+# SAFETY: at most one compaction is ever in flight (Part 4, invariant #9).
+always assertion SingleFlight:
+    return c.active >= 0 and c.active <= 1
+
+# SAFETY: no compaction is in flight once torn (it was cancelled + joined at
+# teardown), so none outlives the session.
+always assertion NoneAfterTeardown:
+    return c.torn == 0 or c.active == 0
diff --git a/formal-verification/conn_lifecycle.fizz b/formal-verification/conn_lifecycle.fizz
new file mode 100644
index 000000000..7aabf3a1f
--- /dev/null
+++ b/formal-verification/conn_lifecycle.fizz
@@ -0,0 +1,60 @@
+---
+# Authoritative formal design for realtime machine M1: connection lifecycle.
+#
+# Companion to:
+#   - docs/design/realtime-state-machines.md  (the map + invariants)
+#   - core/http/endpoints/openai/conncoord    (the Go implementation)
+#
+# The Go MBT adapter maps each action below onto conncoord.Coordinator.Apply and
+# the StateGetter onto conncoord.Coordinator.State, so this spec is the source of
+# truth the implementation is checked against.
+#
+# The legacy hazard (Part 2, failure mode 6 / invariants #8, #10): a single `done`
+# channel reassigned on every VAD toggle and closed from two sites (toggle-off and
+# teardown) guarded only by a vadServerStarted bool. Modeled here as `running`
+# (the VAD goroutine's done channel is live) and `torn` (teardown happened).
+#
+# NOTE: FizzBee is pre-1.0. Validate the exact syntax/CLI against the version
+# pinned in formal-verification/README.md before trusting the gate.
+deadlock_detection: false
+---
+
+role Conn:
+    action Init:
+        self.running = 0     # VAD goroutine running (its done channel is live)
+        self.torn = 0        # teardown has happened
+        self.teardowns = 0   # how many times teardown ran -- MUST stay <= 1
+
+    # session.update toggled turn detection on. No-op after teardown (the legacy
+    # reassign-and-spawn must never resurrect a torn session).
+    atomic action VadOn:
+        if self.torn == 0:
+            self.running = 1
+
+    # session.update toggled turn detection off (close the running done channel).
+    atomic action VadOff:
+        if self.torn == 0:
+            self.running = 0
+
+    # Transport read loop ended / session closing. THE FIX: setting torn absorbs
+    # every later Close, so teardown's channel closes happen exactly once. To
+    # reproduce the legacy double-teardown hazard, delete `self.torn = 1` below:
+    # the checker then reports TeardownOnce violated (Close runs teardown again).
+    atomic action Close:
+        if self.torn == 0:
+            self.running = 0    # StopVAD if it was running (close-once)
+            self.teardowns += 1 # Teardown
+            self.torn = 1
+
+action Init:
+    c = Conn()
+
+# SAFETY: teardown runs at most once -- the done/decode/sound channels are closed
+# exactly once, never double-closed (Part 4, invariant #10).
+always assertion TeardownOnce:
+    return c.teardowns <= 1
+
+# SAFETY: the VAD goroutine is never (re)started after teardown -- no
+# send-after-close / no goroutine outliving the session (Part 4, invariant #8).
+always assertion NoRunAfterTorn:
+    return not (c.torn == 1 and c.running == 1)
diff --git a/formal-verification/fizzbee.sha256 b/formal-verification/fizzbee.sha256
new file mode 100644
index 000000000..342abb569
--- /dev/null
+++ b/formal-verification/fizzbee.sha256
@@ -0,0 +1,4 @@
+00011bbfe9bf4c7bcb03a5bf1f5b7fe7390111ad6f0611c6be71e8692504da4e  fizzbee-v0.5.2-linux_arm.tar.gz
+f494b7b2afcc7ce24575ed91a389b46bbbbe5976f9e4b5cd717327012f5e0395  fizzbee-v0.5.2-linux_x86.tar.gz
+aab223e0bac8f0c052cf774dc25872f72c138da30f4079b914bb9c8921910904  fizzbee-v0.5.2-macos_arm.tar.gz
+6293bd7ab90c79b8607dc9fb2f09407fde0e11ac6596e884bef7f660178597fa  fizzbee-v0.5.2-macos_x86.tar.gz
diff --git a/formal-verification/response_lifecycle.fizz b/formal-verification/response_lifecycle.fizz
new file mode 100644
index 000000000..e1394181c
--- /dev/null
+++ b/formal-verification/response_lifecycle.fizz
@@ -0,0 +1,83 @@
+---
+# Authoritative formal design for realtime machine M3: response coordination.
+#
+# Companion to:
+#   - docs/design/realtime-state-machines.md  (the map + invariants)
+#   - core/http/endpoints/openai/respcoord    (the Go implementation)
+#
+# The Go MBT adapter maps each action below onto respcoord.Coordinator.Apply
+# and the StateGetter onto respcoord.Coordinator.State, so this spec is the
+# source of truth the implementation is checked against.
+#
+# NOTE: FizzBee is pre-1.0. Validate the exact syntax/CLI against the version
+# pinned in formal-verification/README.md before trusting the gate.
+deadlock_detection: false
+---
+
+# Bound the number of responses so the state space is finite.
+MAX_RESPONSES = 4
+
+role Session:
+    action Init:
+        self.live = 0          # number of live responses -- MUST stay in {0,1}
+        self.registered = 0    # id of the active response (0 = none)
+        self.next_id = 0
+        self.torn = 0          # session torn down (Terminated) -- absorbing
+
+    # startResponse as ONE indivisible transition -- this is the single-writer
+    # actor guarantee. Superseding an active response emits its cancelled
+    # terminal (live -= 1) BEFORE spawning the replacement (live += 1), so the
+    # net live count never exceeds 1.
+    #
+    # To reproduce the LEGACY dual-writer race from Part 2 of the design doc,
+    # change `atomic func` to `serial func`: the checker then interleaves two
+    # callers between the cancel and the spawn and reports AtMostOneLive
+    # violated -- exactly the bug TestLegacyMechanismCanDoubleStart pins in Go.
+    atomic func start():
+        if self.registered != 0:
+            self.live -= 1         # cancel + cancelled-terminal for the old
+            self.registered = 0
+        self.next_id += 1
+        self.live += 1             # spawn + register the replacement
+        self.registered = self.next_id
+
+    # client read-loop path: response.create / manual input_audio_buffer.commit.
+    # Rejected once torn (no response starts after teardown).
+    atomic action StartFromClient:
+        require self.next_id < MAX_RESPONSES
+        require self.torn == 0
+        self.start()
+
+    # VAD goroutine path: end-of-speech commit / barge-in. Rejected once torn.
+    atomic action StartFromVad:
+        require self.next_id < MAX_RESPONSES
+        require self.torn == 0
+        self.start()
+
+    # a response reaches its own terminal (response.done completed)
+    atomic action FinishCurrent:
+        if self.registered != 0:
+            self.live -= 1
+            self.registered = 0
+
+    # explicit response.cancel with nothing newer queued
+    atomic action CancelReq:
+        if self.registered != 0:
+            self.live -= 1
+            self.registered = 0
+
+    # session teardown (M1 parent): cancel any in-flight response and go to the
+    # absorbing Terminated state, after which no response can start. This is what
+    # lets the connection's teardown guarantee no response outlives the session.
+    atomic action Shutdown:
+        if self.registered != 0:
+            self.live -= 1
+            self.registered = 0
+        self.torn = 1
+
+action Init:
+    s = Session()
+
+# SAFETY: at most one live response at any instant (Part 4, invariant #1).
+always assertion AtMostOneLive:
+    return s.live >= 0 and s.live <= 1
diff --git a/formal-verification/session_lifecycle.fizz b/formal-verification/session_lifecycle.fizz
new file mode 100644
index 000000000..e767831cc
--- /dev/null
+++ b/formal-verification/session_lifecycle.fizz
@@ -0,0 +1,84 @@
+---
+# Authoritative formal design for the realtime session lifecycle HIERARCHY:
+# how the per-machine coordinators (M1-M5) relate as one statechart.
+#
+# The five machines (respcoord/turncoord/conncoord/compactcoord/ttscoord) are
+# implemented as separate single-writer coordinators, but they are not
+# independent: M1 (connection) is the PARENT region, and its children must
+# terminate when it does. This spec models that relationship — the property no
+# single per-machine spec can express — without merging the Go code.
+#
+# Companion to:
+#   - docs/design/realtime-state-machines.md  (the map + invariants #8/#10)
+#   - the per-machine specs (response_lifecycle / turn_lifecycle / conn_lifecycle
+#     / compaction / tts_pipeline) which check each machine in isolation.
+#
+# Regions modeled here are M1's DIRECT children — the ones the connection
+# goroutine owns and tears down:
+#   conn        M1: 0 live, 1 torn
+#   vad         M2: 0 stopped, 1 running, 2 terminated (handleVAD goroutine joined)
+#   resp        M3: 0 idle,   1 active,  2 terminated (respcoord Terminated)
+#   compaction  M4: 0 idle,   1 running, 2 terminated (compactcoord Terminated)
+# M5 (TTS) is nested UNDER a response (each response owns its TTS pipeline), so
+# "resp terminated => tts closed" is an M3-internal relationship, not a direct
+# child of conn; it is covered by tts_pipeline.fizz + the response path.
+#
+# NOTE: FizzBee is pre-1.0. Validate the exact syntax/CLI against the version
+# pinned in formal-verification/README.md before trusting the gate.
+deadlock_detection: false
+---
+
+role Session:
+    action Init:
+        self.conn = 0
+        self.vad = 0
+        self.resp = 0
+        self.compaction = 0
+
+    # Children may only START work while the connection is live: no goroutine is
+    # spawned after teardown (no resurrection / no send-after-close).
+    atomic action VadStart:
+        if self.conn == 0 and self.vad == 0:
+            self.vad = 1
+    atomic action VadStop:
+        if self.conn == 0 and self.vad == 1:
+            self.vad = 0
+    atomic action RespStart:
+        if self.conn == 0 and self.resp != 2:
+            self.resp = 1
+    atomic action RespFinish:
+        if self.resp == 1:
+            self.resp = 0
+    atomic action CompTrigger:
+        if self.conn == 0 and self.compaction == 0:
+            self.compaction = 1
+    atomic action CompFinish:
+        if self.compaction == 1:
+            self.compaction = 0
+
+    # Parent teardown drives EVERY child to its terminal state in one step: the
+    # connection goroutine stops + joins the VAD goroutine (vad->2), shuts down
+    # the response coordinator (resp->2), and cancels + joins the in-flight
+    # compaction (compaction->2). THE RELATIONSHIP: a torn parent implies all
+    # children terminal.
+    #
+    # To reproduce the real M4 gap (compaction left fire-and-forget, able to
+    # outlive the session), delete `self.compaction = 2` below: the checker then
+    # reports ChildrenDieWithParent violated (conn torn while compaction still
+    # running). Likewise dropping vad/resp reproduces a leaked VAD/response.
+    atomic action Teardown:
+        if self.conn == 0:
+            self.conn = 1
+            self.vad = 2
+            self.resp = 2
+            self.compaction = 2
+
+action Init:
+    s = Session()
+
+# SAFETY (the hierarchy invariant): once the connection is torn, every child is
+# terminal — no VAD goroutine, response, or compaction outlives the session
+# (Part 4, invariants #8/#10). The start guards above additionally make "no child
+# starts after teardown" unreachable.
+always assertion ChildrenDieWithParent:
+    return s.conn == 0 or (s.vad == 2 and s.resp == 2 and s.compaction == 2)
diff --git a/formal-verification/tts_pipeline.fizz b/formal-verification/tts_pipeline.fizz
new file mode 100644
index 000000000..36896f80f
--- /dev/null
+++ b/formal-verification/tts_pipeline.fizz
@@ -0,0 +1,53 @@
+---
+# Authoritative formal design for realtime machine M5: TTS pipeline lifecycle.
+#
+# Companion to:
+#   - docs/design/realtime-state-machines.md  (the map + invariants)
+#   - core/http/endpoints/openai/ttscoord     (the Go implementation)
+#
+# The Go MBT adapter maps each action below onto ttscoord.Coordinator.Apply and
+# the StateGetter onto ttscoord.Coordinator.State, so this spec is the source of
+# truth the implementation is checked against.
+#
+# The TTS pipeline's open->closing->closed lifecycle (the legacy `closed` bool +
+# `done` channel). Two writers: the producer raises Close (wait()), the worker
+# raises WorkerExited. `phase` is 0=open, 1=closing, 2=closed; `wakes` counts how
+# many times Close woke the worker to exit.
+#
+# NOTE: FizzBee is pre-1.0. Validate the exact syntax/CLI against the version
+# pinned in formal-verification/README.md before trusting the gate.
+deadlock_detection: false
+---
+
+role Pipeline:
+    action Init:
+        self.phase = 0   # 0 open, 1 closing, 2 closed -- monotonic
+        self.wakes = 0   # Close->Closing transitions (worker wakeups to exit)
+
+    # wait() called (producer). THE FIX: it advances to closing and wakes the
+    # worker only from open, so wait() is idempotent. To reproduce the legacy
+    # double-wake hazard, drop the `if self.phase == 0` guard (always wake): the
+    # checker then reports WakeOnce violated.
+    atomic action Close:
+        if self.phase == 0:
+            self.phase = 1
+            self.wakes += 1
+
+    # The worker drained the queue and observed the close (worker goroutine).
+    atomic action WorkerExited:
+        if self.phase == 1:
+            self.phase = 2
+
+action Init:
+    p = Pipeline()
+
+# SAFETY: the worker is woken-to-exit at most once -- the done channel is joined
+# exactly once, wait() is idempotent (Part 4, invariant #10).
+always assertion WakeOnce:
+    return p.wakes <= 1
+
+# SAFETY: the lifecycle is bounded and monotonic open -> closing -> closed; a
+# clause is never accepted after close (enqueue is gated on phase 0 in Go) and
+# the worker is joined exactly once (Part 4, invariant #8).
+always assertion Monotonic:
+    return p.phase >= 0 and p.phase <= 2
diff --git a/formal-verification/turn_lifecycle.fizz b/formal-verification/turn_lifecycle.fizz
new file mode 100644
index 000000000..115a5c53a
--- /dev/null
+++ b/formal-verification/turn_lifecycle.fizz
@@ -0,0 +1,79 @@
+---
+# Authoritative formal design for realtime machine M2: turn detection.
+#
+# Companion to:
+#   - docs/design/realtime-state-machines.md  (the map + invariants)
+#   - core/http/endpoints/openai/turncoord    (the Go implementation)
+#
+# The Go MBT adapter maps each action below onto turncoord.Coordinator.Apply
+# and the StateGetter onto turncoord.Coordinator.State, so this spec is the
+# source of truth the implementation is checked against.
+#
+# The property this machine must guarantee is the COUPLING of two facts the
+# legacy code tracked in two separate variables that could disagree:
+#   - speech  -- handleVAD's speechStarted bool
+#   - turn    -- the semantic_vad live-stream-open flag (lts.open())
+# A discardTurn (no-speech clear / mode switch / teardown) closed the live
+# stream (turn -> 0) but left speechStarted set (speech stays 1). They then
+# disagreed, and the next onset was suppressed by `if !speechStarted` -- no
+# speech_started, no barge-in, no commit. See Part 2, failure mode 4.
+#
+# Here speech and turn are driven only ever TOGETHER, modelling the single
+# turncoord State (Idle <-> Speaking) where both facts are one value.
+#
+# NOTE: FizzBee is pre-1.0. Validate the exact syntax/CLI against the version
+# pinned in formal-verification/README.md before trusting the gate.
+deadlock_detection: false
+---
+
+# Bound the number of turns so the state space is finite.
+MAX_TURNS = 4
+
+role Detector:
+    action Init:
+        self.speech = 0   # speechStarted (0/1)
+        self.turn = 0     # live-stream / turn open (0/1)
+        self.turns = 0    # how many turns have been opened (bound)
+
+    # Onset: VAD reports speech while idle -> open a turn. ONE indivisible
+    # transition sets BOTH facts, so they cannot be left disagreeing. Re-onset
+    # while already speaking is a no-op (legacy `if !speechStarted`).
+    atomic action Onset:
+        require self.turns < MAX_TURNS
+        if self.speech == 0:
+            self.turns += 1
+            self.speech = 1
+            self.turn = 1
+
+    # Silence: VAD-confirmed end-of-speech past the dynamic threshold -> commit.
+    # Both facts clear together (EmitSpeechStopped + CommitTurn return to Idle).
+    atomic action Silence:
+        if self.speech == 1:
+            self.speech = 0
+            self.turn = 0
+
+    # Abort: no-speech clear / teardown -> discard. BOTH facts clear together.
+    # (A semantic->server mode switch only drops the orphaned live stream and
+    # lets the turn continue, so it is NOT an Abort -- see turncoord.go.)
+    # THE FIX: clearing only `self.turn` here (deleting `self.speech = 0`)
+    # reproduces the legacy discardTurn bug --
+    # the checker then reports Coupled violated, exactly the desync that
+    # suppressed the next onset.
+    atomic action Abort:
+        if self.turn == 1:
+            self.turn = 0
+            self.speech = 0
+
+action Init:
+    d = Detector()
+
+# SAFETY: speechStarted and turn-open never disagree -- they are one state, so
+# the legacy desync that suppressed the next onset is unrepresentable
+# (Part 4, invariant #4; failure mode 4).
+always assertion Coupled:
+    return d.speech == d.turn
+
+# SAFETY: at most one turn open at any instant -- `turn` is a 0/1 fact, never
+# incremented twice without a clear between (onset is a no-op while speaking).
+always assertion AtMostOneTurnOpen:
+    return d.turn >= 0 and d.turn <= 1
diff --git a/pkg/grpc/backend.go b/pkg/grpc/backend.go
index 838ab9865..de4c51488 100644
--- a/pkg/grpc/backend.go
+++ b/pkg/grpc/backend.go
@@ -119,6 +119,7 @@ type ControlBackend interface {
 	// NOT tracked as a single in-flight unit.
 	AudioTransformStream(ctx context.Context, opts ...grpc.CallOption) (AudioTransformStreamClient, error)
 	AudioToAudioStream(ctx context.Context, opts ...grpc.CallOption) (AudioToAudioStreamClient, error)
+	AudioTranscriptionLive(ctx context.Context, opts ...grpc.CallOption) (AudioTranscriptionLiveClient, error)
 
 	// Forward proxies a raw HTTP request to an upstream provider for
 	// passthrough-mode cloud-proxy backends. Caller streams a single
diff --git a/pkg/grpc/base/base.go b/pkg/grpc/base/base.go
index 55b0d96b6..d1aef3cb6 100644
--- a/pkg/grpc/base/base.go
+++ b/pkg/grpc/base/base.go
@@ -7,6 +7,7 @@ import (
 	"fmt"
 	"os"
 
+	"github.com/mudler/LocalAI/pkg/grpc/grpcerrors"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	gopsutil "github.com/shirou/gopsutil/v3/process"
 )
@@ -166,6 +167,11 @@ func (llm *Base) AudioTransformStream(in <-chan *pb.AudioTransformFrameRequest,
 	return fmt.Errorf("unimplemented")
 }
 
+func (llm *Base) AudioTranscriptionLive(in <-chan *pb.TranscriptLiveRequest, out chan<- *pb.TranscriptLiveResponse) error {
+	close(out)
+	return grpcerrors.LiveTranscriptionUnsupported("base", "not implemented by this backend")
+}
+
 func (llm *Base) AudioToAudioStream(in <-chan *pb.AudioToAudioRequest, out chan<- *pb.AudioToAudioResponse) error {
 	close(out)
 	return fmt.Errorf("unimplemented")
diff --git a/pkg/grpc/client.go b/pkg/grpc/client.go
index b80c74bcd..62865d53e 100644
--- a/pkg/grpc/client.go
+++ b/pkg/grpc/client.go
@@ -900,19 +900,22 @@ type AudioTransformStreamClient interface {
 }
 
 // audioTransformStreamClient is the concrete wrapper. It also owns the
-// underlying gRPC connection so it can be closed when the caller is done.
+// underlying gRPC connection, released once the receive side terminates —
+// NOT at CloseSend, because the server still streams responses (the tail of
+// the transform) after the client closes its send side. Same lifecycle as
+// forwardClient.
 type audioTransformStreamClient struct {
 	pb.Backend_AudioTransformStreamClient
-	conn   *grpc.ClientConn
-	closer func()
+	closeOnce sync.Once
+	closer    func()
 }
 
-func (s *audioTransformStreamClient) CloseSend() error {
-	err := s.Backend_AudioTransformStreamClient.CloseSend()
-	if s.closer != nil {
-		s.closer()
+func (s *audioTransformStreamClient) Recv() (*pb.AudioTransformFrameResponse, error) {
+	resp, err := s.Backend_AudioTransformStreamClient.Recv()
+	if err != nil && s.closer != nil {
+		s.closeOnce.Do(s.closer)
 	}
-	return err
+	return resp, err
 }
 
 func (c *Client) AudioTransformStream(ctx context.Context, opts ...grpc.CallOption) (AudioTransformStreamClient, error) {
@@ -944,7 +947,85 @@ func (c *Client) AudioTransformStream(ctx context.Context, opts ...grpc.CallOpti
 	}
 	return &audioTransformStreamClient{
 		Backend_AudioTransformStreamClient: stream,
-		conn:                               conn,
+		closer: func() {
+			_ = conn.Close()
+			cleanup()
+		},
+	}, nil
+}
+
+// AudioTranscriptionLiveClient is the duplex interface returned by
+// (*Client).AudioTranscriptionLive. Wraps the generated bidi client without
+// leaking the proto package across the public boundary.
+type AudioTranscriptionLiveClient interface {
+	Send(*pb.TranscriptLiveRequest) error
+	Recv() (*pb.TranscriptLiveResponse, error)
+	CloseSend() error
+	Context() context.Context
+}
+
+type audioTranscriptionLiveClient struct {
+	pb.Backend_AudioTranscriptionLiveClient
+	closeOnce sync.Once
+	closer    func()
+}
+
+// Recv releases the connection once the stream reaches a terminal state
+// (io.EOF after the server finishes, or any error). The conn MUST survive
+// CloseSend: the live protocol is close-send -> backend flushes the decode
+// tail -> terminal FinalResult arrives. Closing the conn inside CloseSend
+// killed that pending Recv with "grpc: the client connection is closing",
+// losing the final transcript (and its tail words) on every turn.
+func (s *audioTranscriptionLiveClient) Recv() (*pb.TranscriptLiveResponse, error) {
+	resp, err := s.Backend_AudioTranscriptionLiveClient.Recv()
+	if err != nil {
+		s.release()
+	}
+	return resp, err
+}
+
+func (s *audioTranscriptionLiveClient) release() {
+	s.closeOnce.Do(func() {
+		if s.closer != nil {
+			s.closer()
+		}
+	})
+}
+
+// AudioTranscriptionLive opens the bidirectional live ASR stream. Note the
+// same caveat as AudioToAudioStream: the watchdog busy-mark (and, on
+// non-parallel backends, opMutex) is held for the stream's lifetime, which
+// for a realtime session can be minutes — enable parallel requests on
+// backends meant to serve live sessions alongside unary work.
+func (c *Client) AudioTranscriptionLive(ctx context.Context, opts ...grpc.CallOption) (AudioTranscriptionLiveClient, error) {
+	if !c.parallel {
+		c.opMutex.Lock()
+	}
+	c.setBusy(true)
+	c.wdMark()
+
+	cleanup := func() {
+		c.wdUnMark()
+		c.setBusy(false)
+		if !c.parallel {
+			c.opMutex.Unlock()
+		}
+	}
+
+	conn, err := c.dial()
+	if err != nil {
+		cleanup()
+		return nil, err
+	}
+	client := pb.NewBackendClient(conn)
+	stream, err := client.AudioTranscriptionLive(ctx, opts...)
+	if err != nil {
+		_ = conn.Close()
+		cleanup()
+		return nil, err
+	}
+	return &audioTranscriptionLiveClient{
+		Backend_AudioTranscriptionLiveClient: stream,
 		closer: func() {
 			_ = conn.Close()
 			cleanup()
@@ -962,18 +1043,22 @@ type AudioToAudioStreamClient interface {
 	Context() context.Context
 }
 
+// audioToAudioStreamClient owns its gRPC connection, released once the
+// receive side terminates — NOT at CloseSend, because the server still
+// streams the response tail after the client closes its send side. Same
+// lifecycle as forwardClient.
 type audioToAudioStreamClient struct {
 	pb.Backend_AudioToAudioStreamClient
-	conn   *grpc.ClientConn
-	closer func()
+	closeOnce sync.Once
+	closer    func()
 }
 
-func (s *audioToAudioStreamClient) CloseSend() error {
-	err := s.Backend_AudioToAudioStreamClient.CloseSend()
-	if s.closer != nil {
-		s.closer()
+func (s *audioToAudioStreamClient) Recv() (*pb.AudioToAudioResponse, error) {
+	resp, err := s.Backend_AudioToAudioStreamClient.Recv()
+	if err != nil && s.closer != nil {
+		s.closeOnce.Do(s.closer)
 	}
-	return err
+	return resp, err
 }
 
 func (c *Client) AudioToAudioStream(ctx context.Context, opts ...grpc.CallOption) (AudioToAudioStreamClient, error) {
@@ -1005,7 +1090,6 @@ func (c *Client) AudioToAudioStream(ctx context.Context, opts ...grpc.CallOption
 	}
 	return &audioToAudioStreamClient{
 		Backend_AudioToAudioStreamClient: stream,
-		conn:                             conn,
 		closer: func() {
 			_ = conn.Close()
 			cleanup()
diff --git a/pkg/grpc/embed.go b/pkg/grpc/embed.go
index 2251dc707..265297669 100644
--- a/pkg/grpc/embed.go
+++ b/pkg/grpc/embed.go
@@ -198,6 +198,34 @@ func (e *embedBackend) AudioTransformStream(ctx context.Context, opts ...grpc.Ca
 	}, nil
 }
 
+func (e *embedBackend) AudioTranscriptionLive(ctx context.Context, opts ...grpc.CallOption) (AudioTranscriptionLiveClient, error) {
+	reqs := make(chan *pb.TranscriptLiveRequest, 4)
+	resps := make(chan *pb.TranscriptLiveResponse, 4)
+	srvDone := make(chan error, 1)
+
+	server := &embedBackendAudioTranscriptionLiveStream{
+		ctx:   ctx,
+		reqs:  reqs,
+		resps: resps,
+	}
+
+	go func() {
+		err := e.s.AudioTranscriptionLive(server)
+		// Stash the terminal error BEFORE closing resps: a caller blocked in
+		// Recv wakes on the close and must find the error (the ready-ack
+		// contract surfaces Unimplemented through that first Recv).
+		srvDone <- err
+		close(resps)
+	}()
+
+	return &embedBackendAudioTranscriptionLiveStreamClient{
+		ctx:     ctx,
+		reqs:    reqs,
+		resps:   resps,
+		srvDone: srvDone,
+	}, nil
+}
+
 func (e *embedBackend) Forward(ctx context.Context, opts ...grpc.CallOption) (ForwardClient, error) {
 	reqs := make(chan *pb.ForwardRequest, 8)
 	resps := make(chan *pb.ForwardReply, 8)
@@ -301,6 +329,8 @@ var _ pb.Backend_AudioTransformStreamServer = new(embedBackendAudioTransformStre
 var _ AudioTransformStreamClient = new(embedBackendAudioTransformStreamClient)
 var _ pb.Backend_AudioToAudioStreamServer = new(embedBackendAudioToAudioStream)
 var _ AudioToAudioStreamClient = new(embedBackendAudioToAudioStreamClient)
+var _ pb.Backend_AudioTranscriptionLiveServer = new(embedBackendAudioTranscriptionLiveStream)
+var _ AudioTranscriptionLiveClient = new(embedBackendAudioTranscriptionLiveStreamClient)
 
 // embedBackendAudioTransformStream is the server side of an in-process bidi
 // stream. The hosted server reads requests from `reqs` (closed by client when
@@ -397,6 +427,102 @@ func (e *embedBackendAudioTransformStreamClient) CloseSend() error {
 
 func (e *embedBackendAudioTransformStreamClient) Context() context.Context { return e.ctx }
 
+// embedBackendAudioTranscriptionLiveStream is the in-process server-side
+// handle for the bidirectional live ASR RPC. Mirrors
+// embedBackendAudioTransformStream — the hosted server reads requests from
+// `reqs` (closed by client when done sending) and writes responses to `resps`.
+type embedBackendAudioTranscriptionLiveStream struct {
+	ctx   context.Context
+	reqs  <-chan *pb.TranscriptLiveRequest
+	resps chan<- *pb.TranscriptLiveResponse
+}
+
+func (e *embedBackendAudioTranscriptionLiveStream) Send(resp *pb.TranscriptLiveResponse) error {
+	select {
+	case e.resps <- resp:
+		return nil
+	case <-e.ctx.Done():
+		return e.ctx.Err()
+	}
+}
+
+func (e *embedBackendAudioTranscriptionLiveStream) Recv() (*pb.TranscriptLiveRequest, error) {
+	select {
+	case req, ok := <-e.reqs:
+		if !ok {
+			return nil, io.EOF
+		}
+		return req, nil
+	case <-e.ctx.Done():
+		return nil, e.ctx.Err()
+	}
+}
+
+func (e *embedBackendAudioTranscriptionLiveStream) SetHeader(md metadata.MD) error  { return nil }
+func (e *embedBackendAudioTranscriptionLiveStream) SendHeader(md metadata.MD) error { return nil }
+func (e *embedBackendAudioTranscriptionLiveStream) SetTrailer(md metadata.MD)       {}
+func (e *embedBackendAudioTranscriptionLiveStream) Context() context.Context        { return e.ctx }
+func (e *embedBackendAudioTranscriptionLiveStream) SendMsg(m any) error {
+	if x, ok := m.(*pb.TranscriptLiveResponse); ok {
+		return e.Send(x)
+	}
+	return nil
+}
+func (e *embedBackendAudioTranscriptionLiveStream) RecvMsg(m any) error {
+	// gRPC bidi streaming uses Recv() directly; RecvMsg is unused on this path.
+	return nil
+}
+
+// embedBackendAudioTranscriptionLiveStreamClient is the caller-facing side.
+// It mirrors the server-side stream over the same channels.
+type embedBackendAudioTranscriptionLiveStreamClient struct {
+	ctx       context.Context
+	reqs      chan<- *pb.TranscriptLiveRequest
+	resps     <-chan *pb.TranscriptLiveResponse
+	srvDone   <-chan error
+	closeOnce bool
+}
+
+func (e *embedBackendAudioTranscriptionLiveStreamClient) Send(req *pb.TranscriptLiveRequest) error {
+	select {
+	case e.reqs <- req:
+		return nil
+	case <-e.ctx.Done():
+		return e.ctx.Err()
+	}
+}
+
+func (e *embedBackendAudioTranscriptionLiveStreamClient) Recv() (*pb.TranscriptLiveResponse, error) {
+	select {
+	case resp, ok := <-e.resps:
+		if !ok {
+			// Server-side finished. Surface its terminal error if any.
+			select {
+			case err := <-e.srvDone:
+				if err != nil {
+					return nil, err
+				}
+			default:
+			}
+			return nil, io.EOF
+		}
+		return resp, nil
+	case <-e.ctx.Done():
+		return nil, e.ctx.Err()
+	}
+}
+
+func (e *embedBackendAudioTranscriptionLiveStreamClient) CloseSend() error {
+	if e.closeOnce {
+		return nil
+	}
+	e.closeOnce = true
+	close(e.reqs)
+	return nil
+}
+
+func (e *embedBackendAudioTranscriptionLiveStreamClient) Context() context.Context { return e.ctx }
+
 // embedBackendAudioToAudioStream is the in-process server-side handle for
 // the bidirectional any-to-any audio RPC. Mirrors embedBackendAudioTransform
 // Stream — the hosted server reads requests from `reqs` (closed by client
diff --git a/pkg/grpc/grpcerrors/errors.go b/pkg/grpc/grpcerrors/errors.go
index a5b8c32ee..724d63547 100644
--- a/pkg/grpc/grpcerrors/errors.go
+++ b/pkg/grpc/grpcerrors/errors.go
@@ -33,3 +33,41 @@ func IsModelNotLoaded(err error) bool {
 	}
 	return strings.Contains(strings.ToLower(err.Error()), "model not loaded")
 }
+
+// LiveTranscriptionUnsupported returns the canonical error a backend returns
+// when it (or the loaded model) cannot serve the bidirectional
+// AudioTranscriptionLive RPC. It carries codes.Unimplemented deliberately:
+// that is also what gRPC itself returns for backends whose stubs predate the
+// RPC, so callers get one uniform "degrade to non-live transcription" signal.
+// (codes.FailedPrecondition is not used here — IsModelNotLoaded claims it.)
+func LiveTranscriptionUnsupported(backend, reason string) error {
+	return status.Errorf(codes.Unimplemented, "%s: live transcription unsupported: %s", backend, reason)
+}
+
+// IsLiveTranscriptionUnsupported reports whether err signals that live
+// transcription is not available for this backend/model. It prefers the typed
+// gRPC status code (Unimplemented) and falls back to the message for paths
+// that lose the status (e.g. errors wrapped across non-gRPC boundaries).
+func IsLiveTranscriptionUnsupported(err error) bool {
+	if err == nil {
+		return false
+	}
+	if status.Code(err) == codes.Unimplemented {
+		return true
+	}
+	return strings.Contains(strings.ToLower(err.Error()), "unimplemented")
+}
+
+// StreamTranscriptionUnsupported returns the canonical error a backend returns
+// when it (or the loaded model) cannot serve the server-streaming
+// AudioTranscriptionStream RPC. It carries codes.Unimplemented like the live
+// signal, but its intent is the opposite: it is meant to be SURFACED to the
+// caller, not silently degraded. A backend must not decode the audio offline
+// and emit it as a single "delta" + final to fake a stream — a client that
+// asked for streaming has to learn the model cannot stream (qualitatively
+// identical output would otherwise hide a missing, possibly required,
+// capability). Callers wanting a plain transcript use the unary
+// AudioTranscription / non-streaming endpoint instead.
+func StreamTranscriptionUnsupported(backend, reason string) error {
+	return status.Errorf(codes.Unimplemented, "%s: streaming transcription unsupported: %s", backend, reason)
+}
diff --git a/pkg/grpc/grpcerrors/errors_test.go b/pkg/grpc/grpcerrors/errors_test.go
index a4b087761..932633510 100644
--- a/pkg/grpc/grpcerrors/errors_test.go
+++ b/pkg/grpc/grpcerrors/errors_test.go
@@ -34,4 +34,30 @@ var _ = Describe("grpcerrors", func() {
 	It("ModelNotLoaded carries FailedPrecondition", func() {
 		Expect(status.Code(grpcerrors.ModelNotLoaded("whisper"))).To(Equal(codes.FailedPrecondition))
 	})
+
+	DescribeTable("IsLiveTranscriptionUnsupported",
+		func(err error, want bool) {
+			Expect(grpcerrors.IsLiveTranscriptionUnsupported(err)).To(Equal(want))
+		},
+		Entry("nil", nil, false),
+		Entry("typed via constructor", grpcerrors.LiveTranscriptionUnsupported("parakeet-cpp", "not a streaming model"), true),
+		Entry("typed code only", status.Error(codes.Unimplemented, "anything"), true),
+		Entry("stale stub message (Unknown code)", errors.New("rpc error: method AudioTranscriptionLive unimplemented"), true),
+		Entry("unrelated error", errors.New("context deadline exceeded"), false),
+		Entry("model not loaded is NOT unsupported", grpcerrors.ModelNotLoaded("parakeet-cpp"), false),
+	)
+
+	It("LiveTranscriptionUnsupported carries Unimplemented, not FailedPrecondition", func() {
+		err := grpcerrors.LiveTranscriptionUnsupported("parakeet-cpp", "reason")
+		Expect(status.Code(err)).To(Equal(codes.Unimplemented))
+		// FailedPrecondition is claimed by IsModelNotLoaded — the two
+		// signals must never alias.
+		Expect(grpcerrors.IsModelNotLoaded(err)).To(BeFalse())
+	})
+
+	It("StreamTranscriptionUnsupported carries Unimplemented and is not ModelNotLoaded", func() {
+		err := grpcerrors.StreamTranscriptionUnsupported("parakeet-cpp", "not a streaming model")
+		Expect(status.Code(err)).To(Equal(codes.Unimplemented))
+		Expect(grpcerrors.IsModelNotLoaded(err)).To(BeFalse())
+	})
 })
diff --git a/pkg/grpc/interface.go b/pkg/grpc/interface.go
index 282735612..c968e7461 100644
--- a/pkg/grpc/interface.go
+++ b/pkg/grpc/interface.go
@@ -27,6 +27,7 @@ type AIModel interface {
 	VoiceEmbed(*pb.VoiceEmbedRequest) (pb.VoiceEmbedResponse, error)
 	AudioTranscription(context.Context, *pb.TranscriptRequest) (pb.TranscriptResult, error)
 	AudioTranscriptionStream(context.Context, *pb.TranscriptRequest, chan *pb.TranscriptStreamResponse) error
+	AudioTranscriptionLive(in <-chan *pb.TranscriptLiveRequest, out chan<- *pb.TranscriptLiveResponse) error
 	TTS(*pb.TTSRequest) error
 	TTSStream(*pb.TTSRequest, chan []byte) error
 	SoundGeneration(*pb.SoundGenerationRequest) error
diff --git a/pkg/grpc/server.go b/pkg/grpc/server.go
index 53522f114..c4c2785c8 100644
--- a/pkg/grpc/server.go
+++ b/pkg/grpc/server.go
@@ -266,6 +266,7 @@ func (s *server) AudioTranscription(ctx context.Context, in *pb.TranscriptReques
 	tresult.Text = result.Text
 	tresult.Language = result.Language
 	tresult.Duration = result.Duration
+	tresult.Eou = result.Eou
 	return tresult, nil
 }
 
@@ -290,6 +291,75 @@ func (s *server) AudioTranscriptionStream(in *pb.TranscriptRequest, stream pb.Ba
 	return err
 }
 
+// AudioTranscriptionLive is the bidirectional live ASR handler. The shape
+// mirrors AudioTransformStream exactly (recv → in chan, out chan → send) so
+// backends implement it with the same goroutine idiom.
+func (s *server) AudioTranscriptionLive(stream pb.Backend_AudioTranscriptionLiveServer) error {
+	if s.llm.Locking() {
+		s.llm.Lock()
+		defer s.llm.Unlock()
+	}
+
+	in := make(chan *pb.TranscriptLiveRequest, 4)
+	out := make(chan *pb.TranscriptLiveResponse, 4)
+
+	// Pump incoming messages from the gRPC stream into `in`. EOF closes the
+	// channel, which signals the backend to finalize the decode session.
+	recvErrCh := make(chan error, 1)
+	go func() {
+		defer close(in)
+		for {
+			req, err := stream.Recv()
+			if err != nil {
+				if errors.Is(err, io.EOF) {
+					recvErrCh <- nil
+					return
+				}
+				recvErrCh <- err
+				return
+			}
+			select {
+			case in <- req:
+			case <-stream.Context().Done():
+				recvErrCh <- stream.Context().Err()
+				return
+			}
+		}
+	}()
+
+	// Pump outgoing responses from `out` to the gRPC stream. The backend
+	// closes `out` on completion.
+	sendDone := make(chan error, 1)
+	go func() {
+		for resp := range out {
+			if err := stream.Send(resp); err != nil {
+				sendDone <- err
+				// Drain `out` so the backend can finish.
+				for range out {
+				}
+				return
+			}
+		}
+		sendDone <- nil
+	}()
+
+	backendErr := s.llm.AudioTranscriptionLive(in, out)
+	sendErr := <-sendDone
+
+	// Unlike AudioTransformStream, do NOT wait for the recv pump when the
+	// backend failed: callers block on the first Recv for the ready ack, so
+	// an unsupported backend (Unimplemented) must surface immediately, not
+	// after the client gives up and closes its send side. Returning cancels
+	// the stream context, which unwinds the recv goroutine.
+	if backendErr != nil {
+		return backendErr
+	}
+	if sendErr != nil {
+		return sendErr
+	}
+	return <-recvErrCh
+}
+
 func (s *server) PredictStream(in *pb.PredictOptions, stream pb.Backend_PredictStreamServer) error {
 	if s.llm.Locking() {
 		s.llm.Lock()
diff --git a/pkg/grpc/transcription_live_conn_test.go b/pkg/grpc/transcription_live_conn_test.go
new file mode 100644
index 000000000..fbee3ed97
--- /dev/null
+++ b/pkg/grpc/transcription_live_conn_test.go
@@ -0,0 +1,63 @@
+package grpc
+
+import (
+	"context"
+	"io"
+	"net"
+
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	gogrpc "google.golang.org/grpc"
+)
+
+// The embedded (test://) path never dials a *grpc.ClientConn, so it cannot
+// catch connection-lifecycle bugs. This suite runs the same live-ASR contract
+// over a real TCP connection: the terminal FinalResult arrives AFTER the
+// client closes its send side, so the conn must be released on terminal Recv
+// — releasing it inside CloseSend killed the pending Recv with "grpc: the
+// client connection is closing" and lost the decode tail on every turn.
+func startLiveASRServer() string {
+	lis, err := net.Listen("tcp", "127.0.0.1:0")
+	Expect(err).ToNot(HaveOccurred())
+
+	s := gogrpc.NewServer(serverOpts()...)
+	pb.RegisterBackendServer(s, &server{llm: &echoLiveASRModel{}})
+	go func() { _ = s.Serve(lis) }()
+	DeferCleanup(s.GracefulStop)
+
+	return lis.Addr().String()
+}
+
+var _ = Describe("AudioTranscriptionLive over a real connection", func() {
+	It("delivers the post-CloseSend FinalResult, then EOF releases the conn", func() {
+		c := NewClient(startLiveASRServer(), true, nil, false)
+
+		stream, err := c.AudioTranscriptionLive(context.Background())
+		Expect(err).NotTo(HaveOccurred())
+
+		Expect(stream.Send(&pb.TranscriptLiveRequest{
+			Payload: &pb.TranscriptLiveRequest_Config{Config: &pb.TranscriptLiveConfig{Language: "en"}},
+		})).To(Succeed())
+		ack, err := stream.Recv()
+		Expect(err).NotTo(HaveOccurred())
+		Expect(ack.GetReady()).To(BeTrue())
+
+		Expect(stream.Send(&pb.TranscriptLiveRequest{
+			Payload: &pb.TranscriptLiveRequest_Audio{Audio: &pb.TranscriptLiveAudio{Pcm: []float32{0.1, 0.2, 0.3}}},
+		})).To(Succeed())
+		delta, err := stream.Recv()
+		Expect(err).NotTo(HaveOccurred())
+		Expect(delta.GetDelta()).To(Equal("[3]"))
+
+		// The decisive step: the terminal message arrives after CloseSend.
+		Expect(stream.CloseSend()).To(Succeed())
+		final, err := stream.Recv()
+		Expect(err).NotTo(HaveOccurred(), "FinalResult must survive CloseSend — the conn may only close on terminal Recv")
+		Expect(final.GetFinalResult()).NotTo(BeNil())
+		Expect(final.GetFinalResult().GetText()).To(Equal("[3]"))
+
+		_, err = stream.Recv()
+		Expect(err).To(MatchError(io.EOF))
+	})
+})
diff --git a/pkg/grpc/transcription_live_test.go b/pkg/grpc/transcription_live_test.go
new file mode 100644
index 000000000..c4b428b2e
--- /dev/null
+++ b/pkg/grpc/transcription_live_test.go
@@ -0,0 +1,120 @@
+package grpc
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"strings"
+
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	"github.com/mudler/LocalAI/pkg/grpc/grpcerrors"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// echoLiveASRModel is a minimal AIModel exercising the AudioTranscriptionLive
+// contract: ready ack after the config, one delta per audio frame (eou set on
+// frames whose first sample is negative), final_result on close.
+type echoLiveASRModel struct {
+	base.SingleThread
+}
+
+func (m *echoLiveASRModel) AudioTranscriptionLive(in <-chan *pb.TranscriptLiveRequest, out chan<- *pb.TranscriptLiveResponse) error {
+	defer close(out)
+
+	first, ok := <-in
+	if !ok || first.GetConfig() == nil {
+		return errors.New("first message must carry a config")
+	}
+	out <- &pb.TranscriptLiveResponse{Ready: true}
+
+	var full strings.Builder
+	for req := range in {
+		audio := req.GetAudio()
+		if audio == nil {
+			continue
+		}
+		delta := fmt.Sprintf("[%d]", len(audio.Pcm))
+		full.WriteString(delta)
+		out <- &pb.TranscriptLiveResponse{
+			Delta: delta,
+			Eou:   len(audio.Pcm) > 0 && audio.Pcm[0] < 0,
+		}
+	}
+	out <- &pb.TranscriptLiveResponse{FinalResult: &pb.TranscriptResult{Text: full.String(), Eou: true}}
+	return nil
+}
+
+var _ = Describe("AudioTranscriptionLive RPC (in-process)", func() {
+	It("acks the config, streams deltas with eou flags, and finalizes on CloseSend", func() {
+		addr := "test://transcription-live-echo"
+		Provide(addr, &echoLiveASRModel{})
+		c := NewClient(addr, true, nil, false)
+
+		stream, err := c.AudioTranscriptionLive(context.Background())
+		Expect(err).NotTo(HaveOccurred())
+
+		Expect(stream.Send(&pb.TranscriptLiveRequest{
+			Payload: &pb.TranscriptLiveRequest_Config{Config: &pb.TranscriptLiveConfig{Language: "en"}},
+		})).To(Succeed())
+
+		// The ready ack must arrive before any audio is sent — this is what
+		// lets callers degrade synchronously when live ASR is unsupported.
+		ack, err := stream.Recv()
+		Expect(err).NotTo(HaveOccurred())
+		Expect(ack.Ready).To(BeTrue())
+
+		Expect(stream.Send(&pb.TranscriptLiveRequest{
+			Payload: &pb.TranscriptLiveRequest_Audio{Audio: &pb.TranscriptLiveAudio{Pcm: []float32{0.1, 0.2}}},
+		})).To(Succeed())
+		r1, err := stream.Recv()
+		Expect(err).NotTo(HaveOccurred())
+		Expect(r1.Delta).To(Equal("[2]"))
+		Expect(r1.Eou).To(BeFalse())
+
+		Expect(stream.Send(&pb.TranscriptLiveRequest{
+			Payload: &pb.TranscriptLiveRequest_Audio{Audio: &pb.TranscriptLiveAudio{Pcm: []float32{-0.5, 0.0, 0.5}}},
+		})).To(Succeed())
+		r2, err := stream.Recv()
+		Expect(err).NotTo(HaveOccurred())
+		Expect(r2.Delta).To(Equal("[3]"))
+		Expect(r2.Eou).To(BeTrue())
+
+		Expect(stream.CloseSend()).To(Succeed())
+
+		final, err := stream.Recv()
+		Expect(err).NotTo(HaveOccurred())
+		Expect(final.FinalResult).NotTo(BeNil())
+		Expect(final.FinalResult.Text).To(Equal("[2][3]"))
+		Expect(final.FinalResult.Eou).To(BeTrue())
+
+		_, err = stream.Recv()
+		Expect(errors.Is(err, io.EOF)).To(BeTrue())
+	})
+
+	It("surfaces Unimplemented from base.Base on the first Recv without CloseSend", func() {
+		// The ready-ack contract: a caller that sent its config and blocks on
+		// the first Recv must get the unsupported signal immediately — NOT
+		// after it gives up and closes the send side.
+		addr := "test://transcription-live-base"
+		Provide(addr, &base.SingleThread{})
+		c := NewClient(addr, true, nil, false)
+
+		stream, err := c.AudioTranscriptionLive(context.Background())
+		Expect(err).NotTo(HaveOccurred())
+
+		Expect(stream.Send(&pb.TranscriptLiveRequest{
+			Payload: &pb.TranscriptLiveRequest_Config{Config: &pb.TranscriptLiveConfig{}},
+		})).To(Succeed())
+
+		_, err = stream.Recv()
+		Expect(err).To(HaveOccurred())
+		Expect(grpcerrors.IsLiveTranscriptionUnsupported(err)).To(BeTrue())
+
+		// Degrading callers close the send side; in embed mode this is also
+		// what unwinds the server-side recv pump.
+		Expect(stream.CloseSend()).To(Succeed())
+	})
+})
diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go
index 509e58e68..43273a662 100644
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -9,8 +9,10 @@ import (
 	"time"
 
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/xlog"
 	"github.com/phayes/freeport"
+	"google.golang.org/protobuf/proto"
 )
 
 const (
@@ -53,7 +55,9 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
 
 		xlog.Debug("Loading Model with gRPC", "modelID", modelID, "file", modelFile, "backend", backend, "options", *o)
 
-		// Distributed mode: delegate to the model router if set
+		// Distributed mode: delegate to the model router if set. No load
+		// event is emitted here: this branch runs per inference request and
+		// the actual load happens on the worker node.
 		ml.mu.Lock()
 		router := ml.modelRouter
 		ml.mu.Unlock()
@@ -62,110 +66,140 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
 			return router(o.context, backend, modelID, modelName, modelFile, o.gRPCOptions, o.parallelRequests)
 		}
 
-		var client *Model
+		uri := ml.GetAllExternalBackends(o)[backend]
+		start := time.Now()
+		m, err := ml.spawnGRPCModel(backend, uri, o, modelID, modelName, modelFile)
+		ml.notifyLoadObserver(BackendLoadEvent{
+			ModelID:    modelID,
+			ModelName:  modelName,
+			Backend:    backend,
+			BackendURI: uri,
+			Duration:   time.Since(start),
+			Err:        err,
+		})
+		return m, err
+	}
+}
 
-		getFreeAddress := func() (string, error) {
-			port, err := freeport.GetFreePort()
-			if err != nil {
-				return "", fmt.Errorf("failed allocating free ports: %s", err.Error())
-			}
-			return fmt.Sprintf("127.0.0.1:%d", port), nil
-		}
+// spawnGRPCModel starts the backend process (or attaches to a remote
+// address), waits for it to come up, and issues the LoadModel RPC. Reached
+// only for actual loads: LoadModel resolves cache hits and coalesces
+// concurrent loads before invoking the grpcModel closure. uri is the
+// resolved external-backend runtime (empty when the backend isn't
+// registered).
+func (ml *ModelLoader) spawnGRPCModel(backend, uri string, o *Options, modelID, modelName, modelFile string) (*Model, error) {
+	var client *Model
 
-		// If no specific model path is set for transformers/HF, set it to the model path
-		for _, env := range []string{"HF_HOME", "TRANSFORMERS_CACHE", "HUGGINGFACE_HUB_CACHE"} {
-			if os.Getenv(env) == "" {
-				err := os.Setenv(env, ml.ModelPath)
-				if err != nil {
-					xlog.Error("unable to set environment variable to modelPath", "error", err, "name", env, "modelPath", ml.ModelPath)
-				}
-			}
-		}
-
-		// Check if the backend is provided as external
-		if uri, ok := ml.GetAllExternalBackends(o)[backend]; ok {
-			xlog.Debug("Loading external backend", "uri", uri)
-			// check if uri is a file or an address
-			if fi, err := os.Stat(uri); err == nil {
-				xlog.Debug("external backend is file", "file", fi)
-				serverAddress, err := getFreeAddress()
-				if err != nil {
-					return nil, fmt.Errorf("failed allocating free ports: %s", err.Error())
-				}
-				// Make sure the process is executable
-				process, err := ml.startProcess(uri, modelID, serverAddress)
-				if err != nil {
-					xlog.Error("failed to launch", "error", err, "path", uri)
-					return nil, err
-				}
-
-				xlog.Debug("GRPC Service Started")
-
-				client = NewModel(modelID, serverAddress, process)
-			} else {
-				xlog.Debug("external backend is a uri")
-				// address
-				client = NewModel(modelID, uri, nil)
-			}
-		} else {
-			xlog.Error("Backend not found", "backend", backend)
-			return nil, fmt.Errorf("backend not found: %s", backend)
-		}
-
-		xlog.Debug("Wait for the service to start up")
-		xlog.Debug("Options", "options", o.gRPCOptions)
-
-		// Wait for the service to start up
-		ready := false
-		for i := range o.grpcAttempts {
-			alive, err := client.GRPC(o.parallelRequests, ml.wd).HealthCheck(context.Background())
-			if alive {
-				xlog.Debug("GRPC Service Ready")
-				ready = true
-				break
-			}
-			if err != nil && i == o.grpcAttempts-1 {
-				xlog.Error("failed starting/connecting to the gRPC service", "error", err)
-			}
-			time.Sleep(time.Duration(o.grpcAttemptsDelay) * time.Second)
-		}
-
-		if !ready {
-			xlog.Debug("GRPC Service NOT ready")
-			if process := client.Process(); process != nil {
-				process.Stop()
-			}
-			return nil, fmt.Errorf("grpc service not ready")
-		}
-
-		options := *o.gRPCOptions
-		options.Model = modelName
-		options.ModelFile = modelFile
-		options.ModelPath = ml.ModelPath
-
-		xlog.Debug("GRPC: Loading model with options", "options", options)
-
-		res, err := client.GRPC(o.parallelRequests, ml.wd).LoadModel(o.context, &options)
+	getFreeAddress := func() (string, error) {
+		port, err := freeport.GetFreePort()
 		if err != nil {
-			if process := client.Process(); process != nil {
-				process.Stop()
-			}
-			return nil, fmt.Errorf("could not load model: %w", err)
-		}
-		if !res.Success {
-			if process := client.Process(); process != nil {
-				process.Stop()
-			}
-			return nil, fmt.Errorf("could not load model (no success): %s", res.Message)
+			return "", fmt.Errorf("failed allocating free ports: %s", err.Error())
 		}
+		return fmt.Sprintf("127.0.0.1:%d", port), nil
+	}
 
-		// Register size for size-aware eviction using the caller-supplied estimate
-		// (computed via pkg/vram, which handles multi-file and non-GGUF models).
-		if ml.wd != nil && o.modelSizeBytes > 0 {
-			ml.wd.RegisterModelSize(modelID, o.modelSizeBytes)
+	// If no specific model path is set for transformers/HF, set it to the model path
+	for _, env := range []string{"HF_HOME", "TRANSFORMERS_CACHE", "HUGGINGFACE_HUB_CACHE"} {
+		if os.Getenv(env) == "" {
+			err := os.Setenv(env, ml.ModelPath)
+			if err != nil {
+				xlog.Error("unable to set environment variable to modelPath", "error", err, "name", env, "modelPath", ml.ModelPath)
+			}
 		}
+	}
 
-		return client, nil
+	// Check if the backend is provided as external
+	if uri != "" {
+		xlog.Debug("Loading external backend", "uri", uri)
+		// check if uri is a file or an address
+		if fi, err := os.Stat(uri); err == nil {
+			xlog.Debug("external backend is file", "file", fi)
+			serverAddress, err := getFreeAddress()
+			if err != nil {
+				return nil, fmt.Errorf("failed allocating free ports: %s", err.Error())
+			}
+			// Make sure the process is executable
+			process, err := ml.startProcess(uri, modelID, serverAddress)
+			if err != nil {
+				xlog.Error("failed to launch", "error", err, "path", uri)
+				return nil, err
+			}
+
+			xlog.Debug("GRPC Service Started")
+
+			client = NewModel(modelID, serverAddress, process)
+		} else {
+			xlog.Debug("external backend is a uri")
+			// address
+			client = NewModel(modelID, uri, nil)
+		}
+	} else {
+		xlog.Error("Backend not found", "backend", backend)
+		return nil, fmt.Errorf("backend not found: %s", backend)
+	}
+
+	xlog.Debug("Wait for the service to start up")
+	xlog.Debug("Options", "options", o.gRPCOptions)
+
+	// Wait for the service to start up
+	ready := false
+	for i := range o.grpcAttempts {
+		alive, err := client.GRPC(o.parallelRequests, ml.wd).HealthCheck(context.Background())
+		if alive {
+			xlog.Debug("GRPC Service Ready")
+			ready = true
+			break
+		}
+		if err != nil && i == o.grpcAttempts-1 {
+			xlog.Error("failed starting/connecting to the gRPC service", "error", err)
+		}
+		time.Sleep(time.Duration(o.grpcAttemptsDelay) * time.Second)
+	}
+
+	if !ready {
+		xlog.Debug("GRPC Service NOT ready")
+		stopLoadProcess(client, modelID)
+		return nil, fmt.Errorf("grpc service not ready")
+	}
+
+	// Clone before setting the per-load fields: o.gRPCOptions is shared by
+	// retried/auto-discovery attempts, and a plain struct copy would copy
+	// the protobuf message's internal mutex.
+	options := proto.Clone(o.gRPCOptions).(*pb.ModelOptions)
+	options.Model = modelName
+	options.ModelFile = modelFile
+	options.ModelPath = ml.ModelPath
+
+	xlog.Debug("GRPC: Loading model with options", "options", options)
+
+	res, err := client.GRPC(o.parallelRequests, ml.wd).LoadModel(o.context, options)
+	if err != nil {
+		stopLoadProcess(client, modelID)
+		return nil, fmt.Errorf("could not load model: %w", err)
+	}
+	if !res.Success {
+		stopLoadProcess(client, modelID)
+		return nil, fmt.Errorf("could not load model (no success): %s", res.Message)
+	}
+
+	// Register size for size-aware eviction using the caller-supplied estimate
+	// (computed via pkg/vram, which handles multi-file and non-GGUF models).
+	if ml.wd != nil && o.modelSizeBytes > 0 {
+		ml.wd.RegisterModelSize(modelID, o.modelSizeBytes)
+	}
+
+	return client, nil
+}
+
+// stopLoadProcess tears down a backend process whose load did not complete.
+// The stop error is only logged: the load error is what the caller reports.
+func stopLoadProcess(client *Model, modelID string) {
+	process := client.Process()
+	if process == nil {
+		return
+	}
+	if err := process.Stop(); err != nil {
+		xlog.Warn("failed to stop backend process after failed load", "error", err, "modelID", modelID)
 	}
 }
 
diff --git a/pkg/model/load_observer_test.go b/pkg/model/load_observer_test.go
new file mode 100644
index 000000000..7061542df
--- /dev/null
+++ b/pkg/model/load_observer_test.go
@@ -0,0 +1,94 @@
+package model_test
+
+import (
+	"os"
+
+	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/system"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// The load observer is the hook the core wires to backend tracing: it must
+// fire once per actual load attempt with the alias-resolved backend and the
+// resolved runtime URI, because that pair is what tells an operator WHICH
+// installed backend build served a model (a stale install is invisible in
+// the model config but shows up here).
+var _ = Describe("ModelLoader load observer", func() {
+	var (
+		modelLoader *model.ModelLoader
+		modelPath   string
+		events      []model.BackendLoadEvent
+	)
+
+	BeforeEach(func() {
+		var err error
+		modelPath, err = os.MkdirTemp("", "load_observer")
+		Expect(err).ToNot(HaveOccurred())
+
+		systemState, err := system.GetSystemState(
+			system.WithModelPath(modelPath),
+		)
+		Expect(err).ToNot(HaveOccurred())
+		modelLoader = model.NewModelLoader(systemState)
+
+		events = nil
+		modelLoader.SetLoadObserver(func(ev model.BackendLoadEvent) {
+			events = append(events, ev)
+		})
+	})
+
+	AfterEach(func() {
+		Expect(os.RemoveAll(modelPath)).To(Succeed())
+	})
+
+	It("fires with the resolved runtime URI when a load attempt fails", func() {
+		// A non-file external backend URI is treated as a remote gRPC
+		// address; nothing listens on port 1, so the health check fails
+		// after the single configured attempt — a real, fast load attempt.
+		modelLoader.SetExternalBackend("fakebackend", "127.0.0.1:1")
+
+		_, err := modelLoader.Load(
+			model.WithModelID("m"),
+			model.WithModel("m.bin"),
+			model.WithBackendString("fakebackend"),
+			model.WithGRPCAttempts(1),
+			model.WithGRPCAttemptsDelay(0),
+		)
+		Expect(err).To(HaveOccurred())
+
+		Expect(events).To(HaveLen(1))
+		ev := events[0]
+		Expect(ev.ModelID).To(Equal("m"))
+		Expect(ev.ModelName).To(Equal("m.bin"))
+		Expect(ev.Backend).To(Equal("fakebackend"))
+		Expect(ev.BackendURI).To(Equal("127.0.0.1:1"))
+		Expect(ev.Err).To(HaveOccurred())
+	})
+
+	It("fires for unknown backends with an empty runtime URI", func() {
+		_, err := modelLoader.Load(
+			model.WithModelID("m"),
+			model.WithModel("m.bin"),
+			model.WithBackendString("no-such-backend"),
+		)
+		Expect(err).To(HaveOccurred())
+
+		Expect(events).To(HaveLen(1))
+		Expect(events[0].Backend).To(Equal("no-such-backend"))
+		Expect(events[0].BackendURI).To(BeEmpty())
+		Expect(events[0].Err).To(MatchError(ContainSubstring("backend not found")))
+	})
+
+	It("is optional: loads proceed when no observer is registered", func() {
+		modelLoader.SetLoadObserver(nil)
+
+		_, err := modelLoader.Load(
+			model.WithModelID("m"),
+			model.WithModel("m.bin"),
+			model.WithBackendString("no-such-backend"),
+		)
+		Expect(err).To(HaveOccurred())
+		Expect(events).To(BeEmpty())
+	})
+})
diff --git a/pkg/model/loader.go b/pkg/model/loader.go
index 5eb40cdb9..b4018a979 100644
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -39,6 +39,27 @@ type RemoteModelUnloader interface {
 type ModelRouter func(ctx context.Context, backend, modelID, modelName, modelFile string,
 	opts *pb.ModelOptions, parallel bool) (*Model, error)
 
+// BackendLoadEvent describes one actual backend load attempt: a backend
+// process spawn (or remote-address attach) followed by its LoadModel RPC.
+// Cache hits and loads coalesced onto another goroutine's in-flight attempt
+// never produce an event, so observers see real loads only. Distributed-mode
+// routing is excluded too: there grpcModel runs per inference request and the
+// worker node owns the actual load.
+type BackendLoadEvent struct {
+	ModelID   string
+	ModelName string
+	// Backend is the alias-resolved backend string (e.g. "parakeet-cpp").
+	Backend string
+	// BackendURI is the resolved runtime serving the load: the installed
+	// backend's launcher path (which names the variant directory) or a
+	// remote gRPC address. This is what identifies WHICH build served the
+	// model — a stale installed backend is invisible in the model config
+	// but obvious here.
+	BackendURI string
+	Duration   time.Duration
+	Err        error
+}
+
 type ModelLoader struct {
 	ModelPath                string
 	mu                       sync.Mutex
@@ -49,6 +70,7 @@ type ModelLoader struct {
 	lruEvictionMaxRetries    int           // Maximum number of retries when waiting for busy models
 	lruEvictionRetryInterval time.Duration // Interval between retries when waiting for busy models
 	onUnloadHooks            []ModelUnloadHook
+	loadObserver             func(BackendLoadEvent)
 	remoteUnloader           RemoteModelUnloader
 	modelRouter              ModelRouter // distributed mode: route to remote node
 	backendLogs              *BackendLogStore
@@ -98,6 +120,23 @@ func (ml *ModelLoader) SetWatchDog(wd *WatchDog) {
 	ml.wd = wd
 }
 
+// SetLoadObserver registers a callback fired after every actual backend load
+// attempt, successful or not. See BackendLoadEvent for what counts as one.
+func (ml *ModelLoader) SetLoadObserver(obs func(BackendLoadEvent)) {
+	ml.mu.Lock()
+	defer ml.mu.Unlock()
+	ml.loadObserver = obs
+}
+
+func (ml *ModelLoader) notifyLoadObserver(ev BackendLoadEvent) {
+	ml.mu.Lock()
+	obs := ml.loadObserver
+	ml.mu.Unlock()
+	if obs != nil {
+		obs(ev)
+	}
+}
+
 // SetRemoteUnloader sets the handler for unloading models on remote nodes.
 // In distributed mode, this should be set to the SmartRouter adapter.
 func (ml *ModelLoader) SetRemoteUnloader(u RemoteModelUnloader) {
diff --git a/pkg/sound/float32.go b/pkg/sound/float32.go
index f42a04e53..f031ecd39 100644
--- a/pkg/sound/float32.go
+++ b/pkg/sound/float32.go
@@ -10,3 +10,19 @@ func BytesFloat32(bytes []byte) float32 {
 	float := math.Float32frombits(bits)
 	return float
 }
+
+// Float32sToInt16LEBytes converts [-1,1] float PCM samples to int16
+// little-endian bytes, clamping out-of-range values instead of wrapping.
+func Float32sToInt16LEBytes(samples []float32) []byte {
+	out := make([]byte, len(samples)*2)
+	for i, f := range samples {
+		v := int32(f * 32767)
+		if v > 32767 {
+			v = 32767
+		} else if v < -32768 {
+			v = -32768
+		}
+		binary.LittleEndian.PutUint16(out[i*2:], uint16(v))
+	}
+	return out
+}
diff --git a/pkg/sound/float32_test.go b/pkg/sound/float32_test.go
new file mode 100644
index 000000000..8cffa8596
--- /dev/null
+++ b/pkg/sound/float32_test.go
@@ -0,0 +1,22 @@
+package sound
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("Float32sToInt16LEBytes", func() {
+	It("converts in-range samples to int16 little-endian bytes", func() {
+		out := Float32sToInt16LEBytes([]float32{0, 0.5, -0.5})
+		Expect(BytesToInt16sLE(out)).To(Equal([]int16{0, 16383, -16383}))
+	})
+
+	It("clamps out-of-range samples instead of wrapping", func() {
+		out := Float32sToInt16LEBytes([]float32{2.0, -2.0})
+		Expect(out).To(Equal([]byte{0xff, 0x7f, 0x00, 0x80})) // 32767, -32768
+	})
+
+	It("returns an empty slice for empty input", func() {
+		Expect(Float32sToInt16LEBytes(nil)).To(BeEmpty())
+	})
+})
diff --git a/scripts/install-fizzbee.sh b/scripts/install-fizzbee.sh
new file mode 100755
index 000000000..2539964b8
--- /dev/null
+++ b/scripts/install-fizzbee.sh
@@ -0,0 +1,96 @@
+#!/bin/sh
+# install-fizzbee: download the pinned FizzBee release and verify its checksum.
+# FizzBee is pre-1.0 and single-maintainer, so we PIN a version + sha256 rather
+# than tracking latest or building from source (its primary build is Bazel).
+# See formal-verification/README.md.
+#
+# Usage:
+#   scripts/install-fizzbee.sh [dest_dir]   (default dest: ./.tools/fizzbee)
+#
+# First-time pinning: run once with no recorded checksum; the script prints the
+# computed sha256. Record it in formal-verification/fizzbee.sha256 as
+# "<sha256>  <asset>" and commit, then CI verifies it on every run.
+set -eu
+
+VERSION=${FIZZBEE_VERSION:-v0.5.2}
+DEST=${1:-".tools/fizzbee"}
+ROOT=$(CDPATH= cd "$(dirname "$0")/.." && pwd)
+SHA_FILE="$ROOT/formal-verification/fizzbee.sha256"
+
+# Detect platform -> release asset name.
+os=$(uname -s)
+arch=$(uname -m)
+case "$os" in
+    Linux)  plat="linux" ;;
+    Darwin) plat="macos" ;;
+    *) echo "unsupported OS: $os" >&2; exit 1 ;;
+esac
+case "$arch" in
+    x86_64|amd64) cpu="x86" ;;
+    arm64|aarch64) cpu="arm" ;;
+    *) echo "unsupported arch: $arch" >&2; exit 1 ;;
+esac
+
+asset="fizzbee-${VERSION}-${plat}_${cpu}.tar.gz"
+url="https://github.com/fizzbee-io/fizzbee/releases/download/${VERSION}/${asset}"
+inner="fizzbee-${VERSION}-${plat}_${cpu}"
+
+# Idempotent: if the pinned version is already extracted (e.g. restored from a
+# CI cache), do nothing. This keeps the install step a no-op on cache hits and
+# avoids re-downloading the (large) bundle.
+if [ -x "$DEST/$inner/fizz" ] && [ -L "$DEST/fizz" ]; then
+    echo "==> FizzBee $VERSION already installed at $DEST/$inner"
+    exit 0
+fi
+
+tmp=$(mktemp -d)
+trap 'rm -rf "$tmp"' EXIT
+
+echo "==> downloading $url"
+curl -fL -o "$tmp/$asset" "$url"
+
+got=$(sha256sum "$tmp/$asset" | awk '{print $1}')
+if [ -f "$SHA_FILE" ]; then
+    want=$(awk -v a="$asset" '$2==a {print $1}' "$SHA_FILE")
+    if [ -z "$want" ]; then
+        echo "ERROR: no recorded sha256 for $asset in $SHA_FILE" >&2
+        echo "       computed: $got  $asset" >&2
+        exit 1
+    fi
+    if [ "$got" != "$want" ]; then
+        echo "ERROR: sha256 mismatch for $asset" >&2
+        echo "       want: $want" >&2
+        echo "       got:  $got" >&2
+        exit 1
+    fi
+    echo "==> checksum verified"
+else
+    echo "WARNING: $SHA_FILE not found -- record this line and commit it:" >&2
+    echo "$got  $asset"
+fi
+
+# The tarball unpacks to a self-contained, version+platform-named directory:
+#   fizzbee-<version>-<plat>_<cpu>/
+#     fizz             <- the CLI wrapper (entrypoint; invoke THIS)
+#     parser/parser_bin <- the .fizz frontend (bundled; no system Python needed)
+#     fizzbee          <- the Go model-checker binary
+#     fizz.env         <- resolves the above RELATIVE to the dir holding `fizz`
+#     mbt_gen.zip      <- MBT generator (only this needs system python)
+# The whole directory must stay intact (fizz.env is path-relative), so we keep
+# it and publish a STABLE symlink `$DEST/fizz` -> the versioned wrapper. The
+# wrapper does readlink -f on itself, so the symlink still resolves fizz.env.
+mkdir -p "$DEST"
+rm -rf "$DEST/$inner"
+tar -xzf "$tmp/$asset" -C "$DEST"
+if [ ! -x "$DEST/$inner/fizz" ]; then
+    echo "ERROR: expected $DEST/$inner/fizz after extraction; tarball layout changed" >&2
+    exit 1
+fi
+ln -sfn "$inner/fizz" "$DEST/fizz"
+
+echo "==> installed FizzBee $VERSION to $DEST/$inner"
+echo "    frontend:  $DEST/$inner/parser/parser_bin"
+echo "    entrypoint: $DEST/fizz  (stable symlink -> $inner/fizz)"
+echo
+echo "The realtime-conformance gate auto-detects \$DEST/fizz; nothing else needed."
+echo "To use it directly, run:  $DEST/fizz <spec.fizz>"
diff --git a/scripts/realtime-conformance.sh b/scripts/realtime-conformance.sh
new file mode 100755
index 000000000..eed5cbecd
--- /dev/null
+++ b/scripts/realtime-conformance.sh
@@ -0,0 +1,83 @@
+#!/bin/sh
+# realtime-conformance: verify the realtime state-machine implementations conform
+# to their formal designs. See docs/design/realtime-state-machines.md (Part 6).
+#
+# Two layers, BOTH required by default -- this gate is FAIL-CLOSED:
+#   1. Go-native conformance: the respcoord + turncoord transition tables +
+#      Ginkgo/Gomega seeded property tests under the race detector (checks the
+#      implementation).
+#   2. FizzBee model check of the authoritative .fizz specs (checks the design,
+#      and is the thing that makes the design authoritative).
+#
+# A missing FizzBee is a HARD FAILURE, not a skip -- otherwise verification
+# silently evaporates the moment the tool is inconvenient, which defeats the
+# whole point. The pinned binary installs reproducibly via scripts/install-fizzbee.sh
+# (checksums in formal-verification/fizzbee.sha256), so "couldn't install" is not a
+# reason to skip; fix the install. The ONLY way to skip is the explicit, loud
+# REALTIME_CONFORMANCE_SKIP_FIZZBEE=1 opt-out, for the rare case of hacking on
+# unrelated code locally -- never in CI.
+#
+# POSIX sh (no bashisms): the project requires tooling scripts to be portable.
+set -eu
+
+ROOT=$(CDPATH= cd "$(dirname "$0")/.." && pwd)
+GOCMD=${GOCMD:-go}
+SPEC_DIR="$ROOT/formal-verification"
+
+echo "==> [1/2] Go conformance (coordinator, respcoord, turncoord, conncoord, compactcoord, ttscoord) with -race"
+"$GOCMD" test -race -count=1 \
+    "$ROOT/core/http/endpoints/openai/coordinator/..." \
+    "$ROOT/core/http/endpoints/openai/respcoord/..." \
+    "$ROOT/core/http/endpoints/openai/turncoord/..." \
+    "$ROOT/core/http/endpoints/openai/conncoord/..." \
+    "$ROOT/core/http/endpoints/openai/compactcoord/..." \
+    "$ROOT/core/http/endpoints/openai/ttscoord/..."
+
+# Locate the FizzBee CLI wrapper. install-fizzbee.sh publishes a stable symlink
+# at .tools/fizzbee/fizz; otherwise fall back to `fizz` on PATH. FIZZBEE_BIN
+# overrides both. NOTE: this must be the `fizz` WRAPPER (which runs the bundled
+# parser then the checker), not the raw `fizzbee` binary, and its sibling
+# parser/ + fizz.env must be intact.
+FIZZBEE_BIN=${FIZZBEE_BIN:-}
+if [ -z "$FIZZBEE_BIN" ]; then
+    if [ -x "$ROOT/.tools/fizzbee/fizz" ]; then
+        FIZZBEE_BIN="$ROOT/.tools/fizzbee/fizz"
+    elif command -v fizz >/dev/null 2>&1; then
+        FIZZBEE_BIN=fizz
+    fi
+fi
+
+echo "==> [2/2] FizzBee model check of authoritative specs"
+if [ -n "$FIZZBEE_BIN" ] && { [ -x "$FIZZBEE_BIN" ] || command -v "$FIZZBEE_BIN" >/dev/null 2>&1; }; then
+    for spec in "$SPEC_DIR"/*.fizz; do
+        echo "    checking $spec"
+        # CLI is `fizz [flags] <spec.fizz>` (default = exhaustive BFS); there is
+        # NO `run` subcommand. The checker may print FAILED/DEADLOCK but still
+        # exit 0, so detect violations from the output as well as the exit code.
+        set +e
+        out=$("$FIZZBEE_BIN" "$spec" 2>&1)
+        rc=$?
+        set -e
+        printf '%s\n' "$out"
+        if [ "$rc" -ne 0 ]; then
+            echo "ERROR: FizzBee exited $rc on $spec" >&2
+            exit 1
+        fi
+        if printf '%s\n' "$out" | grep -qE '^(FAILED|DEADLOCK)'; then
+            echo "ERROR: FizzBee reported an invariant/deadlock violation in $spec" >&2
+            exit 1
+        fi
+    done
+    echo "==> realtime-conformance OK (Go + FizzBee)"
+elif [ "${REALTIME_CONFORMANCE_SKIP_FIZZBEE:-0}" = "1" ]; then
+    echo "    !! FizzBee model check EXPLICITLY SKIPPED (REALTIME_CONFORMANCE_SKIP_FIZZBEE=1)" >&2
+    echo "    !! The authoritative design was NOT verified this run. Do not use in CI." >&2
+    echo "==> realtime-conformance INCOMPLETE (Go only; design check skipped by request)"
+else
+    echo "ERROR: FizzBee not found -- the authoritative design cannot be verified." >&2
+    echo "       This gate is fail-closed; verification is not optional." >&2
+    echo "       Install the pinned, checksum-verified binary:  make install-fizzbee" >&2
+    echo "       (see formal-verification/README.md). To deliberately skip locally while" >&2
+    echo "       hacking on unrelated code, set REALTIME_CONFORMANCE_SKIP_FIZZBEE=1." >&2
+    exit 1
+fi