fix(watchdog): persist a UI-saved Check Interval across restarts (#10601 )

The watchdog Check Interval saved via /api/settings reverted to 500ms on every restart, while the idle/busy timeouts persisted correctly. Root cause: NewApplicationConfig baseline-defaulted WatchDogInterval to 500ms, whereas the idle/busy timeouts default to 0. The startup loader (loadRuntimeSettingsFromFile) applies a persisted runtime_settings.json value only when the field is still at its zero default - its heuristic for "this wasn't set by an env var". Because the interval was always 500ms at that point, the loader never read the persisted value back, so the saved interval was silently discarded on each boot. Fix: drop the non-zero baseline default so the interval behaves like the sibling timeouts (0 = unset). The effective 500ms default is now supplied at the watchdog layer: WithWatchdogInterval ignores a non-positive value so DefaultWatchDogOptions' 500ms is preserved (and a 0 interval can never turn the watchdog loop into a busy spin). Also mirror the interval in the live config file watcher alongside idle/busy, and report the real 500ms default (not the stale "2s") from ToRuntimeSettings. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code]
chore: ⬆️ Update ikawrakow/ik_llama.cpp to f74a6fb87b315b2c3154166e075360e15021a61d (#10598 )
2026-06-30 11:26:32 -04:00 · 2026-06-30 08:04:12 +00:00 · 2026-06-30 09:17:48 +02:00 · 2026-06-30 09:17:11 +02:00 · 2026-06-30 09:16:52 +02:00 · 2026-06-30 09:16:37 +02:00
126 changed files with 10156 additions and 977 deletions
--- a/.githooks/pre-commit
+++ b/.githooks/pre-commit
@@ -7,8 +7,11 @@
 # Runs only the checks relevant to what's staged:
 #   - Go files          -> make lint + make test-coverage-check
 #   - core/http/react-ui -> make test-ui-coverage-check (Playwright e2e + gate)
-# A commit touching neither is skipped entirely (docs/YAML/etc. can't change
-# lint findings, Go coverage, or the UI).
+#   - realtime state machines / specs -> make test-realtime-conformance
+#       (respcoord/**, turncoord/**, or formal-verification/** -- a pure .fizz
+#        spec edit must still re-verify the design, detected separately from Go)
+# A commit touching none of these is skipped entirely (other docs/YAML can't
+# change lint findings, Go coverage, the UI, or the realtime conformance gate).
 #
 # To bypass for a single commit (e.g. a WIP checkpoint): git commit --no-verify
 set -eu
@@ -20,11 +23,13 @@ staged="$(git diff --cached --name-only --diff-filter=ACMRD)"

 go_changed=0
 ui_changed=0
+rt_changed=0
 if echo "$staged" | grep -qE '\.go$'; then go_changed=1; fi
 if echo "$staged" | grep -qE '^core/http/react-ui/'; then ui_changed=1; fi
+if echo "$staged" | grep -qE '^(core/http/endpoints/openai/(coordinator|respcoord|turncoord|conncoord|compactcoord|ttscoord)/|formal-verification/)'; then rt_changed=1; fi

-if [ "$go_changed" -eq 0 ] && [ "$ui_changed" -eq 0 ]; then
-	echo "pre-commit: no Go or React UI changes staged — skipping."
+if [ "$go_changed" -eq 0 ] && [ "$ui_changed" -eq 0 ] && [ "$rt_changed" -eq 0 ]; then
+	echo "pre-commit: no Go, React UI, or realtime-spec changes staged — skipping."
 	exit 0
 fi

@@ -57,4 +62,11 @@ if [ "$ui_changed" -eq 1 ]; then
 	make test-ui-coverage-check
 fi

+if [ "$rt_changed" -eq 1 ]; then
+	echo "pre-commit ▶ realtime state-machine conformance (make test-realtime-conformance) —"
+	echo "             Go transition/rapid tests under -race + FizzBee model check of the"
+	echo "             authoritative specs. Fail-closed: needs FizzBee (make install-fizzbee)."
+	make test-realtime-conformance
+fi
+
 echo "pre-commit ✓ all relevant checks passed"
--- a/.github/workflows/backend_build_darwin.yml
+++ b/.github/workflows/backend_build_darwin.yml
@@ -82,7 +82,7 @@ jobs:
      # as the Linux registry cache.
      - name: Restore Homebrew cache
        id: brew-cache
-        uses: actions/cache/restore@v4
+        uses: actions/cache/restore@v6
        with:
          path: |
            ~/Library/Caches/Homebrew/downloads
@@ -142,7 +142,7 @@ jobs:

      - name: Save Homebrew cache
        if: github.event_name != 'pull_request' && steps.brew-cache.outputs.cache-hit != 'true'
-        uses: actions/cache/save@v4
+        uses: actions/cache/save@v6
        with:
          path: |
            ~/Library/Caches/Homebrew/downloads
@@ -178,7 +178,7 @@ jobs:
      - name: Restore ccache
        if: inputs.backend == 'llama-cpp'
        id: ccache-cache
-        uses: actions/cache/restore@v4
+        uses: actions/cache/restore@v6
        with:
          path: ~/Library/Caches/ccache
          key: ccache-llama-${{ runner.arch }}-${{ steps.llama-version.outputs.version }}-${{ github.run_id }}
@@ -211,7 +211,7 @@ jobs:
      - name: Restore Python wheel cache
        if: inputs.lang == 'python'
        id: pyenv-cache
-        uses: actions/cache/restore@v4
+        uses: actions/cache/restore@v6
        with:
          path: |
            ~/Library/Caches/pip
@@ -256,14 +256,14 @@ jobs:

      - name: Save ccache
        if: inputs.backend == 'llama-cpp' && github.event_name != 'pull_request'
-        uses: actions/cache/save@v4
+        uses: actions/cache/save@v6
        with:
          path: ~/Library/Caches/ccache
          key: ccache-llama-${{ runner.arch }}-${{ steps.llama-version.outputs.version }}-${{ github.run_id }}

      - name: Save Python wheel cache
        if: inputs.lang == 'python' && github.event_name != 'pull_request' && steps.pyenv-cache.outputs.cache-hit != 'true'
-        uses: actions/cache/save@v4
+        uses: actions/cache/save@v6
        with:
          path: |
            ~/Library/Caches/pip
--- a/.github/workflows/realtime-conformance.yml
+++ b/.github/workflows/realtime-conformance.yml
@@ -0,0 +1,69 @@
+---
+name: 'realtime-conformance'
+
+# Verifies the realtime state-machine implementations conform to their formal
+# designs (docs/design/realtime-state-machines.md, formal-verification/). BOTH
+# layers are enforced and the gate is fail-closed: the Go conformance layer
+# (respcoord + turncoord transition/rapid tests under -race) AND the FizzBee model check of
+# the authoritative specs. FizzBee is pinned + checksum-verified
+# (formal-verification/fizzbee.sha256), so a failed install fails the job rather
+# than silently skipping verification.
+
+on:
+  pull_request:
+    paths:
+      - 'core/http/endpoints/openai/coordinator/**'
+      - 'core/http/endpoints/openai/respcoord/**'
+      - 'core/http/endpoints/openai/turncoord/**'
+      - 'core/http/endpoints/openai/conncoord/**'
+      - 'core/http/endpoints/openai/compactcoord/**'
+      - 'core/http/endpoints/openai/ttscoord/**'
+      - 'formal-verification/**'
+      - 'scripts/realtime-conformance.sh'
+      - 'scripts/install-fizzbee.sh'
+      - '.github/workflows/realtime-conformance.yml'
+  push:
+    branches:
+      - master
+    paths:
+      - 'core/http/endpoints/openai/coordinator/**'
+      - 'core/http/endpoints/openai/respcoord/**'
+      - 'core/http/endpoints/openai/turncoord/**'
+      - 'core/http/endpoints/openai/conncoord/**'
+      - 'core/http/endpoints/openai/compactcoord/**'
+      - 'core/http/endpoints/openai/ttscoord/**'
+      - 'formal-verification/**'
+      - 'scripts/realtime-conformance.sh'
+
+concurrency:
+  group: realtime-conformance-${{ github.event.pull_request.number || github.sha }}-${{ github.repository }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  conformance:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        go-version: ['1.26.x']
+    steps:
+      - name: Clone
+        uses: actions/checkout@v7
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v5
+        with:
+          go-version: ${{ matrix.go-version }}
+          cache: false
+      - name: Cache FizzBee
+        uses: actions/cache@v4
+        with:
+          path: .tools/fizzbee
+          key: fizzbee-v0.5.2-${{ runner.os }}-${{ hashFiles('formal-verification/fizzbee.sha256') }}
+      - name: Install FizzBee (pinned, checksum-verified)
+        # No `|| true`: a failed/forged download must fail the job, not silently
+        # drop the design verification. install-fizzbee.sh is a no-op if the
+        # cached binary is already present and valid.
+        run: ./scripts/install-fizzbee.sh
+      - name: Run conformance gate (fail-closed)
+        # No skip env: both the Go conformance and the FizzBee model check are
+        # required. The gate auto-detects .tools/fizzbee/fizz.
+        run: make test-realtime-conformance
--- a/.gitignore
+++ b/.gitignore
@@ -97,3 +97,12 @@ core/http/react-ui/test-results/

 # Local Apple signing material (never commit)
 .certs/
+
+# Pinned dev tools (e.g. FizzBee for the realtime-conformance gate)
+.tools/
+
+# FizzBee model-check artifacts: the parser emits <spec>.json next to each
+# .fizz and the checker writes run dirs under out/. Both are regenerated by
+# the realtime-conformance gate; only the .fizz sources are authoritative.
+formal-verification/*.json
+formal-verification/out/
--- a/14
+++ b/14
@@ -405,6 +405,18 @@ test-realtime: build-mock-backend
 	@echo 'Running realtime e2e tests (mock backend)'
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="Realtime && !real-models" --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e

+# Verify the realtime state-machine implementations conform to their formal
+# designs (Go transition/rapid tests under -race + FizzBee model check of the
+# authoritative specs). See docs/design/realtime-state-machines.md (Part 6) and
+# docs/design/specs/README.md.
+test-realtime-conformance:
+	GOCMD=$(GOCMD) ./scripts/realtime-conformance.sh
+
+# Install the pinned, checksum-verified FizzBee model checker (into .tools/,
+# gitignored) used by test-realtime-conformance. Idempotent; no-op if present.
+install-fizzbee:
+	./scripts/install-fizzbee.sh
+
 # Container-based real-model realtime testing. Build env vars / pipeline
 # definition kept here so test-realtime-models-docker can drive a fully wired
 # pipeline (VAD + STT + LLM + TTS) from inside a containerised runner.
@@ -1027,7 +1039,7 @@ test-extra-backend-whisper-transcription: docker-build-whisper
 ## is reachable.
 test-extra-backend-parakeet-cpp-transcription: docker-build-parakeet-cpp
 	BACKEND_IMAGE=local-ai-backend:parakeet-cpp \
-	BACKEND_TEST_MODEL_URL=https://huggingface.co/mudler/parakeet-cpp-gguf/resolve/main/tdt_ctc-110m-f16.gguf \
+	BACKEND_TEST_MODEL_URL=https://huggingface.co/mudler/parakeet-cpp-gguf/resolve/main/realtime_eou_120m-v1-f16.gguf \
 	BACKEND_TEST_AUDIO_URL=https://github.com/ggml-org/whisper.cpp/raw/master/samples/jfk.wav \
 	BACKEND_TEST_CAPS=health,load,transcription \
 	$(MAKE) test-extra-backend
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -18,6 +18,18 @@ service Backend {
  rpc GenerateVideo(GenerateVideoRequest) returns (Result) {}
  rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
  rpc AudioTranscriptionStream(TranscriptRequest) returns (stream TranscriptStreamResponse) {}
+  // AudioTranscriptionLive is the bidirectional live-microphone ASR RPC. The
+  // first message MUST carry a Config; subsequent messages carry Audio frames
+  // (mono float PCM at config.sample_rate, 16 kHz default). After a
+  // successful open the backend replies with a single ready ack
+  // (TranscriptLiveResponse{ready:true}); backends or models without
+  // cache-aware streaming support return UNIMPLEMENTED instead. Newly
+  // finalized text streams back as deltas; eou=true marks the model's
+  // end-of-utterance token. One stream spans many utterances (the decoder
+  // resets itself after each EOU). Closing the send side finalizes: the
+  // backend flushes the decoder tail and emits a terminal message carrying
+  // final_result. A second Config mid-stream resets the decode session.
+  rpc AudioTranscriptionLive(stream TranscriptLiveRequest) returns (stream TranscriptLiveResponse) {}
  rpc TTS(TTSRequest) returns (Result) {}
  rpc TTSStream(TTSRequest) returns (stream Reply) {}
  rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
@@ -479,6 +491,10 @@ message TranscriptResult {
  string text = 2;
  string language = 3;
  float duration = 4;
+  // True when the decode ended on the model's end-of-utterance special token
+  // (<EOU>/<EOB>, emitted by cache-aware streaming models such as
+  // parakeet_realtime_eou_120m-v1). The marker itself is stripped from text.
+  bool eou = 5;
 }

 message TranscriptStreamResponse {
@@ -486,6 +502,34 @@ message TranscriptStreamResponse {
  TranscriptResult final_result = 2;
 }

+// === AudioTranscriptionLive messages =====================================
+
+message TranscriptLiveRequest {
+  oneof payload {
+    TranscriptLiveConfig config = 1;
+    TranscriptLiveAudio  audio  = 2;
+  }
+}
+
+message TranscriptLiveConfig {
+  string language = 1;             // "" => model default
+  int32 sample_rate = 2;           // 0 => 16000; backends may reject others
+  map<string, string> params = 3;  // backend-specific tuning
+}
+
+message TranscriptLiveAudio {
+  repeated float pcm = 1;          // mono PCM in [-1,1] at config.sample_rate
+}
+
+message TranscriptLiveResponse {
+  bool ready = 1;                       // open ack: sent once, before any delta
+  string delta = 2;                     // newly-finalized text since previous response
+  bool eou = 3;                         // <EOU> fired during this feed (the user yielded the turn)
+  repeated TranscriptWord words = 4;    // words finalized by this feed (stream-relative ns)
+  TranscriptResult final_result = 5;    // terminal message only, after the send side closes
+  bool eob = 6;                         // <EOB> fired: a backchannel ("uh-huh") ended — NOT a turn boundary
+}
+
 message TranscriptWord {
  int64 start = 1;
  int64 end = 2;
--- a/backend/cpp/ik-llama-cpp/Makefile
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -1,5 +1,5 @@

-IK_LLAMA_VERSION?=f96eaddba8bed6a9a5e628bbf6a566775c70b49c
+IK_LLAMA_VERSION?=f74a6fb87b315b2c3154166e075360e15021a61d
 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=0ed235ea2c17a19fc8238668653946721ed136fd
+LLAMA_VERSION?=6f4f53f2b7da54fcdbbecaaa734337c337ad6176
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/privacy-filter/Makefile
+++ b/backend/cpp/privacy-filter/Makefile
@@ -8,7 +8,7 @@
 # Local development: point at a working checkout instead of cloning, e.g.
 #   make PRIVACY_FILTER_SRC=$HOME/c/privacy-filter.cpp grpc-server

-PRIVACY_FILTER_VERSION?=98f52c5ef2250f207cc6b9a6aef05393a120cb7c
+PRIVACY_FILTER_VERSION?=595f59630c69d361b5196f2aba2c71c873d0c13c
 PRIVACY_FILTER_REPO?=https://github.com/localai-org/privacy-filter.cpp
 PRIVACY_FILTER_SRC?=

--- a/backend/go/crispasr/Makefile
+++ b/backend/go/crispasr/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # CrispASR version (release tag)
 CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
-CRISPASR_VERSION?=6514c9da00b03a2f0f1b49a43fae4f3a01a41844
+CRISPASR_VERSION?=3b93758f9725d400eca82976f895e4cec3f31260
 SO_TARGET?=libgocrispasr.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/face-detect/Makefile
+++ b/backend/go/face-detect/Makefile
@@ -1,6 +1,6 @@
 # face-detect backend Makefile.
 #
-# Upstream pin lives below as FACEDETECT_VERSION?=06914b0... (.github/bump_deps.sh
+# Upstream pin lives below as FACEDETECT_VERSION?=e22260d5d5490b37b021b7f795079f386d553afd
 # can find and update it - matches the voice-detect / parakeet.cpp / whisper.cpp
 # convention).
 #
@@ -14,7 +14,7 @@
 # The default target below does the proper clone-at-pin + cmake build so CI does
 # not need a side-checkout.

-FACEDETECT_VERSION?=06914b077d52f90d5421299138e7be6bdd06b5e8
+FACEDETECT_VERSION?=e22260d5d5490b37b021b7f795079f386d553afd
 FACEDETECT_REPO?=https://github.com/mudler/face-detect.cpp

 GOCMD?=go
--- a/backend/go/parakeet-cpp/boundary.go
+++ b/backend/go/parakeet-cpp/boundary.go
@@ -0,0 +1,81 @@
+package main
+
+// utteranceBoundary is the single definition of a small state machine that was
+// previously open-coded three times — as a bare `finalEou` bool with an ad-hoc
+// toggle — in the live feed (live.go), the file-stream text path, and the
+// file-stream JSON path (goparakeetcpp.go).
+//
+// It answers one running question: does the decode currently rest on an
+// end-of-utterance boundary? That is the value a closing FinalResult reports as
+// .Eou and the realtime turn detector treats as a commit point.
+//
+// parakeet auto-resets its decoder after every <EOU>/<EOB>, so one streaming
+// session is a sequence of utterances and this is a LATCH, not a monotonic
+// flag: it closes on an <EOU> and reopens as soon as the next utterance starts.
+// (Contrast the realtime API's per-turn `eouSeen`, which only ever goes
+// false->true because each turn gets a fresh stream. Here the stream outlives
+// the turn, so the boundary status must be able to reopen.)
+//
+// The only transitions, over the events one streamFeedResult carries — an
+// <EOU>, an <EOB> (backchannel), or plain speech output (text and/or words):
+//
+//	            <EOU>
+//	   open ───────────► closed
+//	    ▲ ▲ │             │ │
+//	    │ └─┘ <EOB>|speech │ │ <EOU>
+//	    │   (stay open)    │ └─┘ (stay closed)
+//	    └──────────────────┘
+//	         <EOB>|speech
+//
+//	open   = NOT on an utterance boundary: mid-utterance, the last boundary was
+//	         a backchannel <EOB>, or the stream just began (the initial state).
+//	closed = the last meaningful event was an <EOU> with no later speech: a real
+//	         turn boundary.
+//
+// A feed that carries nothing (no eou/eob/text/words — e.g. a finalize flush
+// that produced no tail) is a no-op and leaves the state unchanged, matching
+// the legacy "leave finalEou as it was" behaviour.
+//
+// The state carries no data, so it is modelled as a two-valued type (a named
+// bool) rather than an int enum: every inhabitant is legal, so illegal states
+// are unrepresentable — the payload-free analog of the sealed sum types the
+// realtime machines use (those need interfaces because their states carry data,
+// e.g. Active{ID}, where "Active with no ID" is the illegal combination a scalar
+// cannot even express).
+type utteranceBoundary bool
+
+const (
+	// boundaryOpen is the zero value (false), so a fresh decode starts open —
+	// exactly the legacy `var finalEou bool` (false) initial condition.
+	boundaryOpen   utteranceBoundary = false
+	boundaryClosed utteranceBoundary = true
+)
+
+// observe folds one decode increment into the latch and returns the new state.
+//
+// <EOU> takes priority when a single feed carries both an <EOU> and speech
+// (e.g. {"text":"hello","eou":1}): the utterance both produced that text AND
+// ended, so the decode rests on the boundary. This matches the legacy
+// eou-checked-first ordering at every call site.
+func (b utteranceBoundary) observe(r streamFeedResult) utteranceBoundary {
+	switch {
+	case r.Eou:
+		return boundaryClosed
+	case r.Eob || r.Delta != "" || len(r.Words) > 0:
+		return boundaryOpen
+	default:
+		return b
+	}
+}
+
+// ended reports whether the decode currently rests on an end-of-utterance
+// boundary (a real <EOU>, not a backchannel <EOB>). This is what a closing
+// FinalResult carries as .Eou.
+func (b utteranceBoundary) ended() bool { return b == boundaryClosed }
+
+func (b utteranceBoundary) String() string {
+	if b == boundaryClosed {
+		return "closed"
+	}
+	return "open"
+}
--- a/backend/go/parakeet-cpp/boundary_test.go
+++ b/backend/go/parakeet-cpp/boundary_test.go
@@ -0,0 +1,92 @@
+package main
+
+import (
+	"math/rand/v2"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("utteranceBoundary (decode end-of-utterance latch)", func() {
+	It("starts open: a fresh decode is not on a boundary", func() {
+		var b utteranceBoundary
+		Expect(b).To(Equal(boundaryOpen))
+		Expect(b.ended()).To(BeFalse())
+	})
+
+	DescribeTable("single feed transition from the open state",
+		func(r streamFeedResult, wantEnded bool) {
+			Expect(boundaryOpen.observe(r).ended()).To(Equal(wantEnded))
+		},
+		Entry("<EOU> closes it", streamFeedResult{Eou: true}, true),
+		Entry("<EOU> with text closes it (eou wins)", streamFeedResult{Delta: "hi", Eou: true}, true),
+		Entry("<EOB> stays open (backchannel is not a turn boundary)", streamFeedResult{Eob: true}, false),
+		Entry("plain text stays open", streamFeedResult{Delta: "hello"}, false),
+		Entry("words-only stays open", streamFeedResult{Words: []transcriptWord{{W: "x"}}}, false),
+		Entry("empty feed is a no-op (stays open)", streamFeedResult{}, false),
+	)
+
+	DescribeTable("single feed transition from the closed state",
+		func(r streamFeedResult, wantEnded bool) {
+			Expect(boundaryClosed.observe(r).ended()).To(Equal(wantEnded))
+		},
+		Entry("another <EOU> stays closed", streamFeedResult{Eou: true}, true),
+		Entry("trailing speech reopens it", streamFeedResult{Delta: "and more"}, false),
+		Entry("words reopen it", streamFeedResult{Words: []transcriptWord{{W: "x"}}}, false),
+		Entry("a backchannel <EOB> reopens it", streamFeedResult{Eob: true}, false),
+		Entry("empty feed is a no-op (stays closed)", streamFeedResult{}, true),
+	)
+
+	It("is a latch: <EOU> then trailing speech reopens, then <EOU> closes again", func() {
+		b := boundaryOpen
+		b = b.observe(streamFeedResult{Delta: "turn one", Eou: true})
+		Expect(b.ended()).To(BeTrue())
+		b = b.observe(streamFeedResult{Delta: " and more"})
+		Expect(b.ended()).To(BeFalse(), "trailing speech without an EOU is an open utterance")
+		b = b.observe(streamFeedResult{Eou: true})
+		Expect(b.ended()).To(BeTrue())
+	})
+
+	It("treats a backchannel before a real EOU correctly", func() {
+		b := boundaryOpen
+		b = b.observe(streamFeedResult{Delta: "uh huh", Eob: true})
+		Expect(b.ended()).To(BeFalse(), "a backchannel must not masquerade as a turn boundary")
+		b = b.observe(streamFeedResult{Delta: "done", Eou: true})
+		Expect(b.ended()).To(BeTrue())
+	})
+
+	It("matches the reference fold over seeded random feed sequences", func() {
+		// The invariant: after any sequence of feeds, ended() is true iff the
+		// last feed that carried ANY event was an <EOU>. <EOU> takes priority
+		// when a feed carries both an EOU and speech; empty feeds are ignored.
+		for seed := uint64(1); seed <= 200; seed++ {
+			rng := rand.New(rand.NewPCG(seed, seed*2654435761))
+			b := boundaryOpen
+			lastWasEou := false // reference: did the last meaningful feed end on EOU?
+			steps := rng.IntN(30)
+			for i := 0; i < steps; i++ {
+				var r streamFeedResult
+				switch rng.IntN(5) {
+				case 0:
+					r = streamFeedResult{Eou: true}
+				case 1:
+					r = streamFeedResult{Eob: true}
+				case 2:
+					r = streamFeedResult{Delta: "w"}
+				case 3:
+					r = streamFeedResult{Delta: "w", Eou: true} // eou + speech, eou wins
+				case 4:
+					r = streamFeedResult{} // empty: no-op
+				}
+				b = b.observe(r)
+				if r.Eou {
+					lastWasEou = true
+				} else if r.Eob || r.Delta != "" || len(r.Words) > 0 {
+					lastWasEou = false
+				}
+			}
+			Expect(b.ended()).To(Equal(lastWasEou),
+				"seed %d: latch disagreed with the reference fold", seed)
+		}
+	})
+})
--- a/backend/go/parakeet-cpp/driver.go
+++ b/backend/go/parakeet-cpp/driver.go
@@ -0,0 +1,82 @@
+package main
+
+import (
+	"context"
+
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
+)
+
+// streamFeedResult is one decode increment from a cache-aware streaming session:
+// the newly-finalized text plus the model's own per-feed boundary tokens
+// (<EOU>/<EOB>) and word timings. It is the single event type both the live
+// (bidi) and file (server-stream) paths fold over, hiding the ABI v4 JSON vs
+// older text-only entry-point split behind one shape.
+type streamFeedResult struct {
+	Delta string
+	Eou   bool
+	Eob   bool
+	Words []transcriptWord
+}
+
+// feedChunk feeds one PCM chunk to the streaming session (or finalizes it, when
+// finalize is true) and returns the unified decode increment. It prefers the
+// ABI v4 JSON entry points (which also carry per-word timestamps) and falls
+// back to the older text-only entry points against an older libparakeet.so.
+//
+// This is the one place the JSON-vs-text choice is made; every consumer works
+// in terms of streamFeedResult.
+func (p *ParakeetCpp) feedChunk(stream uintptr, pcm []float32, finalize bool) (streamFeedResult, error) {
+	if CppStreamFeedJSON != nil {
+		doc, err := p.streamFeedDoc(stream, pcm, finalize)
+		if err != nil {
+			return streamFeedResult{}, err
+		}
+		return streamFeedResult{Delta: doc.Text, Eou: doc.Eou != 0, Eob: doc.Eob != 0, Words: doc.Words}, nil
+	}
+	delta, eou, eob, err := p.streamFeedText(stream, pcm, finalize)
+	if err != nil {
+		return streamFeedResult{}, err
+	}
+	return streamFeedResult{Delta: delta, Eou: eou, Eob: eob}, nil
+}
+
+// feedSlices feeds pcm through the session in streamChunkSamples slices,
+// invoking onFeed for each decode increment. It does NOT finalize: callers
+// decide when the send side is done. The file path finalizes after the whole
+// file; the live path finalizes only when its request channel closes, never
+// between audio messages. Slicing keeps each per-call engineMu hold short so
+// concurrent unary transcription interleaves fairly (the C session buffers
+// internally).
+//
+// If ctx is non-nil it is checked before each slice so a cancelled file
+// transcription stops promptly; the live path passes nil (it is bounded by its
+// request channel instead of a ctx).
+func (p *ParakeetCpp) feedSlices(ctx context.Context, stream uintptr, pcm []float32, onFeed func(streamFeedResult) error) error {
+	for off := 0; off < len(pcm); off += streamChunkSamples {
+		if ctx != nil {
+			if err := ctx.Err(); err != nil {
+				return status.Error(codes.Canceled, "transcription cancelled")
+			}
+		}
+		end := min(off+streamChunkSamples, len(pcm))
+		res, err := p.feedChunk(stream, pcm[off:end], false)
+		if err != nil {
+			return err
+		}
+		if err := onFeed(res); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// flushTail finalizes the session once and folds the flushed tail (the last
+// ~2 encoder frames of text, which only appear on finalize) through onFeed.
+func (p *ParakeetCpp) flushTail(stream uintptr, onFeed func(streamFeedResult) error) error {
+	res, err := p.feedChunk(stream, nil, true)
+	if err != nil {
+		return err
+	}
+	return onFeed(res)
+}
--- a/backend/go/parakeet-cpp/goparakeetcpp.go
+++ b/backend/go/parakeet-cpp/goparakeetcpp.go
@@ -103,12 +103,13 @@ type transcriptJSON struct {
 //	{"text":"...","eou":0,"eob":0,"frame_sec":0.080000,
 //	 "words":[{"w":"...","start":0.480,"end":0.640,"conf":0.9100}, ...]}
 //
-// "text" is the newly-finalized text since the last call; "eou" is 1 when an
-// <EOU> (end of utterance) fired this feed and "eob" is 1 when an <EOB>
-// (backchannel) fired. ABI v4 conflated the two into "eou"; v5 split them, so
-// we read both and treat either as an utterance boundary for segmentation.
-// "words" are the words finalized this call with absolute (stream-relative)
-// start/end seconds.
+// "text" is the newly-finalized text since the last call. Under ABI v5 "eou"
+// is 1 iff an <EOU> fired this feed (the user yielded the turn) and "eob" 1
+// iff an <EOB> fired (a backchannel like "uh-huh" ended — NOT a turn
+// boundary). A v4 library has no "eob" field and its "eou" conflates both
+// tokens: Eob stays 0 and Eou keeps the old any-event meaning. "words" are
+// the words finalized this call with absolute (stream-relative) start/end
+// seconds.
 type streamFeedJSON struct {
 	Text     string           `json:"text"`
 	Eou      int              `json:"eou"`
@@ -364,7 +365,7 @@ var segmentSeparators = []rune{'.', '?', '!'}
 // the caller requested word granularity; token ids populate each segment's
 // Tokens by time-window membership. Shared by the batched and direct paths.
 func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gapFrames int) pb.TranscriptResult {
-	text := strings.TrimSpace(doc.Text)
+	text, eou := stripEouMarker(strings.TrimSpace(doc.Text))

 	// Frame-unit gap threshold -> seconds (NeMo segment_gap_threshold). 0 = off.
 	gapSeconds := 0.0
@@ -383,6 +384,7 @@ func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gap
 		return pb.TranscriptResult{
 			Text:     text,
 			Segments: []*pb.TranscriptSegment{{Id: 0, Text: text}},
+			Eou:      eou,
 		}
 	}

@@ -409,7 +411,25 @@ func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gap
 		}
 		segments = append(segments, seg)
 	}
-	return pb.TranscriptResult{Text: text, Segments: segments}
+	return pb.TranscriptResult{Text: text, Segments: segments, Eou: eou}
+}
+
+// stripEouMarker removes a trailing literal <EOU>/<EOB> from offline-decode
+// text and reports whether the decode ended on an end-of-UTTERANCE token. The
+// realtime EOU model's offline decode keeps the special token in the
+// detokenized text (the streaming path strips it and surfaces it as flags
+// instead); user-visible transcripts must never carry either marker, but only
+// <EOU> may confirm the semantic_vad retranscribe cross-check — a decode
+// ending on <EOB> means the last thing heard was a backchannel, not the user
+// yielding the turn.
+func stripEouMarker(text string) (string, bool) {
+	if strings.HasSuffix(text, "<EOU>") {
+		return strings.TrimSpace(strings.TrimSuffix(text, "<EOU>")), true
+	}
+	if strings.HasSuffix(text, "<EOB>") {
+		return strings.TrimSpace(strings.TrimSuffix(text, "<EOB>")), false
+	}
+	return text, false
 }

 // splitWordsIntoSegments groups words into segments exactly as NeMo's
@@ -476,41 +496,55 @@ func tokensInWindow(tokens []transcriptToken, start, end float64) []int32 {
 	return ids
 }

-// streamSegmenter accumulates streaming words into per-utterance segments. EOU
-// is the model's own utterance boundary; each closed segment takes its start/end
-// from its first/last accumulated word.
+// streamSegmenter accumulates streaming decode increments into per-utterance
+// segments. <EOU>/<EOB> are the model's own utterance boundaries; each closes a
+// segment. When the feed carries per-word timings (ABI v4 JSON), a closed
+// segment takes its start/end from its first/last word; against an older
+// text-only library (no words) it falls back to segmenting the delta text, so
+// the same assembler serves both paths.
 type streamSegmenter struct {
-	segs   []*pb.TranscriptSegment
-	cur    []transcriptWord
-	nextID int32
+	segs    []*pb.TranscriptSegment
+	cur     []transcriptWord // words for the open segment (ABI v4 JSON path)
+	curText []string         // delta text for the open segment (text-only path)
+	nextID  int32
 }

-func (s *streamSegmenter) add(doc streamFeedJSON) {
-	s.cur = append(s.cur, doc.Words...)
-	// Close the segment on either turn signal: <EOU> (end of utterance) or
-	// <EOB> (backchannel). ABI v4 reported both via "eou"; v5 split them, so we
-	// OR them here to keep the v4 segmentation boundaries.
-	if doc.Eou != 0 || doc.Eob != 0 {
+func (s *streamSegmenter) add(r streamFeedResult) {
+	s.cur = append(s.cur, r.Words...)
+	if len(r.Words) == 0 && r.Delta != "" {
+		// Older libparakeet.so with no per-word timing: segment from the text.
+		s.curText = append(s.curText, r.Delta)
+	}
+	// Both <EOU> and <EOB> reset the decoder, so both close a segment.
+	if r.Eou || r.Eob {
 		s.flush()
 	}
 }

 func (s *streamSegmenter) flush() {
-	if len(s.cur) == 0 {
-		return
+	switch {
+	case len(s.cur) > 0:
+		parts := make([]string, len(s.cur))
+		for i, w := range s.cur {
+			parts[i] = w.W
+		}
+		s.segs = append(s.segs, &pb.TranscriptSegment{
+			Id:    s.nextID,
+			Start: secondsToNanos(s.cur[0].Start),
+			End:   secondsToNanos(s.cur[len(s.cur)-1].End),
+			Text:  strings.TrimSpace(strings.Join(parts, " ")),
+		})
+		s.nextID++
+	case len(s.curText) > 0:
+		// No words this segment: emit a text-only segment (no timestamps),
+		// skipping a purely-whitespace one as the legacy text path did.
+		if t := strings.TrimSpace(strings.Join(s.curText, "")); t != "" {
+			s.segs = append(s.segs, &pb.TranscriptSegment{Id: s.nextID, Text: t})
+			s.nextID++
+		}
 	}
-	parts := make([]string, len(s.cur))
-	for i, w := range s.cur {
-		parts[i] = w.W
-	}
-	s.segs = append(s.segs, &pb.TranscriptSegment{
-		Id:    s.nextID,
-		Start: secondsToNanos(s.cur[0].Start),
-		End:   secondsToNanos(s.cur[len(s.cur)-1].End),
-		Text:  strings.TrimSpace(strings.Join(parts, " ")),
-	})
-	s.nextID++
 	s.cur = nil
+	s.curText = nil
 }

 func (s *streamSegmenter) segments() []*pb.TranscriptSegment { return s.segs }
@@ -535,18 +569,119 @@ func secondsToNanos(sec float64) int64 {
 	return int64(sec * 1e9)
 }

+// Per-C-call engine serialization for the streaming paths.
+//
+// Every individual C call (begin / feed / finalize / free) takes engineMu and
+// re-checks ctxPtr under the lock; the lock is NEVER held across a stream's
+// lifetime. This is safe because each parakeet.cpp call builds its own ggml
+// graph and all streaming caches live in the session object, not the ctx —
+// the only ctx-shared mutable state is last_error, which is why it is read
+// under the same lock as the failing call. Holding the lock per call (rather
+// than per stream, as this file previously did) keeps a long-lived live
+// session from starving batched unary transcription and vice versa.
+//
+// A stream must not outlive its ctx (C-API contract). Free() takes engineMu
+// and zeroes ctxPtr, so a racing per-call helper returns ModelNotLoaded
+// instead of feeding a freed engine; streamFree of an orphaned session only
+// runs the session destructor, which does not touch the ctx.
+
+// streamBegin opens a cache-aware streaming session. A 0 stream with nil
+// error means the loaded model is not a streaming model.
+func (p *ParakeetCpp) streamBegin(lang string) (uintptr, error) {
+	p.engineMu.Lock()
+	defer p.engineMu.Unlock()
+	if p.ctxPtr == 0 {
+		return 0, grpcerrors.ModelNotLoaded("parakeet-cpp")
+	}
+	if CppStreamBeginLang != nil {
+		return CppStreamBeginLang(p.ctxPtr, lang), nil
+	}
+	return CppStreamBegin(p.ctxPtr), nil
+}
+
+func (p *ParakeetCpp) streamFree(stream uintptr) {
+	if stream == 0 {
+		return
+	}
+	p.engineMu.Lock()
+	defer p.engineMu.Unlock()
+	CppStreamFree(stream)
+}
+
+// streamFeedText runs one text-mode feed (or the finalize flush when
+// finalize is true) under engineMu, returning the newly-finalized delta and
+// whether an <EOU>/<EOB> fired during the call.
+func (p *ParakeetCpp) streamFeedText(stream uintptr, pcm []float32, finalize bool) (delta string, eou, eob bool, err error) {
+	p.engineMu.Lock()
+	defer p.engineMu.Unlock()
+	if p.ctxPtr == 0 {
+		return "", false, false, grpcerrors.ModelNotLoaded("parakeet-cpp")
+	}
+	var ret uintptr
+	var events int32
+	if finalize {
+		ret = CppStreamFinalize(stream)
+	} else {
+		ret = CppStreamFeed(stream, pcm, int32(len(pcm)), unsafe.Pointer(&events))
+	}
+	if ret == 0 {
+		// last_error is ctx-shared: read it under the same lock as the call.
+		msg := CppLastError(p.ctxPtr)
+		if msg == "" {
+			msg = "unknown error"
+		}
+		return "", false, false, fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
+	}
+	delta = goStringFromCPtr(ret)
+	CppFreeString(ret)
+	// ABI v5: eou_out is a bitmask (bit 0 = <EOU>, bit 1 = <EOB>). A v4
+	// library sets 0/1 for either token, which the bit-0 test reads as the
+	// old conflated eou — the EOB distinction simply isn't available there.
+	return delta, events&1 != 0, events&2 != 0, nil
+}
+
+// streamFeedDoc runs one ABI v4 JSON feed (or finalize) under engineMu and
+// returns the parsed {text,eou,frame_sec,words} document.
+func (p *ParakeetCpp) streamFeedDoc(stream uintptr, pcm []float32, finalize bool) (streamFeedJSON, error) {
+	p.engineMu.Lock()
+	defer p.engineMu.Unlock()
+	if p.ctxPtr == 0 {
+		return streamFeedJSON{}, grpcerrors.ModelNotLoaded("parakeet-cpp")
+	}
+	var ret uintptr
+	if finalize {
+		ret = CppStreamFinalizeJSON(stream)
+	} else {
+		ret = CppStreamFeedJSON(stream, pcm, int32(len(pcm)))
+	}
+	if ret == 0 {
+		msg := CppLastError(p.ctxPtr)
+		if msg == "" {
+			msg = "unknown error"
+		}
+		return streamFeedJSON{}, fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
+	}
+	raw := goStringFromCPtr(ret)
+	CppFreeString(ret)
+	var doc streamFeedJSON
+	if err := json.Unmarshal([]byte(raw), &doc); err != nil {
+		return streamFeedJSON{}, fmt.Errorf("parakeet-cpp: decode stream json: %w", err)
+	}
+	return doc, nil
+}
+
 // AudioTranscriptionStream drives the cache-aware streaming RNN-T over the
-// audio at opts.Dst: it decodes the file to 16 kHz mono PCM, feeds it in
-// chunks to parakeet_capi_stream_feed, and emits each newly-finalized text
-// run as a TranscriptStreamResponse delta. <EOU>/<EOB> events close the
-// current segment; a closing FinalResult carries the full transcript and the
-// per-utterance segments.
+// audio at opts.Dst: it decodes the file to 16 kHz mono PCM, feeds it through
+// the shared decode driver (feedSlices/flushTail), and emits each
+// newly-finalized text run as a TranscriptStreamResponse delta. <EOU>/<EOB>
+// events close the current segment; a closing FinalResult carries the full
+// transcript, the per-utterance segments, and whether the file ended on an
+// utterance boundary.
 //
 // stream_begin returns 0 for models that are not cache-aware streaming models
-// (only e.g. nvidia/parakeet_realtime_eou_120m-v1 qualifies). For those we fall
-// back to a single offline transcription emitted as one delta plus a closing
-// FinalResult, matching LocalAI's non-streaming streaming contract (and the
-// whisper backend), so the streaming endpoint works for every model.
+// (only e.g. nvidia/parakeet_realtime_eou_120m-v1 qualifies). For those this
+// returns codes.Unimplemented rather than faking a stream from an offline
+// decode — see the stream==0 branch and grpcerrors.StreamTranscriptionUnsupported.
 func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.TranscriptRequest, results chan *pb.TranscriptStreamResponse) error {
 	defer close(results)

@@ -560,185 +695,73 @@ func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.Tra
 		return status.Error(codes.Canceled, "transcription cancelled")
 	}

-	var stream uintptr
-	if CppStreamBeginLang != nil {
-		stream = CppStreamBeginLang(p.ctxPtr, opts.GetLanguage())
-	} else {
-		stream = CppStreamBegin(p.ctxPtr)
+	stream, err := p.streamBegin(opts.GetLanguage())
+	if err != nil {
+		return err
 	}
 	if stream == 0 {
-		// Not a cache-aware streaming model: run a normal offline
-		// transcription and emit it as one delta + a closing final result.
-		res, err := p.AudioTranscription(ctx, opts)
-		if err != nil {
-			return err
-		}
-		if t := strings.TrimSpace(res.Text); t != "" {
-			results <- &pb.TranscriptStreamResponse{Delta: t}
-		}
-		results <- &pb.TranscriptStreamResponse{FinalResult: &res}
-		return nil
+		// Not a cache-aware streaming model. Report the missing capability
+		// honestly instead of decoding offline and emitting it as one "delta"
+		// + final: a client that asked for streaming must learn the model
+		// cannot stream, not receive a batch result dressed as a stream (which
+		// is indistinguishable except qualitatively, and silently breaks any
+		// feature that genuinely needs incremental output). Callers wanting a
+		// plain transcript use the unary AudioTranscription path. This mirrors
+		// AudioTranscriptionLive, which already returns Unimplemented here.
+		return grpcerrors.StreamTranscriptionUnsupported("parakeet-cpp",
+			"loaded model is not a cache-aware streaming model")
 	}
-	defer CppStreamFree(stream)
-	// The C engine is a single shared context: a streaming session and a batched
-	// unary dispatch must never touch it at once, so hold engineMu for the whole
-	// stream. This lock is intentionally taken AFTER the non-streaming fallback
-	// above returns: that fallback goes through AudioTranscription -> the batcher
-	// -> runBatch, which itself acquires engineMu, so locking here first would
-	// deadlock. Do not hoist this lock above the fallback.
-	p.engineMu.Lock()
-	defer p.engineMu.Unlock()
+	defer p.streamFree(stream)

 	data, duration, err := decodeWavMono16k(opts.Dst)
 	if err != nil {
 		return err
 	}

-	// ABI v4: when the streaming JSON entry points are present, drive them so the
-	// per-utterance segments carry per-word start/end timestamps. Falls through to
-	// the text-only loop below against an older libparakeet.so. Runs under the
-	// engineMu already held above.
-	if CppStreamFeedJSON != nil {
-		return p.streamJSON(ctx, stream, data, duration, results)
-	}
-
+	// Fold the shared decode driver's per-feed increments into the streamed
+	// deltas and the closing batch result: words/text accumulate into
+	// per-utterance segments (streamSegmenter), and the utterance-boundary
+	// latch (boundary.go) records whether the file ended on an <EOU>. These
+	// are the offline path's concern — the live RPC carries none of them.
 	var (
 		full     strings.Builder
-		segText  strings.Builder
-		segments []*pb.TranscriptSegment
-		segID    int32
+		seg      streamSegmenter
+		boundary utteranceBoundary
 	)
-
-	flushSegment := func() {
-		t := strings.TrimSpace(segText.String())
-		segText.Reset()
-		if t == "" {
-			return
+	emit := func(r streamFeedResult) error {
+		if r.Delta != "" {
+			full.WriteString(r.Delta)
+			results <- &pb.TranscriptStreamResponse{Delta: r.Delta}
 		}
-		segments = append(segments, &pb.TranscriptSegment{Id: segID, Text: t})
-		segID++
-	}
-
-	// emitDelta consumes the malloc'd char* returned by feed/finalize: frees
-	// it, accumulates the text, and sends a delta when non-empty. A 0 return
-	// is an error (vs the "" empty-but-non-NULL no-new-text case).
-	emitDelta := func(ret uintptr) error {
-		if ret == 0 {
-			msg := CppLastError(p.ctxPtr)
-			if msg == "" {
-				msg = "unknown error"
-			}
-			return fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
-		}
-		delta := goStringFromCPtr(ret)
-		CppFreeString(ret)
-		if delta == "" {
-			return nil
-		}
-		full.WriteString(delta)
-		segText.WriteString(delta)
-		results <- &pb.TranscriptStreamResponse{Delta: delta}
+		seg.add(r)
+		boundary = boundary.observe(r)
 		return nil
 	}

-	for off := 0; off < len(data); off += streamChunkSamples {
-		if err := ctx.Err(); err != nil {
-			return status.Error(codes.Canceled, "transcription cancelled")
-		}
-		end := min(off+streamChunkSamples, len(data))
-		chunk := data[off:end]
-
-		var eou int32
-		ret := CppStreamFeed(stream, chunk, int32(len(chunk)), unsafe.Pointer(&eou))
-		if err := emitDelta(ret); err != nil {
-			return err
-		}
-		if eou != 0 {
-			flushSegment()
-		}
-	}
-
-	// Flush the streaming tail (final encoder chunk).
-	if err := emitDelta(CppStreamFinalize(stream)); err != nil {
+	if err := p.feedSlices(ctx, stream, data, emit); err != nil {
 		return err
 	}
-	flushSegment()
-
-	text := strings.TrimSpace(full.String())
-	if len(segments) == 0 && text != "" {
-		segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: text})
-	}
-	results <- &pb.TranscriptStreamResponse{
-		FinalResult: &pb.TranscriptResult{
-			Text:     text,
-			Segments: segments,
-			Duration: duration,
-		},
-	}
-	return nil
-}
-
-// streamJSON drives the streaming JSON entry points (present since ABI v4): each
-// feed/finalize returns a {text,eou,eob,frame_sec,words} document. The
-// newly-finalized text is emitted as a delta (unchanged streaming contract)
-// while words are accumulated into per-utterance segments (closed on <EOU> or
-// <EOB>) so the closing FinalResult carries timestamped segments. Runs under
-// engineMu (already held by the caller).
-func (p *ParakeetCpp) streamJSON(ctx context.Context, stream uintptr, data []float32,
-	duration float32, results chan *pb.TranscriptStreamResponse) error {
-	var (
-		full strings.Builder
-		seg  streamSegmenter
-	)
-	// consume frees the malloc'd char* (a 0 return is an error), parses the JSON,
-	// emits the delta, and routes words through the segmenter.
-	consume := func(ret uintptr) error {
-		if ret == 0 {
-			msg := CppLastError(p.ctxPtr)
-			if msg == "" {
-				msg = "unknown error"
-			}
-			return fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
-		}
-		raw := goStringFromCPtr(ret)
-		CppFreeString(ret)
-		var doc streamFeedJSON
-		if err := json.Unmarshal([]byte(raw), &doc); err != nil {
-			return fmt.Errorf("parakeet-cpp: decode stream json: %w", err)
-		}
-		if doc.Text != "" {
-			full.WriteString(doc.Text)
-			results <- &pb.TranscriptStreamResponse{Delta: doc.Text}
-		}
-		seg.add(doc)
-		return nil
-	}
-
-	for off := 0; off < len(data); off += streamChunkSamples {
-		if err := ctx.Err(); err != nil {
-			return status.Error(codes.Canceled, "transcription cancelled")
-		}
-		end := min(off+streamChunkSamples, len(data))
-		chunk := data[off:end]
-		if err := consume(CppStreamFeedJSON(stream, chunk, int32(len(chunk)))); err != nil {
-			return err
-		}
-	}
-	if err := consume(CppStreamFinalizeJSON(stream)); err != nil {
+	if err := p.flushTail(stream, emit); err != nil {
 		return err
 	}
-	seg.flush() // close any trailing utterance that never saw an EOU
+	seg.flush() // close a trailing utterance that never saw an <EOU>

-	text := strings.TrimSpace(full.String())
+	// final.Text is the exact concatenation of the streamed deltas (full is
+	// their accumulation), so concat(deltas) == FinalResult.Text holds even
+	// when the model prepends a leading space to the first word (SentencePiece
+	// detokenization). This matches the whisper backend's streaming contract.
+	// The single-segment fallback stays trimmed.
+	fullText := full.String()
 	segments := seg.segments()
-	if len(segments) == 0 && text != "" {
-		segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: text})
+	if trimmed := strings.TrimSpace(fullText); len(segments) == 0 && trimmed != "" {
+		segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: trimmed})
 	}
 	results <- &pb.TranscriptStreamResponse{
 		FinalResult: &pb.TranscriptResult{
-			Text:     text,
+			Text:     fullText,
 			Segments: segments,
 			Duration: duration,
+			Eou:      boundary.ended(),
 		},
 	}
 	return nil
@@ -803,6 +826,10 @@ func (p *ParakeetCpp) Free() error {
 		close(p.batStop)
 		p.batStop = nil
 	}
+	// engineMu so an in-flight streaming call (which locks per C call and
+	// re-checks ctxPtr under the lock) can never feed into a freed ctx.
+	p.engineMu.Lock()
+	defer p.engineMu.Unlock()
 	if p.ctxPtr != 0 {
 		CppFree(p.ctxPtr)
 		p.ctxPtr = 0
--- a/backend/go/parakeet-cpp/goparakeetcpp_test.go
+++ b/backend/go/parakeet-cpp/goparakeetcpp_test.go
@@ -14,6 +14,8 @@ import (
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
 )

 func TestParakeetCpp(t *testing.T) {
@@ -201,6 +203,29 @@ var _ = Describe("ParakeetCpp", func() {
 	})

 	Context("AudioTranscriptionStream", func() {
+		It("returns the typed Unimplemented signal for non-streaming models (no offline fallback)", func() {
+			// stream_begin == 0 means the loaded model is not a cache-aware
+			// streaming model. The backend must surface that, not silently
+			// decode offline and fake a one-shot "stream".
+			savedBegin, savedBeginLang := CppStreamBegin, CppStreamBeginLang
+			defer func() { CppStreamBegin, CppStreamBeginLang = savedBegin, savedBeginLang }()
+			CppStreamBeginLang = nil
+			CppStreamBegin = func(ctx uintptr) uintptr { return 0 }
+
+			p := &ParakeetCpp{ctxPtr: 1}
+			results := make(chan *pb.TranscriptStreamResponse, 8)
+			err := p.AudioTranscriptionStream(context.Background(),
+				&pb.TranscriptRequest{Dst: "ignored.wav"}, results)
+			Expect(status.Code(err)).To(Equal(codes.Unimplemented))
+
+			// Honest signal: nothing was emitted — no faked batch result.
+			var emitted []*pb.TranscriptStreamResponse
+			for r := range results {
+				emitted = append(emitted, r)
+			}
+			Expect(emitted).To(BeEmpty())
+		})
+
 		It("streams deltas and a closing FinalResult from a cache-aware model", func() {
 			// Streaming needs a cache-aware streaming model (e.g.
 			// realtime_eou); the offline test model would fail stream_begin.
--- a/backend/go/parakeet-cpp/live.go
+++ b/backend/go/parakeet-cpp/live.go
@@ -0,0 +1,186 @@
+package main
+
+import (
+	"strings"
+	"time"
+
+	"github.com/mudler/LocalAI/pkg/grpc/grpcerrors"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/xlog"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
+)
+
+// liveSampleRate is the only PCM rate the parakeet C streaming API accepts.
+const liveSampleRate = 16000
+
+// AudioTranscriptionLive drives one cache-aware streaming session over audio
+// fed incrementally by the caller (the realtime API's semantic_vad turn
+// detection). Contract:
+//
+//   - the first request must carry a Config; a Config mid-stream resets the
+//     decode session (free + begin) and drops accumulated transcript state;
+//   - a Ready ack is sent right after a successful stream_begin so callers
+//     can degrade synchronously when the model has no streaming support
+//     (LiveTranscriptionUnsupported, codes.Unimplemented);
+//   - every feed that produced output is forwarded as {delta, eou, words};
+//     the <EOU>/<EOB> flag is the model's own utterance boundary and the
+//     decoder auto-resets after it, so one session spans many utterances;
+//   - closing the send side finalizes: the held-back tail chunk is flushed
+//     (the last ~2 encoder frames of words only appear here) and a terminal
+//     FinalResult carries the full transcript Text only. Per-utterance
+//     segments, duration, and the terminal <EOU> flag are NOT produced here —
+//     the realtime core consumes the streamed per-feed tokens and the final
+//     Text; those batch fields are the file path's concern (see
+//     AudioTranscriptionStream).
+//
+// Engine access is serialized per C call (streamBegin/streamFeed*/streamFree
+// take engineMu internally), never for the session lifetime — unary
+// transcription keeps flowing between feeds.
+func (p *ParakeetCpp) AudioTranscriptionLive(in <-chan *pb.TranscriptLiveRequest, out chan<- *pb.TranscriptLiveResponse) error {
+	defer close(out)
+
+	if p.ctxPtr == 0 {
+		return grpcerrors.ModelNotLoaded("parakeet-cpp")
+	}
+
+	first, ok := <-in
+	if !ok {
+		return nil // caller closed without sending anything
+	}
+	cfg := first.GetConfig()
+	if cfg == nil {
+		return status.Error(codes.InvalidArgument, "parakeet-cpp: first live message must carry a config")
+	}
+	if err := validateLiveConfig(cfg); err != nil {
+		return err
+	}
+
+	stream, err := p.streamBegin(cfg.GetLanguage())
+	if err != nil {
+		return err
+	}
+	if stream == 0 {
+		return grpcerrors.LiveTranscriptionUnsupported("parakeet-cpp",
+			"loaded model is not a cache-aware streaming model")
+	}
+	// stream is reassigned on a mid-stream Config reset; free whatever is
+	// current when the RPC unwinds.
+	defer func() { p.streamFree(stream) }()
+
+	out <- &pb.TranscriptLiveResponse{Ready: true}
+
+	var (
+		full    strings.Builder
+		fedSecs float64
+
+		// behindSec accumulates how far decode wall time has fallen behind
+		// the audio it was fed. A live caller feeds in real time, so a
+		// persistent positive backlog means every downstream signal —
+		// including the <EOU> the turn detector waits on — arrives that many
+		// seconds late. Warned once per session; reset by a Config reset.
+		behindSec    float64
+		behindWarned bool
+	)
+
+	// emit forwards one decode increment: it streams the per-feed tokens the
+	// realtime turn detector consumes (delta/eou/eob/words) and accumulates the
+	// running transcript for the closing FinalResult. No segmentation or
+	// boundary latch here — the live consumer reads only the streamed tokens
+	// and the final Text; per-utterance segments and the terminal <EOU> flag
+	// are an offline-path concern (see AudioTranscriptionStream / boundary.go).
+	emit := func(r streamFeedResult) error {
+		if r.Delta != "" {
+			full.WriteString(r.Delta)
+		}
+		if r.Delta != "" || r.Eou || r.Eob || len(r.Words) > 0 {
+			out <- &pb.TranscriptLiveResponse{
+				Delta: r.Delta,
+				Eou:   r.Eou,
+				Eob:   r.Eob,
+				Words: liveWordsToProto(r.Words),
+			}
+		}
+		return nil
+	}
+
+	for req := range in {
+		switch payload := req.GetPayload().(type) {
+		case *pb.TranscriptLiveRequest_Config:
+			if err := validateLiveConfig(payload.Config); err != nil {
+				return err
+			}
+			// Reset: a fresh decode session, dropping accumulated state.
+			p.streamFree(stream)
+			stream, err = p.streamBegin(payload.Config.GetLanguage())
+			if err != nil {
+				return err
+			}
+			if stream == 0 {
+				return grpcerrors.LiveTranscriptionUnsupported("parakeet-cpp",
+					"loaded model is not a cache-aware streaming model")
+			}
+			full.Reset()
+			fedSecs = 0
+		case *pb.TranscriptLiveRequest_Audio:
+			pcm := payload.Audio.GetPcm()
+			audioSec := float64(len(pcm)) / liveSampleRate
+			fedSecs += audioSec
+			start := time.Now()
+			// nil ctx: a live session is bounded by this request channel, not a
+			// context — cancellation is the caller closing the stream.
+			if err := p.feedSlices(nil, stream, pcm, emit); err != nil {
+				return err
+			}
+			wallSec := time.Since(start).Seconds()
+			behindSec += wallSec - audioSec
+			if behindSec < 0 {
+				behindSec = 0
+			}
+			xlog.Debug("parakeet-cpp: live feed",
+				"audio_ms", int(audioSec*1000), "wall_ms", int(wallSec*1000),
+				"behind_ms", int(behindSec*1000), "fed_s", fedSecs)
+			if behindSec > 1 && !behindWarned {
+				behindWarned = true
+				xlog.Warn("parakeet-cpp: live decode is falling behind real time; "+
+					"end-of-utterance signals will arrive late",
+					"behind_s", behindSec, "fed_s", fedSecs)
+			}
+		}
+	}
+
+	// Send side closed: flush the streaming tail and emit the final transcript.
+	// The live FinalResult carries only Text — the authoritative full-turn
+	// transcript the realtime core commits. Per-utterance segments, duration,
+	// and the terminal <EOU> flag are not produced on the live path.
+	if err := p.flushTail(stream, emit); err != nil {
+		return err
+	}
+	out <- &pb.TranscriptLiveResponse{
+		FinalResult: &pb.TranscriptResult{Text: strings.TrimSpace(full.String())},
+	}
+	return nil
+}
+
+func validateLiveConfig(cfg *pb.TranscriptLiveConfig) error {
+	if sr := cfg.GetSampleRate(); sr != 0 && sr != liveSampleRate {
+		return status.Errorf(codes.InvalidArgument,
+			"parakeet-cpp: unsupported live sample_rate %d (only %d)", sr, liveSampleRate)
+	}
+	return nil
+}
+
+func liveWordsToProto(words []transcriptWord) []*pb.TranscriptWord {
+	if len(words) == 0 {
+		return nil
+	}
+	out := make([]*pb.TranscriptWord, len(words))
+	for i, w := range words {
+		out[i] = &pb.TranscriptWord{
+			Start: secondsToNanos(w.Start),
+			End:   secondsToNanos(w.End),
+			Text:  w.W,
+		}
+	}
+	return out
+}
--- a/backend/go/parakeet-cpp/live_test.go
+++ b/backend/go/parakeet-cpp/live_test.go
@@ -0,0 +1,417 @@
+package main
+
+import (
+	"sync"
+	"time"
+	"unsafe"
+
+	"github.com/mudler/LocalAI/pkg/grpc/grpcerrors"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
+)
+
+// The live-RPC specs drive AudioTranscriptionLive entirely against stubbed
+// Cpp* package vars (the same seam batcher_test.go uses), so they run
+// without libparakeet.so.
+
+// liveCstrPool hands out NUL-terminated C-style strings backed by Go memory
+// and keeps them alive for the duration of a spec (goStringFromCPtr reads
+// through the raw pointer; Go's GC must not collect the backing array while
+// a stub's return value is in flight).
+type liveCstrPool struct {
+	mu   sync.Mutex
+	bufs [][]byte
+}
+
+func (p *liveCstrPool) cstr(s string) uintptr {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	b := append([]byte(s), 0)
+	p.bufs = append(p.bufs, b)
+	return uintptr(unsafe.Pointer(&b[0]))
+}
+
+// liveStubs swaps every C entry point the live path touches and returns a
+// restore func for AfterEach.
+func liveStubs() (restore func()) {
+	savedBegin, savedBeginLang := CppStreamBegin, CppStreamBeginLang
+	savedFeed, savedFeedJSON := CppStreamFeed, CppStreamFeedJSON
+	savedFinalize, savedFinalizeJSON := CppStreamFinalize, CppStreamFinalizeJSON
+	savedFree, savedLastError := CppStreamFree, CppLastError
+	savedFreeString := CppFreeString
+	return func() {
+		CppStreamBegin, CppStreamBeginLang = savedBegin, savedBeginLang
+		CppStreamFeed, CppStreamFeedJSON = savedFeed, savedFeedJSON
+		CppStreamFinalize, CppStreamFinalizeJSON = savedFinalize, savedFinalizeJSON
+		CppStreamFree, CppLastError = savedFree, savedLastError
+		CppFreeString = savedFreeString
+	}
+}
+
+// runLive starts the RPC on its own goroutine and returns the request
+// channel plus a collector for everything the backend emitted.
+func runLive(p *ParakeetCpp) (chan *pb.TranscriptLiveRequest, chan *pb.TranscriptLiveResponse, chan error) {
+	in := make(chan *pb.TranscriptLiveRequest)
+	out := make(chan *pb.TranscriptLiveResponse, 32)
+	errCh := make(chan error, 1)
+	go func() { errCh <- p.AudioTranscriptionLive(in, out) }()
+	return in, out, errCh
+}
+
+func liveConfig(lang string) *pb.TranscriptLiveRequest {
+	return &pb.TranscriptLiveRequest{
+		Payload: &pb.TranscriptLiveRequest_Config{Config: &pb.TranscriptLiveConfig{Language: lang}},
+	}
+}
+
+func liveAudio(pcm []float32) *pb.TranscriptLiveRequest {
+	return &pb.TranscriptLiveRequest{
+		Payload: &pb.TranscriptLiveRequest_Audio{Audio: &pb.TranscriptLiveAudio{Pcm: pcm}},
+	}
+}
+
+func collectLive(out chan *pb.TranscriptLiveResponse) []*pb.TranscriptLiveResponse {
+	var got []*pb.TranscriptLiveResponse
+	for r := range out {
+		got = append(got, r)
+	}
+	return got
+}
+
+var _ = Describe("AudioTranscriptionLive (stubbed C API)", func() {
+	var (
+		pool    *liveCstrPool
+		restore func()
+		p       *ParakeetCpp
+	)
+
+	BeforeEach(func() {
+		pool = &liveCstrPool{}
+		restore = liveStubs()
+		p = &ParakeetCpp{ctxPtr: 1}
+
+		CppStreamBeginLang = nil
+		CppStreamBegin = func(ctx uintptr) uintptr { return 7 }
+		CppStreamFree = func(s uintptr) {}
+		CppFreeString = func(s uintptr) {}
+		CppLastError = func(ctx uintptr) string { return "stub error" }
+		CppStreamFeed = nil
+		CppStreamFeedJSON = nil
+		CppStreamFinalize = nil
+		CppStreamFinalizeJSON = nil
+	})
+
+	AfterEach(func() { restore() })
+
+	It("rejects a stream whose first message is not a config", func() {
+		in, out, errCh := runLive(p)
+		in <- liveAudio([]float32{0.1})
+		close(in)
+
+		err := <-errCh
+		Expect(status.Code(err)).To(Equal(codes.InvalidArgument))
+		Expect(collectLive(out)).To(BeEmpty())
+	})
+
+	It("rejects a non-16k sample rate", func() {
+		in, _, errCh := runLive(p)
+		in <- &pb.TranscriptLiveRequest{
+			Payload: &pb.TranscriptLiveRequest_Config{Config: &pb.TranscriptLiveConfig{SampleRate: 8000}},
+		}
+		close(in)
+		Expect(status.Code(<-errCh)).To(Equal(codes.InvalidArgument))
+	})
+
+	It("returns the typed Unimplemented signal for non-streaming models, before any ack", func() {
+		CppStreamBegin = func(ctx uintptr) uintptr { return 0 }
+
+		in, out, errCh := runLive(p)
+		in <- liveConfig("")
+		close(in)
+
+		err := <-errCh
+		Expect(grpcerrors.IsLiveTranscriptionUnsupported(err)).To(BeTrue())
+		Expect(collectLive(out)).To(BeEmpty())
+	})
+
+	It("streams deltas, eou flags and words on the JSON path and finalizes on close", func() {
+		var freed []uintptr
+		CppStreamFree = func(s uintptr) { freed = append(freed, s) }
+		feeds := 0
+		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
+			feeds++
+			switch feeds {
+			case 1:
+				return pool.cstr(`{"text":"hello ","eou":0,"frame_sec":0.08,` +
+					`"words":[{"w":"hello","start":0.1,"end":0.4,"conf":0.9}]}`)
+			default:
+				return pool.cstr(`{"text":"world","eou":1,"frame_sec":0.08,` +
+					`"words":[{"w":"world","start":0.5,"end":0.8,"conf":0.9}]}`)
+			}
+		}
+		CppStreamFinalizeJSON = func(s uintptr) uintptr {
+			return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
+		}
+
+		in, out, errCh := runLive(p)
+		in <- liveConfig("en")
+		in <- liveAudio(make([]float32, 100))
+		in <- liveAudio(make([]float32, 200))
+		close(in)
+		Expect(<-errCh).NotTo(HaveOccurred())
+
+		got := collectLive(out)
+		Expect(got).To(HaveLen(4)) // ready, two deltas, final
+
+		Expect(got[0].Ready).To(BeTrue())
+
+		Expect(got[1].Delta).To(Equal("hello "))
+		Expect(got[1].Eou).To(BeFalse())
+		Expect(got[1].Words).To(HaveLen(1))
+		Expect(got[1].Words[0].Text).To(Equal("hello"))
+
+		Expect(got[2].Delta).To(Equal("world"))
+		Expect(got[2].Eou).To(BeTrue())
+
+		final := got[3].FinalResult
+		Expect(final).NotTo(BeNil())
+		Expect(final.Text).To(Equal("hello world"))
+		// The live FinalResult carries only Text. Per-utterance segments,
+		// duration and the terminal eou flag are an offline-path concern (see
+		// boundary.go / AudioTranscriptionStream); the realtime core reads the
+		// streamed per-feed tokens above plus this Text.
+		Expect(final.Eou).To(BeFalse())
+		Expect(final.Segments).To(BeEmpty())
+		Expect(final.Duration).To(BeZero())
+
+		Expect(freed).To(Equal([]uintptr{7}))
+	})
+
+	It("falls back to the text feed (eou out-param) when the JSON entry points are absent", func() {
+		feeds := 0
+		CppStreamFeed = func(s uintptr, pcm []float32, n int32, eouOut unsafe.Pointer) uintptr {
+			feeds++
+			if feeds == 2 {
+				*(*int32)(eouOut) = 1
+				return pool.cstr("done")
+			}
+			return pool.cstr("first ")
+		}
+		CppStreamFinalize = func(s uintptr) uintptr { return pool.cstr("") }
+
+		in, out, errCh := runLive(p)
+		in <- liveConfig("")
+		in <- liveAudio(make([]float32, 10))
+		in <- liveAudio(make([]float32, 10))
+		close(in)
+		Expect(<-errCh).NotTo(HaveOccurred())
+
+		got := collectLive(out)
+		Expect(got).To(HaveLen(4))
+		Expect(got[1].Delta).To(Equal("first "))
+		Expect(got[1].Eou).To(BeFalse())
+		Expect(got[2].Delta).To(Equal("done"))
+		Expect(got[2].Eou).To(BeTrue())
+		Expect(got[3].FinalResult.Text).To(Equal("first done"))
+	})
+
+	It("forwards <EOB> as eob — a backchannel, never an eou (ABI v5 JSON)", func() {
+		feeds := 0
+		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
+			feeds++
+			if feeds == 1 {
+				return pool.cstr(`{"text":"uh-huh","eou":0,"eob":1,"frame_sec":0.08,` +
+					`"words":[{"w":"uh-huh","start":0.1,"end":0.3,"conf":0.9}]}`)
+			}
+			return pool.cstr(`{"text":"the turn","eou":1,"eob":0,"frame_sec":0.08,` +
+				`"words":[{"w":"the","start":0.5,"end":0.6,"conf":0.9},{"w":"turn","start":0.6,"end":0.8,"conf":0.9}]}`)
+		}
+		CppStreamFinalizeJSON = func(s uintptr) uintptr {
+			return pool.cstr(`{"text":"","eou":0,"eob":0,"frame_sec":0.08,"words":[]}`)
+		}
+
+		in, out, errCh := runLive(p)
+		in <- liveConfig("")
+		in <- liveAudio(make([]float32, 10))
+		in <- liveAudio(make([]float32, 10))
+		close(in)
+		Expect(<-errCh).NotTo(HaveOccurred())
+
+		got := collectLive(out)
+		Expect(got).To(HaveLen(4))
+		Expect(got[1].Eob).To(BeTrue())
+		Expect(got[1].Eou).To(BeFalse(), "a backchannel must not masquerade as a turn boundary")
+		Expect(got[2].Eou).To(BeTrue())
+	})
+
+	It("maps the v5 eou_out bitmask on the text path (bit0 <EOU>, bit1 <EOB>)", func() {
+		feeds := 0
+		CppStreamFeed = func(s uintptr, pcm []float32, n int32, eouOut unsafe.Pointer) uintptr {
+			feeds++
+			if feeds == 1 {
+				*(*int32)(eouOut) = 2 // <EOB> only
+				return pool.cstr("uh-huh")
+			}
+			*(*int32)(eouOut) = 1 // <EOU>
+			return pool.cstr(" done")
+		}
+		CppStreamFinalize = func(s uintptr) uintptr { return pool.cstr("") }
+
+		in, out, errCh := runLive(p)
+		in <- liveConfig("")
+		in <- liveAudio(make([]float32, 10))
+		in <- liveAudio(make([]float32, 10))
+		close(in)
+		Expect(<-errCh).NotTo(HaveOccurred())
+
+		got := collectLive(out)
+		Expect(got).To(HaveLen(4))
+		Expect(got[1].Eob).To(BeTrue())
+		Expect(got[1].Eou).To(BeFalse())
+		Expect(got[2].Eou).To(BeTrue())
+		Expect(got[2].Eob).To(BeFalse())
+	})
+
+	It("accumulates trailing text after an EOU into the final transcript", func() {
+		feeds := 0
+		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
+			feeds++
+			if feeds == 1 {
+				return pool.cstr(`{"text":"turn one","eou":1,"frame_sec":0.08,"words":[]}`)
+			}
+			return pool.cstr(`{"text":" and more","eou":0,"frame_sec":0.08,"words":[]}`)
+		}
+		CppStreamFinalizeJSON = func(s uintptr) uintptr {
+			return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
+		}
+
+		in, out, errCh := runLive(p)
+		in <- liveConfig("")
+		in <- liveAudio(make([]float32, 10))
+		in <- liveAudio(make([]float32, 10))
+		close(in)
+		Expect(<-errCh).NotTo(HaveOccurred())
+
+		got := collectLive(out)
+		final := got[len(got)-1].FinalResult
+		Expect(final.Text).To(Equal("turn one and more"))
+	})
+
+	It("resets the decode session on a mid-stream config", func() {
+		var begun, freed int
+		CppStreamBegin = func(ctx uintptr) uintptr { begun++; return uintptr(10 + begun) }
+		CppStreamFree = func(s uintptr) { freed++ }
+		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
+			return pool.cstr(`{"text":"x","eou":0,"frame_sec":0.08,"words":[]}`)
+		}
+		CppStreamFinalizeJSON = func(s uintptr) uintptr {
+			return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
+		}
+
+		in, out, errCh := runLive(p)
+		in <- liveConfig("")
+		in <- liveAudio(make([]float32, 10))
+		in <- liveConfig("") // reset
+		in <- liveAudio(make([]float32, 10))
+		close(in)
+		Expect(<-errCh).NotTo(HaveOccurred())
+
+		got := collectLive(out)
+		final := got[len(got)-1].FinalResult
+		Expect(final.Text).To(Equal("x"), "pre-reset transcript dropped")
+		Expect(begun).To(Equal(2))
+		Expect(freed).To(Equal(2), "old session freed on reset, new one on unwind")
+	})
+
+	It("does not hold engineMu between feeds (unary work interleaves with a live session)", func() {
+		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
+			return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
+		}
+		CppStreamFinalizeJSON = func(s uintptr) uintptr {
+			return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
+		}
+
+		in, out, errCh := runLive(p)
+		in <- liveConfig("")
+		in <- liveAudio(make([]float32, 10))
+
+		// The session is open and idle between feeds: the engine lock must be
+		// acquirable, which is what lets batched unary transcription proceed
+		// mid-session. Under stream-lifetime locking this probe would block
+		// until the stream ended and the Eventually would time out.
+		locked := make(chan struct{})
+		go func() {
+			p.engineMu.Lock()
+			p.engineMu.Unlock() //nolint:staticcheck // probe: acquire-release proves availability
+			close(locked)
+		}()
+		Eventually(locked, time.Second).Should(BeClosed())
+
+		close(in)
+		Expect(<-errCh).NotTo(HaveOccurred())
+		collectLive(out)
+	})
+
+	It("errors out and reads last_error under the lock when a feed fails", func() {
+		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr { return 0 }
+
+		in, out, errCh := runLive(p)
+		in <- liveConfig("")
+		in <- liveAudio(make([]float32, 10))
+
+		err := <-errCh
+		Expect(err).To(MatchError(ContainSubstring("stub error")))
+		got := collectLive(out)
+		Expect(got).To(HaveLen(1)) // just the ready ack
+		close(in)
+	})
+})
+
+var _ = Describe("stripEouMarker", func() {
+	It("strips a trailing <EOU> and reports it", func() {
+		text, eou := stripEouMarker("it is certainly very like the old portrait<EOU>")
+		Expect(text).To(Equal("it is certainly very like the old portrait"))
+		Expect(eou).To(BeTrue())
+	})
+
+	It("strips a trailing <EOB> WITHOUT reporting an utterance end", func() {
+		// A decode ending on a backchannel must not confirm the
+		// retranscribe gate — the user was acknowledging, not yielding.
+		text, eou := stripEouMarker("uh-huh<EOB>")
+		Expect(text).To(Equal("uh-huh"))
+		Expect(eou).To(BeFalse())
+	})
+
+	It("leaves marker-free text alone", func() {
+		text, eou := stripEouMarker("plain transcript")
+		Expect(text).To(Equal("plain transcript"))
+		Expect(eou).To(BeFalse())
+	})
+
+	It("does not strip a marker in the middle of the text", func() {
+		text, eou := stripEouMarker("a<EOU>b")
+		Expect(text).To(Equal("a<EOU>b"))
+		Expect(eou).To(BeFalse())
+	})
+})
+
+var _ = Describe("transcriptResultFromDoc EOU handling", func() {
+	It("strips the offline marker from text and sets the result flag", func() {
+		doc := transcriptJSON{Text: "the old portrait<EOU>"}
+		res := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0)
+		Expect(res.Text).To(Equal("the old portrait"))
+		Expect(res.Eou).To(BeTrue())
+		Expect(res.Segments).To(HaveLen(1))
+		Expect(res.Segments[0].Text).To(Equal("the old portrait"))
+	})
+
+	It("reports eou=false for marker-free decodes", func() {
+		doc := transcriptJSON{Text: "no marker here"}
+		res := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0)
+		Expect(res.Text).To(Equal("no marker here"))
+		Expect(res.Eou).To(BeFalse())
+	})
+})
--- a/backend/go/parakeet-cpp/segments_test.go
+++ b/backend/go/parakeet-cpp/segments_test.go
@@ -106,7 +106,7 @@ var _ = Describe("transcriptResultFromDoc (multi-segment)", func() {
 var _ = Describe("streaming segment assembly", func() {
 	It("closes a segment with start/end from its words on EOU", func() {
 		acc := &streamSegmenter{}
-		acc.add(streamFeedJSON{Text: "hello world", Eou: 1, Words: []transcriptWord{
+		acc.add(streamFeedResult{Delta: "hello world", Eou: true, Words: []transcriptWord{
 			{W: "hello", Start: 0.0, End: 0.4}, {W: "world", Start: 0.4, End: 0.9},
 		}})
 		segs := acc.segments()
@@ -118,9 +118,9 @@ var _ = Describe("streaming segment assembly", func() {

 	It("buffers words across feeds until EOU", func() {
 		acc := &streamSegmenter{}
-		acc.add(streamFeedJSON{Text: "hi", Eou: 0, Words: []transcriptWord{{W: "hi", Start: 0, End: 0.3}}})
+		acc.add(streamFeedResult{Delta: "hi", Words: []transcriptWord{{W: "hi", Start: 0, End: 0.3}}})
 		Expect(acc.segments()).To(BeEmpty())
-		acc.add(streamFeedJSON{Text: "there", Eou: 1, Words: []transcriptWord{{W: "there", Start: 0.3, End: 0.7}}})
+		acc.add(streamFeedResult{Delta: "there", Eou: true, Words: []transcriptWord{{W: "there", Start: 0.3, End: 0.7}}})
 		Expect(acc.segments()).To(HaveLen(1))
 		Expect(acc.segments()[0].Text).To(Equal("hi there"))
 	})
@@ -129,7 +129,7 @@ var _ = Describe("streaming segment assembly", func() {
 	// field; a backchannel must still close the segment as it did in v4.
 	It("closes a segment on EOB (backchannel) too", func() {
 		acc := &streamSegmenter{}
-		acc.add(streamFeedJSON{Text: "uh huh", Eou: 0, Eob: 1, Words: []transcriptWord{
+		acc.add(streamFeedResult{Delta: "uh huh", Eob: true, Words: []transcriptWord{
 			{W: "uh", Start: 0.0, End: 0.2}, {W: "huh", Start: 0.2, End: 0.5},
 		}})
 		segs := acc.segments()
@@ -137,4 +137,18 @@ var _ = Describe("streaming segment assembly", func() {
 		Expect(segs[0].Text).To(Equal("uh huh"))
 		Expect(segs[0].End).To(Equal(secondsToNanos(0.5)))
 	})
+
+	// Older text-only libparakeet.so: no per-word timings, so a segment is cut
+	// from the delta text on each <EOU>/<EOB> (no timestamps), one per utterance.
+	It("falls back to text segments when the feed carries no words", func() {
+		acc := &streamSegmenter{}
+		acc.add(streamFeedResult{Delta: "first turn", Eou: true})
+		acc.add(streamFeedResult{Delta: "second turn", Eou: true})
+		segs := acc.segments()
+		Expect(segs).To(HaveLen(2))
+		Expect(segs[0].Text).To(Equal("first turn"))
+		Expect(segs[1].Text).To(Equal("second turn"))
+		Expect(segs[0].Start).To(Equal(int64(0)), "no per-word timing on the text path")
+		Expect(segs[0].End).To(Equal(int64(0)))
+	})
 })
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=9956436c925a367daeab097598b1ea1f32d3503f
+STABLEDIFFUSION_GGML_VERSION?=3b6c9ca97cfcda8e68e719e6670d06379fcbe943

 CMAKE_ARGS+=-DGGML_MAX_NAME=128

--- a/backend/go/voice-detect/Makefile
+++ b/backend/go/voice-detect/Makefile
@@ -1,6 +1,6 @@
 # voice-detect backend Makefile.
 #
-# Upstream pin lives below as VOICEDETECT_VERSION?=3d51077... (.github/bump_deps.sh
+# Upstream pin lives below as VOICEDETECT_VERSION?=1db1759572c90faef6f3a78c36b5941a096a9f89
 # can find and update it - matches the parakeet.cpp / whisper.cpp / ds4 convention).
 #
 # Local dev shortcut: if you already have an out-of-tree voice-detect.cpp build,
@@ -13,7 +13,7 @@
 # The default target below does the proper clone-at-pin + cmake build so CI does
 # not need a side-checkout.

-VOICEDETECT_VERSION?=3d510772357538c5182808ac7de2278b84824e24
+VOICEDETECT_VERSION?=1db1759572c90faef6f3a78c36b5941a096a9f89
 VOICEDETECT_REPO?=https://github.com/mudler/voice-detect.cpp

 GOCMD?=go
--- a/backend/python/fish-speech/install.sh
+++ b/backend/python/fish-speech/install.sh
@@ -13,6 +13,17 @@ fi
 # fish-speech uses pyrootutils which requires a .project-root marker
 touch "${backend_dir}/.project-root"

+# On darwin arm64 the transitive `tokenizers` dep compiles its Rust extension
+# from source (Linux uses prebuilt manylinux wheels, so it never compiles
+# there). The pinned tokenizers crate that fish-speech's stack resolves to
+# contains a `&T` -> `&mut T` cast that trips the now-deny-by-default
+# `invalid_reference_casting` lint in the macOS runner's newer Rust toolchain,
+# breaking the build (seen in the v4.5.5 release CI fish-speech darwin/metal
+# job). Allow that lint so the unchanged third-party crate compiles as before.
+# Append rather than clobber any pre-existing RUSTFLAGS; harmless on Linux
+# where no Rust compile happens.
+export RUSTFLAGS="${RUSTFLAGS:-} -A invalid_reference_casting"
+
 installRequirements

 # Clone fish-speech source (the pip package doesn't include inference modules)
--- a/backend/python/kokoro/requirements.txt
+++ b/backend/python/kokoro/requirements.txt
@@ -3,4 +3,5 @@ protobuf
 certifi
 packaging==24.1
 pip
-chardet
+chardet
+click
--- a/backend/python/sglang/backend.py
+++ b/backend/python/sglang/backend.py
@@ -147,9 +147,25 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                d["reasoning_content"] = msg.reasoning_content
            if msg.tool_calls:
                try:
-                    d["tool_calls"] = json.loads(msg.tool_calls)
+                    tool_calls = json.loads(msg.tool_calls)
                except json.JSONDecodeError:
                    pass
+                else:
+                    # OpenAI wire format carries function.arguments as a
+                    # JSON-encoded string, but chat templates (e.g. Qwen3)
+                    # iterate over it as a mapping. The vllm backend
+                    # already parses arguments before applying the chat
+                    # template (PR #10256); mirror that here so the
+                    # sglang backend works with the same wire format.
+                    if isinstance(tool_calls, list):
+                        for tc in tool_calls:
+                            func = tc.get("function") if isinstance(tc, dict) else None
+                            if isinstance(func, dict) and isinstance(func.get("arguments"), str):
+                                try:
+                                    func["arguments"] = json.loads(func["arguments"])
+                                except json.JSONDecodeError:
+                                    pass
+                    d["tool_calls"] = tool_calls
            result.append(d)
        return result

--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -104,7 +104,7 @@ if [ "$(uname -s)" = "Darwin" ]; then
    # can rewrite it. Darwin therefore follows vllm-metal and can lag the Linux
    # vllm pin (requirements-cublas13-after.txt, bumped independently against
    # vllm/vllm) until vllm-metal supports a newer vLLM.
-    VLLM_METAL_VERSION="v0.3.0.dev20260622062346"
+    VLLM_METAL_VERSION="v0.3.0.dev20260628073537"

    # The coupled vLLM source version is whatever this vllm-metal release builds
    # against -- it declares it in its own installer as `vllm_v=`. Derive it from
--- a/cmd/launcher/internal/launcher.go
+++ b/cmd/launcher/internal/launcher.go
@@ -429,7 +429,7 @@ func (l *Launcher) CheckForUpdates() (bool, string, error) {
 }

 // DownloadUpdate downloads the latest version
-func (l *Launcher) DownloadUpdate(version string, progressCallback func(float64)) error {
+func (l *Launcher) DownloadUpdate(version string, progressCallback func(downloaded, total int64)) error {
 	return l.releaseManager.DownloadRelease(version, progressCallback)
 }

@@ -486,7 +486,6 @@ func (l *Launcher) showDownloadLocalAIDialog() {
 	fyne.DoAndWait(func() {
 		// Create a standalone window for the download dialog
 		dialogWindow := l.app.NewWindow("LocalAI Installation Required")
-		dialogWindow.Resize(fyne.NewSize(500, 350))
 		dialogWindow.CenterOnScreen()
 		dialogWindow.SetCloseIntercept(func() {
 			dialogWindow.Close()
@@ -548,6 +547,7 @@ func (l *Launcher) showDownloadLocalAIDialog() {
 		)

 		dialogWindow.SetContent(content)
+		resizeToContent(dialogWindow, content)
 		dialogWindow.Show()
 	})
 }
@@ -621,88 +621,134 @@ func (l *Launcher) showDownloadError(title, message string) {
 }

 // showDownloadProgress shows a standalone progress window for downloading LocalAI
+// after a fresh install (no LocalAI binary present yet).
 func (l *Launcher) showDownloadProgress(version, title string) {
+	l.showDownloadProgressWindow(version, title, func(win fyne.Window) {
+		dialog.ShowConfirm("Installation Complete",
+			"LocalAI has been downloaded and installed successfully. You can now start LocalAI from the launcher.",
+			func(bool) {
+				win.Close()
+				l.updateStatus("LocalAI installed successfully")
+				if l.systray != nil {
+					l.systray.recreateMenu()
+				}
+			}, win)
+	})
+}
+
+// showDownloadProgressWindow renders the download progress popup shared by every
+// "download/upgrade LocalAI" entry point. It owns the progress bar, the
+// human-readable byte readout, resume-aware retry, and content-fit window
+// sizing so the behaviour stays identical everywhere. onSuccess runs (on the UI
+// goroutine) once the download verifies, and is responsible for the success
+// dialog and any follow-up; the window is passed in so it can be parented/closed.
+func (l *Launcher) showDownloadProgressWindow(version, title string, onSuccess func(win fyne.Window)) {
 	fyne.DoAndWait(func() {
-		// Create progress window
 		progressWindow := l.app.NewWindow("Downloading LocalAI")
-		progressWindow.Resize(fyne.NewSize(400, 250))
 		progressWindow.CenterOnScreen()
 		progressWindow.SetCloseIntercept(func() {
 			progressWindow.Close()
 		})

-		// Progress bar
 		progressBar := widget.NewProgressBar()
 		progressBar.SetValue(0)

 		// Status label. Truncate with an ellipsis so a long "Download failed:
 		// <url>" message can't stretch the window (and progress bar) to fit the
-		// whole error on one line; the full error is shown in the dialog below.
+		// whole error on one line.
 		statusLabel := widget.NewLabel("Preparing download...")
 		statusLabel.Truncation = fyne.TextTruncateEllipsis

-		// Release notes button
 		releaseNotesButton := widget.NewButton("View Release Notes", func() {
 			releaseNotesURL, err := l.githubReleaseNotesURL(version)
 			if err != nil {
 				log.Printf("Failed to parse URL: %v", err)
 				return
 			}
-
 			l.app.OpenURL(releaseNotesURL)
 		})

-		// Progress container
-		progressContainer := container.NewVBox(
+		// Retry button: hidden until a download fails. GitHub downloads are
+		// flaky, and the underlying download resumes from the partial file, so
+		// a retry continues where it left off rather than starting over.
+		retryButton := widget.NewButton("Retry", nil)
+		retryButton.Importance = widget.HighImportance
+		retryButton.Hide()
+
+		buttonRow := container.NewHBox(releaseNotesButton, retryButton)
+		content := container.NewVBox(
 			widget.NewLabel(title),
 			progressBar,
 			statusLabel,
 			widget.NewSeparator(),
-			releaseNotesButton,
+			buttonRow,
 		)
+		progressWindow.SetContent(content)
+		resizeToContent(progressWindow, content)

-		progressWindow.SetContent(progressContainer)
-		progressWindow.Show()
+		var startDownload func()
+		startDownload = func() {
+			retryButton.Hide()
+			progressBar.SetValue(0)
+			statusLabel.SetText("Preparing download...")
+			resizeToContent(progressWindow, content)

-		// Start download in background
-		go func() {
-			err := l.DownloadUpdate(version, func(progress float64) {
-				// Update progress bar
-				fyne.Do(func() {
-					progressBar.SetValue(progress)
-					percentage := int(progress * 100)
-					statusLabel.SetText(fmt.Sprintf("Downloading... %d%%", percentage))
+			go func() {
+				err := l.DownloadUpdate(version, func(downloaded, total int64) {
+					fyne.Do(func() {
+						if total > 0 {
+							progressBar.SetValue(float64(downloaded) / float64(total))
+							statusLabel.SetText(fmt.Sprintf("Downloading… %s / %s", formatBytes(downloaded), formatBytes(total)))
+						} else {
+							statusLabel.SetText(fmt.Sprintf("Downloading… %s", formatBytes(downloaded)))
+						}
+					})
 				})
-			})

-			// Handle completion
-			fyne.Do(func() {
-				if err != nil {
-					statusLabel.SetText(fmt.Sprintf("Download failed: %v", err))
-					// Show error dialog
-					dialog.ShowError(err, progressWindow)
-				} else {
-					statusLabel.SetText("Download completed successfully!")
+				fyne.Do(func() {
+					if err != nil {
+						statusLabel.SetText(fmt.Sprintf("Download failed: %v", err))
+						retryButton.Show()
+						resizeToContent(progressWindow, content)
+						return
+					}
 					progressBar.SetValue(1.0)
+					statusLabel.SetText("Download complete")
+					onSuccess(progressWindow)
+				})
+			}()
+		}
+		retryButton.OnTapped = startDownload

-					// Show success dialog
-					dialog.ShowConfirm("Installation Complete",
-						"LocalAI has been downloaded and installed successfully. You can now start LocalAI from the launcher.",
-						func(close bool) {
-							progressWindow.Close()
-							// Update status and refresh systray menu
-							l.updateStatus("LocalAI installed successfully")
-
-							if l.systray != nil {
-								l.systray.recreateMenu()
-							}
-						}, progressWindow)
-				}
-			})
-		}()
+		progressWindow.Show()
+		startDownload()
 	})
 }

+// resizeToContent sizes a window to fit its content (with a sane minimum width)
+// so the dialog doesn't show a large blank gap below the last widget.
+func resizeToContent(w fyne.Window, content fyne.CanvasObject) {
+	size := content.MinSize()
+	if size.Width < 400 {
+		size.Width = 400
+	}
+	w.Resize(size)
+}
+
+// formatBytes renders a byte count as a human-readable size (e.g. "12.3 MB").
+func formatBytes(b int64) string {
+	const unit = 1024
+	if b < unit {
+		return fmt.Sprintf("%d B", b)
+	}
+	div, exp := int64(unit), 0
+	for n := b / unit; n >= unit; n /= unit {
+		div *= unit
+		exp++
+	}
+	return fmt.Sprintf("%.1f %cB", float64(b)/float64(div), "KMGTPE"[exp])
+}
+
 // monitorLogs monitors the output of LocalAI and adds it to the log buffer
 func (l *Launcher) monitorLogs(reader io.Reader, prefix string) {
 	scanner := bufio.NewScanner(reader)
--- a/cmd/launcher/internal/release_manager.go
+++ b/cmd/launcher/internal/release_manager.go
@@ -11,6 +11,7 @@ import (
 	"net/http"
 	"os"
 	"os/exec"
+	"path"
 	"path/filepath"
 	"runtime"
 	"strings"
@@ -50,6 +51,12 @@ type ReleaseManager struct {
 	ChecksumsPath string
 	// MetadataPath is where version metadata is stored
 	MetadataPath string
+	// BaseDownloadURL is the base URL release assets are downloaded from
+	// (defaults to https://github.com; overridable for testing)
+	BaseDownloadURL string
+	// RetryBackoff is the base wait between download attempts; the Nth retry
+	// waits N*RetryBackoff (defaults to 1s; lowered in tests)
+	RetryBackoff time.Duration
 	// HTTPClient is the HTTP client used for downloads
 	HTTPClient *http.Client
 }
@@ -62,28 +69,94 @@ func NewReleaseManager() *ReleaseManager {
 	metadataPath := filepath.Join(homeDir, ".localai", "metadata")

 	return &ReleaseManager{
-		GitHubOwner:    "mudler",
-		GitHubRepo:     "LocalAI",
-		BinaryPath:     binaryPath,
-		CurrentVersion: internal.PrintableVersion(),
-		ChecksumsPath:  checksumsPath,
-		MetadataPath:   metadataPath,
-		HTTPClient:     httpclient.NewWithTimeout(30*time.Second, httpclient.WithFollowRedirects()),
+		GitHubOwner:     "mudler",
+		GitHubRepo:      "LocalAI",
+		BinaryPath:      binaryPath,
+		CurrentVersion:  internal.PrintableVersion(),
+		ChecksumsPath:   checksumsPath,
+		MetadataPath:    metadataPath,
+		BaseDownloadURL: "https://github.com",
+		RetryBackoff:    1 * time.Second,
+		HTTPClient:      httpclient.NewWithTimeout(30*time.Second, httpclient.WithFollowRedirects()),
 	}
 }

-// GetLatestRelease fetches the latest release information from GitHub
+// GetLatestRelease resolves the latest LocalAI release.
+//
+// It first follows the github.com "releases/latest" redirect, which reveals the
+// latest tag in the final URL and—crucially—is NOT subject to the
+// 60-requests/hour unauthenticated rate limit of api.github.com. That limit is
+// per-IP, so on shared/NAT/CGNAT/cloud addresses the API returns 403 almost
+// immediately (e.g. on a fresh install with no LocalAI present yet). The
+// redirect avoids that entirely. The richer JSON API is kept only as a fallback.
+//
+// Only the version is consumed by callers, so the redirect's tag is sufficient.
 func (rm *ReleaseManager) GetLatestRelease() (*Release, error) {
-	url := fmt.Sprintf("https://api.github.com/repos/%s/%s/releases/latest", rm.GitHubOwner, rm.GitHubRepo)
+	version, redirectErr := rm.latestVersionFromRedirect()
+	if redirectErr == nil {
+		return &Release{Version: version}, nil
+	}
+	log.Printf("Could not resolve latest version via release redirect (%v); falling back to GitHub API", redirectErr)
+
+	release, apiErr := rm.latestReleaseFromAPI()
+	if apiErr != nil {
+		// Surface both failures so a rate-limited API doesn't mask the (usually
+		// more relevant) redirect error.
+		return nil, fmt.Errorf("failed to fetch latest release: %v (redirect: %v)", apiErr, redirectErr)
+	}
+	return release, nil
+}
+
+// latestVersionFromRedirect returns the latest tag by following the github.com
+// "releases/latest" redirect to ".../releases/tag/<tag>".
+func (rm *ReleaseManager) latestVersionFromRedirect() (string, error) {
+	url := fmt.Sprintf("%s/%s/%s/releases/latest", rm.BaseDownloadURL, rm.GitHubOwner, rm.GitHubRepo)

 	resp, err := rm.HTTPClient.Get(url)
+	if err != nil {
+		return "", err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		return "", fmt.Errorf("unexpected status %s", resp.Status)
+	}
+
+	// After the redirect is followed, the final request URL is the tag page.
+	version := path.Base(resp.Request.URL.Path)
+	if version == "" || version == "." || version == "latest" {
+		return "", fmt.Errorf("could not determine version from %s", resp.Request.URL.String())
+	}
+	return version, nil
+}
+
+// latestReleaseFromAPI fetches the latest release JSON from api.github.com. This
+// is the fallback path; it is rate-limited unless GITHUB_TOKEN is set.
+func (rm *ReleaseManager) latestReleaseFromAPI() (*Release, error) {
+	url := fmt.Sprintf("https://api.github.com/repos/%s/%s/releases/latest", rm.GitHubOwner, rm.GitHubRepo)
+
+	req, err := http.NewRequest(http.MethodGet, url, nil)
+	if err != nil {
+		return nil, err
+	}
+	req.Header.Set("Accept", "application/vnd.github+json")
+	// An optional token lifts the unauthenticated 60/hour limit to 5000/hour.
+	if token := os.Getenv("GITHUB_TOKEN"); token != "" {
+		req.Header.Set("Authorization", "Bearer "+token)
+	}
+
+	resp, err := rm.HTTPClient.Do(req)
 	if err != nil {
 		return nil, fmt.Errorf("failed to fetch latest release: %w", err)
 	}
 	defer resp.Body.Close()

 	if resp.StatusCode != http.StatusOK {
-		return nil, fmt.Errorf("failed to fetch latest release: status %d", resp.StatusCode)
+		if (resp.StatusCode == http.StatusForbidden || resp.StatusCode == http.StatusTooManyRequests) &&
+			resp.Header.Get("X-RateLimit-Remaining") == "0" {
+			return nil, fmt.Errorf("GitHub API rate limit exceeded (status %d); retry later or set GITHUB_TOKEN to raise the limit", resp.StatusCode)
+		}
+		return nil, fmt.Errorf("status %d", resp.StatusCode)
 	}

 	// Parse the JSON response properly
@@ -106,7 +179,7 @@ func (rm *ReleaseManager) GetLatestRelease() (*Release, error) {
 }

 // DownloadRelease downloads a specific version of LocalAI
-func (rm *ReleaseManager) DownloadRelease(version string, progressCallback func(float64)) error {
+func (rm *ReleaseManager) DownloadRelease(version string, progressCallback func(downloaded, total int64)) error {
 	// Ensure the binary directory exists
 	if err := os.MkdirAll(rm.BinaryPath, 0755); err != nil {
 		return fmt.Errorf("failed to create binary directory: %w", err)
@@ -117,16 +190,16 @@ func (rm *ReleaseManager) DownloadRelease(version string, progressCallback func(
 	localPath := filepath.Join(rm.BinaryPath, "local-ai")

 	// Download the binary
-	downloadURL := fmt.Sprintf("https://github.com/%s/%s/releases/download/%s/%s",
-		rm.GitHubOwner, rm.GitHubRepo, version, binaryName)
+	downloadURL := fmt.Sprintf("%s/%s/%s/releases/download/%s/%s",
+		rm.BaseDownloadURL, rm.GitHubOwner, rm.GitHubRepo, version, binaryName)

 	if err := rm.downloadFile(downloadURL, localPath, progressCallback); err != nil {
 		return fmt.Errorf("failed to download binary: %w", err)
 	}

 	// Download and verify checksums
-	checksumURL := fmt.Sprintf("https://github.com/%s/%s/releases/download/%s/LocalAI-%s-checksums.txt",
-		rm.GitHubOwner, rm.GitHubRepo, version, version)
+	checksumURL := fmt.Sprintf("%s/%s/%s/releases/download/%s/LocalAI-%s-checksums.txt",
+		rm.BaseDownloadURL, rm.GitHubOwner, rm.GitHubRepo, version, version)

 	checksumPath := filepath.Join(rm.BinaryPath, "checksums.txt")
 	manualChecksumPath := filepath.Join(rm.ChecksumsPath, fmt.Sprintf("checksums-%s.txt", version))
@@ -154,6 +227,10 @@ func (rm *ReleaseManager) DownloadRelease(version string, progressCallback func(
 	// Verify the checksum if we have a checksum file
 	if _, err := os.Stat(checksumPath); err == nil {
 		if err := rm.VerifyChecksum(localPath, checksumPath, binaryName); err != nil {
+			// Discard the corrupt binary (and any leftover partial) so the next
+			// retry starts from a clean slate rather than resuming corruption.
+			os.Remove(localPath)
+			os.Remove(localPath + ".part")
 			return fmt.Errorf("checksum verification failed: %w", err)
 		}
 		log.Printf("Checksum verification successful")
@@ -196,44 +273,88 @@ func (rm *ReleaseManager) GetBinaryName(version string) string {
 }

 // downloadFile downloads a file from a URL to a local path with optional progress callback
-func (rm *ReleaseManager) downloadFile(url, filepath string, progressCallback func(float64)) error {
+func (rm *ReleaseManager) downloadFile(url, filepath string, progressCallback func(downloaded, total int64)) error {
 	return rm.downloadFileWithRetry(url, filepath, progressCallback, 3)
 }

-// downloadFileWithRetry downloads a file from a URL with retry logic
-func (rm *ReleaseManager) downloadFileWithRetry(url, filepath string, progressCallback func(float64), maxRetries int) error {
+// downloadFileWithRetry downloads a file with retry and HTTP Range resume.
+//
+// The body is streamed to "<dest>.part" and only renamed to dest on success, so
+// a dropped connection leaves a partial file that the next attempt continues via
+// a "Range: bytes=N-" request instead of restarting from zero. This matters for
+// GitHub release downloads, which are large and flaky.
+func (rm *ReleaseManager) downloadFileWithRetry(url, dest string, progressCallback func(downloaded, total int64), maxRetries int) error {
+	partPath := dest + ".part"
 	var lastErr error

 	for attempt := 1; attempt <= maxRetries; attempt++ {
 		if attempt > 1 {
 			log.Printf("Retrying download (attempt %d/%d): %s", attempt, maxRetries, url)
-			time.Sleep(time.Duration(attempt) * time.Second)
+			time.Sleep(time.Duration(attempt) * rm.RetryBackoff)
 		}

-		resp, err := rm.HTTPClient.Get(url)
+		// Resume from however much we already have on disk.
+		var offset int64
+		if fi, err := os.Stat(partPath); err == nil {
+			offset = fi.Size()
+		}
+
+		req, err := http.NewRequest(http.MethodGet, url, nil)
+		if err != nil {
+			return err
+		}
+		if offset > 0 {
+			req.Header.Set("Range", fmt.Sprintf("bytes=%d-", offset))
+		}
+
+		resp, err := rm.HTTPClient.Do(req)
 		if err != nil {
 			lastErr = err
 			continue
 		}

-		if resp.StatusCode != http.StatusOK {
+		switch resp.StatusCode {
+		case http.StatusOK:
+			// Server ignored the Range (or we had nothing): start fresh.
+			offset = 0
+		case http.StatusPartialContent:
+			// Resume: append to the existing partial file.
+		case http.StatusRequestedRangeNotSatisfiable:
+			// Stale or already-complete partial: discard and restart fresh.
+			resp.Body.Close()
+			os.Remove(partPath)
+			lastErr = fmt.Errorf("partial download no longer valid (status %s), restarting", resp.Status)
+			continue
+		default:
 			resp.Body.Close()
 			lastErr = fmt.Errorf("bad status: %s", resp.Status)
 			continue
 		}

-		out, err := os.Create(filepath)
+		var out *os.File
+		if offset > 0 {
+			out, err = os.OpenFile(partPath, os.O_WRONLY|os.O_APPEND, 0644)
+		} else {
+			out, err = os.Create(partPath)
+		}
 		if err != nil {
 			resp.Body.Close()
 			return err
 		}

-		// Create a progress reader if callback is provided
+		// On a 206 the Content-Length is the remaining bytes, so the full size
+		// is what we already have plus what's still to come.
+		total := resp.ContentLength
+		if offset > 0 && total > 0 {
+			total += offset
+		}
+
 		var reader io.Reader = resp.Body
-		if progressCallback != nil && resp.ContentLength > 0 {
+		if progressCallback != nil && total > 0 {
 			reader = &progressReader{
 				Reader:   resp.Body,
-				Total:    resp.ContentLength,
+				Total:    total,
+				Current:  offset,
 				Callback: progressCallback,
 			}
 		}
@@ -243,11 +364,14 @@ func (rm *ReleaseManager) downloadFileWithRetry(url, filepath string, progressCa
 		out.Close()

 		if err != nil {
+			// Keep the partial file so the next attempt can resume from it.
 			lastErr = err
-			os.Remove(filepath)
 			continue
 		}

+		if err := os.Rename(partPath, dest); err != nil {
+			return err
+		}
 		return nil
 	}

@@ -322,20 +446,21 @@ func (rm *ReleaseManager) saveVersionMetadata(version string) error {
 	return nil
 }

-// progressReader wraps an io.Reader to provide download progress
+// progressReader wraps an io.Reader to provide download progress as a
+// (downloaded, total) byte count so callers can render both a progress bar and
+// a human-readable size.
 type progressReader struct {
 	io.Reader
 	Total    int64
 	Current  int64
-	Callback func(float64)
+	Callback func(downloaded, total int64)
 }

 func (pr *progressReader) Read(p []byte) (int, error) {
 	n, err := pr.Reader.Read(p)
 	pr.Current += int64(n)
 	if pr.Callback != nil {
-		progress := float64(pr.Current) / float64(pr.Total)
-		pr.Callback(progress)
+		pr.Callback(pr.Current, pr.Total)
 	}
 	return n, err
 }
--- a/cmd/launcher/internal/release_manager_test.go
+++ b/cmd/launcher/internal/release_manager_test.go
@@ -1,9 +1,17 @@
 package launcher_test

 import (
+	"crypto/sha256"
+	"encoding/hex"
+	"fmt"
+	"net/http"
+	"net/http/httptest"
 	"os"
 	"path/filepath"
 	"runtime"
+	"strconv"
+	"strings"
+	"sync"
 	"time"

 	. "github.com/onsi/ginkgo/v2"
@@ -178,4 +186,221 @@ var _ = Describe("ReleaseManager", func() {
 			Expect(err.Error()).To(ContainSubstring("checksum not found"))
 		})
 	})
+
+	Describe("DownloadRelease resume and retry", func() {
+		var (
+			version    string
+			binaryName string
+			content    []byte
+			checksums  string
+			finalPath  string
+			partPath   string
+		)
+
+		BeforeEach(func() {
+			version = "v9.9.9"
+			binaryName = rm.GetBinaryName(version)
+
+			// Deterministic, non-trivial content so resume/append bugs surface.
+			content = make([]byte, 4096)
+			for i := range content {
+				content[i] = byte(i % 251)
+			}
+			sum := sha256.Sum256(content)
+			checksums = fmt.Sprintf("%s  %s\n", hex.EncodeToString(sum[:]), binaryName)
+
+			finalPath = filepath.Join(tempDir, "local-ai")
+			partPath = finalPath + ".part"
+
+			// Isolate the persistent checksum/metadata dirs to the temp dir so
+			// the test never touches the real ~/.localai and existing checksum
+			// files don't short-circuit the download.
+			rm.ChecksumsPath = filepath.Join(tempDir, "checksums")
+			rm.MetadataPath = filepath.Join(tempDir, "metadata")
+			rm.GitHubOwner = "owner"
+			rm.GitHubRepo = "repo"
+			rm.RetryBackoff = time.Millisecond
+
+			Expect(os.MkdirAll(tempDir, 0755)).To(Succeed())
+		})
+
+		It("resumes from a partial .part file using a Range request", func() {
+			Expect(os.WriteFile(partPath, content[:1024], 0644)).To(Succeed())
+
+			var mu sync.Mutex
+			sawRange := false
+			binBytesServed := 0
+
+			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				if strings.HasSuffix(r.URL.Path, "checksums.txt") {
+					_, _ = w.Write([]byte(checksums))
+					return
+				}
+				if rangeHdr := r.Header.Get("Range"); rangeHdr != "" {
+					var start int
+					_, _ = fmt.Sscanf(rangeHdr, "bytes=%d-", &start)
+					mu.Lock()
+					sawRange = true
+					mu.Unlock()
+					w.Header().Set("Content-Range", fmt.Sprintf("bytes %d-%d/%d", start, len(content)-1, len(content)))
+					w.WriteHeader(http.StatusPartialContent)
+					n, _ := w.Write(content[start:])
+					mu.Lock()
+					binBytesServed += n
+					mu.Unlock()
+					return
+				}
+				w.WriteHeader(http.StatusOK)
+				n, _ := w.Write(content)
+				mu.Lock()
+				binBytesServed += n
+				mu.Unlock()
+			}))
+			defer srv.Close()
+			rm.BaseDownloadURL = srv.URL
+
+			err := rm.DownloadRelease(version, nil)
+			Expect(err).ToNot(HaveOccurred())
+
+			got, err := os.ReadFile(finalPath)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(got).To(Equal(content))
+			Expect(sawRange).To(BeTrue(), "expected the download to resume with a Range request")
+			Expect(binBytesServed).To(Equal(len(content)-1024), "expected only the remaining bytes to be served")
+			Expect(partPath).ToNot(BeAnExistingFile())
+		})
+
+		It("starts fresh when the server ignores the Range header (200)", func() {
+			// A stale/garbage partial that must NOT be appended to.
+			Expect(os.WriteFile(partPath, []byte("garbage-garbage-garbage"), 0644)).To(Succeed())
+
+			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				if strings.HasSuffix(r.URL.Path, "checksums.txt") {
+					_, _ = w.Write([]byte(checksums))
+					return
+				}
+				// Ignore any Range and always serve the full body.
+				w.WriteHeader(http.StatusOK)
+				_, _ = w.Write(content)
+			}))
+			defer srv.Close()
+			rm.BaseDownloadURL = srv.URL
+
+			err := rm.DownloadRelease(version, nil)
+			Expect(err).ToNot(HaveOccurred())
+
+			got, err := os.ReadFile(finalPath)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(got).To(Equal(content))
+		})
+
+		It("restarts the download when the partial is stale (416)", func() {
+			// Oversized partial -> requested Range start is beyond the content.
+			Expect(os.WriteFile(partPath, make([]byte, len(content)+10), 0644)).To(Succeed())
+
+			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				if strings.HasSuffix(r.URL.Path, "checksums.txt") {
+					_, _ = w.Write([]byte(checksums))
+					return
+				}
+				if rangeHdr := r.Header.Get("Range"); rangeHdr != "" {
+					var start int
+					_, _ = fmt.Sscanf(rangeHdr, "bytes=%d-", &start)
+					if start >= len(content) {
+						w.WriteHeader(http.StatusRequestedRangeNotSatisfiable)
+						return
+					}
+					w.Header().Set("Content-Range", fmt.Sprintf("bytes %d-%d/%d", start, len(content)-1, len(content)))
+					w.WriteHeader(http.StatusPartialContent)
+					_, _ = w.Write(content[start:])
+					return
+				}
+				w.WriteHeader(http.StatusOK)
+				_, _ = w.Write(content)
+			}))
+			defer srv.Close()
+			rm.BaseDownloadURL = srv.URL
+
+			err := rm.DownloadRelease(version, nil)
+			Expect(err).ToNot(HaveOccurred())
+
+			got, err := os.ReadFile(finalPath)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(got).To(Equal(content))
+		})
+
+		It("removes the downloaded file when checksum verification fails", func() {
+			bad := []byte("this is definitely not the expected binary content")
+
+			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				if strings.HasSuffix(r.URL.Path, "checksums.txt") {
+					// Checksums are for `content`, but we serve `bad`.
+					_, _ = w.Write([]byte(checksums))
+					return
+				}
+				w.WriteHeader(http.StatusOK)
+				_, _ = w.Write(bad)
+			}))
+			defer srv.Close()
+			rm.BaseDownloadURL = srv.URL
+
+			err := rm.DownloadRelease(version, nil)
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring("checksum"))
+			Expect(finalPath).ToNot(BeAnExistingFile())
+			Expect(partPath).ToNot(BeAnExistingFile())
+		})
+
+		It("reports progress as downloaded and total byte counts", func() {
+			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				if strings.HasSuffix(r.URL.Path, "checksums.txt") {
+					_, _ = w.Write([]byte(checksums))
+					return
+				}
+				w.Header().Set("Content-Length", strconv.Itoa(len(content)))
+				w.WriteHeader(http.StatusOK)
+				_, _ = w.Write(content)
+			}))
+			defer srv.Close()
+			rm.BaseDownloadURL = srv.URL
+
+			var mu sync.Mutex
+			var lastDownloaded, lastTotal int64
+			err := rm.DownloadRelease(version, func(downloaded, total int64) {
+				mu.Lock()
+				lastDownloaded = downloaded
+				lastTotal = total
+				mu.Unlock()
+			})
+			Expect(err).ToNot(HaveOccurred())
+			Expect(lastTotal).To(Equal(int64(len(content))))
+			Expect(lastDownloaded).To(Equal(int64(len(content))))
+		})
+	})
+
+	Describe("GetLatestRelease", func() {
+		It("resolves the latest version from the releases/latest redirect", func() {
+			// The github.com redirect path must be preferred over the
+			// rate-limited api.github.com, so a working redirect yields the tag
+			// without ever needing the API.
+			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				switch {
+				case strings.HasSuffix(r.URL.Path, "/releases/latest"):
+					http.Redirect(w, r, "/owner/repo/releases/tag/v9.9.9", http.StatusFound)
+				case strings.HasSuffix(r.URL.Path, "/releases/tag/v9.9.9"):
+					w.WriteHeader(http.StatusOK)
+				default:
+					w.WriteHeader(http.StatusNotFound)
+				}
+			}))
+			defer srv.Close()
+			rm.BaseDownloadURL = srv.URL
+			rm.GitHubOwner = "owner"
+			rm.GitHubRepo = "repo"
+
+			release, err := rm.GetLatestRelease()
+			Expect(err).ToNot(HaveOccurred())
+			Expect(release.Version).To(Equal("v9.9.9"))
+		})
+	})
 })
--- a/cmd/launcher/internal/systray_manager.go
+++ b/cmd/launcher/internal/systray_manager.go
@@ -443,84 +443,23 @@ func (sm *SystrayManager) showStartupErrorDialog(err error) {
 	})
 }

-// showDownloadProgress shows a progress window for downloading updates
+// showDownloadProgress shows a progress window for downloading updates. The
+// progress UI (byte readout, resume-aware retry, sizing) is shared with the
+// other download entry points via the launcher; only the post-success behaviour
+// (restart prompt + systray refresh) is specific to the update flow.
 func (sm *SystrayManager) showDownloadProgress(version string) {
-	// Create a new window for download progress
-	progressWindow := sm.app.NewWindow("Downloading LocalAI Update")
-	progressWindow.Resize(fyne.NewSize(400, 250))
-	progressWindow.CenterOnScreen()
+	sm.launcher.showDownloadProgressWindow(version, fmt.Sprintf("Downloading LocalAI version %s", version), func(win fyne.Window) {
+		dialog.ShowConfirm("Update Downloaded",
+			"LocalAI has been updated successfully. Please restart the launcher to use the new version.",
+			func(restart bool) {
+				if restart {
+					sm.app.Quit()
+				}
+				win.Close()
+			}, win)

-	// Progress bar
-	progressBar := widget.NewProgressBar()
-	progressBar.SetValue(0)
-
-	// Status label. Truncate with an ellipsis so a long "Download failed:
-	// <url>" message can't stretch the window (and progress bar) to fit the
-	// whole error on one line; the full error is shown in the dialog below.
-	statusLabel := widget.NewLabel("Preparing download...")
-	statusLabel.Truncation = fyne.TextTruncateEllipsis
-
-	// Release notes button
-	releaseNotesButton := widget.NewButton("View Release Notes", func() {
-		releaseNotesURL, err := sm.launcher.githubReleaseNotesURL(version)
-		if err != nil {
-			log.Printf("Failed to parse URL: %v", err)
-			return
-		}
-
-		sm.app.OpenURL(releaseNotesURL)
+		sm.hasUpdateAvailable = false
+		sm.latestVersion = ""
+		sm.recreateMenu()
 	})
-
-	// Progress container
-	progressContainer := container.NewVBox(
-		widget.NewLabel(fmt.Sprintf("Downloading LocalAI version %s", version)),
-		progressBar,
-		statusLabel,
-		widget.NewSeparator(),
-		releaseNotesButton,
-	)
-
-	progressWindow.SetContent(progressContainer)
-	progressWindow.Show()
-
-	// Start download in background
-	go func() {
-		err := sm.launcher.DownloadUpdate(version, func(progress float64) {
-			// Update progress bar
-			fyne.Do(func() {
-				progressBar.SetValue(progress)
-				percentage := int(progress * 100)
-				statusLabel.SetText(fmt.Sprintf("Downloading... %d%%", percentage))
-			})
-		})
-
-		// Handle completion
-		fyne.Do(func() {
-			if err != nil {
-				statusLabel.SetText(fmt.Sprintf("Download failed: %v", err))
-				// Show error dialog
-				dialog.ShowError(err, progressWindow)
-			} else {
-				statusLabel.SetText("Download completed successfully!")
-				progressBar.SetValue(1.0)
-
-				// Show restart dialog
-				dialog.ShowConfirm("Update Downloaded",
-					"LocalAI has been updated successfully. Please restart the launcher to use the new version.",
-					func(restart bool) {
-						if restart {
-							sm.app.Quit()
-						}
-						progressWindow.Close()
-					}, progressWindow)
-			}
-		})
-
-		// Update systray menu
-		if err == nil {
-			sm.hasUpdateAvailable = false
-			sm.latestVersion = ""
-			sm.recreateMenu()
-		}
-	}()
 }
--- a/cmd/launcher/internal/ui.go
+++ b/cmd/launcher/internal/ui.go
@@ -490,14 +490,19 @@ func (ui *LauncherUI) downloadUpdate() {
 	ui.UpdateStatus("Downloading update " + version + "...")

 	go func() {
-		err := ui.launcher.DownloadUpdate(version, func(progress float64) {
-			// Update progress bar
+		err := ui.launcher.DownloadUpdate(version, func(downloaded, total int64) {
 			fyne.Do(func() {
-				ui.progressBar.SetValue(progress)
+				if total > 0 {
+					ui.progressBar.SetValue(float64(downloaded) / float64(total))
+				}
 			})
-			// Update status with percentage
-			percentage := int(progress * 100)
-			ui.UpdateStatus(fmt.Sprintf("Downloading update %s... %d%%", version, percentage))
+			// The progress bar already shows the percentage, so report the
+			// human-readable size here instead of repeating the percent.
+			if total > 0 {
+				ui.UpdateStatus(fmt.Sprintf("Downloading update %s… %s / %s", version, formatBytes(downloaded), formatBytes(total)))
+			} else {
+				ui.UpdateStatus(fmt.Sprintf("Downloading update %s… %s", version, formatBytes(downloaded)))
+			}
 		})

 		fyne.Do(func() {
@@ -598,82 +603,6 @@ func (ui *LauncherUI) LoadConfiguration() {
 	log.Printf("UI LoadConfiguration: configuration loaded successfully")
 }

-// showDownloadProgress shows a progress window for downloading LocalAI
-func (ui *LauncherUI) showDownloadProgress(version, title string) {
-	fyne.DoAndWait(func() {
-		// Create progress window using the launcher's app
-		progressWindow := ui.launcher.app.NewWindow("Downloading LocalAI")
-		progressWindow.Resize(fyne.NewSize(400, 250))
-		progressWindow.CenterOnScreen()
-
-		// Progress bar
-		progressBar := widget.NewProgressBar()
-		progressBar.SetValue(0)
-
-		// Status label. Truncate with an ellipsis so a long "Download failed:
-		// <url>" message can't stretch the window (and progress bar) to fit the
-		// whole error on one line; the full error is shown in the dialog below.
-		statusLabel := widget.NewLabel("Preparing download...")
-		statusLabel.Truncation = fyne.TextTruncateEllipsis
-
-		// Release notes button
-		releaseNotesButton := widget.NewButton("View Release Notes", func() {
-			releaseNotesURL, err := ui.launcher.githubReleaseNotesURL(version)
-			if err != nil {
-				log.Printf("Failed to parse URL: %v", err)
-				return
-			}
-
-			ui.launcher.app.OpenURL(releaseNotesURL)
-		})
-
-		// Progress container
-		progressContainer := container.NewVBox(
-			widget.NewLabel(title),
-			progressBar,
-			statusLabel,
-			widget.NewSeparator(),
-			releaseNotesButton,
-		)
-
-		progressWindow.SetContent(progressContainer)
-		progressWindow.Show()
-
-		// Start download in background
-		go func() {
-			err := ui.launcher.DownloadUpdate(version, func(progress float64) {
-				// Update progress bar
-				fyne.Do(func() {
-					progressBar.SetValue(progress)
-					percentage := int(progress * 100)
-					statusLabel.SetText(fmt.Sprintf("Downloading... %d%%", percentage))
-				})
-			})
-
-			// Handle completion
-			fyne.Do(func() {
-				if err != nil {
-					statusLabel.SetText(fmt.Sprintf("Download failed: %v", err))
-					// Show error dialog
-					dialog.ShowError(err, progressWindow)
-				} else {
-					statusLabel.SetText("Download completed successfully!")
-					progressBar.SetValue(1.0)
-
-					// Show success dialog
-					dialog.ShowConfirm("Installation Complete",
-						"LocalAI has been downloaded and installed successfully. You can now start LocalAI from the launcher.",
-						func(close bool) {
-							progressWindow.Close()
-							// Update status
-							ui.UpdateStatus("LocalAI installed successfully")
-						}, progressWindow)
-				}
-			})
-		}()
-	})
-}
-
 // UpdateRunningState updates UI based on LocalAI running state
 func (ui *LauncherUI) UpdateRunningState(isRunning bool) {
 	fyne.Do(func() {
--- a/core/application/application.go
+++ b/core/application/application.go
@@ -103,6 +103,11 @@ func newApplication(appConfig *config.ApplicationConfig) *Application {
 		mcpTools.CloseMCPSessions(modelName)
 	})

+	// Record a model_load backend trace for every real backend load, so the
+	// Traces UI shows which backend runtime served each model and how long
+	// the load took. Load failures are traced by the modality wrappers.
+	ml.SetLoadObserver(corebackend.ModelLoadTraceObserver(appConfig))
+
 	app := &Application{
 		backendLoader:      config.NewModelConfigLoader(appConfig.SystemState.Model.ModelsPath),
 		modelLoader:        ml,
--- a/core/application/config_file_watcher.go
+++ b/core/application/config_file_watcher.go
@@ -197,6 +197,7 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
 		envWatchdogBusy := appConfig.WatchDogBusy == startupAppConfig.WatchDogBusy
 		envWatchdogIdleTimeout := appConfig.WatchDogIdleTimeout == startupAppConfig.WatchDogIdleTimeout
 		envWatchdogBusyTimeout := appConfig.WatchDogBusyTimeout == startupAppConfig.WatchDogBusyTimeout
+		envWatchdogInterval := appConfig.WatchDogInterval == startupAppConfig.WatchDogInterval
 		envSingleBackend := appConfig.SingleBackend == startupAppConfig.SingleBackend
 		envMaxActiveBackends := appConfig.MaxActiveBackends == startupAppConfig.MaxActiveBackends
 		envMemoryReclaimerEnabled := appConfig.MemoryReclaimerEnabled == startupAppConfig.MemoryReclaimerEnabled
@@ -257,6 +258,14 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
 					xlog.Warn("invalid watchdog busy timeout in runtime_settings.json", "error", err, "timeout", *settings.WatchdogBusyTimeout)
 				}
 			}
+			if settings.WatchdogInterval != nil && !envWatchdogInterval {
+				dur, err := time.ParseDuration(*settings.WatchdogInterval)
+				if err == nil {
+					appConfig.WatchDogInterval = dur
+				} else {
+					xlog.Warn("invalid watchdog interval in runtime_settings.json", "error", err, "interval", *settings.WatchdogInterval)
+				}
+			}
 			// Handle MaxActiveBackends (new) and SingleBackend (deprecated)
 			if settings.MaxActiveBackends != nil && !envMaxActiveBackends {
 				appConfig.MaxActiveBackends = *settings.MaxActiveBackends
--- a/core/application/runtime_settings_branding_test.go
+++ b/core/application/runtime_settings_branding_test.go
@@ -87,6 +87,31 @@ var _ = Describe("loadRuntimeSettingsFromFile", func() {
 		})
 	})

+	// Watchdog check interval (issue #10601). Unlike the idle/busy timeouts
+	// (which default to 0), NewApplicationConfig baseline-defaults the
+	// interval to 500ms. The loader's "apply file value only if still at the
+	// zero default" env-detection therefore never fired for the interval, so
+	// a UI-saved Check Interval silently reverted to 500ms on every restart
+	// while the idle/busy timeouts persisted. These specs construct the
+	// config the same way boot does (NewApplicationConfig) so they observe
+	// the real default the loader sees.
+	Describe("watchdog interval", func() {
+		It("loads a UI-saved watchdog_interval on the next startup", func() {
+			cfg := config.NewApplicationConfig()
+			cfg.DynamicConfigsDir = seedSettings(`{"watchdog_interval": "2s"}`)
+			loadRuntimeSettingsFromFile(cfg)
+			Expect(cfg.WatchDogInterval).To(Equal(2 * time.Second))
+		})
+
+		It("does not override an explicit env/CLI interval", func() {
+			cfg := config.NewApplicationConfig()
+			cfg.DynamicConfigsDir = seedSettings(`{"watchdog_interval": "2s"}`)
+			cfg.WatchDogInterval = 1 * time.Second // simulate SetWatchDogInterval from env
+			loadRuntimeSettingsFromFile(cfg)
+			Expect(cfg.WatchDogInterval).To(Equal(1*time.Second), "env/CLI interval must win over the persisted file value")
+		})
+	})
+
 	// MITM listener address. The file is the only source — no env var
 	// exists — so a regression here means an admin who configured the
 	// listener via /api/settings loses it after a reboot, even though
--- a/core/backend/model_load_trace_test.go
+++ b/core/backend/model_load_trace_test.go
@@ -0,0 +1,72 @@
+package backend_test
+
+import (
+	"errors"
+	"time"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/trace"
+	"github.com/mudler/LocalAI/pkg/model"
+)
+
+// ModelLoadTraceObserver is what makes successful loads visible on the
+// Traces page: one model_load row per real backend load, carrying the
+// resolved backend runtime. Failures must NOT be recorded here — the
+// modality wrappers own those — and the observer must respect the runtime
+// tracing toggle.
+var _ = Describe("ModelLoadTraceObserver", func() {
+	var appConfig *config.ApplicationConfig
+
+	successEvent := model.BackendLoadEvent{
+		ModelID:    "parakeet-cpp-realtime_eou_120m-v1",
+		ModelName:  "realtime_eou_120m.gguf",
+		Backend:    "parakeet-cpp",
+		BackendURI: "/backends/intel-sycl-f16-parakeet-cpp-development/run.sh",
+		Duration:   1500 * time.Millisecond,
+	}
+
+	BeforeEach(func() {
+		appConfig = &config.ApplicationConfig{
+			EnableTracing:   true,
+			TracingMaxItems: 64,
+		}
+		trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
+		trace.ClearBackendTraces()
+	})
+
+	It("records a model_load trace with the backend runtime on success", func() {
+		backend.ModelLoadTraceObserver(appConfig)(successEvent)
+
+		Eventually(trace.GetBackendTraces).Should(HaveLen(1))
+		got := trace.GetBackendTraces()[0]
+		Expect(got.Type).To(Equal(trace.BackendTraceModelLoad))
+		Expect(got.Summary).To(Equal("Model loaded"))
+		Expect(got.ModelName).To(Equal("parakeet-cpp-realtime_eou_120m-v1"))
+		Expect(got.Backend).To(Equal("parakeet-cpp"))
+		Expect(got.Duration).To(Equal(1500 * time.Millisecond))
+		Expect(got.Data["backend_runtime"]).To(Equal("/backends/intel-sycl-f16-parakeet-cpp-development/run.sh"))
+		Expect(got.Data["model_file"]).To(Equal("realtime_eou_120m.gguf"))
+		Expect(got.Error).To(BeEmpty())
+	})
+
+	It("skips failed loads — the modality wrappers trace those with request context", func() {
+		failed := successEvent
+		failed.Err = errors.New("grpc service not ready")
+
+		backend.ModelLoadTraceObserver(appConfig)(failed)
+
+		Consistently(trace.GetBackendTraces, "100ms", "20ms").Should(BeEmpty())
+	})
+
+	It("records nothing when tracing is disabled", func() {
+		appConfig.EnableTracing = false
+
+		backend.ModelLoadTraceObserver(appConfig)(successEvent)
+
+		Consistently(trace.GetBackendTraces, "100ms", "20ms").Should(BeEmpty())
+	})
+})
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -19,6 +19,39 @@ import (
 	"github.com/mudler/xlog"
 )

+// ModelLoadTraceObserver returns the ModelLoader load observer that records
+// a model_load backend trace for every successful real load (backend process
+// spawn + LoadModel RPC; cache hits never reach the observer). Failures are
+// deliberately skipped here: the modality wrappers already record them via
+// recordModelLoadFailure with request context, and the backend auto-discovery
+// scan probes several backends before one succeeds — tracing every probe
+// failure would bury the buffer in noise.
+//
+// The traced data includes the resolved backend runtime (the installed
+// backend's launcher path, which names the variant directory) — that is what
+// identifies WHICH build served the load. A stale installed backend is
+// invisible in the model config but obvious here.
+func ModelLoadTraceObserver(appConfig *config.ApplicationConfig) func(model.BackendLoadEvent) {
+	return func(ev model.BackendLoadEvent) {
+		if ev.Err != nil || !appConfig.EnableTracing {
+			return
+		}
+		trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
+		trace.RecordBackendTrace(trace.BackendTrace{
+			Timestamp: time.Now(),
+			Duration:  ev.Duration,
+			Type:      trace.BackendTraceModelLoad,
+			ModelName: ev.ModelID,
+			Backend:   ev.Backend,
+			Summary:   "Model loaded",
+			Data: map[string]any{
+				"model_file":      ev.ModelName,
+				"backend_runtime": ev.BackendURI,
+			},
+		})
+	}
+}
+
 // recordModelLoadFailure records a backend trace when model loading fails.
 func recordModelLoadFailure(appConfig *config.ApplicationConfig, modelName, backend string, err error, data map[string]any) {
 	if !appConfig.EnableTracing {
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -181,6 +181,7 @@ func transcriptResultFromProto(r *proto.TranscriptResult) *schema.TranscriptionR
 		Text:     r.Text,
 		Language: r.Language,
 		Duration: float64(r.Duration),
+		Eou:      r.Eou,
 	}

 	for _, s := range r.Segments {
--- a/core/backend/transcript_live.go
+++ b/core/backend/transcript_live.go
@@ -0,0 +1,297 @@
+package backend
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"maps"
+	"sync"
+	"time"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/trace"
+	grpcPkg "github.com/mudler/LocalAI/pkg/grpc"
+	"github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/sound"
+	"github.com/mudler/xlog"
+)
+
+// LiveTranscriptionEvent is one streamed event from a live (bidirectional)
+// transcription session. Delta/Eou/Eob/Words arrive as the user speaks; Final
+// is set exactly once, on the terminal event after Close flushes the decode
+// tail. Eou means the model judged the user yielded the turn; Eob means a
+// backchannel ("uh-huh") ended — callers must NOT treat Eob as a turn
+// boundary.
+type LiveTranscriptionEvent struct {
+	Delta string
+	Eou   bool
+	Eob   bool
+	Words []schema.TranscriptionWord
+	Final *schema.TranscriptionResult
+}
+
+// LiveTranscriptionSession is a handle on an open live transcription stream.
+// Feed pushes 16 kHz mono float PCM; Close signals end-of-audio, waits for
+// the backend's terminal Final event to be delivered, and releases the
+// stream.
+type LiveTranscriptionSession interface {
+	Feed(pcm []float32) error
+	Close() error
+}
+
+// liveCloseDrainTimeout bounds how long Close waits for the backend to flush
+// the decode tail before force-cancelling the stream. Finalize is one short
+// engine call; seconds here means the backend is wedged.
+const liveCloseDrainTimeout = 10 * time.Second
+
+type liveTranscriptionSession struct {
+	stream    grpcPkg.AudioTranscriptionLiveClient
+	cancel    context.CancelFunc
+	recvDone  chan struct{}
+	recvErr   error // written by the recv goroutine before recvDone closes
+	closeOnce sync.Once
+	closeErr  error
+	trace     *liveTraceState // nil when tracing was disabled at open
+}
+
+func (s *liveTranscriptionSession) Feed(pcm []float32) error {
+	s.trace.addPCM(pcm)
+	return s.stream.Send(&proto.TranscriptLiveRequest{
+		Payload: &proto.TranscriptLiveRequest_Audio{Audio: &proto.TranscriptLiveAudio{Pcm: pcm}},
+	})
+}
+
+func (s *liveTranscriptionSession) Close() error {
+	s.closeOnce.Do(func() {
+		err := s.stream.CloseSend()
+		select {
+		case <-s.recvDone:
+		case <-time.After(liveCloseDrainTimeout):
+			xlog.Warn("live transcription: backend did not finalize in time; cancelling stream")
+			s.cancel()
+			<-s.recvDone
+		}
+		s.cancel()
+		if err == nil {
+			err = s.recvErr
+		}
+		s.closeErr = err
+		s.trace.record(err)
+	})
+	return s.closeErr
+}
+
+// liveSampleRate is the PCM rate of a live transcription session, fixed by
+// the session config sent in ModelTranscriptionLive.
+const liveSampleRate = 16000
+
+// liveTraceState accumulates what the per-turn backend trace needs while a
+// live session runs: a bounded copy of the fed PCM for the audio snippet,
+// the decode outputs, and timing. One trace is recorded at Close — the live
+// path never touches the unary transcription wrapper, so without this a
+// streaming-only pipeline produced no transcription traces at all. Feed and
+// the recv goroutine run concurrently; mu guards the accumulators.
+type liveTraceState struct {
+	appConfig *config.ApplicationConfig
+	modelName string
+	backend   string
+	language  string
+	started   time.Time
+
+	mu          sync.Mutex
+	pcm         []byte // first trace.MaxSnippetSeconds of fed audio, int16 LE
+	fedSamples  int    // ALL samples fed, beyond the snippet cap
+	deltaEvents int
+	eouEvents   int
+	eobEvents   int
+	finalText   string
+}
+
+func newLiveTraceState(modelConfig config.ModelConfig, appConfig *config.ApplicationConfig, language string) *liveTraceState {
+	if !appConfig.EnableTracing {
+		return nil
+	}
+	trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
+	return &liveTraceState{
+		appConfig: appConfig,
+		modelName: modelConfig.Name,
+		backend:   modelConfig.Backend,
+		language:  language,
+		started:   time.Now(),
+	}
+}
+
+func (ts *liveTraceState) addPCM(pcm []float32) {
+	if ts == nil {
+		return
+	}
+	ts.mu.Lock()
+	defer ts.mu.Unlock()
+	ts.fedSamples += len(pcm)
+	maxBytes := trace.MaxSnippetSeconds * liveSampleRate * 2
+	if room := (maxBytes - len(ts.pcm)) / 2; room > 0 {
+		if len(pcm) > room {
+			pcm = pcm[:room]
+		}
+		ts.pcm = append(ts.pcm, sound.Float32sToInt16LEBytes(pcm)...)
+	}
+}
+
+func (ts *liveTraceState) observe(ev LiveTranscriptionEvent) {
+	if ts == nil {
+		return
+	}
+	ts.mu.Lock()
+	defer ts.mu.Unlock()
+	if ev.Delta != "" {
+		ts.deltaEvents++
+	}
+	if ev.Eou {
+		ts.eouEvents++
+	}
+	if ev.Eob {
+		ts.eobEvents++
+	}
+	if ev.Final != nil {
+		ts.finalText = ev.Final.Text
+	}
+}
+
+func (ts *liveTraceState) record(closeErr error) {
+	if ts == nil || !ts.appConfig.EnableTracing {
+		return
+	}
+	ts.mu.Lock()
+	data := map[string]any{
+		"source":       "live_stream",
+		"language":     ts.language,
+		"result_text":  ts.finalText,
+		"eou_events":   ts.eouEvents,
+		"eob_events":   ts.eobEvents,
+		"delta_events": ts.deltaEvents,
+	}
+	if snippet := trace.AudioSnippetFromPCM(ts.pcm, liveSampleRate, ts.fedSamples*2, ts.appConfig.TracingMaxBodyBytes); snippet != nil {
+		maps.Copy(data, snippet)
+	}
+	summary := "live -> " + ts.finalText
+	ts.mu.Unlock()
+
+	bt := trace.BackendTrace{
+		Timestamp: ts.started,
+		Duration:  time.Since(ts.started),
+		Type:      trace.BackendTraceTranscription,
+		ModelName: ts.modelName,
+		Backend:   ts.backend,
+		Summary:   trace.TruncateString(summary, 200),
+		Data:      data,
+	}
+	if closeErr != nil {
+		bt.Error = closeErr.Error()
+	}
+	trace.RecordBackendTrace(bt)
+}
+
+// ModelTranscriptionLive loads the transcription backend, opens the
+// bidirectional AudioTranscriptionLive RPC, sends the session config, and
+// BLOCKS until the backend's ready ack. A grpcerrors.
+// IsLiveTranscriptionUnsupported error means the backend (or the loaded
+// model) cannot do live transcription and the caller should degrade to the
+// unary/file path. After a successful return, onEvent is invoked from a
+// background goroutine — in order, one event at a time — for every response
+// the backend streams, ending with the Final event triggered by Close.
+func ModelTranscriptionLive(ctx context.Context, language string,
+	ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig,
+	onEvent func(LiveTranscriptionEvent)) (LiveTranscriptionSession, error) {
+
+	transcriptionModel, err := loadTranscriptionModel(ctx, ml, modelConfig, appConfig)
+	if err != nil {
+		return nil, err
+	}
+
+	// The derived cancel out-lives this call inside the session: Close uses
+	// it to unwind the stream (and, in embed mode, the server-side recv
+	// pump, which only stops on send-close or context cancellation).
+	streamCtx, cancel := context.WithCancel(ctx)
+	stream, err := transcriptionModel.AudioTranscriptionLive(streamCtx)
+	if err != nil {
+		cancel()
+		return nil, err
+	}
+
+	fail := func(err error) (LiveTranscriptionSession, error) {
+		_ = stream.CloseSend()
+		cancel()
+		return nil, err
+	}
+
+	if err := stream.Send(&proto.TranscriptLiveRequest{
+		Payload: &proto.TranscriptLiveRequest_Config{Config: &proto.TranscriptLiveConfig{
+			Language:   language,
+			SampleRate: liveSampleRate,
+		}},
+	}); err != nil {
+		return fail(err)
+	}
+
+	// Ready-ack contract: the backend answers a successful open with a
+	// {ready:true} response before any transcript data; unsupported
+	// backends surface Unimplemented here instead.
+	ack, err := stream.Recv()
+	if err != nil {
+		return fail(err)
+	}
+	if !ack.GetReady() {
+		return fail(fmt.Errorf("live transcription: backend %q broke the ready-ack contract (first response carried data)", modelConfig.Backend))
+	}
+
+	s := &liveTranscriptionSession{
+		stream:   stream,
+		cancel:   cancel,
+		recvDone: make(chan struct{}),
+		trace:    newLiveTraceState(modelConfig, appConfig, language),
+	}
+
+	go func() {
+		defer close(s.recvDone)
+		for {
+			resp, err := stream.Recv()
+			if err != nil {
+				if !errors.Is(err, io.EOF) && streamCtx.Err() == nil {
+					xlog.Warn("live transcription stream ended unexpectedly", "error", err)
+					s.recvErr = err
+				}
+				return
+			}
+			ev := liveEventFromProto(resp)
+			if ev.Delta == "" && !ev.Eou && !ev.Eob && len(ev.Words) == 0 && ev.Final == nil {
+				continue // duplicate ready ack / keep-alive: nothing to deliver
+			}
+			s.trace.observe(ev)
+			onEvent(ev)
+		}
+	}()
+
+	return s, nil
+}
+
+func liveEventFromProto(r *proto.TranscriptLiveResponse) LiveTranscriptionEvent {
+	ev := LiveTranscriptionEvent{
+		Delta: r.GetDelta(),
+		Eou:   r.GetEou(),
+		Eob:   r.GetEob(),
+	}
+	for _, w := range r.GetWords() {
+		ev.Words = append(ev.Words, schema.TranscriptionWord{
+			Start: time.Duration(w.Start),
+			End:   time.Duration(w.End),
+			Text:  w.Text,
+		})
+	}
+	if r.GetFinalResult() != nil {
+		ev.Final = transcriptResultFromProto(r.GetFinalResult())
+	}
+	return ev
+}
--- a/core/backend/transcript_live_internal_test.go
+++ b/core/backend/transcript_live_internal_test.go
@@ -0,0 +1,162 @@
+package backend
+
+import (
+	"errors"
+	"time"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/trace"
+	"github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("liveEventFromProto", func() {
+	It("maps deltas, eou flags and words (ns -> duration)", func() {
+		ev := liveEventFromProto(&proto.TranscriptLiveResponse{
+			Delta: "hello ",
+			Eou:   true,
+			Words: []*proto.TranscriptWord{
+				{Start: int64(100 * time.Millisecond), End: int64(400 * time.Millisecond), Text: "hello"},
+			},
+		})
+		Expect(ev.Delta).To(Equal("hello "))
+		Expect(ev.Eou).To(BeTrue())
+		Expect(ev.Words).To(HaveLen(1))
+		Expect(ev.Words[0].Text).To(Equal("hello"))
+		Expect(ev.Words[0].Start).To(Equal(100 * time.Millisecond))
+		Expect(ev.Words[0].End).To(Equal(400 * time.Millisecond))
+		Expect(ev.Final).To(BeNil())
+	})
+
+	It("maps the terminal final result including the eou flag", func() {
+		ev := liveEventFromProto(&proto.TranscriptLiveResponse{
+			FinalResult: &proto.TranscriptResult{
+				Text:     "hello world",
+				Duration: 1.5,
+				Eou:      true,
+				Segments: []*proto.TranscriptSegment{{Id: 0, Text: "hello world"}},
+			},
+		})
+		Expect(ev.Final).NotTo(BeNil())
+		Expect(ev.Final.Text).To(Equal("hello world"))
+		Expect(ev.Final.Duration).To(BeNumerically("~", 1.5, 1e-6))
+		Expect(ev.Final.Eou).To(BeTrue())
+		Expect(ev.Final.Segments).To(HaveLen(1))
+	})
+
+	It("yields an empty event for a bare ready ack (filtered by the recv loop)", func() {
+		ev := liveEventFromProto(&proto.TranscriptLiveResponse{Ready: true})
+		Expect(ev.Delta).To(BeEmpty())
+		Expect(ev.Eou).To(BeFalse())
+		Expect(ev.Words).To(BeEmpty())
+		Expect(ev.Final).To(BeNil())
+	})
+
+	It("maps the eob backchannel flag separately from eou", func() {
+		ev := liveEventFromProto(&proto.TranscriptLiveResponse{Delta: "uh-huh", Eob: true})
+		Expect(ev.Eob).To(BeTrue())
+		Expect(ev.Eou).To(BeFalse())
+	})
+})
+
+// liveTraceState is what makes streaming-only pipelines visible on the
+// Traces page: without it a semantic_vad session with retranscribe off
+// produced no transcription trace at all. One trace per session (= one per
+// realtime turn), recorded at Close.
+var _ = Describe("liveTraceState", func() {
+	var appConfig *config.ApplicationConfig
+
+	BeforeEach(func() {
+		appConfig = &config.ApplicationConfig{
+			EnableTracing:   true,
+			TracingMaxItems: 64,
+		}
+		trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
+		trace.ClearBackendTraces()
+	})
+
+	modelCfg := func() config.ModelConfig {
+		cfg := config.ModelConfig{Backend: "parakeet-cpp"}
+		cfg.Name = "parakeet-live"
+		return cfg
+	}
+
+	It("is disabled (nil) when tracing is off, and nil receivers are no-ops", func() {
+		appConfig.EnableTracing = false
+		ts := newLiveTraceState(modelCfg(), appConfig, "en")
+		Expect(ts).To(BeNil())
+
+		// The session calls these unconditionally; nil must be safe.
+		ts.addPCM([]float32{0.5})
+		ts.observe(LiveTranscriptionEvent{Eou: true})
+		ts.record(nil)
+		Consistently(trace.GetBackendTraces, "100ms", "20ms").Should(BeEmpty())
+	})
+
+	It("records one transcription trace with text, eou event counts and audio snippet at Close", func() {
+		ts := newLiveTraceState(modelCfg(), appConfig, "en")
+		Expect(ts).NotTo(BeNil())
+
+		// One second of a loud-ish constant tone so the snippet has signal.
+		pcm := make([]float32, liveSampleRate)
+		for i := range pcm {
+			pcm[i] = 0.25
+		}
+		ts.addPCM(pcm)
+		ts.observe(LiveTranscriptionEvent{Delta: "hello "})
+		ts.observe(LiveTranscriptionEvent{Delta: "world", Eou: true})
+		ts.observe(LiveTranscriptionEvent{Final: &schema.TranscriptionResult{Text: "hello world", Eou: true}})
+
+		ts.record(nil)
+
+		Eventually(trace.GetBackendTraces).Should(HaveLen(1))
+		got := trace.GetBackendTraces()[0]
+		Expect(got.Type).To(Equal(trace.BackendTraceTranscription))
+		Expect(got.ModelName).To(Equal("parakeet-live"))
+		Expect(got.Backend).To(Equal("parakeet-cpp"))
+		Expect(got.Summary).To(ContainSubstring("hello world"))
+		Expect(got.Data["source"]).To(Equal("live_stream"))
+		Expect(got.Data["result_text"]).To(Equal("hello world"))
+		// The live FinalResult no longer carries a terminal eou flag; the
+		// per-feed eou_events count is what the trace records instead.
+		Expect(got.Data).NotTo(HaveKey("eou"))
+		Expect(got.Data["eou_events"]).To(Equal(1))
+		Expect(got.Data["delta_events"]).To(Equal(2))
+		Expect(got.Data["audio_duration_s"]).To(BeNumerically("~", 1.0, 0.01))
+		Expect(got.Data["audio_wav_base64"]).NotTo(BeEmpty())
+		Expect(got.Error).To(BeEmpty())
+	})
+
+	It("caps the stored snippet but keeps counting the full fed duration", func() {
+		ts := newLiveTraceState(modelCfg(), appConfig, "")
+
+		// Feed past the snippet cap in two chunks (cap + one extra second).
+		ts.addPCM(make([]float32, trace.MaxSnippetSeconds*liveSampleRate))
+		ts.addPCM(make([]float32, liveSampleRate))
+
+		Expect(len(ts.pcm)).To(Equal(trace.MaxSnippetSeconds * liveSampleRate * 2))
+		Expect(ts.fedSamples).To(Equal((trace.MaxSnippetSeconds + 1) * liveSampleRate))
+
+		ts.record(nil)
+		Eventually(trace.GetBackendTraces).Should(HaveLen(1))
+		got := trace.GetBackendTraces()[0]
+		Expect(got.Data["audio_duration_s"]).To(BeNumerically("~", float64(trace.MaxSnippetSeconds+1), 0.01))
+		Expect(got.Data["audio_snippet_s"]).To(BeNumerically("~", float64(trace.MaxSnippetSeconds), 0.01))
+	})
+
+	It("clamps out-of-range float samples instead of wrapping", func() {
+		ts := newLiveTraceState(modelCfg(), appConfig, "")
+		ts.addPCM([]float32{2.0, -2.0})
+		Expect(ts.pcm).To(Equal([]byte{0xff, 0x7f, 0x00, 0x80})) // 32767, -32768
+	})
+
+	It("stamps the close error on the trace", func() {
+		ts := newLiveTraceState(modelCfg(), appConfig, "")
+		ts.record(errors.New("stream torn down"))
+
+		Eventually(trace.GetBackendTraces).Should(HaveLen(1))
+		Expect(trace.GetBackendTraces()[0].Error).To(Equal("stream torn down"))
+	})
+})
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -6,6 +6,7 @@ import (
 	"regexp"
 	"time"

+	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/system"
 	"github.com/mudler/LocalAI/pkg/xsysinfo"
 	"github.com/mudler/xlog"
@@ -241,12 +242,19 @@ func NewApplicationConfig(o ...AppOption) *ApplicationConfig {
 		Context:                  context.Background(),
 		UploadLimitMB:            15,
 		Debug:                    true,
-		AgentJobRetentionDays:    30,                     // Default: 30 days
-		LRUEvictionMaxRetries:    30,                     // Default: 30 retries
-		LRUEvictionRetryInterval: 1 * time.Second,        // Default: 1 second
-		WatchDogInterval:         500 * time.Millisecond, // Default: 500ms
-		TracingMaxItems:          1024,
-		TracingMaxBodyBytes:      64 * 1024, // 64 KiB - caps each request/response body in the trace buffer
+		AgentJobRetentionDays:    30,              // Default: 30 days
+		LRUEvictionMaxRetries:    30,              // Default: 30 retries
+		LRUEvictionRetryInterval: 1 * time.Second, // Default: 1 second
+		// WatchDogInterval is intentionally left at the zero value here.
+		// The startup loader applies a persisted runtime_settings.json value
+		// only when the interval is still 0 (its "not set by env var"
+		// heuristic, matching the idle/busy timeouts); a non-zero baseline
+		// default would defeat that and silently revert a UI-saved Check
+		// Interval to the default on every restart (#10601). The effective
+		// 500ms default is supplied at the watchdog layer (DefaultWatchdogInterval)
+		// when the value is still 0.
+		TracingMaxItems:     1024,
+		TracingMaxBodyBytes: 64 * 1024, // 64 KiB - caps each request/response body in the trace buffer
 		AgentPool: AgentPoolConfig{
 			Enabled:         true,
 			Timeout:         "5m",
@@ -1097,7 +1105,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 	if o.WatchDogInterval > 0 {
 		watchdogInterval = o.WatchDogInterval.String()
 	} else {
-		watchdogInterval = "2s" // default
+		watchdogInterval = model.DefaultWatchdogInterval.String() // default: 500ms
 	}
 	var lruEvictionRetryInterval string
 	if o.LRUEvictionRetryInterval > 0 {
--- a/core/config/meta/registry.go
+++ b/core/config/meta/registry.go
@@ -567,6 +567,38 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Advanced:    true,
 			Order:       83,
 		},
+		"pipeline.turn_detection.type": {
+			Section:     "pipeline",
+			Label:       "Turn Detection",
+			Description: "Default turn-detection mode for realtime sessions on this pipeline. server_vad commits after a fixed silence window; semantic_vad lets the transcription model's end-of-utterance token drive a dynamic window (fast commit after the token, long eagerness fallback without it). semantic_vad requires a streaming-EOU transcription model (e.g. parakeet-cpp-realtime_eou_120m-v1) and degrades to silence-only otherwise. Clients can override per session via session.update.",
+			Component:   "select",
+			Options: []FieldOption{
+				{Value: "", Label: "Default (server_vad)"},
+				{Value: "server_vad", Label: "server_vad (silence-based)"},
+				{Value: "semantic_vad", Label: "semantic_vad (end-of-utterance token)"},
+			},
+			Order: 87,
+		},
+		"pipeline.turn_detection.eagerness": {
+			Section:     "pipeline",
+			Label:       "Eagerness",
+			Description: "semantic_vad fallback silence window used when no end-of-utterance token was seen: low waits 8s, medium/auto 4s, high 2s.",
+			Component:   "select",
+			Options: []FieldOption{
+				{Value: "", Label: "Default (auto)"},
+				{Value: "low", Label: "low (8s)"},
+				{Value: "medium", Label: "medium (4s)"},
+				{Value: "high", Label: "high (2s)"},
+			},
+			Order: 88,
+		},
+		"pipeline.turn_detection.retranscribe": {
+			Section:     "pipeline",
+			Label:       "Retranscribe on Commit",
+			Description: "Cross-check every semantic_vad commit with an offline decode of the buffered turn: commit only proceeds when the batch decode also ends in the end-of-utterance token, and its transcript is used. Logs a streamed-vs-batch comparison — useful to gauge streaming/batch alignment — at the cost of one extra decode per turn.",
+			Component:   "toggle",
+			Order:       89,
+		},

 		// --- Functions ---
 		"function.grammar.parallel_calls": {
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -650,6 +650,12 @@ type Pipeline struct {
 	// VoiceRecognition gates the pipeline behind speaker verification. Nil
 	// (block absent) means no gate, preserving existing behavior.
 	VoiceRecognition *PipelineVoiceRecognition `yaml:"voice_recognition,omitempty" json:"voice_recognition,omitempty"`
+
+	// TurnDetection sets the server-side default turn-detection mode for
+	// realtime sessions on this pipeline, so clients need no session.update
+	// to benefit. A client session.update still overrides type and eagerness
+	// per session; retranscribe is server-side only. Unset keeps server_vad.
+	TurnDetection PipelineTurnDetection `yaml:"turn_detection,omitempty" json:"turn_detection,omitempty"`
 }

 // PipelineCompaction configures summarize-then-drop for a realtime pipeline.
@@ -934,6 +940,38 @@ func (v PipelineVoiceRecognition) Validate(registryAvailable bool) error {
 	return nil
 }

+// @Description PipelineTurnDetection sets realtime turn-detection defaults.
+type PipelineTurnDetection struct {
+	// Type selects the default turn_detection mode for sessions on this
+	// pipeline: "server_vad" (silence-based) or "semantic_vad" (the
+	// transcription model's end-of-utterance token drives a dynamic silence
+	// window; needs a streaming-EOU transcription model such as
+	// parakeet_realtime_eou_120m-v1, degrades to silence-only otherwise).
+	Type string `yaml:"type,omitempty" json:"type,omitempty"`
+	// Eagerness is the semantic_vad fallback when no end-of-utterance token
+	// was seen: low waits 8s of silence, medium/auto 4s, high 2s.
+	Eagerness string `yaml:"eagerness,omitempty" json:"eagerness,omitempty"`
+	// Retranscribe (semantic_vad only) cross-checks every EOU-triggered
+	// commit with an offline decode of the buffered turn: the commit only
+	// proceeds when the batch decode also ends in the end-of-utterance token,
+	// and its transcript is the one used. The streamed and batch transcripts
+	// are compared in the logs — a diagnostic for streaming/batch alignment
+	// at the cost of one extra decode per turn.
+	Retranscribe *bool `yaml:"retranscribe,omitempty" json:"retranscribe,omitempty"`
+}
+
+// TurnDetectionSemantic reports whether this pipeline defaults sessions to
+// semantic (EOU-driven) turn detection.
+func (p Pipeline) TurnDetectionSemantic() bool {
+	return strings.EqualFold(strings.TrimSpace(p.TurnDetection.Type), "semantic_vad")
+}
+
+// TurnDetectionRetranscribe reports whether semantic_vad commits should be
+// cross-checked (and transcribed) by an offline decode of the buffered turn.
+func (p Pipeline) TurnDetectionRetranscribe() bool {
+	return p.TurnDetection.Retranscribe != nil && *p.TurnDetection.Retranscribe
+}
+
 // @Description File configuration for model downloads
 type File struct {
 	Filename string         `yaml:"filename,omitempty" json:"filename,omitempty"`
--- a/core/config/pipeline_turn_detection_test.go
+++ b/core/config/pipeline_turn_detection_test.go
@@ -0,0 +1,61 @@
+package config
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"gopkg.in/yaml.v3"
+)
+
+// pipeline.turn_detection sets the server-side default turn-detection mode
+// for realtime sessions. Unset keeps server_vad, so existing configs are
+// unaffected; retranscribe is opt-in.
+var _ = Describe("Pipeline turn_detection config", func() {
+	It("defaults to non-semantic with retranscribe off when unset", func() {
+		var p Pipeline
+		Expect(p.TurnDetectionSemantic()).To(BeFalse())
+		Expect(p.TurnDetectionRetranscribe()).To(BeFalse())
+	})
+
+	It("parses the nested turn_detection block from YAML", func() {
+		var c ModelConfig
+		err := yaml.Unmarshal([]byte(`
+name: gpt-realtime
+pipeline:
+  transcription: parakeet-cpp-realtime_eou_120m-v1
+  turn_detection:
+    type: semantic_vad
+    eagerness: high
+    retranscribe: true
+`), &c)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(c.Pipeline.TurnDetectionSemantic()).To(BeTrue())
+		Expect(c.Pipeline.TurnDetection.Eagerness).To(Equal("high"))
+		Expect(c.Pipeline.TurnDetectionRetranscribe()).To(BeTrue())
+	})
+
+	It("treats server_vad and unknown types as non-semantic", func() {
+		var p Pipeline
+		p.TurnDetection.Type = "server_vad"
+		Expect(p.TurnDetectionSemantic()).To(BeFalse())
+		p.TurnDetection.Type = "something_else"
+		Expect(p.TurnDetectionSemantic()).To(BeFalse())
+	})
+
+	It("matches semantic_vad case-insensitively with surrounding space", func() {
+		var p Pipeline
+		p.TurnDetection.Type = " Semantic_VAD "
+		Expect(p.TurnDetectionSemantic()).To(BeTrue())
+	})
+
+	It("treats an explicit retranscribe false as off", func() {
+		var c ModelConfig
+		err := yaml.Unmarshal([]byte(`
+pipeline:
+  turn_detection:
+    type: semantic_vad
+    retranscribe: false
+`), &c)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(c.Pipeline.TurnDetectionRetranscribe()).To(BeFalse())
+	})
+})
--- a/core/gallery/importers/importers_test.go
+++ b/core/gallery/importers/importers_test.go
@@ -22,11 +22,13 @@ var _ = Describe("DiscoverModelConfig", func() {
 			modelConfig, err := importers.DiscoverModelConfig(uri, preferences)

 			Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
-			Expect(modelConfig.Name).To(Equal("LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF"), fmt.Sprintf("Model config: %+v", modelConfig))
+			// No name preference + repo-root URI: the name follows the selected
+			// GGUF file, not the repo (issue #10587).
+			Expect(modelConfig.Name).To(Equal("localai-functioncall-qwen2.5-7b-v0.5-q4_k_m"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Description).To(Equal("Imported from https://huggingface.co/mudler/LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: llama-cpp"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(len(modelConfig.Files)).To(Equal(1), fmt.Sprintf("Model config: %+v", modelConfig))
-			Expect(modelConfig.Files[0].Filename).To(Equal("llama-cpp/models/LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
+			Expect(modelConfig.Files[0].Filename).To(Equal("llama-cpp/models/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Files[0].URI).To(Equal("https://huggingface.co/mudler/LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF/resolve/main/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Files[0].SHA256).To(Equal("4e7b7fe1d54b881f1ef90799219dc6cc285d29db24f559c8998d1addb35713d4"), fmt.Sprintf("Model config: %+v", modelConfig))
 		})
@@ -38,16 +40,17 @@ var _ = Describe("DiscoverModelConfig", func() {
 			modelConfig, err := importers.DiscoverModelConfig(uri, preferences)

 			Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
-			Expect(modelConfig.Name).To(Equal("Qwen3-VL-2B-Instruct-GGUF"), fmt.Sprintf("Model config: %+v", modelConfig))
+			// No name preference: name follows the selected model GGUF (issue #10587).
+			Expect(modelConfig.Name).To(Equal("Qwen3VL-2B-Instruct-Q4_K_M"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Description).To(Equal("Imported from https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: llama-cpp"), fmt.Sprintf("Model config: %+v", modelConfig))
-			Expect(modelConfig.ConfigFile).To(ContainSubstring("mmproj: llama-cpp/mmproj/Qwen3-VL-2B-Instruct-GGUF/mmproj-Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
-			Expect(modelConfig.ConfigFile).To(ContainSubstring("model: llama-cpp/models/Qwen3-VL-2B-Instruct-GGUF/Qwen3VL-2B-Instruct-Q4_K_M.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("mmproj: llama-cpp/mmproj/Qwen3VL-2B-Instruct-Q4_K_M/mmproj-Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("model: llama-cpp/models/Qwen3VL-2B-Instruct-Q4_K_M/Qwen3VL-2B-Instruct-Q4_K_M.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(len(modelConfig.Files)).To(Equal(2), fmt.Sprintf("Model config: %+v", modelConfig))
-			Expect(modelConfig.Files[0].Filename).To(Equal("llama-cpp/models/Qwen3-VL-2B-Instruct-GGUF/Qwen3VL-2B-Instruct-Q4_K_M.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
+			Expect(modelConfig.Files[0].Filename).To(Equal("llama-cpp/models/Qwen3VL-2B-Instruct-Q4_K_M/Qwen3VL-2B-Instruct-Q4_K_M.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Files[0].URI).To(Equal("https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF/resolve/main/Qwen3VL-2B-Instruct-Q4_K_M.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Files[0].SHA256).ToNot(BeEmpty(), fmt.Sprintf("Model config: %+v", modelConfig))
-			Expect(modelConfig.Files[1].Filename).To(Equal("llama-cpp/mmproj/Qwen3-VL-2B-Instruct-GGUF/mmproj-Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
+			Expect(modelConfig.Files[1].Filename).To(Equal("llama-cpp/mmproj/Qwen3VL-2B-Instruct-Q4_K_M/mmproj-Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Files[1].URI).To(Equal("https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF/resolve/main/mmproj-Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Files[1].SHA256).ToNot(BeEmpty(), fmt.Sprintf("Model config: %+v", modelConfig))
 		})
@@ -59,16 +62,17 @@ var _ = Describe("DiscoverModelConfig", func() {
 			modelConfig, err := importers.DiscoverModelConfig(uri, preferences)

 			Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
-			Expect(modelConfig.Name).To(Equal("Qwen3-VL-2B-Instruct-GGUF"), fmt.Sprintf("Model config: %+v", modelConfig))
+			// No name preference: name follows the selected Q8_0 model GGUF (issue #10587).
+			Expect(modelConfig.Name).To(Equal("Qwen3VL-2B-Instruct-Q8_0"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Description).To(Equal("Imported from https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: llama-cpp"), fmt.Sprintf("Model config: %+v", modelConfig))
-			Expect(modelConfig.ConfigFile).To(ContainSubstring("mmproj: llama-cpp/mmproj/Qwen3-VL-2B-Instruct-GGUF/mmproj-Qwen3VL-2B-Instruct-F16.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
-			Expect(modelConfig.ConfigFile).To(ContainSubstring("model: llama-cpp/models/Qwen3-VL-2B-Instruct-GGUF/Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("mmproj: llama-cpp/mmproj/Qwen3VL-2B-Instruct-Q8_0/mmproj-Qwen3VL-2B-Instruct-F16.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("model: llama-cpp/models/Qwen3VL-2B-Instruct-Q8_0/Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(len(modelConfig.Files)).To(Equal(2), fmt.Sprintf("Model config: %+v", modelConfig))
-			Expect(modelConfig.Files[0].Filename).To(Equal("llama-cpp/models/Qwen3-VL-2B-Instruct-GGUF/Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
+			Expect(modelConfig.Files[0].Filename).To(Equal("llama-cpp/models/Qwen3VL-2B-Instruct-Q8_0/Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Files[0].URI).To(Equal("https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF/resolve/main/Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Files[0].SHA256).ToNot(BeEmpty(), fmt.Sprintf("Model config: %+v", modelConfig))
-			Expect(modelConfig.Files[1].Filename).To(Equal("llama-cpp/mmproj/Qwen3-VL-2B-Instruct-GGUF/mmproj-Qwen3VL-2B-Instruct-F16.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
+			Expect(modelConfig.Files[1].Filename).To(Equal("llama-cpp/mmproj/Qwen3VL-2B-Instruct-Q8_0/mmproj-Qwen3VL-2B-Instruct-F16.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Files[1].URI).To(Equal("https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF/resolve/main/mmproj-Qwen3VL-2B-Instruct-F16.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Files[1].SHA256).ToNot(BeEmpty(), fmt.Sprintf("Model config: %+v", modelConfig))
 		})
--- a/core/gallery/importers/llama-cpp.go
+++ b/core/gallery/importers/llama-cpp.go
@@ -98,8 +98,13 @@ func (i *LlamaCPPImporter) Import(details Details) (gallery.ModelConfig, error)
 		}
 	}

-	name, ok := preferencesMap["name"].(string)
-	if !ok {
+	// nameProvided tracks whether the user supplied an explicit model name.
+	// When they didn't, the URI base is only a fallback: for a HuggingFace
+	// repo-root URI (no file component) it would be the repo name, so the HF
+	// branch below re-derives the name from the selected GGUF file instead
+	// (issue #10587).
+	name, nameProvided := preferencesMap["name"].(string)
+	if !nameProvided {
 		name = filepath.Base(details.URI)
 	}

@@ -227,10 +232,23 @@ func (i *LlamaCPPImporter) Import(details Details) (gallery.ModelConfig, error)
 		mmprojGroups := hfapi.GroupShards(mmprojFiles)
 		ggufGroups := hfapi.GroupShards(ggufFiles)

+		modelGroup := pickPreferredGroup(ggufGroups, quants)
+
+		// A repo-root URI has no file component, so the URI-base fallback
+		// above produced the repo name. When the user left the name blank,
+		// derive it from the GGUF file actually selected from the listing so
+		// the gallery entry and `model:` directory reflect the model, not the
+		// repository (issue #10587). An explicit name preference always wins.
+		if !nameProvided && modelGroup != nil {
+			name = modelNameFromShardGroup(*modelGroup)
+			modelConfig.Name = name
+			cfg.Name = name
+		}
+
 		// Emit the model group first so cfg.Files[0] is the model — callers
 		// and tests rely on the model file preceding any mmproj companion.
-		if group := pickPreferredGroup(ggufGroups, quants); group != nil {
-			appendShardGroup(&cfg, *group, filepath.Join("llama-cpp", "models", name))
+		if modelGroup != nil {
+			appendShardGroup(&cfg, *modelGroup, filepath.Join("llama-cpp", "models", name))
 		}
 		if group := pickPreferredGroup(mmprojGroups, mmprojQuantsList); group != nil {
 			appendShardGroup(&cfg, *group, filepath.Join("llama-cpp", "mmproj", name))
@@ -281,6 +299,20 @@ func (i *LlamaCPPImporter) Import(details Details) (gallery.ModelConfig, error)
 	return cfg, nil
 }

+// modelNameFromShardGroup derives a human-facing model name from the picked
+// GGUF group: the logical base filename with its .gguf extension stripped.
+// ShardGroup.Base is the common prefix for sharded sets (without the
+// -NNNNN-of-MMMMM suffix) and the sole basename for single-file models, so
+// this yields a clean name like "model-Q4_K_M" rather than an individual
+// shard filename or the repo-root URI base.
+func modelNameFromShardGroup(group hfapi.ShardGroup) string {
+	base := group.Base
+	if ext := filepath.Ext(base); strings.EqualFold(ext, ".gguf") {
+		base = strings.TrimSuffix(base, ext)
+	}
+	return base
+}
+
 // pickPreferredGroup walks the preference list in priority order and returns
 // the first group whose base filename contains any preference. When nothing
 // matches, the last group wins — this preserves the historical "if the user
--- a/core/gallery/importers/llama-cpp_test.go
+++ b/core/gallery/importers/llama-cpp_test.go
@@ -372,6 +372,62 @@ var _ = Describe("LlamaCPPImporter", func() {
 			Expect(err).ToNot(HaveOccurred())
 			Expect(modelConfig.Files).To(BeEmpty())
 		})
+
+		It("derives the model name from the selected GGUF when no name is given", func() {
+			// Regression for #10587: a repo-root URI has no file component, so
+			// the URI base ("example-GGUF") is just the repo name. With the
+			// name field left blank, the emitted name and model directory must
+			// follow the GGUF file actually selected, not the repository.
+			details := withHF(`{"quantizations":"Q4_K_M"}`,
+				hfFile("Meta-Llama-3-8B-Instruct.Q4_K_M.gguf", "aaa"),
+				hfFile("Meta-Llama-3-8B-Instruct.Q3_K_M.gguf", "bbb"),
+			)
+
+			modelConfig, err := importer.Import(details)
+
+			Expect(err).ToNot(HaveOccurred())
+			Expect(modelConfig.Name).To(Equal("Meta-Llama-3-8B-Instruct.Q4_K_M"))
+			Expect(modelConfig.Files).To(HaveLen(1), fmt.Sprintf("%+v", modelConfig))
+			Expect(modelConfig.Files[0].Filename).To(Equal(
+				"llama-cpp/models/Meta-Llama-3-8B-Instruct.Q4_K_M/Meta-Llama-3-8B-Instruct.Q4_K_M.gguf"))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("name: Meta-Llama-3-8B-Instruct.Q4_K_M"))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring(
+				"model: llama-cpp/models/Meta-Llama-3-8B-Instruct.Q4_K_M/Meta-Llama-3-8B-Instruct.Q4_K_M.gguf"))
+		})
+
+		It("derives a clean name from the shard base for split GGUFs when no name is given", func() {
+			// The selected primary file is shard 1; using its raw basename
+			// would leak the -00001-of-00002 suffix into the name. The shard
+			// base must be used so the name is the logical model.
+			details := withHF(``,
+				hfFile("Qwen3-30B-A3B-Q4_K_M-00001-of-00002.gguf", "p1"),
+				hfFile("Qwen3-30B-A3B-Q4_K_M-00002-of-00002.gguf", "p2"),
+			)
+
+			modelConfig, err := importer.Import(details)
+
+			Expect(err).ToNot(HaveOccurred())
+			Expect(modelConfig.Name).To(Equal("Qwen3-30B-A3B-Q4_K_M"))
+			Expect(modelConfig.Files).To(HaveLen(2), fmt.Sprintf("%+v", modelConfig))
+			Expect(modelConfig.Files[0].Filename).To(Equal(
+				"llama-cpp/models/Qwen3-30B-A3B-Q4_K_M/Qwen3-30B-A3B-Q4_K_M-00001-of-00002.gguf"))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring(
+				"model: llama-cpp/models/Qwen3-30B-A3B-Q4_K_M/Qwen3-30B-A3B-Q4_K_M-00001-of-00002.gguf"))
+		})
+
+		It("keeps an explicit name over the selected GGUF filename", func() {
+			// Precedence guard: when the user supplies a name it always wins,
+			// even though a GGUF file was selected from the listing.
+			details := withHF(`{"name":"my-custom-name","quantizations":"Q4_K_M"}`,
+				hfFile("model-Q4_K_M.gguf", "aaa"),
+			)
+
+			modelConfig, err := importer.Import(details)
+
+			Expect(err).ToNot(HaveOccurred())
+			Expect(modelConfig.Name).To(Equal("my-custom-name"))
+			Expect(modelConfig.Files[0].Filename).To(Equal("llama-cpp/models/my-custom-name/model-Q4_K_M.gguf"))
+		})
 	})

 	Context("quant token boundary matching", func() {
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -618,6 +618,10 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 					finishReason = FinishReasonToolCalls
 				} else if toolsCalled {
 					finishReason = FinishReasonFunctionCall
+				} else if reachedTokenBudget(finalUsage.Completion, config.Maxtokens) {
+					// Generation stopped because it hit the max_tokens ceiling
+					// rather than a natural stop — report "length" (issue #9716).
+					finishReason = FinishReasonLength
 				}

 				// Final delta chunk: empty delta with finish_reason set. Per
@@ -984,6 +988,18 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 					}
 				}

+				// If generation hit the max_tokens ceiling, report "length"
+				// instead of a natural "stop" (issue #9716). Mirrors the
+				// streaming path; tool/function finish reasons are untouched.
+				if reachedTokenBudget(tokenUsage.Completion, config.Maxtokens) {
+					for i := range result {
+						if result[i].FinishReason != nil && *result[i].FinishReason == FinishReasonStop {
+							lengthReason := FinishReasonLength
+							result[i].FinishReason = &lengthReason
+						}
+					}
+				}
+
 				// No MCP tools to execute (or no MCP tools configured), return response
 				usage := schema.OpenAIUsage{
 					PromptTokens:     tokenUsage.Prompt,
--- a/core/http/endpoints/openai/compactcoord/compactcoord.go
+++ b/core/http/endpoints/openai/compactcoord/compactcoord.go
@@ -0,0 +1,149 @@
+// Package compactcoord is the explicit state machine for the realtime API's
+// conversation-compaction concern (machine "M4" in
+// docs/design/realtime-state-machines.md).
+//
+// In the legacy code this machine is an implicit single-flight guard: a
+// per-conversation `compacting atomic.Bool` that maybeCompact CAS-flips to start
+// a background summarize+evict and a deferred Store(false) clears. The intent —
+// at most one compaction running per conversation at a time, so two goroutines
+// never summarize and evict the same overflow concurrently (Part 4, invariant
+// #9) — is correct but implicit in a bare atomic.
+//
+// This package makes it explicit:
+//   - a sealed sum type for State (Idle | Running) — "two compactions running" is
+//     unrepresentable,
+//   - a total, pure transition function Next(state, event) -> (state, effects),
+//   - a single-writer Coordinator that serializes every transition.
+//
+// Unlike respcoord (M3), a Trigger while Running is NOT a supersede: compaction
+// is idempotent work on the same overflow, so a concurrent trigger is simply
+// dropped (matching the legacy CAS-fails-so-skip), not queued or restarted.
+package compactcoord
+
+import (
+	"fmt"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator"
+)
+
+// State is the sealed sum type of compaction states. Exhaustively:
+// Idle | Running | Terminated.
+type State interface {
+	isState()
+	String() string
+}
+
+// Idle: no compaction is running.
+type Idle struct{}
+
+// Running: exactly one compaction is in flight.
+type Running struct{}
+
+// Terminated: the conversation/session is torn down. Absorbing — no compaction
+// can start from here, so the M1 (connection) parent's teardown can cancel +
+// join the in-flight compaction and guarantee none outlives the session (see
+// formal-verification/session_lifecycle.fizz). This closes the legacy gap where
+// the fire-and-forget compaction goroutine could outlive the session.
+type Terminated struct{}
+
+func (Idle) isState()       {}
+func (Running) isState()    {}
+func (Terminated) isState() {}
+
+func (Idle) String() string       { return "Idle" }
+func (Running) String() string    { return "Running" }
+func (Terminated) String() string { return "Terminated" }
+
+// Event is the sealed sum type of inputs. Exhaustively:
+// Trigger | Finished | Shutdown.
+type Event interface {
+	isEvent()
+	String() string
+}
+
+// Trigger requests a compaction (the live buffer grew past the trigger). It
+// starts one only when Idle; while Running it is a no-op (single-flight).
+type Trigger struct{}
+
+// Finished reports that the running compaction goroutine finished (success, error, or
+// timeout — it always reports Finished so the flag can never stick).
+type Finished struct{}
+
+// Shutdown terminates the coordinator at teardown: the in-flight compaction is
+// cancelled + joined by the sink, and no compaction can start afterwards.
+type Shutdown struct{}
+
+func (Trigger) isEvent()  {}
+func (Finished) isEvent() {}
+func (Shutdown) isEvent() {}
+
+func (Trigger) String() string  { return "Trigger" }
+func (Finished) String() string { return "Finished" }
+func (Shutdown) String() string { return "Shutdown" }
+
+// Effect is a side effect returned by Next as data. Exhaustively: StartCompaction.
+type Effect interface {
+	isEffect()
+	String() string
+}
+
+// StartCompaction: spawn the background summarize+evict goroutine.
+type StartCompaction struct{}
+
+func (StartCompaction) isEffect() {}
+
+func (StartCompaction) String() string { return "StartCompaction" }
+
+// Next is the total, pure transition function. For every (state, event) it
+// returns the next state and the ordered effects. It returns a non-nil error
+// only for an unknown State/Event implementation. Every in-domain pair is
+// defined; there are no forbidden transitions, only no-ops.
+//
+// Single-flight crux: StartCompaction is emitted only on Idle+Trigger, and a
+// Trigger while Running is a no-op — so at most one compaction ever runs.
+func Next(s State, e Event) (State, []Effect, error) {
+	switch s.(type) {
+	case Idle:
+		switch e.(type) {
+		case Trigger:
+			return Running{}, []Effect{StartCompaction{}}, nil
+		case Finished:
+			// No compaction to finish: stale/idempotent no-op.
+			return Idle{}, nil, nil
+		case Shutdown:
+			return Terminated{}, nil, nil
+		}
+	case Running:
+		switch e.(type) {
+		case Trigger:
+			// Already compacting: drop (single-flight).
+			return Running{}, nil, nil
+		case Finished:
+			return Idle{}, nil, nil
+		case Shutdown:
+			// Teardown while compacting: the sink cancels + joins the goroutine,
+			// so its later Finished is absorbed here in Terminated.
+			return Terminated{}, nil, nil
+		}
+	case Terminated:
+		// Absorbing: a Trigger after teardown is rejected (no StartCompaction), so
+		// no compaction outlives the session.
+		switch e.(type) {
+		case Trigger, Finished, Shutdown:
+			return Terminated{}, nil, nil
+		}
+	}
+	return s, nil, fmt.Errorf("compactcoord: unhandled transition %s <- %s", s, e)
+}
+
+// EffectSink performs the effects produced by a transition. See coordinator.Sink:
+// StartCompaction spawns a goroutine, so Perform does not block under the lock.
+type EffectSink = coordinator.Sink[Effect]
+
+// Coordinator serializes the compaction transitions. See coordinator.Coordinator.
+type Coordinator = coordinator.Coordinator[State, Event, Effect]
+
+// New returns an idle Coordinator that performs effects via sink.
+func New(sink EffectSink) *Coordinator {
+	return coordinator.New[State, Event, Effect](Idle{}, Next, sink)
+}
--- a/core/http/endpoints/openai/compactcoord/compactcoord_suite_test.go
+++ b/core/http/endpoints/openai/compactcoord/compactcoord_suite_test.go
@@ -0,0 +1,13 @@
+package compactcoord
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestCompactcoord(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "compactcoord (realtime M4) Suite")
+}
--- a/core/http/endpoints/openai/compactcoord/compactcoord_test.go
+++ b/core/http/endpoints/openai/compactcoord/compactcoord_test.go
@@ -0,0 +1,202 @@
+package compactcoord
+
+import (
+	"math/rand/v2"
+	"sync"
+	"sync/atomic"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// recordingSink captures the ordered stream of effects. Perform is called under
+// the coordinator lock; the mutex here guards reads from the spec goroutine.
+type recordingSink struct {
+	mu  sync.Mutex
+	log []Effect
+}
+
+func (s *recordingSink) Perform(e Effect) {
+	s.mu.Lock()
+	s.log = append(s.log, e)
+	s.mu.Unlock()
+}
+
+func (s *recordingSink) count() int {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	return len(s.log)
+}
+
+type unknownEvent struct{}
+
+func (unknownEvent) isEvent()       {}
+func (unknownEvent) String() string { return "unknownEvent" }
+
+type unknownState struct{}
+
+func (unknownState) isState()       {}
+func (unknownState) String() string { return "unknownState" }
+
+var _ = Describe("compactcoord.Next", func() {
+	DescribeTable("transitions",
+		func(state State, event Event, wantState State, wantEff []Effect) {
+			gotState, gotEff, err := Next(state, event)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(gotState).To(Equal(wantState))
+			Expect(gotEff).To(Equal(wantEff))
+		},
+		Entry("idle+trigger -> running: start",
+			Idle{}, Trigger{}, Running{}, []Effect{StartCompaction{}}),
+		Entry("idle+finished -> idle, no-op (stale)",
+			Idle{}, Finished{}, Idle{}, []Effect(nil)),
+		Entry("running+trigger -> running, no-op (single-flight)",
+			Running{}, Trigger{}, Running{}, []Effect(nil)),
+		Entry("running+finished -> idle",
+			Running{}, Finished{}, Idle{}, []Effect(nil)),
+		Entry("idle+shutdown -> terminated",
+			Idle{}, Shutdown{}, Terminated{}, []Effect(nil)),
+		Entry("running+shutdown -> terminated",
+			Running{}, Shutdown{}, Terminated{}, []Effect(nil)),
+		Entry("terminated+trigger -> terminated, REJECTED",
+			Terminated{}, Trigger{}, Terminated{}, []Effect(nil)),
+		Entry("terminated+finished -> terminated, no-op (stale)",
+			Terminated{}, Finished{}, Terminated{}, []Effect(nil)),
+		Entry("terminated+shutdown -> terminated, idempotent",
+			Terminated{}, Shutdown{}, Terminated{}, []Effect(nil)),
+	)
+
+	It("is total over the defined (state, event) pairs", func() {
+		for _, s := range []State{Idle{}, Running{}, Terminated{}} {
+			for _, e := range []Event{Trigger{}, Finished{}, Shutdown{}} {
+				_, _, err := Next(s, e)
+				Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e)
+			}
+		}
+	})
+
+	It("errors on an unknown event type", func() {
+		_, _, err := Next(Idle{}, unknownEvent{})
+		Expect(err).To(HaveOccurred())
+	})
+
+	It("errors on an unknown state type", func() {
+		_, _, err := Next(unknownState{}, Trigger{})
+		Expect(err).To(HaveOccurred())
+	})
+})
+
+var _ = Describe("compactcoord.Coordinator", func() {
+	// A StartCompaction is only ever produced while Idle (verified by checking the
+	// effect count grows exactly when the model transitions Idle->Running), so at
+	// most one compaction is ever in flight.
+	It("starts at most one compaction at a time over random sequences", func() {
+		seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE}
+		for _, seed := range seeds {
+			r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5))
+			sink := &recordingSink{}
+			c := New(sink)
+			running := false
+			starts := 0
+
+			for range 5000 {
+				if r.IntN(2) == 0 {
+					before := sink.count()
+					Expect(c.Apply(Trigger{})).To(Succeed())
+					if sink.count() > before {
+						// A StartCompaction was produced: must have been Idle.
+						Expect(running).To(BeFalse(), "seed=%d: started while already running", seed)
+						running = true
+						starts++
+					}
+				} else {
+					Expect(c.Apply(Finished{})).To(Succeed())
+					running = false
+				}
+				if running {
+					Expect(c.State()).To(Equal(State(Running{})), "seed=%d", seed)
+				} else {
+					Expect(c.State()).To(Equal(State(Idle{})), "seed=%d", seed)
+				}
+			}
+			Expect(starts).To(BeNumerically(">", 0), "seed=%d: walk should have started at least one", seed)
+		}
+	})
+
+	// Faithful concurrent test: StartCompaction spawns "work" that bumps an active
+	// counter, runs, and reports Finished back to the coordinator (exactly how the
+	// real sink behaves). Single-flight must hold even under many concurrent
+	// Triggers: the active counter never exceeds 1. Run under -race.
+	It("never runs two compactions concurrently", func() {
+		var active, maxActive int32
+		var c *Coordinator
+		var work sync.WaitGroup
+		sink := &spawnSink{onStart: func() {
+			work.Add(1)
+			go func() {
+				defer work.Done()
+				n := atomic.AddInt32(&active, 1)
+				for {
+					m := atomic.LoadInt32(&maxActive)
+					if n <= m || atomic.CompareAndSwapInt32(&maxActive, m, n) {
+						break
+					}
+				}
+				atomic.AddInt32(&active, -1)
+				_ = c.Apply(Finished{})
+			}()
+		}}
+		c = New(sink)
+
+		var wg sync.WaitGroup
+		for g := 0; g < 8; g++ {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
+				for range 1000 {
+					_ = c.Apply(Trigger{})
+				}
+			}()
+		}
+		wg.Wait()
+		work.Wait() // let any in-flight compaction report Finished
+
+		Expect(atomic.LoadInt32(&maxActive)).To(BeNumerically("<=", 1))
+		Expect(c.State()).To(Equal(State(Idle{})))
+	})
+
+	It("terminates on shutdown and rejects later triggers", func() {
+		sink := &recordingSink{}
+		c := New(sink)
+		Expect(c.Apply(Trigger{})).To(Succeed()) // Idle -> Running (StartCompaction)
+		Expect(c.Apply(Shutdown{})).To(Succeed())
+		Expect(c.State()).To(Equal(State(Terminated{})))
+
+		before := sink.count()
+		Expect(c.Apply(Trigger{})).To(Succeed()) // rejected
+		Expect(sink.count()).To(Equal(before), "no StartCompaction after shutdown")
+		Expect(c.Apply(Finished{})).To(Succeed()) // stale, absorbed
+		Expect(c.State()).To(Equal(State(Terminated{})))
+	})
+})
+
+// spawnSink invokes onStart for each StartCompaction (called under the coord lock;
+// onStart must be non-blocking — it spawns the work goroutine).
+type spawnSink struct{ onStart func() }
+
+func (s *spawnSink) Perform(e Effect) {
+	if _, ok := e.(StartCompaction); ok {
+		s.onStart()
+	}
+}
+
+var _ = DescribeTable("compactcoord stringers",
+	func(got, want string) { Expect(got).To(Equal(want)) },
+	Entry(nil, Idle{}.String(), "Idle"),
+	Entry(nil, Running{}.String(), "Running"),
+	Entry(nil, Terminated{}.String(), "Terminated"),
+	Entry(nil, Trigger{}.String(), "Trigger"),
+	Entry(nil, Finished{}.String(), "Finished"),
+	Entry(nil, Shutdown{}.String(), "Shutdown"),
+	Entry(nil, StartCompaction{}.String(), "StartCompaction"),
+)
--- a/core/http/endpoints/openai/conncoord/conncoord.go
+++ b/core/http/endpoints/openai/conncoord/conncoord.go
@@ -0,0 +1,164 @@
+// Package conncoord is the explicit state machine for the realtime API's
+// connection lifecycle (machine "M1" in docs/design/realtime-state-machines.md).
+//
+// In the legacy code this machine is implicit and fragile. The session handler
+// keeps a `vadServerStarted` bool plus a `done` channel that is REASSIGNED to a
+// fresh channel every time turn detection is toggled on (session.update) and
+// closed both at toggle-off and at teardown (Part 2, failure mode 6). It is
+// correct today only because one goroutine owns it; "one variable name meaning
+// different channels over time, closed from two sites guarded by a bool" is a
+// structural hazard, not an explicit lifecycle. Teardown likewise depends on the
+// bool to avoid closing an already-closed channel.
+//
+// This package makes the lifecycle explicit:
+//   - a sealed sum type for State (Live{VADRunning} | Torn) — illegal states
+//     such as "running after teardown" are unrepresentable,
+//   - a total, pure transition function Next(state, event) -> (state, effects),
+//   - a single-writer Coordinator that serializes every transition.
+//
+// The guarantees the spec checks:
+//   - the VAD goroutine's done channel is closed exactly once per start (StopVAD
+//     is emitted only while running, so never a double close / close of nil),
+//   - teardown runs exactly once (Close from Live; any later Close is a no-op),
+//   - nothing is started after teardown (no resurrection / no send-after-close).
+//
+// Like turncoord (M2), the connection machine is driven by the single session
+// goroutine; the Coordinator's lock keeps State() race-free and guards against a
+// future second writer. The effects are performed by a sink that owns the actual
+// channels/goroutines (see realtime_conncoord.go).
+package conncoord
+
+import (
+	"fmt"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator"
+)
+
+// State is the sealed sum type of connection states. The only implementations
+// are the marker-method structs in this file. Exhaustively: Live | Torn.
+type State interface {
+	isState()
+	String() string
+}
+
+// Live: the session is active. VADRunning records whether the turn-detection
+// (handleVAD) goroutine is currently running — the single source of truth that
+// replaces the legacy vadServerStarted bool, so the per-run done channel is
+// closed exactly once.
+type Live struct{ VADRunning bool }
+
+// Torn: the session has been torn down. Terminal — no effect is ever produced
+// from here again.
+type Torn struct{}
+
+func (Live) isState() {}
+func (Torn) isState() {}
+
+func (s Live) String() string { return fmt.Sprintf("Live(vad=%t)", s.VADRunning) }
+func (Torn) String() string   { return "Torn" }
+
+// Event is the sealed sum type of inputs. Exhaustively: SetVAD | Close.
+type Event interface {
+	isEvent()
+	String() string
+}
+
+// SetVAD requests the turn-detection goroutine be running (Active) or not. It is
+// raised whenever session.update changes whether turn detection is active. It is
+// idempotent: setting the state it is already in is a no-op.
+type SetVAD struct{ Active bool }
+
+// Close requests teardown (the transport read loop ended, or the session is
+// closing). It is idempotent — only the first Close from Live tears down.
+type Close struct{}
+
+func (SetVAD) isEvent() {}
+func (Close) isEvent()  {}
+
+func (e SetVAD) String() string { return fmt.Sprintf("SetVAD(%t)", e.Active) }
+func (Close) String() string    { return "Close" }
+
+// Effect is a side effect returned by Next as data for the caller to perform.
+// Exhaustively: StartVAD | StopVAD | Teardown.
+type Effect interface {
+	isEffect()
+	String() string
+}
+
+// StartVAD: create a fresh done channel and spawn the handleVAD goroutine on it.
+type StartVAD struct{}
+
+// StopVAD: close the running VAD goroutine's done channel (signal it to exit).
+type StopVAD struct{}
+
+// Teardown: the once-only teardown — stop the remaining input goroutines (opus
+// decode, sound window), join them, cancel in-flight responses, and remove the
+// session from the registry. Emitted exactly once.
+type Teardown struct{}
+
+func (StartVAD) isEffect() {}
+func (StopVAD) isEffect()  {}
+func (Teardown) isEffect() {}
+
+func (StartVAD) String() string { return "StartVAD" }
+func (StopVAD) String() string  { return "StopVAD" }
+func (Teardown) String() string { return "Teardown" }
+
+// Next is the total, pure transition function. For every (state, event) it
+// returns the next state and the ordered effects to perform. It returns a
+// non-nil error only for an unknown State/Event implementation. Every in-domain
+// pair is defined; there are no forbidden transitions, only no-ops.
+//
+// The crux: Close moves to Torn, which absorbs every later event with no
+// effects. So teardown's channel closes happen exactly once even if Close is
+// raised again (e.g. an error path and the normal return both reaching it), and
+// no StartVAD can resurrect a torn session.
+func Next(s State, e Event) (State, []Effect, error) {
+	switch st := s.(type) {
+	case Live:
+		switch ev := e.(type) {
+		case SetVAD:
+			switch {
+			case ev.Active && !st.VADRunning:
+				return Live{VADRunning: true}, []Effect{StartVAD{}}, nil
+			case !ev.Active && st.VADRunning:
+				return Live{VADRunning: false}, []Effect{StopVAD{}}, nil
+			default:
+				// Already in the requested state: idempotent no-op.
+				return Live{VADRunning: st.VADRunning}, nil, nil
+			}
+		case Close:
+			if st.VADRunning {
+				return Torn{}, []Effect{StopVAD{}, Teardown{}}, nil
+			}
+			return Torn{}, []Effect{Teardown{}}, nil
+		}
+	case Torn:
+		switch e.(type) {
+		case SetVAD:
+			// No resurrection: a toggle after teardown is ignored.
+			return Torn{}, nil, nil
+		case Close:
+			// Idempotent: teardown already ran.
+			return Torn{}, nil, nil
+		}
+	}
+	return s, nil, fmt.Errorf("conncoord: unhandled transition %s <- %s", s, e)
+}
+
+// EffectSink performs the effects produced by a transition. See coordinator.Sink:
+// Perform runs under the coordinator lock. The Teardown effect does join
+// goroutines (which can block) — acceptable here because the connection
+// coordinator is single-writer and torn down exactly once at the end of the
+// session goroutine, so no other Apply is contending the lock.
+type EffectSink = coordinator.Sink[Effect]
+
+// Coordinator serializes the connection-lifecycle transitions.
+// See coordinator.Coordinator.
+type Coordinator = coordinator.Coordinator[State, Event, Effect]
+
+// New returns a Coordinator in Live{VADRunning:false} that performs effects via
+// sink.
+func New(sink EffectSink) *Coordinator {
+	return coordinator.New[State, Event, Effect](Live{VADRunning: false}, Next, sink)
+}
--- a/core/http/endpoints/openai/conncoord/conncoord_suite_test.go
+++ b/core/http/endpoints/openai/conncoord/conncoord_suite_test.go
@@ -0,0 +1,13 @@
+package conncoord
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestConncoord(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "conncoord (realtime M1) Suite")
+}
--- a/core/http/endpoints/openai/conncoord/conncoord_test.go
+++ b/core/http/endpoints/openai/conncoord/conncoord_test.go
@@ -0,0 +1,212 @@
+package conncoord
+
+import (
+	"math/rand/v2"
+	"sync"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// recordingSink captures the ordered stream of effects so the invariants can be
+// checked independently of the transition function. Perform is called by
+// Coordinator.Apply under the coordinator lock; the mutex here only guards reads
+// from the spec goroutine.
+type recordingSink struct {
+	mu  sync.Mutex
+	log []Effect
+}
+
+func (s *recordingSink) Perform(e Effect) {
+	s.mu.Lock()
+	s.log = append(s.log, e)
+	s.mu.Unlock()
+}
+
+func (s *recordingSink) snapshot() []Effect {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	out := make([]Effect, len(s.log))
+	copy(out, s.log)
+	return out
+}
+
+// checkLog replays the effect log and asserts the lifecycle safety properties
+// from docs/design/realtime-state-machines.md, Part 4 (invariants #8, #10 and
+// failure mode 6):
+//
+//	(1) the VAD done channel is closed exactly once per start -- StartVAD only
+//	    while stopped, StopVAD only while running (no double close / close-of-nil);
+//	(2) teardown runs at most once;
+//	(3) no resurrection -- no StartVAD after Teardown.
+func checkLog(log []Effect) {
+	running := false
+	torn := false
+	teardowns := 0
+	for i, eff := range log {
+		switch eff.(type) {
+		case StartVAD:
+			Expect(torn).To(BeFalse(), "invariant (3): StartVAD after teardown (effect #%d)\nlog=%v", i, log)
+			Expect(running).To(BeFalse(), "invariant (1): StartVAD while already running (effect #%d)\nlog=%v", i, log)
+			running = true
+		case StopVAD:
+			Expect(running).To(BeTrue(), "invariant (1): StopVAD while not running (effect #%d)\nlog=%v", i, log)
+			running = false
+		case Teardown:
+			Expect(torn).To(BeFalse(), "invariant (2): Teardown twice (effect #%d)\nlog=%v", i, log)
+			torn = true
+			teardowns++
+		}
+	}
+	Expect(teardowns).To(BeNumerically("<=", 1), "invariant (2): teardown ran %d times\nlog=%v", teardowns, log)
+}
+
+type unknownEvent struct{}
+
+func (unknownEvent) isEvent()       {}
+func (unknownEvent) String() string { return "unknownEvent" }
+
+type unknownState struct{}
+
+func (unknownState) isState()       {}
+func (unknownState) String() string { return "unknownState" }
+
+var _ = Describe("conncoord.Next", func() {
+	DescribeTable("transitions",
+		func(state State, event Event, wantState State, wantEff []Effect) {
+			gotState, gotEff, err := Next(state, event)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(gotState).To(Equal(wantState))
+			Expect(gotEff).To(Equal(wantEff))
+		},
+		Entry("stopped+setvad(on) -> running: start",
+			Live{VADRunning: false}, SetVAD{Active: true},
+			Live{VADRunning: true}, []Effect{StartVAD{}}),
+		Entry("running+setvad(on) -> running, no-op",
+			Live{VADRunning: true}, SetVAD{Active: true},
+			Live{VADRunning: true}, []Effect(nil)),
+		Entry("stopped+setvad(off) -> stopped, no-op",
+			Live{VADRunning: false}, SetVAD{Active: false},
+			Live{VADRunning: false}, []Effect(nil)),
+		Entry("running+setvad(off) -> stopped: stop",
+			Live{VADRunning: true}, SetVAD{Active: false},
+			Live{VADRunning: false}, []Effect{StopVAD{}}),
+		Entry("stopped+close -> torn: teardown",
+			Live{VADRunning: false}, Close{},
+			Torn{}, []Effect{Teardown{}}),
+		Entry("running+close -> torn: stop + teardown",
+			Live{VADRunning: true}, Close{},
+			Torn{}, []Effect{StopVAD{}, Teardown{}}),
+		Entry("torn+setvad(on) -> torn, no-op (no resurrection)",
+			Torn{}, SetVAD{Active: true},
+			Torn{}, []Effect(nil)),
+		Entry("torn+close -> torn, no-op (idempotent)",
+			Torn{}, Close{},
+			Torn{}, []Effect(nil)),
+	)
+
+	It("is total over the defined (state, event) pairs", func() {
+		states := []State{Live{VADRunning: false}, Live{VADRunning: true}, Torn{}}
+		events := []Event{SetVAD{Active: true}, SetVAD{Active: false}, Close{}}
+		for _, s := range states {
+			for _, e := range events {
+				_, _, err := Next(s, e)
+				Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e)
+			}
+		}
+	})
+
+	It("errors on an unknown event type", func() {
+		_, _, err := Next(Live{}, unknownEvent{})
+		Expect(err).To(HaveOccurred())
+	})
+
+	It("errors on an unknown state type", func() {
+		_, _, err := Next(unknownState{}, Close{})
+		Expect(err).To(HaveOccurred())
+	})
+})
+
+var _ = Describe("conncoord.Coordinator", func() {
+	It("upholds the lifecycle invariants over random event sequences", func() {
+		seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE}
+		for _, seed := range seeds {
+			r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5))
+			sink := &recordingSink{}
+			c := New(sink)
+			running := false
+			torn := false
+
+			for range 5000 {
+				switch r.IntN(3) {
+				case 0:
+					Expect(c.Apply(SetVAD{Active: true})).To(Succeed())
+					if !torn {
+						running = true
+					}
+				case 1:
+					Expect(c.Apply(SetVAD{Active: false})).To(Succeed())
+					if !torn {
+						running = false
+					}
+				case 2:
+					Expect(c.Apply(Close{})).To(Succeed())
+					torn = true
+					running = false
+				}
+				if torn {
+					Expect(c.State()).To(Equal(State(Torn{})), "seed=%d", seed)
+				} else {
+					Expect(c.State()).To(Equal(State(Live{VADRunning: running})), "seed=%d", seed)
+				}
+			}
+			checkLog(sink.snapshot())
+		}
+	})
+
+	It("tears down at most once under concurrent SetVAD/Close from two goroutines", func() {
+		const perGoroutine = 2000
+		sink := &recordingSink{}
+		c := New(sink)
+
+		var wg sync.WaitGroup
+		drive := func(active bool) {
+			defer wg.Done()
+			for i := range perGoroutine {
+				switch i % 3 {
+				case 0:
+					_ = c.Apply(SetVAD{Active: active})
+				case 1:
+					_ = c.Apply(SetVAD{Active: !active})
+				case 2:
+					if i > perGoroutine/2 {
+						_ = c.Apply(Close{})
+					}
+				}
+			}
+		}
+
+		wg.Add(2)
+		go drive(true)
+		go drive(false)
+		wg.Wait()
+		_ = c.Apply(Close{})
+
+		checkLog(sink.snapshot())
+		Expect(c.State()).To(Equal(State(Torn{})))
+	})
+})
+
+var _ = DescribeTable("conncoord stringers",
+	func(got, want string) { Expect(got).To(Equal(want)) },
+	Entry(nil, Live{VADRunning: true}.String(), "Live(vad=true)"),
+	Entry(nil, Live{VADRunning: false}.String(), "Live(vad=false)"),
+	Entry(nil, Torn{}.String(), "Torn"),
+
+	Entry(nil, SetVAD{Active: true}.String(), "SetVAD(true)"),
+	Entry(nil, Close{}.String(), "Close"),
+
+	Entry(nil, StartVAD{}.String(), "StartVAD"),
+	Entry(nil, StopVAD{}.String(), "StopVAD"),
+	Entry(nil, Teardown{}.String(), "Teardown"),
+)
--- a/core/http/endpoints/openai/constants.go
+++ b/core/http/endpoints/openai/constants.go
@@ -5,4 +5,7 @@ const (
 	FinishReasonStop         = "stop"
 	FinishReasonToolCalls    = "tool_calls"
 	FinishReasonFunctionCall = "function_call"
+	// FinishReasonLength is reported when generation stopped because it
+	// reached the max_tokens budget rather than a natural stop (issue #9716).
+	FinishReasonLength = "length"
 )
--- a/core/http/endpoints/openai/coordinator/coordinator.go
+++ b/core/http/endpoints/openai/coordinator/coordinator.go
@@ -0,0 +1,82 @@
+// Package coordinator is the shared single-writer state-machine runtime for the
+// realtime API's explicit coordinators (machines M1–M5 in
+// docs/design/realtime-state-machines.md).
+//
+// Each machine package (respcoord, turncoord, conncoord, compactcoord, ttscoord)
+// defines its OWN sealed sum types for State/Event/Effect and a total, pure
+// transition function Next(state, event) -> (state, []effect, error). The
+// plumbing around that — a single-writer Coordinator that serializes every
+// transition behind one lock and performs the returned effects in order — is
+// identical across all five, so it lives here once instead of being copied.
+//
+// A machine package wires itself up with three lines:
+//
+//	type EffectSink = coordinator.Sink[Effect]
+//	type Coordinator = coordinator.Coordinator[State, Event, Effect]
+//	func New(sink EffectSink) *Coordinator { return coordinator.New[State, Event, Effect](Idle{}, Next, sink) }
+//
+// The aliases keep each package's public API (Coordinator, New, EffectSink,
+// Apply, State) unchanged. The single-writer serialization — the load-bearing
+// concurrency guarantee the FizzBee specs check — is therefore implemented and
+// reasoned about in exactly one place.
+package coordinator
+
+import "sync"
+
+// TransitionFunc is a machine's total, pure transition: given the current state
+// and an event it returns the next state, the ordered effects to perform, and a
+// non-nil error ONLY for an unhandled (programmer-error) state/event pair. It
+// must not perform I/O or block; side effects are returned as data (F) for the
+// Coordinator to hand to the Sink.
+type TransitionFunc[S, E, F any] func(state S, event E) (S, []F, error)
+
+// Sink performs the effects a transition produces. Implementations MUST be
+// non-blocking: Perform is called while the Coordinator holds its lock, so it
+// must not block (it should spawn a goroutine, call a cancel func, or do a
+// non-blocking channel send) and MUST NOT call back into the same Coordinator's
+// Apply.
+type Sink[F any] interface {
+	Perform(F)
+}
+
+// Coordinator is the single-writer wrapper around a pure transition function.
+// Every Apply is serialized by mu, so multiple goroutines can drive the machine
+// without racing, and a transition's effects are performed in order under the
+// lock (before any subsequent Apply can observe the new state).
+type Coordinator[S, E, F any] struct {
+	mu    sync.Mutex
+	state S
+	next  TransitionFunc[S, E, F]
+	sink  Sink[F]
+}
+
+// New returns a Coordinator in the given initial state that transitions via next
+// and performs effects via sink.
+func New[S, E, F any](initial S, next TransitionFunc[S, E, F], sink Sink[F]) *Coordinator[S, E, F] {
+	return &Coordinator[S, E, F]{state: initial, next: next, sink: sink}
+}
+
+// Apply runs one transition under the lock and performs its effects in order. If
+// the transition function returns an error (an unhandled state/event), the state
+// is left unchanged and the error is returned to the caller — never silently
+// swallowed.
+func (c *Coordinator[S, E, F]) Apply(e E) error {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	ns, effects, err := c.next(c.state, e)
+	if err != nil {
+		return err
+	}
+	c.state = ns
+	for _, eff := range effects {
+		c.sink.Perform(eff)
+	}
+	return nil
+}
+
+// State returns the current state (a value; safe to call concurrently).
+func (c *Coordinator[S, E, F]) State() S {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	return c.state
+}
--- a/core/http/endpoints/openai/coordinator/coordinator_suite_test.go
+++ b/core/http/endpoints/openai/coordinator/coordinator_suite_test.go
@@ -0,0 +1,13 @@
+package coordinator
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestCoordinator(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "coordinator (shared runtime) Suite")
+}
--- a/core/http/endpoints/openai/coordinator/coordinator_test.go
+++ b/core/http/endpoints/openai/coordinator/coordinator_test.go
@@ -0,0 +1,124 @@
+package coordinator
+
+import (
+	"errors"
+	"fmt"
+	"sync"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// A tiny toy machine exercises the generic runtime directly (the five real
+// machines exercise it via their aliases, but the gate measures this package's
+// own coverage). off <-toggle-> on; burst emits three ordered effects; boom is
+// the unhandled/error path.
+type tstate int
+
+const (
+	off tstate = iota
+	on
+)
+
+type tevent int
+
+const (
+	toggle tevent = iota
+	burst
+	boom
+)
+
+type teffect string
+
+func tnext(s tstate, e tevent) (tstate, []teffect, error) {
+	switch e {
+	case toggle:
+		if s == off {
+			return on, []teffect{"on"}, nil
+		}
+		return off, []teffect{"off"}, nil
+	case burst:
+		return s, []teffect{"a", "b", "c"}, nil
+	case boom:
+		return s, nil, errors.New("boom: unhandled")
+	}
+	return s, nil, fmt.Errorf("unknown event %d", int(e))
+}
+
+type recordingSink struct {
+	mu  sync.Mutex
+	log []teffect
+}
+
+func (s *recordingSink) Perform(e teffect) {
+	s.mu.Lock()
+	s.log = append(s.log, e)
+	s.mu.Unlock()
+}
+
+func (s *recordingSink) snapshot() []teffect {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	out := make([]teffect, len(s.log))
+	copy(out, s.log)
+	return out
+}
+
+var _ = Describe("coordinator.Coordinator", func() {
+	It("starts in the initial state", func() {
+		c := New[tstate, tevent, teffect](off, tnext, &recordingSink{})
+		Expect(c.State()).To(Equal(off))
+	})
+
+	It("advances state and performs the transition's effects", func() {
+		sink := &recordingSink{}
+		c := New[tstate, tevent, teffect](off, tnext, sink)
+
+		Expect(c.Apply(toggle)).To(Succeed())
+		Expect(c.State()).To(Equal(on))
+		Expect(c.Apply(toggle)).To(Succeed())
+		Expect(c.State()).To(Equal(off))
+
+		Expect(sink.snapshot()).To(Equal([]teffect{"on", "off"}))
+	})
+
+	It("performs multiple effects in order", func() {
+		sink := &recordingSink{}
+		c := New[tstate, tevent, teffect](off, tnext, sink)
+		Expect(c.Apply(burst)).To(Succeed())
+		Expect(sink.snapshot()).To(Equal([]teffect{"a", "b", "c"}))
+	})
+
+	It("returns the transition error and leaves state unchanged", func() {
+		sink := &recordingSink{}
+		c := New[tstate, tevent, teffect](on, tnext, sink)
+		err := c.Apply(boom)
+		Expect(err).To(HaveOccurred())
+		Expect(c.State()).To(Equal(on), "state unchanged on error")
+		Expect(sink.snapshot()).To(BeEmpty(), "no effects performed on error")
+	})
+
+	It("serializes concurrent Apply from many goroutines (run with -race)", func() {
+		const goroutines = 8
+		const each = 1000
+		sink := &recordingSink{}
+		c := New[tstate, tevent, teffect](off, tnext, sink)
+
+		var wg sync.WaitGroup
+		wg.Add(goroutines)
+		for range goroutines {
+			go func() {
+				defer wg.Done()
+				for range each {
+					_ = c.Apply(toggle)
+				}
+			}()
+		}
+		wg.Wait()
+
+		// goroutines*each toggles from off; an even total returns to off. The
+		// point is race-freedom + a consistent final state, not the value itself.
+		Expect(c.State()).To(Equal(off))
+		Expect(sink.snapshot()).To(HaveLen(goroutines * each))
+	})
+})
--- a/core/http/endpoints/openai/inference.go
+++ b/core/http/endpoints/openai/inference.go
@@ -13,6 +13,14 @@ import (
 	"github.com/mudler/xlog"
 )

+// reachedTokenBudget reports whether generation stopped because it reached the
+// configured max_tokens ceiling. A maxTokens of nil or <= 0 means "no limit".
+// Used to suppress regeneration retries (which would just hit the same ceiling
+// again) and to report finish_reason "length" instead of "stop" (issue #9716).
+func reachedTokenBudget(completion int, maxTokens *int) bool {
+	return maxTokens != nil && *maxTokens > 0 && completion >= *maxTokens
+}
+
 func ComputeChoices(
 	req *schema.OpenAIRequest,
 	predInput string,
@@ -113,11 +121,21 @@ func ComputeChoices(
 			}
 			prediction = p

+			// budgetExhausted is true when the model stopped because it reached
+			// the configured max_tokens ceiling. None of the retry paths below
+			// should fire in that case: regenerating would just hit the same
+			// ceiling again and multiply token consumption (issue #9716). A
+			// thinking model that spends its whole budget on the reasoning block
+			// produces an empty content / reasoning-only response, which would
+			// otherwise look like a failed generation worth retrying. This is a
+			// "length" finish, not an empty one.
+			budgetExhausted := reachedTokenBudget(prediction.Usage.Completion, config.Maxtokens)
+
 			// Built-in: retry on truly empty response (no tokens at all).
 			// However, when the C++ autoparser is active, it clears the raw
 			// message and delivers content via ChatDeltas instead. Do NOT
 			// retry if ChatDeltas contain tool calls or content.
-			if strings.TrimSpace(prediction.Response) == "" && attempt < maxRetries {
+			if strings.TrimSpace(prediction.Response) == "" && attempt < maxRetries && !budgetExhausted {
 				hasChatDeltaData := false
 				for _, d := range prediction.ChatDeltas {
 					if d.Content != "" || len(d.ToolCalls) > 0 {
@@ -159,7 +177,7 @@ func ComputeChoices(
 					}
 				}
 			}
-			if shouldRetryFn != nil && !skipCallerRetry && shouldRetryFn(attempt) && attempt < maxRetries {
+			if shouldRetryFn != nil && !skipCallerRetry && !budgetExhausted && shouldRetryFn(attempt) && attempt < maxRetries {
 				// Caller has already reset its state inside shouldRetry
 				result = result[:0]
 				allChatDeltas = nil
--- a/core/http/endpoints/openai/inference_test.go
+++ b/core/http/endpoints/openai/inference_test.go
@@ -393,6 +393,73 @@ var _ = Describe("ComputeChoices", func() {
 		})
 	})

+	Context("reachedTokenBudget", func() {
+		ptr := func(i int) *int { return &i }
+		It("is false when no limit is configured", func() {
+			Expect(reachedTokenBudget(1000, nil)).To(BeFalse())
+			Expect(reachedTokenBudget(1000, ptr(0))).To(BeFalse())
+			Expect(reachedTokenBudget(1000, ptr(-1))).To(BeFalse())
+		})
+		It("is false when generation stopped below the limit", func() {
+			Expect(reachedTokenBudget(99, ptr(100))).To(BeFalse())
+		})
+		It("is true when generation reached or exceeded the limit", func() {
+			Expect(reachedTokenBudget(100, ptr(100))).To(BeTrue())
+			Expect(reachedTokenBudget(101, ptr(100))).To(BeTrue())
+		})
+	})
+
+	Context("max_tokens budget exhausted on reasoning (issue #9716)", func() {
+		// Reproduces the streaming retry loop: when a thinking model spends its
+		// entire max_tokens budget on the reasoning block, the C++ autoparser
+		// clears the raw Response and delivers reasoning-only ChatDeltas (no
+		// content, no tool calls). The built-in empty-response retry then fires
+		// and regenerates from scratch up to maxRetries times, each re-consuming
+		// the whole budget — instead of terminating with finish_reason "length".
+		It("should NOT retry when the token budget was exhausted", func() {
+			maxTokens := 100
+			cfg.Maxtokens = &maxTokens
+
+			calls := 0
+			backend.ModelInferenceFunc = func(
+				ctx context.Context, s string, messages schema.Messages,
+				images, videos, audios []string,
+				loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader,
+				o *config.ApplicationConfig,
+				tokenCallback func(string, backend.TokenUsage) bool,
+				tools, toolChoice string,
+				logprobs, topLogprobs *int,
+				logitBias map[string]float64,
+				metadata map[string]string,
+			) (func() (backend.LLMResponse, error), error) {
+				predFunc := func() (backend.LLMResponse, error) {
+					calls++
+					// Autoparser cleared Response; only reasoning was produced,
+					// and the completion count reached the max_tokens budget.
+					return backend.LLMResponse{
+						Response:   "",
+						ChatDeltas: []*pb.ChatDelta{{ReasoningContent: "thinking..."}},
+						Usage:      backend.TokenUsage{Prompt: 5, Completion: maxTokens},
+					}, nil
+				}
+				return predFunc, nil
+			}
+
+			_, usage, _, err := ComputeChoices(
+				makeReq(), "test", cfg, nil, appCfg, nil,
+				func(s string, c *[]schema.Choice) {
+					*c = append(*c, schema.Choice{Text: s})
+				},
+				nil,
+			)
+			Expect(err).ToNot(HaveOccurred())
+			// The model hit its token ceiling; regenerating would just hit it
+			// again and multiply token consumption. Exactly one call expected.
+			Expect(calls).To(Equal(1), "budget-exhausted generation must not be retried")
+			Expect(usage.Completion).To(Equal(maxTokens))
+		})
+	})
+
 	Context("with streaming token callback", func() {
 		It("should call tokenCallback for streaming responses", func() {
 			var streamedTokens []string
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -12,7 +12,6 @@ import (
 	"os"
 	"strconv"
 	"sync"
-	"sync/atomic"
 	"time"

 	"net/http"
@@ -26,6 +25,8 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/auth"
 	mcpTools "github.com/mudler/LocalAI/core/http/endpoints/mcp"
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/respcoord"
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/turncoord"
 	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/templates"
@@ -168,44 +169,12 @@ type Session struct {
 	gateMu        sync.Mutex
 	voiceVerified bool

-	// Response cancellation: protects activeResponseCancel/activeResponseDone
-	responseMu           sync.Mutex
-	activeResponseCancel context.CancelFunc
-	activeResponseDone   chan struct{}
-}
-
-// cancelActiveResponse cancels any in-flight response and waits for its
-// goroutine to exit. This ensures we never have overlapping responses and
-// that interrupted responses are fully cleaned up before starting a new one.
-func (s *Session) cancelActiveResponse() {
-	s.responseMu.Lock()
-	cancel := s.activeResponseCancel
-	done := s.activeResponseDone
-	s.responseMu.Unlock()
-
-	if cancel != nil {
-		cancel()
-	}
-	if done != nil {
-		<-done
-	}
-}
-
-// startResponse cancels any active response and returns a new context for
-// the replacement response. The caller MUST close the returned done channel
-// when the response goroutine exits.
-func (s *Session) startResponse(parent context.Context) (context.Context, chan struct{}) {
-	s.cancelActiveResponse()
-
-	ctx, cancel := context.WithCancel(parent)
-	done := make(chan struct{})
-
-	s.responseMu.Lock()
-	s.activeResponseCancel = cancel
-	s.activeResponseDone = done
-	s.responseMu.Unlock()
-
-	return ctx, done
+	// respSink is the explicit response-coordination state machine (respcoord,
+	// machine M3). It replaces the legacy startResponse/cancelActiveResponse
+	// pair and its dual-writer activeResponse* fields: every start/cancel/finish
+	// decision is serialized through respcoord.Coordinator, guaranteeing at most
+	// one live response. See realtime_respcoord.go.
+	respSink *responseSink
 }

 func (s *Session) FromClient(session *types.SessionUnion) {
@@ -258,8 +227,10 @@ type Conversation struct {
 	// is kept out of Items (so trimRealtimeItems never drops it) and rendered
 	// as a system message right after the session instructions.
 	Memory string
-	// compacting ensures at most one background compaction runs per conversation.
-	compacting atomic.Bool
+	// compaction is the explicit single-flight compaction coordinator (M4): at
+	// most one background summarize+evict runs per conversation at a time. It
+	// replaces the legacy `compacting atomic.Bool`. See realtime_compactcoord.go.
+	compaction *compactionSink
 }

 func (c *Conversation) ToServer() types.Conversation {
@@ -288,6 +259,12 @@ type Model interface {
 	// sound-event tags. topK caps the number of returned tags (0 = backend
 	// default), threshold drops tags below the given score (0 = keep all).
 	SoundDetection(ctx context.Context, audio string, topK int, threshold float32) (*schema.SoundClassificationResult, error)
+	// TranscribeLive opens a live (bidirectional) transcription session on the
+	// pipeline's transcription backend, used by semantic_vad turn detection;
+	// onEvent fires from a background goroutine for every delta/EOU/final
+	// event. Backends without live support fail with an error satisfying
+	// grpcerrors.IsLiveTranscriptionUnsupported.
+	TranscribeLive(ctx context.Context, language string, onEvent func(backend.LiveTranscriptionEvent)) (backend.LiveTranscriptionSession, error)
 	PredictConfig() *config.ModelConfig
 }

@@ -513,14 +490,10 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	// input_audio_buffer.commit. There is no transcription stage in that case.
 	soundOnly := cfg.Pipeline.SoundDetection != "" && cfg.Pipeline.Transcription == "" && cfg.Pipeline.LLM == ""

-	turnDetection := &types.TurnDetectionUnion{
-		ServerVad: &types.ServerVad{
-			Threshold:         0.5,
-			PrefixPaddingMs:   300,
-			SilenceDurationMs: 500,
-			CreateResponse:    true,
-		},
-	}
+	// defaultTurnDetection seeds server_vad by default, or semantic_vad when the
+	// pipeline opts in (turn_detection.type: semantic_vad); clients can still
+	// override per session via session.update.
+	turnDetection := defaultTurnDetection(cfg)
 	inputAudioTranscription := &types.AudioTranscription{Model: sttModel}
 	if soundOnly {
 		turnDetection = nil           // turn_detection none: no VAD
@@ -561,12 +534,27 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	}
 	session.CompactionEnabled, session.CompactionTrigger, session.MaxSummaryTokens, session.SummaryModel = resolveCompaction(cfg, session.MaxHistoryItems)

+	// Single-writer response coordinator (machine M3). All response starts and
+	// cancels go through this, so the read-loop and VAD goroutine can never race
+	// into two overlapping responses (see realtime_respcoord.go).
+	session.respSink = newResponseSink()
+
 	// Create a default conversation
 	conversationID := generateConversationID()
 	conversation := &Conversation{
 		ID:    conversationID,
 		Items: []*types.MessageItemUnion{},
 	}
+	// The compaction coordinator's work closure resolves the summarizer (lazily
+	// loading a configured summary_model) and runs the summarize+evict off the
+	// response path — only when a compaction actually starts.
+	conversation.compaction = newCompactionSink(func(ctx context.Context) {
+		model := session.summarizerModel()
+		if model == nil {
+			return
+		}
+		session.compact(ctx, conversation, model)
+	})
 	session.Conversations[conversationID] = conversation
 	session.DefaultConversationID = conversationID

@@ -648,34 +636,22 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	})

 	var (
-		msg  []byte
-		wg   sync.WaitGroup
-		done = make(chan struct{})
+		msg []byte
+		wg  sync.WaitGroup
 	)

-	vadServerStarted := false
-	toggleVAD := func() {
-		if session.TurnDetection != nil && session.TurnDetection.ServerVad != nil && !vadServerStarted {
-			xlog.Debug("Starting VAD goroutine...")
-			done = make(chan struct{})
-			wg.Go(func() {
-				conversation := session.Conversations[session.DefaultConversationID]
-				handleVAD(session, conversation, t, done)
-			})
-			vadServerStarted = true
-		} else if (session.TurnDetection == nil || session.TurnDetection.ServerVad == nil) && vadServerStarted {
-			xlog.Debug("Stopping VAD goroutine...")
-			close(done)
-			vadServerStarted = false
-		}
-	}
+	// M1 connection lifecycle. The VAD goroutine's run/stop (and its done channel)
+	// and the once-only teardown are owned by this coordinator, so the channel is
+	// closed exactly once and never resurrected after teardown (Part 2, failure
+	// mode 6; invariants #8, #10). See realtime_conncoord.go and conncoord/.
+	conn := newConnSink(session, sessionID, t, &wg)
+	toggleVAD := func() { conn.setVAD(turnDetectionActive(session.TurnDetection)) }

 	// For WebRTC sessions, start the Opus decode loop before VAD so that
 	// decoded PCM is already flowing when VAD's first tick fires.
-	var decodeDone chan struct{}
 	if wt, ok := t.(*WebRTCTransport); ok {
-		decodeDone = make(chan struct{})
-		go decodeOpusLoop(session, wt.opusBackend, decodeDone)
+		conn.decodeDone = make(chan struct{})
+		go decodeOpusLoop(session, wt.opusBackend, conn.decodeDone)
 	}

 	toggleVAD()
@@ -684,9 +660,9 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	// with window/hop configured, the server classifies the last window of
 	// streamed audio on a timer, so the client only has to stream (no commits).
 	// This runs independent of VAD (sound events are not speech).
-	var soundWindowDone chan struct{}
 	if soundOnly && session.SoundDetectionWindowMs > 0 && session.SoundDetectionHopMs > 0 {
-		soundWindowDone = make(chan struct{})
+		conn.soundWindowDone = make(chan struct{})
+		soundWindowDone := conn.soundWindowDone
 		wg.Go(func() {
 			handleSoundWindow(session, t, soundWindowDone)
 		})
@@ -811,11 +787,11 @@ func runRealtimeSession(application *application.Application, t Transport, model
 			xlog.Debug("recv", "message", string(msg))

 			sessionLock.Lock()
-			isServerVAD := session.TurnDetection != nil && session.TurnDetection.ServerVad != nil
+			autoTurnDetection := turnDetectionActive(session.TurnDetection)
 			sessionLock.Unlock()

 			// TODO: At the least need to check locking and timer state in the VAD Go routine before allowing this
-			if isServerVAD {
+			if autoTurnDetection {
 				sendNotImplemented(t, "input_audio_buffer.commit in conjunction with VAD")
 				continue
 			}
@@ -831,11 +807,9 @@ func runRealtimeSession(application *application.Application, t Transport, model
 				ItemID:          generateItemID(),
 			})

-			respCtx, respDone := session.startResponse(context.Background())
-			go func() {
-				defer close(respDone)
-				commitUtterance(respCtx, allAudio, session, conversation, t)
-			}()
+			session.respSink.issue(context.Background(), respcoord.SourceClient, func(ctx context.Context) {
+				commitUtterance(ctx, allAudio, session, conversation, t)
+			})

 		case types.InputAudioBufferClearEvent:
 			xlog.Debug("recv", "message", string(msg))
@@ -968,15 +942,14 @@ func runRealtimeSession(application *application.Application, t Transport, model
 				conversation.Lock.Unlock()
 			}

-			respCtx, respDone := session.startResponse(context.Background())
-			go func() {
-				defer close(respDone)
-				triggerResponse(respCtx, session, conversation, t, &e.Response)
-			}()
+			resp := e.Response
+			session.respSink.issue(context.Background(), respcoord.SourceClient, func(ctx context.Context) {
+				triggerResponse(ctx, session, conversation, t, &resp)
+			})

 		case types.ResponseCancelEvent:
 			xlog.Debug("recv", "message", string(msg))
-			session.cancelActiveResponse()
+			session.respSink.cancel(respcoord.SourceClient)

 		default:
 			xlog.Error("unknown message type")
@@ -984,28 +957,11 @@ func runRealtimeSession(application *application.Application, t Transport, model
 		}
 	}

-	// Cancel any in-flight response before tearing down
-	session.cancelActiveResponse()
-
-	// Stop the Opus decode goroutine (if running)
-	if decodeDone != nil {
-		close(decodeDone)
-	}
-
-	// Signal any running VAD goroutine to exit.
-	if vadServerStarted {
-		close(done)
-	}
-	// Stop the server-side sound-detection windowing goroutine (if running).
-	if soundWindowDone != nil {
-		close(soundWindowDone)
-	}
-	wg.Wait()
-
-	// Remove the session from the sessions map
-	sessionLock.Lock()
-	delete(sessions, sessionID)
-	sessionLock.Unlock()
+	// Tear down through the connection coordinator (once). It stops any running
+	// VAD goroutine, then the opus-decode and sound-window goroutines, joins them,
+	// cancels the in-flight response and drains all response goroutines, and
+	// finally removes the session — all in dependency order, exactly once.
+	conn.close()
 }

 // sendEvent sends a server event via the transport, logging any errors.
@@ -1285,8 +1241,38 @@ func decodeOpusLoop(session *Session, opusBackend grpc.Backend, done chan struct
 	}
 }

+// noSpeechHoldbackSec is how much of the tail of an inspected, segment-free
+// buffer survives the periodic no-speech clear. It must cover the VAD's
+// onset-detection latency: a word can already be underway in the newest part
+// of the window without silero having crossed its threshold yet, and clearing
+// it cuts the start of the utterance the next tick will detect.
+const noSpeechHoldbackSec = 0.5
+
+// dropInspectedPrefix removes the head of the audio buffer that a VAD tick
+// inspected (the first inspected bytes), keeping the newest holdbackBytes of
+// that window plus everything appended while the tick ran — audio the VAD
+// never saw. When something is dropped the result is a fresh copy, never a
+// sub-slice, so later appends can't scribble on memory shared with the old
+// backing array; when nothing is dropped buf is returned unchanged.
+func dropInspectedPrefix(buf []byte, inspected, holdbackBytes int) []byte {
+	cut := inspected - holdbackBytes
+	if cut <= 0 {
+		return buf
+	}
+	if cut > len(buf) {
+		cut = len(buf)
+	}
+	return append([]byte(nil), buf[cut:]...)
+}
+
 // handleVAD is a goroutine that listens for audio data from the client,
-// runs VAD on the audio data, and commits utterances to the conversation
+// runs VAD on the audio data, and commits utterances to the conversation.
+//
+// With turn_detection.type == "semantic_vad" (sv != nil below) the silero
+// loop is augmented by a live transcription stream: the buffer's new audio
+// is fed to the transcription model every tick and its end-of-utterance
+// token switches the commit threshold between a short post-EOU window and
+// the long eagerness fallback. The server_vad path is untouched.
 func handleVAD(session *Session, conv *Conversation, t Transport, done chan struct{}) {
 	vadContext, cancel := context.WithCancel(context.Background())
 	go func() {
@@ -1299,9 +1285,22 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru
 		silenceThreshold = float64(session.TurnDetection.ServerVad.SilenceDurationMs) / 1000
 	}

-	speechStarted := false
+	lts := newLiveTurnState(session, t)
 	startTime := time.Now()

+	// M2 turn-detection state machine. "Speech started" and "a turn's live ASR
+	// stream is open" are ONE coordinator state (Idle/Speaking), so they cannot
+	// desync the way the legacy speechStarted bool and lts.open() could (Part 2,
+	// failure mode 4). See realtime_turncoord.go and turncoord/.
+	sink := newTurnSink(session, conv, t, lts, vadContext, startTime)
+	// Teardown: end any open turn through the coordinator (DiscardTurn closes the
+	// live stream; no-op if already idle). Replaces the bare lts.discardTurn().
+	defer func() {
+		if err := sink.coord.Apply(turncoord.Abort{Reason: turncoord.AbortTeardown}); err != nil {
+			xlog.Error("turncoord: abort(teardown) failed", "error", err)
+		}
+	}()
+
 	ticker := time.NewTicker(300 * time.Millisecond)
 	defer ticker.Stop()

@@ -1310,6 +1309,30 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru
 		case <-done:
 			return
 		case <-ticker.C:
+			// Semantic mode is re-read each tick: session.update can switch
+			// turn-detection modes (and the retranscribe gate) mid-session.
+			sessionLock.Lock()
+			var sv *types.RealtimeSessionSemanticVad
+			if session.TurnDetection != nil {
+				sv = session.TurnDetection.SemanticVad
+			}
+			retranscribe := sv != nil && session.ModelConfig != nil &&
+				session.ModelConfig.Pipeline.TurnDetectionRetranscribe()
+			sessionLock.Unlock()
+
+			// The turn coordinator's data-heavy effects (OpenTurn/CommitTurn)
+			// need this tick's mode; set it before any Apply below.
+			sink.sv = sv
+
+			// session.update switched semantic -> server mid-turn: drop the
+			// orphaned live stream. This is NOT a turn abort — the turn continues
+			// under server_vad (a config change must not cut off a mid-utterance
+			// speaker), so the coordinator stays Speaking; only the orphaned live
+			// stream is closed.
+			if sv == nil && lts.open() {
+				lts.discardTurn()
+			}
+
 			session.AudioBufferLock.Lock()
 			allAudio := make([]byte, len(session.InputAudioBuffer))
 			copy(allAudio, session.InputAudioBuffer)
@@ -1323,6 +1346,13 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru
 			// Resample from InputSampleRate to 16kHz
 			aints = sound.ResampleInt16(aints, session.InputSampleRate, localSampleRate)

+			audioLength := float64(len(aints)) / localSampleRate
+
+			if sv != nil && lts.open() {
+				lts.feedNewAudio(aints)
+				lts.drainEvents(audioLength)
+			}
+
 			segments, err := runVAD(vadContext, session, aints)
 			if err != nil {
 				if err.Error() == "unexpected speech end" {
@@ -1334,31 +1364,52 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru
 				continue
 			}

-			audioLength := float64(len(aints)) / localSampleRate
-
-			// TODO: When resetting the buffer we should retain a small postfix
+			// NOTE: the no-speech clear and the min-buffer gate above stay on
+			// the short silenceThreshold even in semantic mode — the eagerness
+			// fallback applies only to the end-of-speech commit decision, or a
+			// low eagerness would delay speech_started/barge-in by seconds.
 			if len(segments) == 0 && audioLength > silenceThreshold {
+				// "No segments" is not "no speech": silero (threshold 0.5)
+				// crosses up to a few hundred ms into a soft word onset, so
+				// the newest audio in the inspected window may be the start
+				// of a word the next tick will recognize — and more audio
+				// arrived while this tick ran. Keep both; drop only the
+				// older, confirmed-silent head, or utterance onsets get cut.
+				holdback := int(noSpeechHoldbackSec*float64(session.InputSampleRate)) * 2
 				session.AudioBufferLock.Lock()
-				session.InputAudioBuffer = nil
+				session.InputAudioBuffer = dropInspectedPrefix(session.InputAudioBuffer, len(allAudio), holdback)
 				session.AudioBufferLock.Unlock()

+				// No-speech clear: end any open turn (Speaking -> Idle, discarding
+				// the partial). Returning to Idle is the fix for failure mode 4 —
+				// the legacy discardTurn left speechStarted true, suppressing the
+				// next onset. Idle while not speaking is a no-op.
+				if err := sink.coord.Apply(turncoord.Abort{Reason: turncoord.AbortNoSpeech}); err != nil {
+					xlog.Error("turncoord: abort(no_speech) failed", "error", err)
+				}
 				continue
 			} else if len(segments) == 0 {
 				continue
 			}

-			if !speechStarted {
-				// Barge-in: cancel any in-flight response so we stop
-				// sending audio and don't keep the interrupted reply in history.
-				session.cancelActiveResponse()
+			// Speech detected this tick: open the turn (Idle -> Speaking) through
+			// the coordinator. On that transition it opens the turn's live ASR
+			// stream + feeds the buffered prefix (OpenTurn), cancels any in-flight
+			// response (BargeIn, non-blocking — the VAD tick is never stalled), and
+			// emits speech_started. While already Speaking it is a no-op, so "turn
+			// open" and "speech started" can never disagree. The turn id is minted
+			// here and carried by the coordinator through to the committed event.
+			sink.onsetAudio = aints
+			if err := sink.coord.Apply(turncoord.Onset{Turn: turncoord.TurnID(generateItemID())}); err != nil {
+				xlog.Error("turncoord: onset failed", "error", err)
+			}

-				sendEvent(t, types.InputAudioBufferSpeechStartedEvent{
-					ServerEventBase: types.ServerEventBase{
-						EventID: "event_TODO",
-					},
-					AudioStartMs: time.Since(startTime).Milliseconds(),
-				})
-				speechStarted = true
+			if sv != nil {
+				// Drain again: events produced by THIS tick's feed have
+				// usually arrived by the time runVAD returns, and leaving
+				// them for the next tick adds 300ms to every EOU-triggered
+				// commit.
+				lts.drainEvents(audioLength)
 			}

 			// Segment still in progress when audio ended
@@ -1367,41 +1418,90 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru
 				continue
 			}

-			if float32(audioLength)-segEndTime > float32(silenceThreshold) {
+			threshold := silenceThreshold
+			eouPending := false
+			if sv != nil {
+				eouPending = lts.eouPending(segments)
+				threshold = lts.thresholdSec(eouPending, sv)
+			}
+
+			if float32(audioLength)-segEndTime > float32(threshold) {
+				if sv != nil {
+					trigger, eouLag := lts.commitTrigger(eouPending, float64(segEndTime))
+					xlog.Info("semantic_vad: committing turn",
+						"trigger", trigger,
+						"speech_end_s", segEndTime,
+						"eou_lag_s", eouLag,
+						"silence_s", audioLength-float64(segEndTime),
+						"audio_s", audioLength)
+				}
+				// Retranscribe gate (semantic mode, EOU-triggered commits
+				// only): cross-check the streamed EOU with an offline decode
+				// of the buffered turn before committing. Runs synchronously
+				// on the tick — the engine would serialize a concurrent feed
+				// against it anyway. Timeout-triggered commits skip the gate.
+				var gated *schema.TranscriptionResult
+				if retranscribe && eouPending {
+					batch, gerr := transcribeUtterance(vadContext, sound.Int16toBytesLE(aints), session)
+					switch {
+					case gerr != nil:
+						xlog.Warn("semantic_vad: retranscribe gate failed; committing via the file path", "error", gerr)
+					case !batch.Eou:
+						xlog.Info("semantic_vad: batch decode did not confirm the streamed EOU; continuing to listen",
+							"streamed", lts.previewText(), "batch", batch.Text)
+						// The batch decode rejected the streamed EOU as a false
+						// positive: consume the recorded EOU so the next tick
+						// falls back to the eagerness window instead of
+						// re-triggering on the same token.
+						lts.eouAtSec = 0
+						continue
+					default:
+						xlog.Info("semantic_vad: batch decode confirmed the streamed EOU",
+							"streamed", lts.previewText(), "batch", batch.Text)
+						gated = batch
+					}
+				}
+
 				xlog.Debug("Detected end of speech segment")
 				session.AudioBufferLock.Lock()
-				session.InputAudioBuffer = nil
+				// Keep audio appended while this tick ran — it belongs to
+				// the next turn (in any mode: nil-ing it dropped the onset
+				// of an utterance started right after a commit).
+				session.InputAudioBuffer = dropInspectedPrefix(session.InputAudioBuffer, len(allAudio), 0)
 				session.AudioBufferLock.Unlock()

-				sendEvent(t, types.InputAudioBufferSpeechStoppedEvent{
-					ServerEventBase: types.ServerEventBase{
-						EventID: "event_TODO",
-					},
-					AudioEndMs: time.Since(startTime).Milliseconds(),
-				})
-				speechStarted = false
-
-				sendEvent(t, types.InputAudioBufferCommittedEvent{
-					ServerEventBase: types.ServerEventBase{
-						EventID: "event_TODO",
-					},
-					ItemID:         generateItemID(),
-					PreviousItemID: "TODO",
-				})
-
-				abytes := sound.Int16toBytesLE(aints)
-				// TODO: Remove prefix silence that is is over TurnDetectionParams.PrefixPaddingMs
-				respCtx, respDone := session.startResponse(vadContext)
-				go func() {
-					defer close(respDone)
-					commitUtterance(respCtx, abytes, session, conv, t)
-				}()
+				// Commit the turn through the coordinator: it emits speech_stopped
+				// (EmitSpeechStopped) then the committed event, finalizes the live
+				// stream, and issues the response (CommitTurn). The committed item
+				// id is the coordinator's turn id (== the id the live captions
+				// streamed under), so the client replaces the partial text.
+				sink.commitAudio = sound.Int16toBytesLE(aints)
+				sink.commitAudioLength = audioLength
+				sink.commitRetranscribe = retranscribe
+				sink.commitGated = gated
+				// TODO: Remove prefix silence that is over TurnDetectionParams.PrefixPaddingMs
+				if err := sink.coord.Apply(turncoord.Silence{}); err != nil {
+					xlog.Error("turncoord: commit failed", "error", err)
+				}
 			}
 		}
 	}
 }

 func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Conversation, t Transport) {
+	commitUtteranceWithTranscript(ctx, utt, nil, nil, "", session, conv, t)
+}
+
+// commitUtteranceWithTranscript commits one user turn. live carries the
+// transcript semantic_vad's live stream already produced (its caption deltas
+// were streamed to the client during the turn, so only the completed event
+// is emitted here); gated carries the retranscribe gate's batch decode (the
+// authoritative transcript in that mode). With neither — server_vad, manual
+// commits, semantic degrade, or a live stream that heard nothing — the audio
+// is written to a temp WAV and transcribed via the file path as before.
+// itemID is the turn's conversation item id ("" mints a fresh one); it must
+// match the id any live deltas were sent under.
+func commitUtteranceWithTranscript(ctx context.Context, utt []byte, live *liveUtterance, gated *schema.TranscriptionResult, itemID string, session *Session, conv *Conversation, t Transport) {
 	if len(utt) == 0 {
 		return
 	}
@@ -1466,14 +1566,37 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
 	}

 	// TODO: If we have a real any-to-any model then transcription is optional
+
+	// The turn's live captions (semantic_vad) already streamed under this
+	// itemID; the completed event below reuses it so the client replaces the
+	// partial text. server_vad / manual commits arrive with no itemID, so mint
+	// one here.
+	if itemID == "" {
+		itemID = generateItemID()
+	}
+
 	var transcript string
 	switch {
+	case gated != nil:
+		// semantic_vad retranscribe gate: the batch decode is authoritative.
+		transcript = gated.Text
+		if err := emitPrecomputedTranscription(t, itemID, nil, transcript); err != nil {
+			sendError(t, "transcription_failed", err.Error(), "", "event_TODO")
+			return
+		}
+	case live != nil && live.Text != "":
+		// The caption deltas already streamed during the turn under this
+		// itemID; the completed event replaces the partial text client-side.
+		transcript = live.Text
+		if err := emitPrecomputedTranscription(t, itemID, nil, transcript); err != nil {
+			sendError(t, "transcription_failed", err.Error(), "", "event_TODO")
+			return
+		}
 	case session.InputAudioTranscription != nil:
 		// emitTranscription streams transcript deltas when
 		// pipeline.streaming.transcription is set, otherwise emits a single
 		// completed event; either way it returns the final transcript text.
-		var err error
-		transcript, err = emitTranscription(ctx, t, session, generateItemID(), f.Name())
+		transcript, err = emitTranscription(ctx, t, session, itemID, f.Name())
 		if err != nil {
 			// Drain the gate goroutine before returning so its in-flight read of
 			// the temp WAV finishes before the deferred os.Remove fires.
@@ -1642,6 +1765,56 @@ func writeWindowWAV(pcm []byte, sampleRate int) (string, error) {
 	return f.Name(), nil
 }

+// writeUtteranceWAV persists raw 16 kHz mono PCM to a temp WAV for the
+// file-based transcription paths. The caller must invoke cleanup.
+func writeUtteranceWAV(utt []byte) (string, func(), error) {
+	f, err := os.CreateTemp("", "realtime-audio-chunk-*.wav")
+	if err != nil {
+		return "", nil, err
+	}
+	cleanup := func() {
+		_ = f.Close()
+		_ = os.Remove(f.Name())
+	}
+	xlog.Debug("Writing to file", "file", f.Name())
+
+	hdr := laudio.NewWAVHeader(uint32(len(utt)))
+	if err := hdr.Write(f); err != nil {
+		cleanup()
+		return "", nil, err
+	}
+	if _, err := f.Write(utt); err != nil {
+		cleanup()
+		return "", nil, err
+	}
+	_ = f.Sync()
+	return f.Name(), cleanup, nil
+}
+
+// transcribeUtterance runs one offline (unary) decode of the buffered turn —
+// the semantic_vad retranscribe gate. The result's Eou flag reports whether
+// the batch decode also ended on the end-of-utterance token.
+func transcribeUtterance(ctx context.Context, utt []byte, session *Session) (*schema.TranscriptionResult, error) {
+	path, cleanup, err := writeUtteranceWAV(utt)
+	if err != nil {
+		return nil, err
+	}
+	defer cleanup()
+
+	language, prompt := "", ""
+	if cfg := session.InputAudioTranscription; cfg != nil {
+		language, prompt = cfg.Language, cfg.Prompt
+	}
+	tr, err := session.ModelInterface.Transcribe(ctx, path, language, false, false, prompt)
+	if err != nil {
+		return nil, err
+	}
+	if tr == nil {
+		return nil, fmt.Errorf("transcribe result is nil")
+	}
+	return tr, nil
+}
+
 func runVAD(ctx context.Context, session *Session, adata []int16) ([]schema.VADSegment, error) {
 	soundIntBuffer := &audio.IntBuffer{
 		Format:         &audio.Format{SampleRate: localSampleRate, NumChannels: 1},
@@ -1721,14 +1894,100 @@ func generateResponse(ctx context.Context, session *Session, utt []byte, transcr
 // without another response cycle.
 const maxAssistantToolTurns = 10

+// responseOutcome is how a response ended, decided by the response body and
+// read once by triggerResponse to emit the single terminal event.
+type responseOutcome int
+
+const (
+	outcomeCompleted responseOutcome = iota
+	outcomeCancelled
+	outcomeFailed // an error event was already sent; emit no terminal (legacy behavior)
+)
+
+// liveResponse accumulates the wire-visible result of ONE response.create across
+// the whole agentic tool-turn recursion: a single id, the output items as they
+// complete, the summed token usage, and the final outcome. triggerResponse owns
+// it; triggerResponseAtTurn / streamLLMResponse / emitToolCallItems fill it in.
+// This is what makes "exactly one response.done per response.create, with Output
+// and Usage populated" true — the body no longer emits per-turn terminals.
+type liveResponse struct {
+	id      string
+	output  []types.MessageItemUnion
+	usage   backend.TokenUsage
+	outcome responseOutcome
+}
+
+func (r *liveResponse) addItem(it types.MessageItemUnion) { r.output = append(r.output, it) }
+
+func (r *liveResponse) addUsage(u backend.TokenUsage) {
+	r.usage.Prompt += u.Prompt
+	r.usage.Completion += u.Completion
+}
+
+// responseUsage maps the backend's token counts onto the OpenAI Realtime
+// response.usage shape. Returns nil when there is nothing to report so the
+// field is omitted rather than sent as zeros.
+func responseUsage(u backend.TokenUsage) *types.TokenUsage {
+	if u.Prompt == 0 && u.Completion == 0 {
+		return nil
+	}
+	return &types.TokenUsage{
+		InputTokens:  u.Prompt,
+		OutputTokens: u.Completion,
+		TotalTokens:  u.Prompt + u.Completion,
+	}
+}
+
 func triggerResponse(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams) {
-	triggerResponseAtTurn(ctx, session, conv, t, overrides, 0)
+	// One response.created and one response.done per response.create — even when
+	// the server-side tool loop runs several inference turns. The per-turn
+	// terminals the legacy code emitted (one response.done per turn, with empty
+	// Output/Usage) are gone; tool turns are now internal to this single response.
+	r := &liveResponse{id: generateUniqueID()}
+	sendEvent(t, types.ResponseCreatedEvent{
+		ServerEventBase: types.ServerEventBase{},
+		Response: types.Response{
+			ID:     r.id,
+			Object: "realtime.response",
+			Status: types.ResponseStatusInProgress,
+		},
+	})
+
+	triggerResponseAtTurn(ctx, session, conv, t, overrides, 0, r)
+
+	switch r.outcome {
+	case outcomeCancelled:
+		sendEvent(t, types.ResponseDoneEvent{
+			ServerEventBase: types.ServerEventBase{},
+			Response: types.Response{
+				ID:     r.id,
+				Object: "realtime.response",
+				Status: types.ResponseStatusCancelled,
+				Output: r.output,
+			},
+		})
+	case outcomeFailed:
+		// A specific error event was already sent; emit no terminal (matches the
+		// legacy behavior where failed responses had no response.done).
+	default:
+		sendEvent(t, types.ResponseDoneEvent{
+			ServerEventBase: types.ServerEventBase{},
+			Response: types.Response{
+				ID:     r.id,
+				Object: "realtime.response",
+				Status: types.ResponseStatusCompleted,
+				Output: r.output,
+				Usage:  responseUsage(r.usage),
+			},
+		})
+	}
+
 	// Fold aged-out turns into the rolling memory off the critical path; the
 	// next turn reaps the smaller buffer.
 	session.maybeCompact(conv)
 }

-func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams, toolTurn int) {
+func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams, toolTurn int, r *liveResponse) {
 	config := session.ModelInterface.PredictConfig()

 	// Default values
@@ -1891,15 +2150,9 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 		images = append(images, m.StringImages...)
 	}

-	responseID := generateUniqueID()
-	sendEvent(t, types.ResponseCreatedEvent{
-		ServerEventBase: types.ServerEventBase{},
-		Response: types.Response{
-			ID:     responseID,
-			Object: "realtime.response",
-			Status: types.ResponseStatusInProgress,
-		},
-	})
+	// response.created/done are emitted once per response.create by triggerResponse;
+	// every turn (including agentic recursion) shares this id.
+	responseID := r.id

 	// Streamed LLM path: when the pipeline opts into LLM streaming, stream the
 	// transcript to the client as it is generated and synthesize the buffered
@@ -1915,7 +2168,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 			respMods = overrides.OutputModalities
 		}
 		if canStream && modalitiesContainAudio(resolveOutputModalities(session.OutputModalities, respMods)) {
-			if streamLLMResponse(ctx, session, conv, t, responseID, conversationHistory, images, config, tools, toolChoice, toolTurn) {
+			if streamLLMResponse(ctx, session, conv, t, r, conversationHistory, images, config, tools, toolChoice, toolTurn) {
 				return
 			}
 		}
@@ -1924,26 +2177,22 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 	predFunc, err := session.ModelInterface.Predict(ctx, conversationHistory, images, nil, nil, nil, tools, toolChoice, nil, nil, nil)
 	if err != nil {
 		sendError(t, "inference_failed", fmt.Sprintf("backend error: %v", err), "", "") // item.Assistant.ID is unknown here
+		r.outcome = outcomeFailed
 		return
 	}

 	pred, err := predFunc()
 	if err != nil {
 		sendError(t, "prediction_failed", fmt.Sprintf("backend error: %v", err), "", "")
+		r.outcome = outcomeFailed
 		return
 	}
+	r.addUsage(pred.Usage)

 	// Check for cancellation after LLM inference (barge-in may have fired)
 	if ctx.Err() != nil {
 		xlog.Debug("Response cancelled after LLM inference (barge-in)")
-		sendEvent(t, types.ResponseDoneEvent{
-			ServerEventBase: types.ServerEventBase{},
-			Response: types.Response{
-				ID:     responseID,
-				Object: "realtime.response",
-				Status: types.ResponseStatusCancelled,
-			},
-		})
+		r.outcome = outcomeCancelled
 		return
 	}

@@ -2103,18 +2352,12 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 			conv.Lock.Unlock()
 		}

-		// sendCancelledResponse emits the cancelled status and cleans up the
-		// assistant item so the interrupted reply is not in chat history.
+		// sendCancelledResponse records the cancelled outcome (triggerResponse
+		// emits the single terminal) and cleans up the partial assistant item so
+		// the interrupted reply is not in chat history.
 		sendCancelledResponse := func() {
 			removeItemFromConv(item.Assistant.ID)
-			sendEvent(t, types.ResponseDoneEvent{
-				ServerEventBase: types.ServerEventBase{},
-				Response: types.Response{
-					ID:     responseID,
-					Object: "realtime.response",
-					Status: types.ResponseStatusCancelled,
-				},
-			})
+			r.outcome = outcomeCancelled
 		}

 		var audioString string
@@ -2163,6 +2406,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 				}
 				xlog.Error("TTS failed", "error", err)
 				sendError(t, "tts_error", fmt.Sprintf("TTS generation failed: %v", err), "", item.Assistant.ID)
+				r.outcome = outcomeFailed
 				return
 			}
 			if !isWebRTC {
@@ -2220,12 +2464,13 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 			OutputIndex:     0,
 			Item:            item,
 		})
+		r.addItem(item)
 	}

-	// Emit the parsed tool calls, the terminal response.done, and (for
-	// server-side assistant tools) the follow-up response. Shared with the
-	// streamed path so both finalize tool calls identically.
-	emitToolCallItems(ctx, session, conv, t, responseID, finalToolCalls, finalSpeech != "", toolTurn)
+	// Emit the parsed tool calls and (for server-side assistant tools) the
+	// follow-up turn. Shared with the streamed path so both finalize tool calls
+	// identically. The single terminal is emitted by triggerResponse.
+	emitToolCallItems(ctx, session, conv, t, r, finalToolCalls, finalSpeech != "", toolTurn)
 }

 // emitToolCallItems emits the realtime function_call items for the parsed tool
@@ -2239,7 +2484,8 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 //   - All other tools follow the standard OpenAI flow: emit
 //     function_call_arguments.done and wait for the client to send
 //     conversation.item.create back.
-func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation, t Transport, responseID string, toolCalls []functions.FuncCallResults, hasContent bool, toolTurn int) {
+func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation, t Transport, r *liveResponse, toolCalls []functions.FuncCallResults, hasContent bool, toolTurn int) {
+	responseID := r.id
 	xlog.Debug("About to handle tool calls", "finalToolCallsCount", len(toolCalls))
 	executedAssistantTool := false
 	for i, tc := range toolCalls {
@@ -2302,6 +2548,7 @@ func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation
 				OutputIndex:     outputIndex,
 				Item:            fcItem,
 			})
+			r.addItem(fcItem)
 			sendEvent(t, types.ResponseOutputItemAddedEvent{
 				ServerEventBase: types.ServerEventBase{},
 				ResponseID:      responseID,
@@ -2314,6 +2561,7 @@ func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation
 				OutputIndex:     outputIndex,
 				Item:            foItem,
 			})
+			r.addItem(foItem)
 			executedAssistantTool = true
 			continue
 		}
@@ -2343,28 +2591,25 @@ func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation
 			OutputIndex:     outputIndex,
 			Item:            fcItem,
 		})
+		r.addItem(fcItem)
 	}

-	sendEvent(t, types.ResponseDoneEvent{
-		ServerEventBase: types.ServerEventBase{},
-		Response: types.Response{
-			ID:     responseID,
-			Object: "realtime.response",
-			Status: types.ResponseStatusCompleted,
-		},
-	})
+	// No terminal here: triggerResponse emits the single response.done once the
+	// whole turn (including the agentic recursion below) completes.

 	// If we executed any assistant tools inproc, run another response cycle
 	// so the model can speak the result. Mirrors the chat-side agentic loop
 	// but driven server-side rather than by client round-trip. Bounded so a
-	// degenerate "model keeps calling tools" doesn't blow the stack.
+	// degenerate "model keeps calling tools" doesn't blow the stack. The
+	// follow-up turn shares the same liveResponse, so its output accumulates
+	// into the one response.done.
 	if executedAssistantTool {
 		if toolTurn+1 >= maxAssistantToolTurns {
 			xlog.Warn("realtime: assistant tool-turn limit reached, stopping the agentic loop",
 				"limit", maxAssistantToolTurns, "model", session.Model)
 			return
 		}
-		triggerResponseAtTurn(ctx, session, conv, t, nil, toolTurn+1)
+		triggerResponseAtTurn(ctx, session, conv, t, nil, toolTurn+1, r)
 	}
 }

--- a/core/http/endpoints/openai/realtime_compactcoord.go
+++ b/core/http/endpoints/openai/realtime_compactcoord.go
@@ -0,0 +1,79 @@
+package openai
+
+import (
+	"context"
+	"sync"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/compactcoord"
+	"github.com/mudler/xlog"
+)
+
+// compactionSink wires the explicit compaction state machine
+// (compactcoord.Coordinator — machine "M4" in docs/design/realtime-state-machines.md)
+// into a conversation.
+//
+// It replaces the legacy `compacting atomic.Bool` single-flight guard: the
+// coordinator owns whether a compaction is running, so a Trigger while one is
+// already in flight is dropped (single-flight) and the background goroutine
+// always reports Finished — the flag can never stick (invariant #9).
+//
+// run is the summarize+evict work for this conversation (captured at
+// construction); StartCompaction spawns it and reports Finished when it returns.
+// It takes a context derived from the sink's session-scoped ctx, so shutdown()
+// can cancel an in-flight compaction.
+type compactionSink struct {
+	coord  *compactcoord.Coordinator
+	run    func(ctx context.Context)
+	ctx    context.Context
+	cancel context.CancelFunc
+	wg     sync.WaitGroup
+}
+
+func newCompactionSink(run func(ctx context.Context)) *compactionSink {
+	s := &compactionSink{run: run}
+	s.ctx, s.cancel = context.WithCancel(context.Background())
+	s.coord = compactcoord.New(s)
+	return s
+}
+
+// trigger asks the coordinator to start a compaction; a no-op while one is
+// already running or after shutdown. Non-blocking.
+func (s *compactionSink) trigger() {
+	if err := s.coord.Apply(compactcoord.Trigger{}); err != nil {
+		xlog.Error("compactcoord: trigger failed", "error", err)
+	}
+}
+
+// shutdown is called by the connection (M1) parent's teardown: cancel any
+// in-flight compaction, join it, then move the coordinator to Terminated so no
+// compaction can start afterwards. This closes the legacy gap where the
+// fire-and-forget compaction goroutine could outlive the session. Cancelling the
+// context first makes the in-flight summarizer Predict return promptly, so the
+// join is bounded.
+func (s *compactionSink) shutdown() {
+	s.cancel()
+	s.wg.Wait()
+	if err := s.coord.Apply(compactcoord.Shutdown{}); err != nil {
+		xlog.Error("compactcoord: shutdown apply failed", "error", err)
+	}
+}
+
+// Perform executes one effect. Called under the coordinator lock; StartCompaction
+// only spawns a goroutine, so it does not block.
+func (s *compactionSink) Perform(e compactcoord.Effect) {
+	switch e.(type) {
+	case compactcoord.StartCompaction:
+		s.wg.Add(1)
+		go func() {
+			defer s.wg.Done()
+			defer func() {
+				if err := s.coord.Apply(compactcoord.Finished{}); err != nil {
+					xlog.Error("compactcoord: finished apply failed", "error", err)
+				}
+			}()
+			if s.run != nil {
+				s.run(s.ctx)
+			}
+		}()
+	}
+}
--- a/core/http/endpoints/openai/realtime_compaction.go
+++ b/core/http/endpoints/openai/realtime_compaction.go
@@ -222,7 +222,7 @@ func prefixMatches(items, snapshot []*types.MessageItemUnion) bool {
 // conv.Lock across the summarizer call: snapshot under lock, summarize unlocked,
 // commit under lock (re-validating the head is unchanged). On any error it
 // leaves the conversation untouched — items are never dropped without a summary.
-func (s *Session) compact(conv *Conversation, model Model) {
+func (s *Session) compact(ctx context.Context, conv *Conversation, model Model) {
 	if model == nil {
 		return
 	}
@@ -241,9 +241,10 @@ func (s *Session) compact(conv *Conversation, model Model) {
 	prior := conv.Memory
 	conv.Lock.Unlock()

-	// Summarize (unlocked).
+	// Summarize (unlocked). The timeout is derived from the caller's ctx so the
+	// connection teardown can cancel an in-flight summary (bounding the join).
 	msgs := buildSummaryMessages(prior, renderItemsTranscript(overflow), s.MaxSummaryTokens)
-	ctx, cancel := context.WithTimeout(context.Background(), compactionTimeout)
+	ctx, cancel := context.WithTimeout(ctx, compactionTimeout)
 	defer cancel()
 	predFunc, err := model.Predict(ctx, msgs, nil, nil, nil, nil, nil, nil, nil, nil, nil)
 	if err != nil {
@@ -298,9 +299,13 @@ func (s *Session) summarizerModel() Model {
 }

 // maybeCompact schedules a background compaction when the live buffer has grown
-// past the trigger and none is already running. Returns immediately.
+// past the trigger and none is already running. Returns immediately. The
+// single-flight guarantee (at most one compaction per conversation) is owned by
+// the compaction coordinator (M4); see realtime_compactcoord.go. The actual
+// summarize+evict work (and the lazy summary_model load) is the conversation's
+// compaction-sink run closure, so it stays off the response path.
 func (s *Session) maybeCompact(conv *Conversation) {
-	if !s.CompactionEnabled {
+	if !s.CompactionEnabled || conv.compaction == nil {
 		return
 	}
 	conv.Lock.Lock()
@@ -309,18 +314,5 @@ func (s *Session) maybeCompact(conv *Conversation) {
 	if !over {
 		return
 	}
-	if !conv.compacting.CompareAndSwap(false, true) {
-		return
-	}
-	go func() {
-		defer conv.compacting.Store(false)
-		// Resolve (and, for a configured summary_model, lazily load) the
-		// summarizer only when a compaction actually runs, off the response
-		// path — so the model load never blocks a user turn.
-		model := s.summarizerModel()
-		if model == nil {
-			return
-		}
-		s.compact(conv, model)
-	}()
+	conv.compaction.trigger()
 }
--- a/core/http/endpoints/openai/realtime_compaction_test.go
+++ b/core/http/endpoints/openai/realtime_compaction_test.go
@@ -1,6 +1,7 @@
 package openai

 import (
+	"context"
 	"errors"

 	. "github.com/onsi/ginkgo/v2"
@@ -198,7 +199,7 @@ var _ = Describe("compact", func() {
 		s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4, MaxSummaryTokens: 512}
 		m := &fakeModel{predictResp: backend.LLMResponse{Response: "ROLLED UP"}}

-		s.compact(conv, m)
+		s.compact(context.Background(), conv, m)

 		Expect(conv.Memory).To(Equal("ROLLED UP"))
 		Expect(len(conv.Items)).To(Equal(4))
@@ -213,7 +214,7 @@ var _ = Describe("compact", func() {
 		s := &Session{CompactionEnabled: true, CompactionTrigger: 2, MaxHistoryItems: 1, MaxSummaryTokens: 512}
 		m := &fakeModel{predictErr: errors.New("boom")}

-		s.compact(conv, m)
+		s.compact(context.Background(), conv, m)

 		Expect(conv.Memory).To(Equal(""))
 		Expect(len(conv.Items)).To(Equal(3))
@@ -227,7 +228,7 @@ var _ = Describe("compact", func() {
 		s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4, MaxSummaryTokens: 512}
 		m := &fakeModel{predictResp: backend.LLMResponse{Response: "<think>planning the summary</think>CLEAN SUMMARY"}}

-		s.compact(conv, m)
+		s.compact(context.Background(), conv, m)

 		Expect(conv.Memory).To(Equal("CLEAN SUMMARY"))
 		Expect(conv.Memory).ToNot(ContainSubstring("planning"))
@@ -236,7 +237,7 @@ var _ = Describe("compact", func() {
 	It("does nothing when items are at or below the trigger", func() {
 		conv := &Conversation{Items: []*types.MessageItemUnion{user("1", "a")}}
 		s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4}
-		s.compact(conv, &fakeModel{predictResp: backend.LLMResponse{Response: "x"}})
+		s.compact(context.Background(), conv, &fakeModel{predictResp: backend.LLMResponse{Response: "x"}})
 		Expect(conv.Memory).To(Equal(""))
 		Expect(len(conv.Items)).To(Equal(1))
 	})
--- a/core/http/endpoints/openai/realtime_conncoord.go
+++ b/core/http/endpoints/openai/realtime_conncoord.go
@@ -0,0 +1,122 @@
+package openai
+
+import (
+	"sync"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/conncoord"
+	"github.com/mudler/xlog"
+)
+
+// connSink wires the explicit connection-lifecycle state machine
+// (conncoord.Coordinator — machine "M1" in docs/design/realtime-state-machines.md)
+// into the realtime session handler.
+//
+// It replaces the legacy vadServerStarted bool + the `done` channel that was
+// reassigned on every turn-detection toggle and closed from two sites (Part 2,
+// failure mode 6). The coordinator owns whether the VAD goroutine is running, so
+// the per-run done channel is created and closed in lockstep with that one state
+// — closed exactly once, never resurrected after teardown.
+//
+// The connection machine is driven by the single session goroutine (the handler
+// loop and its teardown), so this sink and its coordinator are loop-local; the
+// Coordinator's lock only keeps State() race-free.
+//
+// Effects:
+//   - StartVAD: create a fresh done channel and spawn handleVAD on it (joined via wg).
+//   - StopVAD:  close that done channel.
+//   - Teardown: stop the remaining input goroutines (opus decode, sound window),
+//     join everything, cancel in-flight responses, and remove the session — once.
+type connSink struct {
+	session   *Session
+	sessionID string
+	transport Transport
+	wg        *sync.WaitGroup
+
+	coord *conncoord.Coordinator
+
+	// vadDone is the current VAD run's stop signal — recreated on each StartVAD,
+	// closed by StopVAD / Teardown. Owned solely by Perform (single goroutine).
+	vadDone chan struct{}
+
+	// One-shot stop signals for the other input goroutines, registered by the
+	// handler when it starts them; closed once by Teardown.
+	decodeDone      chan struct{}
+	soundWindowDone chan struct{}
+}
+
+func newConnSink(session *Session, sessionID string, t Transport, wg *sync.WaitGroup) *connSink {
+	s := &connSink{
+		session:   session,
+		sessionID: sessionID,
+		transport: t,
+		wg:        wg,
+	}
+	s.coord = conncoord.New(s)
+	return s
+}
+
+// setVAD requests the turn-detection goroutine match active. Idempotent.
+func (s *connSink) setVAD(active bool) {
+	if err := s.coord.Apply(conncoord.SetVAD{Active: active}); err != nil {
+		xlog.Error("conncoord: setVAD failed", "error", err)
+	}
+}
+
+// close tears the session down (once). Safe to call from multiple exit paths.
+func (s *connSink) close() {
+	if err := s.coord.Apply(conncoord.Close{}); err != nil {
+		xlog.Error("conncoord: close failed", "error", err)
+	}
+}
+
+// Perform executes one effect. Called by Coordinator.Apply under the coordinator
+// lock; the connection coordinator is single-writer and torn down exactly once at
+// the end of the session goroutine, so the blocking joins in Teardown never
+// contend the lock.
+func (s *connSink) Perform(e conncoord.Effect) {
+	switch e.(type) {
+	case conncoord.StartVAD:
+		xlog.Debug("Starting VAD goroutine...")
+		s.vadDone = make(chan struct{})
+		done := s.vadDone
+		s.wg.Go(func() {
+			conversation := s.session.Conversations[s.session.DefaultConversationID]
+			handleVAD(s.session, conversation, s.transport, done)
+		})
+	case conncoord.StopVAD:
+		xlog.Debug("Stopping VAD goroutine...")
+		close(s.vadDone)
+		s.vadDone = nil
+	case conncoord.Teardown:
+		// Tear down in dependency order, driving every child machine to its
+		// terminal state so none outlives the session (the hierarchy invariant in
+		// formal-verification/session_lifecycle.fizz: conn Torn => children terminal).
+		//
+		// 1. Stop the remaining input goroutines and join them (this joins the VAD
+		//    goroutine, M2, via the StopVAD above + wg).
+		if s.decodeDone != nil {
+			close(s.decodeDone)
+		}
+		if s.soundWindowDone != nil {
+			close(s.soundWindowDone)
+		}
+		s.wg.Wait()
+
+		// 2. Terminate the response coordinator (M3): cancel the in-flight response
+		//    and join all response goroutines (which also closes their TTS
+		//    pipelines, M5). After this no response can start.
+		s.session.respSink.shutdown()
+
+		// 3. Terminate every conversation's compaction coordinator (M4): cancel +
+		//    join any in-flight summarize+evict so it cannot outlive the session.
+		for _, conv := range s.session.Conversations {
+			if conv.compaction != nil {
+				conv.compaction.shutdown()
+			}
+		}
+
+		sessionLock.Lock()
+		delete(sessions, s.sessionID)
+		sessionLock.Unlock()
+	}
+}
--- a/core/http/endpoints/openai/realtime_doubles_test.go
+++ b/core/http/endpoints/openai/realtime_doubles_test.go
@@ -74,6 +74,16 @@ type fakeModel struct {

 	transcribeDeltas []string
 	transcribeFinal  *schema.TranscriptionResult
+	transcribeErr    error
+
+	// TranscribeLive scripting: liveErr makes the open fail (degrade path);
+	// liveEvents are delivered to onEvent synchronously at open;
+	// liveCloseEvents are delivered during Close (the finalize flush).
+	liveErr         error
+	liveEvents      []backend.LiveTranscriptionEvent
+	liveCloseEvents []backend.LiveTranscriptionEvent
+	liveOpened      int
+	liveSession     *fakeLiveSession

 	// soundDetectionResult/soundDetectionErr drive the SoundDetection double so
 	// the sound-event path can be exercised deterministically.
@@ -97,7 +107,7 @@ func (m *fakeModel) VAD(context.Context, *schema.VADRequest) (*schema.VADRespons
 }

 func (m *fakeModel) Transcribe(context.Context, string, string, bool, bool, string) (*schema.TranscriptionResult, error) {
-	return m.transcribeFinal, nil
+	return m.transcribeFinal, m.transcribeErr
 }

 func (m *fakeModel) SoundDetection(context.Context, string, int, float32) (*schema.SoundClassificationResult, error) {
@@ -150,4 +160,43 @@ func (m *fakeModel) TranscribeStream(_ context.Context, _, _ string, _, _ bool,
 	return m.transcribeFinal, nil
 }

+func (m *fakeModel) TranscribeLive(_ context.Context, _ string, onEvent func(backend.LiveTranscriptionEvent)) (backend.LiveTranscriptionSession, error) {
+	if m.liveErr != nil {
+		return nil, m.liveErr
+	}
+	m.liveOpened++
+	for _, ev := range m.liveEvents {
+		onEvent(ev)
+	}
+	m.liveSession = &fakeLiveSession{onEvent: onEvent, closeEvents: m.liveCloseEvents}
+	return m.liveSession, nil
+}
+
 func (m *fakeModel) PredictConfig() *config.ModelConfig { return m.cfg }
+
+// fakeLiveSession records what semantic_vad fed and closed; closeEvents are
+// replayed through onEvent during Close, mimicking the backend's finalize
+// flush (trailing delta + Final) landing before Close returns.
+type fakeLiveSession struct {
+	onEvent     func(backend.LiveTranscriptionEvent)
+	closeEvents []backend.LiveTranscriptionEvent
+	fed         [][]float32
+	feedErr     error
+	closed      int
+}
+
+func (s *fakeLiveSession) Feed(pcm []float32) error {
+	if s.feedErr != nil {
+		return s.feedErr
+	}
+	s.fed = append(s.fed, append([]float32(nil), pcm...))
+	return nil
+}
+
+func (s *fakeLiveSession) Close() error {
+	s.closed++
+	for _, ev := range s.closeEvents {
+		s.onEvent(ev)
+	}
+	return nil
+}
--- a/core/http/endpoints/openai/realtime_model.go
+++ b/core/http/endpoints/openai/realtime_model.go
@@ -102,6 +102,10 @@ func (m *transcriptOnlyModel) TranscribeStream(ctx context.Context, audio, langu
 	return transcribeStream(ctx, m.modelLoader, *m.TranscriptionConfig, m.appConfig, audio, language, translate, diarize, prompt, onDelta)
 }

+func (m *transcriptOnlyModel) TranscribeLive(ctx context.Context, language string, onEvent func(backend.LiveTranscriptionEvent)) (backend.LiveTranscriptionSession, error) {
+	return backend.ModelTranscriptionLive(ctx, language, m.modelLoader, *m.TranscriptionConfig, m.appConfig, onEvent)
+}
+
 func (m *transcriptOnlyModel) PredictConfig() *config.ModelConfig {
 	return nil
 }
@@ -348,6 +352,10 @@ func (m *wrappedModel) TranscribeStream(ctx context.Context, audio, language str
 	return transcribeStream(ctx, m.modelLoader, *m.TranscriptionConfig, m.appConfig, audio, language, translate, diarize, prompt, onDelta)
 }

+func (m *wrappedModel) TranscribeLive(ctx context.Context, language string, onEvent func(backend.LiveTranscriptionEvent)) (backend.LiveTranscriptionSession, error) {
+	return backend.ModelTranscriptionLive(ctx, language, m.modelLoader, *m.TranscriptionConfig, m.appConfig, onEvent)
+}
+
 func (m *wrappedModel) PredictConfig() *config.ModelConfig {
 	return m.LLMConfig
 }
--- a/core/http/endpoints/openai/realtime_respcoord.go
+++ b/core/http/endpoints/openai/realtime_respcoord.go
@@ -0,0 +1,143 @@
+package openai
+
+import (
+	"context"
+	"sync"
+	"sync/atomic"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/respcoord"
+	"github.com/mudler/xlog"
+)
+
+// responseSink wires the explicit response-coordination state machine
+// (respcoord.Coordinator — machine "M3" in docs/design/realtime-state-machines.md)
+// into a realtime session.
+//
+// It replaces the legacy startResponse/cancelActiveResponse pair, whose
+// activeResponse* fields were written from two goroutines (the client read-loop
+// and the VAD goroutine) with the <-done wait performed outside the lock — the
+// dual-writer race documented in Part 2 (failure mode 2). The coordinator
+// serializes every start/cancel/finish decision behind one lock and guarantees
+// at most one live response, so the two callers can no longer interleave into
+// two overlapping responses.
+//
+// Each response runs as a goroutine spawned here. The effects map as:
+//   - StartResponse:  spawn the registered body with a fresh cancelable context.
+//   - CancelResponse: cancel that context (cooperative — the body stops at its
+//     next ctx checkpoint and emits its own response.done{cancelled}).
+//   - EmitTerminal:   currently a no-op. response.done is still emitted by the
+//     response body itself; making this the single authoritative terminal (one
+//     response.done per response.create, with Output+Usage populated) is the
+//     next step and does not change the coordination guarantees here.
+type responseSink struct {
+	mu      sync.Mutex
+	coord   *respcoord.Coordinator
+	cancels map[respcoord.ResponseID]context.CancelFunc
+	bodies  map[respcoord.ResponseID]responseBody
+	seq     atomic.Uint64
+	wg      sync.WaitGroup
+}
+
+type responseBody struct {
+	parent context.Context
+	run    func(ctx context.Context)
+}
+
+func newResponseSink() *responseSink {
+	s := &responseSink{
+		cancels: map[respcoord.ResponseID]context.CancelFunc{},
+		bodies:  map[respcoord.ResponseID]responseBody{},
+	}
+	s.coord = respcoord.New(s)
+	return s
+}
+
+// issue registers a response body and asks the coordinator to start it. Any
+// in-flight response is superseded (cancelled, with its own terminal) first,
+// atomically inside the coordinator — no caller-side locking, no dual-writer
+// race. Non-blocking: the superseded response drains concurrently and its later
+// Finished is ignored as stale.
+func (s *responseSink) issue(parent context.Context, source respcoord.Source, run func(ctx context.Context)) {
+	id := respcoord.ResponseID(s.seq.Add(1))
+	s.mu.Lock()
+	s.bodies[id] = responseBody{parent: parent, run: run}
+	s.mu.Unlock()
+	if err := s.coord.Apply(respcoord.Start{ID: id, Source: source}); err != nil {
+		xlog.Error("respcoord: start failed", "error", err)
+	}
+}
+
+// cancel cancels the in-flight response, if any. Non-blocking (barge-in must not
+// stall the VAD tick).
+func (s *responseSink) cancel(source respcoord.Source) {
+	if err := s.coord.Apply(respcoord.Cancel{Source: source}); err != nil {
+		xlog.Error("respcoord: cancel failed", "error", err)
+	}
+}
+
+// wait blocks until every response goroutine (the active one plus any draining
+// superseded ones) has exited. Used at teardown so the session is never deleted
+// out from under a running response.
+func (s *responseSink) wait() {
+	s.wg.Wait()
+}
+
+// shutdown terminates the coordinator (cancelling any in-flight response) and
+// then joins all response goroutines. After this the coordinator is in its
+// absorbing Terminated state, so no further response can be issued — the
+// connection (M1) parent's teardown uses this to guarantee no response outlives
+// the session (see formal-verification/session_lifecycle.fizz).
+func (s *responseSink) shutdown() {
+	if err := s.coord.Apply(respcoord.Shutdown{}); err != nil {
+		xlog.Error("respcoord: shutdown failed", "error", err)
+	}
+	s.wait()
+}
+
+// Perform executes one effect. It is called by Coordinator.Apply while the
+// coordinator lock is held, so it must not block. It briefly takes s.mu but
+// never acquires the coordinator lock while holding s.mu; the spawned
+// goroutine's Finished apply takes the coordinator lock only AFTER releasing
+// s.mu, so there is no lock cycle.
+func (s *responseSink) Perform(e respcoord.Effect) {
+	switch eff := e.(type) {
+	case respcoord.StartResponse:
+		s.mu.Lock()
+		body := s.bodies[eff.ID]
+		delete(s.bodies, eff.ID)
+		parent := body.parent
+		if parent == nil {
+			parent = context.Background()
+		}
+		ctx, cancel := context.WithCancel(parent)
+		s.cancels[eff.ID] = cancel
+		s.mu.Unlock()
+
+		s.wg.Go(func() {
+			defer func() {
+				s.mu.Lock()
+				delete(s.cancels, eff.ID)
+				s.mu.Unlock()
+				// Report completion. If this response was superseded/cancelled
+				// the id is stale and the coordinator ignores it (so the
+				// terminal is never emitted twice).
+				if err := s.coord.Apply(respcoord.Finished{ID: eff.ID}); err != nil {
+					xlog.Error("respcoord: finished apply failed", "error", err)
+				}
+			}()
+			if body.run != nil {
+				body.run(ctx)
+			}
+		})
+	case respcoord.CancelResponse:
+		s.mu.Lock()
+		cancel := s.cancels[eff.ID]
+		s.mu.Unlock()
+		if cancel != nil {
+			cancel()
+		}
+	case respcoord.EmitTerminal:
+		// No-op for now: the response body still emits its own response.done.
+		// Wiring the authoritative single terminal here is the next step.
+	}
+}
--- a/core/http/endpoints/openai/realtime_semantic_vad.go
+++ b/core/http/endpoints/openai/realtime_semantic_vad.go
@@ -0,0 +1,350 @@
+package openai
+
+import (
+	"context"
+	"strings"
+
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/xlog"
+)
+
+// Semantic (EOU-driven) turn detection.
+//
+// With turn_detection.type == "semantic_vad", the transcription model is fed
+// the microphone audio live while the user speaks and its end-of-utterance
+// token turns the silence window dynamic: an immediate commit once the
+// token fires (the model judged the user finished and expects a reply), the
+// much longer eagerness fallback when it does not (mid-thought pause). The
+// silero VAD stays in charge of speech_started/barge-in and the actual
+// silence measurement, so a spurious EOU mid-speech cannot cut the user off
+// — the commit still requires real silence.
+
+const (
+	// semanticEouSilenceSec is the extra silence required to commit once the
+	// end-of-utterance token has fired. Zero: the token already trails the
+	// audio by the encoder chunk schedule plus a VAD tick (~0.3-0.9s), and
+	// the commit check only runs after silero closes the speech segment —
+	// which itself takes real silence — so any window on top is pure added
+	// response delay.
+	semanticEouSilenceSec = 0.0
+
+	// liveEventsBuffer sizes the recv-callback → VAD-tick handoff channel.
+	// Events arrive at a few per second and the ticker drains every 300ms;
+	// a full channel means the loop is wedged, and dropping (with a warning)
+	// beats blocking the backend's recv goroutine.
+	liveEventsBuffer = 64
+)
+
+// eagernessMaxSilenceSec maps the OpenAI semantic_vad eagerness to the
+// fallback silence window used when no end-of-utterance token was seen:
+// low waits longest, high responds fastest, auto/empty equals medium —
+// the same 8s/4s/2s max timeouts OpenAI documents.
+func eagernessMaxSilenceSec(eagerness string) float64 {
+	switch strings.ToLower(strings.TrimSpace(eagerness)) {
+	case "low":
+		return 8
+	case "high":
+		return 2
+	default: // "medium", "auto", ""
+		return 4
+	}
+}
+
+// liveUtterance is one committed turn's transcript as produced by the live
+// stream. Its delta events were already streamed to the client as they
+// arrived (keyed by the turn's item id), so only the final text travels here.
+type liveUtterance struct {
+	Text string
+}
+
+// liveTurnState is handleVAD's per-session live-ASR companion for
+// semantic_vad. One live stream is opened per user turn (begun when the VAD
+// first reports speech, finalized at commit) — the underlying decode session
+// grows with fed audio, so per-turn streams keep it bounded. All fields are
+// owned by the handleVAD goroutine; the backend's recv callback only writes
+// into the buffered events channel.
+type liveTurnState struct {
+	session   *Session
+	transport Transport // live caption deltas are sent here as they drain
+	events    chan backend.LiveTranscriptionEvent
+
+	live        backend.LiveTranscriptionSession // nil between turns
+	unavailable bool                             // sticky: backend can't do live ASR, degrade for the session
+
+	fed16k int // 16k samples of the current buffer already fed
+	// eouAtSec is the audio time of the most recent EOU this turn (0 = none).
+	// It is a recorded fact: set when an EOU drains and never toggled off
+	// mid-turn. Whether it still governs the trailing silence is derived
+	// purely by eouPending() from this plus the live VAD segments.
+	eouAtSec   float64
+	parts      []string // deltas accumulated for the current turn
+	finalText  string   // authoritative full-turn text from the Final event
+	itemID     string   // the turn's conversation item id, allocated at openTurn
+	deltasSent bool     // at least one caption delta reached the client this turn
+}
+
+func newLiveTurnState(session *Session, transport Transport) *liveTurnState {
+	return &liveTurnState{
+		session:   session,
+		transport: transport,
+		events:    make(chan backend.LiveTranscriptionEvent, liveEventsBuffer),
+	}
+}
+
+func (l *liveTurnState) open() bool { return l.live != nil }
+
+// openTurn starts the turn's live stream under the caller-supplied item id. A
+// failure (most commonly the backend's typed "live transcription unsupported"
+// signal) degrades the whole session to silence-only detection — warned once,
+// then sticky.
+//
+// The item id is supplied by the turn coordinator (turncoord) rather than minted
+// here: it is allocated when the turn STARTS so caption deltas can stream to the
+// client while the user is still speaking, and the committed event and final
+// transcript reuse it (replacing the partial text). The coordinator carries the
+// same id on its CommitTurn/DiscardTurn effects, so the committed event always
+// matches the captions.
+func (l *liveTurnState) openTurn(ctx context.Context, itemID string) bool {
+	if l.live != nil {
+		return true
+	}
+	if l.unavailable {
+		return false
+	}
+	language := ""
+	if l.session.InputAudioTranscription != nil {
+		language = l.session.InputAudioTranscription.Language
+	}
+	live, err := l.session.ModelInterface.TranscribeLive(ctx, language, func(ev backend.LiveTranscriptionEvent) {
+		select {
+		case l.events <- ev:
+		default:
+			xlog.Warn("semantic_vad: live transcription event dropped (event channel full)")
+		}
+	})
+	if err != nil {
+		l.unavailable = true
+		xlog.Warn("semantic_vad: live transcription unavailable; degrading to silence-only turn detection",
+			"error", err)
+		return false
+	}
+	l.resetTurn()
+	l.live = live
+	l.itemID = itemID
+	return true
+}
+
+// feedNewAudio pushes the not-yet-fed tail of the resampled buffer to the
+// live stream. The final sample is held back: ResampleInt16 is prefix-stable
+// except for its last output sample, so excluding it keeps successive
+// whole-buffer resamples bit-identical over the fed range.
+func (l *liveTurnState) feedNewAudio(aints16k []int16) {
+	if l.live == nil {
+		return
+	}
+	end := len(aints16k) - 1
+	if end <= l.fed16k {
+		return
+	}
+	if err := l.live.Feed(int16sToFloat32(aints16k[l.fed16k:end])); err != nil {
+		xlog.Warn("semantic_vad: live feed failed; degrading to silence-only turn detection", "error", err)
+		l.discardTurn()
+		l.unavailable = true
+		return
+	}
+	l.fed16k = end
+}
+
+// drainEvents folds everything the live stream produced since the last tick
+// into the turn state. audioSec (the current buffer length in seconds) marks
+// WHEN an EOU was observed, so later VAD segments can distinguish speech
+// that resumed after it.
+func (l *liveTurnState) drainEvents(audioSec float64) {
+	for {
+		select {
+		case ev := <-l.events:
+			if ev.Delta != "" {
+				l.parts = append(l.parts, ev.Delta)
+				// Live captions: forward the delta immediately under the
+				// turn's item id — the browser shows text while the user
+				// is still speaking; the completed event at commit
+				// replaces it with the authoritative transcript.
+				if l.transport != nil && l.itemID != "" {
+					sendEvent(l.transport, types.ConversationItemInputAudioTranscriptionDeltaEvent{
+						ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
+						ItemID:          l.itemID,
+						ContentIndex:    0,
+						Delta:           ev.Delta,
+					})
+					l.deltasSent = true
+				}
+			}
+			if ev.Eou {
+				// Record the position; do not flip a flag. Whether this EOU
+				// still applies to the trailing silence is decided later by
+				// eouPending(), purely from this and the live VAD segments.
+				l.eouAtSec = audioSec
+				xlog.Debug("semantic_vad: EOU token observed", "audio_s", audioSec)
+			}
+			if ev.Eob {
+				// A backchannel ended ("uh-huh") — the user is still
+				// listening, not yielding the turn. Deliberately NOT a
+				// commit trigger.
+				xlog.Debug("semantic_vad: EOB (backchannel) observed", "audio_s", audioSec)
+			}
+			if ev.Final != nil && strings.TrimSpace(ev.Final.Text) != "" {
+				l.finalText = ev.Final.Text
+			}
+		default:
+			return
+		}
+	}
+}
+
+// eouPending reports whether the recorded EOU still applies to the current
+// trailing silence. It is a pure function of the recorded EOU position and the
+// VAD's live view — there is no stored boolean that can fall out of sync.
+//
+// An EOU stops applying only once the user has STARTED a new utterance after
+// it (a segment whose start is past the EOU): that is genuine resumed speech,
+// so the earlier yield no longer holds. An in-progress segment whose speech
+// began BEFORE the EOU is NOT resumed speech — it is just silero still padding
+// before it closes the segment, which is the normal state at the instant the
+// (predictive) EOU fires. Treating that as resumed speech was the bug that
+// cleared the flag on the very tick the token arrived, dropping almost every
+// EOU to the eagerness timeout.
+func (l *liveTurnState) eouPending(segments []schema.VADSegment) bool {
+	if l.eouAtSec == 0 || len(segments) == 0 {
+		return false
+	}
+	last := segments[len(segments)-1]
+	return float64(last.Start) <= l.eouAtSec
+}
+
+// thresholdSec is the dynamic commit threshold: zero once the model said
+// the utterance is over (any VAD-confirmed silence commits), the eagerness
+// fallback otherwise.
+func (l *liveTurnState) thresholdSec(eouPending bool, sv *types.RealtimeSessionSemanticVad) float64 {
+	if eouPending {
+		return semanticEouSilenceSec
+	}
+	return eagernessMaxSilenceSec(sv.Eagerness)
+}
+
+// commitTrigger describes how a commit decision was reached, for the per-turn
+// timing log: "eou" with the token's lag behind the VAD's speech end, or
+// "timeout" when the eagerness fallback elapsed without one. The lag is the
+// number the user needs to tell a slow EOU emission apart from loop overhead.
+func (l *liveTurnState) commitTrigger(eouPending bool, speechEndSec float64) (trigger string, eouLagSec float64) {
+	if !eouPending {
+		return "timeout", 0
+	}
+	return "eou", l.eouAtSec - speechEndSec
+}
+
+// finishTurn finalizes the live stream (flushing the decode tail — the last
+// ~2 encoder frames of text only appear here), folds the terminal events in,
+// and returns the turn's transcript. Returns nil when the stream never
+// produced text (the VAD triggered on something the model heard nothing in).
+func (l *liveTurnState) finishTurn(audioSec float64) *liveUtterance {
+	if l.live == nil {
+		return nil
+	}
+	if err := l.live.Close(); err != nil {
+		xlog.Warn("semantic_vad: live transcription finalize failed", "error", err)
+	}
+	l.live = nil
+	l.drainEvents(audioSec)
+
+	text := strings.TrimSpace(l.finalText)
+	if text == "" {
+		text = l.previewText()
+	}
+	ut := &liveUtterance{Text: text}
+	l.resetTurn()
+	if ut.Text == "" {
+		return nil
+	}
+	return ut
+}
+
+// discardTurn drops the current turn (no-speech buffer clear, feed failure,
+// session teardown): the stream is closed and its transcript thrown away.
+// Any caption deltas already shown for it are retracted via the failed
+// event, so the client doesn't keep a stuck partial entry.
+func (l *liveTurnState) discardTurn() {
+	if l.live != nil {
+		_ = l.live.Close()
+		l.live = nil
+	}
+	l.drainEvents(0)
+	if l.deltasSent && l.transport != nil && l.itemID != "" {
+		sendEvent(l.transport, types.ConversationItemInputAudioTranscriptionFailedEvent{
+			ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
+			ItemID:          l.itemID,
+			ContentIndex:    0,
+			Error: types.Error{
+				Type:    "transcription_discarded",
+				Message: "turn discarded before commit",
+			},
+		})
+	}
+	l.resetTurn()
+}
+
+func (l *liveTurnState) resetTurn() {
+	l.fed16k = 0
+	l.eouAtSec = 0
+	l.parts = nil
+	l.finalText = ""
+	l.itemID = ""
+	l.deltasSent = false
+}
+
+// previewText is the turn's transcript so far (for the retranscribe
+// comparison log and as the fallback when no Final event arrived).
+func (l *liveTurnState) previewText() string {
+	return strings.TrimSpace(strings.Join(l.parts, ""))
+}
+
+// int16sToFloat32 converts PCM to the [-1,1] float form the live stream
+// feeds the model (the same scaling runVAD's go-audio conversion applies).
+func int16sToFloat32(samples []int16) []float32 {
+	out := make([]float32, len(samples))
+	for i, s := range samples {
+		out[i] = float32(s) / 32768.0
+	}
+	return out
+}
+
+// turnDetectionActive reports whether the session has any automatic turn
+// detection (server or semantic VAD) that should run the handleVAD loop.
+func turnDetectionActive(td *types.TurnDetectionUnion) bool {
+	return td != nil && (td.ServerVad != nil || td.SemanticVad != nil)
+}
+
+// defaultTurnDetection seeds a new session's turn detection from the
+// pipeline's server-side default: semantic_vad pipelines start sessions in
+// semantic mode (clients can still override via session.update); everything
+// else keeps the historical server_vad defaults.
+func defaultTurnDetection(cfg *config.ModelConfig) *types.TurnDetectionUnion {
+	if cfg != nil && cfg.Pipeline.TurnDetectionSemantic() {
+		return &types.TurnDetectionUnion{
+			SemanticVad: &types.RealtimeSessionSemanticVad{
+				CreateResponse: true,
+				Eagerness:      cfg.Pipeline.TurnDetection.Eagerness,
+			},
+		}
+	}
+	return &types.TurnDetectionUnion{
+		ServerVad: &types.ServerVad{
+			Threshold:         0.5,
+			PrefixPaddingMs:   300,
+			SilenceDurationMs: 500,
+			CreateResponse:    true,
+		},
+	}
+}
--- a/core/http/endpoints/openai/realtime_semantic_vad_test.go
+++ b/core/http/endpoints/openai/realtime_semantic_vad_test.go
@@ -0,0 +1,414 @@
+package openai
+
+import (
+	"context"
+	"errors"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+	"github.com/mudler/LocalAI/core/schema"
+)
+
+var _ = Describe("eagernessMaxSilenceSec", func() {
+	DescribeTable("maps eagerness to the no-EOU fallback window",
+		func(eagerness string, want float64) {
+			Expect(eagernessMaxSilenceSec(eagerness)).To(Equal(want))
+		},
+		Entry("low", "low", 8.0),
+		Entry("medium", "medium", 4.0),
+		Entry("high", "high", 2.0),
+		Entry("auto equals medium", "auto", 4.0),
+		Entry("empty equals medium", "", 4.0),
+		Entry("case and space insensitive", " High ", 2.0),
+		Entry("unknown equals medium", "frantic", 4.0),
+	)
+})
+
+var _ = Describe("turnDetectionActive", func() {
+	It("is active for server and semantic VAD, inactive otherwise", func() {
+		Expect(turnDetectionActive(nil)).To(BeFalse())
+		Expect(turnDetectionActive(&types.TurnDetectionUnion{})).To(BeFalse())
+		Expect(turnDetectionActive(&types.TurnDetectionUnion{ServerVad: &types.ServerVad{}})).To(BeTrue())
+		Expect(turnDetectionActive(&types.TurnDetectionUnion{SemanticVad: &types.RealtimeSessionSemanticVad{}})).To(BeTrue())
+	})
+})
+
+var _ = Describe("defaultTurnDetection", func() {
+	It("keeps the historical server_vad defaults for non-semantic pipelines", func() {
+		td := defaultTurnDetection(&config.ModelConfig{})
+		Expect(td.ServerVad).NotTo(BeNil())
+		Expect(td.SemanticVad).To(BeNil())
+		Expect(td.ServerVad.SilenceDurationMs).To(Equal(int64(500)))
+		Expect(td.ServerVad.CreateResponse).To(BeTrue())
+	})
+
+	It("seeds semantic_vad with the pipeline's eagerness", func() {
+		cfg := &config.ModelConfig{}
+		cfg.Pipeline.TurnDetection.Type = "semantic_vad"
+		cfg.Pipeline.TurnDetection.Eagerness = "high"
+		td := defaultTurnDetection(cfg)
+		Expect(td.SemanticVad).NotTo(BeNil())
+		Expect(td.ServerVad).To(BeNil())
+		Expect(td.SemanticVad.Eagerness).To(Equal("high"))
+		Expect(td.SemanticVad.CreateResponse).To(BeTrue())
+	})
+
+	It("treats a nil config as server_vad", func() {
+		Expect(defaultTurnDetection(nil).ServerVad).NotTo(BeNil())
+	})
+})
+
+var _ = Describe("int16sToFloat32", func() {
+	It("scales like the VAD conversion", func() {
+		out := int16sToFloat32([]int16{0, 16384, -32768})
+		Expect(out).To(HaveLen(3))
+		Expect(out[0]).To(BeNumerically("~", 0.0, 1e-6))
+		Expect(out[1]).To(BeNumerically("~", 0.5, 1e-6))
+		Expect(out[2]).To(BeNumerically("~", -1.0, 1e-6))
+	})
+})
+
+var _ = Describe("liveTurnState", func() {
+	var (
+		m   *fakeModel
+		lts *liveTurnState
+		ftr *fakeTransport
+	)
+
+	newSemanticSession := func(m *fakeModel) *Session {
+		return &Session{
+			InputAudioTranscription: &types.AudioTranscription{},
+			ModelInterface:          m,
+		}
+	}
+
+	BeforeEach(func() {
+		m = &fakeModel{}
+		ftr = &fakeTransport{}
+		lts = newLiveTurnState(newSemanticSession(m), ftr)
+	})
+
+	Describe("openTurn", func() {
+		It("opens once per turn and reports open()", func() {
+			Expect(lts.open()).To(BeFalse())
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
+			Expect(lts.open()).To(BeTrue())
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue(), "idempotent while open")
+			Expect(m.liveOpened).To(Equal(1))
+		})
+
+		It("degrades stickily when the backend cannot do live transcription", func() {
+			m.liveErr = errors.New("rpc error: code = Unimplemented desc = live transcription unsupported")
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeFalse())
+			Expect(lts.unavailable).To(BeTrue())
+
+			// Later turns never retry: the failure is per-session sticky.
+			m.liveErr = nil
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeFalse())
+			Expect(m.liveOpened).To(Equal(0))
+		})
+	})
+
+	Describe("feedNewAudio", func() {
+		It("feeds only the unfed tail and holds back the final resampled sample", func() {
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
+
+			lts.feedNewAudio([]int16{1, 2, 3, 4})
+			Expect(m.liveSession.fed).To(HaveLen(1))
+			Expect(m.liveSession.fed[0]).To(HaveLen(3), "last sample held back")
+
+			// Same buffer grown by two samples: only the delta is fed.
+			lts.feedNewAudio([]int16{1, 2, 3, 4, 5, 6})
+			Expect(m.liveSession.fed).To(HaveLen(2))
+			Expect(m.liveSession.fed[1]).To(HaveLen(2))
+
+			// No growth past the holdback: nothing fed.
+			lts.feedNewAudio([]int16{1, 2, 3, 4, 5, 6})
+			Expect(m.liveSession.fed).To(HaveLen(2))
+		})
+
+		It("degrades and closes the turn when a feed fails", func() {
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
+			m.liveSession.feedErr = errors.New("backend gone")
+			sess := m.liveSession
+
+			lts.feedNewAudio([]int16{1, 2, 3, 4})
+
+			Expect(lts.open()).To(BeFalse())
+			Expect(lts.unavailable).To(BeTrue())
+			Expect(sess.closed).To(Equal(1))
+		})
+	})
+
+	Describe("event handling and the dynamic threshold", func() {
+		sv := &types.RealtimeSessionSemanticVad{Eagerness: "high"}
+
+		It("uses the eagerness fallback until an EOU is recorded, then commits without an extra window", func() {
+			Expect(lts.thresholdSec(false, sv)).To(Equal(2.0))
+			Expect(lts.thresholdSec(true, sv)).To(Equal(semanticEouSilenceSec))
+
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
+			lts.session.ModelInterface.(*fakeModel).liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "hello ", Eou: false})
+			lts.session.ModelInterface.(*fakeModel).liveSession.onEvent(backend.LiveTranscriptionEvent{Eou: true})
+			lts.drainEvents(3.3)
+
+			Expect(lts.eouAtSec).To(BeNumerically("~", 3.3, 1e-9))
+			Expect(lts.previewText()).To(Equal("hello"))
+		})
+
+		// The bug this replaces: the (predictive) EOU routinely arrives while
+		// silero is still padding the speech segment open. eouPending must NOT
+		// read that as resumed speech.
+		It("keeps the EOU pending while silero is still closing the same segment", func() {
+			lts.eouAtSec = 3.3
+			Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 0}})).To(BeTrue(), "segment began before the EOU and is merely unclosed")
+			Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 3.0}})).To(BeTrue(), "and still pending once it closes")
+		})
+
+		It("drops the EOU only when a new utterance starts after it (resumed speech)", func() {
+			lts.eouAtSec = 3.3
+			Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 3.0}, {Start: 4.0, End: 0}})).To(BeFalse())
+			Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 3.0}, {Start: 4.0, End: 5.0}})).To(BeFalse())
+		})
+
+		It("has no pending EOU before one is recorded", func() {
+			Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 3.0}})).To(BeFalse())
+			Expect(lts.eouPending(nil)).To(BeFalse())
+		})
+
+		It("does not arm the commit threshold on an EOB backchannel", func() {
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
+			lts.session.ModelInterface.(*fakeModel).liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "uh-huh", Eob: true})
+			lts.drainEvents(2.0)
+
+			Expect(lts.eouAtSec).To(BeZero(), "a backchannel is not the user yielding the turn")
+			Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 1.8}})).To(BeFalse(), "still on the eagerness fallback")
+			Expect(lts.previewText()).To(Equal("uh-huh"), "the backchannel text still lands in the transcript")
+		})
+
+		It("reports the commit trigger and the EOU token's lag behind speech end", func() {
+			trigger, lag := lts.commitTrigger(false, 3.2)
+			Expect(trigger).To(Equal("timeout"))
+			Expect(lag).To(BeZero())
+
+			lts.eouAtSec = 3.5
+			trigger, lag = lts.commitTrigger(true, 3.2)
+			Expect(trigger).To(Equal("eou"))
+			Expect(lag).To(BeNumerically("~", 0.3, 1e-9))
+		})
+	})
+
+	Describe("finishTurn", func() {
+		It("finalizes the stream, prefers the Final text, and resets for the next turn", func() {
+			m.liveCloseEvents = []backend.LiveTranscriptionEvent{
+				{Delta: " world"},
+				{Final: &schema.TranscriptionResult{Text: "hello world", Eou: true}},
+			}
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
+			sess := m.liveSession
+			sess.onEvent(backend.LiveTranscriptionEvent{Delta: "hello", Eou: true})
+			lts.drainEvents(2.0)
+
+			ut := lts.finishTurn(2.5)
+
+			Expect(sess.closed).To(Equal(1))
+			Expect(ut).NotTo(BeNil())
+			Expect(ut.Text).To(Equal("hello world"), "Final event text wins over joined deltas")
+			Expect(lts.open()).To(BeFalse())
+			Expect(lts.eouAtSec).To(BeZero())
+			Expect(lts.parts).To(BeEmpty())
+			Expect(lts.fed16k).To(BeZero())
+		})
+
+		It("returns nil when the stream heard nothing", func() {
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
+			Expect(lts.finishTurn(1.0)).To(BeNil())
+			Expect(m.liveSession.closed).To(Equal(1))
+		})
+
+		It("is a no-op without an open stream", func() {
+			Expect(lts.finishTurn(1.0)).To(BeNil())
+		})
+	})
+
+	Describe("discardTurn", func() {
+		It("closes the stream, drops the transcript and retracts streamed captions", func() {
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
+			sess := m.liveSession
+			sess.onEvent(backend.LiveTranscriptionEvent{Delta: "noise"})
+			lts.drainEvents(1.0)
+
+			lts.discardTurn()
+
+			Expect(sess.closed).To(Equal(1))
+			Expect(lts.open()).To(BeFalse())
+			Expect(lts.parts).To(BeEmpty())
+			Expect(ftr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionFailed)).To(Equal(1),
+				"the client saw caption deltas for this turn — it must be told to drop them")
+		})
+
+		It("sends no failed event when no captions ever reached the client", func() {
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
+			lts.discardTurn()
+			Expect(ftr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionFailed)).To(Equal(0))
+		})
+	})
+
+	Describe("live captions", func() {
+		It("streams each delta to the client under the turn's item id as it drains", func() {
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
+			turnID := lts.itemID
+			Expect(turnID).NotTo(BeEmpty(), "the item id exists from turn open so captions can reference it")
+
+			m.liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "hel"})
+			m.liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "lo"})
+			lts.drainEvents(1.0)
+
+			var got []types.ConversationItemInputAudioTranscriptionDeltaEvent
+			for _, e := range ftr.events {
+				if d, ok := e.(types.ConversationItemInputAudioTranscriptionDeltaEvent); ok {
+					got = append(got, d)
+				}
+			}
+			Expect(got).To(HaveLen(2))
+			Expect(got[0].Delta).To(Equal("hel"))
+			Expect(got[1].Delta).To(Equal("lo"))
+			Expect(got[0].ItemID).To(Equal(turnID))
+			Expect(got[1].ItemID).To(Equal(turnID))
+			Expect(lts.deltasSent).To(BeTrue())
+		})
+
+		It("finishTurn does not retract captions — the commit's completed event supersedes them", func() {
+			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
+			m.liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "hello"})
+			lts.drainEvents(1.0)
+
+			Expect(lts.finishTurn(1.5)).NotTo(BeNil())
+			Expect(ftr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionFailed)).To(Equal(0))
+		})
+	})
+})
+
+// commitUtteranceWithTranscript routes the three transcript sources: the
+// retranscribe gate's batch decode, the live stream's accumulated text, and
+// the historical file path.
+var _ = Describe("commitUtteranceWithTranscript", func() {
+	newTranscriptionOnlySession := func(m *fakeModel, streamTranscription bool) *Session {
+		cfg := &config.ModelConfig{}
+		if streamTranscription {
+			on := true
+			cfg.Pipeline.Streaming.Transcription = &on
+		}
+		return &Session{
+			TranscriptionOnly:       true, // stop after the transcript: no LLM/TTS in these specs
+			InputAudioTranscription: &types.AudioTranscription{},
+			ModelConfig:             cfg,
+			ModelInterface:          m,
+		}
+	}
+
+	It("uses the gate's batch transcript and never re-runs the backend", func() {
+		m := &fakeModel{transcribeErr: errors.New("must not be called")}
+		session := newTranscriptionOnlySession(m, true)
+		tr := &fakeTransport{}
+
+		commitUtteranceWithTranscript(context.Background(), []byte{1, 2}, nil,
+			&schema.TranscriptionResult{Text: "batch text", Eou: true}, "item_turn", session, &Conversation{}, tr)
+
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionDelta)).To(Equal(0))
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
+	})
+
+	It("emits only the completed event for a live transcript — captions already streamed during the turn", func() {
+		m := &fakeModel{transcribeErr: errors.New("must not be called")}
+		session := newTranscriptionOnlySession(m, true)
+		tr := &fakeTransport{}
+
+		commitUtteranceWithTranscript(context.Background(), []byte{1, 2},
+			&liveUtterance{Text: "hello"}, nil, "item_turn", session, &Conversation{}, tr)
+
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionDelta)).To(Equal(0))
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
+
+		var completed types.ConversationItemInputAudioTranscriptionCompletedEvent
+		for _, e := range tr.events {
+			if c, ok := e.(types.ConversationItemInputAudioTranscriptionCompletedEvent); ok {
+				completed = c
+			}
+		}
+		Expect(completed.ItemID).To(Equal("item_turn"),
+			"completed must reuse the caption deltas' item id so the client replaces, not duplicates")
+		Expect(completed.Transcript).To(Equal("hello"))
+	})
+
+	It("falls back to the file path when the live stream heard nothing", func() {
+		m := &fakeModel{transcribeFinal: &schema.TranscriptionResult{Text: "from file"}}
+		session := newTranscriptionOnlySession(m, false)
+		tr := &fakeTransport{}
+
+		commitUtteranceWithTranscript(context.Background(), []byte{1, 2},
+			&liveUtterance{}, nil, "", session, &Conversation{}, tr)
+
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
+	})
+})
+
+// transcribeUtterance is the retranscribe gate's offline decode of the
+// buffered turn.
+var _ = Describe("transcribeUtterance", func() {
+	It("returns the batch decode with its Eou flag", func() {
+		m := &fakeModel{transcribeFinal: &schema.TranscriptionResult{Text: "confirmed", Eou: true}}
+		session := &Session{
+			InputAudioTranscription: &types.AudioTranscription{},
+			ModelInterface:          m,
+		}
+
+		tr, err := transcribeUtterance(context.Background(), []byte{0, 0, 1, 1}, session)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(tr.Text).To(Equal("confirmed"))
+		Expect(tr.Eou).To(BeTrue())
+	})
+
+	It("propagates backend errors", func() {
+		m := &fakeModel{transcribeErr: errors.New("engine fell over")}
+		session := &Session{
+			InputAudioTranscription: &types.AudioTranscription{},
+			ModelInterface:          m,
+		}
+
+		_, err := transcribeUtterance(context.Background(), []byte{0, 0}, session)
+		Expect(err).To(MatchError(ContainSubstring("engine fell over")))
+	})
+})
+
+// emitPrecomputedTranscription replays an already-produced transcript as the
+// standard delta/completed event sequence.
+var _ = Describe("emitPrecomputedTranscription", func() {
+	It("emits deltas then completed, sharing the item id", func() {
+		tr := &fakeTransport{}
+		Expect(emitPrecomputedTranscription(tr, "item42", []string{"a", "", "b"}, "ab")).To(Succeed())
+
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionDelta)).To(Equal(2), "empty deltas skipped")
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
+		for _, e := range tr.events {
+			switch ev := e.(type) {
+			case types.ConversationItemInputAudioTranscriptionDeltaEvent:
+				Expect(ev.ItemID).To(Equal("item42"))
+			case types.ConversationItemInputAudioTranscriptionCompletedEvent:
+				Expect(ev.ItemID).To(Equal("item42"))
+				Expect(ev.Transcript).To(Equal("ab"))
+			}
+		}
+	})
+
+	It("emits only the completed event with no deltas", func() {
+		tr := &fakeTransport{}
+		Expect(emitPrecomputedTranscription(tr, "item1", nil, "hi")).To(Succeed())
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionDelta)).To(Equal(0))
+		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
+	})
+})
--- a/core/http/endpoints/openai/realtime_stream.go
+++ b/core/http/endpoints/openai/realtime_stream.go
@@ -86,7 +86,8 @@ func (s *transcriptStreamer) content() string {
 // tool calls. It returns true when it has fully handled the response so the
 // caller can return; callers must only invoke it for an audio modality, and with
 // tools only when the model uses its tokenizer template (see triggerResponseAtTurn).
-func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation, t Transport, responseID string, history schema.Messages, images []string, llmCfg *config.ModelConfig, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, toolTurn int) bool {
+func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation, t Transport, r *liveResponse, history schema.Messages, images []string, llmCfg *config.ModelConfig, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, toolTurn int) bool {
+	responseID := r.id
 	itemID := generateItemID()
 	item := types.MessageItemUnion{
 		Assistant: &types.MessageItemAssistant{
@@ -121,6 +122,8 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 		})
 	}

+	// cancel rolls back the partial item and records the cancelled outcome; the
+	// single terminal is emitted by triggerResponse.
 	cancel := func() {
 		if announced {
 			conv.Lock.Lock()
@@ -132,10 +135,7 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 			}
 			conv.Lock.Unlock()
 		}
-		sendEvent(t, types.ResponseDoneEvent{
-			ServerEventBase: types.ServerEventBase{},
-			Response:        types.Response{ID: responseID, Object: "realtime.response", Status: types.ResponseStatusCancelled},
-		})
+		r.outcome = outcomeCancelled
 	}

 	var template string
@@ -161,24 +161,30 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 	streamer.announce = announce

 	// Clause chunking (opt-in): synthesize each clause as soon as it completes
-	// instead of buffering the whole reply. streamedAudio accumulates the PCM
-	// across clauses for the conversation item record; ttsErr captures the first
-	// synthesis failure so the token callback can stop the prediction. emitSpeech
-	// runs synchronously here — the LLM keeps generating into the gRPC stream
-	// while a clause is synthesized, so audio still starts mid-generation.
+	// instead of buffering the whole reply. Synthesis runs on a worker goroutine
+	// (ttsPipeline) rather than inline in the token callback: emitSpeech blocks
+	// until the whole clause is synthesized (and, for WebRTC, played back at
+	// real time), and the callback runs on the goroutine that drains the LLM
+	// gRPC stream — so speaking inline stalls generation and freezes the
+	// assistant transcript at every clause boundary. The worker lets generation
+	// and the transcript stream keep flowing while audio is produced behind them.
 	var chunker *clauseChunker
+	var ttsPipe *ttsPipeline
 	if session.ModelConfig != nil && session.ModelConfig.Pipeline.ChunkClauses() {
 		chunker = newClauseChunker(defaultClauseMinRunes, defaultClauseMaxRunes)
+		ttsPipe = newTTSPipeline(func(clause string) ([]byte, error) {
+			return emitSpeech(ctx, t, session, responseID, itemID, clause)
+		})
 	}
 	var streamedAudio []byte
 	var ttsErr error
-	speakClause := func(clause string) error {
-		a, err := emitSpeech(ctx, t, session, responseID, itemID, clause)
-		if err != nil {
-			return err
-		}
-		streamedAudio = append(streamedAudio, a...)
-		return nil
+
+	// Backstop: always join the TTS worker, even on an unexpected early return.
+	// wait() is idempotent, so the explicit drain below (which captures the
+	// streamed audio and first error) stays authoritative; this only guarantees
+	// the goroutine can never leak if a new return path is added.
+	if ttsPipe != nil {
+		defer func() { _, _ = ttsPipe.wait() }()
 	}

 	// fail reports a mid-stream failure. A cancelled context means the client
@@ -188,6 +194,7 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 			cancel()
 		} else {
 			sendError(t, code, fmt.Sprintf("%s: %v", msg, err), "", itemID)
+			r.outcome = outcomeFailed
 		}
 		return true
 	}
@@ -207,8 +214,12 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 		delta := streamer.onToken(text)
 		if chunker != nil && delta != "" {
 			for _, clause := range chunker.push(delta) {
-				if ttsErr = speakClause(clause); ttsErr != nil {
-					return false // stop the prediction; reported after predFunc returns
+				// Hand the clause to the worker and keep going — never block the
+				// recv loop on synthesis. A false return means a prior clause
+				// already failed; stop the prediction (the error is collected
+				// from the pipeline after predFunc returns).
+				if !ttsPipe.enqueue(clause) {
+					return false
 				}
 			}
 		}
@@ -217,10 +228,27 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation

 	predFunc, err := session.ModelInterface.Predict(ctx, history, images, nil, nil, cb, tools, toolChoice, nil, nil, nil)
 	if err != nil {
+		// The deferred wait() joins the (idle) worker.
 		sendError(t, "inference_failed", fmt.Sprintf("backend error: %v", err), "", itemID)
 		return true
 	}
 	pred, err := predFunc()
+
+	// Drain the TTS worker. On a clean finish, enqueue the trailing clause(s) the
+	// chunker was still holding; on an error or barge-in, stop synthesizing.
+	// wait() runs on every path so the worker goroutine never leaks, and it
+	// returns the audio streamed so far plus the first synthesis failure.
+	if ttsPipe != nil {
+		if err == nil && ctx.Err() == nil {
+			for _, clause := range chunker.flush() {
+				if !ttsPipe.enqueue(clause) {
+					break
+				}
+			}
+		}
+		streamedAudio, ttsErr = ttsPipe.wait()
+	}
+
 	// A clause synthesis failed mid-stream (the callback stopped the prediction);
 	// report it as a TTS error rather than a prediction error.
 	if ttsErr != nil {
@@ -233,6 +261,7 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 		cancel()
 		return true
 	}
+	r.addUsage(pred.Usage)

 	content := streamer.content()
 	toolCalls := functions.ToolCallsFromChatDeltas(pred.ChatDeltas)
@@ -244,24 +273,19 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 			announce()
 		}

-		// Synthesize the audio. With clause chunking the completed clauses were
-		// already spoken inside the token callback; flush the trailing clause(s)
-		// the segmenter was still holding. Otherwise buffer the whole message and
-		// synthesize it once. emitSpeech streams the audio chunks when the TTS
-		// backend supports TTSStream, otherwise it sends a single unary delta.
+		// With clause chunking the clauses were synthesized on the worker as the
+		// reply streamed (including the trailing flush drained above), so the
+		// audio is already accumulated. Otherwise buffer the whole message and
+		// synthesize it once now — emitSpeech streams the audio chunks when the
+		// TTS backend supports TTSStream, otherwise it sends a single unary delta.
 		var audio []byte
 		if chunker != nil {
-			for _, clause := range chunker.flush() {
-				if ttsErr = speakClause(clause); ttsErr != nil {
-					break
-				}
-			}
 			audio = streamedAudio
 		} else {
 			audio, ttsErr = emitSpeech(ctx, t, session, responseID, itemID, content)
-		}
-		if ttsErr != nil {
-			return fail("tts_error", "TTS generation failed", ttsErr)
+			if ttsErr != nil {
+				return fail("tts_error", "TTS generation failed", ttsErr)
+			}
 		}

 		_, isWebRTC := t.(*WebRTCTransport)
@@ -306,10 +330,12 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 			OutputIndex:     0,
 			Item:            item,
 		})
+		r.addItem(item)
 	}

-	// Emit any tool calls, the terminal response.done, and (for server-side
-	// assistant tools) the follow-up turn — shared with the buffered path.
-	emitToolCallItems(ctx, session, conv, t, responseID, toolCalls, content != "", toolTurn)
+	// Emit any tool calls and (for server-side assistant tools) the follow-up
+	// turn — shared with the buffered path. The single terminal is emitted by
+	// triggerResponse.
+	emitToolCallItems(ctx, session, conv, t, r, toolCalls, content != "", toolTurn)
 	return true
 }
--- a/core/http/endpoints/openai/realtime_stream_test.go
+++ b/core/http/endpoints/openai/realtime_stream_test.go
@@ -102,7 +102,8 @@ var _ = Describe("streamLLMResponse", func() {
 		t := &fakeTransport{}
 		llmCfg := &config.ModelConfig{}

-		handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0)
+		r := &liveResponse{id: "resp1"}
+		handled := streamLLMResponse(context.Background(), session, conv, t, r, nil, nil, llmCfg, nil, nil, 0)

 		Expect(handled).To(BeTrue())
 		// One live transcript delta per streamed token.
@@ -132,7 +133,8 @@ var _ = Describe("streamLLMResponse", func() {
 		t := &fakeTransport{}
 		llmCfg := &config.ModelConfig{}

-		handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0)
+		r := &liveResponse{id: "resp1"}
+		handled := streamLLMResponse(context.Background(), session, conv, t, r, nil, nil, llmCfg, nil, nil, 0)

 		Expect(handled).To(BeTrue())
 		// Two clauses ("Hello world." mid-stream, "How are you?" on flush) → two
@@ -140,8 +142,10 @@ var _ = Describe("streamLLMResponse", func() {
 		Expect(t.countEvents(types.ServerEventTypeResponseOutputAudioDelta)).To(Equal(2))
 		// The full transcript still streams verbatim.
 		Expect(t.transcriptDeltaText()).To(Equal("Hello world. How are you?"))
-		// Exactly one terminal response.done.
-		Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(1))
+		// The terminal response.done is emitted by triggerResponse, not by
+		// streamLLMResponse — so at this layer there are none.
+		Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
+		Expect(r.outcome).To(Equal(outcomeCompleted))
 	})

 	It("streams content deltas and emits tool-call items (autoparser tool turn)", func() {
@@ -169,15 +173,18 @@ var _ = Describe("streamLLMResponse", func() {
 		llmCfg := &config.ModelConfig{}
 		llmCfg.TemplateConfig.UseTokenizerTemplate = true

-		handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0)
+		r := &liveResponse{id: "resp1"}
+		handled := streamLLMResponse(context.Background(), session, conv, t, r, nil, nil, llmCfg, nil, nil, 0)

 		Expect(handled).To(BeTrue())
 		// The spoken content was streamed live.
 		Expect(t.transcriptDeltaText()).To(Equal("Let me check."))
 		// The tool call is emitted as a function_call item.
 		Expect(t.countEvents(types.ServerEventTypeResponseFunctionCallArgumentsDone)).To(Equal(1))
-		// Exactly one terminal response.done.
-		Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(1))
+		// The terminal response.done is emitted by triggerResponse, not by
+		// streamLLMResponse — so at this layer there are none.
+		Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
+		Expect(r.outcome).To(Equal(outcomeCompleted))
 	})

 	It("emits only tool-call items for a content-less tool turn (no empty assistant item)", func() {
@@ -200,7 +207,8 @@ var _ = Describe("streamLLMResponse", func() {
 		llmCfg := &config.ModelConfig{}
 		llmCfg.TemplateConfig.UseTokenizerTemplate = true

-		handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0)
+		r := &liveResponse{id: "resp1"}
+		handled := streamLLMResponse(context.Background(), session, conv, t, r, nil, nil, llmCfg, nil, nil, 0)

 		Expect(handled).To(BeTrue())
 		// No content → no transcript deltas and no spurious assistant content item.
@@ -208,6 +216,51 @@ var _ = Describe("streamLLMResponse", func() {
 		Expect(t.countEvents(types.ServerEventTypeResponseOutputAudioTranscriptDelta)).To(Equal(0))
 		// The tool call is still emitted.
 		Expect(t.countEvents(types.ServerEventTypeResponseFunctionCallArgumentsDone)).To(Equal(1))
-		Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(1))
+		Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
+		Expect(r.outcome).To(Equal(outcomeCompleted))
+	})
+})
+
+var _ = Describe("triggerResponse", func() {
+	It("emits exactly one response.created and one response.done with output and usage", func() {
+		m := &fakeModel{
+			cfg: &config.ModelConfig{},
+			predictResp: backend.LLMResponse{
+				Response: "Hi there.",
+				Usage:    backend.TokenUsage{Prompt: 5, Completion: 3},
+			},
+		}
+		session := &Session{
+			OutputSampleRate: 24000,
+			ModelInterface:   m,
+			ModelConfig:      &config.ModelConfig{},
+			// Text-only so the buffered path skips TTS and the assertion focuses
+			// on the terminal's Output + Usage.
+			OutputModalities: []types.Modality{types.ModalityText},
+		}
+		conv := &Conversation{}
+		t := &fakeTransport{}
+
+		triggerResponse(context.Background(), session, conv, t, nil)
+
+		// Exactly one of each lifecycle event for the whole response.create.
+		Expect(t.countEvents(types.ServerEventTypeResponseCreated)).To(Equal(1))
+		Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(1))
+
+		// The single terminal carries the produced output item and the usage —
+		// both empty in the legacy code.
+		var done *types.ResponseDoneEvent
+		for i := range t.events {
+			if d, ok := t.events[i].(types.ResponseDoneEvent); ok {
+				done = &d
+			}
+		}
+		Expect(done).NotTo(BeNil())
+		Expect(done.Response.Status).To(Equal(types.ResponseStatusCompleted))
+		Expect(done.Response.Output).To(HaveLen(1))
+		Expect(done.Response.Usage).NotTo(BeNil())
+		Expect(done.Response.Usage.InputTokens).To(Equal(5))
+		Expect(done.Response.Usage.OutputTokens).To(Equal(3))
+		Expect(done.Response.Usage.TotalTokens).To(Equal(8))
 	})
 })
--- a/core/http/endpoints/openai/realtime_transcription.go
+++ b/core/http/endpoints/openai/realtime_transcription.go
@@ -7,6 +7,33 @@ import (
 	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
 )

+// emitPrecomputedTranscription emits the transcription events for a turn
+// whose transcript already exists (semantic_vad's live stream, or the
+// retranscribe gate's batch decode): optional delta replays followed by the
+// completed event — the same contract emitTranscription produces, sharing
+// one itemID — without running the backend again.
+func emitPrecomputedTranscription(t Transport, itemID string, deltas []string, transcript string) error {
+	for _, d := range deltas {
+		if d == "" {
+			continue
+		}
+		if err := t.SendEvent(types.ConversationItemInputAudioTranscriptionDeltaEvent{
+			ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
+			ItemID:          itemID,
+			ContentIndex:    0,
+			Delta:           d,
+		}); err != nil {
+			return err
+		}
+	}
+	return t.SendEvent(types.ConversationItemInputAudioTranscriptionCompletedEvent{
+		ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
+		ItemID:          itemID,
+		ContentIndex:    0,
+		Transcript:      transcript,
+	})
+}
+
 // emitTranscription transcribes a committed utterance and emits the transcription
 // events for it, returning the final transcript text. With
 // pipeline.streaming.transcription enabled it streams each transcript fragment as
--- a/core/http/endpoints/openai/realtime_tts_pipeline.go
+++ b/core/http/endpoints/openai/realtime_tts_pipeline.go
@@ -0,0 +1,153 @@
+package openai
+
+import (
+	"sync"
+	"sync/atomic"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/ttscoord"
+)
+
+// ttsPipeline decouples speech synthesis from LLM token generation.
+//
+// The LLM token callback runs on the same goroutine that drains the model's
+// gRPC stream, so anything it does serially — including a blocking TTS call —
+// stops the stream from being read and stalls generation (and, since the same
+// goroutine also sends the assistant transcript, freezes the transcript the
+// client sees). ttsPipeline lets the callback hand each completed clause to a
+// single worker goroutine that synthesizes them in order, concurrently with
+// continued generation. One worker preserves clause — and therefore audio —
+// ordering.
+//
+// The clause queue is intentionally unbounded: clauses are short strings and a
+// reply has a bounded number of them, while the expensive product (audio) is
+// paced by the TTS backend regardless. So enqueue never blocks the callback,
+// and the transcript streams to the client at generation speed while audio is
+// produced behind it.
+type ttsPipeline struct {
+	speak func(clause string) ([]byte, error)
+
+	mu    sync.Mutex
+	queue []string
+	wake  chan struct{} // buffered(1) wakeup signal for the worker
+
+	// coord owns the open->closing->closed lifecycle (machine M5). It replaces the
+	// legacy `closed bool`: the producer raises Close (wait()), the worker raises
+	// WorkerExited. See ttscoord/ and realtime-state-machines.md.
+	coord *ttscoord.Coordinator
+
+	done   chan struct{}
+	failed atomic.Bool
+
+	// audio and firstErr are owned by the worker goroutine and only safe to
+	// read after wait() has returned (it joins on the worker via done).
+	audio    []byte
+	firstErr error
+}
+
+// newTTSPipeline starts the worker. speak performs the actual synthesis and
+// returns the PCM accumulated for the conversation-item record (empty for
+// transports that stream audio out-of-band, e.g. WebRTC).
+func newTTSPipeline(speak func(clause string) ([]byte, error)) *ttsPipeline {
+	p := &ttsPipeline{
+		speak: speak,
+		wake:  make(chan struct{}, 1),
+		done:  make(chan struct{}),
+	}
+	p.coord = ttscoord.New(p)
+	go p.run()
+	return p
+}
+
+// closing reports whether wait() has been called (lifecycle past Open). Read
+// under p.mu in the worker so the queue-empty check and the close check are
+// consistent.
+func (p *ttsPipeline) closing() bool {
+	_, open := p.coord.State().(ttscoord.Open)
+	return !open
+}
+
+// Perform executes a coordinator effect. Wake nudges the worker (non-blocking).
+func (p *ttsPipeline) Perform(e ttscoord.Effect) {
+	if _, ok := e.(ttscoord.Wake); ok {
+		p.signal()
+	}
+}
+
+func (p *ttsPipeline) run() {
+	defer close(p.done)
+	for {
+		p.mu.Lock()
+		for len(p.queue) == 0 && !p.closing() {
+			p.mu.Unlock()
+			<-p.wake
+			p.mu.Lock()
+		}
+		if len(p.queue) == 0 && p.closing() {
+			p.mu.Unlock()
+			// Drained and closed: advance the lifecycle to Closed, then exit
+			// (the deferred close(p.done) joins the producer's wait()).
+			_ = p.coord.Apply(ttscoord.WorkerExited{})
+			return
+		}
+		clause := p.queue[0]
+		p.queue = p.queue[1:]
+		p.mu.Unlock()
+
+		// Once a clause has failed, keep draining the queue without speaking so
+		// the producer's wait() returns promptly and the first error is kept.
+		if p.failed.Load() {
+			continue
+		}
+		a, err := p.speak(clause)
+		if err != nil {
+			p.firstErr = err
+			p.failed.Store(true)
+			continue
+		}
+		p.audio = append(p.audio, a...)
+	}
+}
+
+// enqueue offers a clause for synthesis. It never blocks; it returns false once
+// synthesis has failed, signalling the caller to stop the prediction.
+func (p *ttsPipeline) enqueue(clause string) bool {
+	if p.failed.Load() {
+		return false
+	}
+	p.mu.Lock()
+	// Reject once closing/closed: the worker may have already drained and exited,
+	// so a clause queued now would be silently dropped. The lifecycle (Open) and
+	// the append are checked under the same lock, so the worker cannot exit between
+	// the gate and the enqueue (it takes p.mu to observe the empty queue).
+	if p.closing() {
+		p.mu.Unlock()
+		return false
+	}
+	p.queue = append(p.queue, clause)
+	p.mu.Unlock()
+	p.signal()
+	return true
+}
+
+// signal wakes the worker without blocking; the buffered channel coalesces
+// signals, which is safe because the worker drains the whole queue per wake.
+func (p *ttsPipeline) signal() {
+	select {
+	case p.wake <- struct{}{}:
+	default:
+	}
+}
+
+// wait closes the queue and blocks until the worker has spoken every enqueued
+// clause, then returns the accumulated audio and the first synthesis error. It
+// is idempotent: calling it again returns the same result without blocking, so
+// callers can drain it explicitly to read the audio and still defer a wait() as
+// a leak-proof backstop. No clause may be enqueued after the first wait().
+func (p *ttsPipeline) wait() ([]byte, error) {
+	// Close the lifecycle (Open->Closing) and wake the worker. Idempotent: a
+	// second Close is absorbed (no second wake), and <-p.done returns immediately
+	// once the worker has exited.
+	_ = p.coord.Apply(ttscoord.Close{})
+	<-p.done
+	return p.audio, p.firstErr
+}
--- a/core/http/endpoints/openai/realtime_tts_pipeline_test.go
+++ b/core/http/endpoints/openai/realtime_tts_pipeline_test.go
@@ -0,0 +1,114 @@
+package openai
+
+import (
+	"errors"
+	"sync"
+	"time"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("ttsPipeline", func() {
+	It("synthesizes clauses in order and accumulates their audio", func() {
+		p := newTTSPipeline(func(clause string) ([]byte, error) {
+			return []byte(clause), nil
+		})
+		Expect(p.enqueue("a")).To(BeTrue())
+		Expect(p.enqueue("b")).To(BeTrue())
+		Expect(p.enqueue("c")).To(BeTrue())
+
+		audio, err := p.wait()
+		Expect(err).NotTo(HaveOccurred())
+		Expect(string(audio)).To(Equal("abc"))
+	})
+
+	It("never blocks the producer even when synthesis is slow", func() {
+		var started sync.WaitGroup
+		started.Add(1)
+		release := make(chan struct{})
+		first := true
+		p := newTTSPipeline(func(clause string) ([]byte, error) {
+			if first {
+				first = false
+				started.Done()
+				<-release // hold the worker on the first clause
+			}
+			return []byte(clause), nil
+		})
+
+		Expect(p.enqueue("1")).To(BeTrue())
+		started.Wait() // worker is now blocked synthesizing the first clause
+
+		// Enqueuing many more clauses must return immediately, not block on the
+		// stalled worker — this is what keeps the LLM recv loop flowing.
+		done := make(chan struct{})
+		go func() {
+			defer close(done)
+			for _, c := range []string{"2", "3", "4", "5"} {
+				p.enqueue(c)
+			}
+		}()
+		Eventually(done, time.Second).Should(BeClosed())
+
+		close(release)
+		audio, err := p.wait()
+		Expect(err).NotTo(HaveOccurred())
+		Expect(string(audio)).To(Equal("12345"))
+	})
+
+	It("keeps the first error, stops speaking, and signals the producer to stop", func() {
+		boom := errors.New("backend gone")
+		var spoken []string
+		var mu sync.Mutex
+		p := newTTSPipeline(func(clause string) ([]byte, error) {
+			mu.Lock()
+			spoken = append(spoken, clause)
+			mu.Unlock()
+			if clause == "b" {
+				return nil, boom
+			}
+			return []byte(clause), nil
+		})
+
+		Expect(p.enqueue("a")).To(BeTrue())
+		Expect(p.enqueue("b")).To(BeTrue())
+
+		// Once the failure is observed, enqueue reports it so the caller stops
+		// the prediction; any further clauses are dropped, not spoken.
+		Eventually(func() bool { return !p.enqueue("c") }, time.Second).Should(BeTrue())
+
+		_, err := p.wait()
+		Expect(err).To(MatchError(boom))
+
+		mu.Lock()
+		defer mu.Unlock()
+		Expect(spoken).NotTo(ContainElement("c"), "clauses after the failure are not synthesized")
+	})
+
+	It("is idempotent: a second wait returns the same result without blocking", func() {
+		p := newTTSPipeline(func(clause string) ([]byte, error) {
+			return []byte(clause), nil
+		})
+		Expect(p.enqueue("x")).To(BeTrue())
+
+		audio1, err1 := p.wait()
+		// A deferred backstop wait() in the caller runs after the explicit one;
+		// it must not block or change the result.
+		audio2, err2 := p.wait()
+
+		Expect(err1).NotTo(HaveOccurred())
+		Expect(err2).NotTo(HaveOccurred())
+		Expect(string(audio1)).To(Equal("x"))
+		Expect(string(audio2)).To(Equal("x"))
+	})
+
+	It("returns cleanly when no clause was ever enqueued", func() {
+		p := newTTSPipeline(func(clause string) ([]byte, error) {
+			return []byte(clause), nil
+		})
+		audio, err := p.wait()
+		Expect(err).NotTo(HaveOccurred())
+		Expect(audio).To(BeEmpty())
+	})
+})
--- a/core/http/endpoints/openai/realtime_turncoord.go
+++ b/core/http/endpoints/openai/realtime_turncoord.go
@@ -0,0 +1,127 @@
+package openai
+
+import (
+	"context"
+	"time"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/respcoord"
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/turncoord"
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+	"github.com/mudler/LocalAI/core/schema"
+)
+
+// turnSink wires the explicit turn-detection state machine (turncoord.Coordinator
+// — machine "M2" in docs/design/realtime-state-machines.md) into handleVAD.
+//
+// In the legacy code the turn lifecycle was split across two variables that could
+// disagree: handleVAD's goroutine-local speechStarted bool and the semantic_vad
+// liveTurnState's "is the live stream open" flag (lts.open()). A discardTurn (the
+// no-speech clear, or teardown) closed the live stream but left speechStarted
+// true, so the next speech onset was suppressed by `if !speechStarted` — no
+// speech_started, no barge-in, no commit (Part 2, failure mode 4). Here "speech
+// started" and "a turn is open" are ONE coordinator state, so they cannot desync.
+//
+// Unlike responseSink (M3), which is a genuine dual-writer race, the turn machine
+// is owned by the single handleVAD goroutine; this sink and its coordinator are
+// loop-local. The coordinator's lock only matters for the teardown-time Abort and
+// for keeping State() readable — there is no second writer.
+//
+// The effects map onto the existing turn I/O:
+//   - OpenTurn:          open the live ASR stream (semantic_vad) + feed the onset
+//     audio. A failed open degrades the turn to silence-only — the turn still
+//     proceeds (server_vad-like), matching the legacy behaviour.
+//   - BargeIn:           cancel any in-flight response (non-blocking).
+//   - EmitSpeechStarted: input_audio_buffer.speech_started.
+//   - EmitSpeechStopped: input_audio_buffer.speech_stopped.
+//   - CommitTurn:        committed event + finalize the live stream + issue the
+//     response (via responseSink/respcoord).
+//   - DiscardTurn:       close the live stream and retract any captions.
+//
+// The data-heavy effects (OpenTurn, CommitTurn) need the current tick's audio and
+// transcription context. Because Apply performs effects synchronously on the same
+// (handleVAD) goroutine, the loop sets the relevant scratch fields immediately
+// before each Apply; there is no cross-goroutine sharing.
+type turnSink struct {
+	session    *Session
+	conv       *Conversation
+	transport  Transport
+	lts        *liveTurnState
+	vadContext context.Context
+	startTime  time.Time
+
+	coord *turncoord.Coordinator
+
+	// per-tick context, set by handleVAD before each Apply (single goroutine).
+	sv                 *types.RealtimeSessionSemanticVad // nil = server_vad
+	onsetAudio         []int16                           // OpenTurn feeds this
+	commitAudio        []byte                            // CommitTurn issues this
+	commitAudioLength  float64                           // for finishTurn (flush tail)
+	commitRetranscribe bool                              // gated batch is authoritative
+	commitGated        *schema.TranscriptionResult       // retranscribe batch decode
+}
+
+func newTurnSink(session *Session, conv *Conversation, t Transport, lts *liveTurnState, vadContext context.Context, startTime time.Time) *turnSink {
+	s := &turnSink{
+		session:    session,
+		conv:       conv,
+		transport:  t,
+		lts:        lts,
+		vadContext: vadContext,
+		startTime:  startTime,
+	}
+	s.coord = turncoord.New(s)
+	return s
+}
+
+// Perform executes one effect. It is called by Coordinator.Apply while the
+// coordinator lock is held. The turn coordinator is single-writer (handleVAD), so
+// the synchronous network writes / lts operations here are the same ones the
+// legacy loop did inline on this goroutine; they never contend the lock.
+func (s *turnSink) Perform(e turncoord.Effect) {
+	switch eff := e.(type) {
+	case turncoord.OpenTurn:
+		if s.sv != nil && s.lts.openTurn(s.vadContext, string(eff.Turn)) {
+			s.lts.feedNewAudio(s.onsetAudio)
+		}
+	case turncoord.BargeIn:
+		s.session.respSink.cancel(respcoord.SourceVAD)
+	case turncoord.EmitSpeechStarted:
+		sendEvent(s.transport, types.InputAudioBufferSpeechStartedEvent{
+			ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
+			AudioStartMs:    time.Since(s.startTime).Milliseconds(),
+		})
+	case turncoord.EmitSpeechStopped:
+		sendEvent(s.transport, types.InputAudioBufferSpeechStoppedEvent{
+			ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
+			AudioEndMs:      time.Since(s.startTime).Milliseconds(),
+		})
+	case turncoord.CommitTurn:
+		// The committed item id is the coordinator's turn id (== the live caption
+		// id), so the client's completed event replaces the partial text.
+		itemID := string(eff.Turn)
+		sendEvent(s.transport, types.InputAudioBufferCommittedEvent{
+			ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
+			ItemID:          itemID,
+			PreviousItemID:  "TODO",
+		})
+		// Finalize the turn's live stream (flushes the decode tail). In
+		// retranscribe mode the batch decode is authoritative, so the streamed
+		// transcript is dropped.
+		var live *liveUtterance
+		if s.sv != nil {
+			ut := s.lts.finishTurn(s.commitAudioLength)
+			if !s.commitRetranscribe {
+				live = ut
+			}
+		}
+		audio := s.commitAudio
+		gated := s.commitGated
+		conv := s.conv
+		s.session.respSink.issue(s.vadContext, respcoord.SourceVAD, func(ctx context.Context) {
+			commitUtteranceWithTranscript(ctx, audio, live, gated, itemID, s.session, conv, s.transport)
+		})
+	case turncoord.DiscardTurn:
+		// No-op if the stream was never open (server_vad / already idle).
+		s.lts.discardTurn()
+	}
+}
--- a/core/http/endpoints/openai/realtime_vad_buffer_test.go
+++ b/core/http/endpoints/openai/realtime_vad_buffer_test.go
@@ -0,0 +1,54 @@
+package openai
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// dropInspectedPrefix is what stands between the VAD loop's buffer clears and
+// cutting the first word off an utterance: the no-speech clear must keep the
+// holdback tail (silero hasn't crossed its onset threshold yet) and both
+// clears must keep audio appended while the tick ran (the VAD never saw it).
+var _ = Describe("dropInspectedPrefix", func() {
+	It("keeps the holdback tail of the inspected window and everything appended mid-tick", func() {
+		inspected := []byte{1, 2, 3, 4, 5, 6}
+		appended := []byte{7, 8}
+		buf := append(append([]byte(nil), inspected...), appended...)
+
+		out := dropInspectedPrefix(buf, len(inspected), 2)
+
+		Expect(out).To(Equal([]byte{5, 6, 7, 8}), "older confirmed-silent head dropped, possible onset + fresh audio kept")
+	})
+
+	It("returns the buffer unchanged when the inspected window fits in the holdback", func() {
+		buf := []byte{1, 2, 3}
+
+		Expect(dropInspectedPrefix(buf, len(buf), 4)).To(Equal(buf))
+		Expect(dropInspectedPrefix(buf, len(buf), len(buf))).To(Equal(buf))
+	})
+
+	It("drops the whole inspected window with zero holdback, keeping only mid-tick appends", func() {
+		// The commit-time clear: the inspected audio was committed, audio
+		// appended while the tick ran belongs to the next turn.
+		buf := []byte{1, 2, 3, 4}
+
+		Expect(dropInspectedPrefix(buf, 4, 0)).To(BeEmpty())
+		Expect(dropInspectedPrefix(append(buf, 9), 4, 0)).To(Equal([]byte{9}))
+	})
+
+	It("clamps when told more was inspected than the buffer holds", func() {
+		buf := []byte{1, 2}
+
+		Expect(dropInspectedPrefix(buf, 10, 0)).To(BeEmpty())
+	})
+
+	It("returns a copy, not a sub-slice, when bytes are dropped", func() {
+		buf := []byte{1, 2, 3, 4}
+
+		out := dropInspectedPrefix(buf, 4, 2)
+
+		Expect(out).To(Equal([]byte{3, 4}))
+		buf[2] = 99
+		Expect(out).To(Equal([]byte{3, 4}), "mutating the old backing array must not leak into the published buffer")
+	})
+})
--- a/core/http/endpoints/openai/respcoord/respcoord.go
+++ b/core/http/endpoints/openai/respcoord/respcoord.go
@@ -0,0 +1,267 @@
+// Package respcoord is the explicit state machine for the realtime API's
+// response-coordination concern (machine "M3" in
+// docs/design/realtime-state-machines.md).
+//
+// In the legacy code this machine is implicit: a response is "active" iff
+// Session.activeResponseDone is a non-nil, unclosed channel, and the lifecycle
+// is driven from TWO goroutines (the client read-loop and the VAD goroutine)
+// that both call startResponse/cancelActiveResponse. responseMu guards only the
+// field swap, while the <-done wait happens outside the lock, so two concurrent
+// starts can briefly leave two live response goroutines both appending to the
+// conversation. See docs/design/realtime-state-machines.md, Part 2 (failure
+// mode 2) and the ResponseLifecycle spec under formal-verification/.
+//
+// This package replaces that with:
+//   - a sealed sum type for State (illegal states are unrepresentable),
+//   - a total, pure transition function Next(state, event) -> (state, effects),
+//   - a single-writer Coordinator that serializes every transition.
+//
+// The design guarantees the invariants the specs check:
+//   - at most one live response at any instant,
+//   - exactly one terminal (response.done) per started response,
+//   - no response is started after its terminal (no resurrection).
+package respcoord
+
+import (
+	"fmt"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator"
+)
+
+// ResponseID identifies a single response attempt. The caller mints a fresh,
+// monotonically increasing id for every Start; ids are never reused. The
+// monotonic id is what lets the machine ignore "stale" Finished events from a
+// response that was already superseded or cancelled.
+type ResponseID uint64
+
+// Source records which goroutine drove an event. It is carried for
+// observability/logging only; it never affects a transition (both sources are
+// equal authority). Keeping it in the event type makes the dual-writer reality
+// explicit rather than hidden.
+type Source int
+
+const (
+	// SourceClient is the read-loop: response.create or a manual
+	// input_audio_buffer.commit.
+	SourceClient Source = iota
+	// SourceVAD is the turn-detection goroutine: end-of-speech commit or a
+	// barge-in cancel.
+	SourceVAD
+)
+
+func (s Source) String() string {
+	switch s {
+	case SourceClient:
+		return "client"
+	case SourceVAD:
+		return "vad"
+	default:
+		return fmt.Sprintf("Source(%d)", int(s))
+	}
+}
+
+// Status is the terminal status reported on response.done.
+type Status int
+
+const (
+	// StatusCompleted is a response that finished on its own.
+	StatusCompleted Status = iota
+	// StatusCancelled is a response cut short by a barge-in, an explicit
+	// response.cancel, or by being superseded by a newer response.
+	StatusCancelled
+)
+
+func (s Status) String() string {
+	switch s {
+	case StatusCompleted:
+		return "completed"
+	case StatusCancelled:
+		return "cancelled"
+	default:
+		return fmt.Sprintf("Status(%d)", int(s))
+	}
+}
+
+// State is the sealed sum type of coordinator states. The only implementations
+// are the unexported-method-bearing structs in this file, so callers outside
+// the package cannot fabricate an out-of-band state. Exhaustively:
+// Idle | Active | Terminated.
+type State interface {
+	isState()
+	String() string
+}
+
+// Idle: no response is in flight.
+type Idle struct{}
+
+// Active: exactly one response (ID) is in flight. The struct holds a single id,
+// so "two active responses" is not representable.
+type Active struct{ ID ResponseID }
+
+// Terminated: the session is torn down. Absorbing — no response can start from
+// here, so the M1 (connection) parent's teardown can guarantee no response
+// outlives the session (see formal-verification/session_lifecycle.fizz).
+type Terminated struct{}
+
+func (Idle) isState()       {}
+func (Active) isState()     {}
+func (Terminated) isState() {}
+
+func (Idle) String() string       { return "Idle" }
+func (a Active) String() string   { return fmt.Sprintf("Active(%d)", a.ID) }
+func (Terminated) String() string { return "Terminated" }
+
+// Event is the sealed sum type of inputs. Exhaustively:
+// Start | Finished | Cancel | Shutdown.
+type Event interface {
+	isEvent()
+	String() string
+}
+
+// Start requests a new response. ID must be a fresh, never-before-used id.
+type Start struct {
+	ID     ResponseID
+	Source Source
+}
+
+// Finished reports that the response goroutine for ID reached its own terminal.
+// If ID is not the currently-active response it is "stale" (the response was
+// already superseded/cancelled) and is ignored.
+type Finished struct{ ID ResponseID }
+
+// Cancel requests cancellation of the in-flight response (barge-in or explicit
+// response.cancel). It is a no-op when idle.
+type Cancel struct{ Source Source }
+
+// Shutdown terminates the coordinator at session teardown: it cancels any
+// in-flight response and moves to the absorbing Terminated state, after which no
+// response can start. Raised by the connection (M1) parent's teardown.
+type Shutdown struct{}
+
+func (Start) isEvent()    {}
+func (Finished) isEvent() {}
+func (Cancel) isEvent()   {}
+func (Shutdown) isEvent() {}
+
+func (e Start) String() string    { return fmt.Sprintf("Start(%d,%s)", e.ID, e.Source) }
+func (e Finished) String() string { return fmt.Sprintf("Finished(%d)", e.ID) }
+func (e Cancel) String() string   { return fmt.Sprintf("Cancel(%s)", e.Source) }
+func (Shutdown) String() string   { return "Shutdown" }
+
+// Effect is a side effect returned by Next as data for the caller to perform.
+// Returning effects as data (rather than firing callbacks inside the
+// transition) keeps Next pure and exhaustively testable, and lets the
+// Coordinator decide how/when to perform them. Exhaustively:
+// CancelResponse | StartResponse | EmitTerminal.
+type Effect interface {
+	isEffect()
+	String() string
+}
+
+// CancelResponse: cancel the context of the running response ID.
+type CancelResponse struct{ ID ResponseID }
+
+// StartResponse: spawn the response goroutine for ID.
+type StartResponse struct{ ID ResponseID }
+
+// EmitTerminal: send response.done for ID with Status.
+type EmitTerminal struct {
+	ID     ResponseID
+	Status Status
+}
+
+func (CancelResponse) isEffect() {}
+func (StartResponse) isEffect()  {}
+func (EmitTerminal) isEffect()   {}
+
+func (e CancelResponse) String() string { return fmt.Sprintf("CancelResponse(%d)", e.ID) }
+func (e StartResponse) String() string  { return fmt.Sprintf("StartResponse(%d)", e.ID) }
+func (e EmitTerminal) String() string {
+	return fmt.Sprintf("EmitTerminal(%d,%s)", e.ID, e.Status)
+}
+
+// Next is the total, pure transition function. For every (state, event) it
+// returns the next state and the ordered effects to perform. It returns a
+// non-nil error only for an unknown State/Event implementation (a programmer
+// error / future type added without updating this function) — callers must
+// surface that, never silently ignore it. Every in-domain (state, event) pair
+// is defined; there are no "forbidden" transitions, only no-ops for stale or
+// idle inputs.
+//
+// The supersede rule (Active + Start) is the crux of the fix: starting a new
+// response while one is active emits the old response's cancelled terminal and
+// cancels it BEFORE the replacement starts, all within one serialized
+// transition. The old goroutine's later Finished is therefore stale and
+// ignored — so each id gets exactly one terminal and there is never more than
+// one live response.
+func Next(s State, e Event) (State, []Effect, error) {
+	switch st := s.(type) {
+	case Idle:
+		switch ev := e.(type) {
+		case Start:
+			return Active{ID: ev.ID}, []Effect{StartResponse{ID: ev.ID}}, nil
+		case Cancel:
+			// Nothing in flight: idempotent no-op.
+			return Idle{}, nil, nil
+		case Finished:
+			// Stale terminal from an already-superseded/cancelled response.
+			return Idle{}, nil, nil
+		case Shutdown:
+			// Teardown with nothing in flight: go terminal.
+			return Terminated{}, nil, nil
+		}
+	case Active:
+		switch ev := e.(type) {
+		case Start:
+			return Active{ID: ev.ID}, []Effect{
+				CancelResponse{ID: st.ID},
+				EmitTerminal{ID: st.ID, Status: StatusCancelled},
+				StartResponse{ID: ev.ID},
+			}, nil
+		case Finished:
+			if ev.ID == st.ID {
+				return Idle{}, []Effect{EmitTerminal{ID: st.ID, Status: StatusCompleted}}, nil
+			}
+			// Stale finish from a superseded response — already terminal-ed.
+			return Active{ID: st.ID}, nil, nil
+		case Cancel:
+			return Idle{}, []Effect{
+				CancelResponse{ID: st.ID},
+				EmitTerminal{ID: st.ID, Status: StatusCancelled},
+			}, nil
+		case Shutdown:
+			// Teardown while a response is live: cancel it (with its terminal) and
+			// go terminal so nothing can start afterwards.
+			return Terminated{}, []Effect{
+				CancelResponse{ID: st.ID},
+				EmitTerminal{ID: st.ID, Status: StatusCancelled},
+			}, nil
+		}
+	case Terminated:
+		// Absorbing: every event is a no-op. A Start after teardown is rejected
+		// (no StartResponse), so no response can outlive the session.
+		switch e.(type) {
+		case Start, Finished, Cancel, Shutdown:
+			return Terminated{}, nil, nil
+		}
+	}
+	return s, nil, fmt.Errorf("respcoord: unhandled transition %s <- %s", s, e)
+}
+
+// EffectSink performs the effects produced by a transition. See coordinator.Sink
+// for the non-blocking contract: Perform runs under the coordinator lock, so it
+// must not block and must not re-enter Apply (the spawned response goroutine's
+// Finished apply happens only after the sink returns).
+type EffectSink = coordinator.Sink[Effect]
+
+// Coordinator serializes every Start/Finished/Cancel/Shutdown transition behind
+// one lock, so the two driving goroutines (read-loop and VAD) can call Apply
+// concurrently without the legacy dual-writer race. Effects are performed in
+// order under the lock — preserving the (cancel old, emit old terminal, start
+// new) supersede ordering. See coordinator.Coordinator.
+type Coordinator = coordinator.Coordinator[State, Event, Effect]
+
+// New returns an idle Coordinator that performs effects via sink.
+func New(sink EffectSink) *Coordinator {
+	return coordinator.New[State, Event, Effect](Idle{}, Next, sink)
+}
--- a/core/http/endpoints/openai/respcoord/respcoord_suite_test.go
+++ b/core/http/endpoints/openai/respcoord/respcoord_suite_test.go
@@ -0,0 +1,13 @@
+package respcoord
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestRespcoord(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "respcoord (realtime M3) Suite")
+}
--- a/core/http/endpoints/openai/respcoord/respcoord_test.go
+++ b/core/http/endpoints/openai/respcoord/respcoord_test.go
@@ -0,0 +1,370 @@
+package respcoord
+
+import (
+	"math/rand/v2"
+	"sync"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// recordingSink captures the ordered stream of effects so the invariants can be
+// checked independently of the transition function's internals. Perform is
+// called by Coordinator.Apply under the coordinator lock, so it is already
+// serialized; the mutex here only guards reads from the spec goroutine.
+type recordingSink struct {
+	mu  sync.Mutex
+	log []Effect
+}
+
+func (s *recordingSink) Perform(e Effect) {
+	s.mu.Lock()
+	s.log = append(s.log, e)
+	s.mu.Unlock()
+}
+
+func (s *recordingSink) snapshot() []Effect {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	out := make([]Effect, len(s.log))
+	copy(out, s.log)
+	return out
+}
+
+// checkInvariants replays the effect log and asserts the three core safety
+// properties from docs/design/realtime-state-machines.md, Part 4:
+//
+//	(1) at most one live response at any instant
+//	    -- after every effect, the number of started-but-not-terminated ids <= 1;
+//	(2) exactly one terminal per started response
+//	    -- each id is started at most once and terminated at most once;
+//	(3) no resurrection
+//	    -- an id is never started after it has been terminated.
+func checkInvariants(log []Effect) {
+	started := map[ResponseID]int{}
+	terminated := map[ResponseID]int{}
+	live := map[ResponseID]bool{}
+
+	for i, eff := range log {
+		switch e := eff.(type) {
+		case StartResponse:
+			Expect(terminated[e.ID]).To(Equal(0), "invariant (3): StartResponse(%d) after it was terminated (effect #%d)\nlog=%v", e.ID, i, log)
+			started[e.ID]++
+			Expect(started[e.ID]).To(Equal(1), "invariant (2): id %d started %d times (effect #%d)\nlog=%v", e.ID, started[e.ID], i, log)
+			live[e.ID] = true
+		case EmitTerminal:
+			terminated[e.ID]++
+			Expect(terminated[e.ID]).To(Equal(1), "invariant (2): id %d terminated %d times (effect #%d)\nlog=%v", e.ID, terminated[e.ID], i, log)
+			delete(live, e.ID)
+		case CancelResponse:
+			// no count assertion; cancellation is paired with a terminal
+		}
+		Expect(len(live)).To(BeNumerically("<=", 1), "invariant (1): %d live responses after effect #%d (%s)\nlog=%v", len(live), i, eff, log)
+	}
+}
+
+// unknownEvent is an Event implementation Next does not know about, to exercise
+// the defensive error path.
+type unknownEvent struct{}
+
+func (unknownEvent) isEvent()       {}
+func (unknownEvent) String() string { return "unknownEvent" }
+
+var _ = Describe("respcoord.Next", func() {
+	// DescribeTable exhaustively pins every (state, event) cell of the pure
+	// transition function, including the stale / idle no-op cells. This is the
+	// practical stand-in for "no transition leads to an inconsistent state": if a
+	// cell changes, this table must change with it.
+	DescribeTable("transitions",
+		func(state State, event Event, wantState State, wantEff []Effect) {
+			gotState, gotEff, err := Next(state, event)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(gotState).To(Equal(wantState))
+			Expect(gotEff).To(Equal(wantEff))
+		},
+		Entry("idle+start -> active, spawns response",
+			Idle{}, Start{ID: 1, Source: SourceClient},
+			Active{ID: 1}, []Effect{StartResponse{ID: 1}}),
+		Entry("idle+cancel -> idle, no-op",
+			Idle{}, Cancel{Source: SourceVAD},
+			Idle{}, []Effect(nil)),
+		Entry("idle+finished(stale) -> idle, no-op",
+			Idle{}, Finished{ID: 7},
+			Idle{}, []Effect(nil)),
+		Entry("active+start -> supersede: cancel+terminal(old)+start(new)",
+			Active{ID: 1}, Start{ID: 2, Source: SourceVAD},
+			Active{ID: 2},
+			[]Effect{
+				CancelResponse{ID: 1},
+				EmitTerminal{ID: 1, Status: StatusCancelled},
+				StartResponse{ID: 2},
+			}),
+		Entry("active+finished(current) -> idle, completed terminal",
+			Active{ID: 3}, Finished{ID: 3},
+			Idle{}, []Effect{EmitTerminal{ID: 3, Status: StatusCompleted}}),
+		Entry("active+finished(stale) -> stay active, no-op",
+			Active{ID: 3}, Finished{ID: 2},
+			Active{ID: 3}, []Effect(nil)),
+		Entry("active+cancel -> idle, cancel+cancelled terminal",
+			Active{ID: 5}, Cancel{Source: SourceClient},
+			Idle{},
+			[]Effect{
+				CancelResponse{ID: 5},
+				EmitTerminal{ID: 5, Status: StatusCancelled},
+			}),
+		Entry("idle+shutdown -> terminated, no-op",
+			Idle{}, Shutdown{},
+			Terminated{}, []Effect(nil)),
+		Entry("active+shutdown -> terminated: cancel+cancelled terminal",
+			Active{ID: 6}, Shutdown{},
+			Terminated{},
+			[]Effect{
+				CancelResponse{ID: 6},
+				EmitTerminal{ID: 6, Status: StatusCancelled},
+			}),
+		Entry("terminated+start -> terminated, REJECTED (no resurrection)",
+			Terminated{}, Start{ID: 9, Source: SourceClient},
+			Terminated{}, []Effect(nil)),
+		Entry("terminated+finished -> terminated, no-op (stale)",
+			Terminated{}, Finished{ID: 9},
+			Terminated{}, []Effect(nil)),
+		Entry("terminated+cancel -> terminated, no-op",
+			Terminated{}, Cancel{Source: SourceVAD},
+			Terminated{}, []Effect(nil)),
+		Entry("terminated+shutdown -> terminated, idempotent",
+			Terminated{}, Shutdown{},
+			Terminated{}, []Effect(nil)),
+	)
+
+	It("is total: every defined (state, event) pair is handled without error", func() {
+		states := []State{Idle{}, Active{ID: 1}, Terminated{}}
+		events := []Event{
+			Start{ID: 2, Source: SourceClient},
+			Finished{ID: 1},
+			Finished{ID: 99},
+			Cancel{Source: SourceVAD},
+			Shutdown{},
+		}
+		for _, s := range states {
+			for _, e := range events {
+				_, _, err := Next(s, e)
+				Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e)
+			}
+		}
+	})
+
+	It("errors on an unknown event type", func() {
+		_, _, err := Next(Active{ID: 1}, unknownEvent{})
+		Expect(err).To(HaveOccurred())
+	})
+})
+
+var _ = Describe("respcoord.Coordinator", func() {
+	// This replaces the previous rapid stateful test: a seeded random walk over
+	// the event space, asserting the invariants hold after every step. Seeds are
+	// fixed so any failure reproduces deterministically.
+	It("upholds the safety invariants over random event sequences", func() {
+		seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE}
+		for _, seed := range seeds {
+			r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5))
+			sink := &recordingSink{}
+			c := New(sink)
+			var nextID uint64
+
+			for range 3000 {
+				switch r.IntN(4) {
+				case 0: // start from client
+					nextID++
+					Expect(c.Apply(Start{ID: ResponseID(nextID), Source: SourceClient})).To(Succeed())
+				case 1: // start from VAD
+					nextID++
+					Expect(c.Apply(Start{ID: ResponseID(nextID), Source: SourceVAD})).To(Succeed())
+				case 2: // possibly-stale finish from any plausible id (incl. future)
+					id := r.Uint64N(nextID + 3)
+					Expect(c.Apply(Finished{ID: ResponseID(id)})).To(Succeed())
+				case 3: // explicit cancel
+					Expect(c.Apply(Cancel{Source: SourceClient})).To(Succeed())
+				}
+			}
+			// One full-log replay per seed: it iterates the whole sequence, so
+			// it catches a violation at any step without the O(n^2) cost of
+			// re-replaying after every Apply.
+			checkInvariants(sink.snapshot())
+		}
+	})
+
+	// Hammer Apply from two goroutines -- the read-loop and the VAD goroutine,
+	// the exact dual-writer scenario that races in the legacy code -- and assert
+	// the invariants still hold. Run under -race to also catch any data race in
+	// the coordinator itself.
+	It("upholds the invariants under concurrent dual-writer Apply", func() {
+		const perGoroutine = 2000
+		sink := &recordingSink{}
+		c := New(sink)
+
+		var idCounter uint64
+		var idMu sync.Mutex
+		nextID := func() ResponseID {
+			idMu.Lock()
+			defer idMu.Unlock()
+			idCounter++
+			return ResponseID(idCounter)
+		}
+
+		var wg sync.WaitGroup
+		drive := func(src Source) {
+			defer wg.Done()
+			for i := range perGoroutine {
+				switch i % 3 {
+				case 0:
+					_ = c.Apply(Start{ID: nextID(), Source: src})
+				case 1:
+					if a, ok := c.State().(Active); ok {
+						_ = c.Apply(Finished{ID: a.ID})
+					}
+				case 2:
+					_ = c.Apply(Cancel{Source: src})
+				}
+			}
+		}
+
+		wg.Add(2)
+		go drive(SourceClient)
+		go drive(SourceVAD)
+		wg.Wait()
+
+		checkInvariants(sink.snapshot())
+	})
+
+	It("rejects the dual-writer interleaving the legacy mechanism allowed", func() {
+		// Equivalent sequence to the legacy double-start race: start id1, then two
+		// superseding starts (id2, id3) such as the read-loop and VAD would each
+		// issue. Each Start is serialized by the coordinator, so each supersede
+		// cancels+terminates the previous -- never two live at once.
+		sink := &recordingSink{}
+		c := New(sink)
+
+		Expect(c.Apply(Start{ID: 1, Source: SourceClient})).To(Succeed())
+		Expect(c.Apply(Start{ID: 2, Source: SourceVAD})).To(Succeed())
+		Expect(c.Apply(Start{ID: 3, Source: SourceClient})).To(Succeed())
+
+		checkInvariants(sink.snapshot())
+
+		got, ok := c.State().(Active)
+		Expect(ok).To(BeTrue(), "state = %s, want Active(3)", c.State())
+		Expect(got.ID).To(Equal(ResponseID(3)))
+	})
+
+	It("terminates on shutdown and rejects any later response (no resurrection)", func() {
+		sink := &recordingSink{}
+		c := New(sink)
+
+		Expect(c.Apply(Start{ID: 1, Source: SourceClient})).To(Succeed())
+		Expect(c.Apply(Shutdown{})).To(Succeed()) // cancels id 1 + goes terminal
+		Expect(c.State()).To(Equal(State(Terminated{})))
+
+		// A late response.create after teardown is structurally rejected.
+		Expect(c.Apply(Start{ID: 2, Source: SourceClient})).To(Succeed())
+		Expect(c.State()).To(Equal(State(Terminated{})))
+		// And a stale Finished from the cancelled response is absorbed.
+		Expect(c.Apply(Finished{ID: 1})).To(Succeed())
+
+		checkInvariants(sink.snapshot())
+		starts := 0
+		for _, e := range sink.snapshot() {
+			if _, ok := e.(StartResponse); ok {
+				starts++
+			}
+		}
+		Expect(starts).To(Equal(1), "only id 1 ever started; the post-shutdown Start was rejected")
+	})
+})
+
+// legacyCoord models the LEGACY startResponse/cancelActiveResponse mechanism, in
+// which the snapshot ("lock" read), the cancel-and-wait, and the spawn are NOT
+// atomic with respect to each other across the two driving goroutines. It exists
+// only to demonstrate the dual-writer race (Part 2, failure mode 2) that
+// respcoord.Coordinator eliminates. It is not used in production.
+//
+// Mapping to the legacy code:
+//   - startStep1  = snapshot Session.activeResponse* under responseMu
+//   - startStep2  = cancelActiveResponse: cancel() then <-done (outside the lock);
+//     a second waiter on an already-closed done returns immediately and does NOT
+//     decrement again (modeled by the snap==registered guard)
+//   - startStep3  = store the new cancel/done pair and spawn the goroutine
+type legacyCoord struct {
+	live       int    // # of live response goroutines (the bug: can exceed 1)
+	registered uint64 // id of the currently-registered response (0 = none)
+	nextID     uint64
+}
+
+func (l *legacyCoord) startStep1() uint64 { return l.registered } // snapshot
+
+func (l *legacyCoord) startStep2(snap uint64) { // cancel-and-wait
+	if snap != 0 && snap == l.registered {
+		l.live--
+		l.registered = 0
+	}
+}
+
+func (l *legacyCoord) startStep3() { // spawn + register
+	l.nextID++
+	l.live++
+	l.registered = l.nextID
+}
+
+var _ = DescribeTable("respcoord stringers",
+	func(got, want string) { Expect(got).To(Equal(want)) },
+	Entry(nil, SourceClient.String(), "client"),
+	Entry(nil, SourceVAD.String(), "vad"),
+	Entry(nil, Source(99).String(), "Source(99)"),
+
+	Entry(nil, StatusCompleted.String(), "completed"),
+	Entry(nil, StatusCancelled.String(), "cancelled"),
+	Entry(nil, Status(99).String(), "Status(99)"),
+
+	Entry(nil, Idle{}.String(), "Idle"),
+	Entry(nil, Active{ID: 7}.String(), "Active(7)"),
+	Entry(nil, Terminated{}.String(), "Terminated"),
+
+	Entry(nil, Start{ID: 1, Source: SourceVAD}.String(), "Start(1,vad)"),
+	Entry(nil, Finished{ID: 2}.String(), "Finished(2)"),
+	Entry(nil, Cancel{Source: SourceClient}.String(), "Cancel(client)"),
+	Entry(nil, Shutdown{}.String(), "Shutdown"),
+
+	Entry(nil, CancelResponse{ID: 3}.String(), "CancelResponse(3)"),
+	Entry(nil, StartResponse{ID: 4}.String(), "StartResponse(4)"),
+	Entry(nil, EmitTerminal{ID: 5, Status: StatusCompleted}.String(), "EmitTerminal(5,completed)"),
+)
+
+var _ = Describe("legacy dual-writer characterization", func() {
+	// Pins the exact interleaving in which the read-loop and the VAD goroutine
+	// both start a response and the machine ends up with TWO live responses. This
+	// is a characterization test for the bug: if a future change to the legacy
+	// model accidentally fixes it, this spec flips and we delete the legacy model.
+	// The production path uses respcoord.Coordinator, proven safe above.
+	It("can reach two live responses (the bug respcoord eliminates)", func() {
+		l := &legacyCoord{}
+
+		// First response established normally.
+		s := l.startStep1()
+		l.startStep2(s)
+		l.startStep3() // live=1, registered=1
+		Expect(l.live).To(Equal(1), "setup")
+
+		// The race: both goroutines snapshot the SAME active response (id 1)...
+		snapVAD := l.startStep1()    // 1
+		snapClient := l.startStep1() // 1
+
+		// ...both "cancel-and-wait" it. The first decrements; the second finds it
+		// already gone and does nothing.
+		l.startStep2(snapVAD)    // live=0, registered=0
+		l.startStep2(snapClient) // no-op (already 0)
+
+		// ...then both spawn their replacement.
+		l.startStep3() // live=1
+		l.startStep3() // live=2  <-- two live responses
+
+		Expect(l.live).To(Equal(2), "expected the legacy race to reach 2 live responses")
+	})
+})
--- a/core/http/endpoints/openai/ttscoord/ttscoord.go
+++ b/core/http/endpoints/openai/ttscoord/ttscoord.go
@@ -0,0 +1,150 @@
+// Package ttscoord is the explicit state machine for the realtime API's
+// TTS-pipeline lifecycle (machine "M5" in docs/design/realtime-state-machines.md).
+//
+// The realtime TTS pipeline (realtime_tts_pipeline.go) decouples synthesis from
+// LLM token generation: the token callback enqueues clauses, a single worker
+// goroutine synthesizes them in order, and wait() closes the queue and joins the
+// worker. In the legacy code the lifecycle is an implicit `closed bool` (guarded
+// by the pipeline mutex) plus a `done` channel closed once by the worker. Two
+// gaps: enqueue does NOT check `closed`, so a clause offered after wait() is
+// silently appended to a worker that may have already exited (dropped); and the
+// open/closed lifecycle is inferred from a bool rather than stored.
+//
+// This package makes the lifecycle explicit:
+//   - a sealed sum type for State (Open | Closing | Closed) — monotonic; illegal
+//     reversals are unrepresentable,
+//   - a total, pure transition function Next(state, event) -> (state, effects),
+//   - a single-writer Coordinator that serializes every transition.
+//
+// It is a genuine two-writer machine: the producer goroutine raises Close (from
+// wait()), and the worker goroutine raises WorkerExited when it has drained the
+// queue and seen the close — so serializing the transition matters. The poison
+// `failed` latch stays a lock-free atomic.Bool in the pipeline (it is read per
+// clause on the worker's hot path and is orthogonal to open/closed); this machine
+// owns only the open->closing->closed lifecycle.
+//
+// Guarantees the spec checks:
+//   - Close wakes the worker to exit exactly once (idempotent wait(); invariant
+//     #10),
+//   - the lifecycle is monotonic and Closed is terminal — so a clause is never
+//     accepted after close (enqueue is gated on Open) and the worker is joined
+//     exactly once (no leak; invariant #8).
+package ttscoord
+
+import (
+	"fmt"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator"
+)
+
+// State is the sealed sum type of TTS-pipeline lifecycle states. Exhaustively:
+// Open | Closing | Closed.
+type State interface {
+	isState()
+	String() string
+}
+
+// Open: the worker is running and accepting clauses.
+type Open struct{}
+
+// Closing: wait() has been called; the worker is draining the remaining queue and
+// will exit. No new clause is accepted.
+type Closing struct{}
+
+// Closed: the worker has exited (its done channel is closed). Terminal.
+type Closed struct{}
+
+func (Open) isState()    {}
+func (Closing) isState() {}
+func (Closed) isState()  {}
+
+func (Open) String() string    { return "Open" }
+func (Closing) String() string { return "Closing" }
+func (Closed) String() string  { return "Closed" }
+
+// Event is the sealed sum type of inputs. Exhaustively: Close | WorkerExited.
+type Event interface {
+	isEvent()
+	String() string
+}
+
+// Close is raised by the producer goroutine (wait()): close the queue and ask
+// the worker to finish. Idempotent.
+type Close struct{}
+
+// WorkerExited is raised by the worker goroutine when it has drained the queue
+// and observed the close, just before it closes its done channel.
+type WorkerExited struct{}
+
+func (Close) isEvent()        {}
+func (WorkerExited) isEvent() {}
+
+func (Close) String() string        { return "Close" }
+func (WorkerExited) String() string { return "WorkerExited" }
+
+// Effect is a side effect returned by Next as data. Exhaustively: Wake.
+type Effect interface {
+	isEffect()
+	String() string
+}
+
+// Wake: signal the worker (via the buffered wake channel) so it re-checks the
+// lifecycle and exits. Emitted once, on the Open->Closing transition.
+type Wake struct{}
+
+func (Wake) isEffect() {}
+
+func (Wake) String() string { return "Wake" }
+
+// Next is the total, pure transition function. For every (state, event) it
+// returns the next state and the ordered effects. It returns a non-nil error
+// only for an unknown State/Event implementation. Every in-domain pair is
+// defined; there are no forbidden transitions, only no-ops.
+//
+// The lifecycle is monotonic Open -> Closing -> Closed. Close wakes the worker
+// only on the first Open->Closing transition (idempotent wait()); a later Close
+// is absorbed. WorkerExited only advances Closing -> Closed.
+func Next(s State, e Event) (State, []Effect, error) {
+	switch s.(type) {
+	case Open:
+		switch e.(type) {
+		case Close:
+			return Closing{}, []Effect{Wake{}}, nil
+		case WorkerExited:
+			// Worker exited while still Open (e.g. never any clause and an early
+			// close race) -- treat as fully closed; defensive, keeps Next total.
+			return Closed{}, nil, nil
+		}
+	case Closing:
+		switch e.(type) {
+		case Close:
+			// Idempotent wait(): already closing, no second wake.
+			return Closing{}, nil, nil
+		case WorkerExited:
+			return Closed{}, nil, nil
+		}
+	case Closed:
+		switch e.(type) {
+		case Close:
+			return Closed{}, nil, nil
+		case WorkerExited:
+			return Closed{}, nil, nil
+		}
+	}
+	return s, nil, fmt.Errorf("ttscoord: unhandled transition %s <- %s", s, e)
+}
+
+// EffectSink performs the effects produced by a transition. See coordinator.Sink:
+// Wake does a non-blocking send on a buffered channel, so Perform does not block
+// under the lock.
+type EffectSink = coordinator.Sink[Effect]
+
+// Coordinator serializes the TTS-pipeline transitions. The producer (Close) and
+// worker (WorkerExited) goroutines both call Apply, so the lock serializes the
+// two writers. See coordinator.Coordinator.
+type Coordinator = coordinator.Coordinator[State, Event, Effect]
+
+// New returns an Open Coordinator that performs effects via sink.
+func New(sink EffectSink) *Coordinator {
+	return coordinator.New[State, Event, Effect](Open{}, Next, sink)
+}
--- a/core/http/endpoints/openai/ttscoord/ttscoord_suite_test.go
+++ b/core/http/endpoints/openai/ttscoord/ttscoord_suite_test.go
@@ -0,0 +1,13 @@
+package ttscoord
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestTtscoord(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "ttscoord (realtime M5) Suite")
+}
--- a/core/http/endpoints/openai/ttscoord/ttscoord_test.go
+++ b/core/http/endpoints/openai/ttscoord/ttscoord_test.go
@@ -0,0 +1,165 @@
+package ttscoord
+
+import (
+	"math/rand/v2"
+	"sync"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// recordingSink captures the ordered stream of effects.
+type recordingSink struct {
+	mu  sync.Mutex
+	log []Effect
+}
+
+func (s *recordingSink) Perform(e Effect) {
+	s.mu.Lock()
+	s.log = append(s.log, e)
+	s.mu.Unlock()
+}
+
+func (s *recordingSink) wakes() int {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	n := 0
+	for _, e := range s.log {
+		if _, ok := e.(Wake); ok {
+			n++
+		}
+	}
+	return n
+}
+
+type unknownEvent struct{}
+
+func (unknownEvent) isEvent()       {}
+func (unknownEvent) String() string { return "unknownEvent" }
+
+type unknownState struct{}
+
+func (unknownState) isState()       {}
+func (unknownState) String() string { return "unknownState" }
+
+var _ = Describe("ttscoord.Next", func() {
+	DescribeTable("transitions",
+		func(state State, event Event, wantState State, wantEff []Effect) {
+			gotState, gotEff, err := Next(state, event)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(gotState).To(Equal(wantState))
+			Expect(gotEff).To(Equal(wantEff))
+		},
+		Entry("open+close -> closing: wake",
+			Open{}, Close{}, Closing{}, []Effect{Wake{}}),
+		Entry("open+workerexited -> closed (defensive)",
+			Open{}, WorkerExited{}, Closed{}, []Effect(nil)),
+		Entry("closing+close -> closing, no-op (idempotent wait)",
+			Closing{}, Close{}, Closing{}, []Effect(nil)),
+		Entry("closing+workerexited -> closed",
+			Closing{}, WorkerExited{}, Closed{}, []Effect(nil)),
+		Entry("closed+close -> closed, no-op",
+			Closed{}, Close{}, Closed{}, []Effect(nil)),
+		Entry("closed+workerexited -> closed, no-op",
+			Closed{}, WorkerExited{}, Closed{}, []Effect(nil)),
+	)
+
+	It("is total over the defined (state, event) pairs", func() {
+		for _, s := range []State{Open{}, Closing{}, Closed{}} {
+			for _, e := range []Event{Close{}, WorkerExited{}} {
+				_, _, err := Next(s, e)
+				Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e)
+			}
+		}
+	})
+
+	It("errors on an unknown event type", func() {
+		_, _, err := Next(Open{}, unknownEvent{})
+		Expect(err).To(HaveOccurred())
+	})
+
+	It("errors on an unknown state type", func() {
+		_, _, err := Next(unknownState{}, Close{})
+		Expect(err).To(HaveOccurred())
+	})
+})
+
+// phaseOf maps a state to a monotonic rank for the "never goes backwards" check.
+func phaseOf(s State) int {
+	switch s.(type) {
+	case Open:
+		return 0
+	case Closing:
+		return 1
+	case Closed:
+		return 2
+	default:
+		return -1
+	}
+}
+
+var _ = Describe("ttscoord.Coordinator", func() {
+	It("keeps the lifecycle monotonic and wakes at most once over random sequences", func() {
+		seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE}
+		for _, seed := range seeds {
+			r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5))
+			sink := &recordingSink{}
+			c := New(sink)
+			prev := 0
+
+			for range 5000 {
+				if r.IntN(2) == 0 {
+					Expect(c.Apply(Close{})).To(Succeed())
+				} else {
+					Expect(c.Apply(WorkerExited{})).To(Succeed())
+				}
+				cur := phaseOf(c.State())
+				Expect(cur).To(BeNumerically(">=", prev), "seed=%d: lifecycle went backwards", seed)
+				prev = cur
+			}
+			Expect(sink.wakes()).To(BeNumerically("<=", 1), "seed=%d: woke more than once", seed)
+		}
+	})
+
+	// Two-writer test: a producer raises Close while the "worker" raises
+	// WorkerExited, the real concurrency. The lifecycle must stay monotonic and
+	// Wake must fire at most once. Run under -race.
+	It("is two-writer safe (producer Close vs worker WorkerExited)", func() {
+		const iterations = 200
+		for range iterations {
+			sink := &recordingSink{}
+			c := New(sink)
+			var wg sync.WaitGroup
+			wg.Add(2)
+			go func() { defer wg.Done(); _ = c.Apply(Close{}) }()
+			go func() { defer wg.Done(); _ = c.Apply(WorkerExited{}) }()
+			wg.Wait()
+			// After both, drive to terminal and assert idempotence.
+			_ = c.Apply(Close{})
+			_ = c.Apply(WorkerExited{})
+			Expect(c.State()).To(Equal(State(Closed{})))
+			Expect(sink.wakes()).To(BeNumerically("<=", 1))
+		}
+	})
+
+	It("only Open accepts (a gate query never panics across states)", func() {
+		// Mirrors the pipeline's enqueue gate: accepted iff Open.
+		sink := &recordingSink{}
+		c := New(sink)
+		_, open := c.State().(Open)
+		Expect(open).To(BeTrue())
+		Expect(c.Apply(Close{})).To(Succeed())
+		_, open = c.State().(Open)
+		Expect(open).To(BeFalse())
+	})
+})
+
+var _ = DescribeTable("ttscoord stringers",
+	func(got, want string) { Expect(got).To(Equal(want)) },
+	Entry(nil, Open{}.String(), "Open"),
+	Entry(nil, Closing{}.String(), "Closing"),
+	Entry(nil, Closed{}.String(), "Closed"),
+	Entry(nil, Close{}.String(), "Close"),
+	Entry(nil, WorkerExited{}.String(), "WorkerExited"),
+	Entry(nil, Wake{}.String(), "Wake"),
+)
--- a/core/http/endpoints/openai/turncoord/turncoord.go
+++ b/core/http/endpoints/openai/turncoord/turncoord.go
@@ -0,0 +1,255 @@
+// Package turncoord is the explicit state machine for the realtime API's
+// turn-detection concern (machine "M2" in
+// docs/design/realtime-state-machines.md).
+//
+// In the legacy code this machine is implicit and, worse, split across TWO
+// variables that can disagree: handleVAD's goroutine-local speechStarted bool
+// and the semantic_vad liveTurnState's "is the live stream open" flag
+// (lts.open()). They are set and cleared at separate points, so a discardTurn
+// (no-speech clear, a semantic->server mode switch mid-turn, or teardown)
+// closes the live stream but leaves speechStarted true. The two then disagree,
+// and the next speech onset is suppressed because `if !speechStarted` is false
+// — the user's next utterance silently produces no speech_started, no barge-in,
+// and no commit. See docs/design/realtime-state-machines.md, Part 2 (failure
+// mode 4) and the turn_lifecycle spec under formal-verification/.
+//
+// This package replaces that with:
+//   - a sealed sum type for State (illegal states are unrepresentable),
+//   - a total, pure transition function Next(state, event) -> (state, effects),
+//   - a single-writer Coordinator that serializes every transition.
+//
+// "Speech detected" and "a turn is open" become ONE state (Speaking), so they
+// can no longer fall out of sync: every path that ends a turn returns to Idle
+// and necessarily clears both. The design guarantees the invariants the specs
+// check:
+//   - speechStarted ⟺ a turn is open (Part 4, invariant #4) — structural here,
+//   - a barge-in cancel precedes the next turn's commit (you must pass through
+//     Speaking, which barges in on entry, before a Silence can commit),
+//   - every opened turn is finished (commit) or discarded (abort) exactly once.
+//
+// Unlike M3 (respcoord), which is a genuine dual-writer race, M2's turn
+// lifecycle is driven by the single handleVAD goroutine: the value here is
+// making the speechStarted/turn-open desync unrepresentable, not serializing
+// concurrent writers. The Coordinator still serializes transitions so that
+// State() is race-free and a teardown-time Abort from another goroutine (or a
+// future second writer) stays safe.
+//
+// Mode note: in server_vad mode there is no live ASR stream, so OpenTurn /
+// DiscardTurn have nothing to open or close — the sink performs them as no-ops
+// and "turn open" is satisfied vacuously. The state coupling (Speaking ⟺ turn
+// open) still holds; it is only semantic_vad that had two real variables to
+// desync.
+package turncoord
+
+import (
+	"fmt"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator"
+)
+
+// TurnID identifies one user turn. The caller mints it when speech begins (it
+// is the conversation item id the live caption deltas stream under, reused by
+// the committed event so the client replaces the partial text). Carrying it in
+// the state makes "commit/discard refer to the turn that was opened" explicit.
+type TurnID string
+
+// AbortReason records why a turn was dropped without committing. Like
+// respcoord.Source it is observability only — every reason aborts the same way;
+// keeping it in the event makes the distinct legacy discardTurn sites explicit
+// rather than collapsed into one anonymous code path.
+type AbortReason int
+
+const (
+	// AbortNoSpeech: the no-speech clear — the VAD found no segments and the
+	// buffer is past the holdback, so the inspected audio was not speech.
+	AbortNoSpeech AbortReason = iota
+	// AbortTeardown: the session is closing.
+	AbortTeardown
+)
+
+// NOTE: a semantic->server turn-detection switch mid-turn is deliberately NOT an
+// Abort: it only drops the orphaned live ASR stream and lets the turn continue
+// under server_vad (so a config change can't cut off a mid-utterance speaker).
+// That orphan cleanup stays inline in handleVAD; only the two reasons above end
+// a turn (return to Idle).
+
+func (r AbortReason) String() string {
+	switch r {
+	case AbortNoSpeech:
+		return "no_speech"
+	case AbortTeardown:
+		return "teardown"
+	default:
+		return fmt.Sprintf("AbortReason(%d)", int(r))
+	}
+}
+
+// State is the sealed sum type of turn-detection states. The only
+// implementations are the marker-method structs in this file, so callers
+// outside the package cannot fabricate an out-of-band state. Exhaustively:
+// Idle | Speaking.
+type State interface {
+	isState()
+	String() string
+}
+
+// Idle: no turn is open and no speech is in progress (legacy: speechStarted ==
+// false AND the live stream is closed — here a single state, so they cannot
+// disagree).
+type Idle struct{}
+
+// Speaking: a turn is open and speech is in progress (legacy: speechStarted ==
+// true AND, in semantic mode, the live stream open). Turn is the open turn's id.
+type Speaking struct{ Turn TurnID }
+
+func (Idle) isState()     {}
+func (Speaking) isState() {}
+
+func (Idle) String() string       { return "Idle" }
+func (s Speaking) String() string { return fmt.Sprintf("Speaking(%s)", s.Turn) }
+
+// Event is the sealed sum type of inputs. Exhaustively: Onset | Silence | Abort.
+type Event interface {
+	isEvent()
+	String() string
+}
+
+// Onset reports that the VAD found speech this tick. Turn is the id to open the
+// turn under (allocated by the caller so caption deltas can stream immediately).
+// While already Speaking it is a no-op: re-detection of ongoing speech does not
+// reopen a turn (legacy `if !speechStarted`).
+type Onset struct{ Turn TurnID }
+
+// Silence reports VAD-confirmed silence past the dynamic commit threshold (the
+// end-of-speech commit trigger). The threshold itself — semantic_vad's EOU vs
+// eagerness fallback — is computed by the caller before raising this event; the
+// machine only sequences the commit. It is a no-op while Idle (nothing to
+// commit).
+type Silence struct{}
+
+// Abort drops the open turn without committing (no-speech clear, mode switch,
+// teardown). It is a no-op while Idle (nothing open).
+type Abort struct{ Reason AbortReason }
+
+func (Onset) isEvent()   {}
+func (Silence) isEvent() {}
+func (Abort) isEvent()   {}
+
+func (e Onset) String() string { return fmt.Sprintf("Onset(%s)", e.Turn) }
+func (Silence) String() string { return "Silence" }
+func (e Abort) String() string { return fmt.Sprintf("Abort(%s)", e.Reason) }
+
+// Effect is a side effect returned by Next as data for the caller to perform.
+// Returning effects as data (rather than firing callbacks inside the
+// transition) keeps Next pure and exhaustively testable. Exhaustively:
+// BargeIn | OpenTurn | EmitSpeechStarted | EmitSpeechStopped | CommitTurn |
+// DiscardTurn.
+type Effect interface {
+	isEffect()
+	String() string
+}
+
+// BargeIn: cancel any in-flight response (the M2->M3 edge). Emitted on the
+// Idle->Speaking onset, before the new turn can ever commit — so a barge-in
+// always precedes the next commit.
+type BargeIn struct{}
+
+// OpenTurn: open the live ASR stream for Turn (semantic_vad). No-op in
+// server_vad mode.
+type OpenTurn struct{ Turn TurnID }
+
+// EmitSpeechStarted: send input_audio_buffer.speech_started.
+type EmitSpeechStarted struct{}
+
+// EmitSpeechStopped: send input_audio_buffer.speech_stopped.
+type EmitSpeechStopped struct{}
+
+// CommitTurn: finalize the turn's live stream, emit input_audio_buffer.committed
+// for Turn, and issue the response (via respcoord). The completion of one turn.
+type CommitTurn struct{ Turn TurnID }
+
+// DiscardTurn: close the turn's live stream and retract any caption deltas
+// already shown for Turn (the failed transcription event). No commit, no
+// response.
+type DiscardTurn struct{ Turn TurnID }
+
+func (BargeIn) isEffect()           {}
+func (OpenTurn) isEffect()          {}
+func (EmitSpeechStarted) isEffect() {}
+func (EmitSpeechStopped) isEffect() {}
+func (CommitTurn) isEffect()        {}
+func (DiscardTurn) isEffect()       {}
+
+func (BargeIn) String() string           { return "BargeIn" }
+func (e OpenTurn) String() string        { return fmt.Sprintf("OpenTurn(%s)", e.Turn) }
+func (EmitSpeechStarted) String() string { return "EmitSpeechStarted" }
+func (EmitSpeechStopped) String() string { return "EmitSpeechStopped" }
+func (e CommitTurn) String() string      { return fmt.Sprintf("CommitTurn(%s)", e.Turn) }
+func (e DiscardTurn) String() string     { return fmt.Sprintf("DiscardTurn(%s)", e.Turn) }
+
+// Next is the total, pure transition function. For every (state, event) it
+// returns the next state and the ordered effects to perform. It returns a
+// non-nil error only for an unknown State/Event implementation (a programmer
+// error / future type added without updating this function) — callers must
+// surface that, never silently ignore it. Every in-domain (state, event) pair
+// is defined; there are no "forbidden" transitions, only no-ops for events that
+// don't apply to the current state.
+//
+// The crux of the fix is that both turn-ending transitions (Silence commit and
+// Abort) go to Idle, which carries no turn data: there is no way to clear "turn
+// open" while leaving "speech started" set, because they are the same state.
+// The legacy desync (discardTurn closed the live stream but left speechStarted
+// true) is therefore unrepresentable.
+//
+// Effect ordering on onset mirrors the live handleVAD: OpenTurn (start the live
+// stream), then BargeIn (cancel the prior response), then EmitSpeechStarted.
+func Next(s State, e Event) (State, []Effect, error) {
+	switch st := s.(type) {
+	case Idle:
+		switch ev := e.(type) {
+		case Onset:
+			return Speaking{Turn: ev.Turn}, []Effect{
+				OpenTurn{Turn: ev.Turn},
+				BargeIn{},
+				EmitSpeechStarted{},
+			}, nil
+		case Silence:
+			// Nothing in flight to commit: idempotent no-op.
+			return Idle{}, nil, nil
+		case Abort:
+			// No open turn: idempotent no-op (discardTurn on a closed stream).
+			return Idle{}, nil, nil
+		}
+	case Speaking:
+		switch e.(type) {
+		case Onset:
+			// Speech already in progress: re-detection does not reopen a turn
+			// or re-emit speech_started (legacy `if !speechStarted`). The turn
+			// id stays the one allocated at onset.
+			return Speaking{Turn: st.Turn}, nil, nil
+		case Silence:
+			return Idle{}, []Effect{
+				EmitSpeechStopped{},
+				CommitTurn{Turn: st.Turn},
+			}, nil
+		case Abort:
+			return Idle{}, []Effect{DiscardTurn{Turn: st.Turn}}, nil
+		}
+	}
+	return s, nil, fmt.Errorf("turncoord: unhandled transition %s <- %s", s, e)
+}
+
+// EffectSink performs the effects produced by a transition. See coordinator.Sink
+// for the non-blocking contract: Perform runs under the coordinator lock, so it
+// must not block and must not re-enter Apply.
+type EffectSink = coordinator.Sink[Effect]
+
+// Coordinator serializes turn transitions. In practice the handleVAD goroutine is
+// the only writer, but serializing keeps State() race-free and a teardown-time
+// Abort from another goroutine safe. See coordinator.Coordinator.
+type Coordinator = coordinator.Coordinator[State, Event, Effect]
+
+// New returns an idle Coordinator that performs effects via sink.
+func New(sink EffectSink) *Coordinator {
+	return coordinator.New[State, Event, Effect](Idle{}, Next, sink)
+}
--- a/core/http/endpoints/openai/turncoord/turncoord_suite_test.go
+++ b/core/http/endpoints/openai/turncoord/turncoord_suite_test.go
@@ -0,0 +1,13 @@
+package turncoord
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestTurncoord(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "turncoord (realtime M2) Suite")
+}
--- a/core/http/endpoints/openai/turncoord/turncoord_test.go
+++ b/core/http/endpoints/openai/turncoord/turncoord_test.go
@@ -0,0 +1,242 @@
+package turncoord
+
+import (
+	"fmt"
+	"math/rand/v2"
+	"sync"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// recordingSink captures the ordered stream of effects so the invariants can be
+// checked independently of the transition function's internals. Perform is
+// called by Coordinator.Apply under the coordinator lock, so it is already
+// serialized; the mutex here only guards reads from the spec goroutine.
+type recordingSink struct {
+	mu  sync.Mutex
+	log []Effect
+}
+
+func (s *recordingSink) Perform(e Effect) {
+	s.mu.Lock()
+	s.log = append(s.log, e)
+	s.mu.Unlock()
+}
+
+func (s *recordingSink) snapshot() []Effect {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	out := make([]Effect, len(s.log))
+	copy(out, s.log)
+	return out
+}
+
+// checkLog replays the effect log and asserts the turn-lifecycle safety
+// properties from docs/design/realtime-state-machines.md, Part 4 (invariant #4
+// and the discardTurn/speechStarted desync, failure mode 4):
+//
+//	(1) at most one turn open at any instant -- OpenTurn never fires while a
+//	    turn is already open;
+//	(2) every turn id is opened at most once;
+//	(3) no orphan close -- CommitTurn/DiscardTurn only fire on an open turn.
+//
+// The wire pairing of speech_started/speech_stopped is intentionally NOT
+// reconstructed here: like the legacy no-speech clear, an Abort discards the
+// turn without a speech_stopped (the failed-transcription event is its closure
+// signal). The guarantee this package adds is the *state* coupling (Speaking
+// <=> a turn is open), checked inline in the property spec below.
+func checkLog(log []Effect) {
+	open := false
+	opens := map[TurnID]int{}
+	for i, eff := range log {
+		switch e := eff.(type) {
+		case OpenTurn:
+			Expect(open).To(BeFalse(), "invariant (1): OpenTurn(%s) while a turn is already open (effect #%d)\nlog=%v", e.Turn, i, log)
+			open = true
+			opens[e.Turn]++
+			Expect(opens[e.Turn]).To(Equal(1), "invariant (2): turn %s opened %d times (effect #%d)\nlog=%v", e.Turn, opens[e.Turn], i, log)
+		case CommitTurn:
+			Expect(open).To(BeTrue(), "invariant (3): CommitTurn(%s) with no open turn (effect #%d)\nlog=%v", e.Turn, i, log)
+			open = false
+		case DiscardTurn:
+			Expect(open).To(BeTrue(), "invariant (3): DiscardTurn(%s) with no open turn (effect #%d)\nlog=%v", e.Turn, i, log)
+			open = false
+		}
+	}
+}
+
+// unknownEvent / unknownState exercise the defensive error path for a type that
+// Next does not know about (a future variant added without updating Next).
+type unknownEvent struct{}
+
+func (unknownEvent) isEvent()       {}
+func (unknownEvent) String() string { return "unknownEvent" }
+
+type unknownState struct{}
+
+func (unknownState) isState()       {}
+func (unknownState) String() string { return "unknownState" }
+
+var _ = Describe("turncoord.Next", func() {
+	// DescribeTable exhaustively pins every (state, event) cell of the pure
+	// transition function, including the idle no-op cells. This is the practical
+	// stand-in for "no transition leads to an inconsistent state": if a cell
+	// changes, this table must change with it.
+	DescribeTable("transitions",
+		func(state State, event Event, wantState State, wantEff []Effect) {
+			gotState, gotEff, err := Next(state, event)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(gotState).To(Equal(wantState))
+			Expect(gotEff).To(Equal(wantEff))
+		},
+		Entry("idle+onset -> speaking: open, barge-in, speech_started",
+			Idle{}, Onset{Turn: "t1"},
+			Speaking{Turn: "t1"},
+			[]Effect{OpenTurn{Turn: "t1"}, BargeIn{}, EmitSpeechStarted{}}),
+		Entry("idle+silence -> idle, no-op (nothing to commit)",
+			Idle{}, Silence{},
+			Idle{}, []Effect(nil)),
+		Entry("idle+abort -> idle, no-op (nothing open)",
+			Idle{}, Abort{Reason: AbortNoSpeech},
+			Idle{}, []Effect(nil)),
+		Entry("speaking+onset -> stay speaking, no-op (already speaking)",
+			Speaking{Turn: "t1"}, Onset{Turn: "t2"}, // a fresh id is ignored mid-turn
+			Speaking{Turn: "t1"}, []Effect(nil)),
+		Entry("speaking+silence -> idle: speech_stopped + commit",
+			Speaking{Turn: "t1"}, Silence{},
+			Idle{}, []Effect{EmitSpeechStopped{}, CommitTurn{Turn: "t1"}}),
+		Entry("speaking+abort(no_speech) -> idle: discard",
+			Speaking{Turn: "t1"}, Abort{Reason: AbortNoSpeech},
+			Idle{}, []Effect{DiscardTurn{Turn: "t1"}}),
+		Entry("speaking+abort(teardown) -> idle: discard",
+			Speaking{Turn: "t9"}, Abort{Reason: AbortTeardown},
+			Idle{}, []Effect{DiscardTurn{Turn: "t9"}}),
+	)
+
+	It("is total: every defined (state, event) pair is handled without error", func() {
+		states := []State{Idle{}, Speaking{Turn: "t1"}}
+		events := []Event{
+			Onset{Turn: "t2"},
+			Silence{},
+			Abort{Reason: AbortNoSpeech},
+			Abort{Reason: AbortTeardown},
+		}
+		for _, s := range states {
+			for _, e := range events {
+				_, _, err := Next(s, e)
+				Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e)
+			}
+		}
+	})
+
+	It("errors on an unknown event type", func() {
+		_, _, err := Next(Speaking{Turn: "t1"}, unknownEvent{})
+		Expect(err).To(HaveOccurred())
+	})
+
+	It("errors on an unknown state type", func() {
+		_, _, err := Next(unknownState{}, Onset{Turn: "t1"})
+		Expect(err).To(HaveOccurred())
+	})
+})
+
+var _ = Describe("turncoord.Coordinator", func() {
+	// This replaces the previous rapid stateful test: a seeded random walk over
+	// the event space, asserting after every step both the log invariants and
+	// the core state coupling -- the machine is in Speaking IFF a turn is
+	// currently open. That coupling is the whole point of M2: in the legacy code
+	// speechStarted and the live-stream-open flag were separate variables a
+	// discard could desync; here they are one state and cannot. Seeds are fixed
+	// so any failure reproduces deterministically (the failing seed/step is in
+	// the assertion message).
+	It("keeps state coupled to turn-open over random event sequences", func() {
+		seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE}
+		for _, seed := range seeds {
+			r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5))
+			sink := &recordingSink{}
+			c := New(sink)
+			var nextTurn uint64
+			open := false // independent model of "is a turn open"
+
+			for step := range 5000 {
+				switch r.IntN(3) {
+				case 0:
+					nextTurn++
+					Expect(c.Apply(Onset{Turn: TurnID(fmt.Sprintf("t%d", nextTurn))})).To(Succeed())
+					open = true // onset opens a turn (or is a no-op if already open)
+				case 1:
+					Expect(c.Apply(Silence{})).To(Succeed())
+					open = false // commit (or no-op if already idle)
+				case 2:
+					Expect(c.Apply(Abort{Reason: AbortReason(r.IntN(2))})).To(Succeed())
+					open = false // discard (or no-op if already idle)
+				}
+				_, speaking := c.State().(Speaking)
+				Expect(speaking).To(Equal(open), "coupling: seed=%d step=%d state=%s", seed, step, c.State())
+			}
+			checkLog(sink.snapshot())
+		}
+	})
+
+	// M2 is single-writer in practice (handleVAD), but teardown can Abort from
+	// another goroutine, so the Coordinator must be race-safe. Run under -race;
+	// the log invariants must hold regardless of interleaving.
+	It("is race-safe under concurrent Apply from two goroutines", func() {
+		const perGoroutine = 2000
+		sink := &recordingSink{}
+		c := New(sink)
+
+		var idCounter uint64
+		var idMu sync.Mutex
+		nextTurn := func() TurnID {
+			idMu.Lock()
+			defer idMu.Unlock()
+			idCounter++
+			return TurnID(fmt.Sprintf("t%d", idCounter))
+		}
+
+		var wg sync.WaitGroup
+		drive := func(reason AbortReason) {
+			defer wg.Done()
+			for i := range perGoroutine {
+				switch i % 3 {
+				case 0:
+					_ = c.Apply(Onset{Turn: nextTurn()})
+				case 1:
+					_ = c.Apply(Silence{})
+				case 2:
+					_ = c.Apply(Abort{Reason: reason})
+				}
+			}
+		}
+
+		wg.Add(2)
+		go drive(AbortNoSpeech)
+		go drive(AbortTeardown)
+		wg.Wait()
+
+		checkLog(sink.snapshot())
+	})
+})
+
+var _ = DescribeTable("turncoord stringers",
+	func(got, want string) { Expect(got).To(Equal(want)) },
+	Entry(nil, AbortNoSpeech.String(), "no_speech"),
+	Entry(nil, AbortTeardown.String(), "teardown"),
+	Entry(nil, AbortReason(99).String(), "AbortReason(99)"),
+
+	Entry(nil, Idle{}.String(), "Idle"),
+	Entry(nil, Speaking{Turn: "t7"}.String(), "Speaking(t7)"),
+
+	Entry(nil, Onset{Turn: "t1"}.String(), "Onset(t1)"),
+	Entry(nil, Silence{}.String(), "Silence"),
+	Entry(nil, Abort{Reason: AbortTeardown}.String(), "Abort(teardown)"),
+
+	Entry(nil, BargeIn{}.String(), "BargeIn"),
+	Entry(nil, OpenTurn{Turn: "t2"}.String(), "OpenTurn(t2)"),
+	Entry(nil, EmitSpeechStarted{}.String(), "EmitSpeechStarted"),
+	Entry(nil, EmitSpeechStopped{}.String(), "EmitSpeechStopped"),
+	Entry(nil, CommitTurn{Turn: "t3"}.String(), "CommitTurn(t3)"),
+	Entry(nil, DiscardTurn{Turn: "t4"}.String(), "DiscardTurn(t4)"),
+)
--- a/core/http/react-ui/e2e/traces-audio.spec.js
+++ b/core/http/react-ui/e2e/traces-audio.spec.js
@@ -0,0 +1,87 @@
+import { test, expect } from './coverage-fixtures.js'
+
+// Audio snippets on the Traces page must play through a blob: object URL —
+// the CSP's connect-src allows blob: but not data:, and the waveform peaks
+// renderer fetch()es the player src — and must degrade to a readable note
+// (not a broken player) when the stored payload is the "<truncated: N bytes>"
+// marker an older server stamped into oversized fields.
+
+// Minimal valid 16 kHz mono 16-bit PCM WAV (0.1s 440 Hz sine), base64-encoded.
+function wavBase64(samples = 1600, rate = 16000) {
+  const dataSize = samples * 2
+  const buf = Buffer.alloc(44 + dataSize)
+  buf.write('RIFF', 0)
+  buf.writeUInt32LE(36 + dataSize, 4)
+  buf.write('WAVE', 8)
+  buf.write('fmt ', 12)
+  buf.writeUInt32LE(16, 16)
+  buf.writeUInt16LE(1, 20) // PCM
+  buf.writeUInt16LE(1, 22) // mono
+  buf.writeUInt32LE(rate, 24)
+  buf.writeUInt32LE(rate * 2, 28)
+  buf.writeUInt16LE(2, 32)
+  buf.writeUInt16LE(16, 34)
+  buf.write('data', 36)
+  buf.writeUInt32LE(dataSize, 40)
+  for (let i = 0; i < samples; i++) {
+    buf.writeInt16LE(Math.round(8000 * Math.sin((2 * Math.PI * 440 * i) / rate)), 44 + i * 2)
+  }
+  return buf.toString('base64')
+}
+
+function transcriptionTrace(audioWavBase64) {
+  return {
+    type: 'transcription',
+    timestamp: Date.now() * 1_000_000,
+    model_name: 'parakeet-test',
+    summary: 'transcribed utterance',
+    duration: 500_000_000,
+    error: null,
+    data: {
+      audio_wav_base64: audioWavBase64,
+      audio_duration_s: 0.1,
+      audio_snippet_s: 0.1,
+      audio_sample_rate: 16000,
+      audio_samples: 1600,
+      audio_rms_dbfs: -12.0,
+      audio_peak_dbfs: -6.0,
+      audio_dc_offset: 0,
+    },
+  }
+}
+
+async function openBackendTraceRow(page, traces) {
+  await page.route('**/api/traces', (route) => {
+    route.fulfill({ contentType: 'application/json', body: JSON.stringify([]) })
+  })
+  await page.route('**/api/backend-traces', (route) => {
+    route.fulfill({ contentType: 'application/json', body: JSON.stringify(traces) })
+  })
+  await page.goto('/app/traces')
+  await expect(page.locator('text=Tracing is')).toBeVisible({ timeout: 10_000 })
+  await page.locator('button', { hasText: 'Backend Traces' }).click()
+  await page.locator('td', { hasText: 'parakeet-test' }).first().click()
+}
+
+test.describe('Traces - Audio Snippets', () => {
+  test('plays a clip through a blob: URL, not a CSP-blocked data: URL', async ({ page }) => {
+    await openBackendTraceRow(page, [transcriptionTrace(wavBase64())])
+
+    // The expanded row carries the snippet metrics and a player whose source
+    // is an object URL (connect-src allows blob:, so the peaks fetch works).
+    await expect(page.locator('text=Audio Snippet')).toBeVisible()
+    const audio = page.locator('audio')
+    await expect(audio).toHaveCount(1)
+    const src = await audio.getAttribute('src')
+    expect(src).toMatch(/^blob:/)
+    await expect(page.getByTestId('audio-snippet-unavailable')).toHaveCount(0)
+  })
+
+  test('shows a readable note instead of a broken player for truncated payloads', async ({ page }) => {
+    await openBackendTraceRow(page, [transcriptionTrace('<truncated: 281660 bytes>')])
+
+    await expect(page.locator('text=Audio Snippet')).toBeVisible()
+    await expect(page.getByTestId('audio-snippet-unavailable')).toBeVisible()
+    await expect(page.locator('audio')).toHaveCount(0)
+  })
+})
--- a/core/http/react-ui/src/pages/Talk.jsx
+++ b/core/http/react-ui/src/pages/Talk.jsx
@@ -19,24 +19,31 @@ const STATUS_STYLES = {
  error:        { icon: 'fa-solid fa-circle', color: 'var(--color-error)', bg: 'var(--color-error-light)' },
 }

-// upsertAssistant merges a streamed transcript fragment into the assistant entry
-// identified by the server's item_id, or appends a new entry if none exists yet.
-// Keying by item_id (not a mutable index tracked across handler/updater
-// boundaries) makes streamed deltas idempotent and order-independent, so React's
-// batching of non-React data-channel events cannot produce a duplicate bubble.
-// mode 'append' adds to the running text; 'replace' sets the final transcript.
-function upsertAssistant(prev, itemId, text, mode) {
-  // Only assistant entries carry an id, and the streaming entry is almost
-  // always the newest — search from the tail so per-delta cost stays constant.
+// upsertEntry merges a streamed transcript fragment into the entry identified
+// by the server's item_id, or appends a new entry (with the given role) if
+// none exists yet. Keying by item_id (not a mutable index tracked across
+// handler/updater boundaries) makes streamed deltas idempotent and
+// order-independent, so React's batching of non-React data-channel events
+// cannot produce a duplicate bubble. mode 'append' adds to the running text;
+// 'replace' sets the final transcript — the server sends a completed event
+// whose authoritative text supersedes any live captions (e.g. the
+// semantic_vad retranscribe gate's batch decode).
+function upsertEntry(prev, itemId, role, text, mode) {
+  // The streaming entry is almost always the newest — search from the tail
+  // so per-delta cost stays constant.
  const i = prev.findLastIndex(e => e.id === itemId)
  if (i === -1) {
-    return [...prev, { role: 'assistant', id: itemId, text }]
+    return [...prev, { role, id: itemId, text }]
  }
  const next = [...prev]
  next[i] = { ...next[i], text: mode === 'append' ? next[i].text + text : text }
  return next
 }

+function upsertAssistant(prev, itemId, text, mode) {
+  return upsertEntry(prev, itemId, 'assistant', text, mode)
+}
+
 export default function Talk() {
  const { addToast } = useOutletContext()
  const navigate = useNavigate()
@@ -252,12 +259,33 @@ export default function Talk() {
      case 'input_audio_buffer.speech_stopped':
        updateStatus('thinking', 'Processing...')
        break
+      case 'conversation.item.input_audio_transcription.delta':
+        // Live captions: semantic_vad streams the user's words while they
+        // are still speaking, keyed by the item id the commit will reuse.
+        if (event.delta && event.item_id) {
+          setTranscript(prev => upsertEntry(prev, event.item_id, 'user', event.delta, 'append'))
+        }
+        break
      case 'conversation.item.input_audio_transcription.completed':
        if (event.transcript) {
-          setTranscript(prev => [...prev, { role: 'user', text: event.transcript }])
+          if (event.item_id) {
+            // Replaces any live captions with the authoritative transcript
+            // (which may differ, e.g. the retranscribe gate's batch decode);
+            // creates the entry when there were none (server_vad).
+            setTranscript(prev => upsertEntry(prev, event.item_id, 'user', event.transcript, 'replace'))
+          } else {
+            setTranscript(prev => [...prev, { role: 'user', text: event.transcript }])
+          }
        }
        updateStatus('thinking', 'Generating response...')
        break
+      case 'conversation.item.input_audio_transcription.failed':
+        // The turn was discarded after captions were shown (e.g. the buffer
+        // was cleared as silence) — retract the partial entry.
+        if (event.item_id) {
+          setTranscript(prev => prev.filter(e => e.id !== event.item_id))
+        }
+        break
      case 'response.output_audio_transcript.delta':
        if (event.delta) {
          inProgressIdRef.current = event.item_id
@@ -712,7 +740,7 @@ export default function Talk() {
          )}
          {selectedModelInfo && !selectedModelInfo.self_contained && (
            <div style={{
-              display: 'grid', gridTemplateColumns: 'repeat(4, minmax(0, 1fr))', gap: 'var(--spacing-xs)',
+              display: 'flex', flexDirection: 'column', gap: 'var(--spacing-xs)',
              marginBottom: 'var(--spacing-xs)', fontSize: '0.75rem',
            }}>
              {[
@@ -724,9 +752,12 @@ export default function Talk() {
                <div key={item.label} style={{
                  background: 'var(--color-bg-secondary)', borderRadius: 'var(--radius-sm)',
                  padding: 'var(--spacing-xs)', border: '1px solid var(--color-border)',
+                  display: 'flex', alignItems: 'baseline', gap: 'var(--spacing-sm)',
                }}>
-                  <div style={{ color: 'var(--color-text-secondary)', marginBottom: 2 }}>{item.label}</div>
-                  <div style={{ fontFamily: 'var(--font-mono)', overflow: 'hidden', textOverflow: 'ellipsis', whiteSpace: 'nowrap' }}>{item.value}</div>
+                  <div style={{ color: 'var(--color-text-secondary)', whiteSpace: 'nowrap' }}>{item.label}</div>
+                  {/* full width for the value; wrap rather than overflow when the
+                      model name is long (minWidth:0 lets the flex item shrink) */}
+                  <div style={{ fontFamily: 'var(--font-mono)', minWidth: 0, marginLeft: 'auto', textAlign: 'right', overflowWrap: 'anywhere' }}>{item.value || '—'}</div>
                </div>
              ))}
            </div>
--- a/core/http/react-ui/src/pages/Traces.jsx
+++ b/core/http/react-ui/src/pages/Traces.jsx
@@ -86,8 +86,40 @@ function typeBadgeStyle(type) {
  return { background: c.bg, color: c.color, padding: '2px 8px', borderRadius: 'var(--radius-sm)', fontSize: '0.75rem', fontWeight: 500 }
 }

+// useWavObjectURL — decode a base64 WAV payload into a blob: object URL for
+// the waveform player. A data: URL would render in <audio> (media-src allows
+// data:) but the peaks renderer fetch()es the src and the CSP's connect-src
+// only allows blob:, so playback broke with a CSP violation. Decoding to a
+// Blob also tolerates payloads that aren't valid base64 — e.g. the
+// "<truncated: N bytes>" marker older servers stamped into oversized fields —
+// by yielding null instead of a broken player.
+function useWavObjectURL(b64) {
+  const [url, setUrl] = useState(null)
+  useEffect(() => {
+    if (!b64) {
+      setUrl(null)
+      return undefined
+    }
+    let objectUrl = null
+    try {
+      const bin = atob(b64)
+      const bytes = new Uint8Array(bin.length)
+      for (let i = 0; i < bin.length; i++) bytes[i] = bin.charCodeAt(i)
+      objectUrl = URL.createObjectURL(new Blob([bytes], { type: 'audio/wav' }))
+      setUrl(objectUrl)
+    } catch {
+      setUrl(null)
+    }
+    return () => {
+      if (objectUrl) URL.revokeObjectURL(objectUrl)
+    }
+  }, [b64])
+  return url
+}
+
 // Audio player + metrics for transcription traces
 function AudioSnippet({ data }) {
+  const audioUrl = useWavObjectURL(data?.audio_wav_base64)
  if (!data?.audio_wav_base64) return null
  const metrics = [
    { label: 'Duration', value: data.audio_duration_s + 's' },
@@ -104,7 +136,11 @@ function AudioSnippet({ data }) {
        <i className="fas fa-headphones" style={{ color: 'var(--color-primary)' }} /> Audio Snippet
      </h4>
      <div style={{ background: 'var(--color-bg-primary)', border: '1px solid var(--color-border)', borderRadius: 'var(--radius-md)', padding: 'var(--spacing-sm)' }}>
-        <WaveformPlayer src={`data:audio/wav;base64,${data.audio_wav_base64}`} height={64} />
+        {audioUrl
+          ? <WaveformPlayer src={audioUrl} height={64} />
+          : <div data-testid="audio-snippet-unavailable" style={{ fontSize: '0.75rem', color: 'var(--color-text-secondary)', padding: 'var(--spacing-xs)' }}>
+              <i className="fas fa-triangle-exclamation" /> Audio clip not playable — it was truncated when recorded (raise Max Body Bytes in the tracing settings).
+            </div>}
        <div style={{ display: 'grid', gridTemplateColumns: 'repeat(auto-fill, minmax(120px, 1fr))', gap: 'var(--spacing-xs)', fontSize: '0.75rem', marginTop: 'var(--spacing-sm)' }}>
          {metrics.map(m => (
            <div key={m.label} style={{ background: 'var(--color-bg-secondary)', borderRadius: 'var(--radius-sm)', padding: 'var(--spacing-xs)' }}>
--- a/core/schema/transcription.go
+++ b/core/schema/transcription.go
@@ -24,6 +24,11 @@ type TranscriptionResult struct {
 	Text     string                 `json:"text"`
 	Language string                 `json:"language,omitempty"`
 	Duration float64                `json:"duration,omitempty"`
+	// Eou reports that the decode ended on the model's end-of-utterance
+	// special token (emitted by streaming-EOU models such as
+	// parakeet_realtime_eou_120m-v1; always false elsewhere). The marker
+	// itself never appears in Text.
+	Eou bool `json:"eou,omitempty"`
 }

 type TranscriptionSegmentSeconds struct {
--- a/core/services/nodes/health_mock_test.go
+++ b/core/services/nodes/health_mock_test.go
@@ -241,6 +241,9 @@ func (c *fakeBackendClient) AudioTransformStream(_ context.Context, _ ...ggrpc.C
 func (c *fakeBackendClient) AudioToAudioStream(_ context.Context, _ ...ggrpc.CallOption) (grpc.AudioToAudioStreamClient, error) {
 	return nil, nil
 }
+func (c *fakeBackendClient) AudioTranscriptionLive(_ context.Context, _ ...ggrpc.CallOption) (grpc.AudioTranscriptionLiveClient, error) {
+	return nil, nil
+}
 func (c *fakeBackendClient) Forward(_ context.Context, _ ...ggrpc.CallOption) (grpc.ForwardClient, error) {
 	return nil, nil
 }
--- a/core/services/nodes/inflight_test.go
+++ b/core/services/nodes/inflight_test.go
@@ -195,6 +195,10 @@ func (f *fakeGRPCBackend) AudioToAudioStream(_ context.Context, _ ...ggrpc.CallO
 	return nil, nil
 }

+func (f *fakeGRPCBackend) AudioTranscriptionLive(_ context.Context, _ ...ggrpc.CallOption) (grpc.AudioTranscriptionLiveClient, error) {
+	return nil, nil
+}
+
 func (f *fakeGRPCBackend) Forward(_ context.Context, _ ...ggrpc.CallOption) (grpc.ForwardClient, error) {
 	return nil, nil
 }
--- a/core/trace/backend_trace.go
+++ b/core/trace/backend_trace.go
@@ -75,8 +75,8 @@ var (
 // trace) or any TTS run (~1.3 MiB of audio_wav_base64 per trace) blows the
 // payload past tens of MiB and locks the Traces page in a loading state.
 //
-// 0 disables the cap. Set on the first InitBackendTracingIfEnabled call only,
-// matching the sync.Once-guarded maxItems semantics.
+// 0 disables the cap. Guarded by backendMu; refreshed on EVERY
+// InitBackendTracingIfEnabled call — see below.
 var backendMaxBodyBytes int

 func InitBackendTracingIfEnabled(maxItems, maxBodyBytes int) {
@@ -86,7 +86,6 @@ func InitBackendTracingIfEnabled(maxItems, maxBodyBytes int) {
 		}
 		backendMu.Lock()
 		backendTraceBuffer = circularbuffer.New[*BackendTrace](maxItems)
-		backendMaxBodyBytes = maxBodyBytes
 		backendMu.Unlock()

 		go func() {
@@ -99,11 +98,26 @@ func InitBackendTracingIfEnabled(maxItems, maxBodyBytes int) {
 			}
 		}()
 	})
+
+	// The body cap tracks the LATEST call, not the first: tracing_max_body_bytes
+	// is runtime-mutable via the settings API (ApplyRuntimeSettings), and every
+	// recording path calls this right before RecordBackendTrace with the current
+	// appConfig value. Freezing the cap on first init meant a raised setting let
+	// producers (e.g. trace.AudioSnippet, which reads the live value) embed
+	// payloads that this recorder then stomped with the "<truncated: N bytes>"
+	// marker — corrupting audio_wav_base64 into an unplayable string. maxItems
+	// keeps first-call semantics: resizing the ring buffer would drop entries.
+	backendMu.Lock()
+	backendMaxBodyBytes = maxBodyBytes
+	backendMu.Unlock()
 }

 func RecordBackendTrace(t BackendTrace) {
-	if t.Data != nil && backendMaxBodyBytes > 0 {
-		t.Data = capDataStrings(t.Data, backendMaxBodyBytes)
+	backendMu.Lock()
+	maxBody := backendMaxBodyBytes
+	backendMu.Unlock()
+	if t.Data != nil && maxBody > 0 {
+		t.Data = capDataStrings(t.Data, maxBody)
 	}
 	select {
 	case backendLogChan <- &t:
--- a/core/trace/backend_trace_cap_test.go
+++ b/core/trace/backend_trace_cap_test.go
@@ -28,8 +28,9 @@ const (

 var _ = Describe("RecordBackendTrace Data capping", func() {
 	BeforeEach(func() {
-		// Init is sync.Once so the first test wins; subsequent tests just
-		// clear the buffer. The cap value below has to match the first call.
+		// The ring buffer is allocated once (sync.Once) but the body cap
+		// follows the latest call, so each spec re-establishes smallCap here
+		// regardless of what a previous spec set.
 		trace.InitBackendTracingIfEnabled(64, smallCap)
 		trace.ClearBackendTraces()
 	})
@@ -131,6 +132,30 @@ var _ = Describe("RecordBackendTrace Data capping", func() {
 		got := trace.GetBackendTraces()[0]
 		Expect(got.Data["messages"]).To(Equal(preTruncated))
 	})
+
+	It("applies a runtime-raised cap without a restart", func() {
+		// tracing_max_body_bytes is runtime-mutable via the settings API.
+		// Producers like AudioSnippet read the live value, so the recorder
+		// must too — under the old first-call-wins behaviour a raised cap
+		// kept truncating audio_wav_base64 payloads the producer had already
+		// let through, corrupting them into "<truncated: N bytes>" markers.
+		oversizedForOldCap := strings.Repeat("w", smallCap*4)
+
+		trace.InitBackendTracingIfEnabled(64, smallCap*8) // simulate the settings raise
+		trace.RecordBackendTrace(trace.BackendTrace{
+			Timestamp: time.Now(),
+			Type:      trace.BackendTraceTranscription,
+			ModelName: "m",
+			Data: map[string]any{
+				"audio_wav_base64": oversizedForOldCap,
+			},
+		})
+
+		Eventually(trace.GetBackendTraces).Should(HaveLen(1))
+		got := trace.GetBackendTraces()[0]
+		Expect(got.Data["audio_wav_base64"]).To(Equal(oversizedForOldCap),
+			"a payload under the raised cap must survive intact")
+	})
 })

 var _ = Describe("TruncateToBytes", func() {
--- a/coverage-baseline.txt
+++ b/coverage-baseline.txt
@@ -1 +1 @@
-45.0
+48.5
--- a/docs/content/features/openai-realtime.md
+++ b/docs/content/features/openai-realtime.md
@@ -56,6 +56,41 @@ pipeline:

 All streaming flags are off by default, so existing pipelines are unaffected.

+### Turn detection
+
+Turn detection decides when the user has finished speaking and the pipeline should respond. Two modes are supported, matching the OpenAI session schema:
+
+- **`server_vad`** (default): silence-based. The VAD model watches the audio and the turn commits after `silence_duration_ms` (default 500 ms) of silence. Simple and model-agnostic, but a fixed silence window must trade interrupting mid-sentence pauses against sluggish responses.
+- **`semantic_vad`**: model-driven. The transcription model itself signals end-of-utterance and the silence window becomes dynamic: short right after the model emits its end-of-utterance token, much longer when it does not — so pausing to think no longer gets cut off, while finished sentences get a fast response.
+
+`semantic_vad` requires a transcription model that emits an end-of-utterance token over a cache-aware streaming decode — currently `parakeet-cpp-realtime_eou_120m-v1` (the model is trained to distinguish "paused, expecting a reply" from "paused mid-thought"). The realtime pipeline feeds it the microphone audio live while the user speaks. With any other transcription backend the session degrades gracefully to silence-only detection using the eagerness timeout below (a warning is logged once). The model also emits a distinct end-of-backchannel token (`<EOB>`) for short acknowledgments like "uh-huh": those are transcribed but never treated as the user yielding the turn.
+
+Sessions can opt in via `session.update` (`turn_detection: {"type": "semantic_vad", "eagerness": "medium"}`), or the pipeline can set a server-side default so clients need no changes:
+
+```yaml
+name: gpt-realtime
+pipeline:
+  vad: silero-vad-ggml
+  transcription: parakeet-cpp-realtime_eou_120m-v1
+  llm: qwen3-4b
+  tts: tts-1
+  turn_detection:
+    type: semantic_vad   # default for sessions on this model (server_vad if unset)
+    eagerness: medium    # low | medium | high | auto (auto == medium)
+    retranscribe: false  # see below
+```
+
+A client `session.update` still overrides `type` and `eagerness` per session.
+
+**Eagerness** sets the fallback silence window used when no end-of-utterance token was seen (the model missed it, or the user genuinely trails off): `low` waits 8 s, `medium`/`auto` 4 s, `high` 2 s — the same max-timeout semantics OpenAI documents. After the token is seen, the turn commits on the next VAD tick (~300 ms).
+
+**Live captions**: while the user speaks, `semantic_vad` streams `conversation.item.input_audio_transcription.delta` events under the item id the commit will later reuse, so clients can render the words as they are recognized. The `completed` event at commit carries the authoritative transcript and replaces the partial text (with `retranscribe: true` it may differ from the captions); a turn discarded before commit emits `conversation.item.input_audio_transcription.failed` so clients can retract its captions.
+
+**`retranscribe`** (server-side only, semantic_vad only) cross-checks the streaming decode against a batch decode at commit time:
+
+- `false` (default): the transcript accumulated from the live stream is used as-is — the model runs once per utterance and the LLM starts immediately at commit.
+- `true`: the committed audio is re-transcribed offline. If the batch decode also ends with the end-of-utterance token the turn proceeds (using the batch transcript); if it does **not**, the commit is cancelled and the session keeps listening — treating the streaming token as a false positive. Both transcripts are compared and logged, which makes this mode a useful diagnostic for how well the streaming and batch decodes align, at the cost of one extra decode per turn.
+
 ### Disabling thinking

 For reasoning models, you can force the pipeline LLM's thinking off without editing the LLM model config:
--- a/docs/design/realtime-state-machines.md
+++ b/docs/design/realtime-state-machines.md
@@ -0,0 +1,603 @@
+# Realtime API state machines — map & re-architecture research
+
+Status: research / design (compaction phase). No code changes implied yet.
+
+The realtime API (`core/http/endpoints/openai/realtime*.go`) grew feature-by-feature
+(server_vad → semantic_vad/EOU, streaming pipeline, tool turns, compaction, voice
+gate, sound detection, WebRTC). The result is several **implicit** state machines
+whose states and transitions are scattered across goroutine-local variables, shared
+`Session`/`Conversation` fields under five different mutexes, raw channels, and
+`context` cancellation. State is *inferred* from variable combinations rather than
+*stored*; several illegal/inconsistent states are reachable.
+
+This document (1) inventories the implicit machines, (2) catalogues the cross-cutting
+failure modes, (3) researches how to re-implement them explicitly and verifiably, and
+(4) lists the invariants a correct implementation must guarantee.
+
+All line numbers are against the current `feat/realtime-semantic-vad-eou` branch and
+will drift; treat them as anchors.
+
+---
+
+## Part 1 — Inventory of the implicit state machines
+
+There is **no `state`/`status` field anywhere** in `Session` or `Conversation`. Every
+machine below is reconstructed from variable combinations.
+
+### M1. Connection / transport lifecycle
+
+Two transports implement one `Transport` interface; their lifecycles differ sharply.
+
+- **WebSocket** (`realtime_transport_ws.go`): essentially stateless — a `*websocket.Conn`
+  plus a write `sync.Mutex`. No send queue, no send goroutine, no closed flag. "Closed"
+  = `ReadEvent` returns an error.
+- **WebRTC** (`realtime_transport_webrtc.go`): an explicit-ish machine built from raw
+  channels — `dcReady` (closed by `dcDone sync.OnceFunc`), `closed` (closed by
+  `closeDone sync.OnceFunc` from *either* `OnConnectionStateChange` or `Close()`),
+  `flushed`, `sessionCh` (cap 1), `inEvents`/`outEvents` (cap 256), plus a `sendLoop`
+  goroutine and RTP counters under `rtpMu`.
+
+Conceptual states (`connecting → data-channel-open → session-created → active →
+closing → closed`) are **not stored**; the only persisted membership state is the
+`sessions[sessionID]` map entry (exists `realtime.go:631`→`:1009`). `session-created`
+and `session-updated` are *events*, not states.
+
+Teardown order (`realtime.go:989-1010`): `cancelActiveResponse` → `close(decodeDone)`
+→ `close(done)` (if VAD running) → `close(soundWindowDone)` → `wg.Wait()` →
+`delete(sessions,…)`. Then, WebRTC only, `defer transport.Close()` → `closeDone()` →
+`<-flushed` → `pc.Close()`.
+
+### M2. Audio-input / turn-detection (server_vad + semantic_vad + EOU)
+
+One `handleVAD` goroutine (`realtime.go:1322`) on a 300 ms ticker. Mode is
+**re-evaluated every tick** under `sessionLock` (`:1350-1357`) so it can flip mid-turn.
+
+- **server_vad** states are encoded by the goroutine-local `speechStarted bool`
+  (`:1337`) plus silence *measured* (not timed) as `audioLength - segEndTime >
+  silenceThreshold` recomputed each tick (`:1461`). States: idle → inspecting →
+  speech-detected → awaiting-commit → committing → transcribing/responding.
+  "Holdback" is a byte count (`noSpeechHoldbackSec*rate*2`), not a timer.
+- **semantic_vad** adds the `liveTurnState` struct (`realtime_semantic_vad.go`):
+  `live` (nil = closed), `unavailable` (sticky degrade → behaves as server_vad),
+  `eouAtSec`, `parts`, `itemID` (allocated at turn open so captions can stream),
+  `deltasSent`. Extra states: closed, open/streaming-ASR, EOU-pending, EOU-fallback
+  (dynamic silence threshold 0 s when EOU pending, else eagerness 8/4/2 s),
+  retranscribe-gate, EOU-rejected, finished, discarded.
+  The one cross-goroutine edge: the backend recv callback pushes onto `events`
+  (buffered 64, **non-blocking — drops on overflow**, `:116-117`); `drainEvents`
+  reads it on the tick.
+- **Voice gate** (`realtime_voicegate.go`) runs *inside* the commit goroutine:
+  resolving → authorized/rejected, with a sticky `voiceVerified` (under `gateMu`) for
+  `when:first`.
+
+### M3. Response lifecycle (+ synchronous tool-turn recursion)
+
+A response is "active" iff `Session.activeResponseDone` is non-nil and unclosed
+(`responseMu`, `:172`). One goroutine owns it; its lifetime == that channel's. State
+is observable only through the `response.*` event stream and `ItemStatus*` on the
+assistant item. Logical states: idle → starting → generating-text →
+generating-audio → tool-call-pending → tool-executing → awaiting-next-tool-turn →
+cancelling → done(completed|cancelled) | failed.
+
+- Cancellation is **cooperative at discrete checkpoints** (`ctx.Err()` at
+  `:2172,2364,2394`, `realtime_stream.go:193,202,241,259`).
+- The tool loop is **synchronous recursion on the same goroutine**, bounded by
+  `maxAssistantToolTurns = 10`; each level mints a fresh `responseID` and emits a full
+  `response.created … response.done{Completed}` cycle — so one user turn can emit
+  *several* `response.done{Completed}` events under different IDs.
+- Terminal events are **not exactly-once**: failed paths `return` with no
+  `response.done`; cancelled paths emit `done{Cancelled}`; the completed terminal is
+  unconditional at the tail of `emitToolCallItems`.
+
+### M4. Conversation / compaction
+
+`Conversation`: `Items` + `Memory` (rolling summary) under `Lock`; `compacting
+atomic.Bool`. States: normal ↔ compacting. Compaction (`realtime_compaction.go`)
+snapshots overflow under `Lock`, summarizes **unlocked**, re-locks and commits guarded
+by an optimistic head-`prefixMatches` check. It is launched **only by turn-0
+`triggerResponse`** (`:1963`), off the response path — so a long agentic turn
+(recursion calls `triggerResponseAtTurn` directly) can append many tool items and
+**never compact** until the next user turn (compaction starvation).
+
+### M5. Streaming sub-machines (transcription, chunker, TTS)
+
+Backend LLM/TTS/transcription streams are **synchronous callback recv loops on the
+caller's goroutine** — no internal goroutines/channels. The only true concurrent FSM is:
+
+- **TTS pipeline** (`realtime_tts_pipeline.go`): one worker goroutine, an **unbounded**
+  mutex-guarded `queue`, a coalesced `wake` chan (cap 1), a `closed` flag, a `done`
+  chan closed once by the worker's `defer`, a lock-free `failed atomic.Bool`, and
+  worker-owned `audio`/`firstErr` that are safe to read only after `wait()` joins via
+  `done`. Idempotent `wait()`; deferred `wait()` backstop guarantees no worker leak.
+- **Chunker** (`realtime_chunker.go`): a pure single-buffer FSM (buffering ↔ emitting,
+  `flush` = hard boundary). **No concurrency guard** — correctness depends entirely on
+  `push`/`flush` being called from one goroutine (the LLM recv loop). On cancel the
+  flush is skipped, so the buffered partial clause is intentionally dropped.
+- **Transcription** (`realtime_transcription.go`): stateless straight-line function;
+  "streaming" is just repeated synchronous callbacks.
+
+---
+
+## Part 2 — Cross-cutting failure modes (why it's a mess)
+
+1. **Shared mutable `Session` config with inconsistent locking (the core problem).**
+   `updateSession`/`updateTransSession` mutate `Voice`, `Instructions`, `Tools`,
+   `OutputModalities`, `ModelConfig`, **`ModelInterface`**, sample rates, and the
+   shared `InputAudioTranscription` pointer under `sessionLock`. But in-flight
+   response/speech/transcription goroutines read those same fields **without any
+   lock** (`realtime_speech.go:72-79`, `realtime_stream.go:228`, semantic_vad
+   `:110`). Reloading `ModelInterface` mid-response is a data race against a running
+   Predict/TTS/Transcribe, and the swapped-out model is dropped without Close.
+   `sessionLock` actually guards the *global `sessions` map*; it only mutually excludes
+   the handful of other sites that happen to also take it (handleVAD tick, the commit
+   branch). Response goroutines never take it.
+
+2. **Two writers of the active-response pair.** `startResponse`/`cancelActiveResponse`
+   are called from both the main read loop (`:836,973,981,990`) **and** the VAD
+   goroutine (barge-in `:1429`, end-of-speech `:1543`). `responseMu` guards only the
+   field swap; the `<-done` wait is outside the lock. A read-loop `ResponseCreate`
+   racing a VAD `speech_stopped` can have both read the same prior pair, both
+   overwrite, and briefly leave **two live response goroutines** both appending to
+   `conv.Items`. The "never overlapping" guarantee holds only under the unstated
+   assumption that responses are driven from a single goroutine — which is false.
+
+3. **State is inferred, not stored.** Whether a response is active, whether a turn is
+   open, whether audio is being buffered — all are derived from combinations of
+   booleans, nil-checks, channel state, and `context` error. No single source of truth;
+   no place to assert an invariant.
+
+4. **Reachable inconsistent states.** e.g. after a semantic-VAD `discardTurn`,
+   `speechStarted` stays true while `lts` is closed, so they disagree and the next
+   onset suppresses `SpeechStarted`. Mid-stream cancel leaves the client having seen
+   `output_item.added`/`content_part.added` with no matching `…done`. `events`-channel
+   overflow silently drops an EOU, degrading EOU-pending to the 2–8 s fallback.
+
+5. **Lifecycle/ownership gaps.** `decodeOpusLoop` is a bare `go` (not in `wg`) and can
+   run after `delete(sessions,…)`. `handleIncomingAudioTrack` (pion `OnTrack`
+   goroutine) has **no shutdown signal** — it appends to `OpusFrames` until `ReadRTP`
+   errors, unjoined by `wg`. WebRTC `outEvents` enqueued before the DC opens are lost
+   on early failure.
+
+6. **The `done`-channel/`vadServerStarted` toggle dance.** A single `done` local
+   (`:655`) is reassigned to a fresh channel on each VAD start (`:662`) and closed at
+   toggle-off (`:670`) and teardown (`:999`). Safe today only because one goroutine
+   owns it — one variable name meaning different channels over time is a structural
+   fragility, not an explicit lifecycle.
+
+---
+
+## Part 3 — Research: explicit, verifiable re-implementation
+
+The goal the user stated: **transitions cannot lead to an inconsistent state, and we
+can verify that.** Four layered techniques, from architecture down to runtime.
+
+### 3.1 Architecture: single-writer session actor (share by communicating)
+
+The root cause of (1) and (2) is *shared mutable state across goroutines*. The most
+effective, idiomatic-Go fix is to give each session **one owning goroutine** that holds
+all session state with **no locks**, and have every other goroutine communicate with it
+over channels:
+
+```
+            ┌────────── inbound events ──────────┐
+ transport ─┤  client events (ReadEvent)         │
+   VAD     ─┤  vad: speech_started/stopped, EOU   ├─►  session actor  ──► outbound
+ model I/O ─┤  llm/tts/asr results, errors        │   (owns ALL state,    events
+ timers    ─┤  ticks, deadlines                   │    single goroutine)
+            └────────────────────────────────────┘
+```
+
+- All state mutation happens in one place; `sessionLock`, `responseMu`, `gateMu`,
+  `AudioBufferLock`, `OpusFramesLock`, `Conversation.Lock` collapse into "the actor owns
+  it." Worker goroutines (Predict/TTS/ASR, opus decode, RTP read) become **stateless
+  effects** that take an immutable snapshot in and send results back as events.
+- `ModelInterface` reload becomes an event the actor sequences relative to responses
+  (e.g. drain/cancel the active response first), eliminating the mid-call swap race.
+- Cancellation stays `context`-based but the actor is the only thing that starts/stops
+  responses, killing the dual-writer race (2).
+
+This is the actor / CSP model. It does not by itself prove correctness — that's what
+3.2–3.4 add — but it makes the state *centralized and explicit*, which is the
+precondition for verification.
+
+### 3.2 Make illegal states unrepresentable (type-level)
+
+Inside the actor, model each machine as an explicit state with a **pure transition
+function** `next(state, event) (state, []effect, error)`:
+
+- Represent states as a Go **sealed sum type** (interface with an unexported marker
+  method, one struct per state carrying only that state's data) so e.g. `EOU-pending`
+  data cannot be accessed while `Closed`. This is the Go equivalent of an ADT and is the
+  single biggest lever for "inconsistent state unrepresentable."
+- The transition function is **total and pure** (no I/O, no goroutines): it returns the
+  next state plus a list of *effects* (send event, start Predict, arm timer) that the
+  actor executes. Pure transition functions are trivially unit-testable and
+  property-testable.
+- An unexpected `(state, event)` pair returns an explicit error / stays put and logs —
+  never a silent half-transition.
+
+The four machines are **hierarchical** (a statechart): Connection ⊃ Turn(M2) and
+Response(M3) ⊃ Tool-turn; Conversation(M4) and the TTS sub-machine(M5) are largely
+orthogonal regions. Model them as nested states rather than one flat enum.
+
+Library options (all guard *logic*, none give concurrency safety — that's 3.1's job):
+- `qmuntal/stateless` — declarative, hierarchical, guard/entry/exit actions; closest fit.
+- `looplab/fsm` — simpler, flat, event-callback based.
+- Hand-rolled transition tables — most control, no dep; recommended here given the
+  hierarchy and the desire to keep transitions auditable. `go.mod` currently pulls no
+  FSM lib.
+
+### 3.3 Design-time formal verification (prove the protocol)
+
+Before/while coding, model the *protocol* (not the Go) in a model checker to prove the
+hard concurrency properties exhaustively:
+
+- **FizzBee** (the adopted tool) to specify the actor's event/state space and check: no
+  two concurrent active responses; barge-in + ResponseCancel + speech_stopped
+  interleavings never deadlock or drop a turn; every `response.created` is eventually
+  followed by exactly one terminal; teardown joins all goroutines. The
+  cancel/startResponse/barge-in interplay (failure mode 2) is exactly the kind of
+  liveness/safety property model checkers exist for.
+- Keep the spec small and focused on the M2↔M3 boundary (turn detection ↔ response),
+  which is where the real races live.
+
+### 3.4 Implementation-time & runtime verification
+
+- **Exhaustive table-driven transition tests**: since transitions are a pure function,
+  enumerate `(state × event)` and assert the result for every cell, including the
+  illegal cells (assert they error / no-op). This is the practical stand-in for a proof
+  that "no transition leads to inconsistent state."
+- **Property-based testing**: feed random event sequences into the actor and assert
+  global invariants hold after every step (Part 4). This catches reachable-bad-state
+  bugs the example tests miss. (Implemented as Ginkgo/Gomega seeded random-walk specs
+  — see Part 6.2 for why not `rapid`.)
+- **Race detector under load**: run the property tests with `-race`; with 3.1 there
+  should be *zero* shared mutable state, so `-race` cleanliness becomes a meaningful
+  signal rather than noise.
+- **Runtime invariant assertions + structured transition logging**: log every
+  `state --event--> state` with the session ID; assert invariants in dev builds.
+  Replace today's silent degradations (dropped EOU, suppressed SpeechStarted) with
+  explicit, observable transitions.
+
+### 3.5 Recommended path for LocalAI
+
+1. Specify the M2↔M3 protocol in FizzBee; nail the cancel/barge-in invariants.
+2. Introduce a per-session actor (3.1) that owns existing state behind the current
+   `Transport` interface — incremental, keeps the event types.
+3. Replace each implicit machine with an explicit sealed-state transition function
+   (3.2), one at a time: Response first (highest-risk dual-writer), then Turn/VAD, then
+   Connection, then leave TTS/Chunker/Compaction (already mostly self-contained) for
+   last.
+4. Land the table-driven + property-based test suites alongside each machine; gate on
+   `-race`.
+
+---
+
+## Part 4 — Invariants a correct implementation must guarantee
+
+These are the "cannot reach inconsistent state" properties to encode as assertions,
+property-test oracles, and FizzBee invariants:
+
+1. **At most one active response per session** at any instant (no overlapping response
+   goroutines; no two appenders to `conv.Items` from response logic).
+2. **Exactly one terminal per `response.created`**: every emitted `response.created` is
+   followed by exactly one of `response.done{completed|cancelled}` or a defined failure
+   terminal — never zero, never two. (Decide whether agentic tool turns are one
+   response or many; make it explicit either way.)
+3. **No `response.*` content events after that response's terminal.** No
+   `output_item.added`/`content_part.added` without a matching `…done` (even on cancel).
+4. **Turn/response coupling**: `speechStarted` ⟺ a live turn is open; barge-in cancels
+   the active response *before* a new turn's commit starts.
+5. **No config field is read by a worker while being mutated** (reload is sequenced
+   against in-flight work; a response uses an immutable snapshot of model/voice/tools).
+6. **Audio buffer monotonic & consistent**: commit/clear/append/VAD-drop never lose or
+   double-consume bytes; `clear` resets *all* turn state (including `lts`).
+7. **No dropped control events**: an EOU/Final is never silently lost (no overflow-drop
+   on a bounded channel that changes turn outcome).
+8. **Clean teardown**: every spawned goroutine (incl. `decodeOpusLoop`,
+   `handleIncomingAudioTrack`) is signalled and joined before the session is deleted; no
+   sends after transport close.
+9. **Compaction safety & liveness**: compaction never races a reader into a torn
+   `Items`; and it actually runs when the trigger is exceeded, including inside long
+   agentic turns.
+10. **Idempotent close**: every channel/resource closed exactly once on every path.
+
+---
+
+## Implementation status
+
+- **M3 (response coordination) — first vertical slice landed.** Explicit machine in
+  `core/http/endpoints/openai/respcoord/` (sealed `State`/`Event`/`Effect` sum types, a
+  total pure `Next`, a single-writer `Coordinator`); transition-table + Ginkgo/Gomega
+  seeded-property + concurrent conformance tests (green under `-race`); a deterministic
+  characterization test pinning the legacy dual-writer race. Authoritative spec:
+  `formal-verification/response_lifecycle.fizz`. Gate:
+  `scripts/realtime-conformance.sh` (Go layer always; FizzBee when pinned) wired as
+  `make test-realtime-conformance` and `.github/workflows/realtime-conformance.yml`. See
+  `formal-verification/README.md`.
+- **Gate is fail-closed and pinned (done).** `fizzbee.sha256` pins all four platforms;
+  the gate hard-fails without FizzBee; CI installs+caches the verified binary with no skip;
+  pre-commit runs the gate on `respcoord/**` or `formal-verification/**` changes.
+- **M3 wired into the live session (done).** `realtime_respcoord.go` adds `responseSink`
+  (the `respcoord.Coordinator` + a goroutine-spawning effect sink) to `Session`. The legacy
+  `startResponse`/`cancelActiveResponse` and the dual-writer `activeResponse*`/`responseMu`
+  fields are gone; all six call sites (manual commit, `response.create`, VAD speech-stopped,
+  `response.cancel`, barge-in, teardown) route through it. Barge-in/cancel are now
+  non-blocking (removes the legacy ~300 ms VAD stall); teardown stops input goroutines, then
+  cancels + `wait()`s all response goroutines before deleting the session. `EmitTerminal` is
+  a no-op for now (the response body still emits its own `response.done`) — coordination is
+  fixed without changing wire behavior. Verified: builds, `go vet` clean, all 300 openai
+  specs pass under `-race`, and `make test-realtime` (the mock-backend realtime e2e suite,
+  12 specs over WS + WebRTC) passes.
+- **Single authoritative terminal + populated Output/Usage (done).** One
+  `response.created` and one `response.done` per `response.create`, even across the
+  server-side agentic tool loop (which is now internal turns of one response, not one
+  terminal each). A `liveResponse` accumulator threads through
+  `triggerResponse`→`triggerResponseAtTurn`→`emitToolCallItems`/`streamLLMResponse`,
+  collecting output items as they complete and summing token usage; `triggerResponse`
+  emits the one terminal (completed/cancelled; failed still emits none, matching legacy)
+  with `Output` + `Usage` filled in (both were always empty before). Verified: 301 openai
+  specs under `-race` (incl. a new `triggerResponse` terminal test) + `make test-realtime`.
+  Design note: emission is hoisted to `triggerResponse` (the body owns it) rather than the
+  coordinator's `EmitTerminal` effect — at cancel/supersede time the coordinator doesn't
+  yet have the body's partial Output, so the body, which does, is the natural emitter. The
+  coordinator still guarantees one body run per `response.create`, so "exactly one terminal"
+  holds transitively; `EmitTerminal` remains the spec's logical marker (no-op in the sink).
+- **M2 (turn detection) — model + spec landed AND wired into the live session.**
+  Explicit machine in `core/http/endpoints/openai/turncoord/` (sealed `State` =
+  `Idle | Speaking{Turn}`, `Event` = `Onset | Silence | Abort{Reason}`, `Effect` =
+  `BargeIn | OpenTurn | EmitSpeechStarted | EmitSpeechStopped | CommitTurn |
+  DiscardTurn`, a total pure `Next`, a single-writer `Coordinator`);
+  transition-table + Ginkgo/Gomega seeded-property + concurrent conformance tests
+  (green under `-race`). The fix it encodes: "speech detected" and "a turn is open"
+  — the two legacy variables (`speechStarted` and `lts.open()`) that a `discardTurn`
+  could desync (failure mode 4) — become ONE state, so the next-onset suppression
+  bug is unrepresentable. Authoritative spec:
+  `formal-verification/turn_lifecycle.fizz`, with an `always assertion Coupled`
+  (speech ⟺ turn-open), verified non-vacuous (deleting `self.speech = 0` in `Abort`
+  makes the checker report `Coupled` violated). The gate
+  (`scripts/realtime-conformance.sh`, pre-commit, CI) covers `turncoord` and the
+  spec. **Wired (done):** `realtime_turncoord.go` adds `turnSink` (the
+  `turncoord.Coordinator` + a loop-local effect sink) to `handleVAD`. The legacy
+  `speechStarted` bool is gone; onset/no-speech-clear/commit/teardown route through
+  `coord.Apply(Onset|Abort{NoSpeech}|Silence|Abort{Teardown})`. The turn id is
+  minted at onset and carried by the coordinator to the committed event (so it
+  matches the live captions); `liveTurnState.openTurn` now takes that id instead of
+  minting its own. A semantic→server mode switch mid-turn is deliberately NOT an
+  abort (it only drops the orphaned live stream and lets the turn continue under
+  server_vad), so it stays inline. Verified: builds, `go vet`/`gofmt`/golangci-lint
+  clean, all openai specs under `-race`, and `make test-realtime` (12 e2e specs over
+  WS + WebRTC) pass.
+- **M1 (connection lifecycle) — model + spec landed AND wired.** Explicit machine
+  in `core/http/endpoints/openai/conncoord/` (sealed `State` = `Live{VADRunning} |
+  Torn`, `Event` = `SetVAD | Close`, `Effect` = `StartVAD | StopVAD | Teardown`, a
+  total pure `Next`, a single-writer `Coordinator`); transition-table +
+  Ginkgo/Gomega seeded-property + concurrent conformance tests (green under
+  `-race`). It replaces the legacy `vadServerStarted` bool + the `done` channel
+  reassigned on every turn-detection toggle and closed from two sites (failure
+  mode 6): the coordinator owns whether the VAD goroutine runs, so its done channel
+  is closed exactly once and never resurrected after teardown; `Close` moves to
+  `Torn`, which absorbs every later event so teardown runs exactly once even from
+  multiple exit paths (invariants #8, #10). Spec:
+  `formal-verification/conn_lifecycle.fizz` (`always assertion TeardownOnce` +
+  `NoRunAfterTorn`), verified non-vacuous (deleting `self.torn = 1` in `Close`
+  fails `TeardownOnce`). **Wired (done):** `realtime_conncoord.go` adds `connSink`;
+  the handler's setup/`toggleVAD`/teardown now route through
+  `conn.setVAD(...)`/`conn.close()`; the `done`/`vadServerStarted` locals and the
+  manual ordered-teardown block are gone (the Teardown effect performs that
+  sequence). Verified: builds, vet/gofmt/golangci-lint clean, openai specs under
+  `-race`, `make test-realtime` (12 e2e WS+WebRTC), full conformance gate green
+  (3 Go packages + 3 fizz specs PASSED).
+- **M4 (conversation compaction) — model + spec landed AND wired.** Explicit
+  machine in `core/http/endpoints/openai/compactcoord/` (sealed `State` =
+  `Idle | Running`, `Event` = `Trigger | Finished`, `Effect` = `StartCompaction`,
+  a total pure `Next`, a single-writer `Coordinator`); transition-table +
+  Ginkgo/Gomega seeded-property + concurrent (effect-spawns-work-reports-Finished)
+  conformance tests (green under `-race`). It makes the legacy `compacting
+  atomic.Bool` single-flight guard explicit: a `Trigger` while `Running` is dropped
+  (not superseded — compaction is idempotent work on the same overflow), so at most
+  one summarize+evict runs per conversation (invariant #9). Spec:
+  `formal-verification/compaction.fizz` (`always assertion SingleFlight`), verified
+  non-vacuous (deleting the `if self.active == 0` guard fails `SingleFlight`).
+  **Wired (done):** `realtime_compactcoord.go` adds `compactionSink`; the
+  `Conversation.compacting atomic.Bool` is replaced by `Conversation.compaction
+  *compactionSink` (built at conversation creation with the summarize+evict run
+  closure); `maybeCompact` now calls `conv.compaction.trigger()`. The summarizer
+  resolution + `compact()` stay in the sink's spawned goroutine (off the response
+  path); `compact()` itself (snapshot/summarize-unlocked/optimistic-commit) is
+  unchanged. Verified: builds, vet/gofmt/golangci-lint clean, openai specs under
+  `-race`, `make test-realtime` (12 e2e), full conformance gate green (4 Go
+  packages + 4 fizz specs PASSED).
+- **M5 (TTS pipeline lifecycle) — model + spec landed AND wired.** Explicit
+  machine in `core/http/endpoints/openai/ttscoord/` (sealed `State` =
+  `Open | Closing | Closed`, `Event` = `Close | WorkerExited`, `Effect` = `Wake`, a
+  total pure `Next`, a single-writer `Coordinator`); transition-table +
+  Ginkgo/Gomega seeded-property + two-writer conformance tests (green under
+  `-race`). It is a genuine two-writer machine (producer `Close` from `wait()` vs
+  worker `WorkerExited`); it makes the legacy `closed bool` lifecycle explicit and
+  monotonic, fixes the latent enqueue-after-close silent drop (enqueue is now gated
+  on `Open`), and guarantees idempotent `wait()` (one wake / one worker join). The
+  poison `failed` latch stays a lock-free `atomic.Bool` (orthogonal, read per
+  clause on the worker's hot path). Spec: `formal-verification/tts_pipeline.fizz`
+  (`always assertion WakeOnce` + `Monotonic`), verified non-vacuous (deleting the
+  `if self.phase == 0` guard in `Close` fails `WakeOnce`). **Wired (done):**
+  `realtime_tts_pipeline.go`'s `ttsPipeline` embeds the coordinator (and is its
+  effect sink — `Wake` → `signal()`); `closed bool` is gone; the worker checks
+  `closing()` and raises `WorkerExited` on drain, `enqueue` rejects once not
+  `Open`, `wait()` raises `Close`. The wake/done channel mechanics are unchanged.
+  Verified: builds, vet/gofmt/golangci-lint clean, openai specs under `-race`,
+  `make test-realtime` (12 e2e), full conformance gate green (5 Go packages + 5
+  fizz specs PASSED).
+- **All five mapped machines (M1–M5) are now explicit, wired, and verified.** The
+  realtime-conformance gate model-checks all `.fizz` specs and runs all five Go
+  conformance suites under `-race`, fail-closed.
+- **The machines form a hierarchy, and that relationship is now modeled and
+  enforced.** M1 (connection) is the parent region; when it tears down, every child
+  must be terminal. Previously this was only an imperative side effect of
+  `conncoord`'s teardown ordering, with a real gap (M4 compaction was
+  fire-and-forget and could outlive the torn session). Now:
+  - `formal-verification/session_lifecycle.fizz` is a **composition spec** that
+    models conn + its direct children (vad/M2, resp/M3, compaction/M4) as one
+    statechart and asserts `ChildrenDieWithParent` (conn torn ⟹ all children
+    terminal) plus "no child starts after teardown". Its non-vacuity reproduces the
+    exact M4 gap (drop the compaction-terminate line → assertion fails).
+  - `respcoord` (M3) and `compactcoord` (M4) gained an absorbing **`Terminated`**
+    state + a `Shutdown` event, so a response/compaction cannot start after
+    teardown (structural "no resurrection").
+  - `conncoord`'s `Teardown` effect now explicitly drives the children terminal:
+    stop+join the VAD goroutine (M2), `respSink.shutdown()` (M3 → Terminated, joins
+    response goroutines and their M5 pipelines), and `compaction.shutdown()` for
+    every conversation (M4: cancel the in-flight summary via a session-scoped
+    context, then join — **closing the gap**). `compact` now takes a `context` so
+    teardown can bound the join. M2's terminal is realized by the goroutine join and
+    M5's by its existing `Closed`; the persistent coordinators (M3/M4) carry the
+    explicit `Terminated` state.
+
+## Part 5 — Library vs hand-rolled (Go ecosystem, verified 2026-06)
+
+Researched against live GitHub/pkg.go.dev data. **Verdict: hand-roll a typed transition
+table over sealed sum-type states for the per-connection machines.** No Go library gives
+the two properties we most want — *compile-time-illegal states* and a *pure
+`next(state,event)->(state,[]effect,error)`*; every library models states as
+`string`/`int`/`any` and fires side-effecting callbacks mid-transition. And since the
+actor (Part 3.1) drives everything from one goroutine, the libraries' main value-add —
+internal locking — is dead weight.
+
+Library landscape:
+
+| Option | Stars / status | Hierarchy | Typed states | Illegal-transition | Viz | Fit |
+|---|---|---|---|---|---|---|
+| **hand-rolled table + sealed sum types** | — | DIY (parent field / nested switch) | **yes** (sealed iface) | explicit `default:` | ~30 LOC Mermaid emitter | **best** |
+| **qmuntal/stateless** (port of .NET Stateless) | 1.36k, v1.8.0 2026-02, maintained | yes (substates, guards, entry/exit, internal/ignored) | `any` | `error` + `OnUnhandledTrigger` + `PermittedTriggers` | DOT | best library fallback if hierarchy grows |
+| **looplab/fsm** | 3.4k, v1.0.3 2025-05, maintained | flat | strings | typed errors | **DOT+Mermaid** | only for flat machines wanting free diagrams |
+| cocoonspace/fsm | 89, dormant 2021 | flat | int | `bool` no-op | — | lock-free but dead; DIY beats it |
+| true Harel statecharts (gstate, statechartx) | ≤10, <1yr, single-author | parallel+history | varies | varies | varies | only if we truly need parallel regions; unproven |
+| Temporal / Cadence | large, maintained | n/a | n/a | n/a | n/a | **overkill** — external cluster+DB, durable replay, wrong latency class |
+
+Decision: hand-roll; keep **qmuntal/stateless** as the fallback if one machine grows deep
+hierarchy/guards faster than we want to hand-maintain (its `error`-on-illegal-trigger and
+`PermittedTriggers()` are the most useful library features for our "reject illegal
+transitions" requirement, at the cost of `any`-typed states). Add a tiny Mermaid emitter
+over the hand-rolled table so we keep the visualization the libraries advertise.
+
+## Part 6 — Formal design tied to code, and making it authoritative
+
+The user requirement: the formal design is **authoritative** — a coding agent should be
+unable to silently change implementation behavior without it being caught against the
+spec; the default path is "update the spec and re-verify," not "edit the code and ignore
+the spec." This is a *conformance + enforcement* problem, in three layers.
+
+### 6.1 The source of truth & design-time check
+
+Write the concurrency-critical core — the **M2↔M3 boundary** (turn detection ↔ response:
+barge-in, ResponseCancel, speech_stopped, the dual-writer race) — as a **FizzBee** spec
+and **model-check it in CI**. Keep the spec small and focused on M2↔M3; that is where the
+real safety/liveness properties (Part 4 invariants 1–4) live. (FizzBee is the adopted
+model checker — see Part 6.4.)
+
+### 6.2 The conformance bridge (code ↔ spec)
+
+The honest finding: design-time model checking is well-supported; the *Go conformance
+bridge is thin everywhere* and needs per-spec glue. Two layers, adopted together:
+
+1. **FizzBee MBT** — the authoritative layer. The `.fizz` spec is model-checked, and
+   `fizz mbt-scaffold --lang go` generates Go interfaces + a `go test` harness; you
+   implement adapters mapping model actions→code and `StateGetter`→state. Conformance
+   runs as plain `go test` — the cleanest CI fit. Risk: pre-1.0, essentially one
+   maintainer (pin a version + sha256, vendor examples).
+2. **Ginkgo/Gomega seeded property tests** — the Go-native floor. A small Go model
+   (the test's `open`/`registered` shadow) is the oracle; a fixed-seed random walk
+   drives random event sequences against the `Coordinator`, asserting the Part-4
+   invariants after each step / per seed. It checks the *implementation* against a Go
+   oracle — it complements, but does not replace, the FizzBee check of the *design*.
+   (We originally specced `pgregory.net/rapid` here for its `(*T).Repeat` driver and
+   automatic shrinking, but LocalAI mandates Ginkgo/Gomega for all tests — its
+   `forbidigo` lint forbids stdlib `testing` assertions — and `rapid.Check` needs a
+   concrete `*testing.T`/`*rapid.T` that cannot run inside a Ginkgo `It`. Rather than
+   weaken the lint gate with an exclusion, the property layer is hand-rolled seeded
+   walks: fixed seeds make every failure reproducible, at the cost of `rapid`'s
+   automatic shrinking. `rapid` is consequently not a direct dependency.)
+
+These compose: model-check the design (6.1) for "the design is right"; conformance-test
+the code (6.2) for "the code matches the design." Add `go test -race` (with `-cpu=1,2,4`,
+repeated runs) over the stateful tests for interleaving-bug discovery, and Go native
+fuzzing over the *same* harness for coverage-guided sequence exploration + a committable
+regression corpus. (`testing/quick` is frozen — do not use.)
+
+There is no viable single-source-of-truth codegen (one spec compiled into both the runtime
+Go and the model) for retrofitting existing Go — the candidates are research-grade and
+greenfield-only. Our practical substitute is the CI gate below plus a single Go transition
+table that emits both the diagram and the test action set.
+
+### 6.3 Enforcement — making the design un-ignorable for agents
+
+Structural enforcement, leveraging this repo's existing non-bypassable gate culture
+(pre-commit + monotonic ratchets; `--no-verify` is forbidden, baselines never lowered):
+
+1. **Add a `realtime-conformance` gate** to the pre-commit/CI pipeline that runs (a) the
+   model check (6.1) and (b) the conformance bridge (6.2). A behavior change that does not
+   conform turns the gate **red**; the only green paths are *make the code conform* or
+   *update the spec* — and updating the spec re-triggers the model check, so an illegal
+   design is rejected too. This is the actual mechanism that makes "update the design and
+   verify" the default rather than optional.
+2. **Treat the spec as a ratchet artifact** like coverage: the gate must not be weakened,
+   the spec not deleted, the build tag not silently disabled.
+3. **Write an `.agents/realtime-state-machines.md` guide** (indexed from `CLAUDE.md`)
+   stating the spec is the source of truth: change the spec first, re-run the gate, then
+   implement. The doc is secondary; the gate is what enforces it.
+
+### 6.4 Decided stack
+
+- **Implementation:** hand-rolled sealed-state transition functions + single-writer actor
+  (Parts 3.1–3.2).
+- **Design-time + conformance:** **FizzBee** (decided). `.fizz` spec is model-checked, and
+  `fizz`'s Go MBT generator (`mbt/generator/templates/go` → interfaces/adapters/test;
+  driven via a gRPC plugin in `mbt/lib/go`) produces a `go test` conformance harness
+  whose adapters map model actions → our actor and `StateGetter` → our state. Go is a
+  first-class MBT target (Go + Rust are the only two). Verified 2026-06: Apache-2.0,
+  v0.5.2, prebuilt linux/macos×x86/arm binaries, ships Claude Code skills
+  (`/fizz-spec|check|debug|mbt`) for the spec-authoring loop.
+- **Go-native layer:** **Ginkgo/Gomega seeded property tests** run alongside — they
+  check the *implementation*, complementing (not substituting for) the FizzBee check
+  of the *design*. Skipping FizzBee is NOT "degrading to the Go layer": the design
+  authority would be gone. The gate is therefore **fail-closed** (see Enforcement).
+  (Originally specced as `rapid`; switched to Ginkgo/Gomega to satisfy LocalAI's
+  Ginkgo-only `forbidigo` lint without weakening that gate — see Part 6.2.)
+- **Enforcement:** the `realtime-conformance` pre-commit/CI gate + `.agents/` guide
+  (Part 6.3).
+
+FizzBee risk mitigations (decided):
+- The gate is **fail-closed**: a missing FizzBee is a hard failure, never a silent skip.
+  The only bypass is the explicit, loud `REALTIME_CONFORMANCE_SKIP_FIZZBEE=1` (local
+  only; CI never sets it; pre-commit runs the gate on any `respcoord/**` or
+  `formal-verification/**` change so a pure `.fizz` edit still re-verifies).
+- CI **pins the FizzBee release binary by version + sha256** (`formal-verification/fizzbee.sha256`,
+  all four platforms, digests from the GitHub release; installer verifies before extract,
+  CI caches it). Not go-gettable: `pkg/modelchecker` imports the Bazel-internal `fizz/proto`
+  with no committed `.pb.go`, so a plain `go get` won't build — hence the pinned binary.
+- Keep the `.fizz` model **portable** (no exotic features) so it stays re-expressible in
+  another model checker if FizzBee is ever abandoned — lock-in is at the tooling layer
+  only, not the design.
+
+## Open questions (decide before implementing)
+
+- **Scope of the actor refactor**: full single-writer per session, or incrementally
+  migrate one machine at a time behind the existing locks? (Suggest: M3 response
+  coordination first — it has the load-bearing dual-writer bug.)
+
+Resolved: **FSM library vs hand-rolled** → hand-rolled sealed-state tables,
+qmuntal/stateless fallback (Part 5). **Conformance bridge** → FizzBee (model-check + Go
+MBT) with a Ginkgo/Gomega seeded-property Go-native floor as hedge (Part 6.4). **Single-source-of-truth codegen**
+(PGo/MPCal) → not viable (research-grade, greenfield-only); substitute is the CI
+conformance gate (Part 6.3).
+
+**Agentic turn semantics** → invariant #2 is **one `response.done` per `response.create`**
+(OpenAI-faithful); the server-side `AssistantExecutor` tool loop becomes internal
+sub-states of a single response rather than emitting one terminal per turn. Verified safe
+in-tree: the current `response.done` carries only `{id, object, status}` (`Output`/`Usage`
+never populated), the React UI (`Talk.jsx:330`) reads only `status`, every unit test
+already asserts `ResponseDone == 1` for tool turns, no test expects multiplicity, and the
+server-side recursion is untested. Collapsing also fixes a latent "Listening…" flicker
+mid-agentic-loop. The client-driven tool loop (fresh `response.create` per round-trip)
+legitimately keeps one terminal each — unaffected. Follow-up: actually populate `Output` +
+`Usage` in the single terminal (currently always empty).
--- a/formal-verification/README.md
+++ b/formal-verification/README.md
@@ -0,0 +1,142 @@
+# Formal verification — realtime state machines
+
+Formal designs (FizzBee specs) for the realtime API state machines and the harness
+that keeps the Go implementation provably in step with them. Background and
+rationale: [../docs/design/realtime-state-machines.md](../docs/design/realtime-state-machines.md) (Part 6).
+
+The design is **authoritative**: behaviour changes go through the spec first, then
+the implementation is checked against it. The `realtime-conformance` gate makes
+that the path of least resistance — you cannot land a non-conforming change green.
+
+## What's here
+
+| File | Role |
+|------|------|
+| `response_lifecycle.fizz` | **Authoritative** FizzBee model of machine M3 (response coordination). Model-checked + drives the Go MBT conformance harness. |
+| `turn_lifecycle.fizz` | **Authoritative** FizzBee model of machine M2 (turn detection): the speechStarted / turn-open coupling. |
+| `conn_lifecycle.fizz` | **Authoritative** FizzBee model of machine M1 (connection lifecycle): VAD toggle + once-only teardown. |
+| `compaction.fizz` | **Authoritative** FizzBee model of machine M4 (conversation compaction): single-flight. |
+| `tts_pipeline.fizz` | **Authoritative** FizzBee model of machine M5 (TTS pipeline): open->closing->closed, idempotent close. |
+| `session_lifecycle.fizz` | **Composition** spec: the M1–M5 hierarchy — conn (M1) is the parent; when it is torn down, every child (vad/M2, resp/M3, compaction/M4) is terminal. Models the relationship the per-machine specs can't express. |
+| `fizzbee.sha256` | Pinned checksum(s) of the FizzBee release the gate uses (created on first `install-fizzbee.sh` run). |
+
+The implementations under test live in
+[`core/http/endpoints/openai/respcoord`](../../../core/http/endpoints/openai/respcoord) (M3),
+[`core/http/endpoints/openai/turncoord`](../../../core/http/endpoints/openai/turncoord) (M2),
+[`core/http/endpoints/openai/conncoord`](../../../core/http/endpoints/openai/conncoord) (M1),
+[`core/http/endpoints/openai/compactcoord`](../../../core/http/endpoints/openai/compactcoord) (M4),
+and [`core/http/endpoints/openai/ttscoord`](../../../core/http/endpoints/openai/ttscoord) (M5).
+
+## Running the gate
+
+```sh
+make test-realtime-conformance
+# or directly:
+./scripts/realtime-conformance.sh
+```
+
+Two layers, **both required — the gate is fail-closed**:
+
+1. **Go-native conformance** — the `respcoord` + `turncoord` + `conncoord` + `compactcoord` + `ttscoord` transition-table
+   tests + Ginkgo/Gomega seeded property (random-walk) tests under `-race`
+   (checks the implementation), plus the shared `coordinator` runtime they all
+   build on. Also run as part of `make test` (they're ordinary Go packages with a
+   Ginkgo suite each). The five machines reduce to their sealed State/Event/Effect
+   types + a pure `Next`; the single-writer Coordinator/Sink plumbing lives once in
+   `core/http/endpoints/openai/coordinator` (a generic `Coordinator[S,E,F]`).
+2. **FizzBee model check** — model-checks the authoritative `.fizz` specs (checks
+   the design). **A missing FizzBee is a hard failure, not a skip** — otherwise
+   the design verification silently disappears whenever the tool is inconvenient,
+   which is the whole thing we're trying to prevent.
+
+FizzBee is pinned and checksum-verified (`fizzbee.sha256`), so "couldn't install"
+is not a reason to skip — run `make install-fizzbee`. The **only** way to skip is
+the explicit, loud `REALTIME_CONFORMANCE_SKIP_FIZZBEE=1` opt-out, intended for
+local work on unrelated code. CI never sets it, and `pre-commit` runs the full
+gate whenever `respcoord/**`, `turncoord/**`, `conncoord/**`, `compactcoord/**`, `ttscoord/**`, or `formal-verification/**` is
+staged (so a pure `.fizz` edit still re-verifies).
+
+## Installing FizzBee (pinned)
+
+FizzBee is pre-1.0 and single-maintainer, so we pin a version + sha256 and use the
+prebuilt release tarball (its primary build is Bazel — it is **not** go-gettable:
+the `pkg/modelchecker` library imports the Bazel-internal `fizz/proto` with no
+committed `.pb.go`, so a plain `go get` won't build it).
+
+```sh
+make install-fizzbee                  # = scripts/install-fizzbee.sh (default v0.5.2)
+```
+
+The four platform assets are pinned by sha256 in `fizzbee.sha256` (digests taken
+from the GitHub release); the installer verifies before extracting. Heads-up: the
+Linux bundles are large (~290–350 MB, because `parser_bin` embeds a full runtime),
+macOS ~36 MB. CI caches `.tools/fizzbee` keyed on the pin so it downloads once.
+
+This unpacks a **self-contained** directory under `.tools/fizzbee/` (gitignored):
+
+```
+.tools/fizzbee/
+  fizz                              -> stable symlink the gate auto-detects
+  fizzbee-v0.5.2-linux_x86/
+    fizz            # CLI wrapper (entrypoint)
+    parser/parser_bin # the .fizz frontend, BUNDLED (no system Python needed)
+    fizzbee         # Go model-checker binary
+    fizz.env        # resolves the above paths relative to `fizz`
+    mbt_gen.zip     # MBT generator (this one DOES need system python)
+```
+
+Keep the directory intact — `fizz.env` resolves its siblings relative to the
+`fizz` wrapper. The gate auto-detects `.tools/fizzbee/fizz`; override with
+`FIZZBEE_BIN` only if you installed elsewhere (point it at the `fizz` wrapper,
+not the raw `fizzbee` binary).
+
+First `install-fizzbee.sh` run prints the computed sha256; record it in
+`fizzbee.sha256` as `<sha256>  <asset>` and commit so later runs verify the pin.
+
+> CLI facts (validate against the pinned version — FizzBee is pre-1.0): the CLI
+> is `fizz [flags] <spec.fizz>` (default = exhaustive BFS); there is **no `run`
+> subcommand**. The checker can print `FAILED`/`DEADLOCK` while still exiting 0,
+> so the gate scans output for those markers in addition to the exit code.
+> Model-checking needs only the bundled `parser_bin` (no Python); only
+> `mbt-scaffold` shells out to system `python`.
+
+## Reproducing the bug the spec catches
+
+Each spec models the **correct** design, so it passes; each documents how to
+reproduce the legacy bug it guards against:
+
+- `response_lifecycle.fizz` (M3): change `atomic func start()` to
+  `serial func start()` — the checker reports `AtMostOneLive` violated (the
+  dual-writer race). Pinned deterministically in Go by the respcoord
+  "legacy dual-writer characterization" spec.
+- `turn_lifecycle.fizz` (M2): in `Abort`, delete `self.speech = 0` (clear only
+  the turn, as the legacy `discardTurn` did) — the checker reports `Coupled`
+  violated (the speechStarted/turn-open desync that suppressed the next onset).
+- `conn_lifecycle.fizz` (M1): in `Close`, delete `self.torn = 1` — the checker
+  reports `TeardownOnce` violated (the legacy double-teardown / double-close
+  hazard when a session reaches teardown from more than one exit path).
+- `compaction.fizz` (M4): in `Trigger`, delete the `if self.active == 0:` guard —
+  the checker reports `SingleFlight` violated (two goroutines compacting the same
+  overflow concurrently, the race the `compacting` CAS prevents).
+- `tts_pipeline.fizz` (M5): in `Close`, delete the `if self.phase == 0` guard —
+  the checker reports `WakeOnce` violated (a non-idempotent wait() that wakes /
+  joins the worker more than once).
+- `session_lifecycle.fizz` (hierarchy): in `Teardown`, delete `self.compaction = 2`
+  — the checker reports `ChildrenDieWithParent` violated. This is the real M4 gap:
+  a fire-and-forget compaction outliving the torn session. The fix is `conncoord`'s
+  teardown cancelling + joining each conversation's compaction (and respcoord/
+  compactcoord gained an absorbing `Terminated` state so no child can start after
+  teardown).
+
+## Adding another machine
+
+All five mapped machines (M1–M5) have landed. To add a new sealed-state machine:
+
+1. Add `<machine>.fizz` here (with an `always assertion`; verify non-vacuity by
+   breaking one guard and confirming the checker fails).
+2. Implement it as a sealed-state package under `core/http/endpoints/openai/`.
+3. Add transition-table + Ginkgo/Gomega seeded property conformance tests
+   (one `*_suite_test.go` bootstrap per package; LocalAI mandates Ginkgo/Gomega).
+4. The gate picks up new `*.fizz` specs automatically; add the new Go package to
+   the `-race` test list in `scripts/realtime-conformance.sh` (and the path
+   filters in `.githooks/pre-commit` + `.github/workflows/realtime-conformance.yml`).
--- a/formal-verification/compaction.fizz
+++ b/formal-verification/compaction.fizz
@@ -0,0 +1,57 @@
+---
+# Authoritative formal design for realtime machine M4: conversation compaction.
+#
+# Companion to:
+#   - docs/design/realtime-state-machines.md  (the map + invariants)
+#   - core/http/endpoints/openai/compactcoord (the Go implementation)
+#
+# The Go MBT adapter maps each action below onto compactcoord.Coordinator.Apply
+# and the StateGetter onto compactcoord.Coordinator.State, so this spec is the
+# source of truth the implementation is checked against.
+#
+# The property: at most one background compaction runs per conversation at a time,
+# so two goroutines never summarize+evict the same overflow concurrently (Part 4,
+# invariant #9). The legacy guard is a `compacting atomic.Bool` CAS; here `active`
+# is the number of in-flight compactions, started only from Idle.
+#
+# NOTE: FizzBee is pre-1.0. Validate the exact syntax/CLI against the version
+# pinned in formal-verification/README.md before trusting the gate.
+deadlock_detection: false
+---
+
+role Compactor:
+    action Init:
+        self.active = 0   # compactions in flight -- MUST stay in {0,1}
+        self.torn = 0     # session torn down (Terminated) -- absorbing
+
+    # maybeCompact wants to start a compaction. THE FIX: it starts one only when
+    # none is running (single-flight) and not after teardown. To reproduce the
+    # legacy race where two goroutines could both compact the same overflow,
+    # delete the `self.active == 0` guard (always increment): the checker then
+    # reports SingleFlight violated.
+    atomic action Trigger:
+        if self.active == 0 and self.torn == 0:
+            self.active += 1   # StartCompaction
+
+    # The background compaction goroutine finished (success, error, or timeout).
+    atomic action Finished:
+        if self.active > 0:
+            self.active -= 1
+
+    # Teardown: the connection (M1) parent cancels + joins the in-flight
+    # compaction, then terminates the coordinator so none can start afterwards.
+    atomic action Shutdown:
+        self.active = 0    # cancelled + joined
+        self.torn = 1
+
+action Init:
+    c = Compactor()
+
+# SAFETY: at most one compaction is ever in flight (Part 4, invariant #9).
+always assertion SingleFlight:
+    return c.active >= 0 and c.active <= 1
+
+# SAFETY: no compaction is in flight once torn (it was cancelled + joined at
+# teardown), so none outlives the session.
+always assertion NoneAfterTeardown:
+    return c.torn == 0 or c.active == 0
--- a/formal-verification/conn_lifecycle.fizz
+++ b/formal-verification/conn_lifecycle.fizz
@@ -0,0 +1,60 @@
+---
+# Authoritative formal design for realtime machine M1: connection lifecycle.
+#
+# Companion to:
+#   - docs/design/realtime-state-machines.md  (the map + invariants)
+#   - core/http/endpoints/openai/conncoord    (the Go implementation)
+#
+# The Go MBT adapter maps each action below onto conncoord.Coordinator.Apply and
+# the StateGetter onto conncoord.Coordinator.State, so this spec is the source of
+# truth the implementation is checked against.
+#
+# The legacy hazard (Part 2, failure mode 6 / invariants #8, #10): a single `done`
+# channel reassigned on every VAD toggle and closed from two sites (toggle-off and
+# teardown) guarded only by a vadServerStarted bool. Modeled here as `running`
+# (the VAD goroutine's done channel is live) and `torn` (teardown happened).
+#
+# NOTE: FizzBee is pre-1.0. Validate the exact syntax/CLI against the version
+# pinned in formal-verification/README.md before trusting the gate.
+deadlock_detection: false
+---
+
+role Conn:
+    action Init:
+        self.running = 0     # VAD goroutine running (its done channel is live)
+        self.torn = 0        # teardown has happened
+        self.teardowns = 0   # how many times teardown ran -- MUST stay <= 1
+
+    # session.update toggled turn detection on. No-op after teardown (the legacy
+    # reassign-and-spawn must never resurrect a torn session).
+    atomic action VadOn:
+        if self.torn == 0:
+            self.running = 1
+
+    # session.update toggled turn detection off (close the running done channel).
+    atomic action VadOff:
+        if self.torn == 0:
+            self.running = 0
+
+    # Transport read loop ended / session closing. THE FIX: setting torn absorbs
+    # every later Close, so teardown's channel closes happen exactly once. To
+    # reproduce the legacy double-teardown hazard, delete `self.torn = 1` below:
+    # the checker then reports TeardownOnce violated (Close runs teardown again).
+    atomic action Close:
+        if self.torn == 0:
+            self.running = 0    # StopVAD if it was running (close-once)
+            self.teardowns += 1 # Teardown
+            self.torn = 1
+
+action Init:
+    c = Conn()
+
+# SAFETY: teardown runs at most once -- the done/decode/sound channels are closed
+# exactly once, never double-closed (Part 4, invariant #10).
+always assertion TeardownOnce:
+    return c.teardowns <= 1
+
+# SAFETY: the VAD goroutine is never (re)started after teardown -- no
+# send-after-close / no goroutine outliving the session (Part 4, invariant #8).
+always assertion NoRunAfterTorn:
+    return not (c.torn == 1 and c.running == 1)
--- a/formal-verification/fizzbee.sha256
+++ b/formal-verification/fizzbee.sha256
@@ -0,0 +1,4 @@
+00011bbfe9bf4c7bcb03a5bf1f5b7fe7390111ad6f0611c6be71e8692504da4e  fizzbee-v0.5.2-linux_arm.tar.gz
+f494b7b2afcc7ce24575ed91a389b46bbbbe5976f9e4b5cd717327012f5e0395  fizzbee-v0.5.2-linux_x86.tar.gz
+aab223e0bac8f0c052cf774dc25872f72c138da30f4079b914bb9c8921910904  fizzbee-v0.5.2-macos_arm.tar.gz
+6293bd7ab90c79b8607dc9fb2f09407fde0e11ac6596e884bef7f660178597fa  fizzbee-v0.5.2-macos_x86.tar.gz
--- a/formal-verification/response_lifecycle.fizz
+++ b/formal-verification/response_lifecycle.fizz
@@ -0,0 +1,83 @@
+---
+# Authoritative formal design for realtime machine M3: response coordination.
+#
+# Companion to:
+#   - docs/design/realtime-state-machines.md  (the map + invariants)
+#   - core/http/endpoints/openai/respcoord    (the Go implementation)
+#
+# The Go MBT adapter maps each action below onto respcoord.Coordinator.Apply
+# and the StateGetter onto respcoord.Coordinator.State, so this spec is the
+# source of truth the implementation is checked against.
+#
+# NOTE: FizzBee is pre-1.0. Validate the exact syntax/CLI against the version
+# pinned in formal-verification/README.md before trusting the gate.
+deadlock_detection: false
+---
+
+# Bound the number of responses so the state space is finite.
+MAX_RESPONSES = 4
+
+role Session:
+    action Init:
+        self.live = 0          # number of live responses -- MUST stay in {0,1}
+        self.registered = 0    # id of the active response (0 = none)
+        self.next_id = 0
+        self.torn = 0          # session torn down (Terminated) -- absorbing
+
+    # startResponse as ONE indivisible transition -- this is the single-writer
+    # actor guarantee. Superseding an active response emits its cancelled
+    # terminal (live -= 1) BEFORE spawning the replacement (live += 1), so the
+    # net live count never exceeds 1.
+    #
+    # To reproduce the LEGACY dual-writer race from Part 2 of the design doc,
+    # change `atomic func` to `serial func`: the checker then interleaves two
+    # callers between the cancel and the spawn and reports AtMostOneLive
+    # violated -- exactly the bug TestLegacyMechanismCanDoubleStart pins in Go.
+    atomic func start():
+        if self.registered != 0:
+            self.live -= 1         # cancel + cancelled-terminal for the old
+            self.registered = 0
+        self.next_id += 1
+        self.live += 1             # spawn + register the replacement
+        self.registered = self.next_id
+
+    # client read-loop path: response.create / manual input_audio_buffer.commit.
+    # Rejected once torn (no response starts after teardown).
+    atomic action StartFromClient:
+        require self.next_id < MAX_RESPONSES
+        require self.torn == 0
+        self.start()
+
+    # VAD goroutine path: end-of-speech commit / barge-in. Rejected once torn.
+    atomic action StartFromVad:
+        require self.next_id < MAX_RESPONSES
+        require self.torn == 0
+        self.start()
+
+    # a response reaches its own terminal (response.done completed)
+    atomic action FinishCurrent:
+        if self.registered != 0:
+            self.live -= 1
+            self.registered = 0
+
+    # explicit response.cancel with nothing newer queued
+    atomic action CancelReq:
+        if self.registered != 0:
+            self.live -= 1
+            self.registered = 0
+
+    # session teardown (M1 parent): cancel any in-flight response and go to the
+    # absorbing Terminated state, after which no response can start. This is what
+    # lets the connection's teardown guarantee no response outlives the session.
+    atomic action Shutdown:
+        if self.registered != 0:
+            self.live -= 1
+            self.registered = 0
+        self.torn = 1
+
+action Init:
+    s = Session()
+
+# SAFETY: at most one live response at any instant (Part 4, invariant #1).
+always assertion AtMostOneLive:
+    return s.live >= 0 and s.live <= 1
--- a/Show More
+++ b/Show More
@@ -1 +1 @@
 .0
 .5