chore(deps): bump the pip group across 3 directories with 1 update

Bumps the pip group with 1 update in the /backend/python/sglang directory: torch. Bumps the pip group with 1 update in the /backend/python/trl directory: torch. Bumps the pip group with 1 update in the /backend/python/vllm-omni directory: torch. Updates `torch` from 2.9.0 to 2.12.0+cpu Updates `torch` from 2.10.0 to 2.12.0+cpu Updates `torch` from 2.7.0 to 2.12.0+cu130 --- updated-dependencies: - dependency-name: torch dependency-version: 2.12.0+cpu dependency-type: direct:production dependency-group: pip - dependency-name: torch dependency-version: 2.12.0+cpu dependency-type: direct:production dependency-group: pip - dependency-name: torch dependency-version: 2.12.0+cu130 dependency-type: direct:production dependency-group: pip ... Signed-off-by: dependabot[bot] <support@github.com>
2026-06-30 19:37:00 -04:00 · 2026-06-28 09:54:23 +00:00
146 changed files with 997 additions and 10467 deletions
--- a/.githooks/pre-commit
+++ b/.githooks/pre-commit
@@ -7,11 +7,8 @@
 # Runs only the checks relevant to what's staged:
 #   - Go files          -> make lint + make test-coverage-check
 #   - core/http/react-ui -> make test-ui-coverage-check (Playwright e2e + gate)
-#   - realtime state machines / specs -> make test-realtime-conformance
-#       (respcoord/**, turncoord/**, or formal-verification/** -- a pure .fizz
-#        spec edit must still re-verify the design, detected separately from Go)
-# A commit touching none of these is skipped entirely (other docs/YAML can't
-# change lint findings, Go coverage, the UI, or the realtime conformance gate).
+# A commit touching neither is skipped entirely (docs/YAML/etc. can't change
+# lint findings, Go coverage, or the UI).
 #
 # To bypass for a single commit (e.g. a WIP checkpoint): git commit --no-verify
 set -eu
@@ -23,13 +20,11 @@ staged="$(git diff --cached --name-only --diff-filter=ACMRD)"

 go_changed=0
 ui_changed=0
-rt_changed=0
 if echo "$staged" | grep -qE '\.go$'; then go_changed=1; fi
 if echo "$staged" | grep -qE '^core/http/react-ui/'; then ui_changed=1; fi
-if echo "$staged" | grep -qE '^(core/http/endpoints/openai/(coordinator|respcoord|turncoord|conncoord|compactcoord|ttscoord)/|formal-verification/)'; then rt_changed=1; fi

-if [ "$go_changed" -eq 0 ] && [ "$ui_changed" -eq 0 ] && [ "$rt_changed" -eq 0 ]; then
-	echo "pre-commit: no Go, React UI, or realtime-spec changes staged — skipping."
+if [ "$go_changed" -eq 0 ] && [ "$ui_changed" -eq 0 ]; then
+	echo "pre-commit: no Go or React UI changes staged — skipping."
 	exit 0
 fi

@@ -62,11 +57,4 @@ if [ "$ui_changed" -eq 1 ]; then
 	make test-ui-coverage-check
 fi

-if [ "$rt_changed" -eq 1 ]; then
-	echo "pre-commit ▶ realtime state-machine conformance (make test-realtime-conformance) —"
-	echo "             Go transition/rapid tests under -race + FizzBee model check of the"
-	echo "             authoritative specs. Fail-closed: needs FizzBee (make install-fizzbee)."
-	make test-realtime-conformance
-fi
-
 echo "pre-commit ✓ all relevant checks passed"
--- a/.github/workflows/backend_build_darwin.yml
+++ b/.github/workflows/backend_build_darwin.yml
@@ -82,7 +82,7 @@ jobs:
      # as the Linux registry cache.
      - name: Restore Homebrew cache
        id: brew-cache
-        uses: actions/cache/restore@v6
+        uses: actions/cache/restore@v4
        with:
          path: |
            ~/Library/Caches/Homebrew/downloads
@@ -142,7 +142,7 @@ jobs:

      - name: Save Homebrew cache
        if: github.event_name != 'pull_request' && steps.brew-cache.outputs.cache-hit != 'true'
-        uses: actions/cache/save@v6
+        uses: actions/cache/save@v4
        with:
          path: |
            ~/Library/Caches/Homebrew/downloads
@@ -178,7 +178,7 @@ jobs:
      - name: Restore ccache
        if: inputs.backend == 'llama-cpp'
        id: ccache-cache
-        uses: actions/cache/restore@v6
+        uses: actions/cache/restore@v4
        with:
          path: ~/Library/Caches/ccache
          key: ccache-llama-${{ runner.arch }}-${{ steps.llama-version.outputs.version }}-${{ github.run_id }}
@@ -211,7 +211,7 @@ jobs:
      - name: Restore Python wheel cache
        if: inputs.lang == 'python'
        id: pyenv-cache
-        uses: actions/cache/restore@v6
+        uses: actions/cache/restore@v4
        with:
          path: |
            ~/Library/Caches/pip
@@ -256,14 +256,14 @@ jobs:

      - name: Save ccache
        if: inputs.backend == 'llama-cpp' && github.event_name != 'pull_request'
-        uses: actions/cache/save@v6
+        uses: actions/cache/save@v4
        with:
          path: ~/Library/Caches/ccache
          key: ccache-llama-${{ runner.arch }}-${{ steps.llama-version.outputs.version }}-${{ github.run_id }}

      - name: Save Python wheel cache
        if: inputs.lang == 'python' && github.event_name != 'pull_request' && steps.pyenv-cache.outputs.cache-hit != 'true'
-        uses: actions/cache/save@v6
+        uses: actions/cache/save@v4
        with:
          path: |
            ~/Library/Caches/pip
--- a/.github/workflows/realtime-conformance.yml
+++ b/.github/workflows/realtime-conformance.yml
@@ -1,69 +0,0 @@
---
-name: 'realtime-conformance'
-
-# Verifies the realtime state-machine implementations conform to their formal
-# designs (docs/design/realtime-state-machines.md, formal-verification/). BOTH
-# layers are enforced and the gate is fail-closed: the Go conformance layer
-# (respcoord + turncoord transition/rapid tests under -race) AND the FizzBee model check of
-# the authoritative specs. FizzBee is pinned + checksum-verified
-# (formal-verification/fizzbee.sha256), so a failed install fails the job rather
-# than silently skipping verification.
-
-on:
-  pull_request:
-    paths:
-      - 'core/http/endpoints/openai/coordinator/**'
-      - 'core/http/endpoints/openai/respcoord/**'
-      - 'core/http/endpoints/openai/turncoord/**'
-      - 'core/http/endpoints/openai/conncoord/**'
-      - 'core/http/endpoints/openai/compactcoord/**'
-      - 'core/http/endpoints/openai/ttscoord/**'
-      - 'formal-verification/**'
-      - 'scripts/realtime-conformance.sh'
-      - 'scripts/install-fizzbee.sh'
-      - '.github/workflows/realtime-conformance.yml'
-  push:
-    branches:
-      - master
-    paths:
-      - 'core/http/endpoints/openai/coordinator/**'
-      - 'core/http/endpoints/openai/respcoord/**'
-      - 'core/http/endpoints/openai/turncoord/**'
-      - 'core/http/endpoints/openai/conncoord/**'
-      - 'core/http/endpoints/openai/compactcoord/**'
-      - 'core/http/endpoints/openai/ttscoord/**'
-      - 'formal-verification/**'
-      - 'scripts/realtime-conformance.sh'
-
-concurrency:
-  group: realtime-conformance-${{ github.event.pull_request.number || github.sha }}-${{ github.repository }}
-  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
-
-jobs:
-  conformance:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        go-version: ['1.26.x']
-    steps:
-      - name: Clone
-        uses: actions/checkout@v7
-      - name: Setup Go ${{ matrix.go-version }}
-        uses: actions/setup-go@v5
-        with:
-          go-version: ${{ matrix.go-version }}
-          cache: false
-      - name: Cache FizzBee
-        uses: actions/cache@v4
-        with:
-          path: .tools/fizzbee
-          key: fizzbee-v0.5.2-${{ runner.os }}-${{ hashFiles('formal-verification/fizzbee.sha256') }}
-      - name: Install FizzBee (pinned, checksum-verified)
-        # No `|| true`: a failed/forged download must fail the job, not silently
-        # drop the design verification. install-fizzbee.sh is a no-op if the
-        # cached binary is already present and valid.
-        run: ./scripts/install-fizzbee.sh
-      - name: Run conformance gate (fail-closed)
-        # No skip env: both the Go conformance and the FizzBee model check are
-        # required. The gate auto-detects .tools/fizzbee/fizz.
-        run: make test-realtime-conformance
--- a/.gitignore
+++ b/.gitignore
@@ -97,12 +97,3 @@ core/http/react-ui/test-results/

 # Local Apple signing material (never commit)
 .certs/
-
-# Pinned dev tools (e.g. FizzBee for the realtime-conformance gate)
-.tools/
-
-# FizzBee model-check artifacts: the parser emits <spec>.json next to each
-# .fizz and the checker writes run dirs under out/. Both are regenerated by
-# the realtime-conformance gate; only the .fizz sources are authoritative.
-formal-verification/*.json
-formal-verification/out/
--- a/23
+++ b/23
@@ -405,18 +405,6 @@ test-realtime: build-mock-backend
 	@echo 'Running realtime e2e tests (mock backend)'
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="Realtime && !real-models" --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e

-# Verify the realtime state-machine implementations conform to their formal
-# designs (Go transition/rapid tests under -race + FizzBee model check of the
-# authoritative specs). See docs/design/realtime-state-machines.md (Part 6) and
-# docs/design/specs/README.md.
-test-realtime-conformance:
-	GOCMD=$(GOCMD) ./scripts/realtime-conformance.sh
-
-# Install the pinned, checksum-verified FizzBee model checker (into .tools/,
-# gitignored) used by test-realtime-conformance. Idempotent; no-op if present.
-install-fizzbee:
-	./scripts/install-fizzbee.sh
-
 # Container-based real-model realtime testing. Build env vars / pipeline
 # definition kept here so test-realtime-models-docker can drive a fully wired
 # pipeline (VAD + STT + LLM + TTS) from inside a containerised runner.
@@ -1039,7 +1027,7 @@ test-extra-backend-whisper-transcription: docker-build-whisper
 ## is reachable.
 test-extra-backend-parakeet-cpp-transcription: docker-build-parakeet-cpp
 	BACKEND_IMAGE=local-ai-backend:parakeet-cpp \
-	BACKEND_TEST_MODEL_URL=https://huggingface.co/mudler/parakeet-cpp-gguf/resolve/main/realtime_eou_120m-v1-f16.gguf \
+	BACKEND_TEST_MODEL_URL=https://huggingface.co/mudler/parakeet-cpp-gguf/resolve/main/tdt_ctc-110m-f16.gguf \
 	BACKEND_TEST_AUDIO_URL=https://github.com/ggml-org/whisper.cpp/raw/master/samples/jfk.wav \
 	BACKEND_TEST_CAPS=health,load,transcription \
 	$(MAKE) test-extra-backend
@@ -1482,13 +1470,8 @@ build-launcher-darwin:
 	mv cmd/launcher/LocalAI.app dist/LocalAI.app
 	bash contrib/macos/sign-and-notarize.sh sign dist/LocalAI.app

-# Notarize + staple the .app itself, then wrap it into a drag-to-Applications
-# DMG via hdiutil and sign the DMG. The app is stapled BEFORE packaging so the
-# bundle carries its own ticket and verifies offline (a dmg-only staple leaves
-# the app relying on an online Gatekeeper check, which fails offline / once the
-# app is copied out of the dmg). No-op without notary secrets.
+# Wrap the (signed) app into a drag-to-Applications DMG via hdiutil, then sign the DMG.
 dmg-launcher-darwin: build-launcher-darwin
-	bash contrib/macos/sign-and-notarize.sh notarize-app dist/LocalAI.app
 	rm -rf dist/dmg dist/LocalAI.dmg
 	mkdir -p dist/dmg
 	cp -R dist/LocalAI.app dist/dmg/LocalAI.app
@@ -1500,7 +1483,7 @@ dmg-launcher-darwin: build-launcher-darwin
 notarize-launcher-darwin: dmg-launcher-darwin
 	bash contrib/macos/sign-and-notarize.sh notarize dist/LocalAI.dmg

-# Single entrypoint for CI: build -> sign app -> notarize+staple app -> dmg -> sign dmg -> notarize+staple dmg.
+# Single entrypoint for CI: build -> sign app -> dmg -> sign dmg -> notarize -> staple.
 release-launcher-darwin: notarize-launcher-darwin
 	@echo "dist/LocalAI.dmg is ready"

--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -18,18 +18,6 @@ service Backend {
  rpc GenerateVideo(GenerateVideoRequest) returns (Result) {}
  rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
  rpc AudioTranscriptionStream(TranscriptRequest) returns (stream TranscriptStreamResponse) {}
-  // AudioTranscriptionLive is the bidirectional live-microphone ASR RPC. The
-  // first message MUST carry a Config; subsequent messages carry Audio frames
-  // (mono float PCM at config.sample_rate, 16 kHz default). After a
-  // successful open the backend replies with a single ready ack
-  // (TranscriptLiveResponse{ready:true}); backends or models without
-  // cache-aware streaming support return UNIMPLEMENTED instead. Newly
-  // finalized text streams back as deltas; eou=true marks the model's
-  // end-of-utterance token. One stream spans many utterances (the decoder
-  // resets itself after each EOU). Closing the send side finalizes: the
-  // backend flushes the decoder tail and emits a terminal message carrying
-  // final_result. A second Config mid-stream resets the decode session.
-  rpc AudioTranscriptionLive(stream TranscriptLiveRequest) returns (stream TranscriptLiveResponse) {}
  rpc TTS(TTSRequest) returns (Result) {}
  rpc TTSStream(TTSRequest) returns (stream Reply) {}
  rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
@@ -491,10 +479,6 @@ message TranscriptResult {
  string text = 2;
  string language = 3;
  float duration = 4;
-  // True when the decode ended on the model's end-of-utterance special token
-  // (<EOU>/<EOB>, emitted by cache-aware streaming models such as
-  // parakeet_realtime_eou_120m-v1). The marker itself is stripped from text.
-  bool eou = 5;
 }

 message TranscriptStreamResponse {
@@ -502,34 +486,6 @@ message TranscriptStreamResponse {
  TranscriptResult final_result = 2;
 }

-// === AudioTranscriptionLive messages =====================================
-
-message TranscriptLiveRequest {
-  oneof payload {
-    TranscriptLiveConfig config = 1;
-    TranscriptLiveAudio  audio  = 2;
-  }
-}
-
-message TranscriptLiveConfig {
-  string language = 1;             // "" => model default
-  int32 sample_rate = 2;           // 0 => 16000; backends may reject others
-  map<string, string> params = 3;  // backend-specific tuning
-}
-
-message TranscriptLiveAudio {
-  repeated float pcm = 1;          // mono PCM in [-1,1] at config.sample_rate
-}
-
-message TranscriptLiveResponse {
-  bool ready = 1;                       // open ack: sent once, before any delta
-  string delta = 2;                     // newly-finalized text since previous response
-  bool eou = 3;                         // <EOU> fired during this feed (the user yielded the turn)
-  repeated TranscriptWord words = 4;    // words finalized by this feed (stream-relative ns)
-  TranscriptResult final_result = 5;    // terminal message only, after the send side closes
-  bool eob = 6;                         // <EOB> fired: a backchannel ("uh-huh") ended — NOT a turn boundary
-}
-
 message TranscriptWord {
  int64 start = 1;
  int64 end = 2;
--- a/backend/cpp/ik-llama-cpp/Makefile
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -1,5 +1,5 @@

-IK_LLAMA_VERSION?=f74a6fb87b315b2c3154166e075360e15021a61d
+IK_LLAMA_VERSION?=f96eaddba8bed6a9a5e628bbf6a566775c70b49c
 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=6f4f53f2b7da54fcdbbecaaa734337c337ad6176
+LLAMA_VERSION?=0ed235ea2c17a19fc8238668653946721ed136fd
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/privacy-filter/Makefile
+++ b/backend/cpp/privacy-filter/Makefile
@@ -8,7 +8,7 @@
 # Local development: point at a working checkout instead of cloning, e.g.
 #   make PRIVACY_FILTER_SRC=$HOME/c/privacy-filter.cpp grpc-server

-PRIVACY_FILTER_VERSION?=595f59630c69d361b5196f2aba2c71c873d0c13c
+PRIVACY_FILTER_VERSION?=98f52c5ef2250f207cc6b9a6aef05393a120cb7c
 PRIVACY_FILTER_REPO?=https://github.com/localai-org/privacy-filter.cpp
 PRIVACY_FILTER_SRC?=

--- a/backend/go/crispasr/Makefile
+++ b/backend/go/crispasr/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # CrispASR version (release tag)
 CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
-CRISPASR_VERSION?=3b93758f9725d400eca82976f895e4cec3f31260
+CRISPASR_VERSION?=6514c9da00b03a2f0f1b49a43fae4f3a01a41844
 SO_TARGET?=libgocrispasr.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/face-detect/Makefile
+++ b/backend/go/face-detect/Makefile
@@ -1,6 +1,6 @@
 # face-detect backend Makefile.
 #
-# Upstream pin lives below as FACEDETECT_VERSION?=e22260d5d5490b37b021b7f795079f386d553afd
+# Upstream pin lives below as FACEDETECT_VERSION?=06914b0... (.github/bump_deps.sh
 # can find and update it - matches the voice-detect / parakeet.cpp / whisper.cpp
 # convention).
 #
@@ -14,7 +14,7 @@
 # The default target below does the proper clone-at-pin + cmake build so CI does
 # not need a side-checkout.

-FACEDETECT_VERSION?=e22260d5d5490b37b021b7f795079f386d553afd
+FACEDETECT_VERSION?=06914b077d52f90d5421299138e7be6bdd06b5e8
 FACEDETECT_REPO?=https://github.com/mudler/face-detect.cpp

 GOCMD?=go
--- a/backend/go/parakeet-cpp/boundary.go
+++ b/backend/go/parakeet-cpp/boundary.go
@@ -1,81 +0,0 @@
-package main
-
-// utteranceBoundary is the single definition of a small state machine that was
-// previously open-coded three times — as a bare `finalEou` bool with an ad-hoc
-// toggle — in the live feed (live.go), the file-stream text path, and the
-// file-stream JSON path (goparakeetcpp.go).
-//
-// It answers one running question: does the decode currently rest on an
-// end-of-utterance boundary? That is the value a closing FinalResult reports as
-// .Eou and the realtime turn detector treats as a commit point.
-//
-// parakeet auto-resets its decoder after every <EOU>/<EOB>, so one streaming
-// session is a sequence of utterances and this is a LATCH, not a monotonic
-// flag: it closes on an <EOU> and reopens as soon as the next utterance starts.
-// (Contrast the realtime API's per-turn `eouSeen`, which only ever goes
-// false->true because each turn gets a fresh stream. Here the stream outlives
-// the turn, so the boundary status must be able to reopen.)
-//
-// The only transitions, over the events one streamFeedResult carries — an
-// <EOU>, an <EOB> (backchannel), or plain speech output (text and/or words):
-//
-//	            <EOU>
-//	   open ───────────► closed
-//	    ▲ ▲ │             │ │
-//	    │ └─┘ <EOB>|speech │ │ <EOU>
-//	    │   (stay open)    │ └─┘ (stay closed)
-//	    └──────────────────┘
-//	         <EOB>|speech
-//
-//	open   = NOT on an utterance boundary: mid-utterance, the last boundary was
-//	         a backchannel <EOB>, or the stream just began (the initial state).
-//	closed = the last meaningful event was an <EOU> with no later speech: a real
-//	         turn boundary.
-//
-// A feed that carries nothing (no eou/eob/text/words — e.g. a finalize flush
-// that produced no tail) is a no-op and leaves the state unchanged, matching
-// the legacy "leave finalEou as it was" behaviour.
-//
-// The state carries no data, so it is modelled as a two-valued type (a named
-// bool) rather than an int enum: every inhabitant is legal, so illegal states
-// are unrepresentable — the payload-free analog of the sealed sum types the
-// realtime machines use (those need interfaces because their states carry data,
-// e.g. Active{ID}, where "Active with no ID" is the illegal combination a scalar
-// cannot even express).
-type utteranceBoundary bool
-
-const (
-	// boundaryOpen is the zero value (false), so a fresh decode starts open —
-	// exactly the legacy `var finalEou bool` (false) initial condition.
-	boundaryOpen   utteranceBoundary = false
-	boundaryClosed utteranceBoundary = true
-)
-
-// observe folds one decode increment into the latch and returns the new state.
-//
-// <EOU> takes priority when a single feed carries both an <EOU> and speech
-// (e.g. {"text":"hello","eou":1}): the utterance both produced that text AND
-// ended, so the decode rests on the boundary. This matches the legacy
-// eou-checked-first ordering at every call site.
-func (b utteranceBoundary) observe(r streamFeedResult) utteranceBoundary {
-	switch {
-	case r.Eou:
-		return boundaryClosed
-	case r.Eob || r.Delta != "" || len(r.Words) > 0:
-		return boundaryOpen
-	default:
-		return b
-	}
-}
-
-// ended reports whether the decode currently rests on an end-of-utterance
-// boundary (a real <EOU>, not a backchannel <EOB>). This is what a closing
-// FinalResult carries as .Eou.
-func (b utteranceBoundary) ended() bool { return b == boundaryClosed }
-
-func (b utteranceBoundary) String() string {
-	if b == boundaryClosed {
-		return "closed"
-	}
-	return "open"
-}
--- a/backend/go/parakeet-cpp/boundary_test.go
+++ b/backend/go/parakeet-cpp/boundary_test.go
@@ -1,92 +0,0 @@
-package main
-
-import (
-	"math/rand/v2"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("utteranceBoundary (decode end-of-utterance latch)", func() {
-	It("starts open: a fresh decode is not on a boundary", func() {
-		var b utteranceBoundary
-		Expect(b).To(Equal(boundaryOpen))
-		Expect(b.ended()).To(BeFalse())
-	})
-
-	DescribeTable("single feed transition from the open state",
-		func(r streamFeedResult, wantEnded bool) {
-			Expect(boundaryOpen.observe(r).ended()).To(Equal(wantEnded))
-		},
-		Entry("<EOU> closes it", streamFeedResult{Eou: true}, true),
-		Entry("<EOU> with text closes it (eou wins)", streamFeedResult{Delta: "hi", Eou: true}, true),
-		Entry("<EOB> stays open (backchannel is not a turn boundary)", streamFeedResult{Eob: true}, false),
-		Entry("plain text stays open", streamFeedResult{Delta: "hello"}, false),
-		Entry("words-only stays open", streamFeedResult{Words: []transcriptWord{{W: "x"}}}, false),
-		Entry("empty feed is a no-op (stays open)", streamFeedResult{}, false),
-	)
-
-	DescribeTable("single feed transition from the closed state",
-		func(r streamFeedResult, wantEnded bool) {
-			Expect(boundaryClosed.observe(r).ended()).To(Equal(wantEnded))
-		},
-		Entry("another <EOU> stays closed", streamFeedResult{Eou: true}, true),
-		Entry("trailing speech reopens it", streamFeedResult{Delta: "and more"}, false),
-		Entry("words reopen it", streamFeedResult{Words: []transcriptWord{{W: "x"}}}, false),
-		Entry("a backchannel <EOB> reopens it", streamFeedResult{Eob: true}, false),
-		Entry("empty feed is a no-op (stays closed)", streamFeedResult{}, true),
-	)
-
-	It("is a latch: <EOU> then trailing speech reopens, then <EOU> closes again", func() {
-		b := boundaryOpen
-		b = b.observe(streamFeedResult{Delta: "turn one", Eou: true})
-		Expect(b.ended()).To(BeTrue())
-		b = b.observe(streamFeedResult{Delta: " and more"})
-		Expect(b.ended()).To(BeFalse(), "trailing speech without an EOU is an open utterance")
-		b = b.observe(streamFeedResult{Eou: true})
-		Expect(b.ended()).To(BeTrue())
-	})
-
-	It("treats a backchannel before a real EOU correctly", func() {
-		b := boundaryOpen
-		b = b.observe(streamFeedResult{Delta: "uh huh", Eob: true})
-		Expect(b.ended()).To(BeFalse(), "a backchannel must not masquerade as a turn boundary")
-		b = b.observe(streamFeedResult{Delta: "done", Eou: true})
-		Expect(b.ended()).To(BeTrue())
-	})
-
-	It("matches the reference fold over seeded random feed sequences", func() {
-		// The invariant: after any sequence of feeds, ended() is true iff the
-		// last feed that carried ANY event was an <EOU>. <EOU> takes priority
-		// when a feed carries both an EOU and speech; empty feeds are ignored.
-		for seed := uint64(1); seed <= 200; seed++ {
-			rng := rand.New(rand.NewPCG(seed, seed*2654435761))
-			b := boundaryOpen
-			lastWasEou := false // reference: did the last meaningful feed end on EOU?
-			steps := rng.IntN(30)
-			for i := 0; i < steps; i++ {
-				var r streamFeedResult
-				switch rng.IntN(5) {
-				case 0:
-					r = streamFeedResult{Eou: true}
-				case 1:
-					r = streamFeedResult{Eob: true}
-				case 2:
-					r = streamFeedResult{Delta: "w"}
-				case 3:
-					r = streamFeedResult{Delta: "w", Eou: true} // eou + speech, eou wins
-				case 4:
-					r = streamFeedResult{} // empty: no-op
-				}
-				b = b.observe(r)
-				if r.Eou {
-					lastWasEou = true
-				} else if r.Eob || r.Delta != "" || len(r.Words) > 0 {
-					lastWasEou = false
-				}
-			}
-			Expect(b.ended()).To(Equal(lastWasEou),
-				"seed %d: latch disagreed with the reference fold", seed)
-		}
-	})
-})
--- a/backend/go/parakeet-cpp/driver.go
+++ b/backend/go/parakeet-cpp/driver.go
@@ -1,82 +0,0 @@
-package main
-
-import (
-	"context"
-
-	"google.golang.org/grpc/codes"
-	"google.golang.org/grpc/status"
-)
-
-// streamFeedResult is one decode increment from a cache-aware streaming session:
-// the newly-finalized text plus the model's own per-feed boundary tokens
-// (<EOU>/<EOB>) and word timings. It is the single event type both the live
-// (bidi) and file (server-stream) paths fold over, hiding the ABI v4 JSON vs
-// older text-only entry-point split behind one shape.
-type streamFeedResult struct {
-	Delta string
-	Eou   bool
-	Eob   bool
-	Words []transcriptWord
-}
-
-// feedChunk feeds one PCM chunk to the streaming session (or finalizes it, when
-// finalize is true) and returns the unified decode increment. It prefers the
-// ABI v4 JSON entry points (which also carry per-word timestamps) and falls
-// back to the older text-only entry points against an older libparakeet.so.
-//
-// This is the one place the JSON-vs-text choice is made; every consumer works
-// in terms of streamFeedResult.
-func (p *ParakeetCpp) feedChunk(stream uintptr, pcm []float32, finalize bool) (streamFeedResult, error) {
-	if CppStreamFeedJSON != nil {
-		doc, err := p.streamFeedDoc(stream, pcm, finalize)
-		if err != nil {
-			return streamFeedResult{}, err
-		}
-		return streamFeedResult{Delta: doc.Text, Eou: doc.Eou != 0, Eob: doc.Eob != 0, Words: doc.Words}, nil
-	}
-	delta, eou, eob, err := p.streamFeedText(stream, pcm, finalize)
-	if err != nil {
-		return streamFeedResult{}, err
-	}
-	return streamFeedResult{Delta: delta, Eou: eou, Eob: eob}, nil
-}
-
-// feedSlices feeds pcm through the session in streamChunkSamples slices,
-// invoking onFeed for each decode increment. It does NOT finalize: callers
-// decide when the send side is done. The file path finalizes after the whole
-// file; the live path finalizes only when its request channel closes, never
-// between audio messages. Slicing keeps each per-call engineMu hold short so
-// concurrent unary transcription interleaves fairly (the C session buffers
-// internally).
-//
-// If ctx is non-nil it is checked before each slice so a cancelled file
-// transcription stops promptly; the live path passes nil (it is bounded by its
-// request channel instead of a ctx).
-func (p *ParakeetCpp) feedSlices(ctx context.Context, stream uintptr, pcm []float32, onFeed func(streamFeedResult) error) error {
-	for off := 0; off < len(pcm); off += streamChunkSamples {
-		if ctx != nil {
-			if err := ctx.Err(); err != nil {
-				return status.Error(codes.Canceled, "transcription cancelled")
-			}
-		}
-		end := min(off+streamChunkSamples, len(pcm))
-		res, err := p.feedChunk(stream, pcm[off:end], false)
-		if err != nil {
-			return err
-		}
-		if err := onFeed(res); err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
-// flushTail finalizes the session once and folds the flushed tail (the last
-// ~2 encoder frames of text, which only appear on finalize) through onFeed.
-func (p *ParakeetCpp) flushTail(stream uintptr, onFeed func(streamFeedResult) error) error {
-	res, err := p.feedChunk(stream, nil, true)
-	if err != nil {
-		return err
-	}
-	return onFeed(res)
-}
--- a/backend/go/parakeet-cpp/goparakeetcpp.go
+++ b/backend/go/parakeet-cpp/goparakeetcpp.go
@@ -103,13 +103,12 @@ type transcriptJSON struct {
 //	{"text":"...","eou":0,"eob":0,"frame_sec":0.080000,
 //	 "words":[{"w":"...","start":0.480,"end":0.640,"conf":0.9100}, ...]}
 //
-// "text" is the newly-finalized text since the last call. Under ABI v5 "eou"
-// is 1 iff an <EOU> fired this feed (the user yielded the turn) and "eob" 1
-// iff an <EOB> fired (a backchannel like "uh-huh" ended — NOT a turn
-// boundary). A v4 library has no "eob" field and its "eou" conflates both
-// tokens: Eob stays 0 and Eou keeps the old any-event meaning. "words" are
-// the words finalized this call with absolute (stream-relative) start/end
-// seconds.
+// "text" is the newly-finalized text since the last call; "eou" is 1 when an
+// <EOU> (end of utterance) fired this feed and "eob" is 1 when an <EOB>
+// (backchannel) fired. ABI v4 conflated the two into "eou"; v5 split them, so
+// we read both and treat either as an utterance boundary for segmentation.
+// "words" are the words finalized this call with absolute (stream-relative)
+// start/end seconds.
 type streamFeedJSON struct {
 	Text     string           `json:"text"`
 	Eou      int              `json:"eou"`
@@ -365,7 +364,7 @@ var segmentSeparators = []rune{'.', '?', '!'}
 // the caller requested word granularity; token ids populate each segment's
 // Tokens by time-window membership. Shared by the batched and direct paths.
 func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gapFrames int) pb.TranscriptResult {
-	text, eou := stripEouMarker(strings.TrimSpace(doc.Text))
+	text := strings.TrimSpace(doc.Text)

 	// Frame-unit gap threshold -> seconds (NeMo segment_gap_threshold). 0 = off.
 	gapSeconds := 0.0
@@ -384,7 +383,6 @@ func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gap
 		return pb.TranscriptResult{
 			Text:     text,
 			Segments: []*pb.TranscriptSegment{{Id: 0, Text: text}},
-			Eou:      eou,
 		}
 	}

@@ -411,25 +409,7 @@ func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gap
 		}
 		segments = append(segments, seg)
 	}
-	return pb.TranscriptResult{Text: text, Segments: segments, Eou: eou}
-}
-
-// stripEouMarker removes a trailing literal <EOU>/<EOB> from offline-decode
-// text and reports whether the decode ended on an end-of-UTTERANCE token. The
-// realtime EOU model's offline decode keeps the special token in the
-// detokenized text (the streaming path strips it and surfaces it as flags
-// instead); user-visible transcripts must never carry either marker, but only
-// <EOU> may confirm the semantic_vad retranscribe cross-check — a decode
-// ending on <EOB> means the last thing heard was a backchannel, not the user
-// yielding the turn.
-func stripEouMarker(text string) (string, bool) {
-	if strings.HasSuffix(text, "<EOU>") {
-		return strings.TrimSpace(strings.TrimSuffix(text, "<EOU>")), true
-	}
-	if strings.HasSuffix(text, "<EOB>") {
-		return strings.TrimSpace(strings.TrimSuffix(text, "<EOB>")), false
-	}
-	return text, false
+	return pb.TranscriptResult{Text: text, Segments: segments}
 }

 // splitWordsIntoSegments groups words into segments exactly as NeMo's
@@ -496,55 +476,41 @@ func tokensInWindow(tokens []transcriptToken, start, end float64) []int32 {
 	return ids
 }

-// streamSegmenter accumulates streaming decode increments into per-utterance
-// segments. <EOU>/<EOB> are the model's own utterance boundaries; each closes a
-// segment. When the feed carries per-word timings (ABI v4 JSON), a closed
-// segment takes its start/end from its first/last word; against an older
-// text-only library (no words) it falls back to segmenting the delta text, so
-// the same assembler serves both paths.
+// streamSegmenter accumulates streaming words into per-utterance segments. EOU
+// is the model's own utterance boundary; each closed segment takes its start/end
+// from its first/last accumulated word.
 type streamSegmenter struct {
-	segs    []*pb.TranscriptSegment
-	cur     []transcriptWord // words for the open segment (ABI v4 JSON path)
-	curText []string         // delta text for the open segment (text-only path)
-	nextID  int32
+	segs   []*pb.TranscriptSegment
+	cur    []transcriptWord
+	nextID int32
 }

-func (s *streamSegmenter) add(r streamFeedResult) {
-	s.cur = append(s.cur, r.Words...)
-	if len(r.Words) == 0 && r.Delta != "" {
-		// Older libparakeet.so with no per-word timing: segment from the text.
-		s.curText = append(s.curText, r.Delta)
-	}
-	// Both <EOU> and <EOB> reset the decoder, so both close a segment.
-	if r.Eou || r.Eob {
+func (s *streamSegmenter) add(doc streamFeedJSON) {
+	s.cur = append(s.cur, doc.Words...)
+	// Close the segment on either turn signal: <EOU> (end of utterance) or
+	// <EOB> (backchannel). ABI v4 reported both via "eou"; v5 split them, so we
+	// OR them here to keep the v4 segmentation boundaries.
+	if doc.Eou != 0 || doc.Eob != 0 {
 		s.flush()
 	}
 }

 func (s *streamSegmenter) flush() {
-	switch {
-	case len(s.cur) > 0:
-		parts := make([]string, len(s.cur))
-		for i, w := range s.cur {
-			parts[i] = w.W
-		}
-		s.segs = append(s.segs, &pb.TranscriptSegment{
-			Id:    s.nextID,
-			Start: secondsToNanos(s.cur[0].Start),
-			End:   secondsToNanos(s.cur[len(s.cur)-1].End),
-			Text:  strings.TrimSpace(strings.Join(parts, " ")),
-		})
-		s.nextID++
-	case len(s.curText) > 0:
-		// No words this segment: emit a text-only segment (no timestamps),
-		// skipping a purely-whitespace one as the legacy text path did.
-		if t := strings.TrimSpace(strings.Join(s.curText, "")); t != "" {
-			s.segs = append(s.segs, &pb.TranscriptSegment{Id: s.nextID, Text: t})
-			s.nextID++
-		}
+	if len(s.cur) == 0 {
+		return
 	}
+	parts := make([]string, len(s.cur))
+	for i, w := range s.cur {
+		parts[i] = w.W
+	}
+	s.segs = append(s.segs, &pb.TranscriptSegment{
+		Id:    s.nextID,
+		Start: secondsToNanos(s.cur[0].Start),
+		End:   secondsToNanos(s.cur[len(s.cur)-1].End),
+		Text:  strings.TrimSpace(strings.Join(parts, " ")),
+	})
+	s.nextID++
 	s.cur = nil
-	s.curText = nil
 }

 func (s *streamSegmenter) segments() []*pb.TranscriptSegment { return s.segs }
@@ -569,119 +535,18 @@ func secondsToNanos(sec float64) int64 {
 	return int64(sec * 1e9)
 }

-// Per-C-call engine serialization for the streaming paths.
-//
-// Every individual C call (begin / feed / finalize / free) takes engineMu and
-// re-checks ctxPtr under the lock; the lock is NEVER held across a stream's
-// lifetime. This is safe because each parakeet.cpp call builds its own ggml
-// graph and all streaming caches live in the session object, not the ctx —
-// the only ctx-shared mutable state is last_error, which is why it is read
-// under the same lock as the failing call. Holding the lock per call (rather
-// than per stream, as this file previously did) keeps a long-lived live
-// session from starving batched unary transcription and vice versa.
-//
-// A stream must not outlive its ctx (C-API contract). Free() takes engineMu
-// and zeroes ctxPtr, so a racing per-call helper returns ModelNotLoaded
-// instead of feeding a freed engine; streamFree of an orphaned session only
-// runs the session destructor, which does not touch the ctx.
-
-// streamBegin opens a cache-aware streaming session. A 0 stream with nil
-// error means the loaded model is not a streaming model.
-func (p *ParakeetCpp) streamBegin(lang string) (uintptr, error) {
-	p.engineMu.Lock()
-	defer p.engineMu.Unlock()
-	if p.ctxPtr == 0 {
-		return 0, grpcerrors.ModelNotLoaded("parakeet-cpp")
-	}
-	if CppStreamBeginLang != nil {
-		return CppStreamBeginLang(p.ctxPtr, lang), nil
-	}
-	return CppStreamBegin(p.ctxPtr), nil
-}
-
-func (p *ParakeetCpp) streamFree(stream uintptr) {
-	if stream == 0 {
-		return
-	}
-	p.engineMu.Lock()
-	defer p.engineMu.Unlock()
-	CppStreamFree(stream)
-}
-
-// streamFeedText runs one text-mode feed (or the finalize flush when
-// finalize is true) under engineMu, returning the newly-finalized delta and
-// whether an <EOU>/<EOB> fired during the call.
-func (p *ParakeetCpp) streamFeedText(stream uintptr, pcm []float32, finalize bool) (delta string, eou, eob bool, err error) {
-	p.engineMu.Lock()
-	defer p.engineMu.Unlock()
-	if p.ctxPtr == 0 {
-		return "", false, false, grpcerrors.ModelNotLoaded("parakeet-cpp")
-	}
-	var ret uintptr
-	var events int32
-	if finalize {
-		ret = CppStreamFinalize(stream)
-	} else {
-		ret = CppStreamFeed(stream, pcm, int32(len(pcm)), unsafe.Pointer(&events))
-	}
-	if ret == 0 {
-		// last_error is ctx-shared: read it under the same lock as the call.
-		msg := CppLastError(p.ctxPtr)
-		if msg == "" {
-			msg = "unknown error"
-		}
-		return "", false, false, fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
-	}
-	delta = goStringFromCPtr(ret)
-	CppFreeString(ret)
-	// ABI v5: eou_out is a bitmask (bit 0 = <EOU>, bit 1 = <EOB>). A v4
-	// library sets 0/1 for either token, which the bit-0 test reads as the
-	// old conflated eou — the EOB distinction simply isn't available there.
-	return delta, events&1 != 0, events&2 != 0, nil
-}
-
-// streamFeedDoc runs one ABI v4 JSON feed (or finalize) under engineMu and
-// returns the parsed {text,eou,frame_sec,words} document.
-func (p *ParakeetCpp) streamFeedDoc(stream uintptr, pcm []float32, finalize bool) (streamFeedJSON, error) {
-	p.engineMu.Lock()
-	defer p.engineMu.Unlock()
-	if p.ctxPtr == 0 {
-		return streamFeedJSON{}, grpcerrors.ModelNotLoaded("parakeet-cpp")
-	}
-	var ret uintptr
-	if finalize {
-		ret = CppStreamFinalizeJSON(stream)
-	} else {
-		ret = CppStreamFeedJSON(stream, pcm, int32(len(pcm)))
-	}
-	if ret == 0 {
-		msg := CppLastError(p.ctxPtr)
-		if msg == "" {
-			msg = "unknown error"
-		}
-		return streamFeedJSON{}, fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
-	}
-	raw := goStringFromCPtr(ret)
-	CppFreeString(ret)
-	var doc streamFeedJSON
-	if err := json.Unmarshal([]byte(raw), &doc); err != nil {
-		return streamFeedJSON{}, fmt.Errorf("parakeet-cpp: decode stream json: %w", err)
-	}
-	return doc, nil
-}
-
 // AudioTranscriptionStream drives the cache-aware streaming RNN-T over the
-// audio at opts.Dst: it decodes the file to 16 kHz mono PCM, feeds it through
-// the shared decode driver (feedSlices/flushTail), and emits each
-// newly-finalized text run as a TranscriptStreamResponse delta. <EOU>/<EOB>
-// events close the current segment; a closing FinalResult carries the full
-// transcript, the per-utterance segments, and whether the file ended on an
-// utterance boundary.
+// audio at opts.Dst: it decodes the file to 16 kHz mono PCM, feeds it in
+// chunks to parakeet_capi_stream_feed, and emits each newly-finalized text
+// run as a TranscriptStreamResponse delta. <EOU>/<EOB> events close the
+// current segment; a closing FinalResult carries the full transcript and the
+// per-utterance segments.
 //
 // stream_begin returns 0 for models that are not cache-aware streaming models
-// (only e.g. nvidia/parakeet_realtime_eou_120m-v1 qualifies). For those this
-// returns codes.Unimplemented rather than faking a stream from an offline
-// decode — see the stream==0 branch and grpcerrors.StreamTranscriptionUnsupported.
+// (only e.g. nvidia/parakeet_realtime_eou_120m-v1 qualifies). For those we fall
+// back to a single offline transcription emitted as one delta plus a closing
+// FinalResult, matching LocalAI's non-streaming streaming contract (and the
+// whisper backend), so the streaming endpoint works for every model.
 func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.TranscriptRequest, results chan *pb.TranscriptStreamResponse) error {
 	defer close(results)

@@ -695,73 +560,185 @@ func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.Tra
 		return status.Error(codes.Canceled, "transcription cancelled")
 	}

-	stream, err := p.streamBegin(opts.GetLanguage())
-	if err != nil {
-		return err
+	var stream uintptr
+	if CppStreamBeginLang != nil {
+		stream = CppStreamBeginLang(p.ctxPtr, opts.GetLanguage())
+	} else {
+		stream = CppStreamBegin(p.ctxPtr)
 	}
 	if stream == 0 {
-		// Not a cache-aware streaming model. Report the missing capability
-		// honestly instead of decoding offline and emitting it as one "delta"
-		// + final: a client that asked for streaming must learn the model
-		// cannot stream, not receive a batch result dressed as a stream (which
-		// is indistinguishable except qualitatively, and silently breaks any
-		// feature that genuinely needs incremental output). Callers wanting a
-		// plain transcript use the unary AudioTranscription path. This mirrors
-		// AudioTranscriptionLive, which already returns Unimplemented here.
-		return grpcerrors.StreamTranscriptionUnsupported("parakeet-cpp",
-			"loaded model is not a cache-aware streaming model")
+		// Not a cache-aware streaming model: run a normal offline
+		// transcription and emit it as one delta + a closing final result.
+		res, err := p.AudioTranscription(ctx, opts)
+		if err != nil {
+			return err
+		}
+		if t := strings.TrimSpace(res.Text); t != "" {
+			results <- &pb.TranscriptStreamResponse{Delta: t}
+		}
+		results <- &pb.TranscriptStreamResponse{FinalResult: &res}
+		return nil
 	}
-	defer p.streamFree(stream)
+	defer CppStreamFree(stream)
+	// The C engine is a single shared context: a streaming session and a batched
+	// unary dispatch must never touch it at once, so hold engineMu for the whole
+	// stream. This lock is intentionally taken AFTER the non-streaming fallback
+	// above returns: that fallback goes through AudioTranscription -> the batcher
+	// -> runBatch, which itself acquires engineMu, so locking here first would
+	// deadlock. Do not hoist this lock above the fallback.
+	p.engineMu.Lock()
+	defer p.engineMu.Unlock()

 	data, duration, err := decodeWavMono16k(opts.Dst)
 	if err != nil {
 		return err
 	}

-	// Fold the shared decode driver's per-feed increments into the streamed
-	// deltas and the closing batch result: words/text accumulate into
-	// per-utterance segments (streamSegmenter), and the utterance-boundary
-	// latch (boundary.go) records whether the file ended on an <EOU>. These
-	// are the offline path's concern — the live RPC carries none of them.
+	// ABI v4: when the streaming JSON entry points are present, drive them so the
+	// per-utterance segments carry per-word start/end timestamps. Falls through to
+	// the text-only loop below against an older libparakeet.so. Runs under the
+	// engineMu already held above.
+	if CppStreamFeedJSON != nil {
+		return p.streamJSON(ctx, stream, data, duration, results)
+	}
+
 	var (
 		full     strings.Builder
-		seg      streamSegmenter
-		boundary utteranceBoundary
+		segText  strings.Builder
+		segments []*pb.TranscriptSegment
+		segID    int32
 	)
-	emit := func(r streamFeedResult) error {
-		if r.Delta != "" {
-			full.WriteString(r.Delta)
-			results <- &pb.TranscriptStreamResponse{Delta: r.Delta}
+
+	flushSegment := func() {
+		t := strings.TrimSpace(segText.String())
+		segText.Reset()
+		if t == "" {
+			return
 		}
-		seg.add(r)
-		boundary = boundary.observe(r)
+		segments = append(segments, &pb.TranscriptSegment{Id: segID, Text: t})
+		segID++
+	}
+
+	// emitDelta consumes the malloc'd char* returned by feed/finalize: frees
+	// it, accumulates the text, and sends a delta when non-empty. A 0 return
+	// is an error (vs the "" empty-but-non-NULL no-new-text case).
+	emitDelta := func(ret uintptr) error {
+		if ret == 0 {
+			msg := CppLastError(p.ctxPtr)
+			if msg == "" {
+				msg = "unknown error"
+			}
+			return fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
+		}
+		delta := goStringFromCPtr(ret)
+		CppFreeString(ret)
+		if delta == "" {
+			return nil
+		}
+		full.WriteString(delta)
+		segText.WriteString(delta)
+		results <- &pb.TranscriptStreamResponse{Delta: delta}
 		return nil
 	}

-	if err := p.feedSlices(ctx, stream, data, emit); err != nil {
-		return err
-	}
-	if err := p.flushTail(stream, emit); err != nil {
-		return err
-	}
-	seg.flush() // close a trailing utterance that never saw an <EOU>
+	for off := 0; off < len(data); off += streamChunkSamples {
+		if err := ctx.Err(); err != nil {
+			return status.Error(codes.Canceled, "transcription cancelled")
+		}
+		end := min(off+streamChunkSamples, len(data))
+		chunk := data[off:end]

-	// final.Text is the exact concatenation of the streamed deltas (full is
-	// their accumulation), so concat(deltas) == FinalResult.Text holds even
-	// when the model prepends a leading space to the first word (SentencePiece
-	// detokenization). This matches the whisper backend's streaming contract.
-	// The single-segment fallback stays trimmed.
-	fullText := full.String()
-	segments := seg.segments()
-	if trimmed := strings.TrimSpace(fullText); len(segments) == 0 && trimmed != "" {
-		segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: trimmed})
+		var eou int32
+		ret := CppStreamFeed(stream, chunk, int32(len(chunk)), unsafe.Pointer(&eou))
+		if err := emitDelta(ret); err != nil {
+			return err
+		}
+		if eou != 0 {
+			flushSegment()
+		}
+	}
+
+	// Flush the streaming tail (final encoder chunk).
+	if err := emitDelta(CppStreamFinalize(stream)); err != nil {
+		return err
+	}
+	flushSegment()
+
+	text := strings.TrimSpace(full.String())
+	if len(segments) == 0 && text != "" {
+		segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: text})
 	}
 	results <- &pb.TranscriptStreamResponse{
 		FinalResult: &pb.TranscriptResult{
-			Text:     fullText,
+			Text:     text,
+			Segments: segments,
+			Duration: duration,
+		},
+	}
+	return nil
+}
+
+// streamJSON drives the streaming JSON entry points (present since ABI v4): each
+// feed/finalize returns a {text,eou,eob,frame_sec,words} document. The
+// newly-finalized text is emitted as a delta (unchanged streaming contract)
+// while words are accumulated into per-utterance segments (closed on <EOU> or
+// <EOB>) so the closing FinalResult carries timestamped segments. Runs under
+// engineMu (already held by the caller).
+func (p *ParakeetCpp) streamJSON(ctx context.Context, stream uintptr, data []float32,
+	duration float32, results chan *pb.TranscriptStreamResponse) error {
+	var (
+		full strings.Builder
+		seg  streamSegmenter
+	)
+	// consume frees the malloc'd char* (a 0 return is an error), parses the JSON,
+	// emits the delta, and routes words through the segmenter.
+	consume := func(ret uintptr) error {
+		if ret == 0 {
+			msg := CppLastError(p.ctxPtr)
+			if msg == "" {
+				msg = "unknown error"
+			}
+			return fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
+		}
+		raw := goStringFromCPtr(ret)
+		CppFreeString(ret)
+		var doc streamFeedJSON
+		if err := json.Unmarshal([]byte(raw), &doc); err != nil {
+			return fmt.Errorf("parakeet-cpp: decode stream json: %w", err)
+		}
+		if doc.Text != "" {
+			full.WriteString(doc.Text)
+			results <- &pb.TranscriptStreamResponse{Delta: doc.Text}
+		}
+		seg.add(doc)
+		return nil
+	}
+
+	for off := 0; off < len(data); off += streamChunkSamples {
+		if err := ctx.Err(); err != nil {
+			return status.Error(codes.Canceled, "transcription cancelled")
+		}
+		end := min(off+streamChunkSamples, len(data))
+		chunk := data[off:end]
+		if err := consume(CppStreamFeedJSON(stream, chunk, int32(len(chunk)))); err != nil {
+			return err
+		}
+	}
+	if err := consume(CppStreamFinalizeJSON(stream)); err != nil {
+		return err
+	}
+	seg.flush() // close any trailing utterance that never saw an EOU
+
+	text := strings.TrimSpace(full.String())
+	segments := seg.segments()
+	if len(segments) == 0 && text != "" {
+		segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: text})
+	}
+	results <- &pb.TranscriptStreamResponse{
+		FinalResult: &pb.TranscriptResult{
+			Text:     text,
 			Segments: segments,
 			Duration: duration,
-			Eou:      boundary.ended(),
 		},
 	}
 	return nil
@@ -826,10 +803,6 @@ func (p *ParakeetCpp) Free() error {
 		close(p.batStop)
 		p.batStop = nil
 	}
-	// engineMu so an in-flight streaming call (which locks per C call and
-	// re-checks ctxPtr under the lock) can never feed into a freed ctx.
-	p.engineMu.Lock()
-	defer p.engineMu.Unlock()
 	if p.ctxPtr != 0 {
 		CppFree(p.ctxPtr)
 		p.ctxPtr = 0
--- a/backend/go/parakeet-cpp/goparakeetcpp_test.go
+++ b/backend/go/parakeet-cpp/goparakeetcpp_test.go
@@ -14,8 +14,6 @@ import (
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
-	"google.golang.org/grpc/codes"
-	"google.golang.org/grpc/status"
 )

 func TestParakeetCpp(t *testing.T) {
@@ -203,29 +201,6 @@ var _ = Describe("ParakeetCpp", func() {
 	})

 	Context("AudioTranscriptionStream", func() {
-		It("returns the typed Unimplemented signal for non-streaming models (no offline fallback)", func() {
-			// stream_begin == 0 means the loaded model is not a cache-aware
-			// streaming model. The backend must surface that, not silently
-			// decode offline and fake a one-shot "stream".
-			savedBegin, savedBeginLang := CppStreamBegin, CppStreamBeginLang
-			defer func() { CppStreamBegin, CppStreamBeginLang = savedBegin, savedBeginLang }()
-			CppStreamBeginLang = nil
-			CppStreamBegin = func(ctx uintptr) uintptr { return 0 }
-
-			p := &ParakeetCpp{ctxPtr: 1}
-			results := make(chan *pb.TranscriptStreamResponse, 8)
-			err := p.AudioTranscriptionStream(context.Background(),
-				&pb.TranscriptRequest{Dst: "ignored.wav"}, results)
-			Expect(status.Code(err)).To(Equal(codes.Unimplemented))
-
-			// Honest signal: nothing was emitted — no faked batch result.
-			var emitted []*pb.TranscriptStreamResponse
-			for r := range results {
-				emitted = append(emitted, r)
-			}
-			Expect(emitted).To(BeEmpty())
-		})
-
 		It("streams deltas and a closing FinalResult from a cache-aware model", func() {
 			// Streaming needs a cache-aware streaming model (e.g.
 			// realtime_eou); the offline test model would fail stream_begin.
--- a/backend/go/parakeet-cpp/live.go
+++ b/backend/go/parakeet-cpp/live.go
@@ -1,186 +0,0 @@
-package main
-
-import (
-	"strings"
-	"time"
-
-	"github.com/mudler/LocalAI/pkg/grpc/grpcerrors"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	"github.com/mudler/xlog"
-	"google.golang.org/grpc/codes"
-	"google.golang.org/grpc/status"
-)
-
-// liveSampleRate is the only PCM rate the parakeet C streaming API accepts.
-const liveSampleRate = 16000
-
-// AudioTranscriptionLive drives one cache-aware streaming session over audio
-// fed incrementally by the caller (the realtime API's semantic_vad turn
-// detection). Contract:
-//
-//   - the first request must carry a Config; a Config mid-stream resets the
-//     decode session (free + begin) and drops accumulated transcript state;
-//   - a Ready ack is sent right after a successful stream_begin so callers
-//     can degrade synchronously when the model has no streaming support
-//     (LiveTranscriptionUnsupported, codes.Unimplemented);
-//   - every feed that produced output is forwarded as {delta, eou, words};
-//     the <EOU>/<EOB> flag is the model's own utterance boundary and the
-//     decoder auto-resets after it, so one session spans many utterances;
-//   - closing the send side finalizes: the held-back tail chunk is flushed
-//     (the last ~2 encoder frames of words only appear here) and a terminal
-//     FinalResult carries the full transcript Text only. Per-utterance
-//     segments, duration, and the terminal <EOU> flag are NOT produced here —
-//     the realtime core consumes the streamed per-feed tokens and the final
-//     Text; those batch fields are the file path's concern (see
-//     AudioTranscriptionStream).
-//
-// Engine access is serialized per C call (streamBegin/streamFeed*/streamFree
-// take engineMu internally), never for the session lifetime — unary
-// transcription keeps flowing between feeds.
-func (p *ParakeetCpp) AudioTranscriptionLive(in <-chan *pb.TranscriptLiveRequest, out chan<- *pb.TranscriptLiveResponse) error {
-	defer close(out)
-
-	if p.ctxPtr == 0 {
-		return grpcerrors.ModelNotLoaded("parakeet-cpp")
-	}
-
-	first, ok := <-in
-	if !ok {
-		return nil // caller closed without sending anything
-	}
-	cfg := first.GetConfig()
-	if cfg == nil {
-		return status.Error(codes.InvalidArgument, "parakeet-cpp: first live message must carry a config")
-	}
-	if err := validateLiveConfig(cfg); err != nil {
-		return err
-	}
-
-	stream, err := p.streamBegin(cfg.GetLanguage())
-	if err != nil {
-		return err
-	}
-	if stream == 0 {
-		return grpcerrors.LiveTranscriptionUnsupported("parakeet-cpp",
-			"loaded model is not a cache-aware streaming model")
-	}
-	// stream is reassigned on a mid-stream Config reset; free whatever is
-	// current when the RPC unwinds.
-	defer func() { p.streamFree(stream) }()
-
-	out <- &pb.TranscriptLiveResponse{Ready: true}
-
-	var (
-		full    strings.Builder
-		fedSecs float64
-
-		// behindSec accumulates how far decode wall time has fallen behind
-		// the audio it was fed. A live caller feeds in real time, so a
-		// persistent positive backlog means every downstream signal —
-		// including the <EOU> the turn detector waits on — arrives that many
-		// seconds late. Warned once per session; reset by a Config reset.
-		behindSec    float64
-		behindWarned bool
-	)
-
-	// emit forwards one decode increment: it streams the per-feed tokens the
-	// realtime turn detector consumes (delta/eou/eob/words) and accumulates the
-	// running transcript for the closing FinalResult. No segmentation or
-	// boundary latch here — the live consumer reads only the streamed tokens
-	// and the final Text; per-utterance segments and the terminal <EOU> flag
-	// are an offline-path concern (see AudioTranscriptionStream / boundary.go).
-	emit := func(r streamFeedResult) error {
-		if r.Delta != "" {
-			full.WriteString(r.Delta)
-		}
-		if r.Delta != "" || r.Eou || r.Eob || len(r.Words) > 0 {
-			out <- &pb.TranscriptLiveResponse{
-				Delta: r.Delta,
-				Eou:   r.Eou,
-				Eob:   r.Eob,
-				Words: liveWordsToProto(r.Words),
-			}
-		}
-		return nil
-	}
-
-	for req := range in {
-		switch payload := req.GetPayload().(type) {
-		case *pb.TranscriptLiveRequest_Config:
-			if err := validateLiveConfig(payload.Config); err != nil {
-				return err
-			}
-			// Reset: a fresh decode session, dropping accumulated state.
-			p.streamFree(stream)
-			stream, err = p.streamBegin(payload.Config.GetLanguage())
-			if err != nil {
-				return err
-			}
-			if stream == 0 {
-				return grpcerrors.LiveTranscriptionUnsupported("parakeet-cpp",
-					"loaded model is not a cache-aware streaming model")
-			}
-			full.Reset()
-			fedSecs = 0
-		case *pb.TranscriptLiveRequest_Audio:
-			pcm := payload.Audio.GetPcm()
-			audioSec := float64(len(pcm)) / liveSampleRate
-			fedSecs += audioSec
-			start := time.Now()
-			// nil ctx: a live session is bounded by this request channel, not a
-			// context — cancellation is the caller closing the stream.
-			if err := p.feedSlices(nil, stream, pcm, emit); err != nil {
-				return err
-			}
-			wallSec := time.Since(start).Seconds()
-			behindSec += wallSec - audioSec
-			if behindSec < 0 {
-				behindSec = 0
-			}
-			xlog.Debug("parakeet-cpp: live feed",
-				"audio_ms", int(audioSec*1000), "wall_ms", int(wallSec*1000),
-				"behind_ms", int(behindSec*1000), "fed_s", fedSecs)
-			if behindSec > 1 && !behindWarned {
-				behindWarned = true
-				xlog.Warn("parakeet-cpp: live decode is falling behind real time; "+
-					"end-of-utterance signals will arrive late",
-					"behind_s", behindSec, "fed_s", fedSecs)
-			}
-		}
-	}
-
-	// Send side closed: flush the streaming tail and emit the final transcript.
-	// The live FinalResult carries only Text — the authoritative full-turn
-	// transcript the realtime core commits. Per-utterance segments, duration,
-	// and the terminal <EOU> flag are not produced on the live path.
-	if err := p.flushTail(stream, emit); err != nil {
-		return err
-	}
-	out <- &pb.TranscriptLiveResponse{
-		FinalResult: &pb.TranscriptResult{Text: strings.TrimSpace(full.String())},
-	}
-	return nil
-}
-
-func validateLiveConfig(cfg *pb.TranscriptLiveConfig) error {
-	if sr := cfg.GetSampleRate(); sr != 0 && sr != liveSampleRate {
-		return status.Errorf(codes.InvalidArgument,
-			"parakeet-cpp: unsupported live sample_rate %d (only %d)", sr, liveSampleRate)
-	}
-	return nil
-}
-
-func liveWordsToProto(words []transcriptWord) []*pb.TranscriptWord {
-	if len(words) == 0 {
-		return nil
-	}
-	out := make([]*pb.TranscriptWord, len(words))
-	for i, w := range words {
-		out[i] = &pb.TranscriptWord{
-			Start: secondsToNanos(w.Start),
-			End:   secondsToNanos(w.End),
-			Text:  w.W,
-		}
-	}
-	return out
-}
--- a/backend/go/parakeet-cpp/live_test.go
+++ b/backend/go/parakeet-cpp/live_test.go
@@ -1,417 +0,0 @@
-package main
-
-import (
-	"sync"
-	"time"
-	"unsafe"
-
-	"github.com/mudler/LocalAI/pkg/grpc/grpcerrors"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-	"google.golang.org/grpc/codes"
-	"google.golang.org/grpc/status"
-)
-
-// The live-RPC specs drive AudioTranscriptionLive entirely against stubbed
-// Cpp* package vars (the same seam batcher_test.go uses), so they run
-// without libparakeet.so.
-
-// liveCstrPool hands out NUL-terminated C-style strings backed by Go memory
-// and keeps them alive for the duration of a spec (goStringFromCPtr reads
-// through the raw pointer; Go's GC must not collect the backing array while
-// a stub's return value is in flight).
-type liveCstrPool struct {
-	mu   sync.Mutex
-	bufs [][]byte
-}
-
-func (p *liveCstrPool) cstr(s string) uintptr {
-	p.mu.Lock()
-	defer p.mu.Unlock()
-	b := append([]byte(s), 0)
-	p.bufs = append(p.bufs, b)
-	return uintptr(unsafe.Pointer(&b[0]))
-}
-
-// liveStubs swaps every C entry point the live path touches and returns a
-// restore func for AfterEach.
-func liveStubs() (restore func()) {
-	savedBegin, savedBeginLang := CppStreamBegin, CppStreamBeginLang
-	savedFeed, savedFeedJSON := CppStreamFeed, CppStreamFeedJSON
-	savedFinalize, savedFinalizeJSON := CppStreamFinalize, CppStreamFinalizeJSON
-	savedFree, savedLastError := CppStreamFree, CppLastError
-	savedFreeString := CppFreeString
-	return func() {
-		CppStreamBegin, CppStreamBeginLang = savedBegin, savedBeginLang
-		CppStreamFeed, CppStreamFeedJSON = savedFeed, savedFeedJSON
-		CppStreamFinalize, CppStreamFinalizeJSON = savedFinalize, savedFinalizeJSON
-		CppStreamFree, CppLastError = savedFree, savedLastError
-		CppFreeString = savedFreeString
-	}
-}
-
-// runLive starts the RPC on its own goroutine and returns the request
-// channel plus a collector for everything the backend emitted.
-func runLive(p *ParakeetCpp) (chan *pb.TranscriptLiveRequest, chan *pb.TranscriptLiveResponse, chan error) {
-	in := make(chan *pb.TranscriptLiveRequest)
-	out := make(chan *pb.TranscriptLiveResponse, 32)
-	errCh := make(chan error, 1)
-	go func() { errCh <- p.AudioTranscriptionLive(in, out) }()
-	return in, out, errCh
-}
-
-func liveConfig(lang string) *pb.TranscriptLiveRequest {
-	return &pb.TranscriptLiveRequest{
-		Payload: &pb.TranscriptLiveRequest_Config{Config: &pb.TranscriptLiveConfig{Language: lang}},
-	}
-}
-
-func liveAudio(pcm []float32) *pb.TranscriptLiveRequest {
-	return &pb.TranscriptLiveRequest{
-		Payload: &pb.TranscriptLiveRequest_Audio{Audio: &pb.TranscriptLiveAudio{Pcm: pcm}},
-	}
-}
-
-func collectLive(out chan *pb.TranscriptLiveResponse) []*pb.TranscriptLiveResponse {
-	var got []*pb.TranscriptLiveResponse
-	for r := range out {
-		got = append(got, r)
-	}
-	return got
-}
-
-var _ = Describe("AudioTranscriptionLive (stubbed C API)", func() {
-	var (
-		pool    *liveCstrPool
-		restore func()
-		p       *ParakeetCpp
-	)
-
-	BeforeEach(func() {
-		pool = &liveCstrPool{}
-		restore = liveStubs()
-		p = &ParakeetCpp{ctxPtr: 1}
-
-		CppStreamBeginLang = nil
-		CppStreamBegin = func(ctx uintptr) uintptr { return 7 }
-		CppStreamFree = func(s uintptr) {}
-		CppFreeString = func(s uintptr) {}
-		CppLastError = func(ctx uintptr) string { return "stub error" }
-		CppStreamFeed = nil
-		CppStreamFeedJSON = nil
-		CppStreamFinalize = nil
-		CppStreamFinalizeJSON = nil
-	})
-
-	AfterEach(func() { restore() })
-
-	It("rejects a stream whose first message is not a config", func() {
-		in, out, errCh := runLive(p)
-		in <- liveAudio([]float32{0.1})
-		close(in)
-
-		err := <-errCh
-		Expect(status.Code(err)).To(Equal(codes.InvalidArgument))
-		Expect(collectLive(out)).To(BeEmpty())
-	})
-
-	It("rejects a non-16k sample rate", func() {
-		in, _, errCh := runLive(p)
-		in <- &pb.TranscriptLiveRequest{
-			Payload: &pb.TranscriptLiveRequest_Config{Config: &pb.TranscriptLiveConfig{SampleRate: 8000}},
-		}
-		close(in)
-		Expect(status.Code(<-errCh)).To(Equal(codes.InvalidArgument))
-	})
-
-	It("returns the typed Unimplemented signal for non-streaming models, before any ack", func() {
-		CppStreamBegin = func(ctx uintptr) uintptr { return 0 }
-
-		in, out, errCh := runLive(p)
-		in <- liveConfig("")
-		close(in)
-
-		err := <-errCh
-		Expect(grpcerrors.IsLiveTranscriptionUnsupported(err)).To(BeTrue())
-		Expect(collectLive(out)).To(BeEmpty())
-	})
-
-	It("streams deltas, eou flags and words on the JSON path and finalizes on close", func() {
-		var freed []uintptr
-		CppStreamFree = func(s uintptr) { freed = append(freed, s) }
-		feeds := 0
-		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
-			feeds++
-			switch feeds {
-			case 1:
-				return pool.cstr(`{"text":"hello ","eou":0,"frame_sec":0.08,` +
-					`"words":[{"w":"hello","start":0.1,"end":0.4,"conf":0.9}]}`)
-			default:
-				return pool.cstr(`{"text":"world","eou":1,"frame_sec":0.08,` +
-					`"words":[{"w":"world","start":0.5,"end":0.8,"conf":0.9}]}`)
-			}
-		}
-		CppStreamFinalizeJSON = func(s uintptr) uintptr {
-			return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
-		}
-
-		in, out, errCh := runLive(p)
-		in <- liveConfig("en")
-		in <- liveAudio(make([]float32, 100))
-		in <- liveAudio(make([]float32, 200))
-		close(in)
-		Expect(<-errCh).NotTo(HaveOccurred())
-
-		got := collectLive(out)
-		Expect(got).To(HaveLen(4)) // ready, two deltas, final
-
-		Expect(got[0].Ready).To(BeTrue())
-
-		Expect(got[1].Delta).To(Equal("hello "))
-		Expect(got[1].Eou).To(BeFalse())
-		Expect(got[1].Words).To(HaveLen(1))
-		Expect(got[1].Words[0].Text).To(Equal("hello"))
-
-		Expect(got[2].Delta).To(Equal("world"))
-		Expect(got[2].Eou).To(BeTrue())
-
-		final := got[3].FinalResult
-		Expect(final).NotTo(BeNil())
-		Expect(final.Text).To(Equal("hello world"))
-		// The live FinalResult carries only Text. Per-utterance segments,
-		// duration and the terminal eou flag are an offline-path concern (see
-		// boundary.go / AudioTranscriptionStream); the realtime core reads the
-		// streamed per-feed tokens above plus this Text.
-		Expect(final.Eou).To(BeFalse())
-		Expect(final.Segments).To(BeEmpty())
-		Expect(final.Duration).To(BeZero())
-
-		Expect(freed).To(Equal([]uintptr{7}))
-	})
-
-	It("falls back to the text feed (eou out-param) when the JSON entry points are absent", func() {
-		feeds := 0
-		CppStreamFeed = func(s uintptr, pcm []float32, n int32, eouOut unsafe.Pointer) uintptr {
-			feeds++
-			if feeds == 2 {
-				*(*int32)(eouOut) = 1
-				return pool.cstr("done")
-			}
-			return pool.cstr("first ")
-		}
-		CppStreamFinalize = func(s uintptr) uintptr { return pool.cstr("") }
-
-		in, out, errCh := runLive(p)
-		in <- liveConfig("")
-		in <- liveAudio(make([]float32, 10))
-		in <- liveAudio(make([]float32, 10))
-		close(in)
-		Expect(<-errCh).NotTo(HaveOccurred())
-
-		got := collectLive(out)
-		Expect(got).To(HaveLen(4))
-		Expect(got[1].Delta).To(Equal("first "))
-		Expect(got[1].Eou).To(BeFalse())
-		Expect(got[2].Delta).To(Equal("done"))
-		Expect(got[2].Eou).To(BeTrue())
-		Expect(got[3].FinalResult.Text).To(Equal("first done"))
-	})
-
-	It("forwards <EOB> as eob — a backchannel, never an eou (ABI v5 JSON)", func() {
-		feeds := 0
-		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
-			feeds++
-			if feeds == 1 {
-				return pool.cstr(`{"text":"uh-huh","eou":0,"eob":1,"frame_sec":0.08,` +
-					`"words":[{"w":"uh-huh","start":0.1,"end":0.3,"conf":0.9}]}`)
-			}
-			return pool.cstr(`{"text":"the turn","eou":1,"eob":0,"frame_sec":0.08,` +
-				`"words":[{"w":"the","start":0.5,"end":0.6,"conf":0.9},{"w":"turn","start":0.6,"end":0.8,"conf":0.9}]}`)
-		}
-		CppStreamFinalizeJSON = func(s uintptr) uintptr {
-			return pool.cstr(`{"text":"","eou":0,"eob":0,"frame_sec":0.08,"words":[]}`)
-		}
-
-		in, out, errCh := runLive(p)
-		in <- liveConfig("")
-		in <- liveAudio(make([]float32, 10))
-		in <- liveAudio(make([]float32, 10))
-		close(in)
-		Expect(<-errCh).NotTo(HaveOccurred())
-
-		got := collectLive(out)
-		Expect(got).To(HaveLen(4))
-		Expect(got[1].Eob).To(BeTrue())
-		Expect(got[1].Eou).To(BeFalse(), "a backchannel must not masquerade as a turn boundary")
-		Expect(got[2].Eou).To(BeTrue())
-	})
-
-	It("maps the v5 eou_out bitmask on the text path (bit0 <EOU>, bit1 <EOB>)", func() {
-		feeds := 0
-		CppStreamFeed = func(s uintptr, pcm []float32, n int32, eouOut unsafe.Pointer) uintptr {
-			feeds++
-			if feeds == 1 {
-				*(*int32)(eouOut) = 2 // <EOB> only
-				return pool.cstr("uh-huh")
-			}
-			*(*int32)(eouOut) = 1 // <EOU>
-			return pool.cstr(" done")
-		}
-		CppStreamFinalize = func(s uintptr) uintptr { return pool.cstr("") }
-
-		in, out, errCh := runLive(p)
-		in <- liveConfig("")
-		in <- liveAudio(make([]float32, 10))
-		in <- liveAudio(make([]float32, 10))
-		close(in)
-		Expect(<-errCh).NotTo(HaveOccurred())
-
-		got := collectLive(out)
-		Expect(got).To(HaveLen(4))
-		Expect(got[1].Eob).To(BeTrue())
-		Expect(got[1].Eou).To(BeFalse())
-		Expect(got[2].Eou).To(BeTrue())
-		Expect(got[2].Eob).To(BeFalse())
-	})
-
-	It("accumulates trailing text after an EOU into the final transcript", func() {
-		feeds := 0
-		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
-			feeds++
-			if feeds == 1 {
-				return pool.cstr(`{"text":"turn one","eou":1,"frame_sec":0.08,"words":[]}`)
-			}
-			return pool.cstr(`{"text":" and more","eou":0,"frame_sec":0.08,"words":[]}`)
-		}
-		CppStreamFinalizeJSON = func(s uintptr) uintptr {
-			return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
-		}
-
-		in, out, errCh := runLive(p)
-		in <- liveConfig("")
-		in <- liveAudio(make([]float32, 10))
-		in <- liveAudio(make([]float32, 10))
-		close(in)
-		Expect(<-errCh).NotTo(HaveOccurred())
-
-		got := collectLive(out)
-		final := got[len(got)-1].FinalResult
-		Expect(final.Text).To(Equal("turn one and more"))
-	})
-
-	It("resets the decode session on a mid-stream config", func() {
-		var begun, freed int
-		CppStreamBegin = func(ctx uintptr) uintptr { begun++; return uintptr(10 + begun) }
-		CppStreamFree = func(s uintptr) { freed++ }
-		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
-			return pool.cstr(`{"text":"x","eou":0,"frame_sec":0.08,"words":[]}`)
-		}
-		CppStreamFinalizeJSON = func(s uintptr) uintptr {
-			return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
-		}
-
-		in, out, errCh := runLive(p)
-		in <- liveConfig("")
-		in <- liveAudio(make([]float32, 10))
-		in <- liveConfig("") // reset
-		in <- liveAudio(make([]float32, 10))
-		close(in)
-		Expect(<-errCh).NotTo(HaveOccurred())
-
-		got := collectLive(out)
-		final := got[len(got)-1].FinalResult
-		Expect(final.Text).To(Equal("x"), "pre-reset transcript dropped")
-		Expect(begun).To(Equal(2))
-		Expect(freed).To(Equal(2), "old session freed on reset, new one on unwind")
-	})
-
-	It("does not hold engineMu between feeds (unary work interleaves with a live session)", func() {
-		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
-			return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
-		}
-		CppStreamFinalizeJSON = func(s uintptr) uintptr {
-			return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
-		}
-
-		in, out, errCh := runLive(p)
-		in <- liveConfig("")
-		in <- liveAudio(make([]float32, 10))
-
-		// The session is open and idle between feeds: the engine lock must be
-		// acquirable, which is what lets batched unary transcription proceed
-		// mid-session. Under stream-lifetime locking this probe would block
-		// until the stream ended and the Eventually would time out.
-		locked := make(chan struct{})
-		go func() {
-			p.engineMu.Lock()
-			p.engineMu.Unlock() //nolint:staticcheck // probe: acquire-release proves availability
-			close(locked)
-		}()
-		Eventually(locked, time.Second).Should(BeClosed())
-
-		close(in)
-		Expect(<-errCh).NotTo(HaveOccurred())
-		collectLive(out)
-	})
-
-	It("errors out and reads last_error under the lock when a feed fails", func() {
-		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr { return 0 }
-
-		in, out, errCh := runLive(p)
-		in <- liveConfig("")
-		in <- liveAudio(make([]float32, 10))
-
-		err := <-errCh
-		Expect(err).To(MatchError(ContainSubstring("stub error")))
-		got := collectLive(out)
-		Expect(got).To(HaveLen(1)) // just the ready ack
-		close(in)
-	})
-})
-
-var _ = Describe("stripEouMarker", func() {
-	It("strips a trailing <EOU> and reports it", func() {
-		text, eou := stripEouMarker("it is certainly very like the old portrait<EOU>")
-		Expect(text).To(Equal("it is certainly very like the old portrait"))
-		Expect(eou).To(BeTrue())
-	})
-
-	It("strips a trailing <EOB> WITHOUT reporting an utterance end", func() {
-		// A decode ending on a backchannel must not confirm the
-		// retranscribe gate — the user was acknowledging, not yielding.
-		text, eou := stripEouMarker("uh-huh<EOB>")
-		Expect(text).To(Equal("uh-huh"))
-		Expect(eou).To(BeFalse())
-	})
-
-	It("leaves marker-free text alone", func() {
-		text, eou := stripEouMarker("plain transcript")
-		Expect(text).To(Equal("plain transcript"))
-		Expect(eou).To(BeFalse())
-	})
-
-	It("does not strip a marker in the middle of the text", func() {
-		text, eou := stripEouMarker("a<EOU>b")
-		Expect(text).To(Equal("a<EOU>b"))
-		Expect(eou).To(BeFalse())
-	})
-})
-
-var _ = Describe("transcriptResultFromDoc EOU handling", func() {
-	It("strips the offline marker from text and sets the result flag", func() {
-		doc := transcriptJSON{Text: "the old portrait<EOU>"}
-		res := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0)
-		Expect(res.Text).To(Equal("the old portrait"))
-		Expect(res.Eou).To(BeTrue())
-		Expect(res.Segments).To(HaveLen(1))
-		Expect(res.Segments[0].Text).To(Equal("the old portrait"))
-	})
-
-	It("reports eou=false for marker-free decodes", func() {
-		doc := transcriptJSON{Text: "no marker here"}
-		res := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0)
-		Expect(res.Text).To(Equal("no marker here"))
-		Expect(res.Eou).To(BeFalse())
-	})
-})
--- a/backend/go/parakeet-cpp/segments_test.go
+++ b/backend/go/parakeet-cpp/segments_test.go
@@ -106,7 +106,7 @@ var _ = Describe("transcriptResultFromDoc (multi-segment)", func() {
 var _ = Describe("streaming segment assembly", func() {
 	It("closes a segment with start/end from its words on EOU", func() {
 		acc := &streamSegmenter{}
-		acc.add(streamFeedResult{Delta: "hello world", Eou: true, Words: []transcriptWord{
+		acc.add(streamFeedJSON{Text: "hello world", Eou: 1, Words: []transcriptWord{
 			{W: "hello", Start: 0.0, End: 0.4}, {W: "world", Start: 0.4, End: 0.9},
 		}})
 		segs := acc.segments()
@@ -118,9 +118,9 @@ var _ = Describe("streaming segment assembly", func() {

 	It("buffers words across feeds until EOU", func() {
 		acc := &streamSegmenter{}
-		acc.add(streamFeedResult{Delta: "hi", Words: []transcriptWord{{W: "hi", Start: 0, End: 0.3}}})
+		acc.add(streamFeedJSON{Text: "hi", Eou: 0, Words: []transcriptWord{{W: "hi", Start: 0, End: 0.3}}})
 		Expect(acc.segments()).To(BeEmpty())
-		acc.add(streamFeedResult{Delta: "there", Eou: true, Words: []transcriptWord{{W: "there", Start: 0.3, End: 0.7}}})
+		acc.add(streamFeedJSON{Text: "there", Eou: 1, Words: []transcriptWord{{W: "there", Start: 0.3, End: 0.7}}})
 		Expect(acc.segments()).To(HaveLen(1))
 		Expect(acc.segments()[0].Text).To(Equal("hi there"))
 	})
@@ -129,7 +129,7 @@ var _ = Describe("streaming segment assembly", func() {
 	// field; a backchannel must still close the segment as it did in v4.
 	It("closes a segment on EOB (backchannel) too", func() {
 		acc := &streamSegmenter{}
-		acc.add(streamFeedResult{Delta: "uh huh", Eob: true, Words: []transcriptWord{
+		acc.add(streamFeedJSON{Text: "uh huh", Eou: 0, Eob: 1, Words: []transcriptWord{
 			{W: "uh", Start: 0.0, End: 0.2}, {W: "huh", Start: 0.2, End: 0.5},
 		}})
 		segs := acc.segments()
@@ -137,18 +137,4 @@ var _ = Describe("streaming segment assembly", func() {
 		Expect(segs[0].Text).To(Equal("uh huh"))
 		Expect(segs[0].End).To(Equal(secondsToNanos(0.5)))
 	})
-
-	// Older text-only libparakeet.so: no per-word timings, so a segment is cut
-	// from the delta text on each <EOU>/<EOB> (no timestamps), one per utterance.
-	It("falls back to text segments when the feed carries no words", func() {
-		acc := &streamSegmenter{}
-		acc.add(streamFeedResult{Delta: "first turn", Eou: true})
-		acc.add(streamFeedResult{Delta: "second turn", Eou: true})
-		segs := acc.segments()
-		Expect(segs).To(HaveLen(2))
-		Expect(segs[0].Text).To(Equal("first turn"))
-		Expect(segs[1].Text).To(Equal("second turn"))
-		Expect(segs[0].Start).To(Equal(int64(0)), "no per-word timing on the text path")
-		Expect(segs[0].End).To(Equal(int64(0)))
-	})
 })
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=3b6c9ca97cfcda8e68e719e6670d06379fcbe943
+STABLEDIFFUSION_GGML_VERSION?=9956436c925a367daeab097598b1ea1f32d3503f

 CMAKE_ARGS+=-DGGML_MAX_NAME=128

--- a/backend/go/voice-detect/Makefile
+++ b/backend/go/voice-detect/Makefile
@@ -1,6 +1,6 @@
 # voice-detect backend Makefile.
 #
-# Upstream pin lives below as VOICEDETECT_VERSION?=1db1759572c90faef6f3a78c36b5941a096a9f89
+# Upstream pin lives below as VOICEDETECT_VERSION?=3d51077... (.github/bump_deps.sh
 # can find and update it - matches the parakeet.cpp / whisper.cpp / ds4 convention).
 #
 # Local dev shortcut: if you already have an out-of-tree voice-detect.cpp build,
@@ -13,7 +13,7 @@
 # The default target below does the proper clone-at-pin + cmake build so CI does
 # not need a side-checkout.

-VOICEDETECT_VERSION?=1db1759572c90faef6f3a78c36b5941a096a9f89
+VOICEDETECT_VERSION?=3d510772357538c5182808ac7de2278b84824e24
 VOICEDETECT_REPO?=https://github.com/mudler/voice-detect.cpp

 GOCMD?=go
--- a/backend/python/fish-speech/install.sh
+++ b/backend/python/fish-speech/install.sh
@@ -13,17 +13,6 @@ fi
 # fish-speech uses pyrootutils which requires a .project-root marker
 touch "${backend_dir}/.project-root"

-# On darwin arm64 the transitive `tokenizers` dep compiles its Rust extension
-# from source (Linux uses prebuilt manylinux wheels, so it never compiles
-# there). The pinned tokenizers crate that fish-speech's stack resolves to
-# contains a `&T` -> `&mut T` cast that trips the now-deny-by-default
-# `invalid_reference_casting` lint in the macOS runner's newer Rust toolchain,
-# breaking the build (seen in the v4.5.5 release CI fish-speech darwin/metal
-# job). Allow that lint so the unchanged third-party crate compiles as before.
-# Append rather than clobber any pre-existing RUSTFLAGS; harmless on Linux
-# where no Rust compile happens.
-export RUSTFLAGS="${RUSTFLAGS:-} -A invalid_reference_casting"
-
 installRequirements

 # Clone fish-speech source (the pip package doesn't include inference modules)
--- a/backend/python/sglang/backend.py
+++ b/backend/python/sglang/backend.py
@@ -147,25 +147,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                d["reasoning_content"] = msg.reasoning_content
            if msg.tool_calls:
                try:
-                    tool_calls = json.loads(msg.tool_calls)
+                    d["tool_calls"] = json.loads(msg.tool_calls)
                except json.JSONDecodeError:
                    pass
-                else:
-                    # OpenAI wire format carries function.arguments as a
-                    # JSON-encoded string, but chat templates (e.g. Qwen3)
-                    # iterate over it as a mapping. The vllm backend
-                    # already parses arguments before applying the chat
-                    # template (PR #10256); mirror that here so the
-                    # sglang backend works with the same wire format.
-                    if isinstance(tool_calls, list):
-                        for tc in tool_calls:
-                            func = tc.get("function") if isinstance(tc, dict) else None
-                            if isinstance(func, dict) and isinstance(func.get("arguments"), str):
-                                try:
-                                    func["arguments"] = json.loads(func["arguments"])
-                                except json.JSONDecodeError:
-                                    pass
-                    d["tool_calls"] = tool_calls
            result.append(d)
        return result

--- a/backend/python/sglang/requirements-cpu.txt
+++ b/backend/python/sglang/requirements-cpu.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 accelerate
-torch==2.9.0
+torch==2.12.0+cpu
 torchvision
 torchaudio
 transformers
--- a/backend/python/sglang/requirements-cublas12.txt
+++ b/backend/python/sglang/requirements-cublas12.txt
@@ -6,7 +6,7 @@
 # for cublas12 so uv consults this index alongside PyPI.
 --extra-index-url https://download.pytorch.org/whl/cu128
 accelerate
-torch==2.9.1
+torch==2.12.0+cpu
 torchvision
 torchaudio
 transformers
--- a/backend/python/trl/requirements-cpu.txt
+++ b/backend/python/trl/requirements-cpu.txt
@@ -1,9 +1,9 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.10.0
+torch==2.12.0+cpu
 trl
 peft
 datasets>=3.0.0
-transformers>=4.56.2
+transformers>=5.12.1
 accelerate>=1.4.0
 huggingface-hub>=1.3.0
 sentencepiece
--- a/backend/python/trl/requirements-cublas12.txt
+++ b/backend/python/trl/requirements-cublas12.txt
@@ -1,8 +1,8 @@
-torch==2.10.0
+torch==2.12.0+cpu
 trl
 peft
 datasets>=3.0.0
-transformers>=4.56.2
+transformers>=5.12.1
 accelerate>=1.4.0
 huggingface-hub>=1.3.0
 sentencepiece
--- a/backend/python/trl/requirements-cublas13.txt
+++ b/backend/python/trl/requirements-cublas13.txt
@@ -1,8 +1,8 @@
-torch==2.10.0
+torch==2.12.0+cpu
 trl
 peft
 datasets>=3.0.0
-transformers>=4.56.2
+transformers>=5.12.1
 accelerate>=1.4.0
 huggingface-hub>=1.3.0
 sentencepiece
--- a/backend/python/trl/requirements-mps.txt
+++ b/backend/python/trl/requirements-mps.txt
@@ -1,8 +1,8 @@
-torch==2.10.0
+torch==2.12.0+cpu
 trl
 peft
 datasets>=3.0.0
-transformers>=4.56.2
+transformers>=5.12.1
 accelerate>=1.4.0
 huggingface-hub>=1.3.0
 sentencepiece
--- a/backend/python/vllm-omni/requirements-cublas12.txt
+++ b/backend/python/vllm-omni/requirements-cublas12.txt
@@ -1,4 +1,4 @@
 accelerate
-torch==2.7.0
+torch==2.12.0+cu130
 transformers
 bitsandbytes
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -104,7 +104,7 @@ if [ "$(uname -s)" = "Darwin" ]; then
    # can rewrite it. Darwin therefore follows vllm-metal and can lag the Linux
    # vllm pin (requirements-cublas13-after.txt, bumped independently against
    # vllm/vllm) until vllm-metal supports a newer vLLM.
-    VLLM_METAL_VERSION="v0.3.0.dev20260628073537"
+    VLLM_METAL_VERSION="v0.3.0.dev20260622062346"

    # The coupled vLLM source version is whatever this vllm-metal release builds
    # against -- it declares it in its own installer as `vllm_v=`. Derive it from
--- a/cmd/launcher/internal/launcher.go
+++ b/cmd/launcher/internal/launcher.go
@@ -429,7 +429,7 @@ func (l *Launcher) CheckForUpdates() (bool, string, error) {
 }

 // DownloadUpdate downloads the latest version
-func (l *Launcher) DownloadUpdate(version string, progressCallback func(downloaded, total int64)) error {
+func (l *Launcher) DownloadUpdate(version string, progressCallback func(float64)) error {
 	return l.releaseManager.DownloadRelease(version, progressCallback)
 }

@@ -486,6 +486,7 @@ func (l *Launcher) showDownloadLocalAIDialog() {
 	fyne.DoAndWait(func() {
 		// Create a standalone window for the download dialog
 		dialogWindow := l.app.NewWindow("LocalAI Installation Required")
+		dialogWindow.Resize(fyne.NewSize(500, 350))
 		dialogWindow.CenterOnScreen()
 		dialogWindow.SetCloseIntercept(func() {
 			dialogWindow.Close()
@@ -547,7 +548,6 @@ func (l *Launcher) showDownloadLocalAIDialog() {
 		)

 		dialogWindow.SetContent(content)
-		resizeToContent(dialogWindow, content)
 		dialogWindow.Show()
 	})
 }
@@ -621,134 +621,88 @@ func (l *Launcher) showDownloadError(title, message string) {
 }

 // showDownloadProgress shows a standalone progress window for downloading LocalAI
-// after a fresh install (no LocalAI binary present yet).
 func (l *Launcher) showDownloadProgress(version, title string) {
-	l.showDownloadProgressWindow(version, title, func(win fyne.Window) {
-		dialog.ShowConfirm("Installation Complete",
-			"LocalAI has been downloaded and installed successfully. You can now start LocalAI from the launcher.",
-			func(bool) {
-				win.Close()
-				l.updateStatus("LocalAI installed successfully")
-				if l.systray != nil {
-					l.systray.recreateMenu()
-				}
-			}, win)
-	})
-}
-
-// showDownloadProgressWindow renders the download progress popup shared by every
-// "download/upgrade LocalAI" entry point. It owns the progress bar, the
-// human-readable byte readout, resume-aware retry, and content-fit window
-// sizing so the behaviour stays identical everywhere. onSuccess runs (on the UI
-// goroutine) once the download verifies, and is responsible for the success
-// dialog and any follow-up; the window is passed in so it can be parented/closed.
-func (l *Launcher) showDownloadProgressWindow(version, title string, onSuccess func(win fyne.Window)) {
 	fyne.DoAndWait(func() {
+		// Create progress window
 		progressWindow := l.app.NewWindow("Downloading LocalAI")
+		progressWindow.Resize(fyne.NewSize(400, 250))
 		progressWindow.CenterOnScreen()
 		progressWindow.SetCloseIntercept(func() {
 			progressWindow.Close()
 		})

+		// Progress bar
 		progressBar := widget.NewProgressBar()
 		progressBar.SetValue(0)

 		// Status label. Truncate with an ellipsis so a long "Download failed:
 		// <url>" message can't stretch the window (and progress bar) to fit the
-		// whole error on one line.
+		// whole error on one line; the full error is shown in the dialog below.
 		statusLabel := widget.NewLabel("Preparing download...")
 		statusLabel.Truncation = fyne.TextTruncateEllipsis

+		// Release notes button
 		releaseNotesButton := widget.NewButton("View Release Notes", func() {
 			releaseNotesURL, err := l.githubReleaseNotesURL(version)
 			if err != nil {
 				log.Printf("Failed to parse URL: %v", err)
 				return
 			}
+
 			l.app.OpenURL(releaseNotesURL)
 		})

-		// Retry button: hidden until a download fails. GitHub downloads are
-		// flaky, and the underlying download resumes from the partial file, so
-		// a retry continues where it left off rather than starting over.
-		retryButton := widget.NewButton("Retry", nil)
-		retryButton.Importance = widget.HighImportance
-		retryButton.Hide()
-
-		buttonRow := container.NewHBox(releaseNotesButton, retryButton)
-		content := container.NewVBox(
+		// Progress container
+		progressContainer := container.NewVBox(
 			widget.NewLabel(title),
 			progressBar,
 			statusLabel,
 			widget.NewSeparator(),
-			buttonRow,
+			releaseNotesButton,
 		)
-		progressWindow.SetContent(content)
-		resizeToContent(progressWindow, content)
-
-		var startDownload func()
-		startDownload = func() {
-			retryButton.Hide()
-			progressBar.SetValue(0)
-			statusLabel.SetText("Preparing download...")
-			resizeToContent(progressWindow, content)
-
-			go func() {
-				err := l.DownloadUpdate(version, func(downloaded, total int64) {
-					fyne.Do(func() {
-						if total > 0 {
-							progressBar.SetValue(float64(downloaded) / float64(total))
-							statusLabel.SetText(fmt.Sprintf("Downloading… %s / %s", formatBytes(downloaded), formatBytes(total)))
-						} else {
-							statusLabel.SetText(fmt.Sprintf("Downloading… %s", formatBytes(downloaded)))
-						}
-					})
-				})
-
-				fyne.Do(func() {
-					if err != nil {
-						statusLabel.SetText(fmt.Sprintf("Download failed: %v", err))
-						retryButton.Show()
-						resizeToContent(progressWindow, content)
-						return
-					}
-					progressBar.SetValue(1.0)
-					statusLabel.SetText("Download complete")
-					onSuccess(progressWindow)
-				})
-			}()
-		}
-		retryButton.OnTapped = startDownload

+		progressWindow.SetContent(progressContainer)
 		progressWindow.Show()
-		startDownload()
+
+		// Start download in background
+		go func() {
+			err := l.DownloadUpdate(version, func(progress float64) {
+				// Update progress bar
+				fyne.Do(func() {
+					progressBar.SetValue(progress)
+					percentage := int(progress * 100)
+					statusLabel.SetText(fmt.Sprintf("Downloading... %d%%", percentage))
+				})
+			})
+
+			// Handle completion
+			fyne.Do(func() {
+				if err != nil {
+					statusLabel.SetText(fmt.Sprintf("Download failed: %v", err))
+					// Show error dialog
+					dialog.ShowError(err, progressWindow)
+				} else {
+					statusLabel.SetText("Download completed successfully!")
+					progressBar.SetValue(1.0)
+
+					// Show success dialog
+					dialog.ShowConfirm("Installation Complete",
+						"LocalAI has been downloaded and installed successfully. You can now start LocalAI from the launcher.",
+						func(close bool) {
+							progressWindow.Close()
+							// Update status and refresh systray menu
+							l.updateStatus("LocalAI installed successfully")
+
+							if l.systray != nil {
+								l.systray.recreateMenu()
+							}
+						}, progressWindow)
+				}
+			})
+		}()
 	})
 }

-// resizeToContent sizes a window to fit its content (with a sane minimum width)
-// so the dialog doesn't show a large blank gap below the last widget.
-func resizeToContent(w fyne.Window, content fyne.CanvasObject) {
-	size := content.MinSize()
-	if size.Width < 400 {
-		size.Width = 400
-	}
-	w.Resize(size)
-}
-
-// formatBytes renders a byte count as a human-readable size (e.g. "12.3 MB").
-func formatBytes(b int64) string {
-	const unit = 1024
-	if b < unit {
-		return fmt.Sprintf("%d B", b)
-	}
-	div, exp := int64(unit), 0
-	for n := b / unit; n >= unit; n /= unit {
-		div *= unit
-		exp++
-	}
-	return fmt.Sprintf("%.1f %cB", float64(b)/float64(div), "KMGTPE"[exp])
-}
-
 // monitorLogs monitors the output of LocalAI and adds it to the log buffer
 func (l *Launcher) monitorLogs(reader io.Reader, prefix string) {
 	scanner := bufio.NewScanner(reader)
--- a/cmd/launcher/internal/release_manager.go
+++ b/cmd/launcher/internal/release_manager.go
@@ -11,7 +11,6 @@ import (
 	"net/http"
 	"os"
 	"os/exec"
-	"path"
 	"path/filepath"
 	"runtime"
 	"strings"
@@ -51,12 +50,6 @@ type ReleaseManager struct {
 	ChecksumsPath string
 	// MetadataPath is where version metadata is stored
 	MetadataPath string
-	// BaseDownloadURL is the base URL release assets are downloaded from
-	// (defaults to https://github.com; overridable for testing)
-	BaseDownloadURL string
-	// RetryBackoff is the base wait between download attempts; the Nth retry
-	// waits N*RetryBackoff (defaults to 1s; lowered in tests)
-	RetryBackoff time.Duration
 	// HTTPClient is the HTTP client used for downloads
 	HTTPClient *http.Client
 }
@@ -69,94 +62,28 @@ func NewReleaseManager() *ReleaseManager {
 	metadataPath := filepath.Join(homeDir, ".localai", "metadata")

 	return &ReleaseManager{
-		GitHubOwner:     "mudler",
-		GitHubRepo:      "LocalAI",
-		BinaryPath:      binaryPath,
-		CurrentVersion:  internal.PrintableVersion(),
-		ChecksumsPath:   checksumsPath,
-		MetadataPath:    metadataPath,
-		BaseDownloadURL: "https://github.com",
-		RetryBackoff:    1 * time.Second,
-		HTTPClient:      httpclient.NewWithTimeout(30*time.Second, httpclient.WithFollowRedirects()),
+		GitHubOwner:    "mudler",
+		GitHubRepo:     "LocalAI",
+		BinaryPath:     binaryPath,
+		CurrentVersion: internal.PrintableVersion(),
+		ChecksumsPath:  checksumsPath,
+		MetadataPath:   metadataPath,
+		HTTPClient:     httpclient.NewWithTimeout(30*time.Second, httpclient.WithFollowRedirects()),
 	}
 }

-// GetLatestRelease resolves the latest LocalAI release.
-//
-// It first follows the github.com "releases/latest" redirect, which reveals the
-// latest tag in the final URL and—crucially—is NOT subject to the
-// 60-requests/hour unauthenticated rate limit of api.github.com. That limit is
-// per-IP, so on shared/NAT/CGNAT/cloud addresses the API returns 403 almost
-// immediately (e.g. on a fresh install with no LocalAI present yet). The
-// redirect avoids that entirely. The richer JSON API is kept only as a fallback.
-//
-// Only the version is consumed by callers, so the redirect's tag is sufficient.
+// GetLatestRelease fetches the latest release information from GitHub
 func (rm *ReleaseManager) GetLatestRelease() (*Release, error) {
-	version, redirectErr := rm.latestVersionFromRedirect()
-	if redirectErr == nil {
-		return &Release{Version: version}, nil
-	}
-	log.Printf("Could not resolve latest version via release redirect (%v); falling back to GitHub API", redirectErr)
-
-	release, apiErr := rm.latestReleaseFromAPI()
-	if apiErr != nil {
-		// Surface both failures so a rate-limited API doesn't mask the (usually
-		// more relevant) redirect error.
-		return nil, fmt.Errorf("failed to fetch latest release: %v (redirect: %v)", apiErr, redirectErr)
-	}
-	return release, nil
-}
-
-// latestVersionFromRedirect returns the latest tag by following the github.com
-// "releases/latest" redirect to ".../releases/tag/<tag>".
-func (rm *ReleaseManager) latestVersionFromRedirect() (string, error) {
-	url := fmt.Sprintf("%s/%s/%s/releases/latest", rm.BaseDownloadURL, rm.GitHubOwner, rm.GitHubRepo)
-
-	resp, err := rm.HTTPClient.Get(url)
-	if err != nil {
-		return "", err
-	}
-	defer resp.Body.Close()
-
-	if resp.StatusCode != http.StatusOK {
-		return "", fmt.Errorf("unexpected status %s", resp.Status)
-	}
-
-	// After the redirect is followed, the final request URL is the tag page.
-	version := path.Base(resp.Request.URL.Path)
-	if version == "" || version == "." || version == "latest" {
-		return "", fmt.Errorf("could not determine version from %s", resp.Request.URL.String())
-	}
-	return version, nil
-}
-
-// latestReleaseFromAPI fetches the latest release JSON from api.github.com. This
-// is the fallback path; it is rate-limited unless GITHUB_TOKEN is set.
-func (rm *ReleaseManager) latestReleaseFromAPI() (*Release, error) {
 	url := fmt.Sprintf("https://api.github.com/repos/%s/%s/releases/latest", rm.GitHubOwner, rm.GitHubRepo)

-	req, err := http.NewRequest(http.MethodGet, url, nil)
-	if err != nil {
-		return nil, err
-	}
-	req.Header.Set("Accept", "application/vnd.github+json")
-	// An optional token lifts the unauthenticated 60/hour limit to 5000/hour.
-	if token := os.Getenv("GITHUB_TOKEN"); token != "" {
-		req.Header.Set("Authorization", "Bearer "+token)
-	}
-
-	resp, err := rm.HTTPClient.Do(req)
+	resp, err := rm.HTTPClient.Get(url)
 	if err != nil {
 		return nil, fmt.Errorf("failed to fetch latest release: %w", err)
 	}
 	defer resp.Body.Close()

 	if resp.StatusCode != http.StatusOK {
-		if (resp.StatusCode == http.StatusForbidden || resp.StatusCode == http.StatusTooManyRequests) &&
-			resp.Header.Get("X-RateLimit-Remaining") == "0" {
-			return nil, fmt.Errorf("GitHub API rate limit exceeded (status %d); retry later or set GITHUB_TOKEN to raise the limit", resp.StatusCode)
-		}
-		return nil, fmt.Errorf("status %d", resp.StatusCode)
+		return nil, fmt.Errorf("failed to fetch latest release: status %d", resp.StatusCode)
 	}

 	// Parse the JSON response properly
@@ -179,7 +106,7 @@ func (rm *ReleaseManager) latestReleaseFromAPI() (*Release, error) {
 }

 // DownloadRelease downloads a specific version of LocalAI
-func (rm *ReleaseManager) DownloadRelease(version string, progressCallback func(downloaded, total int64)) error {
+func (rm *ReleaseManager) DownloadRelease(version string, progressCallback func(float64)) error {
 	// Ensure the binary directory exists
 	if err := os.MkdirAll(rm.BinaryPath, 0755); err != nil {
 		return fmt.Errorf("failed to create binary directory: %w", err)
@@ -190,16 +117,16 @@ func (rm *ReleaseManager) DownloadRelease(version string, progressCallback func(
 	localPath := filepath.Join(rm.BinaryPath, "local-ai")

 	// Download the binary
-	downloadURL := fmt.Sprintf("%s/%s/%s/releases/download/%s/%s",
-		rm.BaseDownloadURL, rm.GitHubOwner, rm.GitHubRepo, version, binaryName)
+	downloadURL := fmt.Sprintf("https://github.com/%s/%s/releases/download/%s/%s",
+		rm.GitHubOwner, rm.GitHubRepo, version, binaryName)

 	if err := rm.downloadFile(downloadURL, localPath, progressCallback); err != nil {
 		return fmt.Errorf("failed to download binary: %w", err)
 	}

 	// Download and verify checksums
-	checksumURL := fmt.Sprintf("%s/%s/%s/releases/download/%s/LocalAI-%s-checksums.txt",
-		rm.BaseDownloadURL, rm.GitHubOwner, rm.GitHubRepo, version, version)
+	checksumURL := fmt.Sprintf("https://github.com/%s/%s/releases/download/%s/LocalAI-%s-checksums.txt",
+		rm.GitHubOwner, rm.GitHubRepo, version, version)

 	checksumPath := filepath.Join(rm.BinaryPath, "checksums.txt")
 	manualChecksumPath := filepath.Join(rm.ChecksumsPath, fmt.Sprintf("checksums-%s.txt", version))
@@ -227,10 +154,6 @@ func (rm *ReleaseManager) DownloadRelease(version string, progressCallback func(
 	// Verify the checksum if we have a checksum file
 	if _, err := os.Stat(checksumPath); err == nil {
 		if err := rm.VerifyChecksum(localPath, checksumPath, binaryName); err != nil {
-			// Discard the corrupt binary (and any leftover partial) so the next
-			// retry starts from a clean slate rather than resuming corruption.
-			os.Remove(localPath)
-			os.Remove(localPath + ".part")
 			return fmt.Errorf("checksum verification failed: %w", err)
 		}
 		log.Printf("Checksum verification successful")
@@ -273,88 +196,44 @@ func (rm *ReleaseManager) GetBinaryName(version string) string {
 }

 // downloadFile downloads a file from a URL to a local path with optional progress callback
-func (rm *ReleaseManager) downloadFile(url, filepath string, progressCallback func(downloaded, total int64)) error {
+func (rm *ReleaseManager) downloadFile(url, filepath string, progressCallback func(float64)) error {
 	return rm.downloadFileWithRetry(url, filepath, progressCallback, 3)
 }

-// downloadFileWithRetry downloads a file with retry and HTTP Range resume.
-//
-// The body is streamed to "<dest>.part" and only renamed to dest on success, so
-// a dropped connection leaves a partial file that the next attempt continues via
-// a "Range: bytes=N-" request instead of restarting from zero. This matters for
-// GitHub release downloads, which are large and flaky.
-func (rm *ReleaseManager) downloadFileWithRetry(url, dest string, progressCallback func(downloaded, total int64), maxRetries int) error {
-	partPath := dest + ".part"
+// downloadFileWithRetry downloads a file from a URL with retry logic
+func (rm *ReleaseManager) downloadFileWithRetry(url, filepath string, progressCallback func(float64), maxRetries int) error {
 	var lastErr error

 	for attempt := 1; attempt <= maxRetries; attempt++ {
 		if attempt > 1 {
 			log.Printf("Retrying download (attempt %d/%d): %s", attempt, maxRetries, url)
-			time.Sleep(time.Duration(attempt) * rm.RetryBackoff)
+			time.Sleep(time.Duration(attempt) * time.Second)
 		}

-		// Resume from however much we already have on disk.
-		var offset int64
-		if fi, err := os.Stat(partPath); err == nil {
-			offset = fi.Size()
-		}
-
-		req, err := http.NewRequest(http.MethodGet, url, nil)
-		if err != nil {
-			return err
-		}
-		if offset > 0 {
-			req.Header.Set("Range", fmt.Sprintf("bytes=%d-", offset))
-		}
-
-		resp, err := rm.HTTPClient.Do(req)
+		resp, err := rm.HTTPClient.Get(url)
 		if err != nil {
 			lastErr = err
 			continue
 		}

-		switch resp.StatusCode {
-		case http.StatusOK:
-			// Server ignored the Range (or we had nothing): start fresh.
-			offset = 0
-		case http.StatusPartialContent:
-			// Resume: append to the existing partial file.
-		case http.StatusRequestedRangeNotSatisfiable:
-			// Stale or already-complete partial: discard and restart fresh.
-			resp.Body.Close()
-			os.Remove(partPath)
-			lastErr = fmt.Errorf("partial download no longer valid (status %s), restarting", resp.Status)
-			continue
-		default:
+		if resp.StatusCode != http.StatusOK {
 			resp.Body.Close()
 			lastErr = fmt.Errorf("bad status: %s", resp.Status)
 			continue
 		}

-		var out *os.File
-		if offset > 0 {
-			out, err = os.OpenFile(partPath, os.O_WRONLY|os.O_APPEND, 0644)
-		} else {
-			out, err = os.Create(partPath)
-		}
+		out, err := os.Create(filepath)
 		if err != nil {
 			resp.Body.Close()
 			return err
 		}

-		// On a 206 the Content-Length is the remaining bytes, so the full size
-		// is what we already have plus what's still to come.
-		total := resp.ContentLength
-		if offset > 0 && total > 0 {
-			total += offset
-		}
-
+		// Create a progress reader if callback is provided
 		var reader io.Reader = resp.Body
-		if progressCallback != nil && total > 0 {
+		if progressCallback != nil && resp.ContentLength > 0 {
 			reader = &progressReader{
 				Reader:   resp.Body,
-				Total:    total,
-				Current:  offset,
+				Total:    resp.ContentLength,
 				Callback: progressCallback,
 			}
 		}
@@ -364,14 +243,11 @@ func (rm *ReleaseManager) downloadFileWithRetry(url, dest string, progressCallba
 		out.Close()

 		if err != nil {
-			// Keep the partial file so the next attempt can resume from it.
 			lastErr = err
+			os.Remove(filepath)
 			continue
 		}

-		if err := os.Rename(partPath, dest); err != nil {
-			return err
-		}
 		return nil
 	}

@@ -446,21 +322,20 @@ func (rm *ReleaseManager) saveVersionMetadata(version string) error {
 	return nil
 }

-// progressReader wraps an io.Reader to provide download progress as a
-// (downloaded, total) byte count so callers can render both a progress bar and
-// a human-readable size.
+// progressReader wraps an io.Reader to provide download progress
 type progressReader struct {
 	io.Reader
 	Total    int64
 	Current  int64
-	Callback func(downloaded, total int64)
+	Callback func(float64)
 }

 func (pr *progressReader) Read(p []byte) (int, error) {
 	n, err := pr.Reader.Read(p)
 	pr.Current += int64(n)
 	if pr.Callback != nil {
-		pr.Callback(pr.Current, pr.Total)
+		progress := float64(pr.Current) / float64(pr.Total)
+		pr.Callback(progress)
 	}
 	return n, err
 }
--- a/cmd/launcher/internal/release_manager_test.go
+++ b/cmd/launcher/internal/release_manager_test.go
@@ -1,17 +1,9 @@
 package launcher_test

 import (
-	"crypto/sha256"
-	"encoding/hex"
-	"fmt"
-	"net/http"
-	"net/http/httptest"
 	"os"
 	"path/filepath"
 	"runtime"
-	"strconv"
-	"strings"
-	"sync"
 	"time"

 	. "github.com/onsi/ginkgo/v2"
@@ -186,221 +178,4 @@ var _ = Describe("ReleaseManager", func() {
 			Expect(err.Error()).To(ContainSubstring("checksum not found"))
 		})
 	})
-
-	Describe("DownloadRelease resume and retry", func() {
-		var (
-			version    string
-			binaryName string
-			content    []byte
-			checksums  string
-			finalPath  string
-			partPath   string
-		)
-
-		BeforeEach(func() {
-			version = "v9.9.9"
-			binaryName = rm.GetBinaryName(version)
-
-			// Deterministic, non-trivial content so resume/append bugs surface.
-			content = make([]byte, 4096)
-			for i := range content {
-				content[i] = byte(i % 251)
-			}
-			sum := sha256.Sum256(content)
-			checksums = fmt.Sprintf("%s  %s\n", hex.EncodeToString(sum[:]), binaryName)
-
-			finalPath = filepath.Join(tempDir, "local-ai")
-			partPath = finalPath + ".part"
-
-			// Isolate the persistent checksum/metadata dirs to the temp dir so
-			// the test never touches the real ~/.localai and existing checksum
-			// files don't short-circuit the download.
-			rm.ChecksumsPath = filepath.Join(tempDir, "checksums")
-			rm.MetadataPath = filepath.Join(tempDir, "metadata")
-			rm.GitHubOwner = "owner"
-			rm.GitHubRepo = "repo"
-			rm.RetryBackoff = time.Millisecond
-
-			Expect(os.MkdirAll(tempDir, 0755)).To(Succeed())
-		})
-
-		It("resumes from a partial .part file using a Range request", func() {
-			Expect(os.WriteFile(partPath, content[:1024], 0644)).To(Succeed())
-
-			var mu sync.Mutex
-			sawRange := false
-			binBytesServed := 0
-
-			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-				if strings.HasSuffix(r.URL.Path, "checksums.txt") {
-					_, _ = w.Write([]byte(checksums))
-					return
-				}
-				if rangeHdr := r.Header.Get("Range"); rangeHdr != "" {
-					var start int
-					_, _ = fmt.Sscanf(rangeHdr, "bytes=%d-", &start)
-					mu.Lock()
-					sawRange = true
-					mu.Unlock()
-					w.Header().Set("Content-Range", fmt.Sprintf("bytes %d-%d/%d", start, len(content)-1, len(content)))
-					w.WriteHeader(http.StatusPartialContent)
-					n, _ := w.Write(content[start:])
-					mu.Lock()
-					binBytesServed += n
-					mu.Unlock()
-					return
-				}
-				w.WriteHeader(http.StatusOK)
-				n, _ := w.Write(content)
-				mu.Lock()
-				binBytesServed += n
-				mu.Unlock()
-			}))
-			defer srv.Close()
-			rm.BaseDownloadURL = srv.URL
-
-			err := rm.DownloadRelease(version, nil)
-			Expect(err).ToNot(HaveOccurred())
-
-			got, err := os.ReadFile(finalPath)
-			Expect(err).ToNot(HaveOccurred())
-			Expect(got).To(Equal(content))
-			Expect(sawRange).To(BeTrue(), "expected the download to resume with a Range request")
-			Expect(binBytesServed).To(Equal(len(content)-1024), "expected only the remaining bytes to be served")
-			Expect(partPath).ToNot(BeAnExistingFile())
-		})
-
-		It("starts fresh when the server ignores the Range header (200)", func() {
-			// A stale/garbage partial that must NOT be appended to.
-			Expect(os.WriteFile(partPath, []byte("garbage-garbage-garbage"), 0644)).To(Succeed())
-
-			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-				if strings.HasSuffix(r.URL.Path, "checksums.txt") {
-					_, _ = w.Write([]byte(checksums))
-					return
-				}
-				// Ignore any Range and always serve the full body.
-				w.WriteHeader(http.StatusOK)
-				_, _ = w.Write(content)
-			}))
-			defer srv.Close()
-			rm.BaseDownloadURL = srv.URL
-
-			err := rm.DownloadRelease(version, nil)
-			Expect(err).ToNot(HaveOccurred())
-
-			got, err := os.ReadFile(finalPath)
-			Expect(err).ToNot(HaveOccurred())
-			Expect(got).To(Equal(content))
-		})
-
-		It("restarts the download when the partial is stale (416)", func() {
-			// Oversized partial -> requested Range start is beyond the content.
-			Expect(os.WriteFile(partPath, make([]byte, len(content)+10), 0644)).To(Succeed())
-
-			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-				if strings.HasSuffix(r.URL.Path, "checksums.txt") {
-					_, _ = w.Write([]byte(checksums))
-					return
-				}
-				if rangeHdr := r.Header.Get("Range"); rangeHdr != "" {
-					var start int
-					_, _ = fmt.Sscanf(rangeHdr, "bytes=%d-", &start)
-					if start >= len(content) {
-						w.WriteHeader(http.StatusRequestedRangeNotSatisfiable)
-						return
-					}
-					w.Header().Set("Content-Range", fmt.Sprintf("bytes %d-%d/%d", start, len(content)-1, len(content)))
-					w.WriteHeader(http.StatusPartialContent)
-					_, _ = w.Write(content[start:])
-					return
-				}
-				w.WriteHeader(http.StatusOK)
-				_, _ = w.Write(content)
-			}))
-			defer srv.Close()
-			rm.BaseDownloadURL = srv.URL
-
-			err := rm.DownloadRelease(version, nil)
-			Expect(err).ToNot(HaveOccurred())
-
-			got, err := os.ReadFile(finalPath)
-			Expect(err).ToNot(HaveOccurred())
-			Expect(got).To(Equal(content))
-		})
-
-		It("removes the downloaded file when checksum verification fails", func() {
-			bad := []byte("this is definitely not the expected binary content")
-
-			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-				if strings.HasSuffix(r.URL.Path, "checksums.txt") {
-					// Checksums are for `content`, but we serve `bad`.
-					_, _ = w.Write([]byte(checksums))
-					return
-				}
-				w.WriteHeader(http.StatusOK)
-				_, _ = w.Write(bad)
-			}))
-			defer srv.Close()
-			rm.BaseDownloadURL = srv.URL
-
-			err := rm.DownloadRelease(version, nil)
-			Expect(err).To(HaveOccurred())
-			Expect(err.Error()).To(ContainSubstring("checksum"))
-			Expect(finalPath).ToNot(BeAnExistingFile())
-			Expect(partPath).ToNot(BeAnExistingFile())
-		})
-
-		It("reports progress as downloaded and total byte counts", func() {
-			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-				if strings.HasSuffix(r.URL.Path, "checksums.txt") {
-					_, _ = w.Write([]byte(checksums))
-					return
-				}
-				w.Header().Set("Content-Length", strconv.Itoa(len(content)))
-				w.WriteHeader(http.StatusOK)
-				_, _ = w.Write(content)
-			}))
-			defer srv.Close()
-			rm.BaseDownloadURL = srv.URL
-
-			var mu sync.Mutex
-			var lastDownloaded, lastTotal int64
-			err := rm.DownloadRelease(version, func(downloaded, total int64) {
-				mu.Lock()
-				lastDownloaded = downloaded
-				lastTotal = total
-				mu.Unlock()
-			})
-			Expect(err).ToNot(HaveOccurred())
-			Expect(lastTotal).To(Equal(int64(len(content))))
-			Expect(lastDownloaded).To(Equal(int64(len(content))))
-		})
-	})
-
-	Describe("GetLatestRelease", func() {
-		It("resolves the latest version from the releases/latest redirect", func() {
-			// The github.com redirect path must be preferred over the
-			// rate-limited api.github.com, so a working redirect yields the tag
-			// without ever needing the API.
-			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-				switch {
-				case strings.HasSuffix(r.URL.Path, "/releases/latest"):
-					http.Redirect(w, r, "/owner/repo/releases/tag/v9.9.9", http.StatusFound)
-				case strings.HasSuffix(r.URL.Path, "/releases/tag/v9.9.9"):
-					w.WriteHeader(http.StatusOK)
-				default:
-					w.WriteHeader(http.StatusNotFound)
-				}
-			}))
-			defer srv.Close()
-			rm.BaseDownloadURL = srv.URL
-			rm.GitHubOwner = "owner"
-			rm.GitHubRepo = "repo"
-
-			release, err := rm.GetLatestRelease()
-			Expect(err).ToNot(HaveOccurred())
-			Expect(release.Version).To(Equal("v9.9.9"))
-		})
-	})
 })
--- a/cmd/launcher/internal/systray_manager.go
+++ b/cmd/launcher/internal/systray_manager.go
@@ -443,23 +443,84 @@ func (sm *SystrayManager) showStartupErrorDialog(err error) {
 	})
 }

-// showDownloadProgress shows a progress window for downloading updates. The
-// progress UI (byte readout, resume-aware retry, sizing) is shared with the
-// other download entry points via the launcher; only the post-success behaviour
-// (restart prompt + systray refresh) is specific to the update flow.
+// showDownloadProgress shows a progress window for downloading updates
 func (sm *SystrayManager) showDownloadProgress(version string) {
-	sm.launcher.showDownloadProgressWindow(version, fmt.Sprintf("Downloading LocalAI version %s", version), func(win fyne.Window) {
-		dialog.ShowConfirm("Update Downloaded",
-			"LocalAI has been updated successfully. Please restart the launcher to use the new version.",
-			func(restart bool) {
-				if restart {
-					sm.app.Quit()
-				}
-				win.Close()
-			}, win)
+	// Create a new window for download progress
+	progressWindow := sm.app.NewWindow("Downloading LocalAI Update")
+	progressWindow.Resize(fyne.NewSize(400, 250))
+	progressWindow.CenterOnScreen()

-		sm.hasUpdateAvailable = false
-		sm.latestVersion = ""
-		sm.recreateMenu()
+	// Progress bar
+	progressBar := widget.NewProgressBar()
+	progressBar.SetValue(0)
+
+	// Status label. Truncate with an ellipsis so a long "Download failed:
+	// <url>" message can't stretch the window (and progress bar) to fit the
+	// whole error on one line; the full error is shown in the dialog below.
+	statusLabel := widget.NewLabel("Preparing download...")
+	statusLabel.Truncation = fyne.TextTruncateEllipsis
+
+	// Release notes button
+	releaseNotesButton := widget.NewButton("View Release Notes", func() {
+		releaseNotesURL, err := sm.launcher.githubReleaseNotesURL(version)
+		if err != nil {
+			log.Printf("Failed to parse URL: %v", err)
+			return
+		}
+
+		sm.app.OpenURL(releaseNotesURL)
 	})
+
+	// Progress container
+	progressContainer := container.NewVBox(
+		widget.NewLabel(fmt.Sprintf("Downloading LocalAI version %s", version)),
+		progressBar,
+		statusLabel,
+		widget.NewSeparator(),
+		releaseNotesButton,
+	)
+
+	progressWindow.SetContent(progressContainer)
+	progressWindow.Show()
+
+	// Start download in background
+	go func() {
+		err := sm.launcher.DownloadUpdate(version, func(progress float64) {
+			// Update progress bar
+			fyne.Do(func() {
+				progressBar.SetValue(progress)
+				percentage := int(progress * 100)
+				statusLabel.SetText(fmt.Sprintf("Downloading... %d%%", percentage))
+			})
+		})
+
+		// Handle completion
+		fyne.Do(func() {
+			if err != nil {
+				statusLabel.SetText(fmt.Sprintf("Download failed: %v", err))
+				// Show error dialog
+				dialog.ShowError(err, progressWindow)
+			} else {
+				statusLabel.SetText("Download completed successfully!")
+				progressBar.SetValue(1.0)
+
+				// Show restart dialog
+				dialog.ShowConfirm("Update Downloaded",
+					"LocalAI has been updated successfully. Please restart the launcher to use the new version.",
+					func(restart bool) {
+						if restart {
+							sm.app.Quit()
+						}
+						progressWindow.Close()
+					}, progressWindow)
+			}
+		})
+
+		// Update systray menu
+		if err == nil {
+			sm.hasUpdateAvailable = false
+			sm.latestVersion = ""
+			sm.recreateMenu()
+		}
+	}()
 }
--- a/cmd/launcher/internal/ui.go
+++ b/cmd/launcher/internal/ui.go
@@ -490,19 +490,14 @@ func (ui *LauncherUI) downloadUpdate() {
 	ui.UpdateStatus("Downloading update " + version + "...")

 	go func() {
-		err := ui.launcher.DownloadUpdate(version, func(downloaded, total int64) {
+		err := ui.launcher.DownloadUpdate(version, func(progress float64) {
+			// Update progress bar
 			fyne.Do(func() {
-				if total > 0 {
-					ui.progressBar.SetValue(float64(downloaded) / float64(total))
-				}
+				ui.progressBar.SetValue(progress)
 			})
-			// The progress bar already shows the percentage, so report the
-			// human-readable size here instead of repeating the percent.
-			if total > 0 {
-				ui.UpdateStatus(fmt.Sprintf("Downloading update %s… %s / %s", version, formatBytes(downloaded), formatBytes(total)))
-			} else {
-				ui.UpdateStatus(fmt.Sprintf("Downloading update %s… %s", version, formatBytes(downloaded)))
-			}
+			// Update status with percentage
+			percentage := int(progress * 100)
+			ui.UpdateStatus(fmt.Sprintf("Downloading update %s... %d%%", version, percentage))
 		})

 		fyne.Do(func() {
@@ -603,6 +598,82 @@ func (ui *LauncherUI) LoadConfiguration() {
 	log.Printf("UI LoadConfiguration: configuration loaded successfully")
 }

+// showDownloadProgress shows a progress window for downloading LocalAI
+func (ui *LauncherUI) showDownloadProgress(version, title string) {
+	fyne.DoAndWait(func() {
+		// Create progress window using the launcher's app
+		progressWindow := ui.launcher.app.NewWindow("Downloading LocalAI")
+		progressWindow.Resize(fyne.NewSize(400, 250))
+		progressWindow.CenterOnScreen()
+
+		// Progress bar
+		progressBar := widget.NewProgressBar()
+		progressBar.SetValue(0)
+
+		// Status label. Truncate with an ellipsis so a long "Download failed:
+		// <url>" message can't stretch the window (and progress bar) to fit the
+		// whole error on one line; the full error is shown in the dialog below.
+		statusLabel := widget.NewLabel("Preparing download...")
+		statusLabel.Truncation = fyne.TextTruncateEllipsis
+
+		// Release notes button
+		releaseNotesButton := widget.NewButton("View Release Notes", func() {
+			releaseNotesURL, err := ui.launcher.githubReleaseNotesURL(version)
+			if err != nil {
+				log.Printf("Failed to parse URL: %v", err)
+				return
+			}
+
+			ui.launcher.app.OpenURL(releaseNotesURL)
+		})
+
+		// Progress container
+		progressContainer := container.NewVBox(
+			widget.NewLabel(title),
+			progressBar,
+			statusLabel,
+			widget.NewSeparator(),
+			releaseNotesButton,
+		)
+
+		progressWindow.SetContent(progressContainer)
+		progressWindow.Show()
+
+		// Start download in background
+		go func() {
+			err := ui.launcher.DownloadUpdate(version, func(progress float64) {
+				// Update progress bar
+				fyne.Do(func() {
+					progressBar.SetValue(progress)
+					percentage := int(progress * 100)
+					statusLabel.SetText(fmt.Sprintf("Downloading... %d%%", percentage))
+				})
+			})
+
+			// Handle completion
+			fyne.Do(func() {
+				if err != nil {
+					statusLabel.SetText(fmt.Sprintf("Download failed: %v", err))
+					// Show error dialog
+					dialog.ShowError(err, progressWindow)
+				} else {
+					statusLabel.SetText("Download completed successfully!")
+					progressBar.SetValue(1.0)
+
+					// Show success dialog
+					dialog.ShowConfirm("Installation Complete",
+						"LocalAI has been downloaded and installed successfully. You can now start LocalAI from the launcher.",
+						func(close bool) {
+							progressWindow.Close()
+							// Update status
+							ui.UpdateStatus("LocalAI installed successfully")
+						}, progressWindow)
+				}
+			})
+		}()
+	})
+}
+
 // UpdateRunningState updates UI based on LocalAI running state
 func (ui *LauncherUI) UpdateRunningState(isRunning bool) {
 	fyne.Do(func() {
--- a/contrib/macos/sign-and-notarize.sh
+++ b/contrib/macos/sign-and-notarize.sh
@@ -71,42 +71,13 @@ cmd_notarize() {
  echo "[notarize] notarized and stapled $dmg"
 }

-# Notarize and staple the .app bundle itself. Stapling the dmg alone is not
-# enough: an app with no embedded ticket has no local proof of notarization, so
-# Gatekeeper falls back to an online check — and the app then fails to launch on
-# a machine that is offline / behind a firewall, or once it has been copied out
-# of the dmg. Stapling the bundle makes it verify offline. notarytool needs an
-# archive for a bundle, so we zip it first.
-cmd_notarize_app() {
-  local app="$1"
-  if [ -z "${MACOS_NOTARY_KEY:-}" ]; then
-    echo "[notarize] MACOS_NOTARY_KEY unset: skipping notarization of $app"
-    return 0
-  fi
-  local keyfile zip
-  keyfile="$(mktemp).p8"
-  zip="$(mktemp).zip"
-  echo "$MACOS_NOTARY_KEY" | base64 --decode > "$keyfile"
-  ditto -c -k --keepParent "$app" "$zip"
-  xcrun notarytool submit "$zip" \
-    --key "$keyfile" \
-    --key-id "${MACOS_NOTARY_KEY_ID:?}" \
-    --issuer "${MACOS_NOTARY_ISSUER_ID:?}" \
-    --wait
-  rm -f "$keyfile" "$zip"
-  xcrun stapler staple "$app"
-  xcrun stapler validate "$app"
-  echo "[notarize] notarized and stapled $app"
-}
-
 main() {
  local sub="${1:-}"; shift || true
  case "$sub" in
-    import-cert)  cmd_import_cert ;;
-    sign)         cmd_sign "$@" ;;
-    notarize)     cmd_notarize "$@" ;;
-    notarize-app) cmd_notarize_app "$@" ;;
-    *) echo "usage: $0 {import-cert|sign <path>|notarize <dmg>|notarize-app <app>}" >&2; exit 2 ;;
+    import-cert) cmd_import_cert ;;
+    sign)        cmd_sign "$@" ;;
+    notarize)    cmd_notarize "$@" ;;
+    *) echo "usage: $0 {import-cert|sign <path>|notarize <dmg>}" >&2; exit 2 ;;
  esac
 }

--- a/core/application/application.go
+++ b/core/application/application.go
@@ -103,11 +103,6 @@ func newApplication(appConfig *config.ApplicationConfig) *Application {
 		mcpTools.CloseMCPSessions(modelName)
 	})

-	// Record a model_load backend trace for every real backend load, so the
-	// Traces UI shows which backend runtime served each model and how long
-	// the load took. Load failures are traced by the modality wrappers.
-	ml.SetLoadObserver(corebackend.ModelLoadTraceObserver(appConfig))
-
 	app := &Application{
 		backendLoader:      config.NewModelConfigLoader(appConfig.SystemState.Model.ModelsPath),
 		modelLoader:        ml,
--- a/core/application/config_file_watcher.go
+++ b/core/application/config_file_watcher.go
@@ -197,7 +197,6 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
 		envWatchdogBusy := appConfig.WatchDogBusy == startupAppConfig.WatchDogBusy
 		envWatchdogIdleTimeout := appConfig.WatchDogIdleTimeout == startupAppConfig.WatchDogIdleTimeout
 		envWatchdogBusyTimeout := appConfig.WatchDogBusyTimeout == startupAppConfig.WatchDogBusyTimeout
-		envWatchdogInterval := appConfig.WatchDogInterval == startupAppConfig.WatchDogInterval
 		envSingleBackend := appConfig.SingleBackend == startupAppConfig.SingleBackend
 		envMaxActiveBackends := appConfig.MaxActiveBackends == startupAppConfig.MaxActiveBackends
 		envMemoryReclaimerEnabled := appConfig.MemoryReclaimerEnabled == startupAppConfig.MemoryReclaimerEnabled
@@ -258,14 +257,6 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
 					xlog.Warn("invalid watchdog busy timeout in runtime_settings.json", "error", err, "timeout", *settings.WatchdogBusyTimeout)
 				}
 			}
-			if settings.WatchdogInterval != nil && !envWatchdogInterval {
-				dur, err := time.ParseDuration(*settings.WatchdogInterval)
-				if err == nil {
-					appConfig.WatchDogInterval = dur
-				} else {
-					xlog.Warn("invalid watchdog interval in runtime_settings.json", "error", err, "interval", *settings.WatchdogInterval)
-				}
-			}
 			// Handle MaxActiveBackends (new) and SingleBackend (deprecated)
 			if settings.MaxActiveBackends != nil && !envMaxActiveBackends {
 				appConfig.MaxActiveBackends = *settings.MaxActiveBackends
--- a/core/application/runtime_settings_branding_test.go
+++ b/core/application/runtime_settings_branding_test.go
@@ -87,31 +87,6 @@ var _ = Describe("loadRuntimeSettingsFromFile", func() {
 		})
 	})

-	// Watchdog check interval (issue #10601). Unlike the idle/busy timeouts
-	// (which default to 0), NewApplicationConfig baseline-defaults the
-	// interval to 500ms. The loader's "apply file value only if still at the
-	// zero default" env-detection therefore never fired for the interval, so
-	// a UI-saved Check Interval silently reverted to 500ms on every restart
-	// while the idle/busy timeouts persisted. These specs construct the
-	// config the same way boot does (NewApplicationConfig) so they observe
-	// the real default the loader sees.
-	Describe("watchdog interval", func() {
-		It("loads a UI-saved watchdog_interval on the next startup", func() {
-			cfg := config.NewApplicationConfig()
-			cfg.DynamicConfigsDir = seedSettings(`{"watchdog_interval": "2s"}`)
-			loadRuntimeSettingsFromFile(cfg)
-			Expect(cfg.WatchDogInterval).To(Equal(2 * time.Second))
-		})
-
-		It("does not override an explicit env/CLI interval", func() {
-			cfg := config.NewApplicationConfig()
-			cfg.DynamicConfigsDir = seedSettings(`{"watchdog_interval": "2s"}`)
-			cfg.WatchDogInterval = 1 * time.Second // simulate SetWatchDogInterval from env
-			loadRuntimeSettingsFromFile(cfg)
-			Expect(cfg.WatchDogInterval).To(Equal(1*time.Second), "env/CLI interval must win over the persisted file value")
-		})
-	})
-
 	// MITM listener address. The file is the only source — no env var
 	// exists — so a regression here means an admin who configured the
 	// listener via /api/settings loses it after a reboot, even though
--- a/core/backend/model_load_trace_test.go
+++ b/core/backend/model_load_trace_test.go
@@ -1,72 +0,0 @@
-package backend_test
-
-import (
-	"errors"
-	"time"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-
-	"github.com/mudler/LocalAI/core/backend"
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/trace"
-	"github.com/mudler/LocalAI/pkg/model"
-)
-
-// ModelLoadTraceObserver is what makes successful loads visible on the
-// Traces page: one model_load row per real backend load, carrying the
-// resolved backend runtime. Failures must NOT be recorded here — the
-// modality wrappers own those — and the observer must respect the runtime
-// tracing toggle.
-var _ = Describe("ModelLoadTraceObserver", func() {
-	var appConfig *config.ApplicationConfig
-
-	successEvent := model.BackendLoadEvent{
-		ModelID:    "parakeet-cpp-realtime_eou_120m-v1",
-		ModelName:  "realtime_eou_120m.gguf",
-		Backend:    "parakeet-cpp",
-		BackendURI: "/backends/intel-sycl-f16-parakeet-cpp-development/run.sh",
-		Duration:   1500 * time.Millisecond,
-	}
-
-	BeforeEach(func() {
-		appConfig = &config.ApplicationConfig{
-			EnableTracing:   true,
-			TracingMaxItems: 64,
-		}
-		trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
-		trace.ClearBackendTraces()
-	})
-
-	It("records a model_load trace with the backend runtime on success", func() {
-		backend.ModelLoadTraceObserver(appConfig)(successEvent)
-
-		Eventually(trace.GetBackendTraces).Should(HaveLen(1))
-		got := trace.GetBackendTraces()[0]
-		Expect(got.Type).To(Equal(trace.BackendTraceModelLoad))
-		Expect(got.Summary).To(Equal("Model loaded"))
-		Expect(got.ModelName).To(Equal("parakeet-cpp-realtime_eou_120m-v1"))
-		Expect(got.Backend).To(Equal("parakeet-cpp"))
-		Expect(got.Duration).To(Equal(1500 * time.Millisecond))
-		Expect(got.Data["backend_runtime"]).To(Equal("/backends/intel-sycl-f16-parakeet-cpp-development/run.sh"))
-		Expect(got.Data["model_file"]).To(Equal("realtime_eou_120m.gguf"))
-		Expect(got.Error).To(BeEmpty())
-	})
-
-	It("skips failed loads — the modality wrappers trace those with request context", func() {
-		failed := successEvent
-		failed.Err = errors.New("grpc service not ready")
-
-		backend.ModelLoadTraceObserver(appConfig)(failed)
-
-		Consistently(trace.GetBackendTraces, "100ms", "20ms").Should(BeEmpty())
-	})
-
-	It("records nothing when tracing is disabled", func() {
-		appConfig.EnableTracing = false
-
-		backend.ModelLoadTraceObserver(appConfig)(successEvent)
-
-		Consistently(trace.GetBackendTraces, "100ms", "20ms").Should(BeEmpty())
-	})
-})
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -19,39 +19,6 @@ import (
 	"github.com/mudler/xlog"
 )

-// ModelLoadTraceObserver returns the ModelLoader load observer that records
-// a model_load backend trace for every successful real load (backend process
-// spawn + LoadModel RPC; cache hits never reach the observer). Failures are
-// deliberately skipped here: the modality wrappers already record them via
-// recordModelLoadFailure with request context, and the backend auto-discovery
-// scan probes several backends before one succeeds — tracing every probe
-// failure would bury the buffer in noise.
-//
-// The traced data includes the resolved backend runtime (the installed
-// backend's launcher path, which names the variant directory) — that is what
-// identifies WHICH build served the load. A stale installed backend is
-// invisible in the model config but obvious here.
-func ModelLoadTraceObserver(appConfig *config.ApplicationConfig) func(model.BackendLoadEvent) {
-	return func(ev model.BackendLoadEvent) {
-		if ev.Err != nil || !appConfig.EnableTracing {
-			return
-		}
-		trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
-		trace.RecordBackendTrace(trace.BackendTrace{
-			Timestamp: time.Now(),
-			Duration:  ev.Duration,
-			Type:      trace.BackendTraceModelLoad,
-			ModelName: ev.ModelID,
-			Backend:   ev.Backend,
-			Summary:   "Model loaded",
-			Data: map[string]any{
-				"model_file":      ev.ModelName,
-				"backend_runtime": ev.BackendURI,
-			},
-		})
-	}
-}
-
 // recordModelLoadFailure records a backend trace when model loading fails.
 func recordModelLoadFailure(appConfig *config.ApplicationConfig, modelName, backend string, err error, data map[string]any) {
 	if !appConfig.EnableTracing {
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -181,7 +181,6 @@ func transcriptResultFromProto(r *proto.TranscriptResult) *schema.TranscriptionR
 		Text:     r.Text,
 		Language: r.Language,
 		Duration: float64(r.Duration),
-		Eou:      r.Eou,
 	}

 	for _, s := range r.Segments {
--- a/core/backend/transcript_live.go
+++ b/core/backend/transcript_live.go
@@ -1,297 +0,0 @@
-package backend
-
-import (
-	"context"
-	"errors"
-	"fmt"
-	"io"
-	"maps"
-	"sync"
-	"time"
-
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/LocalAI/core/trace"
-	grpcPkg "github.com/mudler/LocalAI/pkg/grpc"
-	"github.com/mudler/LocalAI/pkg/grpc/proto"
-	"github.com/mudler/LocalAI/pkg/model"
-	"github.com/mudler/LocalAI/pkg/sound"
-	"github.com/mudler/xlog"
-)
-
-// LiveTranscriptionEvent is one streamed event from a live (bidirectional)
-// transcription session. Delta/Eou/Eob/Words arrive as the user speaks; Final
-// is set exactly once, on the terminal event after Close flushes the decode
-// tail. Eou means the model judged the user yielded the turn; Eob means a
-// backchannel ("uh-huh") ended — callers must NOT treat Eob as a turn
-// boundary.
-type LiveTranscriptionEvent struct {
-	Delta string
-	Eou   bool
-	Eob   bool
-	Words []schema.TranscriptionWord
-	Final *schema.TranscriptionResult
-}
-
-// LiveTranscriptionSession is a handle on an open live transcription stream.
-// Feed pushes 16 kHz mono float PCM; Close signals end-of-audio, waits for
-// the backend's terminal Final event to be delivered, and releases the
-// stream.
-type LiveTranscriptionSession interface {
-	Feed(pcm []float32) error
-	Close() error
-}
-
-// liveCloseDrainTimeout bounds how long Close waits for the backend to flush
-// the decode tail before force-cancelling the stream. Finalize is one short
-// engine call; seconds here means the backend is wedged.
-const liveCloseDrainTimeout = 10 * time.Second
-
-type liveTranscriptionSession struct {
-	stream    grpcPkg.AudioTranscriptionLiveClient
-	cancel    context.CancelFunc
-	recvDone  chan struct{}
-	recvErr   error // written by the recv goroutine before recvDone closes
-	closeOnce sync.Once
-	closeErr  error
-	trace     *liveTraceState // nil when tracing was disabled at open
-}
-
-func (s *liveTranscriptionSession) Feed(pcm []float32) error {
-	s.trace.addPCM(pcm)
-	return s.stream.Send(&proto.TranscriptLiveRequest{
-		Payload: &proto.TranscriptLiveRequest_Audio{Audio: &proto.TranscriptLiveAudio{Pcm: pcm}},
-	})
-}
-
-func (s *liveTranscriptionSession) Close() error {
-	s.closeOnce.Do(func() {
-		err := s.stream.CloseSend()
-		select {
-		case <-s.recvDone:
-		case <-time.After(liveCloseDrainTimeout):
-			xlog.Warn("live transcription: backend did not finalize in time; cancelling stream")
-			s.cancel()
-			<-s.recvDone
-		}
-		s.cancel()
-		if err == nil {
-			err = s.recvErr
-		}
-		s.closeErr = err
-		s.trace.record(err)
-	})
-	return s.closeErr
-}
-
-// liveSampleRate is the PCM rate of a live transcription session, fixed by
-// the session config sent in ModelTranscriptionLive.
-const liveSampleRate = 16000
-
-// liveTraceState accumulates what the per-turn backend trace needs while a
-// live session runs: a bounded copy of the fed PCM for the audio snippet,
-// the decode outputs, and timing. One trace is recorded at Close — the live
-// path never touches the unary transcription wrapper, so without this a
-// streaming-only pipeline produced no transcription traces at all. Feed and
-// the recv goroutine run concurrently; mu guards the accumulators.
-type liveTraceState struct {
-	appConfig *config.ApplicationConfig
-	modelName string
-	backend   string
-	language  string
-	started   time.Time
-
-	mu          sync.Mutex
-	pcm         []byte // first trace.MaxSnippetSeconds of fed audio, int16 LE
-	fedSamples  int    // ALL samples fed, beyond the snippet cap
-	deltaEvents int
-	eouEvents   int
-	eobEvents   int
-	finalText   string
-}
-
-func newLiveTraceState(modelConfig config.ModelConfig, appConfig *config.ApplicationConfig, language string) *liveTraceState {
-	if !appConfig.EnableTracing {
-		return nil
-	}
-	trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
-	return &liveTraceState{
-		appConfig: appConfig,
-		modelName: modelConfig.Name,
-		backend:   modelConfig.Backend,
-		language:  language,
-		started:   time.Now(),
-	}
-}
-
-func (ts *liveTraceState) addPCM(pcm []float32) {
-	if ts == nil {
-		return
-	}
-	ts.mu.Lock()
-	defer ts.mu.Unlock()
-	ts.fedSamples += len(pcm)
-	maxBytes := trace.MaxSnippetSeconds * liveSampleRate * 2
-	if room := (maxBytes - len(ts.pcm)) / 2; room > 0 {
-		if len(pcm) > room {
-			pcm = pcm[:room]
-		}
-		ts.pcm = append(ts.pcm, sound.Float32sToInt16LEBytes(pcm)...)
-	}
-}
-
-func (ts *liveTraceState) observe(ev LiveTranscriptionEvent) {
-	if ts == nil {
-		return
-	}
-	ts.mu.Lock()
-	defer ts.mu.Unlock()
-	if ev.Delta != "" {
-		ts.deltaEvents++
-	}
-	if ev.Eou {
-		ts.eouEvents++
-	}
-	if ev.Eob {
-		ts.eobEvents++
-	}
-	if ev.Final != nil {
-		ts.finalText = ev.Final.Text
-	}
-}
-
-func (ts *liveTraceState) record(closeErr error) {
-	if ts == nil || !ts.appConfig.EnableTracing {
-		return
-	}
-	ts.mu.Lock()
-	data := map[string]any{
-		"source":       "live_stream",
-		"language":     ts.language,
-		"result_text":  ts.finalText,
-		"eou_events":   ts.eouEvents,
-		"eob_events":   ts.eobEvents,
-		"delta_events": ts.deltaEvents,
-	}
-	if snippet := trace.AudioSnippetFromPCM(ts.pcm, liveSampleRate, ts.fedSamples*2, ts.appConfig.TracingMaxBodyBytes); snippet != nil {
-		maps.Copy(data, snippet)
-	}
-	summary := "live -> " + ts.finalText
-	ts.mu.Unlock()
-
-	bt := trace.BackendTrace{
-		Timestamp: ts.started,
-		Duration:  time.Since(ts.started),
-		Type:      trace.BackendTraceTranscription,
-		ModelName: ts.modelName,
-		Backend:   ts.backend,
-		Summary:   trace.TruncateString(summary, 200),
-		Data:      data,
-	}
-	if closeErr != nil {
-		bt.Error = closeErr.Error()
-	}
-	trace.RecordBackendTrace(bt)
-}
-
-// ModelTranscriptionLive loads the transcription backend, opens the
-// bidirectional AudioTranscriptionLive RPC, sends the session config, and
-// BLOCKS until the backend's ready ack. A grpcerrors.
-// IsLiveTranscriptionUnsupported error means the backend (or the loaded
-// model) cannot do live transcription and the caller should degrade to the
-// unary/file path. After a successful return, onEvent is invoked from a
-// background goroutine — in order, one event at a time — for every response
-// the backend streams, ending with the Final event triggered by Close.
-func ModelTranscriptionLive(ctx context.Context, language string,
-	ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig,
-	onEvent func(LiveTranscriptionEvent)) (LiveTranscriptionSession, error) {
-
-	transcriptionModel, err := loadTranscriptionModel(ctx, ml, modelConfig, appConfig)
-	if err != nil {
-		return nil, err
-	}
-
-	// The derived cancel out-lives this call inside the session: Close uses
-	// it to unwind the stream (and, in embed mode, the server-side recv
-	// pump, which only stops on send-close or context cancellation).
-	streamCtx, cancel := context.WithCancel(ctx)
-	stream, err := transcriptionModel.AudioTranscriptionLive(streamCtx)
-	if err != nil {
-		cancel()
-		return nil, err
-	}
-
-	fail := func(err error) (LiveTranscriptionSession, error) {
-		_ = stream.CloseSend()
-		cancel()
-		return nil, err
-	}
-
-	if err := stream.Send(&proto.TranscriptLiveRequest{
-		Payload: &proto.TranscriptLiveRequest_Config{Config: &proto.TranscriptLiveConfig{
-			Language:   language,
-			SampleRate: liveSampleRate,
-		}},
-	}); err != nil {
-		return fail(err)
-	}
-
-	// Ready-ack contract: the backend answers a successful open with a
-	// {ready:true} response before any transcript data; unsupported
-	// backends surface Unimplemented here instead.
-	ack, err := stream.Recv()
-	if err != nil {
-		return fail(err)
-	}
-	if !ack.GetReady() {
-		return fail(fmt.Errorf("live transcription: backend %q broke the ready-ack contract (first response carried data)", modelConfig.Backend))
-	}
-
-	s := &liveTranscriptionSession{
-		stream:   stream,
-		cancel:   cancel,
-		recvDone: make(chan struct{}),
-		trace:    newLiveTraceState(modelConfig, appConfig, language),
-	}
-
-	go func() {
-		defer close(s.recvDone)
-		for {
-			resp, err := stream.Recv()
-			if err != nil {
-				if !errors.Is(err, io.EOF) && streamCtx.Err() == nil {
-					xlog.Warn("live transcription stream ended unexpectedly", "error", err)
-					s.recvErr = err
-				}
-				return
-			}
-			ev := liveEventFromProto(resp)
-			if ev.Delta == "" && !ev.Eou && !ev.Eob && len(ev.Words) == 0 && ev.Final == nil {
-				continue // duplicate ready ack / keep-alive: nothing to deliver
-			}
-			s.trace.observe(ev)
-			onEvent(ev)
-		}
-	}()
-
-	return s, nil
-}
-
-func liveEventFromProto(r *proto.TranscriptLiveResponse) LiveTranscriptionEvent {
-	ev := LiveTranscriptionEvent{
-		Delta: r.GetDelta(),
-		Eou:   r.GetEou(),
-		Eob:   r.GetEob(),
-	}
-	for _, w := range r.GetWords() {
-		ev.Words = append(ev.Words, schema.TranscriptionWord{
-			Start: time.Duration(w.Start),
-			End:   time.Duration(w.End),
-			Text:  w.Text,
-		})
-	}
-	if r.GetFinalResult() != nil {
-		ev.Final = transcriptResultFromProto(r.GetFinalResult())
-	}
-	return ev
-}
--- a/core/backend/transcript_live_internal_test.go
+++ b/core/backend/transcript_live_internal_test.go
@@ -1,162 +0,0 @@
-package backend
-
-import (
-	"errors"
-	"time"
-
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/LocalAI/core/trace"
-	"github.com/mudler/LocalAI/pkg/grpc/proto"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("liveEventFromProto", func() {
-	It("maps deltas, eou flags and words (ns -> duration)", func() {
-		ev := liveEventFromProto(&proto.TranscriptLiveResponse{
-			Delta: "hello ",
-			Eou:   true,
-			Words: []*proto.TranscriptWord{
-				{Start: int64(100 * time.Millisecond), End: int64(400 * time.Millisecond), Text: "hello"},
-			},
-		})
-		Expect(ev.Delta).To(Equal("hello "))
-		Expect(ev.Eou).To(BeTrue())
-		Expect(ev.Words).To(HaveLen(1))
-		Expect(ev.Words[0].Text).To(Equal("hello"))
-		Expect(ev.Words[0].Start).To(Equal(100 * time.Millisecond))
-		Expect(ev.Words[0].End).To(Equal(400 * time.Millisecond))
-		Expect(ev.Final).To(BeNil())
-	})
-
-	It("maps the terminal final result including the eou flag", func() {
-		ev := liveEventFromProto(&proto.TranscriptLiveResponse{
-			FinalResult: &proto.TranscriptResult{
-				Text:     "hello world",
-				Duration: 1.5,
-				Eou:      true,
-				Segments: []*proto.TranscriptSegment{{Id: 0, Text: "hello world"}},
-			},
-		})
-		Expect(ev.Final).NotTo(BeNil())
-		Expect(ev.Final.Text).To(Equal("hello world"))
-		Expect(ev.Final.Duration).To(BeNumerically("~", 1.5, 1e-6))
-		Expect(ev.Final.Eou).To(BeTrue())
-		Expect(ev.Final.Segments).To(HaveLen(1))
-	})
-
-	It("yields an empty event for a bare ready ack (filtered by the recv loop)", func() {
-		ev := liveEventFromProto(&proto.TranscriptLiveResponse{Ready: true})
-		Expect(ev.Delta).To(BeEmpty())
-		Expect(ev.Eou).To(BeFalse())
-		Expect(ev.Words).To(BeEmpty())
-		Expect(ev.Final).To(BeNil())
-	})
-
-	It("maps the eob backchannel flag separately from eou", func() {
-		ev := liveEventFromProto(&proto.TranscriptLiveResponse{Delta: "uh-huh", Eob: true})
-		Expect(ev.Eob).To(BeTrue())
-		Expect(ev.Eou).To(BeFalse())
-	})
-})
-
-// liveTraceState is what makes streaming-only pipelines visible on the
-// Traces page: without it a semantic_vad session with retranscribe off
-// produced no transcription trace at all. One trace per session (= one per
-// realtime turn), recorded at Close.
-var _ = Describe("liveTraceState", func() {
-	var appConfig *config.ApplicationConfig
-
-	BeforeEach(func() {
-		appConfig = &config.ApplicationConfig{
-			EnableTracing:   true,
-			TracingMaxItems: 64,
-		}
-		trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
-		trace.ClearBackendTraces()
-	})
-
-	modelCfg := func() config.ModelConfig {
-		cfg := config.ModelConfig{Backend: "parakeet-cpp"}
-		cfg.Name = "parakeet-live"
-		return cfg
-	}
-
-	It("is disabled (nil) when tracing is off, and nil receivers are no-ops", func() {
-		appConfig.EnableTracing = false
-		ts := newLiveTraceState(modelCfg(), appConfig, "en")
-		Expect(ts).To(BeNil())
-
-		// The session calls these unconditionally; nil must be safe.
-		ts.addPCM([]float32{0.5})
-		ts.observe(LiveTranscriptionEvent{Eou: true})
-		ts.record(nil)
-		Consistently(trace.GetBackendTraces, "100ms", "20ms").Should(BeEmpty())
-	})
-
-	It("records one transcription trace with text, eou event counts and audio snippet at Close", func() {
-		ts := newLiveTraceState(modelCfg(), appConfig, "en")
-		Expect(ts).NotTo(BeNil())
-
-		// One second of a loud-ish constant tone so the snippet has signal.
-		pcm := make([]float32, liveSampleRate)
-		for i := range pcm {
-			pcm[i] = 0.25
-		}
-		ts.addPCM(pcm)
-		ts.observe(LiveTranscriptionEvent{Delta: "hello "})
-		ts.observe(LiveTranscriptionEvent{Delta: "world", Eou: true})
-		ts.observe(LiveTranscriptionEvent{Final: &schema.TranscriptionResult{Text: "hello world", Eou: true}})
-
-		ts.record(nil)
-
-		Eventually(trace.GetBackendTraces).Should(HaveLen(1))
-		got := trace.GetBackendTraces()[0]
-		Expect(got.Type).To(Equal(trace.BackendTraceTranscription))
-		Expect(got.ModelName).To(Equal("parakeet-live"))
-		Expect(got.Backend).To(Equal("parakeet-cpp"))
-		Expect(got.Summary).To(ContainSubstring("hello world"))
-		Expect(got.Data["source"]).To(Equal("live_stream"))
-		Expect(got.Data["result_text"]).To(Equal("hello world"))
-		// The live FinalResult no longer carries a terminal eou flag; the
-		// per-feed eou_events count is what the trace records instead.
-		Expect(got.Data).NotTo(HaveKey("eou"))
-		Expect(got.Data["eou_events"]).To(Equal(1))
-		Expect(got.Data["delta_events"]).To(Equal(2))
-		Expect(got.Data["audio_duration_s"]).To(BeNumerically("~", 1.0, 0.01))
-		Expect(got.Data["audio_wav_base64"]).NotTo(BeEmpty())
-		Expect(got.Error).To(BeEmpty())
-	})
-
-	It("caps the stored snippet but keeps counting the full fed duration", func() {
-		ts := newLiveTraceState(modelCfg(), appConfig, "")
-
-		// Feed past the snippet cap in two chunks (cap + one extra second).
-		ts.addPCM(make([]float32, trace.MaxSnippetSeconds*liveSampleRate))
-		ts.addPCM(make([]float32, liveSampleRate))
-
-		Expect(len(ts.pcm)).To(Equal(trace.MaxSnippetSeconds * liveSampleRate * 2))
-		Expect(ts.fedSamples).To(Equal((trace.MaxSnippetSeconds + 1) * liveSampleRate))
-
-		ts.record(nil)
-		Eventually(trace.GetBackendTraces).Should(HaveLen(1))
-		got := trace.GetBackendTraces()[0]
-		Expect(got.Data["audio_duration_s"]).To(BeNumerically("~", float64(trace.MaxSnippetSeconds+1), 0.01))
-		Expect(got.Data["audio_snippet_s"]).To(BeNumerically("~", float64(trace.MaxSnippetSeconds), 0.01))
-	})
-
-	It("clamps out-of-range float samples instead of wrapping", func() {
-		ts := newLiveTraceState(modelCfg(), appConfig, "")
-		ts.addPCM([]float32{2.0, -2.0})
-		Expect(ts.pcm).To(Equal([]byte{0xff, 0x7f, 0x00, 0x80})) // 32767, -32768
-	})
-
-	It("stamps the close error on the trace", func() {
-		ts := newLiveTraceState(modelCfg(), appConfig, "")
-		ts.record(errors.New("stream torn down"))
-
-		Eventually(trace.GetBackendTraces).Should(HaveLen(1))
-		Expect(trace.GetBackendTraces()[0].Error).To(Equal("stream torn down"))
-	})
-})
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -6,7 +6,6 @@ import (
 	"regexp"
 	"time"

-	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/system"
 	"github.com/mudler/LocalAI/pkg/xsysinfo"
 	"github.com/mudler/xlog"
@@ -242,19 +241,12 @@ func NewApplicationConfig(o ...AppOption) *ApplicationConfig {
 		Context:                  context.Background(),
 		UploadLimitMB:            15,
 		Debug:                    true,
-		AgentJobRetentionDays:    30,              // Default: 30 days
-		LRUEvictionMaxRetries:    30,              // Default: 30 retries
-		LRUEvictionRetryInterval: 1 * time.Second, // Default: 1 second
-		// WatchDogInterval is intentionally left at the zero value here.
-		// The startup loader applies a persisted runtime_settings.json value
-		// only when the interval is still 0 (its "not set by env var"
-		// heuristic, matching the idle/busy timeouts); a non-zero baseline
-		// default would defeat that and silently revert a UI-saved Check
-		// Interval to the default on every restart (#10601). The effective
-		// 500ms default is supplied at the watchdog layer (DefaultWatchdogInterval)
-		// when the value is still 0.
-		TracingMaxItems:     1024,
-		TracingMaxBodyBytes: 64 * 1024, // 64 KiB - caps each request/response body in the trace buffer
+		AgentJobRetentionDays:    30,                     // Default: 30 days
+		LRUEvictionMaxRetries:    30,                     // Default: 30 retries
+		LRUEvictionRetryInterval: 1 * time.Second,        // Default: 1 second
+		WatchDogInterval:         500 * time.Millisecond, // Default: 500ms
+		TracingMaxItems:          1024,
+		TracingMaxBodyBytes:      64 * 1024, // 64 KiB - caps each request/response body in the trace buffer
 		AgentPool: AgentPoolConfig{
 			Enabled:         true,
 			Timeout:         "5m",
@@ -1105,7 +1097,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 	if o.WatchDogInterval > 0 {
 		watchdogInterval = o.WatchDogInterval.String()
 	} else {
-		watchdogInterval = model.DefaultWatchdogInterval.String() // default: 500ms
+		watchdogInterval = "2s" // default
 	}
 	var lruEvictionRetryInterval string
 	if o.LRUEvictionRetryInterval > 0 {
--- a/core/config/gguf.go
+++ b/core/config/gguf.go
@@ -67,16 +67,6 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
 		ApplyMTPDefaults(cfg, n)
 	}

-	// Sliding-window-attention models (Gemma 2/3, Cohere2, Llama 4, ...) ship
-	// with a reduced SWA KV cache by default, which cannot reuse a prompt
-	// prefix across requests and so defeats the cross-request prefix cache
-	// (cache_reuse) we enable in serving_defaults.go. Enable the full SWA cache
-	// for these models so the prefix survives; skipped for dense models and
-	// when the user already pinned an SWA cache option.
-	if w, ok := HasSlidingWindowAttention(f); ok {
-		ApplySWAFullDefault(cfg, w)
-	}
-
 	// Thinking support detection is done after model load via DetectThinkingSupportFromBackend

 	// template estimations
--- a/core/config/meta/registry.go
+++ b/core/config/meta/registry.go
@@ -567,38 +567,6 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Advanced:    true,
 			Order:       83,
 		},
-		"pipeline.turn_detection.type": {
-			Section:     "pipeline",
-			Label:       "Turn Detection",
-			Description: "Default turn-detection mode for realtime sessions on this pipeline. server_vad commits after a fixed silence window; semantic_vad lets the transcription model's end-of-utterance token drive a dynamic window (fast commit after the token, long eagerness fallback without it). semantic_vad requires a streaming-EOU transcription model (e.g. parakeet-cpp-realtime_eou_120m-v1) and degrades to silence-only otherwise. Clients can override per session via session.update.",
-			Component:   "select",
-			Options: []FieldOption{
-				{Value: "", Label: "Default (server_vad)"},
-				{Value: "server_vad", Label: "server_vad (silence-based)"},
-				{Value: "semantic_vad", Label: "semantic_vad (end-of-utterance token)"},
-			},
-			Order: 87,
-		},
-		"pipeline.turn_detection.eagerness": {
-			Section:     "pipeline",
-			Label:       "Eagerness",
-			Description: "semantic_vad fallback silence window used when no end-of-utterance token was seen: low waits 8s, medium/auto 4s, high 2s.",
-			Component:   "select",
-			Options: []FieldOption{
-				{Value: "", Label: "Default (auto)"},
-				{Value: "low", Label: "low (8s)"},
-				{Value: "medium", Label: "medium (4s)"},
-				{Value: "high", Label: "high (2s)"},
-			},
-			Order: 88,
-		},
-		"pipeline.turn_detection.retranscribe": {
-			Section:     "pipeline",
-			Label:       "Retranscribe on Commit",
-			Description: "Cross-check every semantic_vad commit with an offline decode of the buffered turn: commit only proceeds when the batch decode also ends in the end-of-utterance token, and its transcript is used. Logs a streamed-vs-batch comparison — useful to gauge streaming/batch alignment — at the cost of one extra decode per turn.",
-			Component:   "toggle",
-			Order:       89,
-		},

 		// --- Functions ---
 		"function.grammar.parallel_calls": {
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -650,12 +650,6 @@ type Pipeline struct {
 	// VoiceRecognition gates the pipeline behind speaker verification. Nil
 	// (block absent) means no gate, preserving existing behavior.
 	VoiceRecognition *PipelineVoiceRecognition `yaml:"voice_recognition,omitempty" json:"voice_recognition,omitempty"`
-
-	// TurnDetection sets the server-side default turn-detection mode for
-	// realtime sessions on this pipeline, so clients need no session.update
-	// to benefit. A client session.update still overrides type and eagerness
-	// per session; retranscribe is server-side only. Unset keeps server_vad.
-	TurnDetection PipelineTurnDetection `yaml:"turn_detection,omitempty" json:"turn_detection,omitempty"`
 }

 // PipelineCompaction configures summarize-then-drop for a realtime pipeline.
@@ -940,38 +934,6 @@ func (v PipelineVoiceRecognition) Validate(registryAvailable bool) error {
 	return nil
 }

-// @Description PipelineTurnDetection sets realtime turn-detection defaults.
-type PipelineTurnDetection struct {
-	// Type selects the default turn_detection mode for sessions on this
-	// pipeline: "server_vad" (silence-based) or "semantic_vad" (the
-	// transcription model's end-of-utterance token drives a dynamic silence
-	// window; needs a streaming-EOU transcription model such as
-	// parakeet_realtime_eou_120m-v1, degrades to silence-only otherwise).
-	Type string `yaml:"type,omitempty" json:"type,omitempty"`
-	// Eagerness is the semantic_vad fallback when no end-of-utterance token
-	// was seen: low waits 8s of silence, medium/auto 4s, high 2s.
-	Eagerness string `yaml:"eagerness,omitempty" json:"eagerness,omitempty"`
-	// Retranscribe (semantic_vad only) cross-checks every EOU-triggered
-	// commit with an offline decode of the buffered turn: the commit only
-	// proceeds when the batch decode also ends in the end-of-utterance token,
-	// and its transcript is the one used. The streamed and batch transcripts
-	// are compared in the logs — a diagnostic for streaming/batch alignment
-	// at the cost of one extra decode per turn.
-	Retranscribe *bool `yaml:"retranscribe,omitempty" json:"retranscribe,omitempty"`
-}
-
-// TurnDetectionSemantic reports whether this pipeline defaults sessions to
-// semantic (EOU-driven) turn detection.
-func (p Pipeline) TurnDetectionSemantic() bool {
-	return strings.EqualFold(strings.TrimSpace(p.TurnDetection.Type), "semantic_vad")
-}
-
-// TurnDetectionRetranscribe reports whether semantic_vad commits should be
-// cross-checked (and transcribed) by an offline decode of the buffered turn.
-func (p Pipeline) TurnDetectionRetranscribe() bool {
-	return p.TurnDetection.Retranscribe != nil && *p.TurnDetection.Retranscribe
-}
-
 // @Description File configuration for model downloads
 type File struct {
 	Filename string         `yaml:"filename,omitempty" json:"filename,omitempty"`
--- a/core/config/pipeline_turn_detection_test.go
+++ b/core/config/pipeline_turn_detection_test.go
@@ -1,61 +0,0 @@
-package config
-
-import (
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-	"gopkg.in/yaml.v3"
-)
-
-// pipeline.turn_detection sets the server-side default turn-detection mode
-// for realtime sessions. Unset keeps server_vad, so existing configs are
-// unaffected; retranscribe is opt-in.
-var _ = Describe("Pipeline turn_detection config", func() {
-	It("defaults to non-semantic with retranscribe off when unset", func() {
-		var p Pipeline
-		Expect(p.TurnDetectionSemantic()).To(BeFalse())
-		Expect(p.TurnDetectionRetranscribe()).To(BeFalse())
-	})
-
-	It("parses the nested turn_detection block from YAML", func() {
-		var c ModelConfig
-		err := yaml.Unmarshal([]byte(`
-name: gpt-realtime
-pipeline:
-  transcription: parakeet-cpp-realtime_eou_120m-v1
-  turn_detection:
-    type: semantic_vad
-    eagerness: high
-    retranscribe: true
-`), &c)
-		Expect(err).ToNot(HaveOccurred())
-		Expect(c.Pipeline.TurnDetectionSemantic()).To(BeTrue())
-		Expect(c.Pipeline.TurnDetection.Eagerness).To(Equal("high"))
-		Expect(c.Pipeline.TurnDetectionRetranscribe()).To(BeTrue())
-	})
-
-	It("treats server_vad and unknown types as non-semantic", func() {
-		var p Pipeline
-		p.TurnDetection.Type = "server_vad"
-		Expect(p.TurnDetectionSemantic()).To(BeFalse())
-		p.TurnDetection.Type = "something_else"
-		Expect(p.TurnDetectionSemantic()).To(BeFalse())
-	})
-
-	It("matches semantic_vad case-insensitively with surrounding space", func() {
-		var p Pipeline
-		p.TurnDetection.Type = " Semantic_VAD "
-		Expect(p.TurnDetectionSemantic()).To(BeTrue())
-	})
-
-	It("treats an explicit retranscribe false as off", func() {
-		var c ModelConfig
-		err := yaml.Unmarshal([]byte(`
-pipeline:
-  turn_detection:
-    type: semantic_vad
-    retranscribe: false
-`), &c)
-		Expect(err).ToNot(HaveOccurred())
-		Expect(c.Pipeline.TurnDetectionRetranscribe()).To(BeFalse())
-	})
-})
--- a/core/config/swa.go
+++ b/core/config/swa.go
@@ -1,56 +0,0 @@
-package config
-
-import (
-	gguf "github.com/gpustack/gguf-parser-go"
-	"github.com/mudler/xlog"
-)
-
-// swaCacheOptionNames lists the backend option keys that control the
-// sliding-window-attention KV cache. If the user pinned any of these we leave
-// the SWA cache alone instead of forcing swa_full.
-var swaCacheOptionNames = []string{"swa_full", "n_swa"}
-
-// HasSlidingWindowAttention reports whether the parsed GGUF describes a
-// sliding-window-attention (SWA) model — Gemma 2/3, Cohere2, Llama 4 and the
-// like. The gguf-parser library normalizes the per-architecture
-// `<arch>.attention.sliding_window` metadata key into
-// GGUFArchitecture.AttentionSlidingWindow, applying the same family-specific
-// rules llama.cpp uses (e.g. Phi-3 carries the key but does not actually run
-// SWA, and is normalized to 0). A non-zero window means the model interleaves
-// SWA layers, so the returned size is also the diagnostic value we log.
-func HasSlidingWindowAttention(f *gguf.GGUFFile) (uint64, bool) {
-	if f == nil {
-		return 0, false
-	}
-	w := f.Architecture().AttentionSlidingWindow
-	return w, w > 0
-}
-
-// ApplySWAFullDefault enables the full-size SWA KV cache (swa_full:true) for a
-// sliding-window model, unless the user already pinned an SWA cache option.
-//
-// Why: llama.cpp defaults to a reduced SWA KV cache sized to the sliding window
-// (memory-light), but that reduced cache cannot preserve a prompt prefix across
-// requests. So for SWA models the cross-request prefix cache we enable in
-// serving_defaults.go (cache_reuse) is silently defeated — every turn
-// reprocesses the entire prompt. Setting swa_full:true makes llama.cpp keep the
-// full KV cache so the shared prefix is actually reused.
-//
-// The tradeoff is memory: the full SWA cache scales with context_size, so this
-// is gated to models that are genuinely SWA (never applied to dense models,
-// where it would only waste memory) and never overrides an explicit user
-// choice. `slidingWindow` is the value read from the GGUF and is used only for
-// the diagnostic log line.
-func ApplySWAFullDefault(cfg *ModelConfig, slidingWindow uint64) {
-	if cfg == nil || slidingWindow == 0 {
-		return
-	}
-	if backendOptionSet(cfg.Options, swaCacheOptionNames...) {
-		xlog.Debug("[swa] sliding-window model but an SWA cache option is already set; leaving user choice intact",
-			"name", cfg.Name, "sliding_window", slidingWindow)
-		return
-	}
-	cfg.Options = append(cfg.Options, "swa_full:true")
-	xlog.Debug("[swa] enabling swa_full for sliding-window model so the cross-request prompt-prefix cache survives (reduced SWA cache cannot reuse a prefix across requests)",
-		"name", cfg.Name, "sliding_window", slidingWindow)
-}
--- a/core/config/swa_test.go
+++ b/core/config/swa_test.go
@@ -1,120 +0,0 @@
-package config_test
-
-import (
-	. "github.com/mudler/LocalAI/core/config"
-
-	gguf "github.com/gpustack/gguf-parser-go"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-// ggufWithSlidingWindow fabricates a minimal in-memory GGUF carrying the given
-// `general.architecture` and `<arch>.attention.sliding_window` so the SWA
-// detection can be exercised without a real model file. A window of 0 omits the
-// key, modelling a dense (non-SWA) model.
-func ggufWithSlidingWindow(arch string, window uint32) *gguf.GGUFFile {
-	kvs := gguf.GGUFMetadataKVs{
-		{
-			Key:       "general.architecture",
-			ValueType: gguf.GGUFMetadataValueTypeString,
-			Value:     arch,
-		},
-	}
-	if window > 0 {
-		kvs = append(kvs, gguf.GGUFMetadataKV{
-			Key:       arch + ".attention.sliding_window",
-			ValueType: gguf.GGUFMetadataValueTypeUint32,
-			Value:     window,
-		})
-	}
-	return &gguf.GGUFFile{
-		Header: gguf.GGUFHeader{MetadataKV: kvs},
-	}
-}
-
-var _ = Describe("SWA full-cache auto-default", func() {
-	Context("HasSlidingWindowAttention", func() {
-		It("returns false on a nil GGUF file", func() {
-			w, ok := HasSlidingWindowAttention(nil)
-			Expect(ok).To(BeFalse())
-			Expect(w).To(BeZero())
-		})
-
-		It("detects a sliding-window model (Gemma 3 style)", func() {
-			w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("gemma3", 1024))
-			Expect(ok).To(BeTrue())
-			Expect(w).To(Equal(uint64(1024)))
-		})
-
-		It("detects Gemma 2 even without an explicit key (family default window)", func() {
-			// gguf-parser applies llama.cpp's family rules: gemma2 defaults the
-			// sliding window to 4096 when the metadata key is absent.
-			w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("gemma2", 0))
-			Expect(ok).To(BeTrue())
-			Expect(w).To(Equal(uint64(4096)))
-		})
-
-		It("reports a dense model as non-SWA", func() {
-			w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("llama", 0))
-			Expect(ok).To(BeFalse())
-			Expect(w).To(BeZero())
-		})
-
-		It("treats Phi-3 as non-SWA even when the key is present", func() {
-			// Phi-3 carries attention.sliding_window but does not actually run
-			// SWA; gguf-parser normalizes it to 0 to match llama.cpp.
-			w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("phi3", 2048))
-			Expect(ok).To(BeFalse())
-			Expect(w).To(BeZero())
-		})
-	})
-
-	Context("ApplySWAFullDefault", func() {
-		It("enables swa_full for a sliding-window model when unset", func() {
-			cfg := &ModelConfig{Name: "gemma3"}
-			ApplySWAFullDefault(cfg, 1024)
-			Expect(cfg.Options).To(ContainElement("swa_full:true"))
-		})
-
-		It("is a no-op for a dense model (window 0)", func() {
-			cfg := &ModelConfig{Name: "llama"}
-			ApplySWAFullDefault(cfg, 0)
-			Expect(cfg.Options).To(BeEmpty())
-		})
-
-		It("preserves an explicit swa_full:false", func() {
-			cfg := &ModelConfig{Name: "gemma3", Options: []string{"swa_full:false"}}
-			ApplySWAFullDefault(cfg, 1024)
-			Expect(cfg.Options).To(Equal([]string{"swa_full:false"}))
-		})
-
-		It("preserves an explicit swa_full:true without duplicating it", func() {
-			cfg := &ModelConfig{Name: "gemma3", Options: []string{"swa_full:true"}}
-			ApplySWAFullDefault(cfg, 1024)
-			Expect(cfg.Options).To(Equal([]string{"swa_full:true"}))
-		})
-
-		It("respects the n_swa alias", func() {
-			cfg := &ModelConfig{Name: "gemma3", Options: []string{"n_swa:512"}}
-			ApplySWAFullDefault(cfg, 1024)
-			Expect(cfg.Options).To(Equal([]string{"n_swa:512"}))
-		})
-
-		It("preserves unrelated options already on the config", func() {
-			cfg := &ModelConfig{
-				Name:    "gemma3",
-				Options: []string{"use_jinja:true", "cache_reuse:256"},
-			}
-			ApplySWAFullDefault(cfg, 1024)
-			Expect(cfg.Options).To(Equal([]string{
-				"use_jinja:true",
-				"cache_reuse:256",
-				"swa_full:true",
-			}))
-		})
-
-		It("tolerates a nil config", func() {
-			Expect(func() { ApplySWAFullDefault(nil, 1024) }).ToNot(Panic())
-		})
-	})
-})
--- a/core/gallery/importers/diffuser.go
+++ b/core/gallery/importers/diffuser.go
@@ -101,7 +101,7 @@ func (i *DiffuserImporter) Import(details Details) (gallery.ModelConfig, error)
 		Backend:             backend,
 		PredictionOptions: schema.PredictionOptions{
 			BasicModelRequest: schema.BasicModelRequest{
-				Model: LocalModelPath(details.URI),
+				Model: details.URI,
 			},
 		},
 		Diffusers: config.Diffusers{
--- a/core/gallery/importers/helpers.go
+++ b/core/gallery/importers/helpers.go
@@ -4,24 +4,9 @@ import (
 	"path/filepath"
 	"strings"

-	"github.com/mudler/LocalAI/pkg/downloader"
 	hfapi "github.com/mudler/LocalAI/pkg/huggingface-api"
 )

-// LocalModelPath normalizes a model URI for backends that treat the model
-// field as a HuggingFace repo id or local filesystem path (mlx, mlx-vlm,
-// vllm, transformers, diffusers). A "file://" import URI is reduced to the
-// bare path it points at: mlx-lm and vLLM otherwise mis-read the "file://"
-// scheme as a repo id and fail with "Repo id must be in the form
-// 'repo_name' or 'namespace/repo_name'" (issue #7461). HuggingFace and HTTP
-// URIs are returned unchanged so the existing remote-load path is untouched.
-func LocalModelPath(uri string) string {
-	if path, ok := strings.CutPrefix(uri, downloader.LocalPrefix); ok {
-		return path
-	}
-	return uri
-}
-
 // HasFile returns true when any file in files has exactly the given basename.
 // Directory components in file.Path are ignored — a nested
 // "sub/dir/config.json" is considered a match for name = "config.json".
--- a/core/gallery/importers/helpers_test.go
+++ b/core/gallery/importers/helpers_test.go
@@ -86,21 +86,4 @@ var _ = Describe("importer helpers", func() {
 			Expect(importers.HasGGMLFile(files, "ggml-")).To(BeFalse())
 		})
 	})
-
-	Describe("LocalModelPath", func() {
-		It("strips the file:// scheme from an absolute local path", func() {
-			Expect(importers.LocalModelPath("file:///Users/u/.lmstudio/models/mlx-community/Qwen3-4bit")).
-				To(Equal("/Users/u/.lmstudio/models/mlx-community/Qwen3-4bit"))
-		})
-		It("strips the file:// scheme from a relative local path", func() {
-			Expect(importers.LocalModelPath("file://my-models/nvidia/Qwen3-30B-A3B-FP4")).
-				To(Equal("my-models/nvidia/Qwen3-30B-A3B-FP4"))
-		})
-		It("leaves HuggingFace and HTTP URIs unchanged", func() {
-			Expect(importers.LocalModelPath("https://huggingface.co/mlx-community/test-model")).
-				To(Equal("https://huggingface.co/mlx-community/test-model"))
-			Expect(importers.LocalModelPath("mlx-community/test-model")).
-				To(Equal("mlx-community/test-model"))
-		})
-	})
 })
--- a/core/gallery/importers/importers_test.go
+++ b/core/gallery/importers/importers_test.go
@@ -22,13 +22,11 @@ var _ = Describe("DiscoverModelConfig", func() {
 			modelConfig, err := importers.DiscoverModelConfig(uri, preferences)

 			Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
-			// No name preference + repo-root URI: the name follows the selected
-			// GGUF file, not the repo (issue #10587).
-			Expect(modelConfig.Name).To(Equal("localai-functioncall-qwen2.5-7b-v0.5-q4_k_m"), fmt.Sprintf("Model config: %+v", modelConfig))
+			Expect(modelConfig.Name).To(Equal("LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Description).To(Equal("Imported from https://huggingface.co/mudler/LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: llama-cpp"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(len(modelConfig.Files)).To(Equal(1), fmt.Sprintf("Model config: %+v", modelConfig))
-			Expect(modelConfig.Files[0].Filename).To(Equal("llama-cpp/models/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
+			Expect(modelConfig.Files[0].Filename).To(Equal("llama-cpp/models/LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Files[0].URI).To(Equal("https://huggingface.co/mudler/LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF/resolve/main/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Files[0].SHA256).To(Equal("4e7b7fe1d54b881f1ef90799219dc6cc285d29db24f559c8998d1addb35713d4"), fmt.Sprintf("Model config: %+v", modelConfig))
 		})
@@ -40,17 +38,16 @@ var _ = Describe("DiscoverModelConfig", func() {
 			modelConfig, err := importers.DiscoverModelConfig(uri, preferences)

 			Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
-			// No name preference: name follows the selected model GGUF (issue #10587).
-			Expect(modelConfig.Name).To(Equal("Qwen3VL-2B-Instruct-Q4_K_M"), fmt.Sprintf("Model config: %+v", modelConfig))
+			Expect(modelConfig.Name).To(Equal("Qwen3-VL-2B-Instruct-GGUF"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Description).To(Equal("Imported from https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: llama-cpp"), fmt.Sprintf("Model config: %+v", modelConfig))
-			Expect(modelConfig.ConfigFile).To(ContainSubstring("mmproj: llama-cpp/mmproj/Qwen3VL-2B-Instruct-Q4_K_M/mmproj-Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
-			Expect(modelConfig.ConfigFile).To(ContainSubstring("model: llama-cpp/models/Qwen3VL-2B-Instruct-Q4_K_M/Qwen3VL-2B-Instruct-Q4_K_M.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("mmproj: llama-cpp/mmproj/Qwen3-VL-2B-Instruct-GGUF/mmproj-Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("model: llama-cpp/models/Qwen3-VL-2B-Instruct-GGUF/Qwen3VL-2B-Instruct-Q4_K_M.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(len(modelConfig.Files)).To(Equal(2), fmt.Sprintf("Model config: %+v", modelConfig))
-			Expect(modelConfig.Files[0].Filename).To(Equal("llama-cpp/models/Qwen3VL-2B-Instruct-Q4_K_M/Qwen3VL-2B-Instruct-Q4_K_M.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
+			Expect(modelConfig.Files[0].Filename).To(Equal("llama-cpp/models/Qwen3-VL-2B-Instruct-GGUF/Qwen3VL-2B-Instruct-Q4_K_M.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Files[0].URI).To(Equal("https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF/resolve/main/Qwen3VL-2B-Instruct-Q4_K_M.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Files[0].SHA256).ToNot(BeEmpty(), fmt.Sprintf("Model config: %+v", modelConfig))
-			Expect(modelConfig.Files[1].Filename).To(Equal("llama-cpp/mmproj/Qwen3VL-2B-Instruct-Q4_K_M/mmproj-Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
+			Expect(modelConfig.Files[1].Filename).To(Equal("llama-cpp/mmproj/Qwen3-VL-2B-Instruct-GGUF/mmproj-Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Files[1].URI).To(Equal("https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF/resolve/main/mmproj-Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Files[1].SHA256).ToNot(BeEmpty(), fmt.Sprintf("Model config: %+v", modelConfig))
 		})
@@ -62,17 +59,16 @@ var _ = Describe("DiscoverModelConfig", func() {
 			modelConfig, err := importers.DiscoverModelConfig(uri, preferences)

 			Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
-			// No name preference: name follows the selected Q8_0 model GGUF (issue #10587).
-			Expect(modelConfig.Name).To(Equal("Qwen3VL-2B-Instruct-Q8_0"), fmt.Sprintf("Model config: %+v", modelConfig))
+			Expect(modelConfig.Name).To(Equal("Qwen3-VL-2B-Instruct-GGUF"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Description).To(Equal("Imported from https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: llama-cpp"), fmt.Sprintf("Model config: %+v", modelConfig))
-			Expect(modelConfig.ConfigFile).To(ContainSubstring("mmproj: llama-cpp/mmproj/Qwen3VL-2B-Instruct-Q8_0/mmproj-Qwen3VL-2B-Instruct-F16.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
-			Expect(modelConfig.ConfigFile).To(ContainSubstring("model: llama-cpp/models/Qwen3VL-2B-Instruct-Q8_0/Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("mmproj: llama-cpp/mmproj/Qwen3-VL-2B-Instruct-GGUF/mmproj-Qwen3VL-2B-Instruct-F16.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("model: llama-cpp/models/Qwen3-VL-2B-Instruct-GGUF/Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(len(modelConfig.Files)).To(Equal(2), fmt.Sprintf("Model config: %+v", modelConfig))
-			Expect(modelConfig.Files[0].Filename).To(Equal("llama-cpp/models/Qwen3VL-2B-Instruct-Q8_0/Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
+			Expect(modelConfig.Files[0].Filename).To(Equal("llama-cpp/models/Qwen3-VL-2B-Instruct-GGUF/Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Files[0].URI).To(Equal("https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF/resolve/main/Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Files[0].SHA256).ToNot(BeEmpty(), fmt.Sprintf("Model config: %+v", modelConfig))
-			Expect(modelConfig.Files[1].Filename).To(Equal("llama-cpp/mmproj/Qwen3VL-2B-Instruct-Q8_0/mmproj-Qwen3VL-2B-Instruct-F16.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
+			Expect(modelConfig.Files[1].Filename).To(Equal("llama-cpp/mmproj/Qwen3-VL-2B-Instruct-GGUF/mmproj-Qwen3VL-2B-Instruct-F16.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Files[1].URI).To(Equal("https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF/resolve/main/mmproj-Qwen3VL-2B-Instruct-F16.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
 			Expect(modelConfig.Files[1].SHA256).ToNot(BeEmpty(), fmt.Sprintf("Model config: %+v", modelConfig))
 		})
--- a/core/gallery/importers/llama-cpp.go
+++ b/core/gallery/importers/llama-cpp.go
@@ -98,13 +98,8 @@ func (i *LlamaCPPImporter) Import(details Details) (gallery.ModelConfig, error)
 		}
 	}

-	// nameProvided tracks whether the user supplied an explicit model name.
-	// When they didn't, the URI base is only a fallback: for a HuggingFace
-	// repo-root URI (no file component) it would be the repo name, so the HF
-	// branch below re-derives the name from the selected GGUF file instead
-	// (issue #10587).
-	name, nameProvided := preferencesMap["name"].(string)
-	if !nameProvided {
+	name, ok := preferencesMap["name"].(string)
+	if !ok {
 		name = filepath.Base(details.URI)
 	}

@@ -232,23 +227,10 @@ func (i *LlamaCPPImporter) Import(details Details) (gallery.ModelConfig, error)
 		mmprojGroups := hfapi.GroupShards(mmprojFiles)
 		ggufGroups := hfapi.GroupShards(ggufFiles)

-		modelGroup := pickPreferredGroup(ggufGroups, quants)
-
-		// A repo-root URI has no file component, so the URI-base fallback
-		// above produced the repo name. When the user left the name blank,
-		// derive it from the GGUF file actually selected from the listing so
-		// the gallery entry and `model:` directory reflect the model, not the
-		// repository (issue #10587). An explicit name preference always wins.
-		if !nameProvided && modelGroup != nil {
-			name = modelNameFromShardGroup(*modelGroup)
-			modelConfig.Name = name
-			cfg.Name = name
-		}
-
 		// Emit the model group first so cfg.Files[0] is the model — callers
 		// and tests rely on the model file preceding any mmproj companion.
-		if modelGroup != nil {
-			appendShardGroup(&cfg, *modelGroup, filepath.Join("llama-cpp", "models", name))
+		if group := pickPreferredGroup(ggufGroups, quants); group != nil {
+			appendShardGroup(&cfg, *group, filepath.Join("llama-cpp", "models", name))
 		}
 		if group := pickPreferredGroup(mmprojGroups, mmprojQuantsList); group != nil {
 			appendShardGroup(&cfg, *group, filepath.Join("llama-cpp", "mmproj", name))
@@ -299,20 +281,6 @@ func (i *LlamaCPPImporter) Import(details Details) (gallery.ModelConfig, error)
 	return cfg, nil
 }

-// modelNameFromShardGroup derives a human-facing model name from the picked
-// GGUF group: the logical base filename with its .gguf extension stripped.
-// ShardGroup.Base is the common prefix for sharded sets (without the
-// -NNNNN-of-MMMMM suffix) and the sole basename for single-file models, so
-// this yields a clean name like "model-Q4_K_M" rather than an individual
-// shard filename or the repo-root URI base.
-func modelNameFromShardGroup(group hfapi.ShardGroup) string {
-	base := group.Base
-	if ext := filepath.Ext(base); strings.EqualFold(ext, ".gguf") {
-		base = strings.TrimSuffix(base, ext)
-	}
-	return base
-}
-
 // pickPreferredGroup walks the preference list in priority order and returns
 // the first group whose base filename contains any preference. When nothing
 // matches, the last group wins — this preserves the historical "if the user
--- a/core/gallery/importers/llama-cpp_test.go
+++ b/core/gallery/importers/llama-cpp_test.go
@@ -372,62 +372,6 @@ var _ = Describe("LlamaCPPImporter", func() {
 			Expect(err).ToNot(HaveOccurred())
 			Expect(modelConfig.Files).To(BeEmpty())
 		})
-
-		It("derives the model name from the selected GGUF when no name is given", func() {
-			// Regression for #10587: a repo-root URI has no file component, so
-			// the URI base ("example-GGUF") is just the repo name. With the
-			// name field left blank, the emitted name and model directory must
-			// follow the GGUF file actually selected, not the repository.
-			details := withHF(`{"quantizations":"Q4_K_M"}`,
-				hfFile("Meta-Llama-3-8B-Instruct.Q4_K_M.gguf", "aaa"),
-				hfFile("Meta-Llama-3-8B-Instruct.Q3_K_M.gguf", "bbb"),
-			)
-
-			modelConfig, err := importer.Import(details)
-
-			Expect(err).ToNot(HaveOccurred())
-			Expect(modelConfig.Name).To(Equal("Meta-Llama-3-8B-Instruct.Q4_K_M"))
-			Expect(modelConfig.Files).To(HaveLen(1), fmt.Sprintf("%+v", modelConfig))
-			Expect(modelConfig.Files[0].Filename).To(Equal(
-				"llama-cpp/models/Meta-Llama-3-8B-Instruct.Q4_K_M/Meta-Llama-3-8B-Instruct.Q4_K_M.gguf"))
-			Expect(modelConfig.ConfigFile).To(ContainSubstring("name: Meta-Llama-3-8B-Instruct.Q4_K_M"))
-			Expect(modelConfig.ConfigFile).To(ContainSubstring(
-				"model: llama-cpp/models/Meta-Llama-3-8B-Instruct.Q4_K_M/Meta-Llama-3-8B-Instruct.Q4_K_M.gguf"))
-		})
-
-		It("derives a clean name from the shard base for split GGUFs when no name is given", func() {
-			// The selected primary file is shard 1; using its raw basename
-			// would leak the -00001-of-00002 suffix into the name. The shard
-			// base must be used so the name is the logical model.
-			details := withHF(``,
-				hfFile("Qwen3-30B-A3B-Q4_K_M-00001-of-00002.gguf", "p1"),
-				hfFile("Qwen3-30B-A3B-Q4_K_M-00002-of-00002.gguf", "p2"),
-			)
-
-			modelConfig, err := importer.Import(details)
-
-			Expect(err).ToNot(HaveOccurred())
-			Expect(modelConfig.Name).To(Equal("Qwen3-30B-A3B-Q4_K_M"))
-			Expect(modelConfig.Files).To(HaveLen(2), fmt.Sprintf("%+v", modelConfig))
-			Expect(modelConfig.Files[0].Filename).To(Equal(
-				"llama-cpp/models/Qwen3-30B-A3B-Q4_K_M/Qwen3-30B-A3B-Q4_K_M-00001-of-00002.gguf"))
-			Expect(modelConfig.ConfigFile).To(ContainSubstring(
-				"model: llama-cpp/models/Qwen3-30B-A3B-Q4_K_M/Qwen3-30B-A3B-Q4_K_M-00001-of-00002.gguf"))
-		})
-
-		It("keeps an explicit name over the selected GGUF filename", func() {
-			// Precedence guard: when the user supplies a name it always wins,
-			// even though a GGUF file was selected from the listing.
-			details := withHF(`{"name":"my-custom-name","quantizations":"Q4_K_M"}`,
-				hfFile("model-Q4_K_M.gguf", "aaa"),
-			)
-
-			modelConfig, err := importer.Import(details)
-
-			Expect(err).ToNot(HaveOccurred())
-			Expect(modelConfig.Name).To(Equal("my-custom-name"))
-			Expect(modelConfig.Files[0].Filename).To(Equal("llama-cpp/models/my-custom-name/model-Q4_K_M.gguf"))
-		})
 	})

 	Context("quant token boundary matching", func() {
--- a/core/gallery/importers/mlx.go
+++ b/core/gallery/importers/mlx.go
@@ -87,7 +87,7 @@ func (i *MLXImporter) Import(details Details) (gallery.ModelConfig, error) {
 		Backend:             backend,
 		PredictionOptions: schema.PredictionOptions{
 			BasicModelRequest: schema.BasicModelRequest{
-				Model: LocalModelPath(details.URI),
+				Model: details.URI,
 			},
 		},
 		TemplateConfig: config.TemplateConfig{
--- a/core/gallery/importers/mlx_test.go
+++ b/core/gallery/importers/mlx_test.go
@@ -198,24 +198,5 @@ var _ = Describe("MLXImporter", func() {
 			Expect(err).ToNot(HaveOccurred())
 			Expect(modelConfig.Name).To(Equal("model"))
 		})
-
-		It("should emit a bare filesystem path for a file:// local import", func() {
-			// Regression for #7461: a model imported from a local directory
-			// (e.g. LM Studio's store) must not carry the file:// scheme into
-			// the model field — mlx-lm rejects it as an invalid repo id.
-			preferences := json.RawMessage(`{"backend": "mlx"}`)
-			details := importers.Details{
-				URI:         "file:///Users/u/.lmstudio/models/mlx-community/Qwen3-Coder-30B-A3B-Instruct-4bit",
-				Preferences: preferences,
-			}
-
-			modelConfig, err := importer.Import(details)
-
-			Expect(err).ToNot(HaveOccurred())
-			Expect(modelConfig.Name).To(Equal("Qwen3-Coder-30B-A3B-Instruct-4bit"))
-			Expect(modelConfig.ConfigFile).To(ContainSubstring(
-				"model: /Users/u/.lmstudio/models/mlx-community/Qwen3-Coder-30B-A3B-Instruct-4bit"))
-			Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("model: file://"))
-		})
 	})
 })
--- a/core/gallery/importers/transformers.go
+++ b/core/gallery/importers/transformers.go
@@ -91,7 +91,7 @@ func (i *TransformersImporter) Import(details Details) (gallery.ModelConfig, err
 		Backend:             backend,
 		PredictionOptions: schema.PredictionOptions{
 			BasicModelRequest: schema.BasicModelRequest{
-				Model: LocalModelPath(details.URI),
+				Model: details.URI,
 			},
 		},
 		TemplateConfig: config.TemplateConfig{
--- a/core/gallery/importers/vllm.go
+++ b/core/gallery/importers/vllm.go
@@ -81,7 +81,7 @@ func (i *VLLMImporter) Import(details Details) (gallery.ModelConfig, error) {
 		Backend:             backend,
 		PredictionOptions: schema.PredictionOptions{
 			BasicModelRequest: schema.BasicModelRequest{
-				Model: LocalModelPath(details.URI),
+				Model: details.URI,
 			},
 		},
 		TemplateConfig: config.TemplateConfig{
--- a/core/gallery/importers/vllm_test.go
+++ b/core/gallery/importers/vllm_test.go
@@ -177,22 +177,5 @@ var _ = Describe("VLLMImporter", func() {
 			Expect(modelConfig.ConfigFile).To(ContainSubstring("known_usecases:"))
 			Expect(modelConfig.ConfigFile).To(ContainSubstring("- chat"))
 		})
-
-		It("should emit a bare filesystem path for a file:// local import", func() {
-			// Regression for #7461: vLLM rejects a file:// model field as an
-			// invalid repo id, so a locally-imported model must carry the bare
-			// path instead.
-			preferences := json.RawMessage(`{"backend": "vllm"}`)
-			details := Details{
-				URI:         "file://my-models/nvidia/Qwen3-30B-A3B-FP4",
-				Preferences: preferences,
-			}
-
-			modelConfig, err := importer.Import(details)
-
-			Expect(err).ToNot(HaveOccurred())
-			Expect(modelConfig.ConfigFile).To(ContainSubstring("model: my-models/nvidia/Qwen3-30B-A3B-FP4"))
-			Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("model: file://"))
-		})
 	})
 })
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -618,10 +618,6 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 					finishReason = FinishReasonToolCalls
 				} else if toolsCalled {
 					finishReason = FinishReasonFunctionCall
-				} else if reachedTokenBudget(finalUsage.Completion, config.Maxtokens) {
-					// Generation stopped because it hit the max_tokens ceiling
-					// rather than a natural stop — report "length" (issue #9716).
-					finishReason = FinishReasonLength
 				}

 				// Final delta chunk: empty delta with finish_reason set. Per
@@ -988,18 +984,6 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 					}
 				}

-				// If generation hit the max_tokens ceiling, report "length"
-				// instead of a natural "stop" (issue #9716). Mirrors the
-				// streaming path; tool/function finish reasons are untouched.
-				if reachedTokenBudget(tokenUsage.Completion, config.Maxtokens) {
-					for i := range result {
-						if result[i].FinishReason != nil && *result[i].FinishReason == FinishReasonStop {
-							lengthReason := FinishReasonLength
-							result[i].FinishReason = &lengthReason
-						}
-					}
-				}
-
 				// No MCP tools to execute (or no MCP tools configured), return response
 				usage := schema.OpenAIUsage{
 					PromptTokens:     tokenUsage.Prompt,
--- a/core/http/endpoints/openai/compactcoord/compactcoord.go
+++ b/core/http/endpoints/openai/compactcoord/compactcoord.go
@@ -1,149 +0,0 @@
-// Package compactcoord is the explicit state machine for the realtime API's
-// conversation-compaction concern (machine "M4" in
-// docs/design/realtime-state-machines.md).
-//
-// In the legacy code this machine is an implicit single-flight guard: a
-// per-conversation `compacting atomic.Bool` that maybeCompact CAS-flips to start
-// a background summarize+evict and a deferred Store(false) clears. The intent —
-// at most one compaction running per conversation at a time, so two goroutines
-// never summarize and evict the same overflow concurrently (Part 4, invariant
-// #9) — is correct but implicit in a bare atomic.
-//
-// This package makes it explicit:
-//   - a sealed sum type for State (Idle | Running) — "two compactions running" is
-//     unrepresentable,
-//   - a total, pure transition function Next(state, event) -> (state, effects),
-//   - a single-writer Coordinator that serializes every transition.
-//
-// Unlike respcoord (M3), a Trigger while Running is NOT a supersede: compaction
-// is idempotent work on the same overflow, so a concurrent trigger is simply
-// dropped (matching the legacy CAS-fails-so-skip), not queued or restarted.
-package compactcoord
-
-import (
-	"fmt"
-
-	"github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator"
-)
-
-// State is the sealed sum type of compaction states. Exhaustively:
-// Idle | Running | Terminated.
-type State interface {
-	isState()
-	String() string
-}
-
-// Idle: no compaction is running.
-type Idle struct{}
-
-// Running: exactly one compaction is in flight.
-type Running struct{}
-
-// Terminated: the conversation/session is torn down. Absorbing — no compaction
-// can start from here, so the M1 (connection) parent's teardown can cancel +
-// join the in-flight compaction and guarantee none outlives the session (see
-// formal-verification/session_lifecycle.fizz). This closes the legacy gap where
-// the fire-and-forget compaction goroutine could outlive the session.
-type Terminated struct{}
-
-func (Idle) isState()       {}
-func (Running) isState()    {}
-func (Terminated) isState() {}
-
-func (Idle) String() string       { return "Idle" }
-func (Running) String() string    { return "Running" }
-func (Terminated) String() string { return "Terminated" }
-
-// Event is the sealed sum type of inputs. Exhaustively:
-// Trigger | Finished | Shutdown.
-type Event interface {
-	isEvent()
-	String() string
-}
-
-// Trigger requests a compaction (the live buffer grew past the trigger). It
-// starts one only when Idle; while Running it is a no-op (single-flight).
-type Trigger struct{}
-
-// Finished reports that the running compaction goroutine finished (success, error, or
-// timeout — it always reports Finished so the flag can never stick).
-type Finished struct{}
-
-// Shutdown terminates the coordinator at teardown: the in-flight compaction is
-// cancelled + joined by the sink, and no compaction can start afterwards.
-type Shutdown struct{}
-
-func (Trigger) isEvent()  {}
-func (Finished) isEvent() {}
-func (Shutdown) isEvent() {}
-
-func (Trigger) String() string  { return "Trigger" }
-func (Finished) String() string { return "Finished" }
-func (Shutdown) String() string { return "Shutdown" }
-
-// Effect is a side effect returned by Next as data. Exhaustively: StartCompaction.
-type Effect interface {
-	isEffect()
-	String() string
-}
-
-// StartCompaction: spawn the background summarize+evict goroutine.
-type StartCompaction struct{}
-
-func (StartCompaction) isEffect() {}
-
-func (StartCompaction) String() string { return "StartCompaction" }
-
-// Next is the total, pure transition function. For every (state, event) it
-// returns the next state and the ordered effects. It returns a non-nil error
-// only for an unknown State/Event implementation. Every in-domain pair is
-// defined; there are no forbidden transitions, only no-ops.
-//
-// Single-flight crux: StartCompaction is emitted only on Idle+Trigger, and a
-// Trigger while Running is a no-op — so at most one compaction ever runs.
-func Next(s State, e Event) (State, []Effect, error) {
-	switch s.(type) {
-	case Idle:
-		switch e.(type) {
-		case Trigger:
-			return Running{}, []Effect{StartCompaction{}}, nil
-		case Finished:
-			// No compaction to finish: stale/idempotent no-op.
-			return Idle{}, nil, nil
-		case Shutdown:
-			return Terminated{}, nil, nil
-		}
-	case Running:
-		switch e.(type) {
-		case Trigger:
-			// Already compacting: drop (single-flight).
-			return Running{}, nil, nil
-		case Finished:
-			return Idle{}, nil, nil
-		case Shutdown:
-			// Teardown while compacting: the sink cancels + joins the goroutine,
-			// so its later Finished is absorbed here in Terminated.
-			return Terminated{}, nil, nil
-		}
-	case Terminated:
-		// Absorbing: a Trigger after teardown is rejected (no StartCompaction), so
-		// no compaction outlives the session.
-		switch e.(type) {
-		case Trigger, Finished, Shutdown:
-			return Terminated{}, nil, nil
-		}
-	}
-	return s, nil, fmt.Errorf("compactcoord: unhandled transition %s <- %s", s, e)
-}
-
-// EffectSink performs the effects produced by a transition. See coordinator.Sink:
-// StartCompaction spawns a goroutine, so Perform does not block under the lock.
-type EffectSink = coordinator.Sink[Effect]
-
-// Coordinator serializes the compaction transitions. See coordinator.Coordinator.
-type Coordinator = coordinator.Coordinator[State, Event, Effect]
-
-// New returns an idle Coordinator that performs effects via sink.
-func New(sink EffectSink) *Coordinator {
-	return coordinator.New[State, Event, Effect](Idle{}, Next, sink)
-}
--- a/core/http/endpoints/openai/compactcoord/compactcoord_suite_test.go
+++ b/core/http/endpoints/openai/compactcoord/compactcoord_suite_test.go
@@ -1,13 +0,0 @@
-package compactcoord
-
-import (
-	"testing"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-func TestCompactcoord(t *testing.T) {
-	RegisterFailHandler(Fail)
-	RunSpecs(t, "compactcoord (realtime M4) Suite")
-}
--- a/core/http/endpoints/openai/compactcoord/compactcoord_test.go
+++ b/core/http/endpoints/openai/compactcoord/compactcoord_test.go
@@ -1,202 +0,0 @@
-package compactcoord
-
-import (
-	"math/rand/v2"
-	"sync"
-	"sync/atomic"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-// recordingSink captures the ordered stream of effects. Perform is called under
-// the coordinator lock; the mutex here guards reads from the spec goroutine.
-type recordingSink struct {
-	mu  sync.Mutex
-	log []Effect
-}
-
-func (s *recordingSink) Perform(e Effect) {
-	s.mu.Lock()
-	s.log = append(s.log, e)
-	s.mu.Unlock()
-}
-
-func (s *recordingSink) count() int {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-	return len(s.log)
-}
-
-type unknownEvent struct{}
-
-func (unknownEvent) isEvent()       {}
-func (unknownEvent) String() string { return "unknownEvent" }
-
-type unknownState struct{}
-
-func (unknownState) isState()       {}
-func (unknownState) String() string { return "unknownState" }
-
-var _ = Describe("compactcoord.Next", func() {
-	DescribeTable("transitions",
-		func(state State, event Event, wantState State, wantEff []Effect) {
-			gotState, gotEff, err := Next(state, event)
-			Expect(err).NotTo(HaveOccurred())
-			Expect(gotState).To(Equal(wantState))
-			Expect(gotEff).To(Equal(wantEff))
-		},
-		Entry("idle+trigger -> running: start",
-			Idle{}, Trigger{}, Running{}, []Effect{StartCompaction{}}),
-		Entry("idle+finished -> idle, no-op (stale)",
-			Idle{}, Finished{}, Idle{}, []Effect(nil)),
-		Entry("running+trigger -> running, no-op (single-flight)",
-			Running{}, Trigger{}, Running{}, []Effect(nil)),
-		Entry("running+finished -> idle",
-			Running{}, Finished{}, Idle{}, []Effect(nil)),
-		Entry("idle+shutdown -> terminated",
-			Idle{}, Shutdown{}, Terminated{}, []Effect(nil)),
-		Entry("running+shutdown -> terminated",
-			Running{}, Shutdown{}, Terminated{}, []Effect(nil)),
-		Entry("terminated+trigger -> terminated, REJECTED",
-			Terminated{}, Trigger{}, Terminated{}, []Effect(nil)),
-		Entry("terminated+finished -> terminated, no-op (stale)",
-			Terminated{}, Finished{}, Terminated{}, []Effect(nil)),
-		Entry("terminated+shutdown -> terminated, idempotent",
-			Terminated{}, Shutdown{}, Terminated{}, []Effect(nil)),
-	)
-
-	It("is total over the defined (state, event) pairs", func() {
-		for _, s := range []State{Idle{}, Running{}, Terminated{}} {
-			for _, e := range []Event{Trigger{}, Finished{}, Shutdown{}} {
-				_, _, err := Next(s, e)
-				Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e)
-			}
-		}
-	})
-
-	It("errors on an unknown event type", func() {
-		_, _, err := Next(Idle{}, unknownEvent{})
-		Expect(err).To(HaveOccurred())
-	})
-
-	It("errors on an unknown state type", func() {
-		_, _, err := Next(unknownState{}, Trigger{})
-		Expect(err).To(HaveOccurred())
-	})
-})
-
-var _ = Describe("compactcoord.Coordinator", func() {
-	// A StartCompaction is only ever produced while Idle (verified by checking the
-	// effect count grows exactly when the model transitions Idle->Running), so at
-	// most one compaction is ever in flight.
-	It("starts at most one compaction at a time over random sequences", func() {
-		seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE}
-		for _, seed := range seeds {
-			r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5))
-			sink := &recordingSink{}
-			c := New(sink)
-			running := false
-			starts := 0
-
-			for range 5000 {
-				if r.IntN(2) == 0 {
-					before := sink.count()
-					Expect(c.Apply(Trigger{})).To(Succeed())
-					if sink.count() > before {
-						// A StartCompaction was produced: must have been Idle.
-						Expect(running).To(BeFalse(), "seed=%d: started while already running", seed)
-						running = true
-						starts++
-					}
-				} else {
-					Expect(c.Apply(Finished{})).To(Succeed())
-					running = false
-				}
-				if running {
-					Expect(c.State()).To(Equal(State(Running{})), "seed=%d", seed)
-				} else {
-					Expect(c.State()).To(Equal(State(Idle{})), "seed=%d", seed)
-				}
-			}
-			Expect(starts).To(BeNumerically(">", 0), "seed=%d: walk should have started at least one", seed)
-		}
-	})
-
-	// Faithful concurrent test: StartCompaction spawns "work" that bumps an active
-	// counter, runs, and reports Finished back to the coordinator (exactly how the
-	// real sink behaves). Single-flight must hold even under many concurrent
-	// Triggers: the active counter never exceeds 1. Run under -race.
-	It("never runs two compactions concurrently", func() {
-		var active, maxActive int32
-		var c *Coordinator
-		var work sync.WaitGroup
-		sink := &spawnSink{onStart: func() {
-			work.Add(1)
-			go func() {
-				defer work.Done()
-				n := atomic.AddInt32(&active, 1)
-				for {
-					m := atomic.LoadInt32(&maxActive)
-					if n <= m || atomic.CompareAndSwapInt32(&maxActive, m, n) {
-						break
-					}
-				}
-				atomic.AddInt32(&active, -1)
-				_ = c.Apply(Finished{})
-			}()
-		}}
-		c = New(sink)
-
-		var wg sync.WaitGroup
-		for g := 0; g < 8; g++ {
-			wg.Add(1)
-			go func() {
-				defer wg.Done()
-				for range 1000 {
-					_ = c.Apply(Trigger{})
-				}
-			}()
-		}
-		wg.Wait()
-		work.Wait() // let any in-flight compaction report Finished
-
-		Expect(atomic.LoadInt32(&maxActive)).To(BeNumerically("<=", 1))
-		Expect(c.State()).To(Equal(State(Idle{})))
-	})
-
-	It("terminates on shutdown and rejects later triggers", func() {
-		sink := &recordingSink{}
-		c := New(sink)
-		Expect(c.Apply(Trigger{})).To(Succeed()) // Idle -> Running (StartCompaction)
-		Expect(c.Apply(Shutdown{})).To(Succeed())
-		Expect(c.State()).To(Equal(State(Terminated{})))
-
-		before := sink.count()
-		Expect(c.Apply(Trigger{})).To(Succeed()) // rejected
-		Expect(sink.count()).To(Equal(before), "no StartCompaction after shutdown")
-		Expect(c.Apply(Finished{})).To(Succeed()) // stale, absorbed
-		Expect(c.State()).To(Equal(State(Terminated{})))
-	})
-})
-
-// spawnSink invokes onStart for each StartCompaction (called under the coord lock;
-// onStart must be non-blocking — it spawns the work goroutine).
-type spawnSink struct{ onStart func() }
-
-func (s *spawnSink) Perform(e Effect) {
-	if _, ok := e.(StartCompaction); ok {
-		s.onStart()
-	}
-}
-
-var _ = DescribeTable("compactcoord stringers",
-	func(got, want string) { Expect(got).To(Equal(want)) },
-	Entry(nil, Idle{}.String(), "Idle"),
-	Entry(nil, Running{}.String(), "Running"),
-	Entry(nil, Terminated{}.String(), "Terminated"),
-	Entry(nil, Trigger{}.String(), "Trigger"),
-	Entry(nil, Finished{}.String(), "Finished"),
-	Entry(nil, Shutdown{}.String(), "Shutdown"),
-	Entry(nil, StartCompaction{}.String(), "StartCompaction"),
-)
--- a/core/http/endpoints/openai/conncoord/conncoord.go
+++ b/core/http/endpoints/openai/conncoord/conncoord.go
@@ -1,164 +0,0 @@
-// Package conncoord is the explicit state machine for the realtime API's
-// connection lifecycle (machine "M1" in docs/design/realtime-state-machines.md).
-//
-// In the legacy code this machine is implicit and fragile. The session handler
-// keeps a `vadServerStarted` bool plus a `done` channel that is REASSIGNED to a
-// fresh channel every time turn detection is toggled on (session.update) and
-// closed both at toggle-off and at teardown (Part 2, failure mode 6). It is
-// correct today only because one goroutine owns it; "one variable name meaning
-// different channels over time, closed from two sites guarded by a bool" is a
-// structural hazard, not an explicit lifecycle. Teardown likewise depends on the
-// bool to avoid closing an already-closed channel.
-//
-// This package makes the lifecycle explicit:
-//   - a sealed sum type for State (Live{VADRunning} | Torn) — illegal states
-//     such as "running after teardown" are unrepresentable,
-//   - a total, pure transition function Next(state, event) -> (state, effects),
-//   - a single-writer Coordinator that serializes every transition.
-//
-// The guarantees the spec checks:
-//   - the VAD goroutine's done channel is closed exactly once per start (StopVAD
-//     is emitted only while running, so never a double close / close of nil),
-//   - teardown runs exactly once (Close from Live; any later Close is a no-op),
-//   - nothing is started after teardown (no resurrection / no send-after-close).
-//
-// Like turncoord (M2), the connection machine is driven by the single session
-// goroutine; the Coordinator's lock keeps State() race-free and guards against a
-// future second writer. The effects are performed by a sink that owns the actual
-// channels/goroutines (see realtime_conncoord.go).
-package conncoord
-
-import (
-	"fmt"
-
-	"github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator"
-)
-
-// State is the sealed sum type of connection states. The only implementations
-// are the marker-method structs in this file. Exhaustively: Live | Torn.
-type State interface {
-	isState()
-	String() string
-}
-
-// Live: the session is active. VADRunning records whether the turn-detection
-// (handleVAD) goroutine is currently running — the single source of truth that
-// replaces the legacy vadServerStarted bool, so the per-run done channel is
-// closed exactly once.
-type Live struct{ VADRunning bool }
-
-// Torn: the session has been torn down. Terminal — no effect is ever produced
-// from here again.
-type Torn struct{}
-
-func (Live) isState() {}
-func (Torn) isState() {}
-
-func (s Live) String() string { return fmt.Sprintf("Live(vad=%t)", s.VADRunning) }
-func (Torn) String() string   { return "Torn" }
-
-// Event is the sealed sum type of inputs. Exhaustively: SetVAD | Close.
-type Event interface {
-	isEvent()
-	String() string
-}
-
-// SetVAD requests the turn-detection goroutine be running (Active) or not. It is
-// raised whenever session.update changes whether turn detection is active. It is
-// idempotent: setting the state it is already in is a no-op.
-type SetVAD struct{ Active bool }
-
-// Close requests teardown (the transport read loop ended, or the session is
-// closing). It is idempotent — only the first Close from Live tears down.
-type Close struct{}
-
-func (SetVAD) isEvent() {}
-func (Close) isEvent()  {}
-
-func (e SetVAD) String() string { return fmt.Sprintf("SetVAD(%t)", e.Active) }
-func (Close) String() string    { return "Close" }
-
-// Effect is a side effect returned by Next as data for the caller to perform.
-// Exhaustively: StartVAD | StopVAD | Teardown.
-type Effect interface {
-	isEffect()
-	String() string
-}
-
-// StartVAD: create a fresh done channel and spawn the handleVAD goroutine on it.
-type StartVAD struct{}
-
-// StopVAD: close the running VAD goroutine's done channel (signal it to exit).
-type StopVAD struct{}
-
-// Teardown: the once-only teardown — stop the remaining input goroutines (opus
-// decode, sound window), join them, cancel in-flight responses, and remove the
-// session from the registry. Emitted exactly once.
-type Teardown struct{}
-
-func (StartVAD) isEffect() {}
-func (StopVAD) isEffect()  {}
-func (Teardown) isEffect() {}
-
-func (StartVAD) String() string { return "StartVAD" }
-func (StopVAD) String() string  { return "StopVAD" }
-func (Teardown) String() string { return "Teardown" }
-
-// Next is the total, pure transition function. For every (state, event) it
-// returns the next state and the ordered effects to perform. It returns a
-// non-nil error only for an unknown State/Event implementation. Every in-domain
-// pair is defined; there are no forbidden transitions, only no-ops.
-//
-// The crux: Close moves to Torn, which absorbs every later event with no
-// effects. So teardown's channel closes happen exactly once even if Close is
-// raised again (e.g. an error path and the normal return both reaching it), and
-// no StartVAD can resurrect a torn session.
-func Next(s State, e Event) (State, []Effect, error) {
-	switch st := s.(type) {
-	case Live:
-		switch ev := e.(type) {
-		case SetVAD:
-			switch {
-			case ev.Active && !st.VADRunning:
-				return Live{VADRunning: true}, []Effect{StartVAD{}}, nil
-			case !ev.Active && st.VADRunning:
-				return Live{VADRunning: false}, []Effect{StopVAD{}}, nil
-			default:
-				// Already in the requested state: idempotent no-op.
-				return Live{VADRunning: st.VADRunning}, nil, nil
-			}
-		case Close:
-			if st.VADRunning {
-				return Torn{}, []Effect{StopVAD{}, Teardown{}}, nil
-			}
-			return Torn{}, []Effect{Teardown{}}, nil
-		}
-	case Torn:
-		switch e.(type) {
-		case SetVAD:
-			// No resurrection: a toggle after teardown is ignored.
-			return Torn{}, nil, nil
-		case Close:
-			// Idempotent: teardown already ran.
-			return Torn{}, nil, nil
-		}
-	}
-	return s, nil, fmt.Errorf("conncoord: unhandled transition %s <- %s", s, e)
-}
-
-// EffectSink performs the effects produced by a transition. See coordinator.Sink:
-// Perform runs under the coordinator lock. The Teardown effect does join
-// goroutines (which can block) — acceptable here because the connection
-// coordinator is single-writer and torn down exactly once at the end of the
-// session goroutine, so no other Apply is contending the lock.
-type EffectSink = coordinator.Sink[Effect]
-
-// Coordinator serializes the connection-lifecycle transitions.
-// See coordinator.Coordinator.
-type Coordinator = coordinator.Coordinator[State, Event, Effect]
-
-// New returns a Coordinator in Live{VADRunning:false} that performs effects via
-// sink.
-func New(sink EffectSink) *Coordinator {
-	return coordinator.New[State, Event, Effect](Live{VADRunning: false}, Next, sink)
-}
--- a/core/http/endpoints/openai/conncoord/conncoord_suite_test.go
+++ b/core/http/endpoints/openai/conncoord/conncoord_suite_test.go
@@ -1,13 +0,0 @@
-package conncoord
-
-import (
-	"testing"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-func TestConncoord(t *testing.T) {
-	RegisterFailHandler(Fail)
-	RunSpecs(t, "conncoord (realtime M1) Suite")
-}
--- a/core/http/endpoints/openai/conncoord/conncoord_test.go
+++ b/core/http/endpoints/openai/conncoord/conncoord_test.go
@@ -1,212 +0,0 @@
-package conncoord
-
-import (
-	"math/rand/v2"
-	"sync"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-// recordingSink captures the ordered stream of effects so the invariants can be
-// checked independently of the transition function. Perform is called by
-// Coordinator.Apply under the coordinator lock; the mutex here only guards reads
-// from the spec goroutine.
-type recordingSink struct {
-	mu  sync.Mutex
-	log []Effect
-}
-
-func (s *recordingSink) Perform(e Effect) {
-	s.mu.Lock()
-	s.log = append(s.log, e)
-	s.mu.Unlock()
-}
-
-func (s *recordingSink) snapshot() []Effect {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-	out := make([]Effect, len(s.log))
-	copy(out, s.log)
-	return out
-}
-
-// checkLog replays the effect log and asserts the lifecycle safety properties
-// from docs/design/realtime-state-machines.md, Part 4 (invariants #8, #10 and
-// failure mode 6):
-//
-//	(1) the VAD done channel is closed exactly once per start -- StartVAD only
-//	    while stopped, StopVAD only while running (no double close / close-of-nil);
-//	(2) teardown runs at most once;
-//	(3) no resurrection -- no StartVAD after Teardown.
-func checkLog(log []Effect) {
-	running := false
-	torn := false
-	teardowns := 0
-	for i, eff := range log {
-		switch eff.(type) {
-		case StartVAD:
-			Expect(torn).To(BeFalse(), "invariant (3): StartVAD after teardown (effect #%d)\nlog=%v", i, log)
-			Expect(running).To(BeFalse(), "invariant (1): StartVAD while already running (effect #%d)\nlog=%v", i, log)
-			running = true
-		case StopVAD:
-			Expect(running).To(BeTrue(), "invariant (1): StopVAD while not running (effect #%d)\nlog=%v", i, log)
-			running = false
-		case Teardown:
-			Expect(torn).To(BeFalse(), "invariant (2): Teardown twice (effect #%d)\nlog=%v", i, log)
-			torn = true
-			teardowns++
-		}
-	}
-	Expect(teardowns).To(BeNumerically("<=", 1), "invariant (2): teardown ran %d times\nlog=%v", teardowns, log)
-}
-
-type unknownEvent struct{}
-
-func (unknownEvent) isEvent()       {}
-func (unknownEvent) String() string { return "unknownEvent" }
-
-type unknownState struct{}
-
-func (unknownState) isState()       {}
-func (unknownState) String() string { return "unknownState" }
-
-var _ = Describe("conncoord.Next", func() {
-	DescribeTable("transitions",
-		func(state State, event Event, wantState State, wantEff []Effect) {
-			gotState, gotEff, err := Next(state, event)
-			Expect(err).NotTo(HaveOccurred())
-			Expect(gotState).To(Equal(wantState))
-			Expect(gotEff).To(Equal(wantEff))
-		},
-		Entry("stopped+setvad(on) -> running: start",
-			Live{VADRunning: false}, SetVAD{Active: true},
-			Live{VADRunning: true}, []Effect{StartVAD{}}),
-		Entry("running+setvad(on) -> running, no-op",
-			Live{VADRunning: true}, SetVAD{Active: true},
-			Live{VADRunning: true}, []Effect(nil)),
-		Entry("stopped+setvad(off) -> stopped, no-op",
-			Live{VADRunning: false}, SetVAD{Active: false},
-			Live{VADRunning: false}, []Effect(nil)),
-		Entry("running+setvad(off) -> stopped: stop",
-			Live{VADRunning: true}, SetVAD{Active: false},
-			Live{VADRunning: false}, []Effect{StopVAD{}}),
-		Entry("stopped+close -> torn: teardown",
-			Live{VADRunning: false}, Close{},
-			Torn{}, []Effect{Teardown{}}),
-		Entry("running+close -> torn: stop + teardown",
-			Live{VADRunning: true}, Close{},
-			Torn{}, []Effect{StopVAD{}, Teardown{}}),
-		Entry("torn+setvad(on) -> torn, no-op (no resurrection)",
-			Torn{}, SetVAD{Active: true},
-			Torn{}, []Effect(nil)),
-		Entry("torn+close -> torn, no-op (idempotent)",
-			Torn{}, Close{},
-			Torn{}, []Effect(nil)),
-	)
-
-	It("is total over the defined (state, event) pairs", func() {
-		states := []State{Live{VADRunning: false}, Live{VADRunning: true}, Torn{}}
-		events := []Event{SetVAD{Active: true}, SetVAD{Active: false}, Close{}}
-		for _, s := range states {
-			for _, e := range events {
-				_, _, err := Next(s, e)
-				Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e)
-			}
-		}
-	})
-
-	It("errors on an unknown event type", func() {
-		_, _, err := Next(Live{}, unknownEvent{})
-		Expect(err).To(HaveOccurred())
-	})
-
-	It("errors on an unknown state type", func() {
-		_, _, err := Next(unknownState{}, Close{})
-		Expect(err).To(HaveOccurred())
-	})
-})
-
-var _ = Describe("conncoord.Coordinator", func() {
-	It("upholds the lifecycle invariants over random event sequences", func() {
-		seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE}
-		for _, seed := range seeds {
-			r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5))
-			sink := &recordingSink{}
-			c := New(sink)
-			running := false
-			torn := false
-
-			for range 5000 {
-				switch r.IntN(3) {
-				case 0:
-					Expect(c.Apply(SetVAD{Active: true})).To(Succeed())
-					if !torn {
-						running = true
-					}
-				case 1:
-					Expect(c.Apply(SetVAD{Active: false})).To(Succeed())
-					if !torn {
-						running = false
-					}
-				case 2:
-					Expect(c.Apply(Close{})).To(Succeed())
-					torn = true
-					running = false
-				}
-				if torn {
-					Expect(c.State()).To(Equal(State(Torn{})), "seed=%d", seed)
-				} else {
-					Expect(c.State()).To(Equal(State(Live{VADRunning: running})), "seed=%d", seed)
-				}
-			}
-			checkLog(sink.snapshot())
-		}
-	})
-
-	It("tears down at most once under concurrent SetVAD/Close from two goroutines", func() {
-		const perGoroutine = 2000
-		sink := &recordingSink{}
-		c := New(sink)
-
-		var wg sync.WaitGroup
-		drive := func(active bool) {
-			defer wg.Done()
-			for i := range perGoroutine {
-				switch i % 3 {
-				case 0:
-					_ = c.Apply(SetVAD{Active: active})
-				case 1:
-					_ = c.Apply(SetVAD{Active: !active})
-				case 2:
-					if i > perGoroutine/2 {
-						_ = c.Apply(Close{})
-					}
-				}
-			}
-		}
-
-		wg.Add(2)
-		go drive(true)
-		go drive(false)
-		wg.Wait()
-		_ = c.Apply(Close{})
-
-		checkLog(sink.snapshot())
-		Expect(c.State()).To(Equal(State(Torn{})))
-	})
-})
-
-var _ = DescribeTable("conncoord stringers",
-	func(got, want string) { Expect(got).To(Equal(want)) },
-	Entry(nil, Live{VADRunning: true}.String(), "Live(vad=true)"),
-	Entry(nil, Live{VADRunning: false}.String(), "Live(vad=false)"),
-	Entry(nil, Torn{}.String(), "Torn"),
-
-	Entry(nil, SetVAD{Active: true}.String(), "SetVAD(true)"),
-	Entry(nil, Close{}.String(), "Close"),
-
-	Entry(nil, StartVAD{}.String(), "StartVAD"),
-	Entry(nil, StopVAD{}.String(), "StopVAD"),
-	Entry(nil, Teardown{}.String(), "Teardown"),
-)
--- a/core/http/endpoints/openai/constants.go
+++ b/core/http/endpoints/openai/constants.go
@@ -5,7 +5,4 @@ const (
 	FinishReasonStop         = "stop"
 	FinishReasonToolCalls    = "tool_calls"
 	FinishReasonFunctionCall = "function_call"
-	// FinishReasonLength is reported when generation stopped because it
-	// reached the max_tokens budget rather than a natural stop (issue #9716).
-	FinishReasonLength = "length"
 )
--- a/core/http/endpoints/openai/coordinator/coordinator.go
+++ b/core/http/endpoints/openai/coordinator/coordinator.go
@@ -1,82 +0,0 @@
-// Package coordinator is the shared single-writer state-machine runtime for the
-// realtime API's explicit coordinators (machines M1–M5 in
-// docs/design/realtime-state-machines.md).
-//
-// Each machine package (respcoord, turncoord, conncoord, compactcoord, ttscoord)
-// defines its OWN sealed sum types for State/Event/Effect and a total, pure
-// transition function Next(state, event) -> (state, []effect, error). The
-// plumbing around that — a single-writer Coordinator that serializes every
-// transition behind one lock and performs the returned effects in order — is
-// identical across all five, so it lives here once instead of being copied.
-//
-// A machine package wires itself up with three lines:
-//
-//	type EffectSink = coordinator.Sink[Effect]
-//	type Coordinator = coordinator.Coordinator[State, Event, Effect]
-//	func New(sink EffectSink) *Coordinator { return coordinator.New[State, Event, Effect](Idle{}, Next, sink) }
-//
-// The aliases keep each package's public API (Coordinator, New, EffectSink,
-// Apply, State) unchanged. The single-writer serialization — the load-bearing
-// concurrency guarantee the FizzBee specs check — is therefore implemented and
-// reasoned about in exactly one place.
-package coordinator
-
-import "sync"
-
-// TransitionFunc is a machine's total, pure transition: given the current state
-// and an event it returns the next state, the ordered effects to perform, and a
-// non-nil error ONLY for an unhandled (programmer-error) state/event pair. It
-// must not perform I/O or block; side effects are returned as data (F) for the
-// Coordinator to hand to the Sink.
-type TransitionFunc[S, E, F any] func(state S, event E) (S, []F, error)
-
-// Sink performs the effects a transition produces. Implementations MUST be
-// non-blocking: Perform is called while the Coordinator holds its lock, so it
-// must not block (it should spawn a goroutine, call a cancel func, or do a
-// non-blocking channel send) and MUST NOT call back into the same Coordinator's
-// Apply.
-type Sink[F any] interface {
-	Perform(F)
-}
-
-// Coordinator is the single-writer wrapper around a pure transition function.
-// Every Apply is serialized by mu, so multiple goroutines can drive the machine
-// without racing, and a transition's effects are performed in order under the
-// lock (before any subsequent Apply can observe the new state).
-type Coordinator[S, E, F any] struct {
-	mu    sync.Mutex
-	state S
-	next  TransitionFunc[S, E, F]
-	sink  Sink[F]
-}
-
-// New returns a Coordinator in the given initial state that transitions via next
-// and performs effects via sink.
-func New[S, E, F any](initial S, next TransitionFunc[S, E, F], sink Sink[F]) *Coordinator[S, E, F] {
-	return &Coordinator[S, E, F]{state: initial, next: next, sink: sink}
-}
-
-// Apply runs one transition under the lock and performs its effects in order. If
-// the transition function returns an error (an unhandled state/event), the state
-// is left unchanged and the error is returned to the caller — never silently
-// swallowed.
-func (c *Coordinator[S, E, F]) Apply(e E) error {
-	c.mu.Lock()
-	defer c.mu.Unlock()
-	ns, effects, err := c.next(c.state, e)
-	if err != nil {
-		return err
-	}
-	c.state = ns
-	for _, eff := range effects {
-		c.sink.Perform(eff)
-	}
-	return nil
-}
-
-// State returns the current state (a value; safe to call concurrently).
-func (c *Coordinator[S, E, F]) State() S {
-	c.mu.Lock()
-	defer c.mu.Unlock()
-	return c.state
-}
--- a/core/http/endpoints/openai/coordinator/coordinator_suite_test.go
+++ b/core/http/endpoints/openai/coordinator/coordinator_suite_test.go
@@ -1,13 +0,0 @@
-package coordinator
-
-import (
-	"testing"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-func TestCoordinator(t *testing.T) {
-	RegisterFailHandler(Fail)
-	RunSpecs(t, "coordinator (shared runtime) Suite")
-}
--- a/core/http/endpoints/openai/coordinator/coordinator_test.go
+++ b/core/http/endpoints/openai/coordinator/coordinator_test.go
@@ -1,124 +0,0 @@
-package coordinator
-
-import (
-	"errors"
-	"fmt"
-	"sync"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-// A tiny toy machine exercises the generic runtime directly (the five real
-// machines exercise it via their aliases, but the gate measures this package's
-// own coverage). off <-toggle-> on; burst emits three ordered effects; boom is
-// the unhandled/error path.
-type tstate int
-
-const (
-	off tstate = iota
-	on
-)
-
-type tevent int
-
-const (
-	toggle tevent = iota
-	burst
-	boom
-)
-
-type teffect string
-
-func tnext(s tstate, e tevent) (tstate, []teffect, error) {
-	switch e {
-	case toggle:
-		if s == off {
-			return on, []teffect{"on"}, nil
-		}
-		return off, []teffect{"off"}, nil
-	case burst:
-		return s, []teffect{"a", "b", "c"}, nil
-	case boom:
-		return s, nil, errors.New("boom: unhandled")
-	}
-	return s, nil, fmt.Errorf("unknown event %d", int(e))
-}
-
-type recordingSink struct {
-	mu  sync.Mutex
-	log []teffect
-}
-
-func (s *recordingSink) Perform(e teffect) {
-	s.mu.Lock()
-	s.log = append(s.log, e)
-	s.mu.Unlock()
-}
-
-func (s *recordingSink) snapshot() []teffect {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-	out := make([]teffect, len(s.log))
-	copy(out, s.log)
-	return out
-}
-
-var _ = Describe("coordinator.Coordinator", func() {
-	It("starts in the initial state", func() {
-		c := New[tstate, tevent, teffect](off, tnext, &recordingSink{})
-		Expect(c.State()).To(Equal(off))
-	})
-
-	It("advances state and performs the transition's effects", func() {
-		sink := &recordingSink{}
-		c := New[tstate, tevent, teffect](off, tnext, sink)
-
-		Expect(c.Apply(toggle)).To(Succeed())
-		Expect(c.State()).To(Equal(on))
-		Expect(c.Apply(toggle)).To(Succeed())
-		Expect(c.State()).To(Equal(off))
-
-		Expect(sink.snapshot()).To(Equal([]teffect{"on", "off"}))
-	})
-
-	It("performs multiple effects in order", func() {
-		sink := &recordingSink{}
-		c := New[tstate, tevent, teffect](off, tnext, sink)
-		Expect(c.Apply(burst)).To(Succeed())
-		Expect(sink.snapshot()).To(Equal([]teffect{"a", "b", "c"}))
-	})
-
-	It("returns the transition error and leaves state unchanged", func() {
-		sink := &recordingSink{}
-		c := New[tstate, tevent, teffect](on, tnext, sink)
-		err := c.Apply(boom)
-		Expect(err).To(HaveOccurred())
-		Expect(c.State()).To(Equal(on), "state unchanged on error")
-		Expect(sink.snapshot()).To(BeEmpty(), "no effects performed on error")
-	})
-
-	It("serializes concurrent Apply from many goroutines (run with -race)", func() {
-		const goroutines = 8
-		const each = 1000
-		sink := &recordingSink{}
-		c := New[tstate, tevent, teffect](off, tnext, sink)
-
-		var wg sync.WaitGroup
-		wg.Add(goroutines)
-		for range goroutines {
-			go func() {
-				defer wg.Done()
-				for range each {
-					_ = c.Apply(toggle)
-				}
-			}()
-		}
-		wg.Wait()
-
-		// goroutines*each toggles from off; an even total returns to off. The
-		// point is race-freedom + a consistent final state, not the value itself.
-		Expect(c.State()).To(Equal(off))
-		Expect(sink.snapshot()).To(HaveLen(goroutines * each))
-	})
-})
--- a/core/http/endpoints/openai/inference.go
+++ b/core/http/endpoints/openai/inference.go
@@ -13,14 +13,6 @@ import (
 	"github.com/mudler/xlog"
 )

-// reachedTokenBudget reports whether generation stopped because it reached the
-// configured max_tokens ceiling. A maxTokens of nil or <= 0 means "no limit".
-// Used to suppress regeneration retries (which would just hit the same ceiling
-// again) and to report finish_reason "length" instead of "stop" (issue #9716).
-func reachedTokenBudget(completion int, maxTokens *int) bool {
-	return maxTokens != nil && *maxTokens > 0 && completion >= *maxTokens
-}
-
 func ComputeChoices(
 	req *schema.OpenAIRequest,
 	predInput string,
@@ -121,21 +113,11 @@ func ComputeChoices(
 			}
 			prediction = p

-			// budgetExhausted is true when the model stopped because it reached
-			// the configured max_tokens ceiling. None of the retry paths below
-			// should fire in that case: regenerating would just hit the same
-			// ceiling again and multiply token consumption (issue #9716). A
-			// thinking model that spends its whole budget on the reasoning block
-			// produces an empty content / reasoning-only response, which would
-			// otherwise look like a failed generation worth retrying. This is a
-			// "length" finish, not an empty one.
-			budgetExhausted := reachedTokenBudget(prediction.Usage.Completion, config.Maxtokens)
-
 			// Built-in: retry on truly empty response (no tokens at all).
 			// However, when the C++ autoparser is active, it clears the raw
 			// message and delivers content via ChatDeltas instead. Do NOT
 			// retry if ChatDeltas contain tool calls or content.
-			if strings.TrimSpace(prediction.Response) == "" && attempt < maxRetries && !budgetExhausted {
+			if strings.TrimSpace(prediction.Response) == "" && attempt < maxRetries {
 				hasChatDeltaData := false
 				for _, d := range prediction.ChatDeltas {
 					if d.Content != "" || len(d.ToolCalls) > 0 {
@@ -177,7 +159,7 @@ func ComputeChoices(
 					}
 				}
 			}
-			if shouldRetryFn != nil && !skipCallerRetry && !budgetExhausted && shouldRetryFn(attempt) && attempt < maxRetries {
+			if shouldRetryFn != nil && !skipCallerRetry && shouldRetryFn(attempt) && attempt < maxRetries {
 				// Caller has already reset its state inside shouldRetry
 				result = result[:0]
 				allChatDeltas = nil
--- a/core/http/endpoints/openai/inference_test.go
+++ b/core/http/endpoints/openai/inference_test.go
@@ -393,73 +393,6 @@ var _ = Describe("ComputeChoices", func() {
 		})
 	})

-	Context("reachedTokenBudget", func() {
-		ptr := func(i int) *int { return &i }
-		It("is false when no limit is configured", func() {
-			Expect(reachedTokenBudget(1000, nil)).To(BeFalse())
-			Expect(reachedTokenBudget(1000, ptr(0))).To(BeFalse())
-			Expect(reachedTokenBudget(1000, ptr(-1))).To(BeFalse())
-		})
-		It("is false when generation stopped below the limit", func() {
-			Expect(reachedTokenBudget(99, ptr(100))).To(BeFalse())
-		})
-		It("is true when generation reached or exceeded the limit", func() {
-			Expect(reachedTokenBudget(100, ptr(100))).To(BeTrue())
-			Expect(reachedTokenBudget(101, ptr(100))).To(BeTrue())
-		})
-	})
-
-	Context("max_tokens budget exhausted on reasoning (issue #9716)", func() {
-		// Reproduces the streaming retry loop: when a thinking model spends its
-		// entire max_tokens budget on the reasoning block, the C++ autoparser
-		// clears the raw Response and delivers reasoning-only ChatDeltas (no
-		// content, no tool calls). The built-in empty-response retry then fires
-		// and regenerates from scratch up to maxRetries times, each re-consuming
-		// the whole budget — instead of terminating with finish_reason "length".
-		It("should NOT retry when the token budget was exhausted", func() {
-			maxTokens := 100
-			cfg.Maxtokens = &maxTokens
-
-			calls := 0
-			backend.ModelInferenceFunc = func(
-				ctx context.Context, s string, messages schema.Messages,
-				images, videos, audios []string,
-				loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader,
-				o *config.ApplicationConfig,
-				tokenCallback func(string, backend.TokenUsage) bool,
-				tools, toolChoice string,
-				logprobs, topLogprobs *int,
-				logitBias map[string]float64,
-				metadata map[string]string,
-			) (func() (backend.LLMResponse, error), error) {
-				predFunc := func() (backend.LLMResponse, error) {
-					calls++
-					// Autoparser cleared Response; only reasoning was produced,
-					// and the completion count reached the max_tokens budget.
-					return backend.LLMResponse{
-						Response:   "",
-						ChatDeltas: []*pb.ChatDelta{{ReasoningContent: "thinking..."}},
-						Usage:      backend.TokenUsage{Prompt: 5, Completion: maxTokens},
-					}, nil
-				}
-				return predFunc, nil
-			}
-
-			_, usage, _, err := ComputeChoices(
-				makeReq(), "test", cfg, nil, appCfg, nil,
-				func(s string, c *[]schema.Choice) {
-					*c = append(*c, schema.Choice{Text: s})
-				},
-				nil,
-			)
-			Expect(err).ToNot(HaveOccurred())
-			// The model hit its token ceiling; regenerating would just hit it
-			// again and multiply token consumption. Exactly one call expected.
-			Expect(calls).To(Equal(1), "budget-exhausted generation must not be retried")
-			Expect(usage.Completion).To(Equal(maxTokens))
-		})
-	})
-
 	Context("with streaming token callback", func() {
 		It("should call tokenCallback for streaming responses", func() {
 			var streamedTokens []string
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -12,6 +12,7 @@ import (
 	"os"
 	"strconv"
 	"sync"
+	"sync/atomic"
 	"time"

 	"net/http"
@@ -25,8 +26,6 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/auth"
 	mcpTools "github.com/mudler/LocalAI/core/http/endpoints/mcp"
-	"github.com/mudler/LocalAI/core/http/endpoints/openai/respcoord"
-	"github.com/mudler/LocalAI/core/http/endpoints/openai/turncoord"
 	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/templates"
@@ -169,12 +168,44 @@ type Session struct {
 	gateMu        sync.Mutex
 	voiceVerified bool

-	// respSink is the explicit response-coordination state machine (respcoord,
-	// machine M3). It replaces the legacy startResponse/cancelActiveResponse
-	// pair and its dual-writer activeResponse* fields: every start/cancel/finish
-	// decision is serialized through respcoord.Coordinator, guaranteeing at most
-	// one live response. See realtime_respcoord.go.
-	respSink *responseSink
+	// Response cancellation: protects activeResponseCancel/activeResponseDone
+	responseMu           sync.Mutex
+	activeResponseCancel context.CancelFunc
+	activeResponseDone   chan struct{}
+}
+
+// cancelActiveResponse cancels any in-flight response and waits for its
+// goroutine to exit. This ensures we never have overlapping responses and
+// that interrupted responses are fully cleaned up before starting a new one.
+func (s *Session) cancelActiveResponse() {
+	s.responseMu.Lock()
+	cancel := s.activeResponseCancel
+	done := s.activeResponseDone
+	s.responseMu.Unlock()
+
+	if cancel != nil {
+		cancel()
+	}
+	if done != nil {
+		<-done
+	}
+}
+
+// startResponse cancels any active response and returns a new context for
+// the replacement response. The caller MUST close the returned done channel
+// when the response goroutine exits.
+func (s *Session) startResponse(parent context.Context) (context.Context, chan struct{}) {
+	s.cancelActiveResponse()
+
+	ctx, cancel := context.WithCancel(parent)
+	done := make(chan struct{})
+
+	s.responseMu.Lock()
+	s.activeResponseCancel = cancel
+	s.activeResponseDone = done
+	s.responseMu.Unlock()
+
+	return ctx, done
 }

 func (s *Session) FromClient(session *types.SessionUnion) {
@@ -227,10 +258,8 @@ type Conversation struct {
 	// is kept out of Items (so trimRealtimeItems never drops it) and rendered
 	// as a system message right after the session instructions.
 	Memory string
-	// compaction is the explicit single-flight compaction coordinator (M4): at
-	// most one background summarize+evict runs per conversation at a time. It
-	// replaces the legacy `compacting atomic.Bool`. See realtime_compactcoord.go.
-	compaction *compactionSink
+	// compacting ensures at most one background compaction runs per conversation.
+	compacting atomic.Bool
 }

 func (c *Conversation) ToServer() types.Conversation {
@@ -259,12 +288,6 @@ type Model interface {
 	// sound-event tags. topK caps the number of returned tags (0 = backend
 	// default), threshold drops tags below the given score (0 = keep all).
 	SoundDetection(ctx context.Context, audio string, topK int, threshold float32) (*schema.SoundClassificationResult, error)
-	// TranscribeLive opens a live (bidirectional) transcription session on the
-	// pipeline's transcription backend, used by semantic_vad turn detection;
-	// onEvent fires from a background goroutine for every delta/EOU/final
-	// event. Backends without live support fail with an error satisfying
-	// grpcerrors.IsLiveTranscriptionUnsupported.
-	TranscribeLive(ctx context.Context, language string, onEvent func(backend.LiveTranscriptionEvent)) (backend.LiveTranscriptionSession, error)
 	PredictConfig() *config.ModelConfig
 }

@@ -490,10 +513,14 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	// input_audio_buffer.commit. There is no transcription stage in that case.
 	soundOnly := cfg.Pipeline.SoundDetection != "" && cfg.Pipeline.Transcription == "" && cfg.Pipeline.LLM == ""

-	// defaultTurnDetection seeds server_vad by default, or semantic_vad when the
-	// pipeline opts in (turn_detection.type: semantic_vad); clients can still
-	// override per session via session.update.
-	turnDetection := defaultTurnDetection(cfg)
+	turnDetection := &types.TurnDetectionUnion{
+		ServerVad: &types.ServerVad{
+			Threshold:         0.5,
+			PrefixPaddingMs:   300,
+			SilenceDurationMs: 500,
+			CreateResponse:    true,
+		},
+	}
 	inputAudioTranscription := &types.AudioTranscription{Model: sttModel}
 	if soundOnly {
 		turnDetection = nil           // turn_detection none: no VAD
@@ -534,27 +561,12 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	}
 	session.CompactionEnabled, session.CompactionTrigger, session.MaxSummaryTokens, session.SummaryModel = resolveCompaction(cfg, session.MaxHistoryItems)

-	// Single-writer response coordinator (machine M3). All response starts and
-	// cancels go through this, so the read-loop and VAD goroutine can never race
-	// into two overlapping responses (see realtime_respcoord.go).
-	session.respSink = newResponseSink()
-
 	// Create a default conversation
 	conversationID := generateConversationID()
 	conversation := &Conversation{
 		ID:    conversationID,
 		Items: []*types.MessageItemUnion{},
 	}
-	// The compaction coordinator's work closure resolves the summarizer (lazily
-	// loading a configured summary_model) and runs the summarize+evict off the
-	// response path — only when a compaction actually starts.
-	conversation.compaction = newCompactionSink(func(ctx context.Context) {
-		model := session.summarizerModel()
-		if model == nil {
-			return
-		}
-		session.compact(ctx, conversation, model)
-	})
 	session.Conversations[conversationID] = conversation
 	session.DefaultConversationID = conversationID

@@ -636,22 +648,34 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	})

 	var (
-		msg []byte
-		wg  sync.WaitGroup
+		msg  []byte
+		wg   sync.WaitGroup
+		done = make(chan struct{})
 	)

-	// M1 connection lifecycle. The VAD goroutine's run/stop (and its done channel)
-	// and the once-only teardown are owned by this coordinator, so the channel is
-	// closed exactly once and never resurrected after teardown (Part 2, failure
-	// mode 6; invariants #8, #10). See realtime_conncoord.go and conncoord/.
-	conn := newConnSink(session, sessionID, t, &wg)
-	toggleVAD := func() { conn.setVAD(turnDetectionActive(session.TurnDetection)) }
+	vadServerStarted := false
+	toggleVAD := func() {
+		if session.TurnDetection != nil && session.TurnDetection.ServerVad != nil && !vadServerStarted {
+			xlog.Debug("Starting VAD goroutine...")
+			done = make(chan struct{})
+			wg.Go(func() {
+				conversation := session.Conversations[session.DefaultConversationID]
+				handleVAD(session, conversation, t, done)
+			})
+			vadServerStarted = true
+		} else if (session.TurnDetection == nil || session.TurnDetection.ServerVad == nil) && vadServerStarted {
+			xlog.Debug("Stopping VAD goroutine...")
+			close(done)
+			vadServerStarted = false
+		}
+	}

 	// For WebRTC sessions, start the Opus decode loop before VAD so that
 	// decoded PCM is already flowing when VAD's first tick fires.
+	var decodeDone chan struct{}
 	if wt, ok := t.(*WebRTCTransport); ok {
-		conn.decodeDone = make(chan struct{})
-		go decodeOpusLoop(session, wt.opusBackend, conn.decodeDone)
+		decodeDone = make(chan struct{})
+		go decodeOpusLoop(session, wt.opusBackend, decodeDone)
 	}

 	toggleVAD()
@@ -660,9 +684,9 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	// with window/hop configured, the server classifies the last window of
 	// streamed audio on a timer, so the client only has to stream (no commits).
 	// This runs independent of VAD (sound events are not speech).
+	var soundWindowDone chan struct{}
 	if soundOnly && session.SoundDetectionWindowMs > 0 && session.SoundDetectionHopMs > 0 {
-		conn.soundWindowDone = make(chan struct{})
-		soundWindowDone := conn.soundWindowDone
+		soundWindowDone = make(chan struct{})
 		wg.Go(func() {
 			handleSoundWindow(session, t, soundWindowDone)
 		})
@@ -787,11 +811,11 @@ func runRealtimeSession(application *application.Application, t Transport, model
 			xlog.Debug("recv", "message", string(msg))

 			sessionLock.Lock()
-			autoTurnDetection := turnDetectionActive(session.TurnDetection)
+			isServerVAD := session.TurnDetection != nil && session.TurnDetection.ServerVad != nil
 			sessionLock.Unlock()

 			// TODO: At the least need to check locking and timer state in the VAD Go routine before allowing this
-			if autoTurnDetection {
+			if isServerVAD {
 				sendNotImplemented(t, "input_audio_buffer.commit in conjunction with VAD")
 				continue
 			}
@@ -807,9 +831,11 @@ func runRealtimeSession(application *application.Application, t Transport, model
 				ItemID:          generateItemID(),
 			})

-			session.respSink.issue(context.Background(), respcoord.SourceClient, func(ctx context.Context) {
-				commitUtterance(ctx, allAudio, session, conversation, t)
-			})
+			respCtx, respDone := session.startResponse(context.Background())
+			go func() {
+				defer close(respDone)
+				commitUtterance(respCtx, allAudio, session, conversation, t)
+			}()

 		case types.InputAudioBufferClearEvent:
 			xlog.Debug("recv", "message", string(msg))
@@ -942,14 +968,15 @@ func runRealtimeSession(application *application.Application, t Transport, model
 				conversation.Lock.Unlock()
 			}

-			resp := e.Response
-			session.respSink.issue(context.Background(), respcoord.SourceClient, func(ctx context.Context) {
-				triggerResponse(ctx, session, conversation, t, &resp)
-			})
+			respCtx, respDone := session.startResponse(context.Background())
+			go func() {
+				defer close(respDone)
+				triggerResponse(respCtx, session, conversation, t, &e.Response)
+			}()

 		case types.ResponseCancelEvent:
 			xlog.Debug("recv", "message", string(msg))
-			session.respSink.cancel(respcoord.SourceClient)
+			session.cancelActiveResponse()

 		default:
 			xlog.Error("unknown message type")
@@ -957,11 +984,28 @@ func runRealtimeSession(application *application.Application, t Transport, model
 		}
 	}

-	// Tear down through the connection coordinator (once). It stops any running
-	// VAD goroutine, then the opus-decode and sound-window goroutines, joins them,
-	// cancels the in-flight response and drains all response goroutines, and
-	// finally removes the session — all in dependency order, exactly once.
-	conn.close()
+	// Cancel any in-flight response before tearing down
+	session.cancelActiveResponse()
+
+	// Stop the Opus decode goroutine (if running)
+	if decodeDone != nil {
+		close(decodeDone)
+	}
+
+	// Signal any running VAD goroutine to exit.
+	if vadServerStarted {
+		close(done)
+	}
+	// Stop the server-side sound-detection windowing goroutine (if running).
+	if soundWindowDone != nil {
+		close(soundWindowDone)
+	}
+	wg.Wait()
+
+	// Remove the session from the sessions map
+	sessionLock.Lock()
+	delete(sessions, sessionID)
+	sessionLock.Unlock()
 }

 // sendEvent sends a server event via the transport, logging any errors.
@@ -1241,38 +1285,8 @@ func decodeOpusLoop(session *Session, opusBackend grpc.Backend, done chan struct
 	}
 }

-// noSpeechHoldbackSec is how much of the tail of an inspected, segment-free
-// buffer survives the periodic no-speech clear. It must cover the VAD's
-// onset-detection latency: a word can already be underway in the newest part
-// of the window without silero having crossed its threshold yet, and clearing
-// it cuts the start of the utterance the next tick will detect.
-const noSpeechHoldbackSec = 0.5
-
-// dropInspectedPrefix removes the head of the audio buffer that a VAD tick
-// inspected (the first inspected bytes), keeping the newest holdbackBytes of
-// that window plus everything appended while the tick ran — audio the VAD
-// never saw. When something is dropped the result is a fresh copy, never a
-// sub-slice, so later appends can't scribble on memory shared with the old
-// backing array; when nothing is dropped buf is returned unchanged.
-func dropInspectedPrefix(buf []byte, inspected, holdbackBytes int) []byte {
-	cut := inspected - holdbackBytes
-	if cut <= 0 {
-		return buf
-	}
-	if cut > len(buf) {
-		cut = len(buf)
-	}
-	return append([]byte(nil), buf[cut:]...)
-}
-
 // handleVAD is a goroutine that listens for audio data from the client,
-// runs VAD on the audio data, and commits utterances to the conversation.
-//
-// With turn_detection.type == "semantic_vad" (sv != nil below) the silero
-// loop is augmented by a live transcription stream: the buffer's new audio
-// is fed to the transcription model every tick and its end-of-utterance
-// token switches the commit threshold between a short post-EOU window and
-// the long eagerness fallback. The server_vad path is untouched.
+// runs VAD on the audio data, and commits utterances to the conversation
 func handleVAD(session *Session, conv *Conversation, t Transport, done chan struct{}) {
 	vadContext, cancel := context.WithCancel(context.Background())
 	go func() {
@@ -1285,22 +1299,9 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru
 		silenceThreshold = float64(session.TurnDetection.ServerVad.SilenceDurationMs) / 1000
 	}

-	lts := newLiveTurnState(session, t)
+	speechStarted := false
 	startTime := time.Now()

-	// M2 turn-detection state machine. "Speech started" and "a turn's live ASR
-	// stream is open" are ONE coordinator state (Idle/Speaking), so they cannot
-	// desync the way the legacy speechStarted bool and lts.open() could (Part 2,
-	// failure mode 4). See realtime_turncoord.go and turncoord/.
-	sink := newTurnSink(session, conv, t, lts, vadContext, startTime)
-	// Teardown: end any open turn through the coordinator (DiscardTurn closes the
-	// live stream; no-op if already idle). Replaces the bare lts.discardTurn().
-	defer func() {
-		if err := sink.coord.Apply(turncoord.Abort{Reason: turncoord.AbortTeardown}); err != nil {
-			xlog.Error("turncoord: abort(teardown) failed", "error", err)
-		}
-	}()
-
 	ticker := time.NewTicker(300 * time.Millisecond)
 	defer ticker.Stop()

@@ -1309,30 +1310,6 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru
 		case <-done:
 			return
 		case <-ticker.C:
-			// Semantic mode is re-read each tick: session.update can switch
-			// turn-detection modes (and the retranscribe gate) mid-session.
-			sessionLock.Lock()
-			var sv *types.RealtimeSessionSemanticVad
-			if session.TurnDetection != nil {
-				sv = session.TurnDetection.SemanticVad
-			}
-			retranscribe := sv != nil && session.ModelConfig != nil &&
-				session.ModelConfig.Pipeline.TurnDetectionRetranscribe()
-			sessionLock.Unlock()
-
-			// The turn coordinator's data-heavy effects (OpenTurn/CommitTurn)
-			// need this tick's mode; set it before any Apply below.
-			sink.sv = sv
-
-			// session.update switched semantic -> server mid-turn: drop the
-			// orphaned live stream. This is NOT a turn abort — the turn continues
-			// under server_vad (a config change must not cut off a mid-utterance
-			// speaker), so the coordinator stays Speaking; only the orphaned live
-			// stream is closed.
-			if sv == nil && lts.open() {
-				lts.discardTurn()
-			}
-
 			session.AudioBufferLock.Lock()
 			allAudio := make([]byte, len(session.InputAudioBuffer))
 			copy(allAudio, session.InputAudioBuffer)
@@ -1346,13 +1323,6 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru
 			// Resample from InputSampleRate to 16kHz
 			aints = sound.ResampleInt16(aints, session.InputSampleRate, localSampleRate)

-			audioLength := float64(len(aints)) / localSampleRate
-
-			if sv != nil && lts.open() {
-				lts.feedNewAudio(aints)
-				lts.drainEvents(audioLength)
-			}
-
 			segments, err := runVAD(vadContext, session, aints)
 			if err != nil {
 				if err.Error() == "unexpected speech end" {
@@ -1364,52 +1334,31 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru
 				continue
 			}

-			// NOTE: the no-speech clear and the min-buffer gate above stay on
-			// the short silenceThreshold even in semantic mode — the eagerness
-			// fallback applies only to the end-of-speech commit decision, or a
-			// low eagerness would delay speech_started/barge-in by seconds.
+			audioLength := float64(len(aints)) / localSampleRate
+
+			// TODO: When resetting the buffer we should retain a small postfix
 			if len(segments) == 0 && audioLength > silenceThreshold {
-				// "No segments" is not "no speech": silero (threshold 0.5)
-				// crosses up to a few hundred ms into a soft word onset, so
-				// the newest audio in the inspected window may be the start
-				// of a word the next tick will recognize — and more audio
-				// arrived while this tick ran. Keep both; drop only the
-				// older, confirmed-silent head, or utterance onsets get cut.
-				holdback := int(noSpeechHoldbackSec*float64(session.InputSampleRate)) * 2
 				session.AudioBufferLock.Lock()
-				session.InputAudioBuffer = dropInspectedPrefix(session.InputAudioBuffer, len(allAudio), holdback)
+				session.InputAudioBuffer = nil
 				session.AudioBufferLock.Unlock()

-				// No-speech clear: end any open turn (Speaking -> Idle, discarding
-				// the partial). Returning to Idle is the fix for failure mode 4 —
-				// the legacy discardTurn left speechStarted true, suppressing the
-				// next onset. Idle while not speaking is a no-op.
-				if err := sink.coord.Apply(turncoord.Abort{Reason: turncoord.AbortNoSpeech}); err != nil {
-					xlog.Error("turncoord: abort(no_speech) failed", "error", err)
-				}
 				continue
 			} else if len(segments) == 0 {
 				continue
 			}

-			// Speech detected this tick: open the turn (Idle -> Speaking) through
-			// the coordinator. On that transition it opens the turn's live ASR
-			// stream + feeds the buffered prefix (OpenTurn), cancels any in-flight
-			// response (BargeIn, non-blocking — the VAD tick is never stalled), and
-			// emits speech_started. While already Speaking it is a no-op, so "turn
-			// open" and "speech started" can never disagree. The turn id is minted
-			// here and carried by the coordinator through to the committed event.
-			sink.onsetAudio = aints
-			if err := sink.coord.Apply(turncoord.Onset{Turn: turncoord.TurnID(generateItemID())}); err != nil {
-				xlog.Error("turncoord: onset failed", "error", err)
-			}
+			if !speechStarted {
+				// Barge-in: cancel any in-flight response so we stop
+				// sending audio and don't keep the interrupted reply in history.
+				session.cancelActiveResponse()

-			if sv != nil {
-				// Drain again: events produced by THIS tick's feed have
-				// usually arrived by the time runVAD returns, and leaving
-				// them for the next tick adds 300ms to every EOU-triggered
-				// commit.
-				lts.drainEvents(audioLength)
+				sendEvent(t, types.InputAudioBufferSpeechStartedEvent{
+					ServerEventBase: types.ServerEventBase{
+						EventID: "event_TODO",
+					},
+					AudioStartMs: time.Since(startTime).Milliseconds(),
+				})
+				speechStarted = true
 			}

 			// Segment still in progress when audio ended
@@ -1418,90 +1367,41 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru
 				continue
 			}

-			threshold := silenceThreshold
-			eouPending := false
-			if sv != nil {
-				eouPending = lts.eouPending(segments)
-				threshold = lts.thresholdSec(eouPending, sv)
-			}
-
-			if float32(audioLength)-segEndTime > float32(threshold) {
-				if sv != nil {
-					trigger, eouLag := lts.commitTrigger(eouPending, float64(segEndTime))
-					xlog.Info("semantic_vad: committing turn",
-						"trigger", trigger,
-						"speech_end_s", segEndTime,
-						"eou_lag_s", eouLag,
-						"silence_s", audioLength-float64(segEndTime),
-						"audio_s", audioLength)
-				}
-				// Retranscribe gate (semantic mode, EOU-triggered commits
-				// only): cross-check the streamed EOU with an offline decode
-				// of the buffered turn before committing. Runs synchronously
-				// on the tick — the engine would serialize a concurrent feed
-				// against it anyway. Timeout-triggered commits skip the gate.
-				var gated *schema.TranscriptionResult
-				if retranscribe && eouPending {
-					batch, gerr := transcribeUtterance(vadContext, sound.Int16toBytesLE(aints), session)
-					switch {
-					case gerr != nil:
-						xlog.Warn("semantic_vad: retranscribe gate failed; committing via the file path", "error", gerr)
-					case !batch.Eou:
-						xlog.Info("semantic_vad: batch decode did not confirm the streamed EOU; continuing to listen",
-							"streamed", lts.previewText(), "batch", batch.Text)
-						// The batch decode rejected the streamed EOU as a false
-						// positive: consume the recorded EOU so the next tick
-						// falls back to the eagerness window instead of
-						// re-triggering on the same token.
-						lts.eouAtSec = 0
-						continue
-					default:
-						xlog.Info("semantic_vad: batch decode confirmed the streamed EOU",
-							"streamed", lts.previewText(), "batch", batch.Text)
-						gated = batch
-					}
-				}
-
+			if float32(audioLength)-segEndTime > float32(silenceThreshold) {
 				xlog.Debug("Detected end of speech segment")
 				session.AudioBufferLock.Lock()
-				// Keep audio appended while this tick ran — it belongs to
-				// the next turn (in any mode: nil-ing it dropped the onset
-				// of an utterance started right after a commit).
-				session.InputAudioBuffer = dropInspectedPrefix(session.InputAudioBuffer, len(allAudio), 0)
+				session.InputAudioBuffer = nil
 				session.AudioBufferLock.Unlock()

-				// Commit the turn through the coordinator: it emits speech_stopped
-				// (EmitSpeechStopped) then the committed event, finalizes the live
-				// stream, and issues the response (CommitTurn). The committed item
-				// id is the coordinator's turn id (== the id the live captions
-				// streamed under), so the client replaces the partial text.
-				sink.commitAudio = sound.Int16toBytesLE(aints)
-				sink.commitAudioLength = audioLength
-				sink.commitRetranscribe = retranscribe
-				sink.commitGated = gated
-				// TODO: Remove prefix silence that is over TurnDetectionParams.PrefixPaddingMs
-				if err := sink.coord.Apply(turncoord.Silence{}); err != nil {
-					xlog.Error("turncoord: commit failed", "error", err)
-				}
+				sendEvent(t, types.InputAudioBufferSpeechStoppedEvent{
+					ServerEventBase: types.ServerEventBase{
+						EventID: "event_TODO",
+					},
+					AudioEndMs: time.Since(startTime).Milliseconds(),
+				})
+				speechStarted = false
+
+				sendEvent(t, types.InputAudioBufferCommittedEvent{
+					ServerEventBase: types.ServerEventBase{
+						EventID: "event_TODO",
+					},
+					ItemID:         generateItemID(),
+					PreviousItemID: "TODO",
+				})
+
+				abytes := sound.Int16toBytesLE(aints)
+				// TODO: Remove prefix silence that is is over TurnDetectionParams.PrefixPaddingMs
+				respCtx, respDone := session.startResponse(vadContext)
+				go func() {
+					defer close(respDone)
+					commitUtterance(respCtx, abytes, session, conv, t)
+				}()
 			}
 		}
 	}
 }

 func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Conversation, t Transport) {
-	commitUtteranceWithTranscript(ctx, utt, nil, nil, "", session, conv, t)
-}
-
-// commitUtteranceWithTranscript commits one user turn. live carries the
-// transcript semantic_vad's live stream already produced (its caption deltas
-// were streamed to the client during the turn, so only the completed event
-// is emitted here); gated carries the retranscribe gate's batch decode (the
-// authoritative transcript in that mode). With neither — server_vad, manual
-// commits, semantic degrade, or a live stream that heard nothing — the audio
-// is written to a temp WAV and transcribed via the file path as before.
-// itemID is the turn's conversation item id ("" mints a fresh one); it must
-// match the id any live deltas were sent under.
-func commitUtteranceWithTranscript(ctx context.Context, utt []byte, live *liveUtterance, gated *schema.TranscriptionResult, itemID string, session *Session, conv *Conversation, t Transport) {
 	if len(utt) == 0 {
 		return
 	}
@@ -1566,37 +1466,14 @@ func commitUtteranceWithTranscript(ctx context.Context, utt []byte, live *liveUt
 	}

 	// TODO: If we have a real any-to-any model then transcription is optional
-
-	// The turn's live captions (semantic_vad) already streamed under this
-	// itemID; the completed event below reuses it so the client replaces the
-	// partial text. server_vad / manual commits arrive with no itemID, so mint
-	// one here.
-	if itemID == "" {
-		itemID = generateItemID()
-	}
-
 	var transcript string
 	switch {
-	case gated != nil:
-		// semantic_vad retranscribe gate: the batch decode is authoritative.
-		transcript = gated.Text
-		if err := emitPrecomputedTranscription(t, itemID, nil, transcript); err != nil {
-			sendError(t, "transcription_failed", err.Error(), "", "event_TODO")
-			return
-		}
-	case live != nil && live.Text != "":
-		// The caption deltas already streamed during the turn under this
-		// itemID; the completed event replaces the partial text client-side.
-		transcript = live.Text
-		if err := emitPrecomputedTranscription(t, itemID, nil, transcript); err != nil {
-			sendError(t, "transcription_failed", err.Error(), "", "event_TODO")
-			return
-		}
 	case session.InputAudioTranscription != nil:
 		// emitTranscription streams transcript deltas when
 		// pipeline.streaming.transcription is set, otherwise emits a single
 		// completed event; either way it returns the final transcript text.
-		transcript, err = emitTranscription(ctx, t, session, itemID, f.Name())
+		var err error
+		transcript, err = emitTranscription(ctx, t, session, generateItemID(), f.Name())
 		if err != nil {
 			// Drain the gate goroutine before returning so its in-flight read of
 			// the temp WAV finishes before the deferred os.Remove fires.
@@ -1765,56 +1642,6 @@ func writeWindowWAV(pcm []byte, sampleRate int) (string, error) {
 	return f.Name(), nil
 }

-// writeUtteranceWAV persists raw 16 kHz mono PCM to a temp WAV for the
-// file-based transcription paths. The caller must invoke cleanup.
-func writeUtteranceWAV(utt []byte) (string, func(), error) {
-	f, err := os.CreateTemp("", "realtime-audio-chunk-*.wav")
-	if err != nil {
-		return "", nil, err
-	}
-	cleanup := func() {
-		_ = f.Close()
-		_ = os.Remove(f.Name())
-	}
-	xlog.Debug("Writing to file", "file", f.Name())
-
-	hdr := laudio.NewWAVHeader(uint32(len(utt)))
-	if err := hdr.Write(f); err != nil {
-		cleanup()
-		return "", nil, err
-	}
-	if _, err := f.Write(utt); err != nil {
-		cleanup()
-		return "", nil, err
-	}
-	_ = f.Sync()
-	return f.Name(), cleanup, nil
-}
-
-// transcribeUtterance runs one offline (unary) decode of the buffered turn —
-// the semantic_vad retranscribe gate. The result's Eou flag reports whether
-// the batch decode also ended on the end-of-utterance token.
-func transcribeUtterance(ctx context.Context, utt []byte, session *Session) (*schema.TranscriptionResult, error) {
-	path, cleanup, err := writeUtteranceWAV(utt)
-	if err != nil {
-		return nil, err
-	}
-	defer cleanup()
-
-	language, prompt := "", ""
-	if cfg := session.InputAudioTranscription; cfg != nil {
-		language, prompt = cfg.Language, cfg.Prompt
-	}
-	tr, err := session.ModelInterface.Transcribe(ctx, path, language, false, false, prompt)
-	if err != nil {
-		return nil, err
-	}
-	if tr == nil {
-		return nil, fmt.Errorf("transcribe result is nil")
-	}
-	return tr, nil
-}
-
 func runVAD(ctx context.Context, session *Session, adata []int16) ([]schema.VADSegment, error) {
 	soundIntBuffer := &audio.IntBuffer{
 		Format:         &audio.Format{SampleRate: localSampleRate, NumChannels: 1},
@@ -1894,100 +1721,14 @@ func generateResponse(ctx context.Context, session *Session, utt []byte, transcr
 // without another response cycle.
 const maxAssistantToolTurns = 10

-// responseOutcome is how a response ended, decided by the response body and
-// read once by triggerResponse to emit the single terminal event.
-type responseOutcome int
-
-const (
-	outcomeCompleted responseOutcome = iota
-	outcomeCancelled
-	outcomeFailed // an error event was already sent; emit no terminal (legacy behavior)
-)
-
-// liveResponse accumulates the wire-visible result of ONE response.create across
-// the whole agentic tool-turn recursion: a single id, the output items as they
-// complete, the summed token usage, and the final outcome. triggerResponse owns
-// it; triggerResponseAtTurn / streamLLMResponse / emitToolCallItems fill it in.
-// This is what makes "exactly one response.done per response.create, with Output
-// and Usage populated" true — the body no longer emits per-turn terminals.
-type liveResponse struct {
-	id      string
-	output  []types.MessageItemUnion
-	usage   backend.TokenUsage
-	outcome responseOutcome
-}
-
-func (r *liveResponse) addItem(it types.MessageItemUnion) { r.output = append(r.output, it) }
-
-func (r *liveResponse) addUsage(u backend.TokenUsage) {
-	r.usage.Prompt += u.Prompt
-	r.usage.Completion += u.Completion
-}
-
-// responseUsage maps the backend's token counts onto the OpenAI Realtime
-// response.usage shape. Returns nil when there is nothing to report so the
-// field is omitted rather than sent as zeros.
-func responseUsage(u backend.TokenUsage) *types.TokenUsage {
-	if u.Prompt == 0 && u.Completion == 0 {
-		return nil
-	}
-	return &types.TokenUsage{
-		InputTokens:  u.Prompt,
-		OutputTokens: u.Completion,
-		TotalTokens:  u.Prompt + u.Completion,
-	}
-}
-
 func triggerResponse(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams) {
-	// One response.created and one response.done per response.create — even when
-	// the server-side tool loop runs several inference turns. The per-turn
-	// terminals the legacy code emitted (one response.done per turn, with empty
-	// Output/Usage) are gone; tool turns are now internal to this single response.
-	r := &liveResponse{id: generateUniqueID()}
-	sendEvent(t, types.ResponseCreatedEvent{
-		ServerEventBase: types.ServerEventBase{},
-		Response: types.Response{
-			ID:     r.id,
-			Object: "realtime.response",
-			Status: types.ResponseStatusInProgress,
-		},
-	})
-
-	triggerResponseAtTurn(ctx, session, conv, t, overrides, 0, r)
-
-	switch r.outcome {
-	case outcomeCancelled:
-		sendEvent(t, types.ResponseDoneEvent{
-			ServerEventBase: types.ServerEventBase{},
-			Response: types.Response{
-				ID:     r.id,
-				Object: "realtime.response",
-				Status: types.ResponseStatusCancelled,
-				Output: r.output,
-			},
-		})
-	case outcomeFailed:
-		// A specific error event was already sent; emit no terminal (matches the
-		// legacy behavior where failed responses had no response.done).
-	default:
-		sendEvent(t, types.ResponseDoneEvent{
-			ServerEventBase: types.ServerEventBase{},
-			Response: types.Response{
-				ID:     r.id,
-				Object: "realtime.response",
-				Status: types.ResponseStatusCompleted,
-				Output: r.output,
-				Usage:  responseUsage(r.usage),
-			},
-		})
-	}
-
+	triggerResponseAtTurn(ctx, session, conv, t, overrides, 0)
 	// Fold aged-out turns into the rolling memory off the critical path; the
 	// next turn reaps the smaller buffer.
 	session.maybeCompact(conv)
 }

-func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams, toolTurn int, r *liveResponse) {
+func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams, toolTurn int) {
 	config := session.ModelInterface.PredictConfig()

 	// Default values
@@ -2150,9 +1891,15 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 		images = append(images, m.StringImages...)
 	}

-	// response.created/done are emitted once per response.create by triggerResponse;
-	// every turn (including agentic recursion) shares this id.
-	responseID := r.id
+	responseID := generateUniqueID()
+	sendEvent(t, types.ResponseCreatedEvent{
+		ServerEventBase: types.ServerEventBase{},
+		Response: types.Response{
+			ID:     responseID,
+			Object: "realtime.response",
+			Status: types.ResponseStatusInProgress,
+		},
+	})

 	// Streamed LLM path: when the pipeline opts into LLM streaming, stream the
 	// transcript to the client as it is generated and synthesize the buffered
@@ -2168,7 +1915,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 			respMods = overrides.OutputModalities
 		}
 		if canStream && modalitiesContainAudio(resolveOutputModalities(session.OutputModalities, respMods)) {
-			if streamLLMResponse(ctx, session, conv, t, r, conversationHistory, images, config, tools, toolChoice, toolTurn) {
+			if streamLLMResponse(ctx, session, conv, t, responseID, conversationHistory, images, config, tools, toolChoice, toolTurn) {
 				return
 			}
 		}
@@ -2177,22 +1924,26 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 	predFunc, err := session.ModelInterface.Predict(ctx, conversationHistory, images, nil, nil, nil, tools, toolChoice, nil, nil, nil)
 	if err != nil {
 		sendError(t, "inference_failed", fmt.Sprintf("backend error: %v", err), "", "") // item.Assistant.ID is unknown here
-		r.outcome = outcomeFailed
 		return
 	}

 	pred, err := predFunc()
 	if err != nil {
 		sendError(t, "prediction_failed", fmt.Sprintf("backend error: %v", err), "", "")
-		r.outcome = outcomeFailed
 		return
 	}
-	r.addUsage(pred.Usage)

 	// Check for cancellation after LLM inference (barge-in may have fired)
 	if ctx.Err() != nil {
 		xlog.Debug("Response cancelled after LLM inference (barge-in)")
-		r.outcome = outcomeCancelled
+		sendEvent(t, types.ResponseDoneEvent{
+			ServerEventBase: types.ServerEventBase{},
+			Response: types.Response{
+				ID:     responseID,
+				Object: "realtime.response",
+				Status: types.ResponseStatusCancelled,
+			},
+		})
 		return
 	}

@@ -2352,12 +2103,18 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 			conv.Lock.Unlock()
 		}

-		// sendCancelledResponse records the cancelled outcome (triggerResponse
-		// emits the single terminal) and cleans up the partial assistant item so
-		// the interrupted reply is not in chat history.
+		// sendCancelledResponse emits the cancelled status and cleans up the
+		// assistant item so the interrupted reply is not in chat history.
 		sendCancelledResponse := func() {
 			removeItemFromConv(item.Assistant.ID)
-			r.outcome = outcomeCancelled
+			sendEvent(t, types.ResponseDoneEvent{
+				ServerEventBase: types.ServerEventBase{},
+				Response: types.Response{
+					ID:     responseID,
+					Object: "realtime.response",
+					Status: types.ResponseStatusCancelled,
+				},
+			})
 		}

 		var audioString string
@@ -2406,7 +2163,6 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 				}
 				xlog.Error("TTS failed", "error", err)
 				sendError(t, "tts_error", fmt.Sprintf("TTS generation failed: %v", err), "", item.Assistant.ID)
-				r.outcome = outcomeFailed
 				return
 			}
 			if !isWebRTC {
@@ -2464,13 +2220,12 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 			OutputIndex:     0,
 			Item:            item,
 		})
-		r.addItem(item)
 	}

-	// Emit the parsed tool calls and (for server-side assistant tools) the
-	// follow-up turn. Shared with the streamed path so both finalize tool calls
-	// identically. The single terminal is emitted by triggerResponse.
-	emitToolCallItems(ctx, session, conv, t, r, finalToolCalls, finalSpeech != "", toolTurn)
+	// Emit the parsed tool calls, the terminal response.done, and (for
+	// server-side assistant tools) the follow-up response. Shared with the
+	// streamed path so both finalize tool calls identically.
+	emitToolCallItems(ctx, session, conv, t, responseID, finalToolCalls, finalSpeech != "", toolTurn)
 }

 // emitToolCallItems emits the realtime function_call items for the parsed tool
@@ -2484,8 +2239,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 //   - All other tools follow the standard OpenAI flow: emit
 //     function_call_arguments.done and wait for the client to send
 //     conversation.item.create back.
-func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation, t Transport, r *liveResponse, toolCalls []functions.FuncCallResults, hasContent bool, toolTurn int) {
-	responseID := r.id
+func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation, t Transport, responseID string, toolCalls []functions.FuncCallResults, hasContent bool, toolTurn int) {
 	xlog.Debug("About to handle tool calls", "finalToolCallsCount", len(toolCalls))
 	executedAssistantTool := false
 	for i, tc := range toolCalls {
@@ -2548,7 +2302,6 @@ func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation
 				OutputIndex:     outputIndex,
 				Item:            fcItem,
 			})
-			r.addItem(fcItem)
 			sendEvent(t, types.ResponseOutputItemAddedEvent{
 				ServerEventBase: types.ServerEventBase{},
 				ResponseID:      responseID,
@@ -2561,7 +2314,6 @@ func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation
 				OutputIndex:     outputIndex,
 				Item:            foItem,
 			})
-			r.addItem(foItem)
 			executedAssistantTool = true
 			continue
 		}
@@ -2591,25 +2343,28 @@ func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation
 			OutputIndex:     outputIndex,
 			Item:            fcItem,
 		})
-		r.addItem(fcItem)
 	}

-	// No terminal here: triggerResponse emits the single response.done once the
-	// whole turn (including the agentic recursion below) completes.
+	sendEvent(t, types.ResponseDoneEvent{
+		ServerEventBase: types.ServerEventBase{},
+		Response: types.Response{
+			ID:     responseID,
+			Object: "realtime.response",
+			Status: types.ResponseStatusCompleted,
+		},
+	})

 	// If we executed any assistant tools inproc, run another response cycle
 	// so the model can speak the result. Mirrors the chat-side agentic loop
 	// but driven server-side rather than by client round-trip. Bounded so a
-	// degenerate "model keeps calling tools" doesn't blow the stack. The
-	// follow-up turn shares the same liveResponse, so its output accumulates
-	// into the one response.done.
+	// degenerate "model keeps calling tools" doesn't blow the stack.
 	if executedAssistantTool {
 		if toolTurn+1 >= maxAssistantToolTurns {
 			xlog.Warn("realtime: assistant tool-turn limit reached, stopping the agentic loop",
 				"limit", maxAssistantToolTurns, "model", session.Model)
 			return
 		}
-		triggerResponseAtTurn(ctx, session, conv, t, nil, toolTurn+1, r)
+		triggerResponseAtTurn(ctx, session, conv, t, nil, toolTurn+1)
 	}
 }

--- a/core/http/endpoints/openai/realtime_compactcoord.go
+++ b/core/http/endpoints/openai/realtime_compactcoord.go
@@ -1,79 +0,0 @@
-package openai
-
-import (
-	"context"
-	"sync"
-
-	"github.com/mudler/LocalAI/core/http/endpoints/openai/compactcoord"
-	"github.com/mudler/xlog"
-)
-
-// compactionSink wires the explicit compaction state machine
-// (compactcoord.Coordinator — machine "M4" in docs/design/realtime-state-machines.md)
-// into a conversation.
-//
-// It replaces the legacy `compacting atomic.Bool` single-flight guard: the
-// coordinator owns whether a compaction is running, so a Trigger while one is
-// already in flight is dropped (single-flight) and the background goroutine
-// always reports Finished — the flag can never stick (invariant #9).
-//
-// run is the summarize+evict work for this conversation (captured at
-// construction); StartCompaction spawns it and reports Finished when it returns.
-// It takes a context derived from the sink's session-scoped ctx, so shutdown()
-// can cancel an in-flight compaction.
-type compactionSink struct {
-	coord  *compactcoord.Coordinator
-	run    func(ctx context.Context)
-	ctx    context.Context
-	cancel context.CancelFunc
-	wg     sync.WaitGroup
-}
-
-func newCompactionSink(run func(ctx context.Context)) *compactionSink {
-	s := &compactionSink{run: run}
-	s.ctx, s.cancel = context.WithCancel(context.Background())
-	s.coord = compactcoord.New(s)
-	return s
-}
-
-// trigger asks the coordinator to start a compaction; a no-op while one is
-// already running or after shutdown. Non-blocking.
-func (s *compactionSink) trigger() {
-	if err := s.coord.Apply(compactcoord.Trigger{}); err != nil {
-		xlog.Error("compactcoord: trigger failed", "error", err)
-	}
-}
-
-// shutdown is called by the connection (M1) parent's teardown: cancel any
-// in-flight compaction, join it, then move the coordinator to Terminated so no
-// compaction can start afterwards. This closes the legacy gap where the
-// fire-and-forget compaction goroutine could outlive the session. Cancelling the
-// context first makes the in-flight summarizer Predict return promptly, so the
-// join is bounded.
-func (s *compactionSink) shutdown() {
-	s.cancel()
-	s.wg.Wait()
-	if err := s.coord.Apply(compactcoord.Shutdown{}); err != nil {
-		xlog.Error("compactcoord: shutdown apply failed", "error", err)
-	}
-}
-
-// Perform executes one effect. Called under the coordinator lock; StartCompaction
-// only spawns a goroutine, so it does not block.
-func (s *compactionSink) Perform(e compactcoord.Effect) {
-	switch e.(type) {
-	case compactcoord.StartCompaction:
-		s.wg.Add(1)
-		go func() {
-			defer s.wg.Done()
-			defer func() {
-				if err := s.coord.Apply(compactcoord.Finished{}); err != nil {
-					xlog.Error("compactcoord: finished apply failed", "error", err)
-				}
-			}()
-			if s.run != nil {
-				s.run(s.ctx)
-			}
-		}()
-	}
-}
--- a/core/http/endpoints/openai/realtime_compaction.go
+++ b/core/http/endpoints/openai/realtime_compaction.go
@@ -222,7 +222,7 @@ func prefixMatches(items, snapshot []*types.MessageItemUnion) bool {
 // conv.Lock across the summarizer call: snapshot under lock, summarize unlocked,
 // commit under lock (re-validating the head is unchanged). On any error it
 // leaves the conversation untouched — items are never dropped without a summary.
-func (s *Session) compact(ctx context.Context, conv *Conversation, model Model) {
+func (s *Session) compact(conv *Conversation, model Model) {
 	if model == nil {
 		return
 	}
@@ -241,10 +241,9 @@ func (s *Session) compact(ctx context.Context, conv *Conversation, model Model)
 	prior := conv.Memory
 	conv.Lock.Unlock()

-	// Summarize (unlocked). The timeout is derived from the caller's ctx so the
-	// connection teardown can cancel an in-flight summary (bounding the join).
+	// Summarize (unlocked).
 	msgs := buildSummaryMessages(prior, renderItemsTranscript(overflow), s.MaxSummaryTokens)
-	ctx, cancel := context.WithTimeout(ctx, compactionTimeout)
+	ctx, cancel := context.WithTimeout(context.Background(), compactionTimeout)
 	defer cancel()
 	predFunc, err := model.Predict(ctx, msgs, nil, nil, nil, nil, nil, nil, nil, nil, nil)
 	if err != nil {
@@ -299,13 +298,9 @@ func (s *Session) summarizerModel() Model {
 }

 // maybeCompact schedules a background compaction when the live buffer has grown
-// past the trigger and none is already running. Returns immediately. The
-// single-flight guarantee (at most one compaction per conversation) is owned by
-// the compaction coordinator (M4); see realtime_compactcoord.go. The actual
-// summarize+evict work (and the lazy summary_model load) is the conversation's
-// compaction-sink run closure, so it stays off the response path.
+// past the trigger and none is already running. Returns immediately.
 func (s *Session) maybeCompact(conv *Conversation) {
-	if !s.CompactionEnabled || conv.compaction == nil {
+	if !s.CompactionEnabled {
 		return
 	}
 	conv.Lock.Lock()
@@ -314,5 +309,18 @@ func (s *Session) maybeCompact(conv *Conversation) {
 	if !over {
 		return
 	}
-	conv.compaction.trigger()
+	if !conv.compacting.CompareAndSwap(false, true) {
+		return
+	}
+	go func() {
+		defer conv.compacting.Store(false)
+		// Resolve (and, for a configured summary_model, lazily load) the
+		// summarizer only when a compaction actually runs, off the response
+		// path — so the model load never blocks a user turn.
+		model := s.summarizerModel()
+		if model == nil {
+			return
+		}
+		s.compact(conv, model)
+	}()
 }
--- a/core/http/endpoints/openai/realtime_compaction_test.go
+++ b/core/http/endpoints/openai/realtime_compaction_test.go
@@ -1,7 +1,6 @@
 package openai

 import (
-	"context"
 	"errors"

 	. "github.com/onsi/ginkgo/v2"
@@ -199,7 +198,7 @@ var _ = Describe("compact", func() {
 		s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4, MaxSummaryTokens: 512}
 		m := &fakeModel{predictResp: backend.LLMResponse{Response: "ROLLED UP"}}

-		s.compact(context.Background(), conv, m)
+		s.compact(conv, m)

 		Expect(conv.Memory).To(Equal("ROLLED UP"))
 		Expect(len(conv.Items)).To(Equal(4))
@@ -214,7 +213,7 @@ var _ = Describe("compact", func() {
 		s := &Session{CompactionEnabled: true, CompactionTrigger: 2, MaxHistoryItems: 1, MaxSummaryTokens: 512}
 		m := &fakeModel{predictErr: errors.New("boom")}

-		s.compact(context.Background(), conv, m)
+		s.compact(conv, m)

 		Expect(conv.Memory).To(Equal(""))
 		Expect(len(conv.Items)).To(Equal(3))
@@ -228,7 +227,7 @@ var _ = Describe("compact", func() {
 		s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4, MaxSummaryTokens: 512}
 		m := &fakeModel{predictResp: backend.LLMResponse{Response: "<think>planning the summary</think>CLEAN SUMMARY"}}

-		s.compact(context.Background(), conv, m)
+		s.compact(conv, m)

 		Expect(conv.Memory).To(Equal("CLEAN SUMMARY"))
 		Expect(conv.Memory).ToNot(ContainSubstring("planning"))
@@ -237,7 +236,7 @@ var _ = Describe("compact", func() {
 	It("does nothing when items are at or below the trigger", func() {
 		conv := &Conversation{Items: []*types.MessageItemUnion{user("1", "a")}}
 		s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4}
-		s.compact(context.Background(), conv, &fakeModel{predictResp: backend.LLMResponse{Response: "x"}})
+		s.compact(conv, &fakeModel{predictResp: backend.LLMResponse{Response: "x"}})
 		Expect(conv.Memory).To(Equal(""))
 		Expect(len(conv.Items)).To(Equal(1))
 	})
--- a/core/http/endpoints/openai/realtime_conncoord.go
+++ b/core/http/endpoints/openai/realtime_conncoord.go
@@ -1,122 +0,0 @@
-package openai
-
-import (
-	"sync"
-
-	"github.com/mudler/LocalAI/core/http/endpoints/openai/conncoord"
-	"github.com/mudler/xlog"
-)
-
-// connSink wires the explicit connection-lifecycle state machine
-// (conncoord.Coordinator — machine "M1" in docs/design/realtime-state-machines.md)
-// into the realtime session handler.
-//
-// It replaces the legacy vadServerStarted bool + the `done` channel that was
-// reassigned on every turn-detection toggle and closed from two sites (Part 2,
-// failure mode 6). The coordinator owns whether the VAD goroutine is running, so
-// the per-run done channel is created and closed in lockstep with that one state
-// — closed exactly once, never resurrected after teardown.
-//
-// The connection machine is driven by the single session goroutine (the handler
-// loop and its teardown), so this sink and its coordinator are loop-local; the
-// Coordinator's lock only keeps State() race-free.
-//
-// Effects:
-//   - StartVAD: create a fresh done channel and spawn handleVAD on it (joined via wg).
-//   - StopVAD:  close that done channel.
-//   - Teardown: stop the remaining input goroutines (opus decode, sound window),
-//     join everything, cancel in-flight responses, and remove the session — once.
-type connSink struct {
-	session   *Session
-	sessionID string
-	transport Transport
-	wg        *sync.WaitGroup
-
-	coord *conncoord.Coordinator
-
-	// vadDone is the current VAD run's stop signal — recreated on each StartVAD,
-	// closed by StopVAD / Teardown. Owned solely by Perform (single goroutine).
-	vadDone chan struct{}
-
-	// One-shot stop signals for the other input goroutines, registered by the
-	// handler when it starts them; closed once by Teardown.
-	decodeDone      chan struct{}
-	soundWindowDone chan struct{}
-}
-
-func newConnSink(session *Session, sessionID string, t Transport, wg *sync.WaitGroup) *connSink {
-	s := &connSink{
-		session:   session,
-		sessionID: sessionID,
-		transport: t,
-		wg:        wg,
-	}
-	s.coord = conncoord.New(s)
-	return s
-}
-
-// setVAD requests the turn-detection goroutine match active. Idempotent.
-func (s *connSink) setVAD(active bool) {
-	if err := s.coord.Apply(conncoord.SetVAD{Active: active}); err != nil {
-		xlog.Error("conncoord: setVAD failed", "error", err)
-	}
-}
-
-// close tears the session down (once). Safe to call from multiple exit paths.
-func (s *connSink) close() {
-	if err := s.coord.Apply(conncoord.Close{}); err != nil {
-		xlog.Error("conncoord: close failed", "error", err)
-	}
-}
-
-// Perform executes one effect. Called by Coordinator.Apply under the coordinator
-// lock; the connection coordinator is single-writer and torn down exactly once at
-// the end of the session goroutine, so the blocking joins in Teardown never
-// contend the lock.
-func (s *connSink) Perform(e conncoord.Effect) {
-	switch e.(type) {
-	case conncoord.StartVAD:
-		xlog.Debug("Starting VAD goroutine...")
-		s.vadDone = make(chan struct{})
-		done := s.vadDone
-		s.wg.Go(func() {
-			conversation := s.session.Conversations[s.session.DefaultConversationID]
-			handleVAD(s.session, conversation, s.transport, done)
-		})
-	case conncoord.StopVAD:
-		xlog.Debug("Stopping VAD goroutine...")
-		close(s.vadDone)
-		s.vadDone = nil
-	case conncoord.Teardown:
-		// Tear down in dependency order, driving every child machine to its
-		// terminal state so none outlives the session (the hierarchy invariant in
-		// formal-verification/session_lifecycle.fizz: conn Torn => children terminal).
-		//
-		// 1. Stop the remaining input goroutines and join them (this joins the VAD
-		//    goroutine, M2, via the StopVAD above + wg).
-		if s.decodeDone != nil {
-			close(s.decodeDone)
-		}
-		if s.soundWindowDone != nil {
-			close(s.soundWindowDone)
-		}
-		s.wg.Wait()
-
-		// 2. Terminate the response coordinator (M3): cancel the in-flight response
-		//    and join all response goroutines (which also closes their TTS
-		//    pipelines, M5). After this no response can start.
-		s.session.respSink.shutdown()
-
-		// 3. Terminate every conversation's compaction coordinator (M4): cancel +
-		//    join any in-flight summarize+evict so it cannot outlive the session.
-		for _, conv := range s.session.Conversations {
-			if conv.compaction != nil {
-				conv.compaction.shutdown()
-			}
-		}
-
-		sessionLock.Lock()
-		delete(sessions, s.sessionID)
-		sessionLock.Unlock()
-	}
-}
--- a/core/http/endpoints/openai/realtime_doubles_test.go
+++ b/core/http/endpoints/openai/realtime_doubles_test.go
@@ -74,16 +74,6 @@ type fakeModel struct {

 	transcribeDeltas []string
 	transcribeFinal  *schema.TranscriptionResult
-	transcribeErr    error
-
-	// TranscribeLive scripting: liveErr makes the open fail (degrade path);
-	// liveEvents are delivered to onEvent synchronously at open;
-	// liveCloseEvents are delivered during Close (the finalize flush).
-	liveErr         error
-	liveEvents      []backend.LiveTranscriptionEvent
-	liveCloseEvents []backend.LiveTranscriptionEvent
-	liveOpened      int
-	liveSession     *fakeLiveSession

 	// soundDetectionResult/soundDetectionErr drive the SoundDetection double so
 	// the sound-event path can be exercised deterministically.
@@ -107,7 +97,7 @@ func (m *fakeModel) VAD(context.Context, *schema.VADRequest) (*schema.VADRespons
 }

 func (m *fakeModel) Transcribe(context.Context, string, string, bool, bool, string) (*schema.TranscriptionResult, error) {
-	return m.transcribeFinal, m.transcribeErr
+	return m.transcribeFinal, nil
 }

 func (m *fakeModel) SoundDetection(context.Context, string, int, float32) (*schema.SoundClassificationResult, error) {
@@ -160,43 +150,4 @@ func (m *fakeModel) TranscribeStream(_ context.Context, _, _ string, _, _ bool,
 	return m.transcribeFinal, nil
 }

-func (m *fakeModel) TranscribeLive(_ context.Context, _ string, onEvent func(backend.LiveTranscriptionEvent)) (backend.LiveTranscriptionSession, error) {
-	if m.liveErr != nil {
-		return nil, m.liveErr
-	}
-	m.liveOpened++
-	for _, ev := range m.liveEvents {
-		onEvent(ev)
-	}
-	m.liveSession = &fakeLiveSession{onEvent: onEvent, closeEvents: m.liveCloseEvents}
-	return m.liveSession, nil
-}
-
 func (m *fakeModel) PredictConfig() *config.ModelConfig { return m.cfg }
-
-// fakeLiveSession records what semantic_vad fed and closed; closeEvents are
-// replayed through onEvent during Close, mimicking the backend's finalize
-// flush (trailing delta + Final) landing before Close returns.
-type fakeLiveSession struct {
-	onEvent     func(backend.LiveTranscriptionEvent)
-	closeEvents []backend.LiveTranscriptionEvent
-	fed         [][]float32
-	feedErr     error
-	closed      int
-}
-
-func (s *fakeLiveSession) Feed(pcm []float32) error {
-	if s.feedErr != nil {
-		return s.feedErr
-	}
-	s.fed = append(s.fed, append([]float32(nil), pcm...))
-	return nil
-}
-
-func (s *fakeLiveSession) Close() error {
-	s.closed++
-	for _, ev := range s.closeEvents {
-		s.onEvent(ev)
-	}
-	return nil
-}
--- a/core/http/endpoints/openai/realtime_model.go
+++ b/core/http/endpoints/openai/realtime_model.go
@@ -102,10 +102,6 @@ func (m *transcriptOnlyModel) TranscribeStream(ctx context.Context, audio, langu
 	return transcribeStream(ctx, m.modelLoader, *m.TranscriptionConfig, m.appConfig, audio, language, translate, diarize, prompt, onDelta)
 }

-func (m *transcriptOnlyModel) TranscribeLive(ctx context.Context, language string, onEvent func(backend.LiveTranscriptionEvent)) (backend.LiveTranscriptionSession, error) {
-	return backend.ModelTranscriptionLive(ctx, language, m.modelLoader, *m.TranscriptionConfig, m.appConfig, onEvent)
-}
-
 func (m *transcriptOnlyModel) PredictConfig() *config.ModelConfig {
 	return nil
 }
@@ -352,10 +348,6 @@ func (m *wrappedModel) TranscribeStream(ctx context.Context, audio, language str
 	return transcribeStream(ctx, m.modelLoader, *m.TranscriptionConfig, m.appConfig, audio, language, translate, diarize, prompt, onDelta)
 }

-func (m *wrappedModel) TranscribeLive(ctx context.Context, language string, onEvent func(backend.LiveTranscriptionEvent)) (backend.LiveTranscriptionSession, error) {
-	return backend.ModelTranscriptionLive(ctx, language, m.modelLoader, *m.TranscriptionConfig, m.appConfig, onEvent)
-}
-
 func (m *wrappedModel) PredictConfig() *config.ModelConfig {
 	return m.LLMConfig
 }
--- a/core/http/endpoints/openai/realtime_respcoord.go
+++ b/core/http/endpoints/openai/realtime_respcoord.go
@@ -1,143 +0,0 @@
-package openai
-
-import (
-	"context"
-	"sync"
-	"sync/atomic"
-
-	"github.com/mudler/LocalAI/core/http/endpoints/openai/respcoord"
-	"github.com/mudler/xlog"
-)
-
-// responseSink wires the explicit response-coordination state machine
-// (respcoord.Coordinator — machine "M3" in docs/design/realtime-state-machines.md)
-// into a realtime session.
-//
-// It replaces the legacy startResponse/cancelActiveResponse pair, whose
-// activeResponse* fields were written from two goroutines (the client read-loop
-// and the VAD goroutine) with the <-done wait performed outside the lock — the
-// dual-writer race documented in Part 2 (failure mode 2). The coordinator
-// serializes every start/cancel/finish decision behind one lock and guarantees
-// at most one live response, so the two callers can no longer interleave into
-// two overlapping responses.
-//
-// Each response runs as a goroutine spawned here. The effects map as:
-//   - StartResponse:  spawn the registered body with a fresh cancelable context.
-//   - CancelResponse: cancel that context (cooperative — the body stops at its
-//     next ctx checkpoint and emits its own response.done{cancelled}).
-//   - EmitTerminal:   currently a no-op. response.done is still emitted by the
-//     response body itself; making this the single authoritative terminal (one
-//     response.done per response.create, with Output+Usage populated) is the
-//     next step and does not change the coordination guarantees here.
-type responseSink struct {
-	mu      sync.Mutex
-	coord   *respcoord.Coordinator
-	cancels map[respcoord.ResponseID]context.CancelFunc
-	bodies  map[respcoord.ResponseID]responseBody
-	seq     atomic.Uint64
-	wg      sync.WaitGroup
-}
-
-type responseBody struct {
-	parent context.Context
-	run    func(ctx context.Context)
-}
-
-func newResponseSink() *responseSink {
-	s := &responseSink{
-		cancels: map[respcoord.ResponseID]context.CancelFunc{},
-		bodies:  map[respcoord.ResponseID]responseBody{},
-	}
-	s.coord = respcoord.New(s)
-	return s
-}
-
-// issue registers a response body and asks the coordinator to start it. Any
-// in-flight response is superseded (cancelled, with its own terminal) first,
-// atomically inside the coordinator — no caller-side locking, no dual-writer
-// race. Non-blocking: the superseded response drains concurrently and its later
-// Finished is ignored as stale.
-func (s *responseSink) issue(parent context.Context, source respcoord.Source, run func(ctx context.Context)) {
-	id := respcoord.ResponseID(s.seq.Add(1))
-	s.mu.Lock()
-	s.bodies[id] = responseBody{parent: parent, run: run}
-	s.mu.Unlock()
-	if err := s.coord.Apply(respcoord.Start{ID: id, Source: source}); err != nil {
-		xlog.Error("respcoord: start failed", "error", err)
-	}
-}
-
-// cancel cancels the in-flight response, if any. Non-blocking (barge-in must not
-// stall the VAD tick).
-func (s *responseSink) cancel(source respcoord.Source) {
-	if err := s.coord.Apply(respcoord.Cancel{Source: source}); err != nil {
-		xlog.Error("respcoord: cancel failed", "error", err)
-	}
-}
-
-// wait blocks until every response goroutine (the active one plus any draining
-// superseded ones) has exited. Used at teardown so the session is never deleted
-// out from under a running response.
-func (s *responseSink) wait() {
-	s.wg.Wait()
-}
-
-// shutdown terminates the coordinator (cancelling any in-flight response) and
-// then joins all response goroutines. After this the coordinator is in its
-// absorbing Terminated state, so no further response can be issued — the
-// connection (M1) parent's teardown uses this to guarantee no response outlives
-// the session (see formal-verification/session_lifecycle.fizz).
-func (s *responseSink) shutdown() {
-	if err := s.coord.Apply(respcoord.Shutdown{}); err != nil {
-		xlog.Error("respcoord: shutdown failed", "error", err)
-	}
-	s.wait()
-}
-
-// Perform executes one effect. It is called by Coordinator.Apply while the
-// coordinator lock is held, so it must not block. It briefly takes s.mu but
-// never acquires the coordinator lock while holding s.mu; the spawned
-// goroutine's Finished apply takes the coordinator lock only AFTER releasing
-// s.mu, so there is no lock cycle.
-func (s *responseSink) Perform(e respcoord.Effect) {
-	switch eff := e.(type) {
-	case respcoord.StartResponse:
-		s.mu.Lock()
-		body := s.bodies[eff.ID]
-		delete(s.bodies, eff.ID)
-		parent := body.parent
-		if parent == nil {
-			parent = context.Background()
-		}
-		ctx, cancel := context.WithCancel(parent)
-		s.cancels[eff.ID] = cancel
-		s.mu.Unlock()
-
-		s.wg.Go(func() {
-			defer func() {
-				s.mu.Lock()
-				delete(s.cancels, eff.ID)
-				s.mu.Unlock()
-				// Report completion. If this response was superseded/cancelled
-				// the id is stale and the coordinator ignores it (so the
-				// terminal is never emitted twice).
-				if err := s.coord.Apply(respcoord.Finished{ID: eff.ID}); err != nil {
-					xlog.Error("respcoord: finished apply failed", "error", err)
-				}
-			}()
-			if body.run != nil {
-				body.run(ctx)
-			}
-		})
-	case respcoord.CancelResponse:
-		s.mu.Lock()
-		cancel := s.cancels[eff.ID]
-		s.mu.Unlock()
-		if cancel != nil {
-			cancel()
-		}
-	case respcoord.EmitTerminal:
-		// No-op for now: the response body still emits its own response.done.
-		// Wiring the authoritative single terminal here is the next step.
-	}
-}
--- a/core/http/endpoints/openai/realtime_semantic_vad.go
+++ b/core/http/endpoints/openai/realtime_semantic_vad.go
@@ -1,350 +0,0 @@
-package openai
-
-import (
-	"context"
-	"strings"
-
-	"github.com/mudler/LocalAI/core/backend"
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
-	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/xlog"
-)
-
-// Semantic (EOU-driven) turn detection.
-//
-// With turn_detection.type == "semantic_vad", the transcription model is fed
-// the microphone audio live while the user speaks and its end-of-utterance
-// token turns the silence window dynamic: an immediate commit once the
-// token fires (the model judged the user finished and expects a reply), the
-// much longer eagerness fallback when it does not (mid-thought pause). The
-// silero VAD stays in charge of speech_started/barge-in and the actual
-// silence measurement, so a spurious EOU mid-speech cannot cut the user off
-// — the commit still requires real silence.
-
-const (
-	// semanticEouSilenceSec is the extra silence required to commit once the
-	// end-of-utterance token has fired. Zero: the token already trails the
-	// audio by the encoder chunk schedule plus a VAD tick (~0.3-0.9s), and
-	// the commit check only runs after silero closes the speech segment —
-	// which itself takes real silence — so any window on top is pure added
-	// response delay.
-	semanticEouSilenceSec = 0.0
-
-	// liveEventsBuffer sizes the recv-callback → VAD-tick handoff channel.
-	// Events arrive at a few per second and the ticker drains every 300ms;
-	// a full channel means the loop is wedged, and dropping (with a warning)
-	// beats blocking the backend's recv goroutine.
-	liveEventsBuffer = 64
-)
-
-// eagernessMaxSilenceSec maps the OpenAI semantic_vad eagerness to the
-// fallback silence window used when no end-of-utterance token was seen:
-// low waits longest, high responds fastest, auto/empty equals medium —
-// the same 8s/4s/2s max timeouts OpenAI documents.
-func eagernessMaxSilenceSec(eagerness string) float64 {
-	switch strings.ToLower(strings.TrimSpace(eagerness)) {
-	case "low":
-		return 8
-	case "high":
-		return 2
-	default: // "medium", "auto", ""
-		return 4
-	}
-}
-
-// liveUtterance is one committed turn's transcript as produced by the live
-// stream. Its delta events were already streamed to the client as they
-// arrived (keyed by the turn's item id), so only the final text travels here.
-type liveUtterance struct {
-	Text string
-}
-
-// liveTurnState is handleVAD's per-session live-ASR companion for
-// semantic_vad. One live stream is opened per user turn (begun when the VAD
-// first reports speech, finalized at commit) — the underlying decode session
-// grows with fed audio, so per-turn streams keep it bounded. All fields are
-// owned by the handleVAD goroutine; the backend's recv callback only writes
-// into the buffered events channel.
-type liveTurnState struct {
-	session   *Session
-	transport Transport // live caption deltas are sent here as they drain
-	events    chan backend.LiveTranscriptionEvent
-
-	live        backend.LiveTranscriptionSession // nil between turns
-	unavailable bool                             // sticky: backend can't do live ASR, degrade for the session
-
-	fed16k int // 16k samples of the current buffer already fed
-	// eouAtSec is the audio time of the most recent EOU this turn (0 = none).
-	// It is a recorded fact: set when an EOU drains and never toggled off
-	// mid-turn. Whether it still governs the trailing silence is derived
-	// purely by eouPending() from this plus the live VAD segments.
-	eouAtSec   float64
-	parts      []string // deltas accumulated for the current turn
-	finalText  string   // authoritative full-turn text from the Final event
-	itemID     string   // the turn's conversation item id, allocated at openTurn
-	deltasSent bool     // at least one caption delta reached the client this turn
-}
-
-func newLiveTurnState(session *Session, transport Transport) *liveTurnState {
-	return &liveTurnState{
-		session:   session,
-		transport: transport,
-		events:    make(chan backend.LiveTranscriptionEvent, liveEventsBuffer),
-	}
-}
-
-func (l *liveTurnState) open() bool { return l.live != nil }
-
-// openTurn starts the turn's live stream under the caller-supplied item id. A
-// failure (most commonly the backend's typed "live transcription unsupported"
-// signal) degrades the whole session to silence-only detection — warned once,
-// then sticky.
-//
-// The item id is supplied by the turn coordinator (turncoord) rather than minted
-// here: it is allocated when the turn STARTS so caption deltas can stream to the
-// client while the user is still speaking, and the committed event and final
-// transcript reuse it (replacing the partial text). The coordinator carries the
-// same id on its CommitTurn/DiscardTurn effects, so the committed event always
-// matches the captions.
-func (l *liveTurnState) openTurn(ctx context.Context, itemID string) bool {
-	if l.live != nil {
-		return true
-	}
-	if l.unavailable {
-		return false
-	}
-	language := ""
-	if l.session.InputAudioTranscription != nil {
-		language = l.session.InputAudioTranscription.Language
-	}
-	live, err := l.session.ModelInterface.TranscribeLive(ctx, language, func(ev backend.LiveTranscriptionEvent) {
-		select {
-		case l.events <- ev:
-		default:
-			xlog.Warn("semantic_vad: live transcription event dropped (event channel full)")
-		}
-	})
-	if err != nil {
-		l.unavailable = true
-		xlog.Warn("semantic_vad: live transcription unavailable; degrading to silence-only turn detection",
-			"error", err)
-		return false
-	}
-	l.resetTurn()
-	l.live = live
-	l.itemID = itemID
-	return true
-}
-
-// feedNewAudio pushes the not-yet-fed tail of the resampled buffer to the
-// live stream. The final sample is held back: ResampleInt16 is prefix-stable
-// except for its last output sample, so excluding it keeps successive
-// whole-buffer resamples bit-identical over the fed range.
-func (l *liveTurnState) feedNewAudio(aints16k []int16) {
-	if l.live == nil {
-		return
-	}
-	end := len(aints16k) - 1
-	if end <= l.fed16k {
-		return
-	}
-	if err := l.live.Feed(int16sToFloat32(aints16k[l.fed16k:end])); err != nil {
-		xlog.Warn("semantic_vad: live feed failed; degrading to silence-only turn detection", "error", err)
-		l.discardTurn()
-		l.unavailable = true
-		return
-	}
-	l.fed16k = end
-}
-
-// drainEvents folds everything the live stream produced since the last tick
-// into the turn state. audioSec (the current buffer length in seconds) marks
-// WHEN an EOU was observed, so later VAD segments can distinguish speech
-// that resumed after it.
-func (l *liveTurnState) drainEvents(audioSec float64) {
-	for {
-		select {
-		case ev := <-l.events:
-			if ev.Delta != "" {
-				l.parts = append(l.parts, ev.Delta)
-				// Live captions: forward the delta immediately under the
-				// turn's item id — the browser shows text while the user
-				// is still speaking; the completed event at commit
-				// replaces it with the authoritative transcript.
-				if l.transport != nil && l.itemID != "" {
-					sendEvent(l.transport, types.ConversationItemInputAudioTranscriptionDeltaEvent{
-						ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
-						ItemID:          l.itemID,
-						ContentIndex:    0,
-						Delta:           ev.Delta,
-					})
-					l.deltasSent = true
-				}
-			}
-			if ev.Eou {
-				// Record the position; do not flip a flag. Whether this EOU
-				// still applies to the trailing silence is decided later by
-				// eouPending(), purely from this and the live VAD segments.
-				l.eouAtSec = audioSec
-				xlog.Debug("semantic_vad: EOU token observed", "audio_s", audioSec)
-			}
-			if ev.Eob {
-				// A backchannel ended ("uh-huh") — the user is still
-				// listening, not yielding the turn. Deliberately NOT a
-				// commit trigger.
-				xlog.Debug("semantic_vad: EOB (backchannel) observed", "audio_s", audioSec)
-			}
-			if ev.Final != nil && strings.TrimSpace(ev.Final.Text) != "" {
-				l.finalText = ev.Final.Text
-			}
-		default:
-			return
-		}
-	}
-}
-
-// eouPending reports whether the recorded EOU still applies to the current
-// trailing silence. It is a pure function of the recorded EOU position and the
-// VAD's live view — there is no stored boolean that can fall out of sync.
-//
-// An EOU stops applying only once the user has STARTED a new utterance after
-// it (a segment whose start is past the EOU): that is genuine resumed speech,
-// so the earlier yield no longer holds. An in-progress segment whose speech
-// began BEFORE the EOU is NOT resumed speech — it is just silero still padding
-// before it closes the segment, which is the normal state at the instant the
-// (predictive) EOU fires. Treating that as resumed speech was the bug that
-// cleared the flag on the very tick the token arrived, dropping almost every
-// EOU to the eagerness timeout.
-func (l *liveTurnState) eouPending(segments []schema.VADSegment) bool {
-	if l.eouAtSec == 0 || len(segments) == 0 {
-		return false
-	}
-	last := segments[len(segments)-1]
-	return float64(last.Start) <= l.eouAtSec
-}
-
-// thresholdSec is the dynamic commit threshold: zero once the model said
-// the utterance is over (any VAD-confirmed silence commits), the eagerness
-// fallback otherwise.
-func (l *liveTurnState) thresholdSec(eouPending bool, sv *types.RealtimeSessionSemanticVad) float64 {
-	if eouPending {
-		return semanticEouSilenceSec
-	}
-	return eagernessMaxSilenceSec(sv.Eagerness)
-}
-
-// commitTrigger describes how a commit decision was reached, for the per-turn
-// timing log: "eou" with the token's lag behind the VAD's speech end, or
-// "timeout" when the eagerness fallback elapsed without one. The lag is the
-// number the user needs to tell a slow EOU emission apart from loop overhead.
-func (l *liveTurnState) commitTrigger(eouPending bool, speechEndSec float64) (trigger string, eouLagSec float64) {
-	if !eouPending {
-		return "timeout", 0
-	}
-	return "eou", l.eouAtSec - speechEndSec
-}
-
-// finishTurn finalizes the live stream (flushing the decode tail — the last
-// ~2 encoder frames of text only appear here), folds the terminal events in,
-// and returns the turn's transcript. Returns nil when the stream never
-// produced text (the VAD triggered on something the model heard nothing in).
-func (l *liveTurnState) finishTurn(audioSec float64) *liveUtterance {
-	if l.live == nil {
-		return nil
-	}
-	if err := l.live.Close(); err != nil {
-		xlog.Warn("semantic_vad: live transcription finalize failed", "error", err)
-	}
-	l.live = nil
-	l.drainEvents(audioSec)
-
-	text := strings.TrimSpace(l.finalText)
-	if text == "" {
-		text = l.previewText()
-	}
-	ut := &liveUtterance{Text: text}
-	l.resetTurn()
-	if ut.Text == "" {
-		return nil
-	}
-	return ut
-}
-
-// discardTurn drops the current turn (no-speech buffer clear, feed failure,
-// session teardown): the stream is closed and its transcript thrown away.
-// Any caption deltas already shown for it are retracted via the failed
-// event, so the client doesn't keep a stuck partial entry.
-func (l *liveTurnState) discardTurn() {
-	if l.live != nil {
-		_ = l.live.Close()
-		l.live = nil
-	}
-	l.drainEvents(0)
-	if l.deltasSent && l.transport != nil && l.itemID != "" {
-		sendEvent(l.transport, types.ConversationItemInputAudioTranscriptionFailedEvent{
-			ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
-			ItemID:          l.itemID,
-			ContentIndex:    0,
-			Error: types.Error{
-				Type:    "transcription_discarded",
-				Message: "turn discarded before commit",
-			},
-		})
-	}
-	l.resetTurn()
-}
-
-func (l *liveTurnState) resetTurn() {
-	l.fed16k = 0
-	l.eouAtSec = 0
-	l.parts = nil
-	l.finalText = ""
-	l.itemID = ""
-	l.deltasSent = false
-}
-
-// previewText is the turn's transcript so far (for the retranscribe
-// comparison log and as the fallback when no Final event arrived).
-func (l *liveTurnState) previewText() string {
-	return strings.TrimSpace(strings.Join(l.parts, ""))
-}
-
-// int16sToFloat32 converts PCM to the [-1,1] float form the live stream
-// feeds the model (the same scaling runVAD's go-audio conversion applies).
-func int16sToFloat32(samples []int16) []float32 {
-	out := make([]float32, len(samples))
-	for i, s := range samples {
-		out[i] = float32(s) / 32768.0
-	}
-	return out
-}
-
-// turnDetectionActive reports whether the session has any automatic turn
-// detection (server or semantic VAD) that should run the handleVAD loop.
-func turnDetectionActive(td *types.TurnDetectionUnion) bool {
-	return td != nil && (td.ServerVad != nil || td.SemanticVad != nil)
-}
-
-// defaultTurnDetection seeds a new session's turn detection from the
-// pipeline's server-side default: semantic_vad pipelines start sessions in
-// semantic mode (clients can still override via session.update); everything
-// else keeps the historical server_vad defaults.
-func defaultTurnDetection(cfg *config.ModelConfig) *types.TurnDetectionUnion {
-	if cfg != nil && cfg.Pipeline.TurnDetectionSemantic() {
-		return &types.TurnDetectionUnion{
-			SemanticVad: &types.RealtimeSessionSemanticVad{
-				CreateResponse: true,
-				Eagerness:      cfg.Pipeline.TurnDetection.Eagerness,
-			},
-		}
-	}
-	return &types.TurnDetectionUnion{
-		ServerVad: &types.ServerVad{
-			Threshold:         0.5,
-			PrefixPaddingMs:   300,
-			SilenceDurationMs: 500,
-			CreateResponse:    true,
-		},
-	}
-}
--- a/core/http/endpoints/openai/realtime_semantic_vad_test.go
+++ b/core/http/endpoints/openai/realtime_semantic_vad_test.go
@@ -1,414 +0,0 @@
-package openai
-
-import (
-	"context"
-	"errors"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-
-	"github.com/mudler/LocalAI/core/backend"
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
-	"github.com/mudler/LocalAI/core/schema"
-)
-
-var _ = Describe("eagernessMaxSilenceSec", func() {
-	DescribeTable("maps eagerness to the no-EOU fallback window",
-		func(eagerness string, want float64) {
-			Expect(eagernessMaxSilenceSec(eagerness)).To(Equal(want))
-		},
-		Entry("low", "low", 8.0),
-		Entry("medium", "medium", 4.0),
-		Entry("high", "high", 2.0),
-		Entry("auto equals medium", "auto", 4.0),
-		Entry("empty equals medium", "", 4.0),
-		Entry("case and space insensitive", " High ", 2.0),
-		Entry("unknown equals medium", "frantic", 4.0),
-	)
-})
-
-var _ = Describe("turnDetectionActive", func() {
-	It("is active for server and semantic VAD, inactive otherwise", func() {
-		Expect(turnDetectionActive(nil)).To(BeFalse())
-		Expect(turnDetectionActive(&types.TurnDetectionUnion{})).To(BeFalse())
-		Expect(turnDetectionActive(&types.TurnDetectionUnion{ServerVad: &types.ServerVad{}})).To(BeTrue())
-		Expect(turnDetectionActive(&types.TurnDetectionUnion{SemanticVad: &types.RealtimeSessionSemanticVad{}})).To(BeTrue())
-	})
-})
-
-var _ = Describe("defaultTurnDetection", func() {
-	It("keeps the historical server_vad defaults for non-semantic pipelines", func() {
-		td := defaultTurnDetection(&config.ModelConfig{})
-		Expect(td.ServerVad).NotTo(BeNil())
-		Expect(td.SemanticVad).To(BeNil())
-		Expect(td.ServerVad.SilenceDurationMs).To(Equal(int64(500)))
-		Expect(td.ServerVad.CreateResponse).To(BeTrue())
-	})
-
-	It("seeds semantic_vad with the pipeline's eagerness", func() {
-		cfg := &config.ModelConfig{}
-		cfg.Pipeline.TurnDetection.Type = "semantic_vad"
-		cfg.Pipeline.TurnDetection.Eagerness = "high"
-		td := defaultTurnDetection(cfg)
-		Expect(td.SemanticVad).NotTo(BeNil())
-		Expect(td.ServerVad).To(BeNil())
-		Expect(td.SemanticVad.Eagerness).To(Equal("high"))
-		Expect(td.SemanticVad.CreateResponse).To(BeTrue())
-	})
-
-	It("treats a nil config as server_vad", func() {
-		Expect(defaultTurnDetection(nil).ServerVad).NotTo(BeNil())
-	})
-})
-
-var _ = Describe("int16sToFloat32", func() {
-	It("scales like the VAD conversion", func() {
-		out := int16sToFloat32([]int16{0, 16384, -32768})
-		Expect(out).To(HaveLen(3))
-		Expect(out[0]).To(BeNumerically("~", 0.0, 1e-6))
-		Expect(out[1]).To(BeNumerically("~", 0.5, 1e-6))
-		Expect(out[2]).To(BeNumerically("~", -1.0, 1e-6))
-	})
-})
-
-var _ = Describe("liveTurnState", func() {
-	var (
-		m   *fakeModel
-		lts *liveTurnState
-		ftr *fakeTransport
-	)
-
-	newSemanticSession := func(m *fakeModel) *Session {
-		return &Session{
-			InputAudioTranscription: &types.AudioTranscription{},
-			ModelInterface:          m,
-		}
-	}
-
-	BeforeEach(func() {
-		m = &fakeModel{}
-		ftr = &fakeTransport{}
-		lts = newLiveTurnState(newSemanticSession(m), ftr)
-	})
-
-	Describe("openTurn", func() {
-		It("opens once per turn and reports open()", func() {
-			Expect(lts.open()).To(BeFalse())
-			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
-			Expect(lts.open()).To(BeTrue())
-			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue(), "idempotent while open")
-			Expect(m.liveOpened).To(Equal(1))
-		})
-
-		It("degrades stickily when the backend cannot do live transcription", func() {
-			m.liveErr = errors.New("rpc error: code = Unimplemented desc = live transcription unsupported")
-			Expect(lts.openTurn(context.Background(), "item1")).To(BeFalse())
-			Expect(lts.unavailable).To(BeTrue())
-
-			// Later turns never retry: the failure is per-session sticky.
-			m.liveErr = nil
-			Expect(lts.openTurn(context.Background(), "item1")).To(BeFalse())
-			Expect(m.liveOpened).To(Equal(0))
-		})
-	})
-
-	Describe("feedNewAudio", func() {
-		It("feeds only the unfed tail and holds back the final resampled sample", func() {
-			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
-
-			lts.feedNewAudio([]int16{1, 2, 3, 4})
-			Expect(m.liveSession.fed).To(HaveLen(1))
-			Expect(m.liveSession.fed[0]).To(HaveLen(3), "last sample held back")
-
-			// Same buffer grown by two samples: only the delta is fed.
-			lts.feedNewAudio([]int16{1, 2, 3, 4, 5, 6})
-			Expect(m.liveSession.fed).To(HaveLen(2))
-			Expect(m.liveSession.fed[1]).To(HaveLen(2))
-
-			// No growth past the holdback: nothing fed.
-			lts.feedNewAudio([]int16{1, 2, 3, 4, 5, 6})
-			Expect(m.liveSession.fed).To(HaveLen(2))
-		})
-
-		It("degrades and closes the turn when a feed fails", func() {
-			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
-			m.liveSession.feedErr = errors.New("backend gone")
-			sess := m.liveSession
-
-			lts.feedNewAudio([]int16{1, 2, 3, 4})
-
-			Expect(lts.open()).To(BeFalse())
-			Expect(lts.unavailable).To(BeTrue())
-			Expect(sess.closed).To(Equal(1))
-		})
-	})
-
-	Describe("event handling and the dynamic threshold", func() {
-		sv := &types.RealtimeSessionSemanticVad{Eagerness: "high"}
-
-		It("uses the eagerness fallback until an EOU is recorded, then commits without an extra window", func() {
-			Expect(lts.thresholdSec(false, sv)).To(Equal(2.0))
-			Expect(lts.thresholdSec(true, sv)).To(Equal(semanticEouSilenceSec))
-
-			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
-			lts.session.ModelInterface.(*fakeModel).liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "hello ", Eou: false})
-			lts.session.ModelInterface.(*fakeModel).liveSession.onEvent(backend.LiveTranscriptionEvent{Eou: true})
-			lts.drainEvents(3.3)
-
-			Expect(lts.eouAtSec).To(BeNumerically("~", 3.3, 1e-9))
-			Expect(lts.previewText()).To(Equal("hello"))
-		})
-
-		// The bug this replaces: the (predictive) EOU routinely arrives while
-		// silero is still padding the speech segment open. eouPending must NOT
-		// read that as resumed speech.
-		It("keeps the EOU pending while silero is still closing the same segment", func() {
-			lts.eouAtSec = 3.3
-			Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 0}})).To(BeTrue(), "segment began before the EOU and is merely unclosed")
-			Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 3.0}})).To(BeTrue(), "and still pending once it closes")
-		})
-
-		It("drops the EOU only when a new utterance starts after it (resumed speech)", func() {
-			lts.eouAtSec = 3.3
-			Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 3.0}, {Start: 4.0, End: 0}})).To(BeFalse())
-			Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 3.0}, {Start: 4.0, End: 5.0}})).To(BeFalse())
-		})
-
-		It("has no pending EOU before one is recorded", func() {
-			Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 3.0}})).To(BeFalse())
-			Expect(lts.eouPending(nil)).To(BeFalse())
-		})
-
-		It("does not arm the commit threshold on an EOB backchannel", func() {
-			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
-			lts.session.ModelInterface.(*fakeModel).liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "uh-huh", Eob: true})
-			lts.drainEvents(2.0)
-
-			Expect(lts.eouAtSec).To(BeZero(), "a backchannel is not the user yielding the turn")
-			Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 1.8}})).To(BeFalse(), "still on the eagerness fallback")
-			Expect(lts.previewText()).To(Equal("uh-huh"), "the backchannel text still lands in the transcript")
-		})
-
-		It("reports the commit trigger and the EOU token's lag behind speech end", func() {
-			trigger, lag := lts.commitTrigger(false, 3.2)
-			Expect(trigger).To(Equal("timeout"))
-			Expect(lag).To(BeZero())
-
-			lts.eouAtSec = 3.5
-			trigger, lag = lts.commitTrigger(true, 3.2)
-			Expect(trigger).To(Equal("eou"))
-			Expect(lag).To(BeNumerically("~", 0.3, 1e-9))
-		})
-	})
-
-	Describe("finishTurn", func() {
-		It("finalizes the stream, prefers the Final text, and resets for the next turn", func() {
-			m.liveCloseEvents = []backend.LiveTranscriptionEvent{
-				{Delta: " world"},
-				{Final: &schema.TranscriptionResult{Text: "hello world", Eou: true}},
-			}
-			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
-			sess := m.liveSession
-			sess.onEvent(backend.LiveTranscriptionEvent{Delta: "hello", Eou: true})
-			lts.drainEvents(2.0)
-
-			ut := lts.finishTurn(2.5)
-
-			Expect(sess.closed).To(Equal(1))
-			Expect(ut).NotTo(BeNil())
-			Expect(ut.Text).To(Equal("hello world"), "Final event text wins over joined deltas")
-			Expect(lts.open()).To(BeFalse())
-			Expect(lts.eouAtSec).To(BeZero())
-			Expect(lts.parts).To(BeEmpty())
-			Expect(lts.fed16k).To(BeZero())
-		})
-
-		It("returns nil when the stream heard nothing", func() {
-			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
-			Expect(lts.finishTurn(1.0)).To(BeNil())
-			Expect(m.liveSession.closed).To(Equal(1))
-		})
-
-		It("is a no-op without an open stream", func() {
-			Expect(lts.finishTurn(1.0)).To(BeNil())
-		})
-	})
-
-	Describe("discardTurn", func() {
-		It("closes the stream, drops the transcript and retracts streamed captions", func() {
-			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
-			sess := m.liveSession
-			sess.onEvent(backend.LiveTranscriptionEvent{Delta: "noise"})
-			lts.drainEvents(1.0)
-
-			lts.discardTurn()
-
-			Expect(sess.closed).To(Equal(1))
-			Expect(lts.open()).To(BeFalse())
-			Expect(lts.parts).To(BeEmpty())
-			Expect(ftr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionFailed)).To(Equal(1),
-				"the client saw caption deltas for this turn — it must be told to drop them")
-		})
-
-		It("sends no failed event when no captions ever reached the client", func() {
-			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
-			lts.discardTurn()
-			Expect(ftr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionFailed)).To(Equal(0))
-		})
-	})
-
-	Describe("live captions", func() {
-		It("streams each delta to the client under the turn's item id as it drains", func() {
-			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
-			turnID := lts.itemID
-			Expect(turnID).NotTo(BeEmpty(), "the item id exists from turn open so captions can reference it")
-
-			m.liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "hel"})
-			m.liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "lo"})
-			lts.drainEvents(1.0)
-
-			var got []types.ConversationItemInputAudioTranscriptionDeltaEvent
-			for _, e := range ftr.events {
-				if d, ok := e.(types.ConversationItemInputAudioTranscriptionDeltaEvent); ok {
-					got = append(got, d)
-				}
-			}
-			Expect(got).To(HaveLen(2))
-			Expect(got[0].Delta).To(Equal("hel"))
-			Expect(got[1].Delta).To(Equal("lo"))
-			Expect(got[0].ItemID).To(Equal(turnID))
-			Expect(got[1].ItemID).To(Equal(turnID))
-			Expect(lts.deltasSent).To(BeTrue())
-		})
-
-		It("finishTurn does not retract captions — the commit's completed event supersedes them", func() {
-			Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
-			m.liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "hello"})
-			lts.drainEvents(1.0)
-
-			Expect(lts.finishTurn(1.5)).NotTo(BeNil())
-			Expect(ftr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionFailed)).To(Equal(0))
-		})
-	})
-})
-
-// commitUtteranceWithTranscript routes the three transcript sources: the
-// retranscribe gate's batch decode, the live stream's accumulated text, and
-// the historical file path.
-var _ = Describe("commitUtteranceWithTranscript", func() {
-	newTranscriptionOnlySession := func(m *fakeModel, streamTranscription bool) *Session {
-		cfg := &config.ModelConfig{}
-		if streamTranscription {
-			on := true
-			cfg.Pipeline.Streaming.Transcription = &on
-		}
-		return &Session{
-			TranscriptionOnly:       true, // stop after the transcript: no LLM/TTS in these specs
-			InputAudioTranscription: &types.AudioTranscription{},
-			ModelConfig:             cfg,
-			ModelInterface:          m,
-		}
-	}
-
-	It("uses the gate's batch transcript and never re-runs the backend", func() {
-		m := &fakeModel{transcribeErr: errors.New("must not be called")}
-		session := newTranscriptionOnlySession(m, true)
-		tr := &fakeTransport{}
-
-		commitUtteranceWithTranscript(context.Background(), []byte{1, 2}, nil,
-			&schema.TranscriptionResult{Text: "batch text", Eou: true}, "item_turn", session, &Conversation{}, tr)
-
-		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionDelta)).To(Equal(0))
-		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
-	})
-
-	It("emits only the completed event for a live transcript — captions already streamed during the turn", func() {
-		m := &fakeModel{transcribeErr: errors.New("must not be called")}
-		session := newTranscriptionOnlySession(m, true)
-		tr := &fakeTransport{}
-
-		commitUtteranceWithTranscript(context.Background(), []byte{1, 2},
-			&liveUtterance{Text: "hello"}, nil, "item_turn", session, &Conversation{}, tr)
-
-		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionDelta)).To(Equal(0))
-		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
-
-		var completed types.ConversationItemInputAudioTranscriptionCompletedEvent
-		for _, e := range tr.events {
-			if c, ok := e.(types.ConversationItemInputAudioTranscriptionCompletedEvent); ok {
-				completed = c
-			}
-		}
-		Expect(completed.ItemID).To(Equal("item_turn"),
-			"completed must reuse the caption deltas' item id so the client replaces, not duplicates")
-		Expect(completed.Transcript).To(Equal("hello"))
-	})
-
-	It("falls back to the file path when the live stream heard nothing", func() {
-		m := &fakeModel{transcribeFinal: &schema.TranscriptionResult{Text: "from file"}}
-		session := newTranscriptionOnlySession(m, false)
-		tr := &fakeTransport{}
-
-		commitUtteranceWithTranscript(context.Background(), []byte{1, 2},
-			&liveUtterance{}, nil, "", session, &Conversation{}, tr)
-
-		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
-	})
-})
-
-// transcribeUtterance is the retranscribe gate's offline decode of the
-// buffered turn.
-var _ = Describe("transcribeUtterance", func() {
-	It("returns the batch decode with its Eou flag", func() {
-		m := &fakeModel{transcribeFinal: &schema.TranscriptionResult{Text: "confirmed", Eou: true}}
-		session := &Session{
-			InputAudioTranscription: &types.AudioTranscription{},
-			ModelInterface:          m,
-		}
-
-		tr, err := transcribeUtterance(context.Background(), []byte{0, 0, 1, 1}, session)
-		Expect(err).ToNot(HaveOccurred())
-		Expect(tr.Text).To(Equal("confirmed"))
-		Expect(tr.Eou).To(BeTrue())
-	})
-
-	It("propagates backend errors", func() {
-		m := &fakeModel{transcribeErr: errors.New("engine fell over")}
-		session := &Session{
-			InputAudioTranscription: &types.AudioTranscription{},
-			ModelInterface:          m,
-		}
-
-		_, err := transcribeUtterance(context.Background(), []byte{0, 0}, session)
-		Expect(err).To(MatchError(ContainSubstring("engine fell over")))
-	})
-})
-
-// emitPrecomputedTranscription replays an already-produced transcript as the
-// standard delta/completed event sequence.
-var _ = Describe("emitPrecomputedTranscription", func() {
-	It("emits deltas then completed, sharing the item id", func() {
-		tr := &fakeTransport{}
-		Expect(emitPrecomputedTranscription(tr, "item42", []string{"a", "", "b"}, "ab")).To(Succeed())
-
-		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionDelta)).To(Equal(2), "empty deltas skipped")
-		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
-		for _, e := range tr.events {
-			switch ev := e.(type) {
-			case types.ConversationItemInputAudioTranscriptionDeltaEvent:
-				Expect(ev.ItemID).To(Equal("item42"))
-			case types.ConversationItemInputAudioTranscriptionCompletedEvent:
-				Expect(ev.ItemID).To(Equal("item42"))
-				Expect(ev.Transcript).To(Equal("ab"))
-			}
-		}
-	})
-
-	It("emits only the completed event with no deltas", func() {
-		tr := &fakeTransport{}
-		Expect(emitPrecomputedTranscription(tr, "item1", nil, "hi")).To(Succeed())
-		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionDelta)).To(Equal(0))
-		Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
-	})
-})
--- a/core/http/endpoints/openai/realtime_stream.go
+++ b/core/http/endpoints/openai/realtime_stream.go
@@ -86,8 +86,7 @@ func (s *transcriptStreamer) content() string {
 // tool calls. It returns true when it has fully handled the response so the
 // caller can return; callers must only invoke it for an audio modality, and with
 // tools only when the model uses its tokenizer template (see triggerResponseAtTurn).
-func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation, t Transport, r *liveResponse, history schema.Messages, images []string, llmCfg *config.ModelConfig, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, toolTurn int) bool {
-	responseID := r.id
+func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation, t Transport, responseID string, history schema.Messages, images []string, llmCfg *config.ModelConfig, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, toolTurn int) bool {
 	itemID := generateItemID()
 	item := types.MessageItemUnion{
 		Assistant: &types.MessageItemAssistant{
@@ -122,8 +121,6 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 		})
 	}

-	// cancel rolls back the partial item and records the cancelled outcome; the
-	// single terminal is emitted by triggerResponse.
 	cancel := func() {
 		if announced {
 			conv.Lock.Lock()
@@ -135,7 +132,10 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 			}
 			conv.Lock.Unlock()
 		}
-		r.outcome = outcomeCancelled
+		sendEvent(t, types.ResponseDoneEvent{
+			ServerEventBase: types.ServerEventBase{},
+			Response:        types.Response{ID: responseID, Object: "realtime.response", Status: types.ResponseStatusCancelled},
+		})
 	}

 	var template string
@@ -161,30 +161,24 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 	streamer.announce = announce

 	// Clause chunking (opt-in): synthesize each clause as soon as it completes
-	// instead of buffering the whole reply. Synthesis runs on a worker goroutine
-	// (ttsPipeline) rather than inline in the token callback: emitSpeech blocks
-	// until the whole clause is synthesized (and, for WebRTC, played back at
-	// real time), and the callback runs on the goroutine that drains the LLM
-	// gRPC stream — so speaking inline stalls generation and freezes the
-	// assistant transcript at every clause boundary. The worker lets generation
-	// and the transcript stream keep flowing while audio is produced behind them.
+	// instead of buffering the whole reply. streamedAudio accumulates the PCM
+	// across clauses for the conversation item record; ttsErr captures the first
+	// synthesis failure so the token callback can stop the prediction. emitSpeech
+	// runs synchronously here — the LLM keeps generating into the gRPC stream
+	// while a clause is synthesized, so audio still starts mid-generation.
 	var chunker *clauseChunker
-	var ttsPipe *ttsPipeline
 	if session.ModelConfig != nil && session.ModelConfig.Pipeline.ChunkClauses() {
 		chunker = newClauseChunker(defaultClauseMinRunes, defaultClauseMaxRunes)
-		ttsPipe = newTTSPipeline(func(clause string) ([]byte, error) {
-			return emitSpeech(ctx, t, session, responseID, itemID, clause)
-		})
 	}
 	var streamedAudio []byte
 	var ttsErr error
-
-	// Backstop: always join the TTS worker, even on an unexpected early return.
-	// wait() is idempotent, so the explicit drain below (which captures the
-	// streamed audio and first error) stays authoritative; this only guarantees
-	// the goroutine can never leak if a new return path is added.
-	if ttsPipe != nil {
-		defer func() { _, _ = ttsPipe.wait() }()
+	speakClause := func(clause string) error {
+		a, err := emitSpeech(ctx, t, session, responseID, itemID, clause)
+		if err != nil {
+			return err
+		}
+		streamedAudio = append(streamedAudio, a...)
+		return nil
 	}

 	// fail reports a mid-stream failure. A cancelled context means the client
@@ -194,7 +188,6 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 			cancel()
 		} else {
 			sendError(t, code, fmt.Sprintf("%s: %v", msg, err), "", itemID)
-			r.outcome = outcomeFailed
 		}
 		return true
 	}
@@ -214,12 +207,8 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 		delta := streamer.onToken(text)
 		if chunker != nil && delta != "" {
 			for _, clause := range chunker.push(delta) {
-				// Hand the clause to the worker and keep going — never block the
-				// recv loop on synthesis. A false return means a prior clause
-				// already failed; stop the prediction (the error is collected
-				// from the pipeline after predFunc returns).
-				if !ttsPipe.enqueue(clause) {
-					return false
+				if ttsErr = speakClause(clause); ttsErr != nil {
+					return false // stop the prediction; reported after predFunc returns
 				}
 			}
 		}
@@ -228,27 +217,10 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation

 	predFunc, err := session.ModelInterface.Predict(ctx, history, images, nil, nil, cb, tools, toolChoice, nil, nil, nil)
 	if err != nil {
-		// The deferred wait() joins the (idle) worker.
 		sendError(t, "inference_failed", fmt.Sprintf("backend error: %v", err), "", itemID)
 		return true
 	}
 	pred, err := predFunc()
-
-	// Drain the TTS worker. On a clean finish, enqueue the trailing clause(s) the
-	// chunker was still holding; on an error or barge-in, stop synthesizing.
-	// wait() runs on every path so the worker goroutine never leaks, and it
-	// returns the audio streamed so far plus the first synthesis failure.
-	if ttsPipe != nil {
-		if err == nil && ctx.Err() == nil {
-			for _, clause := range chunker.flush() {
-				if !ttsPipe.enqueue(clause) {
-					break
-				}
-			}
-		}
-		streamedAudio, ttsErr = ttsPipe.wait()
-	}
-
 	// A clause synthesis failed mid-stream (the callback stopped the prediction);
 	// report it as a TTS error rather than a prediction error.
 	if ttsErr != nil {
@@ -261,7 +233,6 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 		cancel()
 		return true
 	}
-	r.addUsage(pred.Usage)

 	content := streamer.content()
 	toolCalls := functions.ToolCallsFromChatDeltas(pred.ChatDeltas)
@@ -273,19 +244,24 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 			announce()
 		}

-		// With clause chunking the clauses were synthesized on the worker as the
-		// reply streamed (including the trailing flush drained above), so the
-		// audio is already accumulated. Otherwise buffer the whole message and
-		// synthesize it once now — emitSpeech streams the audio chunks when the
-		// TTS backend supports TTSStream, otherwise it sends a single unary delta.
+		// Synthesize the audio. With clause chunking the completed clauses were
+		// already spoken inside the token callback; flush the trailing clause(s)
+		// the segmenter was still holding. Otherwise buffer the whole message and
+		// synthesize it once. emitSpeech streams the audio chunks when the TTS
+		// backend supports TTSStream, otherwise it sends a single unary delta.
 		var audio []byte
 		if chunker != nil {
+			for _, clause := range chunker.flush() {
+				if ttsErr = speakClause(clause); ttsErr != nil {
+					break
+				}
+			}
 			audio = streamedAudio
 		} else {
 			audio, ttsErr = emitSpeech(ctx, t, session, responseID, itemID, content)
-			if ttsErr != nil {
-				return fail("tts_error", "TTS generation failed", ttsErr)
-			}
+		}
+		if ttsErr != nil {
+			return fail("tts_error", "TTS generation failed", ttsErr)
 		}

 		_, isWebRTC := t.(*WebRTCTransport)
@@ -330,12 +306,10 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
 			OutputIndex:     0,
 			Item:            item,
 		})
-		r.addItem(item)
 	}

-	// Emit any tool calls and (for server-side assistant tools) the follow-up
-	// turn — shared with the buffered path. The single terminal is emitted by
-	// triggerResponse.
-	emitToolCallItems(ctx, session, conv, t, r, toolCalls, content != "", toolTurn)
+	// Emit any tool calls, the terminal response.done, and (for server-side
+	// assistant tools) the follow-up turn — shared with the buffered path.
+	emitToolCallItems(ctx, session, conv, t, responseID, toolCalls, content != "", toolTurn)
 	return true
 }
--- a/core/http/endpoints/openai/realtime_stream_test.go
+++ b/core/http/endpoints/openai/realtime_stream_test.go
@@ -102,8 +102,7 @@ var _ = Describe("streamLLMResponse", func() {
 		t := &fakeTransport{}
 		llmCfg := &config.ModelConfig{}

-		r := &liveResponse{id: "resp1"}
-		handled := streamLLMResponse(context.Background(), session, conv, t, r, nil, nil, llmCfg, nil, nil, 0)
+		handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0)

 		Expect(handled).To(BeTrue())
 		// One live transcript delta per streamed token.
@@ -133,8 +132,7 @@ var _ = Describe("streamLLMResponse", func() {
 		t := &fakeTransport{}
 		llmCfg := &config.ModelConfig{}

-		r := &liveResponse{id: "resp1"}
-		handled := streamLLMResponse(context.Background(), session, conv, t, r, nil, nil, llmCfg, nil, nil, 0)
+		handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0)

 		Expect(handled).To(BeTrue())
 		// Two clauses ("Hello world." mid-stream, "How are you?" on flush) → two
@@ -142,10 +140,8 @@ var _ = Describe("streamLLMResponse", func() {
 		Expect(t.countEvents(types.ServerEventTypeResponseOutputAudioDelta)).To(Equal(2))
 		// The full transcript still streams verbatim.
 		Expect(t.transcriptDeltaText()).To(Equal("Hello world. How are you?"))
-		// The terminal response.done is emitted by triggerResponse, not by
-		// streamLLMResponse — so at this layer there are none.
-		Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
-		Expect(r.outcome).To(Equal(outcomeCompleted))
+		// Exactly one terminal response.done.
+		Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(1))
 	})

 	It("streams content deltas and emits tool-call items (autoparser tool turn)", func() {
@@ -173,18 +169,15 @@ var _ = Describe("streamLLMResponse", func() {
 		llmCfg := &config.ModelConfig{}
 		llmCfg.TemplateConfig.UseTokenizerTemplate = true

-		r := &liveResponse{id: "resp1"}
-		handled := streamLLMResponse(context.Background(), session, conv, t, r, nil, nil, llmCfg, nil, nil, 0)
+		handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0)

 		Expect(handled).To(BeTrue())
 		// The spoken content was streamed live.
 		Expect(t.transcriptDeltaText()).To(Equal("Let me check."))
 		// The tool call is emitted as a function_call item.
 		Expect(t.countEvents(types.ServerEventTypeResponseFunctionCallArgumentsDone)).To(Equal(1))
-		// The terminal response.done is emitted by triggerResponse, not by
-		// streamLLMResponse — so at this layer there are none.
-		Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
-		Expect(r.outcome).To(Equal(outcomeCompleted))
+		// Exactly one terminal response.done.
+		Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(1))
 	})

 	It("emits only tool-call items for a content-less tool turn (no empty assistant item)", func() {
@@ -207,8 +200,7 @@ var _ = Describe("streamLLMResponse", func() {
 		llmCfg := &config.ModelConfig{}
 		llmCfg.TemplateConfig.UseTokenizerTemplate = true

-		r := &liveResponse{id: "resp1"}
-		handled := streamLLMResponse(context.Background(), session, conv, t, r, nil, nil, llmCfg, nil, nil, 0)
+		handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0)

 		Expect(handled).To(BeTrue())
 		// No content → no transcript deltas and no spurious assistant content item.
@@ -216,51 +208,6 @@ var _ = Describe("streamLLMResponse", func() {
 		Expect(t.countEvents(types.ServerEventTypeResponseOutputAudioTranscriptDelta)).To(Equal(0))
 		// The tool call is still emitted.
 		Expect(t.countEvents(types.ServerEventTypeResponseFunctionCallArgumentsDone)).To(Equal(1))
-		Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
-		Expect(r.outcome).To(Equal(outcomeCompleted))
-	})
-})
-
-var _ = Describe("triggerResponse", func() {
-	It("emits exactly one response.created and one response.done with output and usage", func() {
-		m := &fakeModel{
-			cfg: &config.ModelConfig{},
-			predictResp: backend.LLMResponse{
-				Response: "Hi there.",
-				Usage:    backend.TokenUsage{Prompt: 5, Completion: 3},
-			},
-		}
-		session := &Session{
-			OutputSampleRate: 24000,
-			ModelInterface:   m,
-			ModelConfig:      &config.ModelConfig{},
-			// Text-only so the buffered path skips TTS and the assertion focuses
-			// on the terminal's Output + Usage.
-			OutputModalities: []types.Modality{types.ModalityText},
-		}
-		conv := &Conversation{}
-		t := &fakeTransport{}
-
-		triggerResponse(context.Background(), session, conv, t, nil)
-
-		// Exactly one of each lifecycle event for the whole response.create.
-		Expect(t.countEvents(types.ServerEventTypeResponseCreated)).To(Equal(1))
 		Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(1))
-
-		// The single terminal carries the produced output item and the usage —
-		// both empty in the legacy code.
-		var done *types.ResponseDoneEvent
-		for i := range t.events {
-			if d, ok := t.events[i].(types.ResponseDoneEvent); ok {
-				done = &d
-			}
-		}
-		Expect(done).NotTo(BeNil())
-		Expect(done.Response.Status).To(Equal(types.ResponseStatusCompleted))
-		Expect(done.Response.Output).To(HaveLen(1))
-		Expect(done.Response.Usage).NotTo(BeNil())
-		Expect(done.Response.Usage.InputTokens).To(Equal(5))
-		Expect(done.Response.Usage.OutputTokens).To(Equal(3))
-		Expect(done.Response.Usage.TotalTokens).To(Equal(8))
 	})
 })
--- a/core/http/endpoints/openai/realtime_transcription.go
+++ b/core/http/endpoints/openai/realtime_transcription.go
@@ -7,33 +7,6 @@ import (
 	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
 )

-// emitPrecomputedTranscription emits the transcription events for a turn
-// whose transcript already exists (semantic_vad's live stream, or the
-// retranscribe gate's batch decode): optional delta replays followed by the
-// completed event — the same contract emitTranscription produces, sharing
-// one itemID — without running the backend again.
-func emitPrecomputedTranscription(t Transport, itemID string, deltas []string, transcript string) error {
-	for _, d := range deltas {
-		if d == "" {
-			continue
-		}
-		if err := t.SendEvent(types.ConversationItemInputAudioTranscriptionDeltaEvent{
-			ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
-			ItemID:          itemID,
-			ContentIndex:    0,
-			Delta:           d,
-		}); err != nil {
-			return err
-		}
-	}
-	return t.SendEvent(types.ConversationItemInputAudioTranscriptionCompletedEvent{
-		ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
-		ItemID:          itemID,
-		ContentIndex:    0,
-		Transcript:      transcript,
-	})
-}
-
 // emitTranscription transcribes a committed utterance and emits the transcription
 // events for it, returning the final transcript text. With
 // pipeline.streaming.transcription enabled it streams each transcript fragment as
--- a/core/http/endpoints/openai/realtime_tts_pipeline.go
+++ b/core/http/endpoints/openai/realtime_tts_pipeline.go
@@ -1,153 +0,0 @@
-package openai
-
-import (
-	"sync"
-	"sync/atomic"
-
-	"github.com/mudler/LocalAI/core/http/endpoints/openai/ttscoord"
-)
-
-// ttsPipeline decouples speech synthesis from LLM token generation.
-//
-// The LLM token callback runs on the same goroutine that drains the model's
-// gRPC stream, so anything it does serially — including a blocking TTS call —
-// stops the stream from being read and stalls generation (and, since the same
-// goroutine also sends the assistant transcript, freezes the transcript the
-// client sees). ttsPipeline lets the callback hand each completed clause to a
-// single worker goroutine that synthesizes them in order, concurrently with
-// continued generation. One worker preserves clause — and therefore audio —
-// ordering.
-//
-// The clause queue is intentionally unbounded: clauses are short strings and a
-// reply has a bounded number of them, while the expensive product (audio) is
-// paced by the TTS backend regardless. So enqueue never blocks the callback,
-// and the transcript streams to the client at generation speed while audio is
-// produced behind it.
-type ttsPipeline struct {
-	speak func(clause string) ([]byte, error)
-
-	mu    sync.Mutex
-	queue []string
-	wake  chan struct{} // buffered(1) wakeup signal for the worker
-
-	// coord owns the open->closing->closed lifecycle (machine M5). It replaces the
-	// legacy `closed bool`: the producer raises Close (wait()), the worker raises
-	// WorkerExited. See ttscoord/ and realtime-state-machines.md.
-	coord *ttscoord.Coordinator
-
-	done   chan struct{}
-	failed atomic.Bool
-
-	// audio and firstErr are owned by the worker goroutine and only safe to
-	// read after wait() has returned (it joins on the worker via done).
-	audio    []byte
-	firstErr error
-}
-
-// newTTSPipeline starts the worker. speak performs the actual synthesis and
-// returns the PCM accumulated for the conversation-item record (empty for
-// transports that stream audio out-of-band, e.g. WebRTC).
-func newTTSPipeline(speak func(clause string) ([]byte, error)) *ttsPipeline {
-	p := &ttsPipeline{
-		speak: speak,
-		wake:  make(chan struct{}, 1),
-		done:  make(chan struct{}),
-	}
-	p.coord = ttscoord.New(p)
-	go p.run()
-	return p
-}
-
-// closing reports whether wait() has been called (lifecycle past Open). Read
-// under p.mu in the worker so the queue-empty check and the close check are
-// consistent.
-func (p *ttsPipeline) closing() bool {
-	_, open := p.coord.State().(ttscoord.Open)
-	return !open
-}
-
-// Perform executes a coordinator effect. Wake nudges the worker (non-blocking).
-func (p *ttsPipeline) Perform(e ttscoord.Effect) {
-	if _, ok := e.(ttscoord.Wake); ok {
-		p.signal()
-	}
-}
-
-func (p *ttsPipeline) run() {
-	defer close(p.done)
-	for {
-		p.mu.Lock()
-		for len(p.queue) == 0 && !p.closing() {
-			p.mu.Unlock()
-			<-p.wake
-			p.mu.Lock()
-		}
-		if len(p.queue) == 0 && p.closing() {
-			p.mu.Unlock()
-			// Drained and closed: advance the lifecycle to Closed, then exit
-			// (the deferred close(p.done) joins the producer's wait()).
-			_ = p.coord.Apply(ttscoord.WorkerExited{})
-			return
-		}
-		clause := p.queue[0]
-		p.queue = p.queue[1:]
-		p.mu.Unlock()
-
-		// Once a clause has failed, keep draining the queue without speaking so
-		// the producer's wait() returns promptly and the first error is kept.
-		if p.failed.Load() {
-			continue
-		}
-		a, err := p.speak(clause)
-		if err != nil {
-			p.firstErr = err
-			p.failed.Store(true)
-			continue
-		}
-		p.audio = append(p.audio, a...)
-	}
-}
-
-// enqueue offers a clause for synthesis. It never blocks; it returns false once
-// synthesis has failed, signalling the caller to stop the prediction.
-func (p *ttsPipeline) enqueue(clause string) bool {
-	if p.failed.Load() {
-		return false
-	}
-	p.mu.Lock()
-	// Reject once closing/closed: the worker may have already drained and exited,
-	// so a clause queued now would be silently dropped. The lifecycle (Open) and
-	// the append are checked under the same lock, so the worker cannot exit between
-	// the gate and the enqueue (it takes p.mu to observe the empty queue).
-	if p.closing() {
-		p.mu.Unlock()
-		return false
-	}
-	p.queue = append(p.queue, clause)
-	p.mu.Unlock()
-	p.signal()
-	return true
-}
-
-// signal wakes the worker without blocking; the buffered channel coalesces
-// signals, which is safe because the worker drains the whole queue per wake.
-func (p *ttsPipeline) signal() {
-	select {
-	case p.wake <- struct{}{}:
-	default:
-	}
-}
-
-// wait closes the queue and blocks until the worker has spoken every enqueued
-// clause, then returns the accumulated audio and the first synthesis error. It
-// is idempotent: calling it again returns the same result without blocking, so
-// callers can drain it explicitly to read the audio and still defer a wait() as
-// a leak-proof backstop. No clause may be enqueued after the first wait().
-func (p *ttsPipeline) wait() ([]byte, error) {
-	// Close the lifecycle (Open->Closing) and wake the worker. Idempotent: a
-	// second Close is absorbed (no second wake), and <-p.done returns immediately
-	// once the worker has exited.
-	_ = p.coord.Apply(ttscoord.Close{})
-	<-p.done
-	return p.audio, p.firstErr
-}
--- a/core/http/endpoints/openai/realtime_tts_pipeline_test.go
+++ b/core/http/endpoints/openai/realtime_tts_pipeline_test.go
@@ -1,114 +0,0 @@
-package openai
-
-import (
-	"errors"
-	"sync"
-	"time"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("ttsPipeline", func() {
-	It("synthesizes clauses in order and accumulates their audio", func() {
-		p := newTTSPipeline(func(clause string) ([]byte, error) {
-			return []byte(clause), nil
-		})
-		Expect(p.enqueue("a")).To(BeTrue())
-		Expect(p.enqueue("b")).To(BeTrue())
-		Expect(p.enqueue("c")).To(BeTrue())
-
-		audio, err := p.wait()
-		Expect(err).NotTo(HaveOccurred())
-		Expect(string(audio)).To(Equal("abc"))
-	})
-
-	It("never blocks the producer even when synthesis is slow", func() {
-		var started sync.WaitGroup
-		started.Add(1)
-		release := make(chan struct{})
-		first := true
-		p := newTTSPipeline(func(clause string) ([]byte, error) {
-			if first {
-				first = false
-				started.Done()
-				<-release // hold the worker on the first clause
-			}
-			return []byte(clause), nil
-		})
-
-		Expect(p.enqueue("1")).To(BeTrue())
-		started.Wait() // worker is now blocked synthesizing the first clause
-
-		// Enqueuing many more clauses must return immediately, not block on the
-		// stalled worker — this is what keeps the LLM recv loop flowing.
-		done := make(chan struct{})
-		go func() {
-			defer close(done)
-			for _, c := range []string{"2", "3", "4", "5"} {
-				p.enqueue(c)
-			}
-		}()
-		Eventually(done, time.Second).Should(BeClosed())
-
-		close(release)
-		audio, err := p.wait()
-		Expect(err).NotTo(HaveOccurred())
-		Expect(string(audio)).To(Equal("12345"))
-	})
-
-	It("keeps the first error, stops speaking, and signals the producer to stop", func() {
-		boom := errors.New("backend gone")
-		var spoken []string
-		var mu sync.Mutex
-		p := newTTSPipeline(func(clause string) ([]byte, error) {
-			mu.Lock()
-			spoken = append(spoken, clause)
-			mu.Unlock()
-			if clause == "b" {
-				return nil, boom
-			}
-			return []byte(clause), nil
-		})
-
-		Expect(p.enqueue("a")).To(BeTrue())
-		Expect(p.enqueue("b")).To(BeTrue())
-
-		// Once the failure is observed, enqueue reports it so the caller stops
-		// the prediction; any further clauses are dropped, not spoken.
-		Eventually(func() bool { return !p.enqueue("c") }, time.Second).Should(BeTrue())
-
-		_, err := p.wait()
-		Expect(err).To(MatchError(boom))
-
-		mu.Lock()
-		defer mu.Unlock()
-		Expect(spoken).NotTo(ContainElement("c"), "clauses after the failure are not synthesized")
-	})
-
-	It("is idempotent: a second wait returns the same result without blocking", func() {
-		p := newTTSPipeline(func(clause string) ([]byte, error) {
-			return []byte(clause), nil
-		})
-		Expect(p.enqueue("x")).To(BeTrue())
-
-		audio1, err1 := p.wait()
-		// A deferred backstop wait() in the caller runs after the explicit one;
-		// it must not block or change the result.
-		audio2, err2 := p.wait()
-
-		Expect(err1).NotTo(HaveOccurred())
-		Expect(err2).NotTo(HaveOccurred())
-		Expect(string(audio1)).To(Equal("x"))
-		Expect(string(audio2)).To(Equal("x"))
-	})
-
-	It("returns cleanly when no clause was ever enqueued", func() {
-		p := newTTSPipeline(func(clause string) ([]byte, error) {
-			return []byte(clause), nil
-		})
-		audio, err := p.wait()
-		Expect(err).NotTo(HaveOccurred())
-		Expect(audio).To(BeEmpty())
-	})
-})
--- a/core/http/endpoints/openai/realtime_turncoord.go
+++ b/core/http/endpoints/openai/realtime_turncoord.go
@@ -1,127 +0,0 @@
-package openai
-
-import (
-	"context"
-	"time"
-
-	"github.com/mudler/LocalAI/core/http/endpoints/openai/respcoord"
-	"github.com/mudler/LocalAI/core/http/endpoints/openai/turncoord"
-	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
-	"github.com/mudler/LocalAI/core/schema"
-)
-
-// turnSink wires the explicit turn-detection state machine (turncoord.Coordinator
-// — machine "M2" in docs/design/realtime-state-machines.md) into handleVAD.
-//
-// In the legacy code the turn lifecycle was split across two variables that could
-// disagree: handleVAD's goroutine-local speechStarted bool and the semantic_vad
-// liveTurnState's "is the live stream open" flag (lts.open()). A discardTurn (the
-// no-speech clear, or teardown) closed the live stream but left speechStarted
-// true, so the next speech onset was suppressed by `if !speechStarted` — no
-// speech_started, no barge-in, no commit (Part 2, failure mode 4). Here "speech
-// started" and "a turn is open" are ONE coordinator state, so they cannot desync.
-//
-// Unlike responseSink (M3), which is a genuine dual-writer race, the turn machine
-// is owned by the single handleVAD goroutine; this sink and its coordinator are
-// loop-local. The coordinator's lock only matters for the teardown-time Abort and
-// for keeping State() readable — there is no second writer.
-//
-// The effects map onto the existing turn I/O:
-//   - OpenTurn:          open the live ASR stream (semantic_vad) + feed the onset
-//     audio. A failed open degrades the turn to silence-only — the turn still
-//     proceeds (server_vad-like), matching the legacy behaviour.
-//   - BargeIn:           cancel any in-flight response (non-blocking).
-//   - EmitSpeechStarted: input_audio_buffer.speech_started.
-//   - EmitSpeechStopped: input_audio_buffer.speech_stopped.
-//   - CommitTurn:        committed event + finalize the live stream + issue the
-//     response (via responseSink/respcoord).
-//   - DiscardTurn:       close the live stream and retract any captions.
-//
-// The data-heavy effects (OpenTurn, CommitTurn) need the current tick's audio and
-// transcription context. Because Apply performs effects synchronously on the same
-// (handleVAD) goroutine, the loop sets the relevant scratch fields immediately
-// before each Apply; there is no cross-goroutine sharing.
-type turnSink struct {
-	session    *Session
-	conv       *Conversation
-	transport  Transport
-	lts        *liveTurnState
-	vadContext context.Context
-	startTime  time.Time
-
-	coord *turncoord.Coordinator
-
-	// per-tick context, set by handleVAD before each Apply (single goroutine).
-	sv                 *types.RealtimeSessionSemanticVad // nil = server_vad
-	onsetAudio         []int16                           // OpenTurn feeds this
-	commitAudio        []byte                            // CommitTurn issues this
-	commitAudioLength  float64                           // for finishTurn (flush tail)
-	commitRetranscribe bool                              // gated batch is authoritative
-	commitGated        *schema.TranscriptionResult       // retranscribe batch decode
-}
-
-func newTurnSink(session *Session, conv *Conversation, t Transport, lts *liveTurnState, vadContext context.Context, startTime time.Time) *turnSink {
-	s := &turnSink{
-		session:    session,
-		conv:       conv,
-		transport:  t,
-		lts:        lts,
-		vadContext: vadContext,
-		startTime:  startTime,
-	}
-	s.coord = turncoord.New(s)
-	return s
-}
-
-// Perform executes one effect. It is called by Coordinator.Apply while the
-// coordinator lock is held. The turn coordinator is single-writer (handleVAD), so
-// the synchronous network writes / lts operations here are the same ones the
-// legacy loop did inline on this goroutine; they never contend the lock.
-func (s *turnSink) Perform(e turncoord.Effect) {
-	switch eff := e.(type) {
-	case turncoord.OpenTurn:
-		if s.sv != nil && s.lts.openTurn(s.vadContext, string(eff.Turn)) {
-			s.lts.feedNewAudio(s.onsetAudio)
-		}
-	case turncoord.BargeIn:
-		s.session.respSink.cancel(respcoord.SourceVAD)
-	case turncoord.EmitSpeechStarted:
-		sendEvent(s.transport, types.InputAudioBufferSpeechStartedEvent{
-			ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
-			AudioStartMs:    time.Since(s.startTime).Milliseconds(),
-		})
-	case turncoord.EmitSpeechStopped:
-		sendEvent(s.transport, types.InputAudioBufferSpeechStoppedEvent{
-			ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
-			AudioEndMs:      time.Since(s.startTime).Milliseconds(),
-		})
-	case turncoord.CommitTurn:
-		// The committed item id is the coordinator's turn id (== the live caption
-		// id), so the client's completed event replaces the partial text.
-		itemID := string(eff.Turn)
-		sendEvent(s.transport, types.InputAudioBufferCommittedEvent{
-			ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
-			ItemID:          itemID,
-			PreviousItemID:  "TODO",
-		})
-		// Finalize the turn's live stream (flushes the decode tail). In
-		// retranscribe mode the batch decode is authoritative, so the streamed
-		// transcript is dropped.
-		var live *liveUtterance
-		if s.sv != nil {
-			ut := s.lts.finishTurn(s.commitAudioLength)
-			if !s.commitRetranscribe {
-				live = ut
-			}
-		}
-		audio := s.commitAudio
-		gated := s.commitGated
-		conv := s.conv
-		s.session.respSink.issue(s.vadContext, respcoord.SourceVAD, func(ctx context.Context) {
-			commitUtteranceWithTranscript(ctx, audio, live, gated, itemID, s.session, conv, s.transport)
-		})
-	case turncoord.DiscardTurn:
-		// No-op if the stream was never open (server_vad / already idle).
-		s.lts.discardTurn()
-	}
-}
--- a/core/http/endpoints/openai/realtime_vad_buffer_test.go
+++ b/core/http/endpoints/openai/realtime_vad_buffer_test.go
@@ -1,54 +0,0 @@
-package openai
-
-import (
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-// dropInspectedPrefix is what stands between the VAD loop's buffer clears and
-// cutting the first word off an utterance: the no-speech clear must keep the
-// holdback tail (silero hasn't crossed its onset threshold yet) and both
-// clears must keep audio appended while the tick ran (the VAD never saw it).
-var _ = Describe("dropInspectedPrefix", func() {
-	It("keeps the holdback tail of the inspected window and everything appended mid-tick", func() {
-		inspected := []byte{1, 2, 3, 4, 5, 6}
-		appended := []byte{7, 8}
-		buf := append(append([]byte(nil), inspected...), appended...)
-
-		out := dropInspectedPrefix(buf, len(inspected), 2)
-
-		Expect(out).To(Equal([]byte{5, 6, 7, 8}), "older confirmed-silent head dropped, possible onset + fresh audio kept")
-	})
-
-	It("returns the buffer unchanged when the inspected window fits in the holdback", func() {
-		buf := []byte{1, 2, 3}
-
-		Expect(dropInspectedPrefix(buf, len(buf), 4)).To(Equal(buf))
-		Expect(dropInspectedPrefix(buf, len(buf), len(buf))).To(Equal(buf))
-	})
-
-	It("drops the whole inspected window with zero holdback, keeping only mid-tick appends", func() {
-		// The commit-time clear: the inspected audio was committed, audio
-		// appended while the tick ran belongs to the next turn.
-		buf := []byte{1, 2, 3, 4}
-
-		Expect(dropInspectedPrefix(buf, 4, 0)).To(BeEmpty())
-		Expect(dropInspectedPrefix(append(buf, 9), 4, 0)).To(Equal([]byte{9}))
-	})
-
-	It("clamps when told more was inspected than the buffer holds", func() {
-		buf := []byte{1, 2}
-
-		Expect(dropInspectedPrefix(buf, 10, 0)).To(BeEmpty())
-	})
-
-	It("returns a copy, not a sub-slice, when bytes are dropped", func() {
-		buf := []byte{1, 2, 3, 4}
-
-		out := dropInspectedPrefix(buf, 4, 2)
-
-		Expect(out).To(Equal([]byte{3, 4}))
-		buf[2] = 99
-		Expect(out).To(Equal([]byte{3, 4}), "mutating the old backing array must not leak into the published buffer")
-	})
-})
--- a/core/http/endpoints/openai/respcoord/respcoord.go
+++ b/core/http/endpoints/openai/respcoord/respcoord.go
@@ -1,267 +0,0 @@
-// Package respcoord is the explicit state machine for the realtime API's
-// response-coordination concern (machine "M3" in
-// docs/design/realtime-state-machines.md).
-//
-// In the legacy code this machine is implicit: a response is "active" iff
-// Session.activeResponseDone is a non-nil, unclosed channel, and the lifecycle
-// is driven from TWO goroutines (the client read-loop and the VAD goroutine)
-// that both call startResponse/cancelActiveResponse. responseMu guards only the
-// field swap, while the <-done wait happens outside the lock, so two concurrent
-// starts can briefly leave two live response goroutines both appending to the
-// conversation. See docs/design/realtime-state-machines.md, Part 2 (failure
-// mode 2) and the ResponseLifecycle spec under formal-verification/.
-//
-// This package replaces that with:
-//   - a sealed sum type for State (illegal states are unrepresentable),
-//   - a total, pure transition function Next(state, event) -> (state, effects),
-//   - a single-writer Coordinator that serializes every transition.
-//
-// The design guarantees the invariants the specs check:
-//   - at most one live response at any instant,
-//   - exactly one terminal (response.done) per started response,
-//   - no response is started after its terminal (no resurrection).
-package respcoord
-
-import (
-	"fmt"
-
-	"github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator"
-)
-
-// ResponseID identifies a single response attempt. The caller mints a fresh,
-// monotonically increasing id for every Start; ids are never reused. The
-// monotonic id is what lets the machine ignore "stale" Finished events from a
-// response that was already superseded or cancelled.
-type ResponseID uint64
-
-// Source records which goroutine drove an event. It is carried for
-// observability/logging only; it never affects a transition (both sources are
-// equal authority). Keeping it in the event type makes the dual-writer reality
-// explicit rather than hidden.
-type Source int
-
-const (
-	// SourceClient is the read-loop: response.create or a manual
-	// input_audio_buffer.commit.
-	SourceClient Source = iota
-	// SourceVAD is the turn-detection goroutine: end-of-speech commit or a
-	// barge-in cancel.
-	SourceVAD
-)
-
-func (s Source) String() string {
-	switch s {
-	case SourceClient:
-		return "client"
-	case SourceVAD:
-		return "vad"
-	default:
-		return fmt.Sprintf("Source(%d)", int(s))
-	}
-}
-
-// Status is the terminal status reported on response.done.
-type Status int
-
-const (
-	// StatusCompleted is a response that finished on its own.
-	StatusCompleted Status = iota
-	// StatusCancelled is a response cut short by a barge-in, an explicit
-	// response.cancel, or by being superseded by a newer response.
-	StatusCancelled
-)
-
-func (s Status) String() string {
-	switch s {
-	case StatusCompleted:
-		return "completed"
-	case StatusCancelled:
-		return "cancelled"
-	default:
-		return fmt.Sprintf("Status(%d)", int(s))
-	}
-}
-
-// State is the sealed sum type of coordinator states. The only implementations
-// are the unexported-method-bearing structs in this file, so callers outside
-// the package cannot fabricate an out-of-band state. Exhaustively:
-// Idle | Active | Terminated.
-type State interface {
-	isState()
-	String() string
-}
-
-// Idle: no response is in flight.
-type Idle struct{}
-
-// Active: exactly one response (ID) is in flight. The struct holds a single id,
-// so "two active responses" is not representable.
-type Active struct{ ID ResponseID }
-
-// Terminated: the session is torn down. Absorbing — no response can start from
-// here, so the M1 (connection) parent's teardown can guarantee no response
-// outlives the session (see formal-verification/session_lifecycle.fizz).
-type Terminated struct{}
-
-func (Idle) isState()       {}
-func (Active) isState()     {}
-func (Terminated) isState() {}
-
-func (Idle) String() string       { return "Idle" }
-func (a Active) String() string   { return fmt.Sprintf("Active(%d)", a.ID) }
-func (Terminated) String() string { return "Terminated" }
-
-// Event is the sealed sum type of inputs. Exhaustively:
-// Start | Finished | Cancel | Shutdown.
-type Event interface {
-	isEvent()
-	String() string
-}
-
-// Start requests a new response. ID must be a fresh, never-before-used id.
-type Start struct {
-	ID     ResponseID
-	Source Source
-}
-
-// Finished reports that the response goroutine for ID reached its own terminal.
-// If ID is not the currently-active response it is "stale" (the response was
-// already superseded/cancelled) and is ignored.
-type Finished struct{ ID ResponseID }
-
-// Cancel requests cancellation of the in-flight response (barge-in or explicit
-// response.cancel). It is a no-op when idle.
-type Cancel struct{ Source Source }
-
-// Shutdown terminates the coordinator at session teardown: it cancels any
-// in-flight response and moves to the absorbing Terminated state, after which no
-// response can start. Raised by the connection (M1) parent's teardown.
-type Shutdown struct{}
-
-func (Start) isEvent()    {}
-func (Finished) isEvent() {}
-func (Cancel) isEvent()   {}
-func (Shutdown) isEvent() {}
-
-func (e Start) String() string    { return fmt.Sprintf("Start(%d,%s)", e.ID, e.Source) }
-func (e Finished) String() string { return fmt.Sprintf("Finished(%d)", e.ID) }
-func (e Cancel) String() string   { return fmt.Sprintf("Cancel(%s)", e.Source) }
-func (Shutdown) String() string   { return "Shutdown" }
-
-// Effect is a side effect returned by Next as data for the caller to perform.
-// Returning effects as data (rather than firing callbacks inside the
-// transition) keeps Next pure and exhaustively testable, and lets the
-// Coordinator decide how/when to perform them. Exhaustively:
-// CancelResponse | StartResponse | EmitTerminal.
-type Effect interface {
-	isEffect()
-	String() string
-}
-
-// CancelResponse: cancel the context of the running response ID.
-type CancelResponse struct{ ID ResponseID }
-
-// StartResponse: spawn the response goroutine for ID.
-type StartResponse struct{ ID ResponseID }
-
-// EmitTerminal: send response.done for ID with Status.
-type EmitTerminal struct {
-	ID     ResponseID
-	Status Status
-}
-
-func (CancelResponse) isEffect() {}
-func (StartResponse) isEffect()  {}
-func (EmitTerminal) isEffect()   {}
-
-func (e CancelResponse) String() string { return fmt.Sprintf("CancelResponse(%d)", e.ID) }
-func (e StartResponse) String() string  { return fmt.Sprintf("StartResponse(%d)", e.ID) }
-func (e EmitTerminal) String() string {
-	return fmt.Sprintf("EmitTerminal(%d,%s)", e.ID, e.Status)
-}
-
-// Next is the total, pure transition function. For every (state, event) it
-// returns the next state and the ordered effects to perform. It returns a
-// non-nil error only for an unknown State/Event implementation (a programmer
-// error / future type added without updating this function) — callers must
-// surface that, never silently ignore it. Every in-domain (state, event) pair
-// is defined; there are no "forbidden" transitions, only no-ops for stale or
-// idle inputs.
-//
-// The supersede rule (Active + Start) is the crux of the fix: starting a new
-// response while one is active emits the old response's cancelled terminal and
-// cancels it BEFORE the replacement starts, all within one serialized
-// transition. The old goroutine's later Finished is therefore stale and
-// ignored — so each id gets exactly one terminal and there is never more than
-// one live response.
-func Next(s State, e Event) (State, []Effect, error) {
-	switch st := s.(type) {
-	case Idle:
-		switch ev := e.(type) {
-		case Start:
-			return Active{ID: ev.ID}, []Effect{StartResponse{ID: ev.ID}}, nil
-		case Cancel:
-			// Nothing in flight: idempotent no-op.
-			return Idle{}, nil, nil
-		case Finished:
-			// Stale terminal from an already-superseded/cancelled response.
-			return Idle{}, nil, nil
-		case Shutdown:
-			// Teardown with nothing in flight: go terminal.
-			return Terminated{}, nil, nil
-		}
-	case Active:
-		switch ev := e.(type) {
-		case Start:
-			return Active{ID: ev.ID}, []Effect{
-				CancelResponse{ID: st.ID},
-				EmitTerminal{ID: st.ID, Status: StatusCancelled},
-				StartResponse{ID: ev.ID},
-			}, nil
-		case Finished:
-			if ev.ID == st.ID {
-				return Idle{}, []Effect{EmitTerminal{ID: st.ID, Status: StatusCompleted}}, nil
-			}
-			// Stale finish from a superseded response — already terminal-ed.
-			return Active{ID: st.ID}, nil, nil
-		case Cancel:
-			return Idle{}, []Effect{
-				CancelResponse{ID: st.ID},
-				EmitTerminal{ID: st.ID, Status: StatusCancelled},
-			}, nil
-		case Shutdown:
-			// Teardown while a response is live: cancel it (with its terminal) and
-			// go terminal so nothing can start afterwards.
-			return Terminated{}, []Effect{
-				CancelResponse{ID: st.ID},
-				EmitTerminal{ID: st.ID, Status: StatusCancelled},
-			}, nil
-		}
-	case Terminated:
-		// Absorbing: every event is a no-op. A Start after teardown is rejected
-		// (no StartResponse), so no response can outlive the session.
-		switch e.(type) {
-		case Start, Finished, Cancel, Shutdown:
-			return Terminated{}, nil, nil
-		}
-	}
-	return s, nil, fmt.Errorf("respcoord: unhandled transition %s <- %s", s, e)
-}
-
-// EffectSink performs the effects produced by a transition. See coordinator.Sink
-// for the non-blocking contract: Perform runs under the coordinator lock, so it
-// must not block and must not re-enter Apply (the spawned response goroutine's
-// Finished apply happens only after the sink returns).
-type EffectSink = coordinator.Sink[Effect]
-
-// Coordinator serializes every Start/Finished/Cancel/Shutdown transition behind
-// one lock, so the two driving goroutines (read-loop and VAD) can call Apply
-// concurrently without the legacy dual-writer race. Effects are performed in
-// order under the lock — preserving the (cancel old, emit old terminal, start
-// new) supersede ordering. See coordinator.Coordinator.
-type Coordinator = coordinator.Coordinator[State, Event, Effect]
-
-// New returns an idle Coordinator that performs effects via sink.
-func New(sink EffectSink) *Coordinator {
-	return coordinator.New[State, Event, Effect](Idle{}, Next, sink)
-}
--- a/core/http/endpoints/openai/respcoord/respcoord_suite_test.go
+++ b/core/http/endpoints/openai/respcoord/respcoord_suite_test.go
@@ -1,13 +0,0 @@
-package respcoord
-
-import (
-	"testing"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-func TestRespcoord(t *testing.T) {
-	RegisterFailHandler(Fail)
-	RunSpecs(t, "respcoord (realtime M3) Suite")
-}
--- a/core/http/endpoints/openai/respcoord/respcoord_test.go
+++ b/core/http/endpoints/openai/respcoord/respcoord_test.go
@@ -1,370 +0,0 @@
-package respcoord
-
-import (
-	"math/rand/v2"
-	"sync"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-// recordingSink captures the ordered stream of effects so the invariants can be
-// checked independently of the transition function's internals. Perform is
-// called by Coordinator.Apply under the coordinator lock, so it is already
-// serialized; the mutex here only guards reads from the spec goroutine.
-type recordingSink struct {
-	mu  sync.Mutex
-	log []Effect
-}
-
-func (s *recordingSink) Perform(e Effect) {
-	s.mu.Lock()
-	s.log = append(s.log, e)
-	s.mu.Unlock()
-}
-
-func (s *recordingSink) snapshot() []Effect {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-	out := make([]Effect, len(s.log))
-	copy(out, s.log)
-	return out
-}
-
-// checkInvariants replays the effect log and asserts the three core safety
-// properties from docs/design/realtime-state-machines.md, Part 4:
-//
-//	(1) at most one live response at any instant
-//	    -- after every effect, the number of started-but-not-terminated ids <= 1;
-//	(2) exactly one terminal per started response
-//	    -- each id is started at most once and terminated at most once;
-//	(3) no resurrection
-//	    -- an id is never started after it has been terminated.
-func checkInvariants(log []Effect) {
-	started := map[ResponseID]int{}
-	terminated := map[ResponseID]int{}
-	live := map[ResponseID]bool{}
-
-	for i, eff := range log {
-		switch e := eff.(type) {
-		case StartResponse:
-			Expect(terminated[e.ID]).To(Equal(0), "invariant (3): StartResponse(%d) after it was terminated (effect #%d)\nlog=%v", e.ID, i, log)
-			started[e.ID]++
-			Expect(started[e.ID]).To(Equal(1), "invariant (2): id %d started %d times (effect #%d)\nlog=%v", e.ID, started[e.ID], i, log)
-			live[e.ID] = true
-		case EmitTerminal:
-			terminated[e.ID]++
-			Expect(terminated[e.ID]).To(Equal(1), "invariant (2): id %d terminated %d times (effect #%d)\nlog=%v", e.ID, terminated[e.ID], i, log)
-			delete(live, e.ID)
-		case CancelResponse:
-			// no count assertion; cancellation is paired with a terminal
-		}
-		Expect(len(live)).To(BeNumerically("<=", 1), "invariant (1): %d live responses after effect #%d (%s)\nlog=%v", len(live), i, eff, log)
-	}
-}
-
-// unknownEvent is an Event implementation Next does not know about, to exercise
-// the defensive error path.
-type unknownEvent struct{}
-
-func (unknownEvent) isEvent()       {}
-func (unknownEvent) String() string { return "unknownEvent" }
-
-var _ = Describe("respcoord.Next", func() {
-	// DescribeTable exhaustively pins every (state, event) cell of the pure
-	// transition function, including the stale / idle no-op cells. This is the
-	// practical stand-in for "no transition leads to an inconsistent state": if a
-	// cell changes, this table must change with it.
-	DescribeTable("transitions",
-		func(state State, event Event, wantState State, wantEff []Effect) {
-			gotState, gotEff, err := Next(state, event)
-			Expect(err).NotTo(HaveOccurred())
-			Expect(gotState).To(Equal(wantState))
-			Expect(gotEff).To(Equal(wantEff))
-		},
-		Entry("idle+start -> active, spawns response",
-			Idle{}, Start{ID: 1, Source: SourceClient},
-			Active{ID: 1}, []Effect{StartResponse{ID: 1}}),
-		Entry("idle+cancel -> idle, no-op",
-			Idle{}, Cancel{Source: SourceVAD},
-			Idle{}, []Effect(nil)),
-		Entry("idle+finished(stale) -> idle, no-op",
-			Idle{}, Finished{ID: 7},
-			Idle{}, []Effect(nil)),
-		Entry("active+start -> supersede: cancel+terminal(old)+start(new)",
-			Active{ID: 1}, Start{ID: 2, Source: SourceVAD},
-			Active{ID: 2},
-			[]Effect{
-				CancelResponse{ID: 1},
-				EmitTerminal{ID: 1, Status: StatusCancelled},
-				StartResponse{ID: 2},
-			}),
-		Entry("active+finished(current) -> idle, completed terminal",
-			Active{ID: 3}, Finished{ID: 3},
-			Idle{}, []Effect{EmitTerminal{ID: 3, Status: StatusCompleted}}),
-		Entry("active+finished(stale) -> stay active, no-op",
-			Active{ID: 3}, Finished{ID: 2},
-			Active{ID: 3}, []Effect(nil)),
-		Entry("active+cancel -> idle, cancel+cancelled terminal",
-			Active{ID: 5}, Cancel{Source: SourceClient},
-			Idle{},
-			[]Effect{
-				CancelResponse{ID: 5},
-				EmitTerminal{ID: 5, Status: StatusCancelled},
-			}),
-		Entry("idle+shutdown -> terminated, no-op",
-			Idle{}, Shutdown{},
-			Terminated{}, []Effect(nil)),
-		Entry("active+shutdown -> terminated: cancel+cancelled terminal",
-			Active{ID: 6}, Shutdown{},
-			Terminated{},
-			[]Effect{
-				CancelResponse{ID: 6},
-				EmitTerminal{ID: 6, Status: StatusCancelled},
-			}),
-		Entry("terminated+start -> terminated, REJECTED (no resurrection)",
-			Terminated{}, Start{ID: 9, Source: SourceClient},
-			Terminated{}, []Effect(nil)),
-		Entry("terminated+finished -> terminated, no-op (stale)",
-			Terminated{}, Finished{ID: 9},
-			Terminated{}, []Effect(nil)),
-		Entry("terminated+cancel -> terminated, no-op",
-			Terminated{}, Cancel{Source: SourceVAD},
-			Terminated{}, []Effect(nil)),
-		Entry("terminated+shutdown -> terminated, idempotent",
-			Terminated{}, Shutdown{},
-			Terminated{}, []Effect(nil)),
-	)
-
-	It("is total: every defined (state, event) pair is handled without error", func() {
-		states := []State{Idle{}, Active{ID: 1}, Terminated{}}
-		events := []Event{
-			Start{ID: 2, Source: SourceClient},
-			Finished{ID: 1},
-			Finished{ID: 99},
-			Cancel{Source: SourceVAD},
-			Shutdown{},
-		}
-		for _, s := range states {
-			for _, e := range events {
-				_, _, err := Next(s, e)
-				Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e)
-			}
-		}
-	})
-
-	It("errors on an unknown event type", func() {
-		_, _, err := Next(Active{ID: 1}, unknownEvent{})
-		Expect(err).To(HaveOccurred())
-	})
-})
-
-var _ = Describe("respcoord.Coordinator", func() {
-	// This replaces the previous rapid stateful test: a seeded random walk over
-	// the event space, asserting the invariants hold after every step. Seeds are
-	// fixed so any failure reproduces deterministically.
-	It("upholds the safety invariants over random event sequences", func() {
-		seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE}
-		for _, seed := range seeds {
-			r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5))
-			sink := &recordingSink{}
-			c := New(sink)
-			var nextID uint64
-
-			for range 3000 {
-				switch r.IntN(4) {
-				case 0: // start from client
-					nextID++
-					Expect(c.Apply(Start{ID: ResponseID(nextID), Source: SourceClient})).To(Succeed())
-				case 1: // start from VAD
-					nextID++
-					Expect(c.Apply(Start{ID: ResponseID(nextID), Source: SourceVAD})).To(Succeed())
-				case 2: // possibly-stale finish from any plausible id (incl. future)
-					id := r.Uint64N(nextID + 3)
-					Expect(c.Apply(Finished{ID: ResponseID(id)})).To(Succeed())
-				case 3: // explicit cancel
-					Expect(c.Apply(Cancel{Source: SourceClient})).To(Succeed())
-				}
-			}
-			// One full-log replay per seed: it iterates the whole sequence, so
-			// it catches a violation at any step without the O(n^2) cost of
-			// re-replaying after every Apply.
-			checkInvariants(sink.snapshot())
-		}
-	})
-
-	// Hammer Apply from two goroutines -- the read-loop and the VAD goroutine,
-	// the exact dual-writer scenario that races in the legacy code -- and assert
-	// the invariants still hold. Run under -race to also catch any data race in
-	// the coordinator itself.
-	It("upholds the invariants under concurrent dual-writer Apply", func() {
-		const perGoroutine = 2000
-		sink := &recordingSink{}
-		c := New(sink)
-
-		var idCounter uint64
-		var idMu sync.Mutex
-		nextID := func() ResponseID {
-			idMu.Lock()
-			defer idMu.Unlock()
-			idCounter++
-			return ResponseID(idCounter)
-		}
-
-		var wg sync.WaitGroup
-		drive := func(src Source) {
-			defer wg.Done()
-			for i := range perGoroutine {
-				switch i % 3 {
-				case 0:
-					_ = c.Apply(Start{ID: nextID(), Source: src})
-				case 1:
-					if a, ok := c.State().(Active); ok {
-						_ = c.Apply(Finished{ID: a.ID})
-					}
-				case 2:
-					_ = c.Apply(Cancel{Source: src})
-				}
-			}
-		}
-
-		wg.Add(2)
-		go drive(SourceClient)
-		go drive(SourceVAD)
-		wg.Wait()
-
-		checkInvariants(sink.snapshot())
-	})
-
-	It("rejects the dual-writer interleaving the legacy mechanism allowed", func() {
-		// Equivalent sequence to the legacy double-start race: start id1, then two
-		// superseding starts (id2, id3) such as the read-loop and VAD would each
-		// issue. Each Start is serialized by the coordinator, so each supersede
-		// cancels+terminates the previous -- never two live at once.
-		sink := &recordingSink{}
-		c := New(sink)
-
-		Expect(c.Apply(Start{ID: 1, Source: SourceClient})).To(Succeed())
-		Expect(c.Apply(Start{ID: 2, Source: SourceVAD})).To(Succeed())
-		Expect(c.Apply(Start{ID: 3, Source: SourceClient})).To(Succeed())
-
-		checkInvariants(sink.snapshot())
-
-		got, ok := c.State().(Active)
-		Expect(ok).To(BeTrue(), "state = %s, want Active(3)", c.State())
-		Expect(got.ID).To(Equal(ResponseID(3)))
-	})
-
-	It("terminates on shutdown and rejects any later response (no resurrection)", func() {
-		sink := &recordingSink{}
-		c := New(sink)
-
-		Expect(c.Apply(Start{ID: 1, Source: SourceClient})).To(Succeed())
-		Expect(c.Apply(Shutdown{})).To(Succeed()) // cancels id 1 + goes terminal
-		Expect(c.State()).To(Equal(State(Terminated{})))
-
-		// A late response.create after teardown is structurally rejected.
-		Expect(c.Apply(Start{ID: 2, Source: SourceClient})).To(Succeed())
-		Expect(c.State()).To(Equal(State(Terminated{})))
-		// And a stale Finished from the cancelled response is absorbed.
-		Expect(c.Apply(Finished{ID: 1})).To(Succeed())
-
-		checkInvariants(sink.snapshot())
-		starts := 0
-		for _, e := range sink.snapshot() {
-			if _, ok := e.(StartResponse); ok {
-				starts++
-			}
-		}
-		Expect(starts).To(Equal(1), "only id 1 ever started; the post-shutdown Start was rejected")
-	})
-})
-
-// legacyCoord models the LEGACY startResponse/cancelActiveResponse mechanism, in
-// which the snapshot ("lock" read), the cancel-and-wait, and the spawn are NOT
-// atomic with respect to each other across the two driving goroutines. It exists
-// only to demonstrate the dual-writer race (Part 2, failure mode 2) that
-// respcoord.Coordinator eliminates. It is not used in production.
-//
-// Mapping to the legacy code:
-//   - startStep1  = snapshot Session.activeResponse* under responseMu
-//   - startStep2  = cancelActiveResponse: cancel() then <-done (outside the lock);
-//     a second waiter on an already-closed done returns immediately and does NOT
-//     decrement again (modeled by the snap==registered guard)
-//   - startStep3  = store the new cancel/done pair and spawn the goroutine
-type legacyCoord struct {
-	live       int    // # of live response goroutines (the bug: can exceed 1)
-	registered uint64 // id of the currently-registered response (0 = none)
-	nextID     uint64
-}
-
-func (l *legacyCoord) startStep1() uint64 { return l.registered } // snapshot
-
-func (l *legacyCoord) startStep2(snap uint64) { // cancel-and-wait
-	if snap != 0 && snap == l.registered {
-		l.live--
-		l.registered = 0
-	}
-}
-
-func (l *legacyCoord) startStep3() { // spawn + register
-	l.nextID++
-	l.live++
-	l.registered = l.nextID
-}
-
-var _ = DescribeTable("respcoord stringers",
-	func(got, want string) { Expect(got).To(Equal(want)) },
-	Entry(nil, SourceClient.String(), "client"),
-	Entry(nil, SourceVAD.String(), "vad"),
-	Entry(nil, Source(99).String(), "Source(99)"),
-
-	Entry(nil, StatusCompleted.String(), "completed"),
-	Entry(nil, StatusCancelled.String(), "cancelled"),
-	Entry(nil, Status(99).String(), "Status(99)"),
-
-	Entry(nil, Idle{}.String(), "Idle"),
-	Entry(nil, Active{ID: 7}.String(), "Active(7)"),
-	Entry(nil, Terminated{}.String(), "Terminated"),
-
-	Entry(nil, Start{ID: 1, Source: SourceVAD}.String(), "Start(1,vad)"),
-	Entry(nil, Finished{ID: 2}.String(), "Finished(2)"),
-	Entry(nil, Cancel{Source: SourceClient}.String(), "Cancel(client)"),
-	Entry(nil, Shutdown{}.String(), "Shutdown"),
-
-	Entry(nil, CancelResponse{ID: 3}.String(), "CancelResponse(3)"),
-	Entry(nil, StartResponse{ID: 4}.String(), "StartResponse(4)"),
-	Entry(nil, EmitTerminal{ID: 5, Status: StatusCompleted}.String(), "EmitTerminal(5,completed)"),
-)
-
-var _ = Describe("legacy dual-writer characterization", func() {
-	// Pins the exact interleaving in which the read-loop and the VAD goroutine
-	// both start a response and the machine ends up with TWO live responses. This
-	// is a characterization test for the bug: if a future change to the legacy
-	// model accidentally fixes it, this spec flips and we delete the legacy model.
-	// The production path uses respcoord.Coordinator, proven safe above.
-	It("can reach two live responses (the bug respcoord eliminates)", func() {
-		l := &legacyCoord{}
-
-		// First response established normally.
-		s := l.startStep1()
-		l.startStep2(s)
-		l.startStep3() // live=1, registered=1
-		Expect(l.live).To(Equal(1), "setup")
-
-		// The race: both goroutines snapshot the SAME active response (id 1)...
-		snapVAD := l.startStep1()    // 1
-		snapClient := l.startStep1() // 1
-
-		// ...both "cancel-and-wait" it. The first decrements; the second finds it
-		// already gone and does nothing.
-		l.startStep2(snapVAD)    // live=0, registered=0
-		l.startStep2(snapClient) // no-op (already 0)
-
-		// ...then both spawn their replacement.
-		l.startStep3() // live=1
-		l.startStep3() // live=2  <-- two live responses
-
-		Expect(l.live).To(Equal(2), "expected the legacy race to reach 2 live responses")
-	})
-})
--- a/core/http/endpoints/openai/ttscoord/ttscoord.go
+++ b/core/http/endpoints/openai/ttscoord/ttscoord.go
@@ -1,150 +0,0 @@
-// Package ttscoord is the explicit state machine for the realtime API's
-// TTS-pipeline lifecycle (machine "M5" in docs/design/realtime-state-machines.md).
-//
-// The realtime TTS pipeline (realtime_tts_pipeline.go) decouples synthesis from
-// LLM token generation: the token callback enqueues clauses, a single worker
-// goroutine synthesizes them in order, and wait() closes the queue and joins the
-// worker. In the legacy code the lifecycle is an implicit `closed bool` (guarded
-// by the pipeline mutex) plus a `done` channel closed once by the worker. Two
-// gaps: enqueue does NOT check `closed`, so a clause offered after wait() is
-// silently appended to a worker that may have already exited (dropped); and the
-// open/closed lifecycle is inferred from a bool rather than stored.
-//
-// This package makes the lifecycle explicit:
-//   - a sealed sum type for State (Open | Closing | Closed) — monotonic; illegal
-//     reversals are unrepresentable,
-//   - a total, pure transition function Next(state, event) -> (state, effects),
-//   - a single-writer Coordinator that serializes every transition.
-//
-// It is a genuine two-writer machine: the producer goroutine raises Close (from
-// wait()), and the worker goroutine raises WorkerExited when it has drained the
-// queue and seen the close — so serializing the transition matters. The poison
-// `failed` latch stays a lock-free atomic.Bool in the pipeline (it is read per
-// clause on the worker's hot path and is orthogonal to open/closed); this machine
-// owns only the open->closing->closed lifecycle.
-//
-// Guarantees the spec checks:
-//   - Close wakes the worker to exit exactly once (idempotent wait(); invariant
-//     #10),
-//   - the lifecycle is monotonic and Closed is terminal — so a clause is never
-//     accepted after close (enqueue is gated on Open) and the worker is joined
-//     exactly once (no leak; invariant #8).
-package ttscoord
-
-import (
-	"fmt"
-
-	"github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator"
-)
-
-// State is the sealed sum type of TTS-pipeline lifecycle states. Exhaustively:
-// Open | Closing | Closed.
-type State interface {
-	isState()
-	String() string
-}
-
-// Open: the worker is running and accepting clauses.
-type Open struct{}
-
-// Closing: wait() has been called; the worker is draining the remaining queue and
-// will exit. No new clause is accepted.
-type Closing struct{}
-
-// Closed: the worker has exited (its done channel is closed). Terminal.
-type Closed struct{}
-
-func (Open) isState()    {}
-func (Closing) isState() {}
-func (Closed) isState()  {}
-
-func (Open) String() string    { return "Open" }
-func (Closing) String() string { return "Closing" }
-func (Closed) String() string  { return "Closed" }
-
-// Event is the sealed sum type of inputs. Exhaustively: Close | WorkerExited.
-type Event interface {
-	isEvent()
-	String() string
-}
-
-// Close is raised by the producer goroutine (wait()): close the queue and ask
-// the worker to finish. Idempotent.
-type Close struct{}
-
-// WorkerExited is raised by the worker goroutine when it has drained the queue
-// and observed the close, just before it closes its done channel.
-type WorkerExited struct{}
-
-func (Close) isEvent()        {}
-func (WorkerExited) isEvent() {}
-
-func (Close) String() string        { return "Close" }
-func (WorkerExited) String() string { return "WorkerExited" }
-
-// Effect is a side effect returned by Next as data. Exhaustively: Wake.
-type Effect interface {
-	isEffect()
-	String() string
-}
-
-// Wake: signal the worker (via the buffered wake channel) so it re-checks the
-// lifecycle and exits. Emitted once, on the Open->Closing transition.
-type Wake struct{}
-
-func (Wake) isEffect() {}
-
-func (Wake) String() string { return "Wake" }
-
-// Next is the total, pure transition function. For every (state, event) it
-// returns the next state and the ordered effects. It returns a non-nil error
-// only for an unknown State/Event implementation. Every in-domain pair is
-// defined; there are no forbidden transitions, only no-ops.
-//
-// The lifecycle is monotonic Open -> Closing -> Closed. Close wakes the worker
-// only on the first Open->Closing transition (idempotent wait()); a later Close
-// is absorbed. WorkerExited only advances Closing -> Closed.
-func Next(s State, e Event) (State, []Effect, error) {
-	switch s.(type) {
-	case Open:
-		switch e.(type) {
-		case Close:
-			return Closing{}, []Effect{Wake{}}, nil
-		case WorkerExited:
-			// Worker exited while still Open (e.g. never any clause and an early
-			// close race) -- treat as fully closed; defensive, keeps Next total.
-			return Closed{}, nil, nil
-		}
-	case Closing:
-		switch e.(type) {
-		case Close:
-			// Idempotent wait(): already closing, no second wake.
-			return Closing{}, nil, nil
-		case WorkerExited:
-			return Closed{}, nil, nil
-		}
-	case Closed:
-		switch e.(type) {
-		case Close:
-			return Closed{}, nil, nil
-		case WorkerExited:
-			return Closed{}, nil, nil
-		}
-	}
-	return s, nil, fmt.Errorf("ttscoord: unhandled transition %s <- %s", s, e)
-}
-
-// EffectSink performs the effects produced by a transition. See coordinator.Sink:
-// Wake does a non-blocking send on a buffered channel, so Perform does not block
-// under the lock.
-type EffectSink = coordinator.Sink[Effect]
-
-// Coordinator serializes the TTS-pipeline transitions. The producer (Close) and
-// worker (WorkerExited) goroutines both call Apply, so the lock serializes the
-// two writers. See coordinator.Coordinator.
-type Coordinator = coordinator.Coordinator[State, Event, Effect]
-
-// New returns an Open Coordinator that performs effects via sink.
-func New(sink EffectSink) *Coordinator {
-	return coordinator.New[State, Event, Effect](Open{}, Next, sink)
-}
--- a/core/http/endpoints/openai/ttscoord/ttscoord_suite_test.go
+++ b/core/http/endpoints/openai/ttscoord/ttscoord_suite_test.go
@@ -1,13 +0,0 @@
-package ttscoord
-
-import (
-	"testing"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-func TestTtscoord(t *testing.T) {
-	RegisterFailHandler(Fail)
-	RunSpecs(t, "ttscoord (realtime M5) Suite")
-}
--- a/core/http/endpoints/openai/ttscoord/ttscoord_test.go
+++ b/core/http/endpoints/openai/ttscoord/ttscoord_test.go
@@ -1,165 +0,0 @@
-package ttscoord
-
-import (
-	"math/rand/v2"
-	"sync"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-// recordingSink captures the ordered stream of effects.
-type recordingSink struct {
-	mu  sync.Mutex
-	log []Effect
-}
-
-func (s *recordingSink) Perform(e Effect) {
-	s.mu.Lock()
-	s.log = append(s.log, e)
-	s.mu.Unlock()
-}
-
-func (s *recordingSink) wakes() int {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-	n := 0
-	for _, e := range s.log {
-		if _, ok := e.(Wake); ok {
-			n++
-		}
-	}
-	return n
-}
-
-type unknownEvent struct{}
-
-func (unknownEvent) isEvent()       {}
-func (unknownEvent) String() string { return "unknownEvent" }
-
-type unknownState struct{}
-
-func (unknownState) isState()       {}
-func (unknownState) String() string { return "unknownState" }
-
-var _ = Describe("ttscoord.Next", func() {
-	DescribeTable("transitions",
-		func(state State, event Event, wantState State, wantEff []Effect) {
-			gotState, gotEff, err := Next(state, event)
-			Expect(err).NotTo(HaveOccurred())
-			Expect(gotState).To(Equal(wantState))
-			Expect(gotEff).To(Equal(wantEff))
-		},
-		Entry("open+close -> closing: wake",
-			Open{}, Close{}, Closing{}, []Effect{Wake{}}),
-		Entry("open+workerexited -> closed (defensive)",
-			Open{}, WorkerExited{}, Closed{}, []Effect(nil)),
-		Entry("closing+close -> closing, no-op (idempotent wait)",
-			Closing{}, Close{}, Closing{}, []Effect(nil)),
-		Entry("closing+workerexited -> closed",
-			Closing{}, WorkerExited{}, Closed{}, []Effect(nil)),
-		Entry("closed+close -> closed, no-op",
-			Closed{}, Close{}, Closed{}, []Effect(nil)),
-		Entry("closed+workerexited -> closed, no-op",
-			Closed{}, WorkerExited{}, Closed{}, []Effect(nil)),
-	)
-
-	It("is total over the defined (state, event) pairs", func() {
-		for _, s := range []State{Open{}, Closing{}, Closed{}} {
-			for _, e := range []Event{Close{}, WorkerExited{}} {
-				_, _, err := Next(s, e)
-				Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e)
-			}
-		}
-	})
-
-	It("errors on an unknown event type", func() {
-		_, _, err := Next(Open{}, unknownEvent{})
-		Expect(err).To(HaveOccurred())
-	})
-
-	It("errors on an unknown state type", func() {
-		_, _, err := Next(unknownState{}, Close{})
-		Expect(err).To(HaveOccurred())
-	})
-})
-
-// phaseOf maps a state to a monotonic rank for the "never goes backwards" check.
-func phaseOf(s State) int {
-	switch s.(type) {
-	case Open:
-		return 0
-	case Closing:
-		return 1
-	case Closed:
-		return 2
-	default:
-		return -1
-	}
-}
-
-var _ = Describe("ttscoord.Coordinator", func() {
-	It("keeps the lifecycle monotonic and wakes at most once over random sequences", func() {
-		seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE}
-		for _, seed := range seeds {
-			r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5))
-			sink := &recordingSink{}
-			c := New(sink)
-			prev := 0
-
-			for range 5000 {
-				if r.IntN(2) == 0 {
-					Expect(c.Apply(Close{})).To(Succeed())
-				} else {
-					Expect(c.Apply(WorkerExited{})).To(Succeed())
-				}
-				cur := phaseOf(c.State())
-				Expect(cur).To(BeNumerically(">=", prev), "seed=%d: lifecycle went backwards", seed)
-				prev = cur
-			}
-			Expect(sink.wakes()).To(BeNumerically("<=", 1), "seed=%d: woke more than once", seed)
-		}
-	})
-
-	// Two-writer test: a producer raises Close while the "worker" raises
-	// WorkerExited, the real concurrency. The lifecycle must stay monotonic and
-	// Wake must fire at most once. Run under -race.
-	It("is two-writer safe (producer Close vs worker WorkerExited)", func() {
-		const iterations = 200
-		for range iterations {
-			sink := &recordingSink{}
-			c := New(sink)
-			var wg sync.WaitGroup
-			wg.Add(2)
-			go func() { defer wg.Done(); _ = c.Apply(Close{}) }()
-			go func() { defer wg.Done(); _ = c.Apply(WorkerExited{}) }()
-			wg.Wait()
-			// After both, drive to terminal and assert idempotence.
-			_ = c.Apply(Close{})
-			_ = c.Apply(WorkerExited{})
-			Expect(c.State()).To(Equal(State(Closed{})))
-			Expect(sink.wakes()).To(BeNumerically("<=", 1))
-		}
-	})
-
-	It("only Open accepts (a gate query never panics across states)", func() {
-		// Mirrors the pipeline's enqueue gate: accepted iff Open.
-		sink := &recordingSink{}
-		c := New(sink)
-		_, open := c.State().(Open)
-		Expect(open).To(BeTrue())
-		Expect(c.Apply(Close{})).To(Succeed())
-		_, open = c.State().(Open)
-		Expect(open).To(BeFalse())
-	})
-})
-
-var _ = DescribeTable("ttscoord stringers",
-	func(got, want string) { Expect(got).To(Equal(want)) },
-	Entry(nil, Open{}.String(), "Open"),
-	Entry(nil, Closing{}.String(), "Closing"),
-	Entry(nil, Closed{}.String(), "Closed"),
-	Entry(nil, Close{}.String(), "Close"),
-	Entry(nil, WorkerExited{}.String(), "WorkerExited"),
-	Entry(nil, Wake{}.String(), "Wake"),
-)
--- a/core/http/endpoints/openai/turncoord/turncoord.go
+++ b/core/http/endpoints/openai/turncoord/turncoord.go
@@ -1,255 +0,0 @@
-// Package turncoord is the explicit state machine for the realtime API's
-// turn-detection concern (machine "M2" in
-// docs/design/realtime-state-machines.md).
-//
-// In the legacy code this machine is implicit and, worse, split across TWO
-// variables that can disagree: handleVAD's goroutine-local speechStarted bool
-// and the semantic_vad liveTurnState's "is the live stream open" flag
-// (lts.open()). They are set and cleared at separate points, so a discardTurn
-// (no-speech clear, a semantic->server mode switch mid-turn, or teardown)
-// closes the live stream but leaves speechStarted true. The two then disagree,
-// and the next speech onset is suppressed because `if !speechStarted` is false
-// — the user's next utterance silently produces no speech_started, no barge-in,
-// and no commit. See docs/design/realtime-state-machines.md, Part 2 (failure
-// mode 4) and the turn_lifecycle spec under formal-verification/.
-//
-// This package replaces that with:
-//   - a sealed sum type for State (illegal states are unrepresentable),
-//   - a total, pure transition function Next(state, event) -> (state, effects),
-//   - a single-writer Coordinator that serializes every transition.
-//
-// "Speech detected" and "a turn is open" become ONE state (Speaking), so they
-// can no longer fall out of sync: every path that ends a turn returns to Idle
-// and necessarily clears both. The design guarantees the invariants the specs
-// check:
-//   - speechStarted ⟺ a turn is open (Part 4, invariant #4) — structural here,
-//   - a barge-in cancel precedes the next turn's commit (you must pass through
-//     Speaking, which barges in on entry, before a Silence can commit),
-//   - every opened turn is finished (commit) or discarded (abort) exactly once.
-//
-// Unlike M3 (respcoord), which is a genuine dual-writer race, M2's turn
-// lifecycle is driven by the single handleVAD goroutine: the value here is
-// making the speechStarted/turn-open desync unrepresentable, not serializing
-// concurrent writers. The Coordinator still serializes transitions so that
-// State() is race-free and a teardown-time Abort from another goroutine (or a
-// future second writer) stays safe.
-//
-// Mode note: in server_vad mode there is no live ASR stream, so OpenTurn /
-// DiscardTurn have nothing to open or close — the sink performs them as no-ops
-// and "turn open" is satisfied vacuously. The state coupling (Speaking ⟺ turn
-// open) still holds; it is only semantic_vad that had two real variables to
-// desync.
-package turncoord
-
-import (
-	"fmt"
-
-	"github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator"
-)
-
-// TurnID identifies one user turn. The caller mints it when speech begins (it
-// is the conversation item id the live caption deltas stream under, reused by
-// the committed event so the client replaces the partial text). Carrying it in
-// the state makes "commit/discard refer to the turn that was opened" explicit.
-type TurnID string
-
-// AbortReason records why a turn was dropped without committing. Like
-// respcoord.Source it is observability only — every reason aborts the same way;
-// keeping it in the event makes the distinct legacy discardTurn sites explicit
-// rather than collapsed into one anonymous code path.
-type AbortReason int
-
-const (
-	// AbortNoSpeech: the no-speech clear — the VAD found no segments and the
-	// buffer is past the holdback, so the inspected audio was not speech.
-	AbortNoSpeech AbortReason = iota
-	// AbortTeardown: the session is closing.
-	AbortTeardown
-)
-
-// NOTE: a semantic->server turn-detection switch mid-turn is deliberately NOT an
-// Abort: it only drops the orphaned live ASR stream and lets the turn continue
-// under server_vad (so a config change can't cut off a mid-utterance speaker).
-// That orphan cleanup stays inline in handleVAD; only the two reasons above end
-// a turn (return to Idle).
-
-func (r AbortReason) String() string {
-	switch r {
-	case AbortNoSpeech:
-		return "no_speech"
-	case AbortTeardown:
-		return "teardown"
-	default:
-		return fmt.Sprintf("AbortReason(%d)", int(r))
-	}
-}
-
-// State is the sealed sum type of turn-detection states. The only
-// implementations are the marker-method structs in this file, so callers
-// outside the package cannot fabricate an out-of-band state. Exhaustively:
-// Idle | Speaking.
-type State interface {
-	isState()
-	String() string
-}
-
-// Idle: no turn is open and no speech is in progress (legacy: speechStarted ==
-// false AND the live stream is closed — here a single state, so they cannot
-// disagree).
-type Idle struct{}
-
-// Speaking: a turn is open and speech is in progress (legacy: speechStarted ==
-// true AND, in semantic mode, the live stream open). Turn is the open turn's id.
-type Speaking struct{ Turn TurnID }
-
-func (Idle) isState()     {}
-func (Speaking) isState() {}
-
-func (Idle) String() string       { return "Idle" }
-func (s Speaking) String() string { return fmt.Sprintf("Speaking(%s)", s.Turn) }
-
-// Event is the sealed sum type of inputs. Exhaustively: Onset | Silence | Abort.
-type Event interface {
-	isEvent()
-	String() string
-}
-
-// Onset reports that the VAD found speech this tick. Turn is the id to open the
-// turn under (allocated by the caller so caption deltas can stream immediately).
-// While already Speaking it is a no-op: re-detection of ongoing speech does not
-// reopen a turn (legacy `if !speechStarted`).
-type Onset struct{ Turn TurnID }
-
-// Silence reports VAD-confirmed silence past the dynamic commit threshold (the
-// end-of-speech commit trigger). The threshold itself — semantic_vad's EOU vs
-// eagerness fallback — is computed by the caller before raising this event; the
-// machine only sequences the commit. It is a no-op while Idle (nothing to
-// commit).
-type Silence struct{}
-
-// Abort drops the open turn without committing (no-speech clear, mode switch,
-// teardown). It is a no-op while Idle (nothing open).
-type Abort struct{ Reason AbortReason }
-
-func (Onset) isEvent()   {}
-func (Silence) isEvent() {}
-func (Abort) isEvent()   {}
-
-func (e Onset) String() string { return fmt.Sprintf("Onset(%s)", e.Turn) }
-func (Silence) String() string { return "Silence" }
-func (e Abort) String() string { return fmt.Sprintf("Abort(%s)", e.Reason) }
-
-// Effect is a side effect returned by Next as data for the caller to perform.
-// Returning effects as data (rather than firing callbacks inside the
-// transition) keeps Next pure and exhaustively testable. Exhaustively:
-// BargeIn | OpenTurn | EmitSpeechStarted | EmitSpeechStopped | CommitTurn |
-// DiscardTurn.
-type Effect interface {
-	isEffect()
-	String() string
-}
-
-// BargeIn: cancel any in-flight response (the M2->M3 edge). Emitted on the
-// Idle->Speaking onset, before the new turn can ever commit — so a barge-in
-// always precedes the next commit.
-type BargeIn struct{}
-
-// OpenTurn: open the live ASR stream for Turn (semantic_vad). No-op in
-// server_vad mode.
-type OpenTurn struct{ Turn TurnID }
-
-// EmitSpeechStarted: send input_audio_buffer.speech_started.
-type EmitSpeechStarted struct{}
-
-// EmitSpeechStopped: send input_audio_buffer.speech_stopped.
-type EmitSpeechStopped struct{}
-
-// CommitTurn: finalize the turn's live stream, emit input_audio_buffer.committed
-// for Turn, and issue the response (via respcoord). The completion of one turn.
-type CommitTurn struct{ Turn TurnID }
-
-// DiscardTurn: close the turn's live stream and retract any caption deltas
-// already shown for Turn (the failed transcription event). No commit, no
-// response.
-type DiscardTurn struct{ Turn TurnID }
-
-func (BargeIn) isEffect()           {}
-func (OpenTurn) isEffect()          {}
-func (EmitSpeechStarted) isEffect() {}
-func (EmitSpeechStopped) isEffect() {}
-func (CommitTurn) isEffect()        {}
-func (DiscardTurn) isEffect()       {}
-
-func (BargeIn) String() string           { return "BargeIn" }
-func (e OpenTurn) String() string        { return fmt.Sprintf("OpenTurn(%s)", e.Turn) }
-func (EmitSpeechStarted) String() string { return "EmitSpeechStarted" }
-func (EmitSpeechStopped) String() string { return "EmitSpeechStopped" }
-func (e CommitTurn) String() string      { return fmt.Sprintf("CommitTurn(%s)", e.Turn) }
-func (e DiscardTurn) String() string     { return fmt.Sprintf("DiscardTurn(%s)", e.Turn) }
-
-// Next is the total, pure transition function. For every (state, event) it
-// returns the next state and the ordered effects to perform. It returns a
-// non-nil error only for an unknown State/Event implementation (a programmer
-// error / future type added without updating this function) — callers must
-// surface that, never silently ignore it. Every in-domain (state, event) pair
-// is defined; there are no "forbidden" transitions, only no-ops for events that
-// don't apply to the current state.
-//
-// The crux of the fix is that both turn-ending transitions (Silence commit and
-// Abort) go to Idle, which carries no turn data: there is no way to clear "turn
-// open" while leaving "speech started" set, because they are the same state.
-// The legacy desync (discardTurn closed the live stream but left speechStarted
-// true) is therefore unrepresentable.
-//
-// Effect ordering on onset mirrors the live handleVAD: OpenTurn (start the live
-// stream), then BargeIn (cancel the prior response), then EmitSpeechStarted.
-func Next(s State, e Event) (State, []Effect, error) {
-	switch st := s.(type) {
-	case Idle:
-		switch ev := e.(type) {
-		case Onset:
-			return Speaking{Turn: ev.Turn}, []Effect{
-				OpenTurn{Turn: ev.Turn},
-				BargeIn{},
-				EmitSpeechStarted{},
-			}, nil
-		case Silence:
-			// Nothing in flight to commit: idempotent no-op.
-			return Idle{}, nil, nil
-		case Abort:
-			// No open turn: idempotent no-op (discardTurn on a closed stream).
-			return Idle{}, nil, nil
-		}
-	case Speaking:
-		switch e.(type) {
-		case Onset:
-			// Speech already in progress: re-detection does not reopen a turn
-			// or re-emit speech_started (legacy `if !speechStarted`). The turn
-			// id stays the one allocated at onset.
-			return Speaking{Turn: st.Turn}, nil, nil
-		case Silence:
-			return Idle{}, []Effect{
-				EmitSpeechStopped{},
-				CommitTurn{Turn: st.Turn},
-			}, nil
-		case Abort:
-			return Idle{}, []Effect{DiscardTurn{Turn: st.Turn}}, nil
-		}
-	}
-	return s, nil, fmt.Errorf("turncoord: unhandled transition %s <- %s", s, e)
-}
-
-// EffectSink performs the effects produced by a transition. See coordinator.Sink
-// for the non-blocking contract: Perform runs under the coordinator lock, so it
-// must not block and must not re-enter Apply.
-type EffectSink = coordinator.Sink[Effect]
-
-// Coordinator serializes turn transitions. In practice the handleVAD goroutine is
-// the only writer, but serializing keeps State() race-free and a teardown-time
-// Abort from another goroutine safe. See coordinator.Coordinator.
-type Coordinator = coordinator.Coordinator[State, Event, Effect]
-
-// New returns an idle Coordinator that performs effects via sink.
-func New(sink EffectSink) *Coordinator {
-	return coordinator.New[State, Event, Effect](Idle{}, Next, sink)
-}
--- a/Show More
+++ b/Show More