fix(realtime): keep transcription model on a language-only session.update

A transcription session.update that carries only a language (no model) — e.g. a client forcing the STT input language — has an empty Transcription.Model. updateSession unconditionally copied that into session.ModelConfig.Pipeline.Transcription, blanking the pipeline's configured transcription backend. The next utterance then transcribed against an empty model and the backend RPC failed with "unimplemented" (surfaced to the client as transcription_failed), so transcription silently stopped whenever a language was selected. Only adopt the incoming transcription model when it is non-empty, and preserve the existing model otherwise (mirroring updateTransSession). Signed-off-by: mudler <mudler@localai.io> Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-12 18:58:49 -04:00 · 2026-06-12 22:57:55 +00:00
158 changed files with 564 additions and 7376 deletions
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -703,19 +703,6 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
-  - build-type: 'cublas'
-    cuda-major-version: "12"
-    cuda-minor-version: "8"
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-nvidia-cuda-12-locate-anything-cpp'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "locate-anything-cpp"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "12"
    cuda-minor-version: "8"
@@ -1556,19 +1543,6 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
-  - build-type: 'cublas'
-    cuda-major-version: "13"
-    cuda-minor-version: "0"
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-nvidia-cuda-13-locate-anything-cpp'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "locate-anything-cpp"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "13"
    cuda-minor-version: "0"
@@ -1595,19 +1569,6 @@ include:
    backend: "rfdetr-cpp"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
-  - build-type: 'cublas'
-    cuda-major-version: "13"
-    cuda-minor-version: "0"
-    platforms: 'linux/arm64'
-    skip-drivers: 'false'
-    tag-latest: 'auto'
-    tag-suffix: '-nvidia-l4t-cuda-13-arm64-locate-anything-cpp'
-    base-image: "ubuntu:24.04"
-    ubuntu-version: '2404'
-    runs-on: 'ubuntu-24.04-arm'
-    backend: "locate-anything-cpp"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
  - build-type: 'cublas'
    cuda-major-version: "13"
    cuda-minor-version: "0"
@@ -2845,74 +2806,6 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
-  # locate-anything-cpp
-  - build-type: ''
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-cpu-locate-anything-cpp'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "locate-anything-cpp"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'sycl_f32'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-intel-sycl-f32-locate-anything-cpp'
-    runs-on: 'ubuntu-latest'
-    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
-    skip-drivers: 'false'
-    backend: "locate-anything-cpp"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'sycl_f16'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-intel-sycl-f16-locate-anything-cpp'
-    runs-on: 'ubuntu-latest'
-    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
-    skip-drivers: 'false'
-    backend: "locate-anything-cpp"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'vulkan'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    platform-tag: 'amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-vulkan-locate-anything-cpp'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "locate-anything-cpp"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'vulkan'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/arm64'
-    platform-tag: 'arm64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-vulkan-locate-anything-cpp'
-    runs-on: 'ubuntu-24.04-arm'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "locate-anything-cpp"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
  - build-type: 'sycl_f32'
    cuda-major-version: ""
    cuda-minor-version: ""
@@ -3006,19 +2899,6 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2204'
-  - build-type: 'cublas'
-    cuda-major-version: "12"
-    cuda-minor-version: "0"
-    platforms: 'linux/arm64'
-    skip-drivers: 'false'
-    tag-latest: 'auto'
-    tag-suffix: '-nvidia-l4t-arm64-locate-anything-cpp'
-    base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
-    runs-on: 'ubuntu-24.04-arm'
-    backend: "locate-anything-cpp"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2204'
  # whisper
  - build-type: ''
    cuda-major-version: ""
@@ -4461,10 +4341,6 @@ includeDarwin:
    tag-suffix: "-metal-darwin-arm64-silero-vad"
    build-type: "metal"
    lang: "go"
-  - backend: "sherpa-onnx"
-    tag-suffix: "-metal-darwin-arm64-sherpa-onnx"
-    build-type: "metal"
-    lang: "go"
  - backend: "local-store"
    tag-suffix: "-metal-darwin-arm64-local-store"
    build-type: "metal"
@@ -4472,6 +4348,3 @@ includeDarwin:
  - backend: "llama-cpp-quantization"
    tag-suffix: "-metal-darwin-arm64-llama-cpp-quantization"
    build-type: "mps"
-  - backend: "speaker-recognition"
-    tag-suffix: "-metal-darwin-arm64-speaker-recognition"
-    build-type: "mps"
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -62,10 +62,6 @@ jobs:
            variable: "RFDETR_VERSION"
            branch: "main"
            file: "backend/go/rfdetr-cpp/Makefile"
-          - repository: "mudler/locate-anything.cpp"
-            variable: "LOCATEANYTHING_VERSION"
-            branch: "master"
-            file: "backend/go/locate-anything-cpp/Makefile"
          - repository: "predict-woo/qwen3-tts.cpp"
            variable: "QWEN3TTS_CPP_VERSION"
            branch: "main"
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -38,7 +38,6 @@ jobs:
      acestep-cpp: ${{ steps.detect.outputs.acestep-cpp }}
      qwen3-tts-cpp: ${{ steps.detect.outputs.qwen3-tts-cpp }}
      rfdetr-cpp: ${{ steps.detect.outputs.rfdetr-cpp }}
-      locate-anything-cpp: ${{ steps.detect.outputs.locate-anything-cpp }}
      vibevoice-cpp: ${{ steps.detect.outputs.vibevoice-cpp }}
      localvqe: ${{ steps.detect.outputs.localvqe }}
      voxtral: ${{ steps.detect.outputs.voxtral }}
@@ -564,7 +563,7 @@ jobs:
      - name: Run e2e-backends smoke
        env:
          BACKEND_IMAGE: quay.io/go-skynet/local-ai-backends:master-cpu-llama-cpp
-          BACKEND_TEST_CAPS: health,load,predict,stream,logprobs,logit_bias,tokenize
+          BACKEND_TEST_CAPS: health,load,predict,stream,logprobs,logit_bias
        run: |
          make test-extra-backend
  # Realtime e2e with sherpa-onnx driving VAD + STT + TTS against a mocked LLM.
@@ -902,45 +901,6 @@ jobs:
      - name: Test rfdetr-cpp
        run: |
          make --jobs=5 --output-sync=target -C backend/go/rfdetr-cpp test
-  # Per-backend e2e for locate-anything-cpp: builds the .so + Go binary and
-  # runs `make -C backend/go/locate-anything-cpp test`. test.sh fetches the
-  # locate-anything-q8_0 GGUF (~6.3 GB, NVIDIA LocateAnything-3B) from the
-  # published mudler/locate-anything.cpp-gguf HF repo + a COCO image, then the
-  # Go wire test loads the model and runs an open-vocabulary Detect, asserting
-  # at least one labeled box. Heavier than the other Go backends (it is a 3B),
-  # so it is gated to changes under backend/go/locate-anything-cpp/.
-  tests-locate-anything-cpp:
-    needs: detect-changes
-    if: needs.detect-changes.outputs.locate-anything-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true'
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v6
-        with:
-          submodules: true
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential cmake curl libopenblas-dev
-      - name: Setup Go
-        uses: actions/setup-go@v5
-      - name: Display Go version
-        run: go version
-      - name: Proto Dependencies
-        run: |
-          # Install protoc
-          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
-          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
-          rm protoc.zip
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
-          PATH="$PATH:$HOME/go/bin" make protogen-go
-      - name: Build locate-anything-cpp
-        run: |
-          make --jobs=5 --output-sync=target -C backend/go/locate-anything-cpp
-      - name: Test locate-anything-cpp
-        run: |
-          make --jobs=5 --output-sync=target -C backend/go/locate-anything-cpp test
  # Per-backend smoke for vibevoice-cpp: builds the .so + Go binary and
  # runs `make -C backend/go/vibevoice-cpp test`. test.sh auto-downloads
  # the published mudler/vibevoice.cpp-models bundle (TTS Q8_0 + ASR Q4_K
--- a/1
+++ b/1
@@ -108,7 +108,6 @@ RUN <<EOT bash
        apt-get update && \
        apt-get install -y --no-install-recommends \
            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            cuda-nvrtc-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
--- a/2
+++ b/2
@@ -566,7 +566,6 @@ prepare-test-extra: protogen-python
 	$(MAKE) -C backend/python/speaker-recognition
 	$(MAKE) -C backend/rust/kokoros kokoros-grpc
 	$(MAKE) -C backend/go/rfdetr-cpp
-	$(MAKE) -C backend/go/locate-anything-cpp

 test-extra: prepare-test-extra
 	$(MAKE) -C backend/python/transformers test
@@ -594,7 +593,6 @@ test-extra: prepare-test-extra
 	$(MAKE) -C backend/python/speaker-recognition test
 	$(MAKE) -C backend/rust/kokoros test
 	$(MAKE) -C backend/go/rfdetr-cpp test
-	$(MAKE) -C backend/go/locate-anything-cpp test

 ##
 ## End-to-end gRPC tests that exercise a built backend container image.
--- a/backend/Dockerfile.golang
+++ b/backend/Dockerfile.golang
@@ -206,16 +206,6 @@ RUN if [ "${BACKEND}" = "opus" ]; then \
    apt-get clean && rm -rf /var/lib/apt/lists/*; \
 fi

-# CrispASR's piper TTS backend dlopens libespeak-ng at runtime to phonemize
-# non-English text (the MIT-clean path; English uses a built-in G2P). Install
-# the espeak-ng runtime + its libpcaudio/libsonic deps + voice data so
-# package.sh can bundle them into the FROM scratch image.
-RUN if [ "${BACKEND}" = "crispasr" ]; then \
-    apt-get update && apt-get install -y --no-install-recommends \
-        espeak-ng-data libespeak-ng1 libpcaudio0 libsonic0 && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*; \
-fi
-
 COPY . /LocalAI

 RUN git config --global --add safe.directory /LocalAI
--- a/backend/Dockerfile.python
+++ b/backend/Dockerfile.python
@@ -126,7 +126,6 @@ RUN <<EOT bash
        apt-get update && \
        apt-get install -y --no-install-recommends \
            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            cuda-nvrtc-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
--- a/backend/cpp/ds4/Makefile
+++ b/backend/cpp/ds4/Makefile
@@ -1,10 +1,10 @@
 # ds4 backend Makefile.
 #
-# Upstream pin lives below as DS4_VERSION?=d881f2a05e8ff6bec001315a36b794b4aa310173
+# Upstream pin lives below as DS4_VERSION?=91bafb5acd5a6cf00b1e55ef68bf40ddd207bee7
 # (.github/bump_deps.sh) can find and update it - matches the
 # llama-cpp / ik-llama-cpp / turboquant convention.

-DS4_VERSION?=d881f2a05e8ff6bec001315a36b794b4aa310173
+DS4_VERSION?=91bafb5acd5a6cf00b1e55ef68bf40ddd207bee7
 DS4_REPO?=https://github.com/antirez/ds4

 CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=4c6595503fe45d5a39f88d194e270f64c7424677
+LLAMA_VERSION?=039e20a2db9e87b2477c76cc04905f3e1acad77f
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -3486,7 +3486,7 @@ public:
        if (body.count("prompt") != 0) {
            const bool add_special = json_value(body, "add_special", false);

-            llama_tokens tokens = tokenize_mixed(ctx_server.impl->vocab, body.at("prompt"), add_special, true);
+            llama_tokens tokens = tokenize_mixed(ctx_server.impl->vocab, body.at("content"), add_special, true);


            for (const auto& token : tokens) {
--- a/backend/go/crispasr/Makefile
+++ b/backend/go/crispasr/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # CrispASR version (release tag)
 CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
-CRISPASR_VERSION?=d745bda4386ae0f9d1d2f23fff8ec95d76428221
+CRISPASR_VERSION?=c29f6653a516a3001d923944dad8892072cc7334
 SO_TARGET?=libgocrispasr.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/crispasr/gocrispasr.go
+++ b/backend/go/crispasr/gocrispasr.go
@@ -11,7 +11,6 @@ import (

 	"github.com/go-audio/audio"
 	"github.com/go-audio/wav"
-	gguf "github.com/gpustack/gguf-parser-go"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/utils"
@@ -38,39 +37,6 @@ var (

 type CrispASR struct {
 	base.SingleThread
-	// sampleRate is the output rate (Hz) of the loaded TTS engine's PCM, used to
-	// write a correct WAV header. Most CrispASR TTS backends emit 24 kHz, but
-	// piper returns its model's native rate (16 kHz for x_low/low voices,
-	// 22.05 kHz for medium/high), so it is read from the GGUF metadata at Load.
-	sampleRate int
-}
-
-// defaultTTSSampleRate is the output rate assumed for CrispASR TTS engines that
-// don't advertise one in GGUF metadata (vibevoice/orpheus/chatterbox/qwen3-tts
-// all emit 24 kHz). piper is the exception and carries piper.sample_rate.
-const defaultTTSSampleRate = 24000
-
-// piperSampleRate reads the piper.sample_rate metadata key from a GGUF model.
-// CrispASR's piper backend returns PCM at the model's native rate without
-// resampling, so the WAV header must match it. Returns ok=false for non-piper
-// models (key absent) or an unreadable file, letting the caller fall back to
-// defaultTTSSampleRate.
-func piperSampleRate(modelPath string) (int, bool) {
-	// Only scalar architecture keys are read, so skip the large array metadata
-	// (phoneme map) and mmap the header - same rationale as pkg/vram's reader.
-	f, err := gguf.ParseGGUFFile(modelPath, gguf.UseMMap(), gguf.SkipLargeMetadata())
-	if err != nil {
-		return 0, false
-	}
-	kv, ok := f.Header.MetadataKV.Get("piper.sample_rate")
-	if !ok || kv.ValueType != gguf.GGUFMetadataValueTypeUint32 {
-		return 0, false
-	}
-	rate := int(kv.ValueUint32())
-	if rate <= 0 {
-		return 0, false
-	}
-	return rate, true
 }

 // splitOption splits a "prefix:value" model option into its key and value,
@@ -137,14 +103,6 @@ func (w *CrispASR) Load(opts *pb.ModelOptions) error {
 		return fmt.Errorf("Failed to load CrispASR transcription model")
 	}

-	// Determine the TTS output sample rate for the WAV header. piper voices
-	// carry their native rate in GGUF metadata and CrispASR does not resample;
-	// every other engine emits the 24 kHz default.
-	w.sampleRate = defaultTTSSampleRate
-	if rate, ok := piperSampleRate(opts.ModelFile); ok {
-		w.sampleRate = rate
-	}
-
 	// Load the companion file (codec/tokenizer/s3gen) after the session is open.
 	// rc==0 means success or "not applicable" for the active backend; only a
 	// negative code is fatal.
@@ -432,7 +390,7 @@ func (w *CrispASR) synthesize(text string) ([]float32, error) {
 	}
 	defer CppTTSFree(ptr)
 	src := unsafe.Slice((*float32)(unsafe.Pointer(ptr)), int(n)) //nolint:govet // ptr addresses C-allocated PCM returned across the purego boundary; copied out immediately below, before tts_free.
-	out := make([]float32, int(n))                               // copy out of C memory before free
+	out := make([]float32, int(n)) // copy out of C memory before free
 	copy(out, src)
 	return out, nil
 }
@@ -459,7 +417,7 @@ func (w *CrispASR) TTS(req *pb.TTSRequest) error {
 	if err != nil {
 		return err
 	}
-	return writeWAV(req.Dst, pcm, w.sampleRate)
+	return writeWAV24k(req.Dst, pcm)
 }

 // TTSStream is the streaming counterpart to TTS. CrispASR has no progressive
@@ -489,7 +447,7 @@ func (w *CrispASR) TTSStream(req *pb.TTSRequest, results chan []byte) error {
 	}
 	defer func() { _ = os.Remove(dst) }()

-	if err := writeWAV(dst, pcm, w.sampleRate); err != nil {
+	if err := writeWAV24k(dst, pcm); err != nil {
 		return err
 	}

@@ -501,14 +459,14 @@ func (w *CrispASR) TTSStream(req *pb.TTSRequest, results chan []byte) error {
 	return nil
 }

-// writeWAV writes pcm as a sampleRate Hz, mono, 16-bit PCM WAV at dst.
-func writeWAV(dst string, pcm []float32, sampleRate int) error {
+// writeWAV24k writes pcm as a 24000 Hz, mono, 16-bit PCM WAV at dst.
+func writeWAV24k(dst string, pcm []float32) error {
 	f, err := os.Create(dst)
 	if err != nil {
 		return fmt.Errorf("crispasr: create %q: %w", dst, err)
 	}

-	enc := wav.NewEncoder(f, sampleRate, 16, 1, 1)
+	enc := wav.NewEncoder(f, 24000, 16, 1, 1)
 	ints := make([]int, len(pcm))
 	for i, s := range pcm {
 		if s > 1 {
@@ -519,7 +477,7 @@ func writeWAV(dst string, pcm []float32, sampleRate int) error {
 		ints[i] = int(s * 32767)
 	}
 	buf := &audio.IntBuffer{
-		Format:         &audio.Format{NumChannels: 1, SampleRate: sampleRate},
+		Format:         &audio.Format{NumChannels: 1, SampleRate: 24000},
 		Data:           ints,
 		SourceBitDepth: 16,
 	}
--- a/backend/go/crispasr/gocrispasr_samplerate_test.go
+++ b/backend/go/crispasr/gocrispasr_samplerate_test.go
@@ -1,164 +0,0 @@
-package main
-
-import (
-	"bytes"
-	"encoding/binary"
-	"os"
-	"path/filepath"
-
-	"github.com/go-audio/wav"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-// GGUF metadata value type tags (subset) from the GGUF spec.
-const (
-	ggufTypeUint32 uint32 = 4
-	ggufTypeString uint32 = 8
-)
-
-type ggufKV struct {
-	key   string
-	vtype uint32
-	val   any
-}
-
-// writeMinimalGGUF emits a valid, tensor-less GGUF file carrying only the given
-// metadata key-values. Enough for the header-only parse path piperSampleRate
-// uses; avoids pulling a real multi-MB voice into the test.
-func writeMinimalGGUF(path string, kvs []ggufKV) error {
-	var b bytes.Buffer
-	b.WriteString("GGUF")                                // magic
-	_ = binary.Write(&b, binary.LittleEndian, uint32(3)) // version
-	_ = binary.Write(&b, binary.LittleEndian, uint64(0)) // tensor count
-	_ = binary.Write(&b, binary.LittleEndian, uint64(len(kvs)))
-	for _, kv := range kvs {
-		_ = binary.Write(&b, binary.LittleEndian, uint64(len(kv.key)))
-		b.WriteString(kv.key)
-		_ = binary.Write(&b, binary.LittleEndian, kv.vtype)
-		switch v := kv.val.(type) {
-		case uint32:
-			_ = binary.Write(&b, binary.LittleEndian, v)
-		case string:
-			_ = binary.Write(&b, binary.LittleEndian, uint64(len(v)))
-			b.WriteString(v)
-		}
-	}
-	return os.WriteFile(path, b.Bytes(), 0o644)
-}
-
-// wavSampleRate decodes the WAV header at path and returns its sample rate.
-func wavSampleRate(path string) (int, error) {
-	f, err := os.Open(path)
-	if err != nil {
-		return 0, err
-	}
-	defer func() { _ = f.Close() }()
-	dec := wav.NewDecoder(f)
-	dec.ReadInfo()
-	return int(dec.SampleRate), nil
-}
-
-var _ = Describe("piper sample rate", func() {
-	Context("piperSampleRate", func() {
-		It("reads piper.sample_rate from a piper GGUF (medium = 22050)", func() {
-			p := filepath.Join(GinkgoT().TempDir(), "voice.gguf")
-			Expect(writeMinimalGGUF(p, []ggufKV{
-				{key: "general.architecture", vtype: ggufTypeString, val: "piper"},
-				{key: "piper.sample_rate", vtype: ggufTypeUint32, val: uint32(22050)},
-			})).To(Succeed())
-
-			rate, ok := piperSampleRate(p)
-			Expect(ok).To(BeTrue(), "piper.sample_rate should be found")
-			Expect(rate).To(Equal(22050))
-		})
-
-		It("reads the low-quality rate (16000)", func() {
-			p := filepath.Join(GinkgoT().TempDir(), "voice.gguf")
-			Expect(writeMinimalGGUF(p, []ggufKV{
-				{key: "piper.sample_rate", vtype: ggufTypeUint32, val: uint32(16000)},
-			})).To(Succeed())
-
-			rate, ok := piperSampleRate(p)
-			Expect(ok).To(BeTrue())
-			Expect(rate).To(Equal(16000))
-		})
-
-		It("returns ok=false for a non-piper GGUF (no piper.sample_rate key)", func() {
-			p := filepath.Join(GinkgoT().TempDir(), "other.gguf")
-			Expect(writeMinimalGGUF(p, []ggufKV{
-				{key: "general.architecture", vtype: ggufTypeString, val: "vibevoice"},
-			})).To(Succeed())
-
-			_, ok := piperSampleRate(p)
-			Expect(ok).To(BeFalse())
-		})
-
-		It("returns ok=false for an unreadable/non-GGUF file", func() {
-			p := filepath.Join(GinkgoT().TempDir(), "garbage.gguf")
-			Expect(os.WriteFile(p, []byte("not a gguf"), 0o644)).To(Succeed())
-
-			_, ok := piperSampleRate(p)
-			Expect(ok).To(BeFalse())
-		})
-	})
-
-	// End-to-end through the built .so. Gated on CRISPASR_PIPER_MODEL_PATH (a
-	// real piper voice GGUF) like the other model-backed specs; never runs in
-	// default CI. Proves CrispASR's piper backend output rate flows into the
-	// WAV header instead of the hardcoded 24 kHz default.
-	Context("piper TTS end-to-end", func() {
-		It("writes the WAV at the model's native piper.sample_rate", func() {
-			model := os.Getenv("CRISPASR_PIPER_MODEL_PATH")
-			if model == "" {
-				Skip("set CRISPASR_PIPER_MODEL_PATH to run the piper e2e spec")
-			}
-			ensureLibLoaded()
-
-			expected, ok := piperSampleRate(model)
-			Expect(ok).To(BeTrue(), "model should carry piper.sample_rate metadata")
-
-			w := &CrispASR{}
-			Expect(w.Load(&pb.ModelOptions{
-				ModelFile: model,
-				Options:   []string{"backend:piper"},
-				Threads:   4,
-			})).To(Succeed())
-
-			dst := filepath.Join(GinkgoT().TempDir(), "piper.wav")
-			Expect(w.TTS(&pb.TTSRequest{Text: "Hello from CrispASR piper.", Dst: dst})).To(Succeed())
-
-			info, err := os.Stat(dst)
-			Expect(err).ToNot(HaveOccurred())
-			Expect(info.Size()).To(BeNumerically(">", 1024), "expected a non-trivial WAV")
-
-			rate, err := wavSampleRate(dst)
-			Expect(err).ToNot(HaveOccurred())
-			Expect(rate).To(Equal(expected),
-				"WAV header rate must equal the model's native piper.sample_rate, not the 24k default")
-		})
-	})
-
-	Context("writeWAV", func() {
-		It("writes the WAV header at the given sample rate (22050 for piper, not the 24k default)", func() {
-			dst := filepath.Join(GinkgoT().TempDir(), "out.wav")
-			pcm := make([]float32, 220) // 10 ms of silence is enough for a header
-			Expect(writeWAV(dst, pcm, 22050)).To(Succeed())
-
-			rate, err := wavSampleRate(dst)
-			Expect(err).ToNot(HaveOccurred())
-			Expect(rate).To(Equal(22050))
-		})
-
-		It("writes a 16000 Hz header for low-quality piper voices", func() {
-			dst := filepath.Join(GinkgoT().TempDir(), "out.wav")
-			pcm := make([]float32, 160)
-			Expect(writeWAV(dst, pcm, 16000)).To(Succeed())
-
-			rate, err := wavSampleRate(dst)
-			Expect(err).ToNot(HaveOccurred())
-			Expect(rate).To(Equal(16000))
-		})
-	})
-})
--- a/backend/go/crispasr/package.sh
+++ b/backend/go/crispasr/package.sh
@@ -51,32 +51,6 @@ else
    exit 1
 fi

-# Bundle espeak-ng (+ its libpcaudio/libsonic runtime deps) and its voice data so
-# the piper TTS backend can phonemize non-English text. CrispASR dlopens
-# libespeak-ng.so.1 at runtime (the MIT-clean path); the dlopen succeeds loading
-# libespeak-ng but FAILS if libpcaudio/libsonic are absent, so all three .so are
-# required. run.sh points CRISPASR_ESPEAK_DATA_PATH at the bundled data dir.
-# Best-effort: only copied when present, so a local dev build without espeak-ng
-# installed still packages the rest (English voices keep working).
-ESPEAK_LIBDIR=""
-for d in /usr/lib/x86_64-linux-gnu /usr/lib/aarch64-linux-gnu; do
-    if [ -f "$d/libespeak-ng.so.1" ]; then
-        ESPEAK_LIBDIR="$d"
-        break
-    fi
-done
-if [ -n "$ESPEAK_LIBDIR" ]; then
-    echo "Bundling espeak-ng from $ESPEAK_LIBDIR ..."
-    cp -arfLv "$ESPEAK_LIBDIR/libespeak-ng.so.1" $CURDIR/package/lib/
-    cp -arfLv "$ESPEAK_LIBDIR/libpcaudio.so.0" $CURDIR/package/lib/
-    cp -arfLv "$ESPEAK_LIBDIR/libsonic.so.0" $CURDIR/package/lib/
-    if [ -d "$ESPEAK_LIBDIR/espeak-ng-data" ]; then
-        cp -arfLv "$ESPEAK_LIBDIR/espeak-ng-data" $CURDIR/package/
-    fi
-else
-    echo "espeak-ng not found; non-English piper voices will not phonemize"
-fi
-
 # Package GPU libraries based on BUILD_TYPE
 # The GPU library packaging script will detect BUILD_TYPE and copy appropriate GPU libraries
 GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
--- a/backend/go/crispasr/run.sh
+++ b/backend/go/crispasr/run.sh
@@ -41,11 +41,6 @@ fi
 export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export CRISPASR_LIBRARY=$LIBRARY

-# Point piper's espeak-ng phonemizer at the bundled voice data. The variable
-# names the directory CONTAINING espeak-ng-data (package.sh drops it next to
-# this script). Harmless when espeak-ng wasn't bundled.
-export CRISPASR_ESPEAK_DATA_PATH=$CURDIR
-
 # If there is a lib/ld.so, use it
 if [ -f $CURDIR/lib/ld.so ]; then
 	echo "Using lib/ld.so"
--- a/backend/go/locate-anything-cpp/.gitignore
+++ b/backend/go/locate-anything-cpp/.gitignore
@@ -1,7 +0,0 @@
-sources/
-build*/
-package/
-liblocateanythingcpp*.so
-locate-anything-cpp
-test-models/
-test-data/
--- a/backend/go/locate-anything-cpp/CMakeLists.txt
+++ b/backend/go/locate-anything-cpp/CMakeLists.txt
@@ -1,57 +0,0 @@
-cmake_minimum_required(VERSION 3.18)
-project(liblocateanythingcpp LANGUAGES C CXX)
-
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-# Static-link ggml + locate_anything so the resulting .so has no runtime
-# dependency on extra ggml/locate_anything shared libraries — only on
-# libc/libstdc++/libgomp, which the LocalAI package step bundles into the
-# docker image.
-set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build static libraries" FORCE)
-
-# locate-anything.cpp build switches: skip CLI/tests, keep static lib.
-set(LA_BUILD_CLI OFF CACHE BOOL "Disable locate-anything CLI" FORCE)
-set(LA_BUILD_TESTS OFF CACHE BOOL "Disable locate-anything tests" FORCE)
-set(LA_SHARED OFF CACHE BOOL "Build locate_anything as static lib" FORCE)
-
-# Unlike rt-detr.cpp, locate-anything.cpp ships no in-tree ggml patches, so
-# there is no apply_ggml_patches.sh hook to shim here.
-add_subdirectory(./sources/locate-anything.cpp)
-
-# locate-anything.cpp's top-level CMakeLists points its own target's include
-# dirs at ${CMAKE_SOURCE_DIR}/{include,src,third_party,...}. CMAKE_SOURCE_DIR
-# is the *top-level* source dir of the whole CMake tree, so when we pull it in
-# via add_subdirectory it resolves to OUR directory, not theirs, and the
-# locate_anything target fails to find its own headers (la_capi.h, stb_image.h,
-# la_gguf_keys.h). Re-add the correct, subdir-relative include paths to the
-# already-defined target so it compiles regardless of where it's nested.
-set(LA_SRC ${CMAKE_CURRENT_SOURCE_DIR}/sources/locate-anything.cpp)
-target_include_directories(locate_anything PRIVATE
-    ${LA_SRC}/include
-    ${LA_SRC}/src
-    ${LA_SRC}/third_party
-    ${LA_SRC}/third_party/stb)
-
-# locate-anything.cpp's C-API symbols already live inside liblocate_anything
-# (src/la_capi.cpp is compiled into the lib). We re-export them via a MODULE
-# library that links locate_anything so the symbols are visible at dlopen time.
-add_library(locateanythingcpp MODULE
-    sources/locate-anything.cpp/src/la_capi.cpp)
-
-target_include_directories(locateanythingcpp PRIVATE
-    sources/locate-anything.cpp/include
-    sources/locate-anything.cpp/src
-    sources/locate-anything.cpp/third_party
-    sources/locate-anything.cpp/third_party/stb
-)
-
-target_link_libraries(locateanythingcpp PRIVATE locate_anything ggml)
-
-if(CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.0)
-    target_link_libraries(locateanythingcpp PRIVATE stdc++fs)
-endif()
-
-set_property(TARGET locateanythingcpp PROPERTY CXX_STANDARD 17)
-set_target_properties(locateanythingcpp PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
--- a/backend/go/locate-anything-cpp/Makefile
+++ b/backend/go/locate-anything-cpp/Makefile
@@ -1,134 +0,0 @@
-CMAKE_ARGS?=
-BUILD_TYPE?=
-NATIVE?=false
-
-GOCMD?=go
-GO_TAGS?=
-JOBS?=$(shell nproc --ignore=1)
-
-# locate-anything.cpp. Pin to a specific commit for a stable build; leaving
-# this on `master` always picks up the latest C-API surface (incl. the
-# per-detection accessor functions used by golocateanythingcpp.go).
-LOCATEANYTHING_REPO?=https://github.com/mudler/locate-anything.cpp.git
-LOCATEANYTHING_VERSION?=60e450945476d5e97e0754a8c0e71a9ea81690e0
-
-ifeq ($(NATIVE),false)
-	CMAKE_ARGS+=-DGGML_NATIVE=OFF
-endif
-
-# Forward LocalAI's BUILD_TYPE to the matching ggml backend switch.
-ifeq ($(BUILD_TYPE),cublas)
-	CMAKE_ARGS+=-DGGML_CUDA=ON -DLA_GGML_CUDA=ON
-else ifeq ($(BUILD_TYPE),openblas)
-	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
-else ifeq ($(BUILD_TYPE),clblas)
-	CMAKE_ARGS+=-DGGML_CLBLAST=ON
-else ifeq ($(BUILD_TYPE),hipblas)
-	ROCM_HOME ?= /opt/rocm
-	ROCM_PATH ?= /opt/rocm
-	export CXX=$(ROCM_HOME)/llvm/bin/clang++
-	export CC=$(ROCM_HOME)/llvm/bin/clang
-	AMDGPU_TARGETS?=gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1200,gfx1201
-	CMAKE_ARGS+=-DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
-else ifeq ($(BUILD_TYPE),vulkan)
-	CMAKE_ARGS+=-DGGML_VULKAN=ON -DLA_GGML_VULKAN=ON
-else ifeq ($(OS),Darwin)
-	ifneq ($(BUILD_TYPE),metal)
-		CMAKE_ARGS+=-DGGML_METAL=OFF
-	else
-		CMAKE_ARGS+=-DGGML_METAL=ON
-		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
-		CMAKE_ARGS+=-DLA_GGML_METAL=ON
-	endif
-endif
-
-ifeq ($(BUILD_TYPE),sycl_f16)
-	CMAKE_ARGS+=-DGGML_SYCL=ON \
-		-DCMAKE_C_COMPILER=icx \
-		-DCMAKE_CXX_COMPILER=icpx \
-		-DGGML_SYCL_F16=ON
-endif
-
-ifeq ($(BUILD_TYPE),sycl_f32)
-	CMAKE_ARGS+=-DGGML_SYCL=ON \
-		-DCMAKE_C_COMPILER=icx \
-		-DCMAKE_CXX_COMPILER=icpx
-endif
-
-sources/locate-anything.cpp:
-	mkdir -p sources && \
-	git clone --recursive $(LOCATEANYTHING_REPO) sources/locate-anything.cpp && \
-	cd sources/locate-anything.cpp && \
-	git checkout $(LOCATEANYTHING_VERSION) && \
-	git submodule update --init --recursive --depth 1 --single-branch
-
-# Detect OS
-UNAME_S := $(shell uname -s)
-
-# Only build CPU variants on Linux
-ifeq ($(UNAME_S),Linux)
-	VARIANT_TARGETS = liblocateanythingcpp-avx.so liblocateanythingcpp-avx2.so liblocateanythingcpp-avx512.so liblocateanythingcpp-fallback.so
-else
-	# On non-Linux (e.g., Darwin), build only fallback variant
-	VARIANT_TARGETS = liblocateanythingcpp-fallback.so
-endif
-
-locate-anything-cpp: main.go golocateanythingcpp.go $(VARIANT_TARGETS)
-	CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o locate-anything-cpp ./
-
-package: locate-anything-cpp
-	bash package.sh
-
-build: package
-
-clean: purge
-	rm -rf liblocateanythingcpp*.so locate-anything-cpp package sources
-
-purge:
-	rm -rf build*
-
-# Build all variants (Linux only)
-ifeq ($(UNAME_S),Linux)
-liblocateanythingcpp-avx.so: sources/locate-anything.cpp
-	rm -rfv build-$@
-	$(info ${GREEN}I locate-anything-cpp build info:avx${RESET})
-	SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) liblocateanythingcpp-custom
-	rm -rfv build-$@
-
-liblocateanythingcpp-avx2.so: sources/locate-anything.cpp
-	rm -rfv build-$@
-	$(info ${GREEN}I locate-anything-cpp build info:avx2${RESET})
-	SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) liblocateanythingcpp-custom
-	rm -rfv build-$@
-
-liblocateanythingcpp-avx512.so: sources/locate-anything.cpp
-	rm -rfv build-$@
-	$(info ${GREEN}I locate-anything-cpp build info:avx512${RESET})
-	SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) liblocateanythingcpp-custom
-	rm -rfv build-$@
-endif
-
-# Build fallback variant (all platforms)
-liblocateanythingcpp-fallback.so: sources/locate-anything.cpp
-	rm -rfv build-$@
-	$(info ${GREEN}I locate-anything-cpp build info:fallback${RESET})
-	SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) liblocateanythingcpp-custom
-	rm -rfv build-$@
-
-liblocateanythingcpp-custom: CMakeLists.txt
-	mkdir -p build-$(SO_TARGET) && \
-	cd build-$(SO_TARGET) && \
-	cmake .. $(CMAKE_ARGS) && \
-	cmake --build . --config Release -j$(JOBS) && \
-	cd .. && \
-	mv build-$(SO_TARGET)/liblocateanythingcpp.so ./$(SO_TARGET)
-
-all: locate-anything-cpp package
-
-# `test` is invoked by the top-level Makefile's `test-extra` target. It builds
-# the backend binary + the fallback shared library (needed for dlopen at
-# runtime), then runs test.sh which downloads the q8_0 GGUF + COCO image and
-# exercises the gRPC Load/Detect wire path via the Go smoke test in
-# main_test.go.
-test: locate-anything-cpp liblocateanythingcpp-fallback.so
-	bash test.sh
--- a/backend/go/locate-anything-cpp/golocateanythingcpp.go
+++ b/backend/go/locate-anything-cpp/golocateanythingcpp.go
@@ -1,174 +0,0 @@
-package main
-
-// golocateanythingcpp.go - gRPC handlers (Load, Detect) for the
-// locate-anything-cpp backend.
-//
-// Embeds base.SingleThread to default unimplemented RPCs to "not supported"
-// while we only implement open-vocabulary object detection (Detect).
-
-import (
-	"encoding/base64"
-	"fmt"
-	"os"
-	"path/filepath"
-	"unsafe"
-
-	"github.com/mudler/LocalAI/pkg/grpc/base"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-)
-
-// la_ctx* is an opaque handle. la_capi_load returns it directly (0 == failure),
-// unlike rfdetr's out-parameter convention.
-var (
-	// la_capi_load(const char* gguf_path, int n_threads) -> la_ctx* (0 = fail)
-	CapiLoad func(gguf string, nThreads int32) uintptr
-	// la_capi_free(la_ctx* ctx)
-	CapiFree func(handle uintptr)
-	// la_capi_locate_path(ctx, image_path, prompt, mode) -> char* json (0 = err)
-	CapiLocatePath func(handle uintptr, imagePath string, prompt string, mode int32) uintptr
-	// la_capi_locate_buffer(ctx, bytes, len, prompt, mode) -> char* json (0 = err)
-	CapiLocateBuffer func(handle uintptr, bytes uintptr, length uintptr, prompt string, mode int32) uintptr
-	// la_capi_get_n_detections(ctx) -> int
-	CapiGetNDetections func(handle uintptr) int32
-	// la_capi_get_detection_box(ctx, i, out_xyxy[4]) -> int (0 on success)
-	CapiGetDetectionBox func(handle uintptr, i int32, outXYXY uintptr) int32
-	// la_capi_get_detection_label(ctx, i, buf, buf_size) -> int (required size incl NUL; two-call sizing)
-	CapiGetDetectionLabel func(handle uintptr, i int32, buf uintptr, bufSize int32) int32
-	// la_capi_free_string(char* s)
-	CapiFreeString func(s uintptr)
-	// la_capi_last_error(ctx) -> const char* (owned by ctx, "" if none / null ctx).
-	// purego marshals the returned C string into a Go string (a copy), so we
-	// never free it and avoid raw pointer arithmetic.
-	CapiLastError func(handle uintptr) string
-)
-
-type LocateAnythingCpp struct {
-	base.SingleThread
-	handle uintptr
-}
-
-// Load loads the GGUF model at opts.ModelFile (joined with opts.ModelPath if
-// relative) and stores the la_ctx handle for later Detect calls.
-func (r *LocateAnythingCpp) Load(opts *pb.ModelOptions) error {
-	modelFile := opts.ModelFile
-	if modelFile == "" {
-		modelFile = opts.Model
-	}
-	if modelFile == "" {
-		return fmt.Errorf("locate-anything-cpp: ModelFile is empty")
-	}
-
-	var modelPath string
-	if filepath.IsAbs(modelFile) {
-		modelPath = modelFile
-	} else {
-		modelPath = filepath.Join(opts.ModelPath, modelFile)
-	}
-
-	if _, err := os.Stat(modelPath); err != nil {
-		return fmt.Errorf("locate-anything-cpp: model file not found: %s: %w", modelPath, err)
-	}
-
-	threads := opts.Threads
-	if threads <= 0 {
-		threads = 4
-	}
-
-	// Release previous model if any (re-Load).
-	if r.handle != 0 {
-		CapiFree(r.handle)
-		r.handle = 0
-	}
-
-	h := CapiLoad(modelPath, threads)
-	if h == 0 {
-		// la_capi_last_error needs a ctx; on a failed load we have none (it
-		// returns "" for a null ctx), so the text is best-effort. Surface it
-		// when present.
-		if msg := CapiLastError(0); msg != "" {
-			return fmt.Errorf("locate-anything-cpp: la_capi_load failed for %s: %s", modelPath, msg)
-		}
-		return fmt.Errorf("locate-anything-cpp: la_capi_load failed for %s", modelPath)
-	}
-	r.handle = h
-	return nil
-}
-
-// Detect runs open-vocabulary detection on the base64-encoded image in opts.Src
-// using the required text prompt in opts.Prompt, returning one pb.Detection per
-// located object with its predicted label as ClassName.
-func (r *LocateAnythingCpp) Detect(opts *pb.DetectOptions) (pb.DetectResponse, error) {
-	if r.handle == 0 {
-		return pb.DetectResponse{}, fmt.Errorf("locate-anything-cpp: model not loaded")
-	}
-
-	// Open-vocabulary detection is prompt-driven; without a prompt there is
-	// nothing to locate.
-	prompt := opts.Prompt
-	if prompt == "" {
-		return pb.DetectResponse{}, fmt.Errorf("locate-anything-cpp: a text prompt is required (open-vocabulary detection)")
-	}
-
-	// Decode base64 image and write to temp file.
-	imgData, err := base64.StdEncoding.DecodeString(opts.Src)
-	if err != nil {
-		return pb.DetectResponse{}, fmt.Errorf("locate-anything-cpp: failed to decode base64 image: %w", err)
-	}
-
-	tmpFile, err := os.CreateTemp("", "locate-anything-*.img")
-	if err != nil {
-		return pb.DetectResponse{}, fmt.Errorf("locate-anything-cpp: failed to create temp file: %w", err)
-	}
-	defer func() { _ = os.Remove(tmpFile.Name()) }()
-
-	if _, err := tmpFile.Write(imgData); err != nil {
-		_ = tmpFile.Close()
-		return pb.DetectResponse{}, fmt.Errorf("locate-anything-cpp: failed to write temp file: %w", err)
-	}
-	if err := tmpFile.Close(); err != nil {
-		return pb.DetectResponse{}, fmt.Errorf("locate-anything-cpp: failed to close temp file: %w", err)
-	}
-
-	// mode 0 = hybrid (Parallel Box Decoding). The JSON return value is unused:
-	// structured detections are read via the accessor functions. Still must
-	// free the returned string.
-	jsonPtr := CapiLocatePath(r.handle, tmpFile.Name(), prompt, 0)
-	if jsonPtr != 0 {
-		CapiFreeString(jsonPtr)
-	}
-
-	n := CapiGetNDetections(r.handle)
-	if n < 0 {
-		return pb.DetectResponse{}, fmt.Errorf("locate-anything-cpp: invalid n_detections=%d", n)
-	}
-
-	detections := make([]*pb.Detection, 0, n)
-	for i := int32(0); i < n; i++ {
-		var xyxy [4]float32 // x1, y1, x2, y2
-		if CapiGetDetectionBox(r.handle, i, uintptr(unsafe.Pointer(&xyxy[0]))) != 0 {
-			continue
-		}
-
-		// Two-call sizing for the label string.
-		label := ""
-		need := CapiGetDetectionLabel(r.handle, i, 0, 0)
-		if need > 0 {
-			buf := make([]byte, need)
-			CapiGetDetectionLabel(r.handle, i, uintptr(unsafe.Pointer(&buf[0])), need)
-			label = string(buf[:need-1])
-		}
-
-		detections = append(detections, &pb.Detection{
-			X:          xyxy[0],
-			Y:          xyxy[1],
-			Width:      xyxy[2] - xyxy[0],
-			Height:     xyxy[3] - xyxy[1],
-			Confidence: 1.0,
-			ClassName:  label,
-		})
-	}
-
-	return pb.DetectResponse{
-		Detections: detections,
-	}, nil
-}
--- a/backend/go/locate-anything-cpp/main.go
+++ b/backend/go/locate-anything-cpp/main.go
@@ -1,59 +0,0 @@
-package main
-
-// main.go - entry point for the locate-anything-cpp gRPC backend.
-//
-// Dlopens liblocateanythingcpp-<variant>.so via purego at the path in
-// LOCATEANYTHING_LIBRARY (set by run.sh based on /proc/cpuinfo), registers
-// the la_capi_* C ABI symbols, then starts the gRPC server.
-
-import (
-	"flag"
-	"os"
-
-	"github.com/ebitengine/purego"
-	grpc "github.com/mudler/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-type LibFuncs struct {
-	FuncPtr any
-	Name    string
-}
-
-func main() {
-	// Get library name from environment variable, default to fallback
-	libName := os.Getenv("LOCATEANYTHING_LIBRARY")
-	if libName == "" {
-		libName = "./liblocateanythingcpp-fallback.so"
-	}
-
-	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
-	if err != nil {
-		panic(err)
-	}
-
-	libFuncs := []LibFuncs{
-		{&CapiLoad, "la_capi_load"},
-		{&CapiFree, "la_capi_free"},
-		{&CapiLocatePath, "la_capi_locate_path"},
-		{&CapiLocateBuffer, "la_capi_locate_buffer"},
-		{&CapiGetNDetections, "la_capi_get_n_detections"},
-		{&CapiGetDetectionBox, "la_capi_get_detection_box"},
-		{&CapiGetDetectionLabel, "la_capi_get_detection_label"},
-		{&CapiFreeString, "la_capi_free_string"},
-		{&CapiLastError, "la_capi_last_error"},
-	}
-
-	for _, lf := range libFuncs {
-		purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
-	}
-
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &LocateAnythingCpp{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/go/locate-anything-cpp/main_test.go
+++ b/backend/go/locate-anything-cpp/main_test.go
@@ -1,176 +0,0 @@
-package main
-
-// main_test.go - end-to-end smoke test for the locate-anything-cpp gRPC backend.
-//
-// Spawns the compiled locate-anything-cpp binary on a free local port, dials it
-// via gRPC, and exercises LoadModel + Detect against the test fixtures
-// downloaded by test.sh: the q8_0 GGUF of nvidia/LocateAnything-3B and a real
-// COCO image with people + cars. Asserts that open-vocabulary detection driven
-// by a text prompt returns at least one detection, each carrying a non-empty
-// class name and a bounding box of non-zero size.
-//
-// The spec Skip()s cleanly if its fixtures (the ~6.3 GB model, the test image,
-// the built binary, or the fallback .so) are missing, so the test target stays
-// usable on a fresh checkout / on CI runners where the large model hasn't been
-// downloaded.
-
-import (
-	"context"
-	"encoding/base64"
-	"fmt"
-	"net"
-	"os"
-	"os/exec"
-	"path/filepath"
-	"testing"
-	"time"
-
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-	"google.golang.org/grpc"
-	"google.golang.org/grpc/credentials/insecure"
-)
-
-func TestDetect(t *testing.T) {
-	RegisterFailHandler(Fail)
-	RunSpecs(t, "locate-anything-cpp backend smoke suite")
-}
-
-// freePort grabs an ephemeral TCP port and immediately releases it so the
-// spawned backend can bind to it. There is a tiny TOCTOU window here but in
-// practice it's adequate for a smoke test on a quiet runner.
-func freePort() int {
-	l, err := net.Listen("tcp", "127.0.0.1:0")
-	Expect(err).ToNot(HaveOccurred(), "freePort listen")
-	port := l.Addr().(*net.TCPAddr).Port
-	Expect(l.Close()).To(Succeed())
-	return port
-}
-
-// startBackend spawns the locate-anything-cpp binary on the given port and
-// waits until it accepts TCP connections (up to 10s). It mirrors how main.go
-// resolves the purego library: the LOCATEANYTHING_LIBRARY env var points the
-// dlopen at the freshly built fallback .so, and the la_capi_* symbols are
-// registered there. The returned cleanup func kills the process and reaps it.
-func startBackend(port int) func() {
-	binary, err := filepath.Abs("./locate-anything-cpp")
-	Expect(err).ToNot(HaveOccurred())
-	if _, err := os.Stat(binary); err != nil {
-		Skip(fmt.Sprintf("backend binary not built: %s (run `make locate-anything-cpp` first)", binary))
-	}
-
-	libPath, err := filepath.Abs("./liblocateanythingcpp-fallback.so")
-	Expect(err).ToNot(HaveOccurred())
-	if _, err := os.Stat(libPath); err != nil {
-		Skip(fmt.Sprintf("fallback library not built: %s (run `make liblocateanythingcpp-fallback.so` first)", libPath))
-	}
-
-	addr := fmt.Sprintf("127.0.0.1:%d", port)
-	cmd := exec.Command(binary, "--addr", addr)
-	cmd.Env = append(os.Environ(), "LOCATEANYTHING_LIBRARY="+libPath)
-	cmd.Stdout = os.Stderr
-	cmd.Stderr = os.Stderr
-	Expect(cmd.Start()).To(Succeed())
-
-	cleanup := func() {
-		if cmd.Process != nil {
-			_ = cmd.Process.Kill()
-			_, _ = cmd.Process.Wait()
-		}
-	}
-
-	deadline := time.Now().Add(10 * time.Second)
-	for time.Now().Before(deadline) {
-		c, err := net.DialTimeout("tcp", addr, 200*time.Millisecond)
-		if err == nil {
-			_ = c.Close()
-			return cleanup
-		}
-		time.Sleep(200 * time.Millisecond)
-	}
-
-	cleanup()
-	Fail(fmt.Sprintf("backend did not become ready on %s within 10s", addr))
-	return func() {}
-}
-
-// loadTestImage reads the COCO test image downloaded by test.sh and returns its
-// base64-encoded content (the wire format accepted by the Detect RPC).
-func loadTestImage() string {
-	imgPath, err := filepath.Abs("test-data/test.jpg")
-	Expect(err).ToNot(HaveOccurred())
-	imgBytes, err := os.ReadFile(imgPath)
-	if err != nil {
-		Skip(fmt.Sprintf("test image not present: %s (run test.sh first)", imgPath))
-	}
-	return base64.StdEncoding.EncodeToString(imgBytes)
-}
-
-// dialBackend opens a gRPC client connection to the spawned backend.
-func dialBackend(port int) (pb.BackendClient, func()) {
-	addr := fmt.Sprintf("127.0.0.1:%d", port)
-	conn, err := grpc.NewClient(addr, grpc.WithTransportCredentials(insecure.NewCredentials()))
-	Expect(err).ToNot(HaveOccurred())
-	return pb.NewBackendClient(conn), func() { _ = conn.Close() }
-}
-
-// modelPathOrSkip resolves the model file under ./test-models/ and Skip()s the
-// current spec if it's missing (the ~6.3 GB GGUF is not present on a fresh
-// checkout / on CI runners without the download).
-func modelPathOrSkip(name string) string {
-	modelDir, err := filepath.Abs("test-models")
-	Expect(err).ToNot(HaveOccurred())
-	modelPath := filepath.Join(modelDir, name)
-	if _, err := os.Stat(modelPath); err != nil {
-		Skip(fmt.Sprintf("model not present: %s (run test.sh first)", modelPath))
-	}
-	return modelPath
-}
-
-var _ = Describe("locate-anything-cpp backend", func() {
-	It("runs open-vocabulary detection against a known-good COCO image", func() {
-		modelPath := modelPathOrSkip("locate-anything-q8_0.gguf")
-		imgB64 := loadTestImage()
-
-		port := freePort()
-		cleanup := startBackend(port)
-		defer cleanup()
-
-		client, closeConn := dialBackend(port)
-		defer closeConn()
-
-		// The q8_0 model is ~6.3 GB and hybrid Parallel Box Decoding on CPU is
-		// not cheap, so give LoadModel + Detect a generous deadline.
-		ctx, cancel := context.WithTimeout(context.Background(), 20*time.Minute)
-		defer cancel()
-
-		loadResp, err := client.LoadModel(ctx, &pb.ModelOptions{
-			Model:     "locate-anything-q8_0.gguf",
-			ModelFile: modelPath,
-			Threads:   4,
-		})
-		Expect(err).ToNot(HaveOccurred(), "LoadModel")
-		Expect(loadResp.GetSuccess()).To(BeTrue(), "LoadModel reported failure: %s", loadResp.GetMessage())
-
-		// Open-vocabulary detection is prompt-driven; the prompt names the
-		// classes to locate (people + cars), separated by the </c> control token.
-		detResp, err := client.Detect(ctx, &pb.DetectOptions{
-			Src:    imgB64,
-			Prompt: "Locate all the instances that matches the following description: person</c>car.",
-		})
-		Expect(err).ToNot(HaveOccurred(), "Detect")
-		Expect(detResp.GetDetections()).ToNot(BeEmpty(), "no detections returned on a known-good COCO image")
-
-		_, _ = fmt.Fprintf(GinkgoWriter, "detection OK: %d detections\n", len(detResp.GetDetections()))
-		for i, d := range detResp.GetDetections() {
-			Expect(d.GetClassName()).ToNot(BeEmpty(), "detection %d has empty class_name", i)
-			Expect(d.GetWidth()).To(BeNumerically(">", float32(0)),
-				"detection %d has non-positive width", i)
-			Expect(d.GetHeight()).To(BeNumerically(">", float32(0)),
-				"detection %d has non-positive height", i)
-			_, _ = fmt.Fprintf(GinkgoWriter, "  [%d] %s box=(%.1f,%.1f,%.1fx%.1f)\n",
-				i, d.GetClassName(), d.GetX(), d.GetY(), d.GetWidth(), d.GetHeight())
-		}
-	})
-})
--- a/backend/go/locate-anything-cpp/package.sh
+++ b/backend/go/locate-anything-cpp/package.sh
@@ -1,59 +0,0 @@
-#!/bin/bash
-
-# Script to copy the appropriate libraries based on architecture
-
-set -e
-
-CURDIR=$(dirname "$(realpath $0)")
-REPO_ROOT="${CURDIR}/../../.."
-
-# Create lib directory
-mkdir -p $CURDIR/package/lib
-
-cp -avf $CURDIR/liblocateanythingcpp-*.so $CURDIR/package/
-cp -avf $CURDIR/locate-anything-cpp $CURDIR/package/
-cp -fv $CURDIR/run.sh $CURDIR/package/
-
-# Detect architecture and copy appropriate libraries
-if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
-    # x86_64 architecture
-    echo "Detected x86_64 architecture, copying x86_64 libraries..."
-    cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
-    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
-    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
-    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
-    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
-    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
-    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
-    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
-    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
-elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
-    # ARM64 architecture
-    echo "Detected ARM64 architecture, copying ARM64 libraries..."
-    cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
-    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
-    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
-    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
-    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
-    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
-    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
-    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
-    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
-elif [ $(uname -s) = "Darwin" ]; then
-    echo "Detected Darwin"
-else
-    echo "Error: Could not detect architecture"
-    exit 1
-fi
-
-# Package GPU libraries based on BUILD_TYPE
-GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
-if [ -f "$GPU_LIB_SCRIPT" ]; then
-    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
-    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
-    package_gpu_libs
-fi
-
-echo "Packaging completed successfully"
-ls -liah $CURDIR/package/
-ls -liah $CURDIR/package/lib/
--- a/backend/go/locate-anything-cpp/run.sh
+++ b/backend/go/locate-anything-cpp/run.sh
@@ -1,52 +0,0 @@
-#!/bin/bash
-set -ex
-
-# Get the absolute current dir where the script is located
-CURDIR=$(dirname "$(realpath $0)")
-
-cd /
-
-echo "CPU info:"
-if [ "$(uname)" != "Darwin" ]; then
-	grep -e "model\sname" /proc/cpuinfo | head -1
-	grep -e "flags" /proc/cpuinfo | head -1
-fi
-
-LIBRARY="$CURDIR/liblocateanythingcpp-fallback.so"
-
-if [ "$(uname)" != "Darwin" ]; then
-	if grep -q -e "\savx\s" /proc/cpuinfo ; then
-		echo "CPU:    AVX    found OK"
-		if [ -e $CURDIR/liblocateanythingcpp-avx.so ]; then
-			LIBRARY="$CURDIR/liblocateanythingcpp-avx.so"
-		fi
-	fi
-
-	if grep -q -e "\savx2\s" /proc/cpuinfo ; then
-		echo "CPU:    AVX2   found OK"
-		if [ -e $CURDIR/liblocateanythingcpp-avx2.so ]; then
-			LIBRARY="$CURDIR/liblocateanythingcpp-avx2.so"
-		fi
-	fi
-
-	# Check avx 512
-	if grep -q -e "\savx512f\s" /proc/cpuinfo ; then
-		echo "CPU:    AVX512F found OK"
-		if [ -e $CURDIR/liblocateanythingcpp-avx512.so ]; then
-			LIBRARY="$CURDIR/liblocateanythingcpp-avx512.so"
-		fi
-	fi
-fi
-
-export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
-export LOCATEANYTHING_LIBRARY=$LIBRARY
-
-# If there is a lib/ld.so, use it
-if [ -f $CURDIR/lib/ld.so ]; then
-	echo "Using lib/ld.so"
-	echo "Using library: $LIBRARY"
-	exec $CURDIR/lib/ld.so $CURDIR/locate-anything-cpp "$@"
-fi
-
-echo "Using library: $LIBRARY"
-exec $CURDIR/locate-anything-cpp "$@"
--- a/backend/go/locate-anything-cpp/test.sh
+++ b/backend/go/locate-anything-cpp/test.sh
@@ -1,47 +0,0 @@
-#!/bin/bash
-set -e
-
-CURDIR=$(dirname "$(realpath $0)")
-
-echo "Running locate-anything-cpp backend tests..."
-
-# Test model from the mudler/locate-anything.cpp-gguf HuggingFace repo. This is
-# the q8_0 quantization of nvidia/LocateAnything-3B (~6.3 GB), so the download
-# is the slow step. It is resumed with `curl -C -` and skipped entirely if the
-# file is already present.
-LOCATEANYTHING_MODEL_DIR="${LOCATEANYTHING_MODEL_DIR:-$CURDIR/test-models}"
-
-LOCATEANYTHING_MODEL_FILE="${LOCATEANYTHING_MODEL_FILE:-locate-anything-q8_0.gguf}"
-LOCATEANYTHING_MODEL_URL="${LOCATEANYTHING_MODEL_URL:-https://huggingface.co/mudler/locate-anything.cpp-gguf/resolve/main/locate-anything-q8_0.gguf}"
-
-mkdir -p "$LOCATEANYTHING_MODEL_DIR"
-
-if [ ! -f "$LOCATEANYTHING_MODEL_DIR/$LOCATEANYTHING_MODEL_FILE" ]; then
-    echo "Downloading locate-anything q8_0 model (~6.3 GB, this is slow)..."
-    # -C - resumes a partial download so an interrupted run doesn't restart from 0.
-    curl -L -C - -o "$LOCATEANYTHING_MODEL_DIR/$LOCATEANYTHING_MODEL_FILE" "$LOCATEANYTHING_MODEL_URL" --progress-bar
-fi
-
-# Use a real COCO test image (people + cars) from the upstream rf-detr.cpp repo
-# (~46 KB). Open-vocabulary detection needs real content to locate, so a
-# synthetic image would trivially yield zero detections.
-TEST_IMAGE_DIR="$CURDIR/test-data"
-TEST_IMAGE_FILE="$TEST_IMAGE_DIR/test.jpg"
-TEST_IMAGE_URL="${TEST_IMAGE_URL:-https://raw.githubusercontent.com/mudler/rf-detr.cpp/main/tests/fixtures/ci/test_image.jpg}"
-
-mkdir -p "$TEST_IMAGE_DIR"
-if [ ! -f "$TEST_IMAGE_FILE" ]; then
-    echo "Downloading COCO test image..."
-    curl -L -o "$TEST_IMAGE_FILE" "$TEST_IMAGE_URL" --progress-bar
-fi
-
-echo "locate-anything-cpp test setup complete."
-echo "  model:      $LOCATEANYTHING_MODEL_DIR/$LOCATEANYTHING_MODEL_FILE"
-echo "  test image: $TEST_IMAGE_FILE"
-
-# Run the Go smoke test: spawns the backend binary on a free port, calls
-# LoadModel + Detect via gRPC against the downloaded GGUF + COCO image.
-echo ""
-echo "Running Go smoke test..."
-cd "$CURDIR"
-go test -v -timeout 30m ./...
--- a/backend/go/parakeet-cpp/Makefile
+++ b/backend/go/parakeet-cpp/Makefile
@@ -1,6 +1,6 @@
 # parakeet-cpp backend Makefile.
 #
-# Upstream pin lives below as PARAKEET_VERSION?=b8012f11e5269126eddb7f4fd02f891a2ccc29b0
+# Upstream pin lives below as PARAKEET_VERSION?=e270af73b94c9a5c37ec516230219ed4580e1db6
 # (.github/bump_deps.sh) can find and update it - matches the
 # whisper.cpp / ds4 / vibevoice-cpp convention.
 #
@@ -15,7 +15,7 @@
 # That's what the L0 smoke test uses. The default target below does the
 # proper clone-at-pin + cmake build so CI doesn't need a side-checkout.

-PARAKEET_VERSION?=b8012f11e5269126eddb7f4fd02f891a2ccc29b0
+PARAKEET_VERSION?=e270af73b94c9a5c37ec516230219ed4580e1db6
 PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp

 GOCMD?=go
@@ -39,10 +39,7 @@ endif
 # is overwritten back to OFF and the build silently falls back to CPU. Forward the
 # PARAKEET_GGML_* options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
 ifeq ($(BUILD_TYPE),cublas)
-	# GGML_CUDA_GRAPHS is OFF by ggml default; enabling it gives a small free
-	# speedup (~1% measured on GB10, never negative) by capturing/replaying the
-	# CUDA graph. Not gated by parakeet.cpp, so it passes straight through to ggml.
-	CMAKE_ARGS+=-DPARAKEET_GGML_CUDA=ON -DGGML_CUDA_GRAPHS=ON
+	CMAKE_ARGS+=-DPARAKEET_GGML_CUDA=ON
 else ifeq ($(BUILD_TYPE),openblas)
 	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 else ifeq ($(BUILD_TYPE),hipblas)
--- a/backend/go/parakeet-cpp/goparakeetcpp.go
+++ b/backend/go/parakeet-cpp/goparakeetcpp.go
@@ -98,21 +98,17 @@ type transcriptJSON struct {
 }

 // streamFeedJSON mirrors the document returned by
-// parakeet_capi_stream_feed_json / parakeet_capi_stream_finalize_json (ABI v5):
+// parakeet_capi_stream_feed_json / parakeet_capi_stream_finalize_json (ABI v4):
 //
-//	{"text":"...","eou":0,"eob":0,"frame_sec":0.080000,
+//	{"text":"...","eou":0,"frame_sec":0.080000,
 //	 "words":[{"w":"...","start":0.480,"end":0.640,"conf":0.9100}, ...]}
 //
 // "text" is the newly-finalized text since the last call; "eou" is 1 when an
-// <EOU> (end of utterance) fired this feed and "eob" is 1 when an <EOB>
-// (backchannel) fired. ABI v4 conflated the two into "eou"; v5 split them, so
-// we read both and treat either as an utterance boundary for segmentation.
-// "words" are the words finalized this call with absolute (stream-relative)
-// start/end seconds.
+// <EOU>/<EOB> fired this feed; "words" are the words finalized this call with
+// absolute (stream-relative) start/end seconds.
 type streamFeedJSON struct {
 	Text     string           `json:"text"`
 	Eou      int              `json:"eou"`
-	Eob      int              `json:"eob"`
 	FrameSec float64          `json:"frame_sec"`
 	Words    []transcriptWord `json:"words"`
 }
@@ -487,10 +483,7 @@ type streamSegmenter struct {

 func (s *streamSegmenter) add(doc streamFeedJSON) {
 	s.cur = append(s.cur, doc.Words...)
-	// Close the segment on either turn signal: <EOU> (end of utterance) or
-	// <EOB> (backchannel). ABI v4 reported both via "eou"; v5 split them, so we
-	// OR them here to keep the v4 segmentation boundaries.
-	if doc.Eou != 0 || doc.Eob != 0 {
+	if doc.Eou != 0 {
 		s.flush()
 	}
 }
@@ -678,12 +671,11 @@ func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.Tra
 	return nil
 }

-// streamJSON drives the streaming JSON entry points (present since ABI v4): each
-// feed/finalize returns a {text,eou,eob,frame_sec,words} document. The
-// newly-finalized text is emitted as a delta (unchanged streaming contract)
-// while words are accumulated into per-utterance segments (closed on <EOU> or
-// <EOB>) so the closing FinalResult carries timestamped segments. Runs under
-// engineMu (already held by the caller).
+// streamJSON drives the ABI v4 streaming JSON entry points: each feed/finalize
+// returns a {text,eou,frame_sec,words} document. The newly-finalized text is
+// emitted as a delta (unchanged streaming contract) while words are accumulated
+// into per-utterance segments (closed on EOU) so the closing FinalResult carries
+// timestamped segments. Runs under engineMu (already held by the caller).
 func (p *ParakeetCpp) streamJSON(ctx context.Context, stream uintptr, data []float32,
 	duration float32, results chan *pb.TranscriptStreamResponse) error {
 	var (
--- a/backend/go/parakeet-cpp/segments_test.go
+++ b/backend/go/parakeet-cpp/segments_test.go
@@ -124,17 +124,4 @@ var _ = Describe("streaming segment assembly", func() {
 		Expect(acc.segments()).To(HaveLen(1))
 		Expect(acc.segments()[0].Text).To(Equal("hi there"))
 	})
-
-	// ABI v5 split <EOB> (backchannel) out of the "eou" flag into its own "eob"
-	// field; a backchannel must still close the segment as it did in v4.
-	It("closes a segment on EOB (backchannel) too", func() {
-		acc := &streamSegmenter{}
-		acc.add(streamFeedJSON{Text: "uh huh", Eou: 0, Eob: 1, Words: []transcriptWord{
-			{W: "uh", Start: 0.0, End: 0.2}, {W: "huh", Start: 0.2, End: 0.5},
-		}})
-		segs := acc.segments()
-		Expect(segs).To(HaveLen(1))
-		Expect(segs[0].Text).To(Equal("uh huh"))
-		Expect(segs[0].End).To(Equal(secondsToNanos(0.5)))
-	})
 })
--- a/backend/go/vibevoice-cpp/CMakeLists.txt
+++ b/backend/go/vibevoice-cpp/CMakeLists.txt
@@ -26,16 +26,8 @@ add_library(govibevoicecpp MODULE cpp/govibevoicecpp.cpp)
 # vv_capi_* symbols (purego dlopens them by name, nothing in our
 # translation unit references them). Force the static archive's
 # entire contents into the MODULE so dlsym finds vv_capi_load etc.
-#
-# Link the `vibevoice` TARGET (not a bare archive path) so CMake builds
-# libvibevoice.a first and tracks the dependency: the upstream project is added
-# with EXCLUDE_FROM_ALL, so without a target-level link there is no rule to
-# build it. Passing only $<TARGET_FILE:vibevoice> as a path on Apple left the
-# build with "No rule to make target 'vibevoice/libvibevoice.a'" (issue #10267).
-# force_load is then applied as a separate link option.
 if(APPLE)
-    target_link_libraries(govibevoicecpp PRIVATE vibevoice)
-    target_link_options(govibevoicecpp PRIVATE "-Wl,-force_load,$<TARGET_FILE:vibevoice>")
+    target_link_libraries(govibevoicecpp PRIVATE -Wl,-force_load $<TARGET_FILE:vibevoice>)
 elseif(MSVC)
    target_link_libraries(govibevoicecpp PRIVATE vibevoice)
    set_property(TARGET govibevoicecpp APPEND PROPERTY LINK_FLAGS "/WHOLEARCHIVE:vibevoice")
--- a/backend/go/vibevoice-cpp/Makefile
+++ b/backend/go/vibevoice-cpp/Makefile
@@ -94,30 +94,26 @@ purge:
 # Build all variants (Linux only)
 ifeq ($(UNAME_S),Linux)
 libgovibevoicecpp-avx.so: sources/vibevoice.cpp
-	$(MAKE) purge
 	$(info ${GREEN}I vibevoice-cpp build info:avx${RESET})
 	SO_TARGET=libgovibevoicecpp-avx.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgovibevoicecpp-custom
-	rm -rfv build*
+	rm -rf build-libgovibevoicecpp-avx.so

 libgovibevoicecpp-avx2.so: sources/vibevoice.cpp
-	$(MAKE) purge
 	$(info ${GREEN}I vibevoice-cpp build info:avx2${RESET})
 	SO_TARGET=libgovibevoicecpp-avx2.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) libgovibevoicecpp-custom
-	rm -rfv build*
+	rm -rf build-libgovibevoicecpp-avx2.so

 libgovibevoicecpp-avx512.so: sources/vibevoice.cpp
-	$(MAKE) purge
 	$(info ${GREEN}I vibevoice-cpp build info:avx512${RESET})
 	SO_TARGET=libgovibevoicecpp-avx512.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) libgovibevoicecpp-custom
-	rm -rfv build*
+	rm -rf build-libgovibevoicecpp-avx512.so
 endif

 # Build fallback variant (all platforms)
 libgovibevoicecpp-fallback.so: sources/vibevoice.cpp
-	$(MAKE) purge
 	$(info ${GREEN}I vibevoice-cpp build info:fallback${RESET})
 	SO_TARGET=libgovibevoicecpp-fallback.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgovibevoicecpp-custom
-	rm -rfv build*
+	rm -rf build-libgovibevoicecpp-fallback.so

 libgovibevoicecpp-custom: CMakeLists.txt cpp/govibevoicecpp.cpp cpp/govibevoicecpp.h
 	mkdir -p build-$(SO_TARGET) && \
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -337,35 +337,6 @@
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-rfdetr-cpp"
    intel: "intel-sycl-f32-rfdetr-cpp"
    vulkan: "vulkan-rfdetr-cpp"
- &locateanything
-  name: "locate-anything"
-  alias: "locate-anything"
-  license: apache-2.0
-  description: |
-    Open-vocabulary object detection and visual grounding (NVIDIA
-    LocateAnything-3B) in C/C++ using GGML. Loads pre-built GGUF weights
-    and, given an image and a free-form text prompt, returns bounding
-    boxes, class labels, and confidence scores for the referred objects.
-  urls:
-    - https://github.com/mudler/locate-anything.cpp
-    - https://huggingface.co/nvidia/LocateAnything-3B
-  tags:
-    - object-detection
-    - visual-grounding
-    - open-vocabulary
-    - locate-anything
-    - gpu
-    - cpu
-  capabilities:
-    default: "cpu-locate-anything-cpp"
-    nvidia: "cuda12-locate-anything-cpp"
-    nvidia-cuda-12: "cuda12-locate-anything-cpp"
-    nvidia-cuda-13: "cuda13-locate-anything-cpp"
-    nvidia-l4t: "nvidia-l4t-arm64-locate-anything-cpp"
-    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-locate-anything-cpp"
-    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-locate-anything-cpp"
-    intel: "intel-sycl-f32-locate-anything-cpp"
-    vulkan: "vulkan-locate-anything-cpp"
 - &vllm
  name: "vllm"
  license: apache-2.0
@@ -1254,7 +1225,6 @@
    default: "cpu-sherpa-onnx"
    nvidia: "cuda12-sherpa-onnx"
    nvidia-cuda-12: "cuda12-sherpa-onnx"
-    metal: "metal-sherpa-onnx"
 - !!merge <<: *neutts
  name: "neutts-development"
  capabilities:
@@ -1587,7 +1557,6 @@
    - localai/localai-backends:master-metal-darwin-arm64-kitten-tts
 - !!merge <<: *local-store
  name: "local-store-development"
-  alias: "local-store"
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-local-store"
  mirrors:
    - localai/localai-backends:master-cpu-local-store
@@ -1598,7 +1567,6 @@
    - localai/localai-backends:latest-metal-darwin-arm64-local-store
 - !!merge <<: *local-store
  name: "metal-local-store-development"
-  alias: "local-store"
  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-local-store"
  mirrors:
    - localai/localai-backends:master-metal-darwin-arm64-local-store
@@ -4717,14 +4685,12 @@
    default: "cpu-speaker-recognition"
    nvidia: "cuda12-speaker-recognition"
    nvidia-cuda-12: "cuda12-speaker-recognition"
-    metal: "metal-speaker-recognition"
 - !!merge <<: *speakerrecognition
  name: "speaker-recognition-development"
  capabilities:
    default: "cpu-speaker-recognition-development"
    nvidia: "cuda12-speaker-recognition-development"
    nvidia-cuda-12: "cuda12-speaker-recognition-development"
-    metal: "metal-speaker-recognition-development"
 - !!merge <<: *speakerrecognition
  name: "cpu-speaker-recognition"
  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-speaker-recognition"
@@ -4745,16 +4711,6 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-speaker-recognition"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-12-speaker-recognition
- !!merge <<: *speakerrecognition
-  name: "metal-speaker-recognition"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-speaker-recognition"
-  mirrors:
-    - localai/localai-backends:latest-metal-darwin-arm64-speaker-recognition
- !!merge <<: *speakerrecognition
-  name: "metal-speaker-recognition-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-speaker-recognition"
-  mirrors:
-    - localai/localai-backends:master-metal-darwin-arm64-speaker-recognition
 ## sherpa-onnx
 - !!merge <<: *sherpa-onnx
  name: "sherpa-onnx-development"
@@ -4762,7 +4718,6 @@
    default: "cpu-sherpa-onnx-development"
    nvidia: "cuda12-sherpa-onnx-development"
    nvidia-cuda-12: "cuda12-sherpa-onnx-development"
-    metal: "metal-sherpa-onnx-development"
 - !!merge <<: *sherpa-onnx
  name: "cpu-sherpa-onnx"
  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-sherpa-onnx"
@@ -4783,13 +4738,3 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-sherpa-onnx"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-12-sherpa-onnx
- !!merge <<: *sherpa-onnx
-  name: "metal-sherpa-onnx"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-sherpa-onnx"
-  mirrors:
-    - localai/localai-backends:latest-metal-darwin-arm64-sherpa-onnx
- !!merge <<: *sherpa-onnx
-  name: "metal-sherpa-onnx-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-sherpa-onnx"
-  mirrors:
-    - localai/localai-backends:master-metal-darwin-arm64-sherpa-onnx
--- a/backend/python/mlx/backend.py
+++ b/backend/python/mlx/backend.py
@@ -407,24 +407,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
            messages = messages_to_dicts(request.Messages)

-            # The mlx-lm tokenizer only carries a text-LM chat template. A
-            # vision-language checkpoint (e.g. gemma-4 E4B) loaded here has no
-            # usable template, so apply_chat_template silently passes the raw
-            # text through and the model just echoes/loops (issue #10269).
-            # Warn loudly so the misroute is visible; such models belong on the
-            # mlx-vlm backend.
-            chat_template = getattr(self.tokenizer, "chat_template", None)
-            if not chat_template:
-                underlying = getattr(self.tokenizer, "_tokenizer", None)
-                chat_template = getattr(underlying, "chat_template", None)
-            if not chat_template:
-                print(
-                    "WARNING: this model has no chat template; output may be "
-                    "degenerate. Vision-language models (e.g. gemma-4 E4B) must "
-                    "use the 'mlx-vlm' backend instead of 'mlx'.",
-                    file=sys.stderr,
-                )
-
            kwargs = {"tokenize": False, "add_generation_prompt": True}
            if request.Tools:
                try:
--- a/backend/python/speaker-recognition/requirements-mps.txt
+++ b/backend/python/speaker-recognition/requirements-mps.txt
@@ -1,5 +0,0 @@
-torch
-torchaudio
-speechbrain
-transformers
-onnxruntime
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -26,10 +26,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
-try:
-    from vllm.tokenizers import get_tokenizer  # vLLM >= 0.22
-except ImportError:
-    from vllm.transformers_utils.tokenizer import get_tokenizer  # vLLM < 0.22
+from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.multimodal.utils import fetch_image
 from vllm.assets.video import VideoAsset
 import base64
@@ -150,24 +147,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                d["reasoning_content"] = msg.reasoning_content
            if msg.tool_calls:
                try:
-                    tool_calls = json.loads(msg.tool_calls)
+                    d["tool_calls"] = json.loads(msg.tool_calls)
                except json.JSONDecodeError:
                    pass
-                else:
-                    # OpenAI wire format carries function.arguments as a
-                    # JSON-encoded string, but chat templates (e.g. Qwen3)
-                    # iterate over it as a mapping. vLLM's own OpenAI server
-                    # parses arguments before applying the template, so do
-                    # the same here.
-                    if isinstance(tool_calls, list):
-                        for tc in tool_calls:
-                            func = tc.get("function") if isinstance(tc, dict) else None
-                            if isinstance(func, dict) and isinstance(func.get("arguments"), str):
-                                try:
-                                    func["arguments"] = json.loads(func["arguments"])
-                                except json.JSONDecodeError:
-                                    pass
-                    d["tool_calls"] = tool_calls
            result.append(d)
        return result

--- a/core/application/mitm.go
+++ b/core/application/mitm.go
@@ -11,29 +11,6 @@ import (
 	"github.com/mudler/xlog"
 )

-// startMITMIfConfigured brings up the cloudproxy MITM listener when an
-// address is configured, treating any startup failure as non-fatal.
-//
-// The listener is opt-in middleware whose address is persisted in runtime
-// settings (/api/settings → runtime_settings.json) and replayed on every
-// boot. A bad value — e.g. a host the process can't bind, like a LAN IP
-// inside a container — must NOT abort the whole server: doing so crash-loops
-// with no way out, because the Settings UI used to correct the address can't
-// load if startup never completes. So on failure we log loudly and carry on;
-// the admin fixes the address via /api/settings, which calls RestartMITM.
-func startMITMIfConfigured(app *Application, options *config.ApplicationConfig) {
-	if options.MITMListen == "" {
-		return
-	}
-	if err := startMITMProxy(app, options); err != nil {
-		xlog.Error("mitm: cloudproxy listener failed to start — continuing without it",
-			"listen", options.MITMListen,
-			"error", err,
-			"hint", "fix the address via Settings (e.g. \":8082\" to bind all interfaces) and the listener will restart",
-		)
-	}
-}
-
 func startMITMProxy(app *Application, options *config.ApplicationConfig) error {
 	app.mitmMutex.Lock()
 	defer app.mitmMutex.Unlock()
--- a/core/application/mitm_test.go
+++ b/core/application/mitm_test.go
@@ -1,58 +0,0 @@
-package application
-
-import (
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/pkg/system"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-// minimal Application wired enough for startMITMProxy: an empty model
-// config loader (no host claims), CA written under a temp DataPath.
-func newMITMTestApp(dataPath string) (*Application, *config.ApplicationConfig) {
-	state, err := system.GetSystemState()
-	Expect(err).NotTo(HaveOccurred())
-	state.Model.ModelsPath = dataPath
-	opts := config.NewApplicationConfig(
-		config.WithSystemState(state),
-		config.WithDataPath(dataPath),
-	)
-	return newApplication(opts), opts
-}
-
-var _ = Describe("startMITMIfConfigured", func() {
-	It("does nothing when no listen address is configured", func() {
-		app, opts := newMITMTestApp(GinkgoT().TempDir())
-		opts.MITMListen = ""
-
-		Expect(func() { startMITMIfConfigured(app, opts) }).NotTo(Panic())
-		Expect(app.mitmServer.Load()).To(BeNil(), "no listener should be stored when disabled")
-	})
-
-	// Regression: a persisted-but-unbindable MITM address (e.g. a LAN host
-	// inside a container) must not abort startup. startMITMIfConfigured
-	// swallows the bind error so the rest of LocalAI still comes up and the
-	// admin can fix the address via the Settings UI.
-	It("logs and continues when the listen address cannot be bound", func() {
-		app, opts := newMITMTestApp(GinkgoT().TempDir())
-		// 192.0.2.1 is TEST-NET-1 (RFC 5737): guaranteed not assigned to any
-		// local interface, so bind fails deterministically without DNS.
-		opts.MITMListen = "192.0.2.1:8082"
-
-		Expect(func() { startMITMIfConfigured(app, opts) }).NotTo(Panic())
-		Expect(app.mitmServer.Load()).To(BeNil(), "failed listener must not be stored")
-	})
-
-	It("starts and stores the listener on a bindable address", func() {
-		app, opts := newMITMTestApp(GinkgoT().TempDir())
-		opts.MITMListen = "127.0.0.1:0" // OS-assigned free port
-
-		startMITMIfConfigured(app, opts)
-
-		srv := app.mitmServer.Load()
-		Expect(srv).NotTo(BeNil(), "listener should be stored on success")
-		DeferCleanup(srv.Stop)
-		Expect(srv.Addr()).NotTo(BeEmpty())
-	})
-})
--- a/core/application/router_factories.go
+++ b/core/application/router_factories.go
@@ -1,120 +1,63 @@
 package application

 import (
-	"context"
-	"fmt"
-
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 )

-// adapterConfig resolves a model name to its runtime ModelConfig, or nil when
-// unknown. LoadModelConfigFileByNameDefaultOptions never returns nil — for an
-// unknown name it returns a defaults-filled stub with an empty Name (the YAML
-// `name:` field is required by Validate), which is how we tell the two apart.
+// adapterConfig resolves a model name to its runtime ModelConfig, or
+// nil when the name is unknown. Shared by the router-facing factories
+// below and by ModelConfigLookup.
 func (a *Application) adapterConfig(modelName string) *config.ModelConfig {
 	cfg, err := a.backendLoader.LoadModelConfigFileByNameDefaultOptions(modelName, a.applicationConfig)
-	if err != nil || cfg == nil || cfg.Name == "" {
+	if err != nil || cfg == nil {
 		return nil
 	}
 	return cfg
 }

-// ModelConfigLookup is the lookup the router middleware's classifier validator
-// uses to confirm classifier_model declares FLAG_SCORE before binding it.
+// ModelConfigLookup is the lookup function the router middleware's
+// classifier validator uses to confirm classifier_model declares
+// FLAG_SCORE before binding it.
 func (a *Application) ModelConfigLookup() func(modelName string) *config.ModelConfig {
 	return a.adapterConfig
 }

-// The router-facing factories below (Scorer, Embedder, Reranker, TokenCounter)
-// bind a model NAME at construction and re-resolve the CONFIG on every call.
-// Capturing the config at construction would bake in whatever state
-// adapterConfig saw first — including a stub returned before the YAML reached
-// bcl.configs (e.g. /import-model or gallery install racing startup). The
-// classifier registry caches factories by router-config fingerprint, so a
-// once-stale capture stays stale until the router config is edited.
-
+// Scorer returns a backend.Scorer bound to the named model, or nil
+// when the model is unknown. Used as a method value (app.Scorer) by
+// router.ClassifierDeps — no factory-of-factory wrapper needed.
 func (a *Application) Scorer(modelName string) backend.Scorer {
-	if a.adapterConfig(modelName) == nil {
-		return nil
-	}
-	return &lazyScorer{app: a, modelName: modelName}
-}
-
-type lazyScorer struct {
-	app       *Application
-	modelName string
-}
-
-func (l *lazyScorer) Score(ctx context.Context, prompt string, candidates []string) ([]backend.CandidateScore, error) {
-	cfg := l.app.adapterConfig(l.modelName)
+	cfg := a.adapterConfig(modelName)
 	if cfg == nil {
-		return nil, fmt.Errorf("scorer: model %q no longer available", l.modelName)
-	}
-	return backend.NewScorer(l.app.modelLoader, *cfg, l.app.applicationConfig).Score(ctx, prompt, candidates)
-}
-
-// TokenCounter returns a func so the middleware's literal field type accepts
-// it as a method value without importing core/http/middleware from here.
-func (a *Application) TokenCounter(modelName string) func(string) (int, error) {
-	if a.adapterConfig(modelName) == nil {
 		return nil
 	}
-	return func(text string) (int, error) {
-		cfg := a.adapterConfig(modelName)
-		if cfg == nil {
-			return 0, fmt.Errorf("token counter: model %q no longer available", modelName)
-		}
-		resp, err := backend.ModelTokenize(text, a.modelLoader, *cfg, a.applicationConfig)
-		if err != nil {
-			return 0, err
-		}
-		return len(resp.Tokens), nil
-	}
+	return backend.NewScorer(a.modelLoader, *cfg, a.applicationConfig)
 }

+// Reranker returns a backend.Reranker bound to the named model, or
+// nil when unknown. The reranker model's `type:` (e.g. "colbert")
+// selects the scoring head inside the rerankers backend.
 func (a *Application) Reranker(modelName string) backend.Reranker {
-	if a.adapterConfig(modelName) == nil {
+	cfg := a.adapterConfig(modelName)
+	if cfg == nil {
 		return nil
 	}
-	return &lazyReranker{app: a, modelName: modelName}
-}
-
-type lazyReranker struct {
-	app       *Application
-	modelName string
-}
-
-func (l *lazyReranker) Rerank(ctx context.Context, query string, documents []string) ([]backend.RerankResult, error) {
-	cfg := l.app.adapterConfig(l.modelName)
-	if cfg == nil {
-		return nil, fmt.Errorf("reranker: model %q no longer available", l.modelName)
-	}
-	return backend.NewReranker(l.app.modelLoader, *cfg, l.app.applicationConfig).Rerank(ctx, query, documents)
+	return backend.NewReranker(a.modelLoader, *cfg, a.applicationConfig)
 }

+// Embedder returns a backend.Embedder bound to the named model, or
+// nil when unknown. Used by the router's L2 embedding cache.
 func (a *Application) Embedder(modelName string) backend.Embedder {
-	if a.adapterConfig(modelName) == nil {
+	cfg := a.adapterConfig(modelName)
+	if cfg == nil {
 		return nil
 	}
-	return &lazyEmbedder{app: a, modelName: modelName}
+	return backend.NewEmbedder(a.modelLoader, *cfg, a.applicationConfig)
 }

-type lazyEmbedder struct {
-	app       *Application
-	modelName string
-}
-
-func (l *lazyEmbedder) Embed(ctx context.Context, text string) ([]float32, error) {
-	cfg := l.app.adapterConfig(l.modelName)
-	if cfg == nil {
-		return nil, fmt.Errorf("embedder: model %q no longer available", l.modelName)
-	}
-	return backend.NewEmbedder(l.app.modelLoader, *cfg, l.app.applicationConfig).Embed(ctx, text)
-}
-
-// VectorStore takes a store name, not a model name — no adapterConfig, no
-// staleness to avoid.
+// VectorStore returns a backend.VectorStore for the named collection,
+// or nil when the name is empty. Each router model gets its own
+// backend process via the model loader's cache keyed by storeName.
 func (a *Application) VectorStore(storeName string) backend.VectorStore {
 	return backend.NewVectorStore(a.modelLoader, a.applicationConfig, storeName)
 }
--- a/core/application/router_factories_test.go
+++ b/core/application/router_factories_test.go
@@ -1,155 +0,0 @@
-package application
-
-import (
-	"context"
-	"os"
-	"path/filepath"
-
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/pkg/model"
-	"github.com/mudler/LocalAI/pkg/system"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-// Regression: the router-facing factories used to capture
-// *config.ModelConfig at construction. A gallery install that raced
-// startup left a stub (Backend="") bound for the lifetime of the
-// classifier registry's cache entry, bypassing the user's `backend:`
-// config. These specs pin the lazy re-resolve.
-var _ = Describe("router_factories lazy config resolution", func() {
-	var (
-		tmpDir string
-		app    *Application
-	)
-
-	BeforeEach(func() {
-		var err error
-		tmpDir, err = os.MkdirTemp("", "router-factories-*")
-		Expect(err).NotTo(HaveOccurred())
-
-		appCfg := &config.ApplicationConfig{
-			Context:     context.Background(),
-			SystemState: &system.SystemState{Model: system.Model{ModelsPath: tmpDir}},
-		}
-		app = &Application{
-			backendLoader:     config.NewModelConfigLoader(tmpDir),
-			modelLoader:       model.NewModelLoader(appCfg.SystemState),
-			applicationConfig: appCfg,
-		}
-	})
-
-	AfterEach(func() {
-		_ = os.RemoveAll(tmpDir)
-	})
-
-	// writeCfg seeds both the on-disk YAML and the in-memory cache —
-	// removing only the cache would fall through to file-read.
-	writeCfg := func(name, backend string) {
-		yaml := "name: " + name + "\nbackend: " + backend + "\nparameters:\n  model: " + name + ".bin\n"
-		Expect(os.WriteFile(filepath.Join(tmpDir, name+".yaml"), []byte(yaml), 0644)).To(Succeed())
-		Expect(app.backendLoader.LoadModelConfigsFromPath(tmpDir)).To(Succeed())
-		cfg, ok := app.backendLoader.GetModelConfig(name)
-		Expect(ok).To(BeTrue(), "config must be loaded before the spec runs")
-		Expect(cfg.Backend).To(Equal(backend))
-	}
-
-	// removeCfg purges both the cache and the YAML so LoadModelConfigFileByName
-	// returns the empty-stub case and adapterConfig returns nil.
-	removeCfg := func(name string) {
-		app.backendLoader.RemoveModelConfig(name)
-		Expect(os.Remove(filepath.Join(tmpDir, name+".yaml"))).To(Succeed())
-	}
-
-	Context("Embedder", func() {
-		It("returns nil at construction for an unknown model", func() {
-			Expect(app.Embedder("missing")).To(BeNil())
-		})
-
-		It("re-resolves the model config on each Embed call", func() {
-			writeCfg("emb-test", "llama-cpp")
-			emb := app.Embedder("emb-test")
-			Expect(emb).NotTo(BeNil())
-
-			// The factory must hold the NAME, not a captured config —
-			// otherwise stale captures survive cache invalidation.
-			lazy, ok := emb.(*lazyEmbedder)
-			Expect(ok).To(BeTrue(), "Embedder must return *lazyEmbedder")
-			Expect(lazy.modelName).To(Equal("emb-test"))
-
-			// Mutate the cached config. A lazy implementation sees the
-			// update on the next adapterConfig call; a captured-at-
-			// construction implementation would still see "llama-cpp".
-			app.backendLoader.UpdateModelConfig("emb-test", func(c *config.ModelConfig) {
-				c.Backend = "rerankers"
-			})
-			Expect(lazy.app.adapterConfig("emb-test").Backend).To(Equal("rerankers"))
-
-			// Remove the config entirely → Embed must surface the disappearance.
-			removeCfg("emb-test")
-			_, err := emb.Embed(context.Background(), "anything")
-			Expect(err).To(HaveOccurred())
-			Expect(err.Error()).To(ContainSubstring("no longer available"))
-		})
-	})
-
-	Context("Scorer", func() {
-		It("returns nil at construction for an unknown model", func() {
-			Expect(app.Scorer("missing")).To(BeNil())
-		})
-
-		It("re-resolves the model config on each Score call", func() {
-			writeCfg("score-test", "llama-cpp")
-			sc := app.Scorer("score-test")
-			Expect(sc).NotTo(BeNil())
-
-			lazy, ok := sc.(*lazyScorer)
-			Expect(ok).To(BeTrue(), "Scorer must return *lazyScorer")
-			Expect(lazy.modelName).To(Equal("score-test"))
-
-			removeCfg("score-test")
-			_, err := sc.Score(context.Background(), "prompt", []string{"a"})
-			Expect(err).To(HaveOccurred())
-			Expect(err.Error()).To(ContainSubstring("no longer available"))
-		})
-	})
-
-	Context("Reranker", func() {
-		It("returns nil at construction for an unknown model", func() {
-			Expect(app.Reranker("missing")).To(BeNil())
-		})
-
-		It("re-resolves the model config on each Rerank call", func() {
-			writeCfg("rerank-test", "rerankers")
-			rr := app.Reranker("rerank-test")
-			Expect(rr).NotTo(BeNil())
-
-			lazy, ok := rr.(*lazyReranker)
-			Expect(ok).To(BeTrue(), "Reranker must return *lazyReranker")
-			Expect(lazy.modelName).To(Equal("rerank-test"))
-
-			removeCfg("rerank-test")
-			_, err := rr.Rerank(context.Background(), "q", []string{"d"})
-			Expect(err).To(HaveOccurred())
-			Expect(err.Error()).To(ContainSubstring("no longer available"))
-		})
-	})
-
-	Context("TokenCounter", func() {
-		It("returns nil at construction for an unknown model", func() {
-			Expect(app.TokenCounter("missing")).To(BeNil())
-		})
-
-		It("re-resolves the model config on each call", func() {
-			writeCfg("tok-test", "llama-cpp")
-			tc := app.TokenCounter("tok-test")
-			Expect(tc).NotTo(BeNil())
-
-			removeCfg("tok-test")
-			_, err := tc("anything")
-			Expect(err).To(HaveOccurred())
-			Expect(err.Error()).To(ContainSubstring("no longer available"))
-		})
-	})
-})
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -462,7 +462,11 @@ func New(opts ...config.AppOption) (*Application, error) {
 	// traffic doesn't need a parallel config for MITM traffic.
 	// Runs after loadRuntimeSettingsFromFile so a listener configured
 	// via /api/settings is brought back up across restarts.
-	startMITMIfConfigured(application, options)
+	if options.MITMListen != "" {
+		if err := startMITMProxy(application, options); err != nil {
+			return nil, fmt.Errorf("mitm: startup: %w", err)
+		}
+	}

 	application.ModelLoader().SetBackendLoggingEnabled(options.EnableBackendLogging)

--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@@ -100,13 +100,8 @@ func ModelEmbedding(ctx context.Context, s string, tokens []int, loader *model.M
 		trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)

 		traceData := map[string]any{
-			"input_text": trace.TruncateString(s, 1000),
-		}
-		// Only present for token-mode callers (pre-tokenized override);
-		// emitting "0" alongside input_text would read as "consumed zero
-		// tokens", which is wrong.
-		if len(tokens) > 0 {
-			traceData["input_tokens_count"] = len(tokens)
+			"input_text":         trace.TruncateString(s, 1000),
+			"input_tokens_count": len(tokens),
 		}

 		startTime := time.Now()
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -87,47 +87,11 @@ func getSeed(c config.ModelConfig) int32 {
 	return seed
 }

-// DefaultContextSize and DefaultBatchSize are the backend's fallbacks when a
-// model config leaves them unset. Exported so callers that must respect the
-// effective decode window — notably the router's prompt trimmer — resolve the
-// same numbers grpcModelOpts does instead of guessing.
-const (
-	DefaultContextSize = 4096
-	DefaultBatchSize   = 512
-)
-
-// EffectiveContextSize is the context window the backend will run with: the
-// configured value, or DefaultContextSize when unset.
-func EffectiveContextSize(c config.ModelConfig) int {
-	if c.ContextSize != nil {
-		return *c.ContextSize
-	}
-	return DefaultContextSize
-}
-
-// EffectiveBatchSize is the single-decode batch the backend will run with.
-// Score, embedding and rerank all process the whole input in one pass: score
-// decodes prompt+candidate (asserts n_tokens <= n_batch), and embedding/rerank
-// pool over the full sequence in one physical batch (n_ubatch). So the batch
-// is sized to the context — anything that fits the context fits one pass,
-// avoiding both the GGML_ASSERT crash and the "input is too large to process"
-// error. Explicit `batch:` always wins.
-func EffectiveBatchSize(c config.ModelConfig) int {
-	if c.Batch != 0 {
-		return c.Batch
-	}
-	singlePass := c.HasUsecases(config.FLAG_SCORE) ||
-		c.HasUsecases(config.FLAG_EMBEDDINGS) ||
-		c.HasUsecases(config.FLAG_RERANK)
-	if ctx := EffectiveContextSize(c); singlePass && ctx > DefaultBatchSize {
-		return ctx
-	}
-	return DefaultBatchSize
-}
-
 func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
-	ctxSize := EffectiveContextSize(c)
-	b := EffectiveBatchSize(c)
+	b := 512
+	if c.Batch != 0 {
+		b = c.Batch
+	}

 	flashAttention := "auto"

@@ -170,6 +134,11 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
 		}
 	}

+	ctxSize := 4096
+	if c.ContextSize != nil {
+		ctxSize = *c.ContextSize
+	}
+
 	mmlock := false
 	if c.MMlock != nil {
 		mmlock = *c.MMlock
--- a/core/backend/options_internal_test.go
+++ b/core/backend/options_internal_test.go
@@ -97,67 +97,3 @@ var _ = Describe("gRPCPredictOpts reasoning_effort metadata", func() {
 		Expect(opts.Metadata).ToNot(HaveKey("reasoning_effort"))
 	})
 })
-
-var _ = Describe("grpcModelOpts NBatch", func() {
-	scoreUsecase := config.FLAG_SCORE
-	threads := 1
-	ctx := 4096
-
-	It("defaults to 512 for an ordinary model", func() {
-		cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}}
-		opts := grpcModelOpts(cfg, "/tmp/models")
-		Expect(opts.NBatch).To(BeEquivalentTo(512))
-	})
-
-	It("sizes the batch to the context window for score models", func() {
-		// Score models decode the whole prompt+candidate in one
-		// llama_decode; n_batch must cover it or the backend aborts.
-		cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}, KnownUsecases: &scoreUsecase}
-		opts := grpcModelOpts(cfg, "/tmp/models")
-		Expect(opts.NBatch).To(BeEquivalentTo(4096))
-	})
-
-	It("keeps an explicit batch over the score default", func() {
-		cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}, KnownUsecases: &scoreUsecase}
-		cfg.Batch = 1024
-		opts := grpcModelOpts(cfg, "/tmp/models")
-		Expect(opts.NBatch).To(BeEquivalentTo(1024))
-	})
-
-	It("sizes the batch to the context window for embedding models", func() {
-		// Embedding/rerank pool over the whole sequence in one physical batch
-		// (n_ubatch); without this the input is capped at the 512 default and
-		// the backend returns "input is too large to process".
-		embeddings := true
-		cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}}
-		cfg.Embeddings = &embeddings
-		opts := grpcModelOpts(cfg, "/tmp/models")
-		Expect(opts.NBatch).To(BeEquivalentTo(4096))
-	})
-
-	It("sizes the batch to the context window for rerank models", func() {
-		reranking := true
-		cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}}
-		cfg.Reranking = &reranking
-		opts := grpcModelOpts(cfg, "/tmp/models")
-		Expect(opts.NBatch).To(BeEquivalentTo(4096))
-	})
-
-	It("does not raise the batch when a score model's context is below the default", func() {
-		small := 256
-		cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &small}, KnownUsecases: &scoreUsecase}
-		opts := grpcModelOpts(cfg, "/tmp/models")
-		Expect(opts.NBatch).To(BeEquivalentTo(512))
-	})
-
-	It("sizes the batch to the effective 4096 default for a score model with no explicit context_size", func() {
-		// The crash case: the backend defaults n_ctx to 4096, so n_batch must
-		// follow even when context_size is unset — otherwise n_batch stays 512
-		// against a 4096 window and the score decode hits the GGML_ASSERT.
-		cfg := config.ModelConfig{Threads: &threads, KnownUsecases: &scoreUsecase}
-		Expect(cfg.ContextSize).To(BeNil())
-		opts := grpcModelOpts(cfg, "/tmp/models")
-		Expect(opts.NBatch).To(BeEquivalentTo(4096))
-		Expect(opts.ContextSize).To(BeEquivalentTo(4096), "n_batch must match the effective n_ctx the backend receives")
-	})
-})
--- a/core/backend/stores.go
+++ b/core/backend/stores.go
@@ -3,10 +3,9 @@ package backend
 import (
 	"context"
 	"fmt"
-	"time"
+	"strings"

 	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/trace"

 	"github.com/mudler/LocalAI/pkg/grpc"
 	"github.com/mudler/LocalAI/pkg/model"
@@ -40,85 +39,34 @@ func (s *localVectorStore) backend(_ context.Context) (grpc.Backend, error) {
 	return StoreBackend(s.loader, s.appConfig, s.storeName, "")
 }

-func (s *localVectorStore) Search(ctx context.Context, vec []float32) (sim float64, payload []byte, ok bool, err error) {
-	start := time.Now()
-	outcome := "hit"
-	defer func() {
-		s.recordTrace(start, "search", len(vec), sim, outcome, err)
-	}()
-	be, berr := s.backend(ctx)
-	if berr != nil {
-		outcome = "backend_load_error"
-		return 0, nil, false, fmt.Errorf("vector store load: %w", berr)
+func (s *localVectorStore) Search(ctx context.Context, vec []float32) (float64, []byte, bool, error) {
+	be, err := s.backend(ctx)
+	if err != nil {
+		return 0, nil, false, fmt.Errorf("vector store load: %w", err)
 	}
-	_, values, similarities, ferr := store.Find(ctx, be, vec, 1)
-	if ferr != nil {
-		outcome = "find_error"
-		return 0, nil, false, fmt.Errorf("vector store find: %w", ferr)
+	_, values, similarities, err := store.Find(ctx, be, vec, 1)
+	if err != nil {
+		// local-store's Find returns "existing length is -1" before
+		// any keys are inserted. Surface that as a clean miss so the
+		// cache layer treats it as an empty store and proceeds to
+		// Insert rather than skipping.
+		if strings.Contains(err.Error(), "existing length is -1") {
+			return 0, nil, false, nil
+		}
+		return 0, nil, false, fmt.Errorf("vector store find: %w", err)
 	}
 	if len(values) == 0 || len(similarities) == 0 {
-		outcome = "miss"
 		return 0, nil, false, nil
 	}
 	return float64(similarities[0]), values[0], true, nil
 }

-func (s *localVectorStore) Insert(ctx context.Context, vec []float32, payload []byte) (err error) {
-	start := time.Now()
-	outcome := "ok"
-	defer func() {
-		s.recordTrace(start, "insert", len(vec), 0, outcome, err)
-	}()
-	be, berr := s.backend(ctx)
-	if berr != nil {
-		outcome = "backend_load_error"
-		return fmt.Errorf("vector store load: %w", berr)
-	}
-	if serr := store.SetSingle(ctx, be, vec, payload); serr != nil {
-		outcome = "insert_error"
-		return serr
-	}
-	return nil
-}
-
-// recordTrace surfaces vector-store calls in /api/backend-traces, including
-// the backend-load-failure path that otherwise vanishes into an xlog.Warn.
-// modelName uses the store namespace (e.g. "router-cache-smart-router") so
-// admins can tell which router's cache misbehaved; the backend is always
-// "local-store" and can't disambiguate.
-func (s *localVectorStore) recordTrace(start time.Time, op string, vecDim int, sim float64, outcome string, err error) {
-	if s.appConfig == nil || !s.appConfig.EnableTracing {
-		return
-	}
-	trace.InitBackendTracingIfEnabled(s.appConfig.TracingMaxItems, s.appConfig.TracingMaxBodyBytes)
-	errStr := ""
+func (s *localVectorStore) Insert(ctx context.Context, vec []float32, payload []byte) error {
+	be, err := s.backend(ctx)
 	if err != nil {
-		errStr = err.Error()
+		return fmt.Errorf("vector store load: %w", err)
 	}
-	summary := op + " " + outcome
-	if op == "search" && outcome == "hit" {
-		summary = fmt.Sprintf("search hit (sim=%.3f)", sim)
-	}
-	data := map[string]any{
-		"op":         op,
-		"outcome":    outcome,
-		"vector_dim": vecDim,
-	}
-	// Only include similarity for a real neighbor — miss/empty_store would
-	// otherwise render "similarity: 0" and read as a measured value.
-	if op == "search" && outcome == "hit" {
-		data["similarity"] = sim
-	}
-	trace.RecordBackendTrace(trace.BackendTrace{
-		Timestamp: start,
-		Duration:  time.Since(start),
-		Type:      trace.BackendTraceVectorStore,
-		ModelName: s.storeName,
-		Backend:   model.LocalStoreBackend,
-		Summary:   summary,
-		Error:     errStr,
-		Data:      data,
-	})
+	return store.SetSingle(ctx, be, vec, payload)
 }

 func StoreBackend(sl *model.ModelLoader, appConfig *config.ApplicationConfig, storeName string, backend string) (grpc.Backend, error) {
--- a/core/backend/stores_test.go
+++ b/core/backend/stores_test.go
@@ -1,88 +0,0 @@
-package backend
-
-import (
-	"context"
-
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/trace"
-	"github.com/mudler/LocalAI/pkg/model"
-	"github.com/mudler/LocalAI/pkg/system"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-// findVectorStoreTrace returns the most recent vector_store trace whose
-// model_name matches storeName, or nil if none was recorded. Used by
-// the specs below to assert the trace landed without relying on
-// ring-buffer ordering across other tests in the suite.
-func findVectorStoreTrace(storeName string) *trace.BackendTrace {
-	traces := trace.GetBackendTraces()
-	for i := range traces {
-		bt := &traces[i]
-		if bt.Type == trace.BackendTraceVectorStore && bt.ModelName == storeName {
-			return bt
-		}
-	}
-	return nil
-}
-
-var _ = Describe("localVectorStore tracing", func() {
-	// Pin the trace surface admins read from /api/backend-traces.
-	// The original failure mode that motivated these specs — the
-	// local-store backend not installed — was silent on every surface
-	// except a per-call xlog.Warn. With tracing wired in, the row
-	// appears next to the embedder/score traces for the same request.
-	BeforeEach(func() {
-		trace.ClearBackendTraces()
-	})
-
-	It("records a vector_store trace with outcome=backend_load_error when the backend can't be loaded", func() {
-		// nil ModelLoader → s.backend → StoreBackend → panics on load.
-		// Use a real-but-empty loader so the failure surfaces as an
-		// error instead, exercising the load-failure trace path the
-		// admin would hit when local-store isn't installed.
-		appCfg := &config.ApplicationConfig{
-			EnableTracing:       true,
-			TracingMaxItems:     16,
-			TracingMaxBodyBytes: 1024,
-		}
-		s := &localVectorStore{
-			loader:    model.NewModelLoader(&system.SystemState{}),
-			appConfig: appCfg,
-			storeName: "router-cache-test",
-		}
-
-		// Search must surface the error AND record a trace describing it.
-		_, _, _, err := s.Search(context.Background(), []float32{0.1, 0.2, 0.3})
-		Expect(err).To(HaveOccurred())
-
-		Eventually(func() *trace.BackendTrace {
-			return findVectorStoreTrace("router-cache-test")
-		}).ShouldNot(BeNil())
-
-		bt := findVectorStoreTrace("router-cache-test")
-		Expect(bt.Backend).To(Equal(model.LocalStoreBackend))
-		Expect(bt.Data["op"]).To(Equal("search"))
-		Expect(bt.Data["outcome"]).To(Equal("backend_load_error"))
-		Expect(bt.Data["vector_dim"]).To(Equal(3))
-		// Error is the wrapped "vector store load: …" surfaced to the caller.
-		Expect(bt.Error).To(ContainSubstring("vector store load"))
-	})
-
-	It("does not record a trace when tracing is disabled", func() {
-		// Opt-out path: appConfig.EnableTracing=false must short-circuit
-		// before InitBackendTracingIfEnabled, so a workload with tracing
-		// turned off doesn't pay the channel-send cost per cache call.
-		appCfg := &config.ApplicationConfig{EnableTracing: false}
-		s := &localVectorStore{
-			loader:    model.NewModelLoader(&system.SystemState{}),
-			appConfig: appCfg,
-			storeName: "router-cache-disabled",
-		}
-		_, _, _, _ = s.Search(context.Background(), []float32{1})
-		Consistently(func() *trace.BackendTrace {
-			return findVectorStoreTrace("router-cache-disabled")
-		}).Should(BeNil())
-	})
-})
--- a/core/backend/tokenize.go
+++ b/core/backend/tokenize.go
@@ -7,23 +7,9 @@ import (
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/trace"
 	"github.com/mudler/LocalAI/pkg/grpc"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/model"
 )

-// tokenizeTokenCount returns the number of tokens in a backend response,
-// treating a nil response as zero. The gRPC client returns (nil, err) on
-// failure, and the tracing block below runs before that error is returned —
-// so the count must be read nil-safely here. Reading resp.Tokens on a nil
-// resp previously panicked the whole HTTP handler when tracing was enabled
-// (e.g. a transient tokenize failure during router probe-budget sizing).
-func tokenizeTokenCount(resp *pb.TokenizationResponse) int {
-	if resp == nil {
-		return 0
-	}
-	return len(resp.Tokens)
-}
-
 func ModelTokenize(s string, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (schema.TokenizeResponse, error) {

 	var inferenceModel grpc.Backend
@@ -54,7 +40,10 @@ func ModelTokenize(s string, loader *model.ModelLoader, modelConfig config.Model
 			errStr = err.Error()
 		}

-		tokenCount := tokenizeTokenCount(resp)
+		tokenCount := 0
+		if resp.Tokens != nil {
+			tokenCount = len(resp.Tokens)
+		}

 		trace.RecordBackendTrace(trace.BackendTrace{
 			Timestamp: startTime,
@@ -75,8 +64,8 @@ func ModelTokenize(s string, loader *model.ModelLoader, modelConfig config.Model
 		return schema.TokenizeResponse{}, err
 	}

-	if resp == nil || resp.Tokens == nil {
-		return schema.TokenizeResponse{Tokens: make([]int32, 0)}, nil
+	if resp.Tokens == nil {
+		resp.Tokens = make([]int32, 0)
 	}

 	return schema.TokenizeResponse{
--- a/core/backend/tokenize_test.go
+++ b/core/backend/tokenize_test.go
@@ -1,27 +0,0 @@
-package backend
-
-import (
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("tokenizeTokenCount", func() {
-	// Regression: the gRPC client returns (nil, err) when a tokenize call
-	// fails, and ModelTokenize's tracing block reads the token count before
-	// the error is returned. Dereferencing a nil response there panicked the
-	// HTTP handler (nil pointer dereference) — e.g. a transient tokenize
-	// failure while the router sized its probe-token budget.
-	It("returns zero for a nil response instead of panicking", func() {
-		Expect(tokenizeTokenCount(nil)).To(Equal(0))
-	})
-
-	It("returns zero when the response carries no tokens", func() {
-		Expect(tokenizeTokenCount(&pb.TokenizationResponse{})).To(Equal(0))
-	})
-
-	It("counts the tokens present on the response", func() {
-		Expect(tokenizeTokenCount(&pb.TokenizationResponse{Tokens: []int32{1, 2, 3}})).To(Equal(3))
-	})
-})
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -65,7 +65,7 @@ type ApplicationConfig struct {
 	//
 	//   patterns:
 	//     - id: email
-	//       action: allow            # downgrade default mask -> allow (log only)
+	//       action: route_local      # downgrade default mask -> route_local
 	//     - id: ssn
 	//       action: block            # upgrade default mask -> block
 	//
--- a/core/config/meta/build.go
+++ b/core/config/meta/build.go
@@ -93,9 +93,6 @@ func applyOverride(f *FieldMeta, o FieldMetaOverride) {
 	if o.Component != "" {
 		f.Component = o.Component
 	}
-	if o.Language != "" {
-		f.Language = o.Language
-	}
 	if o.Placeholder != "" {
 		f.Placeholder = o.Placeholder
 	}
--- a/core/config/meta/constants.go
+++ b/core/config/meta/constants.go
@@ -8,7 +8,6 @@ const (
 	ProviderModelsTTS        = "models:tts"
 	ProviderModelsTranscript = "models:transcript"
 	ProviderModelsVAD        = "models:vad"
-	ProviderModelsScore      = "models:score"
 )

 // Static option lists embedded directly in field metadata.
--- a/core/config/meta/registry.go
+++ b/core/config/meta/registry.go
@@ -226,7 +226,6 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Label:       "Chat Template",
 			Description: "Go template for chat completion requests",
 			Component:   "code-editor",
-			Language:    "gotemplate",
 			Order:       40,
 		},
 		"template.chat_message": {
@@ -234,7 +233,6 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Label:       "Chat Message Template",
 			Description: "Go template for individual chat messages",
 			Component:   "code-editor",
-			Language:    "gotemplate",
 			Order:       41,
 		},
 		"template.completion": {
@@ -242,22 +240,13 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Label:       "Completion Template",
 			Description: "Go template for completion requests",
 			Component:   "code-editor",
-			Language:    "gotemplate",
 			Order:       42,
 		},
-		"template.function": {
-			Section:     "templates",
-			Label:       "Functions Template",
-			Description: "Go template applied when tools/functions are present in the request",
-			Component:   "code-editor",
-			Language:    "gotemplate",
-			Order:       43,
-		},
 		"template.use_tokenizer_template": {
 			Section:     "templates",
 			Label:       "Use Tokenizer Template",
 			Description: "Use the chat template from the model's tokenizer config",
-			Order:       44,
+			Order:       43,
 		},
 		// Router section template — kept in the templates UI section
 		// (rather than the router section under "other") so operators
@@ -268,8 +257,7 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Label:       "Router Classifier System Prompt",
 			Description: "Go text/template (with sprig functions) for the routing system prompt the score classifier feeds to its classifier_model. Executed with `.Policies` ([]{Label, Description}). Empty falls back to the built-in Arch-Router-shaped prompt (route-listing block + JSON output schema). Override when the classifier model was trained on a different schema or you need the routing instructions in a different language. The candidate format scored against the model is fixed at `{\"route\": \"<label>\"}` — keep your override's output schema instruction matching that.",
 			Component:   "code-editor",
-			Language:    "gotemplate",
-			Order:       45,
+			Order:       44,
 		},

 		// --- Pipeline ---
@@ -320,41 +308,6 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			},
 			Order: 64,
 		},
-		"pipeline.disable_thinking": {
-			Section:     "pipeline",
-			Label:       "Disable Thinking",
-			Description: "Suppress reasoning/thinking output from the pipeline LLM (sets enable_thinking=false on the underlying model). Use for models that emit <think> blocks you don't want spoken or streamed back to the realtime client.",
-			Component:   "toggle",
-			Order:       65,
-		},
-		"pipeline.streaming.llm": {
-			Section:     "pipeline",
-			Label:       "Stream LLM",
-			Description: "Stream LLM tokens to the realtime client as they are generated instead of waiting for the full response. Emits incremental response.output_audio_transcript.delta / text deltas.",
-			Component:   "toggle",
-			Order:       66,
-		},
-		"pipeline.streaming.tts": {
-			Section:     "pipeline",
-			Label:       "Stream TTS",
-			Description: "Stream synthesized audio chunks to the realtime client as they are produced (requires a TTS backend that implements TTSStream). Falls back to unary synthesis otherwise.",
-			Component:   "toggle",
-			Order:       67,
-		},
-		"pipeline.streaming.transcription": {
-			Section:     "pipeline",
-			Label:       "Stream Transcription",
-			Description: "Stream partial transcription text to the realtime client as the STT backend produces it (requires a transcription backend that implements AudioTranscriptionStream). Falls back to unary transcription otherwise.",
-			Component:   "toggle",
-			Order:       68,
-		},
-		"pipeline.streaming.clause_chunking": {
-			Section:     "pipeline",
-			Label:       "Clause Chunking",
-			Description: "Split the streamed reply into speakable clauses and synthesize each as soon as it completes, instead of buffering the whole message before TTS — lower time-to-first-audio. Script-aware (handles CJK 。！？ and Thai/Lao spaces), so it does not whitespace-split. Requires Stream LLM; off buffers the whole message.",
-			Component:   "toggle",
-			Order:       69,
-		},

 		// --- Functions ---
 		"function.grammar.parallel_calls": {
@@ -412,14 +365,14 @@ func DefaultRegistry() map[string]FieldMetaOverride {

 		// --- PII filtering (per-model) ---
 		"pii.enabled": {
-			Section:     "pii",
+			Section:     "other",
 			Label:       "PII Filtering Enabled",
 			Description: "Enable PII redaction middleware for this model. Unset means use the default (off for local backends, on for proxy-* / cloud-hosted backends).",
 			Component:   "toggle",
 			Order:       200,
 		},
 		"pii.patterns": {
-			Section:     "pii",
+			Section:     "other",
 			Label:       "PII Pattern Overrides",
 			Description: "Override the global default action for specific patterns on this model. Patterns not listed here inherit the global action (Settings → Middleware → Filtering).",
 			Component:   "pii-pattern-list",
@@ -432,7 +385,7 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 		// fails closed — the chat handler does NOT silently fall back
 		// to the local gRPC pipeline.
 		"proxy.mode": {
-			Section:     "proxy",
+			Section:     "other",
 			Label:       "Proxy Mode",
 			Description: "passthrough forwards the client's OpenAI body verbatim — point upstream_url at an OpenAI-compatible endpoint (incl. Anthropic's /v1/chat/completions compat layer). translate converts OpenAI ↔ Anthropic Messages so you can target a native API (/v1/messages); tool_calls and usage tokens survive the round-trip.",
 			Component:   "select",
@@ -444,7 +397,7 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order:   208,
 		},
 		"proxy.provider": {
-			Section:     "proxy",
+			Section:     "other",
 			Label:       "Proxy Provider",
 			Description: "Upstream API family. Drives auth header shape (Bearer vs x-api-key + anthropic-version) and, in translate mode, which request/response codec is used.",
 			Component:   "select",
@@ -456,28 +409,28 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order:   209,
 		},
 		"proxy.upstream_url": {
-			Section:     "proxy",
+			Section:     "other",
 			Label:       "Proxy Upstream URL",
 			Description: "Full POST endpoint of the upstream provider (e.g. https://api.openai.com/v1/chat/completions). Only used when Backend is cloud-proxy.",
 			Component:   "input",
 			Order:       210,
 		},
 		"proxy.api_key_env": {
-			Section:     "proxy",
+			Section:     "other",
 			Label:       "Proxy API Key Env Var",
 			Description: "Name of the environment variable holding the upstream API key. Reading from env keeps the secret out of the YAML and the admin UI.",
 			Component:   "input",
 			Order:       211,
 		},
 		"proxy.upstream_model": {
-			Section:     "proxy",
+			Section:     "other",
 			Label:       "Proxy Upstream Model",
 			Description: "Model name sent to the upstream. Leave empty to forward the client's model field unchanged. Useful when the LocalAI alias differs from the upstream's canonical name.",
 			Component:   "input",
 			Order:       212,
 		},
 		"proxy.request_timeout_seconds": {
-			Section:     "proxy",
+			Section:     "other",
 			Label:       "Proxy Request Timeout (seconds)",
 			Description: "Caps the upstream HTTP request duration. 0 disables the deadline; the request still ends when the client disconnects.",
 			Component:   "number",
@@ -492,7 +445,7 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 		// A host claimed by two configs is a critical error — the
 		// listener refuses to start until resolved.
 		"mitm.hosts": {
-			Section:     "mitm",
+			Section:     "other",
 			Label:       "MITM Intercept Hosts",
 			Description: "Hostnames the cloudproxy MITM proxy terminates TLS for on behalf of this model config. PII filtering and pattern overrides flow from this model when the host is intercepted. Each host must be unique across all configs.",
 			Component:   "string-list",
@@ -507,7 +460,7 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 		// the middleware admin page surfaces every model with a router
 		// block.
 		"router.classifier": {
-			Section:     "router",
+			Section:     "other",
 			Label:       "Classifier",
 			Description: "Picks a candidate by scoring every policy label against the prompt. Only \"score\" is shipped today; it asks the classifier_model to rank each label and reads off the softmax. Empty defaults to \"score\".",
 			Component:   "select",
@@ -517,15 +470,15 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order: 230,
 		},
 		"router.classifier_model": {
-			Section:              "router",
+			Section:              "other",
 			Label:                "Classifier Model",
 			Description:          "Loaded LocalAI model the score classifier asks to rank each policy label as a continuation. Must support the Score gRPC primitive (today: llama-cpp, vLLM) and use the ChatML template. Arch-Router-1.5B Q4_K_M is the canonical choice; any small ChatML instruct model also works at a higher activation_threshold.",
 			Component:            "model-select",
-			AutocompleteProvider: ProviderModelsScore,
+			AutocompleteProvider: ProviderModelsChat,
 			Order:                231,
 		},
 		"router.fallback": {
-			Section:              "router",
+			Section:              "other",
 			Label:                "Fallback Model",
 			Description:          "Model used when no candidate's labels cover the classifier's active label set, or when the classifier errors. Empty means router failures bubble up as HTTP 500 — fail-fast, not silent-bypass.",
 			Component:            "model-select",
@@ -533,7 +486,7 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order:                232,
 		},
 		"router.activation_threshold": {
-			Section:     "router",
+			Section:     "other",
 			Label:       "Activation Threshold",
 			Description: "Softmax-probability floor a policy must clear to join the active label set for a request. Higher → single-label dominant routes; lower → more multi-label activations. 0 picks the package default (0.15). On Arch-Router-1.5B a value around 0.40 keeps the dominant label clean without losing genuine compound activations.",
 			Component:   "slider",
@@ -543,7 +496,7 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order:       233,
 		},
 		"router.classifier_cache_size": {
-			Section:     "router",
+			Section:     "other",
 			Label:       "Classifier L1 Cache Size",
 			Description: "Bounded LRU keyed on (case-folded, whitespace-trimmed) prompt — amortises the classifier round-trip across verbatim repeats common in agent loops. 0 here means \"use the default\" (1024); the cache cannot be disabled from YAML.",
 			Component:   "number",
@@ -551,21 +504,21 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order:       234,
 		},
 		"router.policies": {
-			Section:     "router",
+			Section:     "other",
 			Label:       "Policies",
 			Description: "Label vocabulary the classifier scores over. Each policy has a label and a short natural-language description fed verbatim to the classifier model. Short action-oriented sentences work best (\"writing or debugging code\"; \"small talk\").",
 			Component:   "router-policies",
 			Order:       235,
 		},
 		"router.candidates": {
-			Section:     "router",
+			Section:     "other",
 			Label:       "Candidates",
 			Description: "Routing table: each entry binds a downstream model to a set of policy labels it can serve. Order matters — the middleware picks the FIRST candidate whose labels are a superset of the active set, so list candidates smallest → largest.",
 			Component:   "router-candidates",
 			Order:       236,
 		},
 		"router.score_normalization": {
-			Section:     "router",
+			Section:     "other",
 			Label:       "Score Normalization",
 			Description: "How the score classifier collapses per-candidate joint log-probs into the softmax input. \"raw\" (default) feeds joint log-prob as-is — on-distribution for Arch-Router (the route the model would actually emit if decoded freely). \"mean\" divides by candidate token count — fairer to long labels but off-distribution for models trained to emit fixed-format outputs.",
 			Component:   "select",
@@ -577,7 +530,7 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order: 240,
 		},
 		"router.embedding_cache.embedding_model": {
-			Section:              "router",
+			Section:              "other",
 			Label:                "L2 Cache: Embedding Model",
 			Description:          "Embedding model used by the L2 decision cache. Embeds incoming probes and looks them up in the per-router local-store collection. Empty disables the cache entirely. nomic-embed-text-v1.5 is the recommended default.",
 			Component:            "model-select",
@@ -585,7 +538,7 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order:                237,
 		},
 		"router.embedding_cache.similarity_threshold": {
-			Section:     "router",
+			Section:     "other",
 			Label:       "L2 Cache: Similarity Threshold",
 			Description: "Cosine-similarity floor a cache candidate must clear to count as a hit. 0 picks the package default (0.80). Re-tune per embedding model — the histogram on the Routing tab shows where the cosine distribution actually sits.",
 			Component:   "slider",
@@ -595,7 +548,7 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order:       238,
 		},
 		"router.embedding_cache.confidence_threshold": {
-			Section:     "router",
+			Section:     "other",
 			Label:       "L2 Cache: Confidence Threshold",
 			Description: "Minimum top-label probability a classifier decision must have to be inserted into the cache. 0 picks the package default (0.60). Uncertain decisions are skipped so they can't poison future paraphrases.",
 			Component:   "slider",
@@ -605,7 +558,7 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Order:       239,
 		},
 		"router.embedding_cache.store_name": {
-			Section:     "router",
+			Section:     "other",
 			Label:       "L2 Cache: Store Name",
 			Description: "Optional override for the local-store collection used by this router's cache. Empty defaults to \"router-cache-<router-model-name>\". Two routers sharing a store_name share their cache (rare).",
 			Component:   "input",
--- a/core/config/meta/registry_coverage_test.go
+++ b/core/config/meta/registry_coverage_test.go
@@ -240,6 +240,7 @@ var grandfatheredUnregistered = []string{
 	"swap_space",
 	"system_prompt",
 	"template.edit",
+	"template.function",
 	"template.join_chat_messages_by_character",
 	"template.multimodal",
 	"template.reply_prefix",
--- a/core/config/meta/types.go
+++ b/core/config/meta/types.go
@@ -11,7 +11,6 @@ type FieldMeta struct {
 	Label       string        `json:"label"`                 // human-readable label
 	Description string        `json:"description,omitempty"` // help text
 	Component   string        `json:"component"`             // "input", "number", "toggle", "select", "slider", etc.
-	Language    string        `json:"language,omitempty"`    // syntax mode for code-editor fields: "yaml" (default), "gotemplate"
 	Placeholder string        `json:"placeholder,omitempty"`
 	Default     any           `json:"default,omitempty"`
 	Min         *float64      `json:"min,omitempty"`
@@ -52,7 +51,6 @@ type FieldMetaOverride struct {
 	Label                string
 	Description          string
 	Component            string
-	Language             string
 	Placeholder          string
 	Default              any
 	Min                  *float64
@@ -80,10 +78,6 @@ func DefaultSections() []Section {
 		{ID: "grpc", Label: "gRPC", Icon: "server", Order: 65},
 		{ID: "agent", Label: "Agent", Icon: "bot", Order: 70},
 		{ID: "mcp", Label: "MCP", Icon: "plug", Order: 75},
-		{ID: "router", Label: "Router", Icon: "git-merge", Order: 78},
-		{ID: "proxy", Label: "Proxy", Icon: "cloud", Order: 80},
-		{ID: "mitm", Label: "MITM Proxy", Icon: "shield", Order: 82},
-		{ID: "pii", Label: "PII", Icon: "shield", Order: 84},
 		{ID: "other", Label: "Other", Icon: "more-horizontal", Order: 100},
 	}
 }
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -385,7 +385,7 @@ type PIIConfig struct {
 	Enabled *bool `yaml:"enabled,omitempty" json:"enabled,omitempty"`

 	// Patterns lets a model upgrade or downgrade individual pattern
-	// actions (mask | block | allow) relative to the global
+	// actions (mask | block | route_local) relative to the global
 	// defaults loaded from --pii-config / DefaultPatterns. Pattern IDs
 	// not listed inherit the global action. The regex itself stays
 	// global — only the action is settable per-model.
@@ -499,16 +499,6 @@ type Pipeline struct {
 	// the pipeline's LLM without editing the LLM model config. Overrides the LLM's
 	// own reasoning_effort. Unset leaves the LLM model config in charge.
 	ReasoningEffort string `yaml:"reasoning_effort,omitempty" json:"reasoning_effort,omitempty"`
-
-	// Streaming opts each pipeline stage into incremental delivery (LLM tokens,
-	// TTS audio chunks, transcription text). Unset stages keep the blocking
-	// unary path, so existing configs are unaffected.
-	Streaming PipelineStreaming `yaml:"streaming,omitempty" json:"streaming,omitempty"`
-
-	// DisableThinking suppresses reasoning/thinking for the pipeline LLM (maps
-	// to enable_thinking=false backend metadata) without editing the underlying
-	// LLM model config. Unset leaves the LLM model config in charge.
-	DisableThinking *bool `yaml:"disable_thinking,omitempty" json:"disable_thinking,omitempty"`
 }

 // ApplyReasoningEffort resolves the effective reasoning effort — a per-request
@@ -540,41 +530,6 @@ func (c *ModelConfig) ApplyReasoningEffort(requestEffort string) {
 	}
 }

-// @Description PipelineStreaming toggles incremental delivery per realtime stage.
-type PipelineStreaming struct {
-	LLM           *bool `yaml:"llm,omitempty" json:"llm,omitempty"`
-	TTS           *bool `yaml:"tts,omitempty" json:"tts,omitempty"`
-	Transcription *bool `yaml:"transcription,omitempty" json:"transcription,omitempty"`
-	// ClauseChunking splits the streamed LLM reply into speakable clauses and
-	// synthesizes each as soon as it completes, instead of buffering the whole
-	// message before TTS. Script-aware (CJK/Thai), so it does not rely on
-	// whitespace sentence boundaries. Requires LLM streaming; unset buffers the
-	// whole message (today's default).
-	ClauseChunking *bool `yaml:"clause_chunking,omitempty" json:"clause_chunking,omitempty"`
-}
-
-// StreamLLM reports whether LLM tokens should be streamed for this pipeline.
-func (p Pipeline) StreamLLM() bool { return p.Streaming.LLM != nil && *p.Streaming.LLM }
-
-// StreamTTS reports whether TTS audio should be streamed for this pipeline.
-func (p Pipeline) StreamTTS() bool { return p.Streaming.TTS != nil && *p.Streaming.TTS }
-
-// StreamTranscription reports whether transcription text should be streamed.
-func (p Pipeline) StreamTranscription() bool {
-	return p.Streaming.Transcription != nil && *p.Streaming.Transcription
-}
-
-// ChunkClauses reports whether the streamed reply should be split into
-// script-aware clauses and synthesized incrementally rather than buffered whole.
-func (p Pipeline) ChunkClauses() bool {
-	return p.Streaming.ClauseChunking != nil && *p.Streaming.ClauseChunking
-}
-
-// ThinkingDisabled reports whether the pipeline forces the LLM's thinking off.
-func (p Pipeline) ThinkingDisabled() bool {
-	return p.DisableThinking != nil && *p.DisableThinking
-}
-
 // @Description File configuration for model downloads
 type File struct {
 	Filename string         `yaml:"filename,omitempty" json:"filename,omitempty"`
@@ -1274,20 +1229,14 @@ func (c *ModelConfig) GuessUsecases(u ModelConfigUsecase) bool {
 	}

 	if (u & FLAG_CHAT) == FLAG_CHAT {
-		// A router model is a chat dispatcher: it carries no chat
-		// template of its own (those live on the candidates it routes
-		// to) and is invoked through the chat endpoint, so the router
-		// block stands in for chat capability.
-		if !c.HasRouter() {
-			if c.TemplateConfig.Chat == "" && c.TemplateConfig.ChatMessage == "" && !c.TemplateConfig.UseTokenizerTemplate {
-				return false
-			}
-			if slices.Contains(nonTextGenBackends, c.Backend) {
-				return false
-			}
-			if c.Embeddings != nil && *c.Embeddings {
-				return false
-			}
+		if c.TemplateConfig.Chat == "" && c.TemplateConfig.ChatMessage == "" && !c.TemplateConfig.UseTokenizerTemplate {
+			return false
+		}
+		if slices.Contains(nonTextGenBackends, c.Backend) {
+			return false
+		}
+		if c.Embeddings != nil && *c.Embeddings {
+			return false
 		}
 	}
 	if (u & FLAG_COMPLETION) == FLAG_COMPLETION {
--- a/core/config/model_config_test.go
+++ b/core/config/model_config_test.go
@@ -283,18 +283,6 @@ parameters:
 		Expect(e.HasUsecases(FLAG_CHAT)).To(BeFalse())
 		Expect(e.HasUsecases(FLAG_EMBEDDINGS)).To(BeTrue())

-		// Router models are chat dispatchers: no chat template of their
-		// own, but invoked through the chat endpoint, so they default to
-		// chat-capable.
-		r := ModelConfig{
-			Name: "r",
-			Router: RouterConfig{
-				Candidates: []RouterCandidate{{Model: "downstream", Labels: []string{"general"}}},
-			},
-		}
-		Expect(r.HasUsecases(FLAG_ANY)).To(BeTrue())
-		Expect(r.HasUsecases(FLAG_CHAT)).To(BeTrue())
-
 		f := ModelConfig{
 			Name:    "f",
 			Backend: "piper",
--- a/core/config/pipeline_streaming_test.go
+++ b/core/config/pipeline_streaming_test.go
@@ -1,57 +0,0 @@
-package config
-
-import (
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-	"gopkg.in/yaml.v3"
-)
-
-// The realtime pipeline can stream each stage (LLM tokens, TTS audio,
-// transcription text) and can disable model "thinking" for the LLM. These are
-// opt-in per pipeline; everything defaults to off so existing configs keep the
-// unary behaviour.
-var _ = Describe("Pipeline streaming config", func() {
-	It("defaults every streaming + thinking helper to false when unset", func() {
-		var p Pipeline
-		Expect(p.StreamLLM()).To(BeFalse())
-		Expect(p.StreamTTS()).To(BeFalse())
-		Expect(p.StreamTranscription()).To(BeFalse())
-		Expect(p.ChunkClauses()).To(BeFalse())
-		Expect(p.ThinkingDisabled()).To(BeFalse())
-	})
-
-	It("parses the nested streaming block and disable_thinking from YAML", func() {
-		var c ModelConfig
-		err := yaml.Unmarshal([]byte(`
-name: gpt-realtime
-pipeline:
-  llm: my-llm
-  tts: my-tts
-  transcription: my-stt
-  streaming:
-    llm: true
-    tts: true
-    transcription: true
-    clause_chunking: true
-  disable_thinking: true
-`), &c)
-		Expect(err).ToNot(HaveOccurred())
-		Expect(c.Pipeline.StreamLLM()).To(BeTrue())
-		Expect(c.Pipeline.StreamTTS()).To(BeTrue())
-		Expect(c.Pipeline.StreamTranscription()).To(BeTrue())
-		Expect(c.Pipeline.ChunkClauses()).To(BeTrue())
-		Expect(c.Pipeline.ThinkingDisabled()).To(BeTrue())
-	})
-
-	It("treats an explicit false in the streaming block as disabled", func() {
-		var c ModelConfig
-		err := yaml.Unmarshal([]byte(`
-name: gpt-realtime
-pipeline:
-  streaming:
-    tts: false
-`), &c)
-		Expect(err).ToNot(HaveOccurred())
-		Expect(c.Pipeline.StreamTTS()).To(BeFalse())
-	})
-})
--- a/core/gallery/backends_test.go
+++ b/core/gallery/backends_test.go
@@ -50,14 +50,7 @@ var _ = Describe("Runtime capability-based backend selection", func() {
 		must(os.WriteFile(filepath.Join(cudaDir, "metadata.json"), b, 0o644))
 		must(os.WriteFile(filepath.Join(cudaDir, "run.sh"), []byte(""), 0o755))

-		// Default system: alias should point to CPU. Force the capability to
-		// "cpu" so this is hermetic on hosts that actually have a GPU: backend
-		// preference keys off getSystemCapabilities() (env → real nvidia-smi
-		// detection), not GPUVendor, so without this a GPU dev box reports
-		// "nvidia" and the cuda alias wins. The NVIDIA case below overrides it.
-		must(os.Setenv("LOCALAI_FORCE_META_BACKEND_CAPABILITY", "cpu"))
-		defer func() { _ = os.Unsetenv("LOCALAI_FORCE_META_BACKEND_CAPABILITY") }()
-
+		// Default system: alias should point to CPU
 		sysDefault, err := system.GetSystemState(
 			system.WithBackendPath(tempDir),
 		)
--- a/core/gallery/importers/importers.go
+++ b/core/gallery/importers/importers.go
@@ -158,11 +158,6 @@ var defaultImporters = []Importer{
 	// RFDetrImporter must run before TransformersImporter — RF-DETR
 	// checkpoints may carry tokenizer-adjacent artefacts.
 	&RFDetrImporter{},
-	// LocateAnythingImporter (NVIDIA LocateAnything open-vocab detection,
-	// native C++/ggml port) must run before LlamaCPPImporter so its GGUF
-	// bundles aren't claimed by the generic .gguf importer; kept next to
-	// RFDetrImporter as both are detection models.
-	&LocateAnythingImporter{},
 	// Existing
 	// DS4Importer must precede LlamaCPPImporter - ds4 weights are GGUFs and
 	// would otherwise be claimed by the generic .gguf-handling llama-cpp
--- a/core/gallery/importers/locate-anything.go
+++ b/core/gallery/importers/locate-anything.go
@@ -1,137 +0,0 @@
-package importers
-
-import (
-	"encoding/json"
-	"path/filepath"
-	"strings"
-
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/gallery"
-	"github.com/mudler/LocalAI/core/schema"
-	"go.yaml.in/yaml/v2"
-)
-
-var _ Importer = &LocateAnythingImporter{}
-
-// LocateAnythingImporter routes NVIDIA LocateAnything open-vocabulary
-// object-detection / visual-grounding repositories to the
-// "locate-anything-cpp" backend (a native C++/ggml port). It must be
-// registered BEFORE the generic GGUF matchers (LlamaCPPImporter) so its
-// GGUF bundles aren't swallowed by the generic .gguf-handling importer,
-// and alongside RFDetrImporter since both are detection models that may
-// carry tokenizer-adjacent artefacts.
-//
-// Detection signals:
-//   - preferences.backend="locate-anything-cpp" (explicit override);
-//   - repo name contains "locate-anything" or "locateanything"
-//     (case-insensitive).
-type LocateAnythingImporter struct{}
-
-func (i *LocateAnythingImporter) Name() string      { return "locate-anything-cpp" }
-func (i *LocateAnythingImporter) Modality() string  { return "detection" }
-func (i *LocateAnythingImporter) AutoDetects() bool { return true }
-
-func repoLooksLikeLocateAnything(repo string) bool {
-	lower := strings.ToLower(repo)
-	return strings.Contains(lower, "locate-anything") ||
-		strings.Contains(lower, "locateanything") ||
-		strings.Contains(lower, "locate-anything.cpp") ||
-		strings.Contains(lower, "locate-anything-cpp")
-}
-
-func (i *LocateAnythingImporter) Match(details Details) bool {
-	preferences, err := details.Preferences.MarshalJSON()
-	if err != nil {
-		return false
-	}
-	preferencesMap := make(map[string]any)
-	if len(preferences) > 0 {
-		if err := json.Unmarshal(preferences, &preferencesMap); err != nil {
-			return false
-		}
-	}
-
-	if b, ok := preferencesMap["backend"].(string); ok && b == "locate-anything-cpp" {
-		return true
-	}
-
-	if details.HuggingFace != nil {
-		repoName := details.HuggingFace.ModelID
-		if idx := strings.Index(repoName, "/"); idx >= 0 {
-			repoName = repoName[idx+1:]
-		}
-		if repoLooksLikeLocateAnything(repoName) {
-			return true
-		}
-	}
-
-	// Fallback: hfapi recursion bug may leave HuggingFace nil — decide
-	// from the URI owner/repo.
-	if _, repo, ok := HFOwnerRepoFromURI(details.URI); ok {
-		if repoLooksLikeLocateAnything(repo) {
-			return true
-		}
-	}
-
-	return false
-}
-
-func (i *LocateAnythingImporter) Import(details Details) (gallery.ModelConfig, error) {
-	preferences, err := details.Preferences.MarshalJSON()
-	if err != nil {
-		return gallery.ModelConfig{}, err
-	}
-	preferencesMap := make(map[string]any)
-	if len(preferences) > 0 {
-		if err := json.Unmarshal(preferences, &preferencesMap); err != nil {
-			return gallery.ModelConfig{}, err
-		}
-	}
-
-	name, ok := preferencesMap["name"].(string)
-	if !ok {
-		name = filepath.Base(details.URI)
-	}
-
-	description, ok := preferencesMap["description"].(string)
-	if !ok {
-		description = "Imported from " + details.URI
-	}
-
-	// Prefer the canonical HF "owner/repo" identifier so the emitted
-	// YAML mirrors gallery locate-anything entries.
-	model := details.URI
-	if details.HuggingFace != nil && details.HuggingFace.ModelID != "" {
-		model = details.HuggingFace.ModelID
-	} else if owner, repo, ok := HFOwnerRepoFromURI(details.URI); ok {
-		model = owner + "/" + repo
-	}
-
-	// Always the native C++/ggml backend; explicit preferences.backend
-	// overrides the default.
-	backend := "locate-anything-cpp"
-	if b, ok := preferencesMap["backend"].(string); ok && b != "" {
-		backend = b
-	}
-
-	modelConfig := config.ModelConfig{
-		Name:                name,
-		Description:         description,
-		Backend:             backend,
-		KnownUsecaseStrings: []string{"detection"},
-		PredictionOptions: schema.PredictionOptions{
-			BasicModelRequest: schema.BasicModelRequest{Model: model},
-		},
-	}
-
-	data, err := yaml.Marshal(modelConfig)
-	if err != nil {
-		return gallery.ModelConfig{}, err
-	}
-
-	return gallery.ModelConfig{
-		Name:        name,
-		Description: description,
-		ConfigFile:  string(data),
-	}, nil
-}
--- a/core/gallery/importers/locate-anything_test.go
+++ b/core/gallery/importers/locate-anything_test.go
@@ -1,218 +0,0 @@
-package importers_test
-
-import (
-	"encoding/json"
-	"fmt"
-
-	"github.com/mudler/LocalAI/core/gallery/importers"
-	hfapi "github.com/mudler/LocalAI/pkg/huggingface-api"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("LocateAnythingImporter", func() {
-	Context("Importer interface metadata", func() {
-		It("exposes name/modality/autodetect", func() {
-			imp := &importers.LocateAnythingImporter{}
-			Expect(imp.Name()).To(Equal("locate-anything-cpp"))
-			Expect(imp.Modality()).To(Equal("detection"))
-			Expect(imp.AutoDetects()).To(BeTrue())
-		})
-	})
-
-	Context("Match", func() {
-		It("matches when backend preference is locate-anything-cpp", func() {
-			imp := &importers.LocateAnythingImporter{}
-			preferences := json.RawMessage(`{"backend": "locate-anything-cpp"}`)
-			details := importers.Details{
-				URI:         "https://example.com/some-model",
-				Preferences: preferences,
-			}
-
-			Expect(imp.Match(details)).To(BeTrue())
-		})
-
-		It("matches when the repo name contains 'locate-anything' (case-insensitive)", func() {
-			imp := &importers.LocateAnythingImporter{}
-			details := importers.Details{
-				URI: "https://huggingface.co/mudler/locate-anything-cpp-3b",
-				HuggingFace: &hfapi.ModelDetails{
-					ModelID: "mudler/Locate-Anything-CPP-3B",
-					Author:  "mudler",
-				},
-			}
-
-			Expect(imp.Match(details)).To(BeTrue())
-		})
-
-		It("matches when the repo name contains 'locateanything' (case-insensitive)", func() {
-			imp := &importers.LocateAnythingImporter{}
-			details := importers.Details{
-				URI: "https://huggingface.co/nvidia/LocateAnything-3B",
-				HuggingFace: &hfapi.ModelDetails{
-					ModelID: "nvidia/LocateAnything-3B",
-					Author:  "nvidia",
-				},
-			}
-
-			Expect(imp.Match(details)).To(BeTrue())
-		})
-
-		It("matches via URI fallback when HuggingFace details are missing", func() {
-			imp := &importers.LocateAnythingImporter{}
-			details := importers.Details{
-				URI: "https://huggingface.co/nvidia/LocateAnything-3B",
-			}
-
-			Expect(imp.Match(details)).To(BeTrue())
-		})
-
-		It("does not match unrelated repos without locate-anything signals", func() {
-			imp := &importers.LocateAnythingImporter{}
-			details := importers.Details{
-				URI: "https://huggingface.co/meta-llama/Llama-3-8B",
-				HuggingFace: &hfapi.ModelDetails{
-					ModelID: "meta-llama/Llama-3-8B",
-					Author:  "meta-llama",
-				},
-			}
-
-			Expect(imp.Match(details)).To(BeFalse())
-		})
-
-		It("does not match an rfdetr repo", func() {
-			imp := &importers.LocateAnythingImporter{}
-			details := importers.Details{
-				URI: "https://huggingface.co/mudler/rfdetr-cpp-nano",
-				HuggingFace: &hfapi.ModelDetails{
-					ModelID: "mudler/rfdetr-cpp-nano",
-					Author:  "mudler",
-				},
-			}
-
-			Expect(imp.Match(details)).To(BeFalse())
-		})
-
-		It("returns false for invalid preferences JSON", func() {
-			imp := &importers.LocateAnythingImporter{}
-			preferences := json.RawMessage(`not valid json`)
-			details := importers.Details{
-				URI:         "https://example.com/model",
-				Preferences: preferences,
-			}
-
-			Expect(imp.Match(details)).To(BeFalse())
-		})
-	})
-
-	Context("Import", func() {
-		It("produces a YAML with backend locate-anything-cpp and the repo as the model", func() {
-			imp := &importers.LocateAnythingImporter{}
-			details := importers.Details{
-				URI: "https://huggingface.co/nvidia/LocateAnything-3B",
-				HuggingFace: &hfapi.ModelDetails{
-					ModelID: "nvidia/LocateAnything-3B",
-					Author:  "nvidia",
-				},
-			}
-
-			modelConfig, err := imp.Import(details)
-
-			Expect(err).ToNot(HaveOccurred())
-			Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: locate-anything-cpp"), fmt.Sprintf("Model config: %+v", modelConfig))
-			Expect(modelConfig.ConfigFile).To(ContainSubstring("nvidia/LocateAnything-3B"), fmt.Sprintf("Model config: %+v", modelConfig))
-			Expect(modelConfig.ConfigFile).To(ContainSubstring("detection"), fmt.Sprintf("Model config: %+v", modelConfig))
-		})
-
-		It("respects custom name and description from preferences", func() {
-			imp := &importers.LocateAnythingImporter{}
-			preferences := json.RawMessage(`{"name": "my-locate", "description": "Custom"}`)
-			details := importers.Details{
-				URI:         "https://huggingface.co/nvidia/LocateAnything-3B",
-				Preferences: preferences,
-				HuggingFace: &hfapi.ModelDetails{
-					ModelID: "nvidia/LocateAnything-3B",
-					Author:  "nvidia",
-				},
-			}
-
-			modelConfig, err := imp.Import(details)
-
-			Expect(err).ToNot(HaveOccurred())
-			Expect(modelConfig.Name).To(Equal("my-locate"))
-			Expect(modelConfig.Description).To(Equal("Custom"))
-		})
-	})
-
-	// Table-driven coverage of the backend routing: locate-anything repos
-	// always route to the native locate-anything-cpp backend, with an
-	// explicit preferences.backend override honoured.
-	//
-	// Cases are kept offline-deterministic by injecting Details directly
-	// rather than going through DiscoverModelConfig (which would hit live HF).
-	Context("backend routing (offline)", func() {
-		hfFile := func(path string) hfapi.ModelFile {
-			return hfapi.ModelFile{Path: path}
-		}
-
-		type tc struct {
-			name          string
-			uri           string
-			modelID       string
-			files         []hfapi.ModelFile
-			prefs         string
-			expectBackend string // expected `backend:` line content
-		}
-
-		entries := []tc{
-			{
-				name:          "canonical NVIDIA repo routes to locate-anything-cpp",
-				uri:           "https://huggingface.co/nvidia/LocateAnything-3B",
-				modelID:       "nvidia/LocateAnything-3B",
-				files:         []hfapi.ModelFile{hfFile("locate-anything-3b-q8_0.gguf"), hfFile("README.md")},
-				prefs:         "",
-				expectBackend: "backend: locate-anything-cpp",
-			},
-			{
-				name:          "GGUF bundle with locate-anything name routes to locate-anything-cpp",
-				uri:           "https://huggingface.co/mudler/locate-anything.cpp-3b",
-				modelID:       "mudler/locate-anything.cpp-3b",
-				files:         []hfapi.ModelFile{hfFile("model-f16.gguf")},
-				prefs:         "",
-				expectBackend: "backend: locate-anything-cpp",
-			},
-			{
-				name:          "explicit preferences.backend override is honoured",
-				uri:           "https://huggingface.co/nvidia/LocateAnything-3B",
-				modelID:       "nvidia/LocateAnything-3B",
-				files:         nil,
-				prefs:         `{"backend": "locate-anything-cpp"}`,
-				expectBackend: "backend: locate-anything-cpp",
-			},
-		}
-
-		for _, e := range entries {
-			e := e // capture for closure
-			It(e.name, func() {
-				imp := &importers.LocateAnythingImporter{}
-				details := importers.Details{
-					URI: e.uri,
-					HuggingFace: &hfapi.ModelDetails{
-						ModelID: e.modelID,
-						Files:   e.files,
-					},
-				}
-				if e.prefs != "" {
-					details.Preferences = json.RawMessage(e.prefs)
-				}
-
-				Expect(imp.Match(details)).To(BeTrue(), fmt.Sprintf("Match should fire for %+v", details))
-
-				modelConfig, err := imp.Import(details)
-				Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Import error: %v", err))
-				Expect(modelConfig.ConfigFile).To(ContainSubstring(e.expectBackend),
-					fmt.Sprintf("Model config: %+v", modelConfig))
-			})
-		}
-	})
-})
--- a/core/gallery/importers/mlx.go
+++ b/core/gallery/importers/mlx.go
@@ -64,17 +64,7 @@ func (i *MLXImporter) Import(details Details) (gallery.ModelConfig, error) {
 		description = "Imported from " + details.URI
 	}

-	// Vision-language checkpoints (e.g. gemma-4 E4B) declare the
-	// "image-text-to-text" pipeline tag on HuggingFace. The text-only mlx-lm
-	// tokenizer does not carry their processor chat template, so routing them
-	// through the plain mlx backend yields degenerate looping output
-	// (issue #10269). Send them to the mlx-vlm backend, which applies the
-	// processor-aware chat template.
 	backend := "mlx"
-	if details.HuggingFace != nil && details.HuggingFace.PipelineTag == "image-text-to-text" {
-		backend = "mlx-vlm"
-	}
-	// An explicit backend preference always wins.
 	b, ok := preferencesMap["backend"].(string)
 	if ok {
 		backend = b
--- a/core/gallery/importers/mlx_test.go
+++ b/core/gallery/importers/mlx_test.go
@@ -4,7 +4,6 @@ import (
 	"encoding/json"

 	"github.com/mudler/LocalAI/core/gallery/importers"
-	hfapi "github.com/mudler/LocalAI/pkg/huggingface-api"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
@@ -123,60 +122,6 @@ var _ = Describe("MLXImporter", func() {
 			Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: mlx-vlm"))
 		})

-		It("should auto-route vision-language models to the mlx-vlm backend", func() {
-			// gemma-4 E4B and similar VLMs declare pipeline_tag
-			// "image-text-to-text" on HuggingFace. The text-only mlx-lm
-			// tokenizer does not carry their processor chat template, so
-			// routing them through the plain mlx backend produces degenerate
-			// looping output (issue #10269). They must go to mlx-vlm.
-			details := importers.Details{
-				URI: "https://huggingface.co/mlx-community/gemma-4-E4B-it-qat-4bit",
-				HuggingFace: &hfapi.ModelDetails{
-					ModelID:     "mlx-community/gemma-4-E4B-it-qat-4bit",
-					PipelineTag: "image-text-to-text",
-				},
-			}
-
-			modelConfig, err := importer.Import(details)
-
-			Expect(err).ToNot(HaveOccurred())
-			Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: mlx-vlm"))
-		})
-
-		It("should keep text-only models on the plain mlx backend", func() {
-			details := importers.Details{
-				URI: "https://huggingface.co/mlx-community/Llama-3.2-1B-Instruct-4bit",
-				HuggingFace: &hfapi.ModelDetails{
-					ModelID:     "mlx-community/Llama-3.2-1B-Instruct-4bit",
-					PipelineTag: "text-generation",
-				},
-			}
-
-			modelConfig, err := importer.Import(details)
-
-			Expect(err).ToNot(HaveOccurred())
-			Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: mlx"))
-			Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("backend: mlx-vlm"))
-		})
-
-		It("should honor an explicit backend preference even for a VLM", func() {
-			preferences := json.RawMessage(`{"backend": "mlx"}`)
-			details := importers.Details{
-				URI:         "https://huggingface.co/mlx-community/gemma-4-E4B-it-qat-4bit",
-				Preferences: preferences,
-				HuggingFace: &hfapi.ModelDetails{
-					ModelID:     "mlx-community/gemma-4-E4B-it-qat-4bit",
-					PipelineTag: "image-text-to-text",
-				},
-			}
-
-			modelConfig, err := importer.Import(details)
-
-			Expect(err).ToNot(HaveOccurred())
-			Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: mlx"))
-			Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("backend: mlx-vlm"))
-		})
-
 		It("should handle invalid JSON preferences", func() {
 			preferences := json.RawMessage(`invalid json`)
 			details := importers.Details{
--- a/core/http/endpoints/anthropic/messages.go
+++ b/core/http/endpoints/anthropic/messages.go
@@ -353,7 +353,7 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
 			overrides = make(map[string]pii.Action, len(raw))
 			for ovid, action := range raw {
 				switch pii.Action(action) {
-				case pii.ActionMask, pii.ActionBlock, pii.ActionAllow:
+				case pii.ActionMask, pii.ActionBlock, pii.ActionRouteLocal:
 					overrides[ovid] = pii.Action(action)
 				}
 			}
--- a/core/http/endpoints/localai/api_instructions.go
+++ b/core/http/endpoints/localai/api_instructions.go
@@ -102,7 +102,7 @@ var instructionDefs = []instructionDef{
 		Name:        "pii-filtering",
 		Description: "Inspect and tune the regex PII filter applied to chat requests",
 		Tags:        []string{"pii"},
-		Intro:       "GET /api/pii/patterns lists the active pattern set with each one's action (mask, block, allow). GET /api/pii/events returns recent redaction events filtered by correlation_id / user_id / pattern_id (admin or local-user only). POST /api/pii/test dry-runs the redactor against an admin-supplied string. POST /api/pii/decide is the programmatic decision oracle for external routers: send `{text}`, receive `{findings, suggested_action, redacted_preview}` without LocalAI mutating, recording, or acting on the call — caller composes the action with its own policy. Default patterns: email, phone, SSN, credit card (Luhn), IPv4, common API key prefixes (sk-, pk-, ghp_, github_pat_). PII is per-model: by default it is OFF for non-proxy backends and ON for backends starting with proxy-* (cloud passthroughs). Opt in with `pii: { enabled: true }` in a model's YAML; use `pii: { patterns: [{id, action}] }` to upgrade or downgrade individual actions for that model. Override global default actions via --pii-config pii.yaml; --disable-pii turns the filter off entirely.",
+		Intro:       "GET /api/pii/patterns lists the active pattern set with each one's action (mask, block, route_local). GET /api/pii/events returns recent redaction events filtered by correlation_id / user_id / pattern_id (admin or local-user only). POST /api/pii/test dry-runs the redactor against an admin-supplied string. POST /api/pii/decide is the programmatic decision oracle for external routers: send `{text}`, receive `{findings, suggested_action, redacted_preview}` without LocalAI mutating, recording, or acting on the call — caller composes the action with its own policy. Default patterns: email, phone, SSN, credit card (Luhn), IPv4, common API key prefixes (sk-, pk-, ghp_, github_pat_). PII is per-model: by default it is OFF for non-proxy backends and ON for backends starting with proxy-* (cloud passthroughs). Opt in with `pii: { enabled: true }` in a model's YAML; use `pii: { patterns: [{id, action}] }` to upgrade or downgrade individual actions for that model. Override global default actions via --pii-config pii.yaml; --disable-pii turns the filter off entirely.",
 	},
 	{
 		Name:        "middleware-admin",
--- a/core/http/endpoints/localai/config_meta.go
+++ b/core/http/endpoints/localai/config_meta.go
@@ -124,8 +124,6 @@ func AutocompleteEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, a
 				filterFn = config.BuildUsecaseFilterFn(config.FLAG_VAD)
 			case config.UsecaseTranscript:
 				filterFn = config.BuildUsecaseFilterFn(config.FLAG_TRANSCRIPT)
-			case "score": // router classifier usecase (FLAG_SCORE); not in UsecaseInfoMap
-				filterFn = config.BuildUsecaseFilterFn(config.FLAG_SCORE)
 			default:
 				filterFn = config.NoFilterFn
 			}
--- a/core/http/endpoints/localai/pii_decide.go
+++ b/core/http/endpoints/localai/pii_decide.go
@@ -15,9 +15,9 @@ import (
 //
 // External routers (e.g. the localai-org/platform router) call this
 // before dispatching to learn whether to mask the prompt in place,
-// block the request, or pass it through. LocalAI's in-band PII
-// middleware is the alternative path for direct-to-LocalAI clients —
-// same Redactor, different framing.
+// route to a local-only backend, block the request, or pass it
+// through. LocalAI's in-band PII middleware is the alternative path
+// for direct-to-LocalAI clients — same Redactor, different framing.
 //
 // Takes the *pii.Redactor directly rather than the whole
 // *application.Application so the handler stays unit-testable with a
@@ -62,18 +62,24 @@ func PIIDecideEndpoint(redactor *pii.Redactor) echo.HandlerFunc {
 	}
 }

+// actionAllow is the wire-only value for "no findings". The other
+// three map to existing pii.Action* constants; allow has no in-band
+// counterpart because the in-band middleware simply passes through.
+const actionAllow = "allow"
+
 // suggestedAction collapses the Redactor's Result flags onto a single
-// wire-format action using the in-band ordering (block > mask >
-// allow). "allow" covers both "nothing matched" and "matched but every
-// span resolved to the allow action" — in both cases the caller may
-// dispatch unchanged, with the Findings list reporting what was seen.
+// wire-format action using the in-band ordering (block > route_local
+// > mask > allow). Spans-without-Blocked-or-LocalOnly means every
+// match resolved to ActionMask.
 func suggestedAction(res pii.Result) string {
 	switch {
 	case res.Blocked:
 		return string(pii.ActionBlock)
-	case res.Masked:
+	case res.LocalOnly:
+		return string(pii.ActionRouteLocal)
+	case len(res.Spans) > 0:
 		return string(pii.ActionMask)
 	default:
-		return string(pii.ActionAllow)
+		return actionAllow
 	}
 }
--- a/core/http/endpoints/localai/pii_decide_test.go
+++ b/core/http/endpoints/localai/pii_decide_test.go
@@ -16,8 +16,8 @@ import (

 // PIIDecideEndpoint exposes the redactor as a decision oracle. These
 // specs pin the validation surface and the suggested_action mapping
-// across the three actions (allow/mask/block). The redactor itself is
-// covered in core/services/routing/pii/redactor_test.go.
+// across all four actions (allow/mask/route_local/block). The redactor
+// itself is covered in core/services/routing/pii/redactor_test.go.

 var _ = Describe("PIIDecideEndpoint", func() {
 	var redactor *pii.Redactor
@@ -68,17 +68,16 @@ var _ = Describe("PIIDecideEndpoint", func() {
 		Expect(len(body.Findings)).To(BeNumerically(">=", 1))
 	})

-	It("returns allow when a matched pattern's action is allow", func() {
-		// Downgrade the email pattern to allow for this test —
-		// exercises the allow branch of suggestedAction: a match is
-		// found, but the strongest action is allow so the suggestion
-		// is "allow" and the text is left intact.
-		Expect(redactor.SetAction("email", pii.ActionAllow)).To(Succeed())
+	It("returns route_local when an override sets that action", func() {
+		// Promote the email pattern to route_local for this test —
+		// exercises the route_local branch of suggestedAction without
+		// needing a custom pattern set.
+		Expect(redactor.SetAction("email", pii.ActionRouteLocal)).To(Succeed())
 		rec, body := invokePIIDecide(redactor, `{"text":"contact alice@example.com"}`)
 		Expect(rec.Code).To(Equal(http.StatusOK))
-		Expect(body.SuggestedAction).To(Equal("allow"))
-		Expect(body.Findings).To(HaveLen(1), "allow still reports the finding")
-		// allow leaves the original text intact.
+		Expect(body.SuggestedAction).To(Equal("route_local"))
+		// route_local leaves the original text intact — caller decides
+		// whether to forward it to a local-only backend.
 		Expect(body.RedactedPreview).To(ContainSubstring("alice@example.com"))
 	})

--- a/core/http/endpoints/openai/completion.go
+++ b/core/http/endpoints/openai/completion.go
@@ -130,7 +130,7 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 					overrides = make(map[string]pii.Action, len(raw))
 					for ovid, action := range raw {
 						switch pii.Action(action) {
-						case pii.ActionMask, pii.ActionBlock, pii.ActionAllow:
+						case pii.ActionMask, pii.ActionBlock, pii.ActionRouteLocal:
 							overrides[ovid] = pii.Action(action)
 						}
 					}
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -2,10 +2,8 @@ package openai

 import (
 	"context"
-	"crypto/rand"
 	"encoding/base64"
 	"encoding/binary"
-	"encoding/hex"
 	"encoding/json"
 	"fmt"
 	"math"
@@ -237,12 +235,6 @@ type Model interface {
 	Transcribe(ctx context.Context, audio, language string, translate bool, diarize bool, prompt string) (*schema.TranscriptionResult, error)
 	Predict(ctx context.Context, messages schema.Messages, images, videos, audios []string, tokenCallback func(string, backend.TokenUsage) bool, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, logprobs *int, topLogprobs *int, logitBias map[string]float64) (func() (backend.LLMResponse, error), error)
 	TTS(ctx context.Context, text, voice, language string) (string, *proto.Result, error)
-	// TTSStream synthesizes speech incrementally, invoking onAudio with raw PCM
-	// chunks (and the backend sample rate) as they are produced.
-	TTSStream(ctx context.Context, text, voice, language string, onAudio func(pcm []byte, sampleRate int) error) error
-	// TranscribeStream transcribes audio incrementally, invoking onDelta for each
-	// transcript text fragment and returning the final aggregated result.
-	TranscribeStream(ctx context.Context, audio, language string, translate, diarize bool, prompt string, onDelta func(text string)) (*schema.TranscriptionResult, error)
 	PredictConfig() *config.ModelConfig
 }

@@ -990,8 +982,18 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode
 	}

 	if rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil {
-		session.InputAudioTranscription = rt.Audio.Input.Transcription
-		session.ModelConfig.Pipeline.Transcription = rt.Audio.Input.Transcription.Model
+		trUpd := rt.Audio.Input.Transcription
+		// A language-only update (e.g. a client forcing the STT language) carries
+		// an empty Model. Preserve the pipeline's configured transcription backend
+		// instead of blanking it — otherwise the next utterance transcribes against
+		// an empty model and the backend RPC fails with "unimplemented".
+		if trUpd.Model == "" && session.InputAudioTranscription != nil {
+			trUpd.Model = session.InputAudioTranscription.Model
+		}
+		session.InputAudioTranscription = trUpd
+		if trUpd.Model != "" {
+			session.ModelConfig.Pipeline.Transcription = trUpd.Model
+		}
 	}

 	if rt.Model != "" || (rt.Audio != nil && rt.Audio.Output != nil && rt.Audio.Output.Voice != "") || (rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil) {
@@ -1262,15 +1264,27 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
 	// TODO: If we have a real any-to-any model then transcription is optional
 	var transcript string
 	if session.InputAudioTranscription != nil {
-		// emitTranscription streams transcript deltas when
-		// pipeline.streaming.transcription is set, otherwise emits a single
-		// completed event; either way it returns the final transcript text.
-		var err error
-		transcript, err = emitTranscription(ctx, t, session, generateItemID(), f.Name())
+		tr, err := session.ModelInterface.Transcribe(ctx, f.Name(), session.InputAudioTranscription.Language, false, false, session.InputAudioTranscription.Prompt)
 		if err != nil {
 			sendError(t, "transcription_failed", err.Error(), "", "event_TODO")
 			return
+		} else if tr == nil {
+			sendError(t, "transcription_failed", "trancribe result is nil", "", "event_TODO")
+			return
 		}
+
+		transcript = tr.Text
+		sendEvent(t, types.ConversationItemInputAudioTranscriptionCompletedEvent{
+			ServerEventBase: types.ServerEventBase{
+				EventID: "event_TODO",
+			},
+
+			ItemID: generateItemID(),
+			// ResponseID:   "resp_TODO", // Not needed for transcription completed event
+			// OutputIndex:  0,
+			ContentIndex: 0,
+			Transcript:   transcript,
+		})
 	} else {
 		sendNotImplemented(t, "any-to-any models")
 		return
@@ -1498,26 +1512,6 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 		},
 	})

-	// Streamed LLM path: when the pipeline opts into LLM streaming, stream the
-	// transcript to the client as it is generated and synthesize the buffered
-	// message once. Tool turns are supported only when the model uses its
-	// tokenizer template: the C++ autoparser then delivers content and tool
-	// calls via ChatDeltas (clearing the text stream), so the spoken transcript
-	// never leaks tool-call tokens. Grammar-based function calling emits the
-	// call as JSON in the token stream, so those turns keep the buffered path.
-	if config != nil && session.ModelConfig != nil && session.ModelConfig.Pipeline.StreamLLM() {
-		canStream := len(tools) == 0 || config.TemplateConfig.UseTokenizerTemplate
-		var respMods []types.Modality
-		if overrides != nil {
-			respMods = overrides.OutputModalities
-		}
-		if canStream && modalitiesContainAudio(resolveOutputModalities(session.OutputModalities, respMods)) {
-			if streamLLMResponse(ctx, session, conv, t, responseID, conversationHistory, images, config, tools, toolChoice, toolTurn) {
-				return
-			}
-		}
-	}
-
 	predFunc, err := session.ModelInterface.Predict(ctx, conversationHistory, images, nil, nil, nil, tools, toolChoice, nil, nil, nil)
 	if err != nil {
 		sendError(t, "inference_failed", fmt.Sprintf("backend error: %v", err), "", "") // item.Assistant.ID is unknown here
@@ -1595,7 +1589,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 		// ExtractReasoningWithConfig is a no-op when no tag pair matches,
 		// so it's safe to apply unconditionally in the no-reasoning branch.
 		if deltaReasoning == "" && deltaContent != "" {
-			deltaReasoning, deltaContent = reasoning.ExtractReasoningComplete(deltaContent, thinkingStartToken, spokenReasoningConfig(config.ReasoningConfig))
+			deltaReasoning, deltaContent = reasoning.ExtractReasoningComplete(deltaContent, thinkingStartToken, config.ReasoningConfig)
 		}
 		reasoningText = deltaReasoning
 		responseWithoutReasoning = deltaContent
@@ -1603,7 +1597,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 		cleanedResponse = deltaContent
 		toolCalls = deltaToolCalls
 	} else {
-		reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningComplete(rawResponse, thinkingStartToken, spokenReasoningConfig(config.ReasoningConfig))
+		reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningComplete(rawResponse, thinkingStartToken, config.ReasoningConfig)
 		textContent = functions.ParseTextContent(responseWithoutReasoning, config.FunctionsConfig)
 		cleanedResponse = functions.CleanupLLMResult(responseWithoutReasoning, config.FunctionsConfig)
 		toolCalls = functions.ParseFunctionCall(cleanedResponse, config.FunctionsConfig)
@@ -1729,7 +1723,64 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 				return
 			}

-			// Transcript of the spoken reply (the audio's text).
+			audioFilePath, res, err := session.ModelInterface.TTS(ctx, finalSpeech, session.Voice, session.InputAudioTranscription.Language)
+			if err != nil {
+				if ctx.Err() != nil {
+					xlog.Debug("TTS cancelled (barge-in)")
+					sendCancelledResponse()
+					return
+				}
+				xlog.Error("TTS failed", "error", err)
+				sendError(t, "tts_error", fmt.Sprintf("TTS generation failed: %v", err), "", item.Assistant.ID)
+				return
+			}
+			if !res.Success {
+				xlog.Error("TTS failed", "message", res.Message)
+				sendError(t, "tts_error", fmt.Sprintf("TTS generation failed: %s", res.Message), "", item.Assistant.ID)
+				return
+			}
+			defer func() { _ = os.Remove(audioFilePath) }()
+
+			audioBytes, err := os.ReadFile(audioFilePath)
+			if err != nil {
+				xlog.Error("failed to read TTS file", "error", err)
+				sendError(t, "tts_error", fmt.Sprintf("Failed to read TTS audio: %v", err), "", item.Assistant.ID)
+				return
+			}
+
+			// Parse WAV header to get raw PCM and the actual sample rate from the TTS backend.
+			pcmData, ttsSampleRate := laudio.ParseWAV(audioBytes)
+			if ttsSampleRate == 0 {
+				ttsSampleRate = localSampleRate
+			}
+			xlog.Debug("TTS audio parsed", "raw_bytes", len(audioBytes), "pcm_bytes", len(pcmData), "sample_rate", ttsSampleRate)
+
+			// SendAudio (WebRTC) passes PCM at the TTS sample rate directly to the
+			// Opus encoder, which resamples to 48kHz internally. This avoids a
+			// lossy intermediate resample through 16kHz.
+			// XXX: This is a noop in websocket mode; it's included in the JSON instead
+			if err := t.SendAudio(ctx, pcmData, ttsSampleRate); err != nil {
+				if ctx.Err() != nil {
+					xlog.Debug("Audio playback cancelled (barge-in)")
+					sendCancelledResponse()
+					return
+				}
+				xlog.Error("failed to send audio via transport", "error", err)
+			}
+
+			// For WebSocket clients, resample to the session's output rate and
+			// deliver audio as base64 in JSON events. WebRTC clients already
+			// received audio over the RTP track, so skip the base64 payload.
+			if !isWebRTC {
+				wsPCM := pcmData
+				if ttsSampleRate != session.OutputSampleRate {
+					samples := sound.BytesToInt16sLE(pcmData)
+					resampled := sound.ResampleInt16(samples, ttsSampleRate, session.OutputSampleRate)
+					wsPCM = sound.Int16toBytesLE(resampled)
+				}
+				audioString = base64.StdEncoding.EncodeToString(wsPCM)
+			}
+
 			sendEvent(t, types.ResponseOutputAudioTranscriptDeltaEvent{
 				ServerEventBase: types.ServerEventBase{},
 				ResponseID:      responseID,
@@ -1747,26 +1798,15 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 				Transcript:      finalSpeech,
 			})

-			// Synthesize and send the audio. With pipeline.streaming.tts enabled
-			// emitSpeech forwards a response.output_audio.delta per backend PCM
-			// chunk as it's produced; otherwise it sends the whole utterance as a
-			// single delta. The returned PCM is stored (base64) on the item below.
-			pcmAudio, err := emitSpeech(ctx, t, session, responseID, item.Assistant.ID, finalSpeech)
-			if err != nil {
-				if ctx.Err() != nil {
-					xlog.Debug("TTS cancelled (barge-in)")
-					sendCancelledResponse()
-					return
-				}
-				xlog.Error("TTS failed", "error", err)
-				sendError(t, "tts_error", fmt.Sprintf("TTS generation failed: %v", err), "", item.Assistant.ID)
-				return
-			}
-			if !isWebRTC {
-				audioString = base64.StdEncoding.EncodeToString(pcmAudio)
-			}
-
 			if !isWebRTC {
+				sendEvent(t, types.ResponseOutputAudioDeltaEvent{
+					ServerEventBase: types.ServerEventBase{},
+					ResponseID:      responseID,
+					ItemID:          item.Assistant.ID,
+					OutputIndex:     0,
+					ContentIndex:    0,
+					Delta:           audioString,
+				})
 				sendEvent(t, types.ResponseOutputAudioDoneEvent{
 					ServerEventBase: types.ServerEventBase{},
 					ResponseID:      responseID,
@@ -1819,27 +1859,17 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 		})
 	}

-	// Emit the parsed tool calls, the terminal response.done, and (for
-	// server-side assistant tools) the follow-up response. Shared with the
-	// streamed path so both finalize tool calls identically.
-	emitToolCallItems(ctx, session, conv, t, responseID, finalToolCalls, finalSpeech != "", toolTurn)
-}
-
-// emitToolCallItems emits the realtime function_call items for the parsed tool
-// calls, the terminal response.done, and — for server-side LocalAI Assistant
-// tools — re-triggers a follow-up response so the model can speak the result.
-// hasContent shifts the tool-call output index past the assistant content item
-// when the same turn also produced spoken/text content. Two tool paths:
-//   - LocalAI Assistant tools (session.AssistantExecutor.IsTool) run server-side;
-//     we append both the call and its output to conv.Items and re-trigger. The
-//     client only sees observability events.
-//   - All other tools follow the standard OpenAI flow: emit
-//     function_call_arguments.done and wait for the client to send
-//     conversation.item.create back.
-func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation, t Transport, responseID string, toolCalls []functions.FuncCallResults, hasContent bool, toolTurn int) {
-	xlog.Debug("About to handle tool calls", "finalToolCallsCount", len(toolCalls))
+	// Handle Tool Calls. Two paths:
+	//   - LocalAI Assistant tools (session.AssistantExecutor.IsTool) run
+	//     server-side; we append both the call and its output to conv.Items
+	//     and re-trigger a follow-up response so the model can speak the
+	//     result. The client only sees observability events.
+	//   - All other tools follow the standard OpenAI flow: emit
+	//     function_call_arguments.done and wait for the client to send
+	//     conversation.item.create back.
+	xlog.Debug("About to handle tool calls", "finalToolCallsCount", len(finalToolCalls))
 	executedAssistantTool := false
-	for i, tc := range toolCalls {
+	for i, tc := range finalToolCalls {
 		toolCallID := generateItemID()
 		callID := "call_" + generateUniqueID() // OpenAI uses call_xyz

@@ -1859,7 +1889,7 @@ func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation
 		conv.Lock.Unlock()

 		outputIndex := i
-		if hasContent {
+		if finalSpeech != "" {
 			outputIndex++
 		}

@@ -1985,11 +2015,8 @@ func generateItemID() string {
 }

 func generateUniqueID() string {
-	// 16 random bytes, hex-encoded. Must be collision-free: session, item,
-	// response and call IDs build on this, and the conversation tracks/removes
-	// items by ID (e.g. cancel() in realtime_stream.go, conversation.item.retrieve).
-	// A constant would make every ID alias and corrupt that bookkeeping.
-	var b [16]byte
-	_, _ = rand.Read(b[:])
-	return hex.EncodeToString(b[:])
+	// Generate a unique ID string
+	// For simplicity, use a counter or UUID
+	// Implement as needed
+	return "unique_id"
 }
--- a/core/http/endpoints/openai/realtime_chunker.go
+++ b/core/http/endpoints/openai/realtime_chunker.go
@@ -1,200 +0,0 @@
-package openai
-
-import (
-	"strings"
-	"unicode"
-	"unicode/utf8"
-
-	"github.com/rivo/uniseg"
-)
-
-// Default clause-chunker bounds (in runes). minRunes gates only sub-sentence
-// (clause-mark / Thai-space) cuts so we don't synthesize tiny choppy fragments;
-// full sentences always flush regardless of length. maxRunes caps an
-// unterminated run so a long punctuation-less span doesn't buffer unbounded.
-const (
-	defaultClauseMinRunes = 12
-	defaultClauseMaxRunes = 200
-)
-
-// clauseChunker splits streamed LLM content into speakable clauses for
-// incremental TTS, in a SCRIPT-AWARE way so it works for languages without
-// whitespace word boundaries. It leans on UAX #29 sentence segmentation (which
-// natively terminates on CJK 。！？ as well as Latin .!?), adds CJK clause
-// punctuation (，、；：) and Thai/Lao spaces as finer boundaries, and caps an
-// over-long unterminated run via UAX #14 line-break opportunities.
-//
-// Unlike the old ASCII .!?/newline segmenter (dropped in 076dcdbe), it does not
-// degrade to whole-message buffering for CJK (handled natively) or Thai/Lao
-// (handled via spaces, which Thai uses at clause/sentence boundaries). Scripts
-// that genuinely need a dictionary (Khmer/Myanmar) simply stay buffered until a
-// space or end-of-message — no worse than the buffered default.
-//
-// It is not safe for concurrent use; callers feed it from a single goroutine
-// (the LLM token callback).
-type clauseChunker struct {
-	buf      strings.Builder
-	minRunes int
-	maxRunes int
-}
-
-func newClauseChunker(minRunes, maxRunes int) *clauseChunker {
-	return &clauseChunker{minRunes: minRunes, maxRunes: maxRunes}
-}
-
-// push appends streamed content and returns any clauses that are now complete —
-// "complete" meaning confirmed by following content, so we never speak a clause
-// that the next token might extend. Incomplete trailing text stays buffered.
-func (c *clauseChunker) push(text string) []string {
-	c.buf.WriteString(text)
-	return c.drain(false)
-}
-
-// flush returns the remaining buffered clauses, treating end-of-input as a hard
-// boundary, and clears the buffer.
-func (c *clauseChunker) flush() []string {
-	return c.drain(true)
-}
-
-func (c *clauseChunker) drain(final bool) []string {
-	s := c.buf.String()
-	rest := s
-	var out []string
-	for rest != "" {
-		end, ok := c.nextBoundary(rest, final)
-		if !ok {
-			break
-		}
-		if seg := strings.TrimSpace(rest[:end]); seg != "" {
-			out = append(out, seg)
-		}
-		rest = rest[end:]
-	}
-	// Rewriting the builder reallocates and copies the whole buffer; skip it on
-	// the common per-token call where no boundary was confirmed.
-	if len(rest) != len(s) {
-		c.buf.Reset()
-		c.buf.WriteString(rest)
-	}
-	return out
-}
-
-// nextBoundary returns the byte offset just past the first emittable clause in
-// s, or ok=false when more input is needed (final=false) and no boundary is
-// confirmed yet.
-func (c *clauseChunker) nextBoundary(s string, final bool) (int, bool) {
-	if s == "" {
-		return 0, false
-	}
-
-	// 1) UAX #29 sentence boundary. When the first sentence is followed by more
-	//    text it is a confirmed complete sentence (handles Latin .!? with
-	//    abbreviation/decimal guards, and CJK 。！？ with no whitespace).
-	sentence, rest, _ := uniseg.FirstSentenceInString(s, -1)
-	if rest != "" {
-		// Optionally cut finer inside the sentence at a clause boundary.
-		if cut, ok := c.firstClauseCut(sentence); ok {
-			return cut, true
-		}
-		return len(sentence), true
-	}
-
-	// 2) Unterminated tail: look for a sub-sentence clause boundary (CJK
-	//    punctuation or a Thai/Lao space) confirmed by following content.
-	if cut, ok := c.firstClauseCut(s); ok {
-		return cut, true
-	}
-
-	// 3) Over-long punctuation-less run: force a typographically legal break so
-	//    we don't buffer unbounded (e.g. a long CJK run with no punctuation).
-	if !final && c.maxRunes > 0 && utf8.RuneCountInString(s) > c.maxRunes {
-		if cut, ok := lineBreakCut(s, c.maxRunes); ok {
-			return cut, true
-		}
-	}
-
-	// 4) End of input: emit whatever remains as the final clause.
-	if final {
-		return len(s), true
-	}
-	return 0, false
-}
-
-// firstClauseCut returns the byte offset just past the first sub-sentence clause
-// boundary in s — a CJK clause punctuation mark, or a space following a Thai/Lao
-// letter — provided the prefix is at least minRunes long and non-space content
-// follows. The boundary mark (and any trailing spaces) stay with the left clause.
-func (c *clauseChunker) firstClauseCut(s string) (int, bool) {
-	var prev rune
-	runes := 0
-	for i, r := range s {
-		boundary := isCJKClausePunct(r) || (unicode.IsSpace(r) && isThaiLao(prev))
-		if boundary && runes+1 >= c.minRunes {
-			end := i + utf8.RuneLen(r)
-			for end < len(s) {
-				nr, sz := utf8.DecodeRuneInString(s[end:])
-				if !unicode.IsSpace(nr) {
-					break
-				}
-				end += sz
-			}
-			if end < len(s) { // confirmed: real content follows the boundary
-				return end, true
-			}
-			// Boundary sits at the end of the buffer with nothing after it yet —
-			// wait for the next token to confirm it rather than emit early.
-			return 0, false
-		}
-		prev = r
-		runes++
-	}
-	return 0, false
-}
-
-// lineBreakCut walks UAX #14 line segments and returns the byte offset of the
-// last legal break opportunity at or before maxRunes. Returns ok=false when the
-// run has no internal break opportunity (e.g. a space-less Thai run), leaving it
-// buffered.
-func lineBreakCut(s string, maxRunes int) (int, bool) {
-	state := -1
-	rest := s
-	consumed := 0
-	runes := 0
-	for rest != "" {
-		seg, rem, _, st := uniseg.FirstLineSegmentInString(rest, state)
-		state = st
-		runes += utf8.RuneCountInString(seg)
-		consumed += len(seg)
-		rest = rem
-		if runes >= maxRunes {
-			if consumed < len(s) {
-				return consumed, true
-			}
-			return 0, false
-		}
-	}
-	return 0, false
-}
-
-// isCJKClausePunct reports whether r is a CJK clause-level separator worth a
-// soft TTS break. Sentence terminators (。！？) are intentionally excluded — UAX
-// #29 sentence segmentation already handles those.
-func isCJKClausePunct(r rune) bool {
-	switch r {
-	case '，', // ， fullwidth comma
-		'、', // 、 ideographic comma
-		'；', // ； fullwidth semicolon
-		'：', // ： fullwidth colon
-		'・', // ・ katakana middle dot
-		'･': // ・ halfwidth katakana middle dot
-		return true
-	}
-	return false
-}
-
-// isThaiLao reports whether r is a Thai or Lao letter. Those scripts have no
-// inter-word spaces; an ASCII space inside such a run marks a clause/sentence
-// boundary, which is the only no-dictionary segmentation signal available.
-func isThaiLao(r rune) bool {
-	return unicode.Is(unicode.Thai, r) || unicode.Is(unicode.Lao, r)
-}
--- a/core/http/endpoints/openai/realtime_chunker_test.go
+++ b/core/http/endpoints/openai/realtime_chunker_test.go
@@ -1,103 +0,0 @@
-package openai
-
-import (
-	"strings"
-	"unicode/utf8"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-// clauseChunker splits streamed LLM content into speakable clauses in a
-// script-aware way: UAX#29 sentences (Latin .!? and CJK 。！？), CJK clause
-// punctuation, and Thai/Lao spaces — never whitespace-splitting CJK.
-var _ = Describe("clauseChunker", func() {
-	Context("Latin sentences", func() {
-		It("emits a sentence only once following content confirms it is complete", func() {
-			c := newClauseChunker(12, 200)
-			Expect(c.push("Hello world. How are you?")).To(Equal([]string{"Hello world."}))
-			// The trailing sentence is held until flush (the next token might extend it).
-			Expect(c.flush()).To(Equal([]string{"How are you?"}))
-		})
-
-		It("assembles a sentence across many small tokens", func() {
-			c := newClauseChunker(12, 200)
-			var got []string
-			for _, tok := range []string{"Hello", " world.", " How", " are", " you?"} {
-				got = append(got, c.push(tok)...)
-			}
-			got = append(got, c.flush()...)
-			Expect(got).To(Equal([]string{"Hello world.", "How are you?"}))
-		})
-
-		It("does not split decimals or abbreviations (UAX#29 SB6)", func() {
-			c := newClauseChunker(12, 200)
-			got := c.push("Pi is 3.14 and e is 2.72. Done")
-			Expect(got).To(Equal([]string{"Pi is 3.14 and e is 2.72."}))
-			Expect(c.flush()).To(Equal([]string{"Done"}))
-		})
-	})
-
-	Context("CJK (no whitespace)", func() {
-		It("splits Chinese on the ideographic full stop", func() {
-			c := newClauseChunker(12, 200)
-			Expect(c.push("你好世界。今天天气很好。")).To(Equal([]string{"你好世界。"}))
-			Expect(c.flush()).To(Equal([]string{"今天天气很好。"}))
-		})
-
-		It("splits Japanese on the ideographic full stop", func() {
-			c := newClauseChunker(12, 200)
-			Expect(c.push("こんにちは。元気ですか。")).To(Equal([]string{"こんにちは。"}))
-			Expect(c.flush()).To(Equal([]string{"元気ですか。"}))
-		})
-
-		It("splits on CJK clause punctuation for lower latency", func() {
-			c := newClauseChunker(2, 200) // small min so short test clauses cut
-			Expect(c.push("你好，世界。再见")).To(Equal([]string{"你好，", "世界。"}))
-			Expect(c.flush()).To(Equal([]string{"再见"}))
-		})
-	})
-
-	Context("Thai (spaces mark clauses, not words)", func() {
-		It("splits a Thai run on the inter-clause space", func() {
-			c := newClauseChunker(2, 200)
-			Expect(c.push("สวัสดีครับ กินข้าวไหม")).To(Equal([]string{"สวัสดีครับ"}))
-			Expect(c.flush()).To(Equal([]string{"กินข้าวไหม"}))
-		})
-
-		It("never shatters a space-less Thai run into characters", func() {
-			c := newClauseChunker(2, 200)
-			Expect(c.push("สวัสดีครับ")).To(BeEmpty()) // held, no boundary
-			Expect(c.flush()).To(Equal([]string{"สวัสดีครับ"}))
-		})
-	})
-
-	Context("length cap (UAX#14 fallback)", func() {
-		It("force-breaks an over-long punctuation-less CJK run at legal points", func() {
-			c := newClauseChunker(4, 10) // maxRunes = 10
-			run := strings.Repeat("字", 25)
-			got := c.push(run)
-			got = append(got, c.flush()...)
-			total := 0
-			for _, seg := range got {
-				n := utf8.RuneCountInString(seg)
-				Expect(n).To(BeNumerically("<=", 10)) // never exceeds the cap
-				total += n
-			}
-			Expect(total).To(Equal(25))                       // nothing dropped
-			Expect(len(got)).To(BeNumerically(">=", 3))       // 10 + 10 + 5
-		})
-	})
-
-	Context("buffer lifecycle", func() {
-		It("flush clears the buffer so the chunker is reusable", func() {
-			c := newClauseChunker(12, 200)
-			// "First one." is confirmed by the following "Second", so push drains it;
-			// only the unterminated tail remains for flush.
-			Expect(c.push("First one. Second")).To(Equal([]string{"First one."}))
-			Expect(c.flush()).To(Equal([]string{"Second"}))
-			Expect(c.flush()).To(BeEmpty())
-			Expect(c.push("Again. More")).To(Equal([]string{"Again."}))
-		})
-	})
-})
--- a/core/http/endpoints/openai/realtime_doubles_test.go
+++ b/core/http/endpoints/openai/realtime_doubles_test.go
@@ -1,138 +0,0 @@
-package openai
-
-import (
-	"context"
-	"strings"
-
-	"github.com/mudler/LocalAI/core/backend"
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
-	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/LocalAI/pkg/grpc/proto"
-)
-
-// fakeTransport records the server events and audio sent to a realtime client
-// so streaming behaviour can be asserted without a real WebSocket/WebRTC peer.
-// It is not a *WebRTCTransport, so handler code takes the WebSocket path.
-type fakeTransport struct {
-	events []types.ServerEvent
-	audio  []fakeAudioChunk
-}
-
-type fakeAudioChunk struct {
-	pcm        []byte
-	sampleRate int
-}
-
-func (f *fakeTransport) SendEvent(e types.ServerEvent) error {
-	f.events = append(f.events, e)
-	return nil
-}
-
-func (f *fakeTransport) ReadEvent() ([]byte, error) { return nil, nil }
-
-func (f *fakeTransport) SendAudio(_ context.Context, pcm []byte, sampleRate int) error {
-	f.audio = append(f.audio, fakeAudioChunk{pcm: pcm, sampleRate: sampleRate})
-	return nil
-}
-
-func (f *fakeTransport) Close() error { return nil }
-
-// countEvents returns how many recorded events have the given type.
-func (f *fakeTransport) countEvents(et types.ServerEventType) int {
-	n := 0
-	for _, e := range f.events {
-		if e.ServerEventType() == et {
-			n++
-		}
-	}
-	return n
-}
-
-// transcriptDeltaText concatenates the Delta of every recorded transcript
-// delta event — i.e. the text streamed to the client as it is generated.
-func (f *fakeTransport) transcriptDeltaText() string {
-	var b strings.Builder
-	for _, e := range f.events {
-		if d, ok := e.(types.ResponseOutputAudioTranscriptDeltaEvent); ok {
-			b.WriteString(d.Delta)
-		}
-	}
-	return b.String()
-}
-
-// fakeModel is a configurable Model double. TTSStream replays ttsStreamChunks
-// and TranscribeStream replays transcribeDeltas, so the handler's streaming
-// paths can be driven deterministically.
-type fakeModel struct {
-	cfg *config.ModelConfig
-
-	ttsFile         string
-	ttsStreamChunks [][]byte
-	ttsStreamRate   int
-	ttsStreamErr    error
-
-	transcribeDeltas []string
-	transcribeFinal  *schema.TranscriptionResult
-
-	// Predict streaming: predictTokens are replayed through the token callback
-	// (simulating streamed LLM output); predictResp/predictErr are returned by
-	// the deferred predict function. predictChunkDeltas, when set, are delivered
-	// per-token via TokenUsage.ChatDeltas to exercise the autoparser path.
-	predictTokens      []string
-	predictChunkDeltas [][]*proto.ChatDelta
-	predictResp        backend.LLMResponse
-	predictErr         error
-}
-
-func (m *fakeModel) VAD(context.Context, *schema.VADRequest) (*schema.VADResponse, error) {
-	return nil, nil
-}
-
-func (m *fakeModel) Transcribe(context.Context, string, string, bool, bool, string) (*schema.TranscriptionResult, error) {
-	return m.transcribeFinal, nil
-}
-
-func (m *fakeModel) Predict(_ context.Context, _ schema.Messages, _, _, _ []string, cb func(string, backend.TokenUsage) bool, _ []types.ToolUnion, _ *types.ToolChoiceUnion, _, _ *int, _ map[string]float64) (func() (backend.LLMResponse, error), error) {
-	if m.predictErr != nil {
-		return nil, m.predictErr
-	}
-	return func() (backend.LLMResponse, error) {
-		for i, tok := range m.predictTokens {
-			if cb == nil {
-				continue
-			}
-			usage := backend.TokenUsage{}
-			if i < len(m.predictChunkDeltas) {
-				usage.ChatDeltas = m.predictChunkDeltas[i]
-			}
-			cb(tok, usage)
-		}
-		return m.predictResp, nil
-	}, nil
-}
-
-func (m *fakeModel) TTS(context.Context, string, string, string) (string, *proto.Result, error) {
-	return m.ttsFile, &proto.Result{Success: true}, nil
-}
-
-func (m *fakeModel) TTSStream(_ context.Context, _, _, _ string, onAudio func(pcm []byte, sampleRate int) error) error {
-	if m.ttsStreamErr != nil {
-		return m.ttsStreamErr
-	}
-	for _, c := range m.ttsStreamChunks {
-		if err := onAudio(c, m.ttsStreamRate); err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
-func (m *fakeModel) TranscribeStream(_ context.Context, _, _ string, _, _ bool, _ string, onDelta func(text string)) (*schema.TranscriptionResult, error) {
-	for _, d := range m.transcribeDeltas {
-		onDelta(d)
-	}
-	return m.transcribeFinal, nil
-}
-
-func (m *fakeModel) PredictConfig() *config.ModelConfig { return m.cfg }
--- a/core/http/endpoints/openai/realtime_model.go
+++ b/core/http/endpoints/openai/realtime_model.go
@@ -3,7 +3,6 @@ package openai
 import (
 	"context"
 	"crypto/rand"
-	"encoding/binary"
 	"encoding/hex"
 	"encoding/json"
 	"fmt"
@@ -88,14 +87,6 @@ func (m *transcriptOnlyModel) TTS(ctx context.Context, text, voice, language str
 	return "", nil, fmt.Errorf("TTS not supported in transcript-only mode")
 }

-func (m *transcriptOnlyModel) TTSStream(ctx context.Context, text, voice, language string, onAudio func(pcm []byte, sampleRate int) error) error {
-	return fmt.Errorf("TTS not supported in transcript-only mode")
-}
-
-func (m *transcriptOnlyModel) TranscribeStream(ctx context.Context, audio, language string, translate, diarize bool, prompt string, onDelta func(text string)) (*schema.TranscriptionResult, error) {
-	return transcribeStream(ctx, m.modelLoader, *m.TranscriptionConfig, m.appConfig, audio, language, translate, diarize, prompt, onDelta)
-}
-
 func (m *transcriptOnlyModel) PredictConfig() *config.ModelConfig {
 	return nil
 }
@@ -330,75 +321,10 @@ func (m *wrappedModel) TTS(ctx context.Context, text, voice, language string) (s
 	return backend.ModelTTS(ctx, text, voice, language, "", nil, m.modelLoader, m.appConfig, *m.TTSConfig)
 }

-func (m *wrappedModel) TTSStream(ctx context.Context, text, voice, language string, onAudio func(pcm []byte, sampleRate int) error) error {
-	return ttsStream(ctx, m.modelLoader, m.appConfig, *m.TTSConfig, text, voice, language, onAudio)
-}
-
-func (m *wrappedModel) TranscribeStream(ctx context.Context, audio, language string, translate, diarize bool, prompt string, onDelta func(text string)) (*schema.TranscriptionResult, error) {
-	return transcribeStream(ctx, m.modelLoader, *m.TranscriptionConfig, m.appConfig, audio, language, translate, diarize, prompt, onDelta)
-}
-
 func (m *wrappedModel) PredictConfig() *config.ModelConfig {
 	return m.LLMConfig
 }

-// wavStreamHeaderBytes is the size of the WAV header that backend.ModelTTSStream
-// emits as its first audio callback; the sample rate lives at byte offset 24.
-const wavStreamHeaderBytes = 44
-
-// ttsStream adapts backend.ModelTTSStream (which emits a WAV stream: a 44-byte
-// header carrying the sample rate, then raw PCM) to the realtime onAudio
-// callback, which wants raw PCM plus the sample rate. The header is buffered
-// until complete, the sample rate is read from it, and subsequent bytes are
-// forwarded as PCM.
-func ttsStream(ctx context.Context, ml *model.ModelLoader, appConfig *config.ApplicationConfig, ttsConfig config.ModelConfig, text, voice, language string, onAudio func(pcm []byte, sampleRate int) error) error {
-	var header []byte
-	headerDone := false
-	sampleRate := 0
-	return backend.ModelTTSStream(ctx, text, voice, language, "", nil, ml, appConfig, ttsConfig, func(b []byte) error {
-		if headerDone {
-			if len(b) == 0 {
-				return nil
-			}
-			return onAudio(b, sampleRate)
-		}
-		header = append(header, b...)
-		if len(header) < wavStreamHeaderBytes {
-			return nil
-		}
-		sampleRate = int(binary.LittleEndian.Uint32(header[24:28]))
-		headerDone = true
-		if len(header) > wavStreamHeaderBytes {
-			return onAudio(header[wavStreamHeaderBytes:], sampleRate)
-		}
-		return nil
-	})
-}
-
-// transcribeStream adapts backend.ModelTranscriptionStream to the realtime
-// onDelta callback, returning the final aggregated transcription result.
-func transcribeStream(ctx context.Context, ml *model.ModelLoader, transcriptionConfig config.ModelConfig, appConfig *config.ApplicationConfig, audio, language string, translate, diarize bool, prompt string, onDelta func(text string)) (*schema.TranscriptionResult, error) {
-	var final *schema.TranscriptionResult
-	err := backend.ModelTranscriptionStream(ctx, backend.TranscriptionRequest{
-		Audio:     audio,
-		Language:  language,
-		Translate: translate,
-		Diarize:   diarize,
-		Prompt:    prompt,
-	}, ml, transcriptionConfig, appConfig, func(chunk backend.TranscriptionStreamChunk) {
-		if chunk.Delta != "" {
-			onDelta(chunk.Delta)
-		}
-		if chunk.Final != nil {
-			final = chunk.Final
-		}
-	})
-	if err != nil {
-		return nil, err
-	}
-	return final, nil
-}
-
 func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, *config.ModelConfig, error) {
 	cfgVAD, err := cl.LoadModelConfigFileByName(pipeline.VAD, ml.ModelPath)
 	if err != nil {
@@ -451,14 +377,13 @@ func buildRealtimeRoutingContext(a *application.Application, sessionID string) *
 		return nil
 	}
 	deps := &middleware.ClassifierDeps{
-		Scorer:       a.Scorer,
-		TokenCounter: a.TokenCounter,
-		Embedder:     a.Embedder,
-		VectorStore:  a.VectorStore,
-		Reranker:     a.Reranker,
-		ModelLookup:  a.ModelConfigLookup(),
-		Registry:     a.RouterClassifierRegistry(),
-		Evaluator:    a.TemplatesEvaluator(),
+		Scorer:      a.Scorer,
+		Embedder:    a.Embedder,
+		VectorStore: a.VectorStore,
+		Reranker:    a.Reranker,
+		ModelLookup: a.ModelConfigLookup(),
+		Registry:    a.RouterClassifierRegistry(),
+		Evaluator:   a.TemplatesEvaluator(),
 	}
 	userID := ""
 	if u := a.FallbackUser(); u != nil {
@@ -529,10 +454,8 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
 		return nil, fmt.Errorf("failed to validate config: %w", err)
 	}

-	// Let the pipeline set the LLM's reasoning effort and force thinking off
-	// (cfgLLM is a per-session copy). disable_thinking applies after the effort.
+	// Let the pipeline set the LLM's reasoning effort (cfgLLM is a per-session copy).
 	applyPipelineReasoning(cfgLLM, *pipeline)
-	applyPipelineThinking(cfgLLM, *pipeline)

 	cfgTTS, err := cl.LoadModelConfigFileByName(pipeline.TTS, ml.ModelPath)
 	if err != nil {
--- a/core/http/endpoints/openai/realtime_speech.go
+++ b/core/http/endpoints/openai/realtime_speech.go
@@ -1,102 +0,0 @@
-package openai
-
-import (
-	"context"
-	"encoding/base64"
-	"fmt"
-	"os"
-	"path/filepath"
-
-	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
-	laudio "github.com/mudler/LocalAI/pkg/audio"
-	"github.com/mudler/LocalAI/pkg/sound"
-)
-
-// emitSpeech synthesizes text and sends the audio to the client. When the
-// pipeline opts into TTS streaming it forwards each PCM chunk as its own
-// response.output_audio.delta as soon as the backend produces it; otherwise it
-// synthesizes the whole utterance and sends it as a single delta.
-//
-// It deliberately does NOT emit transcript or audio-done events: the caller owns
-// those so a streamed reply can be split into several spoken segments that share
-// one response/item.
-//
-// It returns the PCM audio (at the session output rate) accumulated across all
-// chunks, which the caller base64-encodes onto the conversation item. For WebRTC
-// the audio goes over the RTP track instead, so the returned slice is empty.
-func emitSpeech(ctx context.Context, t Transport, session *Session, responseID, itemID, text string) ([]byte, error) {
-	if text == "" {
-		return nil, nil
-	}
-
-	_, isWebRTC := t.(*WebRTCTransport)
-
-	var wsAudio []byte // PCM at the session output rate, accumulated for the item record
-
-	// sendChunk hands one PCM buffer to the transport: WebRTC consumes the raw
-	// PCM directly (it resamples internally); WebSocket gets base64 PCM at the
-	// session output rate via a JSON delta event.
-	sendChunk := func(pcm []byte, sampleRate int) error {
-		if len(pcm) == 0 {
-			return nil
-		}
-		if err := t.SendAudio(ctx, pcm, sampleRate); err != nil {
-			return err
-		}
-		if isWebRTC {
-			return nil
-		}
-		wsPCM := pcm
-		if sampleRate != 0 && sampleRate != session.OutputSampleRate {
-			samples := sound.BytesToInt16sLE(pcm)
-			resampled := sound.ResampleInt16(samples, sampleRate, session.OutputSampleRate)
-			wsPCM = sound.Int16toBytesLE(resampled)
-		}
-		wsAudio = append(wsAudio, wsPCM...)
-		return t.SendEvent(types.ResponseOutputAudioDeltaEvent{
-			ServerEventBase: types.ServerEventBase{},
-			ResponseID:      responseID,
-			ItemID:          itemID,
-			OutputIndex:     0,
-			ContentIndex:    0,
-			Delta:           base64.StdEncoding.EncodeToString(wsPCM),
-		})
-	}
-
-	language := ""
-	if session.InputAudioTranscription != nil {
-		language = session.InputAudioTranscription.Language
-	}
-
-	if session.ModelConfig != nil && session.ModelConfig.Pipeline.StreamTTS() {
-		if err := session.ModelInterface.TTSStream(ctx, text, session.Voice, language, sendChunk); err != nil {
-			return nil, err
-		}
-		return wsAudio, nil
-	}
-
-	// Unary fallback: synthesize the whole utterance to a file, then emit once.
-	audioFilePath, res, err := session.ModelInterface.TTS(ctx, text, session.Voice, language)
-	if err != nil {
-		return nil, err
-	}
-	if res != nil && !res.Success {
-		return nil, fmt.Errorf("tts generation failed: %s", res.Message)
-	}
-	defer func() { _ = os.Remove(audioFilePath) }()
-
-	// filepath.Clean normalizes the backend-produced temp path before reading
-	// (also keeps gosec G304 quiet — the path is backend-controlled, not user input).
-	audioBytes, err := os.ReadFile(filepath.Clean(audioFilePath))
-	if err != nil {
-		return nil, fmt.Errorf("read tts audio: %w", err)
-	}
-	pcm, sampleRate := laudio.ParseWAV(audioBytes)
-	if sampleRate == 0 {
-		sampleRate = session.OutputSampleRate
-	}
-	if err := sendChunk(pcm, sampleRate); err != nil {
-		return nil, err
-	}
-	return wsAudio, nil
-}
--- a/core/http/endpoints/openai/realtime_speech_test.go
+++ b/core/http/endpoints/openai/realtime_speech_test.go
@@ -1,70 +0,0 @@
-package openai
-
-import (
-	"context"
-	"os"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
-	laudio "github.com/mudler/LocalAI/pkg/audio"
-)
-
-// emitSpeech synthesizes a piece of text and forwards the audio to the client,
-// streaming a delta per TTS chunk when the pipeline opts in, or sending the
-// whole utterance as one delta otherwise.
-var _ = Describe("emitSpeech", func() {
-	ttsOn := true
-
-	streamingSession := func(m Model) *Session {
-		return &Session{
-			OutputSampleRate: 24000,
-			ModelInterface:   m,
-			ModelConfig: &config.ModelConfig{
-				Pipeline: config.Pipeline{Streaming: config.PipelineStreaming{TTS: &ttsOn}},
-			},
-		}
-	}
-
-	It("streams one output_audio.delta per TTS chunk when streaming is enabled", func() {
-		m := &fakeModel{
-			ttsStreamChunks: [][]byte{{1, 2}, {3, 4}, {5, 6}},
-			ttsStreamRate:   24000,
-		}
-		t := &fakeTransport{}
-
-		audio, err := emitSpeech(context.Background(), t, streamingSession(m), "resp1", "item1", "Hello there.")
-
-		Expect(err).ToNot(HaveOccurred())
-		Expect(t.countEvents(types.ServerEventTypeResponseOutputAudioDelta)).To(Equal(3))
-		// The returned audio is all chunks concatenated (session output rate).
-		Expect(audio).To(Equal([]byte{1, 2, 3, 4, 5, 6}))
-	})
-
-	It("sends a single output_audio.delta in unary mode", func() {
-		// A minimal real WAV file for the unary TTS path to read + parse.
-		f, err := os.CreateTemp("", "emit-*.wav")
-		Expect(err).ToNot(HaveOccurred())
-		defer func() { _ = os.Remove(f.Name()) }()
-		pcm := make([]byte, 320) // 160 samples of silence
-		hdr := laudio.NewWAVHeader(uint32(len(pcm)))
-		Expect(hdr.Write(f)).To(Succeed())
-		_, err = f.Write(pcm)
-		Expect(err).ToNot(HaveOccurred())
-		Expect(f.Close()).To(Succeed())
-
-		session := &Session{
-			OutputSampleRate: 24000,
-			ModelInterface:   &fakeModel{ttsFile: f.Name()},
-			ModelConfig:      &config.ModelConfig{}, // streaming off
-		}
-		t := &fakeTransport{}
-
-		_, err = emitSpeech(context.Background(), t, session, "resp1", "item1", "Hello there.")
-
-		Expect(err).ToNot(HaveOccurred())
-		Expect(t.countEvents(types.ServerEventTypeResponseOutputAudioDelta)).To(Equal(1))
-	})
-})
--- a/core/http/endpoints/openai/realtime_stream.go
+++ b/core/http/endpoints/openai/realtime_stream.go
@@ -1,315 +0,0 @@
-package openai
-
-import (
-	"context"
-	"encoding/base64"
-	"fmt"
-
-	"github.com/mudler/LocalAI/core/backend"
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
-	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/LocalAI/pkg/functions"
-	"github.com/mudler/LocalAI/pkg/reasoning"
-)
-
-// transcriptStreamer turns streamed LLM tokens into the assistant's spoken
-// transcript: it strips reasoning incrementally and sends one
-// response.output_audio_transcript.delta per content fragment. It does NOT
-// synthesize audio — the caller buffers the full message and synthesizes it
-// once (streaming the audio chunks when the TTS backend supports TTSStream),
-// which works uniformly for streaming and non-streaming TTS and for languages
-// without sentence or word boundaries.
-type transcriptStreamer struct {
-	ctx        context.Context
-	t          Transport
-	responseID string
-	itemID     string
-	extractor  *reasoning.ReasoningExtractor
-
-	// announce, if set, is invoked once just before the first transcript delta.
-	// It lets the caller create the assistant item lazily, so a content-less
-	// tool-call turn never emits a spurious empty assistant item.
-	announce  func()
-	announced bool
-}
-
-func newTranscriptStreamer(ctx context.Context, t Transport, responseID, itemID, thinkingStartToken string, reasoningCfg reasoning.Config) *transcriptStreamer {
-	return &transcriptStreamer{
-		ctx:        ctx,
-		t:          t,
-		responseID: responseID,
-		itemID:     itemID,
-		extractor:  reasoning.NewReasoningExtractor(thinkingStartToken, spokenReasoningConfig(reasoningCfg)),
-	}
-}
-
-// onToken handles one streamed unit of model output, sending a transcript delta
-// for the new content (reasoning stripped) and returning that content delta so
-// the caller can also feed it to the clause chunker. For plain-content models
-// the unit is the raw text token; for autoparser tool turns the backend clears
-// the text and delivers content via ChatDeltas, so the caller passes that
-// content here. Returns "" when the token produced no new spoken content.
-func (s *transcriptStreamer) onToken(token string) string {
-	_, content := s.extractor.ProcessToken(token)
-	if content == "" {
-		return ""
-	}
-	if !s.announced {
-		s.announced = true
-		if s.announce != nil {
-			s.announce()
-		}
-	}
-	_ = s.t.SendEvent(types.ResponseOutputAudioTranscriptDeltaEvent{
-		ServerEventBase: types.ServerEventBase{},
-		ResponseID:      s.responseID,
-		ItemID:          s.itemID,
-		OutputIndex:     0,
-		ContentIndex:    0,
-		Delta:           content,
-	})
-	return content
-}
-
-// content returns the full transcript so far with reasoning stripped.
-func (s *transcriptStreamer) content() string {
-	return s.extractor.CleanedContent()
-}
-
-// streamLLMResponse drives a streamed realtime reply. It streams the assistant
-// transcript as the LLM generates, then synthesizes the whole buffered message
-// once (streaming the audio chunks when the TTS backend supports it, otherwise a
-// single unary delta). Tool calls parsed from the autoparser ChatDeltas are
-// emitted after the spoken content. The assistant content item is created lazily
-// on the first content delta, so a content-less tool-call turn emits only the
-// tool calls. It returns true when it has fully handled the response so the
-// caller can return; callers must only invoke it for an audio modality, and with
-// tools only when the model uses its tokenizer template (see triggerResponseAtTurn).
-func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation, t Transport, responseID string, history schema.Messages, images []string, llmCfg *config.ModelConfig, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, toolTurn int) bool {
-	itemID := generateItemID()
-	item := types.MessageItemUnion{
-		Assistant: &types.MessageItemAssistant{
-			ID:      itemID,
-			Status:  types.ItemStatusInProgress,
-			Content: []types.MessageContentOutput{{Type: types.MessageContentTypeOutputAudio}},
-		},
-	}
-
-	// announce creates the assistant content item lazily, just before the first
-	// transcript delta — a tool-only turn never produces content, so it stays out
-	// of the conversation and the client sees only the tool calls.
-	announced := false
-	announce := func() {
-		announced = true
-		conv.Lock.Lock()
-		conv.Items = append(conv.Items, &item)
-		conv.Lock.Unlock()
-		sendEvent(t, types.ResponseOutputItemAddedEvent{
-			ServerEventBase: types.ServerEventBase{},
-			ResponseID:      responseID,
-			OutputIndex:     0,
-			Item:            item,
-		})
-		sendEvent(t, types.ResponseContentPartAddedEvent{
-			ServerEventBase: types.ServerEventBase{},
-			ResponseID:      responseID,
-			ItemID:          itemID,
-			OutputIndex:     0,
-			ContentIndex:    0,
-			Part:            item.Assistant.Content[0],
-		})
-	}
-
-	cancel := func() {
-		if announced {
-			conv.Lock.Lock()
-			for i := len(conv.Items) - 1; i >= 0; i-- {
-				if conv.Items[i].Assistant != nil && conv.Items[i].Assistant.ID == itemID {
-					conv.Items = append(conv.Items[:i], conv.Items[i+1:]...)
-					break
-				}
-			}
-			conv.Lock.Unlock()
-		}
-		sendEvent(t, types.ResponseDoneEvent{
-			ServerEventBase: types.ServerEventBase{},
-			Response:        types.Response{ID: responseID, Object: "realtime.response", Status: types.ResponseStatusCancelled},
-		})
-	}
-
-	var template string
-	if llmCfg.TemplateConfig.UseTokenizerTemplate {
-		template = llmCfg.GetModelTemplate()
-	} else {
-		template = llmCfg.TemplateConfig.Chat
-	}
-	thinkingStartToken := reasoning.DetectThinkingStartToken(template, &llmCfg.ReasoningConfig)
-
-	// The autoparser (tokenizer-template path) already delivers reasoning-free
-	// content. Prefilling the thinking start token here would re-tag that clean
-	// content as an unclosed reasoning block, leaving CleanedContent() empty —
-	// no spoken reply, no TTS. Disable the prefill; closed tag pairs are still
-	// stripped (PEG-fallback case, #9985).
-	reasoningCfg := llmCfg.ReasoningConfig
-	if llmCfg.TemplateConfig.UseTokenizerTemplate {
-		disablePrefill := true
-		reasoningCfg.DisableReasoningTagPrefill = &disablePrefill
-	}
-
-	streamer := newTranscriptStreamer(ctx, t, responseID, itemID, thinkingStartToken, reasoningCfg)
-	streamer.announce = announce
-
-	// Clause chunking (opt-in): synthesize each clause as soon as it completes
-	// instead of buffering the whole reply. streamedAudio accumulates the PCM
-	// across clauses for the conversation item record; ttsErr captures the first
-	// synthesis failure so the token callback can stop the prediction. emitSpeech
-	// runs synchronously here — the LLM keeps generating into the gRPC stream
-	// while a clause is synthesized, so audio still starts mid-generation.
-	var chunker *clauseChunker
-	if session.ModelConfig != nil && session.ModelConfig.Pipeline.ChunkClauses() {
-		chunker = newClauseChunker(defaultClauseMinRunes, defaultClauseMaxRunes)
-	}
-	var streamedAudio []byte
-	var ttsErr error
-	speakClause := func(clause string) error {
-		a, err := emitSpeech(ctx, t, session, responseID, itemID, clause)
-		if err != nil {
-			return err
-		}
-		streamedAudio = append(streamedAudio, a...)
-		return nil
-	}
-
-	// fail reports a mid-stream failure. A cancelled context means the client
-	// interrupted (barge-in), so roll the turn back instead of erroring.
-	fail := func(code, msg string, err error) bool {
-		if ctx.Err() != nil {
-			cancel()
-		} else {
-			sendError(t, code, fmt.Sprintf("%s: %v", msg, err), "", itemID)
-		}
-		return true
-	}
-
-	cb := func(token string, usage backend.TokenUsage) bool {
-		if ctx.Err() != nil {
-			return false
-		}
-		// Plain-content models stream text via the token; autoparser tool turns
-		// clear the text and deliver content via ChatDeltas, so prefer the latter
-		// when present. Either way only content reaches the transcript — tool-call
-		// deltas are parsed from the final response below.
-		text := token
-		if len(usage.ChatDeltas) > 0 {
-			text = functions.ContentFromChatDeltas(usage.ChatDeltas)
-		}
-		delta := streamer.onToken(text)
-		if chunker != nil && delta != "" {
-			for _, clause := range chunker.push(delta) {
-				if ttsErr = speakClause(clause); ttsErr != nil {
-					return false // stop the prediction; reported after predFunc returns
-				}
-			}
-		}
-		return true
-	}
-
-	predFunc, err := session.ModelInterface.Predict(ctx, history, images, nil, nil, cb, tools, toolChoice, nil, nil, nil)
-	if err != nil {
-		sendError(t, "inference_failed", fmt.Sprintf("backend error: %v", err), "", itemID)
-		return true
-	}
-	pred, err := predFunc()
-	// A clause synthesis failed mid-stream (the callback stopped the prediction);
-	// report it as a TTS error rather than a prediction error.
-	if ttsErr != nil {
-		return fail("tts_error", "TTS generation failed", ttsErr)
-	}
-	if err != nil {
-		return fail("prediction_failed", "backend error", err)
-	}
-	if ctx.Err() != nil {
-		cancel()
-		return true
-	}
-
-	content := streamer.content()
-	toolCalls := functions.ToolCallsFromChatDeltas(pred.ChatDeltas)
-
-	// Finalize the spoken content item only when the turn produced content. A
-	// tool-only turn skips this entirely (no empty assistant item).
-	if content != "" {
-		if !announced {
-			announce()
-		}
-
-		// Synthesize the audio. With clause chunking the completed clauses were
-		// already spoken inside the token callback; flush the trailing clause(s)
-		// the segmenter was still holding. Otherwise buffer the whole message and
-		// synthesize it once. emitSpeech streams the audio chunks when the TTS
-		// backend supports TTSStream, otherwise it sends a single unary delta.
-		var audio []byte
-		if chunker != nil {
-			for _, clause := range chunker.flush() {
-				if ttsErr = speakClause(clause); ttsErr != nil {
-					break
-				}
-			}
-			audio = streamedAudio
-		} else {
-			audio, ttsErr = emitSpeech(ctx, t, session, responseID, itemID, content)
-		}
-		if ttsErr != nil {
-			return fail("tts_error", "TTS generation failed", ttsErr)
-		}
-
-		_, isWebRTC := t.(*WebRTCTransport)
-
-		sendEvent(t, types.ResponseOutputAudioTranscriptDoneEvent{
-			ServerEventBase: types.ServerEventBase{},
-			ResponseID:      responseID,
-			ItemID:          itemID,
-			OutputIndex:     0,
-			ContentIndex:    0,
-			Transcript:      content,
-		})
-		if !isWebRTC {
-			sendEvent(t, types.ResponseOutputAudioDoneEvent{
-				ServerEventBase: types.ServerEventBase{},
-				ResponseID:      responseID,
-				ItemID:          itemID,
-				OutputIndex:     0,
-				ContentIndex:    0,
-			})
-		}
-
-		conv.Lock.Lock()
-		item.Assistant.Status = types.ItemStatusCompleted
-		item.Assistant.Content[0].Transcript = content
-		if !isWebRTC {
-			item.Assistant.Content[0].Audio = base64.StdEncoding.EncodeToString(audio)
-		}
-		conv.Lock.Unlock()
-
-		sendEvent(t, types.ResponseContentPartDoneEvent{
-			ServerEventBase: types.ServerEventBase{},
-			ResponseID:      responseID,
-			ItemID:          itemID,
-			OutputIndex:     0,
-			ContentIndex:    0,
-			Part:            item.Assistant.Content[0],
-		})
-		sendEvent(t, types.ResponseOutputItemDoneEvent{
-			ServerEventBase: types.ServerEventBase{},
-			ResponseID:      responseID,
-			OutputIndex:     0,
-			Item:            item,
-		})
-	}
-
-	// Emit any tool calls, the terminal response.done, and (for server-side
-	// assistant tools) the follow-up turn — shared with the buffered path.
-	emitToolCallItems(ctx, session, conv, t, responseID, toolCalls, content != "", toolTurn)
-	return true
-}
--- a/core/http/endpoints/openai/realtime_stream_test.go
+++ b/core/http/endpoints/openai/realtime_stream_test.go
@@ -1,213 +0,0 @@
-package openai
-
-import (
-	"context"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-
-	"github.com/mudler/LocalAI/core/backend"
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
-	"github.com/mudler/LocalAI/pkg/grpc/proto"
-	"github.com/mudler/LocalAI/pkg/reasoning"
-)
-
-// transcriptStreamer turns streamed LLM tokens into incremental transcript
-// deltas, stripping reasoning. Audio is synthesized once from the full message
-// by the caller, so there is no per-sentence segmentation.
-var _ = Describe("transcriptStreamer", func() {
-	It("emits one transcript delta per content token", func() {
-		t := &fakeTransport{}
-		s := newTranscriptStreamer(context.Background(), t, "resp1", "item1", "", reasoning.Config{})
-
-		for _, tok := range []string{"Hello", " world.", " Bye"} {
-			s.onToken(tok)
-		}
-
-		Expect(s.content()).To(Equal("Hello world. Bye"))
-		Expect(t.countEvents(types.ServerEventTypeResponseOutputAudioTranscriptDelta)).To(Equal(3))
-		Expect(t.transcriptDeltaText()).To(Equal("Hello world. Bye"))
-	})
-
-	It("strips leaked reasoning even when reasoning is disabled (disable_thinking safety net)", func() {
-		// disable_thinking maps to DisableReasoning=true (enable_thinking=false to
-		// the backend). If the model emits thinking anyway, the transcript must
-		// still not leak it: stripping always runs for spoken output.
-		disable := true
-		t := &fakeTransport{}
-		s := newTranscriptStreamer(context.Background(), t, "resp1", "item1", "",
-			reasoning.Config{DisableReasoning: &disable})
-
-		s.onToken("<think>secret plan</think>")
-		s.onToken("The answer is 42.")
-
-		Expect(s.content()).To(Equal("The answer is 42."))
-		Expect(s.content()).ToNot(ContainSubstring("secret plan"))
-		Expect(t.transcriptDeltaText()).ToNot(ContainSubstring("secret plan"))
-	})
-
-	It("does not swallow autoparser content when the template has a thinking start token (tokenizer-template path)", func() {
-		// Regression: with tag prefill on, the detected <think> token is
-		// prepended to the autoparser's already-clean content, swallowing the
-		// whole reply (empty transcript → no TTS). streamLLMResponse disables
-		// the prefill for the tokenizer-template path.
-		disablePrefill := true
-		t := &fakeTransport{}
-		s := newTranscriptStreamer(context.Background(), t, "resp1", "item1", "<think>",
-			reasoning.Config{DisableReasoningTagPrefill: &disablePrefill})
-
-		s.onToken("Hello")
-		s.onToken(" there.")
-
-		Expect(s.content()).To(Equal("Hello there."))
-		Expect(t.transcriptDeltaText()).To(Equal("Hello there."))
-	})
-
-	It("still strips embedded closed reasoning tags with prefill disabled (PEG-fallback safety, #9985)", func() {
-		// Disabling prefill must not stop stripping closed <think>…</think>
-		// pairs the PEG fallback can leave in autoparser content.
-		disablePrefill := true
-		t := &fakeTransport{}
-		s := newTranscriptStreamer(context.Background(), t, "resp1", "item1", "<think>",
-			reasoning.Config{DisableReasoningTagPrefill: &disablePrefill})
-
-		s.onToken("<think>secret</think>")
-		s.onToken("The answer is 42.")
-
-		Expect(s.content()).To(Equal("The answer is 42."))
-		Expect(t.transcriptDeltaText()).ToNot(ContainSubstring("secret"))
-	})
-})
-
-// streamLLMResponse drives a full streamed realtime turn: live transcript
-// deltas while the LLM generates, then the whole message is synthesized once.
-var _ = Describe("streamLLMResponse", func() {
-	It("streams transcript deltas then synthesizes the whole message once", func() {
-		on := true
-		m := &fakeModel{
-			predictTokens:   []string{"Hello", " world.", " How are you?"},
-			predictResp:     backend.LLMResponse{Response: "Hello world. How are you?"},
-			ttsStreamChunks: [][]byte{{9}},
-			ttsStreamRate:   24000,
-		}
-		session := &Session{
-			OutputSampleRate: 24000,
-			ModelInterface:   m,
-			ModelConfig: &config.ModelConfig{
-				Pipeline: config.Pipeline{Streaming: config.PipelineStreaming{LLM: &on, TTS: &on}},
-			},
-		}
-		conv := &Conversation{}
-		t := &fakeTransport{}
-		llmCfg := &config.ModelConfig{}
-
-		handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0)
-
-		Expect(handled).To(BeTrue())
-		// One live transcript delta per streamed token.
-		Expect(t.countEvents(types.ServerEventTypeResponseOutputAudioTranscriptDelta)).To(Equal(3))
-		// The whole message is synthesized ONCE (not per sentence): a single
-		// emitSpeech replays the one TTS stream chunk.
-		Expect(t.countEvents(types.ServerEventTypeResponseOutputAudioDelta)).To(Equal(1))
-		Expect(t.transcriptDeltaText()).To(Equal("Hello world. How are you?"))
-	})
-
-	It("synthesizes each clause as it completes when clause chunking is enabled", func() {
-		on := true
-		m := &fakeModel{
-			predictTokens:   []string{"Hello world.", " How are you?"},
-			predictResp:     backend.LLMResponse{Response: "Hello world. How are you?"},
-			ttsStreamChunks: [][]byte{{9}},
-			ttsStreamRate:   24000,
-		}
-		session := &Session{
-			OutputSampleRate: 24000,
-			ModelInterface:   m,
-			ModelConfig: &config.ModelConfig{
-				Pipeline: config.Pipeline{Streaming: config.PipelineStreaming{LLM: &on, TTS: &on, ClauseChunking: &on}},
-			},
-		}
-		conv := &Conversation{}
-		t := &fakeTransport{}
-		llmCfg := &config.ModelConfig{}
-
-		handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0)
-
-		Expect(handled).To(BeTrue())
-		// Two clauses ("Hello world." mid-stream, "How are you?" on flush) → two
-		// emitSpeech calls → two audio deltas, vs one for whole-message buffering.
-		Expect(t.countEvents(types.ServerEventTypeResponseOutputAudioDelta)).To(Equal(2))
-		// The full transcript still streams verbatim.
-		Expect(t.transcriptDeltaText()).To(Equal("Hello world. How are you?"))
-		// Exactly one terminal response.done.
-		Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(1))
-	})
-
-	It("streams content deltas and emits tool-call items (autoparser tool turn)", func() {
-		on := true
-		// Autoparser path: reply.Message is empty; content + tool calls arrive via
-		// ChatDeltas. Chunk 1 carries content, chunk 2 carries the tool call.
-		contentDelta := []*proto.ChatDelta{{Content: "Let me check."}}
-		toolDelta := []*proto.ChatDelta{{ToolCalls: []*proto.ToolCallDelta{{Index: 0, Name: "get_weather", Arguments: `{"city":"Paris"}`}}}}
-		m := &fakeModel{
-			predictTokens:      []string{"", ""},
-			predictChunkDeltas: [][]*proto.ChatDelta{contentDelta, toolDelta},
-			predictResp:        backend.LLMResponse{ChatDeltas: append(append([]*proto.ChatDelta{}, contentDelta...), toolDelta...)},
-			ttsStreamChunks:    [][]byte{{9}},
-			ttsStreamRate:      24000,
-		}
-		session := &Session{
-			OutputSampleRate: 24000,
-			ModelInterface:   m,
-			ModelConfig: &config.ModelConfig{
-				Pipeline: config.Pipeline{Streaming: config.PipelineStreaming{LLM: &on, TTS: &on}},
-			},
-		}
-		conv := &Conversation{}
-		t := &fakeTransport{}
-		llmCfg := &config.ModelConfig{}
-		llmCfg.TemplateConfig.UseTokenizerTemplate = true
-
-		handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0)
-
-		Expect(handled).To(BeTrue())
-		// The spoken content was streamed live.
-		Expect(t.transcriptDeltaText()).To(Equal("Let me check."))
-		// The tool call is emitted as a function_call item.
-		Expect(t.countEvents(types.ServerEventTypeResponseFunctionCallArgumentsDone)).To(Equal(1))
-		// Exactly one terminal response.done.
-		Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(1))
-	})
-
-	It("emits only tool-call items for a content-less tool turn (no empty assistant item)", func() {
-		on := true
-		toolDelta := []*proto.ChatDelta{{ToolCalls: []*proto.ToolCallDelta{{Index: 0, Name: "get_weather", Arguments: `{"city":"Rome"}`}}}}
-		m := &fakeModel{
-			predictTokens:      []string{""},
-			predictChunkDeltas: [][]*proto.ChatDelta{toolDelta},
-			predictResp:        backend.LLMResponse{ChatDeltas: toolDelta},
-		}
-		session := &Session{
-			OutputSampleRate: 24000,
-			ModelInterface:   m,
-			ModelConfig: &config.ModelConfig{
-				Pipeline: config.Pipeline{Streaming: config.PipelineStreaming{LLM: &on, TTS: &on}},
-			},
-		}
-		conv := &Conversation{}
-		t := &fakeTransport{}
-		llmCfg := &config.ModelConfig{}
-		llmCfg.TemplateConfig.UseTokenizerTemplate = true
-
-		handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0)
-
-		Expect(handled).To(BeTrue())
-		// No content → no transcript deltas and no spurious assistant content item.
-		Expect(t.transcriptDeltaText()).To(Equal(""))
-		Expect(t.countEvents(types.ServerEventTypeResponseOutputAudioTranscriptDelta)).To(Equal(0))
-		// The tool call is still emitted.
-		Expect(t.countEvents(types.ServerEventTypeResponseFunctionCallArgumentsDone)).To(Equal(1))
-		Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(1))
-	})
-})
--- a/core/http/endpoints/openai/realtime_thinking.go
+++ b/core/http/endpoints/openai/realtime_thinking.go
@@ -1,33 +0,0 @@
-package openai
-
-import (
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/pkg/reasoning"
-)
-
-// applyPipelineThinking forces the LLM's reasoning/thinking off when the realtime
-// pipeline sets disable_thinking, mapping to the enable_thinking=false backend
-// metadata via ReasoningConfig.DisableReasoning. The LLM config passed in is the
-// per-session copy returned by the config loader, so this does not affect other
-// users of the same model. When the pipeline does not set disable_thinking the
-// LLM config is left untouched.
-func applyPipelineThinking(llm *config.ModelConfig, pipeline config.Pipeline) {
-	if llm == nil || !pipeline.ThinkingDisabled() {
-		return
-	}
-	disable := true
-	llm.ReasoningConfig.DisableReasoning = &disable
-}
-
-// spokenReasoningConfig adapts a model's reasoning config for stripping reasoning
-// OUT of realtime spoken output. ReasoningConfig.DisableReasoning is overloaded:
-// the backend reads it as the "enable_thinking=false" hint (which pipeline
-// disable_thinking sets via applyPipelineThinking), but the reasoning extractor
-// reads it as "skip stripping, assume there is no reasoning". Honouring the latter
-// when extracting for speech would leak raw <think>…</think> whenever the model
-// ignores the suppression hint. Spoken output must never contain reasoning, so we
-// always strip: clear DisableReasoning while keeping custom tokens/tag pairs.
-func spokenReasoningConfig(cfg reasoning.Config) reasoning.Config {
-	cfg.DisableReasoning = nil
-	return cfg
-}
--- a/core/http/endpoints/openai/realtime_thinking_test.go
+++ b/core/http/endpoints/openai/realtime_thinking_test.go
@@ -1,50 +0,0 @@
-package openai
-
-import (
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/pkg/reasoning"
-)
-
-// applyPipelineThinking lets a realtime pipeline force the LLM's thinking off
-// (enable_thinking=false metadata) without editing the LLM model config.
-var _ = Describe("applyPipelineThinking", func() {
-	It("disables reasoning on the LLM config when the pipeline disables thinking", func() {
-		disable := true
-		llm := &config.ModelConfig{}
-		applyPipelineThinking(llm, config.Pipeline{DisableThinking: &disable})
-		Expect(llm.ReasoningConfig.DisableReasoning).ToNot(BeNil())
-		Expect(*llm.ReasoningConfig.DisableReasoning).To(BeTrue())
-	})
-
-	It("leaves the LLM config untouched when the pipeline does not set disable_thinking", func() {
-		llm := &config.ModelConfig{}
-		applyPipelineThinking(llm, config.Pipeline{})
-		Expect(llm.ReasoningConfig.DisableReasoning).To(BeNil())
-	})
-})
-
-// spokenReasoningConfig clears DisableReasoning so realtime spoken output always
-// strips reasoning, even though disable_thinking sets DisableReasoning=true on the
-// LLM config (which the backend reads as enable_thinking=false).
-var _ = Describe("spokenReasoningConfig", func() {
-	It("clears DisableReasoning so the extractor still strips leaked reasoning", func() {
-		disable := true
-		out := spokenReasoningConfig(reasoning.Config{DisableReasoning: &disable})
-		Expect(out.DisableReasoning).To(BeNil())
-	})
-
-	It("preserves the other reasoning settings", func() {
-		disable := true
-		out := spokenReasoningConfig(reasoning.Config{
-			DisableReasoning:    &disable,
-			ThinkingStartTokens: []string{"<reason>"},
-			TagPairs:            []reasoning.TagPair{{Start: "<reason>", End: "</reason>"}},
-		})
-		Expect(out.ThinkingStartTokens).To(Equal([]string{"<reason>"}))
-		Expect(out.TagPairs).To(HaveLen(1))
-		Expect(out.TagPairs[0].Start).To(Equal("<reason>"))
-	})
-})
--- a/core/http/endpoints/openai/realtime_transcription.go
+++ b/core/http/endpoints/openai/realtime_transcription.go
@@ -1,63 +0,0 @@
-package openai
-
-import (
-	"context"
-	"fmt"
-
-	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
-)
-
-// emitTranscription transcribes a committed utterance and emits the transcription
-// events for it, returning the final transcript text. With
-// pipeline.streaming.transcription enabled it streams each transcript fragment as
-// a conversation.item.input_audio_transcription.delta as the backend produces it,
-// then a completed event; otherwise it transcribes the whole utterance and emits
-// a single completed event. delta and completed events share itemID.
-func emitTranscription(ctx context.Context, t Transport, session *Session, itemID, audioPath string) (string, error) {
-	cfg := session.InputAudioTranscription
-
-	if session.ModelConfig != nil && session.ModelConfig.Pipeline.StreamTranscription() {
-		final, err := session.ModelInterface.TranscribeStream(ctx, audioPath, cfg.Language, false, false, cfg.Prompt, func(delta string) {
-			_ = t.SendEvent(types.ConversationItemInputAudioTranscriptionDeltaEvent{
-				ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
-				ItemID:          itemID,
-				ContentIndex:    0,
-				Delta:           delta,
-			})
-		})
-		if err != nil {
-			return "", err
-		}
-		transcript := ""
-		if final != nil {
-			transcript = final.Text
-		}
-		if err := t.SendEvent(types.ConversationItemInputAudioTranscriptionCompletedEvent{
-			ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
-			ItemID:          itemID,
-			ContentIndex:    0,
-			Transcript:      transcript,
-		}); err != nil {
-			return "", err
-		}
-		return transcript, nil
-	}
-
-	// Unary fallback: transcribe the whole utterance, emit one completed event.
-	tr, err := session.ModelInterface.Transcribe(ctx, audioPath, cfg.Language, false, false, cfg.Prompt)
-	if err != nil {
-		return "", err
-	}
-	if tr == nil {
-		return "", fmt.Errorf("transcribe result is nil")
-	}
-	if err := t.SendEvent(types.ConversationItemInputAudioTranscriptionCompletedEvent{
-		ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
-		ItemID:          itemID,
-		ContentIndex:    0,
-		Transcript:      tr.Text,
-	}); err != nil {
-		return "", err
-	}
-	return tr.Text, nil
-}
--- a/core/http/endpoints/openai/realtime_transcription_test.go
+++ b/core/http/endpoints/openai/realtime_transcription_test.go
@@ -1,54 +0,0 @@
-package openai
-
-import (
-	"context"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
-	"github.com/mudler/LocalAI/core/schema"
-)
-
-// emitTranscription transcribes a committed utterance, streaming transcript text
-// deltas when the pipeline opts in, and returns the final transcript text.
-var _ = Describe("emitTranscription", func() {
-	It("streams transcription deltas then a completed event when streaming is enabled", func() {
-		on := true
-		session := &Session{
-			InputAudioTranscription: &types.AudioTranscription{},
-			ModelConfig: &config.ModelConfig{
-				Pipeline: config.Pipeline{Streaming: config.PipelineStreaming{Transcription: &on}},
-			},
-			ModelInterface: &fakeModel{
-				transcribeDeltas: []string{"Hel", "lo", " world"},
-				transcribeFinal:  &schema.TranscriptionResult{Text: "Hello world"},
-			},
-		}
-		t := &fakeTransport{}
-
-		transcript, err := emitTranscription(context.Background(), t, session, "item1", "/tmp/x.wav")
-
-		Expect(err).ToNot(HaveOccurred())
-		Expect(transcript).To(Equal("Hello world"))
-		Expect(t.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionDelta)).To(Equal(3))
-		Expect(t.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
-	})
-
-	It("emits a single completed event with no deltas in unary mode", func() {
-		session := &Session{
-			InputAudioTranscription: &types.AudioTranscription{},
-			ModelConfig:             &config.ModelConfig{}, // streaming off
-			ModelInterface:          &fakeModel{transcribeFinal: &schema.TranscriptionResult{Text: "Hi"}},
-		}
-		t := &fakeTransport{}
-
-		transcript, err := emitTranscription(context.Background(), t, session, "item1", "/tmp/x.wav")
-
-		Expect(err).ToNot(HaveOccurred())
-		Expect(transcript).To(Equal("Hi"))
-		Expect(t.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionDelta)).To(Equal(0))
-		Expect(t.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
-	})
-})
--- a/core/http/middleware/probe_trim_test.go
+++ b/core/http/middleware/probe_trim_test.go
@@ -1,139 +0,0 @@
-package middleware
-
-import (
-	"strings"
-
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/schema"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("routerConfigFingerprint", func() {
-	rc := config.RouterConfig{Classifier: "score", ClassifierModel: "arch-router"}
-	ctx4096 := 4096
-	ctx8192 := 8192
-
-	// Regression: the score classifier bakes context_size into its token
-	// budget at build time, and the built classifier is cached by this
-	// fingerprint. If context_size weren't hashed, editing it and reloading
-	// would return a classifier carrying the stale budget.
-	It("changes when the classifier model's context_size changes", func() {
-		cfgA := &config.ModelConfig{LLMConfig: config.LLMConfig{ContextSize: &ctx4096}}
-		cfgB := &config.ModelConfig{LLMConfig: config.LLMConfig{ContextSize: &ctx8192}}
-		Expect(routerConfigFingerprint(rc, cfgA)).NotTo(Equal(routerConfigFingerprint(rc, cfgB)))
-	})
-
-	It("is stable for identical classifier configs", func() {
-		cfgA := &config.ModelConfig{LLMConfig: config.LLMConfig{ContextSize: &ctx4096}}
-		cfgB := &config.ModelConfig{LLMConfig: config.LLMConfig{ContextSize: &ctx4096}}
-		Expect(routerConfigFingerprint(rc, cfgA)).To(Equal(routerConfigFingerprint(rc, cfgB)))
-	})
-})
-
-var _ = Describe("routing probe extraction and trimming", func() {
-	Describe("OpenAIProbeFromRequest", func() {
-		It("keeps a short conversation intact, newline-terminated per message", func() {
-			req := &schema.OpenAIRequest{Messages: []schema.Message{
-				{Role: "user", Content: "first"},
-				{Role: "assistant", Content: "second"},
-				{Role: "user", Content: "third"},
-			}}
-			Expect(OpenAIProbeFromRequest(req).Prompt).To(Equal("first\nsecond\nthird\n"))
-		})
-
-		It("flattens text blocks and skips image-only messages", func() {
-			req := &schema.OpenAIRequest{Messages: []schema.Message{
-				{Role: "user", Content: []any{
-					map[string]any{"type": "text", "text": "describe this"},
-					map[string]any{"type": "image_url", "image_url": map[string]any{"url": "data:..."}},
-				}},
-				{Role: "user", Content: []any{
-					map[string]any{"type": "image_url", "image_url": map[string]any{"url": "data:..."}},
-				}},
-			}}
-			// Second message contributes no text, so it neither adds a blank
-			// line nor a stray newline.
-			Expect(OpenAIProbeFromRequest(req).Prompt).To(Equal("describe this\n"))
-		})
-
-		It("carries the full conversation untrimmed — trimming is each classifier's job", func() {
-			// The middleware no longer caps the probe by a fixed rune budget;
-			// every turn reaches the Probe and each classifier trims to its own
-			// model's context (see modelTokenTrim / promptTrimmer).
-			block := strings.Repeat("x", 999)
-			msgs := make([]schema.Message, 0, 20)
-			msgs = append(msgs, schema.Message{Role: "user", Content: "OLDEST" + strings.Repeat("o", 994)})
-			for range 18 {
-				msgs = append(msgs, schema.Message{Role: "user", Content: block})
-			}
-			msgs = append(msgs, schema.Message{Role: "user", Content: "NEWEST" + strings.Repeat("n", 994)})
-
-			probe := OpenAIProbeFromRequest(&schema.OpenAIRequest{Messages: msgs})
-			Expect(probe.Prompt).To(ContainSubstring("OLDEST"), "no turn is dropped at probe-build time")
-			Expect(probe.Prompt).To(ContainSubstring("NEWEST"))
-			// Messages preserves the per-turn split the classifier trims from.
-			Expect(probe.Messages).To(HaveLen(20))
-			Expect(probe.Messages[0]).To(ContainSubstring("OLDEST"))
-			Expect(probe.Messages[19]).To(ContainSubstring("NEWEST"))
-		})
-	})
-
-	Describe("AnthropicProbe", func() {
-		It("extracts and trims the same way as the OpenAI path", func() {
-			req := &schema.AnthropicRequest{Messages: []schema.AnthropicMessage{
-				{Role: "user", Content: "alpha"},
-				{Role: "assistant", Content: []any{
-					map[string]any{"type": "text", "text": "beta"},
-				}},
-			}}
-			probe, ok := AnthropicProbe(req)
-			Expect(ok).To(BeTrue())
-			Expect(probe.Prompt).To(Equal("alpha\nbeta\n"))
-		})
-
-		It("returns ok=false for a non-Anthropic payload", func() {
-			_, ok := AnthropicProbe(&schema.OpenAIRequest{})
-			Expect(ok).To(BeFalse())
-		})
-	})
-
-	Describe("modelTokenTrim", func() {
-		tok := func(string) (int, error) { return 1, nil }
-		depsFor := func(cfg *config.ModelConfig) ClassifierDeps {
-			return ClassifierDeps{
-				ModelLookup:  func(string) *config.ModelConfig { return cfg },
-				TokenCounter: func(string) func(string) (int, error) { return tok },
-			}
-		}
-
-		It("still trims to the backend default when context_size is unset", func() {
-			// Regression: with the fixed middleware rune cap gone, an unset
-			// context_size must NOT disable trimming — otherwise a non-trivial
-			// prompt overflows the default 4096 window and every score fails.
-			score := config.FLAG_SCORE
-			cfg := &config.ModelConfig{KnownUsecases: &score} // FLAG_SCORE → batch follows context
-			count, ceiling := modelTokenTrim("classifier", depsFor(cfg))
-			Expect(count).NotTo(BeNil())
-			Expect(ceiling).To(Equal(4096), "unset context_size falls back to the backend default, not 0")
-		})
-
-		It("is bounded by the batch when the batch is smaller than the context", func() {
-			// The probe is one decode (n_tokens <= n_batch). A model with a
-			// large context but a small batch can only process the batch — the
-			// ceiling must follow it, not the context.
-			ctx8k := 8192
-			cfg := &config.ModelConfig{LLMConfig: config.LLMConfig{ContextSize: &ctx8k}}
-			cfg.Batch = 512
-			_, ceiling := modelTokenTrim("embedder", depsFor(cfg))
-			Expect(ceiling).To(Equal(512), "batch is the binding single-decode limit")
-		})
-
-		It("disables trimming only when no tokenizer is available", func() {
-			count, ceiling := modelTokenTrim("x", ClassifierDeps{ModelLookup: func(string) *config.ModelConfig { return &config.ModelConfig{} }})
-			Expect(count).To(BeNil())
-			Expect(ceiling).To(Equal(0))
-		})
-	})
-})
--- a/core/http/middleware/route_model.go
+++ b/core/http/middleware/route_model.go
@@ -6,7 +6,6 @@ import (
 	"encoding/hex"
 	"fmt"
 	"hash/fnv"
-	"strconv"
 	"strings"
 	"time"

@@ -87,12 +86,6 @@ type ClassifierDeps struct {
 	// templates.Evaluator so any model the operator points at gets
 	// its own chat template applied.
 	Evaluator *templates.Evaluator
-
-	// TokenCounter binds the classifier model's tokenizer for the score
-	// classifier's token-trim path. Optional; nil falls back to the
-	// backend's n_ctx guard. Plain func type so core/application supplies
-	// it as a method value without importing this package.
-	TokenCounter func(modelName string) func(text string) (int, error)
 }

 // ProbeExtractor pulls the prompt content out of a parsed request so
@@ -219,6 +212,7 @@ func recordHTTPDecision(c echo.Context, store router.DecisionStore, result *rout
 	_ = store.Record(context.Background(), result.ToDecisionRecord(newDecisionID(), correlationID, userID, source))
 }

+
 // GetOrBuildClassifier looks up a built Classifier for the named router
 // model in the registry and builds it on miss. Exported so the
 // /api/router/decide decision-oracle endpoint can share the same
@@ -268,10 +262,9 @@ func routerConfigFingerprint(rc config.RouterConfig, classifierCfg *config.Model
 	h := fnv.New64a()
 	h.Write(bytes)
 	if classifierCfg != nil {
-		// Narrow projection: only the fields buildClassifier reads (renderer,
-		// stop tokens, context_size → MaxContextTokens). Hashing the whole
-		// ModelConfig would invalidate the cache on irrelevant changes;
-		// omitting context_size would let a reload leave a stale token budget.
+		// Narrow projection: only the fields newTemplateRenderer and
+		// firstStopWord actually read. Hashing the whole ModelConfig
+		// would invalidate the cache on irrelevant parameter changes.
 		h.Write([]byte{0}) // separator so empty fields don't collide
 		h.Write([]byte(classifierCfg.TemplateConfig.Chat))
 		h.Write([]byte{0})
@@ -281,10 +274,6 @@ func routerConfigFingerprint(rc config.RouterConfig, classifierCfg *config.Model
 			h.Write([]byte(sw))
 			h.Write([]byte{0})
 		}
-		h.Write([]byte{0})
-		if classifierCfg.ContextSize != nil {
-			h.Write([]byte(strconv.Itoa(*classifierCfg.ContextSize)))
-		}
 	}
 	return h.Sum64()
 }
@@ -330,30 +319,11 @@ func buildClassifier(cfg *config.ModelConfig, deps ClassifierDeps) (router.Class
 		if deps.ModelLookup != nil {
 			if classifierCfg := deps.ModelLookup(rc.ClassifierModel); classifierCfg != nil {
 				if deps.Evaluator != nil {
-					// The router renders the scoring prompt client-side, so the
-					// classifier model MUST carry a chat template — refusing
-					// here beats silently falling back to a generic ChatML
-					// envelope the model may not have been trained on.
-					renderer := newTemplateRenderer(deps.Evaluator, classifierCfg)
-					if renderer == nil {
-						return nil, fmt.Errorf(
-							"router classifier score: classifier_model %q has no chat template "+
-								"(set template.chat and template.chat_message in its config). The router "+
-								"renders the scoring prompt with the classifier model's own template; "+
-								"without it the prompt format would not match the model",
-							rc.ClassifierModel)
-					}
-					opts.PromptRenderer = renderer
+					opts.PromptRenderer = newTemplateRenderer(deps.Evaluator, classifierCfg)
 				}
 				if st := pickAssistantTurnEnd(classifierCfg.StopWords, classifierCfg.TemplateConfig.ChatMessage); st != "" {
 					opts.StopToken = st
 				}
-				// Token-exact conversation trim — score classifier drops the
-				// oldest turns using the model's own tokenizer.
-				if count, ctxTokens := modelTokenTrim(rc.ClassifierModel, deps); count != nil {
-					opts.TokenCounter = count
-					opts.MaxContextTokens = ctxTokens
-				}
 			}
 		}
 		inner = router.NewScoreClassifier(policies, scorer, opts)
@@ -365,11 +335,7 @@ func buildClassifier(cfg *config.ModelConfig, deps ClassifierDeps) (router.Class
 		if reranker == nil {
 			return nil, fmt.Errorf("router classifier colbert: classifier_model %q not loadable", rc.ClassifierModel)
 		}
-		rerankClassifier := router.NewRerankClassifier(policies, reranker, cacheCap, rc.ActivationThreshold)
-		if count, ctxTokens := modelTokenTrim(rc.ClassifierModel, deps); count != nil {
-			rerankClassifier = rerankClassifier.WithTokenTrim(count, ctxTokens)
-		}
-		inner = rerankClassifier
+		inner = router.NewRerankClassifier(policies, reranker, cacheCap, rc.ActivationThreshold)
 	default:
 		return nil, fmt.Errorf("router: unknown classifier %q (supported: %s)", name, strings.Join([]string{router.ClassifierScore, router.ClassifierColbert}, ", "))
 	}
@@ -557,41 +523,7 @@ func wrapWithEmbeddingCache(cfg *config.ModelConfig, inner router.Classifier, de
 	if vstore == nil {
 		return nil, fmt.Errorf("vector store %q not loadable", storeName)
 	}
-	cache := router.NewEmbeddingCacheClassifier(inner, embedder, vstore, ec.SimilarityThreshold, ec.ConfidenceThreshold)
-	// Trim the probe to the embedder model's own context (e.g. nomic-embed at
-	// 8k) rather than a fixed guess — otherwise the cache key is an embedding
-	// of a silently-truncated conversation.
-	if count, ctxTokens := modelTokenTrim(ec.EmbeddingModel, deps); count != nil {
-		cache = cache.WithTokenTrim(count, ctxTokens)
-	}
-	return cache, nil
-}
-
-// modelTokenTrim returns a model's own tokenizer and the token ceiling its
-// probe must fit, or (nil, 0) when no tokenizer is available (only then can we
-// not trim exactly). The ceiling is min(effective context, effective batch):
-// score/embed/rerank all decode the whole prompt in one pass, so it must fit
-// both the context window and a single batch. Using the backend's *effective*
-// values — not the raw config fields — means trimming still works when
-// context_size and batch are unset; otherwise a non-trivial prompt overflows
-// the default window and every classification fails.
-func modelTokenTrim(modelName string, deps ClassifierDeps) (func(string) (int, error), int) {
-	if deps.TokenCounter == nil || deps.ModelLookup == nil {
-		return nil, 0
-	}
-	cfg := deps.ModelLookup(modelName)
-	if cfg == nil {
-		return nil, 0
-	}
-	count := deps.TokenCounter(modelName)
-	if count == nil {
-		return nil, 0
-	}
-	ceiling := backend.EffectiveContextSize(*cfg)
-	if b := backend.EffectiveBatchSize(*cfg); b < ceiling {
-		ceiling = b
-	}
-	return count, ceiling
+	return router.NewEmbeddingCacheClassifier(inner, embedder, vstore, ec.SimilarityThreshold, ec.ConfidenceThreshold), nil
 }

 func newDecisionID() string {
@@ -613,41 +545,6 @@ func OpenAIProbe(parsed any) (router.Probe, bool) {
 	return OpenAIProbeFromRequest(req), true
 }

-// messageText flattens a chat message's Content to plain text: string content
-// verbatim; []any structured content contributes only its "text" blocks.
-func messageText(content any) string {
-	switch ct := content.(type) {
-	case string:
-		return ct
-	case []any:
-		var b strings.Builder
-		for _, block := range ct {
-			if bm, ok := block.(map[string]any); ok && bm["type"] == "text" {
-				if t, ok := bm["text"].(string); ok {
-					if b.Len() > 0 {
-						b.WriteByte('\n')
-					}
-					b.WriteString(t)
-				}
-			}
-		}
-		return b.String()
-	}
-	return ""
-}
-
-// messageProbeParts drops empty (e.g. image-only) messages so they don't
-// consume budget or emit blank lines.
-func messageProbeParts(texts []string) []string {
-	parts := make([]string, 0, len(texts))
-	for _, t := range texts {
-		if t != "" {
-			parts = append(parts, t)
-		}
-	}
-	return parts
-}
-
 // OpenAIProbeFromRequest is the typed counterpart of OpenAIProbe — same
 // extraction logic, but takes the request struct directly. Realtime and
 // other non-HTTP callers use it to feed a probe to router.Resolve
@@ -656,15 +553,24 @@ func OpenAIProbeFromRequest(req *schema.OpenAIRequest) router.Probe {
 	if req == nil {
 		return router.Probe{}
 	}
-	texts := make([]string, len(req.Messages))
+	var b strings.Builder
 	for i := range req.Messages {
-		texts[i] = messageText(req.Messages[i].Content)
+		switch ct := req.Messages[i].Content.(type) {
+		case string:
+			b.WriteString(ct)
+			b.WriteByte('\n')
+		case []any:
+			for _, block := range ct {
+				if bm, ok := block.(map[string]any); ok && bm["type"] == "text" {
+					if t, ok := bm["text"].(string); ok {
+						b.WriteString(t)
+						b.WriteByte('\n')
+					}
+				}
+			}
+		}
 	}
-	parts := messageProbeParts(texts)
-	// Prompt carries the full conversation; each classifier trims it to its own
-	// model's context (see modelTokenTrim). Messages preserves the per-turn
-	// split the trimmer drops oldest-first.
-	return router.Probe{Prompt: router.JoinTurns(parts), Messages: parts}
+	return router.Probe{Prompt: b.String()}
 }

 // AnthropicProbe is the AnthropicRequest analogue of OpenAIProbe.
@@ -673,10 +579,25 @@ func AnthropicProbe(parsed any) (router.Probe, bool) {
 	if !ok || req == nil {
 		return router.Probe{}, false
 	}
-	texts := make([]string, len(req.Messages))
+	var b strings.Builder
 	for i := range req.Messages {
-		texts[i] = messageText(req.Messages[i].Content)
+		switch ct := req.Messages[i].Content.(type) {
+		case string:
+			b.WriteString(ct)
+			b.WriteByte('\n')
+		case []any:
+			for _, block := range ct {
+				if bm, ok := block.(map[string]any); ok && bm["type"] == "text" {
+					if t, ok := bm["text"].(string); ok {
+						b.WriteString(t)
+						b.WriteByte('\n')
+					}
+				}
+			}
+		}
 	}
-	parts := messageProbeParts(texts)
-	return router.Probe{Prompt: router.JoinTurns(parts), Messages: parts}, true
+	return router.Probe{
+		Prompt: b.String(),
+	}, true
 }
+
--- a/core/http/middleware/route_model_test.go
+++ b/core/http/middleware/route_model_test.go
@@ -246,12 +246,11 @@ var _ = Describe("RouteModel rendered classifier prompt", func() {
 			"rendered prompt must end at assistant-open marker. got: %q", s.lastPrompt)
 	})

-	It("refuses to build the router when the classifier model has no chat_message template", func() {
-		// Partial template config: only the outer Chat, no per-role piece.
-		// The router renders the scoring prompt client-side from the
-		// classifier model's own template, so a missing template is a hard
-		// error rather than a silent fall back to a generic ChatML envelope
-		// the model may not have been trained on.
+	It("falls back to chatMLRenderer when the classifier model has no chat_message template", func() {
+		// Partial template config: only outer Chat, no per-role
+		// piece. The renderer must refuse rather than emit a prompt
+		// that drops the system turn, so the score classifier's
+		// built-in ChatML default takes over.
 		writePartialClassifierModel(modelDir, "arch-router")
 		routerCfg := newScoreRouterModel(modelDir, "smart-router")

@@ -267,9 +266,19 @@ var _ = Describe("RouteModel rendered classifier prompt", func() {
 				ModelLookup: loaderLookup(loader, appConfig),
 				Evaluator:   eval,
 			})
-		Expect(err).To(HaveOccurred())
-		Expect(err.Error()).To(ContainSubstring("no chat template"),
-			"missing classifier template must surface as a clear config error. got: %v", err)
+		Expect(err).NotTo(HaveOccurred())
+
+		// chatMLRenderer fallback emits its own envelope and still
+		// embeds the routing system prompt. OpenAIProbeFromRequest
+		// appends "\n" after each message body, so the user content
+		// reaches the renderer as "hello world\n" — the substring
+		// match accounts for that.
+		Expect(s.lastPrompt).To(ContainSubstring("<routes>"),
+			"fallback renderer also dropped the system prompt")
+		Expect(s.lastPrompt).To(ContainSubstring("<|im_start|>system\n"))
+		Expect(s.lastPrompt).To(ContainSubstring("<|im_start|>user\nhello world\n<|im_end|>"))
+		Expect(strings.HasSuffix(s.lastPrompt, "<|im_start|>assistant\n")).To(BeTrue(),
+			"chatMLRenderer fallback must end at assistant-open marker. got: %q", s.lastPrompt)
 	})

 	It("uses the classifier model's first stopword as the candidate suffix", func() {
@@ -524,8 +533,8 @@ template:

 // writePartialClassifierModel writes a classifier model that has the
 // outer Chat template but no ChatMessage — exercises the
-// newTemplateRenderer "refuse partial templating" branch, which makes
-// buildClassifier reject the router with a missing-template error.
+// newTemplateRenderer "refuse partial templating" branch that hands
+// off to chatMLRenderer.
 func writePartialClassifierModel(modelDir, name string) {
 	body := `name: ` + name + `
 backend: llama-cpp
--- a/core/http/react-ui/e2e/model-config.spec.js
+++ b/core/http/react-ui/e2e/model-config.spec.js
@@ -224,38 +224,4 @@ test.describe('Model Editor - Interactive Tab', () => {
    expect(estimateCalled).toBe(true)
  })

-  test('interactive tab scrolls at body height (no inner overflow pane) and tracks the active section', async ({ page }) => {
-    // Regression: the form sections used to live inside an overflow:auto pane
-    // with maxHeight: calc(100vh - 340px), which kept the global footer in
-    // view on every screen and ate ~50px of editing room on short windows.
-    // Pin two pieces of the fix:
-    //  1. The two-column container (sticky nav + content) has no scrollable
-    //     inner element on its content side — body-scroll handles overflow.
-    //  2. The active-section tracker now listens to window scroll. Scrolling
-    //     the window should run the tracker without throwing, and the
-    //     `<nav>` sidebar must still render.
-    const contentOverflowY = await page.evaluate(() => {
-      const sidebar = document.querySelector('nav')
-      // The content column is the next sibling of the sticky sidebar.
-      const content = sidebar?.nextElementSibling
-      return content ? getComputedStyle(content).overflowY : 'no-content'
-    })
-    expect(['visible', 'normal', 'auto', 'scroll', 'no-content']).toContain(contentOverflowY)
-    expect(contentOverflowY).not.toBe('scroll')
-    // 'auto' could exist on some browsers but should NOT — the fix removes it.
-    // We assert the strong invariant separately.
-    expect(['auto']).not.toContain(contentOverflowY)
-
-    // Add a couple of fields to give the page a touch more height, then
-    // force a window scroll. The tracker should run; the sidebar should
-    // remain visible.
-    const searchInput = page.locator('input[placeholder="Search fields to add..."]')
-    await searchInput.fill('Temperature')
-    const dropdown = searchInput.locator('..').locator('..')
-    await dropdown.locator('div', { hasText: 'Temperature' }).first().click()
-    await page.evaluate(() => window.scrollTo(0, 200))
-    await page.waitForTimeout(50)
-    await expect(page.locator('nav').first()).toBeVisible()
-  })
-
 })
--- a/core/http/react-ui/e2e/model-editor-back-nav.spec.js
+++ b/core/http/react-ui/e2e/model-editor-back-nav.spec.js
@@ -1,94 +0,0 @@
-import { test, expect } from './coverage-fixtures.js'
-
-// Exercises the "Back to <page>" navigation convention: whichever page links
-// into the Model Editor stamps its origin as react-router location state, and
-// the editor's Back button returns there (captioned with the origin) instead
-// of a hardcoded route. Also covers the Middleware page's ?tab= persistence,
-// which is what lets the editor return you to the exact tab you came from.
-
-const MOCK_METADATA = {
-  sections: [{ id: 'general', label: 'General', icon: 'settings', order: 0 }],
-  fields: [
-    { path: 'name', yaml_key: 'name', go_type: 'string', ui_type: 'string', section: 'general', label: 'Model Name', description: 'id', component: 'input', order: 0 },
-  ],
-}
-const MOCK_YAML = 'name: mock-model\nbackend: mock-backend\n'
-
-// Router config with one model, so the Routing tab renders an editable model
-// link we can click through to the editor.
-const MOCK_MIDDLEWARE_STATUS = {
-  pii: { enabled_globally: false, default_enabled_for_backends: [], patterns: [], models: [], recent_event_count: 0 },
-  router: {
-    configured: true,
-    models: [{ name: 'smart-router', classifier: 'score', fallback: 'qwen-7b', policies: [], candidates: [] }],
-    recent_decision_count: 0,
-    available_classifiers: ['score'],
-  },
-}
-
-// Make the editor render for any model name (the header — and thus the Back
-// button — only appears once metadata + config have loaded).
-async function mockEditorEndpoints(page) {
-  await page.route('**/api/models/config-metadata*', (route) =>
-    route.fulfill({ contentType: 'application/json', body: JSON.stringify(MOCK_METADATA) }))
-  await page.route('**/api/models/edit/**', (route) =>
-    route.fulfill({ contentType: 'application/json', body: JSON.stringify({ config: MOCK_YAML, name: 'mock-model' }) }))
-  await page.route('**/api/models/config-json/**', (route) =>
-    route.fulfill({ contentType: 'application/json', body: '{}' }))
-}
-
-test.describe('Model Editor — Back navigation', () => {
-  test.beforeEach(async ({ page }) => {
-    await page.route('**/api/auth/status', (route) =>
-      route.fulfill({ contentType: 'application/json', body: JSON.stringify({ authEnabled: false, staticApiKeyRequired: false, providers: [] }) }))
-    await mockEditorEndpoints(page)
-  })
-
-  test('Back returns to Manage with a "Back to Manage" caption', async ({ page }) => {
-    await page.goto('/app/manage')
-    await expect(page.locator('.table')).toBeVisible({ timeout: 10_000 })
-
-    // Open the first row's action menu and pick "Edit configuration".
-    const trigger = page.locator('button.action-menu__trigger').first()
-    await expect(trigger).toBeVisible()
-    await trigger.click()
-    await page.getByRole('menuitem', { name: 'Edit configuration' }).click()
-
-    await expect(page).toHaveURL(/\/app\/model-editor\//)
-    const back = page.getByRole('button', { name: /Back to Manage/ })
-    await expect(back).toBeVisible({ timeout: 10_000 })
-
-    await back.click()
-    await expect(page).toHaveURL(/\/app\/manage/)
-  })
-
-  test('returns to the originating Middleware tab (?tab=routing) it was opened from', async ({ page }) => {
-    await page.route('**/api/middleware/status', (route) =>
-      route.fulfill({ contentType: 'application/json', body: JSON.stringify(MOCK_MIDDLEWARE_STATUS) }))
-    await page.route('**/api/pii/events?**', (route) =>
-      route.fulfill({ contentType: 'application/json', body: JSON.stringify({ events: [] }) }))
-    await page.route('**/api/router/decisions?**', (route) =>
-      route.fulfill({ contentType: 'application/json', body: JSON.stringify({ decisions: [] }) }))
-
-    await page.goto('/app/middleware')
-    // Switching to Routing must push the tab into the URL.
-    await page.getByRole('button', { name: /Routing/i }).click()
-    await expect(page).toHaveURL(/[?&]tab=routing/)
-
-    // Click through to the router model's config, then back.
-    await page.getByRole('link', { name: 'smart-router' }).click()
-    await expect(page).toHaveURL(/\/app\/model-editor\/smart-router/)
-    const back = page.getByRole('button', { name: /Back to Middleware/ })
-    await expect(back).toBeVisible({ timeout: 10_000 })
-
-    await back.click()
-    // Returns to the exact tab, not the default Filtering tab.
-    await expect(page).toHaveURL(/\/app\/middleware\?tab=routing/)
-    await expect(page.getByText('smart-router').first()).toBeVisible()
-  })
-
-  test('falls back to "Back to Manage" on a direct visit with no origin state', async ({ page }) => {
-    await page.goto('/app/model-editor/mock-model')
-    await expect(page.getByRole('button', { name: /Back to Manage/ })).toBeVisible({ timeout: 10_000 })
-  })
-})
--- a/core/http/react-ui/e2e/traces-errors.spec.js
+++ b/core/http/react-ui/e2e/traces-errors.spec.js
@@ -48,77 +48,3 @@ test.describe('Traces - Error Display', () => {
    await expect(page.locator('th', { hasText: 'Type' })).toBeVisible()
  })
 })
-
-// Pin the BackendTraceDetail expansion path for a vector_store trace —
-// the type that surfaces the router's embedding-cache plumbing. The
-// row click triggers the detail render, which exercises typeBadgeStyle
-// (with the new vector_store badge color), the DataFields component
-// (op / outcome / vector_dim / similarity), and the "View backend
-// logs" link that resolves to the store namespace. Without this spec
-// the new color entry plus the data-field render branches stay
-// uncovered, dragging UI line coverage below the regression gate.
-test.describe('Traces - vector_store backend trace detail', () => {
-  test.beforeEach(async ({ page }) => {
-    await page.route('**/api/traces', (route) => {
-      route.fulfill({ contentType: 'application/json', body: '[]' })
-    })
-    await page.route('**/api/backend-traces', (route) => {
-      route.fulfill({
-        contentType: 'application/json',
-        body: JSON.stringify([
-          {
-            type: 'vector_store',
-            timestamp: '2026-05-28T13:56:25.558Z',
-            model_name: 'router-cache-smart-router',
-            backend: 'local-store',
-            summary: 'search hit (sim=0.989)',
-            duration: 160_000_000,
-            error: '',
-            data: {
-              op: 'search',
-              outcome: 'hit',
-              vector_dim: 768,
-              similarity: 0.9899752140045166,
-            },
-          },
-          {
-            type: 'vector_store',
-            timestamp: '2026-05-28T13:49:07.545Z',
-            model_name: 'router-cache-smart-router',
-            backend: 'local-store',
-            summary: 'search miss',
-            duration: 100_000_000,
-            error: '',
-            data: {
-              op: 'search',
-              outcome: 'miss',
-              vector_dim: 768,
-            },
-          },
-        ]),
-      })
-    })
-    await page.goto('/app/traces')
-    await expect(page.locator('text=Tracing is')).toBeVisible({ timeout: 10_000 })
-    await page.locator('button', { hasText: 'Backend Traces' }).click()
-  })
-
-  test('renders type badge and expands data fields on row click', async ({ page }) => {
-    // The vector_store badge appears in the type column.
-    await expect(page.locator('td span', { hasText: 'vector_store' }).first()).toBeVisible()
-
-    // Clicking the first row expands BackendTraceDetail, which renders
-    // the four data fields. Use the first row's "search hit" summary
-    // as the anchor to disambiguate from the miss row below.
-    await page.locator('tr', { hasText: 'search hit' }).first().click()
-
-    // DataFields renders op/outcome/vector_dim/similarity as label/value pairs.
-    // 'hit' appears as the rendered outcome value.
-    await expect(page.locator('text=outcome').first()).toBeVisible()
-    await expect(page.locator('text=hit').first()).toBeVisible()
-
-    // The model_name → /app/backend-logs link is the BackendTraceDetail
-    // affordance for jumping to logs for the store namespace.
-    await expect(page.locator('a', { hasText: 'View backend logs' })).toBeVisible()
-  })
-})
--- a/core/http/react-ui/i18next-parser.config.js
+++ b/core/http/react-ui/i18next-parser.config.js
@@ -1,5 +1,5 @@
 export default {
-  locales: ['en', 'it', 'es', 'de', 'zh-CN', 'id'],
+  locales: ['en', 'it', 'es', 'de', 'zh-CN'],
  defaultNamespace: 'common',
  output: 'public/locales/$LOCALE/$NAMESPACE.json',
  input: ['src/**/*.{js,jsx}'],
--- a/core/http/react-ui/playwright.config.js
+++ b/core/http/react-ui/playwright.config.js
@@ -4,12 +4,6 @@ export default defineConfig({
  testDir: './e2e',
  timeout: 30_000,
  retries: process.env.CI ? 2 : 0,
-  // TEMPORARY: cap parallelism. Playwright's default (cores/2) oversubscribes
-  // high-core dev machines and intermittently starves the page-teardown
-  // coverage harvest past the 30s test timeout (flaky "Tearing down page"
-  // failures, different specs each run). Capped at 8 pending a proper
-  // root-cause fix; override with PW_WORKERS.
-  workers: process.env.PW_WORKERS ? Number(process.env.PW_WORKERS) : 8,
  reporter: process.env.CI ? 'html' : 'list',
  use: {
    baseURL: 'http://127.0.0.1:8089',
--- a/core/http/react-ui/public/locales/id/admin.json
+++ b/core/http/react-ui/public/locales/id/admin.json
@@ -1,85 +0,0 @@
-{
-  "manage": {
-    "title": "Sistem",
-    "subtitle": "Kelola model dan backend yang terinstal"
-  },
-  "settings": {
-    "title": "Pengaturan",
-    "subtitle": "Konfigurasi pengaturan runtime LocalAI",
-    "saved": "Pengaturan berhasil disimpan",
-    "saveFailed": "Gagal menyimpan: {{message}}",
-    "loadFailed": "Gagal memuat pengaturan: {{message}}",
-    "sections": {
-      "branding": "Branding",
-      "watchdog": "Watchdog",
-      "memory": "Memori",
-      "backends": "Backend",
-      "performance": "Performa",
-      "tracing": "Tracing",
-      "api": "API & CORS",
-      "p2p": "P2P",
-      "galleries": "Galeri",
-      "apikeys": "API Key",
-      "agents": "Agent Job",
-      "agentpool": "Agent Pool",
-      "assistant": "Asisten LocalAI",
-      "responses": "Respons"
-    }
-  },
-  "backends": {
-    "title": "Manajemen Backend",
-    "subtitle": "Temukan dan instal backend AI untuk mendukung model Anda"
-  },
-  "backendLogs": {
-    "title": "Log Backend",
-    "subtitle": "Lihat log dari backend yang sedang berjalan",
-    "empty": "Tidak ada log yang tersedia"
-  },
-  "traces": {
-    "title": "Trace",
-    "subtitle": "Lihat log permintaan API, respons, dan operasi backend"
-  },
-  "nodes": {
-    "title": "Node Terdistribusi",
-    "subtitle": "Kelola node backend dan node worker"
-  },
-  "p2p": {
-    "title": "Komputasi AI Terdistribusi",
-    "subtitle": "Skalakan beban kerja AI Anda ke beberapa perangkat dengan distribusi peer-to-peer"
-  },
-  "users": {
-    "title": "Pengguna",
-    "subtitle": "Kelola pengguna terdaftar, peran, dan undangan"
-  },
-  "usage": {
-    "title": "Penggunaan",
-    "subtitle": "Statistik penggunaan token API",
-    "sources": {
-      "tab": "Sumber",
-      "mixTitle": "Campuran sumber",
-      "ribbonAria": "{{apikey}}% API Key, {{web}}% Web UI, {{legacy}}% Legasi",
-      "topSources": "Sumber teratas dari waktu ke waktu",
-      "searchPlaceholder": "Cari berdasarkan nama atau awalan",
-      "sortBy": "Urutkan",
-      "sortTokens": "Token",
-      "sortRequests": "Permintaan",
-      "sortLastUsed": "Terakhir digunakan",
-      "sortName": "Nama",
-      "sortUser": "Pengguna",
-      "webUI": "Web UI",
-      "legacy": "Legasi",
-      "revoked": " dicabut",
-      "filteredTo": "Difilter ke: {{name}}",
-      "clearFilter": "Hapus filter",
-      "other": "Lainnya ({{count}})",
-      "noTrafficShort": "Tidak ada permintaan dalam periode ini.",
-      "noKeysYet": "Setelah permintaan masuk, Anda akan melihat rinciannya di sini.",
-      "createKey": "Buat API Key pertama Anda",
-      "truncatedWarning": "Menampilkan 200 key teratas. Terapkan filter untuk mempersempit pencarian."
-    }
-  },
-  "explorer": {
-    "title": "Penjelajah",
-    "subtitle": "Jelajahi file dan konfigurasi"
-  }
-}
--- a/core/http/react-ui/public/locales/id/agents.json
+++ b/core/http/react-ui/public/locales/id/agents.json
@@ -1,55 +0,0 @@
-{
-  "title": "Agen",
-  "subtitle": "Kelola agen AI otonom",
-  "actions": {
-    "agentHub": "Pusat Agen",
-    "import": "Impor",
-    "createAgent": "Buat Agen",
-    "edit": "Edit",
-    "chat": "Obrolan",
-    "export": "Ekspor",
-    "delete": "Hapus",
-    "pause": "Jeda",
-    "resume": "Lanjutkan"
-  },
-  "table": {
-    "name": "Nama",
-    "status": "Status",
-    "events": "Event",
-    "actions": "Aksi",
-    "eventsTooltip": "{{count}} event - Klik untuk melihat"
-  },
-  "search": {
-    "placeholder": "Cari agen...",
-    "summary_one": "{{shown}} dari {{total}} agen",
-    "summary_other": "{{shown}} dari {{total}} agen"
-  },
-  "empty": {
-    "noConfigured": "Belum ada agen yang dikonfigurasi",
-    "noConfiguredText": "Buat agen untuk memulai alur kerja AI otonom.",
-    "browseHub": "Tidak tahu harus mulai dari mana? Jelajahi <1>Pusat Agen</1> untuk menemukan konfigurasi agen siap pakai yang bisa Anda impor.",
-    "noMatching": "Tidak ada agen yang cocok",
-    "noMatchingText": "Tidak ada agen yang cocok dengan \"{{query}}\""
-  },
-  "sections": {
-    "yourAgents": "Agent Anda",
-    "otherUsersAgents": "Agent Pengguna Lain"
-  },
-  "deleteDialog": {
-    "title": "Hapus Agen",
-    "message": "Hapus agen \"{{name}}\"? Tindakan ini tidak dapat dibatalkan.",
-    "confirm": "Hapus"
-  },
-  "toasts": {
-    "loadFailed": "Gagal memuat agen: {{message}}",
-    "deleted": "Agen \"{{name}}\" berhasil dihapus",
-    "deleteFailed": "Gagal menghapus agen: {{message}}",
-    "paused": "Agen \"{{name}}\" dijeda",
-    "resumed": "Agen \"{{name}}\" dilanjutkan",
-    "pauseFailed": "Gagal menjeda agen: {{message}}",
-    "resumeFailed": "Gagal melanjutkan agen: {{message}}",
-    "exported": "Agen \"{{name}}\" berhasil diekspor",
-    "exportFailed": "Gagal mengekspor agen: {{message}}",
-    "parseFailed": "Gagal melakukan parse file agen: {{message}}"
-  }
-}
--- a/core/http/react-ui/public/locales/id/auth.json
+++ b/core/http/react-ui/public/locales/id/auth.json
@@ -1,112 +0,0 @@
-{
-  "login": {
-    "subtitle": "Masuk untuk melanjutkan",
-    "registerSubtitle": "Buat akun",
-    "createAdminSubtitle": "Buat akun admin Anda",
-    "tokenSubtitle": "Masukkan API key Anda untuk melanjutkan",
-    "email": "Email",
-    "emailPlaceholder": "anda@example.com",
-    "name": "Nama",
-    "namePlaceholder": "Nama Anda (opsional)",
-    "password": "Kata Sandi",
-    "passwordPlaceholder": "Masukkan kata sandi...",
-    "newPasswordPlaceholder": "Minimal 12 karakter",
-    "confirmPassword": "Konfirmasi Kata Sandi",
-    "confirmPasswordPlaceholder": "Ulangi kata sandi",
-    "inviteCodeLabel": "Kode Undangan",
-    "inviteCodeOptional": " (opsional — lewati waktu tunggu persetujuan)",
-    "inviteCodePlaceholder": "Tempel kode undangan Anda...",
-    "tokenPlaceholder": "Masukkan API key...",
-    "tokenAltPlaceholder": "Masukkan token API...",
-    "signIn": "Masuk",
-    "signingIn": "Sedang masuk...",
-    "register": "Daftar",
-    "creatingAccount": "Membuat akun...",
-    "createAdminAccount": "Buat Akun Admin",
-    "signInWithGitHub": "Masuk dengan GitHub",
-    "signInWithSSO": "Masuk dengan SSO",
-    "loginWithToken": "Masuk dengan Token",
-    "showTokenLogin": "Masuk dengan Token API",
-    "hideTokenLogin": "Sembunyikan Token API",
-    "noAccount": "Belum punya akun?",
-    "hasAccount": "Sudah punya akun?",
-    "or": "atau",
-    "errors": {
-      "loginFailed": "Gagal masuk",
-      "registrationFailed": "Gagal mendaftar",
-      "invalidToken": "Token tidak valid",
-      "passwordsDoNotMatch": "Kata sandi tidak cocok",
-      "enterToken": "Silahkan masukkan token",
-      "networkError": "Eror jaringan",
-      "inviteRequired": "Kode undangan yang valid diperlukan untuk mendaftar"
-    },
-    "messages": {
-      "registrationPending": "Pendaftaran berhasil, menunggu persetujuan."
-    }
-  },
-  "account": {
-    "title": "Akun",
-    "subtitle": "Profil, kredensial, dan API key",
-    "unavailable": "Akun tidak tersedia",
-    "unavailableText": "Autentikasi harus diaktifkan untuk mengelola akun Anda.",
-    "tabs": {
-      "profile": "Profil",
-      "security": "Keamanan",
-      "apiKeys": "API Key"
-    },
-    "profile": {
-      "displayName": "Nama tampilan",
-      "displayNameDescription": "Nama tampilan publik Anda",
-      "avatarUrl": "URL Avatar",
-      "avatarUrlDescription": "URL ke gambar profil Anda",
-      "avatarUrlPlaceholder": "https://example.com/avatar.png",
-      "save": "Simpan",
-      "saving": "Menyimpan...",
-      "updated": "Profil berhasil diperbarui",
-      "updateFailed": "Gagal memperbarui profil: {{message}}"
-    },
-    "security": {
-      "currentPassword": "Kata sandi saat ini",
-      "currentPasswordDescription": "Masukkan kata sandi Anda saat ini untuk memverifikasi identitas Anda",
-      "currentPasswordPlaceholder": "Kata sandi saat ini",
-      "newPassword": "Kata sandi baru",
-      "newPasswordDescription": "Minimal harus 12 karakter",
-      "newPasswordPlaceholder": "Kata sandi baru",
-      "confirmPassword": "Konfirmasi kata sandi",
-      "confirmPasswordDescription": "Masukkan kembali kata sandi baru Anda",
-      "confirmPasswordPlaceholder": "Konfirmasi kata sandi baru",
-      "changePassword": "Ubah kata sandi",
-      "changing": "Mengubah...",
-      "changed": "Kata sandi berhasil diubah",
-      "passwordsDoNotMatch": "Kata sandi tidak cocok",
-      "tooShort": "Kata sandi baru minimal harus 12 karakter",
-      "oauthOnly": "Manajemen kata sandi tidak tersedia untuk akun {{provider}}."
-    },
-    "apiKeys": {
-      "create": "Buat API key",
-      "createDescription": "Buat key untuk akses terprogram",
-      "namePlaceholder": "Nama key (misal: my-app)",
-      "createButton": "Buat",
-      "creating": "Membuat...",
-      "createdToast": "API key berhasil dibuat",
-      "createFailed": "Gagal membuat API key: {{message}}",
-      "loadFailed": "Failed to load API keys: {{message}}",
-      "revoke": "Cabut",
-      "revokeKey": "Cabut key",
-      "revokeTitle": "Cabut API Key",
-      "revokeMessage": "Cabut API key \"{{name}}\"? Tindakan ini tidak dapat dibatalkan.",
-      "revoked": "API key dicabut",
-      "revokeFailed": "Gagal mencabut API key: {{message}}",
-      "copyNow": "Salin sekarang — key ini tidak akan ditampilkan lagi",
-      "copiedToast": "Berhasil disalin ke papan klip",
-      "copyFailed": "Gagal menyalin",
-      "empty": "Belum ada API key. Buat satu di atas untuk akses terprogram.",
-      "lastUsed": "terakhir digunakan {{date}}"
-    }
-  },
-  "notFound": {
-    "title": "Halaman Tidak Ditemukan",
-    "text": "Sepertinya halaman yang Anda cari tidak ditemukan. Mari kembalikan ke halaman sebelumnya.",
-    "goHome": "Kembali ke Beranda"
-  }
-}
--- a/core/http/react-ui/public/locales/id/chat.json
+++ b/core/http/react-ui/public/locales/id/chat.json
@@ -1,117 +0,0 @@
-{
-  "activity": {
-    "thought": "Penalaran",
-    "tool": "Alat",
-    "result": "Hasil",
-    "toolResult": "Hasil {{name}}",
-    "thinking": "Berpikir..."
-  },
-  "header": {
-    "manageModeTooltip": "Obrolan ini dapat menginstal model, mengedit konfigurasi, dan mengelola backend dengan berbicara melalui LocalAI.",
-    "modelInfo": "Info model",
-    "chatSettings": "Pengaturan Obrolan",
-    "modelInfoTitle": "Info model: {{model}}",
-    "editConfig": "Edit konfigurasi",
-    "close": "Tutup"
-  },
-  "modelInfo": {
-    "backend": "Backend",
-    "modelFile": "File model",
-    "contextSize": "Ukuran konteks",
-    "threads": "Thread",
-    "mcp": "MCP",
-    "configured": "Dikonfigurasi",
-    "chatTemplate": "Templat Obrolan",
-    "yes": "Ya",
-    "gpuLayers": "Layer GPU"
-  },
-  "context": {
-    "label": "Konteks: {{percent}}%",
-    "labelWithTokens": "Konteks: {{percent}}% ({{tokens}} tokens)"
-  },
-  "settings": {
-    "title": "Pengaturan Obrolan",
-    "manageMode": "Mode Manajemen",
-    "manageModeDesc": "Izinkan obrolan ini menginstal model, mengganti backend, dan mengedit konfigurasi dengan berbicara melalui LocalAI.",
-    "systemPrompt": "System Prompt",
-    "systemPromptPlaceholder": "Anda adalah asisten yang membantu...",
-    "temperature": "Temperatur",
-    "topP": "Top P",
-    "topK": "Top K",
-    "contextSize": "Ukuran Konteks",
-    "contextSizePlaceholder": "2048",
-    "clearHistory": "Hapus riwayat obrolan"
-  },
-  "empty": {
-    "manageTitle": "Kelola LocalAI dengan obrolan",
-    "manageText": "Izinkan untuk menginstal model, mengganti backend, mengedit konfigurasi, atau memeriksa status. Asisten akan merangkum tindakan dan menunggu konfirmasi Anda sebelum mengubah apa pun.",
-    "startTitle": "Mulai percakapan",
-    "readyText": "Siap untuk mengobrol dengan {{model}}",
-    "selectModelText": "Pilih model di atas untuk memulai",
-    "suggestionsManage": [
-      "Apa saja yang terinstal?",
-      "Instal model obrolan",
-      "Tampilkan status sistem",
-      "Perbarui backend"
-    ],
-    "suggestionsChat": [
-      "Jelaskan cara kerjanya",
-      "Bantu saya menulis kode",
-      "Rangkum dokumen",
-      "Gali ide"
-    ],
-    "recent": "Terbaru",
-    "noMessages": "Belum ada pesan",
-    "hintEnter": "Enter untuk mengirim",
-    "hintShiftEnter": "Shift+Enter untuk baris baru",
-    "hintAttach": "Lampirkan file"
-  },
-  "errors": {
-    "viewTraces": "Lihat trace untuk detailnya"
-  },
-  "actions": {
-    "copy": "Salin",
-    "regenerate": "Hasilkan ulang"
-  },
-  "streaming": {
-    "transferring": "Mentransfer model...",
-    "transferringTo": "Mentransfer model ke {{node}}..."
-  },
-  "tokens": {
-    "perSec": "{{count}} tok/s",
-    "peak": "Puncak: {{count}} tok/s",
-    "usage": "{{prompt}}p + {{completion}}c = {{total}}"
-  },
-  "input": {
-    "placeholder": "Pesan...",
-    "attachFile": "Lampirkan file",
-    "stopGenerating": "Hentikan pembuatan",
-    "canvasTitle": "Canvas — ekstrak blok kode dan media ke panel samping untuk pratinjau, salin, dan unduh",
-    "canvasLabel": "Canvas",
-    "openCanvas": "Buka panel canvas"
-  },
-  "deleteAllDialog": {
-    "title": "Hapus Semua Obrolan",
-    "message": "Hapus semua obrolan? Tindakan ini tidak dapat dibatalkan.",
-    "confirm": "Hapus semua"
-  },
-  "toasts": {
-    "selectModel": "Silahkan pilih model",
-    "copied": "Berhasil disalin ke papan klip",
-    "copyFailed": "Gagal menyalin ke papan klip"
-  },
-  "menu": {
-    "trigger": "Obrolan",
-    "triggerTitle": "Percakapan (Ctrl/Cmd+K)",
-    "search": "Cari percakapan...",
-    "clearSearch": "Hapus pencarian",
-    "noMatch": "Tidak ada percakapan yang cocok dengan pencarian Anda",
-    "noConversations": "Belum ada percakapan",
-    "rename": "Ubah nama",
-    "exportMarkdown": "Ekspor sebagai Markdown",
-    "deleteChat": "Hapus obrolan",
-    "newChat": "Obrolan baru",
-    "clearAll": "Hapus semua",
-    "deleteAllTitle": "Hapus semua percakapan"
-  }
-}
--- a/core/http/react-ui/public/locales/id/collections.json
+++ b/core/http/react-ui/public/locales/id/collections.json
@@ -1,43 +0,0 @@
-{
-  "title": "Basis Pengetahuan",
-  "subtitle": "Kelola koleksi dokumen untuk agen RAG",
-  "newPlaceholder": "Nama koleksi baru...",
-  "actions": {
-    "create": "Buat",
-    "creating": "Membuat...",
-    "details": "Detail",
-    "reset": "Reset",
-    "delete": "Hapus",
-    "viewDetails": "Lihat detail",
-    "resetCollection": "Reset koleksi",
-    "deleteCollection": "Hapus koleksi"
-  },
-  "sections": {
-    "yourCollections": "Koleksi Anda",
-    "otherUsersCollections": "Koleksi Pengguna Lain"
-  },
-  "empty": {
-    "title": "Belum ada koleksi",
-    "text": "Koleksi memungkinkan Anda mengatur dokumen ke dalam basis pengetahuan yang dapat dicari oleh agen menggunakan RAG (Retrieval-Augmented Generation). Buat koleksi di atas untuk memulai.",
-    "noPersonal": "Anda belum memiliki koleksi."
-  },
-  "deleteDialog": {
-    "title": "Hapus koleksi",
-    "message": "Hapus koleksi \"{{name}}\"? Tindakan ini akan menghapus semua entri dan tidak dapat dibatalkan.",
-    "confirm": "Hapus"
-  },
-  "resetDialog": {
-    "title": "Reset koleksi",
-    "message": "Reset koleksi \"{{name}}\"? Tindakan ini akan menghapus semua entri tetapi mempertahankan koleksinya.",
-    "confirm": "Reset"
-  },
-  "toasts": {
-    "loadFailed": "Gagal memuat koleksi: {{message}}",
-    "created": "Koleksi \"{{name}}\" berhasil dibuat",
-    "createFailed": "Gagal membuat koleksi: {{message}}",
-    "deleted": "Koleksi \"{{name}}\" berhasil dihapus",
-    "deleteFailed": "Gagal menghapus koleksi: {{message}}",
-    "reset": "Koleksi \"{{name}}\" berhasil direset",
-    "resetFailed": "Gagal mereset koleksi: {{message}}"
-  }
-}
--- a/core/http/react-ui/public/locales/id/common.json
+++ b/core/http/react-ui/public/locales/id/common.json
@@ -1,109 +0,0 @@
-{
-  "actions": {
-    "save": "Simpan",
-    "saving": "Menyimpan...",
-    "cancel": "Batal",
-    "close": "Tutup",
-    "confirm": "Konfirmasi",
-    "delete": "Hapus",
-    "edit": "Edit",
-    "add": "Tambah",
-    "remove": "Hapus",
-    "create": "Buat",
-    "update": "Perbarui",
-    "refresh": "Segarkan",
-    "reload": "Muat Ulang",
-    "retry": "Coba Lagi",
-    "search": "Cari",
-    "filter": "Filter",
-    "clear": "Hapus",
-    "reset": "Reset",
-    "apply": "Terapkan",
-    "back": "Kembali",
-    "next": "Berikutnya",
-    "previous": "Sebelumnya",
-    "open": "Buka",
-    "submit": "Kirim",
-    "select": "Pilih",
-    "selectAll": "Pilih semua",
-    "copy": "Salin",
-    "copied": "Disalin",
-    "download": "Unduh",
-    "upload": "Unggah",
-    "import": "Impor",
-    "export": "Ekspor",
-    "view": "Lihat",
-    "details": "Detail",
-    "settings": "Pengaturan",
-    "help": "Bantuan",
-    "yes": "Ya",
-    "no": "Tidak",
-    "loading": "Memuat..."
-  },
-  "status": {
-    "loading": "Memuat...",
-    "saving": "Menyimpan...",
-    "saved": "Tersimpan",
-    "ready": "Siap",
-    "running": "Berjalan",
-    "stopped": "Berhenti",
-    "starting": "Memulai...",
-    "stopping": "Menghentikan...",
-    "pending": "Pending",
-    "active": "Aktif",
-    "inactive": "Tidak aktif",
-    "enabled": "Diaktifkan",
-    "disabled": "Dinonaktifkan",
-    "online": "Online",
-    "offline": "Offline",
-    "error": "Eror",
-    "success": "Sukses",
-    "warning": "Peringatan",
-    "info": "Info",
-    "empty": "Tidak ada item",
-    "none": "Tidak ada",
-    "unknown": "Tidak diketahui"
-  },
-  "dialogs": {
-    "confirmDelete": {
-      "title": "Konfirmasi penghapusan",
-      "message": "Apakah Anda yakin ingin menghapus ini? Tindakan ini tidak dapat dibatalkan.",
-      "confirm": "Hapus",
-      "cancel": "Batal"
-    },
-    "unsavedChanges": {
-      "title": "Perubahan belum disimpan",
-      "message": "Anda memiliki perubahan yang belum disimpan. Apakah Anda ingin membuangnya?",
-      "discard": "Buang",
-      "keepEditing": "Tetap mengedit"
-    }
-  },
-  "forms": {
-    "required": "Wajib",
-    "optional": "Opsional",
-    "name": "Nama",
-    "description": "Deskripsi",
-    "type": "Tipe",
-    "value": "Nilai",
-    "search": "Cari...",
-    "selectPlaceholder": "Pilih opsi..."
-  },
-  "time": {
-    "now": "baru saja",
-    "secondsAgo_one": "{{count}} detik yang lalu",
-    "secondsAgo_other": "{{count}} detik yang lalu",
-    "minutesAgo_one": "{{count}} menit yang lalu",
-    "minutesAgo_other": "{{count}} menit yang lalu",
-    "hoursAgo_one": "{{count}} jam yang lalu",
-    "hoursAgo_other": "{{count}} jam yang lalu",
-    "daysAgo_one": "{{count}} hari yang lalu",
-    "daysAgo_other": "{{count}} hari yang lalu"
-  },
-  "units": {
-    "bytes": "B",
-    "kilobytes": "KB",
-    "megabytes": "MB",
-    "gigabytes": "GB",
-    "terabytes": "TB"
-  }
-}
--- a/core/http/react-ui/public/locales/id/errors.json
+++ b/core/http/react-ui/public/locales/id/errors.json
@@ -1,17 +0,0 @@
-{
-  "generic": "Terjadi kesalahan",
-  "network": "Eror jaringan. Periksa koneksi Anda dan coba lagi.",
-  "unauthorized": "Anda tidak memiliki izin untuk melakukan tindakan ini.",
-  "forbidden": "Akses ditolak.",
-  "notFound": "Sumber daya yang diminta tidak ditemukan.",
-  "serverError": "Eror server. Silahkan coba lagi nanti.",
-  "loadFailed": "Gagal memuat: {{message}}",
-  "saveFailed": "Gagal menyimpan: {{message}}",
-  "deleteFailed": "Gagal menghapus: {{message}}",
-  "updateFailed": "Gagal memperbarui: {{message}}",
-  "createFailed": "Gagal membuat: {{message}}",
-  "operationFailed": "Operasi gagal: {{message}}",
-  "invalidInput": "Input tidak valid. Silahkan periksa formulir dan coba lagi.",
-  "tryAgain": "Silahkan coba lagi.",
-  "contactAdmin": "Jika masalah terus berlanjut, hubungi administrator Anda."
-}
--- a/core/http/react-ui/public/locales/id/home.json
+++ b/core/http/react-ui/public/locales/id/home.json
@@ -1,66 +0,0 @@
-{
-  "cluster": {
-    "vram": "VRAM Kluster",
-    "ram": "RAM Kluster",
-    "nodesOnline": "{{healthy}}/{{total}} node online"
-  },
-  "resourceGpu": "GPU",
-  "resourceRam": "RAM",
-  "assistant": {
-    "title": "Kelola LocalAI melalui obrolan",
-    "description": "Instal model, ganti backend, edit konfigurasi dan periksa status dengan berbicara pada LocalAI.",
-    "open": "Buka asisten",
-    "tooltip": "Kelola LocalAI melalui obrolan"
-  },
-  "input": {
-    "placeholder": "Pesan...",
-    "attachImage": "Lampirkan gambar",
-    "attachAudio": "Lampirkan audio",
-    "attachFile": "Lampirkan file",
-    "enterToSend": "Enter untuk mengirim",
-    "selectModelFirst": "Pilih model terlebih dahulu",
-    "sendMessage": "Kirim pesan",
-    "selectModelToast": "Silahkan pilih model terlebih dahulu"
-  },
-  "quickLinks": {
-    "manageByChat": "Kelola melalui obrolan",
-    "installedModels": "Model Terinstal",
-    "browseGallery": "Jelajahi Galeri",
-    "importModel": "Impor Model",
-    "documentation": "Dokumentasi"
-  },
-  "loadedModels": {
-    "count_one": "{{count}} model dimuat",
-    "count_other": "{{count}} model dimuat",
-    "stop": "Hentikan model",
-    "stopAll": "Hentikan semua"
-  },
-  "stopDialog": {
-    "title": "Hentikan Model",
-    "message": "Hentikan model {{model}}?",
-    "confirm": "Hentikan {{model}}",
-    "stopAllTitle": "Hentikan Semua model",
-    "stopAllMessage": "Hentikan semua {{count}} model yang dimuat?",
-    "stopAllConfirm": "Hentikan semua",
-    "stoppedToast": "{{model}} berhasil dihentikan",
-    "allStoppedToast": "Semua model berhasil dihentikan",
-    "stopFailed": "Gagal menghentikan: {{message}}"
-  },
-  "wizard": {
-    "getStarted": "Memulai dengan {{name}}",
-    "intro": "Instal model pertama Anda untuk memulai. Jelajahi galeri atau impor punya Anda sendiri.",
-    "steps": {
-      "step1Title": "Jelajahi Galeri Model",
-      "step1Body": "Temukan model yang tepat untuk kebutuhan Anda dari koleksi pilihan kami.",
-      "step2Title": "Instal Model",
-      "step2Body": "Klik instal untuk mengunduh dan mengonfigurasinya secara otomatis.",
-      "step3Title": "Mulai Mengobrol",
-      "step3Body": "Mengobrol dengan model Anda langsung dari peramban atau gunakan API."
-    },
-    "browseGallery": "Jelajahi Galeri Model",
-    "importModel": "Impor Model",
-    "docs": "Dokumentasi",
-    "noModelsTitle": "Tidak Ada Model yang Tersedia",
-    "noModelsBody": "Belum ada model yang terinstal. Hubungi administrator Anda untuk menyiapkan model agar Anda dapat mulai mengobrol."
-  }
-}
--- a/core/http/react-ui/public/locales/id/importModel.json
+++ b/core/http/react-ui/public/locales/id/importModel.json
@@ -1,142 +0,0 @@
-{
-  "title": "Impor Model Baru",
-  "subtitle": {
-    "simple": "Import model dari URI — deteksi otomatis memilih backend.",
-    "powerYaml": "Tulis konfigurasi YAML lengkap untuk model.",
-    "powerPrefs": "Preferensi impor tingkat lanjut."
-  },
-  "actions": {
-    "import": "Impor Model",
-    "importing": "Mengimpor...",
-    "create": "Buat",
-    "saving": "Menyimpan...",
-    "browseHF": "Jelajahi model di HF",
-    "addCustom": "Tambah Tersuai",
-    "copy": "Salin"
-  },
-  "form": {
-    "modelUri": "URI Model",
-    "uriPlaceholder": "huggingface://TheBloke/Llama-2-7B-Chat-GGUF atau https://example.com/model.gguf",
-    "uriHint": "Masukkan URI atau path ke file model yang ingin Anda impor",
-    "supportedFormats": "Format URI yang Didukung",
-    "options": "Opsi",
-    "preferences": "Preferensi (Opsional)",
-    "commonPreferences": "Preferensi Umum",
-    "customPreferences": "Preferensi Tersuai",
-    "customKeyValueHint": "Tambahkan pasangan key-value tersuai untuk konfigurasi tingkat lanjutan.",
-    "preferenceKey": "Key preferensi untuk baris {{index}}",
-    "preferenceValue": "Nilai preferensi untuk baris {{index}}",
-    "removePref": "Hapus preferensi ini",
-    "key": "Key",
-    "value": "Value",
-    "backend": "Backend",
-    "backendAuto": "Deteksi otomatis (berdasarkan URI)",
-    "backendLoading": "Memuat backend…",
-    "backendSearch": "Cari backend...",
-    "backendHint": "Paksa backend tertentu. Biarkan kosong untuk deteksi otomatis dari URI. Item yang ditandai \"pilih manual\" tidak dapat dideteksi otomatis — pilih sendiri jika Anda tahu apa yang dibutuhkan model.",
-    "backendErrorHint": "Tidak dapat memuat daftar backend — hanya menggunakan deteksi otomatis.",
-    "backendNotInstalled": "Backend ini belum terinstal. Proses impor akan mengunduhnya terlebih dahulu.",
-    "modelName": "Nama Model",
-    "modelNamePlaceholder": "Kosongkan untuk menggunakan nama file",
-    "modelNameHint": "Nama tersuai untuk model. Jika kosong, nama file akan digunakan.",
-    "description": "Deskripsi",
-    "descriptionPlaceholder": "Kosongkan untuk menggunakan deskripsi bawaan",
-    "descriptionHint": "Deskripsi tersuai untuk model.",
-    "quantizations": "Kuantisasi",
-    "quantizationsPlaceholder": "q4_k_m,q4_k_s,q3_k_m (dipisahkan koma)",
-    "quantizationsHint": "Kuantisasi pilihan (dipisahkan koma). Kosongkan untuk nilai bawaan (q4_k_m).",
-    "mmprojQuantizations": "Kuantisasi MMProj",
-    "mmprojQuantizationsPlaceholder": "fp16,fp32 (dipisahkan koma)",
-    "mmprojQuantizationsHint": "Kuantisasi MMProj pilihan. Kosongkan untuk nilai bawaan (fp16).",
-    "embeddings": "Embedding",
-    "embeddingsHint": "Aktifkan dukungan embedding untuk model ini.",
-    "modelType": "Tipe Model",
-    "modelTypePlaceholder": "AutoModelForCausalLM (untuk backend transformers)",
-    "modelTypeHint": "Tipe model untuk backend transformers. Contoh: AutoModelForCausalLM, SentenceTransformer, Mamba.",
-    "pipelineType": "Tipe Pipeline",
-    "pipelineTypeHint": "Tipe pipeline untuk backend diffusers.",
-    "schedulerType": "Tipe Scheduler",
-    "schedulerTypePlaceholder": "k_dpmpp_2m (optional)",
-    "schedulerTypeHint": "Tipe scheduler untuk backend diffusers. Contoh: k_dpmpp_2m, euler_a, ddim.",
-    "enableParameters": "Aktifkan parameter",
-    "enableParametersPlaceholder": "negative_prompt,num_inference_steps (dipisahkan koma)",
-    "enableParametersHint": "Parameter aktif untuk backend diffusers (dipisahkan koma).",
-    "cuda": "CUDA",
-    "cudaHint": "Aktifkan dukungan CUDA untuk akselerasi GPU.",
-    "yamlEditor": "Editor Konfigurasi YAML",
-    "manualPick": "pilih manual",
-    "manualPickTooltip": "Deteksi otomatis tidak akan merutekan ke backend ini. Pilih di sini jika Anda tahu bahwa ini yang Anda inginkan."
-  },
-  "modality": {
-    "text": "LLM Teks",
-    "asr": "Pengenalan suara",
-    "tts": "Text-to-speech",
-    "image": "Gambar / Video",
-    "embeddings": "Embedding",
-    "reranker": "Reranker",
-    "detection": "Deteksi object",
-    "vad": "Deteksi aktivitas suara",
-    "other": "Lainnya"
-  },
-  "powerTabs": {
-    "ariaLabel": "Tab mode tingkat lanjutan",
-    "preferences": "Preferensi",
-    "yaml": "YAML"
-  },
-  "switchDialog": {
-    "title": "Pertahankan preferensi tersuai Anda?",
-    "body": "Beralih ke mode Sederhana akan menyembunyikan preferensi selain backend, nama, dan deskripsi. Preferensi tersebut tetap akan dikirim saat mengimpor.",
-    "cancel": "Batal",
-    "discard": "Buang & beralih",
-    "keep": "Pertahankan & beralih"
-  },
-  "estimate": {
-    "title": "Estimasi kebutuhan",
-    "download": "Unduhan: {{size}}",
-    "vram": "VRAM: {{vram}}"
-  },
-  "toasts": {
-    "noUri": "Silahkan masukkan URI model",
-    "noYaml": "Silahkan masukkan konfigurasi YAML",
-    "started": "Impor dimulai! Melacak progress...",
-    "startedWithMeta": "Impor dimulai! Melacak progress... ({{meta}})",
-    "imported": "Model berhasil diimpor!",
-    "importedYaml": "Konfigurasi model berhasil diimpor!",
-    "importFailed": "Gagal mengimpor: {{message}}",
-    "startImportFailed": "Gagal memulai impor: {{message}}",
-    "backendsLoadFailed": "Tidak dapat memuat daftar backend — hanya menggunakan deteksi otomatis.",
-    "modalityClearedBackend": "Pilihan backend dikosongkan — tidak termasuk dalam grup {{label}}.",
-    "copied": "Disalin ke papan klip"
-  },
-  "uriFormats": {
-    "huggingface": {
-      "title": "HuggingFace",
-      "standard": "Format standar HuggingFace",
-      "short": "Format ringkas HuggingFace",
-      "fullUrl": "URL lengkap HuggingFace"
-    },
-    "http": {
-      "title": "URL HTTP/HTTPS",
-      "direct": "Unduh langsung dari URL HTTPS mana pun"
-    },
-    "local": {
-      "title": "File Lokal",
-      "filePath": "Path file lokal (absolut)",
-      "directYaml": "File konfigurasi YAML lokal langsung"
-    },
-    "oci": {
-      "title": "Registri OCI",
-      "registry": "Registri kontainer OCI",
-      "tarball": "File tarball OCI lokal"
-    },
-    "ollama": {
-      "title": "Ollama",
-      "model": "Format model Ollama"
-    },
-    "yaml": {
-      "title": "File Konfigurasi YAML",
-      "remote": "File konfigurasi YAML jarak jauh",
-      "local": "File konfigurasi YAML lokal"
-    }
-  }
-}
--- a/core/http/react-ui/public/locales/id/media.json
+++ b/core/http/react-ui/public/locales/id/media.json
@@ -1,154 +0,0 @@
-{
-  "studio": {
-    "tabs": {
-      "images": "Gambar",
-      "video": "Video",
-      "tts": "TTS",
-      "sound": "Suara"
-    }
-  },
-  "image": {
-    "title": "Pembuatan Gambar",
-    "labels": {
-      "model": "Model",
-      "prompt": "Prompt",
-      "promptPlaceholder": "Deskripsikan gambar yang ingin Anda buat...",
-      "negativePrompt": "Prompt Negatif",
-      "negativePromptPlaceholder": "Apa yang harus dihindari...",
-      "size": "Ukuran",
-      "count": "Jumlah (1-4)",
-      "advanced": "Pengaturan Tingkat Lanjutan",
-      "imageInputs": "Input Gambar",
-      "steps": "Steps",
-      "stepsPlaceholder": "20",
-      "seed": "Seed",
-      "seedPlaceholder": "Acak",
-      "sourceImage": "Gambar Sumber (img2img)",
-      "refImages": "Gambar Referensi",
-      "refImagesAdded_one": "{{count}} gambar ditambahkan",
-      "refImagesAdded_other": "{{count}} gambar ditambahkan"
-    },
-    "actions": {
-      "generate": "Hasilkan",
-      "generating": "Menghasilkan..."
-    },
-    "empty": "Gambar yang dihasilkan akan muncul di sini",
-    "toasts": {
-      "noPrompt": "Silahkan masukkan prompt",
-      "noModel": "Silahkan pilih model",
-      "noResults": "Tidak ada gambar yang dihasilkan"
-    }
-  },
-  "video": {
-    "title": "Pembuatan Video",
-    "labels": {
-      "model": "Model",
-      "prompt": "Prompt",
-      "promptPlaceholder": "Deskripsikan video yang ingin Anda buat...",
-      "duration": "Durasi (detik)",
-      "fps": "FPS",
-      "size": "Ukuran",
-      "advanced": "Pengaturan Tingkat Lanjutan",
-      "seed": "Seed",
-      "seedPlaceholder": "Acak"
-    },
-    "actions": {
-      "generate": "Hasilkan",
-      "generating": "Menghasilkan..."
-    },
-    "empty": "Video yang dihasilkan akan muncul di sini",
-    "toasts": {
-      "noPrompt": "Silahkan masukkan prompt",
-      "noModel": "Silahkan pilih model",
-      "noResults": "Tidak ada video yang dihasilkan"
-    }
-  },
-  "tts": {
-    "title": "Teks ke Suara",
-    "labels": {
-      "model": "Model",
-      "voice": "Suara",
-      "voicePlaceholder": "ID suara opsional",
-      "input": "Teks",
-      "inputPlaceholder": "Masukkan teks untuk disintesis..."
-    },
-    "actions": {
-      "generate": "Hasilkan",
-      "generating": "Menghasilkan..."
-    },
-    "empty": "Audio yang dihasilkan akan muncul di sini",
-    "toasts": {
-      "noText": "Silahkan masukkan teks",
-      "noModel": "Silahkan pilih model",
-      "generateFailed": "Gagal menghasilkan"
-    }
-  },
-  "sound": {
-    "title": "Pembuatan Suara",
-    "labels": {
-      "model": "Model",
-      "prompt": "Prompt",
-      "promptPlaceholder": "Deskripsikan suara yang ingin Anda buat...",
-      "duration": "Durasi (detik)",
-      "language": "Bahasa",
-      "vocalLanguage": "Bahasa vokal",
-      "lyrics": "Lirik (opsional)",
-      "lyricsPlaceholder": "Lirik untuk generasi vokal",
-      "advanced": "Pengaturan Tingkat Lanjutan",
-      "seed": "Seed",
-      "seedPlaceholder": "Acak"
-    },
-    "actions": {
-      "generate": "Hasilkan",
-      "generating": "Menghasilkan..."
-    },
-    "empty": "Audio yang dihasilkan akan muncul di sini",
-    "toasts": {
-      "noPrompt": "Silahkan masukkan prompt",
-      "noModel": "Silahkan pilih model",
-      "generateFailed": "Gagal menghasilkan"
-    }
-  },
-  "talk": {
-    "title": "Percakapan",
-    "subtitle": "Percakapan suara realtime",
-    "actions": {
-      "start": "Mulai sesi",
-      "stop": "Akhiri sesi",
-      "connecting": "Menghubungkan...",
-      "muted": "Disenyapkan",
-      "mute": "Senyapkan",
-      "unmute": "Aktifkan Suara"
-    },
-    "labels": {
-      "model": "Model",
-      "voice": "Suara",
-      "voicePlaceholder": "alloy",
-      "language": "Bahasa",
-      "languagePlaceholder": "en",
-      "instructions": "Instruksi",
-      "instructionsPlaceholder": "Atur persona asisten..."
-    },
-    "status": {
-      "idle": "Idle",
-      "connecting": "Menghubungkan...",
-      "listening": "Mendengarkan...",
-      "speaking": "Berbicara...",
-      "ended": "Sesi berakhir"
-    },
-    "toasts": {
-      "noModel": "Silahkan pilih model terlebih dahulu",
-      "connectFailed": "Gagal terhubung: {{message}}"
-    }
-  },
-  "history": {
-    "title": "Riwayat",
-    "empty": "Tidak ada riwayat",
-    "deleteEntry": "Hapus entri",
-    "clear": "Hapus riwayat",
-    "clearTitle": "Hapus semua riwayat",
-    "clearMessage": "Hapus semua entri riwayat? Tindakan ini tidak dapat dibatalkan.",
-    "clearConfirm": "Hapus",
-    "cleared": "Riwayat dihapus"
-  }
-}
--- a/core/http/react-ui/public/locales/id/models.json
+++ b/core/http/react-ui/public/locales/id/models.json
@@ -1,93 +0,0 @@
-{
-  "title": "Instal Model",
-  "subtitle": "Telusuri dan instal model AI dari galeri",
-  "stats": {
-    "available": "Tersedia",
-    "installed": "Terinstal"
-  },
-  "actions": {
-    "addModel": "Tambah Model",
-    "importModel": "Impor Model",
-    "install": "Instal",
-    "reinstall": "Instal Ulang",
-    "delete": "Hapus"
-  },
-  "filters": {
-    "all": "Semua",
-    "llm": "Obrolan",
-    "image": "Gambar",
-    "video": "Video",
-    "multimodal": "Multimodal",
-    "vision": "Vision",
-    "tts": "TTS",
-    "stt": "STT",
-    "diarization": "Diarization",
-    "soundGen": "Suara",
-    "audioTransform": "Efek Audio",
-    "realtimeAudio": "Audio Realtime",
-    "embedding": "Embedding",
-    "rerank": "Rerank",
-    "detection": "Deteksi",
-    "vad": "VAD",
-    "fitsGpu": "Muat di GPU",
-    "allBackends": "Semua Backend",
-    "searchBackends": "Cari backends..."
-  },
-  "search": {
-    "placeholder": "Cari model...",
-    "clearFilters": "Hapus filter"
-  },
-  "table": {
-    "modelName": "Nama Model",
-    "description": "Deskripsi",
-    "backend": "Backend",
-    "sizeVram": "Ukuran / VRAM",
-    "status": "Status",
-    "actions": "Aksi",
-    "size": "Ukuran: {{size}}",
-    "vram": "VRAM: {{vram}}",
-    "fits": "Muat",
-    "mayNotFit": "Mungkin tidak muat",
-    "trustRemoteCode": "Trust Remote Code",
-    "installing": "Menginstal",
-    "installingPct": "Menginstal · {{percent}}%",
-    "installed": "Terinstal",
-    "notInstalled": "Belum Terinstal"
-  },
-  "detail": {
-    "description": "Deskripsi",
-    "gallery": "Galeri",
-    "backend": "Backend",
-    "size": "Ukuran",
-    "vram": "VRAM",
-    "license": "Lisensi",
-    "tags": "Tag",
-    "links": "Tautan",
-    "warning": "Peringatan",
-    "files": "File",
-    "fitsGpu": "Muat di GPU",
-    "mayNotFitGpu": "Mungkin tidak muat di GPU",
-    "requiresTrustRemoteCode": "Memerlukan Trust Remote Code",
-    "fileCount_one": "{{count}} file",
-    "fileCount_other": "{{count}} file",
-    "filename": "Nama file",
-    "uri": "URI",
-    "sha256": "SHA256"
-  },
-  "empty": {
-    "title": "Model tidak ditemukan",
-    "withFilters": "Tidak ada model yang cocok dengan pencarian atau filter Anda.",
-    "noFilters": "Galeri model kosong."
-  },
-  "deleteDialog": {
-    "title": "Hapus Model",
-    "message": "Hapus model {{model}}?",
-    "confirm": "Hapus {{model}}",
-    "deletingToast": "Menghapus {{model}}..."
-  },
-  "errors": {
-    "loadFailed": "Gagal memuat model: {{message}}",
-    "installFailed": "Gagal menginstal: {{message}}",
-    "deleteFailed": "Gagal menghapus: {{message}}"
-  }
-}
--- a/core/http/react-ui/public/locales/id/nav.json
+++ b/core/http/react-ui/public/locales/id/nav.json
@@ -1,54 +0,0 @@
-{
-  "appName": "LocalAI",
-  "openMenu": "Buka menu",
-  "closeMenu": "Tutup menu",
-  "primaryNavigation": "Navigasi utama",
-  "switchToLightMode": "Ganti ke mode terang",
-  "switchToDarkMode": "Ganti ke mode gelap",
-  "expandSidebar": "Perluas sidebar",
-  "collapseSidebar": "Ciutkan sidebar",
-  "changeLanguage": "Ubah bahasa",
-  "logout": "Keluar",
-  "accountSettings": "Pengaturan akun",
-  "account": "Akun",
-  "accountFor": "Akun: {{name}}",
-  "sections": {
-    "tools": "Peralatan",
-    "enhance": "Peningkatan",
-    "biometrics": "Biometrik",
-    "agents": "Agen",
-    "system": "Sistem"
-  },
-  "items": {
-    "home": "Beranda",
-    "installModels": "Instal Model",
-    "chat": "Obrolan",
-    "studio": "Studio",
-    "talk": "Bicara",
-    "fineTune": "Fine-Tune (Eksperimental)",
-    "quantize": "Kuantisasi (Eksperimental)",
-    "audioTransform": "Transformasi Audio",
-    "faceRecognition": "Pengenalan Wajah",
-    "voiceRecognition": "Pengenalan Suara",
-    "agents": "Agen",
-    "skills": "Skill",
-    "memory": "Memori",
-    "mcpJobs": "MCP CI Jobs",
-    "usage": "Penggunaan",
-    "users": "Pengguna",
-    "middleware": "Middleware",
-    "backends": "Backend",
-    "traces": "Trace",
-    "nodes": "Node",
-    "swarm": "Swarm",
-    "system": "Sistem",
-    "settings": "Pengaturan",
-    "api": "API"
-  },
-  "footer": {
-    "github": "GitHub",
-    "documentation": "Dokumentasi",
-    "author": "Penulis",
-    "copyright": "© 2023-{{year}} {{author}}"
-  }
-}
--- a/core/http/react-ui/public/locales/id/skills.json
+++ b/core/http/react-ui/public/locales/id/skills.json
@@ -1,79 +0,0 @@
-{
-  "title": "Skill",
-  "subtitle": "Kelola skill agen (instruksi dan sumber daya yang dapat digunakan kembali)",
-  "unavailable": {
-    "subtitle": "Layanan Skill tidak tersedia atau indeks sedang dibangun ulang. Coba beberapa saat lagi.",
-    "retry": "Coba lagi"
-  },
-  "actions": {
-    "newSkill": "Skill baru",
-    "createSkill": "Buat skill",
-    "import": "Impor",
-    "importing": "Mengimpor...",
-    "gitRepos": "Repo Git",
-    "edit": "Edit",
-    "delete": "Hapus",
-    "export": "Ekspor",
-    "sync": "Sinkronisasi",
-    "addRepo": "Tambah repo",
-    "adding": "Menambahkan...",
-    "remove": "Hapus",
-    "enable": "Aktifkan",
-    "disable": "Nonaktifkan"
-  },
-  "search": {
-    "placeholder": "Cari skill..."
-  },
-  "git": {
-    "title": "Repositori git",
-    "description": "Tambah repositori Git untuk mengambil skill. Skill akan muncul di daftar setelah sinkronisasi.",
-    "urlPlaceholder": "https://github.com/user/repo atau git@github.com:user/repo.git",
-    "noRepos": "Tidak ada repositori Git yang dikonfigurasi. Tambahkan satu di atas.",
-    "disabled": "Dinonaktifkan",
-    "removeRepo": "Hapus repo"
-  },
-  "card": {
-    "noDescription": "Tidak ada deskripsi",
-    "readOnly": "Hanya baca",
-    "editTitle": "Edit skill",
-    "deleteTitle": "Hapus skill",
-    "exportTitle": "Ekspor sebagai .tar.gz"
-  },
-  "empty": {
-    "title": "Tidak ada skill yang ditemukan",
-    "text": "Buat atau impor skill untuk memulai.",
-    "noPersonal": "Anda belum memiliki skill."
-  },
-  "sections": {
-    "yourSkills": "Skill Anda",
-    "otherUsersSkills": "Skill Pengguna Lain"
-  },
-  "deleteDialog": {
-    "title": "Hapus Skill",
-    "message": "Hapus skill \"{{name}}\"? Tindakan ini tidak dapat dibatalkan.",
-    "confirm": "Hapus"
-  },
-  "removeRepoDialog": {
-    "title": "Hapus Repositori Git",
-    "message": "Hapus repositori Git ini? Skill dari repo ini tidak akan tersedia lagi.",
-    "confirm": "Hapus"
-  },
-  "toasts": {
-    "loadFailed": "Gagal memuat skill",
-    "deleted": "Skill \"{{name}}\" berhasil dihapus",
-    "deleteFailed": "Gagal menghapus skill",
-    "exported": "Skill \"{{name}}\" berhasil diekspor",
-    "exportFailed": "Gagal mengekspor",
-    "imported": "Skill berhasil diimpor dari \"{{file}}\" ",
-    "importFailed": "Gagal mengimpor",
-    "loadReposFailed": "Gagal memuat repositori Git",
-    "repoAdded": "Repo Git berhasil ditambahkan dan sedang sinkronisasi",
-    "addRepoFailed": "Gagal menambahkan repo",
-    "synced": "Repo berhasil disinkronisasi",
-    "syncFailed": "Gagal sinkronisasi",
-    "toggled": "Berhasil mengubah status repo",
-    "toggleFailed": "Gagal mengubah status repo",
-    "removed": "Repo berhasil dihapus",
-    "removeFailed": "Gagal menghapus repo"
-  }
-}
--- a/Show More
+++ b/Show More