fix(gallery/ltx-2.3): add vae_decode_only:false for i2v / flf2v

LTX-2.3 i2v inference fails inside generate_video with: [ERROR] LTXAV image conditioning requires VAE encoder weights; create the context with vae_decode_only=false Without vae_decode_only:false in the options block, gosd.cpp creates the sd_ctx with VAE encoder weights freed, so latent encoding of the init_image is impossible. Adding the option mirrors what we already do for Wan i2v entries. Affects all six LTX-2.3 entries (dev/distilled × UD-Q4_K_M, Q4_K_M, Q8_0). T2V wasn't impacted by the missing option since it has no init image to encode, which is why the T2V smoke earlier passed. Assisted-by: Claude:claude-opus-4-7
fix(gallery/ltx-2.3): add diffusion_model flag to all variants
2026-05-27 10:07:00 -04:00 · 2026-05-25 19:33:55 +00:00 · 2026-05-25 16:49:37 +00:00
42 changed files with 247 additions and 2721 deletions
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -690,19 +690,6 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
-  - build-type: 'cublas'
-    cuda-major-version: "12"
-    cuda-minor-version: "8"
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-nvidia-cuda-12-rfdetr-cpp'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "rfdetr-cpp"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "12"
    cuda-minor-version: "8"
@@ -1504,19 +1491,6 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
-  - build-type: 'cublas'
-    cuda-major-version: "13"
-    cuda-minor-version: "0"
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-nvidia-cuda-13-rfdetr-cpp'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "rfdetr-cpp"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "13"
    cuda-minor-version: "0"
@@ -1530,19 +1504,6 @@ include:
    backend: "sam3-cpp"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
-  - build-type: 'cublas'
-    cuda-major-version: "13"
-    cuda-minor-version: "0"
-    platforms: 'linux/arm64'
-    skip-drivers: 'false'
-    tag-latest: 'auto'
-    tag-suffix: '-nvidia-l4t-cuda-13-arm64-rfdetr-cpp'
-    base-image: "ubuntu:24.04"
-    ubuntu-version: '2404'
-    runs-on: 'ubuntu-24.04-arm'
-    backend: "rfdetr-cpp"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
  - build-type: 'cublas'
    cuda-major-version: "13"
    cuda-minor-version: "0"
@@ -2674,74 +2635,6 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
-  # rfdetr-cpp
-  - build-type: ''
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-cpu-rfdetr-cpp'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "rfdetr-cpp"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'sycl_f32'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-intel-sycl-f32-rfdetr-cpp'
-    runs-on: 'ubuntu-latest'
-    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
-    skip-drivers: 'false'
-    backend: "rfdetr-cpp"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'sycl_f16'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-intel-sycl-f16-rfdetr-cpp'
-    runs-on: 'ubuntu-latest'
-    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
-    skip-drivers: 'false'
-    backend: "rfdetr-cpp"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'vulkan'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    platform-tag: 'amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-vulkan-rfdetr-cpp'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "rfdetr-cpp"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'vulkan'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/arm64'
-    platform-tag: 'arm64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-vulkan-rfdetr-cpp'
-    runs-on: 'ubuntu-24.04-arm'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "rfdetr-cpp"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
  - build-type: 'sycl_f32'
    cuda-major-version: ""
    cuda-minor-version: ""
@@ -2822,19 +2715,6 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2204'
-  - build-type: 'cublas'
-    cuda-major-version: "12"
-    cuda-minor-version: "0"
-    platforms: 'linux/arm64'
-    skip-drivers: 'false'
-    tag-latest: 'auto'
-    tag-suffix: '-nvidia-l4t-arm64-rfdetr-cpp'
-    base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
-    runs-on: 'ubuntu-24.04-arm'
-    backend: "rfdetr-cpp"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2204'
  # whisper
  - build-type: ''
    cuda-major-version: ""
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -50,10 +50,6 @@ jobs:
            variable: "SAM3_VERSION"
            branch: "main"
            file: "backend/go/sam3-cpp/Makefile"
-          - repository: "mudler/rf-detr.cpp"
-            variable: "RFDETR_VERSION"
-            branch: "main"
-            file: "backend/go/rfdetr-cpp/Makefile"
          - repository: "predict-woo/qwen3-tts.cpp"
            variable: "QWEN3TTS_CPP_VERSION"
            branch: "main"
--- a/.github/workflows/stalebot.yml
+++ b/.github/workflows/stalebot.yml
@@ -11,7 +11,7 @@ jobs:
    if: github.repository == 'mudler/LocalAI'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/stale@eb5cf3af3ac0a1aa4c9c45633dd1ae542a27a899 # v9
+      - uses: actions/stale@b5d41d4e1d5dceea10e7104786b73624c18a190f # v9
        with:
          stale-issue-message: 'This issue is stale because it has been open 90 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
          stale-pr-message: 'This PR is stale because it has been open 90 days with no activity. Remove stale label or comment or this will be closed in 10 days.'
--- a/8
+++ b/8
@@ -1,5 +1,5 @@
 # Disable parallel execution for backend builds
-.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio
+.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio

 GOCMD=go
 GOTEST=$(GOCMD) test
@@ -481,7 +481,6 @@ prepare-test-extra: protogen-python
 	$(MAKE) -C backend/python/insightface
 	$(MAKE) -C backend/python/speaker-recognition
 	$(MAKE) -C backend/rust/kokoros kokoros-grpc
-	$(MAKE) -C backend/go/rfdetr-cpp

 test-extra: prepare-test-extra
 	$(MAKE) -C backend/python/transformers test
@@ -508,7 +507,6 @@ test-extra: prepare-test-extra
 	$(MAKE) -C backend/python/insightface test
 	$(MAKE) -C backend/python/speaker-recognition test
 	$(MAKE) -C backend/rust/kokoros test
-	$(MAKE) -C backend/go/rfdetr-cpp test

 ##
 ## End-to-end gRPC tests that exercise a built backend container image.
@@ -1121,7 +1119,6 @@ BACKEND_KOKOROS = kokoros|rust|.|false|true

 # C++ backends (Go wrapper with purego)
 BACKEND_SAM3_CPP = sam3-cpp|golang|.|false|true
-BACKEND_RFDETR_CPP = rfdetr-cpp|golang|.|false|true

 # Helper function to build docker image for a backend
 # Usage: $(call docker-build-backend,BACKEND_NAME,DOCKERFILE_TYPE,BUILD_CONTEXT,PROGRESS_FLAG,NEEDS_BACKEND_ARG)
@@ -1201,14 +1198,13 @@ $(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP_QUANTIZATION)))
 $(eval $(call generate-docker-build-target,$(BACKEND_TINYGRAD)))
 $(eval $(call generate-docker-build-target,$(BACKEND_KOKOROS)))
 $(eval $(call generate-docker-build-target,$(BACKEND_SAM3_CPP)))
-$(eval $(call generate-docker-build-target,$(BACKEND_RFDETR_CPP)))
 $(eval $(call generate-docker-build-target,$(BACKEND_SHERPA_ONNX)))

 # Pattern rule for docker-save targets
 docker-save-%: backend-images
 	docker save local-ai-backend:$* -o backend-images/$*.tar

-docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-liquid-audio docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-rfdetr-cpp docker-build-qwen3-tts-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx docker-build-cloud-proxy
+docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-liquid-audio docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx docker-build-cloud-proxy

 ########################################################
 ### Mock Backend for E2E Tests
--- a/README.md
+++ b/README.md
@@ -149,10 +149,8 @@ For more details, see the [Getting Started guide](https://localai.io/basics/gett

 ## Latest News

- **May 2026**: **LocalAI 4.3.0** - `llama.cpp` [prompt cache on by default](https://github.com/mudler/LocalAI/pull/9925) (repeated system prompts collapse from minutes to seconds), [keyless cosign signing of backend OCI images](https://github.com/mudler/LocalAI/pull/9823), [per-API-key + per-user usage attribution](https://github.com/mudler/LocalAI/pull/9920), Distributed v3 with [per-request replica routing](https://github.com/mudler/LocalAI/pull/9968). [Release notes](https://github.com/mudler/LocalAI/releases/tag/v4.3.0)
- **May 2026**: **LocalAI 4.2.0** - LocalAI sees and hears: [voice recognition](https://github.com/mudler/LocalAI/pull/9500), [face recognition + antispoofing liveness](https://github.com/mudler/LocalAI/pull/9480), speaker diarization. Plus [drop-in Ollama API](https://github.com/mudler/LocalAI/pull/9284), [video generation](https://github.com/mudler/LocalAI/pull/9420), redesigned UI with i18n + admin-configurable branding, vLLM at feature parity with llama.cpp, and 11 new backends. [Release notes](https://github.com/mudler/LocalAI/releases/tag/v4.2.0)
- **April 2026**: **LocalAI 4.1.0** - LocalAI becomes a control tower: distributed cluster mode with VRAM-aware smart routing + autoscaling, multi-user platform with OIDC and API keys, per-user quotas with predictive analytics, in-UI fine-tuning with TRL (auto-export to GGUF), on-the-fly quantization backend, visual pipeline editor. [Release notes](https://github.com/mudler/LocalAI/releases/tag/v4.1.0)
- **March 2026**: **LocalAI 4.0.0** - native agentic orchestration with the new [Agenthub](https://agenthub.localai.io) community hub, full React UI rewrite with Canvas mode, [MCP Apps + client-side](https://github.com/mudler/LocalAI/pull/8947) with tool streaming, [WebRTC realtime audio](https://github.com/mudler/LocalAI/pull/8790), [MLX-distributed](https://github.com/mudler/LocalAI/pull/8801). [Release notes](https://github.com/mudler/LocalAI/releases/tag/v4.0.0)
+- **April 2026**: [Voice recognition](https://github.com/mudler/LocalAI/pull/9500), [Face recognition, identification & liveness detection](https://github.com/mudler/LocalAI/pull/9480), [Ollama API compatibility](https://github.com/mudler/LocalAI/pull/9284), [Video generation in stable-diffusion.ggml](https://github.com/mudler/LocalAI/pull/9420), [Backend versioning with auto-upgrade](https://github.com/mudler/LocalAI/pull/9315), [Pin models & load-on-demand toggle](https://github.com/mudler/LocalAI/pull/9309), [Universal model importer](https://github.com/mudler/LocalAI/pull/9466), new backends: [sglang](https://github.com/mudler/LocalAI/pull/9359), [ik-llama-cpp](https://github.com/mudler/LocalAI/pull/9326), [TurboQuant](https://github.com/mudler/LocalAI/pull/9355), [sam.cpp](https://github.com/mudler/LocalAI/pull/9288), [Kokoros](https://github.com/mudler/LocalAI/pull/9212), [qwen3tts.cpp](https://github.com/mudler/LocalAI/pull/9316), [tinygrad multimodal](https://github.com/mudler/LocalAI/pull/9364)
+- **March 2026**: [Agent management](https://github.com/mudler/LocalAI/pull/8820), [New React UI](https://github.com/mudler/LocalAI/pull/8772), [WebRTC](https://github.com/mudler/LocalAI/pull/8790), [MLX-distributed via P2P and RDMA](https://github.com/mudler/LocalAI/pull/8801), [MCP Apps, MCP Client-side](https://github.com/mudler/LocalAI/pull/8947)
 - **February 2026**: [Realtime API for audio-to-audio with tool calling](https://github.com/mudler/LocalAI/pull/6245), [ACE-Step 1.5 support](https://github.com/mudler/LocalAI/pull/8396)
 - **January 2026**: **LocalAI 3.10.0** — Anthropic API support, Open Responses API, video & image generation (LTX-2), unified GPU backends, tool streaming, Moonshine, Pocket-TTS. [Release notes](https://github.com/mudler/LocalAI/releases/tag/v3.10.0)
 - **December 2025**: [Dynamic Memory Resource reclaimer](https://github.com/mudler/LocalAI/pull/7583), [Automatic multi-GPU model fitting (llama.cpp)](https://github.com/mudler/LocalAI/pull/7584), [Vibevoice backend](https://github.com/mudler/LocalAI/pull/7494)
@@ -238,22 +236,11 @@ A huge thank you to our generous sponsors who support this project covering CI e
  <a href="https://www.spectrocloud.com/" target="blank">
    <img height="200" src="https://github.com/user-attachments/assets/72eab1dd-8b93-4fc0-9ade-84db49f24962">
  </a>
-</p>
-
-<details>
-
-<summary>
-Past sponsors
-</summary>
-
-<p align="center">
  <a href="https://www.premai.io/" target="blank">
    <img height="200" src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>
  </a>
 </p>

-</details>
-
 ### Individual sponsors

 A special thanks to individual sponsors, a full list is on [GitHub](https://github.com/sponsors/mudler) and [buymeacoffee](https://buymeacoffee.com/mudler). Special shout out to [drikster80](https://github.com/drikster80) for being generous. Thank you everyone!
--- a/backend/cpp/ds4/Makefile
+++ b/backend/cpp/ds4/Makefile
@@ -1,10 +1,10 @@
 # ds4 backend Makefile.
 #
-# Upstream pin lives below as DS4_VERSION?=ad0209f6a4b067574d2b4afe896c08c177156b31
+# Upstream pin lives below as DS4_VERSION?=f91c12b50a1448527c435c028bfc70d1b00f6c33
 # (.github/bump_deps.sh) can find and update it - matches the
 # llama-cpp / ik-llama-cpp / turboquant convention.

-DS4_VERSION?=ad0209f6a4b067574d2b4afe896c08c177156b31
+DS4_VERSION?=f91c12b50a1448527c435c028bfc70d1b00f6c33
 DS4_REPO?=https://github.com/antirez/ds4

 CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
--- a/backend/cpp/ik-llama-cpp/Makefile
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -1,5 +1,5 @@

-IK_LLAMA_VERSION?=b4e1d916c5ec7e75ea3c124dd090425a99fc613f
+IK_LLAMA_VERSION?=9f7ba245ab41e118f03aa8dd5134d18a81159d02
 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=35c9b1f39ebe5a7bb83986d64415a079218be78d
+LLAMA_VERSION?=549b9d84330c327e6791fa812a7d60c0cf63572e
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -570,11 +570,9 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
    // kv_unified=false or cache_ram_mib=0, so flipping kv_unified above is
    // what actually unlocks it.
    params.cache_idle_slots = true;
-    // checkpoint_min_step: minimum spacing between context checkpoints in
-    // tokens (0 disables the minimum). Match upstream's default (256). This
-    // field was renamed from `checkpoint_every_nt` in llama.cpp; the semantics
-    // also shifted from a fixed cadence to a minimum spacing.
-    params.checkpoint_min_step = 256;
+    // checkpoint_every_nt: create a context checkpoint every N tokens during
+    // prefill (-1 disables). Match upstream's default (8192).
+    params.checkpoint_every_nt = 8192;

     // decode options. Options are in form optname:optvale, or if booleans only optname.
    for (int i = 0; i < request->options_size(); i++) {
@@ -748,18 +746,14 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
                params.cache_idle_slots = false;
            }

-        // --- minimum context-checkpoint spacing (upstream -cms / --checkpoint-min-step) ---
-        // 0 disables the minimum-spacing gate. Old option names (`checkpoint_every_nt`,
-        // `checkpoint_every_n_tokens`) are kept as aliases for backward compatibility
-        // with existing user configs: upstream renamed the field and shifted its
-        // semantics from a fixed cadence to a minimum spacing.
-        } else if (!strcmp(optname, "checkpoint_min_step") || !strcmp(optname, "checkpoint_min_spacing") ||
-                   !strcmp(optname, "checkpoint_every_nt") || !strcmp(optname, "checkpoint_every_n_tokens")) {
+        // --- prefill checkpoint cadence (upstream -cpent / --checkpoint-every-n-tokens) ---
+        // -1 disables checkpointing during prefill.
+        } else if (!strcmp(optname, "checkpoint_every_nt") || !strcmp(optname, "checkpoint_every_n_tokens")) {
            if (optval != NULL) {
                try {
-                    params.checkpoint_min_step = std::stoi(optval_str);
+                    params.checkpoint_every_nt = std::stoi(optval_str);
                } catch (const std::exception& e) {
-                    // If conversion fails, keep default value (256)
+                    // If conversion fails, keep default value (8192)
                }
            }

--- a/backend/go/rfdetr-cpp/.gitignore
+++ b/backend/go/rfdetr-cpp/.gitignore
@@ -1,7 +0,0 @@
-sources/
-build*/
-package/
-librfdetrcpp*.so
-rfdetr-cpp
-test-models/
-test-data/
--- a/backend/go/rfdetr-cpp/CMakeLists.txt
+++ b/backend/go/rfdetr-cpp/CMakeLists.txt
@@ -1,79 +0,0 @@
-cmake_minimum_required(VERSION 3.18)
-project(librfdetrcpp LANGUAGES C CXX)
-
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-# Static-link ggml + rfdetr so the resulting .so has no runtime dependency on
-# extra ggml/rfdetr shared libraries — only on libc/libstdc++/libgomp, which
-# the LocalAI package step bundles into the docker image.
-set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build static libraries" FORCE)
-
-# rfdetr.cpp build switches: skip CLI/tests, keep static lib.
-set(RFDETR_BUILD_CLI OFF CACHE BOOL "Disable rfdetr CLI" FORCE)
-set(RFDETR_BUILD_TESTS OFF CACHE BOOL "Disable rfdetr tests" FORCE)
-set(RFDETR_SHARED OFF CACHE BOOL "Build rfdetr as static lib" FORCE)
-
-# rt-detr.cpp's top-level CMakeLists invokes
-# `bash ${CMAKE_SOURCE_DIR}/scripts/apply_ggml_patches.sh` to apply its
-# in-tree ggml patches before descending into the submodule. When we
-# `add_subdirectory` it from a parent project, `CMAKE_SOURCE_DIR` points
-# at *our* directory, not theirs, so the script path resolves wrong.
-#
-# Run the patches script ourselves up front (it's idempotent — re-running
-# is a no-op once patches are applied) so the rt-detr.cpp configure step
-# is essentially a no-op for the patch hook.
-set(RFDETR_CPP_SRC ${CMAKE_CURRENT_SOURCE_DIR}/sources/rt-detr.cpp)
-if(EXISTS ${RFDETR_CPP_SRC}/scripts/apply_ggml_patches.sh)
-    execute_process(
-        COMMAND bash ${RFDETR_CPP_SRC}/scripts/apply_ggml_patches.sh
-        RESULT_VARIABLE _rfdetr_patch_result
-        OUTPUT_VARIABLE _rfdetr_patch_output
-        ERROR_VARIABLE  _rfdetr_patch_error
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-        ERROR_STRIP_TRAILING_WHITESPACE)
-    if(NOT _rfdetr_patch_result EQUAL 0)
-        message(FATAL_ERROR
-            "Failed to apply ggml patches (exit ${_rfdetr_patch_result}):\n"
-            "stdout:\n${_rfdetr_patch_output}\n"
-            "stderr:\n${_rfdetr_patch_error}")
-    endif()
-    message(STATUS "${_rfdetr_patch_output}")
-endif()
-
-# Stage a shim 'scripts/apply_ggml_patches.sh' under our source dir so that
-# rt-detr.cpp's CMakeLists — which calls
-#   bash ${CMAKE_SOURCE_DIR}/scripts/apply_ggml_patches.sh
-# — finds an idempotent no-op there. The real patches have already been
-# applied above; this just satisfies the path lookup.
-file(MAKE_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/scripts)
-file(WRITE ${CMAKE_CURRENT_SOURCE_DIR}/scripts/apply_ggml_patches.sh
-"#!/usr/bin/env bash
-# Shim - patches were already applied by the parent CMakeLists.
-exit 0
-")
-execute_process(COMMAND chmod +x ${CMAKE_CURRENT_SOURCE_DIR}/scripts/apply_ggml_patches.sh)
-
-add_subdirectory(./sources/rt-detr.cpp)
-
-# rfdetr.cpp's C-API symbols already live inside librfdetr (src/rfdetr_capi.cpp
-# is compiled into the lib). We re-export them via a MODULE library that
-# whole-archive-links rfdetr so the symbols are visible at dlopen time.
-add_library(rfdetrcpp MODULE
-    sources/rt-detr.cpp/src/rfdetr_capi.cpp)
-
-target_include_directories(rfdetrcpp PRIVATE
-    sources/rt-detr.cpp/include
-    sources/rt-detr.cpp/src
-    sources/rt-detr.cpp/third_party/stb
-)
-
-target_link_libraries(rfdetrcpp PRIVATE rfdetr ggml)
-
-if(CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.0)
-    target_link_libraries(rfdetrcpp PRIVATE stdc++fs)
-endif()
-
-set_property(TARGET rfdetrcpp PROPERTY CXX_STANDARD 17)
-set_target_properties(rfdetrcpp PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
--- a/backend/go/rfdetr-cpp/Makefile
+++ b/backend/go/rfdetr-cpp/Makefile
@@ -1,135 +0,0 @@
-CMAKE_ARGS?=
-BUILD_TYPE?=
-NATIVE?=false
-
-GOCMD?=go
-GO_TAGS?=
-JOBS?=$(shell nproc --ignore=1)
-
-# rt-detr.cpp (GitHub redirects the historical mudler/rt-detr.cpp to the new
-# mudler/rf-detr.cpp slug). Pin to a specific commit if you need a stable
-# build; leaving this on `master` always picks up the latest C-API surface
-# (incl. the per-detection accessor functions used by gorfdetrcpp.go).
-RFDETR_REPO?=https://github.com/mudler/rf-detr.cpp.git
-RFDETR_VERSION?=main
-
-ifeq ($(NATIVE),false)
-	CMAKE_ARGS+=-DGGML_NATIVE=OFF
-endif
-
-# Forward LocalAI's BUILD_TYPE to the matching ggml backend switch.
-ifeq ($(BUILD_TYPE),cublas)
-	CMAKE_ARGS+=-DGGML_CUDA=ON -DRFDETR_GGML_CUDA=ON
-else ifeq ($(BUILD_TYPE),openblas)
-	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
-else ifeq ($(BUILD_TYPE),clblas)
-	CMAKE_ARGS+=-DGGML_CLBLAST=ON
-else ifeq ($(BUILD_TYPE),hipblas)
-	ROCM_HOME ?= /opt/rocm
-	ROCM_PATH ?= /opt/rocm
-	export CXX=$(ROCM_HOME)/llvm/bin/clang++
-	export CC=$(ROCM_HOME)/llvm/bin/clang
-	AMDGPU_TARGETS?=gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1200,gfx1201
-	CMAKE_ARGS+=-DGGML_HIPBLAS=ON -DRFDETR_GGML_HIPBLAS=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
-else ifeq ($(BUILD_TYPE),vulkan)
-	CMAKE_ARGS+=-DGGML_VULKAN=ON -DRFDETR_GGML_VULKAN=ON
-else ifeq ($(OS),Darwin)
-	ifneq ($(BUILD_TYPE),metal)
-		CMAKE_ARGS+=-DGGML_METAL=OFF
-	else
-		CMAKE_ARGS+=-DGGML_METAL=ON
-		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
-		CMAKE_ARGS+=-DRFDETR_GGML_METAL=ON
-	endif
-endif
-
-ifeq ($(BUILD_TYPE),sycl_f16)
-	CMAKE_ARGS+=-DGGML_SYCL=ON \
-		-DCMAKE_C_COMPILER=icx \
-		-DCMAKE_CXX_COMPILER=icpx \
-		-DGGML_SYCL_F16=ON
-endif
-
-ifeq ($(BUILD_TYPE),sycl_f32)
-	CMAKE_ARGS+=-DGGML_SYCL=ON \
-		-DCMAKE_C_COMPILER=icx \
-		-DCMAKE_CXX_COMPILER=icpx
-endif
-
-sources/rt-detr.cpp:
-	mkdir -p sources && \
-	git clone --recursive $(RFDETR_REPO) sources/rt-detr.cpp && \
-	cd sources/rt-detr.cpp && \
-	git checkout $(RFDETR_VERSION) && \
-	git submodule update --init --recursive --depth 1 --single-branch
-
-# Detect OS
-UNAME_S := $(shell uname -s)
-
-# Only build CPU variants on Linux
-ifeq ($(UNAME_S),Linux)
-	VARIANT_TARGETS = librfdetrcpp-avx.so librfdetrcpp-avx2.so librfdetrcpp-avx512.so librfdetrcpp-fallback.so
-else
-	# On non-Linux (e.g., Darwin), build only fallback variant
-	VARIANT_TARGETS = librfdetrcpp-fallback.so
-endif
-
-rfdetr-cpp: main.go gorfdetrcpp.go $(VARIANT_TARGETS)
-	CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o rfdetr-cpp ./
-
-package: rfdetr-cpp
-	bash package.sh
-
-build: package
-
-clean: purge
-	rm -rf librfdetrcpp*.so rfdetr-cpp package sources
-
-purge:
-	rm -rf build*
-
-# Build all variants (Linux only)
-ifeq ($(UNAME_S),Linux)
-librfdetrcpp-avx.so: sources/rt-detr.cpp
-	$(MAKE) purge
-	$(info ${GREEN}I rfdetr-cpp build info:avx${RESET})
-	SO_TARGET=librfdetrcpp-avx.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) librfdetrcpp-custom
-	rm -rfv build*
-
-librfdetrcpp-avx2.so: sources/rt-detr.cpp
-	$(MAKE) purge
-	$(info ${GREEN}I rfdetr-cpp build info:avx2${RESET})
-	SO_TARGET=librfdetrcpp-avx2.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) librfdetrcpp-custom
-	rm -rfv build*
-
-librfdetrcpp-avx512.so: sources/rt-detr.cpp
-	$(MAKE) purge
-	$(info ${GREEN}I rfdetr-cpp build info:avx512${RESET})
-	SO_TARGET=librfdetrcpp-avx512.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) librfdetrcpp-custom
-	rm -rfv build*
-endif
-
-# Build fallback variant (all platforms)
-librfdetrcpp-fallback.so: sources/rt-detr.cpp
-	$(MAKE) purge
-	$(info ${GREEN}I rfdetr-cpp build info:fallback${RESET})
-	SO_TARGET=librfdetrcpp-fallback.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) librfdetrcpp-custom
-	rm -rfv build*
-
-librfdetrcpp-custom: CMakeLists.txt
-	mkdir -p build-$(SO_TARGET) && \
-	cd build-$(SO_TARGET) && \
-	cmake .. $(CMAKE_ARGS) && \
-	cmake --build . --config Release -j$(JOBS) && \
-	cd .. && \
-	mv build-$(SO_TARGET)/librfdetrcpp.so ./$(SO_TARGET)
-
-all: rfdetr-cpp package
-
-# `test` is invoked by the top-level Makefile's `test-extra` target. It builds
-# the backend binary and then runs the lightweight setup in test.sh (which
-# downloads a small GGUF for smoke validation and synthesises a test image).
-# Full end-to-end RPC exercise lives in tests/e2e-backends and runs against
-# the built image via `make test-extra-backend`.
-test: rfdetr-cpp
-	bash test.sh
--- a/backend/go/rfdetr-cpp/gorfdetrcpp.go
+++ b/backend/go/rfdetr-cpp/gorfdetrcpp.go
@@ -1,195 +0,0 @@
-package main
-
-// gorfdetrcpp.go - gRPC handlers (Load, Detect) for the rfdetr-cpp backend.
-//
-// Embeds base.SingleThread to default unimplemented RPCs to "not supported"
-// while we only implement object detection.
-
-import (
-	"encoding/base64"
-	"fmt"
-	"os"
-	"path/filepath"
-	"strconv"
-	"unsafe"
-
-	"github.com/mudler/LocalAI/pkg/grpc/base"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-)
-
-// Default upper bound on detections returned per image. RF-DETR's decoder
-// queries are limited to a few hundred; 300 is a safe ceiling.
-const defaultTopK = 300
-
-// rfdetr_handle_t is a uintptr-typed opaque handle (see include/rfdetr_capi.h).
-var (
-	// rfdetr_capi_load(const char* model_path, int n_threads, rfdetr_handle_t* out_handle) -> int
-	CapiLoad func(modelPath string, nThreads int32, outHandle *uintptr) int32
-	// rfdetr_capi_unload(rfdetr_handle_t handle) -> int
-	CapiUnload func(handle uintptr) int32
-	// rfdetr_capi_detect_path(handle, image_path, threshold, top_k, out_json) -> int
-	CapiDetectPath func(handle uintptr, imagePath string, threshold float32, topK uint32, outJSON *uintptr) int32
-	// rfdetr_capi_detect_buffer(handle, bytes, len, threshold, top_k, out_json) -> int
-	CapiDetectBuffer func(handle uintptr, bytes uintptr, length uintptr, threshold float32, topK uint32, outJSON *uintptr) int32
-	// rfdetr_capi_free_string(char* s)
-	CapiFreeString func(s uintptr)
-	// rfdetr_capi_get_n_detections(handle) -> int
-	CapiGetNDetections func(handle uintptr) int32
-	// rfdetr_capi_get_detection_class_id(handle, i) -> int
-	CapiGetDetectionClassID func(handle uintptr, i int32) int32
-	// rfdetr_capi_get_detection_box(handle, i, out_xyxy[4]) -> int (0 on success)
-	CapiGetDetectionBox func(handle uintptr, i int32, outXYXY uintptr) int32
-	// rfdetr_capi_get_detection_score(handle, i) -> float
-	CapiGetDetectionScore func(handle uintptr, i int32) float32
-	// rfdetr_capi_get_detection_class_name(handle, i, buf, buf_size) -> int (needed/written; two-call sizing)
-	CapiGetDetectionClassName func(handle uintptr, i int32, buf uintptr, bufSize int32) int32
-	// rfdetr_capi_get_detection_mask_png(handle, i, buf, buf_size) -> int (needed/written; 0 means no mask)
-	CapiGetDetectionMaskPNG func(handle uintptr, i int32, buf uintptr, bufSize int32) int32
-)
-
-type RFDetrCpp struct {
-	base.SingleThread
-	handle uintptr
-}
-
-// Load loads the GGUF model at opts.ModelFile (joined with opts.ModelPath if relative)
-// and stores the handle for later Detect calls.
-func (r *RFDetrCpp) Load(opts *pb.ModelOptions) error {
-	modelFile := opts.ModelFile
-	if modelFile == "" {
-		modelFile = opts.Model
-	}
-	if modelFile == "" {
-		return fmt.Errorf("rfdetr-cpp: ModelFile is empty")
-	}
-
-	var modelPath string
-	if filepath.IsAbs(modelFile) {
-		modelPath = modelFile
-	} else {
-		modelPath = filepath.Join(opts.ModelPath, modelFile)
-	}
-
-	if _, err := os.Stat(modelPath); err != nil {
-		return fmt.Errorf("rfdetr-cpp: model file not found: %s: %w", modelPath, err)
-	}
-
-	threads := opts.Threads
-	if threads <= 0 {
-		threads = 4
-	}
-
-	// Release previous model if any (re-Load).
-	if r.handle != 0 {
-		CapiUnload(r.handle)
-		r.handle = 0
-	}
-
-	var h uintptr
-	rc := CapiLoad(modelPath, threads, &h)
-	if rc != 0 || h == 0 {
-		return fmt.Errorf("rfdetr-cpp: rfdetr_capi_load failed with rc=%d for %s", rc, modelPath)
-	}
-	r.handle = h
-	return nil
-}
-
-// Detect runs object detection on the base64-encoded image in opts.Src at
-// opts.Threshold, returning one pb.Detection per result. Seg models also
-// populate Detection.Mask with PNG-encoded mask bytes.
-func (r *RFDetrCpp) Detect(opts *pb.DetectOptions) (pb.DetectResponse, error) {
-	if r.handle == 0 {
-		return pb.DetectResponse{}, fmt.Errorf("rfdetr-cpp: model not loaded")
-	}
-
-	// Decode base64 image and write to temp file.
-	imgData, err := base64.StdEncoding.DecodeString(opts.Src)
-	if err != nil {
-		return pb.DetectResponse{}, fmt.Errorf("rfdetr-cpp: failed to decode base64 image: %w", err)
-	}
-
-	tmpFile, err := os.CreateTemp("", "rfdetr-*.img")
-	if err != nil {
-		return pb.DetectResponse{}, fmt.Errorf("rfdetr-cpp: failed to create temp file: %w", err)
-	}
-	defer func() { _ = os.Remove(tmpFile.Name()) }()
-
-	if _, err := tmpFile.Write(imgData); err != nil {
-		_ = tmpFile.Close()
-		return pb.DetectResponse{}, fmt.Errorf("rfdetr-cpp: failed to write temp file: %w", err)
-	}
-	if err := tmpFile.Close(); err != nil {
-		return pb.DetectResponse{}, fmt.Errorf("rfdetr-cpp: failed to close temp file: %w", err)
-	}
-
-	threshold := opts.Threshold
-	if threshold <= 0 {
-		threshold = 0.5
-	}
-
-	// JSON output from detect_path is unused: we read structured detections via
-	// the accessor functions. Still must free the returned string.
-	var jsonPtr uintptr
-	rc := CapiDetectPath(r.handle, tmpFile.Name(), threshold, uint32(defaultTopK), &jsonPtr)
-	if jsonPtr != 0 {
-		CapiFreeString(jsonPtr)
-	}
-	if rc != 0 {
-		return pb.DetectResponse{}, fmt.Errorf("rfdetr-cpp: detect failed with rc=%d", rc)
-	}
-
-	n := CapiGetNDetections(r.handle)
-	if n < 0 {
-		return pb.DetectResponse{}, fmt.Errorf("rfdetr-cpp: invalid n_detections=%d", n)
-	}
-
-	detections := make([]*pb.Detection, 0, n)
-	for i := int32(0); i < n; i++ {
-		var bbox [4]float32 // x1, y1, x2, y2
-		if rc := CapiGetDetectionBox(r.handle, i, uintptr(unsafe.Pointer(&bbox[0]))); rc != 0 {
-			continue
-		}
-		cid := CapiGetDetectionClassID(r.handle, i)
-		score := CapiGetDetectionScore(r.handle, i)
-
-		// Two-call sizing for class_name.
-		var className string
-		nameSize := CapiGetDetectionClassName(r.handle, i, 0, 0)
-		if nameSize > 1 {
-			buf := make([]byte, nameSize)
-			written := CapiGetDetectionClassName(r.handle, i, uintptr(unsafe.Pointer(&buf[0])), nameSize)
-			// `written` is the same number (needed bytes including NUL); strip NUL.
-			if written > 0 && int(written) <= len(buf) {
-				className = string(buf[:written-1])
-			} else {
-				className = string(buf[:len(buf)-1])
-			}
-		}
-		if className == "" {
-			className = strconv.Itoa(int(cid))
-		}
-
-		// Two-call sizing for mask PNG (returns 0 when no mask).
-		var mask []byte
-		maskSize := CapiGetDetectionMaskPNG(r.handle, i, 0, 0)
-		if maskSize > 0 {
-			maskBuf := make([]byte, maskSize)
-			CapiGetDetectionMaskPNG(r.handle, i, uintptr(unsafe.Pointer(&maskBuf[0])), maskSize)
-			mask = maskBuf
-		}
-
-		detections = append(detections, &pb.Detection{
-			X:          bbox[0],
-			Y:          bbox[1],
-			Width:      bbox[2] - bbox[0],
-			Height:     bbox[3] - bbox[1],
-			Confidence: score,
-			ClassName:  className,
-			Mask:       mask,
-		})
-	}
-
-	return pb.DetectResponse{
-		Detections: detections,
-	}, nil
-}
--- a/backend/go/rfdetr-cpp/main.go
+++ b/backend/go/rfdetr-cpp/main.go
@@ -1,61 +0,0 @@
-package main
-
-// main.go - entry point for the rfdetr-cpp gRPC backend.
-//
-// Dlopens librfdetrcpp-<variant>.so via purego at the path in
-// RFDETR_LIBRARY (set by run.sh based on /proc/cpuinfo), registers the
-// rfdetr_capi_* C ABI symbols, then starts the gRPC server.
-
-import (
-	"flag"
-	"os"
-
-	"github.com/ebitengine/purego"
-	grpc "github.com/mudler/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-type LibFuncs struct {
-	FuncPtr any
-	Name    string
-}
-
-func main() {
-	// Get library name from environment variable, default to fallback
-	libName := os.Getenv("RFDETR_LIBRARY")
-	if libName == "" {
-		libName = "./librfdetrcpp-fallback.so"
-	}
-
-	rfdetrLib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
-	if err != nil {
-		panic(err)
-	}
-
-	libFuncs := []LibFuncs{
-		{&CapiLoad, "rfdetr_capi_load"},
-		{&CapiUnload, "rfdetr_capi_unload"},
-		{&CapiDetectPath, "rfdetr_capi_detect_path"},
-		{&CapiDetectBuffer, "rfdetr_capi_detect_buffer"},
-		{&CapiFreeString, "rfdetr_capi_free_string"},
-		{&CapiGetNDetections, "rfdetr_capi_get_n_detections"},
-		{&CapiGetDetectionClassID, "rfdetr_capi_get_detection_class_id"},
-		{&CapiGetDetectionBox, "rfdetr_capi_get_detection_box"},
-		{&CapiGetDetectionScore, "rfdetr_capi_get_detection_score"},
-		{&CapiGetDetectionClassName, "rfdetr_capi_get_detection_class_name"},
-		{&CapiGetDetectionMaskPNG, "rfdetr_capi_get_detection_mask_png"},
-	}
-
-	for _, lf := range libFuncs {
-		purego.RegisterLibFunc(lf.FuncPtr, rfdetrLib, lf.Name)
-	}
-
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &RFDetrCpp{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/go/rfdetr-cpp/package.sh
+++ b/backend/go/rfdetr-cpp/package.sh
@@ -1,59 +0,0 @@
-#!/bin/bash
-
-# Script to copy the appropriate libraries based on architecture
-
-set -e
-
-CURDIR=$(dirname "$(realpath $0)")
-REPO_ROOT="${CURDIR}/../../.."
-
-# Create lib directory
-mkdir -p $CURDIR/package/lib
-
-cp -avf $CURDIR/librfdetrcpp-*.so $CURDIR/package/
-cp -avf $CURDIR/rfdetr-cpp $CURDIR/package/
-cp -fv $CURDIR/run.sh $CURDIR/package/
-
-# Detect architecture and copy appropriate libraries
-if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
-    # x86_64 architecture
-    echo "Detected x86_64 architecture, copying x86_64 libraries..."
-    cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
-    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
-    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
-    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
-    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
-    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
-    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
-    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
-    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
-elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
-    # ARM64 architecture
-    echo "Detected ARM64 architecture, copying ARM64 libraries..."
-    cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
-    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
-    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
-    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
-    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
-    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
-    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
-    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
-    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
-elif [ $(uname -s) = "Darwin" ]; then
-    echo "Detected Darwin"
-else
-    echo "Error: Could not detect architecture"
-    exit 1
-fi
-
-# Package GPU libraries based on BUILD_TYPE
-GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
-if [ -f "$GPU_LIB_SCRIPT" ]; then
-    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
-    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
-    package_gpu_libs
-fi
-
-echo "Packaging completed successfully"
-ls -liah $CURDIR/package/
-ls -liah $CURDIR/package/lib/
--- a/backend/go/rfdetr-cpp/run.sh
+++ b/backend/go/rfdetr-cpp/run.sh
@@ -1,52 +0,0 @@
-#!/bin/bash
-set -ex
-
-# Get the absolute current dir where the script is located
-CURDIR=$(dirname "$(realpath $0)")
-
-cd /
-
-echo "CPU info:"
-if [ "$(uname)" != "Darwin" ]; then
-	grep -e "model\sname" /proc/cpuinfo | head -1
-	grep -e "flags" /proc/cpuinfo | head -1
-fi
-
-LIBRARY="$CURDIR/librfdetrcpp-fallback.so"
-
-if [ "$(uname)" != "Darwin" ]; then
-	if grep -q -e "\savx\s" /proc/cpuinfo ; then
-		echo "CPU:    AVX    found OK"
-		if [ -e $CURDIR/librfdetrcpp-avx.so ]; then
-			LIBRARY="$CURDIR/librfdetrcpp-avx.so"
-		fi
-	fi
-
-	if grep -q -e "\savx2\s" /proc/cpuinfo ; then
-		echo "CPU:    AVX2   found OK"
-		if [ -e $CURDIR/librfdetrcpp-avx2.so ]; then
-			LIBRARY="$CURDIR/librfdetrcpp-avx2.so"
-		fi
-	fi
-
-	# Check avx 512
-	if grep -q -e "\savx512f\s" /proc/cpuinfo ; then
-		echo "CPU:    AVX512F found OK"
-		if [ -e $CURDIR/librfdetrcpp-avx512.so ]; then
-			LIBRARY="$CURDIR/librfdetrcpp-avx512.so"
-		fi
-	fi
-fi
-
-export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
-export RFDETR_LIBRARY=$LIBRARY
-
-# If there is a lib/ld.so, use it
-if [ -f $CURDIR/lib/ld.so ]; then
-	echo "Using lib/ld.so"
-	echo "Using library: $LIBRARY"
-	exec $CURDIR/lib/ld.so $CURDIR/rfdetr-cpp "$@"
-fi
-
-echo "Using library: $LIBRARY"
-exec $CURDIR/rfdetr-cpp "$@"
--- a/backend/go/rfdetr-cpp/test.sh
+++ b/backend/go/rfdetr-cpp/test.sh
@@ -1,51 +0,0 @@
-#!/bin/bash
-set -e
-
-CURDIR=$(dirname "$(realpath $0)")
-
-echo "Running rfdetr-cpp backend tests..."
-
-# The test requires an RF-DETR model in GGUF format from one of the
-# mudler/rfdetr-cpp-* HuggingFace repos. Defaults to the nano-q8_0 variant
-# (~20MB) for fast CI testing.
-RFDETR_MODEL_DIR="${RFDETR_MODEL_DIR:-$CURDIR/test-models}"
-RFDETR_MODEL_FILE="${RFDETR_MODEL_FILE:-rfdetr-nano-q8_0.gguf}"
-RFDETR_MODEL_URL="${RFDETR_MODEL_URL:-https://huggingface.co/mudler/rfdetr-cpp-nano/resolve/main/rfdetr-nano-q8_0.gguf}"
-
-# Download model if not present
-if [ ! -f "$RFDETR_MODEL_DIR/$RFDETR_MODEL_FILE" ]; then
-    echo "Downloading rfdetr nano-q8_0 model for testing..."
-    mkdir -p "$RFDETR_MODEL_DIR"
-    curl -L -o "$RFDETR_MODEL_DIR/$RFDETR_MODEL_FILE" "$RFDETR_MODEL_URL" --progress-bar
-    echo "Model downloaded."
-fi
-
-# Create a test image (64x64 red pixel PNG) using Python if available.
-# This exercises the wire protocol; meaningful detection results require a
-# realistic image which the integration test in test-extra provides.
-TEST_IMAGE_DIR="$CURDIR/test-data"
-mkdir -p "$TEST_IMAGE_DIR"
-
-if command -v python3 &> /dev/null; then
-    python3 -c "
-import struct, zlib
-def create_png(width, height, r, g, b):
-    raw = b''
-    for y in range(height):
-        raw += b'\x00'  # filter byte
-        for x in range(width):
-            raw += bytes([r, g, b])
-    def chunk(ctype, data):
-        c = ctype + data
-        return struct.pack('>I', len(data)) + c + struct.pack('>I', zlib.crc32(c) & 0xffffffff)
-    ihdr = struct.pack('>IIBBBBB', width, height, 8, 2, 0, 0, 0)
-    return b'\x89PNG\r\n\x1a\n' + chunk(b'IHDR', ihdr) + chunk(b'IDAT', zlib.compress(raw)) + chunk(b'IEND', b'')
-with open('$TEST_IMAGE_DIR/test.png', 'wb') as f:
-    f.write(create_png(64, 64, 255, 0, 0))
-"
-    echo "Test image created."
-fi
-
-echo "rfdetr-cpp test setup complete."
-echo "Model: $RFDETR_MODEL_DIR/$RFDETR_MODEL_FILE"
-echo "Note: Full integration tests run via the LocalAI test-extra target."
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=1ceb5bd9df7784bcdf67dd9ed8bf0198b542ebc9
+STABLEDIFFUSION_GGML_VERSION?=a397e03488cc27e1a42da646b82dfce9f50741c0

 CMAKE_ARGS+=-DGGML_MAX_NAME=128

--- a/backend/go/stablediffusion-ggml/cpp/gosd.cpp
+++ b/backend/go/stablediffusion-ggml/cpp/gosd.cpp
@@ -27,7 +27,6 @@
 #include <stdlib.h>
 #include <regex>
 #include <errno.h>
-#include <inttypes.h>
 #include <signal.h>
 #include <unistd.h>
 #include <sys/wait.h>
@@ -1076,71 +1075,9 @@ static uint8_t* load_and_resize_image(const char* path, int target_width, int ta
    return buf;
 }

-// Write sd.cpp's audio buffer to a temp WAV file (IEEE float, interleaved).
-// sd_audio_t.data is planar (all channel 0 samples, then channel 1, etc.) — we
-// interleave on the fly so ffmpeg's standard wav demuxer can read it directly.
-// Returns 0 on success and fills wav_path (must be at least 64 bytes).
-static int write_planar_float_wav(const sd_audio_t* a, char* wav_path, size_t wav_path_sz) {
-    if (!a || !a->data || a->sample_count == 0 || a->channels == 0 || a->sample_rate == 0) {
-        return -1;
-    }
-
-    snprintf(wav_path, wav_path_sz, "/tmp/gosd-audio-XXXXXX.wav");
-    int fd = mkstemps(wav_path, 4);
-    if (fd < 0) { perror("mkstemps wav"); return -1; }
-    FILE* f = fdopen(fd, "wb");
-    if (!f) { perror("fdopen wav"); close(fd); return -1; }
-
-    uint64_t frames = a->sample_count;
-    uint32_t channels = a->channels;
-    uint32_t sample_rate = a->sample_rate;
-    uint64_t total_samples64 = frames * (uint64_t)channels;
-    uint64_t data_bytes64 = total_samples64 * sizeof(float);
-    if (data_bytes64 > 0xFFFFFFFFull - 44) {
-        fprintf(stderr, "audio too large for 32-bit WAV (%" PRIu64 " bytes)\n", data_bytes64);
-        fclose(f);
-        unlink(wav_path);
-        return -1;
-    }
-    uint32_t data_bytes = (uint32_t)data_bytes64;
-    uint32_t riff_size = 36 + data_bytes;
-    uint16_t fmt_code = 3;                // WAVE_FORMAT_IEEE_FLOAT
-    uint16_t bits_per_sample = 32;
-    uint16_t block_align = (uint16_t)(channels * sizeof(float));
-    uint32_t byte_rate = sample_rate * block_align;
-    uint16_t ch16 = (uint16_t)channels;
-    uint32_t fmt_size = 16;
-
-    fwrite("RIFF", 1, 4, f);
-    fwrite(&riff_size, 4, 1, f);
-    fwrite("WAVEfmt ", 1, 8, f);
-    fwrite(&fmt_size, 4, 1, f);
-    fwrite(&fmt_code, 2, 1, f);
-    fwrite(&ch16, 2, 1, f);
-    fwrite(&sample_rate, 4, 1, f);
-    fwrite(&byte_rate, 4, 1, f);
-    fwrite(&block_align, 2, 1, f);
-    fwrite(&bits_per_sample, 2, 1, f);
-    fwrite("data", 1, 4, f);
-    fwrite(&data_bytes, 4, 1, f);
-
-    // Interleave planar [ch0_samples..., ch1_samples...] → [ch0_s0, ch1_s0, ...]
-    for (uint64_t s = 0; s < frames; s++) {
-        for (uint32_t c = 0; c < channels; c++) {
-            float v = a->data[(size_t)c * frames + s];
-            fwrite(&v, sizeof(float), 1, f);
-        }
-    }
-    fclose(f);
-    return 0;
-}
-
 // Pipe raw RGB/RGBA frames to ffmpeg stdin and let it produce an MP4 at dst.
-// Uses fork+execvp to avoid shell interpretation of dst. When `audio` is
-// non-null, the audio waveform is staged to a temp WAV and added as a second
-// ffmpeg input so the final MP4 contains both video and AAC audio.
-static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps,
-                                  const sd_audio_t* audio, const char* dst) {
+// Uses fork+execvp to avoid shell interpretation of dst.
+static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, const char* dst) {
    if (num_frames <= 0 || !frames || !frames[0].data) {
        fprintf(stderr, "ffmpeg_mux: empty frames\n");
        return 1;
@@ -1155,87 +1092,38 @@ static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps,
    snprintf(size_str, sizeof(size_str), "%dx%d", width, height);
    snprintf(fps_str, sizeof(fps_str), "%d", fps);

-    // Optional audio: write a temp WAV file if the model produced audio.
-    char wav_path[64] = {0};
-    bool have_audio = false;
-    if (audio && audio->data && audio->sample_count > 0 && audio->channels > 0 && audio->sample_rate > 0) {
-        if (write_planar_float_wav(audio, wav_path, sizeof(wav_path)) == 0) {
-            have_audio = true;
-            fprintf(stderr, "ffmpeg_mux: audio %u Hz × %u ch × %" PRIu64 " frames → %s\n",
-                    audio->sample_rate, audio->channels, audio->sample_count, wav_path);
-        } else {
-            fprintf(stderr, "ffmpeg_mux: failed to stage audio; producing silent video\n");
-        }
-    }
-
    int pipefd[2];
-    if (pipe(pipefd) != 0) {
-        perror("pipe");
-        if (have_audio) unlink(wav_path);
-        return 1;
-    }
+    if (pipe(pipefd) != 0) { perror("pipe"); return 1; }

    pid_t pid = fork();
-    if (pid < 0) {
-        perror("fork");
-        close(pipefd[0]); close(pipefd[1]);
-        if (have_audio) unlink(wav_path);
-        return 1;
-    }
+    if (pid < 0) { perror("fork"); close(pipefd[0]); close(pipefd[1]); return 1; }

    if (pid == 0) {
        // child
        close(pipefd[1]);
        if (dup2(pipefd[0], STDIN_FILENO) < 0) { perror("dup2"); _exit(127); }
        close(pipefd[0]);
-        std::vector<char*> argv;
-        argv.push_back(const_cast<char*>("ffmpeg"));
-        argv.push_back(const_cast<char*>("-y"));
-        argv.push_back(const_cast<char*>("-hide_banner"));
-        argv.push_back(const_cast<char*>("-loglevel"));
-        argv.push_back(const_cast<char*>("warning"));
-        // Input 0: raw video from stdin
-        argv.push_back(const_cast<char*>("-f"));
-        argv.push_back(const_cast<char*>("rawvideo"));
-        argv.push_back(const_cast<char*>("-pix_fmt"));
-        argv.push_back(const_cast<char*>(pix_fmt_in));
-        argv.push_back(const_cast<char*>("-s"));
-        argv.push_back(size_str);
-        argv.push_back(const_cast<char*>("-framerate"));
-        argv.push_back(fps_str);
-        argv.push_back(const_cast<char*>("-i"));
-        argv.push_back(const_cast<char*>("-"));
-        // Input 1: optional audio WAV
-        if (have_audio) {
-            argv.push_back(const_cast<char*>("-i"));
-            argv.push_back(wav_path);
-            argv.push_back(const_cast<char*>("-map"));
-            argv.push_back(const_cast<char*>("0:v:0"));
-            argv.push_back(const_cast<char*>("-map"));
-            argv.push_back(const_cast<char*>("1:a:0"));
-            argv.push_back(const_cast<char*>("-c:a"));
-            argv.push_back(const_cast<char*>("aac"));
-            argv.push_back(const_cast<char*>("-b:a"));
-            argv.push_back(const_cast<char*>("192k"));
-            // -shortest so the final clip ends with the shorter of the two
-            // streams — guards against an audio buffer that overshoots the
-            // video duration (or vice versa) on certain LTX variants.
-            argv.push_back(const_cast<char*>("-shortest"));
-        }
-        argv.push_back(const_cast<char*>("-c:v"));
-        argv.push_back(const_cast<char*>("libx264"));
-        argv.push_back(const_cast<char*>("-pix_fmt"));
-        argv.push_back(const_cast<char*>("yuv420p"));
-        argv.push_back(const_cast<char*>("-movflags"));
-        argv.push_back(const_cast<char*>("+faststart"));
-        // Force MP4 container. Distributed LocalAI hands us a staging
-        // path (e.g. /staging/localai-output-NNN.tmp) with a non-standard
-        // extension; relying on filename suffix makes ffmpeg bail with
-        // "Unable to choose an output format".
-        argv.push_back(const_cast<char*>("-f"));
-        argv.push_back(const_cast<char*>("mp4"));
-        argv.push_back(const_cast<char*>(dst));
-        argv.push_back(nullptr);
+        std::vector<char*> argv = {
+            const_cast<char*>("ffmpeg"),
+            const_cast<char*>("-y"),
+            const_cast<char*>("-hide_banner"),
+            const_cast<char*>("-loglevel"), const_cast<char*>("warning"),
+            const_cast<char*>("-f"), const_cast<char*>("rawvideo"),
+            const_cast<char*>("-pix_fmt"), const_cast<char*>(pix_fmt_in),
+            const_cast<char*>("-s"), size_str,
+            const_cast<char*>("-framerate"), fps_str,
+            const_cast<char*>("-i"), const_cast<char*>("-"),
+            const_cast<char*>("-c:v"), const_cast<char*>("libx264"),
+            const_cast<char*>("-pix_fmt"), const_cast<char*>("yuv420p"),
+            const_cast<char*>("-movflags"), const_cast<char*>("+faststart"),
+            // Force MP4 container. Distributed LocalAI hands us a staging
+            // path (e.g. /staging/localai-output-NNN.tmp) with a non-standard
+            // extension; relying on filename suffix makes ffmpeg bail with
+            // "Unable to choose an output format".
+            const_cast<char*>("-f"), const_cast<char*>("mp4"),
+            const_cast<char*>(dst),
+            nullptr
+        };
        execvp(argv[0], argv.data());
        perror("execvp ffmpeg");
        _exit(127);
@@ -1260,7 +1148,6 @@ static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps,
                close(pipefd[1]);
                int status;
                waitpid(pid, &status, 0);
-                if (have_audio) unlink(wav_path);
                return 1;
            }
            p += n;
@@ -1271,13 +1158,8 @@ static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps,

    int status = 0;
    while (waitpid(pid, &status, 0) < 0) {
-        if (errno != EINTR) {
-            perror("waitpid");
-            if (have_audio) unlink(wav_path);
-            return 1;
-        }
+        if (errno != EINTR) { perror("waitpid"); return 1; }
    }
-    if (have_audio) unlink(wav_path);
    if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
        fprintf(stderr, "ffmpeg exited with status %d\n", status);
        return 1;
@@ -1352,7 +1234,7 @@ int gen_video(sd_vid_gen_params_t *p, int steps, char *dst, float cfg_scale, int

    fprintf(stderr, "Generated %d frames, muxing to %s via ffmpeg\n", num_frames_out, dst);

-    int rc = ffmpeg_mux_raw_to_mp4(frames, num_frames_out, fps, audio, dst);
+    int rc = ffmpeg_mux_raw_to_mp4(frames, num_frames_out, fps, dst);

    for (int i = 0; i < num_frames_out; i++) {
        if (frames[i].data) free(frames[i].data);
--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=e0fd1f6787a5bd4a4957dd97c5b64df882ee7b0c
+WHISPER_CPP_VERSION?=0ccd896f5b882628e1c077f9769735ef4ce52860
 SO_TARGET?=libgowhisper.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -253,34 +253,6 @@
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-sam3-cpp"
    intel: "intel-sycl-f32-sam3-cpp"
    vulkan: "vulkan-sam3-cpp"
- &rfdetrcpp
-  name: "rfdetr-cpp"
-  alias: "rfdetr-cpp"
-  license: apache-2.0
-  description: |
-    Native RF-DETR object detection and instance segmentation in C/C++
-    using GGML. Loads pre-built GGUF weights from the mudler/rfdetr-cpp-*
-    family (Nano/Small/Base/Medium/Large + SegNano/SegSmall/SegMedium)
-    and returns bounding boxes, class labels, confidence scores, and
-    (for segmentation variants) PNG-encoded per-detection masks.
-  urls:
-    - https://github.com/mudler/rf-detr.cpp
-  tags:
-    - object-detection
-    - image-segmentation
-    - rfdetr
-    - gpu
-    - cpu
-  capabilities:
-    default: "cpu-rfdetr-cpp"
-    nvidia: "cuda12-rfdetr-cpp"
-    nvidia-cuda-12: "cuda12-rfdetr-cpp"
-    nvidia-cuda-13: "cuda13-rfdetr-cpp"
-    nvidia-l4t: "nvidia-l4t-arm64-rfdetr-cpp"
-    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-rfdetr-cpp"
-    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-rfdetr-cpp"
-    intel: "intel-sycl-f32-rfdetr-cpp"
-    vulkan: "vulkan-rfdetr-cpp"
 - &vllm
  name: "vllm"
  license: apache-2.0
@@ -2377,99 +2349,6 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-sam3-cpp"
  mirrors:
    - localai/localai-backends:master-gpu-vulkan-sam3-cpp
-## rfdetr-cpp
- !!merge <<: *rfdetrcpp
-  name: "rfdetr-cpp-development"
-  capabilities:
-    default: "cpu-rfdetr-cpp-development"
-    nvidia: "cuda12-rfdetr-cpp-development"
-    nvidia-cuda-12: "cuda12-rfdetr-cpp-development"
-    nvidia-cuda-13: "cuda13-rfdetr-cpp-development"
-    nvidia-l4t: "nvidia-l4t-arm64-rfdetr-cpp-development"
-    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-rfdetr-cpp-development"
-    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-rfdetr-cpp-development"
-    intel: "intel-sycl-f32-rfdetr-cpp-development"
-    vulkan: "vulkan-rfdetr-cpp-development"
- !!merge <<: *rfdetrcpp
-  name: "cpu-rfdetr-cpp"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-rfdetr-cpp"
-  mirrors:
-    - localai/localai-backends:latest-cpu-rfdetr-cpp
- !!merge <<: *rfdetrcpp
-  name: "cpu-rfdetr-cpp-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-rfdetr-cpp"
-  mirrors:
-    - localai/localai-backends:master-cpu-rfdetr-cpp
- !!merge <<: *rfdetrcpp
-  name: "cuda12-rfdetr-cpp"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-rfdetr-cpp"
-  mirrors:
-    - localai/localai-backends:latest-gpu-nvidia-cuda-12-rfdetr-cpp
- !!merge <<: *rfdetrcpp
-  name: "cuda12-rfdetr-cpp-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-rfdetr-cpp"
-  mirrors:
-    - localai/localai-backends:master-gpu-nvidia-cuda-12-rfdetr-cpp
- !!merge <<: *rfdetrcpp
-  name: "cuda13-rfdetr-cpp"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-rfdetr-cpp"
-  mirrors:
-    - localai/localai-backends:latest-gpu-nvidia-cuda-13-rfdetr-cpp
- !!merge <<: *rfdetrcpp
-  name: "cuda13-rfdetr-cpp-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-rfdetr-cpp"
-  mirrors:
-    - localai/localai-backends:master-gpu-nvidia-cuda-13-rfdetr-cpp
- !!merge <<: *rfdetrcpp
-  name: "nvidia-l4t-arm64-rfdetr-cpp"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-rfdetr-cpp"
-  mirrors:
-    - localai/localai-backends:latest-nvidia-l4t-arm64-rfdetr-cpp
- !!merge <<: *rfdetrcpp
-  name: "nvidia-l4t-arm64-rfdetr-cpp-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-rfdetr-cpp"
-  mirrors:
-    - localai/localai-backends:master-nvidia-l4t-arm64-rfdetr-cpp
- !!merge <<: *rfdetrcpp
-  name: "cuda13-nvidia-l4t-arm64-rfdetr-cpp"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-rfdetr-cpp"
-  mirrors:
-    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-rfdetr-cpp
- !!merge <<: *rfdetrcpp
-  name: "cuda13-nvidia-l4t-arm64-rfdetr-cpp-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-rfdetr-cpp"
-  mirrors:
-    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-rfdetr-cpp
- !!merge <<: *rfdetrcpp
-  name: "intel-sycl-f32-rfdetr-cpp"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-rfdetr-cpp"
-  mirrors:
-    - localai/localai-backends:latest-gpu-intel-sycl-f32-rfdetr-cpp
- !!merge <<: *rfdetrcpp
-  name: "intel-sycl-f32-rfdetr-cpp-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-rfdetr-cpp"
-  mirrors:
-    - localai/localai-backends:master-gpu-intel-sycl-f32-rfdetr-cpp
- !!merge <<: *rfdetrcpp
-  name: "intel-sycl-f16-rfdetr-cpp"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-rfdetr-cpp"
-  mirrors:
-    - localai/localai-backends:latest-gpu-intel-sycl-f16-rfdetr-cpp
- !!merge <<: *rfdetrcpp
-  name: "intel-sycl-f16-rfdetr-cpp-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-rfdetr-cpp"
-  mirrors:
-    - localai/localai-backends:master-gpu-intel-sycl-f16-rfdetr-cpp
- !!merge <<: *rfdetrcpp
-  name: "vulkan-rfdetr-cpp"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-rfdetr-cpp"
-  mirrors:
-    - localai/localai-backends:latest-gpu-vulkan-rfdetr-cpp
- !!merge <<: *rfdetrcpp
-  name: "vulkan-rfdetr-cpp-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-rfdetr-cpp"
-  mirrors:
-    - localai/localai-backends:master-gpu-vulkan-rfdetr-cpp
 ## Rerankers
 - !!merge <<: *rerankers
  name: "rerankers-development"
--- a/core/gallery/importers/rfdetr.go
+++ b/core/gallery/importers/rfdetr.go
@@ -31,29 +31,6 @@ func repoLooksLikeRFDetr(repo string) bool {
 	return strings.Contains(lower, "rf-detr") || strings.Contains(lower, "rfdetr")
 }

-// repoHasGGUF inspects the HuggingFace file list (when available) to decide
-// whether the repo ships RF-DETR weights in ggml/GGUF form — the native
-// rfdetr-cpp backend's input format. Mudler's rfdetr-cpp-* repos
-// (mudler/rfdetr-cpp-nano, mudler/rfdetr-cpp-base, ...) match.
-func repoHasGGUF(details Details) bool {
-	if details.HuggingFace == nil {
-		return false
-	}
-	for _, f := range details.HuggingFace.Files {
-		if strings.HasSuffix(strings.ToLower(f.Path), ".gguf") {
-			return true
-		}
-	}
-	return false
-}
-
-func repoLooksLikeRFDetrCpp(repo string) bool {
-	lower := strings.ToLower(repo)
-	return strings.Contains(lower, "rfdetr-cpp") || strings.Contains(lower, "rf-detr-cpp") ||
-		strings.Contains(lower, "rfdetr.cpp") || strings.Contains(lower, "rt-detr.cpp") ||
-		strings.Contains(lower, "rf-detr.cpp")
-}
-
 func (i *RFDetrImporter) Match(details Details) bool {
 	preferences, err := details.Preferences.MarshalJSON()
 	if err != nil {
@@ -66,7 +43,7 @@ func (i *RFDetrImporter) Match(details Details) bool {
 		}
 	}

-	if b, ok := preferencesMap["backend"].(string); ok && (b == "rfdetr" || b == "rfdetr-cpp") {
+	if b, ok := preferencesMap["backend"].(string); ok && b == "rfdetr" {
 		return true
 	}

@@ -122,28 +99,10 @@ func (i *RFDetrImporter) Import(details Details) (gallery.ModelConfig, error) {
 		model = owner + "/" + repo
 	}

-	// Route GGUF-bearing repos (mudler/rfdetr-cpp-*) to the native
-	// rfdetr-cpp backend; HF transformer repos keep the Python rfdetr
-	// backend. Explicit preferences.backend overrides the heuristic.
-	backend := "rfdetr"
-	if b, ok := preferencesMap["backend"].(string); ok && b != "" {
-		backend = b
-	} else if repoHasGGUF(details) {
-		backend = "rfdetr-cpp"
-	} else if details.HuggingFace != nil {
-		repoName := details.HuggingFace.ModelID
-		if idx := strings.Index(repoName, "/"); idx >= 0 {
-			repoName = repoName[idx+1:]
-		}
-		if repoLooksLikeRFDetrCpp(repoName) {
-			backend = "rfdetr-cpp"
-		}
-	}
-
 	modelConfig := config.ModelConfig{
 		Name:                name,
 		Description:         description,
-		Backend:             backend,
+		Backend:             "rfdetr",
 		KnownUsecaseStrings: []string{"detection"},
 		PredictionOptions: schema.PredictionOptions{
 			BasicModelRequest: schema.BasicModelRequest{Model: model},
--- a/core/gallery/importers/rfdetr_test.go
+++ b/core/gallery/importers/rfdetr_test.go
@@ -129,125 +129,4 @@ var _ = Describe("RFDetrImporter", func() {
 			Expect(modelConfig.Description).To(Equal("Custom"))
 		})
 	})
-
-	// Table-driven coverage of the GGUF auto-routing path between the
-	// Python rfdetr backend (HF transformer repos) and the native
-	// rfdetr-cpp backend (GGUF repos like mudler/rfdetr-cpp-*).
-	//
-	// Cases are kept offline-deterministic by injecting Details directly
-	// rather than going through DiscoverModelConfig (which would hit live HF).
-	// The live HF cross-check lives in its own Context below.
-	Context("GGUF auto-routing (offline)", func() {
-		hfFile := func(path string) hfapi.ModelFile {
-			return hfapi.ModelFile{Path: path}
-		}
-
-		type tc struct {
-			name           string
-			uri            string
-			modelID        string
-			files          []hfapi.ModelFile
-			prefs          string
-			expectBackend  string // expected `backend:` line content
-			rejectBackends []string
-		}
-
-		entries := []tc{
-			{
-				name:          "GGUF repo with rfdetr-cpp prefix routes to rfdetr-cpp",
-				uri:           "https://huggingface.co/mudler/rfdetr-cpp-nano",
-				modelID:       "mudler/rfdetr-cpp-nano",
-				files:         []hfapi.ModelFile{hfFile("rfdetr-nano-q8_0.gguf"), hfFile("README.md")},
-				prefs:         "",
-				expectBackend: "backend: rfdetr-cpp",
-			},
-			{
-				name:          "GGUF presence alone routes to rfdetr-cpp even when repo name lacks -cpp",
-				uri:           "https://huggingface.co/some/rf-detr-ggml",
-				modelID:       "some/rf-detr-ggml",
-				files:         []hfapi.ModelFile{hfFile("rfdetr-base-f16.gguf")},
-				prefs:         "",
-				expectBackend: "backend: rfdetr-cpp",
-			},
-			{
-				name:           "transformer repo without GGUF stays on the Python rfdetr backend",
-				uri:            "https://huggingface.co/roboflow/rf-detr-base",
-				modelID:        "roboflow/rf-detr-base",
-				files:          []hfapi.ModelFile{hfFile("config.json"), hfFile("pytorch_model.bin")},
-				prefs:          "",
-				expectBackend:  "backend: rfdetr\n",
-				rejectBackends: []string{"backend: rfdetr-cpp"},
-			},
-			{
-				name:           "explicit preferences.backend=rfdetr overrides GGUF auto-detect",
-				uri:            "https://huggingface.co/mudler/rfdetr-cpp-nano",
-				modelID:        "mudler/rfdetr-cpp-nano",
-				files:          []hfapi.ModelFile{hfFile("rfdetr-nano-q8_0.gguf")},
-				prefs:          `{"backend": "rfdetr"}`,
-				expectBackend:  "backend: rfdetr\n",
-				rejectBackends: []string{"backend: rfdetr-cpp"},
-			},
-			{
-				name:          "explicit preferences.backend=rfdetr-cpp wins on non-GGUF transformer repo",
-				uri:           "https://huggingface.co/roboflow/rf-detr-base",
-				modelID:       "roboflow/rf-detr-base",
-				files:         []hfapi.ModelFile{hfFile("config.json")},
-				prefs:         `{"backend": "rfdetr-cpp"}`,
-				expectBackend: "backend: rfdetr-cpp",
-			},
-			{
-				name:          "repo name with rfdetr.cpp pattern routes to rfdetr-cpp even without HF file list",
-				uri:           "https://huggingface.co/some/rfdetr.cpp-bundle",
-				modelID:       "some/rfdetr.cpp-bundle",
-				files:         nil,
-				prefs:         "",
-				expectBackend: "backend: rfdetr-cpp",
-			},
-		}
-
-		for _, e := range entries {
-			e := e // capture for closure
-			It(e.name, func() {
-				imp := &importers.RFDetrImporter{}
-				details := importers.Details{
-					URI: e.uri,
-					HuggingFace: &hfapi.ModelDetails{
-						ModelID: e.modelID,
-						Files:   e.files,
-					},
-				}
-				if e.prefs != "" {
-					details.Preferences = json.RawMessage(e.prefs)
-				}
-
-				// Match must always be true for these fixtures — they're
-				// either preference-driven or have an rfdetr/rf-detr token.
-				Expect(imp.Match(details)).To(BeTrue(), fmt.Sprintf("Match should fire for %+v", details))
-
-				modelConfig, err := imp.Import(details)
-				Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Import error: %v", err))
-				Expect(modelConfig.ConfigFile).To(ContainSubstring(e.expectBackend),
-					fmt.Sprintf("Model config: %+v", modelConfig))
-				for _, rej := range e.rejectBackends {
-					Expect(modelConfig.ConfigFile).ToNot(ContainSubstring(rej),
-						fmt.Sprintf("did not expect %q in: %+v", rej, modelConfig))
-				}
-			})
-		}
-	})
-
-	// Live HF cross-check: the canonical native GGUF repo for the
-	// rfdetr-cpp backend. Marked broad — we only assert the routing
-	// decision, not file lists (upstream may add quants over time).
-	Context("detection from HuggingFace: mudler/rfdetr-cpp-nano", func() {
-		It("auto-routes to the native rfdetr-cpp backend without preferences", func() {
-			uri := "https://huggingface.co/mudler/rfdetr-cpp-nano"
-			modelConfig, err := importers.DiscoverModelConfig(uri, json.RawMessage(`{}`))
-
-			Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
-			Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: rfdetr-cpp"),
-				fmt.Sprintf("Model config: %+v", modelConfig))
-			Expect(modelConfig.ConfigFile).To(ContainSubstring("mudler/rfdetr-cpp-nano"))
-		})
-	})
 })
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -68,57 +68,6 @@ func mergeToolCallDeltas(existing []schema.ToolCall, deltas []schema.ToolCall) [
 	return existing
 }

-// applyAutoparserOverride replaces the Go-side reasoning-extraction result with
-// the C++ autoparser's classified ChatDeltas when those deltas contain
-// actionable content or reasoning. It preserves the original logprobs.
-//
-// When the autoparser did not classify any reasoning (deltaReasoning == "") but
-// deltaContent still carries an unparsed reasoning tag pair (e.g. the
-// non-jinja "pure content" fallback path on a <think> model — issue #9985),
-// the Go-side reasoning extractor is run on deltaContent as a defensive
-// fallback so <think>…</think> blocks do not leak into the OpenAI `content`
-// field.
-func applyAutoparserOverride(
-	chatDeltas []*pb.ChatDelta,
-	thinkingStartToken string,
-	reasoningConfig reason.Config,
-	existing []schema.Choice,
-) []schema.Choice {
-	if len(chatDeltas) == 0 {
-		return existing
-	}
-	deltaContent := functions.ContentFromChatDeltas(chatDeltas)
-	deltaReasoning := functions.ReasoningFromChatDeltas(chatDeltas)
-	if deltaContent == "" && deltaReasoning == "" {
-		return existing
-	}
-	// Fallback for non-jinja models (issue #9985): when the C++ autoparser
-	// did not classify reasoning but the raw content still contains a known
-	// reasoning tag pair, run Go-side extraction on the content so that the
-	// <think>…</think> block does not leak into the OpenAI `content` field.
-	// When the autoparser DID populate ReasoningContent, leave its
-	// content/reasoning split alone — trust the parser. We replace
-	// deltaContent unconditionally because ExtractReasoningWithConfig is a
-	// no-op when no tag pair matches; this also strips empty thinking
-	// blocks like "<think></think>" that some models emit when reasoning
-	// is disabled.
-	if deltaReasoning == "" && deltaContent != "" {
-		deltaReasoning, deltaContent = reason.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, reasoningConfig)
-	}
-	xlog.Debug("[ChatDeltas] non-SSE no-tools: overriding result with C++ autoparser deltas",
-		"content_len", len(deltaContent), "reasoning_len", len(deltaReasoning))
-	stopReason := FinishReasonStop
-	message := &schema.Message{Role: "assistant", Content: &deltaContent}
-	if deltaReasoning != "" {
-		message.Reasoning = &deltaReasoning
-	}
-	newChoice := schema.Choice{FinishReason: &stopReason, Index: 0, Message: message}
-	if len(existing) > 0 && existing[0].Logprobs != nil {
-		newChoice.Logprobs = existing[0].Logprobs
-	}
-	return []schema.Choice{newChoice}
-}
-
 // ChatEndpoint is the OpenAI Completion API endpoint https://platform.openai.com/docs/api-reference/chat/create
 // @Summary Generate a chat completions for a given prompt and model.
 // @Tags inference
@@ -808,8 +757,24 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 				// For non-tool requests: prefer C++ autoparser chat deltas over
 				// Go-side tag extraction (which can mangle output when thinkingStartToken
 				// differs from the model's actual reasoning tags, e.g. Gemma 4).
-				if !shouldUseFn {
-					result = applyAutoparserOverride(chatDeltas, thinkingStartToken, config.ReasoningConfig, result)
+				if !shouldUseFn && len(chatDeltas) > 0 {
+					deltaContent := functions.ContentFromChatDeltas(chatDeltas)
+					deltaReasoning := functions.ReasoningFromChatDeltas(chatDeltas)
+					if deltaContent != "" || deltaReasoning != "" {
+						xlog.Debug("[ChatDeltas] non-SSE no-tools: overriding result with C++ autoparser deltas",
+							"content_len", len(deltaContent), "reasoning_len", len(deltaReasoning))
+						stopReason := FinishReasonStop
+						message := &schema.Message{Role: "assistant", Content: &deltaContent}
+						if deltaReasoning != "" {
+							message.Reasoning = &deltaReasoning
+						}
+						newChoice := schema.Choice{FinishReason: &stopReason, Index: 0, Message: message}
+						// Preserve logprobs from the original result
+						if len(result) > 0 && result[0].Logprobs != nil {
+							newChoice.Logprobs = result[0].Logprobs
+						}
+						result = []schema.Choice{newChoice}
+					}
 				}

 				// Tool parsing is deferred here (only when shouldUseFn) so chat deltas are available
--- a/core/http/endpoints/openai/chat_stream_reasoning_test.go
+++ b/core/http/endpoints/openai/chat_stream_reasoning_test.go
@@ -1,99 +0,0 @@
-package openai
-
-import (
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	reason "github.com/mudler/LocalAI/pkg/reasoning"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-// Regression test for the prefill-misclassification artifact surfaced in
-// the review of #9991: when LocalAI templates qwen3 with
-// use_tokenizer_template (the post-#9985 gallery shape),
-// DetectThinkingStartToken finds <think> in the model's jinja chat
-// template — without evaluating the surrounding {% if enable_thinking %}
-// guard — and the Go-side extractor's PrependThinkingTokenIfNeeded then
-// treats every non-thinking output token as reasoning. The autoparser does
-// not classify qwen3's tool calls into ChatDelta.ToolCalls (qwen3's tool
-// format isn't on llama.cpp's recognized-tool list), so all tokens land in
-// ChatDelta.Content while the Go-side extractor silently accumulates a
-// "reasoning" string equal to the raw tool-call JSON. End-of-stream this
-// is flushed as a trailing `delta.reasoning` chunk to the client.
-//
-// chooseDeferredReasoning is the gate: when the autoparser was active for
-// any chunk (preferAutoparser sticky), we trust its reasoning_content
-// classification (usually empty) instead of the polluted Go-side state.
-var _ = Describe("chooseDeferredReasoning", func() {
-	// Simulate the qwen3-after-#9985 misclassification: build a real
-	// extractor with a <think> thinking-start token, then feed it
-	// non-thinking content. The extractor will (correctly per its own
-	// contract) treat the content as reasoning because
-	// PrependThinkingTokenIfNeeded synthesizes a leading <think>.
-	pollutedExtractor := func(content string) *reason.ReasoningExtractor {
-		e := reason.NewReasoningExtractor("<think>", reason.Config{})
-		e.ProcessToken(content)
-		Expect(e.Reasoning()).To(Equal(content),
-			"sanity: when the thinking-start token is set and content has no real <think>...</think>, "+
-				"the extractor classifies all content as reasoning — this is exactly the prefill pollution "+
-				"we want chooseDeferredReasoning to guard against")
-		return e
-	}
-
-	Context("autoparser was active (preferAutoparser=true)", func() {
-		It("returns the autoparser's reasoning classification, ignoring the polluted Go-side state", func() {
-			toolCallJSON := `{"arguments": {"cmd": "echo hello"}, "name": "exec"}`
-			extractor := pollutedExtractor(toolCallJSON)
-			// What the C++ autoparser sent: content chunks but no
-			// reasoning_content (qwen3 tool calls aren't classified by
-			// the upstream PEG parser).
-			chatDeltas := []*pb.ChatDelta{
-				{Content: toolCallJSON, ReasoningContent: ""},
-			}
-
-			got := chooseDeferredReasoning(true, chatDeltas, extractor)
-
-			Expect(got).To(BeEmpty(),
-				"chooseDeferredReasoning must NOT return the polluted extractor state "+
-					"when the autoparser was active — the autoparser correctly classified zero reasoning")
-		})
-
-		It("returns the autoparser's reasoning when it actually did classify reasoning", func() {
-			// The other side of the contract: when the autoparser was
-			// in jinja-with-recognized-format mode and DID classify
-			// reasoning, pass that through verbatim.
-			actualReasoning := "Okay, the user asked X. I should call exec."
-			extractor := pollutedExtractor("ignored polluted state")
-			chatDeltas := []*pb.ChatDelta{
-				{Content: "", ReasoningContent: actualReasoning},
-			}
-
-			got := chooseDeferredReasoning(true, chatDeltas, extractor)
-
-			Expect(got).To(Equal(actualReasoning))
-		})
-	})
-
-	Context("autoparser was NOT active (preferAutoparser=false)", func() {
-		It("falls back to the Go-side extractor — the right source for vLLM and other autoparser-less backends", func() {
-			realReasoning := "Genuine reasoning from a backend without an autoparser"
-			extractor := reason.NewReasoningExtractor("<think>", reason.Config{})
-			extractor.ProcessToken("<think>" + realReasoning + "</think>final answer")
-
-			got := chooseDeferredReasoning(false, nil, extractor)
-
-			Expect(got).To(Equal(realReasoning))
-		})
-
-		It("falls back even when ChatDeltas are present but the autoparser never classified anything", func() {
-			// Defensive: chatDeltas could carry vestigial data; if
-			// preferAutoparser wasn't flipped, we still use the
-			// extractor.
-			extractor := reason.NewReasoningExtractor("", reason.Config{})
-			extractor.ProcessToken("<think>some thoughts</think>answer")
-
-			got := chooseDeferredReasoning(false, []*pb.ChatDelta{{Content: "answer"}}, extractor)
-
-			Expect(got).To(Equal("some thoughts"))
-		})
-	})
-})
--- a/core/http/endpoints/openai/chat_stream_workers.go
+++ b/core/http/endpoints/openai/chat_stream_workers.go
@@ -7,111 +7,11 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/functions"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/model"
 	reason "github.com/mudler/LocalAI/pkg/reasoning"
 	"github.com/mudler/xlog"
 )

-// emitJSONToolCallDeltas iterates the JSON tool-call objects produced by the
-// streaming tool-call detector and emits SSE chunks for the ones the caller
-// hasn't already emitted. It returns the new lastEmittedCount.
-//
-// Semantics:
-//   - Skips entries before lastEmittedCount (already emitted).
-//   - Emits one tool_call chunk per consecutive entry that has a usable
-//     `name` string.
-//   - Stops at the first entry without a name (typically the partial-JSON
-//     tail or a healing-marker stub — see issue #9988) so the caller doesn't
-//     advance past it. Bumping lastEmittedCount past an unparsed stub
-//     permanently gates off content emission for the rest of the stream.
-//   - When jsonResults is empty (the autoparser-working case, where the raw
-//     text result is cleared and only ChatDeltas carry tool calls), this is
-//     a no-op and lastEmittedCount is returned unchanged.
-//
-// The autoparser-correctly-classifying-tool-calls path is unaffected: it
-// delivers tool calls via TokenUsage.ChatDeltas, and the deferred
-// end-of-stream block (ToolCallsFromChatDeltas → buildDeferredToolCallChunks)
-// emits them; this helper sees an empty jsonResults and emits nothing.
-func emitJSONToolCallDeltas(
-	jsonResults []map[string]any,
-	lastEmittedCount int,
-	id, model string,
-	created int,
-	responses chan<- schema.OpenAIResponse,
-) int {
-	for i := lastEmittedCount; i < len(jsonResults); i++ {
-		jsonObj := jsonResults[i]
-		name, ok := jsonObj["name"].(string)
-		if !ok || name == "" {
-			break
-		}
-		args := "{}"
-		if argsVal, ok := jsonObj["arguments"]; ok {
-			if argsStr, ok := argsVal.(string); ok {
-				args = argsStr
-			} else {
-				argsBytes, _ := json.Marshal(argsVal)
-				args = string(argsBytes)
-			}
-		}
-		responses <- schema.OpenAIResponse{
-			ID:      id,
-			Created: created,
-			Model:   model,
-			Choices: []schema.Choice{{
-				Delta: &schema.Message{
-					Role: "assistant",
-					ToolCalls: []schema.ToolCall{
-						{
-							Index: i,
-							ID:    id,
-							Type:  "function",
-							FunctionCall: schema.FunctionCall{
-								Name:      name,
-								Arguments: args,
-							},
-						},
-					},
-				},
-				Index:        0,
-				FinishReason: nil,
-			}},
-			Object: "chat.completion.chunk",
-		}
-		lastEmittedCount = i + 1
-	}
-	return lastEmittedCount
-}
-
-// chooseDeferredReasoning picks the source of truth for the end-of-stream
-// reasoning flush in processStreamWithTools. When the C++ autoparser was
-// active during the stream (preferAutoparser), it returns the autoparser's
-// own classified reasoning_content from ChatDeltas — usually empty when the
-// autoparser is in pure-content fallback mode. Otherwise it falls back to
-// the Go-side streaming extractor, which is the right source for backends
-// without an autoparser (vLLM, etc.).
-//
-// Why: the Go-side extractor's accumulated Reasoning() can be polluted by
-// PrependThinkingTokenIfNeeded — when the tokenizer template contains a
-// thinking start token (qwen3's jinja template has <think> inside an
-// {% if enable_thinking %} block, and DetectThinkingStartToken does not
-// evaluate jinja conditionals), prefill detection treats every chunk's
-// content as reasoning, even when the model emitted a raw tool-call JSON
-// in non-thinking mode. Without this guard, qwen3-4b with streaming + tools
-// (after #9985 flipped the gallery to use_tokenizer_template) emits a
-// trailing SSE chunk where `reasoning` carries the tool-call JSON.
-func chooseDeferredReasoning(
-	preferAutoparser bool,
-	chatDeltas []*pb.ChatDelta,
-	extractor *reason.ReasoningExtractor,
-) string {
-	if preferAutoparser {
-		return functions.ReasoningFromChatDeltas(chatDeltas)
-	}
-	return extractor.Reasoning()
-}
-
 // processStream is the streaming worker for chat completions with no
 // tool/function calling involved. It pushes SSE-shaped chunks onto
 // `responses` and returns the authoritative cumulative TokenUsage from
@@ -152,13 +52,6 @@ func processStream(
 	thinkingStartToken := reason.DetectThinkingStartToken(template, &cfg.ReasoningConfig)
 	extractor := reason.NewReasoningExtractor(thinkingStartToken, cfg.ReasoningConfig)

-	// preferAutoparser is sticky: once the C++ autoparser has ever classified
-	// reasoning_content, we trust it for the rest of the stream. Until then we
-	// fall back to Go-side extraction so that a "pure content" autoparser
-	// (non-jinja path, issue #9985) does not leak <think>…</think> tokens
-	// straight into the OpenAI `content` field.
-	preferAutoparser := false
-
 	_, finalUsage, _, err := ComputeChoices(req, s, cfg, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool {
 		var reasoningDelta, contentDelta string

@@ -171,16 +64,8 @@ func processStream(
 		// Otherwise fall back to Go-side extraction.
 		if tokenUsage.HasChatDeltaContent() {
 			rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent()
-			if rawReasoning != "" {
-				preferAutoparser = true
-			}
-			if preferAutoparser {
-				contentDelta = cd
-				reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
-			} else {
-				reasoningDelta = goReasoning
-				contentDelta = goContent
-			}
+			contentDelta = cd
+			reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
 		} else {
 			reasoningDelta = goReasoning
 			contentDelta = goContent
@@ -257,17 +142,6 @@ func processStreamWithTools(
 	hasChatDeltaToolCalls := false
 	hasChatDeltaContent := false

-	// preferAutoparser is sticky: once the C++ autoparser has ever delivered
-	// content or reasoning via ChatDeltas, we trust its classification for the
-	// rest of the stream — including for the end-of-stream reasoning flush in
-	// buildDeferredToolCallChunks. Otherwise the Go-side extractor's
-	// accumulated Reasoning() can be polluted by prefill detection
-	// misclassifying content as reasoning (this happens when <think> appears
-	// in the tokenizer template and the model emits non-reasoning content
-	// like a raw tool-call JSON — qwen3-4b after #9985 enabled
-	// use_tokenizer_template). Mirrors the analogous flag in processStream.
-	preferAutoparser := false
-
 	// X-LocalAI-Node attribution is handled by middleware.ExposeNodeHeader
 	// at the wrapper layer; no in-band signalling from this worker.

@@ -291,17 +165,12 @@ func processStreamWithTools(

 		if usage.HasChatDeltaContent() {
 			rawReasoning, cd := usage.ChatDeltaReasoningAndContent()
-			preferAutoparser = true
 			contentDelta = cd
 			reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
-		} else if !preferAutoparser {
+		} else {
 			reasoningDelta = goReasoning
 			contentDelta = goContent
 		}
-		// If preferAutoparser is already true but this chunk carried no
-		// autoparser data, leave both deltas empty — the next autoparser
-		// chunk will pick things up. Falling back to Go-side here would
-		// re-introduce the prefill-misclassification leak.

 		// Emit reasoning deltas in their own SSE chunks before any tool-call chunks
 		// (OpenAI spec: reasoning and tool_calls never share a delta)
@@ -395,10 +264,49 @@ func processStreamWithTools(
 			// Try JSON tool call parsing for streaming.
 			// Only emit NEW tool calls (same guard as XML parser above).
 			jsonResults, jsonErr := functions.ParseJSONIterative(cleanedResult, true)
-			if jsonErr == nil {
-				lastEmittedCount = emitJSONToolCallDeltas(
-					jsonResults, lastEmittedCount, id, req.Model, created, responses,
-				)
+			if jsonErr == nil && len(jsonResults) > lastEmittedCount {
+				for i := lastEmittedCount; i < len(jsonResults); i++ {
+					jsonObj := jsonResults[i]
+					name, ok := jsonObj["name"].(string)
+					if !ok || name == "" {
+						continue
+					}
+					args := "{}"
+					if argsVal, ok := jsonObj["arguments"]; ok {
+						if argsStr, ok := argsVal.(string); ok {
+							args = argsStr
+						} else {
+							argsBytes, _ := json.Marshal(argsVal)
+							args = string(argsBytes)
+						}
+					}
+					initialMessage := schema.OpenAIResponse{
+						ID:      id,
+						Created: created,
+						Model:   req.Model,
+						Choices: []schema.Choice{{
+							Delta: &schema.Message{
+								Role: "assistant",
+								ToolCalls: []schema.ToolCall{
+									{
+										Index: i,
+										ID:    id,
+										Type:  "function",
+										FunctionCall: schema.FunctionCall{
+											Name:      name,
+											Arguments: args,
+										},
+									},
+								},
+							},
+							Index:        0,
+							FinishReason: nil,
+						}},
+						Object: "chat.completion.chunk",
+					}
+					responses <- initialMessage
+				}
+				lastEmittedCount = len(jsonResults)
 			}
 		}
 		return true
@@ -444,14 +352,7 @@ func processStreamWithTools(
 	} else {
 		// Fallback: parse tool calls from raw text (no chat deltas from backend)
 		xlog.Debug("[ChatDeltas] no pre-parsed tool calls, falling back to Go-side text parsing")
-		// When the autoparser was active during streaming (preferAutoparser),
-		// trust its reasoning classification rather than the Go-side
-		// extractor's accumulated state — the latter may have misclassified
-		// content as reasoning due to prefill detection on a tokenizer
-		// template that contains <think>. This was visible on qwen3-4b after
-		// #9985 enabled use_tokenizer_template: a streaming tool-call JSON
-		// would leak as a trailing reasoning chunk via the deferred flush.
-		reasoning = chooseDeferredReasoning(preferAutoparser, chatDeltas, extractor)
+		reasoning = extractor.Reasoning()
 		cleanedResult := extractor.CleanedContent()
 		*textContentToReturn = functions.ParseTextContent(cleanedResult, cfg.FunctionsConfig)
 		cleanedResult = functions.CleanupLLMResult(cleanedResult, cfg.FunctionsConfig)
--- a/core/http/endpoints/openai/chat_stream_workers_test.go
+++ b/core/http/endpoints/openai/chat_stream_workers_test.go
@@ -1,197 +0,0 @@
-package openai
-
-import (
-	"github.com/mudler/LocalAI/core/schema"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-// drainChannel reads everything currently buffered on a channel without
-// blocking on close. The helper test channels are sized for the assertions.
-func drainChannel(ch <-chan schema.OpenAIResponse) []schema.OpenAIResponse {
-	var out []schema.OpenAIResponse
-	for {
-		select {
-		case r, ok := <-ch:
-			if !ok {
-				return out
-			}
-			out = append(out, r)
-		default:
-			return out
-		}
-	}
-}
-
-// nameOf returns the name of the first tool call carried on the choice's
-// delta, or "" if none.
-func nameOf(r schema.OpenAIResponse) string {
-	if len(r.Choices) == 0 || r.Choices[0].Delta == nil {
-		return ""
-	}
-	if len(r.Choices[0].Delta.ToolCalls) == 0 {
-		return ""
-	}
-	return r.Choices[0].Delta.ToolCalls[0].FunctionCall.Name
-}
-
-var _ = Describe("emitJSONToolCallDeltas", func() {
-	const (
-		id      = "test-stream"
-		model   = "test-model"
-		created = 1700000000
-	)
-
-	// The case that motivated this helper. With the previous version of
-	// the streaming worker, ParseJSONIterative would hand back a stub
-	// object like `{"4310046988783340008":1}` after the model had only
-	// emitted `{`. The worker bumped lastEmittedCount unconditionally,
-	// which permanently gated off content emission for the rest of the
-	// stream (qwen3-4b with stream:true + tools dribbled only `{"` to
-	// the client and then nothing). See issue #9988.
-	Context("partial stub without a usable name", func() {
-		It("does NOT bump lastEmittedCount and emits nothing", func() {
-			responses := make(chan schema.OpenAIResponse, 4)
-			// What ParseJSONIterative used to return for `{`:
-			stubResults := []map[string]any{
-				{"4310046988783340008": float64(1)},
-			}
-
-			next := emitJSONToolCallDeltas(stubResults, 0, id, model, created, responses)
-
-			Expect(next).To(Equal(0),
-				"lastEmittedCount must NOT advance past a stub without a name "+
-					"— otherwise content emission gets permanently gated off")
-			Expect(drainChannel(responses)).To(BeEmpty(),
-				"no tool_call chunk should be emitted for a stub without a name")
-		})
-	})
-
-	// No-regression #1: the autoparser-correctly-working path. When the
-	// C++ autoparser classifies tool calls itself, the raw text result is
-	// cleared and ParseJSONIterative on it returns no results — this
-	// helper must be a no-op so the deferred end-of-stream code can emit
-	// the tool calls from TokenUsage.ChatDeltas.
-	Context("empty jsonResults (autoparser-correctly-working path)", func() {
-		It("is a no-op and leaves lastEmittedCount unchanged", func() {
-			responses := make(chan schema.OpenAIResponse, 4)
-			next := emitJSONToolCallDeltas(nil, 0, id, model, created, responses)
-			Expect(next).To(Equal(0))
-			Expect(drainChannel(responses)).To(BeEmpty())
-		})
-
-		It("leaves a non-zero lastEmittedCount unchanged when later called with the same length", func() {
-			responses := make(chan schema.OpenAIResponse, 4)
-			results := []map[string]any{
-				{"name": "search", "arguments": map[string]any{"q": "hi"}},
-			}
-			// First call emits the one available tool call.
-			next := emitJSONToolCallDeltas(results, 0, id, model, created, responses)
-			Expect(next).To(Equal(1))
-			Expect(drainChannel(responses)).To(HaveLen(1))
-
-			// Subsequent chunks haven't grown the slice — must be a no-op.
-			next = emitJSONToolCallDeltas(results, next, id, model, created, responses)
-			Expect(next).To(Equal(1))
-			Expect(drainChannel(responses)).To(BeEmpty())
-		})
-	})
-
-	// No-regression #2: the normal completed-JSON path. When the model
-	// emits a real, complete tool call as JSON in raw content (e.g. qwen3
-	// without jinja but with tools), we should emit exactly one tool_call
-	// SSE chunk on the first call and become a no-op on later calls.
-	Context("single complete tool call", func() {
-		It("emits one tool_call chunk and bumps lastEmittedCount to 1", func() {
-			responses := make(chan schema.OpenAIResponse, 4)
-			results := []map[string]any{
-				{
-					"name": "search",
-					"arguments": map[string]any{
-						"q": "hello",
-					},
-				},
-			}
-
-			next := emitJSONToolCallDeltas(results, 0, id, model, created, responses)
-
-			Expect(next).To(Equal(1))
-			out := drainChannel(responses)
-			Expect(out).To(HaveLen(1))
-			Expect(nameOf(out[0])).To(Equal("search"))
-			Expect(out[0].Choices[0].Delta.ToolCalls[0].FunctionCall.Arguments).
-				To(ContainSubstring(`"q":"hello"`))
-		})
-
-		It("accepts arguments already serialized as a string", func() {
-			responses := make(chan schema.OpenAIResponse, 4)
-			results := []map[string]any{
-				{
-					"name":      "search",
-					"arguments": `{"q":"hello"}`,
-				},
-			}
-
-			emitJSONToolCallDeltas(results, 0, id, model, created, responses)
-
-			out := drainChannel(responses)
-			Expect(out).To(HaveLen(1))
-			Expect(out[0].Choices[0].Delta.ToolCalls[0].FunctionCall.Arguments).
-				To(Equal(`{"q":"hello"}`))
-		})
-	})
-
-	// No-regression #3: multiple tool calls (parallel tool calling).
-	// Both must be emitted, lastEmittedCount must end at 2.
-	Context("multiple complete tool calls", func() {
-		It("emits one chunk per tool call and bumps lastEmittedCount to len(results)", func() {
-			responses := make(chan schema.OpenAIResponse, 8)
-			results := []map[string]any{
-				{"name": "search", "arguments": map[string]any{"q": "a"}},
-				{"name": "browse", "arguments": map[string]any{"url": "b"}},
-			}
-
-			next := emitJSONToolCallDeltas(results, 0, id, model, created, responses)
-
-			Expect(next).To(Equal(2))
-			out := drainChannel(responses)
-			Expect(out).To(HaveLen(2))
-			Expect(nameOf(out[0])).To(Equal("search"))
-			Expect(nameOf(out[1])).To(Equal("browse"))
-		})
-	})
-
-	// The streaming-tail case: incremental chunks. First parse returns
-	// one complete tool call followed by a partial stub; later chunks
-	// complete the second tool call. We must emit the first immediately
-	// and the second on the later call — without ever bumping past the
-	// stub mid-stream.
-	Context("partial tail behind a real tool call", func() {
-		It("emits the complete entry, stops at the stub, and resumes once the tail completes", func() {
-			responses := make(chan schema.OpenAIResponse, 8)
-
-			// Chunk 1: one real call + a partial stub for the next.
-			chunk1 := []map[string]any{
-				{"name": "search", "arguments": map[string]any{"q": "a"}},
-				{"4310046988783340008": float64(1)},
-			}
-			next := emitJSONToolCallDeltas(chunk1, 0, id, model, created, responses)
-			Expect(next).To(Equal(1),
-				"must NOT advance to 2 — the stub at index 1 has no usable name")
-			out := drainChannel(responses)
-			Expect(out).To(HaveLen(1))
-			Expect(nameOf(out[0])).To(Equal("search"))
-
-			// Chunk 2: the stub completes into a real call.
-			chunk2 := []map[string]any{
-				{"name": "search", "arguments": map[string]any{"q": "a"}},
-				{"name": "browse", "arguments": map[string]any{"url": "b"}},
-			}
-			next = emitJSONToolCallDeltas(chunk2, next, id, model, created, responses)
-			Expect(next).To(Equal(2))
-			out = drainChannel(responses)
-			Expect(out).To(HaveLen(1))
-			Expect(nameOf(out[0])).To(Equal("browse"))
-		})
-	})
-})
--- a/core/http/endpoints/openai/chat_test.go
+++ b/core/http/endpoints/openai/chat_test.go
@@ -3,8 +3,6 @@ package openai
 import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/pkg/functions"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	reason "github.com/mudler/LocalAI/pkg/reasoning"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"

@@ -96,98 +94,6 @@ var _ = Describe("handleQuestion", func() {
 	})
 })

-var _ = Describe("applyAutoparserOverride", func() {
-	// Regression test for https://github.com/mudler/LocalAI/issues/9985.
-	// When LocalAI templates a <think>-style reasoning model outside of jinja
-	// (e.g. the gallery qwen3 entry), the llama.cpp autoparser falls back to
-	// the "pure content" PEG parser which dumps the entire raw response,
-	// including <think>…</think>, into ChatDelta.Content and leaves
-	// ChatDelta.ReasoningContent empty. The Go side previously trusted that
-	// content verbatim and clobbered the tokenCallback's correctly-split
-	// reasoning, so <think> blocks leaked into the OpenAI `content` field.
-	Context("autoparser delivered content with embedded <think> tags and empty reasoning (issue #9985)", func() {
-		It("splits <think>…</think> out of content into the reasoning field", func() {
-			raw := "<think>\nOkay, the user said \"Hello\". I should reply warmly.\n</think>\n\nHello! How can I assist you today? 😊"
-			chatDeltas := []*pb.ChatDelta{
-				{Content: raw, ReasoningContent: ""},
-			}
-
-			result := applyAutoparserOverride(chatDeltas, "", reason.Config{}, nil)
-
-			Expect(result).To(HaveLen(1))
-			Expect(result[0].Message).ToNot(BeNil())
-			Expect(result[0].Message.Content).ToNot(BeNil())
-
-			content := *(result[0].Message.Content.(*string))
-			Expect(content).ToNot(ContainSubstring("<think>"),
-				"raw <think> tag must not leak into OpenAI content field")
-			Expect(content).ToNot(ContainSubstring("</think>"),
-				"raw </think> tag must not leak into OpenAI content field")
-			Expect(content).To(ContainSubstring("Hello! How can I assist you today?"),
-				"the model's actual answer must still be in content")
-
-			Expect(result[0].Message.Reasoning).ToNot(BeNil(),
-				"reasoning extracted from <think>…</think> must populate Reasoning")
-			Expect(*result[0].Message.Reasoning).To(ContainSubstring("Okay, the user said"))
-		})
-
-		It("does not run extraction when the autoparser already populated reasoning", func() {
-			// When the autoparser actually classified reasoning, leave its
-			// content/reasoning split untouched.
-			content := "Hello! How can I assist you today?"
-			reasoning := "Already split by the C++ autoparser."
-			chatDeltas := []*pb.ChatDelta{
-				{Content: content, ReasoningContent: reasoning},
-			}
-
-			result := applyAutoparserOverride(chatDeltas, "", reason.Config{}, nil)
-
-			Expect(result).To(HaveLen(1))
-			Expect(*(result[0].Message.Content.(*string))).To(Equal(content))
-			Expect(result[0].Message.Reasoning).ToNot(BeNil())
-			Expect(*result[0].Message.Reasoning).To(Equal(reasoning))
-		})
-
-		It("passes plain content through unchanged when no reasoning tags are present", func() {
-			content := "Just a normal answer with no reasoning at all."
-			chatDeltas := []*pb.ChatDelta{
-				{Content: content, ReasoningContent: ""},
-			}
-
-			result := applyAutoparserOverride(chatDeltas, "", reason.Config{}, nil)
-
-			Expect(result).To(HaveLen(1))
-			Expect(*(result[0].Message.Content.(*string))).To(Equal(content))
-			Expect(result[0].Message.Reasoning).To(BeNil())
-		})
-
-		It("strips an empty <think></think> block (qwen3 /no_think mode)", func() {
-			// qwen3 with the /no_think directive still emits an empty thinking
-			// block. The Go-side fallback must strip it from content rather than
-			// pass <think></think> through verbatim. No reasoning is set because
-			// the block has no body.
-			raw := "<think>\n\n</think>\n\nHello! How can I assist you today?"
-			chatDeltas := []*pb.ChatDelta{
-				{Content: raw, ReasoningContent: ""},
-			}
-
-			result := applyAutoparserOverride(chatDeltas, "", reason.Config{}, nil)
-
-			Expect(result).To(HaveLen(1))
-			content := *(result[0].Message.Content.(*string))
-			Expect(content).ToNot(ContainSubstring("<think>"))
-			Expect(content).ToNot(ContainSubstring("</think>"))
-			Expect(content).To(ContainSubstring("Hello! How can I assist you today?"))
-		})
-
-		It("returns the existing result when chatDeltas is empty", func() {
-			existing := []schema.Choice{{Index: 7}}
-			result := applyAutoparserOverride(nil, "", reason.Config{}, existing)
-			Expect(result).To(Equal(existing))
-		})
-	})
-})
-
 var _ = Describe("mergeToolCallDeltas", func() {
 	Context("with new tool calls", func() {
 		It("should append new tool calls", func() {
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -1572,15 +1572,6 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 			"tool_calls", len(deltaToolCalls),
 			"content_len", len(deltaContent),
 			"reasoning_len", len(deltaReasoning))
-		// Issue #9985: when the autoparser only delivered content (no
-		// reasoning_content), it may be running in the "pure content"
-		// PEG fallback (non-jinja path) which leaves <think>…</think>
-		// embedded in the content. Run Go-side extraction defensively.
-		// ExtractReasoningWithConfig is a no-op when no tag pair matches,
-		// so it's safe to apply unconditionally in the no-reasoning branch.
-		if deltaReasoning == "" && deltaContent != "" {
-			deltaReasoning, deltaContent = reasoning.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, config.ReasoningConfig)
-		}
 		reasoningText = deltaReasoning
 		responseWithoutReasoning = deltaContent
 		textContent = deltaContent
--- a/core/http/endpoints/openresponses/responses.go
+++ b/core/http/endpoints/openresponses/responses.go
@@ -1971,10 +1971,6 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6

 			// Source reasoning from: (1) ChatDeltas from C++ autoparser, (2) extractor's
 			// streaming state, (3) final extraction from the finetuned result.
-			// Issue #9985: when the autoparser delivered Content but no
-			// ReasoningContent, it was running in the "pure content" PEG fallback
-			// (non-jinja path) which leaves reasoning tags embedded in content.
-			// Fall back to the streaming Go-side extractor's split in that case.
 			if chatDeltaReasoning := functions.ReasoningFromChatDeltas(chatDeltas); chatDeltaReasoning != "" {
 				finalReasoning = chatDeltaReasoning
 				finalCleanedResult = functions.ContentFromChatDeltas(chatDeltas)
--- a/docs/content/features/text-generation.md
+++ b/docs/content/features/text-generation.md
@@ -515,7 +515,7 @@ The `llama.cpp` backend supports additional configuration options that can be sp
 | `kv_unified` or `unified_kv` | boolean | Use a single unified KV buffer shared across all sequences. Default: `true` (LocalAI override; upstream defaults to `false` but auto-enables it when slot count is auto). **Required for `cache_idle_slots` to work**: without it the server force-disables idle-slot saving at init, and the prompt cache is never written across requests. | `kv_unified:false` |
 | `cache_idle_slots` or `idle_slots_cache` | boolean | On a new task, save the previous slot's KV state into the prompt cache (and clear the slot) so a later request with the same prefix can warm-load it. Default: `true`. Auto-disabled by the server if `kv_unified=false` or `cache_ram=0`. | `cache_idle_slots:false` |
 | `n_ctx_checkpoints` or `ctx_checkpoints` | integer | Maximum number of context checkpoints per slot (used for partial-prefix recovery, e.g. SWA). Default: `32`. | `ctx_checkpoints:16` |
-| `checkpoint_min_step` or `checkpoint_min_spacing` (aliases: `checkpoint_every_nt`, `checkpoint_every_n_tokens`) | integer | Minimum spacing in tokens between context checkpoints. `0` disables the minimum-spacing gate. Default: `256`. (Renamed upstream from `checkpoint_every_nt`; semantics shifted from a fixed cadence to a minimum spacing.) | `checkpoint_min_step:1024` |
+| `checkpoint_every_nt` or `checkpoint_every_n_tokens` | integer | Create a context checkpoint every N tokens during prefill. `-1` disables checkpointing. Default: `8192`. | `checkpoint_every_nt:4096` |
 | `split_mode` or `sm` | string | How to split the model across multiple GPUs: `none` (single GPU only), `layer` (default — split layers and KV across GPUs), `row` (split rows across GPUs), `tensor` (experimental tensor parallelism — requires `flash_attention: true`, no KV-cache quantization, manually set `context_size`, and a llama.cpp build that includes [#19378](https://github.com/ggml-org/llama.cpp/pull/19378)). | `split_mode:tensor` |

 **Example configuration with options:**
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v4.3.1"
+  "version": "v4.2.6"
 }
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,54 +1,4 @@
 ---
- name: "qwopus3.6-27b-v2-mtp"
-  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
-  urls:
-    - https://huggingface.co/Jackrong/Qwopus3.6-27B-v2-MTP-GGUF
-  description: |
-    🪐 Qwopus3.6-27B-v2-MTP
-    MTP Release
-
-    Multi-Token Prediction reasoning model fine-tuned from Qwen3.6-27B
-
-    🧬 Trace Inversion & Negentropy
-    🧠 27B Parameters
-    ⚡ Speculative Decoding
-    🛠️ Coding / DevOps / Math
-
-    💡 What is Qwopus3.6-27B-v2-MTP?
-    🪐 Qwopus3.6-27B-v2-MTP is a speed-oriented reasoning release built on top of Qwen3.6-27B. It keeps the Qwopus line's focus on reconstructed reasoning traces, coding discipline, DevOps procedures, and mathematical derivations, while adding Multi-Token Prediction for faster generation. The goal is simple: preserve the depth and structure of a 27B reasoning model while making real interactive use noticeably faster.
-
-    ⚡ MTP DecodingAuxiliary future-token prediction improves throughput on long reasoning, code, math, and strict-format prompts.
-    🧩 Structured ReasoningInherits the Qwopus training recipe built around reconstructed step-by-step reasoning trajectories.
-    🧪 GB10 TestedValidated on a 30-question local benchmark across Logic, Coding, DevOps, Math, and Edge tasks.
-    🚀 Practical SpeedDesigned for workflows where strong answers matter, but waiting several extra minutes per task does not.
-
-    ...
-  license: "apache-2.0"
-  tags:
-    - llm
-    - gguf
-    - reasoning
-  overrides:
-    backend: llama-cpp
-    function:
-      automatic_tool_parsing_fallback: true
-      grammar:
-        disable: true
-    known_usecases:
-      - chat
-    options:
-      - use_jinja:true
-      - spec_type:draft-mtp
-      - spec_n_max:6
-      - spec_p_min:0.75
-    parameters:
-      model: llama-cpp/models/Qwopus3.6-27B-v2-MTP-GGUF/Qwopus3.6-27B-v2-MTP-Q4_K_M.gguf
-    template:
-      use_tokenizer_template: true
-  files:
-    - filename: llama-cpp/models/Qwopus3.6-27B-v2-MTP-GGUF/Qwopus3.6-27B-v2-MTP-Q4_K_M.gguf
-      sha256: 818d68223be4d8518dac0b3b5604dde633cbbcbae1f491d842a3e26711c6606d
-      uri: https://huggingface.co/Jackrong/Qwopus3.6-27B-v2-MTP-GGUF/resolve/main/Qwopus3.6-27B-v2-MTP-Q4_K_M.gguf
 - name: "qwen3.6-40b-claude-4.6-opus-deckard-heretic-uncensored-thinking-neo-code-di-imatrix-max"
  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
  urls:
@@ -6118,57 +6068,6 @@
      - detection
    parameters:
      model: rfdetr-base
- name: rfdetr-cpp-nano
-  url: github:mudler/LocalAI/gallery/virtual.yaml@master
-  urls:
-    - https://github.com/mudler/rf-detr.cpp
-    - https://huggingface.co/mudler/rfdetr-cpp-nano
-  description: |
-    RF-DETR Nano object detection model, served via the native rfdetr.cpp backend (ggml + purego, no Python).
-    Q8_0 quantization is the recommended default for CPU: same accuracy as F16/F32, ~20MB on disk, fastest CPU latency.
-    Pure C++/ggml runtime; no Python dependencies. Drop-in for the /v1/detection endpoint.
-  license: apache-2.0
-  icon: https://avatars.githubusercontent.com/u/53104118?s=200&v=4
-  tags:
-    - object-detection
-    - rfdetr
-    - native
-    - cpp
-    - cpu
-  overrides:
-    backend: rfdetr-cpp
-    known_usecases:
-      - detection
-    parameters:
-      model: rfdetr-nano-q8_0.gguf
-  files:
-    - filename: rfdetr-nano-q8_0.gguf
-      uri: huggingface://mudler/rfdetr-cpp-nano/rfdetr-nano-q8_0.gguf
- name: rfdetr-cpp-base
-  url: github:mudler/LocalAI/gallery/virtual.yaml@master
-  urls:
-    - https://github.com/mudler/rf-detr.cpp
-    - https://huggingface.co/mudler/rfdetr-cpp-base
-  description: |
-    RF-DETR Base object detection model, served via the native rfdetr.cpp backend.
-    F16 quantization is recommended on CPU: identical accuracy to F32, half the size, fastest.
-  license: apache-2.0
-  icon: https://avatars.githubusercontent.com/u/53104118?s=200&v=4
-  tags:
-    - object-detection
-    - rfdetr
-    - native
-    - cpp
-    - cpu
-  overrides:
-    backend: rfdetr-cpp
-    known_usecases:
-      - detection
-    parameters:
-      model: rfdetr-base-f16.gguf
-  files:
-    - filename: rfdetr-base-f16.gguf
-      uri: huggingface://mudler/rfdetr-cpp-base/rfdetr-base-f16.gguf
 - name: edgetam
  url: github:mudler/LocalAI/gallery/virtual.yaml@master
  urls:
--- a/gallery/qwen3.yaml
+++ b/gallery/qwen3.yaml
@@ -11,12 +11,36 @@ config_file: |
        - <dummy32000>
        - </s>
        - <|endoftext|>
-    # Delegate templating to llama.cpp's jinja runtime so the C++ autoparser
-    # can classify <think>…</think> blocks into reasoning_content natively
-    # (issue #9985). Without use_jinja the autoparser falls back to a
-    # "pure content" PEG parser that leaks reasoning tags into content.
-    options:
-        - use_jinja:true
    template:
-        use_tokenizer_template: true
+        chat: |
+            {{.Input -}}
+            <|im_start|>assistant
+        chat_message: |
+            <|im_start|>{{if eq .RoleName "tool" }}user{{else}}{{ .RoleName }}{{end}}
+            {{ if eq .RoleName "tool" -}}
+            <tool_response>
+            {{ end -}}
+            {{ if .Content -}}
+            {{.Content }}
+            {{ end -}}
+            {{ if eq .RoleName "tool" -}}
+            </tool_response>
+            {{ end -}}
+            {{ if .FunctionCall -}}
+            <tool_call>
+            {{toJson .FunctionCall}}
+            </tool_call>
+            {{ end -}}<|im_end|>
+        completion: |
+            {{.Input}}
+        function: |
+            <|im_start|>system
+            You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+            {{range .Functions}}
+            {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
+            {{end}}
+            For each function call return a json object with function name and arguments: {"name": <function-name>, "arguments": <json-arguments-object>}
+            <|im_end|>
+            {{.Input -}}
+            <|im_start|>assistant
 name: qwen3
--- a/go.mod
+++ b/go.mod
@@ -10,7 +10,7 @@ require (
 	github.com/anthropics/anthropic-sdk-go v1.42.0
 	github.com/aws/aws-sdk-go-v2 v1.41.7
 	github.com/aws/aws-sdk-go-v2/config v1.32.16
-	github.com/aws/aws-sdk-go-v2/credentials v1.19.17
+	github.com/aws/aws-sdk-go-v2/credentials v1.19.15
 	github.com/aws/aws-sdk-go-v2/service/s3 v1.99.1
 	github.com/charmbracelet/glamour v1.0.0
 	github.com/containerd/containerd v1.7.31
@@ -41,7 +41,7 @@ require (
 	github.com/mudler/go-processmanager v0.1.1
 	github.com/mudler/memory v0.0.0-20260406210934-424c1ecf2cf8
 	github.com/mudler/xlog v0.0.6
-	github.com/nats-io/nats.go v1.52.0
+	github.com/nats-io/nats.go v1.50.0
 	github.com/ollama/ollama v0.20.4
 	github.com/onsi/ginkgo/v2 v2.29.0
 	github.com/onsi/gomega v1.40.0
@@ -81,18 +81,18 @@ require (
 	filippo.io/keygen v0.0.0-20260114151900-8e2790ea4c5b // indirect
 	github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect
 	github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.9 // indirect
-	github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.23 // indirect
-	github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.23 // indirect
-	github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.23 // indirect
-	github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.24 // indirect
-	github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.9 // indirect
+	github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.22 // indirect
+	github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.22 // indirect
+	github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.22 // indirect
+	github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.23 // indirect
+	github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.8 // indirect
 	github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.14 // indirect
-	github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.23 // indirect
+	github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.22 // indirect
 	github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.22 // indirect
-	github.com/aws/aws-sdk-go-v2/service/signin v1.0.11 // indirect
-	github.com/aws/aws-sdk-go-v2/service/sso v1.30.17 // indirect
-	github.com/aws/aws-sdk-go-v2/service/ssooidc v1.36.0 // indirect
-	github.com/aws/aws-sdk-go-v2/service/sts v1.42.1 // indirect
+	github.com/aws/aws-sdk-go-v2/service/signin v1.0.10 // indirect
+	github.com/aws/aws-sdk-go-v2/service/sso v1.30.16 // indirect
+	github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.20 // indirect
+	github.com/aws/aws-sdk-go-v2/service/sts v1.42.0 // indirect
 	github.com/aws/smithy-go v1.25.1 // indirect
 	github.com/bahlo/generic-list-go v0.2.0 // indirect
 	github.com/blang/semver v3.5.1+incompatible // indirect
@@ -498,7 +498,7 @@ require (
 	golang.org/x/mod v0.35.0 // indirect
 	golang.org/x/sync v0.20.0
 	golang.org/x/sys v0.44.0 // indirect
-	golang.org/x/term v0.43.0
+	golang.org/x/term v0.43.0 // indirect
 	golang.org/x/text v0.37.0 // indirect
 	golang.org/x/tools v0.44.0 // indirect
 	golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 // indirect
--- a/go.sum
+++ b/go.sum
@@ -150,36 +150,36 @@ github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.9 h1:adBsCIIpLbLmYnkQ
 github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.9/go.mod h1:uOYhgfgThm/ZyAuJGNQ5YgNyOlYfqnGpTHXvk3cpykg=
 github.com/aws/aws-sdk-go-v2/config v1.32.16 h1:Q0iQ7quUgJP0F/SCRTieScnaMdXr9h/2+wze1u3cNeM=
 github.com/aws/aws-sdk-go-v2/config v1.32.16/go.mod h1:duCCnJEFqpt2RC6no1iK6q+8HpwOAkiUua0pY507dQc=
-github.com/aws/aws-sdk-go-v2/credentials v1.19.17 h1:gP2nkGsS+KMvF/jfFz2Vv2qiiOqWKyPACSzPsqHgoW8=
-github.com/aws/aws-sdk-go-v2/credentials v1.19.17/go.mod h1:Bsew3S/moG5iT77giPj1q8wb/s0RE5/QfH+ASjYtuQc=
-github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.23 h1:UuSfcORqNSz/ey3VPRS8TcVH2Ikf0/sC+Hdj400QI6U=
-github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.23/go.mod h1:+G/OSGiOFnSOkYloKj/9M35s74LgVAdJBSD5lsFfqKg=
-github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.23 h1:GpT/TrnBYuE5gan2cZbTtvP+JlHsutdmlV2YfEyNde0=
-github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.23/go.mod h1:xYWD6BS9ywC5bS3sz9Xh04whO/hzK2plt2Zkyrp4JuA=
-github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.23 h1:bpd8vxhlQi2r1hiueOw02f/duEPTMK59Q4QMAoTTtTo=
-github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.23/go.mod h1:15DfR2nw+CRHIk0tqNyifu3G1YdAOy68RftkhMDDwYk=
-github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.24 h1:OQqn11BtaYv1WLUowvcA30MpzIu8Ti4pcLPIIyoKZrA=
-github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.24/go.mod h1:X5ZJyfwVrWA96GzPmUCWFQaEARPR7gCrpq2E92PJwAE=
-github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.9 h1:FLudkZLt5ci0ozzgkVo8BJGwvqNaZbTWb3UcucAateA=
-github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.9/go.mod h1:w7wZ/s9qK7c8g4al+UyoF1Sp/Z45UwMGcqIzLWVQHWk=
+github.com/aws/aws-sdk-go-v2/credentials v1.19.15 h1:fyvgWTszojq8hEnMi8PPBTvZdTtEVmAVyo+NFLHBhH4=
+github.com/aws/aws-sdk-go-v2/credentials v1.19.15/go.mod h1:gJiYyMOjNg8OEdRWOf3CrFQxM2a98qmrtjx1zuiQfB8=
+github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.22 h1:IOGsJ1xVWhsi+ZO7/NW8OuZZBtMJLZbk4P5HDjJO0jQ=
+github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.22/go.mod h1:b+hYdbU+jGKfXE8kKM6g1+h+L/Go3vMvzlxBsiuGsxg=
+github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.22 h1:GmLa5Kw1ESqtFpXsx5MmC84QWa/ZrLZvlJGa2y+4kcQ=
+github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.22/go.mod h1:6sW9iWm9DK9YRpRGga/qzrzNLgKpT2cIxb7Vo2eNOp0=
+github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.22 h1:dY4kWZiSaXIzxnKlj17nHnBcXXBfac6UlsAx2qL6XrU=
+github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.22/go.mod h1:KIpEUx0JuRZLO7U6cbV204cWAEco2iC3l061IxlwLtI=
+github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.23 h1:FPXsW9+gMuIeKmz7j6ENWcWtBGTe1kH8r9thNt5Uxx4=
+github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.23/go.mod h1:7J8iGMdRKk6lw2C+cMIphgAnT8uTwBwNOsGkyOCm80U=
+github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.8 h1:HtOTYcbVcGABLOVuPYaIihj6IlkqubBwFj10K5fxRek=
+github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.8/go.mod h1:VsK9abqQeGlzPgUr+isNWzPlK2vKe9INMLWnY65f5Xs=
 github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.14 h1:xnvDEnw+pnj5mctWiYuFbigrEzSm35x7k4KS/ZkCANg=
 github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.14/go.mod h1:yS5rNogD8e0Wu9+l3MUwr6eENBzEeGejvINpN5PAYfY=
-github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.23 h1:pbrxO/kuIwgEsOPLkaHu0O+m4fNgLU8B3vxQ+72jTPw=
-github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.23/go.mod h1:/CMNUqoj46HpS3MNRDEDIwcgEnrtZlKRaHNaHxIFpNA=
+github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.22 h1:PUmZeJU6Y1Lbvt9WFuJ0ugUK2xn6hIWUBBbKuOWF30s=
+github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.22/go.mod h1:nO6egFBoAaoXze24a2C0NjQCvdpk8OueRoYimvEB9jo=
 github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.22 h1:SE+aQ4DEqG53RRCAIHlCf//B2ycxGH7jFkpnAh/kKPM=
 github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.22/go.mod h1:ES3ynECd7fYeJIL6+oax+uIEljmfps0S70BaQzbMd/o=
 github.com/aws/aws-sdk-go-v2/service/kms v1.48.2 h1:aL8Y/AbB6I+uw0MjLbdo68NQ8t5lNs3CY3S848HpETk=
 github.com/aws/aws-sdk-go-v2/service/kms v1.48.2/go.mod h1:VJcNH6BLr+3VJwinRKdotLOMglHO8mIKlD3ea5c7hbw=
 github.com/aws/aws-sdk-go-v2/service/s3 v1.99.1 h1:kU/eBN5+MWNo/LcbNa4hWDdN76hdcd7hocU5kvu7IsU=
 github.com/aws/aws-sdk-go-v2/service/s3 v1.99.1/go.mod h1:Fw9aqhJicIVee1VytBBjH+l+5ov6/PhbtIK/u3rt/ls=
-github.com/aws/aws-sdk-go-v2/service/signin v1.0.11 h1:TdJ+HdzOBhU8+iVAOGUTU63VXopcumCOF1paFulHWZc=
-github.com/aws/aws-sdk-go-v2/service/signin v1.0.11/go.mod h1:R82ZRExE/nheo0N+T8zHPcLRTcH8MGsnR3BiVGX0TwI=
-github.com/aws/aws-sdk-go-v2/service/sso v1.30.17 h1:7byT8HUWrgoRp6sXjxtZwgOKfhss5fW6SkLBtqzgRoE=
-github.com/aws/aws-sdk-go-v2/service/sso v1.30.17/go.mod h1:xNWknVi4Ezm1vg1QsB/5EWpAJURq22uqd38U8qKvOJc=
-github.com/aws/aws-sdk-go-v2/service/ssooidc v1.36.0 h1:nDARhv/oF55bcxF7rCI/4PDxOKnVXVWwDuDwCs2I2SQ=
-github.com/aws/aws-sdk-go-v2/service/ssooidc v1.36.0/go.mod h1:4vIRDq+CJB2xFAXZ+YgGUTiEft7oAQlhIs71xcSeuVg=
-github.com/aws/aws-sdk-go-v2/service/sts v1.42.1 h1:F/M5Y9I3nwr2IEpshZgh1GeHpOItExNM9L1euNuh/fk=
-github.com/aws/aws-sdk-go-v2/service/sts v1.42.1/go.mod h1:mTNxImtovCOEEuD65mKW7DCsL+2gjEH+RPEAexAzAio=
+github.com/aws/aws-sdk-go-v2/service/signin v1.0.10 h1:a1Fq/KXn75wSzoJaPQTgZO0wHGqE9mjFnylnqEPTchA=
+github.com/aws/aws-sdk-go-v2/service/signin v1.0.10/go.mod h1:p6+MXNxW7IA6dMgHfTAzljuwSKD0NCm/4lbS4t6+7vI=
+github.com/aws/aws-sdk-go-v2/service/sso v1.30.16 h1:x6bKbmDhsgSZwv6q19wY/u3rLk/3FGjJWyqKcIRufpE=
+github.com/aws/aws-sdk-go-v2/service/sso v1.30.16/go.mod h1:CudnEVKRtLn0+3uMV0yEXZ+YZOKnAtUJ5DmDhilVnIw=
+github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.20 h1:oK/njaL8GtyEihkWMD4k3VgHCT64RQKkZwh0DG5j8ak=
+github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.20/go.mod h1:JHs8/y1f3zY7U5WcuzoJ/yAYGYtNIVPKLIbp61euvmg=
+github.com/aws/aws-sdk-go-v2/service/sts v1.42.0 h1:ks8KBcZPh3PYISr5dAiXCM5/Thcuxk8l+PG4+A0exds=
+github.com/aws/aws-sdk-go-v2/service/sts v1.42.0/go.mod h1:pFw33T0WLvXU3rw1WBkpMlkgIn54eCB5FYLhjDc9Foo=
 github.com/aws/smithy-go v1.25.1 h1:J8ERsGSU7d+aCmdQur5Txg6bVoYelvQJgtZehD12GkI=
 github.com/aws/smithy-go v1.25.1/go.mod h1:YE2RhdIuDbA5E5bTdciG9KrW3+TiEONeUWCqxX9i1Fc=
 github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
@@ -982,6 +982,10 @@ github.com/mudler/localrecall v0.6.1-0.20260507074622-a7724fef6f81 h1:8D9NJ/ikhs
 github.com/mudler/localrecall v0.6.1-0.20260507074622-a7724fef6f81/go.mod h1:28k5n19raUrkuwXkacdNsBlj8yuSnGhpT16tu+2+4dU=
 github.com/mudler/memory v0.0.0-20260406210934-424c1ecf2cf8 h1:Ry8RiWy8fZ6Ff4E7dPmjRsBrnHOnPeOOj2LhCgyjQu0=
 github.com/mudler/memory v0.0.0-20260406210934-424c1ecf2cf8/go.mod h1:EA8Ashhd56o32qN7ouPKFSRUs/Z+LrRCF4v6R2Oarm8=
+github.com/mudler/skillserver v0.0.6 h1:ixz6wUekLdTmbnpAavCkTydDF6UdXAG3ncYufSPK9G0=
+github.com/mudler/skillserver v0.0.6/go.mod h1:z3yFhcL9bSykmmh6xgGu0hyoItd4CnxgtWMEWw8uFJU=
+github.com/mudler/skillserver v0.0.7-0.20260520212528-3dae7f041b1e h1:ryXE1UEzGhLkDFYuaxJ0fZ6fg4l++TWfMCTJ1E7bYS8=
+github.com/mudler/skillserver v0.0.7-0.20260520212528-3dae7f041b1e/go.mod h1:z3yFhcL9bSykmmh6xgGu0hyoItd4CnxgtWMEWw8uFJU=
 github.com/mudler/skillserver v0.0.7-0.20260520220837-a7317cbf9145 h1:z59tA3IDYPt71nzH1jpxeaA1LuDw8aZfpTQFNU43Zb8=
 github.com/mudler/skillserver v0.0.7-0.20260520220837-a7317cbf9145/go.mod h1:z3yFhcL9bSykmmh6xgGu0hyoItd4CnxgtWMEWw8uFJU=
 github.com/mudler/water v0.0.0-20250808092830-dd90dcf09025 h1:WFLP5FHInarYGXi6B/Ze204x7Xy6q/I4nCZnWEyPHK0=
@@ -1018,8 +1022,8 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq
 github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
 github.com/natefinch/atomic v1.0.1 h1:ZPYKxkqQOx3KZ+RsbnP/YsgvxWQPGxjC0oBt2AhwV0A=
 github.com/natefinch/atomic v1.0.1/go.mod h1:N/D/ELrljoqDyT3rZrsUmtsuzvHkeB/wWjHV22AZRbM=
-github.com/nats-io/nats.go v1.52.0 h1:n3avV4VBsCgsdwh71TppsTwtv+QdPs7ntSKM8qJLGsc=
-github.com/nats-io/nats.go v1.52.0/go.mod h1:26HypzazeOkyO3/mqd1zZd53STJN0EjCYF9Uy2ZOBno=
+github.com/nats-io/nats.go v1.50.0 h1:5zAeQrTvyrKrWLJ0fu02W3br8ym57qf7csDzgLOpcds=
+github.com/nats-io/nats.go v1.50.0/go.mod h1:26HypzazeOkyO3/mqd1zZd53STJN0EjCYF9Uy2ZOBno=
 github.com/nats-io/nkeys v0.4.15 h1:JACV5jRVO9V856KOapQ7x+EY8Jo3qw1vJt/9Jpwzkk4=
 github.com/nats-io/nkeys v0.4.15/go.mod h1:CpMchTXC9fxA5zrMo4KpySxNjiDVvr8ANOSZdiNfUrs=
 github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
--- a/pkg/functions/iterative_parser.go
+++ b/pkg/functions/iterative_parser.go
@@ -577,21 +577,6 @@ func trimPotentialPartialWord(content string, format *XMLToolCallFormat, startTh
 func removeHealingMarkerFromJSON(value map[string]any, marker string) map[string]any {
 	result := make(map[string]any)
 	for k, v := range value {
-		// Strip the healing marker from KEYS. parseJSONWithStack appends the
-		// marker to close a partial key (e.g. `{ "code` heals into
-		// `{"code<marker>":1}`); we want to preserve the prefix the model
-		// actually emitted. If the entire key was the marker (i.e. the input
-		// was just `{` heals into `{"<marker>":1}`), the truncated key is
-		// empty — drop the entry. Without this, downstream callers see a
-		// stub object with a random integer-looking key and treat it as a
-		// complete result, the shape that trips chat_stream_workers.go's
-		// streaming tool-call detector in issue #9988.
-		if idx := strings.Index(k, marker); idx != -1 {
-			k = k[:idx]
-			if k == "" {
-				continue
-			}
-		}
 		if str, ok := v.(string); ok {
 			if idx := strings.Index(str, marker); idx != -1 {
 				v = str[:idx]
--- a/pkg/functions/parse.go
+++ b/pkg/functions/parse.go
@@ -325,17 +325,7 @@ func ParseJSONIterative(s string, isPartial bool) ([]map[string]any, error) {
 		if jsonValue != nil {
 			// Convert to map[string]any if it's an object, or handle arrays
 			if obj, ok := jsonValue.(map[string]any); ok {
-				// Skip stub objects that healed away to nothing. Partial inputs
-				// like `{`, `{"`, or `{"n` go through parseJSONWithStack and
-				// come back as `{"<marker>":1}`; after removeHealingMarkerFromJSON
-				// drops the marker key the map is empty. Returning it as a
-				// real result trips the streaming tool-call detector
-				// (chat_stream_workers.go) into thinking a tool call landed,
-				// gating off content emission for the rest of the stream
-				// (issue #9988).
-				if !(isPartialJSON && len(obj) == 0) {
-					results = append(results, obj)
-				}
+				results = append(results, obj)
 			} else if arr, ok := jsonValue.([]any); ok {
 				// Handle arrays: extract objects from array
 				for _, item := range arr {
--- a/pkg/functions/parse_test.go
+++ b/pkg/functions/parse_test.go
@@ -1782,101 +1782,6 @@ value
 				// Results may be empty or contain partial data
 				Expect(len(results)).To(BeNumerically(">=", 0))
 			})
-
-			// Regression: https://github.com/mudler/LocalAI/issues/9988.
-			// The streaming tool-call detector calls ParseJSONIterative on each
-			// new content chunk. If the parser returns a stub object whose only
-			// key is the synthetic healing marker, the caller treats it as
-			// "tool call detected" and gates content emission — qwen3 with
-			// streaming + tools used to leak only the first two characters of
-			// the JSON ("{\"") to clients as a result.
-			// Regression: https://github.com/mudler/LocalAI/issues/9988.
-			// parseJSONWithStack inserts a random-integer healing marker into
-			// keys (and sometimes values) to make a partial input parseable.
-			// Those marker characters must never reach the caller — keys made
-			// entirely of the marker must be dropped, and a marker suffix on a
-			// partial key must be stripped down to the prefix the model
-			// actually typed. Without this the streaming worker sees garbage
-			// keys like `"4310046988783340008"` and mistakes the stub for a
-			// completed tool call, then gates off content emission.
-			DescribeTable("partial JSON starts must not surface healing markers in keys",
-				func(input string) {
-					parser := NewChatMsgParser(input, true)
-					marker := parser.HealingMarker()
-					results, err := ParseJSONIterative(input, true)
-					if err != nil {
-						return
-					}
-					for _, obj := range results {
-						for k := range obj {
-							Expect(k).NotTo(ContainSubstring(marker),
-								"healing marker leaked into key %q for input=%q (full=%+v)", k, input, obj)
-							Expect(k).NotTo(MatchRegexp(`^[A-Za-z]?\d{6,}$`),
-								"key %q looks like a synthetic numeric marker for input=%q (full=%+v)",
-								k, input, obj)
-						}
-					}
-				},
-				Entry("just an opening brace", `{`),
-				Entry("brace + quote", `{"`),
-				Entry("brace + partial key", `{"n`),
-				Entry("brace + quoted partial key", `{"na`),
-				Entry("brace + complete key, no value yet", `{"name"`),
-				Entry("brace + key + colon", `{"name":`),
-				Entry("brace + key + opening quote of value", `{"name":"`),
-				Entry("brace + partial value", `{"name":"ans`),
-			)
-
-			DescribeTable("partial JSON that has not yet committed a tool name must not surface a stub object",
-				// The streaming tool-call detector treats every entry returned
-				// by ParseJSONIterative as a potential new tool call. For very
-				// early partial inputs like `{` or `{"` there is nothing the
-				// caller can act on yet — returning a stub object bumps
-				// lastEmittedCount and gates off content emission.
-				// (Partial-key results like `{"n` → `{"n": 1}` are OK at the
-				// parser level — the streaming caller filters them by
-				// requiring a usable `name` field. See the streaming
-				// defense in chat_stream_workers.go.)
-				func(input string) {
-					results, err := ParseJSONIterative(input, true)
-					if err != nil {
-						return
-					}
-					Expect(results).To(BeEmpty(),
-						"ParseJSONIterative(%q) should return no results — the partial input has no anchor", input)
-				},
-				Entry("just an opening brace", `{`),
-				Entry("brace + quote", `{"`),
-			)
-
-			It("returns a clean tool call once the JSON has a real name (issue #9988)", func() {
-				results, err := ParseJSONIterative(`{"name":"answer","arguments":{"message":"Hi"}}`, true)
-				Expect(err).NotTo(HaveOccurred())
-				Expect(results).To(HaveLen(1))
-				Expect(results[0]).To(HaveKeyWithValue("name", "answer"))
-				for k := range results[0] {
-					Expect(k).NotTo(MatchRegexp(`^[A-Za-z]?\d{6,}$`),
-						"healing marker leaked as key %q", k)
-				}
-			})
-
-			It("strips healing-marker keys even when a real name is present (issue #9988)", func() {
-				// `{"name":"answer"` with no closing brace healed into a stub
-				// with both `name:"answer"` AND a marker-only key. The marker
-				// key must not surface.
-				parser := NewChatMsgParser(`{"name":"answer"`, true)
-				parser.SetHealingMarker("$marker$")
-				jsonValue, isPartial, _, err := parser.TryConsumeJSON()
-				Expect(err).NotTo(HaveOccurred())
-				Expect(isPartial).To(BeTrue())
-				obj, ok := jsonValue.(map[string]any)
-				Expect(ok).To(BeTrue())
-				Expect(obj).To(HaveKeyWithValue("name", "answer"))
-				for k := range obj {
-					Expect(k).NotTo(ContainSubstring("$marker$"),
-						"healing marker leaked into key %q", k)
-				}
-			})
 		})

 		Describe("Comprehensive JSON partial parsing tests (matching llama.cpp)", func() {
--- a/swagger/docs.go
+++ b/swagger/docs.go
@@ -1121,117 +1121,6 @@ const docTemplate = `{
                }
            }
        },
-        "/api/pii/decide": {
-            "post": {
-                "consumes": [
-                    "application/json"
-                ],
-                "produces": [
-                    "application/json"
-                ],
-                "tags": [
-                    "pii"
-                ],
-                "summary": "Scan text for PII and return findings + suggested action (decision oracle)",
-                "parameters": [
-                    {
-                        "description": "decide params",
-                        "name": "request",
-                        "in": "body",
-                        "required": true,
-                        "schema": {
-                            "$ref": "#/definitions/schema.PIIDecideRequest"
-                        }
-                    }
-                ],
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "schema": {
-                            "$ref": "#/definitions/schema.PIIDecideResponse"
-                        }
-                    },
-                    "400": {
-                        "description": "Bad Request",
-                        "schema": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "type": "string"
-                            }
-                        }
-                    }
-                }
-            }
-        },
-        "/api/router/decide": {
-            "post": {
-                "consumes": [
-                    "application/json"
-                ],
-                "produces": [
-                    "application/json"
-                ],
-                "tags": [
-                    "router"
-                ],
-                "summary": "Classify a prompt against a router model's policies (decision oracle)",
-                "parameters": [
-                    {
-                        "description": "decide params",
-                        "name": "request",
-                        "in": "body",
-                        "required": true,
-                        "schema": {
-                            "$ref": "#/definitions/schema.RouterDecideRequest"
-                        }
-                    }
-                ],
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "schema": {
-                            "$ref": "#/definitions/schema.RouterDecideResponse"
-                        }
-                    },
-                    "400": {
-                        "description": "Bad Request",
-                        "schema": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "type": "string"
-                            }
-                        }
-                    },
-                    "404": {
-                        "description": "Not Found",
-                        "schema": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "type": "string"
-                            }
-                        }
-                    },
-                    "500": {
-                        "description": "Internal Server Error",
-                        "schema": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "type": "string"
-                            }
-                        }
-                    },
-                    "503": {
-                        "description": "Service Unavailable",
-                        "schema": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "type": "string"
-                            }
-                        }
-                    }
-                }
-            }
-        },
        "/api/traces": {
            "get": {
                "description": "Returns captured API exchange traces (request/response pairs) in reverse chronological order",
@@ -3397,6 +3286,7 @@ const docTemplate = `{
                "downloaded_size": {
                    "type": "string"
                },
+                "error": {},
                "file_name": {
                    "type": "string"
                },
@@ -4819,6 +4709,27 @@ const docTemplate = `{
                    "description": "The message role",
                    "type": "string"
                },
+                "string_audios": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
+                "string_content": {
+                    "type": "string"
+                },
+                "string_images": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
+                "string_videos": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
                "tool_call_id": {
                    "type": "string"
                },
@@ -5412,10 +5323,6 @@ const docTemplate = `{
                        }
                    ]
                },
-                "max_completion_tokens": {
-                    "description": "MaxCompletionTokens is the modern alias for max_tokens\n(OpenAI deprecated max_tokens; gpt-5 / o-series reject it).\nAccepted on the wire so up-to-date clients can use the new\nname; the request middleware collapses it into Maxtokens so\ninternal code reads exactly one field.",
-                    "type": "integer"
-                },
                "max_tokens": {
                    "type": "integer"
                },
@@ -5747,109 +5654,6 @@ const docTemplate = `{
                }
            }
        },
-        "schema.PIIDecideRequest": {
-            "type": "object",
-            "properties": {
-                "text": {
-                    "description": "Text is the user-visible content to inspect. Required.",
-                    "type": "string"
-                }
-            }
-        },
-        "schema.PIIDecideResponse": {
-            "type": "object",
-            "properties": {
-                "findings": {
-                    "description": "Findings is one entry per matched span — pattern id, byte\nrange, and audit-safe hash prefix (never the matched value).",
-                    "type": "array",
-                    "items": {
-                        "$ref": "#/definitions/schema.PIIFinding"
-                    }
-                },
-                "redacted_preview": {
-                    "description": "RedactedPreview is the input with mask-action spans replaced\nby their placeholders. Identical to Text when no findings or\nwhen the strongest action is block/route_local (which don't\nrewrite content).",
-                    "type": "string"
-                },
-                "suggested_action": {
-                    "description": "SuggestedAction is the strongest action across all findings:\n\"block\", \"route_local\", \"mask\", or \"allow\" (no findings).",
-                    "type": "string"
-                }
-            }
-        },
-        "schema.PIIFinding": {
-            "type": "object",
-            "properties": {
-                "end": {
-                    "type": "integer"
-                },
-                "hash_prefix": {
-                    "type": "string"
-                },
-                "pattern": {
-                    "type": "string"
-                },
-                "start": {
-                    "type": "integer"
-                }
-            }
-        },
-        "schema.RouterDecideRequest": {
-            "type": "object",
-            "properties": {
-                "input": {
-                    "description": "Input is the user-visible prompt text to classify. Required.\nSchema-shape extraction (chat-message concatenation, etc.) is\nthe caller's responsibility — matches the Probe contract used\nby the in-band middleware.",
-                    "type": "string"
-                },
-                "router": {
-                    "description": "Router is the name of the router model (a ModelConfig with a\n` + "`" + `router:` + "`" + ` block). Required.",
-                    "type": "string"
-                }
-            }
-        },
-        "schema.RouterDecideResponse": {
-            "type": "object",
-            "properties": {
-                "cache_similarity": {
-                    "description": "CacheSimilarity carries the cosine similarity of the cache hit\n(0 when not cached).",
-                    "type": "number"
-                },
-                "cached": {
-                    "description": "Cached is true when the decision came from the L2 embedding\ncache rather than a fresh classifier run.",
-                    "type": "boolean"
-                },
-                "candidate": {
-                    "description": "Candidate is the model that would be routed to. Empty when no\ncandidate covers Labels AND no fallback is configured.",
-                    "type": "string"
-                },
-                "classifier": {
-                    "description": "Classifier is the classifier name that produced the decision\n(e.g. \"score\").",
-                    "type": "string"
-                },
-                "fallback": {
-                    "description": "Fallback is true when Candidate is the router's configured\nfallback because no candidate covered Labels. Lets callers\ndistinguish \"matched\" from \"fell back\" without comparing names.",
-                    "type": "boolean"
-                },
-                "labels": {
-                    "description": "Labels is the set of active policy labels.",
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                },
-                "latency_ms": {
-                    "description": "LatencyMs is the classifier's wall-clock cost.",
-                    "type": "integer"
-                },
-                "router": {
-                    "description": "Router echoes the requested router model.",
-                    "type": "string"
-                },
-                "score": {
-                    "description": "Score is the top label's softmax probability (the\nclassifier-side confidence signal).",
-                    "type": "number"
-                }
-            }
-        },
        "schema.StreamOptions": {
            "type": "object",
            "properties": {
--- a/swagger/swagger.json
+++ b/swagger/swagger.json
@@ -1118,117 +1118,6 @@
                }
            }
        },
-        "/api/pii/decide": {
-            "post": {
-                "consumes": [
-                    "application/json"
-                ],
-                "produces": [
-                    "application/json"
-                ],
-                "tags": [
-                    "pii"
-                ],
-                "summary": "Scan text for PII and return findings + suggested action (decision oracle)",
-                "parameters": [
-                    {
-                        "description": "decide params",
-                        "name": "request",
-                        "in": "body",
-                        "required": true,
-                        "schema": {
-                            "$ref": "#/definitions/schema.PIIDecideRequest"
-                        }
-                    }
-                ],
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "schema": {
-                            "$ref": "#/definitions/schema.PIIDecideResponse"
-                        }
-                    },
-                    "400": {
-                        "description": "Bad Request",
-                        "schema": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "type": "string"
-                            }
-                        }
-                    }
-                }
-            }
-        },
-        "/api/router/decide": {
-            "post": {
-                "consumes": [
-                    "application/json"
-                ],
-                "produces": [
-                    "application/json"
-                ],
-                "tags": [
-                    "router"
-                ],
-                "summary": "Classify a prompt against a router model's policies (decision oracle)",
-                "parameters": [
-                    {
-                        "description": "decide params",
-                        "name": "request",
-                        "in": "body",
-                        "required": true,
-                        "schema": {
-                            "$ref": "#/definitions/schema.RouterDecideRequest"
-                        }
-                    }
-                ],
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "schema": {
-                            "$ref": "#/definitions/schema.RouterDecideResponse"
-                        }
-                    },
-                    "400": {
-                        "description": "Bad Request",
-                        "schema": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "type": "string"
-                            }
-                        }
-                    },
-                    "404": {
-                        "description": "Not Found",
-                        "schema": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "type": "string"
-                            }
-                        }
-                    },
-                    "500": {
-                        "description": "Internal Server Error",
-                        "schema": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "type": "string"
-                            }
-                        }
-                    },
-                    "503": {
-                        "description": "Service Unavailable",
-                        "schema": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "type": "string"
-                            }
-                        }
-                    }
-                }
-            }
-        },
        "/api/traces": {
            "get": {
                "description": "Returns captured API exchange traces (request/response pairs) in reverse chronological order",
@@ -3394,6 +3283,7 @@
                "downloaded_size": {
                    "type": "string"
                },
+                "error": {},
                "file_name": {
                    "type": "string"
                },
@@ -4816,6 +4706,27 @@
                    "description": "The message role",
                    "type": "string"
                },
+                "string_audios": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
+                "string_content": {
+                    "type": "string"
+                },
+                "string_images": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
+                "string_videos": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
                "tool_call_id": {
                    "type": "string"
                },
@@ -5409,10 +5320,6 @@
                        }
                    ]
                },
-                "max_completion_tokens": {
-                    "description": "MaxCompletionTokens is the modern alias for max_tokens\n(OpenAI deprecated max_tokens; gpt-5 / o-series reject it).\nAccepted on the wire so up-to-date clients can use the new\nname; the request middleware collapses it into Maxtokens so\ninternal code reads exactly one field.",
-                    "type": "integer"
-                },
                "max_tokens": {
                    "type": "integer"
                },
@@ -5744,109 +5651,6 @@
                }
            }
        },
-        "schema.PIIDecideRequest": {
-            "type": "object",
-            "properties": {
-                "text": {
-                    "description": "Text is the user-visible content to inspect. Required.",
-                    "type": "string"
-                }
-            }
-        },
-        "schema.PIIDecideResponse": {
-            "type": "object",
-            "properties": {
-                "findings": {
-                    "description": "Findings is one entry per matched span — pattern id, byte\nrange, and audit-safe hash prefix (never the matched value).",
-                    "type": "array",
-                    "items": {
-                        "$ref": "#/definitions/schema.PIIFinding"
-                    }
-                },
-                "redacted_preview": {
-                    "description": "RedactedPreview is the input with mask-action spans replaced\nby their placeholders. Identical to Text when no findings or\nwhen the strongest action is block/route_local (which don't\nrewrite content).",
-                    "type": "string"
-                },
-                "suggested_action": {
-                    "description": "SuggestedAction is the strongest action across all findings:\n\"block\", \"route_local\", \"mask\", or \"allow\" (no findings).",
-                    "type": "string"
-                }
-            }
-        },
-        "schema.PIIFinding": {
-            "type": "object",
-            "properties": {
-                "end": {
-                    "type": "integer"
-                },
-                "hash_prefix": {
-                    "type": "string"
-                },
-                "pattern": {
-                    "type": "string"
-                },
-                "start": {
-                    "type": "integer"
-                }
-            }
-        },
-        "schema.RouterDecideRequest": {
-            "type": "object",
-            "properties": {
-                "input": {
-                    "description": "Input is the user-visible prompt text to classify. Required.\nSchema-shape extraction (chat-message concatenation, etc.) is\nthe caller's responsibility — matches the Probe contract used\nby the in-band middleware.",
-                    "type": "string"
-                },
-                "router": {
-                    "description": "Router is the name of the router model (a ModelConfig with a\n`router:` block). Required.",
-                    "type": "string"
-                }
-            }
-        },
-        "schema.RouterDecideResponse": {
-            "type": "object",
-            "properties": {
-                "cache_similarity": {
-                    "description": "CacheSimilarity carries the cosine similarity of the cache hit\n(0 when not cached).",
-                    "type": "number"
-                },
-                "cached": {
-                    "description": "Cached is true when the decision came from the L2 embedding\ncache rather than a fresh classifier run.",
-                    "type": "boolean"
-                },
-                "candidate": {
-                    "description": "Candidate is the model that would be routed to. Empty when no\ncandidate covers Labels AND no fallback is configured.",
-                    "type": "string"
-                },
-                "classifier": {
-                    "description": "Classifier is the classifier name that produced the decision\n(e.g. \"score\").",
-                    "type": "string"
-                },
-                "fallback": {
-                    "description": "Fallback is true when Candidate is the router's configured\nfallback because no candidate covered Labels. Lets callers\ndistinguish \"matched\" from \"fell back\" without comparing names.",
-                    "type": "boolean"
-                },
-                "labels": {
-                    "description": "Labels is the set of active policy labels.",
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                },
-                "latency_ms": {
-                    "description": "LatencyMs is the classifier's wall-clock cost.",
-                    "type": "integer"
-                },
-                "router": {
-                    "description": "Router echoes the requested router model.",
-                    "type": "string"
-                },
-                "score": {
-                    "description": "Score is the top label's softmax probability (the\nclassifier-side confidence signal).",
-                    "type": "number"
-                }
-            }
-        },
        "schema.StreamOptions": {
            "type": "object",
            "properties": {
--- a/swagger/swagger.yaml
+++ b/swagger/swagger.yaml
@@ -244,6 +244,7 @@ definitions:
        type: boolean
      downloaded_size:
        type: string
+      error: {}
      file_name:
        type: string
      file_size:
@@ -1225,6 +1226,20 @@ definitions:
      role:
        description: The message role
        type: string
+      string_audios:
+        items:
+          type: string
+        type: array
+      string_content:
+        type: string
+      string_images:
+        items:
+          type: string
+        type: array
+      string_videos:
+        items:
+          type: string
+        type: array
      tool_call_id:
        type: string
      tool_calls:
@@ -1621,14 +1636,6 @@ definitions:
          OpenAI API logprobs parameters
          logprobs: boolean - if true, returns log probabilities of each output token
          top_logprobs: integer 0-20 - number of most likely tokens to return at each token position
-      max_completion_tokens:
-        description: |-
-          MaxCompletionTokens is the modern alias for max_tokens
-          (OpenAI deprecated max_tokens; gpt-5 / o-series reject it).
-          Accepted on the wire so up-to-date clients can use the new
-          name; the request middleware collapses it into Maxtokens so
-          internal code reads exactly one field.
-        type: integer
      max_tokens:
        type: integer
      messages:
@@ -1865,105 +1872,6 @@ definitions:
          $ref: '#/definitions/schema.NodeData'
        type: array
    type: object
-  schema.PIIDecideRequest:
-    properties:
-      text:
-        description: Text is the user-visible content to inspect. Required.
-        type: string
-    type: object
-  schema.PIIDecideResponse:
-    properties:
-      findings:
-        description: |-
-          Findings is one entry per matched span — pattern id, byte
-          range, and audit-safe hash prefix (never the matched value).
-        items:
-          $ref: '#/definitions/schema.PIIFinding'
-        type: array
-      redacted_preview:
-        description: |-
-          RedactedPreview is the input with mask-action spans replaced
-          by their placeholders. Identical to Text when no findings or
-          when the strongest action is block/route_local (which don't
-          rewrite content).
-        type: string
-      suggested_action:
-        description: |-
-          SuggestedAction is the strongest action across all findings:
-          "block", "route_local", "mask", or "allow" (no findings).
-        type: string
-    type: object
-  schema.PIIFinding:
-    properties:
-      end:
-        type: integer
-      hash_prefix:
-        type: string
-      pattern:
-        type: string
-      start:
-        type: integer
-    type: object
-  schema.RouterDecideRequest:
-    properties:
-      input:
-        description: |-
-          Input is the user-visible prompt text to classify. Required.
-          Schema-shape extraction (chat-message concatenation, etc.) is
-          the caller's responsibility — matches the Probe contract used
-          by the in-band middleware.
-        type: string
-      router:
-        description: |-
-          Router is the name of the router model (a ModelConfig with a
-          `router:` block). Required.
-        type: string
-    type: object
-  schema.RouterDecideResponse:
-    properties:
-      cache_similarity:
-        description: |-
-          CacheSimilarity carries the cosine similarity of the cache hit
-          (0 when not cached).
-        type: number
-      cached:
-        description: |-
-          Cached is true when the decision came from the L2 embedding
-          cache rather than a fresh classifier run.
-        type: boolean
-      candidate:
-        description: |-
-          Candidate is the model that would be routed to. Empty when no
-          candidate covers Labels AND no fallback is configured.
-        type: string
-      classifier:
-        description: |-
-          Classifier is the classifier name that produced the decision
-          (e.g. "score").
-        type: string
-      fallback:
-        description: |-
-          Fallback is true when Candidate is the router's configured
-          fallback because no candidate covered Labels. Lets callers
-          distinguish "matched" from "fell back" without comparing names.
-        type: boolean
-      labels:
-        description: Labels is the set of active policy labels.
-        items:
-          type: string
-        type: array
-      latency_ms:
-        description: LatencyMs is the classifier's wall-clock cost.
-        type: integer
-      router:
-        description: Router echoes the requested router model.
-        type: string
-      score:
-        description: |-
-          Score is the top label's softmax probability (the
-          classifier-side confidence signal).
-        type: number
-    type: object
  schema.StreamOptions:
    properties:
      include_usage:
@@ -3076,79 +2984,6 @@ paths:
      summary: Show the P2P token
      tags:
      - p2p
-  /api/pii/decide:
-    post:
-      consumes:
-      - application/json
-      parameters:
-      - description: decide params
-        in: body
-        name: request
-        required: true
-        schema:
-          $ref: '#/definitions/schema.PIIDecideRequest'
-      produces:
-      - application/json
-      responses:
-        "200":
-          description: OK
-          schema:
-            $ref: '#/definitions/schema.PIIDecideResponse'
-        "400":
-          description: Bad Request
-          schema:
-            additionalProperties:
-              type: string
-            type: object
-      summary: Scan text for PII and return findings + suggested action (decision
-        oracle)
-      tags:
-      - pii
-  /api/router/decide:
-    post:
-      consumes:
-      - application/json
-      parameters:
-      - description: decide params
-        in: body
-        name: request
-        required: true
-        schema:
-          $ref: '#/definitions/schema.RouterDecideRequest'
-      produces:
-      - application/json
-      responses:
-        "200":
-          description: OK
-          schema:
-            $ref: '#/definitions/schema.RouterDecideResponse'
-        "400":
-          description: Bad Request
-          schema:
-            additionalProperties:
-              type: string
-            type: object
-        "404":
-          description: Not Found
-          schema:
-            additionalProperties:
-              type: string
-            type: object
-        "500":
-          description: Internal Server Error
-          schema:
-            additionalProperties:
-              type: string
-            type: object
-        "503":
-          description: Service Unavailable
-          schema:
-            additionalProperties:
-              type: string
-            type: object
-      summary: Classify a prompt against a router model's policies (decision oracle)
-      tags:
-      - router
  /api/traces:
    get:
      description: Returns captured API exchange traces (request/response pairs) in