fix(gallery/ltx-2.3): add vae_decode_only:false for i2v / flf2v

LTX-2.3 i2v inference fails inside generate_video with: [ERROR] LTXAV image conditioning requires VAE encoder weights; create the context with vae_decode_only=false Without vae_decode_only:false in the options block, gosd.cpp creates the sd_ctx with VAE encoder weights freed, so latent encoding of the init_image is impossible. Adding the option mirrors what we already do for Wan i2v entries. Affects all six LTX-2.3 entries (dev/distilled × UD-Q4_K_M, Q4_K_M, Q8_0). T2V wasn't impacted by the missing option since it has no init image to encode, which is why the T2V smoke earlier passed. Assisted-by: Claude:claude-opus-4-7
fix(gallery/ltx-2.3): add diffusion_model flag to all variants
2026-05-25 17:18:18 -04:00 · 2026-05-25 19:33:55 +00:00 · 2026-05-25 16:49:37 +00:00
11 changed files with 138 additions and 946 deletions
--- a/backend/go/stablediffusion-ggml/cpp/gosd.cpp
+++ b/backend/go/stablediffusion-ggml/cpp/gosd.cpp
@@ -27,7 +27,6 @@
 #include <stdlib.h>
 #include <regex>
 #include <errno.h>
-#include <inttypes.h>
 #include <signal.h>
 #include <unistd.h>
 #include <sys/wait.h>
@@ -1076,71 +1075,9 @@ static uint8_t* load_and_resize_image(const char* path, int target_width, int ta
    return buf;
 }

-// Write sd.cpp's audio buffer to a temp WAV file (IEEE float, interleaved).
-// sd_audio_t.data is planar (all channel 0 samples, then channel 1, etc.) — we
-// interleave on the fly so ffmpeg's standard wav demuxer can read it directly.
-// Returns 0 on success and fills wav_path (must be at least 64 bytes).
-static int write_planar_float_wav(const sd_audio_t* a, char* wav_path, size_t wav_path_sz) {
-    if (!a || !a->data || a->sample_count == 0 || a->channels == 0 || a->sample_rate == 0) {
-        return -1;
-    }
-
-    snprintf(wav_path, wav_path_sz, "/tmp/gosd-audio-XXXXXX.wav");
-    int fd = mkstemps(wav_path, 4);
-    if (fd < 0) { perror("mkstemps wav"); return -1; }
-    FILE* f = fdopen(fd, "wb");
-    if (!f) { perror("fdopen wav"); close(fd); return -1; }
-
-    uint64_t frames = a->sample_count;
-    uint32_t channels = a->channels;
-    uint32_t sample_rate = a->sample_rate;
-    uint64_t total_samples64 = frames * (uint64_t)channels;
-    uint64_t data_bytes64 = total_samples64 * sizeof(float);
-    if (data_bytes64 > 0xFFFFFFFFull - 44) {
-        fprintf(stderr, "audio too large for 32-bit WAV (%" PRIu64 " bytes)\n", data_bytes64);
-        fclose(f);
-        unlink(wav_path);
-        return -1;
-    }
-    uint32_t data_bytes = (uint32_t)data_bytes64;
-    uint32_t riff_size = 36 + data_bytes;
-    uint16_t fmt_code = 3;                // WAVE_FORMAT_IEEE_FLOAT
-    uint16_t bits_per_sample = 32;
-    uint16_t block_align = (uint16_t)(channels * sizeof(float));
-    uint32_t byte_rate = sample_rate * block_align;
-    uint16_t ch16 = (uint16_t)channels;
-    uint32_t fmt_size = 16;
-
-    fwrite("RIFF", 1, 4, f);
-    fwrite(&riff_size, 4, 1, f);
-    fwrite("WAVEfmt ", 1, 8, f);
-    fwrite(&fmt_size, 4, 1, f);
-    fwrite(&fmt_code, 2, 1, f);
-    fwrite(&ch16, 2, 1, f);
-    fwrite(&sample_rate, 4, 1, f);
-    fwrite(&byte_rate, 4, 1, f);
-    fwrite(&block_align, 2, 1, f);
-    fwrite(&bits_per_sample, 2, 1, f);
-    fwrite("data", 1, 4, f);
-    fwrite(&data_bytes, 4, 1, f);
-
-    // Interleave planar [ch0_samples..., ch1_samples...] → [ch0_s0, ch1_s0, ...]
-    for (uint64_t s = 0; s < frames; s++) {
-        for (uint32_t c = 0; c < channels; c++) {
-            float v = a->data[(size_t)c * frames + s];
-            fwrite(&v, sizeof(float), 1, f);
-        }
-    }
-    fclose(f);
-    return 0;
-}
-
 // Pipe raw RGB/RGBA frames to ffmpeg stdin and let it produce an MP4 at dst.
-// Uses fork+execvp to avoid shell interpretation of dst. When `audio` is
-// non-null, the audio waveform is staged to a temp WAV and added as a second
-// ffmpeg input so the final MP4 contains both video and AAC audio.
-static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps,
-                                  const sd_audio_t* audio, const char* dst) {
+// Uses fork+execvp to avoid shell interpretation of dst.
+static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, const char* dst) {
    if (num_frames <= 0 || !frames || !frames[0].data) {
        fprintf(stderr, "ffmpeg_mux: empty frames\n");
        return 1;
@@ -1155,87 +1092,38 @@ static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps,
    snprintf(size_str, sizeof(size_str), "%dx%d", width, height);
    snprintf(fps_str, sizeof(fps_str), "%d", fps);

-    // Optional audio: write a temp WAV file if the model produced audio.
-    char wav_path[64] = {0};
-    bool have_audio = false;
-    if (audio && audio->data && audio->sample_count > 0 && audio->channels > 0 && audio->sample_rate > 0) {
-        if (write_planar_float_wav(audio, wav_path, sizeof(wav_path)) == 0) {
-            have_audio = true;
-            fprintf(stderr, "ffmpeg_mux: audio %u Hz × %u ch × %" PRIu64 " frames → %s\n",
-                    audio->sample_rate, audio->channels, audio->sample_count, wav_path);
-        } else {
-            fprintf(stderr, "ffmpeg_mux: failed to stage audio; producing silent video\n");
-        }
-    }
-
    int pipefd[2];
-    if (pipe(pipefd) != 0) {
-        perror("pipe");
-        if (have_audio) unlink(wav_path);
-        return 1;
-    }
+    if (pipe(pipefd) != 0) { perror("pipe"); return 1; }

    pid_t pid = fork();
-    if (pid < 0) {
-        perror("fork");
-        close(pipefd[0]); close(pipefd[1]);
-        if (have_audio) unlink(wav_path);
-        return 1;
-    }
+    if (pid < 0) { perror("fork"); close(pipefd[0]); close(pipefd[1]); return 1; }

    if (pid == 0) {
        // child
        close(pipefd[1]);
        if (dup2(pipefd[0], STDIN_FILENO) < 0) { perror("dup2"); _exit(127); }
        close(pipefd[0]);
-        std::vector<char*> argv;
-        argv.push_back(const_cast<char*>("ffmpeg"));
-        argv.push_back(const_cast<char*>("-y"));
-        argv.push_back(const_cast<char*>("-hide_banner"));
-        argv.push_back(const_cast<char*>("-loglevel"));
-        argv.push_back(const_cast<char*>("warning"));
-        // Input 0: raw video from stdin
-        argv.push_back(const_cast<char*>("-f"));
-        argv.push_back(const_cast<char*>("rawvideo"));
-        argv.push_back(const_cast<char*>("-pix_fmt"));
-        argv.push_back(const_cast<char*>(pix_fmt_in));
-        argv.push_back(const_cast<char*>("-s"));
-        argv.push_back(size_str);
-        argv.push_back(const_cast<char*>("-framerate"));
-        argv.push_back(fps_str);
-        argv.push_back(const_cast<char*>("-i"));
-        argv.push_back(const_cast<char*>("-"));
-        // Input 1: optional audio WAV
-        if (have_audio) {
-            argv.push_back(const_cast<char*>("-i"));
-            argv.push_back(wav_path);
-            argv.push_back(const_cast<char*>("-map"));
-            argv.push_back(const_cast<char*>("0:v:0"));
-            argv.push_back(const_cast<char*>("-map"));
-            argv.push_back(const_cast<char*>("1:a:0"));
-            argv.push_back(const_cast<char*>("-c:a"));
-            argv.push_back(const_cast<char*>("aac"));
-            argv.push_back(const_cast<char*>("-b:a"));
-            argv.push_back(const_cast<char*>("192k"));
-            // -shortest so the final clip ends with the shorter of the two
-            // streams — guards against an audio buffer that overshoots the
-            // video duration (or vice versa) on certain LTX variants.
-            argv.push_back(const_cast<char*>("-shortest"));
-        }
-        argv.push_back(const_cast<char*>("-c:v"));
-        argv.push_back(const_cast<char*>("libx264"));
-        argv.push_back(const_cast<char*>("-pix_fmt"));
-        argv.push_back(const_cast<char*>("yuv420p"));
-        argv.push_back(const_cast<char*>("-movflags"));
-        argv.push_back(const_cast<char*>("+faststart"));
-        // Force MP4 container. Distributed LocalAI hands us a staging
-        // path (e.g. /staging/localai-output-NNN.tmp) with a non-standard
-        // extension; relying on filename suffix makes ffmpeg bail with
-        // "Unable to choose an output format".
-        argv.push_back(const_cast<char*>("-f"));
-        argv.push_back(const_cast<char*>("mp4"));
-        argv.push_back(const_cast<char*>(dst));
-        argv.push_back(nullptr);
+        std::vector<char*> argv = {
+            const_cast<char*>("ffmpeg"),
+            const_cast<char*>("-y"),
+            const_cast<char*>("-hide_banner"),
+            const_cast<char*>("-loglevel"), const_cast<char*>("warning"),
+            const_cast<char*>("-f"), const_cast<char*>("rawvideo"),
+            const_cast<char*>("-pix_fmt"), const_cast<char*>(pix_fmt_in),
+            const_cast<char*>("-s"), size_str,
+            const_cast<char*>("-framerate"), fps_str,
+            const_cast<char*>("-i"), const_cast<char*>("-"),
+            const_cast<char*>("-c:v"), const_cast<char*>("libx264"),
+            const_cast<char*>("-pix_fmt"), const_cast<char*>("yuv420p"),
+            const_cast<char*>("-movflags"), const_cast<char*>("+faststart"),
+            // Force MP4 container. Distributed LocalAI hands us a staging
+            // path (e.g. /staging/localai-output-NNN.tmp) with a non-standard
+            // extension; relying on filename suffix makes ffmpeg bail with
+            // "Unable to choose an output format".
+            const_cast<char*>("-f"), const_cast<char*>("mp4"),
+            const_cast<char*>(dst),
+            nullptr
+        };
        execvp(argv[0], argv.data());
        perror("execvp ffmpeg");
        _exit(127);
@@ -1260,7 +1148,6 @@ static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps,
                close(pipefd[1]);
                int status;
                waitpid(pid, &status, 0);
-                if (have_audio) unlink(wav_path);
                return 1;
            }
            p += n;
@@ -1271,13 +1158,8 @@ static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps,

    int status = 0;
    while (waitpid(pid, &status, 0) < 0) {
-        if (errno != EINTR) {
-            perror("waitpid");
-            if (have_audio) unlink(wav_path);
-            return 1;
-        }
+        if (errno != EINTR) { perror("waitpid"); return 1; }
    }
-    if (have_audio) unlink(wav_path);
    if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
        fprintf(stderr, "ffmpeg exited with status %d\n", status);
        return 1;
@@ -1352,7 +1234,7 @@ int gen_video(sd_vid_gen_params_t *p, int steps, char *dst, float cfg_scale, int

    fprintf(stderr, "Generated %d frames, muxing to %s via ffmpeg\n", num_frames_out, dst);

-    int rc = ffmpeg_mux_raw_to_mp4(frames, num_frames_out, fps, audio, dst);
+    int rc = ffmpeg_mux_raw_to_mp4(frames, num_frames_out, fps, dst);

    for (int i = 0; i < num_frames_out; i++) {
        if (frames[i].data) free(frames[i].data);
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -68,57 +68,6 @@ func mergeToolCallDeltas(existing []schema.ToolCall, deltas []schema.ToolCall) [
 	return existing
 }

-// applyAutoparserOverride replaces the Go-side reasoning-extraction result with
-// the C++ autoparser's classified ChatDeltas when those deltas contain
-// actionable content or reasoning. It preserves the original logprobs.
-//
-// When the autoparser did not classify any reasoning (deltaReasoning == "") but
-// deltaContent still carries an unparsed reasoning tag pair (e.g. the
-// non-jinja "pure content" fallback path on a <think> model — issue #9985),
-// the Go-side reasoning extractor is run on deltaContent as a defensive
-// fallback so <think>…</think> blocks do not leak into the OpenAI `content`
-// field.
-func applyAutoparserOverride(
-	chatDeltas []*pb.ChatDelta,
-	thinkingStartToken string,
-	reasoningConfig reason.Config,
-	existing []schema.Choice,
-) []schema.Choice {
-	if len(chatDeltas) == 0 {
-		return existing
-	}
-	deltaContent := functions.ContentFromChatDeltas(chatDeltas)
-	deltaReasoning := functions.ReasoningFromChatDeltas(chatDeltas)
-	if deltaContent == "" && deltaReasoning == "" {
-		return existing
-	}
-	// Fallback for non-jinja models (issue #9985): when the C++ autoparser
-	// did not classify reasoning but the raw content still contains a known
-	// reasoning tag pair, run Go-side extraction on the content so that the
-	// <think>…</think> block does not leak into the OpenAI `content` field.
-	// When the autoparser DID populate ReasoningContent, leave its
-	// content/reasoning split alone — trust the parser. We replace
-	// deltaContent unconditionally because ExtractReasoningWithConfig is a
-	// no-op when no tag pair matches; this also strips empty thinking
-	// blocks like "<think></think>" that some models emit when reasoning
-	// is disabled.
-	if deltaReasoning == "" && deltaContent != "" {
-		deltaReasoning, deltaContent = reason.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, reasoningConfig)
-	}
-	xlog.Debug("[ChatDeltas] non-SSE no-tools: overriding result with C++ autoparser deltas",
-		"content_len", len(deltaContent), "reasoning_len", len(deltaReasoning))
-	stopReason := FinishReasonStop
-	message := &schema.Message{Role: "assistant", Content: &deltaContent}
-	if deltaReasoning != "" {
-		message.Reasoning = &deltaReasoning
-	}
-	newChoice := schema.Choice{FinishReason: &stopReason, Index: 0, Message: message}
-	if len(existing) > 0 && existing[0].Logprobs != nil {
-		newChoice.Logprobs = existing[0].Logprobs
-	}
-	return []schema.Choice{newChoice}
-}
-
 // ChatEndpoint is the OpenAI Completion API endpoint https://platform.openai.com/docs/api-reference/chat/create
 // @Summary Generate a chat completions for a given prompt and model.
 // @Tags inference
@@ -808,8 +757,24 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 				// For non-tool requests: prefer C++ autoparser chat deltas over
 				// Go-side tag extraction (which can mangle output when thinkingStartToken
 				// differs from the model's actual reasoning tags, e.g. Gemma 4).
-				if !shouldUseFn {
-					result = applyAutoparserOverride(chatDeltas, thinkingStartToken, config.ReasoningConfig, result)
+				if !shouldUseFn && len(chatDeltas) > 0 {
+					deltaContent := functions.ContentFromChatDeltas(chatDeltas)
+					deltaReasoning := functions.ReasoningFromChatDeltas(chatDeltas)
+					if deltaContent != "" || deltaReasoning != "" {
+						xlog.Debug("[ChatDeltas] non-SSE no-tools: overriding result with C++ autoparser deltas",
+							"content_len", len(deltaContent), "reasoning_len", len(deltaReasoning))
+						stopReason := FinishReasonStop
+						message := &schema.Message{Role: "assistant", Content: &deltaContent}
+						if deltaReasoning != "" {
+							message.Reasoning = &deltaReasoning
+						}
+						newChoice := schema.Choice{FinishReason: &stopReason, Index: 0, Message: message}
+						// Preserve logprobs from the original result
+						if len(result) > 0 && result[0].Logprobs != nil {
+							newChoice.Logprobs = result[0].Logprobs
+						}
+						result = []schema.Choice{newChoice}
+					}
 				}

 				// Tool parsing is deferred here (only when shouldUseFn) so chat deltas are available
--- a/core/http/endpoints/openai/chat_stream_workers.go
+++ b/core/http/endpoints/openai/chat_stream_workers.go
@@ -52,13 +52,6 @@ func processStream(
 	thinkingStartToken := reason.DetectThinkingStartToken(template, &cfg.ReasoningConfig)
 	extractor := reason.NewReasoningExtractor(thinkingStartToken, cfg.ReasoningConfig)

-	// preferAutoparser is sticky: once the C++ autoparser has ever classified
-	// reasoning_content, we trust it for the rest of the stream. Until then we
-	// fall back to Go-side extraction so that a "pure content" autoparser
-	// (non-jinja path, issue #9985) does not leak <think>…</think> tokens
-	// straight into the OpenAI `content` field.
-	preferAutoparser := false
-
 	_, finalUsage, _, err := ComputeChoices(req, s, cfg, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool {
 		var reasoningDelta, contentDelta string

@@ -71,16 +64,8 @@ func processStream(
 		// Otherwise fall back to Go-side extraction.
 		if tokenUsage.HasChatDeltaContent() {
 			rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent()
-			if rawReasoning != "" {
-				preferAutoparser = true
-			}
-			if preferAutoparser {
-				contentDelta = cd
-				reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
-			} else {
-				reasoningDelta = goReasoning
-				contentDelta = goContent
-			}
+			contentDelta = cd
+			reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
 		} else {
 			reasoningDelta = goReasoning
 			contentDelta = goContent
--- a/core/http/endpoints/openai/chat_test.go
+++ b/core/http/endpoints/openai/chat_test.go
@@ -3,8 +3,6 @@ package openai
 import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/pkg/functions"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	reason "github.com/mudler/LocalAI/pkg/reasoning"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"

@@ -96,98 +94,6 @@ var _ = Describe("handleQuestion", func() {
 	})
 })

-var _ = Describe("applyAutoparserOverride", func() {
-	// Regression test for https://github.com/mudler/LocalAI/issues/9985.
-	// When LocalAI templates a <think>-style reasoning model outside of jinja
-	// (e.g. the gallery qwen3 entry), the llama.cpp autoparser falls back to
-	// the "pure content" PEG parser which dumps the entire raw response,
-	// including <think>…</think>, into ChatDelta.Content and leaves
-	// ChatDelta.ReasoningContent empty. The Go side previously trusted that
-	// content verbatim and clobbered the tokenCallback's correctly-split
-	// reasoning, so <think> blocks leaked into the OpenAI `content` field.
-	Context("autoparser delivered content with embedded <think> tags and empty reasoning (issue #9985)", func() {
-		It("splits <think>…</think> out of content into the reasoning field", func() {
-			raw := "<think>\nOkay, the user said \"Hello\". I should reply warmly.\n</think>\n\nHello! How can I assist you today? 😊"
-			chatDeltas := []*pb.ChatDelta{
-				{Content: raw, ReasoningContent: ""},
-			}
-
-			result := applyAutoparserOverride(chatDeltas, "", reason.Config{}, nil)
-
-			Expect(result).To(HaveLen(1))
-			Expect(result[0].Message).ToNot(BeNil())
-			Expect(result[0].Message.Content).ToNot(BeNil())
-
-			content := *(result[0].Message.Content.(*string))
-			Expect(content).ToNot(ContainSubstring("<think>"),
-				"raw <think> tag must not leak into OpenAI content field")
-			Expect(content).ToNot(ContainSubstring("</think>"),
-				"raw </think> tag must not leak into OpenAI content field")
-			Expect(content).To(ContainSubstring("Hello! How can I assist you today?"),
-				"the model's actual answer must still be in content")
-
-			Expect(result[0].Message.Reasoning).ToNot(BeNil(),
-				"reasoning extracted from <think>…</think> must populate Reasoning")
-			Expect(*result[0].Message.Reasoning).To(ContainSubstring("Okay, the user said"))
-		})
-
-		It("does not run extraction when the autoparser already populated reasoning", func() {
-			// When the autoparser actually classified reasoning, leave its
-			// content/reasoning split untouched.
-			content := "Hello! How can I assist you today?"
-			reasoning := "Already split by the C++ autoparser."
-			chatDeltas := []*pb.ChatDelta{
-				{Content: content, ReasoningContent: reasoning},
-			}
-
-			result := applyAutoparserOverride(chatDeltas, "", reason.Config{}, nil)
-
-			Expect(result).To(HaveLen(1))
-			Expect(*(result[0].Message.Content.(*string))).To(Equal(content))
-			Expect(result[0].Message.Reasoning).ToNot(BeNil())
-			Expect(*result[0].Message.Reasoning).To(Equal(reasoning))
-		})
-
-		It("passes plain content through unchanged when no reasoning tags are present", func() {
-			content := "Just a normal answer with no reasoning at all."
-			chatDeltas := []*pb.ChatDelta{
-				{Content: content, ReasoningContent: ""},
-			}
-
-			result := applyAutoparserOverride(chatDeltas, "", reason.Config{}, nil)
-
-			Expect(result).To(HaveLen(1))
-			Expect(*(result[0].Message.Content.(*string))).To(Equal(content))
-			Expect(result[0].Message.Reasoning).To(BeNil())
-		})
-
-		It("strips an empty <think></think> block (qwen3 /no_think mode)", func() {
-			// qwen3 with the /no_think directive still emits an empty thinking
-			// block. The Go-side fallback must strip it from content rather than
-			// pass <think></think> through verbatim. No reasoning is set because
-			// the block has no body.
-			raw := "<think>\n\n</think>\n\nHello! How can I assist you today?"
-			chatDeltas := []*pb.ChatDelta{
-				{Content: raw, ReasoningContent: ""},
-			}
-
-			result := applyAutoparserOverride(chatDeltas, "", reason.Config{}, nil)
-
-			Expect(result).To(HaveLen(1))
-			content := *(result[0].Message.Content.(*string))
-			Expect(content).ToNot(ContainSubstring("<think>"))
-			Expect(content).ToNot(ContainSubstring("</think>"))
-			Expect(content).To(ContainSubstring("Hello! How can I assist you today?"))
-		})
-
-		It("returns the existing result when chatDeltas is empty", func() {
-			existing := []schema.Choice{{Index: 7}}
-			result := applyAutoparserOverride(nil, "", reason.Config{}, existing)
-			Expect(result).To(Equal(existing))
-		})
-	})
-})
-
 var _ = Describe("mergeToolCallDeltas", func() {
 	Context("with new tool calls", func() {
 		It("should append new tool calls", func() {
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -1572,15 +1572,6 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 			"tool_calls", len(deltaToolCalls),
 			"content_len", len(deltaContent),
 			"reasoning_len", len(deltaReasoning))
-		// Issue #9985: when the autoparser only delivered content (no
-		// reasoning_content), it may be running in the "pure content"
-		// PEG fallback (non-jinja path) which leaves <think>…</think>
-		// embedded in the content. Run Go-side extraction defensively.
-		// ExtractReasoningWithConfig is a no-op when no tag pair matches,
-		// so it's safe to apply unconditionally in the no-reasoning branch.
-		if deltaReasoning == "" && deltaContent != "" {
-			deltaReasoning, deltaContent = reasoning.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, config.ReasoningConfig)
-		}
 		reasoningText = deltaReasoning
 		responseWithoutReasoning = deltaContent
 		textContent = deltaContent
--- a/core/http/endpoints/openresponses/responses.go
+++ b/core/http/endpoints/openresponses/responses.go
@@ -1971,10 +1971,6 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6

 			// Source reasoning from: (1) ChatDeltas from C++ autoparser, (2) extractor's
 			// streaming state, (3) final extraction from the finetuned result.
-			// Issue #9985: when the autoparser delivered Content but no
-			// ReasoningContent, it was running in the "pure content" PEG fallback
-			// (non-jinja path) which leaves reasoning tags embedded in content.
-			// Fall back to the streaming Go-side extractor's split in that case.
 			if chatDeltaReasoning := functions.ReasoningFromChatDeltas(chatDeltas); chatDeltaReasoning != "" {
 				finalReasoning = chatDeltaReasoning
 				finalCleanedResult = functions.ContentFromChatDeltas(chatDeltas)
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v4.3.1"
+  "version": "v4.2.6"
 }
--- a/gallery/qwen3.yaml
+++ b/gallery/qwen3.yaml
@@ -11,12 +11,36 @@ config_file: |
        - <dummy32000>
        - </s>
        - <|endoftext|>
-    # Delegate templating to llama.cpp's jinja runtime so the C++ autoparser
-    # can classify <think>…</think> blocks into reasoning_content natively
-    # (issue #9985). Without use_jinja the autoparser falls back to a
-    # "pure content" PEG parser that leaks reasoning tags into content.
-    options:
-        - use_jinja:true
    template:
-        use_tokenizer_template: true
+        chat: |
+            {{.Input -}}
+            <|im_start|>assistant
+        chat_message: |
+            <|im_start|>{{if eq .RoleName "tool" }}user{{else}}{{ .RoleName }}{{end}}
+            {{ if eq .RoleName "tool" -}}
+            <tool_response>
+            {{ end -}}
+            {{ if .Content -}}
+            {{.Content }}
+            {{ end -}}
+            {{ if eq .RoleName "tool" -}}
+            </tool_response>
+            {{ end -}}
+            {{ if .FunctionCall -}}
+            <tool_call>
+            {{toJson .FunctionCall}}
+            </tool_call>
+            {{ end -}}<|im_end|>
+        completion: |
+            {{.Input}}
+        function: |
+            <|im_start|>system
+            You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+            {{range .Functions}}
+            {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
+            {{end}}
+            For each function call return a json object with function name and arguments: {"name": <function-name>, "arguments": <json-arguments-object>}
+            <|im_end|>
+            {{.Input -}}
+            <|im_start|>assistant
 name: qwen3
--- a/swagger/docs.go
+++ b/swagger/docs.go
@@ -1121,117 +1121,6 @@ const docTemplate = `{
                }
            }
        },
-        "/api/pii/decide": {
-            "post": {
-                "consumes": [
-                    "application/json"
-                ],
-                "produces": [
-                    "application/json"
-                ],
-                "tags": [
-                    "pii"
-                ],
-                "summary": "Scan text for PII and return findings + suggested action (decision oracle)",
-                "parameters": [
-                    {
-                        "description": "decide params",
-                        "name": "request",
-                        "in": "body",
-                        "required": true,
-                        "schema": {
-                            "$ref": "#/definitions/schema.PIIDecideRequest"
-                        }
-                    }
-                ],
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "schema": {
-                            "$ref": "#/definitions/schema.PIIDecideResponse"
-                        }
-                    },
-                    "400": {
-                        "description": "Bad Request",
-                        "schema": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "type": "string"
-                            }
-                        }
-                    }
-                }
-            }
-        },
-        "/api/router/decide": {
-            "post": {
-                "consumes": [
-                    "application/json"
-                ],
-                "produces": [
-                    "application/json"
-                ],
-                "tags": [
-                    "router"
-                ],
-                "summary": "Classify a prompt against a router model's policies (decision oracle)",
-                "parameters": [
-                    {
-                        "description": "decide params",
-                        "name": "request",
-                        "in": "body",
-                        "required": true,
-                        "schema": {
-                            "$ref": "#/definitions/schema.RouterDecideRequest"
-                        }
-                    }
-                ],
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "schema": {
-                            "$ref": "#/definitions/schema.RouterDecideResponse"
-                        }
-                    },
-                    "400": {
-                        "description": "Bad Request",
-                        "schema": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "type": "string"
-                            }
-                        }
-                    },
-                    "404": {
-                        "description": "Not Found",
-                        "schema": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "type": "string"
-                            }
-                        }
-                    },
-                    "500": {
-                        "description": "Internal Server Error",
-                        "schema": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "type": "string"
-                            }
-                        }
-                    },
-                    "503": {
-                        "description": "Service Unavailable",
-                        "schema": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "type": "string"
-                            }
-                        }
-                    }
-                }
-            }
-        },
        "/api/traces": {
            "get": {
                "description": "Returns captured API exchange traces (request/response pairs) in reverse chronological order",
@@ -3397,6 +3286,7 @@ const docTemplate = `{
                "downloaded_size": {
                    "type": "string"
                },
+                "error": {},
                "file_name": {
                    "type": "string"
                },
@@ -4819,6 +4709,27 @@ const docTemplate = `{
                    "description": "The message role",
                    "type": "string"
                },
+                "string_audios": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
+                "string_content": {
+                    "type": "string"
+                },
+                "string_images": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
+                "string_videos": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
                "tool_call_id": {
                    "type": "string"
                },
@@ -5412,10 +5323,6 @@ const docTemplate = `{
                        }
                    ]
                },
-                "max_completion_tokens": {
-                    "description": "MaxCompletionTokens is the modern alias for max_tokens\n(OpenAI deprecated max_tokens; gpt-5 / o-series reject it).\nAccepted on the wire so up-to-date clients can use the new\nname; the request middleware collapses it into Maxtokens so\ninternal code reads exactly one field.",
-                    "type": "integer"
-                },
                "max_tokens": {
                    "type": "integer"
                },
@@ -5747,109 +5654,6 @@ const docTemplate = `{
                }
            }
        },
-        "schema.PIIDecideRequest": {
-            "type": "object",
-            "properties": {
-                "text": {
-                    "description": "Text is the user-visible content to inspect. Required.",
-                    "type": "string"
-                }
-            }
-        },
-        "schema.PIIDecideResponse": {
-            "type": "object",
-            "properties": {
-                "findings": {
-                    "description": "Findings is one entry per matched span — pattern id, byte\nrange, and audit-safe hash prefix (never the matched value).",
-                    "type": "array",
-                    "items": {
-                        "$ref": "#/definitions/schema.PIIFinding"
-                    }
-                },
-                "redacted_preview": {
-                    "description": "RedactedPreview is the input with mask-action spans replaced\nby their placeholders. Identical to Text when no findings or\nwhen the strongest action is block/route_local (which don't\nrewrite content).",
-                    "type": "string"
-                },
-                "suggested_action": {
-                    "description": "SuggestedAction is the strongest action across all findings:\n\"block\", \"route_local\", \"mask\", or \"allow\" (no findings).",
-                    "type": "string"
-                }
-            }
-        },
-        "schema.PIIFinding": {
-            "type": "object",
-            "properties": {
-                "end": {
-                    "type": "integer"
-                },
-                "hash_prefix": {
-                    "type": "string"
-                },
-                "pattern": {
-                    "type": "string"
-                },
-                "start": {
-                    "type": "integer"
-                }
-            }
-        },
-        "schema.RouterDecideRequest": {
-            "type": "object",
-            "properties": {
-                "input": {
-                    "description": "Input is the user-visible prompt text to classify. Required.\nSchema-shape extraction (chat-message concatenation, etc.) is\nthe caller's responsibility — matches the Probe contract used\nby the in-band middleware.",
-                    "type": "string"
-                },
-                "router": {
-                    "description": "Router is the name of the router model (a ModelConfig with a\n` + "`" + `router:` + "`" + ` block). Required.",
-                    "type": "string"
-                }
-            }
-        },
-        "schema.RouterDecideResponse": {
-            "type": "object",
-            "properties": {
-                "cache_similarity": {
-                    "description": "CacheSimilarity carries the cosine similarity of the cache hit\n(0 when not cached).",
-                    "type": "number"
-                },
-                "cached": {
-                    "description": "Cached is true when the decision came from the L2 embedding\ncache rather than a fresh classifier run.",
-                    "type": "boolean"
-                },
-                "candidate": {
-                    "description": "Candidate is the model that would be routed to. Empty when no\ncandidate covers Labels AND no fallback is configured.",
-                    "type": "string"
-                },
-                "classifier": {
-                    "description": "Classifier is the classifier name that produced the decision\n(e.g. \"score\").",
-                    "type": "string"
-                },
-                "fallback": {
-                    "description": "Fallback is true when Candidate is the router's configured\nfallback because no candidate covered Labels. Lets callers\ndistinguish \"matched\" from \"fell back\" without comparing names.",
-                    "type": "boolean"
-                },
-                "labels": {
-                    "description": "Labels is the set of active policy labels.",
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                },
-                "latency_ms": {
-                    "description": "LatencyMs is the classifier's wall-clock cost.",
-                    "type": "integer"
-                },
-                "router": {
-                    "description": "Router echoes the requested router model.",
-                    "type": "string"
-                },
-                "score": {
-                    "description": "Score is the top label's softmax probability (the\nclassifier-side confidence signal).",
-                    "type": "number"
-                }
-            }
-        },
        "schema.StreamOptions": {
            "type": "object",
            "properties": {
--- a/swagger/swagger.json
+++ b/swagger/swagger.json
@@ -1118,117 +1118,6 @@
                }
            }
        },
-        "/api/pii/decide": {
-            "post": {
-                "consumes": [
-                    "application/json"
-                ],
-                "produces": [
-                    "application/json"
-                ],
-                "tags": [
-                    "pii"
-                ],
-                "summary": "Scan text for PII and return findings + suggested action (decision oracle)",
-                "parameters": [
-                    {
-                        "description": "decide params",
-                        "name": "request",
-                        "in": "body",
-                        "required": true,
-                        "schema": {
-                            "$ref": "#/definitions/schema.PIIDecideRequest"
-                        }
-                    }
-                ],
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "schema": {
-                            "$ref": "#/definitions/schema.PIIDecideResponse"
-                        }
-                    },
-                    "400": {
-                        "description": "Bad Request",
-                        "schema": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "type": "string"
-                            }
-                        }
-                    }
-                }
-            }
-        },
-        "/api/router/decide": {
-            "post": {
-                "consumes": [
-                    "application/json"
-                ],
-                "produces": [
-                    "application/json"
-                ],
-                "tags": [
-                    "router"
-                ],
-                "summary": "Classify a prompt against a router model's policies (decision oracle)",
-                "parameters": [
-                    {
-                        "description": "decide params",
-                        "name": "request",
-                        "in": "body",
-                        "required": true,
-                        "schema": {
-                            "$ref": "#/definitions/schema.RouterDecideRequest"
-                        }
-                    }
-                ],
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "schema": {
-                            "$ref": "#/definitions/schema.RouterDecideResponse"
-                        }
-                    },
-                    "400": {
-                        "description": "Bad Request",
-                        "schema": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "type": "string"
-                            }
-                        }
-                    },
-                    "404": {
-                        "description": "Not Found",
-                        "schema": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "type": "string"
-                            }
-                        }
-                    },
-                    "500": {
-                        "description": "Internal Server Error",
-                        "schema": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "type": "string"
-                            }
-                        }
-                    },
-                    "503": {
-                        "description": "Service Unavailable",
-                        "schema": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "type": "string"
-                            }
-                        }
-                    }
-                }
-            }
-        },
        "/api/traces": {
            "get": {
                "description": "Returns captured API exchange traces (request/response pairs) in reverse chronological order",
@@ -3394,6 +3283,7 @@
                "downloaded_size": {
                    "type": "string"
                },
+                "error": {},
                "file_name": {
                    "type": "string"
                },
@@ -4816,6 +4706,27 @@
                    "description": "The message role",
                    "type": "string"
                },
+                "string_audios": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
+                "string_content": {
+                    "type": "string"
+                },
+                "string_images": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
+                "string_videos": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
                "tool_call_id": {
                    "type": "string"
                },
@@ -5409,10 +5320,6 @@
                        }
                    ]
                },
-                "max_completion_tokens": {
-                    "description": "MaxCompletionTokens is the modern alias for max_tokens\n(OpenAI deprecated max_tokens; gpt-5 / o-series reject it).\nAccepted on the wire so up-to-date clients can use the new\nname; the request middleware collapses it into Maxtokens so\ninternal code reads exactly one field.",
-                    "type": "integer"
-                },
                "max_tokens": {
                    "type": "integer"
                },
@@ -5744,109 +5651,6 @@
                }
            }
        },
-        "schema.PIIDecideRequest": {
-            "type": "object",
-            "properties": {
-                "text": {
-                    "description": "Text is the user-visible content to inspect. Required.",
-                    "type": "string"
-                }
-            }
-        },
-        "schema.PIIDecideResponse": {
-            "type": "object",
-            "properties": {
-                "findings": {
-                    "description": "Findings is one entry per matched span — pattern id, byte\nrange, and audit-safe hash prefix (never the matched value).",
-                    "type": "array",
-                    "items": {
-                        "$ref": "#/definitions/schema.PIIFinding"
-                    }
-                },
-                "redacted_preview": {
-                    "description": "RedactedPreview is the input with mask-action spans replaced\nby their placeholders. Identical to Text when no findings or\nwhen the strongest action is block/route_local (which don't\nrewrite content).",
-                    "type": "string"
-                },
-                "suggested_action": {
-                    "description": "SuggestedAction is the strongest action across all findings:\n\"block\", \"route_local\", \"mask\", or \"allow\" (no findings).",
-                    "type": "string"
-                }
-            }
-        },
-        "schema.PIIFinding": {
-            "type": "object",
-            "properties": {
-                "end": {
-                    "type": "integer"
-                },
-                "hash_prefix": {
-                    "type": "string"
-                },
-                "pattern": {
-                    "type": "string"
-                },
-                "start": {
-                    "type": "integer"
-                }
-            }
-        },
-        "schema.RouterDecideRequest": {
-            "type": "object",
-            "properties": {
-                "input": {
-                    "description": "Input is the user-visible prompt text to classify. Required.\nSchema-shape extraction (chat-message concatenation, etc.) is\nthe caller's responsibility — matches the Probe contract used\nby the in-band middleware.",
-                    "type": "string"
-                },
-                "router": {
-                    "description": "Router is the name of the router model (a ModelConfig with a\n`router:` block). Required.",
-                    "type": "string"
-                }
-            }
-        },
-        "schema.RouterDecideResponse": {
-            "type": "object",
-            "properties": {
-                "cache_similarity": {
-                    "description": "CacheSimilarity carries the cosine similarity of the cache hit\n(0 when not cached).",
-                    "type": "number"
-                },
-                "cached": {
-                    "description": "Cached is true when the decision came from the L2 embedding\ncache rather than a fresh classifier run.",
-                    "type": "boolean"
-                },
-                "candidate": {
-                    "description": "Candidate is the model that would be routed to. Empty when no\ncandidate covers Labels AND no fallback is configured.",
-                    "type": "string"
-                },
-                "classifier": {
-                    "description": "Classifier is the classifier name that produced the decision\n(e.g. \"score\").",
-                    "type": "string"
-                },
-                "fallback": {
-                    "description": "Fallback is true when Candidate is the router's configured\nfallback because no candidate covered Labels. Lets callers\ndistinguish \"matched\" from \"fell back\" without comparing names.",
-                    "type": "boolean"
-                },
-                "labels": {
-                    "description": "Labels is the set of active policy labels.",
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                },
-                "latency_ms": {
-                    "description": "LatencyMs is the classifier's wall-clock cost.",
-                    "type": "integer"
-                },
-                "router": {
-                    "description": "Router echoes the requested router model.",
-                    "type": "string"
-                },
-                "score": {
-                    "description": "Score is the top label's softmax probability (the\nclassifier-side confidence signal).",
-                    "type": "number"
-                }
-            }
-        },
        "schema.StreamOptions": {
            "type": "object",
            "properties": {
--- a/swagger/swagger.yaml
+++ b/swagger/swagger.yaml
@@ -244,6 +244,7 @@ definitions:
        type: boolean
      downloaded_size:
        type: string
+      error: {}
      file_name:
        type: string
      file_size:
@@ -1225,6 +1226,20 @@ definitions:
      role:
        description: The message role
        type: string
+      string_audios:
+        items:
+          type: string
+        type: array
+      string_content:
+        type: string
+      string_images:
+        items:
+          type: string
+        type: array
+      string_videos:
+        items:
+          type: string
+        type: array
      tool_call_id:
        type: string
      tool_calls:
@@ -1621,14 +1636,6 @@ definitions:
          OpenAI API logprobs parameters
          logprobs: boolean - if true, returns log probabilities of each output token
          top_logprobs: integer 0-20 - number of most likely tokens to return at each token position
-      max_completion_tokens:
-        description: |-
-          MaxCompletionTokens is the modern alias for max_tokens
-          (OpenAI deprecated max_tokens; gpt-5 / o-series reject it).
-          Accepted on the wire so up-to-date clients can use the new
-          name; the request middleware collapses it into Maxtokens so
-          internal code reads exactly one field.
-        type: integer
      max_tokens:
        type: integer
      messages:
@@ -1865,105 +1872,6 @@ definitions:
          $ref: '#/definitions/schema.NodeData'
        type: array
    type: object
-  schema.PIIDecideRequest:
-    properties:
-      text:
-        description: Text is the user-visible content to inspect. Required.
-        type: string
-    type: object
-  schema.PIIDecideResponse:
-    properties:
-      findings:
-        description: |-
-          Findings is one entry per matched span — pattern id, byte
-          range, and audit-safe hash prefix (never the matched value).
-        items:
-          $ref: '#/definitions/schema.PIIFinding'
-        type: array
-      redacted_preview:
-        description: |-
-          RedactedPreview is the input with mask-action spans replaced
-          by their placeholders. Identical to Text when no findings or
-          when the strongest action is block/route_local (which don't
-          rewrite content).
-        type: string
-      suggested_action:
-        description: |-
-          SuggestedAction is the strongest action across all findings:
-          "block", "route_local", "mask", or "allow" (no findings).
-        type: string
-    type: object
-  schema.PIIFinding:
-    properties:
-      end:
-        type: integer
-      hash_prefix:
-        type: string
-      pattern:
-        type: string
-      start:
-        type: integer
-    type: object
-  schema.RouterDecideRequest:
-    properties:
-      input:
-        description: |-
-          Input is the user-visible prompt text to classify. Required.
-          Schema-shape extraction (chat-message concatenation, etc.) is
-          the caller's responsibility — matches the Probe contract used
-          by the in-band middleware.
-        type: string
-      router:
-        description: |-
-          Router is the name of the router model (a ModelConfig with a
-          `router:` block). Required.
-        type: string
-    type: object
-  schema.RouterDecideResponse:
-    properties:
-      cache_similarity:
-        description: |-
-          CacheSimilarity carries the cosine similarity of the cache hit
-          (0 when not cached).
-        type: number
-      cached:
-        description: |-
-          Cached is true when the decision came from the L2 embedding
-          cache rather than a fresh classifier run.
-        type: boolean
-      candidate:
-        description: |-
-          Candidate is the model that would be routed to. Empty when no
-          candidate covers Labels AND no fallback is configured.
-        type: string
-      classifier:
-        description: |-
-          Classifier is the classifier name that produced the decision
-          (e.g. "score").
-        type: string
-      fallback:
-        description: |-
-          Fallback is true when Candidate is the router's configured
-          fallback because no candidate covered Labels. Lets callers
-          distinguish "matched" from "fell back" without comparing names.
-        type: boolean
-      labels:
-        description: Labels is the set of active policy labels.
-        items:
-          type: string
-        type: array
-      latency_ms:
-        description: LatencyMs is the classifier's wall-clock cost.
-        type: integer
-      router:
-        description: Router echoes the requested router model.
-        type: string
-      score:
-        description: |-
-          Score is the top label's softmax probability (the
-          classifier-side confidence signal).
-        type: number
-    type: object
  schema.StreamOptions:
    properties:
      include_usage:
@@ -3076,79 +2984,6 @@ paths:
      summary: Show the P2P token
      tags:
      - p2p
-  /api/pii/decide:
-    post:
-      consumes:
-      - application/json
-      parameters:
-      - description: decide params
-        in: body
-        name: request
-        required: true
-        schema:
-          $ref: '#/definitions/schema.PIIDecideRequest'
-      produces:
-      - application/json
-      responses:
-        "200":
-          description: OK
-          schema:
-            $ref: '#/definitions/schema.PIIDecideResponse'
-        "400":
-          description: Bad Request
-          schema:
-            additionalProperties:
-              type: string
-            type: object
-      summary: Scan text for PII and return findings + suggested action (decision
-        oracle)
-      tags:
-      - pii
-  /api/router/decide:
-    post:
-      consumes:
-      - application/json
-      parameters:
-      - description: decide params
-        in: body
-        name: request
-        required: true
-        schema:
-          $ref: '#/definitions/schema.RouterDecideRequest'
-      produces:
-      - application/json
-      responses:
-        "200":
-          description: OK
-          schema:
-            $ref: '#/definitions/schema.RouterDecideResponse'
-        "400":
-          description: Bad Request
-          schema:
-            additionalProperties:
-              type: string
-            type: object
-        "404":
-          description: Not Found
-          schema:
-            additionalProperties:
-              type: string
-            type: object
-        "500":
-          description: Internal Server Error
-          schema:
-            additionalProperties:
-              type: string
-            type: object
-        "503":
-          description: Service Unavailable
-          schema:
-            additionalProperties:
-              type: string
-            type: object
-      summary: Classify a prompt against a router model's policies (decision oracle)
-      tags:
-      - router
  /api/traces:
    get:
      description: Returns captured API exchange traces (request/response pairs) in