test(reasoning): cover the enable_thinking=false non-thinking-mode regression

Adds the end-to-end case that actually broke session summaries / auto-titles and was not covered before: a request with enable_thinking=false against a <think>-capable model. In non-thinking mode the model emits no reasoning block, so llama.cpp's autoparser returns ChatDeltas with content set and reasoning_content empty (verified against stock llama-server: same model with chat_template_kwargs.enable_thinking=false returns reasoning_content=null, content="hello"). thinkingStartToken is still "<think>" because it is detected per-model from the enable_thinking=true render, so the old code prepended it and swallowed the answer. The test fails without the ExtractReasoningComplete gate. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
fix(reasoning): stop prefilled <think> from swallowing tag-less answers
2026-06-09 01:07:09 -04:00 · 2026-06-09 00:06:00 +00:00 · 2026-06-08 23:20:27 +00:00 · 2026-06-08 23:17:50 +02:00
16 changed files with 336 additions and 38 deletions
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=9e3b928fd8c9d14dbf15a8768b9fdd7e5c721d66
+LLAMA_VERSION?=28ca1e600c5dac1854fb7e09611914013430b037
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -381,6 +381,15 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, const
            });
    }

+    // for each video in the request, add the video data
+    for (int i = 0; i < predict->videos_size(); i++) {
+        data["video_data"].push_back(json
+            {
+                {"id", i},
+                {"data",    predict->videos(i)},
+            });
+    }
+
    data["stop"] = predict->stopprompts();
    // data["n_probs"] = predict->nprobs();
    //TODO: images,
@@ -1503,7 +1512,7 @@ public:
                    msg_json["role"] = msg.role();

                    bool is_last_user_msg = (i == last_user_msg_idx);
-                    bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0);
+                    bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0 || request->videos_size() > 0);

                    // Handle content - can be string, null, or array
                    // For multimodal content, we'll embed images/audio from separate fields
@@ -1554,6 +1563,16 @@ public:
                                    content_array.push_back(audio_chunk);
                                }
                            }
+                            if (request->videos_size() > 0) {
+                                for (int j = 0; j < request->videos_size(); j++) {
+                                    json video_chunk;
+                                    video_chunk["type"] = "input_video";
+                                    json input_video;
+                                    input_video["data"] = request->videos(j);
+                                    video_chunk["input_video"] = input_video;
+                                    content_array.push_back(video_chunk);
+                                }
+                            }
                            msg_json["content"] = content_array;
                        } else {
                            // Use content as-is (already array or not last user message)
@@ -1588,6 +1607,16 @@ public:
                                content_array.push_back(audio_chunk);
                            }
                        }
+                        if (request->videos_size() > 0) {
+                            for (int j = 0; j < request->videos_size(); j++) {
+                                json video_chunk;
+                                video_chunk["type"] = "input_video";
+                                json input_video;
+                                input_video["data"] = request->videos(j);
+                                video_chunk["input_video"] = input_video;
+                                content_array.push_back(video_chunk);
+                            }
+                        }
                        msg_json["content"] = content_array;
                    } else if (msg.role() == "tool") {
                        // Tool role messages must have content field set, even if empty
@@ -2039,6 +2068,16 @@ public:
                        files.push_back(decoded_data);
                    }
                }
+
+                const auto &video_data = data.find("video_data");
+                if (video_data != data.end() && video_data->is_array())
+                {
+                    for (const auto &video : *video_data)
+                    {
+                        auto decoded_data = base64_decode(video["data"].get<std::string>());
+                        files.push_back(decoded_data);
+                    }
+                }
            }

            const bool has_mtmd = ctx_server.impl->mctx != nullptr;
@@ -2291,7 +2330,7 @@ public:
                    }

                    bool is_last_user_msg = (i == last_user_msg_idx);
-                    bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0);
+                    bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0 || request->videos_size() > 0);

                    // Handle content - can be string, null, or array
                    // For multimodal content, we'll embed images/audio from separate fields
@@ -2344,6 +2383,16 @@ public:
                                    content_array.push_back(audio_chunk);
                                }
                            }
+                            if (request->videos_size() > 0) {
+                                for (int j = 0; j < request->videos_size(); j++) {
+                                    json video_chunk;
+                                    video_chunk["type"] = "input_video";
+                                    json input_video;
+                                    input_video["data"] = request->videos(j);
+                                    video_chunk["input_video"] = input_video;
+                                    content_array.push_back(video_chunk);
+                                }
+                            }
                            msg_json["content"] = content_array;
                        } else {
                            // Use content as-is (already array or not last user message)
@@ -2383,6 +2432,16 @@ public:
                                content_array.push_back(audio_chunk);
                            }
                        }
+                        if (request->videos_size() > 0) {
+                            for (int j = 0; j < request->videos_size(); j++) {
+                                json video_chunk;
+                                video_chunk["type"] = "input_video";
+                                json input_video;
+                                input_video["data"] = request->videos(j);
+                                video_chunk["input_video"] = input_video;
+                                content_array.push_back(video_chunk);
+                            }
+                        }
                        msg_json["content"] = content_array;
                        SRV_INF("[CONTENT DEBUG] Predict: Message %d created content array with media\n", i);
                    } else if (!msg.tool_calls().empty()) {
@@ -2845,6 +2904,16 @@ public:
                        files.push_back(decoded_data);
                    }
                }
+
+                const auto &video_data = data.find("video_data");
+                if (video_data != data.end() && video_data->is_array())
+                {
+                    for (const auto &video : *video_data)
+                    {
+                        auto decoded_data = base64_decode(video["data"].get<std::string>());
+                        files.push_back(decoded_data);
+                    }
+                }
            }

            // process files
--- a/backend/python/transformers/requirements-cpu.txt
+++ b/backend/python/transformers/requirements-cpu.txt
@@ -2,7 +2,7 @@ torch==2.7.1
 llvmlite==0.43.0
 numba==0.60.0
 accelerate
-transformers>=5.10.2
+transformers>=5.9.0
 bitsandbytes
 sentence-transformers==5.5.1
 diffusers
--- a/backend/python/transformers/requirements-cublas12.txt
+++ b/backend/python/transformers/requirements-cublas12.txt
@@ -2,7 +2,7 @@ torch==2.7.1
 accelerate
 llvmlite==0.43.0
 numba==0.60.0
-transformers>=5.10.2
+transformers>=5.9.0
 bitsandbytes
 sentence-transformers==5.5.1
 diffusers
--- a/backend/python/transformers/requirements-cublas13.txt
+++ b/backend/python/transformers/requirements-cublas13.txt
@@ -2,7 +2,7 @@
 torch==2.9.0
 llvmlite==0.43.0
 numba==0.60.0
-transformers>=5.10.2
+transformers>=5.9.0
 bitsandbytes
 sentence-transformers==5.5.1
 diffusers
--- a/backend/python/transformers/requirements-hipblas.txt
+++ b/backend/python/transformers/requirements-hipblas.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/rocm7.0
 torch==2.10.0+rocm7.0
 accelerate
-transformers>=5.10.2
+transformers>=5.9.0
 llvmlite==0.43.0
 numba==0.60.0
 bitsandbytes
--- a/backend/python/transformers/requirements-intel.txt
+++ b/backend/python/transformers/requirements-intel.txt
@@ -3,7 +3,7 @@ torch
 optimum[openvino]
 llvmlite==0.43.0
 numba==0.60.0
-transformers>=5.10.2
+transformers>=5.9.0
 bitsandbytes
 sentence-transformers==5.5.1
 diffusers
--- a/backend/python/transformers/requirements-mps.txt
+++ b/backend/python/transformers/requirements-mps.txt
@@ -2,7 +2,7 @@ torch==2.7.1
 llvmlite==0.43.0
 numba==0.60.0
 accelerate
-transformers>=5.10.2
+transformers>=5.9.0
 bitsandbytes
 sentence-transformers==5.5.1
 diffusers
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -103,7 +103,12 @@ func applyAutoparserOverride(
 	// blocks like "<think></think>" that some models emit when reasoning
 	// is disabled.
 	if deltaReasoning == "" && deltaContent != "" {
-		deltaReasoning, deltaContent = reason.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, reasoningConfig)
+		// Complete-response extraction: only honor a prefilled <think> start
+		// token when deltaContent actually closes the reasoning block. Without
+		// it the model answered directly and the whole answer must stay in
+		// content rather than be swallowed as unclosed reasoning. See
+		// reason.ExtractReasoningComplete.
+		deltaReasoning, deltaContent = reason.ExtractReasoningComplete(deltaContent, thinkingStartToken, reasoningConfig)
 	}
 	xlog.Debug("[ChatDeltas] non-SSE no-tools: overriding result with C++ autoparser deltas",
 		"content_len", len(deltaContent), "reasoning_len", len(deltaReasoning))
--- a/core/http/endpoints/openai/chat_test.go
+++ b/core/http/endpoints/openai/chat_test.go
@@ -186,6 +186,114 @@ var _ = Describe("applyAutoparserOverride", func() {
 			Expect(result).To(Equal(existing))
 		})
 	})
+
+	// Regression tests for the prefilled-thinking-token path (thinkingStartToken
+	// != ""). This is the configuration the gallery qwen3 family runs in: the
+	// chat template injects <think> into the prompt, so DetectThinkingStartToken
+	// returns "<think>" and the model's output begins *inside* a reasoning block
+	// — it emits a closing </think> but no opening tag.
+	//
+	// The defensive Go-side fallback prepends the start token so the standard
+	// extractor can pair it with the model's </think>. But on a *complete*
+	// response that contains NO closing tag (the model answered directly with no
+	// reasoning at all), prepending <think> manufactures an unclosed block that
+	// swallows the entire answer into reasoning, leaving content empty. That is
+	// the bug: short/direct answers (session names, JSON summaries) come back
+	// with an empty content field.
+	Context("autoparser delivered content with empty reasoning and a prefilled thinking token", func() {
+		const startToken = "<think>"
+
+		It("keeps a tag-less direct answer as content instead of swallowing it as reasoning", func() {
+			// Model answered directly: no <think>, no </think> anywhere.
+			chatDeltas := []*pb.ChatDelta{
+				{Content: "hello", ReasoningContent: ""},
+			}
+
+			result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil)
+
+			Expect(result).To(HaveLen(1))
+			Expect(result[0].Message.Content).ToNot(BeNil())
+			Expect(*(result[0].Message.Content.(*string))).To(Equal("hello"),
+				"a complete answer with no closing reasoning tag must stay in content")
+			Expect(result[0].Message.Reasoning).To(BeNil(),
+				"no reasoning block was emitted, so Reasoning must not be set")
+		})
+
+		It("keeps a tag-less JSON answer as content (the summary case)", func() {
+			raw := `{"short":"Tests pass","long":"go test ./... succeeded."}`
+			chatDeltas := []*pb.ChatDelta{
+				{Content: raw, ReasoningContent: ""},
+			}
+
+			result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil)
+
+			Expect(result).To(HaveLen(1))
+			Expect(*(result[0].Message.Content.(*string))).To(Equal(raw))
+			Expect(result[0].Message.Reasoning).To(BeNil())
+		})
+
+		It("still splits reasoning when the model emits the closing tag (prefill paired with </think>)", func() {
+			// The legitimate prefill case: <think> was in the prompt, so the
+			// output carries only the closing tag. The closing tag is the proof
+			// that a reasoning block exists, so extraction must run.
+			raw := "The user wants a greeting.\n</think>\n\nHello there!"
+			chatDeltas := []*pb.ChatDelta{
+				{Content: raw, ReasoningContent: ""},
+			}
+
+			result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil)
+
+			Expect(result).To(HaveLen(1))
+			content := *(result[0].Message.Content.(*string))
+			Expect(content).To(ContainSubstring("Hello there!"))
+			Expect(content).ToNot(ContainSubstring("</think>"))
+			Expect(content).ToNot(ContainSubstring("The user wants a greeting"))
+			Expect(result[0].Message.Reasoning).ToNot(BeNil())
+			Expect(*result[0].Message.Reasoning).To(ContainSubstring("The user wants a greeting"))
+		})
+
+		It("still splits a fully-tagged <think>…</think> block with a prefill token set", func() {
+			raw := "<think>Reasoning here.</think>Final answer."
+			chatDeltas := []*pb.ChatDelta{
+				{Content: raw, ReasoningContent: ""},
+			}
+
+			result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil)
+
+			Expect(result).To(HaveLen(1))
+			Expect(*(result[0].Message.Content.(*string))).To(Equal("Final answer."))
+			Expect(result[0].Message.Reasoning).ToNot(BeNil())
+			Expect(*result[0].Message.Reasoning).To(ContainSubstring("Reasoning here"))
+		})
+
+		// End-to-end regression for the real production failure: a request with
+		// enable_thinking=false against a <think>-capable model (qwen3 family).
+		//
+		// In non-thinking mode the model emits no reasoning block, so llama.cpp's
+		// autoparser correctly returns ChatDeltas with Content set and
+		// ReasoningContent EMPTY (verified against stock llama-server: the same
+		// model with chat_template_kwargs.enable_thinking=false returns
+		// reasoning_content=null and content="hello"). But thinkingStartToken is
+		// detected per-model from the enable_thinking=TRUE render
+		// (grpc-server renders with enable_thinking=true; DetectThinkingStartToken
+		// does not evaluate the jinja {% if enable_thinking %} conditional), so it
+		// is "<think>" even for this non-thinking request. The old code prepended
+		// it and swallowed the answer. This is the case that broke session
+		// summaries and auto-titles and was NOT covered before.
+		It("preserves content for a non-thinking-mode request (enable_thinking=false, empty reasoning_content)", func() {
+			// What llama.cpp's autoparser actually returns in non-thinking mode.
+			chatDeltas := []*pb.ChatDelta{
+				{Content: `{"short":"Go tests passed for internal/session"}`, ReasoningContent: ""},
+			}
+
+			result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil)
+
+			Expect(result).To(HaveLen(1))
+			Expect(*(result[0].Message.Content.(*string))).To(Equal(`{"short":"Go tests passed for internal/session"}`),
+				"non-thinking-mode answers must reach the client intact, not be swallowed as reasoning")
+			Expect(result[0].Message.Reasoning).To(BeNil())
+		})
+	})
 })

 var _ = Describe("mergeToolCallDeltas", func() {
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -1579,7 +1579,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 		// ExtractReasoningWithConfig is a no-op when no tag pair matches,
 		// so it's safe to apply unconditionally in the no-reasoning branch.
 		if deltaReasoning == "" && deltaContent != "" {
-			deltaReasoning, deltaContent = reasoning.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, config.ReasoningConfig)
+			deltaReasoning, deltaContent = reasoning.ExtractReasoningComplete(deltaContent, thinkingStartToken, config.ReasoningConfig)
 		}
 		reasoningText = deltaReasoning
 		responseWithoutReasoning = deltaContent
@@ -1587,7 +1587,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 		cleanedResponse = deltaContent
 		toolCalls = deltaToolCalls
 	} else {
-		reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, config.ReasoningConfig)
+		reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningComplete(rawResponse, thinkingStartToken, config.ReasoningConfig)
 		textContent = functions.ParseTextContent(responseWithoutReasoning, config.FunctionsConfig)
 		cleanedResponse = functions.CleanupLLMResult(responseWithoutReasoning, config.FunctionsConfig)
 		toolCalls = functions.ParseFunctionCall(cleanedResponse, config.FunctionsConfig)
--- a/core/http/endpoints/openresponses/responses.go
+++ b/core/http/endpoints/openresponses/responses.go
@@ -1356,7 +1356,7 @@ func handleOpenResponsesNonStream(c echo.Context, responseID string, createdAt i
 	thinkingStartToken := reason.DetectThinkingStartToken(template, &cfg.ReasoningConfig)

 	// Extract reasoning from result before cleaning
-	reasoningContent, cleanedResult := reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig)
+	reasoningContent, cleanedResult := reason.ExtractReasoningComplete(result, thinkingStartToken, cfg.ReasoningConfig)

 	// Parse tool calls if using functions
 	var outputItems []schema.ORItemField
@@ -1996,7 +1996,7 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 				finalCleanedResult = extractor.CleanedContent()
 			}
 			if finalReasoning == "" && finalCleanedResult == "" {
-				finalReasoning, finalCleanedResult = reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig)
+				finalReasoning, finalCleanedResult = reason.ExtractReasoningComplete(result, thinkingStartToken, cfg.ReasoningConfig)
 			}

 			// Close reasoning item if it exists and wasn't closed yet
@@ -2493,7 +2493,7 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 		finalCleanedResult = extractor.CleanedContent()
 	}
 	if finalReasoning == "" && finalCleanedResult == "" {
-		finalReasoning, finalCleanedResult = reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig)
+		finalReasoning, finalCleanedResult = reason.ExtractReasoningComplete(result, thinkingStartToken, cfg.ReasoningConfig)
 	}

 	// Close reasoning item if it exists and wasn't closed yet
--- a/core/http/react-ui/src/hooks/useChat.js
+++ b/core/http/react-ui/src/hooks/useChat.js
@@ -216,6 +216,12 @@ export function useChat(initialModel = '') {
            audio_url: { url: `data:${file.type};base64,${file.base64}` },
          })
          userFiles.push({ name: file.name, type: 'audio' })
+        } else if (file.type?.startsWith('video/')) {
+          messageContent.push({
+            type: 'video_url',
+            video_url: { url: `data:${file.type};base64,${file.base64}` },
+          })
+          userFiles.push({ name: file.name, type: 'video' })
        } else {
 			// Text/PDF files - append to content
 			if (file.textContent) {
--- a/core/http/react-ui/src/pages/Chat.jsx
+++ b/core/http/react-ui/src/pages/Chat.jsx
@@ -265,7 +265,7 @@ function UserMessageContent({ content, files }) {
        <div className="chat-message-files">
          {files.map((f, i) => (
            <span key={i} className="chat-file-inline">
-              <i className={`fas ${f.type === 'image' ? 'fa-image' : f.type === 'audio' ? 'fa-headphones' : 'fa-file'}`} />
+              <i className={`fas ${f.type === 'image' ? 'fa-image' : f.type === 'audio' ? 'fa-headphones' : f.type === 'video' ? 'fa-film' : 'fa-file'}`} />
              {f.name}
            </span>
          ))}
@@ -274,6 +274,9 @@ function UserMessageContent({ content, files }) {
      {Array.isArray(content) && content.filter(c => c.type === 'image_url').map((img, i) => (
        <img key={i} src={img.image_url.url} alt="attached" className="chat-inline-image" />
      ))}
+      {Array.isArray(content) && content.filter(c => c.type === 'video_url').map((vid, i) => (
+        <video key={i} src={vid.video_url.url} controls className="chat-inline-video" />
+      ))}
    </>
  )
 }
@@ -711,7 +714,7 @@ export default function Chat() {
    for (const file of e.target.files) {
      const base64 = await fileToBase64(file)
      const entry = { name: file.name, type: file.type, base64 }
-      if (!file.type.startsWith('image/') && !file.type.startsWith('audio/')) {
+      if (!file.type.startsWith('image/') && !file.type.startsWith('audio/') && !file.type.startsWith('video/')) {
        entry.textContent = await file.text().catch(() => '')
      }
      newFiles.push(entry)
@@ -1244,7 +1247,7 @@ export default function Chat() {
          <div className="chat-files">
            {files.map((f, i) => (
              <span key={i} className="chat-file-badge">
-                <i className={`fas ${f.type?.startsWith('image/') ? 'fa-image' : f.type?.startsWith('audio/') ? 'fa-headphones' : 'fa-file'}`} />
+                <i className={`fas ${f.type?.startsWith('image/') ? 'fa-image' : f.type?.startsWith('audio/') ? 'fa-headphones' : f.type?.startsWith('video/') ? 'fa-film' : 'fa-file'}`} />
                {f.name}
                <button onClick={() => setFiles(prev => prev.filter((_, idx) => idx !== i))}>
                  <i className="fas fa-xmark" />
@@ -1343,7 +1346,7 @@ export default function Chat() {
              ref={fileInputRef}
              type="file"
              multiple
-              accept="image/*,audio/*,application/pdf,.txt,.md,.csv,.json"
+              accept="image/*,audio/*,video/*,application/pdf,.txt,.md,.csv,.json"
              style={{ display: 'none' }}
              onChange={handleFileChange}
            />
--- a/pkg/reasoning/reasoning.go
+++ b/pkg/reasoning/reasoning.go
@@ -89,6 +89,35 @@ func ExtractReasoningWithConfig(content, thinkingStartToken string, config Confi
 	return reasoning, cleanedContent
 }

+// ExtractReasoningComplete extracts reasoning from a COMPLETE (non-streaming)
+// model response. It behaves like ExtractReasoningWithConfig except that it only
+// honors a prefilled thinking start token when the response actually contains
+// the matching closing tag.
+//
+// Rationale: when a chat template injects the start token into the prompt (so
+// DetectThinkingStartToken returns e.g. "<think>"), the model's output begins
+// inside a reasoning block and carries only the closing tag. The defensive
+// fallback prepends the start token so the extractor can pair it with that
+// close tag. But on a COMPLETE response with no closing tag, the model answered
+// directly with no reasoning at all — prepending the start token would
+// manufacture an unclosed block that swallows the entire answer into reasoning,
+// leaving content empty (breaking short/direct answers such as session names or
+// JSON summaries). Genuine reasoning tags already present in the content still
+// extract, because dropping the synthetic prefill does not affect them.
+//
+// Streaming callers must keep using ExtractReasoningWithConfig: mid-stream an
+// as-yet-unclosed block is legitimate and its tokens should surface as
+// reasoning deltas as they arrive.
+func ExtractReasoningComplete(content, thinkingStartToken string, config Config) (reasoning string, cleanedContent string) {
+	startToken := thinkingStartToken
+	if startToken != "" {
+		if end := ClosingTokenForStart(startToken, &config); end == "" || !strings.Contains(content, end) {
+			startToken = ""
+		}
+	}
+	return ExtractReasoningWithConfig(content, startToken, config)
+}
+
 // PrependThinkingTokenIfNeeded prepends the thinking start token to content if it was
 // detected in the prompt. This allows the standard extraction logic to work correctly
 // for models where the thinking token is already in the prompt.
@@ -131,6 +160,48 @@ func PrependThinkingTokenIfNeeded(content string, startToken string) string {
 	return startToken + content
 }

+// defaultReasoningTagPairs are the built-in start/end reasoning tag pairs,
+// matching llama.cpp's chat-parser.cpp. Kept at package scope so that
+// ExtractReasoning and ClosingTokenForStart share a single source of truth.
+var defaultReasoningTagPairs = []TagPair{
+	{Start: "<|START_THINKING|>", End: "<|END_THINKING|>"},            // Command-R models
+	{Start: "<|inner_prefix|>", End: "<|inner_suffix|>"},              // Apertus models
+	{Start: "<seed:think>", End: "</seed:think>"},                     // Seed models
+	{Start: "<think>", End: "</think>"},                               // DeepSeek, Granite, ExaOne models
+	{Start: "<|think|>", End: "<|end|><|begin|>assistant<|content|>"}, // Solar Open models (complex end)
+	{Start: "<|channel>thought", End: "<channel|>"},                   // Gemma 4 models
+	{Start: "<thinking>", End: "</thinking>"},                         // General thinking tag
+	{Start: "[THINK]", End: "[/THINK]"},                               // Magistral models
+}
+
+// ClosingTokenForStart returns the closing reasoning tag that pairs with the
+// given start token, searching custom config TagPairs first then the built-in
+// defaults. Returns "" when startToken is empty or unrecognized.
+//
+// Used by the non-streaming autoparser fallback to decide whether a complete
+// response that began with a prefilled thinking token actually closed its
+// reasoning block: only then is synthesizing the start token (so the standard
+// extractor can pair it with the model's close tag) safe. A complete response
+// with no closing tag is a direct answer, not unclosed reasoning.
+func ClosingTokenForStart(startToken string, config *Config) string {
+	if startToken == "" {
+		return ""
+	}
+	if config != nil {
+		for _, pair := range config.TagPairs {
+			if pair.Start == startToken {
+				return pair.End
+			}
+		}
+	}
+	for _, pair := range defaultReasoningTagPairs {
+		if pair.Start == startToken {
+			return pair.End
+		}
+	}
+	return ""
+}
+
 // ExtractReasoning extracts reasoning content from thinking tags and returns
 // both the extracted reasoning and the cleaned content (with tags removed).
 // It handles <thinking>...</thinking> and <think>...</think> tags.
@@ -145,22 +216,7 @@ func ExtractReasoning(content string, config *Config) (reasoning string, cleaned
 	var cleanedParts []string
 	remaining := content

-	// Define default tag pairs to look for (matching llama.cpp's chat-parser.cpp)
-	defaultTagPairs := []struct {
-		start string
-		end   string
-	}{
-		{"<|START_THINKING|>", "<|END_THINKING|>"},            // Command-R models
-		{"<|inner_prefix|>", "<|inner_suffix|>"},              // Apertus models
-		{"<seed:think>", "</seed:think>"},                     // Seed models
-		{"<think>", "</think>"},                               // DeepSeek, Granite, ExaOne models
-		{"<|think|>", "<|end|><|begin|>assistant<|content|>"}, // Solar Open models (complex end)
-		{"<|channel>thought", "<channel|>"},                    // Gemma 4 models
-		{"<thinking>", "</thinking>"},                         // General thinking tag
-		{"[THINK]", "[/THINK]"},                               // Magistral models
-	}
-
-	// Merge custom tag pairs with default tag pairs (custom pairs first for priority)
+	// Merge custom tag pairs (highest priority) with the built-in defaults.
 	var tagPairs []struct {
 		start string
 		end   string
@@ -175,9 +231,11 @@ func ExtractReasoning(content string, config *Config) (reasoning string, cleaned
 			}
 		}
 	}
-	// Add default tag pairs
-	for _, pair := range defaultTagPairs {
-		tagPairs = append(tagPairs, pair)
+	for _, pair := range defaultReasoningTagPairs {
+		tagPairs = append(tagPairs, struct {
+			start string
+			end   string
+		}{pair.Start, pair.End})
 	}

 	// Track the last position we've processed
--- a/pkg/reasoning/reasoning_test.go
+++ b/pkg/reasoning/reasoning_test.go
@@ -1175,6 +1175,55 @@ var _ = Describe("Custom Tokens and Tag Pairs Integration", func() {
 	})
 })

+var _ = Describe("ClosingTokenForStart", func() {
+	It("returns the default closing tag for a known start token", func() {
+		Expect(ClosingTokenForStart("<think>", nil)).To(Equal("</think>"))
+		Expect(ClosingTokenForStart("<thinking>", nil)).To(Equal("</thinking>"))
+		Expect(ClosingTokenForStart("[THINK]", nil)).To(Equal("[/THINK]"))
+	})
+
+	It("returns empty for an empty or unknown start token", func() {
+		Expect(ClosingTokenForStart("", nil)).To(BeEmpty())
+		Expect(ClosingTokenForStart("<nope>", nil)).To(BeEmpty())
+	})
+
+	It("prefers custom config tag pairs over the defaults", func() {
+		cfg := &Config{TagPairs: []TagPair{{Start: "<think>", End: "<<END>>"}}}
+		Expect(ClosingTokenForStart("<think>", cfg)).To(Equal("<<END>>"))
+	})
+})
+
+var _ = Describe("ExtractReasoningComplete", func() {
+	const startToken = "<think>"
+
+	It("keeps a tag-less answer as content when a start token is prefilled but no close tag is present", func() {
+		// The bug guard: prompt-prefilled <think>, model answered directly with
+		// no reasoning. The synthetic prefill must not swallow it as reasoning.
+		reasoning, content := ExtractReasoningComplete("hello", startToken, Config{})
+		Expect(reasoning).To(BeEmpty())
+		Expect(content).To(Equal("hello"))
+	})
+
+	It("extracts reasoning when the model emits only the closing tag (legitimate prefill)", func() {
+		reasoning, content := ExtractReasoningComplete("the rationale\n</think>\n\nthe answer", startToken, Config{})
+		Expect(reasoning).To(ContainSubstring("the rationale"))
+		Expect(content).To(ContainSubstring("the answer"))
+		Expect(content).ToNot(ContainSubstring("</think>"))
+	})
+
+	It("extracts a fully-tagged block regardless of the prefill token", func() {
+		reasoning, content := ExtractReasoningComplete("<think>r</think>answer", startToken, Config{})
+		Expect(reasoning).To(Equal("r"))
+		Expect(content).To(Equal("answer"))
+	})
+
+	It("behaves like ExtractReasoningWithConfig when no start token is prefilled", func() {
+		reasoning, content := ExtractReasoningComplete("<think>r</think>answer", "", Config{})
+		Expect(reasoning).To(Equal("r"))
+		Expect(content).To(Equal("answer"))
+	})
+})
+
 // Helper function to create bool pointers for test configs
 func boolPtr(b bool) *bool {
 	return &b