From 184a42547400b1a823bd7aa6160f0cfc7242aecc Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Tue, 9 Jun 2026 00:06:00 +0000
Subject: [PATCH] test(reasoning): cover the enable_thinking=false
 non-thinking-mode regression

Adds the end-to-end case that actually broke session summaries / auto-titles
and was not covered before: a request with enable_thinking=false against a
<think>-capable model. In non-thinking mode the model emits no reasoning block,
so llama.cpp's autoparser returns ChatDeltas with content set and
reasoning_content empty (verified against stock llama-server: same model with
chat_template_kwargs.enable_thinking=false returns reasoning_content=null,
content="hello"). thinkingStartToken is still "<think>" because it is detected
per-model from the enable_thinking=true render, so the old code prepended it and
swallowed the answer. The test fails without the ExtractReasoningComplete gate.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 core/http/endpoints/openai/chat_test.go | 28 +++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
diff --git a/core/http/endpoints/openai/chat_test.go b/core/http/endpoints/openai/chat_test.go
index f5aa35690..ffb3086fb 100644
--- a/core/http/endpoints/openai/chat_test.go
+++ b/core/http/endpoints/openai/chat_test.go
@@ -265,6 +265,34 @@ var _ = Describe("applyAutoparserOverride", func() {
 			Expect(result[0].Message.Reasoning).ToNot(BeNil())
 			Expect(*result[0].Message.Reasoning).To(ContainSubstring("Reasoning here"))
 		})
+
+		// End-to-end regression for the real production failure: a request with
+		// enable_thinking=false against a <think>-capable model (qwen3 family).
+		//
+		// In non-thinking mode the model emits no reasoning block, so llama.cpp's
+		// autoparser correctly returns ChatDeltas with Content set and
+		// ReasoningContent EMPTY (verified against stock llama-server: the same
+		// model with chat_template_kwargs.enable_thinking=false returns
+		// reasoning_content=null and content="hello"). But thinkingStartToken is
+		// detected per-model from the enable_thinking=TRUE render
+		// (grpc-server renders with enable_thinking=true; DetectThinkingStartToken
+		// does not evaluate the jinja {% if enable_thinking %} conditional), so it
+		// is "<think>" even for this non-thinking request. The old code prepended
+		// it and swallowed the answer. This is the case that broke session
+		// summaries and auto-titles and was NOT covered before.
+		It("preserves content for a non-thinking-mode request (enable_thinking=false, empty reasoning_content)", func() {
+			// What llama.cpp's autoparser actually returns in non-thinking mode.
+			chatDeltas := []*pb.ChatDelta{
+				{Content: `{"short":"Go tests passed for internal/session"}`, ReasoningContent: ""},
+			}
+
+			result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil)
+
+			Expect(result).To(HaveLen(1))
+			Expect(*(result[0].Message.Content.(*string))).To(Equal(`{"short":"Go tests passed for internal/session"}`),
+				"non-thinking-mode answers must reach the client intact, not be swallowed as reasoning")
+			Expect(result[0].Message.Reasoning).To(BeNil())
+		})
 	})
 })