diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
index 48e86d42e..5b9b5ed13 100644
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -103,7 +103,12 @@ func applyAutoparserOverride(
// blocks like "" that some models emit when reasoning
// is disabled.
if deltaReasoning == "" && deltaContent != "" {
- deltaReasoning, deltaContent = reason.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, reasoningConfig)
+ // Complete-response extraction: only honor a prefilled start
+ // token when deltaContent actually closes the reasoning block. Without
+ // it the model answered directly and the whole answer must stay in
+ // content rather than be swallowed as unclosed reasoning. See
+ // reason.ExtractReasoningComplete.
+ deltaReasoning, deltaContent = reason.ExtractReasoningComplete(deltaContent, thinkingStartToken, reasoningConfig)
}
xlog.Debug("[ChatDeltas] non-SSE no-tools: overriding result with C++ autoparser deltas",
"content_len", len(deltaContent), "reasoning_len", len(deltaReasoning))
diff --git a/core/http/endpoints/openai/chat_test.go b/core/http/endpoints/openai/chat_test.go
index ccdbe6850..f5aa35690 100644
--- a/core/http/endpoints/openai/chat_test.go
+++ b/core/http/endpoints/openai/chat_test.go
@@ -186,6 +186,86 @@ var _ = Describe("applyAutoparserOverride", func() {
Expect(result).To(Equal(existing))
})
})
+
+ // Regression tests for the prefilled-thinking-token path (thinkingStartToken
+ // != ""). This is the configuration the gallery qwen3 family runs in: the
+ // chat template injects into the prompt, so DetectThinkingStartToken
+ // returns "" and the model's output begins *inside* a reasoning block
+ // — it emits a closing but no opening tag.
+ //
+ // The defensive Go-side fallback prepends the start token so the standard
+ // extractor can pair it with the model's . But on a *complete*
+ // response that contains NO closing tag (the model answered directly with no
+ // reasoning at all), prepending manufactures an unclosed block that
+ // swallows the entire answer into reasoning, leaving content empty. That is
+ // the bug: short/direct answers (session names, JSON summaries) come back
+ // with an empty content field.
+ Context("autoparser delivered content with empty reasoning and a prefilled thinking token", func() {
+ const startToken = ""
+
+ It("keeps a tag-less direct answer as content instead of swallowing it as reasoning", func() {
+ // Model answered directly: no , no anywhere.
+ chatDeltas := []*pb.ChatDelta{
+ {Content: "hello", ReasoningContent: ""},
+ }
+
+ result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil)
+
+ Expect(result).To(HaveLen(1))
+ Expect(result[0].Message.Content).ToNot(BeNil())
+ Expect(*(result[0].Message.Content.(*string))).To(Equal("hello"),
+ "a complete answer with no closing reasoning tag must stay in content")
+ Expect(result[0].Message.Reasoning).To(BeNil(),
+ "no reasoning block was emitted, so Reasoning must not be set")
+ })
+
+ It("keeps a tag-less JSON answer as content (the summary case)", func() {
+ raw := `{"short":"Tests pass","long":"go test ./... succeeded."}`
+ chatDeltas := []*pb.ChatDelta{
+ {Content: raw, ReasoningContent: ""},
+ }
+
+ result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil)
+
+ Expect(result).To(HaveLen(1))
+ Expect(*(result[0].Message.Content.(*string))).To(Equal(raw))
+ Expect(result[0].Message.Reasoning).To(BeNil())
+ })
+
+ It("still splits reasoning when the model emits the closing tag (prefill paired with )", func() {
+ // The legitimate prefill case: was in the prompt, so the
+ // output carries only the closing tag. The closing tag is the proof
+ // that a reasoning block exists, so extraction must run.
+ raw := "The user wants a greeting.\n\n\nHello there!"
+ chatDeltas := []*pb.ChatDelta{
+ {Content: raw, ReasoningContent: ""},
+ }
+
+ result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil)
+
+ Expect(result).To(HaveLen(1))
+ content := *(result[0].Message.Content.(*string))
+ Expect(content).To(ContainSubstring("Hello there!"))
+ Expect(content).ToNot(ContainSubstring(""))
+ Expect(content).ToNot(ContainSubstring("The user wants a greeting"))
+ Expect(result[0].Message.Reasoning).ToNot(BeNil())
+ Expect(*result[0].Message.Reasoning).To(ContainSubstring("The user wants a greeting"))
+ })
+
+ It("still splits a fully-tagged … block with a prefill token set", func() {
+ raw := "Reasoning here.Final answer."
+ chatDeltas := []*pb.ChatDelta{
+ {Content: raw, ReasoningContent: ""},
+ }
+
+ result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil)
+
+ Expect(result).To(HaveLen(1))
+ Expect(*(result[0].Message.Content.(*string))).To(Equal("Final answer."))
+ Expect(result[0].Message.Reasoning).ToNot(BeNil())
+ Expect(*result[0].Message.Reasoning).To(ContainSubstring("Reasoning here"))
+ })
+ })
})
var _ = Describe("mergeToolCallDeltas", func() {
diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go
index 0d638a909..9bd40679c 100644
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -1579,7 +1579,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
// ExtractReasoningWithConfig is a no-op when no tag pair matches,
// so it's safe to apply unconditionally in the no-reasoning branch.
if deltaReasoning == "" && deltaContent != "" {
- deltaReasoning, deltaContent = reasoning.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, config.ReasoningConfig)
+ deltaReasoning, deltaContent = reasoning.ExtractReasoningComplete(deltaContent, thinkingStartToken, config.ReasoningConfig)
}
reasoningText = deltaReasoning
responseWithoutReasoning = deltaContent
@@ -1587,7 +1587,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
cleanedResponse = deltaContent
toolCalls = deltaToolCalls
} else {
- reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, config.ReasoningConfig)
+ reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningComplete(rawResponse, thinkingStartToken, config.ReasoningConfig)
textContent = functions.ParseTextContent(responseWithoutReasoning, config.FunctionsConfig)
cleanedResponse = functions.CleanupLLMResult(responseWithoutReasoning, config.FunctionsConfig)
toolCalls = functions.ParseFunctionCall(cleanedResponse, config.FunctionsConfig)
diff --git a/core/http/endpoints/openresponses/responses.go b/core/http/endpoints/openresponses/responses.go
index 2b986cc61..916380d01 100644
--- a/core/http/endpoints/openresponses/responses.go
+++ b/core/http/endpoints/openresponses/responses.go
@@ -1356,7 +1356,7 @@ func handleOpenResponsesNonStream(c echo.Context, responseID string, createdAt i
thinkingStartToken := reason.DetectThinkingStartToken(template, &cfg.ReasoningConfig)
// Extract reasoning from result before cleaning
- reasoningContent, cleanedResult := reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig)
+ reasoningContent, cleanedResult := reason.ExtractReasoningComplete(result, thinkingStartToken, cfg.ReasoningConfig)
// Parse tool calls if using functions
var outputItems []schema.ORItemField
@@ -1996,7 +1996,7 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
finalCleanedResult = extractor.CleanedContent()
}
if finalReasoning == "" && finalCleanedResult == "" {
- finalReasoning, finalCleanedResult = reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig)
+ finalReasoning, finalCleanedResult = reason.ExtractReasoningComplete(result, thinkingStartToken, cfg.ReasoningConfig)
}
// Close reasoning item if it exists and wasn't closed yet
@@ -2493,7 +2493,7 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
finalCleanedResult = extractor.CleanedContent()
}
if finalReasoning == "" && finalCleanedResult == "" {
- finalReasoning, finalCleanedResult = reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig)
+ finalReasoning, finalCleanedResult = reason.ExtractReasoningComplete(result, thinkingStartToken, cfg.ReasoningConfig)
}
// Close reasoning item if it exists and wasn't closed yet
diff --git a/pkg/reasoning/reasoning.go b/pkg/reasoning/reasoning.go
index e9920af5d..108276c3d 100644
--- a/pkg/reasoning/reasoning.go
+++ b/pkg/reasoning/reasoning.go
@@ -89,6 +89,35 @@ func ExtractReasoningWithConfig(content, thinkingStartToken string, config Confi
return reasoning, cleanedContent
}
+// ExtractReasoningComplete extracts reasoning from a COMPLETE (non-streaming)
+// model response. It behaves like ExtractReasoningWithConfig except that it only
+// honors a prefilled thinking start token when the response actually contains
+// the matching closing tag.
+//
+// Rationale: when a chat template injects the start token into the prompt (so
+// DetectThinkingStartToken returns e.g. ""), the model's output begins
+// inside a reasoning block and carries only the closing tag. The defensive
+// fallback prepends the start token so the extractor can pair it with that
+// close tag. But on a COMPLETE response with no closing tag, the model answered
+// directly with no reasoning at all — prepending the start token would
+// manufacture an unclosed block that swallows the entire answer into reasoning,
+// leaving content empty (breaking short/direct answers such as session names or
+// JSON summaries). Genuine reasoning tags already present in the content still
+// extract, because dropping the synthetic prefill does not affect them.
+//
+// Streaming callers must keep using ExtractReasoningWithConfig: mid-stream an
+// as-yet-unclosed block is legitimate and its tokens should surface as
+// reasoning deltas as they arrive.
+func ExtractReasoningComplete(content, thinkingStartToken string, config Config) (reasoning string, cleanedContent string) {
+ startToken := thinkingStartToken
+ if startToken != "" {
+ if end := ClosingTokenForStart(startToken, &config); end == "" || !strings.Contains(content, end) {
+ startToken = ""
+ }
+ }
+ return ExtractReasoningWithConfig(content, startToken, config)
+}
+
// PrependThinkingTokenIfNeeded prepends the thinking start token to content if it was
// detected in the prompt. This allows the standard extraction logic to work correctly
// for models where the thinking token is already in the prompt.
@@ -131,6 +160,48 @@ func PrependThinkingTokenIfNeeded(content string, startToken string) string {
return startToken + content
}
+// defaultReasoningTagPairs are the built-in start/end reasoning tag pairs,
+// matching llama.cpp's chat-parser.cpp. Kept at package scope so that
+// ExtractReasoning and ClosingTokenForStart share a single source of truth.
+var defaultReasoningTagPairs = []TagPair{
+ {Start: "<|START_THINKING|>", End: "<|END_THINKING|>"}, // Command-R models
+ {Start: "<|inner_prefix|>", End: "<|inner_suffix|>"}, // Apertus models
+ {Start: "", End: ""}, // Seed models
+ {Start: "", End: ""}, // DeepSeek, Granite, ExaOne models
+ {Start: "<|think|>", End: "<|end|><|begin|>assistant<|content|>"}, // Solar Open models (complex end)
+ {Start: "<|channel>thought", End: ""}, // Gemma 4 models
+ {Start: "", End: ""}, // General thinking tag
+ {Start: "[THINK]", End: "[/THINK]"}, // Magistral models
+}
+
+// ClosingTokenForStart returns the closing reasoning tag that pairs with the
+// given start token, searching custom config TagPairs first then the built-in
+// defaults. Returns "" when startToken is empty or unrecognized.
+//
+// Used by the non-streaming autoparser fallback to decide whether a complete
+// response that began with a prefilled thinking token actually closed its
+// reasoning block: only then is synthesizing the start token (so the standard
+// extractor can pair it with the model's close tag) safe. A complete response
+// with no closing tag is a direct answer, not unclosed reasoning.
+func ClosingTokenForStart(startToken string, config *Config) string {
+ if startToken == "" {
+ return ""
+ }
+ if config != nil {
+ for _, pair := range config.TagPairs {
+ if pair.Start == startToken {
+ return pair.End
+ }
+ }
+ }
+ for _, pair := range defaultReasoningTagPairs {
+ if pair.Start == startToken {
+ return pair.End
+ }
+ }
+ return ""
+}
+
// ExtractReasoning extracts reasoning content from thinking tags and returns
// both the extracted reasoning and the cleaned content (with tags removed).
// It handles ... and ... tags.
@@ -145,22 +216,7 @@ func ExtractReasoning(content string, config *Config) (reasoning string, cleaned
var cleanedParts []string
remaining := content
- // Define default tag pairs to look for (matching llama.cpp's chat-parser.cpp)
- defaultTagPairs := []struct {
- start string
- end string
- }{
- {"<|START_THINKING|>", "<|END_THINKING|>"}, // Command-R models
- {"<|inner_prefix|>", "<|inner_suffix|>"}, // Apertus models
- {"", ""}, // Seed models
- {"", ""}, // DeepSeek, Granite, ExaOne models
- {"<|think|>", "<|end|><|begin|>assistant<|content|>"}, // Solar Open models (complex end)
- {"<|channel>thought", ""}, // Gemma 4 models
- {"", ""}, // General thinking tag
- {"[THINK]", "[/THINK]"}, // Magistral models
- }
-
- // Merge custom tag pairs with default tag pairs (custom pairs first for priority)
+ // Merge custom tag pairs (highest priority) with the built-in defaults.
var tagPairs []struct {
start string
end string
@@ -175,9 +231,11 @@ func ExtractReasoning(content string, config *Config) (reasoning string, cleaned
}
}
}
- // Add default tag pairs
- for _, pair := range defaultTagPairs {
- tagPairs = append(tagPairs, pair)
+ for _, pair := range defaultReasoningTagPairs {
+ tagPairs = append(tagPairs, struct {
+ start string
+ end string
+ }{pair.Start, pair.End})
}
// Track the last position we've processed
diff --git a/pkg/reasoning/reasoning_test.go b/pkg/reasoning/reasoning_test.go
index 9f3675ff6..5e6151b01 100644
--- a/pkg/reasoning/reasoning_test.go
+++ b/pkg/reasoning/reasoning_test.go
@@ -1175,6 +1175,55 @@ var _ = Describe("Custom Tokens and Tag Pairs Integration", func() {
})
})
+var _ = Describe("ClosingTokenForStart", func() {
+ It("returns the default closing tag for a known start token", func() {
+ Expect(ClosingTokenForStart("", nil)).To(Equal(""))
+ Expect(ClosingTokenForStart("", nil)).To(Equal(""))
+ Expect(ClosingTokenForStart("[THINK]", nil)).To(Equal("[/THINK]"))
+ })
+
+ It("returns empty for an empty or unknown start token", func() {
+ Expect(ClosingTokenForStart("", nil)).To(BeEmpty())
+ Expect(ClosingTokenForStart("", nil)).To(BeEmpty())
+ })
+
+ It("prefers custom config tag pairs over the defaults", func() {
+ cfg := &Config{TagPairs: []TagPair{{Start: "", End: "<>"}}}
+ Expect(ClosingTokenForStart("", cfg)).To(Equal("<>"))
+ })
+})
+
+var _ = Describe("ExtractReasoningComplete", func() {
+ const startToken = ""
+
+ It("keeps a tag-less answer as content when a start token is prefilled but no close tag is present", func() {
+ // The bug guard: prompt-prefilled , model answered directly with
+ // no reasoning. The synthetic prefill must not swallow it as reasoning.
+ reasoning, content := ExtractReasoningComplete("hello", startToken, Config{})
+ Expect(reasoning).To(BeEmpty())
+ Expect(content).To(Equal("hello"))
+ })
+
+ It("extracts reasoning when the model emits only the closing tag (legitimate prefill)", func() {
+ reasoning, content := ExtractReasoningComplete("the rationale\n\n\nthe answer", startToken, Config{})
+ Expect(reasoning).To(ContainSubstring("the rationale"))
+ Expect(content).To(ContainSubstring("the answer"))
+ Expect(content).ToNot(ContainSubstring(""))
+ })
+
+ It("extracts a fully-tagged block regardless of the prefill token", func() {
+ reasoning, content := ExtractReasoningComplete("ranswer", startToken, Config{})
+ Expect(reasoning).To(Equal("r"))
+ Expect(content).To(Equal("answer"))
+ })
+
+ It("behaves like ExtractReasoningWithConfig when no start token is prefilled", func() {
+ reasoning, content := ExtractReasoningComplete("ranswer", "", Config{})
+ Expect(reasoning).To(Equal("r"))
+ Expect(content).To(Equal("answer"))
+ })
+})
+
// Helper function to create bool pointers for test configs
func boolPtr(b bool) *bool {
return &b