diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index 48e86d42e..5b9b5ed13 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -103,7 +103,12 @@ func applyAutoparserOverride( // blocks like "" that some models emit when reasoning // is disabled. if deltaReasoning == "" && deltaContent != "" { - deltaReasoning, deltaContent = reason.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, reasoningConfig) + // Complete-response extraction: only honor a prefilled start + // token when deltaContent actually closes the reasoning block. Without + // it the model answered directly and the whole answer must stay in + // content rather than be swallowed as unclosed reasoning. See + // reason.ExtractReasoningComplete. + deltaReasoning, deltaContent = reason.ExtractReasoningComplete(deltaContent, thinkingStartToken, reasoningConfig) } xlog.Debug("[ChatDeltas] non-SSE no-tools: overriding result with C++ autoparser deltas", "content_len", len(deltaContent), "reasoning_len", len(deltaReasoning)) diff --git a/core/http/endpoints/openai/chat_test.go b/core/http/endpoints/openai/chat_test.go index ccdbe6850..f5aa35690 100644 --- a/core/http/endpoints/openai/chat_test.go +++ b/core/http/endpoints/openai/chat_test.go @@ -186,6 +186,86 @@ var _ = Describe("applyAutoparserOverride", func() { Expect(result).To(Equal(existing)) }) }) + + // Regression tests for the prefilled-thinking-token path (thinkingStartToken + // != ""). This is the configuration the gallery qwen3 family runs in: the + // chat template injects into the prompt, so DetectThinkingStartToken + // returns "" and the model's output begins *inside* a reasoning block + // — it emits a closing but no opening tag. + // + // The defensive Go-side fallback prepends the start token so the standard + // extractor can pair it with the model's . But on a *complete* + // response that contains NO closing tag (the model answered directly with no + // reasoning at all), prepending manufactures an unclosed block that + // swallows the entire answer into reasoning, leaving content empty. That is + // the bug: short/direct answers (session names, JSON summaries) come back + // with an empty content field. + Context("autoparser delivered content with empty reasoning and a prefilled thinking token", func() { + const startToken = "" + + It("keeps a tag-less direct answer as content instead of swallowing it as reasoning", func() { + // Model answered directly: no , no anywhere. + chatDeltas := []*pb.ChatDelta{ + {Content: "hello", ReasoningContent: ""}, + } + + result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil) + + Expect(result).To(HaveLen(1)) + Expect(result[0].Message.Content).ToNot(BeNil()) + Expect(*(result[0].Message.Content.(*string))).To(Equal("hello"), + "a complete answer with no closing reasoning tag must stay in content") + Expect(result[0].Message.Reasoning).To(BeNil(), + "no reasoning block was emitted, so Reasoning must not be set") + }) + + It("keeps a tag-less JSON answer as content (the summary case)", func() { + raw := `{"short":"Tests pass","long":"go test ./... succeeded."}` + chatDeltas := []*pb.ChatDelta{ + {Content: raw, ReasoningContent: ""}, + } + + result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil) + + Expect(result).To(HaveLen(1)) + Expect(*(result[0].Message.Content.(*string))).To(Equal(raw)) + Expect(result[0].Message.Reasoning).To(BeNil()) + }) + + It("still splits reasoning when the model emits the closing tag (prefill paired with )", func() { + // The legitimate prefill case: was in the prompt, so the + // output carries only the closing tag. The closing tag is the proof + // that a reasoning block exists, so extraction must run. + raw := "The user wants a greeting.\n\n\nHello there!" + chatDeltas := []*pb.ChatDelta{ + {Content: raw, ReasoningContent: ""}, + } + + result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil) + + Expect(result).To(HaveLen(1)) + content := *(result[0].Message.Content.(*string)) + Expect(content).To(ContainSubstring("Hello there!")) + Expect(content).ToNot(ContainSubstring("")) + Expect(content).ToNot(ContainSubstring("The user wants a greeting")) + Expect(result[0].Message.Reasoning).ToNot(BeNil()) + Expect(*result[0].Message.Reasoning).To(ContainSubstring("The user wants a greeting")) + }) + + It("still splits a fully-tagged block with a prefill token set", func() { + raw := "Reasoning here.Final answer." + chatDeltas := []*pb.ChatDelta{ + {Content: raw, ReasoningContent: ""}, + } + + result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil) + + Expect(result).To(HaveLen(1)) + Expect(*(result[0].Message.Content.(*string))).To(Equal("Final answer.")) + Expect(result[0].Message.Reasoning).ToNot(BeNil()) + Expect(*result[0].Message.Reasoning).To(ContainSubstring("Reasoning here")) + }) + }) }) var _ = Describe("mergeToolCallDeltas", func() { diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go index 0d638a909..9bd40679c 100644 --- a/core/http/endpoints/openai/realtime.go +++ b/core/http/endpoints/openai/realtime.go @@ -1579,7 +1579,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa // ExtractReasoningWithConfig is a no-op when no tag pair matches, // so it's safe to apply unconditionally in the no-reasoning branch. if deltaReasoning == "" && deltaContent != "" { - deltaReasoning, deltaContent = reasoning.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, config.ReasoningConfig) + deltaReasoning, deltaContent = reasoning.ExtractReasoningComplete(deltaContent, thinkingStartToken, config.ReasoningConfig) } reasoningText = deltaReasoning responseWithoutReasoning = deltaContent @@ -1587,7 +1587,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa cleanedResponse = deltaContent toolCalls = deltaToolCalls } else { - reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, config.ReasoningConfig) + reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningComplete(rawResponse, thinkingStartToken, config.ReasoningConfig) textContent = functions.ParseTextContent(responseWithoutReasoning, config.FunctionsConfig) cleanedResponse = functions.CleanupLLMResult(responseWithoutReasoning, config.FunctionsConfig) toolCalls = functions.ParseFunctionCall(cleanedResponse, config.FunctionsConfig) diff --git a/core/http/endpoints/openresponses/responses.go b/core/http/endpoints/openresponses/responses.go index 2b986cc61..916380d01 100644 --- a/core/http/endpoints/openresponses/responses.go +++ b/core/http/endpoints/openresponses/responses.go @@ -1356,7 +1356,7 @@ func handleOpenResponsesNonStream(c echo.Context, responseID string, createdAt i thinkingStartToken := reason.DetectThinkingStartToken(template, &cfg.ReasoningConfig) // Extract reasoning from result before cleaning - reasoningContent, cleanedResult := reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig) + reasoningContent, cleanedResult := reason.ExtractReasoningComplete(result, thinkingStartToken, cfg.ReasoningConfig) // Parse tool calls if using functions var outputItems []schema.ORItemField @@ -1996,7 +1996,7 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6 finalCleanedResult = extractor.CleanedContent() } if finalReasoning == "" && finalCleanedResult == "" { - finalReasoning, finalCleanedResult = reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig) + finalReasoning, finalCleanedResult = reason.ExtractReasoningComplete(result, thinkingStartToken, cfg.ReasoningConfig) } // Close reasoning item if it exists and wasn't closed yet @@ -2493,7 +2493,7 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6 finalCleanedResult = extractor.CleanedContent() } if finalReasoning == "" && finalCleanedResult == "" { - finalReasoning, finalCleanedResult = reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig) + finalReasoning, finalCleanedResult = reason.ExtractReasoningComplete(result, thinkingStartToken, cfg.ReasoningConfig) } // Close reasoning item if it exists and wasn't closed yet diff --git a/pkg/reasoning/reasoning.go b/pkg/reasoning/reasoning.go index e9920af5d..108276c3d 100644 --- a/pkg/reasoning/reasoning.go +++ b/pkg/reasoning/reasoning.go @@ -89,6 +89,35 @@ func ExtractReasoningWithConfig(content, thinkingStartToken string, config Confi return reasoning, cleanedContent } +// ExtractReasoningComplete extracts reasoning from a COMPLETE (non-streaming) +// model response. It behaves like ExtractReasoningWithConfig except that it only +// honors a prefilled thinking start token when the response actually contains +// the matching closing tag. +// +// Rationale: when a chat template injects the start token into the prompt (so +// DetectThinkingStartToken returns e.g. ""), the model's output begins +// inside a reasoning block and carries only the closing tag. The defensive +// fallback prepends the start token so the extractor can pair it with that +// close tag. But on a COMPLETE response with no closing tag, the model answered +// directly with no reasoning at all — prepending the start token would +// manufacture an unclosed block that swallows the entire answer into reasoning, +// leaving content empty (breaking short/direct answers such as session names or +// JSON summaries). Genuine reasoning tags already present in the content still +// extract, because dropping the synthetic prefill does not affect them. +// +// Streaming callers must keep using ExtractReasoningWithConfig: mid-stream an +// as-yet-unclosed block is legitimate and its tokens should surface as +// reasoning deltas as they arrive. +func ExtractReasoningComplete(content, thinkingStartToken string, config Config) (reasoning string, cleanedContent string) { + startToken := thinkingStartToken + if startToken != "" { + if end := ClosingTokenForStart(startToken, &config); end == "" || !strings.Contains(content, end) { + startToken = "" + } + } + return ExtractReasoningWithConfig(content, startToken, config) +} + // PrependThinkingTokenIfNeeded prepends the thinking start token to content if it was // detected in the prompt. This allows the standard extraction logic to work correctly // for models where the thinking token is already in the prompt. @@ -131,6 +160,48 @@ func PrependThinkingTokenIfNeeded(content string, startToken string) string { return startToken + content } +// defaultReasoningTagPairs are the built-in start/end reasoning tag pairs, +// matching llama.cpp's chat-parser.cpp. Kept at package scope so that +// ExtractReasoning and ClosingTokenForStart share a single source of truth. +var defaultReasoningTagPairs = []TagPair{ + {Start: "<|START_THINKING|>", End: "<|END_THINKING|>"}, // Command-R models + {Start: "<|inner_prefix|>", End: "<|inner_suffix|>"}, // Apertus models + {Start: "", End: ""}, // Seed models + {Start: "", End: ""}, // DeepSeek, Granite, ExaOne models + {Start: "<|think|>", End: "<|end|><|begin|>assistant<|content|>"}, // Solar Open models (complex end) + {Start: "<|channel>thought", End: ""}, // Gemma 4 models + {Start: "", End: ""}, // General thinking tag + {Start: "[THINK]", End: "[/THINK]"}, // Magistral models +} + +// ClosingTokenForStart returns the closing reasoning tag that pairs with the +// given start token, searching custom config TagPairs first then the built-in +// defaults. Returns "" when startToken is empty or unrecognized. +// +// Used by the non-streaming autoparser fallback to decide whether a complete +// response that began with a prefilled thinking token actually closed its +// reasoning block: only then is synthesizing the start token (so the standard +// extractor can pair it with the model's close tag) safe. A complete response +// with no closing tag is a direct answer, not unclosed reasoning. +func ClosingTokenForStart(startToken string, config *Config) string { + if startToken == "" { + return "" + } + if config != nil { + for _, pair := range config.TagPairs { + if pair.Start == startToken { + return pair.End + } + } + } + for _, pair := range defaultReasoningTagPairs { + if pair.Start == startToken { + return pair.End + } + } + return "" +} + // ExtractReasoning extracts reasoning content from thinking tags and returns // both the extracted reasoning and the cleaned content (with tags removed). // It handles ... and ... tags. @@ -145,22 +216,7 @@ func ExtractReasoning(content string, config *Config) (reasoning string, cleaned var cleanedParts []string remaining := content - // Define default tag pairs to look for (matching llama.cpp's chat-parser.cpp) - defaultTagPairs := []struct { - start string - end string - }{ - {"<|START_THINKING|>", "<|END_THINKING|>"}, // Command-R models - {"<|inner_prefix|>", "<|inner_suffix|>"}, // Apertus models - {"", ""}, // Seed models - {"", ""}, // DeepSeek, Granite, ExaOne models - {"<|think|>", "<|end|><|begin|>assistant<|content|>"}, // Solar Open models (complex end) - {"<|channel>thought", ""}, // Gemma 4 models - {"", ""}, // General thinking tag - {"[THINK]", "[/THINK]"}, // Magistral models - } - - // Merge custom tag pairs with default tag pairs (custom pairs first for priority) + // Merge custom tag pairs (highest priority) with the built-in defaults. var tagPairs []struct { start string end string @@ -175,9 +231,11 @@ func ExtractReasoning(content string, config *Config) (reasoning string, cleaned } } } - // Add default tag pairs - for _, pair := range defaultTagPairs { - tagPairs = append(tagPairs, pair) + for _, pair := range defaultReasoningTagPairs { + tagPairs = append(tagPairs, struct { + start string + end string + }{pair.Start, pair.End}) } // Track the last position we've processed diff --git a/pkg/reasoning/reasoning_test.go b/pkg/reasoning/reasoning_test.go index 9f3675ff6..5e6151b01 100644 --- a/pkg/reasoning/reasoning_test.go +++ b/pkg/reasoning/reasoning_test.go @@ -1175,6 +1175,55 @@ var _ = Describe("Custom Tokens and Tag Pairs Integration", func() { }) }) +var _ = Describe("ClosingTokenForStart", func() { + It("returns the default closing tag for a known start token", func() { + Expect(ClosingTokenForStart("", nil)).To(Equal("")) + Expect(ClosingTokenForStart("", nil)).To(Equal("")) + Expect(ClosingTokenForStart("[THINK]", nil)).To(Equal("[/THINK]")) + }) + + It("returns empty for an empty or unknown start token", func() { + Expect(ClosingTokenForStart("", nil)).To(BeEmpty()) + Expect(ClosingTokenForStart("", nil)).To(BeEmpty()) + }) + + It("prefers custom config tag pairs over the defaults", func() { + cfg := &Config{TagPairs: []TagPair{{Start: "", End: "<>"}}} + Expect(ClosingTokenForStart("", cfg)).To(Equal("<>")) + }) +}) + +var _ = Describe("ExtractReasoningComplete", func() { + const startToken = "" + + It("keeps a tag-less answer as content when a start token is prefilled but no close tag is present", func() { + // The bug guard: prompt-prefilled , model answered directly with + // no reasoning. The synthetic prefill must not swallow it as reasoning. + reasoning, content := ExtractReasoningComplete("hello", startToken, Config{}) + Expect(reasoning).To(BeEmpty()) + Expect(content).To(Equal("hello")) + }) + + It("extracts reasoning when the model emits only the closing tag (legitimate prefill)", func() { + reasoning, content := ExtractReasoningComplete("the rationale\n\n\nthe answer", startToken, Config{}) + Expect(reasoning).To(ContainSubstring("the rationale")) + Expect(content).To(ContainSubstring("the answer")) + Expect(content).ToNot(ContainSubstring("")) + }) + + It("extracts a fully-tagged block regardless of the prefill token", func() { + reasoning, content := ExtractReasoningComplete("ranswer", startToken, Config{}) + Expect(reasoning).To(Equal("r")) + Expect(content).To(Equal("answer")) + }) + + It("behaves like ExtractReasoningWithConfig when no start token is prefilled", func() { + reasoning, content := ExtractReasoningComplete("ranswer", "", Config{}) + Expect(reasoning).To(Equal("r")) + Expect(content).To(Equal("answer")) + }) +}) + // Helper function to create bool pointers for test configs func boolPtr(b bool) *bool { return &b