Files
LocalAI/pkg/reasoning/reasoning.go
LocalAI [bot] e1ec03d33f fix(reasoning): stop prefilled <think> from swallowing tag-less answers (#10225)
* fix(reasoning): stop prefilled <think> from swallowing tag-less answers

When a chat template injects the thinking start token into the prompt (so
DetectThinkingStartToken returns e.g. "<think>"), the model's output begins
inside a reasoning block and carries only the closing tag. The non-jinja
autoparser fallback (peg-native "pure content" mode, issue #9985) prepends the
start token so the extractor can pair it with the model's </think>.

But on a COMPLETE response that contains no closing tag, the model answered
directly with no reasoning at all. Prepending the start token there manufactures
an unclosed block that swallows the entire answer into reasoning, leaving the
OpenAI `content` field empty. This breaks short/direct answers — session names,
JSON summaries, any terse completion where the model skips the think block —
which come back with empty content. Regression surfaced by #9991, which added
the defensive prefill extraction to the complete-response paths.

Add reasoning.ExtractReasoningComplete: it only honors a prefilled start token
when the response actually contains the matching closing tag (proof a reasoning
block exists). Genuine reasoning tags already in the content still extract;
tag-less content stays content. Apply it at every complete-response site
(applyAutoparserOverride, realtime, openresponses). The streaming per-token
extractor is intentionally left on ExtractReasoningWithConfig — mid-stream an
as-yet-unclosed block is legitimate and must surface as reasoning deltas.

Also adds reasoning.ClosingTokenForStart and hoists the default reasoning tag
pairs to package scope so both helpers share one source of truth.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* test(reasoning): cover the enable_thinking=false non-thinking-mode regression

Adds the end-to-end case that actually broke session summaries / auto-titles
and was not covered before: a request with enable_thinking=false against a
<think>-capable model. In non-thinking mode the model emits no reasoning block,
so llama.cpp's autoparser returns ChatDeltas with content set and
reasoning_content empty (verified against stock llama-server: same model with
chat_template_kwargs.enable_thinking=false returns reasoning_content=null,
content="hello"). thinkingStartToken is still "<think>" because it is detected
per-model from the enable_thinking=true render, so the old code prepended it and
swallowed the answer. The test fails without the ExtractReasoningComplete gate.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-09 09:02:04 +02:00

327 lines
12 KiB
Go

package reasoning
import (
"strings"
)
// DetectThinkingStartToken checks if the prompt or template contains a thinking start token
// and returns the detected token. This indicates that the model's prompt template
// already includes the thinking token, so the model output will start with reasoning
// content without an explicit opening tag.
// Returns the detected token if found, empty string otherwise.
// Common tokens checked (in order of specificity - longer first):
// Based on llama.cpp's chat-parser.cpp implementations:
// - <|START_THINKING|> (Command-R models)
// - <|inner_prefix|> (Apertus models)
// - <seed:think> (Seed models)
// - <think> (DeepSeek, Granite, ExaOne models)
// - <|channel>thought (Gemma 4 models)
// - <|think|> (Solar Open models)
// - <thinking> (General thinking tag)
// - [THINK] (Magistral models)
// Custom tokens from config are checked first, then default tokens.
func DetectThinkingStartToken(prompt string, config *Config) string {
// Common thinking start tokens (in order of specificity - longer first)
// Based on llama.cpp's chat-parser.cpp implementations
defaultTokens := []string{
"<|START_THINKING|>", // Command-R models
"<|channel>thought", // Gemma 4 models (before <|think|> — Gemma 4 templates contain both)
"<|inner_prefix|>", // Apertus models
"<seed:think>", // Seed models
"<think>", // DeepSeek, Granite, ExaOne models
"<|think|>", // Solar Open models
"<thinking>", // General thinking tag
"[THINK]", // Magistral models
}
// Merge custom tokens with default tokens (custom tokens first for priority)
var thinkingStartTokens []string
if config != nil && len(config.ThinkingStartTokens) > 0 {
thinkingStartTokens = append(thinkingStartTokens, config.ThinkingStartTokens...)
}
thinkingStartTokens = append(thinkingStartTokens, defaultTokens...)
// Check if prompt ends with any of these tokens (allowing for trailing whitespace/newlines)
trimmedPrompt := strings.TrimRight(prompt, " \t\n\r")
for _, token := range thinkingStartTokens {
if strings.Contains(trimmedPrompt, token) {
return token
}
}
// Also check if any of these tokens appear near the end (within last 100 chars)
// This handles cases where there might be stop tokens or other content after
if len(trimmedPrompt) > 100 {
lastPart := trimmedPrompt[len(trimmedPrompt)-100:]
for _, token := range thinkingStartTokens {
if idx := strings.LastIndex(lastPart, token); idx != -1 {
// Check if this is the last meaningful content (only whitespace after)
afterToken := lastPart[idx+len(token):]
if strings.TrimSpace(afterToken) == "" {
return token
}
}
}
}
return ""
}
// ExtractReasoningWithConfig extracts reasoning from content with the given config.
// If reasoning is disabled, it returns the original content.
// If thinking start token prefill is enabled, it prepends the thinking start token to the content.
// It returns the extracted reasoning and the cleaned content.
func ExtractReasoningWithConfig(content, thinkingStartToken string, config Config) (reasoning string, cleanedContent string) {
cleanedContent = content
// If reasoning is not disabled, prepend the thinking start token if needed and extract reasoning
if config.DisableReasoning == nil || !*config.DisableReasoning {
// If thinking start token prefill is not disabled, prepend the thinking start token
if config.DisableReasoningTagPrefill == nil || !*config.DisableReasoningTagPrefill {
cleanedContent = PrependThinkingTokenIfNeeded(cleanedContent, thinkingStartToken)
}
// Extract reasoning from the cleaned content
reasoning, cleanedContent = ExtractReasoning(cleanedContent, &config)
if config.StripReasoningOnly != nil && *config.StripReasoningOnly {
reasoning = ""
}
}
return reasoning, cleanedContent
}
// ExtractReasoningComplete extracts reasoning from a COMPLETE (non-streaming)
// model response. It behaves like ExtractReasoningWithConfig except that it only
// honors a prefilled thinking start token when the response actually contains
// the matching closing tag.
//
// Rationale: when a chat template injects the start token into the prompt (so
// DetectThinkingStartToken returns e.g. "<think>"), the model's output begins
// inside a reasoning block and carries only the closing tag. The defensive
// fallback prepends the start token so the extractor can pair it with that
// close tag. But on a COMPLETE response with no closing tag, the model answered
// directly with no reasoning at all — prepending the start token would
// manufacture an unclosed block that swallows the entire answer into reasoning,
// leaving content empty (breaking short/direct answers such as session names or
// JSON summaries). Genuine reasoning tags already present in the content still
// extract, because dropping the synthetic prefill does not affect them.
//
// Streaming callers must keep using ExtractReasoningWithConfig: mid-stream an
// as-yet-unclosed block is legitimate and its tokens should surface as
// reasoning deltas as they arrive.
func ExtractReasoningComplete(content, thinkingStartToken string, config Config) (reasoning string, cleanedContent string) {
startToken := thinkingStartToken
if startToken != "" {
if end := ClosingTokenForStart(startToken, &config); end == "" || !strings.Contains(content, end) {
startToken = ""
}
}
return ExtractReasoningWithConfig(content, startToken, config)
}
// PrependThinkingTokenIfNeeded prepends the thinking start token to content if it was
// detected in the prompt. This allows the standard extraction logic to work correctly
// for models where the thinking token is already in the prompt.
func PrependThinkingTokenIfNeeded(content string, startToken string) string {
if startToken == "" {
return content
}
// Check if content already starts with the token (allowing for leading whitespace)
trimmed := strings.TrimLeftFunc(content, func(r rune) bool {
return r == ' ' || r == '\t' || r == '\n' || r == '\r'
})
// If content already contains the token, don't prepend
if strings.Contains(trimmed, startToken) {
return content
}
// If content is a non-empty prefix of the start token (e.g. "<|channel>"
// accumulating toward "<|channel>thought"), don't prepend — we're still
// receiving the tag token-by-token during streaming.
if trimmed != "" && strings.HasPrefix(startToken, trimmed) {
return content
}
// Find where leading whitespace ends
whitespaceEnd := 0
for whitespaceEnd < len(content) {
r := content[whitespaceEnd]
if r != ' ' && r != '\t' && r != '\n' && r != '\r' {
break
}
whitespaceEnd++
}
// Prepend the token after whitespace to make it look like normal tagged content
if whitespaceEnd > 0 {
return content[:whitespaceEnd] + startToken + content[whitespaceEnd:]
}
return startToken + content
}
// defaultReasoningTagPairs are the built-in start/end reasoning tag pairs,
// matching llama.cpp's chat-parser.cpp. Kept at package scope so that
// ExtractReasoning and ClosingTokenForStart share a single source of truth.
var defaultReasoningTagPairs = []TagPair{
{Start: "<|START_THINKING|>", End: "<|END_THINKING|>"}, // Command-R models
{Start: "<|inner_prefix|>", End: "<|inner_suffix|>"}, // Apertus models
{Start: "<seed:think>", End: "</seed:think>"}, // Seed models
{Start: "<think>", End: "</think>"}, // DeepSeek, Granite, ExaOne models
{Start: "<|think|>", End: "<|end|><|begin|>assistant<|content|>"}, // Solar Open models (complex end)
{Start: "<|channel>thought", End: "<channel|>"}, // Gemma 4 models
{Start: "<thinking>", End: "</thinking>"}, // General thinking tag
{Start: "[THINK]", End: "[/THINK]"}, // Magistral models
}
// ClosingTokenForStart returns the closing reasoning tag that pairs with the
// given start token, searching custom config TagPairs first then the built-in
// defaults. Returns "" when startToken is empty or unrecognized.
//
// Used by the non-streaming autoparser fallback to decide whether a complete
// response that began with a prefilled thinking token actually closed its
// reasoning block: only then is synthesizing the start token (so the standard
// extractor can pair it with the model's close tag) safe. A complete response
// with no closing tag is a direct answer, not unclosed reasoning.
func ClosingTokenForStart(startToken string, config *Config) string {
if startToken == "" {
return ""
}
if config != nil {
for _, pair := range config.TagPairs {
if pair.Start == startToken {
return pair.End
}
}
}
for _, pair := range defaultReasoningTagPairs {
if pair.Start == startToken {
return pair.End
}
}
return ""
}
// ExtractReasoning extracts reasoning content from thinking tags and returns
// both the extracted reasoning and the cleaned content (with tags removed).
// It handles <thinking>...</thinking> and <think>...</think> tags.
// Multiple reasoning blocks are concatenated with newlines.
// Custom tag pairs from config are checked first, then default tag pairs.
func ExtractReasoning(content string, config *Config) (reasoning string, cleanedContent string) {
if content == "" {
return "", content
}
var reasoningParts []string
var cleanedParts []string
remaining := content
// Merge custom tag pairs (highest priority) with the built-in defaults.
var tagPairs []struct {
start string
end string
}
if config != nil && len(config.TagPairs) > 0 {
for _, pair := range config.TagPairs {
if pair.Start != "" && pair.End != "" {
tagPairs = append(tagPairs, struct {
start string
end string
}{pair.Start, pair.End})
}
}
}
for _, pair := range defaultReasoningTagPairs {
tagPairs = append(tagPairs, struct {
start string
end string
}{pair.Start, pair.End})
}
// Track the last position we've processed
lastPos := 0
for {
// Find the earliest tag start
earliestStart := -1
earliestEnd := -1
isUnclosed := false
var matchedTag struct {
start string
end string
}
for _, tagPair := range tagPairs {
startIdx := strings.Index(remaining[lastPos:], tagPair.start)
if startIdx == -1 {
continue
}
startIdx += lastPos
// Find the corresponding end tag
endIdx := strings.Index(remaining[startIdx+len(tagPair.start):], tagPair.end)
if endIdx == -1 {
// Unclosed tag - extract what we have
if earliestStart == -1 || startIdx < earliestStart {
earliestStart = startIdx
earliestEnd = len(remaining)
isUnclosed = true
matchedTag = tagPair
}
continue
}
endIdx += startIdx + len(tagPair.start)
// Found a complete tag pair
if earliestStart == -1 || startIdx < earliestStart {
earliestStart = startIdx
earliestEnd = endIdx + len(tagPair.end)
isUnclosed = false
matchedTag = tagPair
}
}
if earliestStart == -1 {
// No more tags found, add remaining content
if lastPos < len(remaining) {
cleanedParts = append(cleanedParts, remaining[lastPos:])
}
break
}
// Add content before the tag
if earliestStart > lastPos {
cleanedParts = append(cleanedParts, remaining[lastPos:earliestStart])
}
// Extract reasoning content
reasoningStart := earliestStart + len(matchedTag.start)
// For unclosed tags, earliestEnd is already at the end of the string
// For closed tags, earliestEnd points to after the closing tag, so we subtract the end tag length
var reasoningEnd int
if isUnclosed {
// Unclosed tag - extract everything to the end
reasoningEnd = len(remaining)
} else {
// Closed tag - exclude the end tag
reasoningEnd = earliestEnd - len(matchedTag.end)
}
if reasoningEnd > reasoningStart {
reasoningContent := strings.TrimSpace(remaining[reasoningStart:reasoningEnd])
if reasoningContent != "" {
reasoningParts = append(reasoningParts, reasoningContent)
}
}
// Move past this tag
lastPos = earliestEnd
}
// Combine reasoning parts
reasoning = strings.Join(reasoningParts, "\n\n")
// Combine cleaned content parts
cleanedContent = strings.Join(cleanedParts, "")
return reasoning, cleanedContent
}