mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-10 17:56:49 -04:00
* fix(reasoning): stop prefilled <think> from swallowing tag-less answers When a chat template injects the thinking start token into the prompt (so DetectThinkingStartToken returns e.g. "<think>"), the model's output begins inside a reasoning block and carries only the closing tag. The non-jinja autoparser fallback (peg-native "pure content" mode, issue #9985) prepends the start token so the extractor can pair it with the model's </think>. But on a COMPLETE response that contains no closing tag, the model answered directly with no reasoning at all. Prepending the start token there manufactures an unclosed block that swallows the entire answer into reasoning, leaving the OpenAI `content` field empty. This breaks short/direct answers — session names, JSON summaries, any terse completion where the model skips the think block — which come back with empty content. Regression surfaced by #9991, which added the defensive prefill extraction to the complete-response paths. Add reasoning.ExtractReasoningComplete: it only honors a prefilled start token when the response actually contains the matching closing tag (proof a reasoning block exists). Genuine reasoning tags already in the content still extract; tag-less content stays content. Apply it at every complete-response site (applyAutoparserOverride, realtime, openresponses). The streaming per-token extractor is intentionally left on ExtractReasoningWithConfig — mid-stream an as-yet-unclosed block is legitimate and must surface as reasoning deltas. Also adds reasoning.ClosingTokenForStart and hoists the default reasoning tag pairs to package scope so both helpers share one source of truth. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * test(reasoning): cover the enable_thinking=false non-thinking-mode regression Adds the end-to-end case that actually broke session summaries / auto-titles and was not covered before: a request with enable_thinking=false against a <think>-capable model. In non-thinking mode the model emits no reasoning block, so llama.cpp's autoparser returns ChatDeltas with content set and reasoning_content empty (verified against stock llama-server: same model with chat_template_kwargs.enable_thinking=false returns reasoning_content=null, content="hello"). thinkingStartToken is still "<think>" because it is detected per-model from the enable_thinking=true render, so the old code prepended it and swallowed the answer. The test fails without the ExtractReasoningComplete gate. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
327 lines
12 KiB
Go
327 lines
12 KiB
Go
package reasoning
|
|
|
|
import (
|
|
"strings"
|
|
)
|
|
|
|
// DetectThinkingStartToken checks if the prompt or template contains a thinking start token
|
|
// and returns the detected token. This indicates that the model's prompt template
|
|
// already includes the thinking token, so the model output will start with reasoning
|
|
// content without an explicit opening tag.
|
|
// Returns the detected token if found, empty string otherwise.
|
|
// Common tokens checked (in order of specificity - longer first):
|
|
// Based on llama.cpp's chat-parser.cpp implementations:
|
|
// - <|START_THINKING|> (Command-R models)
|
|
// - <|inner_prefix|> (Apertus models)
|
|
// - <seed:think> (Seed models)
|
|
// - <think> (DeepSeek, Granite, ExaOne models)
|
|
// - <|channel>thought (Gemma 4 models)
|
|
// - <|think|> (Solar Open models)
|
|
// - <thinking> (General thinking tag)
|
|
// - [THINK] (Magistral models)
|
|
// Custom tokens from config are checked first, then default tokens.
|
|
func DetectThinkingStartToken(prompt string, config *Config) string {
|
|
// Common thinking start tokens (in order of specificity - longer first)
|
|
// Based on llama.cpp's chat-parser.cpp implementations
|
|
defaultTokens := []string{
|
|
"<|START_THINKING|>", // Command-R models
|
|
"<|channel>thought", // Gemma 4 models (before <|think|> — Gemma 4 templates contain both)
|
|
"<|inner_prefix|>", // Apertus models
|
|
"<seed:think>", // Seed models
|
|
"<think>", // DeepSeek, Granite, ExaOne models
|
|
"<|think|>", // Solar Open models
|
|
"<thinking>", // General thinking tag
|
|
"[THINK]", // Magistral models
|
|
}
|
|
|
|
// Merge custom tokens with default tokens (custom tokens first for priority)
|
|
var thinkingStartTokens []string
|
|
if config != nil && len(config.ThinkingStartTokens) > 0 {
|
|
thinkingStartTokens = append(thinkingStartTokens, config.ThinkingStartTokens...)
|
|
}
|
|
thinkingStartTokens = append(thinkingStartTokens, defaultTokens...)
|
|
|
|
// Check if prompt ends with any of these tokens (allowing for trailing whitespace/newlines)
|
|
trimmedPrompt := strings.TrimRight(prompt, " \t\n\r")
|
|
for _, token := range thinkingStartTokens {
|
|
if strings.Contains(trimmedPrompt, token) {
|
|
return token
|
|
}
|
|
}
|
|
|
|
// Also check if any of these tokens appear near the end (within last 100 chars)
|
|
// This handles cases where there might be stop tokens or other content after
|
|
if len(trimmedPrompt) > 100 {
|
|
lastPart := trimmedPrompt[len(trimmedPrompt)-100:]
|
|
for _, token := range thinkingStartTokens {
|
|
if idx := strings.LastIndex(lastPart, token); idx != -1 {
|
|
// Check if this is the last meaningful content (only whitespace after)
|
|
afterToken := lastPart[idx+len(token):]
|
|
if strings.TrimSpace(afterToken) == "" {
|
|
return token
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return ""
|
|
}
|
|
|
|
// ExtractReasoningWithConfig extracts reasoning from content with the given config.
|
|
// If reasoning is disabled, it returns the original content.
|
|
// If thinking start token prefill is enabled, it prepends the thinking start token to the content.
|
|
// It returns the extracted reasoning and the cleaned content.
|
|
func ExtractReasoningWithConfig(content, thinkingStartToken string, config Config) (reasoning string, cleanedContent string) {
|
|
cleanedContent = content
|
|
// If reasoning is not disabled, prepend the thinking start token if needed and extract reasoning
|
|
if config.DisableReasoning == nil || !*config.DisableReasoning {
|
|
// If thinking start token prefill is not disabled, prepend the thinking start token
|
|
if config.DisableReasoningTagPrefill == nil || !*config.DisableReasoningTagPrefill {
|
|
cleanedContent = PrependThinkingTokenIfNeeded(cleanedContent, thinkingStartToken)
|
|
}
|
|
// Extract reasoning from the cleaned content
|
|
reasoning, cleanedContent = ExtractReasoning(cleanedContent, &config)
|
|
if config.StripReasoningOnly != nil && *config.StripReasoningOnly {
|
|
reasoning = ""
|
|
}
|
|
}
|
|
|
|
return reasoning, cleanedContent
|
|
}
|
|
|
|
// ExtractReasoningComplete extracts reasoning from a COMPLETE (non-streaming)
|
|
// model response. It behaves like ExtractReasoningWithConfig except that it only
|
|
// honors a prefilled thinking start token when the response actually contains
|
|
// the matching closing tag.
|
|
//
|
|
// Rationale: when a chat template injects the start token into the prompt (so
|
|
// DetectThinkingStartToken returns e.g. "<think>"), the model's output begins
|
|
// inside a reasoning block and carries only the closing tag. The defensive
|
|
// fallback prepends the start token so the extractor can pair it with that
|
|
// close tag. But on a COMPLETE response with no closing tag, the model answered
|
|
// directly with no reasoning at all — prepending the start token would
|
|
// manufacture an unclosed block that swallows the entire answer into reasoning,
|
|
// leaving content empty (breaking short/direct answers such as session names or
|
|
// JSON summaries). Genuine reasoning tags already present in the content still
|
|
// extract, because dropping the synthetic prefill does not affect them.
|
|
//
|
|
// Streaming callers must keep using ExtractReasoningWithConfig: mid-stream an
|
|
// as-yet-unclosed block is legitimate and its tokens should surface as
|
|
// reasoning deltas as they arrive.
|
|
func ExtractReasoningComplete(content, thinkingStartToken string, config Config) (reasoning string, cleanedContent string) {
|
|
startToken := thinkingStartToken
|
|
if startToken != "" {
|
|
if end := ClosingTokenForStart(startToken, &config); end == "" || !strings.Contains(content, end) {
|
|
startToken = ""
|
|
}
|
|
}
|
|
return ExtractReasoningWithConfig(content, startToken, config)
|
|
}
|
|
|
|
// PrependThinkingTokenIfNeeded prepends the thinking start token to content if it was
|
|
// detected in the prompt. This allows the standard extraction logic to work correctly
|
|
// for models where the thinking token is already in the prompt.
|
|
func PrependThinkingTokenIfNeeded(content string, startToken string) string {
|
|
if startToken == "" {
|
|
return content
|
|
}
|
|
|
|
// Check if content already starts with the token (allowing for leading whitespace)
|
|
trimmed := strings.TrimLeftFunc(content, func(r rune) bool {
|
|
return r == ' ' || r == '\t' || r == '\n' || r == '\r'
|
|
})
|
|
|
|
// If content already contains the token, don't prepend
|
|
if strings.Contains(trimmed, startToken) {
|
|
return content
|
|
}
|
|
|
|
// If content is a non-empty prefix of the start token (e.g. "<|channel>"
|
|
// accumulating toward "<|channel>thought"), don't prepend — we're still
|
|
// receiving the tag token-by-token during streaming.
|
|
if trimmed != "" && strings.HasPrefix(startToken, trimmed) {
|
|
return content
|
|
}
|
|
|
|
// Find where leading whitespace ends
|
|
whitespaceEnd := 0
|
|
for whitespaceEnd < len(content) {
|
|
r := content[whitespaceEnd]
|
|
if r != ' ' && r != '\t' && r != '\n' && r != '\r' {
|
|
break
|
|
}
|
|
whitespaceEnd++
|
|
}
|
|
|
|
// Prepend the token after whitespace to make it look like normal tagged content
|
|
if whitespaceEnd > 0 {
|
|
return content[:whitespaceEnd] + startToken + content[whitespaceEnd:]
|
|
}
|
|
return startToken + content
|
|
}
|
|
|
|
// defaultReasoningTagPairs are the built-in start/end reasoning tag pairs,
|
|
// matching llama.cpp's chat-parser.cpp. Kept at package scope so that
|
|
// ExtractReasoning and ClosingTokenForStart share a single source of truth.
|
|
var defaultReasoningTagPairs = []TagPair{
|
|
{Start: "<|START_THINKING|>", End: "<|END_THINKING|>"}, // Command-R models
|
|
{Start: "<|inner_prefix|>", End: "<|inner_suffix|>"}, // Apertus models
|
|
{Start: "<seed:think>", End: "</seed:think>"}, // Seed models
|
|
{Start: "<think>", End: "</think>"}, // DeepSeek, Granite, ExaOne models
|
|
{Start: "<|think|>", End: "<|end|><|begin|>assistant<|content|>"}, // Solar Open models (complex end)
|
|
{Start: "<|channel>thought", End: "<channel|>"}, // Gemma 4 models
|
|
{Start: "<thinking>", End: "</thinking>"}, // General thinking tag
|
|
{Start: "[THINK]", End: "[/THINK]"}, // Magistral models
|
|
}
|
|
|
|
// ClosingTokenForStart returns the closing reasoning tag that pairs with the
|
|
// given start token, searching custom config TagPairs first then the built-in
|
|
// defaults. Returns "" when startToken is empty or unrecognized.
|
|
//
|
|
// Used by the non-streaming autoparser fallback to decide whether a complete
|
|
// response that began with a prefilled thinking token actually closed its
|
|
// reasoning block: only then is synthesizing the start token (so the standard
|
|
// extractor can pair it with the model's close tag) safe. A complete response
|
|
// with no closing tag is a direct answer, not unclosed reasoning.
|
|
func ClosingTokenForStart(startToken string, config *Config) string {
|
|
if startToken == "" {
|
|
return ""
|
|
}
|
|
if config != nil {
|
|
for _, pair := range config.TagPairs {
|
|
if pair.Start == startToken {
|
|
return pair.End
|
|
}
|
|
}
|
|
}
|
|
for _, pair := range defaultReasoningTagPairs {
|
|
if pair.Start == startToken {
|
|
return pair.End
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// ExtractReasoning extracts reasoning content from thinking tags and returns
|
|
// both the extracted reasoning and the cleaned content (with tags removed).
|
|
// It handles <thinking>...</thinking> and <think>...</think> tags.
|
|
// Multiple reasoning blocks are concatenated with newlines.
|
|
// Custom tag pairs from config are checked first, then default tag pairs.
|
|
func ExtractReasoning(content string, config *Config) (reasoning string, cleanedContent string) {
|
|
if content == "" {
|
|
return "", content
|
|
}
|
|
|
|
var reasoningParts []string
|
|
var cleanedParts []string
|
|
remaining := content
|
|
|
|
// Merge custom tag pairs (highest priority) with the built-in defaults.
|
|
var tagPairs []struct {
|
|
start string
|
|
end string
|
|
}
|
|
if config != nil && len(config.TagPairs) > 0 {
|
|
for _, pair := range config.TagPairs {
|
|
if pair.Start != "" && pair.End != "" {
|
|
tagPairs = append(tagPairs, struct {
|
|
start string
|
|
end string
|
|
}{pair.Start, pair.End})
|
|
}
|
|
}
|
|
}
|
|
for _, pair := range defaultReasoningTagPairs {
|
|
tagPairs = append(tagPairs, struct {
|
|
start string
|
|
end string
|
|
}{pair.Start, pair.End})
|
|
}
|
|
|
|
// Track the last position we've processed
|
|
lastPos := 0
|
|
|
|
for {
|
|
// Find the earliest tag start
|
|
earliestStart := -1
|
|
earliestEnd := -1
|
|
isUnclosed := false
|
|
var matchedTag struct {
|
|
start string
|
|
end string
|
|
}
|
|
|
|
for _, tagPair := range tagPairs {
|
|
startIdx := strings.Index(remaining[lastPos:], tagPair.start)
|
|
if startIdx == -1 {
|
|
continue
|
|
}
|
|
startIdx += lastPos
|
|
|
|
// Find the corresponding end tag
|
|
endIdx := strings.Index(remaining[startIdx+len(tagPair.start):], tagPair.end)
|
|
if endIdx == -1 {
|
|
// Unclosed tag - extract what we have
|
|
if earliestStart == -1 || startIdx < earliestStart {
|
|
earliestStart = startIdx
|
|
earliestEnd = len(remaining)
|
|
isUnclosed = true
|
|
matchedTag = tagPair
|
|
}
|
|
continue
|
|
}
|
|
endIdx += startIdx + len(tagPair.start)
|
|
|
|
// Found a complete tag pair
|
|
if earliestStart == -1 || startIdx < earliestStart {
|
|
earliestStart = startIdx
|
|
earliestEnd = endIdx + len(tagPair.end)
|
|
isUnclosed = false
|
|
matchedTag = tagPair
|
|
}
|
|
}
|
|
|
|
if earliestStart == -1 {
|
|
// No more tags found, add remaining content
|
|
if lastPos < len(remaining) {
|
|
cleanedParts = append(cleanedParts, remaining[lastPos:])
|
|
}
|
|
break
|
|
}
|
|
|
|
// Add content before the tag
|
|
if earliestStart > lastPos {
|
|
cleanedParts = append(cleanedParts, remaining[lastPos:earliestStart])
|
|
}
|
|
|
|
// Extract reasoning content
|
|
reasoningStart := earliestStart + len(matchedTag.start)
|
|
// For unclosed tags, earliestEnd is already at the end of the string
|
|
// For closed tags, earliestEnd points to after the closing tag, so we subtract the end tag length
|
|
var reasoningEnd int
|
|
if isUnclosed {
|
|
// Unclosed tag - extract everything to the end
|
|
reasoningEnd = len(remaining)
|
|
} else {
|
|
// Closed tag - exclude the end tag
|
|
reasoningEnd = earliestEnd - len(matchedTag.end)
|
|
}
|
|
if reasoningEnd > reasoningStart {
|
|
reasoningContent := strings.TrimSpace(remaining[reasoningStart:reasoningEnd])
|
|
if reasoningContent != "" {
|
|
reasoningParts = append(reasoningParts, reasoningContent)
|
|
}
|
|
}
|
|
|
|
// Move past this tag
|
|
lastPos = earliestEnd
|
|
}
|
|
|
|
// Combine reasoning parts
|
|
reasoning = strings.Join(reasoningParts, "\n\n")
|
|
// Combine cleaned content parts
|
|
cleanedContent = strings.Join(cleanedParts, "")
|
|
|
|
return reasoning, cleanedContent
|
|
}
|