fix(reasoning): support models with reasoning without starting thinking tag (#8132)

* chore: extract reasoning to its own package

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* make sure we detect thinking tokens from template

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Allow to override via config, add tests

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2026-01-20 21:07:59 +01:00
committed by GitHub
parent e886bb291a
commit 34e054f607
10 changed files with 542 additions and 127 deletions

View File

@@ -1,114 +0,0 @@
package functions
import (
"strings"
)
// ExtractReasoning extracts reasoning content from thinking tags and returns
// both the extracted reasoning and the cleaned content (with tags removed).
// It handles <thinking>...</thinking> and <think>...</think> tags.
// Multiple reasoning blocks are concatenated with newlines.
func ExtractReasoning(content string) (reasoning string, cleanedContent string) {
if content == "" {
return "", content
}
var reasoningParts []string
var cleanedParts []string
remaining := content
// Define tag pairs to look for
tagPairs := []struct {
start string
end string
}{
{"<thinking>", "</thinking>"},
{"<think>", "</think>"},
}
// Track the last position we've processed
lastPos := 0
for {
// Find the earliest tag start
earliestStart := -1
earliestEnd := -1
isUnclosed := false
var matchedTag struct {
start string
end string
}
for _, tagPair := range tagPairs {
startIdx := strings.Index(remaining[lastPos:], tagPair.start)
if startIdx == -1 {
continue
}
startIdx += lastPos
// Find the corresponding end tag
endIdx := strings.Index(remaining[startIdx+len(tagPair.start):], tagPair.end)
if endIdx == -1 {
// Unclosed tag - extract what we have
if earliestStart == -1 || startIdx < earliestStart {
earliestStart = startIdx
earliestEnd = len(remaining)
isUnclosed = true
matchedTag = tagPair
}
continue
}
endIdx += startIdx + len(tagPair.start)
// Found a complete tag pair
if earliestStart == -1 || startIdx < earliestStart {
earliestStart = startIdx
earliestEnd = endIdx + len(tagPair.end)
isUnclosed = false
matchedTag = tagPair
}
}
if earliestStart == -1 {
// No more tags found, add remaining content
if lastPos < len(remaining) {
cleanedParts = append(cleanedParts, remaining[lastPos:])
}
break
}
// Add content before the tag
if earliestStart > lastPos {
cleanedParts = append(cleanedParts, remaining[lastPos:earliestStart])
}
// Extract reasoning content
reasoningStart := earliestStart + len(matchedTag.start)
// For unclosed tags, earliestEnd is already at the end of the string
// For closed tags, earliestEnd points to after the closing tag, so we subtract the end tag length
var reasoningEnd int
if isUnclosed {
// Unclosed tag - extract everything to the end
reasoningEnd = len(remaining)
} else {
// Closed tag - exclude the end tag
reasoningEnd = earliestEnd - len(matchedTag.end)
}
if reasoningEnd > reasoningStart {
reasoningContent := strings.TrimSpace(remaining[reasoningStart:reasoningEnd])
if reasoningContent != "" {
reasoningParts = append(reasoningParts, reasoningContent)
}
}
// Move past this tag
lastPos = earliestEnd
}
// Combine reasoning parts
reasoning = strings.Join(reasoningParts, "\n\n")
// Combine cleaned content parts
cleanedContent = strings.Join(cleanedParts, "")
return reasoning, cleanedContent
}

View File

@@ -393,7 +393,7 @@ func (wd *WatchDog) checkMemory() {
memoryType = "RAM"
}
xlog.Debug("[WatchDog] Memory check", "type", memoryType, "usage_percent", aggregate.UsagePercent, "threshold_percent", thresholdPercent, "loaded_models", modelCount)
//xlog.Debug("[WatchDog] Memory check", "type", memoryType, "usage_percent", aggregate.UsagePercent, "threshold_percent", thresholdPercent, "loaded_models", modelCount)
// Check if usage exceeds threshold
if aggregate.UsagePercent > thresholdPercent {

5
pkg/reasoning/config.go Normal file
View File

@@ -0,0 +1,5 @@
package reasoning
type Config struct {
DisableReasoningTagPrefill *bool `yaml:"disable_reasoning_tag_prefill,omitempty" json:"disable_reasoning_tag_prefill,omitempty"`
}

208
pkg/reasoning/reasoning.go Normal file
View File

@@ -0,0 +1,208 @@
package reasoning
import (
"strings"
)
// DetectThinkingStartToken checks if the prompt or template contains a thinking start token
// and returns the detected token. This indicates that the model's prompt template
// already includes the thinking token, so the model output will start with reasoning
// content without an explicit opening tag.
// Returns the detected token if found, empty string otherwise.
// Common tokens checked (in order of specificity - longer first):
// Based on llama.cpp's chat-parser.cpp implementations:
// - <|START_THINKING|> (Command-R models)
// - <|inner_prefix|> (Apertus models)
// - <seed:think> (Seed models)
// - <think> (DeepSeek, Granite, ExaOne models)
// - <|think|> (Solar Open models)
// - <thinking> (General thinking tag)
// - <think> (GLM models)
// - [THINK] (Magistral models)
func DetectThinkingStartToken(prompt string) string {
// Common thinking start tokens (in order of specificity - longer first)
// Based on llama.cpp's chat-parser.cpp implementations
thinkingStartTokens := []string{
"<|START_THINKING|>", // Command-R models
"<|inner_prefix|>", // Apertus models
"<seed:think>", // Seed models
"<think>", // DeepSeek, Granite, ExaOne models
"<|think|>", // Solar Open models
"<thinking>", // General thinking tag
"[THINK]", // Magistral models
}
// Check if prompt ends with any of these tokens (allowing for trailing whitespace/newlines)
trimmedPrompt := strings.TrimRight(prompt, " \t\n\r")
for _, token := range thinkingStartTokens {
if strings.Contains(trimmedPrompt, token) {
return token
}
}
// Also check if any of these tokens appear near the end (within last 100 chars)
// This handles cases where there might be stop tokens or other content after
if len(trimmedPrompt) > 100 {
lastPart := trimmedPrompt[len(trimmedPrompt)-100:]
for _, token := range thinkingStartTokens {
if idx := strings.LastIndex(lastPart, token); idx != -1 {
// Check if this is the last meaningful content (only whitespace after)
afterToken := lastPart[idx+len(token):]
if strings.TrimSpace(afterToken) == "" {
return token
}
}
}
}
return ""
}
// PrependThinkingTokenIfNeeded prepends the thinking start token to content if it was
// detected in the prompt. This allows the standard extraction logic to work correctly
// for models where the thinking token is already in the prompt.
func PrependThinkingTokenIfNeeded(content string, startToken string) string {
if startToken == "" {
return content
}
// Check if content already starts with the token (allowing for leading whitespace)
trimmed := strings.TrimLeftFunc(content, func(r rune) bool {
return r == ' ' || r == '\t' || r == '\n' || r == '\r'
})
// If content already starts with the token, don't prepend
if strings.Contains(trimmed, startToken) {
return content
}
// Find where leading whitespace ends
whitespaceEnd := 0
for whitespaceEnd < len(content) {
r := content[whitespaceEnd]
if r != ' ' && r != '\t' && r != '\n' && r != '\r' {
break
}
whitespaceEnd++
}
// Prepend the token after whitespace to make it look like normal tagged content
if whitespaceEnd > 0 {
return content[:whitespaceEnd] + startToken + content[whitespaceEnd:]
}
return startToken + content
}
// ExtractReasoning extracts reasoning content from thinking tags and returns
// both the extracted reasoning and the cleaned content (with tags removed).
// It handles <thinking>...</thinking> and <think>...</think> tags.
// Multiple reasoning blocks are concatenated with newlines.
func ExtractReasoning(content string) (reasoning string, cleanedContent string) {
if content == "" {
return "", content
}
var reasoningParts []string
var cleanedParts []string
remaining := content
// Define tag pairs to look for (matching llama.cpp's chat-parser.cpp)
tagPairs := []struct {
start string
end string
}{
{"<|START_THINKING|>", "<|END_THINKING|>"}, // Command-R models
{"<|inner_prefix|>", "<|inner_suffix|>"}, // Apertus models
{"<seed:think>", "</seed:think>"}, // Seed models
{"<think>", "</think>"}, // DeepSeek, Granite, ExaOne models
{"<|think|>", "<|end|><|begin|>assistant<|content|>"}, // Solar Open models (complex end)
{"<thinking>", "</thinking>"}, // General thinking tag
{"[THINK]", "[/THINK]"}, // Magistral models
}
// Track the last position we've processed
lastPos := 0
for {
// Find the earliest tag start
earliestStart := -1
earliestEnd := -1
isUnclosed := false
var matchedTag struct {
start string
end string
}
for _, tagPair := range tagPairs {
startIdx := strings.Index(remaining[lastPos:], tagPair.start)
if startIdx == -1 {
continue
}
startIdx += lastPos
// Find the corresponding end tag
endIdx := strings.Index(remaining[startIdx+len(tagPair.start):], tagPair.end)
if endIdx == -1 {
// Unclosed tag - extract what we have
if earliestStart == -1 || startIdx < earliestStart {
earliestStart = startIdx
earliestEnd = len(remaining)
isUnclosed = true
matchedTag = tagPair
}
continue
}
endIdx += startIdx + len(tagPair.start)
// Found a complete tag pair
if earliestStart == -1 || startIdx < earliestStart {
earliestStart = startIdx
earliestEnd = endIdx + len(tagPair.end)
isUnclosed = false
matchedTag = tagPair
}
}
if earliestStart == -1 {
// No more tags found, add remaining content
if lastPos < len(remaining) {
cleanedParts = append(cleanedParts, remaining[lastPos:])
}
break
}
// Add content before the tag
if earliestStart > lastPos {
cleanedParts = append(cleanedParts, remaining[lastPos:earliestStart])
}
// Extract reasoning content
reasoningStart := earliestStart + len(matchedTag.start)
// For unclosed tags, earliestEnd is already at the end of the string
// For closed tags, earliestEnd points to after the closing tag, so we subtract the end tag length
var reasoningEnd int
if isUnclosed {
// Unclosed tag - extract everything to the end
reasoningEnd = len(remaining)
} else {
// Closed tag - exclude the end tag
reasoningEnd = earliestEnd - len(matchedTag.end)
}
if reasoningEnd > reasoningStart {
reasoningContent := strings.TrimSpace(remaining[reasoningStart:reasoningEnd])
if reasoningContent != "" {
reasoningParts = append(reasoningParts, reasoningContent)
}
}
// Move past this tag
lastPos = earliestEnd
}
// Combine reasoning parts
reasoning = strings.Join(reasoningParts, "\n\n")
// Combine cleaned content parts
cleanedContent = strings.Join(cleanedParts, "")
return reasoning, cleanedContent
}

View File

@@ -0,0 +1,13 @@
package reasoning_test
import (
"testing"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
func TestReasoning(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "Reasoning test suite")
}

View File

@@ -1,9 +1,9 @@
package functions_test
package reasoning_test
import (
"strings"
. "github.com/mudler/LocalAI/pkg/functions"
. "github.com/mudler/LocalAI/pkg/reasoning"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
@@ -258,4 +258,249 @@ var _ = Describe("ExtractReasoning", func() {
Expect(cleaned).To(Equal("Text More"))
})
})
Context("when content has <|START_THINKING|> tags (Command-R)", func() {
It("should extract reasoning from START_THINKING block", func() {
content := "Text <|START_THINKING|>Command-R reasoning<|END_THINKING|> More"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("Command-R reasoning"))
Expect(cleaned).To(Equal("Text More"))
})
It("should handle unclosed START_THINKING block", func() {
content := "Before <|START_THINKING|>Incomplete reasoning"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("Incomplete reasoning"))
Expect(cleaned).To(Equal("Before "))
})
})
Context("when content has <|inner_prefix|> tags (Apertus)", func() {
It("should extract reasoning from inner_prefix block", func() {
content := "Text <|inner_prefix|>Apertus reasoning<|inner_suffix|> More"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("Apertus reasoning"))
Expect(cleaned).To(Equal("Text More"))
})
})
Context("when content has <seed:think> tags (Seed)", func() {
It("should extract reasoning from seed:think block", func() {
content := "Text <seed:think>Seed reasoning</seed:think> More"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("Seed reasoning"))
Expect(cleaned).To(Equal("Text More"))
})
})
Context("when content has <|think|> tags (Solar Open)", func() {
It("should extract reasoning from Solar Open think block", func() {
content := "Text <|think|>Solar reasoning<|end|><|begin|>assistant<|content|> More"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("Solar reasoning"))
Expect(cleaned).To(Equal("Text More"))
})
})
Context("when content has [THINK] tags (Magistral)", func() {
It("should extract reasoning from THINK block", func() {
content := "Text [THINK]Magistral reasoning[/THINK] More"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("Magistral reasoning"))
Expect(cleaned).To(Equal("Text More"))
})
It("should handle unclosed THINK block", func() {
content := "Before [THINK]Incomplete reasoning"
reasoning, cleaned := ExtractReasoning(content)
Expect(reasoning).To(Equal("Incomplete reasoning"))
Expect(cleaned).To(Equal("Before "))
})
})
})
var _ = Describe("DetectThinkingStartToken", func() {
Context("when prompt contains thinking start tokens", func() {
It("should detect <|START_THINKING|> at the end", func() {
prompt := "Some prompt text <|START_THINKING|>"
token := DetectThinkingStartToken(prompt)
Expect(token).To(Equal("<|START_THINKING|>"))
})
It("should detect <think> at the end", func() {
prompt := "Prompt with <think>"
token := DetectThinkingStartToken(prompt)
Expect(token).To(Equal("<think>"))
})
It("should detect <thinking> at the end", func() {
prompt := "Some text <thinking>"
token := DetectThinkingStartToken(prompt)
Expect(token).To(Equal("<thinking>"))
})
It("should detect <|inner_prefix|> at the end", func() {
prompt := "Prompt <|inner_prefix|>"
token := DetectThinkingStartToken(prompt)
Expect(token).To(Equal("<|inner_prefix|>"))
})
It("should detect <seed:think> at the end", func() {
prompt := "Text <seed:think>"
token := DetectThinkingStartToken(prompt)
Expect(token).To(Equal("<seed:think>"))
})
It("should detect <|think|> at the end", func() {
prompt := "Prompt <|think|>"
token := DetectThinkingStartToken(prompt)
Expect(token).To(Equal("<|think|>"))
})
It("should detect [THINK] at the end", func() {
prompt := "Text [THINK]"
token := DetectThinkingStartToken(prompt)
Expect(token).To(Equal("[THINK]"))
})
It("should handle trailing whitespace", func() {
prompt := "Prompt <|START_THINKING|> \n\t "
token := DetectThinkingStartToken(prompt)
Expect(token).To(Equal("<|START_THINKING|>"))
})
It("should detect token near the end (within last 100 chars)", func() {
prefix := strings.Repeat("x", 50)
prompt := prefix + "<|START_THINKING|>"
token := DetectThinkingStartToken(prompt)
Expect(token).To(Equal("<|START_THINKING|>"))
})
It("should detect token when followed by only whitespace", func() {
prompt := "Text <think> \n "
token := DetectThinkingStartToken(prompt)
Expect(token).To(Equal("<think>"))
})
})
Context("when prompt does not contain thinking tokens", func() {
It("should return empty string for regular prompt", func() {
prompt := "This is a regular prompt without thinking tokens"
token := DetectThinkingStartToken(prompt)
Expect(token).To(BeEmpty())
})
It("should return empty string for empty prompt", func() {
prompt := ""
token := DetectThinkingStartToken(prompt)
Expect(token).To(BeEmpty())
})
It("should detect token even when far from end (Contains check)", func() {
prefix := strings.Repeat("x", 150)
prompt := prefix + "<|START_THINKING|>"
token := DetectThinkingStartToken(prompt)
// Current implementation uses Contains, so it finds tokens anywhere
Expect(token).To(Equal("<|START_THINKING|>"))
})
It("should detect token even when followed by non-whitespace (Contains check)", func() {
prompt := "Text <|START_THINKING|>more text"
token := DetectThinkingStartToken(prompt)
// Current implementation uses Contains, so it finds tokens anywhere
Expect(token).To(Equal("<|START_THINKING|>"))
})
})
Context("when multiple tokens are present", func() {
It("should return the first matching token (most specific)", func() {
prompt := "Text <|START_THINKING|> <thinking>"
token := DetectThinkingStartToken(prompt)
// Should return the first one found (order matters)
Expect(token).To(Equal("<|START_THINKING|>"))
})
})
})
var _ = Describe("PrependThinkingTokenIfNeeded", func() {
Context("when startToken is empty", func() {
It("should return content unchanged", func() {
content := "Some content"
result := PrependThinkingTokenIfNeeded(content, "")
Expect(result).To(Equal(content))
})
})
Context("when content already starts with token", func() {
It("should not prepend if content starts with token", func() {
content := "<|START_THINKING|>Reasoning content"
result := PrependThinkingTokenIfNeeded(content, "<|START_THINKING|>")
Expect(result).To(Equal(content))
})
It("should not prepend if content starts with token after whitespace", func() {
content := " <think>Reasoning"
result := PrependThinkingTokenIfNeeded(content, "<think>")
Expect(result).To(Equal(content))
})
It("should not prepend if token appears anywhere in content", func() {
content := "Some text <thinking>Reasoning</thinking>"
result := PrependThinkingTokenIfNeeded(content, "<thinking>")
// With Contains check, it should not prepend
Expect(result).To(Equal(content))
})
})
Context("when content does not contain token", func() {
It("should prepend token to content", func() {
content := "Reasoning content"
result := PrependThinkingTokenIfNeeded(content, "<|START_THINKING|>")
Expect(result).To(Equal("<|START_THINKING|>Reasoning content"))
})
It("should prepend token after leading whitespace", func() {
content := " \n Reasoning content"
result := PrependThinkingTokenIfNeeded(content, "<think>")
Expect(result).To(Equal(" \n <think>Reasoning content"))
})
It("should handle empty content", func() {
content := ""
result := PrependThinkingTokenIfNeeded(content, "<thinking>")
Expect(result).To(Equal("<thinking>"))
})
It("should handle content with only whitespace", func() {
content := " \n\t "
result := PrependThinkingTokenIfNeeded(content, "<|START_THINKING|>")
Expect(result).To(Equal(" \n\t <|START_THINKING|>"))
})
})
Context("with different token types", func() {
It("should prepend <|START_THINKING|>", func() {
content := "Reasoning"
result := PrependThinkingTokenIfNeeded(content, "<|START_THINKING|>")
Expect(result).To(Equal("<|START_THINKING|>Reasoning"))
})
It("should prepend <think>", func() {
content := "Reasoning"
result := PrependThinkingTokenIfNeeded(content, "<think>")
Expect(result).To(Equal("<think>Reasoning"))
})
It("should prepend <thinking>", func() {
content := "Reasoning"
result := PrependThinkingTokenIfNeeded(content, "<thinking>")
Expect(result).To(Equal("<thinking>Reasoning"))
})
It("should prepend [THINK]", func() {
content := "Reasoning"
result := PrependThinkingTokenIfNeeded(content, "[THINK]")
Expect(result).To(Equal("[THINK]Reasoning"))
})
})
})

View File

@@ -569,7 +569,7 @@ func getIntelGPUTop() []GPUMemoryInfo {
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
xlog.Debug("intel_gpu_top failed", "error", err, "stderr", stderr.String())
xlog.Debug("intel_gpu_top failed", "error", err, "stderr", stderr.String(), "stdout", stdout.String())
return nil
}