fix(reasoning): support models with reasoning without starting thinking tag (#8132)

* chore: extract reasoning to its own package Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * make sure we detect thinking tokens from template Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Allow to override via config, add tests Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-03-08 18:08:41 -04:00 · 2026-01-20 21:07:59 +01:00
parent e886bb291a
commit 34e054f607
10 changed files with 542 additions and 127 deletions
--- a/core/config/gguf.go
+++ b/core/config/gguf.go
@@ -62,16 +62,23 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
 		cfg.NGPULayers = &defaultHigh
 	}

-	xlog.Debug("guessDefaultsFromFile: NGPULayers set", "NGPULayers", cfg.NGPULayers)
+	xlog.Debug("[gguf] guessDefaultsFromFile: NGPULayers set", "NGPULayers", cfg.NGPULayers, "modelName", f.Metadata().Name)
+
+	// identify from well known templates first, otherwise use the raw jinja template
+	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
+	if found {
+		// fill jinja template
+		cfg.modelTemplate = chatTemplate.ValueString()
+	}

 	// template estimations
 	if cfg.HasTemplate() {
 		// nothing to guess here
-		xlog.Debug("guessDefaultsFromFile: template already set", "name", cfg.Name)
+		xlog.Debug("[gguf] guessDefaultsFromFile: template already set", "name", cfg.Name, "modelName", f.Metadata().Name)
 		return
 	}

-	xlog.Debug("Model file loaded", "file", cfg.ModelFileName(), "eosTokenID", f.Tokenizer().EOSTokenID, "bosTokenID", f.Tokenizer().BOSTokenID, "modelName", f.Metadata().Name, "architecture", f.Architecture().Architecture)
+	xlog.Debug("[gguf] Model file loaded", "file", cfg.ModelFileName(), "eosTokenID", f.Tokenizer().EOSTokenID, "bosTokenID", f.Tokenizer().BOSTokenID, "modelName", f.Metadata().Name, "architecture", f.Architecture().Architecture)

 	// guess the name
 	if cfg.Name == "" {
@@ -83,4 +90,5 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
 	cfg.FunctionsConfig.GrammarConfig.NoGrammar = true
 	cfg.Options = append(cfg.Options, "use_jinja:true")
 	cfg.KnownUsecaseStrings = append(cfg.KnownUsecaseStrings, "FLAG_CHAT")
+
 }
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -10,6 +10,7 @@ import (
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/functions"
+	"github.com/mudler/LocalAI/pkg/reasoning"
 	"github.com/mudler/cogito"
 	"gopkg.in/yaml.v3"
 )
@@ -30,6 +31,7 @@ type TTSConfig struct {
 // @Description ModelConfig represents a model configuration
 type ModelConfig struct {
 	modelConfigFile          string `yaml:"-" json:"-"`
+	modelTemplate            string `yaml:"-" json:"-"`
 	schema.PredictionOptions `yaml:"parameters,omitempty" json:"parameters,omitempty"`
 	Name                     string `yaml:"name,omitempty" json:"name,omitempty"`

@@ -51,6 +53,7 @@ type ModelConfig struct {
 	ResponseFormatMap                          map[string]interface{} `yaml:"-" json:"-"`

 	FunctionsConfig functions.FunctionsConfig `yaml:"function,omitempty" json:"function,omitempty"`
+	ReasoningConfig reasoning.Config          `yaml:"reasoning,omitempty" json:"reasoning,omitempty"`

 	FeatureFlag FeatureFlag `yaml:"feature_flags,omitempty" json:"feature_flags,omitempty"` // Feature Flag registry. We move fast, and features may break on a per model/backend basis. Registry for (usually temporary) flags that indicate aborting something early.
 	// LLM configs (GPT4ALL, Llama.cpp, ...)
@@ -521,6 +524,11 @@ func (c *ModelConfig) GetModelConfigFile() string {
 	return c.modelConfigFile
 }

+// GetModelTemplate returns the model's chat template if available
+func (c *ModelConfig) GetModelTemplate() string {
+	return c.modelTemplate
+}
+
 type ModelConfigUsecase int

 const (
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -13,6 +13,7 @@ import (
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/functions"
+	reason "github.com/mudler/LocalAI/pkg/reasoning"

 	"github.com/mudler/LocalAI/core/templates"
 	"github.com/mudler/LocalAI/pkg/model"
@@ -38,6 +39,16 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 		}
 		responses <- initialMessage

+		// Detect if thinking token is already in prompt or template
+		// When UseTokenizerTemplate is enabled, predInput is empty, so we check the template
+		var template string
+		if config.TemplateConfig.UseTokenizerTemplate {
+			template = config.GetModelTemplate()
+		} else {
+			template = s
+		}
+		thinkingStartToken := reason.DetectThinkingStartToken(template)
+
 		// Track accumulated content for reasoning extraction
 		accumulatedContent := ""
 		lastEmittedReasoning := ""
@@ -45,8 +56,12 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator

 		_, _, err := ComputeChoices(req, s, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool {
 			accumulatedContent += s
-			// Extract reasoning from accumulated content
-			currentReasoning, cleanedContent := functions.ExtractReasoning(accumulatedContent)
+			content := accumulatedContent
+			// Prepend thinking token if needed, then extract reasoning
+			if config.ReasoningConfig.DisableReasoningTagPrefill == nil || !*config.ReasoningConfig.DisableReasoningTagPrefill {
+				content = reason.PrependThinkingTokenIfNeeded(content, thinkingStartToken)
+			}
+			currentReasoning, cleanedContent := reason.ExtractReasoning(content)

 			// Calculate new reasoning delta (what we haven't emitted yet)
 			var reasoningDelta *string
@@ -118,6 +133,15 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 		return err
 	}
 	processTools := func(noAction string, prompt string, req *schema.OpenAIRequest, config *config.ModelConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse, extraUsage bool) error {
+		// Detect if thinking token is already in prompt or template
+		var template string
+		if config.TemplateConfig.UseTokenizerTemplate {
+			template = config.GetModelTemplate()
+		} else {
+			template = prompt
+		}
+		thinkingStartToken := reason.DetectThinkingStartToken(template)
+
 		result := ""
 		lastEmittedCount := 0
 		_, tokenUsage, err := ComputeChoices(req, prompt, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
@@ -229,8 +253,12 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 		if err != nil {
 			return err
 		}
-		// Extract reasoning before processing tool calls
-		reasoning, cleanedResult := functions.ExtractReasoning(result)
+		// Prepend thinking token if needed, then extract reasoning before processing tool calls
+		resultWithToken := result
+		if config.ReasoningConfig.DisableReasoningTagPrefill == nil || !*config.ReasoningConfig.DisableReasoningTagPrefill {
+			resultWithToken = reason.PrependThinkingTokenIfNeeded(result, thinkingStartToken)
+		}
+		reasoning, cleanedResult := reason.ExtractReasoning(resultWithToken)
 		result = cleanedResult

 		textContentToReturn = functions.ParseTextContent(result, config.FunctionsConfig)
@@ -617,10 +645,24 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator

 		// no streaming mode
 		default:
+			// Detect if thinking token is already in prompt or template
+			var template string
+			if config.TemplateConfig.UseTokenizerTemplate {
+				template = config.GetModelTemplate() // TODO: this should be the parsed jinja template. But for now this is the best we can do.
+			} else {
+				template = predInput
+			}
+			thinkingStartToken := reason.DetectThinkingStartToken(template)
+
+			xlog.Debug("Thinking start token", "thinkingStartToken", thinkingStartToken, "template", template)

 			tokenCallback := func(s string, c *[]schema.Choice) {
-				// Extract reasoning from the response
-				reasoning, cleanedS := functions.ExtractReasoning(s)
+				// Prepend thinking token if needed, then extract reasoning from the response
+				sWithToken := s
+				if config.ReasoningConfig.DisableReasoningTagPrefill == nil || !*config.ReasoningConfig.DisableReasoningTagPrefill {
+					sWithToken = reason.PrependThinkingTokenIfNeeded(s, thinkingStartToken)
+				}
+				reasoning, cleanedS := reason.ExtractReasoning(sWithToken)
 				s = cleanedS

 				if !shouldUseFn {
--- a/pkg/functions/reasoning.go
+++ b/pkg/functions/reasoning.go
@@ -1,114 +0,0 @@
-package functions
-
-import (
-	"strings"
-)
-
-// ExtractReasoning extracts reasoning content from thinking tags and returns
-// both the extracted reasoning and the cleaned content (with tags removed).
-// It handles <thinking>...</thinking> and <think>...</think> tags.
-// Multiple reasoning blocks are concatenated with newlines.
-func ExtractReasoning(content string) (reasoning string, cleanedContent string) {
-	if content == "" {
-		return "", content
-	}
-
-	var reasoningParts []string
-	var cleanedParts []string
-	remaining := content
-
-	// Define tag pairs to look for
-	tagPairs := []struct {
-		start string
-		end   string
-	}{
-		{"<thinking>", "</thinking>"},
-		{"<think>", "</think>"},
-	}
-
-	// Track the last position we've processed
-	lastPos := 0
-
-	for {
-		// Find the earliest tag start
-		earliestStart := -1
-		earliestEnd := -1
-		isUnclosed := false
-		var matchedTag struct {
-			start string
-			end   string
-		}
-
-		for _, tagPair := range tagPairs {
-			startIdx := strings.Index(remaining[lastPos:], tagPair.start)
-			if startIdx == -1 {
-				continue
-			}
-			startIdx += lastPos
-
-			// Find the corresponding end tag
-			endIdx := strings.Index(remaining[startIdx+len(tagPair.start):], tagPair.end)
-			if endIdx == -1 {
-				// Unclosed tag - extract what we have
-				if earliestStart == -1 || startIdx < earliestStart {
-					earliestStart = startIdx
-					earliestEnd = len(remaining)
-					isUnclosed = true
-					matchedTag = tagPair
-				}
-				continue
-			}
-			endIdx += startIdx + len(tagPair.start)
-
-			// Found a complete tag pair
-			if earliestStart == -1 || startIdx < earliestStart {
-				earliestStart = startIdx
-				earliestEnd = endIdx + len(tagPair.end)
-				isUnclosed = false
-				matchedTag = tagPair
-			}
-		}
-
-		if earliestStart == -1 {
-			// No more tags found, add remaining content
-			if lastPos < len(remaining) {
-				cleanedParts = append(cleanedParts, remaining[lastPos:])
-			}
-			break
-		}
-
-		// Add content before the tag
-		if earliestStart > lastPos {
-			cleanedParts = append(cleanedParts, remaining[lastPos:earliestStart])
-		}
-
-		// Extract reasoning content
-		reasoningStart := earliestStart + len(matchedTag.start)
-		// For unclosed tags, earliestEnd is already at the end of the string
-		// For closed tags, earliestEnd points to after the closing tag, so we subtract the end tag length
-		var reasoningEnd int
-		if isUnclosed {
-			// Unclosed tag - extract everything to the end
-			reasoningEnd = len(remaining)
-		} else {
-			// Closed tag - exclude the end tag
-			reasoningEnd = earliestEnd - len(matchedTag.end)
-		}
-		if reasoningEnd > reasoningStart {
-			reasoningContent := strings.TrimSpace(remaining[reasoningStart:reasoningEnd])
-			if reasoningContent != "" {
-				reasoningParts = append(reasoningParts, reasoningContent)
-			}
-		}
-
-		// Move past this tag
-		lastPos = earliestEnd
-	}
-
-	// Combine reasoning parts
-	reasoning = strings.Join(reasoningParts, "\n\n")
-	// Combine cleaned content parts
-	cleanedContent = strings.Join(cleanedParts, "")
-
-	return reasoning, cleanedContent
-}
--- a/pkg/model/watchdog.go
+++ b/pkg/model/watchdog.go
@@ -393,7 +393,7 @@ func (wd *WatchDog) checkMemory() {
 		memoryType = "RAM"
 	}

-	xlog.Debug("[WatchDog] Memory check", "type", memoryType, "usage_percent", aggregate.UsagePercent, "threshold_percent", thresholdPercent, "loaded_models", modelCount)
+	//xlog.Debug("[WatchDog] Memory check", "type", memoryType, "usage_percent", aggregate.UsagePercent, "threshold_percent", thresholdPercent, "loaded_models", modelCount)

 	// Check if usage exceeds threshold
 	if aggregate.UsagePercent > thresholdPercent {
--- a/pkg/reasoning/config.go
+++ b/pkg/reasoning/config.go
@@ -0,0 +1,5 @@
+package reasoning
+
+type Config struct {
+	DisableReasoningTagPrefill *bool `yaml:"disable_reasoning_tag_prefill,omitempty" json:"disable_reasoning_tag_prefill,omitempty"`
+}
--- a/pkg/reasoning/reasoning.go
+++ b/pkg/reasoning/reasoning.go
@@ -0,0 +1,208 @@
+package reasoning
+
+import (
+	"strings"
+)
+
+// DetectThinkingStartToken checks if the prompt or template contains a thinking start token
+// and returns the detected token. This indicates that the model's prompt template
+// already includes the thinking token, so the model output will start with reasoning
+// content without an explicit opening tag.
+// Returns the detected token if found, empty string otherwise.
+// Common tokens checked (in order of specificity - longer first):
+// Based on llama.cpp's chat-parser.cpp implementations:
+// - <|START_THINKING|>      (Command-R models)
+// - <|inner_prefix|>        (Apertus models)
+// - <seed:think>            (Seed models)
+// - <think>    (DeepSeek, Granite, ExaOne models)
+// - <|think|>               (Solar Open models)
+// - <thinking>              (General thinking tag)
+// - <think>                 (GLM models)
+// - [THINK]                 (Magistral models)
+func DetectThinkingStartToken(prompt string) string {
+	// Common thinking start tokens (in order of specificity - longer first)
+	// Based on llama.cpp's chat-parser.cpp implementations
+	thinkingStartTokens := []string{
+		"<|START_THINKING|>", // Command-R models
+		"<|inner_prefix|>",   // Apertus models
+		"<seed:think>",       // Seed models
+		"<think>",            // DeepSeek, Granite, ExaOne models
+		"<|think|>",          // Solar Open models
+		"<thinking>",         // General thinking tag
+		"[THINK]",            // Magistral models
+	}
+
+	// Check if prompt ends with any of these tokens (allowing for trailing whitespace/newlines)
+	trimmedPrompt := strings.TrimRight(prompt, " \t\n\r")
+	for _, token := range thinkingStartTokens {
+		if strings.Contains(trimmedPrompt, token) {
+			return token
+		}
+	}
+
+	// Also check if any of these tokens appear near the end (within last 100 chars)
+	// This handles cases where there might be stop tokens or other content after
+	if len(trimmedPrompt) > 100 {
+		lastPart := trimmedPrompt[len(trimmedPrompt)-100:]
+		for _, token := range thinkingStartTokens {
+			if idx := strings.LastIndex(lastPart, token); idx != -1 {
+				// Check if this is the last meaningful content (only whitespace after)
+				afterToken := lastPart[idx+len(token):]
+				if strings.TrimSpace(afterToken) == "" {
+					return token
+				}
+			}
+		}
+	}
+
+	return ""
+}
+
+// PrependThinkingTokenIfNeeded prepends the thinking start token to content if it was
+// detected in the prompt. This allows the standard extraction logic to work correctly
+// for models where the thinking token is already in the prompt.
+func PrependThinkingTokenIfNeeded(content string, startToken string) string {
+	if startToken == "" {
+		return content
+	}
+
+	// Check if content already starts with the token (allowing for leading whitespace)
+	trimmed := strings.TrimLeftFunc(content, func(r rune) bool {
+		return r == ' ' || r == '\t' || r == '\n' || r == '\r'
+	})
+
+	// If content already starts with the token, don't prepend
+	if strings.Contains(trimmed, startToken) {
+		return content
+	}
+
+	// Find where leading whitespace ends
+	whitespaceEnd := 0
+	for whitespaceEnd < len(content) {
+		r := content[whitespaceEnd]
+		if r != ' ' && r != '\t' && r != '\n' && r != '\r' {
+			break
+		}
+		whitespaceEnd++
+	}
+
+	// Prepend the token after whitespace to make it look like normal tagged content
+	if whitespaceEnd > 0 {
+		return content[:whitespaceEnd] + startToken + content[whitespaceEnd:]
+	}
+	return startToken + content
+}
+
+// ExtractReasoning extracts reasoning content from thinking tags and returns
+// both the extracted reasoning and the cleaned content (with tags removed).
+// It handles <thinking>...</thinking> and <think>...</think> tags.
+// Multiple reasoning blocks are concatenated with newlines.
+func ExtractReasoning(content string) (reasoning string, cleanedContent string) {
+	if content == "" {
+		return "", content
+	}
+
+	var reasoningParts []string
+	var cleanedParts []string
+	remaining := content
+
+	// Define tag pairs to look for (matching llama.cpp's chat-parser.cpp)
+	tagPairs := []struct {
+		start string
+		end   string
+	}{
+		{"<|START_THINKING|>", "<|END_THINKING|>"},            // Command-R models
+		{"<|inner_prefix|>", "<|inner_suffix|>"},              // Apertus models
+		{"<seed:think>", "</seed:think>"},                     // Seed models
+		{"<think>", "</think>"},                               // DeepSeek, Granite, ExaOne models
+		{"<|think|>", "<|end|><|begin|>assistant<|content|>"}, // Solar Open models (complex end)
+		{"<thinking>", "</thinking>"},                         // General thinking tag
+		{"[THINK]", "[/THINK]"},                               // Magistral models
+	}
+
+	// Track the last position we've processed
+	lastPos := 0
+
+	for {
+		// Find the earliest tag start
+		earliestStart := -1
+		earliestEnd := -1
+		isUnclosed := false
+		var matchedTag struct {
+			start string
+			end   string
+		}
+
+		for _, tagPair := range tagPairs {
+			startIdx := strings.Index(remaining[lastPos:], tagPair.start)
+			if startIdx == -1 {
+				continue
+			}
+			startIdx += lastPos
+
+			// Find the corresponding end tag
+			endIdx := strings.Index(remaining[startIdx+len(tagPair.start):], tagPair.end)
+			if endIdx == -1 {
+				// Unclosed tag - extract what we have
+				if earliestStart == -1 || startIdx < earliestStart {
+					earliestStart = startIdx
+					earliestEnd = len(remaining)
+					isUnclosed = true
+					matchedTag = tagPair
+				}
+				continue
+			}
+			endIdx += startIdx + len(tagPair.start)
+
+			// Found a complete tag pair
+			if earliestStart == -1 || startIdx < earliestStart {
+				earliestStart = startIdx
+				earliestEnd = endIdx + len(tagPair.end)
+				isUnclosed = false
+				matchedTag = tagPair
+			}
+		}
+
+		if earliestStart == -1 {
+			// No more tags found, add remaining content
+			if lastPos < len(remaining) {
+				cleanedParts = append(cleanedParts, remaining[lastPos:])
+			}
+			break
+		}
+
+		// Add content before the tag
+		if earliestStart > lastPos {
+			cleanedParts = append(cleanedParts, remaining[lastPos:earliestStart])
+		}
+
+		// Extract reasoning content
+		reasoningStart := earliestStart + len(matchedTag.start)
+		// For unclosed tags, earliestEnd is already at the end of the string
+		// For closed tags, earliestEnd points to after the closing tag, so we subtract the end tag length
+		var reasoningEnd int
+		if isUnclosed {
+			// Unclosed tag - extract everything to the end
+			reasoningEnd = len(remaining)
+		} else {
+			// Closed tag - exclude the end tag
+			reasoningEnd = earliestEnd - len(matchedTag.end)
+		}
+		if reasoningEnd > reasoningStart {
+			reasoningContent := strings.TrimSpace(remaining[reasoningStart:reasoningEnd])
+			if reasoningContent != "" {
+				reasoningParts = append(reasoningParts, reasoningContent)
+			}
+		}
+
+		// Move past this tag
+		lastPos = earliestEnd
+	}
+
+	// Combine reasoning parts
+	reasoning = strings.Join(reasoningParts, "\n\n")
+	// Combine cleaned content parts
+	cleanedContent = strings.Join(cleanedParts, "")
+
+	return reasoning, cleanedContent
+}
--- a/pkg/reasoning/reasoning_suite_test.go
+++ b/pkg/reasoning/reasoning_suite_test.go
@@ -0,0 +1,13 @@
+package reasoning_test
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestReasoning(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "Reasoning test suite")
+}
--- a/pkg/reasoning/reasoning_test.go
+++ b/pkg/reasoning/reasoning_test.go
@@ -1,9 +1,9 @@
-package functions_test
+package reasoning_test

 import (
 	"strings"

-	. "github.com/mudler/LocalAI/pkg/functions"
+	. "github.com/mudler/LocalAI/pkg/reasoning"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
@@ -258,4 +258,249 @@ var _ = Describe("ExtractReasoning", func() {
 			Expect(cleaned).To(Equal("Text  More"))
 		})
 	})
+
+	Context("when content has <|START_THINKING|> tags (Command-R)", func() {
+		It("should extract reasoning from START_THINKING block", func() {
+			content := "Text <|START_THINKING|>Command-R reasoning<|END_THINKING|> More"
+			reasoning, cleaned := ExtractReasoning(content)
+			Expect(reasoning).To(Equal("Command-R reasoning"))
+			Expect(cleaned).To(Equal("Text  More"))
+		})
+
+		It("should handle unclosed START_THINKING block", func() {
+			content := "Before <|START_THINKING|>Incomplete reasoning"
+			reasoning, cleaned := ExtractReasoning(content)
+			Expect(reasoning).To(Equal("Incomplete reasoning"))
+			Expect(cleaned).To(Equal("Before "))
+		})
+	})
+
+	Context("when content has <|inner_prefix|> tags (Apertus)", func() {
+		It("should extract reasoning from inner_prefix block", func() {
+			content := "Text <|inner_prefix|>Apertus reasoning<|inner_suffix|> More"
+			reasoning, cleaned := ExtractReasoning(content)
+			Expect(reasoning).To(Equal("Apertus reasoning"))
+			Expect(cleaned).To(Equal("Text  More"))
+		})
+	})
+
+	Context("when content has <seed:think> tags (Seed)", func() {
+		It("should extract reasoning from seed:think block", func() {
+			content := "Text <seed:think>Seed reasoning</seed:think> More"
+			reasoning, cleaned := ExtractReasoning(content)
+			Expect(reasoning).To(Equal("Seed reasoning"))
+			Expect(cleaned).To(Equal("Text  More"))
+		})
+	})
+
+	Context("when content has <|think|> tags (Solar Open)", func() {
+		It("should extract reasoning from Solar Open think block", func() {
+			content := "Text <|think|>Solar reasoning<|end|><|begin|>assistant<|content|> More"
+			reasoning, cleaned := ExtractReasoning(content)
+			Expect(reasoning).To(Equal("Solar reasoning"))
+			Expect(cleaned).To(Equal("Text  More"))
+		})
+	})
+
+	Context("when content has [THINK] tags (Magistral)", func() {
+		It("should extract reasoning from THINK block", func() {
+			content := "Text [THINK]Magistral reasoning[/THINK] More"
+			reasoning, cleaned := ExtractReasoning(content)
+			Expect(reasoning).To(Equal("Magistral reasoning"))
+			Expect(cleaned).To(Equal("Text  More"))
+		})
+
+		It("should handle unclosed THINK block", func() {
+			content := "Before [THINK]Incomplete reasoning"
+			reasoning, cleaned := ExtractReasoning(content)
+			Expect(reasoning).To(Equal("Incomplete reasoning"))
+			Expect(cleaned).To(Equal("Before "))
+		})
+	})
+})
+
+var _ = Describe("DetectThinkingStartToken", func() {
+	Context("when prompt contains thinking start tokens", func() {
+		It("should detect <|START_THINKING|> at the end", func() {
+			prompt := "Some prompt text <|START_THINKING|>"
+			token := DetectThinkingStartToken(prompt)
+			Expect(token).To(Equal("<|START_THINKING|>"))
+		})
+
+		It("should detect <think> at the end", func() {
+			prompt := "Prompt with <think>"
+			token := DetectThinkingStartToken(prompt)
+			Expect(token).To(Equal("<think>"))
+		})
+
+		It("should detect <thinking> at the end", func() {
+			prompt := "Some text <thinking>"
+			token := DetectThinkingStartToken(prompt)
+			Expect(token).To(Equal("<thinking>"))
+		})
+
+		It("should detect <|inner_prefix|> at the end", func() {
+			prompt := "Prompt <|inner_prefix|>"
+			token := DetectThinkingStartToken(prompt)
+			Expect(token).To(Equal("<|inner_prefix|>"))
+		})
+
+		It("should detect <seed:think> at the end", func() {
+			prompt := "Text <seed:think>"
+			token := DetectThinkingStartToken(prompt)
+			Expect(token).To(Equal("<seed:think>"))
+		})
+
+		It("should detect <|think|> at the end", func() {
+			prompt := "Prompt <|think|>"
+			token := DetectThinkingStartToken(prompt)
+			Expect(token).To(Equal("<|think|>"))
+		})
+
+		It("should detect [THINK] at the end", func() {
+			prompt := "Text [THINK]"
+			token := DetectThinkingStartToken(prompt)
+			Expect(token).To(Equal("[THINK]"))
+		})
+
+		It("should handle trailing whitespace", func() {
+			prompt := "Prompt <|START_THINKING|>   \n\t  "
+			token := DetectThinkingStartToken(prompt)
+			Expect(token).To(Equal("<|START_THINKING|>"))
+		})
+
+		It("should detect token near the end (within last 100 chars)", func() {
+			prefix := strings.Repeat("x", 50)
+			prompt := prefix + "<|START_THINKING|>"
+			token := DetectThinkingStartToken(prompt)
+			Expect(token).To(Equal("<|START_THINKING|>"))
+		})
+
+		It("should detect token when followed by only whitespace", func() {
+			prompt := "Text <think>   \n  "
+			token := DetectThinkingStartToken(prompt)
+			Expect(token).To(Equal("<think>"))
+		})
+	})
+
+	Context("when prompt does not contain thinking tokens", func() {
+		It("should return empty string for regular prompt", func() {
+			prompt := "This is a regular prompt without thinking tokens"
+			token := DetectThinkingStartToken(prompt)
+			Expect(token).To(BeEmpty())
+		})
+
+		It("should return empty string for empty prompt", func() {
+			prompt := ""
+			token := DetectThinkingStartToken(prompt)
+			Expect(token).To(BeEmpty())
+		})
+
+		It("should detect token even when far from end (Contains check)", func() {
+			prefix := strings.Repeat("x", 150)
+			prompt := prefix + "<|START_THINKING|>"
+			token := DetectThinkingStartToken(prompt)
+			// Current implementation uses Contains, so it finds tokens anywhere
+			Expect(token).To(Equal("<|START_THINKING|>"))
+		})
+
+		It("should detect token even when followed by non-whitespace (Contains check)", func() {
+			prompt := "Text <|START_THINKING|>more text"
+			token := DetectThinkingStartToken(prompt)
+			// Current implementation uses Contains, so it finds tokens anywhere
+			Expect(token).To(Equal("<|START_THINKING|>"))
+		})
+	})
+
+	Context("when multiple tokens are present", func() {
+		It("should return the first matching token (most specific)", func() {
+			prompt := "Text <|START_THINKING|> <thinking>"
+			token := DetectThinkingStartToken(prompt)
+			// Should return the first one found (order matters)
+			Expect(token).To(Equal("<|START_THINKING|>"))
+		})
+	})
+})
+
+var _ = Describe("PrependThinkingTokenIfNeeded", func() {
+	Context("when startToken is empty", func() {
+		It("should return content unchanged", func() {
+			content := "Some content"
+			result := PrependThinkingTokenIfNeeded(content, "")
+			Expect(result).To(Equal(content))
+		})
+	})
+
+	Context("when content already starts with token", func() {
+		It("should not prepend if content starts with token", func() {
+			content := "<|START_THINKING|>Reasoning content"
+			result := PrependThinkingTokenIfNeeded(content, "<|START_THINKING|>")
+			Expect(result).To(Equal(content))
+		})
+
+		It("should not prepend if content starts with token after whitespace", func() {
+			content := "   <think>Reasoning"
+			result := PrependThinkingTokenIfNeeded(content, "<think>")
+			Expect(result).To(Equal(content))
+		})
+
+		It("should not prepend if token appears anywhere in content", func() {
+			content := "Some text <thinking>Reasoning</thinking>"
+			result := PrependThinkingTokenIfNeeded(content, "<thinking>")
+			// With Contains check, it should not prepend
+			Expect(result).To(Equal(content))
+		})
+	})
+
+	Context("when content does not contain token", func() {
+		It("should prepend token to content", func() {
+			content := "Reasoning content"
+			result := PrependThinkingTokenIfNeeded(content, "<|START_THINKING|>")
+			Expect(result).To(Equal("<|START_THINKING|>Reasoning content"))
+		})
+
+		It("should prepend token after leading whitespace", func() {
+			content := "   \n  Reasoning content"
+			result := PrependThinkingTokenIfNeeded(content, "<think>")
+			Expect(result).To(Equal("   \n  <think>Reasoning content"))
+		})
+
+		It("should handle empty content", func() {
+			content := ""
+			result := PrependThinkingTokenIfNeeded(content, "<thinking>")
+			Expect(result).To(Equal("<thinking>"))
+		})
+
+		It("should handle content with only whitespace", func() {
+			content := "   \n\t  "
+			result := PrependThinkingTokenIfNeeded(content, "<|START_THINKING|>")
+			Expect(result).To(Equal("   \n\t  <|START_THINKING|>"))
+		})
+	})
+
+	Context("with different token types", func() {
+		It("should prepend <|START_THINKING|>", func() {
+			content := "Reasoning"
+			result := PrependThinkingTokenIfNeeded(content, "<|START_THINKING|>")
+			Expect(result).To(Equal("<|START_THINKING|>Reasoning"))
+		})
+
+		It("should prepend <think>", func() {
+			content := "Reasoning"
+			result := PrependThinkingTokenIfNeeded(content, "<think>")
+			Expect(result).To(Equal("<think>Reasoning"))
+		})
+
+		It("should prepend <thinking>", func() {
+			content := "Reasoning"
+			result := PrependThinkingTokenIfNeeded(content, "<thinking>")
+			Expect(result).To(Equal("<thinking>Reasoning"))
+		})
+
+		It("should prepend [THINK]", func() {
+			content := "Reasoning"
+			result := PrependThinkingTokenIfNeeded(content, "[THINK]")
+			Expect(result).To(Equal("[THINK]Reasoning"))
+		})
+	})
 })
--- a/pkg/xsysinfo/gpu.go
+++ b/pkg/xsysinfo/gpu.go
@@ -569,7 +569,7 @@ func getIntelGPUTop() []GPUMemoryInfo {
 	cmd.Stderr = &stderr

 	if err := cmd.Run(); err != nil {
-		xlog.Debug("intel_gpu_top failed", "error", err, "stderr", stderr.String())
+		xlog.Debug("intel_gpu_top failed", "error", err, "stderr", stderr.String(), "stdout", stdout.String())
 		return nil
 	}