From 53deeb110780f579e4e7047f1af427436c35c045 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Sat, 4 Apr 2026 20:45:50 +0000
Subject: [PATCH] fix(reasoning): suppress partial tag tokens during autoparser
 warm-up

The C++ PEG parser needs a few tokens to identify the reasoning format
(e.g. "<|channel>thought\n" for Gemma 4). During this warm-up, the gRPC
layer was sending raw partial tag tokens to Go, which leaked into the
reasoning field.

- Clear reply.message in gRPC when autoparser is active but has no diffs
  yet, matching llama.cpp server behavior of only emitting classified output
- Prefer C++ autoparser chat deltas for reasoning/content in all streaming
  paths, falling back to Go-side extraction for backends without autoparser
  (e.g. vLLM)
- Override non-streaming no-tools result with chat delta content when available
- Guard PrependThinkingTokenIfNeeded against partial tag prefixes during
  streaming accumulation
- Reorder default thinking tokens so <|channel>thought is checked before
  <|think|> (Gemma 4 templates contain both)
---
 backend/cpp/llama-cpp/grpc-server.cpp         | 14 +++++++++--
 core/http/endpoints/openai/chat.go            | 23 ++++---------------
 .../http/endpoints/openresponses/responses.go | 14 ++++++-----
 pkg/reasoning/reasoning.go                    | 11 +++++++--
 4 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp
index 6017cb84a..2ae599ded 100644
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -1608,8 +1608,18 @@ public:
         auto attach_chat_deltas = [](backend::Reply & reply, server_task_result * raw_result) {
             // Try streaming partial result first
             auto* partial = dynamic_cast<server_task_result_cmpl_partial*>(raw_result);
-            if (partial && !partial->oaicompat_msg_diffs.empty()) {
-                populate_chat_deltas_from_diffs(reply, partial->oaicompat_msg_diffs);
+            if (partial) {
+                if (!partial->oaicompat_msg_diffs.empty()) {
+                    populate_chat_deltas_from_diffs(reply, partial->oaicompat_msg_diffs);
+                } else if (partial->is_updated) {
+                    // Autoparser is active but hasn't classified this chunk yet
+                    // (PEG parser warming up). Clear the raw message so the Go
+                    // side doesn't try to parse partial tag tokens (e.g. "<|channel>"
+                    // before the full "<|channel>thought\n" is received).
+                    // This matches llama.cpp server behavior which only emits SSE
+                    // chunks when the parser produces diffs.
+                    reply.set_message("");
+                }
                 return;
             }
             // Try final result
diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
index 8dd3ae396..ca5196c60 100644
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -84,24 +84,18 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 		_, _, _, err := ComputeChoices(req, s, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool {
 			var reasoningDelta, contentDelta string
 
-			// Always keep the Go-side extractor in sync with raw tokens
-			// (needed for backends that never send chat deltas).
+			// Always keep the Go-side extractor in sync with raw tokens so it
+			// can serve as fallback for backends without an autoparser (e.g. vLLM).
 			goReasoning, goContent := extractor.ProcessToken(s)
 
-			// Prefer pre-parsed chat deltas from C++ autoparser when available.
+			// When C++ autoparser chat deltas are available, prefer them — they
+			// handle model-specific formats (Gemma 4, etc.) without Go-side tags.
+			// Otherwise fall back to Go-side extraction.
 			if tokenUsage.HasChatDeltaContent() {
 				rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent()
 				contentDelta = cd
-				// Strip reasoning tags (e.g. <|channel>thought / <channel|>) that
-				// the C++ autoparser includes as part of reasoning content.
 				reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
-			} else if config.TemplateConfig.UseTokenizerTemplate {
-				// C++ autoparser is active (jinja templates) but hasn't emitted
-				// chat deltas for this chunk yet — PEG parser is still warming up
-				// (e.g. accumulating "<|channel>thought\n" for Gemma 4).
-				// Suppress Go-side output to avoid leaking partial tag tokens.
 			} else {
-				// No autoparser — use Go-side extraction as the sole source.
 				reasoningDelta = goReasoning
 				contentDelta = goContent
 			}
@@ -159,20 +153,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 
 			var reasoningDelta, contentDelta string
 
-			// Always keep the Go-side extractor in sync with raw tokens
 			goReasoning, goContent := extractor.ProcessToken(s)
 
-			// Prefer pre-parsed chat deltas from C++ autoparser when available.
 			if usage.HasChatDeltaContent() {
 				rawReasoning, cd := usage.ChatDeltaReasoningAndContent()
 				contentDelta = cd
-				// Strip reasoning tags (e.g. <|channel>thought / <channel|>) that
-				// the C++ autoparser includes as part of reasoning content.
 				reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
-			} else if config.TemplateConfig.UseTokenizerTemplate {
-				// C++ autoparser warming up — suppress Go-side to avoid tag leaks.
 			} else {
-				// No autoparser — use Go-side extraction.
 				reasoningDelta = goReasoning
 				contentDelta = goContent
 			}
diff --git a/core/http/endpoints/openresponses/responses.go b/core/http/endpoints/openresponses/responses.go
index 565582627..764156d4d 100644
--- a/core/http/endpoints/openresponses/responses.go
+++ b/core/http/endpoints/openresponses/responses.go
@@ -1821,14 +1821,15 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 				// If no tool calls detected yet, handle reasoning and text
 				if !inToolCallMode {
 					var reasoningDelta, contentDelta string
-					// Prefer pre-parsed chat deltas from C++ autoparser when available
+					goReasoning, goContent := extractor.ProcessToken(token)
+
 					if tokenUsage.HasChatDeltaContent() {
 						rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent()
 						contentDelta = cd
 						reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
-						extractor.ProcessToken(token) // keep state consistent
 					} else {
-						reasoningDelta, contentDelta = extractor.ProcessToken(token)
+						reasoningDelta = goReasoning
+						contentDelta = goContent
 					}
 
 					// Handle reasoning item
@@ -2350,14 +2351,15 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 		accumulatedText += token
 
 		var reasoningDelta, contentDelta string
-		// Prefer pre-parsed chat deltas from C++ autoparser when available
+		goReasoning, goContent := extractor.ProcessToken(token)
+
 		if tokenUsage.HasChatDeltaContent() {
 			rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent()
 			contentDelta = cd
 			reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
-			extractor.ProcessToken(token) // keep state consistent
 		} else {
-			reasoningDelta, contentDelta = extractor.ProcessToken(token)
+			reasoningDelta = goReasoning
+			contentDelta = goContent
 		}
 
 		// Handle reasoning item
diff --git a/pkg/reasoning/reasoning.go b/pkg/reasoning/reasoning.go
index 72681aaed..e9920af5d 100644
--- a/pkg/reasoning/reasoning.go
+++ b/pkg/reasoning/reasoning.go
@@ -25,11 +25,11 @@ func DetectThinkingStartToken(prompt string, config *Config) string {
 	// Based on llama.cpp's chat-parser.cpp implementations
 	defaultTokens := []string{
 		"<|START_THINKING|>", // Command-R models
+		"<|channel>thought",  // Gemma 4 models (before <|think|> — Gemma 4 templates contain both)
 		"<|inner_prefix|>",   // Apertus models
 		"<seed:think>",       // Seed models
 		"<think>",            // DeepSeek, Granite, ExaOne models
 		"<|think|>",          // Solar Open models
-		"<|channel>thought",  // Gemma 4 models
 		"<thinking>",         // General thinking tag
 		"[THINK]",            // Magistral models
 	}
@@ -102,11 +102,18 @@ func PrependThinkingTokenIfNeeded(content string, startToken string) string {
 		return r == ' ' || r == '\t' || r == '\n' || r == '\r'
 	})
 
-	// If content already starts with the token, don't prepend
+	// If content already contains the token, don't prepend
 	if strings.Contains(trimmed, startToken) {
 		return content
 	}
 
+	// If content is a non-empty prefix of the start token (e.g. "<|channel>"
+	// accumulating toward "<|channel>thought"), don't prepend — we're still
+	// receiving the tag token-by-token during streaming.
+	if trimmed != "" && strings.HasPrefix(startToken, trimmed) {
+		return content
+	}
+
 	// Find where leading whitespace ends
 	whitespaceEnd := 0
 	for whitespaceEnd < len(content) {