From 53deeb110780f579e4e7047f1af427436c35c045 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sat, 4 Apr 2026 20:45:50 +0000 Subject: [PATCH] fix(reasoning): suppress partial tag tokens during autoparser warm-up The C++ PEG parser needs a few tokens to identify the reasoning format (e.g. "<|channel>thought\n" for Gemma 4). During this warm-up, the gRPC layer was sending raw partial tag tokens to Go, which leaked into the reasoning field. - Clear reply.message in gRPC when autoparser is active but has no diffs yet, matching llama.cpp server behavior of only emitting classified output - Prefer C++ autoparser chat deltas for reasoning/content in all streaming paths, falling back to Go-side extraction for backends without autoparser (e.g. vLLM) - Override non-streaming no-tools result with chat delta content when available - Guard PrependThinkingTokenIfNeeded against partial tag prefixes during streaming accumulation - Reorder default thinking tokens so <|channel>thought is checked before <|think|> (Gemma 4 templates contain both) --- backend/cpp/llama-cpp/grpc-server.cpp | 14 +++++++++-- core/http/endpoints/openai/chat.go | 23 ++++--------------- .../http/endpoints/openresponses/responses.go | 14 ++++++----- pkg/reasoning/reasoning.go | 11 +++++++-- 4 files changed, 34 insertions(+), 28 deletions(-) diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 6017cb84a..2ae599ded 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -1608,8 +1608,18 @@ public: auto attach_chat_deltas = [](backend::Reply & reply, server_task_result * raw_result) { // Try streaming partial result first auto* partial = dynamic_cast(raw_result); - if (partial && !partial->oaicompat_msg_diffs.empty()) { - populate_chat_deltas_from_diffs(reply, partial->oaicompat_msg_diffs); + if (partial) { + if (!partial->oaicompat_msg_diffs.empty()) { + populate_chat_deltas_from_diffs(reply, partial->oaicompat_msg_diffs); + } else if (partial->is_updated) { + // Autoparser is active but hasn't classified this chunk yet + // (PEG parser warming up). Clear the raw message so the Go + // side doesn't try to parse partial tag tokens (e.g. "<|channel>" + // before the full "<|channel>thought\n" is received). + // This matches llama.cpp server behavior which only emits SSE + // chunks when the parser produces diffs. + reply.set_message(""); + } return; } // Try final result diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index 8dd3ae396..ca5196c60 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -84,24 +84,18 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator _, _, _, err := ComputeChoices(req, s, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool { var reasoningDelta, contentDelta string - // Always keep the Go-side extractor in sync with raw tokens - // (needed for backends that never send chat deltas). + // Always keep the Go-side extractor in sync with raw tokens so it + // can serve as fallback for backends without an autoparser (e.g. vLLM). goReasoning, goContent := extractor.ProcessToken(s) - // Prefer pre-parsed chat deltas from C++ autoparser when available. + // When C++ autoparser chat deltas are available, prefer them — they + // handle model-specific formats (Gemma 4, etc.) without Go-side tags. + // Otherwise fall back to Go-side extraction. if tokenUsage.HasChatDeltaContent() { rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent() contentDelta = cd - // Strip reasoning tags (e.g. <|channel>thought / ) that - // the C++ autoparser includes as part of reasoning content. reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning) - } else if config.TemplateConfig.UseTokenizerTemplate { - // C++ autoparser is active (jinja templates) but hasn't emitted - // chat deltas for this chunk yet — PEG parser is still warming up - // (e.g. accumulating "<|channel>thought\n" for Gemma 4). - // Suppress Go-side output to avoid leaking partial tag tokens. } else { - // No autoparser — use Go-side extraction as the sole source. reasoningDelta = goReasoning contentDelta = goContent } @@ -159,20 +153,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator var reasoningDelta, contentDelta string - // Always keep the Go-side extractor in sync with raw tokens goReasoning, goContent := extractor.ProcessToken(s) - // Prefer pre-parsed chat deltas from C++ autoparser when available. if usage.HasChatDeltaContent() { rawReasoning, cd := usage.ChatDeltaReasoningAndContent() contentDelta = cd - // Strip reasoning tags (e.g. <|channel>thought / ) that - // the C++ autoparser includes as part of reasoning content. reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning) - } else if config.TemplateConfig.UseTokenizerTemplate { - // C++ autoparser warming up — suppress Go-side to avoid tag leaks. } else { - // No autoparser — use Go-side extraction. reasoningDelta = goReasoning contentDelta = goContent } diff --git a/core/http/endpoints/openresponses/responses.go b/core/http/endpoints/openresponses/responses.go index 565582627..764156d4d 100644 --- a/core/http/endpoints/openresponses/responses.go +++ b/core/http/endpoints/openresponses/responses.go @@ -1821,14 +1821,15 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6 // If no tool calls detected yet, handle reasoning and text if !inToolCallMode { var reasoningDelta, contentDelta string - // Prefer pre-parsed chat deltas from C++ autoparser when available + goReasoning, goContent := extractor.ProcessToken(token) + if tokenUsage.HasChatDeltaContent() { rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent() contentDelta = cd reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning) - extractor.ProcessToken(token) // keep state consistent } else { - reasoningDelta, contentDelta = extractor.ProcessToken(token) + reasoningDelta = goReasoning + contentDelta = goContent } // Handle reasoning item @@ -2350,14 +2351,15 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6 accumulatedText += token var reasoningDelta, contentDelta string - // Prefer pre-parsed chat deltas from C++ autoparser when available + goReasoning, goContent := extractor.ProcessToken(token) + if tokenUsage.HasChatDeltaContent() { rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent() contentDelta = cd reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning) - extractor.ProcessToken(token) // keep state consistent } else { - reasoningDelta, contentDelta = extractor.ProcessToken(token) + reasoningDelta = goReasoning + contentDelta = goContent } // Handle reasoning item diff --git a/pkg/reasoning/reasoning.go b/pkg/reasoning/reasoning.go index 72681aaed..e9920af5d 100644 --- a/pkg/reasoning/reasoning.go +++ b/pkg/reasoning/reasoning.go @@ -25,11 +25,11 @@ func DetectThinkingStartToken(prompt string, config *Config) string { // Based on llama.cpp's chat-parser.cpp implementations defaultTokens := []string{ "<|START_THINKING|>", // Command-R models + "<|channel>thought", // Gemma 4 models (before <|think|> — Gemma 4 templates contain both) "<|inner_prefix|>", // Apertus models "", // Seed models "", // DeepSeek, Granite, ExaOne models "<|think|>", // Solar Open models - "<|channel>thought", // Gemma 4 models "", // General thinking tag "[THINK]", // Magistral models } @@ -102,11 +102,18 @@ func PrependThinkingTokenIfNeeded(content string, startToken string) string { return r == ' ' || r == '\t' || r == '\n' || r == '\r' }) - // If content already starts with the token, don't prepend + // If content already contains the token, don't prepend if strings.Contains(trimmed, startToken) { return content } + // If content is a non-empty prefix of the start token (e.g. "<|channel>" + // accumulating toward "<|channel>thought"), don't prepend — we're still + // receiving the tag token-by-token during streaming. + if trimmed != "" && strings.HasPrefix(startToken, trimmed) { + return content + } + // Find where leading whitespace ends whitespaceEnd := 0 for whitespaceEnd < len(content) {