mirror of
https://github.com/mudler/LocalAI.git
synced 2026-04-16 21:08:16 -04:00
fix(reasoning): suppress partial tag tokens during autoparser warm-up
The C++ PEG parser needs a few tokens to identify the reasoning format (e.g. "<|channel>thought\n" for Gemma 4). During this warm-up, the gRPC layer was sending raw partial tag tokens to Go, which leaked into the reasoning field. - Clear reply.message in gRPC when autoparser is active but has no diffs yet, matching llama.cpp server behavior of only emitting classified output - Prefer C++ autoparser chat deltas for reasoning/content in all streaming paths, falling back to Go-side extraction for backends without autoparser (e.g. vLLM) - Override non-streaming no-tools result with chat delta content when available - Guard PrependThinkingTokenIfNeeded against partial tag prefixes during streaming accumulation - Reorder default thinking tokens so <|channel>thought is checked before <|think|> (Gemma 4 templates contain both)
This commit is contained in:
@@ -1608,8 +1608,18 @@ public:
|
||||
auto attach_chat_deltas = [](backend::Reply & reply, server_task_result * raw_result) {
|
||||
// Try streaming partial result first
|
||||
auto* partial = dynamic_cast<server_task_result_cmpl_partial*>(raw_result);
|
||||
if (partial && !partial->oaicompat_msg_diffs.empty()) {
|
||||
populate_chat_deltas_from_diffs(reply, partial->oaicompat_msg_diffs);
|
||||
if (partial) {
|
||||
if (!partial->oaicompat_msg_diffs.empty()) {
|
||||
populate_chat_deltas_from_diffs(reply, partial->oaicompat_msg_diffs);
|
||||
} else if (partial->is_updated) {
|
||||
// Autoparser is active but hasn't classified this chunk yet
|
||||
// (PEG parser warming up). Clear the raw message so the Go
|
||||
// side doesn't try to parse partial tag tokens (e.g. "<|channel>"
|
||||
// before the full "<|channel>thought\n" is received).
|
||||
// This matches llama.cpp server behavior which only emits SSE
|
||||
// chunks when the parser produces diffs.
|
||||
reply.set_message("");
|
||||
}
|
||||
return;
|
||||
}
|
||||
// Try final result
|
||||
|
||||
@@ -84,24 +84,18 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
_, _, _, err := ComputeChoices(req, s, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool {
|
||||
var reasoningDelta, contentDelta string
|
||||
|
||||
// Always keep the Go-side extractor in sync with raw tokens
|
||||
// (needed for backends that never send chat deltas).
|
||||
// Always keep the Go-side extractor in sync with raw tokens so it
|
||||
// can serve as fallback for backends without an autoparser (e.g. vLLM).
|
||||
goReasoning, goContent := extractor.ProcessToken(s)
|
||||
|
||||
// Prefer pre-parsed chat deltas from C++ autoparser when available.
|
||||
// When C++ autoparser chat deltas are available, prefer them — they
|
||||
// handle model-specific formats (Gemma 4, etc.) without Go-side tags.
|
||||
// Otherwise fall back to Go-side extraction.
|
||||
if tokenUsage.HasChatDeltaContent() {
|
||||
rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent()
|
||||
contentDelta = cd
|
||||
// Strip reasoning tags (e.g. <|channel>thought / <channel|>) that
|
||||
// the C++ autoparser includes as part of reasoning content.
|
||||
reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
|
||||
} else if config.TemplateConfig.UseTokenizerTemplate {
|
||||
// C++ autoparser is active (jinja templates) but hasn't emitted
|
||||
// chat deltas for this chunk yet — PEG parser is still warming up
|
||||
// (e.g. accumulating "<|channel>thought\n" for Gemma 4).
|
||||
// Suppress Go-side output to avoid leaking partial tag tokens.
|
||||
} else {
|
||||
// No autoparser — use Go-side extraction as the sole source.
|
||||
reasoningDelta = goReasoning
|
||||
contentDelta = goContent
|
||||
}
|
||||
@@ -159,20 +153,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
|
||||
var reasoningDelta, contentDelta string
|
||||
|
||||
// Always keep the Go-side extractor in sync with raw tokens
|
||||
goReasoning, goContent := extractor.ProcessToken(s)
|
||||
|
||||
// Prefer pre-parsed chat deltas from C++ autoparser when available.
|
||||
if usage.HasChatDeltaContent() {
|
||||
rawReasoning, cd := usage.ChatDeltaReasoningAndContent()
|
||||
contentDelta = cd
|
||||
// Strip reasoning tags (e.g. <|channel>thought / <channel|>) that
|
||||
// the C++ autoparser includes as part of reasoning content.
|
||||
reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
|
||||
} else if config.TemplateConfig.UseTokenizerTemplate {
|
||||
// C++ autoparser warming up — suppress Go-side to avoid tag leaks.
|
||||
} else {
|
||||
// No autoparser — use Go-side extraction.
|
||||
reasoningDelta = goReasoning
|
||||
contentDelta = goContent
|
||||
}
|
||||
|
||||
@@ -1821,14 +1821,15 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
// If no tool calls detected yet, handle reasoning and text
|
||||
if !inToolCallMode {
|
||||
var reasoningDelta, contentDelta string
|
||||
// Prefer pre-parsed chat deltas from C++ autoparser when available
|
||||
goReasoning, goContent := extractor.ProcessToken(token)
|
||||
|
||||
if tokenUsage.HasChatDeltaContent() {
|
||||
rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent()
|
||||
contentDelta = cd
|
||||
reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
|
||||
extractor.ProcessToken(token) // keep state consistent
|
||||
} else {
|
||||
reasoningDelta, contentDelta = extractor.ProcessToken(token)
|
||||
reasoningDelta = goReasoning
|
||||
contentDelta = goContent
|
||||
}
|
||||
|
||||
// Handle reasoning item
|
||||
@@ -2350,14 +2351,15 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
accumulatedText += token
|
||||
|
||||
var reasoningDelta, contentDelta string
|
||||
// Prefer pre-parsed chat deltas from C++ autoparser when available
|
||||
goReasoning, goContent := extractor.ProcessToken(token)
|
||||
|
||||
if tokenUsage.HasChatDeltaContent() {
|
||||
rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent()
|
||||
contentDelta = cd
|
||||
reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
|
||||
extractor.ProcessToken(token) // keep state consistent
|
||||
} else {
|
||||
reasoningDelta, contentDelta = extractor.ProcessToken(token)
|
||||
reasoningDelta = goReasoning
|
||||
contentDelta = goContent
|
||||
}
|
||||
|
||||
// Handle reasoning item
|
||||
|
||||
@@ -25,11 +25,11 @@ func DetectThinkingStartToken(prompt string, config *Config) string {
|
||||
// Based on llama.cpp's chat-parser.cpp implementations
|
||||
defaultTokens := []string{
|
||||
"<|START_THINKING|>", // Command-R models
|
||||
"<|channel>thought", // Gemma 4 models (before <|think|> — Gemma 4 templates contain both)
|
||||
"<|inner_prefix|>", // Apertus models
|
||||
"<seed:think>", // Seed models
|
||||
"<think>", // DeepSeek, Granite, ExaOne models
|
||||
"<|think|>", // Solar Open models
|
||||
"<|channel>thought", // Gemma 4 models
|
||||
"<thinking>", // General thinking tag
|
||||
"[THINK]", // Magistral models
|
||||
}
|
||||
@@ -102,11 +102,18 @@ func PrependThinkingTokenIfNeeded(content string, startToken string) string {
|
||||
return r == ' ' || r == '\t' || r == '\n' || r == '\r'
|
||||
})
|
||||
|
||||
// If content already starts with the token, don't prepend
|
||||
// If content already contains the token, don't prepend
|
||||
if strings.Contains(trimmed, startToken) {
|
||||
return content
|
||||
}
|
||||
|
||||
// If content is a non-empty prefix of the start token (e.g. "<|channel>"
|
||||
// accumulating toward "<|channel>thought"), don't prepend — we're still
|
||||
// receiving the tag token-by-token during streaming.
|
||||
if trimmed != "" && strings.HasPrefix(startToken, trimmed) {
|
||||
return content
|
||||
}
|
||||
|
||||
// Find where leading whitespace ends
|
||||
whitespaceEnd := 0
|
||||
for whitespaceEnd < len(content) {
|
||||
|
||||
Reference in New Issue
Block a user