fix(reasoning): accumulate and strip reasoning tags from autoparser results (#9227)

fix(reasoning): acccumulate and strip reasoning tags from autoparser results

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2026-04-04 18:15:32 +02:00
committed by GitHub
parent 6f304d1201
commit 6d9d77d590
4 changed files with 168 additions and 4 deletions

View File

@@ -86,7 +86,11 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
// Prefer pre-parsed chat deltas from C++ autoparser when available
if tokenUsage.HasChatDeltaContent() {
reasoningDelta, contentDelta = tokenUsage.ChatDeltaReasoningAndContent()
rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent()
contentDelta = cd
// Strip reasoning tags (e.g. <|channel>thought / <channel|>) that
// the C++ autoparser includes as part of reasoning content.
reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
// Keep extractor state consistent for fallback
extractor.ProcessToken(s)
} else {
@@ -149,7 +153,11 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
// Prefer pre-parsed chat deltas from C++ autoparser when available
if usage.HasChatDeltaContent() {
reasoningDelta, contentDelta = usage.ChatDeltaReasoningAndContent()
rawReasoning, cd := usage.ChatDeltaReasoningAndContent()
contentDelta = cd
// Strip reasoning tags (e.g. <|channel>thought / <channel|>) that
// the C++ autoparser includes as part of reasoning content.
reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
// Keep extractor state consistent for fallback
extractor.ProcessToken(s)
} else {

View File

@@ -1823,7 +1823,9 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
var reasoningDelta, contentDelta string
// Prefer pre-parsed chat deltas from C++ autoparser when available
if tokenUsage.HasChatDeltaContent() {
reasoningDelta, contentDelta = tokenUsage.ChatDeltaReasoningAndContent()
rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent()
contentDelta = cd
reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
extractor.ProcessToken(token) // keep state consistent
} else {
reasoningDelta, contentDelta = extractor.ProcessToken(token)
@@ -2350,7 +2352,9 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
var reasoningDelta, contentDelta string
// Prefer pre-parsed chat deltas from C++ autoparser when available
if tokenUsage.HasChatDeltaContent() {
reasoningDelta, contentDelta = tokenUsage.ChatDeltaReasoningAndContent()
rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent()
contentDelta = cd
reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
extractor.ProcessToken(token) // keep state consistent
} else {
reasoningDelta, contentDelta = extractor.ProcessToken(token)

View File

@@ -21,6 +21,12 @@ type ReasoningExtractor struct {
lastReasoning string
lastCleaned string
suppressReasoning bool
// ChatDelta reasoning accumulator — used by ProcessChatDeltaReasoning
// to strip reasoning tags (e.g. <|channel>thought, <channel|>) that
// the C++ autoparser includes in reasoning_content deltas.
cdReasoningAccum string
cdLastStrippedReasoning string
}
// NewReasoningExtractor creates a new extractor for the given thinking token and config.
@@ -64,6 +70,61 @@ func (e *ReasoningExtractor) ProcessToken(token string) (reasoningDelta, content
return reasoningDelta, contentDelta
}
// ProcessChatDeltaReasoning accumulates raw reasoning text from C++ autoparser
// ChatDeltas, strips any embedded reasoning tags (e.g. <|channel>thought …
// <channel|> for Gemma 4), and returns only the new stripped delta.
// This prevents tag tokens from leaking into the reasoning field of SSE chunks.
//
// When the C++ autoparser already strips tags (e.g. <think> models), the text
// passes through unchanged — ExtractReasoning finds no tags so we use the raw text.
func (e *ReasoningExtractor) ProcessChatDeltaReasoning(rawDelta string) string {
if rawDelta == "" {
return ""
}
e.cdReasoningAccum += rawDelta
// Try to strip reasoning tags from accumulated ChatDelta reasoning.
stripped, cleaned := ExtractReasoning(e.cdReasoningAccum, &e.config)
if stripped == "" {
// ExtractReasoning found no reasoning content. This happens when:
// a) A complete start tag was found but has no content after it yet
// (cleaned == "" because everything is inside the unclosed tag)
// → keep buffering
// b) We're accumulating a partial multi-token start tag
// (e.g. "<|channel>" before "thought" arrives)
// → keep buffering
// c) No tags at all — C++ already stripped them
// → pass through the raw text as-is
if cleaned == "" && strings.TrimSpace(e.cdReasoningAccum) != "" {
// Case (a): tag found, unclosed, no content yet
stripped = ""
} else if e.thinkingStartToken != "" &&
len(strings.TrimSpace(e.cdReasoningAccum)) < len(e.thinkingStartToken) &&
strings.HasPrefix(e.thinkingStartToken, strings.TrimSpace(e.cdReasoningAccum)) {
// Case (b): partial start tag prefix
stripped = ""
} else {
// Case (c): no tags found — text is already clean from C++
stripped = e.cdReasoningAccum
}
}
// Compute delta from stripped reasoning
var delta string
if len(stripped) > len(e.cdLastStrippedReasoning) && strings.HasPrefix(stripped, e.cdLastStrippedReasoning) {
delta = stripped[len(e.cdLastStrippedReasoning):]
} else if stripped != e.cdLastStrippedReasoning && stripped != "" {
delta = stripped
}
e.cdLastStrippedReasoning = stripped
if e.suppressReasoning {
return ""
}
return delta
}
// Reasoning returns the total accumulated reasoning after streaming.
func (e *ReasoningExtractor) Reasoning() string {
return e.lastReasoning
@@ -84,6 +145,8 @@ func (e *ReasoningExtractor) Reset() {
e.accumulated = ""
e.lastReasoning = ""
e.lastCleaned = ""
e.cdReasoningAccum = ""
e.cdLastStrippedReasoning = ""
}
// ResetAndSuppressReasoning clears state and suppresses future reasoning deltas.
@@ -95,6 +158,8 @@ func (e *ReasoningExtractor) ResetAndSuppressReasoning() {
e.accumulated = ""
e.lastReasoning = ""
e.lastCleaned = ""
e.cdReasoningAccum = ""
e.cdLastStrippedReasoning = ""
e.suppressReasoning = true
}

View File

@@ -195,4 +195,91 @@ var _ = Describe("ReasoningExtractor", func() {
Expect(ext.CleanedContent()).To(Equal("visible content"))
})
})
Context("ProcessChatDeltaReasoning with Gemma 4 tags", func() {
It("should strip <|channel>thought and <channel|> tags from streaming deltas", func() {
ext := NewReasoningExtractor("<|channel>thought", Config{})
// Simulate C++ autoparser sending tag tokens as reasoning
d1 := ext.ProcessChatDeltaReasoning("<|channel>")
Expect(d1).To(BeEmpty(), "start tag prefix should be buffered, not emitted")
d2 := ext.ProcessChatDeltaReasoning("thought")
Expect(d2).To(BeEmpty(), "start tag suffix should be buffered, not emitted")
d3 := ext.ProcessChatDeltaReasoning("\n")
Expect(d3).To(BeEmpty(), "newline after start tag should not emit yet")
d4 := ext.ProcessChatDeltaReasoning("The")
Expect(d4).To(Equal("The"))
d5 := ext.ProcessChatDeltaReasoning(" user")
Expect(d5).To(Equal(" user"))
d6 := ext.ProcessChatDeltaReasoning(" asks")
Expect(d6).To(Equal(" asks"))
// Trailing newline gets TrimSpaced by ExtractReasoning,
// so it appears delayed with the next non-whitespace token
d7 := ext.ProcessChatDeltaReasoning("\n")
Expect(d7).To(BeEmpty(), "trailing newline is buffered by TrimSpace")
d8 := ext.ProcessChatDeltaReasoning("2+2=4")
Expect(d8).To(Equal("\n2+2=4"), "delayed newline emitted with next content")
d9 := ext.ProcessChatDeltaReasoning("<channel|>")
Expect(d9).To(BeEmpty(), "close tag should be consumed, not emitted")
})
It("should handle empty deltas", func() {
ext := NewReasoningExtractor("<|channel>thought", Config{})
d := ext.ProcessChatDeltaReasoning("")
Expect(d).To(BeEmpty())
})
It("should pass through reasoning without tags unchanged", func() {
ext := NewReasoningExtractor("<think>", Config{})
// When C++ autoparser already strips tags (e.g. <think> models),
// reasoning arrives clean — just pass it through.
d1 := ext.ProcessChatDeltaReasoning("I need to")
Expect(d1).To(Equal("I need to"))
d2 := ext.ProcessChatDeltaReasoning(" think carefully")
Expect(d2).To(Equal(" think carefully"))
})
It("should strip <think> tags if C++ autoparser includes them", func() {
ext := NewReasoningExtractor("<think>", Config{})
d1 := ext.ProcessChatDeltaReasoning("<think>")
Expect(d1).To(BeEmpty())
d2 := ext.ProcessChatDeltaReasoning("reasoning")
Expect(d2).To(Equal("reasoning"))
d3 := ext.ProcessChatDeltaReasoning("</think>")
Expect(d3).To(BeEmpty())
})
It("should respect suppressReasoning", func() {
ext := NewReasoningExtractor("<|channel>thought", Config{})
ext.ResetAndSuppressReasoning()
d := ext.ProcessChatDeltaReasoning("some reasoning")
Expect(d).To(BeEmpty())
})
It("should reset ChatDelta state on Reset", func() {
ext := NewReasoningExtractor("<|channel>thought", Config{})
ext.ProcessChatDeltaReasoning("<|channel>thought")
ext.ProcessChatDeltaReasoning("\nfirst reasoning")
ext.Reset()
// After reset, should start fresh
d := ext.ProcessChatDeltaReasoning("clean reasoning")
Expect(d).To(Equal("clean reasoning"))
})
})
})