mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-29 11:07:18 -04:00
fix(streaming/tools): don't leak prefill-misclassified content as trailing reasoning chunk (#10000)
When the C++ autoparser is in pure-content fallback mode (qwen3-4b after
model emits a tool-call JSON in non-thinking mode, the streaming worker
ended the SSE stream with a spurious
data: {...,"delta":{"reasoning":"{\"name\":\"exec\",\"arguments\":...}"}}
chunk carrying the same JSON that was already in delta.tool_calls.
The Go-side ReasoningExtractor is configured from
DetectThinkingStartToken, which scans the model's jinja chat template
verbatim and finds <think> inside an {% if enable_thinking %} block
without evaluating the conditional. Every output chunk then runs through
PrependThinkingTokenIfNeeded, which synthesizes a <think> in front and
makes ExtractReasoning treat everything after as reasoning. The autoparser
correctly classifies zero reasoning (qwen3's tool format isn't on
llama.cpp's recognized-tool list, so all tokens land in
ChatDelta.Content), but processStreamWithTools then preferred
extractor.Reasoning() over functions.ReasoningFromChatDeltas at the
end-of-stream flush — handing the polluted Go-side state to
buildDeferredToolCallChunks, which emitted it as a trailing reasoning
chunk.
Two changes:
* Add a sticky preferAutoparser flag to processStreamWithTools, mirroring
the analogous flag in processStream from #9985. Once any ChatDelta
carries content or reasoning, the flag stays on for the rest of the
stream and the worker stops falling back to the Go-side extractor for
per-token deltas. This avoids the per-chunk leak path and the cumulative
pollution.
* Extract chooseDeferredReasoning, a small helper that selects the
end-of-stream reasoning source. When preferAutoparser is set, return
functions.ReasoningFromChatDeltas(chatDeltas); otherwise fall back to
extractor.Reasoning() (the correct source for vLLM and other backends
with no autoparser).
The helper has a focused test suite covering both sides of the contract:
autoparser-active with empty reasoning (the qwen3 case — the fix's
purpose), autoparser-active with real reasoning_content
(jinja-with-recognized-format models), and autoparser-not-active with
genuine Go-side reasoning (vLLM-style backends).
E2E with combined #9988 and this fix on qwen3-4b post-#9985 gallery
shape: 18 content chunks of the tool-call JSON, 1 tool_call chunk with
name='exec' and the right arguments, finish_reason=tool_calls, and zero
reasoning chunks — down from one polluted reasoning chunk before this
fix.
Depends on #9999 (the streaming JSON tool-call gating bug for qwen3) to
make the trailing chunk observable end-to-end; the helper unit tests are
independent.
Assisted-by: Claude:opus-4-7 [Read] [Edit] [Bash] [Write]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
99
core/http/endpoints/openai/chat_stream_reasoning_test.go
Normal file
99
core/http/endpoints/openai/chat_stream_reasoning_test.go
Normal file
@@ -0,0 +1,99 @@
|
||||
package openai
|
||||
|
||||
import (
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
reason "github.com/mudler/LocalAI/pkg/reasoning"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
// Regression test for the prefill-misclassification artifact surfaced in
|
||||
// the review of #9991: when LocalAI templates qwen3 with
|
||||
// use_tokenizer_template (the post-#9985 gallery shape),
|
||||
// DetectThinkingStartToken finds <think> in the model's jinja chat
|
||||
// template — without evaluating the surrounding {% if enable_thinking %}
|
||||
// guard — and the Go-side extractor's PrependThinkingTokenIfNeeded then
|
||||
// treats every non-thinking output token as reasoning. The autoparser does
|
||||
// not classify qwen3's tool calls into ChatDelta.ToolCalls (qwen3's tool
|
||||
// format isn't on llama.cpp's recognized-tool list), so all tokens land in
|
||||
// ChatDelta.Content while the Go-side extractor silently accumulates a
|
||||
// "reasoning" string equal to the raw tool-call JSON. End-of-stream this
|
||||
// is flushed as a trailing `delta.reasoning` chunk to the client.
|
||||
//
|
||||
// chooseDeferredReasoning is the gate: when the autoparser was active for
|
||||
// any chunk (preferAutoparser sticky), we trust its reasoning_content
|
||||
// classification (usually empty) instead of the polluted Go-side state.
|
||||
var _ = Describe("chooseDeferredReasoning", func() {
|
||||
// Simulate the qwen3-after-#9985 misclassification: build a real
|
||||
// extractor with a <think> thinking-start token, then feed it
|
||||
// non-thinking content. The extractor will (correctly per its own
|
||||
// contract) treat the content as reasoning because
|
||||
// PrependThinkingTokenIfNeeded synthesizes a leading <think>.
|
||||
pollutedExtractor := func(content string) *reason.ReasoningExtractor {
|
||||
e := reason.NewReasoningExtractor("<think>", reason.Config{})
|
||||
e.ProcessToken(content)
|
||||
Expect(e.Reasoning()).To(Equal(content),
|
||||
"sanity: when the thinking-start token is set and content has no real <think>...</think>, "+
|
||||
"the extractor classifies all content as reasoning — this is exactly the prefill pollution "+
|
||||
"we want chooseDeferredReasoning to guard against")
|
||||
return e
|
||||
}
|
||||
|
||||
Context("autoparser was active (preferAutoparser=true)", func() {
|
||||
It("returns the autoparser's reasoning classification, ignoring the polluted Go-side state", func() {
|
||||
toolCallJSON := `{"arguments": {"cmd": "echo hello"}, "name": "exec"}`
|
||||
extractor := pollutedExtractor(toolCallJSON)
|
||||
// What the C++ autoparser sent: content chunks but no
|
||||
// reasoning_content (qwen3 tool calls aren't classified by
|
||||
// the upstream PEG parser).
|
||||
chatDeltas := []*pb.ChatDelta{
|
||||
{Content: toolCallJSON, ReasoningContent: ""},
|
||||
}
|
||||
|
||||
got := chooseDeferredReasoning(true, chatDeltas, extractor)
|
||||
|
||||
Expect(got).To(BeEmpty(),
|
||||
"chooseDeferredReasoning must NOT return the polluted extractor state "+
|
||||
"when the autoparser was active — the autoparser correctly classified zero reasoning")
|
||||
})
|
||||
|
||||
It("returns the autoparser's reasoning when it actually did classify reasoning", func() {
|
||||
// The other side of the contract: when the autoparser was
|
||||
// in jinja-with-recognized-format mode and DID classify
|
||||
// reasoning, pass that through verbatim.
|
||||
actualReasoning := "Okay, the user asked X. I should call exec."
|
||||
extractor := pollutedExtractor("ignored polluted state")
|
||||
chatDeltas := []*pb.ChatDelta{
|
||||
{Content: "", ReasoningContent: actualReasoning},
|
||||
}
|
||||
|
||||
got := chooseDeferredReasoning(true, chatDeltas, extractor)
|
||||
|
||||
Expect(got).To(Equal(actualReasoning))
|
||||
})
|
||||
})
|
||||
|
||||
Context("autoparser was NOT active (preferAutoparser=false)", func() {
|
||||
It("falls back to the Go-side extractor — the right source for vLLM and other autoparser-less backends", func() {
|
||||
realReasoning := "Genuine reasoning from a backend without an autoparser"
|
||||
extractor := reason.NewReasoningExtractor("<think>", reason.Config{})
|
||||
extractor.ProcessToken("<think>" + realReasoning + "</think>final answer")
|
||||
|
||||
got := chooseDeferredReasoning(false, nil, extractor)
|
||||
|
||||
Expect(got).To(Equal(realReasoning))
|
||||
})
|
||||
|
||||
It("falls back even when ChatDeltas are present but the autoparser never classified anything", func() {
|
||||
// Defensive: chatDeltas could carry vestigial data; if
|
||||
// preferAutoparser wasn't flipped, we still use the
|
||||
// extractor.
|
||||
extractor := reason.NewReasoningExtractor("", reason.Config{})
|
||||
extractor.ProcessToken("<think>some thoughts</think>answer")
|
||||
|
||||
got := chooseDeferredReasoning(false, []*pb.ChatDelta{{Content: "answer"}}, extractor)
|
||||
|
||||
Expect(got).To(Equal("some thoughts"))
|
||||
})
|
||||
})
|
||||
})
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
"github.com/mudler/LocalAI/pkg/functions"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
reason "github.com/mudler/LocalAI/pkg/reasoning"
|
||||
"github.com/mudler/xlog"
|
||||
@@ -83,6 +84,34 @@ func emitJSONToolCallDeltas(
|
||||
return lastEmittedCount
|
||||
}
|
||||
|
||||
// chooseDeferredReasoning picks the source of truth for the end-of-stream
|
||||
// reasoning flush in processStreamWithTools. When the C++ autoparser was
|
||||
// active during the stream (preferAutoparser), it returns the autoparser's
|
||||
// own classified reasoning_content from ChatDeltas — usually empty when the
|
||||
// autoparser is in pure-content fallback mode. Otherwise it falls back to
|
||||
// the Go-side streaming extractor, which is the right source for backends
|
||||
// without an autoparser (vLLM, etc.).
|
||||
//
|
||||
// Why: the Go-side extractor's accumulated Reasoning() can be polluted by
|
||||
// PrependThinkingTokenIfNeeded — when the tokenizer template contains a
|
||||
// thinking start token (qwen3's jinja template has <think> inside an
|
||||
// {% if enable_thinking %} block, and DetectThinkingStartToken does not
|
||||
// evaluate jinja conditionals), prefill detection treats every chunk's
|
||||
// content as reasoning, even when the model emitted a raw tool-call JSON
|
||||
// in non-thinking mode. Without this guard, qwen3-4b with streaming + tools
|
||||
// (after #9985 flipped the gallery to use_tokenizer_template) emits a
|
||||
// trailing SSE chunk where `reasoning` carries the tool-call JSON.
|
||||
func chooseDeferredReasoning(
|
||||
preferAutoparser bool,
|
||||
chatDeltas []*pb.ChatDelta,
|
||||
extractor *reason.ReasoningExtractor,
|
||||
) string {
|
||||
if preferAutoparser {
|
||||
return functions.ReasoningFromChatDeltas(chatDeltas)
|
||||
}
|
||||
return extractor.Reasoning()
|
||||
}
|
||||
|
||||
// processStream is the streaming worker for chat completions with no
|
||||
// tool/function calling involved. It pushes SSE-shaped chunks onto
|
||||
// `responses` and returns the authoritative cumulative TokenUsage from
|
||||
@@ -228,6 +257,17 @@ func processStreamWithTools(
|
||||
hasChatDeltaToolCalls := false
|
||||
hasChatDeltaContent := false
|
||||
|
||||
// preferAutoparser is sticky: once the C++ autoparser has ever delivered
|
||||
// content or reasoning via ChatDeltas, we trust its classification for the
|
||||
// rest of the stream — including for the end-of-stream reasoning flush in
|
||||
// buildDeferredToolCallChunks. Otherwise the Go-side extractor's
|
||||
// accumulated Reasoning() can be polluted by prefill detection
|
||||
// misclassifying content as reasoning (this happens when <think> appears
|
||||
// in the tokenizer template and the model emits non-reasoning content
|
||||
// like a raw tool-call JSON — qwen3-4b after #9985 enabled
|
||||
// use_tokenizer_template). Mirrors the analogous flag in processStream.
|
||||
preferAutoparser := false
|
||||
|
||||
// X-LocalAI-Node attribution is handled by middleware.ExposeNodeHeader
|
||||
// at the wrapper layer; no in-band signalling from this worker.
|
||||
|
||||
@@ -251,12 +291,17 @@ func processStreamWithTools(
|
||||
|
||||
if usage.HasChatDeltaContent() {
|
||||
rawReasoning, cd := usage.ChatDeltaReasoningAndContent()
|
||||
preferAutoparser = true
|
||||
contentDelta = cd
|
||||
reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
|
||||
} else {
|
||||
} else if !preferAutoparser {
|
||||
reasoningDelta = goReasoning
|
||||
contentDelta = goContent
|
||||
}
|
||||
// If preferAutoparser is already true but this chunk carried no
|
||||
// autoparser data, leave both deltas empty — the next autoparser
|
||||
// chunk will pick things up. Falling back to Go-side here would
|
||||
// re-introduce the prefill-misclassification leak.
|
||||
|
||||
// Emit reasoning deltas in their own SSE chunks before any tool-call chunks
|
||||
// (OpenAI spec: reasoning and tool_calls never share a delta)
|
||||
@@ -399,7 +444,14 @@ func processStreamWithTools(
|
||||
} else {
|
||||
// Fallback: parse tool calls from raw text (no chat deltas from backend)
|
||||
xlog.Debug("[ChatDeltas] no pre-parsed tool calls, falling back to Go-side text parsing")
|
||||
reasoning = extractor.Reasoning()
|
||||
// When the autoparser was active during streaming (preferAutoparser),
|
||||
// trust its reasoning classification rather than the Go-side
|
||||
// extractor's accumulated state — the latter may have misclassified
|
||||
// content as reasoning due to prefill detection on a tokenizer
|
||||
// template that contains <think>. This was visible on qwen3-4b after
|
||||
// #9985 enabled use_tokenizer_template: a streaming tool-call JSON
|
||||
// would leak as a trailing reasoning chunk via the deferred flush.
|
||||
reasoning = chooseDeferredReasoning(preferAutoparser, chatDeltas, extractor)
|
||||
cleanedResult := extractor.CleanedContent()
|
||||
*textContentToReturn = functions.ParseTextContent(cleanedResult, cfg.FunctionsConfig)
|
||||
cleanedResult = functions.CleanupLLMResult(cleanedResult, cfg.FunctionsConfig)
|
||||
|
||||
Reference in New Issue
Block a user