diff --git a/core/http/endpoints/openai/chat_stream_reasoning_test.go b/core/http/endpoints/openai/chat_stream_reasoning_test.go new file mode 100644 index 000000000..8ee7219ac --- /dev/null +++ b/core/http/endpoints/openai/chat_stream_reasoning_test.go @@ -0,0 +1,99 @@ +package openai + +import ( + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + reason "github.com/mudler/LocalAI/pkg/reasoning" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// Regression test for the prefill-misclassification artifact surfaced in +// the review of #9991: when LocalAI templates qwen3 with +// use_tokenizer_template (the post-#9985 gallery shape), +// DetectThinkingStartToken finds in the model's jinja chat +// template — without evaluating the surrounding {% if enable_thinking %} +// guard — and the Go-side extractor's PrependThinkingTokenIfNeeded then +// treats every non-thinking output token as reasoning. The autoparser does +// not classify qwen3's tool calls into ChatDelta.ToolCalls (qwen3's tool +// format isn't on llama.cpp's recognized-tool list), so all tokens land in +// ChatDelta.Content while the Go-side extractor silently accumulates a +// "reasoning" string equal to the raw tool-call JSON. End-of-stream this +// is flushed as a trailing `delta.reasoning` chunk to the client. +// +// chooseDeferredReasoning is the gate: when the autoparser was active for +// any chunk (preferAutoparser sticky), we trust its reasoning_content +// classification (usually empty) instead of the polluted Go-side state. +var _ = Describe("chooseDeferredReasoning", func() { + // Simulate the qwen3-after-#9985 misclassification: build a real + // extractor with a thinking-start token, then feed it + // non-thinking content. The extractor will (correctly per its own + // contract) treat the content as reasoning because + // PrependThinkingTokenIfNeeded synthesizes a leading . + pollutedExtractor := func(content string) *reason.ReasoningExtractor { + e := reason.NewReasoningExtractor("", reason.Config{}) + e.ProcessToken(content) + Expect(e.Reasoning()).To(Equal(content), + "sanity: when the thinking-start token is set and content has no real ..., "+ + "the extractor classifies all content as reasoning — this is exactly the prefill pollution "+ + "we want chooseDeferredReasoning to guard against") + return e + } + + Context("autoparser was active (preferAutoparser=true)", func() { + It("returns the autoparser's reasoning classification, ignoring the polluted Go-side state", func() { + toolCallJSON := `{"arguments": {"cmd": "echo hello"}, "name": "exec"}` + extractor := pollutedExtractor(toolCallJSON) + // What the C++ autoparser sent: content chunks but no + // reasoning_content (qwen3 tool calls aren't classified by + // the upstream PEG parser). + chatDeltas := []*pb.ChatDelta{ + {Content: toolCallJSON, ReasoningContent: ""}, + } + + got := chooseDeferredReasoning(true, chatDeltas, extractor) + + Expect(got).To(BeEmpty(), + "chooseDeferredReasoning must NOT return the polluted extractor state "+ + "when the autoparser was active — the autoparser correctly classified zero reasoning") + }) + + It("returns the autoparser's reasoning when it actually did classify reasoning", func() { + // The other side of the contract: when the autoparser was + // in jinja-with-recognized-format mode and DID classify + // reasoning, pass that through verbatim. + actualReasoning := "Okay, the user asked X. I should call exec." + extractor := pollutedExtractor("ignored polluted state") + chatDeltas := []*pb.ChatDelta{ + {Content: "", ReasoningContent: actualReasoning}, + } + + got := chooseDeferredReasoning(true, chatDeltas, extractor) + + Expect(got).To(Equal(actualReasoning)) + }) + }) + + Context("autoparser was NOT active (preferAutoparser=false)", func() { + It("falls back to the Go-side extractor — the right source for vLLM and other autoparser-less backends", func() { + realReasoning := "Genuine reasoning from a backend without an autoparser" + extractor := reason.NewReasoningExtractor("", reason.Config{}) + extractor.ProcessToken("" + realReasoning + "final answer") + + got := chooseDeferredReasoning(false, nil, extractor) + + Expect(got).To(Equal(realReasoning)) + }) + + It("falls back even when ChatDeltas are present but the autoparser never classified anything", func() { + // Defensive: chatDeltas could carry vestigial data; if + // preferAutoparser wasn't flipped, we still use the + // extractor. + extractor := reason.NewReasoningExtractor("", reason.Config{}) + extractor.ProcessToken("some thoughtsanswer") + + got := chooseDeferredReasoning(false, []*pb.ChatDelta{{Content: "answer"}}, extractor) + + Expect(got).To(Equal("some thoughts")) + }) + }) +}) diff --git a/core/http/endpoints/openai/chat_stream_workers.go b/core/http/endpoints/openai/chat_stream_workers.go index c871ae59a..87f5ee21e 100644 --- a/core/http/endpoints/openai/chat_stream_workers.go +++ b/core/http/endpoints/openai/chat_stream_workers.go @@ -7,6 +7,7 @@ import ( "github.com/mudler/LocalAI/core/config" "github.com/mudler/LocalAI/core/schema" "github.com/mudler/LocalAI/pkg/functions" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" "github.com/mudler/LocalAI/pkg/model" reason "github.com/mudler/LocalAI/pkg/reasoning" "github.com/mudler/xlog" @@ -83,6 +84,34 @@ func emitJSONToolCallDeltas( return lastEmittedCount } +// chooseDeferredReasoning picks the source of truth for the end-of-stream +// reasoning flush in processStreamWithTools. When the C++ autoparser was +// active during the stream (preferAutoparser), it returns the autoparser's +// own classified reasoning_content from ChatDeltas — usually empty when the +// autoparser is in pure-content fallback mode. Otherwise it falls back to +// the Go-side streaming extractor, which is the right source for backends +// without an autoparser (vLLM, etc.). +// +// Why: the Go-side extractor's accumulated Reasoning() can be polluted by +// PrependThinkingTokenIfNeeded — when the tokenizer template contains a +// thinking start token (qwen3's jinja template has inside an +// {% if enable_thinking %} block, and DetectThinkingStartToken does not +// evaluate jinja conditionals), prefill detection treats every chunk's +// content as reasoning, even when the model emitted a raw tool-call JSON +// in non-thinking mode. Without this guard, qwen3-4b with streaming + tools +// (after #9985 flipped the gallery to use_tokenizer_template) emits a +// trailing SSE chunk where `reasoning` carries the tool-call JSON. +func chooseDeferredReasoning( + preferAutoparser bool, + chatDeltas []*pb.ChatDelta, + extractor *reason.ReasoningExtractor, +) string { + if preferAutoparser { + return functions.ReasoningFromChatDeltas(chatDeltas) + } + return extractor.Reasoning() +} + // processStream is the streaming worker for chat completions with no // tool/function calling involved. It pushes SSE-shaped chunks onto // `responses` and returns the authoritative cumulative TokenUsage from @@ -228,6 +257,17 @@ func processStreamWithTools( hasChatDeltaToolCalls := false hasChatDeltaContent := false + // preferAutoparser is sticky: once the C++ autoparser has ever delivered + // content or reasoning via ChatDeltas, we trust its classification for the + // rest of the stream — including for the end-of-stream reasoning flush in + // buildDeferredToolCallChunks. Otherwise the Go-side extractor's + // accumulated Reasoning() can be polluted by prefill detection + // misclassifying content as reasoning (this happens when appears + // in the tokenizer template and the model emits non-reasoning content + // like a raw tool-call JSON — qwen3-4b after #9985 enabled + // use_tokenizer_template). Mirrors the analogous flag in processStream. + preferAutoparser := false + // X-LocalAI-Node attribution is handled by middleware.ExposeNodeHeader // at the wrapper layer; no in-band signalling from this worker. @@ -251,12 +291,17 @@ func processStreamWithTools( if usage.HasChatDeltaContent() { rawReasoning, cd := usage.ChatDeltaReasoningAndContent() + preferAutoparser = true contentDelta = cd reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning) - } else { + } else if !preferAutoparser { reasoningDelta = goReasoning contentDelta = goContent } + // If preferAutoparser is already true but this chunk carried no + // autoparser data, leave both deltas empty — the next autoparser + // chunk will pick things up. Falling back to Go-side here would + // re-introduce the prefill-misclassification leak. // Emit reasoning deltas in their own SSE chunks before any tool-call chunks // (OpenAI spec: reasoning and tool_calls never share a delta) @@ -399,7 +444,14 @@ func processStreamWithTools( } else { // Fallback: parse tool calls from raw text (no chat deltas from backend) xlog.Debug("[ChatDeltas] no pre-parsed tool calls, falling back to Go-side text parsing") - reasoning = extractor.Reasoning() + // When the autoparser was active during streaming (preferAutoparser), + // trust its reasoning classification rather than the Go-side + // extractor's accumulated state — the latter may have misclassified + // content as reasoning due to prefill detection on a tokenizer + // template that contains . This was visible on qwen3-4b after + // #9985 enabled use_tokenizer_template: a streaming tool-call JSON + // would leak as a trailing reasoning chunk via the deferred flush. + reasoning = chooseDeferredReasoning(preferAutoparser, chatDeltas, extractor) cleanedResult := extractor.CleanedContent() *textContentToReturn = functions.ParseTextContent(cleanedResult, cfg.FunctionsConfig) cleanedResult = functions.CleanupLLMResult(cleanedResult, cfg.FunctionsConfig)