From 716ddd697b48a26309993baa9cbf58c26f1acd1a Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sat, 4 Apr 2026 12:12:08 +0200 Subject: [PATCH] feat(autoparser): prefer chat deltas from backends when emitted (#9224) Signed-off-by: Ettore Di Giacinto --- core/backend/llm.go | 27 ++++ core/backend/llm_test.go | 109 ++++++++++++++++ core/http/endpoints/openai/chat.go | 25 +++- core/http/endpoints/openai/inference_test.go | 119 ++++++++++++++++++ .../http/endpoints/openresponses/responses.go | 19 ++- 5 files changed, 295 insertions(+), 4 deletions(-) diff --git a/core/backend/llm.go b/core/backend/llm.go index 5b416a44d..d4894e70e 100644 --- a/core/backend/llm.go +++ b/core/backend/llm.go @@ -36,6 +36,27 @@ type TokenUsage struct { Completion int TimingPromptProcessing float64 TimingTokenGeneration float64 + ChatDeltas []*proto.ChatDelta // per-chunk deltas from C++ autoparser (only set during streaming) +} + +// HasChatDeltaContent returns true if any chat delta carries content or reasoning text. +// Used to decide whether to prefer C++ autoparser deltas over Go-side tag extraction. +func (t TokenUsage) HasChatDeltaContent() bool { + for _, d := range t.ChatDeltas { + if d.Content != "" || d.ReasoningContent != "" { + return true + } + } + return false +} + +// ChatDeltaReasoningAndContent extracts accumulated reasoning and content from chat deltas. +func (t TokenUsage) ChatDeltaReasoningAndContent() (reasoning, content string) { + for _, d := range t.ChatDeltas { + content += d.Content + reasoning += d.ReasoningContent + } + return reasoning, content } // ModelInferenceFunc is a test-friendly indirection to call model inference logic. @@ -171,6 +192,9 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima allChatDeltas = append(allChatDeltas, reply.ChatDeltas...) } + // Attach per-chunk chat deltas to tokenUsage so the callback can use them + tokenUsage.ChatDeltas = reply.ChatDeltas + // Parse logprobs from reply if present (collect from last chunk that has them) if len(reply.Logprobs) > 0 { var parsedLogprobs schema.Logprobs @@ -200,6 +224,9 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima if len(msg) == 0 { tokenCallback("", tokenUsage) } + + // Clear per-chunk deltas so they don't leak to the next chunk + tokenUsage.ChatDeltas = nil }) if len(allChatDeltas) > 0 { xlog.Debug("[ChatDeltas] streaming completed, accumulated deltas from C++ autoparser", "total_deltas", len(allChatDeltas)) diff --git a/core/backend/llm_test.go b/core/backend/llm_test.go index ea68a9315..2cee3bfa1 100644 --- a/core/backend/llm_test.go +++ b/core/backend/llm_test.go @@ -4,6 +4,7 @@ import ( . "github.com/mudler/LocalAI/core/backend" "github.com/mudler/LocalAI/core/config" "github.com/mudler/LocalAI/core/schema" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -107,3 +108,111 @@ var _ = Describe("LLM tests", func() { }) }) }) + +var _ = Describe("TokenUsage ChatDelta helpers", func() { + Describe("HasChatDeltaContent", func() { + It("should return false when ChatDeltas is nil", func() { + usage := TokenUsage{} + Expect(usage.HasChatDeltaContent()).To(BeFalse()) + }) + + It("should return false when ChatDeltas is empty", func() { + usage := TokenUsage{ChatDeltas: []*pb.ChatDelta{}} + Expect(usage.HasChatDeltaContent()).To(BeFalse()) + }) + + It("should return false when all deltas have empty content and reasoning", func() { + usage := TokenUsage{ + ChatDeltas: []*pb.ChatDelta{ + {Content: "", ReasoningContent: ""}, + {Content: ""}, + }, + } + Expect(usage.HasChatDeltaContent()).To(BeFalse()) + }) + + It("should return true when a delta has content", func() { + usage := TokenUsage{ + ChatDeltas: []*pb.ChatDelta{ + {Content: "hello"}, + }, + } + Expect(usage.HasChatDeltaContent()).To(BeTrue()) + }) + + It("should return true when a delta has reasoning content", func() { + usage := TokenUsage{ + ChatDeltas: []*pb.ChatDelta{ + {ReasoningContent: "thinking..."}, + }, + } + Expect(usage.HasChatDeltaContent()).To(BeTrue()) + }) + + It("should return true when a delta has both content and reasoning", func() { + usage := TokenUsage{ + ChatDeltas: []*pb.ChatDelta{ + {Content: "hello", ReasoningContent: "thinking..."}, + }, + } + Expect(usage.HasChatDeltaContent()).To(BeTrue()) + }) + }) + + Describe("ChatDeltaReasoningAndContent", func() { + It("should return empty strings when ChatDeltas is nil", func() { + usage := TokenUsage{} + reasoning, content := usage.ChatDeltaReasoningAndContent() + Expect(reasoning).To(BeEmpty()) + Expect(content).To(BeEmpty()) + }) + + It("should concatenate content from multiple deltas", func() { + usage := TokenUsage{ + ChatDeltas: []*pb.ChatDelta{ + {Content: "Hello"}, + {Content: " world"}, + }, + } + reasoning, content := usage.ChatDeltaReasoningAndContent() + Expect(content).To(Equal("Hello world")) + Expect(reasoning).To(BeEmpty()) + }) + + It("should concatenate reasoning from multiple deltas", func() { + usage := TokenUsage{ + ChatDeltas: []*pb.ChatDelta{ + {ReasoningContent: "step 1"}, + {ReasoningContent: " step 2"}, + }, + } + reasoning, content := usage.ChatDeltaReasoningAndContent() + Expect(reasoning).To(Equal("step 1 step 2")) + Expect(content).To(BeEmpty()) + }) + + It("should separate reasoning and content from mixed deltas", func() { + usage := TokenUsage{ + ChatDeltas: []*pb.ChatDelta{ + {ReasoningContent: "thinking"}, + {Content: "answer"}, + }, + } + reasoning, content := usage.ChatDeltaReasoningAndContent() + Expect(reasoning).To(Equal("thinking")) + Expect(content).To(Equal("answer")) + }) + + It("should handle deltas with both fields set", func() { + usage := TokenUsage{ + ChatDeltas: []*pb.ChatDelta{ + {Content: "a", ReasoningContent: "r1"}, + {Content: "b", ReasoningContent: "r2"}, + }, + } + reasoning, content := usage.ChatDeltaReasoningAndContent() + Expect(reasoning).To(Equal("r1r2")) + Expect(content).To(Equal("ab")) + }) + }) +}) diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index eb3a92a77..de290b732 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -81,7 +81,17 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator extractor := reason.NewReasoningExtractor(thinkingStartToken, config.ReasoningConfig) _, _, _, err := ComputeChoices(req, s, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool { - reasoningDelta, contentDelta := extractor.ProcessToken(s) + var reasoningDelta, contentDelta string + + // Prefer pre-parsed chat deltas from C++ autoparser when available + if tokenUsage.HasChatDeltaContent() { + reasoningDelta, contentDelta = tokenUsage.ChatDeltaReasoningAndContent() + // Keep extractor state consistent for fallback + extractor.ProcessToken(s) + } else { + // Fallback: Go-side extraction from raw text + reasoningDelta, contentDelta = extractor.ProcessToken(s) + } usage := schema.OpenAIUsage{ PromptTokens: tokenUsage.Prompt, @@ -133,7 +143,18 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator _, tokenUsage, chatDeltas, err := ComputeChoices(req, prompt, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool { result += s - reasoningDelta, contentDelta := extractor.ProcessToken(s) + + var reasoningDelta, contentDelta string + + // Prefer pre-parsed chat deltas from C++ autoparser when available + if usage.HasChatDeltaContent() { + reasoningDelta, contentDelta = usage.ChatDeltaReasoningAndContent() + // Keep extractor state consistent for fallback + extractor.ProcessToken(s) + } else { + // Fallback: Go-side extraction from raw text + reasoningDelta, contentDelta = extractor.ProcessToken(s) + } // Emit reasoning deltas in their own SSE chunks before any tool-call chunks // (OpenAI spec: reasoning and tool_calls never share a delta) diff --git a/core/http/endpoints/openai/inference_test.go b/core/http/endpoints/openai/inference_test.go index 7b5ab39dc..f540a4d8f 100644 --- a/core/http/endpoints/openai/inference_test.go +++ b/core/http/endpoints/openai/inference_test.go @@ -398,5 +398,124 @@ var _ = Describe("ComputeChoices", func() { Expect(choices).To(HaveLen(1)) Expect(streamedTokens).To(Equal([]string{"Hello", " world"})) }) + + It("should pass chat deltas through TokenUsage during streaming", func() { + var receivedDeltas [][]*pb.ChatDelta + backend.ModelInferenceFunc = func( + ctx context.Context, s string, messages schema.Messages, + images, videos, audios []string, + loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader, + o *config.ApplicationConfig, + tokenCallback func(string, backend.TokenUsage) bool, + tools, toolChoice string, + logprobs, topLogprobs *int, + logitBias map[string]float64, + metadata map[string]string, + ) (func() (backend.LLMResponse, error), error) { + predFunc := func() (backend.LLMResponse, error) { + if tokenCallback != nil { + // Simulate C++ autoparser sending reasoning in chat deltas + tokenCallback("<|channel>thought\nthinking\n", backend.TokenUsage{ + Prompt: 5, + ChatDeltas: []*pb.ChatDelta{ + {ReasoningContent: "thinking"}, + }, + }) + tokenCallback("Hello!", backend.TokenUsage{ + Prompt: 5, Completion: 3, + ChatDeltas: []*pb.ChatDelta{ + {Content: "Hello!"}, + }, + }) + } + return backend.LLMResponse{ + Response: "<|channel>thought\nthinking\nHello!", + Usage: backend.TokenUsage{Prompt: 5, Completion: 3}, + ChatDeltas: []*pb.ChatDelta{ + {ReasoningContent: "thinking"}, + {Content: "Hello!"}, + }, + }, nil + } + return predFunc, nil + } + + choices, _, deltas, err := ComputeChoices( + makeReq(), "test", cfg, nil, appCfg, nil, + func(s string, c *[]schema.Choice) { + *c = append(*c, schema.Choice{Text: s}) + }, + func(s string, usage backend.TokenUsage) bool { + // Capture chat deltas received per-chunk + if len(usage.ChatDeltas) > 0 { + receivedDeltas = append(receivedDeltas, usage.ChatDeltas) + } + return true + }, + ) + Expect(err).ToNot(HaveOccurred()) + Expect(choices).To(HaveLen(1)) + + // Verify per-chunk deltas were received during streaming + Expect(receivedDeltas).To(HaveLen(2)) + Expect(receivedDeltas[0][0].ReasoningContent).To(Equal("thinking")) + Expect(receivedDeltas[1][0].Content).To(Equal("Hello!")) + + // Verify final accumulated deltas are also returned + Expect(deltas).To(HaveLen(2)) + Expect(deltas[0].ReasoningContent).To(Equal("thinking")) + Expect(deltas[1].Content).To(Equal("Hello!")) + }) + + It("should prefer chat deltas over raw text when HasChatDeltaContent is true", func() { + // Verify that the callback can distinguish between + // chunks with and without chat deltas + var withDeltas, withoutDeltas int + backend.ModelInferenceFunc = func( + ctx context.Context, s string, messages schema.Messages, + images, videos, audios []string, + loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader, + o *config.ApplicationConfig, + tokenCallback func(string, backend.TokenUsage) bool, + tools, toolChoice string, + logprobs, topLogprobs *int, + logitBias map[string]float64, + metadata map[string]string, + ) (func() (backend.LLMResponse, error), error) { + predFunc := func() (backend.LLMResponse, error) { + if tokenCallback != nil { + // Chunk with chat deltas (C++ autoparser active) + tokenCallback("raw-text", backend.TokenUsage{ + ChatDeltas: []*pb.ChatDelta{{Content: "parsed-content"}}, + }) + // Chunk without chat deltas (fallback) + tokenCallback("fallback-text", backend.TokenUsage{}) + } + return backend.LLMResponse{Response: "raw-textfallback-text"}, nil + } + return predFunc, nil + } + + _, _, _, err := ComputeChoices( + makeReq(), "test", cfg, nil, appCfg, nil, + func(s string, c *[]schema.Choice) { + *c = append(*c, schema.Choice{Text: s}) + }, + func(s string, usage backend.TokenUsage) bool { + if usage.HasChatDeltaContent() { + withDeltas++ + r, c := usage.ChatDeltaReasoningAndContent() + Expect(c).To(Equal("parsed-content")) + Expect(r).To(BeEmpty()) + } else { + withoutDeltas++ + } + return true + }, + ) + Expect(err).ToNot(HaveOccurred()) + Expect(withDeltas).To(Equal(1)) + Expect(withoutDeltas).To(Equal(1)) + }) }) }) diff --git a/core/http/endpoints/openresponses/responses.go b/core/http/endpoints/openresponses/responses.go index 7ab3efbe7..9798e3bb9 100644 --- a/core/http/endpoints/openresponses/responses.go +++ b/core/http/endpoints/openresponses/responses.go @@ -1819,7 +1819,14 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6 // If no tool calls detected yet, handle reasoning and text if !inToolCallMode { - reasoningDelta, contentDelta := extractor.ProcessToken(token) + var reasoningDelta, contentDelta string + // Prefer pre-parsed chat deltas from C++ autoparser when available + if tokenUsage.HasChatDeltaContent() { + reasoningDelta, contentDelta = tokenUsage.ChatDeltaReasoningAndContent() + extractor.ProcessToken(token) // keep state consistent + } else { + reasoningDelta, contentDelta = extractor.ProcessToken(token) + } // Handle reasoning item if extractor.Reasoning() != "" { @@ -2338,7 +2345,15 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6 // Stream text deltas with reasoning extraction tokenCallback := func(token string, tokenUsage backend.TokenUsage) bool { accumulatedText += token - reasoningDelta, contentDelta := extractor.ProcessToken(token) + + var reasoningDelta, contentDelta string + // Prefer pre-parsed chat deltas from C++ autoparser when available + if tokenUsage.HasChatDeltaContent() { + reasoningDelta, contentDelta = tokenUsage.ChatDeltaReasoningAndContent() + extractor.ProcessToken(token) // keep state consistent + } else { + reasoningDelta, contentDelta = extractor.ProcessToken(token) + } // Handle reasoning item if extractor.Reasoning() != "" {