From 716ddd697b48a26309993baa9cbf58c26f1acd1a Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sat, 4 Apr 2026 12:12:08 +0200
Subject: [PATCH] feat(autoparser): prefer chat deltas from backends when
 emitted (#9224)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/backend/llm.go                           |  27 ++++
 core/backend/llm_test.go                      | 109 ++++++++++++++++
 core/http/endpoints/openai/chat.go            |  25 +++-
 core/http/endpoints/openai/inference_test.go  | 119 ++++++++++++++++++
 .../http/endpoints/openresponses/responses.go |  19 ++-
 5 files changed, 295 insertions(+), 4 deletions(-)

diff --git a/core/backend/llm.go b/core/backend/llm.go
index 5b416a44d..d4894e70e 100644
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -36,6 +36,27 @@ type TokenUsage struct {
 	Completion             int
 	TimingPromptProcessing float64
 	TimingTokenGeneration  float64
+	ChatDeltas             []*proto.ChatDelta // per-chunk deltas from C++ autoparser (only set during streaming)
+}
+
+// HasChatDeltaContent returns true if any chat delta carries content or reasoning text.
+// Used to decide whether to prefer C++ autoparser deltas over Go-side tag extraction.
+func (t TokenUsage) HasChatDeltaContent() bool {
+	for _, d := range t.ChatDeltas {
+		if d.Content != "" || d.ReasoningContent != "" {
+			return true
+		}
+	}
+	return false
+}
+
+// ChatDeltaReasoningAndContent extracts accumulated reasoning and content from chat deltas.
+func (t TokenUsage) ChatDeltaReasoningAndContent() (reasoning, content string) {
+	for _, d := range t.ChatDeltas {
+		content += d.Content
+		reasoning += d.ReasoningContent
+	}
+	return reasoning, content
 }
 
 // ModelInferenceFunc is a test-friendly indirection to call model inference logic.
@@ -171,6 +192,9 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima
 					allChatDeltas = append(allChatDeltas, reply.ChatDeltas...)
 				}
 
+				// Attach per-chunk chat deltas to tokenUsage so the callback can use them
+				tokenUsage.ChatDeltas = reply.ChatDeltas
+
 				// Parse logprobs from reply if present (collect from last chunk that has them)
 				if len(reply.Logprobs) > 0 {
 					var parsedLogprobs schema.Logprobs
@@ -200,6 +224,9 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima
 				if len(msg) == 0 {
 					tokenCallback("", tokenUsage)
 				}
+
+				// Clear per-chunk deltas so they don't leak to the next chunk
+				tokenUsage.ChatDeltas = nil
 			})
 			if len(allChatDeltas) > 0 {
 				xlog.Debug("[ChatDeltas] streaming completed, accumulated deltas from C++ autoparser", "total_deltas", len(allChatDeltas))
diff --git a/core/backend/llm_test.go b/core/backend/llm_test.go
index ea68a9315..2cee3bfa1 100644
--- a/core/backend/llm_test.go
+++ b/core/backend/llm_test.go
@@ -4,6 +4,7 @@ import (
 	. "github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
@@ -107,3 +108,111 @@ var _ = Describe("LLM tests", func() {
 		})
 	})
 })
+
+var _ = Describe("TokenUsage ChatDelta helpers", func() {
+	Describe("HasChatDeltaContent", func() {
+		It("should return false when ChatDeltas is nil", func() {
+			usage := TokenUsage{}
+			Expect(usage.HasChatDeltaContent()).To(BeFalse())
+		})
+
+		It("should return false when ChatDeltas is empty", func() {
+			usage := TokenUsage{ChatDeltas: []*pb.ChatDelta{}}
+			Expect(usage.HasChatDeltaContent()).To(BeFalse())
+		})
+
+		It("should return false when all deltas have empty content and reasoning", func() {
+			usage := TokenUsage{
+				ChatDeltas: []*pb.ChatDelta{
+					{Content: "", ReasoningContent: ""},
+					{Content: ""},
+				},
+			}
+			Expect(usage.HasChatDeltaContent()).To(BeFalse())
+		})
+
+		It("should return true when a delta has content", func() {
+			usage := TokenUsage{
+				ChatDeltas: []*pb.ChatDelta{
+					{Content: "hello"},
+				},
+			}
+			Expect(usage.HasChatDeltaContent()).To(BeTrue())
+		})
+
+		It("should return true when a delta has reasoning content", func() {
+			usage := TokenUsage{
+				ChatDeltas: []*pb.ChatDelta{
+					{ReasoningContent: "thinking..."},
+				},
+			}
+			Expect(usage.HasChatDeltaContent()).To(BeTrue())
+		})
+
+		It("should return true when a delta has both content and reasoning", func() {
+			usage := TokenUsage{
+				ChatDeltas: []*pb.ChatDelta{
+					{Content: "hello", ReasoningContent: "thinking..."},
+				},
+			}
+			Expect(usage.HasChatDeltaContent()).To(BeTrue())
+		})
+	})
+
+	Describe("ChatDeltaReasoningAndContent", func() {
+		It("should return empty strings when ChatDeltas is nil", func() {
+			usage := TokenUsage{}
+			reasoning, content := usage.ChatDeltaReasoningAndContent()
+			Expect(reasoning).To(BeEmpty())
+			Expect(content).To(BeEmpty())
+		})
+
+		It("should concatenate content from multiple deltas", func() {
+			usage := TokenUsage{
+				ChatDeltas: []*pb.ChatDelta{
+					{Content: "Hello"},
+					{Content: " world"},
+				},
+			}
+			reasoning, content := usage.ChatDeltaReasoningAndContent()
+			Expect(content).To(Equal("Hello world"))
+			Expect(reasoning).To(BeEmpty())
+		})
+
+		It("should concatenate reasoning from multiple deltas", func() {
+			usage := TokenUsage{
+				ChatDeltas: []*pb.ChatDelta{
+					{ReasoningContent: "step 1"},
+					{ReasoningContent: " step 2"},
+				},
+			}
+			reasoning, content := usage.ChatDeltaReasoningAndContent()
+			Expect(reasoning).To(Equal("step 1 step 2"))
+			Expect(content).To(BeEmpty())
+		})
+
+		It("should separate reasoning and content from mixed deltas", func() {
+			usage := TokenUsage{
+				ChatDeltas: []*pb.ChatDelta{
+					{ReasoningContent: "thinking"},
+					{Content: "answer"},
+				},
+			}
+			reasoning, content := usage.ChatDeltaReasoningAndContent()
+			Expect(reasoning).To(Equal("thinking"))
+			Expect(content).To(Equal("answer"))
+		})
+
+		It("should handle deltas with both fields set", func() {
+			usage := TokenUsage{
+				ChatDeltas: []*pb.ChatDelta{
+					{Content: "a", ReasoningContent: "r1"},
+					{Content: "b", ReasoningContent: "r2"},
+				},
+			}
+			reasoning, content := usage.ChatDeltaReasoningAndContent()
+			Expect(reasoning).To(Equal("r1r2"))
+			Expect(content).To(Equal("ab"))
+		})
+	})
+})
diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
index eb3a92a77..de290b732 100644
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -81,7 +81,17 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 		extractor := reason.NewReasoningExtractor(thinkingStartToken, config.ReasoningConfig)
 
 		_, _, _, err := ComputeChoices(req, s, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool {
-			reasoningDelta, contentDelta := extractor.ProcessToken(s)
+			var reasoningDelta, contentDelta string
+
+			// Prefer pre-parsed chat deltas from C++ autoparser when available
+			if tokenUsage.HasChatDeltaContent() {
+				reasoningDelta, contentDelta = tokenUsage.ChatDeltaReasoningAndContent()
+				// Keep extractor state consistent for fallback
+				extractor.ProcessToken(s)
+			} else {
+				// Fallback: Go-side extraction from raw text
+				reasoningDelta, contentDelta = extractor.ProcessToken(s)
+			}
 
 			usage := schema.OpenAIUsage{
 				PromptTokens:     tokenUsage.Prompt,
@@ -133,7 +143,18 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 
 		_, tokenUsage, chatDeltas, err := ComputeChoices(req, prompt, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
 			result += s
-			reasoningDelta, contentDelta := extractor.ProcessToken(s)
+
+			var reasoningDelta, contentDelta string
+
+			// Prefer pre-parsed chat deltas from C++ autoparser when available
+			if usage.HasChatDeltaContent() {
+				reasoningDelta, contentDelta = usage.ChatDeltaReasoningAndContent()
+				// Keep extractor state consistent for fallback
+				extractor.ProcessToken(s)
+			} else {
+				// Fallback: Go-side extraction from raw text
+				reasoningDelta, contentDelta = extractor.ProcessToken(s)
+			}
 
 			// Emit reasoning deltas in their own SSE chunks before any tool-call chunks
 			// (OpenAI spec: reasoning and tool_calls never share a delta)
diff --git a/core/http/endpoints/openai/inference_test.go b/core/http/endpoints/openai/inference_test.go
index 7b5ab39dc..f540a4d8f 100644
--- a/core/http/endpoints/openai/inference_test.go
+++ b/core/http/endpoints/openai/inference_test.go
@@ -398,5 +398,124 @@ var _ = Describe("ComputeChoices", func() {
 			Expect(choices).To(HaveLen(1))
 			Expect(streamedTokens).To(Equal([]string{"Hello", " world"}))
 		})
+
+		It("should pass chat deltas through TokenUsage during streaming", func() {
+			var receivedDeltas [][]*pb.ChatDelta
+			backend.ModelInferenceFunc = func(
+				ctx context.Context, s string, messages schema.Messages,
+				images, videos, audios []string,
+				loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader,
+				o *config.ApplicationConfig,
+				tokenCallback func(string, backend.TokenUsage) bool,
+				tools, toolChoice string,
+				logprobs, topLogprobs *int,
+				logitBias map[string]float64,
+				metadata map[string]string,
+			) (func() (backend.LLMResponse, error), error) {
+				predFunc := func() (backend.LLMResponse, error) {
+					if tokenCallback != nil {
+						// Simulate C++ autoparser sending reasoning in chat deltas
+						tokenCallback("<|channel>thought\nthinking\n<channel|>", backend.TokenUsage{
+							Prompt: 5,
+							ChatDeltas: []*pb.ChatDelta{
+								{ReasoningContent: "thinking"},
+							},
+						})
+						tokenCallback("Hello!", backend.TokenUsage{
+							Prompt: 5, Completion: 3,
+							ChatDeltas: []*pb.ChatDelta{
+								{Content: "Hello!"},
+							},
+						})
+					}
+					return backend.LLMResponse{
+						Response: "<|channel>thought\nthinking\n<channel|>Hello!",
+						Usage:    backend.TokenUsage{Prompt: 5, Completion: 3},
+						ChatDeltas: []*pb.ChatDelta{
+							{ReasoningContent: "thinking"},
+							{Content: "Hello!"},
+						},
+					}, nil
+				}
+				return predFunc, nil
+			}
+
+			choices, _, deltas, err := ComputeChoices(
+				makeReq(), "test", cfg, nil, appCfg, nil,
+				func(s string, c *[]schema.Choice) {
+					*c = append(*c, schema.Choice{Text: s})
+				},
+				func(s string, usage backend.TokenUsage) bool {
+					// Capture chat deltas received per-chunk
+					if len(usage.ChatDeltas) > 0 {
+						receivedDeltas = append(receivedDeltas, usage.ChatDeltas)
+					}
+					return true
+				},
+			)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(choices).To(HaveLen(1))
+
+			// Verify per-chunk deltas were received during streaming
+			Expect(receivedDeltas).To(HaveLen(2))
+			Expect(receivedDeltas[0][0].ReasoningContent).To(Equal("thinking"))
+			Expect(receivedDeltas[1][0].Content).To(Equal("Hello!"))
+
+			// Verify final accumulated deltas are also returned
+			Expect(deltas).To(HaveLen(2))
+			Expect(deltas[0].ReasoningContent).To(Equal("thinking"))
+			Expect(deltas[1].Content).To(Equal("Hello!"))
+		})
+
+		It("should prefer chat deltas over raw text when HasChatDeltaContent is true", func() {
+			// Verify that the callback can distinguish between
+			// chunks with and without chat deltas
+			var withDeltas, withoutDeltas int
+			backend.ModelInferenceFunc = func(
+				ctx context.Context, s string, messages schema.Messages,
+				images, videos, audios []string,
+				loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader,
+				o *config.ApplicationConfig,
+				tokenCallback func(string, backend.TokenUsage) bool,
+				tools, toolChoice string,
+				logprobs, topLogprobs *int,
+				logitBias map[string]float64,
+				metadata map[string]string,
+			) (func() (backend.LLMResponse, error), error) {
+				predFunc := func() (backend.LLMResponse, error) {
+					if tokenCallback != nil {
+						// Chunk with chat deltas (C++ autoparser active)
+						tokenCallback("raw-text", backend.TokenUsage{
+							ChatDeltas: []*pb.ChatDelta{{Content: "parsed-content"}},
+						})
+						// Chunk without chat deltas (fallback)
+						tokenCallback("fallback-text", backend.TokenUsage{})
+					}
+					return backend.LLMResponse{Response: "raw-textfallback-text"}, nil
+				}
+				return predFunc, nil
+			}
+
+			_, _, _, err := ComputeChoices(
+				makeReq(), "test", cfg, nil, appCfg, nil,
+				func(s string, c *[]schema.Choice) {
+					*c = append(*c, schema.Choice{Text: s})
+				},
+				func(s string, usage backend.TokenUsage) bool {
+					if usage.HasChatDeltaContent() {
+						withDeltas++
+						r, c := usage.ChatDeltaReasoningAndContent()
+						Expect(c).To(Equal("parsed-content"))
+						Expect(r).To(BeEmpty())
+					} else {
+						withoutDeltas++
+					}
+					return true
+				},
+			)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(withDeltas).To(Equal(1))
+			Expect(withoutDeltas).To(Equal(1))
+		})
 	})
 })
diff --git a/core/http/endpoints/openresponses/responses.go b/core/http/endpoints/openresponses/responses.go
index 7ab3efbe7..9798e3bb9 100644
--- a/core/http/endpoints/openresponses/responses.go
+++ b/core/http/endpoints/openresponses/responses.go
@@ -1819,7 +1819,14 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 
 				// If no tool calls detected yet, handle reasoning and text
 				if !inToolCallMode {
-					reasoningDelta, contentDelta := extractor.ProcessToken(token)
+					var reasoningDelta, contentDelta string
+					// Prefer pre-parsed chat deltas from C++ autoparser when available
+					if tokenUsage.HasChatDeltaContent() {
+						reasoningDelta, contentDelta = tokenUsage.ChatDeltaReasoningAndContent()
+						extractor.ProcessToken(token) // keep state consistent
+					} else {
+						reasoningDelta, contentDelta = extractor.ProcessToken(token)
+					}
 
 					// Handle reasoning item
 					if extractor.Reasoning() != "" {
@@ -2338,7 +2345,15 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 	// Stream text deltas with reasoning extraction
 	tokenCallback := func(token string, tokenUsage backend.TokenUsage) bool {
 		accumulatedText += token
-		reasoningDelta, contentDelta := extractor.ProcessToken(token)
+
+		var reasoningDelta, contentDelta string
+		// Prefer pre-parsed chat deltas from C++ autoparser when available
+		if tokenUsage.HasChatDeltaContent() {
+			reasoningDelta, contentDelta = tokenUsage.ChatDeltaReasoningAndContent()
+			extractor.ProcessToken(token) // keep state consistent
+		} else {
+			reasoningDelta, contentDelta = extractor.ProcessToken(token)
+		}
 
 		// Handle reasoning item
 		if extractor.Reasoning() != "" {