feat(autoparser): prefer chat deltas from backends when emitted (#9224)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2026-04-04 12:12:08 +02:00
committed by GitHub
parent 223deb908d
commit 716ddd697b
5 changed files with 295 additions and 4 deletions

View File

@@ -36,6 +36,27 @@ type TokenUsage struct {
Completion int
TimingPromptProcessing float64
TimingTokenGeneration float64
ChatDeltas []*proto.ChatDelta // per-chunk deltas from C++ autoparser (only set during streaming)
}
// HasChatDeltaContent returns true if any chat delta carries content or reasoning text.
// Used to decide whether to prefer C++ autoparser deltas over Go-side tag extraction.
func (t TokenUsage) HasChatDeltaContent() bool {
for _, d := range t.ChatDeltas {
if d.Content != "" || d.ReasoningContent != "" {
return true
}
}
return false
}
// ChatDeltaReasoningAndContent extracts accumulated reasoning and content from chat deltas.
func (t TokenUsage) ChatDeltaReasoningAndContent() (reasoning, content string) {
for _, d := range t.ChatDeltas {
content += d.Content
reasoning += d.ReasoningContent
}
return reasoning, content
}
// ModelInferenceFunc is a test-friendly indirection to call model inference logic.
@@ -171,6 +192,9 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima
allChatDeltas = append(allChatDeltas, reply.ChatDeltas...)
}
// Attach per-chunk chat deltas to tokenUsage so the callback can use them
tokenUsage.ChatDeltas = reply.ChatDeltas
// Parse logprobs from reply if present (collect from last chunk that has them)
if len(reply.Logprobs) > 0 {
var parsedLogprobs schema.Logprobs
@@ -200,6 +224,9 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima
if len(msg) == 0 {
tokenCallback("", tokenUsage)
}
// Clear per-chunk deltas so they don't leak to the next chunk
tokenUsage.ChatDeltas = nil
})
if len(allChatDeltas) > 0 {
xlog.Debug("[ChatDeltas] streaming completed, accumulated deltas from C++ autoparser", "total_deltas", len(allChatDeltas))

View File

@@ -4,6 +4,7 @@ import (
. "github.com/mudler/LocalAI/core/backend"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/schema"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
@@ -107,3 +108,111 @@ var _ = Describe("LLM tests", func() {
})
})
})
var _ = Describe("TokenUsage ChatDelta helpers", func() {
Describe("HasChatDeltaContent", func() {
It("should return false when ChatDeltas is nil", func() {
usage := TokenUsage{}
Expect(usage.HasChatDeltaContent()).To(BeFalse())
})
It("should return false when ChatDeltas is empty", func() {
usage := TokenUsage{ChatDeltas: []*pb.ChatDelta{}}
Expect(usage.HasChatDeltaContent()).To(BeFalse())
})
It("should return false when all deltas have empty content and reasoning", func() {
usage := TokenUsage{
ChatDeltas: []*pb.ChatDelta{
{Content: "", ReasoningContent: ""},
{Content: ""},
},
}
Expect(usage.HasChatDeltaContent()).To(BeFalse())
})
It("should return true when a delta has content", func() {
usage := TokenUsage{
ChatDeltas: []*pb.ChatDelta{
{Content: "hello"},
},
}
Expect(usage.HasChatDeltaContent()).To(BeTrue())
})
It("should return true when a delta has reasoning content", func() {
usage := TokenUsage{
ChatDeltas: []*pb.ChatDelta{
{ReasoningContent: "thinking..."},
},
}
Expect(usage.HasChatDeltaContent()).To(BeTrue())
})
It("should return true when a delta has both content and reasoning", func() {
usage := TokenUsage{
ChatDeltas: []*pb.ChatDelta{
{Content: "hello", ReasoningContent: "thinking..."},
},
}
Expect(usage.HasChatDeltaContent()).To(BeTrue())
})
})
Describe("ChatDeltaReasoningAndContent", func() {
It("should return empty strings when ChatDeltas is nil", func() {
usage := TokenUsage{}
reasoning, content := usage.ChatDeltaReasoningAndContent()
Expect(reasoning).To(BeEmpty())
Expect(content).To(BeEmpty())
})
It("should concatenate content from multiple deltas", func() {
usage := TokenUsage{
ChatDeltas: []*pb.ChatDelta{
{Content: "Hello"},
{Content: " world"},
},
}
reasoning, content := usage.ChatDeltaReasoningAndContent()
Expect(content).To(Equal("Hello world"))
Expect(reasoning).To(BeEmpty())
})
It("should concatenate reasoning from multiple deltas", func() {
usage := TokenUsage{
ChatDeltas: []*pb.ChatDelta{
{ReasoningContent: "step 1"},
{ReasoningContent: " step 2"},
},
}
reasoning, content := usage.ChatDeltaReasoningAndContent()
Expect(reasoning).To(Equal("step 1 step 2"))
Expect(content).To(BeEmpty())
})
It("should separate reasoning and content from mixed deltas", func() {
usage := TokenUsage{
ChatDeltas: []*pb.ChatDelta{
{ReasoningContent: "thinking"},
{Content: "answer"},
},
}
reasoning, content := usage.ChatDeltaReasoningAndContent()
Expect(reasoning).To(Equal("thinking"))
Expect(content).To(Equal("answer"))
})
It("should handle deltas with both fields set", func() {
usage := TokenUsage{
ChatDeltas: []*pb.ChatDelta{
{Content: "a", ReasoningContent: "r1"},
{Content: "b", ReasoningContent: "r2"},
},
}
reasoning, content := usage.ChatDeltaReasoningAndContent()
Expect(reasoning).To(Equal("r1r2"))
Expect(content).To(Equal("ab"))
})
})
})

View File

@@ -81,7 +81,17 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
extractor := reason.NewReasoningExtractor(thinkingStartToken, config.ReasoningConfig)
_, _, _, err := ComputeChoices(req, s, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool {
reasoningDelta, contentDelta := extractor.ProcessToken(s)
var reasoningDelta, contentDelta string
// Prefer pre-parsed chat deltas from C++ autoparser when available
if tokenUsage.HasChatDeltaContent() {
reasoningDelta, contentDelta = tokenUsage.ChatDeltaReasoningAndContent()
// Keep extractor state consistent for fallback
extractor.ProcessToken(s)
} else {
// Fallback: Go-side extraction from raw text
reasoningDelta, contentDelta = extractor.ProcessToken(s)
}
usage := schema.OpenAIUsage{
PromptTokens: tokenUsage.Prompt,
@@ -133,7 +143,18 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
_, tokenUsage, chatDeltas, err := ComputeChoices(req, prompt, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
result += s
reasoningDelta, contentDelta := extractor.ProcessToken(s)
var reasoningDelta, contentDelta string
// Prefer pre-parsed chat deltas from C++ autoparser when available
if usage.HasChatDeltaContent() {
reasoningDelta, contentDelta = usage.ChatDeltaReasoningAndContent()
// Keep extractor state consistent for fallback
extractor.ProcessToken(s)
} else {
// Fallback: Go-side extraction from raw text
reasoningDelta, contentDelta = extractor.ProcessToken(s)
}
// Emit reasoning deltas in their own SSE chunks before any tool-call chunks
// (OpenAI spec: reasoning and tool_calls never share a delta)

View File

@@ -398,5 +398,124 @@ var _ = Describe("ComputeChoices", func() {
Expect(choices).To(HaveLen(1))
Expect(streamedTokens).To(Equal([]string{"Hello", " world"}))
})
It("should pass chat deltas through TokenUsage during streaming", func() {
var receivedDeltas [][]*pb.ChatDelta
backend.ModelInferenceFunc = func(
ctx context.Context, s string, messages schema.Messages,
images, videos, audios []string,
loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader,
o *config.ApplicationConfig,
tokenCallback func(string, backend.TokenUsage) bool,
tools, toolChoice string,
logprobs, topLogprobs *int,
logitBias map[string]float64,
metadata map[string]string,
) (func() (backend.LLMResponse, error), error) {
predFunc := func() (backend.LLMResponse, error) {
if tokenCallback != nil {
// Simulate C++ autoparser sending reasoning in chat deltas
tokenCallback("<|channel>thought\nthinking\n<channel|>", backend.TokenUsage{
Prompt: 5,
ChatDeltas: []*pb.ChatDelta{
{ReasoningContent: "thinking"},
},
})
tokenCallback("Hello!", backend.TokenUsage{
Prompt: 5, Completion: 3,
ChatDeltas: []*pb.ChatDelta{
{Content: "Hello!"},
},
})
}
return backend.LLMResponse{
Response: "<|channel>thought\nthinking\n<channel|>Hello!",
Usage: backend.TokenUsage{Prompt: 5, Completion: 3},
ChatDeltas: []*pb.ChatDelta{
{ReasoningContent: "thinking"},
{Content: "Hello!"},
},
}, nil
}
return predFunc, nil
}
choices, _, deltas, err := ComputeChoices(
makeReq(), "test", cfg, nil, appCfg, nil,
func(s string, c *[]schema.Choice) {
*c = append(*c, schema.Choice{Text: s})
},
func(s string, usage backend.TokenUsage) bool {
// Capture chat deltas received per-chunk
if len(usage.ChatDeltas) > 0 {
receivedDeltas = append(receivedDeltas, usage.ChatDeltas)
}
return true
},
)
Expect(err).ToNot(HaveOccurred())
Expect(choices).To(HaveLen(1))
// Verify per-chunk deltas were received during streaming
Expect(receivedDeltas).To(HaveLen(2))
Expect(receivedDeltas[0][0].ReasoningContent).To(Equal("thinking"))
Expect(receivedDeltas[1][0].Content).To(Equal("Hello!"))
// Verify final accumulated deltas are also returned
Expect(deltas).To(HaveLen(2))
Expect(deltas[0].ReasoningContent).To(Equal("thinking"))
Expect(deltas[1].Content).To(Equal("Hello!"))
})
It("should prefer chat deltas over raw text when HasChatDeltaContent is true", func() {
// Verify that the callback can distinguish between
// chunks with and without chat deltas
var withDeltas, withoutDeltas int
backend.ModelInferenceFunc = func(
ctx context.Context, s string, messages schema.Messages,
images, videos, audios []string,
loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader,
o *config.ApplicationConfig,
tokenCallback func(string, backend.TokenUsage) bool,
tools, toolChoice string,
logprobs, topLogprobs *int,
logitBias map[string]float64,
metadata map[string]string,
) (func() (backend.LLMResponse, error), error) {
predFunc := func() (backend.LLMResponse, error) {
if tokenCallback != nil {
// Chunk with chat deltas (C++ autoparser active)
tokenCallback("raw-text", backend.TokenUsage{
ChatDeltas: []*pb.ChatDelta{{Content: "parsed-content"}},
})
// Chunk without chat deltas (fallback)
tokenCallback("fallback-text", backend.TokenUsage{})
}
return backend.LLMResponse{Response: "raw-textfallback-text"}, nil
}
return predFunc, nil
}
_, _, _, err := ComputeChoices(
makeReq(), "test", cfg, nil, appCfg, nil,
func(s string, c *[]schema.Choice) {
*c = append(*c, schema.Choice{Text: s})
},
func(s string, usage backend.TokenUsage) bool {
if usage.HasChatDeltaContent() {
withDeltas++
r, c := usage.ChatDeltaReasoningAndContent()
Expect(c).To(Equal("parsed-content"))
Expect(r).To(BeEmpty())
} else {
withoutDeltas++
}
return true
},
)
Expect(err).ToNot(HaveOccurred())
Expect(withDeltas).To(Equal(1))
Expect(withoutDeltas).To(Equal(1))
})
})
})

View File

@@ -1819,7 +1819,14 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
// If no tool calls detected yet, handle reasoning and text
if !inToolCallMode {
reasoningDelta, contentDelta := extractor.ProcessToken(token)
var reasoningDelta, contentDelta string
// Prefer pre-parsed chat deltas from C++ autoparser when available
if tokenUsage.HasChatDeltaContent() {
reasoningDelta, contentDelta = tokenUsage.ChatDeltaReasoningAndContent()
extractor.ProcessToken(token) // keep state consistent
} else {
reasoningDelta, contentDelta = extractor.ProcessToken(token)
}
// Handle reasoning item
if extractor.Reasoning() != "" {
@@ -2338,7 +2345,15 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
// Stream text deltas with reasoning extraction
tokenCallback := func(token string, tokenUsage backend.TokenUsage) bool {
accumulatedText += token
reasoningDelta, contentDelta := extractor.ProcessToken(token)
var reasoningDelta, contentDelta string
// Prefer pre-parsed chat deltas from C++ autoparser when available
if tokenUsage.HasChatDeltaContent() {
reasoningDelta, contentDelta = tokenUsage.ChatDeltaReasoningAndContent()
extractor.ProcessToken(token) // keep state consistent
} else {
reasoningDelta, contentDelta = extractor.ProcessToken(token)
}
// Handle reasoning item
if extractor.Reasoning() != "" {