mirror of
https://github.com/mudler/LocalAI.git
synced 2026-04-20 14:57:35 -04:00
feat(autoparser): prefer chat deltas from backends when emitted (#9224)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
committed by
GitHub
parent
223deb908d
commit
716ddd697b
@@ -36,6 +36,27 @@ type TokenUsage struct {
|
||||
Completion int
|
||||
TimingPromptProcessing float64
|
||||
TimingTokenGeneration float64
|
||||
ChatDeltas []*proto.ChatDelta // per-chunk deltas from C++ autoparser (only set during streaming)
|
||||
}
|
||||
|
||||
// HasChatDeltaContent returns true if any chat delta carries content or reasoning text.
|
||||
// Used to decide whether to prefer C++ autoparser deltas over Go-side tag extraction.
|
||||
func (t TokenUsage) HasChatDeltaContent() bool {
|
||||
for _, d := range t.ChatDeltas {
|
||||
if d.Content != "" || d.ReasoningContent != "" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// ChatDeltaReasoningAndContent extracts accumulated reasoning and content from chat deltas.
|
||||
func (t TokenUsage) ChatDeltaReasoningAndContent() (reasoning, content string) {
|
||||
for _, d := range t.ChatDeltas {
|
||||
content += d.Content
|
||||
reasoning += d.ReasoningContent
|
||||
}
|
||||
return reasoning, content
|
||||
}
|
||||
|
||||
// ModelInferenceFunc is a test-friendly indirection to call model inference logic.
|
||||
@@ -171,6 +192,9 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima
|
||||
allChatDeltas = append(allChatDeltas, reply.ChatDeltas...)
|
||||
}
|
||||
|
||||
// Attach per-chunk chat deltas to tokenUsage so the callback can use them
|
||||
tokenUsage.ChatDeltas = reply.ChatDeltas
|
||||
|
||||
// Parse logprobs from reply if present (collect from last chunk that has them)
|
||||
if len(reply.Logprobs) > 0 {
|
||||
var parsedLogprobs schema.Logprobs
|
||||
@@ -200,6 +224,9 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima
|
||||
if len(msg) == 0 {
|
||||
tokenCallback("", tokenUsage)
|
||||
}
|
||||
|
||||
// Clear per-chunk deltas so they don't leak to the next chunk
|
||||
tokenUsage.ChatDeltas = nil
|
||||
})
|
||||
if len(allChatDeltas) > 0 {
|
||||
xlog.Debug("[ChatDeltas] streaming completed, accumulated deltas from C++ autoparser", "total_deltas", len(allChatDeltas))
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
. "github.com/mudler/LocalAI/core/backend"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
@@ -107,3 +108,111 @@ var _ = Describe("LLM tests", func() {
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
var _ = Describe("TokenUsage ChatDelta helpers", func() {
|
||||
Describe("HasChatDeltaContent", func() {
|
||||
It("should return false when ChatDeltas is nil", func() {
|
||||
usage := TokenUsage{}
|
||||
Expect(usage.HasChatDeltaContent()).To(BeFalse())
|
||||
})
|
||||
|
||||
It("should return false when ChatDeltas is empty", func() {
|
||||
usage := TokenUsage{ChatDeltas: []*pb.ChatDelta{}}
|
||||
Expect(usage.HasChatDeltaContent()).To(BeFalse())
|
||||
})
|
||||
|
||||
It("should return false when all deltas have empty content and reasoning", func() {
|
||||
usage := TokenUsage{
|
||||
ChatDeltas: []*pb.ChatDelta{
|
||||
{Content: "", ReasoningContent: ""},
|
||||
{Content: ""},
|
||||
},
|
||||
}
|
||||
Expect(usage.HasChatDeltaContent()).To(BeFalse())
|
||||
})
|
||||
|
||||
It("should return true when a delta has content", func() {
|
||||
usage := TokenUsage{
|
||||
ChatDeltas: []*pb.ChatDelta{
|
||||
{Content: "hello"},
|
||||
},
|
||||
}
|
||||
Expect(usage.HasChatDeltaContent()).To(BeTrue())
|
||||
})
|
||||
|
||||
It("should return true when a delta has reasoning content", func() {
|
||||
usage := TokenUsage{
|
||||
ChatDeltas: []*pb.ChatDelta{
|
||||
{ReasoningContent: "thinking..."},
|
||||
},
|
||||
}
|
||||
Expect(usage.HasChatDeltaContent()).To(BeTrue())
|
||||
})
|
||||
|
||||
It("should return true when a delta has both content and reasoning", func() {
|
||||
usage := TokenUsage{
|
||||
ChatDeltas: []*pb.ChatDelta{
|
||||
{Content: "hello", ReasoningContent: "thinking..."},
|
||||
},
|
||||
}
|
||||
Expect(usage.HasChatDeltaContent()).To(BeTrue())
|
||||
})
|
||||
})
|
||||
|
||||
Describe("ChatDeltaReasoningAndContent", func() {
|
||||
It("should return empty strings when ChatDeltas is nil", func() {
|
||||
usage := TokenUsage{}
|
||||
reasoning, content := usage.ChatDeltaReasoningAndContent()
|
||||
Expect(reasoning).To(BeEmpty())
|
||||
Expect(content).To(BeEmpty())
|
||||
})
|
||||
|
||||
It("should concatenate content from multiple deltas", func() {
|
||||
usage := TokenUsage{
|
||||
ChatDeltas: []*pb.ChatDelta{
|
||||
{Content: "Hello"},
|
||||
{Content: " world"},
|
||||
},
|
||||
}
|
||||
reasoning, content := usage.ChatDeltaReasoningAndContent()
|
||||
Expect(content).To(Equal("Hello world"))
|
||||
Expect(reasoning).To(BeEmpty())
|
||||
})
|
||||
|
||||
It("should concatenate reasoning from multiple deltas", func() {
|
||||
usage := TokenUsage{
|
||||
ChatDeltas: []*pb.ChatDelta{
|
||||
{ReasoningContent: "step 1"},
|
||||
{ReasoningContent: " step 2"},
|
||||
},
|
||||
}
|
||||
reasoning, content := usage.ChatDeltaReasoningAndContent()
|
||||
Expect(reasoning).To(Equal("step 1 step 2"))
|
||||
Expect(content).To(BeEmpty())
|
||||
})
|
||||
|
||||
It("should separate reasoning and content from mixed deltas", func() {
|
||||
usage := TokenUsage{
|
||||
ChatDeltas: []*pb.ChatDelta{
|
||||
{ReasoningContent: "thinking"},
|
||||
{Content: "answer"},
|
||||
},
|
||||
}
|
||||
reasoning, content := usage.ChatDeltaReasoningAndContent()
|
||||
Expect(reasoning).To(Equal("thinking"))
|
||||
Expect(content).To(Equal("answer"))
|
||||
})
|
||||
|
||||
It("should handle deltas with both fields set", func() {
|
||||
usage := TokenUsage{
|
||||
ChatDeltas: []*pb.ChatDelta{
|
||||
{Content: "a", ReasoningContent: "r1"},
|
||||
{Content: "b", ReasoningContent: "r2"},
|
||||
},
|
||||
}
|
||||
reasoning, content := usage.ChatDeltaReasoningAndContent()
|
||||
Expect(reasoning).To(Equal("r1r2"))
|
||||
Expect(content).To(Equal("ab"))
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
@@ -81,7 +81,17 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
extractor := reason.NewReasoningExtractor(thinkingStartToken, config.ReasoningConfig)
|
||||
|
||||
_, _, _, err := ComputeChoices(req, s, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool {
|
||||
reasoningDelta, contentDelta := extractor.ProcessToken(s)
|
||||
var reasoningDelta, contentDelta string
|
||||
|
||||
// Prefer pre-parsed chat deltas from C++ autoparser when available
|
||||
if tokenUsage.HasChatDeltaContent() {
|
||||
reasoningDelta, contentDelta = tokenUsage.ChatDeltaReasoningAndContent()
|
||||
// Keep extractor state consistent for fallback
|
||||
extractor.ProcessToken(s)
|
||||
} else {
|
||||
// Fallback: Go-side extraction from raw text
|
||||
reasoningDelta, contentDelta = extractor.ProcessToken(s)
|
||||
}
|
||||
|
||||
usage := schema.OpenAIUsage{
|
||||
PromptTokens: tokenUsage.Prompt,
|
||||
@@ -133,7 +143,18 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
|
||||
_, tokenUsage, chatDeltas, err := ComputeChoices(req, prompt, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
|
||||
result += s
|
||||
reasoningDelta, contentDelta := extractor.ProcessToken(s)
|
||||
|
||||
var reasoningDelta, contentDelta string
|
||||
|
||||
// Prefer pre-parsed chat deltas from C++ autoparser when available
|
||||
if usage.HasChatDeltaContent() {
|
||||
reasoningDelta, contentDelta = usage.ChatDeltaReasoningAndContent()
|
||||
// Keep extractor state consistent for fallback
|
||||
extractor.ProcessToken(s)
|
||||
} else {
|
||||
// Fallback: Go-side extraction from raw text
|
||||
reasoningDelta, contentDelta = extractor.ProcessToken(s)
|
||||
}
|
||||
|
||||
// Emit reasoning deltas in their own SSE chunks before any tool-call chunks
|
||||
// (OpenAI spec: reasoning and tool_calls never share a delta)
|
||||
|
||||
@@ -398,5 +398,124 @@ var _ = Describe("ComputeChoices", func() {
|
||||
Expect(choices).To(HaveLen(1))
|
||||
Expect(streamedTokens).To(Equal([]string{"Hello", " world"}))
|
||||
})
|
||||
|
||||
It("should pass chat deltas through TokenUsage during streaming", func() {
|
||||
var receivedDeltas [][]*pb.ChatDelta
|
||||
backend.ModelInferenceFunc = func(
|
||||
ctx context.Context, s string, messages schema.Messages,
|
||||
images, videos, audios []string,
|
||||
loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader,
|
||||
o *config.ApplicationConfig,
|
||||
tokenCallback func(string, backend.TokenUsage) bool,
|
||||
tools, toolChoice string,
|
||||
logprobs, topLogprobs *int,
|
||||
logitBias map[string]float64,
|
||||
metadata map[string]string,
|
||||
) (func() (backend.LLMResponse, error), error) {
|
||||
predFunc := func() (backend.LLMResponse, error) {
|
||||
if tokenCallback != nil {
|
||||
// Simulate C++ autoparser sending reasoning in chat deltas
|
||||
tokenCallback("<|channel>thought\nthinking\n<channel|>", backend.TokenUsage{
|
||||
Prompt: 5,
|
||||
ChatDeltas: []*pb.ChatDelta{
|
||||
{ReasoningContent: "thinking"},
|
||||
},
|
||||
})
|
||||
tokenCallback("Hello!", backend.TokenUsage{
|
||||
Prompt: 5, Completion: 3,
|
||||
ChatDeltas: []*pb.ChatDelta{
|
||||
{Content: "Hello!"},
|
||||
},
|
||||
})
|
||||
}
|
||||
return backend.LLMResponse{
|
||||
Response: "<|channel>thought\nthinking\n<channel|>Hello!",
|
||||
Usage: backend.TokenUsage{Prompt: 5, Completion: 3},
|
||||
ChatDeltas: []*pb.ChatDelta{
|
||||
{ReasoningContent: "thinking"},
|
||||
{Content: "Hello!"},
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
return predFunc, nil
|
||||
}
|
||||
|
||||
choices, _, deltas, err := ComputeChoices(
|
||||
makeReq(), "test", cfg, nil, appCfg, nil,
|
||||
func(s string, c *[]schema.Choice) {
|
||||
*c = append(*c, schema.Choice{Text: s})
|
||||
},
|
||||
func(s string, usage backend.TokenUsage) bool {
|
||||
// Capture chat deltas received per-chunk
|
||||
if len(usage.ChatDeltas) > 0 {
|
||||
receivedDeltas = append(receivedDeltas, usage.ChatDeltas)
|
||||
}
|
||||
return true
|
||||
},
|
||||
)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(choices).To(HaveLen(1))
|
||||
|
||||
// Verify per-chunk deltas were received during streaming
|
||||
Expect(receivedDeltas).To(HaveLen(2))
|
||||
Expect(receivedDeltas[0][0].ReasoningContent).To(Equal("thinking"))
|
||||
Expect(receivedDeltas[1][0].Content).To(Equal("Hello!"))
|
||||
|
||||
// Verify final accumulated deltas are also returned
|
||||
Expect(deltas).To(HaveLen(2))
|
||||
Expect(deltas[0].ReasoningContent).To(Equal("thinking"))
|
||||
Expect(deltas[1].Content).To(Equal("Hello!"))
|
||||
})
|
||||
|
||||
It("should prefer chat deltas over raw text when HasChatDeltaContent is true", func() {
|
||||
// Verify that the callback can distinguish between
|
||||
// chunks with and without chat deltas
|
||||
var withDeltas, withoutDeltas int
|
||||
backend.ModelInferenceFunc = func(
|
||||
ctx context.Context, s string, messages schema.Messages,
|
||||
images, videos, audios []string,
|
||||
loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader,
|
||||
o *config.ApplicationConfig,
|
||||
tokenCallback func(string, backend.TokenUsage) bool,
|
||||
tools, toolChoice string,
|
||||
logprobs, topLogprobs *int,
|
||||
logitBias map[string]float64,
|
||||
metadata map[string]string,
|
||||
) (func() (backend.LLMResponse, error), error) {
|
||||
predFunc := func() (backend.LLMResponse, error) {
|
||||
if tokenCallback != nil {
|
||||
// Chunk with chat deltas (C++ autoparser active)
|
||||
tokenCallback("raw-text", backend.TokenUsage{
|
||||
ChatDeltas: []*pb.ChatDelta{{Content: "parsed-content"}},
|
||||
})
|
||||
// Chunk without chat deltas (fallback)
|
||||
tokenCallback("fallback-text", backend.TokenUsage{})
|
||||
}
|
||||
return backend.LLMResponse{Response: "raw-textfallback-text"}, nil
|
||||
}
|
||||
return predFunc, nil
|
||||
}
|
||||
|
||||
_, _, _, err := ComputeChoices(
|
||||
makeReq(), "test", cfg, nil, appCfg, nil,
|
||||
func(s string, c *[]schema.Choice) {
|
||||
*c = append(*c, schema.Choice{Text: s})
|
||||
},
|
||||
func(s string, usage backend.TokenUsage) bool {
|
||||
if usage.HasChatDeltaContent() {
|
||||
withDeltas++
|
||||
r, c := usage.ChatDeltaReasoningAndContent()
|
||||
Expect(c).To(Equal("parsed-content"))
|
||||
Expect(r).To(BeEmpty())
|
||||
} else {
|
||||
withoutDeltas++
|
||||
}
|
||||
return true
|
||||
},
|
||||
)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(withDeltas).To(Equal(1))
|
||||
Expect(withoutDeltas).To(Equal(1))
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
@@ -1819,7 +1819,14 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
|
||||
// If no tool calls detected yet, handle reasoning and text
|
||||
if !inToolCallMode {
|
||||
reasoningDelta, contentDelta := extractor.ProcessToken(token)
|
||||
var reasoningDelta, contentDelta string
|
||||
// Prefer pre-parsed chat deltas from C++ autoparser when available
|
||||
if tokenUsage.HasChatDeltaContent() {
|
||||
reasoningDelta, contentDelta = tokenUsage.ChatDeltaReasoningAndContent()
|
||||
extractor.ProcessToken(token) // keep state consistent
|
||||
} else {
|
||||
reasoningDelta, contentDelta = extractor.ProcessToken(token)
|
||||
}
|
||||
|
||||
// Handle reasoning item
|
||||
if extractor.Reasoning() != "" {
|
||||
@@ -2338,7 +2345,15 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
// Stream text deltas with reasoning extraction
|
||||
tokenCallback := func(token string, tokenUsage backend.TokenUsage) bool {
|
||||
accumulatedText += token
|
||||
reasoningDelta, contentDelta := extractor.ProcessToken(token)
|
||||
|
||||
var reasoningDelta, contentDelta string
|
||||
// Prefer pre-parsed chat deltas from C++ autoparser when available
|
||||
if tokenUsage.HasChatDeltaContent() {
|
||||
reasoningDelta, contentDelta = tokenUsage.ChatDeltaReasoningAndContent()
|
||||
extractor.ProcessToken(token) // keep state consistent
|
||||
} else {
|
||||
reasoningDelta, contentDelta = extractor.ProcessToken(token)
|
||||
}
|
||||
|
||||
// Handle reasoning item
|
||||
if extractor.Reasoning() != "" {
|
||||
|
||||
Reference in New Issue
Block a user