mirror of
https://github.com/mudler/LocalAI.git
synced 2026-03-31 21:25:59 -04:00
feat: support streaming mode for tool calls in agent mode, fix interleaved thinking stream (#9023)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
committed by
GitHub
parent
b2030255ca
commit
5fd42399d4
@@ -174,8 +174,78 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
|
||||
result := ""
|
||||
lastEmittedCount := 0
|
||||
|
||||
// Track accumulated content for incremental reasoning and content extraction (mirrors process())
|
||||
accumulatedContent := ""
|
||||
lastEmittedReasoning := ""
|
||||
lastEmittedCleanedContent := ""
|
||||
sentInitialRole := false
|
||||
|
||||
_, tokenUsage, chatDeltas, err := ComputeChoices(req, prompt, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
|
||||
result += s
|
||||
accumulatedContent += s
|
||||
|
||||
// Incremental reasoning extraction — emit reasoning deltas in their own SSE chunks
|
||||
// before any tool-call chunks (OpenAI spec: reasoning and tool_calls never share a delta)
|
||||
currentReasoning, cleanedContent := reason.ExtractReasoningWithConfig(accumulatedContent, thinkingStartToken, config.ReasoningConfig)
|
||||
|
||||
var reasoningDelta *string
|
||||
if currentReasoning != lastEmittedReasoning {
|
||||
if len(currentReasoning) > len(lastEmittedReasoning) && strings.HasPrefix(currentReasoning, lastEmittedReasoning) {
|
||||
newReasoning := currentReasoning[len(lastEmittedReasoning):]
|
||||
reasoningDelta = &newReasoning
|
||||
lastEmittedReasoning = currentReasoning
|
||||
} else if currentReasoning != "" {
|
||||
reasoningDelta = ¤tReasoning
|
||||
lastEmittedReasoning = currentReasoning
|
||||
}
|
||||
}
|
||||
|
||||
if reasoningDelta != nil && *reasoningDelta != "" {
|
||||
responses <- schema.OpenAIResponse{
|
||||
ID: id,
|
||||
Created: created,
|
||||
Model: req.Model,
|
||||
Choices: []schema.Choice{{
|
||||
Delta: &schema.Message{Reasoning: reasoningDelta},
|
||||
Index: 0,
|
||||
}},
|
||||
Object: "chat.completion.chunk",
|
||||
}
|
||||
}
|
||||
|
||||
// Stream content deltas (cleaned of reasoning tags) while no tool calls
|
||||
// have been detected. Once the incremental parser finds tool calls,
|
||||
// content stops — per OpenAI spec, content and tool_calls don't mix.
|
||||
if lastEmittedCount == 0 && cleanedContent != "" {
|
||||
var deltaContent string
|
||||
if len(cleanedContent) > len(lastEmittedCleanedContent) && strings.HasPrefix(cleanedContent, lastEmittedCleanedContent) {
|
||||
deltaContent = cleanedContent[len(lastEmittedCleanedContent):]
|
||||
lastEmittedCleanedContent = cleanedContent
|
||||
} else if cleanedContent != lastEmittedCleanedContent {
|
||||
deltaContent = cleanedContent
|
||||
lastEmittedCleanedContent = cleanedContent
|
||||
}
|
||||
if deltaContent != "" {
|
||||
if !sentInitialRole {
|
||||
responses <- schema.OpenAIResponse{
|
||||
ID: id, Created: created, Model: req.Model,
|
||||
Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant"}, Index: 0}},
|
||||
Object: "chat.completion.chunk",
|
||||
}
|
||||
sentInitialRole = true
|
||||
}
|
||||
responses <- schema.OpenAIResponse{
|
||||
ID: id, Created: created, Model: req.Model,
|
||||
Choices: []schema.Choice{{
|
||||
Delta: &schema.Message{Content: &deltaContent},
|
||||
Index: 0,
|
||||
}},
|
||||
Object: "chat.completion.chunk",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Try incremental XML parsing for streaming support using iterative parser
|
||||
// This allows emitting partial tool calls as they're being generated
|
||||
cleanedResult := functions.CleanupLLMResult(result, config.FunctionsConfig)
|
||||
@@ -306,20 +376,6 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
|
||||
switch {
|
||||
case noActionToRun:
|
||||
initialMessage := schema.OpenAIResponse{
|
||||
ID: id,
|
||||
Created: created,
|
||||
Model: req.Model, // we have to return what the user sent here, due to OpenAI spec.
|
||||
Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant"}, Index: 0, FinishReason: nil}},
|
||||
Object: "chat.completion.chunk",
|
||||
}
|
||||
responses <- initialMessage
|
||||
|
||||
result, err := handleQuestion(config, functionResults, result, prompt)
|
||||
if err != nil {
|
||||
xlog.Error("error handling question", "error", err)
|
||||
return err
|
||||
}
|
||||
usage := schema.OpenAIUsage{
|
||||
PromptTokens: tokenUsage.Prompt,
|
||||
CompletionTokens: tokenUsage.Completion,
|
||||
@@ -330,25 +386,43 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing
|
||||
}
|
||||
|
||||
var deltaReasoning *string
|
||||
if reasoning != "" {
|
||||
deltaReasoning = &reasoning
|
||||
}
|
||||
delta := &schema.Message{Content: &result}
|
||||
if deltaReasoning != nil {
|
||||
delta.Reasoning = deltaReasoning
|
||||
}
|
||||
if sentInitialRole {
|
||||
// Content was already streamed during the callback — just emit usage.
|
||||
delta := &schema.Message{}
|
||||
if reasoning != "" && lastEmittedReasoning == "" {
|
||||
delta.Reasoning = &reasoning
|
||||
}
|
||||
responses <- schema.OpenAIResponse{
|
||||
ID: id, Created: created, Model: req.Model,
|
||||
Choices: []schema.Choice{{Delta: delta, Index: 0}},
|
||||
Object: "chat.completion.chunk",
|
||||
Usage: usage,
|
||||
}
|
||||
} else {
|
||||
// Content was NOT streamed — send everything at once (fallback).
|
||||
responses <- schema.OpenAIResponse{
|
||||
ID: id, Created: created, Model: req.Model,
|
||||
Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant"}, Index: 0}},
|
||||
Object: "chat.completion.chunk",
|
||||
}
|
||||
|
||||
resp := schema.OpenAIResponse{
|
||||
ID: id,
|
||||
Created: created,
|
||||
Model: req.Model, // we have to return what the user sent here, due to OpenAI spec.
|
||||
Choices: []schema.Choice{{Delta: delta, Index: 0, FinishReason: nil}},
|
||||
Object: "chat.completion.chunk",
|
||||
Usage: usage,
|
||||
}
|
||||
result, err := handleQuestion(config, functionResults, result, prompt)
|
||||
if err != nil {
|
||||
xlog.Error("error handling question", "error", err)
|
||||
return err
|
||||
}
|
||||
|
||||
responses <- resp
|
||||
delta := &schema.Message{Content: &result}
|
||||
if reasoning != "" {
|
||||
delta.Reasoning = &reasoning
|
||||
}
|
||||
responses <- schema.OpenAIResponse{
|
||||
ID: id, Created: created, Model: req.Model,
|
||||
Choices: []schema.Choice{{Delta: delta, Index: 0}},
|
||||
Object: "chat.completion.chunk",
|
||||
Usage: usage,
|
||||
}
|
||||
}
|
||||
|
||||
default:
|
||||
for i, ss := range functionResults {
|
||||
|
||||
@@ -1737,6 +1737,16 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
|
||||
for mcpStreamIter := 0; mcpStreamIter <= mcpStreamMaxIterations; mcpStreamIter++ {
|
||||
if mcpStreamIter > 0 {
|
||||
// Reset reasoning and tool-call state for re-inference so reasoning
|
||||
// extraction runs again on subsequent iterations
|
||||
inToolCallMode = false
|
||||
accumulatedContent = ""
|
||||
lastEmittedReasoning = ""
|
||||
lastEmittedCleanedContent = ""
|
||||
currentMessageID = ""
|
||||
lastEmittedToolCallCount = 0
|
||||
currentReasoningID = ""
|
||||
|
||||
predInput = evaluator.TemplateMessages(*openAIReq, openAIReq.Messages, cfg, funcs, shouldUseFn)
|
||||
xlog.Debug("Open Responses stream MCP re-templating", "iteration", mcpStreamIter)
|
||||
images = images[:0]
|
||||
|
||||
@@ -104,6 +104,9 @@ export default function AgentChat() {
|
||||
const [editingName, setEditingName] = useState(null)
|
||||
const [editName, setEditName] = useState('')
|
||||
const [chatSearch, setChatSearch] = useState('')
|
||||
const [streamContent, setStreamContent] = useState('')
|
||||
const [streamReasoning, setStreamReasoning] = useState('')
|
||||
const [streamToolCalls, setStreamToolCalls] = useState([])
|
||||
const messagesEndRef = useRef(null)
|
||||
const messagesRef = useRef(null)
|
||||
const textareaRef = useRef(null)
|
||||
@@ -150,8 +153,41 @@ export default function AgentChat() {
|
||||
const data = JSON.parse(e.data)
|
||||
if (data.status === 'processing') {
|
||||
setProcessingChatId(activeIdRef.current)
|
||||
setStreamContent('')
|
||||
setStreamReasoning('')
|
||||
setStreamToolCalls([])
|
||||
} else if (data.status === 'completed') {
|
||||
setProcessingChatId(null)
|
||||
setStreamContent('')
|
||||
setStreamReasoning('')
|
||||
setStreamToolCalls([])
|
||||
}
|
||||
} catch (_err) {
|
||||
// ignore
|
||||
}
|
||||
})
|
||||
|
||||
es.addEventListener('stream_event', (e) => {
|
||||
try {
|
||||
const data = JSON.parse(e.data)
|
||||
if (data.type === 'reasoning') {
|
||||
setStreamReasoning(prev => prev + (data.content || ''))
|
||||
} else if (data.type === 'content') {
|
||||
setStreamContent(prev => prev + (data.content || ''))
|
||||
} else if (data.type === 'tool_call') {
|
||||
const name = data.tool_name || ''
|
||||
const args = data.tool_args || ''
|
||||
setStreamToolCalls(prev => {
|
||||
if (name) {
|
||||
return [...prev, { name, args }]
|
||||
}
|
||||
if (prev.length === 0) return prev
|
||||
const updated = [...prev]
|
||||
updated[updated.length - 1] = { ...updated[updated.length - 1], args: updated[updated.length - 1].args + args }
|
||||
return updated
|
||||
})
|
||||
} else if (data.type === 'done') {
|
||||
// Content will be finalized by json_message event
|
||||
}
|
||||
} catch (_err) {
|
||||
// ignore
|
||||
@@ -192,7 +228,7 @@ export default function AgentChat() {
|
||||
// Auto-scroll to bottom
|
||||
useEffect(() => {
|
||||
messagesEndRef.current?.scrollIntoView({ behavior: 'smooth' })
|
||||
}, [messages])
|
||||
}, [messages, streamContent, streamReasoning, streamToolCalls])
|
||||
|
||||
// Highlight code blocks
|
||||
useEffect(() => {
|
||||
@@ -537,7 +573,50 @@ export default function AgentChat() {
|
||||
flushSystem('end')
|
||||
return elements
|
||||
})()}
|
||||
{processing && (
|
||||
{processing && (streamReasoning || streamContent || streamToolCalls.length > 0) && (
|
||||
<div className="chat-message chat-message-assistant">
|
||||
<div className="chat-message-avatar">
|
||||
<i className="fas fa-robot" />
|
||||
</div>
|
||||
<div className="chat-message-bubble">
|
||||
{streamReasoning && (
|
||||
<details className="chat-activity-group" open={!streamContent} style={{ marginBottom: streamContent ? 'var(--spacing-sm)' : 0 }}>
|
||||
<summary className="chat-activity-toggle" style={{ cursor: 'pointer' }}>
|
||||
<span className={`chat-activity-summary${!streamContent ? ' chat-activity-shimmer' : ''}`}>
|
||||
{streamContent ? 'Thinking' : 'Thinking...'}
|
||||
</span>
|
||||
</summary>
|
||||
<div className="chat-activity-details">
|
||||
<div className="chat-activity-item chat-activity-thinking">
|
||||
<div className="chat-activity-item-content chat-activity-live"
|
||||
dangerouslySetInnerHTML={{ __html: renderMarkdown(streamReasoning) }} />
|
||||
</div>
|
||||
</div>
|
||||
</details>
|
||||
)}
|
||||
{streamToolCalls.length > 0 && !streamContent && (
|
||||
<div className="chat-activity-group" style={{ marginBottom: 'var(--spacing-sm)' }}>
|
||||
{streamToolCalls.map((tc, idx) => (
|
||||
<div key={idx} className="chat-activity-item chat-activity-tool-call" style={{ padding: 'var(--spacing-xs) var(--spacing-sm)' }}>
|
||||
<span className="chat-activity-item-label">
|
||||
<i className="fas fa-bolt" style={{ marginRight: 'var(--spacing-xs)' }} />
|
||||
{tc.name}
|
||||
</span>
|
||||
<span style={{ opacity: 0.5, fontSize: '0.85em', marginLeft: 'var(--spacing-xs)' }}>calling...</span>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
{streamContent && (
|
||||
<div className="chat-message-content">
|
||||
<span dangerouslySetInnerHTML={{ __html: renderMarkdown(streamContent) }} />
|
||||
<span className="chat-streaming-cursor" />
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
{processing && !streamReasoning && !streamContent && streamToolCalls.length === 0 && (
|
||||
<div className="chat-message chat-message-assistant">
|
||||
<div className="chat-message-avatar" style={{ background: 'var(--color-bg-tertiary)', color: 'var(--color-text-muted)' }}>
|
||||
<i className="fas fa-cogs" />
|
||||
|
||||
@@ -8,6 +8,9 @@ const traceColors = {
|
||||
tool_call: { bg: 'rgba(139,92,246,0.1)', border: 'rgba(139,92,246,0.3)', icon: 'fa-wrench', color: 'var(--color-accent)' },
|
||||
tool_result: { bg: 'rgba(34,197,94,0.1)', border: 'rgba(34,197,94,0.3)', icon: 'fa-check', color: 'var(--color-success)' },
|
||||
status: { bg: 'rgba(245,158,11,0.1)', border: 'rgba(245,158,11,0.3)', icon: 'fa-info-circle', color: 'var(--color-warning)' },
|
||||
stream_reasoning: { bg: 'rgba(99,102,241,0.06)', border: 'rgba(99,102,241,0.2)', icon: 'fa-lightbulb', color: 'var(--color-primary)' },
|
||||
stream_content: { bg: 'rgba(59,130,246,0.08)', border: 'rgba(59,130,246,0.25)', icon: 'fa-pen-nib', color: 'var(--color-info, #3b82f6)' },
|
||||
stream_tool_call: { bg: 'rgba(139,92,246,0.06)', border: 'rgba(139,92,246,0.2)', icon: 'fa-bolt', color: 'var(--color-accent)' },
|
||||
}
|
||||
|
||||
function TraceCard({ trace, index }) {
|
||||
|
||||
@@ -887,6 +887,35 @@ func (s *AgentJobService) executeJobInternal(job schema.Job, task schema.Task, c
|
||||
job.Traces = append(job.Traces, trace)
|
||||
s.jobs.Set(job.ID, job)
|
||||
}),
|
||||
cogito.WithStreamCallback(func(ev cogito.StreamEvent) {
|
||||
switch ev.Type {
|
||||
case cogito.StreamEventReasoning:
|
||||
trace := schema.JobTrace{
|
||||
Type: "stream_reasoning",
|
||||
Content: ev.Content,
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
job.Traces = append(job.Traces, trace)
|
||||
s.jobs.Set(job.ID, job)
|
||||
case cogito.StreamEventContent:
|
||||
trace := schema.JobTrace{
|
||||
Type: "stream_content",
|
||||
Content: ev.Content,
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
job.Traces = append(job.Traces, trace)
|
||||
s.jobs.Set(job.ID, job)
|
||||
case cogito.StreamEventToolCall:
|
||||
trace := schema.JobTrace{
|
||||
Type: "stream_tool_call",
|
||||
Content: ev.ToolArgs,
|
||||
ToolName: ev.ToolName,
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
job.Traces = append(job.Traces, trace)
|
||||
s.jobs.Set(job.ID, job)
|
||||
}
|
||||
}),
|
||||
)
|
||||
|
||||
// Execute tools
|
||||
|
||||
4
go.mod
4
go.mod
@@ -31,7 +31,7 @@ require (
|
||||
github.com/mholt/archiver/v3 v3.5.1
|
||||
github.com/microcosm-cc/bluemonday v1.0.27
|
||||
github.com/modelcontextprotocol/go-sdk v1.4.0
|
||||
github.com/mudler/cogito v0.9.5-0.20260313170202-42271c7e1a6b
|
||||
github.com/mudler/cogito v0.9.5-0.20260315222927-63abdec7189b
|
||||
github.com/mudler/edgevpn v0.31.1
|
||||
github.com/mudler/go-processmanager v0.1.0
|
||||
github.com/mudler/memory v0.0.0-20251216220809-d1256471a6c2
|
||||
@@ -128,7 +128,7 @@ require (
|
||||
github.com/kevinburke/ssh_config v1.2.0 // indirect
|
||||
github.com/labstack/gommon v0.4.2 // indirect
|
||||
github.com/mschoch/smat v0.2.0 // indirect
|
||||
github.com/mudler/LocalAGI v0.0.0-20260314222828-e38f13ab8cec
|
||||
github.com/mudler/LocalAGI v0.0.0-20260315223407-da286065e126
|
||||
github.com/mudler/localrecall v0.5.9-0.20260314221856-96d63875cc47 // indirect
|
||||
github.com/mudler/skillserver v0.0.5-0.20260221145827-0639a82c8f49
|
||||
github.com/olekukonko/tablewriter v0.0.5 // indirect
|
||||
|
||||
8
go.sum
8
go.sum
@@ -654,10 +654,10 @@ github.com/mr-tron/base58 v1.2.0 h1:T/HDJBh4ZCPbU39/+c3rRvE0uKBQlU27+QI8LJ4t64o=
|
||||
github.com/mr-tron/base58 v1.2.0/go.mod h1:BinMc/sQntlIE1frQmRFPUoPA1Zkr8VRgBdjWI2mNwc=
|
||||
github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM=
|
||||
github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw=
|
||||
github.com/mudler/LocalAGI v0.0.0-20260314222828-e38f13ab8cec h1:Y6JYhfJidFktfmQC00SwHtQVh0lr0O52qihgTKddSNU=
|
||||
github.com/mudler/LocalAGI v0.0.0-20260314222828-e38f13ab8cec/go.mod h1:yf+IlZzQCGgKPGFn5yclzA2Dxxhy75K3YDubkjCub04=
|
||||
github.com/mudler/cogito v0.9.5-0.20260313170202-42271c7e1a6b h1:Hs2Byjnukgkwm5Vw7z5aSZEjznPHaxjr2vLc5Uu1fHQ=
|
||||
github.com/mudler/cogito v0.9.5-0.20260313170202-42271c7e1a6b/go.mod h1:6sfja3lcu2nWRzEc0wwqGNu/eCG3EWgij+8s7xyUeQ4=
|
||||
github.com/mudler/LocalAGI v0.0.0-20260315223407-da286065e126 h1:7owcRhvwMP5BDDPsoK8TLnOBY4khcGJXutMY7pe9lhc=
|
||||
github.com/mudler/LocalAGI v0.0.0-20260315223407-da286065e126/go.mod h1:w8kG2r/TlADJ4SnYemPNirW1pdHsqM/RAdCPk9r5Ll0=
|
||||
github.com/mudler/cogito v0.9.5-0.20260315222927-63abdec7189b h1:A74T2Lauvg61KodYqsjTYDY05kPLcW+efVZjd23dghU=
|
||||
github.com/mudler/cogito v0.9.5-0.20260315222927-63abdec7189b/go.mod h1:6sfja3lcu2nWRzEc0wwqGNu/eCG3EWgij+8s7xyUeQ4=
|
||||
github.com/mudler/edgevpn v0.31.1 h1:7qegiDWd0kAg6ljhNHxqvp8hbo/6BbzSdbb7/2WZfiY=
|
||||
github.com/mudler/edgevpn v0.31.1/go.mod h1:ftV5B0nKFzm4R8vR80UYnCb2nf7lxCRgAALxUEEgCf8=
|
||||
github.com/mudler/go-piper v0.0.0-20241023091659-2494246fd9fc h1:RxwneJl1VgvikiX28EkpdAyL4yQVnJMrbquKospjHyA=
|
||||
|
||||
Reference in New Issue
Block a user