diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index cf2f05663..04bd28b36 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -174,8 +174,78 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator result := "" lastEmittedCount := 0 + + // Track accumulated content for incremental reasoning and content extraction (mirrors process()) + accumulatedContent := "" + lastEmittedReasoning := "" + lastEmittedCleanedContent := "" + sentInitialRole := false + _, tokenUsage, chatDeltas, err := ComputeChoices(req, prompt, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool { result += s + accumulatedContent += s + + // Incremental reasoning extraction — emit reasoning deltas in their own SSE chunks + // before any tool-call chunks (OpenAI spec: reasoning and tool_calls never share a delta) + currentReasoning, cleanedContent := reason.ExtractReasoningWithConfig(accumulatedContent, thinkingStartToken, config.ReasoningConfig) + + var reasoningDelta *string + if currentReasoning != lastEmittedReasoning { + if len(currentReasoning) > len(lastEmittedReasoning) && strings.HasPrefix(currentReasoning, lastEmittedReasoning) { + newReasoning := currentReasoning[len(lastEmittedReasoning):] + reasoningDelta = &newReasoning + lastEmittedReasoning = currentReasoning + } else if currentReasoning != "" { + reasoningDelta = ¤tReasoning + lastEmittedReasoning = currentReasoning + } + } + + if reasoningDelta != nil && *reasoningDelta != "" { + responses <- schema.OpenAIResponse{ + ID: id, + Created: created, + Model: req.Model, + Choices: []schema.Choice{{ + Delta: &schema.Message{Reasoning: reasoningDelta}, + Index: 0, + }}, + Object: "chat.completion.chunk", + } + } + + // Stream content deltas (cleaned of reasoning tags) while no tool calls + // have been detected. Once the incremental parser finds tool calls, + // content stops — per OpenAI spec, content and tool_calls don't mix. + if lastEmittedCount == 0 && cleanedContent != "" { + var deltaContent string + if len(cleanedContent) > len(lastEmittedCleanedContent) && strings.HasPrefix(cleanedContent, lastEmittedCleanedContent) { + deltaContent = cleanedContent[len(lastEmittedCleanedContent):] + lastEmittedCleanedContent = cleanedContent + } else if cleanedContent != lastEmittedCleanedContent { + deltaContent = cleanedContent + lastEmittedCleanedContent = cleanedContent + } + if deltaContent != "" { + if !sentInitialRole { + responses <- schema.OpenAIResponse{ + ID: id, Created: created, Model: req.Model, + Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant"}, Index: 0}}, + Object: "chat.completion.chunk", + } + sentInitialRole = true + } + responses <- schema.OpenAIResponse{ + ID: id, Created: created, Model: req.Model, + Choices: []schema.Choice{{ + Delta: &schema.Message{Content: &deltaContent}, + Index: 0, + }}, + Object: "chat.completion.chunk", + } + } + } + // Try incremental XML parsing for streaming support using iterative parser // This allows emitting partial tool calls as they're being generated cleanedResult := functions.CleanupLLMResult(result, config.FunctionsConfig) @@ -306,20 +376,6 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator switch { case noActionToRun: - initialMessage := schema.OpenAIResponse{ - ID: id, - Created: created, - Model: req.Model, // we have to return what the user sent here, due to OpenAI spec. - Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant"}, Index: 0, FinishReason: nil}}, - Object: "chat.completion.chunk", - } - responses <- initialMessage - - result, err := handleQuestion(config, functionResults, result, prompt) - if err != nil { - xlog.Error("error handling question", "error", err) - return err - } usage := schema.OpenAIUsage{ PromptTokens: tokenUsage.Prompt, CompletionTokens: tokenUsage.Completion, @@ -330,25 +386,43 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing } - var deltaReasoning *string - if reasoning != "" { - deltaReasoning = &reasoning - } - delta := &schema.Message{Content: &result} - if deltaReasoning != nil { - delta.Reasoning = deltaReasoning - } + if sentInitialRole { + // Content was already streamed during the callback — just emit usage. + delta := &schema.Message{} + if reasoning != "" && lastEmittedReasoning == "" { + delta.Reasoning = &reasoning + } + responses <- schema.OpenAIResponse{ + ID: id, Created: created, Model: req.Model, + Choices: []schema.Choice{{Delta: delta, Index: 0}}, + Object: "chat.completion.chunk", + Usage: usage, + } + } else { + // Content was NOT streamed — send everything at once (fallback). + responses <- schema.OpenAIResponse{ + ID: id, Created: created, Model: req.Model, + Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant"}, Index: 0}}, + Object: "chat.completion.chunk", + } - resp := schema.OpenAIResponse{ - ID: id, - Created: created, - Model: req.Model, // we have to return what the user sent here, due to OpenAI spec. - Choices: []schema.Choice{{Delta: delta, Index: 0, FinishReason: nil}}, - Object: "chat.completion.chunk", - Usage: usage, - } + result, err := handleQuestion(config, functionResults, result, prompt) + if err != nil { + xlog.Error("error handling question", "error", err) + return err + } - responses <- resp + delta := &schema.Message{Content: &result} + if reasoning != "" { + delta.Reasoning = &reasoning + } + responses <- schema.OpenAIResponse{ + ID: id, Created: created, Model: req.Model, + Choices: []schema.Choice{{Delta: delta, Index: 0}}, + Object: "chat.completion.chunk", + Usage: usage, + } + } default: for i, ss := range functionResults { diff --git a/core/http/endpoints/openresponses/responses.go b/core/http/endpoints/openresponses/responses.go index dd51e1a36..9b0ae2a23 100644 --- a/core/http/endpoints/openresponses/responses.go +++ b/core/http/endpoints/openresponses/responses.go @@ -1737,6 +1737,16 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6 for mcpStreamIter := 0; mcpStreamIter <= mcpStreamMaxIterations; mcpStreamIter++ { if mcpStreamIter > 0 { + // Reset reasoning and tool-call state for re-inference so reasoning + // extraction runs again on subsequent iterations + inToolCallMode = false + accumulatedContent = "" + lastEmittedReasoning = "" + lastEmittedCleanedContent = "" + currentMessageID = "" + lastEmittedToolCallCount = 0 + currentReasoningID = "" + predInput = evaluator.TemplateMessages(*openAIReq, openAIReq.Messages, cfg, funcs, shouldUseFn) xlog.Debug("Open Responses stream MCP re-templating", "iteration", mcpStreamIter) images = images[:0] diff --git a/core/http/react-ui/src/pages/AgentChat.jsx b/core/http/react-ui/src/pages/AgentChat.jsx index 757ae74b9..908e2d9af 100644 --- a/core/http/react-ui/src/pages/AgentChat.jsx +++ b/core/http/react-ui/src/pages/AgentChat.jsx @@ -104,6 +104,9 @@ export default function AgentChat() { const [editingName, setEditingName] = useState(null) const [editName, setEditName] = useState('') const [chatSearch, setChatSearch] = useState('') + const [streamContent, setStreamContent] = useState('') + const [streamReasoning, setStreamReasoning] = useState('') + const [streamToolCalls, setStreamToolCalls] = useState([]) const messagesEndRef = useRef(null) const messagesRef = useRef(null) const textareaRef = useRef(null) @@ -150,8 +153,41 @@ export default function AgentChat() { const data = JSON.parse(e.data) if (data.status === 'processing') { setProcessingChatId(activeIdRef.current) + setStreamContent('') + setStreamReasoning('') + setStreamToolCalls([]) } else if (data.status === 'completed') { setProcessingChatId(null) + setStreamContent('') + setStreamReasoning('') + setStreamToolCalls([]) + } + } catch (_err) { + // ignore + } + }) + + es.addEventListener('stream_event', (e) => { + try { + const data = JSON.parse(e.data) + if (data.type === 'reasoning') { + setStreamReasoning(prev => prev + (data.content || '')) + } else if (data.type === 'content') { + setStreamContent(prev => prev + (data.content || '')) + } else if (data.type === 'tool_call') { + const name = data.tool_name || '' + const args = data.tool_args || '' + setStreamToolCalls(prev => { + if (name) { + return [...prev, { name, args }] + } + if (prev.length === 0) return prev + const updated = [...prev] + updated[updated.length - 1] = { ...updated[updated.length - 1], args: updated[updated.length - 1].args + args } + return updated + }) + } else if (data.type === 'done') { + // Content will be finalized by json_message event } } catch (_err) { // ignore @@ -192,7 +228,7 @@ export default function AgentChat() { // Auto-scroll to bottom useEffect(() => { messagesEndRef.current?.scrollIntoView({ behavior: 'smooth' }) - }, [messages]) + }, [messages, streamContent, streamReasoning, streamToolCalls]) // Highlight code blocks useEffect(() => { @@ -537,7 +573,50 @@ export default function AgentChat() { flushSystem('end') return elements })()} - {processing && ( + {processing && (streamReasoning || streamContent || streamToolCalls.length > 0) && ( +