From 3db60b57e60962bfe7ac768970efca913b4c111f Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Fri, 24 Apr 2026 13:41:38 +0100 Subject: [PATCH] fix(realtime): consume ChatDeltas when C++ autoparser clears Response (#9538) The llama.cpp C++-side chat autoparser clears Reply.Message and delivers parsed content/reasoning/tool-calls via Reply.chat_deltas. chat.go handles this (non-SSE path uses ToolCallsFromChatDeltas/ContentFromChatDeltas/ ReasoningFromChatDeltas), but realtime.go only read pred.Response, so any model routed through the autoparser (Qwen2.5/3 and friends) produced a silent reply: backend emitted N tokens, the session surface saw zero. Mirror the non-SSE chat path in realtime's triggerResponse: when deltas carry tool calls or content, use them directly; otherwise fall back to the existing raw-text parsing. Assisted-by: claude-opus-4-7-1M [Claude Code] Signed-off-by: Richard Palethorpe --- core/http/endpoints/openai/realtime.go | 32 ++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go index 9867233c4..f02aa7fe0 100644 --- a/core/http/endpoints/openai/realtime.go +++ b/core/http/endpoints/openai/realtime.go @@ -1315,13 +1315,35 @@ func triggerResponse(ctx context.Context, session *Session, conv *Conversation, } thinkingStartToken := reasoning.DetectThinkingStartToken(template, &config.ReasoningConfig) - reasoningText, responseWithoutReasoning := reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, config.ReasoningConfig) + // When the C++ autoparser emitted ChatDeltas with actionable data, + // prefer them — the backend clears Reply.Message in that path and + // delivers parsed content/reasoning/tool-calls via the delta stream + // (see pkg/functions/chat_deltas.go, mirrored from chat.go's non-SSE + // handling). Without this, Response is empty and realtime would + // synthesize silence for replies that actually produced tokens. + var reasoningText, responseWithoutReasoning, textContent, cleanedResponse string + var toolCalls []functions.FuncCallResults + deltaToolCalls := functions.ToolCallsFromChatDeltas(pred.ChatDeltas) + deltaContent := functions.ContentFromChatDeltas(pred.ChatDeltas) + deltaReasoning := functions.ReasoningFromChatDeltas(pred.ChatDeltas) + if len(deltaToolCalls) > 0 || deltaContent != "" { + xlog.Debug("[ChatDeltas] realtime: using C++ autoparser deltas", + "tool_calls", len(deltaToolCalls), + "content_len", len(deltaContent), + "reasoning_len", len(deltaReasoning)) + reasoningText = deltaReasoning + responseWithoutReasoning = deltaContent + textContent = deltaContent + cleanedResponse = deltaContent + toolCalls = deltaToolCalls + } else { + reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, config.ReasoningConfig) + textContent = functions.ParseTextContent(responseWithoutReasoning, config.FunctionsConfig) + cleanedResponse = functions.CleanupLLMResult(responseWithoutReasoning, config.FunctionsConfig) + toolCalls = functions.ParseFunctionCall(cleanedResponse, config.FunctionsConfig) + } xlog.Debug("LLM Response", "reasoning", reasoningText, "response_without_reasoning", responseWithoutReasoning) - textContent := functions.ParseTextContent(responseWithoutReasoning, config.FunctionsConfig) - cleanedResponse := functions.CleanupLLMResult(responseWithoutReasoning, config.FunctionsConfig) - toolCalls := functions.ParseFunctionCall(cleanedResponse, config.FunctionsConfig) - xlog.Debug("Function call parsing", "textContent", textContent, "cleanedResponse", cleanedResponse, "toolCallsCount", len(toolCalls)) noActionName := "answer"