From 3db60b57e60962bfe7ac768970efca913b4c111f Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Fri, 24 Apr 2026 13:41:38 +0100
Subject: [PATCH] fix(realtime): consume ChatDeltas when C++ autoparser clears
 Response (#9538)

The llama.cpp C++-side chat autoparser clears Reply.Message and delivers
parsed content/reasoning/tool-calls via Reply.chat_deltas. chat.go handles
this (non-SSE path uses ToolCallsFromChatDeltas/ContentFromChatDeltas/
ReasoningFromChatDeltas), but realtime.go only read pred.Response, so any
model routed through the autoparser (Qwen2.5/3 and friends) produced a
silent reply: backend emitted N tokens, the session surface saw zero.

Mirror the non-SSE chat path in realtime's triggerResponse: when deltas
carry tool calls or content, use them directly; otherwise fall back to
the existing raw-text parsing.

Assisted-by: claude-opus-4-7-1M [Claude Code]

Signed-off-by: Richard Palethorpe <io@richiejp.com>
---
 core/http/endpoints/openai/realtime.go | 32 ++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go
index 9867233c4..f02aa7fe0 100644
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -1315,13 +1315,35 @@ func triggerResponse(ctx context.Context, session *Session, conv *Conversation,
 	}
 	thinkingStartToken := reasoning.DetectThinkingStartToken(template, &config.ReasoningConfig)
 
-	reasoningText, responseWithoutReasoning := reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, config.ReasoningConfig)
+	// When the C++ autoparser emitted ChatDeltas with actionable data,
+	// prefer them — the backend clears Reply.Message in that path and
+	// delivers parsed content/reasoning/tool-calls via the delta stream
+	// (see pkg/functions/chat_deltas.go, mirrored from chat.go's non-SSE
+	// handling). Without this, Response is empty and realtime would
+	// synthesize silence for replies that actually produced tokens.
+	var reasoningText, responseWithoutReasoning, textContent, cleanedResponse string
+	var toolCalls []functions.FuncCallResults
+	deltaToolCalls := functions.ToolCallsFromChatDeltas(pred.ChatDeltas)
+	deltaContent := functions.ContentFromChatDeltas(pred.ChatDeltas)
+	deltaReasoning := functions.ReasoningFromChatDeltas(pred.ChatDeltas)
+	if len(deltaToolCalls) > 0 || deltaContent != "" {
+		xlog.Debug("[ChatDeltas] realtime: using C++ autoparser deltas",
+			"tool_calls", len(deltaToolCalls),
+			"content_len", len(deltaContent),
+			"reasoning_len", len(deltaReasoning))
+		reasoningText = deltaReasoning
+		responseWithoutReasoning = deltaContent
+		textContent = deltaContent
+		cleanedResponse = deltaContent
+		toolCalls = deltaToolCalls
+	} else {
+		reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, config.ReasoningConfig)
+		textContent = functions.ParseTextContent(responseWithoutReasoning, config.FunctionsConfig)
+		cleanedResponse = functions.CleanupLLMResult(responseWithoutReasoning, config.FunctionsConfig)
+		toolCalls = functions.ParseFunctionCall(cleanedResponse, config.FunctionsConfig)
+	}
 	xlog.Debug("LLM Response", "reasoning", reasoningText, "response_without_reasoning", responseWithoutReasoning)
 
-	textContent := functions.ParseTextContent(responseWithoutReasoning, config.FunctionsConfig)
-	cleanedResponse := functions.CleanupLLMResult(responseWithoutReasoning, config.FunctionsConfig)
-	toolCalls := functions.ParseFunctionCall(cleanedResponse, config.FunctionsConfig)
-
 	xlog.Debug("Function call parsing", "textContent", textContent, "cleanedResponse", cleanedResponse, "toolCallsCount", len(toolCalls))
 
 	noActionName := "answer"