mirror of
https://github.com/mudler/LocalAI.git
synced 2026-04-29 11:37:40 -04:00
fix(realtime): consume ChatDeltas when C++ autoparser clears Response (#9538)
The llama.cpp C++-side chat autoparser clears Reply.Message and delivers parsed content/reasoning/tool-calls via Reply.chat_deltas. chat.go handles this (non-SSE path uses ToolCallsFromChatDeltas/ContentFromChatDeltas/ ReasoningFromChatDeltas), but realtime.go only read pred.Response, so any model routed through the autoparser (Qwen2.5/3 and friends) produced a silent reply: backend emitted N tokens, the session surface saw zero. Mirror the non-SSE chat path in realtime's triggerResponse: when deltas carry tool calls or content, use them directly; otherwise fall back to the existing raw-text parsing. Assisted-by: claude-opus-4-7-1M [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com>
This commit is contained in:
committed by
GitHub
parent
13734ae9fa
commit
3db60b57e6
@@ -1315,13 +1315,35 @@ func triggerResponse(ctx context.Context, session *Session, conv *Conversation,
|
||||
}
|
||||
thinkingStartToken := reasoning.DetectThinkingStartToken(template, &config.ReasoningConfig)
|
||||
|
||||
reasoningText, responseWithoutReasoning := reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, config.ReasoningConfig)
|
||||
// When the C++ autoparser emitted ChatDeltas with actionable data,
|
||||
// prefer them — the backend clears Reply.Message in that path and
|
||||
// delivers parsed content/reasoning/tool-calls via the delta stream
|
||||
// (see pkg/functions/chat_deltas.go, mirrored from chat.go's non-SSE
|
||||
// handling). Without this, Response is empty and realtime would
|
||||
// synthesize silence for replies that actually produced tokens.
|
||||
var reasoningText, responseWithoutReasoning, textContent, cleanedResponse string
|
||||
var toolCalls []functions.FuncCallResults
|
||||
deltaToolCalls := functions.ToolCallsFromChatDeltas(pred.ChatDeltas)
|
||||
deltaContent := functions.ContentFromChatDeltas(pred.ChatDeltas)
|
||||
deltaReasoning := functions.ReasoningFromChatDeltas(pred.ChatDeltas)
|
||||
if len(deltaToolCalls) > 0 || deltaContent != "" {
|
||||
xlog.Debug("[ChatDeltas] realtime: using C++ autoparser deltas",
|
||||
"tool_calls", len(deltaToolCalls),
|
||||
"content_len", len(deltaContent),
|
||||
"reasoning_len", len(deltaReasoning))
|
||||
reasoningText = deltaReasoning
|
||||
responseWithoutReasoning = deltaContent
|
||||
textContent = deltaContent
|
||||
cleanedResponse = deltaContent
|
||||
toolCalls = deltaToolCalls
|
||||
} else {
|
||||
reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, config.ReasoningConfig)
|
||||
textContent = functions.ParseTextContent(responseWithoutReasoning, config.FunctionsConfig)
|
||||
cleanedResponse = functions.CleanupLLMResult(responseWithoutReasoning, config.FunctionsConfig)
|
||||
toolCalls = functions.ParseFunctionCall(cleanedResponse, config.FunctionsConfig)
|
||||
}
|
||||
xlog.Debug("LLM Response", "reasoning", reasoningText, "response_without_reasoning", responseWithoutReasoning)
|
||||
|
||||
textContent := functions.ParseTextContent(responseWithoutReasoning, config.FunctionsConfig)
|
||||
cleanedResponse := functions.CleanupLLMResult(responseWithoutReasoning, config.FunctionsConfig)
|
||||
toolCalls := functions.ParseFunctionCall(cleanedResponse, config.FunctionsConfig)
|
||||
|
||||
xlog.Debug("Function call parsing", "textContent", textContent, "cleanedResponse", cleanedResponse, "toolCallsCount", len(toolCalls))
|
||||
|
||||
noActionName := "answer"
|
||||
|
||||
Reference in New Issue
Block a user