From 9748a1cbc63178233fca8d170f424e0f38cb5dbf Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 10 Apr 2026 08:45:47 +0200 Subject: [PATCH] fix(streaming): skip chat deltas for role-init elements to prevent first token duplication (#9299) When TASK_RESPONSE_TYPE_OAI_CHAT is used, the first streaming token produces a JSON array with two elements: a role-init chunk and the actual content chunk. The grpc-server loop called attach_chat_deltas for both elements with the same raw_result pointer, stamping the first token's ChatDelta.Content on both replies. The Go side accumulated both, emitting the first content token twice to SSE clients. Fix: in the array iteration loops in PredictStream, detect role-init elements (delta has "role" key) and skip attach_chat_deltas for them. Only content/reasoning elements get chat deltas attached. Reasoning models are unaffected because their first token goes into reasoning_content, not content. --- backend/cpp/llama-cpp/grpc-server.cpp | 21 +++++++++++++--- core/http/app_test.go | 36 +++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index b3f32b575..3ba9fffeb 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -1716,12 +1716,23 @@ public: } }; - // Process first result + // Process first result. + // When TASK_RESPONSE_TYPE_OAI_CHAT is used, the first token may + // produce a JSON array with a role-init element followed by the + // actual content element. We must only attach chat deltas to the + // content element — attaching to both would duplicate the first + // token since oaicompat_msg_diffs is the same for both. json first_res_json = first_result->to_json(); if (first_res_json.is_array()) { for (const auto & res : first_res_json) { auto reply = build_reply_from_json(res, first_result.get()); - attach_chat_deltas(reply, first_result.get()); + // Skip chat deltas for role-init elements (have "role" in + // delta but no content/reasoning diffs of their own). + bool is_role_init = res.contains("choices") && !res["choices"].empty() && + res["choices"][0].value("delta", json::object()).contains("role"); + if (!is_role_init) { + attach_chat_deltas(reply, first_result.get()); + } writer->Write(reply); } } else { @@ -1745,7 +1756,11 @@ public: if (res_json.is_array()) { for (const auto & res : res_json) { auto reply = build_reply_from_json(res, result.get()); - attach_chat_deltas(reply, result.get()); + bool is_role_init = res.contains("choices") && !res["choices"].empty() && + res["choices"][0].value("delta", json::object()).contains("role"); + if (!is_role_init) { + attach_chat_deltas(reply, result.get()); + } writer->Write(reply); } } else { diff --git a/core/http/app_test.go b/core/http/app_test.go index ea7917a13..74220855c 100644 --- a/core/http/app_test.go +++ b/core/http/app_test.go @@ -978,6 +978,42 @@ parameters: Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty()) }) + It("does not duplicate the first content token in streaming chat completions", Label("llama-gguf", "llama-gguf-stream"), func() { + if runtime.GOOS != "linux" { + Skip("test supported only on linux") + } + stream, err := client.CreateChatCompletionStream(context.TODO(), openai.ChatCompletionRequest{ + Model: "testmodel.ggml", + Messages: []openai.ChatCompletionMessage{{Role: "user", Content: testPrompt}}, + }) + Expect(err).ToNot(HaveOccurred()) + defer stream.Close() + + var contentParts []string + for { + chunk, err := stream.Recv() + if err == io.EOF { + break + } + Expect(err).ToNot(HaveOccurred()) + if len(chunk.Choices) > 0 { + delta := chunk.Choices[0].Delta.Content + if delta != "" { + contentParts = append(contentParts, delta) + } + } + } + + Expect(contentParts).ToNot(BeEmpty(), "Expected streaming content tokens") + // The first content token should appear exactly once. + // A bug in grpc-server.cpp caused the role-init array element + // to get the same ChatDelta stamped, duplicating the first token. + if len(contentParts) >= 2 { + Expect(contentParts[0]).ToNot(Equal(contentParts[1]), + "First content token was duplicated: %v", contentParts[:2]) + } + }) + It("returns logprobs in chat completions when requested", func() { if runtime.GOOS != "linux" { Skip("test only on linux")