mirror of
https://github.com/mudler/LocalAI.git
synced 2026-04-16 12:59:33 -04:00
fix(streaming): skip chat deltas for role-init elements to prevent first token duplication (#9299)
When TASK_RESPONSE_TYPE_OAI_CHAT is used, the first streaming token produces a JSON array with two elements: a role-init chunk and the actual content chunk. The grpc-server loop called attach_chat_deltas for both elements with the same raw_result pointer, stamping the first token's ChatDelta.Content on both replies. The Go side accumulated both, emitting the first content token twice to SSE clients. Fix: in the array iteration loops in PredictStream, detect role-init elements (delta has "role" key) and skip attach_chat_deltas for them. Only content/reasoning elements get chat deltas attached. Reasoning models are unaffected because their first token goes into reasoning_content, not content.
This commit is contained in:
committed by
GitHub
parent
6bc76dda6d
commit
9748a1cbc6
@@ -1716,12 +1716,23 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
// Process first result
|
||||
// Process first result.
|
||||
// When TASK_RESPONSE_TYPE_OAI_CHAT is used, the first token may
|
||||
// produce a JSON array with a role-init element followed by the
|
||||
// actual content element. We must only attach chat deltas to the
|
||||
// content element — attaching to both would duplicate the first
|
||||
// token since oaicompat_msg_diffs is the same for both.
|
||||
json first_res_json = first_result->to_json();
|
||||
if (first_res_json.is_array()) {
|
||||
for (const auto & res : first_res_json) {
|
||||
auto reply = build_reply_from_json(res, first_result.get());
|
||||
attach_chat_deltas(reply, first_result.get());
|
||||
// Skip chat deltas for role-init elements (have "role" in
|
||||
// delta but no content/reasoning diffs of their own).
|
||||
bool is_role_init = res.contains("choices") && !res["choices"].empty() &&
|
||||
res["choices"][0].value("delta", json::object()).contains("role");
|
||||
if (!is_role_init) {
|
||||
attach_chat_deltas(reply, first_result.get());
|
||||
}
|
||||
writer->Write(reply);
|
||||
}
|
||||
} else {
|
||||
@@ -1745,7 +1756,11 @@ public:
|
||||
if (res_json.is_array()) {
|
||||
for (const auto & res : res_json) {
|
||||
auto reply = build_reply_from_json(res, result.get());
|
||||
attach_chat_deltas(reply, result.get());
|
||||
bool is_role_init = res.contains("choices") && !res["choices"].empty() &&
|
||||
res["choices"][0].value("delta", json::object()).contains("role");
|
||||
if (!is_role_init) {
|
||||
attach_chat_deltas(reply, result.get());
|
||||
}
|
||||
writer->Write(reply);
|
||||
}
|
||||
} else {
|
||||
|
||||
@@ -978,6 +978,42 @@ parameters:
|
||||
Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
|
||||
})
|
||||
|
||||
It("does not duplicate the first content token in streaming chat completions", Label("llama-gguf", "llama-gguf-stream"), func() {
|
||||
if runtime.GOOS != "linux" {
|
||||
Skip("test supported only on linux")
|
||||
}
|
||||
stream, err := client.CreateChatCompletionStream(context.TODO(), openai.ChatCompletionRequest{
|
||||
Model: "testmodel.ggml",
|
||||
Messages: []openai.ChatCompletionMessage{{Role: "user", Content: testPrompt}},
|
||||
})
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
defer stream.Close()
|
||||
|
||||
var contentParts []string
|
||||
for {
|
||||
chunk, err := stream.Recv()
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
if len(chunk.Choices) > 0 {
|
||||
delta := chunk.Choices[0].Delta.Content
|
||||
if delta != "" {
|
||||
contentParts = append(contentParts, delta)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Expect(contentParts).ToNot(BeEmpty(), "Expected streaming content tokens")
|
||||
// The first content token should appear exactly once.
|
||||
// A bug in grpc-server.cpp caused the role-init array element
|
||||
// to get the same ChatDelta stamped, duplicating the first token.
|
||||
if len(contentParts) >= 2 {
|
||||
Expect(contentParts[0]).ToNot(Equal(contentParts[1]),
|
||||
"First content token was duplicated: %v", contentParts[:2])
|
||||
}
|
||||
})
|
||||
|
||||
It("returns logprobs in chat completions when requested", func() {
|
||||
if runtime.GOOS != "linux" {
|
||||
Skip("test only on linux")
|
||||
|
||||
Reference in New Issue
Block a user