From aa80d4681b944777268223ab976fbade7d3ea6e7 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sun, 31 May 2026 12:26:03 +0200 Subject: [PATCH] chore: :arrow_up: Update ggml-org/llama.cpp to `d6588daa800058dfa54f1d7ea695b1a810c8ae18` (#10093) * :arrow_up: Update ggml-org/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * fix(llama-cpp): skip begin-of-stream null partial in PredictStream Upstream llama.cpp (ggml-org/llama.cpp#23884), pulled in by this bump, now emits an initial "begin" partial whose to_json() returns null. It exists only to signal the HTTP layer to flush 200 status headers before any token is produced. gRPC has no such concept, and PredictStream had no guard: the null result was fed straight into build_reply_from_json, which threw an uncaught exception. That surfaced as a generic "Unexpected error in RPC handling" and the task was cancelled the instant it launched, breaking the PredictStream e2e spec. Skip null results in both the first-result handling and the streaming loop, mirroring upstream's own `if (first_result_json == nullptr)` guard. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:claude-opus-4-8 [Claude Code] --------- Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Signed-off-by: Ettore Di Giacinto Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> Co-authored-by: Ettore Di Giacinto --- backend/cpp/llama-cpp/Makefile | 2 +- backend/cpp/llama-cpp/grpc-server.cpp | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile index 0dd9b0c5e..b80e8b99a 100644 --- a/backend/cpp/llama-cpp/Makefile +++ b/backend/cpp/llama-cpp/Makefile @@ -1,5 +1,5 @@ -LLAMA_VERSION?=22d66b567eef11cf2e9832f04db64ee0323a0fd0 +LLAMA_VERSION?=d6588daa800058dfa54f1d7ea695b1a810c8ae18 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp CMAKE_ARGS?= diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index f8dd48f5a..2ca329134 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -2204,7 +2204,15 @@ public: // content element — attaching to both would duplicate the first // token since oaicompat_msg_diffs is the same for both. json first_res_json = first_result->to_json(); - if (first_res_json.is_array()) { + // Upstream llama.cpp (ggml-org/llama.cpp#23884) now emits an initial + // "begin" partial whose to_json() returns null, used only to signal the + // HTTP layer to flush 200 status headers before any token. gRPC has no + // such concept, so there is nothing to emit — the real tokens arrive in + // the loop below. Feeding this null into build_reply_from_json would + // throw (uncaught) and surface as a generic RPC error. + if (first_res_json.is_null()) { + // skip the begin-of-stream marker + } else if (first_res_json.is_array()) { for (const auto & res : first_res_json) { auto reply = build_reply_from_json(res, first_result.get()); // Skip chat deltas for role-init elements (have "role" in @@ -2234,7 +2242,10 @@ public: } json res_json = result->to_json(); - if (res_json.is_array()) { + if (res_json.is_null()) { + // begin-of-stream marker (see note above) — nothing to emit + continue; + } else if (res_json.is_array()) { for (const auto & res : res_json) { auto reply = build_reply_from_json(res, result.get()); bool is_role_init = res.contains("choices") && !res["choices"].empty() &&