diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile
index 0dd9b0c5e..b80e8b99a 100644
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@
 
-LLAMA_VERSION?=22d66b567eef11cf2e9832f04db64ee0323a0fd0
+LLAMA_VERSION?=d6588daa800058dfa54f1d7ea695b1a810c8ae18
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
 
 CMAKE_ARGS?=
diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp
index f8dd48f5a..2ca329134 100644
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -2204,7 +2204,15 @@ public:
         // content element — attaching to both would duplicate the first
         // token since oaicompat_msg_diffs is the same for both.
         json first_res_json = first_result->to_json();
-        if (first_res_json.is_array()) {
+        // Upstream llama.cpp (ggml-org/llama.cpp#23884) now emits an initial
+        // "begin" partial whose to_json() returns null, used only to signal the
+        // HTTP layer to flush 200 status headers before any token. gRPC has no
+        // such concept, so there is nothing to emit — the real tokens arrive in
+        // the loop below. Feeding this null into build_reply_from_json would
+        // throw (uncaught) and surface as a generic RPC error.
+        if (first_res_json.is_null()) {
+            // skip the begin-of-stream marker
+        } else if (first_res_json.is_array()) {
             for (const auto & res : first_res_json) {
                 auto reply = build_reply_from_json(res, first_result.get());
                 // Skip chat deltas for role-init elements (have "role" in
@@ -2234,7 +2242,10 @@ public:
             }
 
             json res_json = result->to_json();
-            if (res_json.is_array()) {
+            if (res_json.is_null()) {
+                // begin-of-stream marker (see note above) — nothing to emit
+                continue;
+            } else if (res_json.is_array()) {
                 for (const auto & res : res_json) {
                     auto reply = build_reply_from_json(res, result.get());
                     bool is_role_init = res.contains("choices") && !res["choices"].empty() &&