From c02a50f2ab7ea32e3226212a53a05c795841bef1 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Thu, 30 Apr 2026 08:44:43 +0200
Subject: [PATCH] feat(llama-cpp): bump to d775992 and adapt to spec params
 refactor (#9618)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps backend/cpp/llama-cpp/Makefile LLAMA_VERSION from 665abc6 to
d775992, picking up upstream PR ggml-org/llama.cpp#22397 which splits
common_params_speculative into nested draft / ngram_simple / ngram_mod
sub-structs. Renames every grpc-server.cpp reference to match:

  speculative.mparams_dft.path  -> speculative.draft.mparams.path
  speculative.{n_max,n_min}     -> speculative.draft.{n_max,n_min}
  speculative.{p_min,p_split}   -> speculative.draft.{p_min,p_split}
  speculative.{n_gpu_layers,n_ctx} -> speculative.draft.{n_gpu_layers,n_ctx}
  speculative.ngram_size_n      -> speculative.ngram_simple.size_n
  speculative.ngram_size_m      -> speculative.ngram_simple.size_m
  speculative.ngram_min_hits    -> speculative.ngram_simple.min_hits

The "speculative.n_max" JSON key sent to the upstream server stays
unchanged — server-task.cpp still reads it and routes the value into
draft.n_max internally.

The turboquant fork (TheTom/llama-cpp-turboquant @ 11a241d) branched
before #22397 and still exposes the flat layout. Since turboquant
reuses the shared backend/cpp/llama-cpp/grpc-server.cpp, extend
patch-grpc-server.sh with an idempotent sed block that reverts the
ten field references back to the legacy flat names on the build copy
only — the original under backend/cpp/llama-cpp/ stays compiling
against vanilla upstream. Drop the block once the fork rebases.

ik-llama-cpp has its own grpc-server.cpp with no speculative refs
(0/2661 lines), so it is unaffected.

Validated locally with `make docker-build-llama-cpp` (avx, avx2,
avx512, fallback, grpc + rpc-server all built; image exported).


Assisted-by: Claude:claude-opus-4-7 [Bash Read Edit]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/cpp/llama-cpp/Makefile              |  2 +-
 backend/cpp/llama-cpp/grpc-server.cpp       | 24 +++++++--------
 backend/cpp/turboquant/patch-grpc-server.sh | 33 ++++++++++++++++++++-
 3 files changed, 45 insertions(+), 14 deletions(-)

diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile
index ee164b9e4..d509ddacd 100644
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@
 
-LLAMA_VERSION?=665abc609740d397d30c0d8ef4157dbf900bd1a3
+LLAMA_VERSION?=d77599234ea6e498775aeadbce665eece5bd98cd
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
 
 CMAKE_ARGS?=
diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp
index 6bf144e91..df3d075e7 100644
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -442,7 +442,7 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
 
     // Draft model for speculative decoding
     if (!request->draftmodel().empty()) {
-        params.speculative.mparams_dft.path = request->draftmodel();
+        params.speculative.draft.mparams.path = request->draftmodel();
         // Default to draft type if a draft model is set but no explicit type
         if (params.speculative.type == COMMON_SPECULATIVE_TYPE_NONE) {
             params.speculative.type = COMMON_SPECULATIVE_TYPE_DRAFT;
@@ -679,39 +679,39 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
             }
         } else if (!strcmp(optname, "spec_n_max") || !strcmp(optname, "draft_max")) {
             if (optval != NULL) {
-                try { params.speculative.n_max = std::stoi(optval_str); } catch (...) {}
+                try { params.speculative.draft.n_max = std::stoi(optval_str); } catch (...) {}
             }
         } else if (!strcmp(optname, "spec_n_min") || !strcmp(optname, "draft_min")) {
             if (optval != NULL) {
-                try { params.speculative.n_min = std::stoi(optval_str); } catch (...) {}
+                try { params.speculative.draft.n_min = std::stoi(optval_str); } catch (...) {}
             }
         } else if (!strcmp(optname, "spec_p_min") || !strcmp(optname, "draft_p_min")) {
             if (optval != NULL) {
-                try { params.speculative.p_min = std::stof(optval_str); } catch (...) {}
+                try { params.speculative.draft.p_min = std::stof(optval_str); } catch (...) {}
             }
         } else if (!strcmp(optname, "spec_p_split")) {
             if (optval != NULL) {
-                try { params.speculative.p_split = std::stof(optval_str); } catch (...) {}
+                try { params.speculative.draft.p_split = std::stof(optval_str); } catch (...) {}
             }
         } else if (!strcmp(optname, "spec_ngram_size_n") || !strcmp(optname, "ngram_size_n")) {
             if (optval != NULL) {
-                try { params.speculative.ngram_size_n = (uint16_t)std::stoi(optval_str); } catch (...) {}
+                try { params.speculative.ngram_simple.size_n = (uint16_t)std::stoi(optval_str); } catch (...) {}
             }
         } else if (!strcmp(optname, "spec_ngram_size_m") || !strcmp(optname, "ngram_size_m")) {
             if (optval != NULL) {
-                try { params.speculative.ngram_size_m = (uint16_t)std::stoi(optval_str); } catch (...) {}
+                try { params.speculative.ngram_simple.size_m = (uint16_t)std::stoi(optval_str); } catch (...) {}
             }
         } else if (!strcmp(optname, "spec_ngram_min_hits") || !strcmp(optname, "ngram_min_hits")) {
             if (optval != NULL) {
-                try { params.speculative.ngram_min_hits = (uint16_t)std::stoi(optval_str); } catch (...) {}
+                try { params.speculative.ngram_simple.min_hits = (uint16_t)std::stoi(optval_str); } catch (...) {}
             }
         } else if (!strcmp(optname, "draft_gpu_layers")) {
             if (optval != NULL) {
-                try { params.speculative.n_gpu_layers = std::stoi(optval_str); } catch (...) {}
+                try { params.speculative.draft.n_gpu_layers = std::stoi(optval_str); } catch (...) {}
             }
         } else if (!strcmp(optname, "draft_ctx_size")) {
             if (optval != NULL) {
-                try { params.speculative.n_ctx = std::stoi(optval_str); } catch (...) {}
+                try { params.speculative.draft.n_ctx = std::stoi(optval_str); } catch (...) {}
             }
         }
     }
@@ -933,8 +933,8 @@ public:
             if (!params.mmproj.path.empty()) {
                 error_msg += " (with mmproj: " + params.mmproj.path + ")";
             }
-            if (params.speculative.has_dft() && !params.speculative.mparams_dft.path.empty()) {
-                error_msg += " (with draft model: " + params.speculative.mparams_dft.path + ")";
+            if (params.speculative.has_dft() && !params.speculative.draft.mparams.path.empty()) {
+                error_msg += " (with draft model: " + params.speculative.draft.mparams.path + ")";
             }
             
             // Add captured error details if available
diff --git a/backend/cpp/turboquant/patch-grpc-server.sh b/backend/cpp/turboquant/patch-grpc-server.sh
index c3dd967a0..a4c2df62c 100755
--- a/backend/cpp/turboquant/patch-grpc-server.sh
+++ b/backend/cpp/turboquant/patch-grpc-server.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 # Patch the shared backend/cpp/llama-cpp/grpc-server.cpp *copy* used by the
-# turboquant build to account for two gaps between upstream and the fork:
+# turboquant build to account for the gaps between upstream and the fork:
 #
 #   1. Augment the kv_cache_types[] allow-list so `LoadModel` accepts the
 #      fork-specific `turbo2` / `turbo3` / `turbo4` cache types.
@@ -11,6 +11,14 @@
 #      "<__media__>", and Go-side tooling falls back to that sentinel when the
 #      backend does not expose media_marker, so substituting the literal keeps
 #      behavior identical on the turboquant path.
+#   3. Revert the `common_params_speculative` field references to the
+#      pre-refactor flat layout. Upstream ggml-org/llama.cpp#22397 split the
+#      struct into nested `draft` / `ngram_simple` / `ngram_mod` / etc. members;
+#      the turboquant fork branched before that PR and still exposes the flat
+#      `n_max`, `mparams_dft`, `ngram_size_n`, ... fields. The substitutions
+#      below map the new nested paths back to the legacy flat names so the
+#      shared grpc-server.cpp keeps compiling against the fork's common.h.
+#      Drop this block once the fork rebases past #22397.
 #
 # We patch the *copy* sitting in turboquant-<flavor>-build/, never the original
 # under backend/cpp/llama-cpp/, so the stock llama-cpp build keeps compiling
@@ -77,4 +85,27 @@ else
     echo "==> $SRC has no get_media_marker() call, skipping media-marker patch"
 fi
 
+if grep -q 'params\.speculative\.draft\.\|params\.speculative\.ngram_simple\.' "$SRC"; then
+    echo "==> patching $SRC to revert common_params_speculative refs to pre-#22397 flat layout"
+    # Each substitution is the exact post-refactor path → legacy flat field.
+    # Order doesn't matter because the source paths are disjoint, but we keep
+    # the most-specific (mparams.path) first for readability.
+    sed -E \
+        -e 's/params\.speculative\.draft\.mparams\.path/params.speculative.mparams_dft.path/g' \
+        -e 's/params\.speculative\.draft\.n_max/params.speculative.n_max/g' \
+        -e 's/params\.speculative\.draft\.n_min/params.speculative.n_min/g' \
+        -e 's/params\.speculative\.draft\.p_min/params.speculative.p_min/g' \
+        -e 's/params\.speculative\.draft\.p_split/params.speculative.p_split/g' \
+        -e 's/params\.speculative\.draft\.n_gpu_layers/params.speculative.n_gpu_layers/g' \
+        -e 's/params\.speculative\.draft\.n_ctx/params.speculative.n_ctx/g' \
+        -e 's/params\.speculative\.ngram_simple\.size_n/params.speculative.ngram_size_n/g' \
+        -e 's/params\.speculative\.ngram_simple\.size_m/params.speculative.ngram_size_m/g' \
+        -e 's/params\.speculative\.ngram_simple\.min_hits/params.speculative.ngram_min_hits/g' \
+        "$SRC" > "$SRC.tmp"
+    mv "$SRC.tmp" "$SRC"
+    echo "==> speculative field rename OK"
+else
+    echo "==> $SRC has no post-#22397 speculative field refs, skipping spec rename patch"
+fi
+
 echo "==> all patches applied"