From 53bdb18d1068ba622b9555fc41fa91b51fb44fdb Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 14 May 2026 08:53:23 +0200
Subject: [PATCH] chore: :arrow_up: Update ggml-org/llama.cpp to
 `7f3f843c31cd32dc4adc10b393342dfee071c332` (#9809)

* :arrow_up: Update ggml-org/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* fix(llama-cpp): adapt to upstream COMMON_SPECULATIVE_TYPE_DRAFT rename

ggml-org/llama.cpp#22964 ("spec: update CLI arguments for better
consistency") renamed the speculative type enum values:
  COMMON_SPECULATIVE_TYPE_DRAFT  -> COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE
  COMMON_SPECULATIVE_TYPE_EAGLE3 -> COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3
and the registered name strings flipped from underscore- to dash-
separated form (e.g. ngram_simple -> ngram-simple), with the bare
draft/eagle3 aliases replaced by draft-simple/draft-eagle3.

This broke the build with the new LLAMA_VERSION on every variant
(vulkan/arm64, darwin and likely all the rest) at grpc-server.cpp:461.

Update the upstream branch of the speculative-type fallback to use the
new identifier (the LOCALAI_LEGACY_LLAMA_CPP_SPEC fork branch keeps the
old name), and normalize spec_type option tokens before passing them to
common_speculative_types_from_names so existing model configs that say
spec_type:draft / spec_type:ngram_simple keep working.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: claude-code:claude-opus-4-7

---------

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/cpp/llama-cpp/Makefile        |  2 +-
 backend/cpp/llama-cpp/grpc-server.cpp | 20 +++++++++++++++++---
 2 files changed, 18 insertions(+), 4 deletions(-)
diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile
index 865f9481f..b3a27fd4b 100644
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@
 
-LLAMA_VERSION?=a9883db8ee021cf16783016a60996d41820b5195
+LLAMA_VERSION?=7f3f843c31cd32dc4adc10b393342dfee071c332
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
 
 CMAKE_ARGS?=
diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp
index 5c1d07766..6c4fa6946 100644
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -32,6 +32,7 @@
 #include <grpcpp/health_check_service_interface.h>
 #include <grpcpp/security/server_credentials.h>
 #include <regex>
+#include <algorithm>
 #include <atomic>
 #include <cstdlib>
 #include <fstream>
@@ -450,6 +451,8 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
         // vector; the turboquant fork still uses the legacy scalar. The
         // LOCALAI_LEGACY_LLAMA_CPP_SPEC macro is injected by
         // backend/cpp/turboquant/patch-grpc-server.sh for fork builds only.
+        // Upstream renamed COMMON_SPECULATIVE_TYPE_DRAFT -> ..._DRAFT_SIMPLE
+        // in ggml-org/llama.cpp#22964; the fork still uses the old name.
 #ifdef LOCALAI_LEGACY_LLAMA_CPP_SPEC
         if (params.speculative.type == COMMON_SPECULATIVE_TYPE_NONE) {
             params.speculative.type = COMMON_SPECULATIVE_TYPE_DRAFT;
@@ -458,7 +461,7 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
         const bool no_spec_type = params.speculative.types.empty() ||
             (params.speculative.types.size() == 1 && params.speculative.types[0] == COMMON_SPECULATIVE_TYPE_NONE);
         if (no_spec_type) {
-            params.speculative.types = { COMMON_SPECULATIVE_TYPE_DRAFT };
+            params.speculative.types = { COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE };
         }
 #endif
     }
@@ -701,16 +704,27 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
             // Upstream switched to a vector of types (comma-separated for multi-type
             // chaining via common_speculative_types_from_names). We keep accepting a
             // single value here, but also tolerate comma-separated lists.
+            //
+            // ggml-org/llama.cpp#22964 also renamed the registered names from
+            // underscore- to dash-separated form, and replaced the bare
+            // `draft`/`eagle3` aliases with `draft-simple`/`draft-eagle3`. We
+            // normalize each token here so existing model configs keep working.
+            auto normalize_spec_name = [](std::string s) -> std::string {
+                std::replace(s.begin(), s.end(), '_', '-');
+                if (s == "draft")  return "draft-simple";
+                if (s == "eagle3") return "draft-eagle3";
+                return s;
+            };
             std::vector<std::string> names;
             std::string item;
             for (char c : optval_str) {
                 if (c == ',') {
-                    if (!item.empty()) { names.push_back(item); item.clear(); }
+                    if (!item.empty()) { names.push_back(normalize_spec_name(item)); item.clear(); }
                 } else {
                     item.push_back(c);
                 }
             }
-            if (!item.empty()) names.push_back(item);
+            if (!item.empty()) names.push_back(normalize_spec_name(item));
             auto parsed = common_speculative_types_from_names(names);
             if (!parsed.empty()) {
                 params.speculative.types = parsed;