From 518381278e362423ab70aaa8ad23c7b44ee13a03 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 20 Jun 2026 08:22:22 +0200
Subject: [PATCH] chore: :arrow_up: Update ggml-org/llama.cpp to
 `e475fa2b5f9fb50c3d6fc3e7c6fdf1e004465b62` (#10392)

* :arrow_up: Update ggml-org/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* fix(llama-cpp): adapt grpc-server to upstream server-schema split

Upstream llama.cpp (e475fa2) extracted the JSON request-schema evaluation
out of the static server_task::params_from_json_cmpl into the new
server_schema::eval_llama_cmpl_schema (tools/server/server-schema.cpp).
The grpc-server unity build still called the old static member, breaking
every llama-cpp backend build with "no member named 'params_from_json_cmpl'
in 'server_task'".

Pull server-schema.cpp into the translation unit and call the new function,
keeping both guarded by __has_include so forks that predate the split (e.g.
llama-cpp-turboquant, which still exposes params_from_json_cmpl) keep
compiling against the old static member.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

---------

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/cpp/llama-cpp/Makefile        |  2 +-
 backend/cpp/llama-cpp/grpc-server.cpp | 24 ++++++++++++++++++++++--
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile
index 64414ec30..bf9f4f608 100644
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@
 
-LLAMA_VERSION?=f3e182816421c648188b5eab269853bf1531d950
+LLAMA_VERSION?=e475fa2b5f9fb50c3d6fc3e7c6fdf1e004465b62
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
 
 CMAKE_ARGS?=
diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp
index 8502e9530..c2e7f22e4 100644
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -18,6 +18,18 @@
 #if __has_include("server-chat.cpp")
 #include "server-chat.cpp"
 #endif
+// server-schema.cpp exists only in llama.cpp after the upstream refactor that
+// extracted the JSON request-schema evaluation (previously the static
+// server_task::params_from_json_cmpl) into server_schema::eval_llama_cmpl_schema.
+// server-context.cpp and grpc-server.cpp both call into it, so its definitions
+// must be part of this translation unit or the link fails. __has_include keeps
+// the source compatible with older pins/forks (e.g. llama-cpp-turboquant) that
+// predate the split and still expose params_from_json_cmpl (see the guarded
+// call sites below).
+#if __has_include("server-schema.cpp")
+#define LOCALAI_HAS_SERVER_SCHEMA 1
+#include "server-schema.cpp"
+#endif
 #include "server-context.cpp"
 
 // LocalAI
@@ -2102,7 +2114,11 @@ public:
                 task.index = i;
 
                 task.tokens    = std::move(inputs[i]);
+#ifdef LOCALAI_HAS_SERVER_SCHEMA
+                task.params           = server_schema::eval_llama_cmpl_schema(
+#else
                 task.params           = server_task::params_from_json_cmpl(
+#endif
                         ctx_server.impl->vocab,
                         params_base,
                         ctx_server.get_meta().slot_n_ctx,
@@ -2116,7 +2132,7 @@ public:
                 // cannot detect tool calls or separate reasoning from content.
                 task.params.res_type                 = TASK_RESPONSE_TYPE_OAI_CHAT;
                 task.params.oaicompat_cmpl_id         = completion_id;
-                // oaicompat_model is already populated by params_from_json_cmpl
+                // oaicompat_model is already populated by eval_llama_cmpl_schema
 
                 tasks.push_back(std::move(task));
             }
@@ -2940,7 +2956,11 @@ public:
                 task.index = i;
 
                 task.tokens    = std::move(inputs[i]);
+#ifdef LOCALAI_HAS_SERVER_SCHEMA
+                task.params           = server_schema::eval_llama_cmpl_schema(
+#else
                 task.params           = server_task::params_from_json_cmpl(
+#endif
                         ctx_server.impl->vocab,
                         params_base,
                         ctx_server.get_meta().slot_n_ctx,
@@ -2952,7 +2972,7 @@ public:
                 // reasoning, tool calls, and content are classified into ChatDeltas.
                 task.params.res_type                 = TASK_RESPONSE_TYPE_OAI_CHAT;
                 task.params.oaicompat_cmpl_id         = completion_id;
-                // oaicompat_model is already populated by params_from_json_cmpl
+                // oaicompat_model is already populated by eval_llama_cmpl_schema
 
                 tasks.push_back(std::move(task));
             }