From 28beac9a18425dbfcb77f7aa2d6b862aeb665ebd Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Wed, 24 Jun 2026 17:28:22 +0000
Subject: [PATCH] fix(llama-cpp): terminate tensor/kv override vectors after
 passthrough

The tensor_buft_overrides padding and the kv/draft override terminators
ran before the generic option passthrough, so a passthrough flag
(--cpu-moe, --override-tensor, --override-kv, ...) appended a real entry
after the null sentinel - tripping the model loader's
back().pattern == nullptr assertion (crash) or being silently dropped.
Move all three termination/padding blocks to the end of params_parse,
after both the named-option loop and common_params_parse have pushed
their real entries. Also widen the exit()-flag skip list so --version,
--license, --list-devices and --cache-list cannot terminate the backend.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/cpp/llama-cpp/grpc-server.cpp        | 54 ++++++++++++--------
 docs/content/advanced/model-configuration.md |  5 +-
 2 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp
index 5566d5087..6907b9122 100644
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -1149,6 +1149,9 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
             // These flags make upstream's parser exit() (printing usage /
             // completion), which would kill the backend process. Skip them.
             if (flag == "-h" || flag == "--help" || flag == "--usage" ||
+                flag == "--version" || flag == "--license" ||
+                flag == "--list-devices" || flag == "-cl" ||
+                flag == "--cache-list" ||
                 flag.rfind("--completion", 0) == 0) {
                 fprintf(stderr,
                     "[llama-cpp] ignoring passthrough flag that would exit: %s\n",
@@ -1197,27 +1200,6 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
         }
     }
 
-    if (!params.kv_overrides.empty()) {
-        params.kv_overrides.emplace_back();
-        params.kv_overrides.back().key[0] = 0;
-    }
-
-    // tensor_buft_overrides sentinel termination (mirrors upstream common/arg.cpp).
-    // Real entries are pushed during option parsing; here we pad/terminate so the
-    // model loader sees back().pattern == nullptr (GGML_ASSERT at common.cpp:1543)
-    // and so llama_params_fit has the placeholder slots it requires.
-    {
-        const size_t ntbo = llama_max_tensor_buft_overrides();
-        while (params.tensor_buft_overrides.size() < ntbo) {
-            params.tensor_buft_overrides.push_back({nullptr, nullptr});
-        }
-    }
-    // Terminate the draft tensor_buft_overrides list with a sentinel, mirroring
-    // the main-model handling above.
-    if (!params.speculative.draft.tensor_buft_overrides.empty()) {
-        params.speculative.draft.tensor_buft_overrides.push_back({nullptr, nullptr});
-    }
-
     // TODO: Add yarn
 
     if (!request->tensorsplit().empty()) {
@@ -1343,6 +1325,36 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
             params.n_parallel = saved_n_parallel;
         }
     }
+
+    // Terminate/pad the override vectors only after BOTH the named-option loop
+    // and the generic passthrough (common_params_parse above) have pushed their
+    // real entries, so back() is the null sentinel the model loader asserts on.
+    // Running these before the passthrough let a passthrough flag (--cpu-moe,
+    // --override-tensor, --override-kv, ...) append a real entry after the
+    // sentinel: a GGML_ASSERT crash for tensor_buft_overrides, a silent drop for
+    // kv_overrides. Double-termination is harmless (the while is a no-op if the
+    // passthrough parse already padded; an extra trailing null is ignored).
+
+    if (!params.kv_overrides.empty()) {
+        params.kv_overrides.emplace_back();
+        params.kv_overrides.back().key[0] = 0;
+    }
+
+    // tensor_buft_overrides sentinel termination (mirrors upstream common/arg.cpp).
+    // Real entries are pushed during option parsing; here we pad/terminate so the
+    // model loader sees back().pattern == nullptr (GGML_ASSERT at common.cpp:1543)
+    // and so llama_params_fit has the placeholder slots it requires.
+    {
+        const size_t ntbo = llama_max_tensor_buft_overrides();
+        while (params.tensor_buft_overrides.size() < ntbo) {
+            params.tensor_buft_overrides.push_back({nullptr, nullptr});
+        }
+    }
+    // Terminate the draft tensor_buft_overrides list with a sentinel, mirroring
+    // the main-model handling above.
+    if (!params.speculative.draft.tensor_buft_overrides.empty()) {
+        params.speculative.draft.tensor_buft_overrides.push_back({nullptr, nullptr});
+    }
 }
 
 
diff --git a/docs/content/advanced/model-configuration.md b/docs/content/advanced/model-configuration.md
index 06b516ac0..8092c162a 100644
--- a/docs/content/advanced/model-configuration.md
+++ b/docs/content/advanced/model-configuration.md
@@ -524,8 +524,9 @@ Notes:
 - **Power-user territory:** an invalid flag or value is rejected by the upstream
   parser exactly as it would be by `llama-server`, which can fail model loading.
   Prefer the named options above when one exists.
-- `--help`, `--usage`, and `--completion*` are ignored (they would terminate the
-  backend process).
+- Flags that would terminate the process (such as `--help`, `--usage`,
+  `--version`, `--license`, `--list-devices`, `--cache-list`, and
+  `--completion*`) are ignored.
 
 ### Prompt Caching