From 28beac9a18425dbfcb77f7aa2d6b862aeb665ebd Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 24 Jun 2026 17:28:22 +0000 Subject: [PATCH] fix(llama-cpp): terminate tensor/kv override vectors after passthrough The tensor_buft_overrides padding and the kv/draft override terminators ran before the generic option passthrough, so a passthrough flag (--cpu-moe, --override-tensor, --override-kv, ...) appended a real entry after the null sentinel - tripping the model loader's back().pattern == nullptr assertion (crash) or being silently dropped. Move all three termination/padding blocks to the end of params_parse, after both the named-option loop and common_params_parse have pushed their real entries. Also widen the exit()-flag skip list so --version, --license, --list-devices and --cache-list cannot terminate the backend. Signed-off-by: Ettore Di Giacinto --- backend/cpp/llama-cpp/grpc-server.cpp | 54 ++++++++++++-------- docs/content/advanced/model-configuration.md | 5 +- 2 files changed, 36 insertions(+), 23 deletions(-) diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 5566d5087..6907b9122 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -1149,6 +1149,9 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt // These flags make upstream's parser exit() (printing usage / // completion), which would kill the backend process. Skip them. if (flag == "-h" || flag == "--help" || flag == "--usage" || + flag == "--version" || flag == "--license" || + flag == "--list-devices" || flag == "-cl" || + flag == "--cache-list" || flag.rfind("--completion", 0) == 0) { fprintf(stderr, "[llama-cpp] ignoring passthrough flag that would exit: %s\n", @@ -1197,27 +1200,6 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt } } - if (!params.kv_overrides.empty()) { - params.kv_overrides.emplace_back(); - params.kv_overrides.back().key[0] = 0; - } - - // tensor_buft_overrides sentinel termination (mirrors upstream common/arg.cpp). - // Real entries are pushed during option parsing; here we pad/terminate so the - // model loader sees back().pattern == nullptr (GGML_ASSERT at common.cpp:1543) - // and so llama_params_fit has the placeholder slots it requires. - { - const size_t ntbo = llama_max_tensor_buft_overrides(); - while (params.tensor_buft_overrides.size() < ntbo) { - params.tensor_buft_overrides.push_back({nullptr, nullptr}); - } - } - // Terminate the draft tensor_buft_overrides list with a sentinel, mirroring - // the main-model handling above. - if (!params.speculative.draft.tensor_buft_overrides.empty()) { - params.speculative.draft.tensor_buft_overrides.push_back({nullptr, nullptr}); - } - // TODO: Add yarn if (!request->tensorsplit().empty()) { @@ -1343,6 +1325,36 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt params.n_parallel = saved_n_parallel; } } + + // Terminate/pad the override vectors only after BOTH the named-option loop + // and the generic passthrough (common_params_parse above) have pushed their + // real entries, so back() is the null sentinel the model loader asserts on. + // Running these before the passthrough let a passthrough flag (--cpu-moe, + // --override-tensor, --override-kv, ...) append a real entry after the + // sentinel: a GGML_ASSERT crash for tensor_buft_overrides, a silent drop for + // kv_overrides. Double-termination is harmless (the while is a no-op if the + // passthrough parse already padded; an extra trailing null is ignored). + + if (!params.kv_overrides.empty()) { + params.kv_overrides.emplace_back(); + params.kv_overrides.back().key[0] = 0; + } + + // tensor_buft_overrides sentinel termination (mirrors upstream common/arg.cpp). + // Real entries are pushed during option parsing; here we pad/terminate so the + // model loader sees back().pattern == nullptr (GGML_ASSERT at common.cpp:1543) + // and so llama_params_fit has the placeholder slots it requires. + { + const size_t ntbo = llama_max_tensor_buft_overrides(); + while (params.tensor_buft_overrides.size() < ntbo) { + params.tensor_buft_overrides.push_back({nullptr, nullptr}); + } + } + // Terminate the draft tensor_buft_overrides list with a sentinel, mirroring + // the main-model handling above. + if (!params.speculative.draft.tensor_buft_overrides.empty()) { + params.speculative.draft.tensor_buft_overrides.push_back({nullptr, nullptr}); + } } diff --git a/docs/content/advanced/model-configuration.md b/docs/content/advanced/model-configuration.md index 06b516ac0..8092c162a 100644 --- a/docs/content/advanced/model-configuration.md +++ b/docs/content/advanced/model-configuration.md @@ -524,8 +524,9 @@ Notes: - **Power-user territory:** an invalid flag or value is rejected by the upstream parser exactly as it would be by `llama-server`, which can fail model loading. Prefer the named options above when one exists. -- `--help`, `--usage`, and `--completion*` are ignored (they would terminate the - backend process). +- Flags that would terminate the process (such as `--help`, `--usage`, + `--version`, `--license`, `--list-devices`, `--cache-list`, and + `--completion*`) are ignored. ### Prompt Caching