From 066abf82c08d966dc1ca254c32e64d64e676e4f0 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Thu, 25 Jun 2026 08:10:08 +0200 Subject: [PATCH] feat(llama-cpp): cpu_moe/n_cpu_moe options + generic upstream-flag passthrough (#10490) * feat(llama-cpp): add main-model cpu_moe/n_cpu_moe options Mirror the existing draft_cpu_moe/draft_n_cpu_moe siblings for the main model, matching upstream --cpu-moe / --n-cpu-moe (common/arg.cpp). Lets users keep MoE expert weights on CPU to manage VRAM on large MoE models. Closes part of #10483 Signed-off-by: Ettore Di Giacinto * feat(llama-cpp): forward unknown '-' options to upstream arg parser Any options: entry starting with '-' is collected and passed verbatim to llama.cpp's own common_params_parse (LLAMA_EXAMPLE_SERVER) at the end of params_parse, so every upstream llama-server flag works without a new hand-wired branch. Passthrough runs last and wins on overlap; n_parallel is snapshotted to survive parser_init's SERVER reset, and help/usage/completion flags are skipped to avoid exiting the backend. Closes #10483 Signed-off-by: Ettore Di Giacinto * docs(llama-cpp): document cpu_moe/n_cpu_moe and option passthrough Signed-off-by: Ettore Di Giacinto * fix(llama-cpp): terminate tensor/kv override vectors after passthrough The tensor_buft_overrides padding and the kv/draft override terminators ran before the generic option passthrough, so a passthrough flag (--cpu-moe, --override-tensor, --override-kv, ...) appended a real entry after the null sentinel - tripping the model loader's back().pattern == nullptr assertion (crash) or being silently dropped. Move all three termination/padding blocks to the end of params_parse, after both the named-option loop and common_params_parse have pushed their real entries. Also widen the exit()-flag skip list so --version, --license, --list-devices and --cache-list cannot terminate the backend. Signed-off-by: Ettore Di Giacinto --------- Signed-off-by: Ettore Di Giacinto Co-authored-by: Ettore Di Giacinto --- backend/cpp/llama-cpp/grpc-server.cpp | 138 ++++++++++++++++--- docs/content/advanced/model-configuration.md | 33 +++++ 2 files changed, 150 insertions(+), 21 deletions(-) diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index c2e7f22e4..6907b9122 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -37,6 +37,7 @@ #include "backend.pb.h" #include "backend.grpc.pb.h" #include "common.h" +#include "arg.h" #include "chat-auto-parser.h" #include #include @@ -592,6 +593,10 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt params.checkpoint_min_step = 256; #endif + // Raw upstream llama-server flags collected from any option entry that + // starts with '-'. Applied once after the loop via common_params_parse. + std::vector extra_argv; + // decode options. Options are in form optname:optvale, or if booleans only optname. for (int i = 0; i < request->options_size(); i++) { std::string opt = request->options(i); @@ -1080,6 +1085,31 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt } catch (...) {} } + // --- main model MoE on CPU (upstream --cpu-moe / --n-cpu-moe) --- + } else if (!strcmp(optname, "cpu_moe")) { + // Bool-style flag: keep all MoE expert weights on CPU. + const bool enable = (optval == NULL) || + optval_str == "true" || optval_str == "1" || optval_str == "yes" || + optval_str == "on" || optval_str == "enabled"; + if (enable) { + params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override()); + } + } else if (!strcmp(optname, "n_cpu_moe")) { + if (optval != NULL) { + try { + int n = std::stoi(optval_str); + if (n < 0) n = 0; + // Keep override-name storage alive for the lifetime of the + // params struct (mirrors upstream arg.cpp's function-local static). + static std::list buft_overrides_main; + for (int i = 0; i < n; ++i) { + buft_overrides_main.push_back(llm_ffn_exps_block_regex(i)); + params.tensor_buft_overrides.push_back( + {buft_overrides_main.back().c_str(), ggml_backend_cpu_buffer_type()}); + } + } catch (...) {} + } + // --- draft model tensor buffer overrides (upstream --spec-draft-override-tensor) --- } else if (!strcmp(optname, "draft_override_tensor") || !strcmp(optname, "spec_draft_override_tensor")) { // Format: =,=,... @@ -1111,6 +1141,30 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt else { cur.push_back(c); } } if (!cur.empty()) flush(cur); + + // --- generic passthrough: any entry starting with '-' is a raw + // upstream llama-server flag, forwarded verbatim to the parser. --- + } else if (optname[0] == '-') { + std::string flag = optname; + // These flags make upstream's parser exit() (printing usage / + // completion), which would kill the backend process. Skip them. + if (flag == "-h" || flag == "--help" || flag == "--usage" || + flag == "--version" || flag == "--license" || + flag == "--list-devices" || flag == "-cl" || + flag == "--cache-list" || + flag.rfind("--completion", 0) == 0) { + fprintf(stderr, + "[llama-cpp] ignoring passthrough flag that would exit: %s\n", + flag.c_str()); + } else { + extra_argv.push_back(flag); + // Preserve the whole value after the first ':' so embedded + // colons (e.g. host:port) survive strtok's truncation of optval. + auto colon = opt.find(':'); + if (colon != std::string::npos) { + extra_argv.push_back(opt.substr(colon + 1)); + } + } } } @@ -1146,27 +1200,6 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt } } - if (!params.kv_overrides.empty()) { - params.kv_overrides.emplace_back(); - params.kv_overrides.back().key[0] = 0; - } - - // tensor_buft_overrides sentinel termination (mirrors upstream common/arg.cpp). - // Real entries are pushed during option parsing; here we pad/terminate so the - // model loader sees back().pattern == nullptr (GGML_ASSERT at common.cpp:1543) - // and so llama_params_fit has the placeholder slots it requires. - { - const size_t ntbo = llama_max_tensor_buft_overrides(); - while (params.tensor_buft_overrides.size() < ntbo) { - params.tensor_buft_overrides.push_back({nullptr, nullptr}); - } - } - // Terminate the draft tensor_buft_overrides list with a sentinel, mirroring - // the main-model handling above. - if (!params.speculative.draft.tensor_buft_overrides.empty()) { - params.speculative.draft.tensor_buft_overrides.push_back({nullptr, nullptr}); - } - // TODO: Add yarn if (!request->tensorsplit().empty()) { @@ -1259,6 +1292,69 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt params.sampling.grammar_triggers.push_back(std::move(trigger)); } } + + // Apply any raw upstream flags last so an explicit passthrough flag wins + // over the LocalAI-resolved field it maps to (e.g. --ctx-size beats + // context_size). This is the same parser llama-server itself uses. + if (!extra_argv.empty()) { + // common_params_parser_init resets a few fields for the SERVER example + // (n_parallel -> -1, use_color). Snapshot n_parallel so an unrelated + // passthrough flag can't silently clobber LocalAI's resolved value. + const int saved_n_parallel = params.n_parallel; + + std::vector argv; + std::string prog = "llama-server"; + argv.push_back(prog.data()); + for (auto & a : extra_argv) { + argv.push_back(a.data()); + } + + // ctx_arg.params is a reference, so this overlays the given flags onto + // `params` in place. Returns false on a recoverable parse error (and + // self-restores params); may exit() on a hard error, exactly as + // passing the same bad flag to llama-server would. + if (!common_params_parse((int)argv.size(), argv.data(), params, + LLAMA_EXAMPLE_SERVER)) { + fprintf(stderr, + "[llama-cpp] failed to parse passthrough options; ignoring them\n"); + } + + // Restore n_parallel unless a passthrough flag explicitly set it + // (parser_init's reset sentinel for SERVER is -1). + if (params.n_parallel == -1) { + params.n_parallel = saved_n_parallel; + } + } + + // Terminate/pad the override vectors only after BOTH the named-option loop + // and the generic passthrough (common_params_parse above) have pushed their + // real entries, so back() is the null sentinel the model loader asserts on. + // Running these before the passthrough let a passthrough flag (--cpu-moe, + // --override-tensor, --override-kv, ...) append a real entry after the + // sentinel: a GGML_ASSERT crash for tensor_buft_overrides, a silent drop for + // kv_overrides. Double-termination is harmless (the while is a no-op if the + // passthrough parse already padded; an extra trailing null is ignored). + + if (!params.kv_overrides.empty()) { + params.kv_overrides.emplace_back(); + params.kv_overrides.back().key[0] = 0; + } + + // tensor_buft_overrides sentinel termination (mirrors upstream common/arg.cpp). + // Real entries are pushed during option parsing; here we pad/terminate so the + // model loader sees back().pattern == nullptr (GGML_ASSERT at common.cpp:1543) + // and so llama_params_fit has the placeholder slots it requires. + { + const size_t ntbo = llama_max_tensor_buft_overrides(); + while (params.tensor_buft_overrides.size() < ntbo) { + params.tensor_buft_overrides.push_back({nullptr, nullptr}); + } + } + // Terminate the draft tensor_buft_overrides list with a sentinel, mirroring + // the main-model handling above. + if (!params.speculative.draft.tensor_buft_overrides.empty()) { + params.speculative.draft.tensor_buft_overrides.push_back({nullptr, nullptr}); + } } diff --git a/docs/content/advanced/model-configuration.md b/docs/content/advanced/model-configuration.md index 55e435b12..8092c162a 100644 --- a/docs/content/advanced/model-configuration.md +++ b/docs/content/advanced/model-configuration.md @@ -494,6 +494,39 @@ These llama.cpp options are passed through the `options:` array. | `direct_io` / `use_direct_io` | bool | `false` | Open the model with `O_DIRECT` (faster cold loads on NVMe; ignored if not supported). | | `verbosity` | int | `3` | llama.cpp internal log verbosity threshold. Higher = more verbose. | | `override_tensor` / `tensor_buft_overrides` | string | "" | Per-tensor buffer-type overrides for the main model. Format: `=,=,...`. Mirrors the existing `draft_override_tensor` syntax for the draft model. | +| `cpu_moe` | bool | false | Keep all MoE expert weights of the main model on CPU (upstream `--cpu-moe`). Frees VRAM on large MoE models (DeepSeek, Qwen3 `*-A3B`). | +| `n_cpu_moe` | int | 0 | Keep MoE expert weights of the first N main-model layers on CPU (upstream `--n-cpu-moe`). | + +#### Generic option passthrough + +Any `options:` entry whose name starts with `-` is forwarded **verbatim** to +upstream llama.cpp's own `llama-server` argument parser. This means any flag the +bundled llama.cpp supports works without LocalAI needing a dedicated option, +even ones added after your LocalAI version was built. See the upstream +[server flags reference](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md). + +Format mirrors the rest of the array - `--flag` for a boolean, or `--flag:value` +for a flag that takes a value. Everything after the first `:` is the value, so +embedded colons (e.g. `host:port`) are preserved: + +```yaml +options: + - "--cpu-moe" # boolean flag + - "--n-cpu-moe:4" # flag with a value + - "--override-tensor:exps=CPU" +``` + +Notes: + +- **Precedence:** passthrough flags are applied last, so an explicit flag + overrides the LocalAI option it maps to (e.g. `--ctx-size:8192` overrides + `context_size`). +- **Power-user territory:** an invalid flag or value is rejected by the upstream + parser exactly as it would be by `llama-server`, which can fail model loading. + Prefer the named options above when one exists. +- Flags that would terminate the process (such as `--help`, `--usage`, + `--version`, `--license`, `--list-devices`, `--cache-list`, and + `--completion*`) are ignored. ### Prompt Caching