feat(llama-cpp): add main-model cpu_moe/n_cpu_moe options

Mirror the existing draft_cpu_moe/draft_n_cpu_moe siblings for the main
model, matching upstream --cpu-moe / --n-cpu-moe (common/arg.cpp). Lets
users keep MoE expert weights on CPU to manage VRAM on large MoE models.

Closes part of #10483

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2026-06-24 17:07:14 +00:00
parent e8ae88a2a0
commit 811f0db2e3

View File

@@ -1080,6 +1080,31 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
} catch (...) {}
}
// --- main model MoE on CPU (upstream --cpu-moe / --n-cpu-moe) ---
} else if (!strcmp(optname, "cpu_moe")) {
// Bool-style flag: keep all MoE expert weights on CPU.
const bool enable = (optval == NULL) ||
optval_str == "true" || optval_str == "1" || optval_str == "yes" ||
optval_str == "on" || optval_str == "enabled";
if (enable) {
params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
}
} else if (!strcmp(optname, "n_cpu_moe")) {
if (optval != NULL) {
try {
int n = std::stoi(optval_str);
if (n < 0) n = 0;
// Keep override-name storage alive for the lifetime of the
// params struct (mirrors upstream arg.cpp's function-local static).
static std::list<std::string> buft_overrides_main;
for (int i = 0; i < n; ++i) {
buft_overrides_main.push_back(llm_ffn_exps_block_regex(i));
params.tensor_buft_overrides.push_back(
{buft_overrides_main.back().c_str(), ggml_backend_cpu_buffer_type()});
}
} catch (...) {}
}
// --- draft model tensor buffer overrides (upstream --spec-draft-override-tensor) ---
} else if (!strcmp(optname, "draft_override_tensor") || !strcmp(optname, "spec_draft_override_tensor")) {
// Format: <tensor regex>=<buffer type>,<tensor regex>=<buffer type>,...