mirror of
https://github.com/mudler/LocalAI.git
synced 2026-04-29 11:37:40 -04:00
feat(llama-cpp): expose split_mode option for multi-GPU placement (#9560)
Adds split_mode (alias sm) to the llama.cpp backend options allowlist, accepting none|layer|row|tensor. The tensor value targets the experimental backend-agnostic tensor parallelism from ggml-org/llama.cpp#19378 and requires a llama.cpp build that includes that PR, FlashAttention enabled, KV-cache quantization disabled, and a manually set context size. Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
committed by
GitHub
parent
24505e57f5
commit
21eace40ec
@@ -642,6 +642,21 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
|
||||
} else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") {
|
||||
params.no_op_offload = false;
|
||||
}
|
||||
} else if (!strcmp(optname, "split_mode") || !strcmp(optname, "sm")) {
|
||||
// Accepts: none | layer | row | tensor (the latter requires a llama.cpp build
|
||||
// that includes ggml-org/llama.cpp#19378, FlashAttention enabled, and KV-cache
|
||||
// quantization disabled).
|
||||
if (optval != NULL) {
|
||||
if (optval_str == "none") {
|
||||
params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
||||
} else if (optval_str == "layer") {
|
||||
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
||||
} else if (optval_str == "row") {
|
||||
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
||||
} else if (optval_str == "tensor") {
|
||||
params.split_mode = LLAMA_SPLIT_MODE_TENSOR;
|
||||
}
|
||||
}
|
||||
} else if (!strcmp(optname, "kv_unified") || !strcmp(optname, "unified_kv")) {
|
||||
if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") {
|
||||
params.kv_unified = true;
|
||||
|
||||
Reference in New Issue
Block a user