From 42cb7bda1959b5c65ac7a989242d1ac2c61a14fd Mon Sep 17 00:00:00 2001 From: Austen Date: Sat, 14 Feb 2026 03:07:37 -0600 Subject: [PATCH] fix(llama-cpp): populate tensor_buft_override buffer so llama-cpp properly performs fit calculations (#8560) fix auto-fit for llama-cpp --- backend/cpp/llama-cpp/grpc-server.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index a9870e242..a26d38626 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -417,6 +417,12 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt // n_ctx_checkpoints: max context checkpoints per slot (default: 8) params.n_ctx_checkpoints = 8; + // llama memory fit fails if we don't provide a buffer for tensor overrides + const size_t ntbo = llama_max_tensor_buft_overrides(); + while (params.tensor_buft_overrides.size() < ntbo) { + params.tensor_buft_overrides.push_back({nullptr, nullptr}); + } + // decode options. Options are in form optname:optvale, or if booleans only optname. for (int i = 0; i < request->options_size(); i++) { std::string opt = request->options(i);