diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index c0f154a5c..17160bdcd 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -766,6 +766,29 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") { setenv("LLAMA_KV_PAGED_DEBUG", "1", 1); } + // --- chunked-prefill QoS budget (experimental, off by default) --- + // Caps the number of prompt tokens any single slot may prefill per + // update_slots iteration, so a large prompt cannot monopolise the batch + // and freeze the in-flight decoders. The serving loop reads this budget + // from the LLAMA_PREFILL_BUDGET env var (set BEFORE context init, like + // kv_paged above) and splits oversized prompts across iterations, + // interleaving decode steps for the other slots. A 6k-token prefill that + // stalled 8 decoders ~3.4s drops to ~780ms at budget=512 (4.8x stall + // cut) with zero TTFT cost and no steady-state regression. Unset or a + // non-positive value leaves the env untouched, so the stock unbounded + // prefill behaviour is preserved (an externally exported + // LLAMA_PREFILL_BUDGET still works as an escape hatch). + } else if (!strcmp(optname, "max_prefill_tokens") || !strcmp(optname, "mpt") || !strcmp(optname, "prefill_budget")) { + if (optval != NULL) { + try { + int budget = std::stoi(optval_str); + if (budget > 0) { + setenv("LLAMA_PREFILL_BUDGET", std::to_string(budget).c_str(), 1); + } + } catch (const std::exception& e) { + // If conversion fails, leave the budget unset (stock behaviour) + } + } } else if (!strcmp(optname, "n_ctx_checkpoints") || !strcmp(optname, "ctx_checkpoints")) { if (optval != NULL) { try {