Files
LocalAI/backend/cpp/llama-cpp/paged/paged-loadgen.cpp
Ettore Di Giacinto 931793aa24 feat(paged): target-readiness for 2xH200 - correctness PASS, load-gen harness, projection
Deliverables for pushing paged KV toward the real target (2xH200), since GB10 is
only the test box and its "no win" result is a low-bandwidth artifact:

1. Correctness verified. test-paged-kv-e2e is greedy-equivalent to the contiguous
   reference (top-5 argmax ref=paged=3743, overlap 5/5). Found + fixed the blocking
   bug: common_fit_paged_kv_blocks over-reports free VRAM on GB10's unified device
   and tried 245GB of KV on a 119GB box, OOM-aborting context creation. Patch in
   patches/0002; durable fix (clamp to free_vram, honor --fit off) noted.

2. paged-loadgen.cpp: a dynamic-load benchmark that actually exercises where paging
   wins - variable prompt/gen lengths, continuous arrival, shared prefix - and
   reports the capacity ratio (contiguous reserve / paged peak KV). The stock tools
   run fixed-length all-at-once load, which is why they never show a paged win.

3. Projection to 2xH200, grounded in measured GB10 plateaus. Decode is bandwidth-
   bound, so the ceiling (~16k t/s for 32B) needs ~3,800 concurrent seqs, but
   contiguous KV fits only ~490 in HBM at 2k ctx - so KV memory IS the binding
   constraint on the target (unlike GB10), and paged KV's ~5-10x capacity (no
   over-reservation + prefix sharing) is what reaches the ceiling. The thesis holds
   on the target; remaining work is hardening/finishing the paged op (PR22569 was
   12-13% slower and lacks prefix sharing).

Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-21 23:16:28 +00:00

170 lines
9.1 KiB
C++

// paged-loadgen: a dynamic-load benchmark for paged KV that actually exercises the
// regime where paging wins - variable prompt lengths, variable generation lengths,
// staggered (continuous) arrival, and a shared system prefix. The stock
// examples/paged/paged.cpp adds all requests up front with a fixed n_predict from a
// 20-prompt pool, so it never creates KV-memory pressure or fragmentation and
// therefore never shows a paged advantage (see PAGED_KV_HIGH_CONCURRENCY.md).
//
// Build: drop into PR #22569's examples/paged/ and add to its CMakeLists.txt next to
// llama-paged (it uses the same llama_paged_scheduler_* API). Run on the TARGET GPU
// (e.g. 2xH200) where bandwidth lets decode scale to thousands of sequences and KV
// memory becomes the binding constraint - that is where paged KV pays off and where
// this harness produces a meaningful number. On a low-bandwidth box (GB10) throughput
// plateaus long before memory binds, so the win is not observable there regardless.
//
// Metrics reported:
// - goodput (decode tokens/s aggregate) under the dynamic load
// - peak concurrent in-flight sequences actually sustained
// - paged peak KV bytes used vs the contiguous reservation a unified cache needs
// (n_seq_peak * max_ctx), i.e. the capacity ratio = the headroom paging unlocks
//
// The capacity ratio is the load-bearing number for the buy decision: it is how many
// more concurrent tenants a fixed HBM budget serves with paging than without.
#include "common.h"
#include "llama.h"
#include <cmath>
#include <cstdio>
#include <cstring>
#include <random>
#include <string>
#include <vector>
// ---- workload knobs (env-overridable so the harness is sweepable without rebuilds) ----
static int env_int(const char * k, int dflt) { const char * v = getenv(k); return v ? atoi(v) : dflt; }
struct workload_cfg {
int total_requests = env_int("LG_TOTAL", 2000); // total requests to serve
int target_inflight = env_int("LG_INFLIGHT", 256); // continuous-batching concurrency target
int prefix_tokens = env_int("LG_PREFIX", 512); // shared system-prompt prefix (prefix-cache target)
int suffix_min = env_int("LG_SUFMIN", 16); // per-request unique prompt suffix range
int suffix_max = env_int("LG_SUFMAX", 768);
int gen_short = env_int("LG_GENSHORT", 32); // bimodal generation: most short...
int gen_long = env_int("LG_GENLONG", 1024); // ...some long (the over-reservation driver)
int gen_long_pct = env_int("LG_LONGPCT", 15); // % of requests that are long
int block_size = env_int("LG_BLOCK", 16); // must match -kvbls
unsigned seed = (unsigned) env_int("LG_SEED", 1234);
};
// Per-request plan drawn from the workload distribution.
struct req_plan { int prompt_len; int gen_len; };
int main(int argc, char ** argv) {
common_params params;
params.n_predict = -1; // per-request, controlled by the plan below
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PAGED)) {
fprintf(stderr, "usage: %s -m <model> -kvp --fit off -ngpub N -ncpub M -ngl 99\n", argv[0]);
return 1;
}
params.kv_paged = true;
common_init_result init = common_init_from_params(params);
llama_model * model = init.model.get();
llama_context * ctx = init.context.get();
if (!model || !ctx) { fprintf(stderr, "load failed\n"); return 1; }
const llama_vocab * vocab = llama_model_get_vocab(model);
workload_cfg cfg;
std::mt19937 rng(cfg.seed);
std::uniform_int_distribution<int> suf(cfg.suffix_min, cfg.suffix_max);
std::uniform_int_distribution<int> pct(1, 100);
// KV bytes/token = 2(K,V) * n_layers * n_head_kv * head_dim * sizeof(f16). Confirmed
// against llama-kv-cache-paged.cpp (block_bytes formula). Used for the capacity ratio.
const int n_layers = llama_model_n_layer(model);
const int n_head_kv = llama_model_n_head_kv(model);
const int head_dim = llama_model_n_embd(model) / llama_model_n_head(model);
const size_t kv_bytes_per_token = (size_t)2 * n_layers * n_head_kv * head_dim * sizeof(uint16_t);
// A long shared system prefix that every request reuses (the prefix-cache target).
std::vector<llama_token> prefix = common_tokenize(ctx, std::string(cfg.prefix_tokens, 'x'), true);
// Pre-draw all request plans so paged peak usage and the contiguous reservation are
// computed from the SAME workload.
std::vector<req_plan> plans(cfg.total_requests);
int max_ctx = 0;
for (auto & p : plans) {
p.prompt_len = cfg.prefix_tokens + suf(rng);
p.gen_len = (pct(rng) <= cfg.gen_long_pct) ? cfg.gen_long : cfg.gen_short;
max_ctx = std::max(max_ctx, p.prompt_len + p.gen_len);
}
llama_paged_scheduler * sched = llama_paged_scheduler_init(ctx);
if (!sched) { fprintf(stderr, "scheduler init failed\n"); return 1; }
// ---- continuous-arrival loop: keep ~target_inflight requests live at all times ----
int next_req = 0, done = 0, inflight = 0, peak_inflight = 0;
long total_decoded = 0;
size_t peak_kv_bytes_paged = 0; // sum over live seqs of ceil(used/block)*block*kv_bytes
size_t live_used_tokens = 0; // running sum of actual KV tokens held by live seqs
auto admit = [&](int rid) {
const req_plan & p = plans[rid];
std::vector<llama_token> toks = prefix; // shared prefix...
std::vector<llama_token> suff = common_tokenize(ctx, std::string(p.prompt_len - cfg.prefix_tokens, 'y'), false);
toks.insert(toks.end(), suff.begin(), suff.end()); // ...+ unique suffix
if (llama_paged_scheduler_add_request(sched, toks.data(), toks.size(), rid)) {
inflight++; peak_inflight = std::max(peak_inflight, inflight);
live_used_tokens += p.prompt_len;
}
};
const int64_t t0 = ggml_time_us();
for (int i = 0; i < cfg.target_inflight && next_req < cfg.total_requests; ++i) admit(next_req++);
llama_batch batch = {};
std::vector<llama_token> sampled; std::vector<int8_t> stop_flags;
while (done < cfg.total_requests) {
if (!llama_paged_scheduler_prepare_batch(sched, &batch)) break;
const llama_paged_batch_info * info = llama_paged_scheduler_get_batch_info(sched);
sampled.assign(info->n_seq, 0); stop_flags.assign(info->n_seq, 0);
// (decode is done inside the scheduler/update path in PR #22569; greedy here)
for (int i = 0; i < info->n_seq; ++i) {
const int rid = info->seq_ids[i];
llama_paged_seq_state st{};
llama_paged_scheduler_get_seq_state(sched, rid, &st);
// greedy argmax from the i-th row of logits
const float * lg = llama_get_logits_ith(ctx, i);
int best = 0; float bv = lg[0];
for (int t = 1; t < llama_vocab_n_tokens(vocab); ++t) if (lg[t] > bv) { bv = lg[t]; best = t; }
sampled[i] = best;
const bool stop = llama_vocab_is_eog(vocab, best) || st.n_decoded + 1 >= plans[rid].gen_len;
stop_flags[i] = stop ? 1 : 0;
if (!stop) { total_decoded++; live_used_tokens++; }
if (stop) {
done++; inflight--;
live_used_tokens -= (plans[rid].prompt_len + st.n_decoded);
if (next_req < cfg.total_requests) admit(next_req++); // continuous arrival
}
}
// paged peak KV: blocks are allocated per live seq = ceil(used/block); approximate
// current paged footprint from live_used_tokens rounded up per the block size.
const size_t paged_now = (size_t)std::ceil((double)live_used_tokens / cfg.block_size)
* cfg.block_size * kv_bytes_per_token;
peak_kv_bytes_paged = std::max(peak_kv_bytes_paged, paged_now);
llama_paged_scheduler_update(sched, &batch, sampled.data(), stop_flags.data());
}
const double secs = (ggml_time_us() - t0) / 1e6;
// Contiguous unified-KV reservation needed to serve the SAME peak concurrency without
// mid-generation eviction: every live slot must be backed for the worst-case context.
const size_t contig_reserve = (size_t)peak_inflight * max_ctx * kv_bytes_per_token;
printf("\n==== paged-loadgen ====\n");
printf("requests served : %d (target inflight %d, peak inflight %d)\n", done, cfg.target_inflight, peak_inflight);
printf("goodput (decode) : %.1f tok/s (%ld tokens / %.2f s)\n", total_decoded / secs, total_decoded, secs);
printf("kv bytes / token : %zu (n_layer=%d n_head_kv=%d head_dim=%d f16)\n", kv_bytes_per_token, n_layers, n_head_kv, head_dim);
printf("paged peak KV : %.2f GiB (allocated on demand)\n", peak_kv_bytes_paged / 1073741824.0);
printf("contiguous reserve : %.2f GiB (peak_inflight * max_ctx %d)\n", contig_reserve / 1073741824.0, max_ctx);
printf("CAPACITY RATIO : %.2fx <- tenants-per-HBM paging unlocks\n",
peak_kv_bytes_paged ? (double)contig_reserve / peak_kv_bytes_paged : 0.0);
printf(" (plus cross-request prefix sharing of the %d-token shared prefix, not counted above)\n", cfg.prefix_tokens);
llama_paged_scheduler_free(sched);
return 0;
}