// paged-loadgen: a dynamic-load benchmark for paged KV that actually exercises the // regime where paging wins - variable prompt lengths, variable generation lengths, // staggered (continuous) arrival, and a shared system prefix. The stock // examples/paged/paged.cpp adds all requests up front with a fixed n_predict from a // 20-prompt pool, so it never creates KV-memory pressure or fragmentation and // therefore never shows a paged advantage (see PAGED_KV_HIGH_CONCURRENCY.md). // // Build: drop into PR #22569's examples/paged/ and add to its CMakeLists.txt next to // llama-paged (it uses the same llama_paged_scheduler_* API). Run on the TARGET GPU // (e.g. 2xH200) where bandwidth lets decode scale to thousands of sequences and KV // memory becomes the binding constraint - that is where paged KV pays off and where // this harness produces a meaningful number. On a low-bandwidth box (GB10) throughput // plateaus long before memory binds, so the win is not observable there regardless. // // Metrics reported: // - goodput (decode tokens/s aggregate) under the dynamic load // - peak concurrent in-flight sequences actually sustained // - paged peak KV bytes used vs the contiguous reservation a unified cache needs // (n_seq_peak * max_ctx), i.e. the capacity ratio = the headroom paging unlocks // // The capacity ratio is the load-bearing number for the buy decision: it is how many // more concurrent tenants a fixed HBM budget serves with paging than without. #include "common.h" #include "llama.h" #include #include #include #include #include #include // ---- workload knobs (env-overridable so the harness is sweepable without rebuilds) ---- static int env_int(const char * k, int dflt) { const char * v = getenv(k); return v ? atoi(v) : dflt; } struct workload_cfg { int total_requests = env_int("LG_TOTAL", 2000); // total requests to serve int target_inflight = env_int("LG_INFLIGHT", 256); // continuous-batching concurrency target int prefix_tokens = env_int("LG_PREFIX", 512); // shared system-prompt prefix (prefix-cache target) int suffix_min = env_int("LG_SUFMIN", 16); // per-request unique prompt suffix range int suffix_max = env_int("LG_SUFMAX", 768); int gen_short = env_int("LG_GENSHORT", 32); // bimodal generation: most short... int gen_long = env_int("LG_GENLONG", 1024); // ...some long (the over-reservation driver) int gen_long_pct = env_int("LG_LONGPCT", 15); // % of requests that are long int block_size = env_int("LG_BLOCK", 16); // must match -kvbls unsigned seed = (unsigned) env_int("LG_SEED", 1234); }; // Per-request plan drawn from the workload distribution. struct req_plan { int prompt_len; int gen_len; }; int main(int argc, char ** argv) { common_params params; params.n_predict = -1; // per-request, controlled by the plan below if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PAGED)) { fprintf(stderr, "usage: %s -m -kvp --fit off -ngpub N -ncpub M -ngl 99\n", argv[0]); return 1; } params.kv_paged = true; common_init_result init = common_init_from_params(params); llama_model * model = init.model.get(); llama_context * ctx = init.context.get(); if (!model || !ctx) { fprintf(stderr, "load failed\n"); return 1; } const llama_vocab * vocab = llama_model_get_vocab(model); workload_cfg cfg; std::mt19937 rng(cfg.seed); std::uniform_int_distribution suf(cfg.suffix_min, cfg.suffix_max); std::uniform_int_distribution pct(1, 100); // KV bytes/token = 2(K,V) * n_layers * n_head_kv * head_dim * sizeof(f16). Confirmed // against llama-kv-cache-paged.cpp (block_bytes formula). Used for the capacity ratio. const int n_layers = llama_model_n_layer(model); const int n_head_kv = llama_model_n_head_kv(model); const int head_dim = llama_model_n_embd(model) / llama_model_n_head(model); const size_t kv_bytes_per_token = (size_t)2 * n_layers * n_head_kv * head_dim * sizeof(uint16_t); // A long shared system prefix that every request reuses (the prefix-cache target). std::vector prefix = common_tokenize(ctx, std::string(cfg.prefix_tokens, 'x'), true); // Pre-draw all request plans so paged peak usage and the contiguous reservation are // computed from the SAME workload. std::vector plans(cfg.total_requests); int max_ctx = 0; for (auto & p : plans) { p.prompt_len = cfg.prefix_tokens + suf(rng); p.gen_len = (pct(rng) <= cfg.gen_long_pct) ? cfg.gen_long : cfg.gen_short; max_ctx = std::max(max_ctx, p.prompt_len + p.gen_len); } llama_paged_scheduler * sched = llama_paged_scheduler_init(ctx); if (!sched) { fprintf(stderr, "scheduler init failed\n"); return 1; } // ---- continuous-arrival loop: keep ~target_inflight requests live at all times ---- int next_req = 0, done = 0, inflight = 0, peak_inflight = 0; long total_decoded = 0; size_t peak_kv_bytes_paged = 0; // sum over live seqs of ceil(used/block)*block*kv_bytes size_t live_used_tokens = 0; // running sum of actual KV tokens held by live seqs auto admit = [&](int rid) { const req_plan & p = plans[rid]; std::vector toks = prefix; // shared prefix... std::vector suff = common_tokenize(ctx, std::string(p.prompt_len - cfg.prefix_tokens, 'y'), false); toks.insert(toks.end(), suff.begin(), suff.end()); // ...+ unique suffix if (llama_paged_scheduler_add_request(sched, toks.data(), toks.size(), rid)) { inflight++; peak_inflight = std::max(peak_inflight, inflight); live_used_tokens += p.prompt_len; } }; const int64_t t0 = ggml_time_us(); for (int i = 0; i < cfg.target_inflight && next_req < cfg.total_requests; ++i) admit(next_req++); llama_batch batch = {}; std::vector sampled; std::vector stop_flags; while (done < cfg.total_requests) { if (!llama_paged_scheduler_prepare_batch(sched, &batch)) break; const llama_paged_batch_info * info = llama_paged_scheduler_get_batch_info(sched); sampled.assign(info->n_seq, 0); stop_flags.assign(info->n_seq, 0); // (decode is done inside the scheduler/update path in PR #22569; greedy here) for (int i = 0; i < info->n_seq; ++i) { const int rid = info->seq_ids[i]; llama_paged_seq_state st{}; llama_paged_scheduler_get_seq_state(sched, rid, &st); // greedy argmax from the i-th row of logits const float * lg = llama_get_logits_ith(ctx, i); int best = 0; float bv = lg[0]; for (int t = 1; t < llama_vocab_n_tokens(vocab); ++t) if (lg[t] > bv) { bv = lg[t]; best = t; } sampled[i] = best; const bool stop = llama_vocab_is_eog(vocab, best) || st.n_decoded + 1 >= plans[rid].gen_len; stop_flags[i] = stop ? 1 : 0; if (!stop) { total_decoded++; live_used_tokens++; } if (stop) { done++; inflight--; live_used_tokens -= (plans[rid].prompt_len + st.n_decoded); if (next_req < cfg.total_requests) admit(next_req++); // continuous arrival } } // paged peak KV: blocks are allocated per live seq = ceil(used/block); approximate // current paged footprint from live_used_tokens rounded up per the block size. const size_t paged_now = (size_t)std::ceil((double)live_used_tokens / cfg.block_size) * cfg.block_size * kv_bytes_per_token; peak_kv_bytes_paged = std::max(peak_kv_bytes_paged, paged_now); llama_paged_scheduler_update(sched, &batch, sampled.data(), stop_flags.data()); } const double secs = (ggml_time_us() - t0) / 1e6; // Contiguous unified-KV reservation needed to serve the SAME peak concurrency without // mid-generation eviction: every live slot must be backed for the worst-case context. const size_t contig_reserve = (size_t)peak_inflight * max_ctx * kv_bytes_per_token; printf("\n==== paged-loadgen ====\n"); printf("requests served : %d (target inflight %d, peak inflight %d)\n", done, cfg.target_inflight, peak_inflight); printf("goodput (decode) : %.1f tok/s (%ld tokens / %.2f s)\n", total_decoded / secs, total_decoded, secs); printf("kv bytes / token : %zu (n_layer=%d n_head_kv=%d head_dim=%d f16)\n", kv_bytes_per_token, n_layers, n_head_kv, head_dim); printf("paged peak KV : %.2f GiB (allocated on demand)\n", peak_kv_bytes_paged / 1073741824.0); printf("contiguous reserve : %.2f GiB (peak_inflight * max_ctx %d)\n", contig_reserve / 1073741824.0, max_ctx); printf("CAPACITY RATIO : %.2fx <- tenants-per-HBM paging unlocks\n", peak_kv_bytes_paged ? (double)contig_reserve / peak_kv_bytes_paged : 0.0); printf(" (plus cross-request prefix sharing of the %d-token shared prefix, not counted above)\n", cfg.prefix_tokens); llama_paged_scheduler_free(sched); return 0; }