mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-23 08:08:52 -04:00
Deliverables for pushing paged KV toward the real target (2xH200), since GB10 is only the test box and its "no win" result is a low-bandwidth artifact: 1. Correctness verified. test-paged-kv-e2e is greedy-equivalent to the contiguous reference (top-5 argmax ref=paged=3743, overlap 5/5). Found + fixed the blocking bug: common_fit_paged_kv_blocks over-reports free VRAM on GB10's unified device and tried 245GB of KV on a 119GB box, OOM-aborting context creation. Patch in patches/0002; durable fix (clamp to free_vram, honor --fit off) noted. 2. paged-loadgen.cpp: a dynamic-load benchmark that actually exercises where paging wins - variable prompt/gen lengths, continuous arrival, shared prefix - and reports the capacity ratio (contiguous reserve / paged peak KV). The stock tools run fixed-length all-at-once load, which is why they never show a paged win. 3. Projection to 2xH200, grounded in measured GB10 plateaus. Decode is bandwidth- bound, so the ceiling (~16k t/s for 32B) needs ~3,800 concurrent seqs, but contiguous KV fits only ~490 in HBM at 2k ctx - so KV memory IS the binding constraint on the target (unlike GB10), and paged KV's ~5-10x capacity (no over-reservation + prefix sharing) is what reaches the ceiling. The thesis holds on the target; remaining work is hardening/finishing the paged op (PR22569 was 12-13% slower and lacks prefix sharing). Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
170 lines
9.1 KiB
C++
170 lines
9.1 KiB
C++
// paged-loadgen: a dynamic-load benchmark for paged KV that actually exercises the
|
|
// regime where paging wins - variable prompt lengths, variable generation lengths,
|
|
// staggered (continuous) arrival, and a shared system prefix. The stock
|
|
// examples/paged/paged.cpp adds all requests up front with a fixed n_predict from a
|
|
// 20-prompt pool, so it never creates KV-memory pressure or fragmentation and
|
|
// therefore never shows a paged advantage (see PAGED_KV_HIGH_CONCURRENCY.md).
|
|
//
|
|
// Build: drop into PR #22569's examples/paged/ and add to its CMakeLists.txt next to
|
|
// llama-paged (it uses the same llama_paged_scheduler_* API). Run on the TARGET GPU
|
|
// (e.g. 2xH200) where bandwidth lets decode scale to thousands of sequences and KV
|
|
// memory becomes the binding constraint - that is where paged KV pays off and where
|
|
// this harness produces a meaningful number. On a low-bandwidth box (GB10) throughput
|
|
// plateaus long before memory binds, so the win is not observable there regardless.
|
|
//
|
|
// Metrics reported:
|
|
// - goodput (decode tokens/s aggregate) under the dynamic load
|
|
// - peak concurrent in-flight sequences actually sustained
|
|
// - paged peak KV bytes used vs the contiguous reservation a unified cache needs
|
|
// (n_seq_peak * max_ctx), i.e. the capacity ratio = the headroom paging unlocks
|
|
//
|
|
// The capacity ratio is the load-bearing number for the buy decision: it is how many
|
|
// more concurrent tenants a fixed HBM budget serves with paging than without.
|
|
|
|
#include "common.h"
|
|
#include "llama.h"
|
|
|
|
#include <cmath>
|
|
#include <cstdio>
|
|
#include <cstring>
|
|
#include <random>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
// ---- workload knobs (env-overridable so the harness is sweepable without rebuilds) ----
|
|
static int env_int(const char * k, int dflt) { const char * v = getenv(k); return v ? atoi(v) : dflt; }
|
|
|
|
struct workload_cfg {
|
|
int total_requests = env_int("LG_TOTAL", 2000); // total requests to serve
|
|
int target_inflight = env_int("LG_INFLIGHT", 256); // continuous-batching concurrency target
|
|
int prefix_tokens = env_int("LG_PREFIX", 512); // shared system-prompt prefix (prefix-cache target)
|
|
int suffix_min = env_int("LG_SUFMIN", 16); // per-request unique prompt suffix range
|
|
int suffix_max = env_int("LG_SUFMAX", 768);
|
|
int gen_short = env_int("LG_GENSHORT", 32); // bimodal generation: most short...
|
|
int gen_long = env_int("LG_GENLONG", 1024); // ...some long (the over-reservation driver)
|
|
int gen_long_pct = env_int("LG_LONGPCT", 15); // % of requests that are long
|
|
int block_size = env_int("LG_BLOCK", 16); // must match -kvbls
|
|
unsigned seed = (unsigned) env_int("LG_SEED", 1234);
|
|
};
|
|
|
|
// Per-request plan drawn from the workload distribution.
|
|
struct req_plan { int prompt_len; int gen_len; };
|
|
|
|
int main(int argc, char ** argv) {
|
|
common_params params;
|
|
params.n_predict = -1; // per-request, controlled by the plan below
|
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PAGED)) {
|
|
fprintf(stderr, "usage: %s -m <model> -kvp --fit off -ngpub N -ncpub M -ngl 99\n", argv[0]);
|
|
return 1;
|
|
}
|
|
params.kv_paged = true;
|
|
|
|
common_init_result init = common_init_from_params(params);
|
|
llama_model * model = init.model.get();
|
|
llama_context * ctx = init.context.get();
|
|
if (!model || !ctx) { fprintf(stderr, "load failed\n"); return 1; }
|
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
|
|
workload_cfg cfg;
|
|
std::mt19937 rng(cfg.seed);
|
|
std::uniform_int_distribution<int> suf(cfg.suffix_min, cfg.suffix_max);
|
|
std::uniform_int_distribution<int> pct(1, 100);
|
|
|
|
// KV bytes/token = 2(K,V) * n_layers * n_head_kv * head_dim * sizeof(f16). Confirmed
|
|
// against llama-kv-cache-paged.cpp (block_bytes formula). Used for the capacity ratio.
|
|
const int n_layers = llama_model_n_layer(model);
|
|
const int n_head_kv = llama_model_n_head_kv(model);
|
|
const int head_dim = llama_model_n_embd(model) / llama_model_n_head(model);
|
|
const size_t kv_bytes_per_token = (size_t)2 * n_layers * n_head_kv * head_dim * sizeof(uint16_t);
|
|
|
|
// A long shared system prefix that every request reuses (the prefix-cache target).
|
|
std::vector<llama_token> prefix = common_tokenize(ctx, std::string(cfg.prefix_tokens, 'x'), true);
|
|
|
|
// Pre-draw all request plans so paged peak usage and the contiguous reservation are
|
|
// computed from the SAME workload.
|
|
std::vector<req_plan> plans(cfg.total_requests);
|
|
int max_ctx = 0;
|
|
for (auto & p : plans) {
|
|
p.prompt_len = cfg.prefix_tokens + suf(rng);
|
|
p.gen_len = (pct(rng) <= cfg.gen_long_pct) ? cfg.gen_long : cfg.gen_short;
|
|
max_ctx = std::max(max_ctx, p.prompt_len + p.gen_len);
|
|
}
|
|
|
|
llama_paged_scheduler * sched = llama_paged_scheduler_init(ctx);
|
|
if (!sched) { fprintf(stderr, "scheduler init failed\n"); return 1; }
|
|
|
|
// ---- continuous-arrival loop: keep ~target_inflight requests live at all times ----
|
|
int next_req = 0, done = 0, inflight = 0, peak_inflight = 0;
|
|
long total_decoded = 0;
|
|
size_t peak_kv_bytes_paged = 0; // sum over live seqs of ceil(used/block)*block*kv_bytes
|
|
size_t live_used_tokens = 0; // running sum of actual KV tokens held by live seqs
|
|
|
|
auto admit = [&](int rid) {
|
|
const req_plan & p = plans[rid];
|
|
std::vector<llama_token> toks = prefix; // shared prefix...
|
|
std::vector<llama_token> suff = common_tokenize(ctx, std::string(p.prompt_len - cfg.prefix_tokens, 'y'), false);
|
|
toks.insert(toks.end(), suff.begin(), suff.end()); // ...+ unique suffix
|
|
if (llama_paged_scheduler_add_request(sched, toks.data(), toks.size(), rid)) {
|
|
inflight++; peak_inflight = std::max(peak_inflight, inflight);
|
|
live_used_tokens += p.prompt_len;
|
|
}
|
|
};
|
|
|
|
const int64_t t0 = ggml_time_us();
|
|
for (int i = 0; i < cfg.target_inflight && next_req < cfg.total_requests; ++i) admit(next_req++);
|
|
|
|
llama_batch batch = {};
|
|
std::vector<llama_token> sampled; std::vector<int8_t> stop_flags;
|
|
|
|
while (done < cfg.total_requests) {
|
|
if (!llama_paged_scheduler_prepare_batch(sched, &batch)) break;
|
|
const llama_paged_batch_info * info = llama_paged_scheduler_get_batch_info(sched);
|
|
sampled.assign(info->n_seq, 0); stop_flags.assign(info->n_seq, 0);
|
|
|
|
// (decode is done inside the scheduler/update path in PR #22569; greedy here)
|
|
for (int i = 0; i < info->n_seq; ++i) {
|
|
const int rid = info->seq_ids[i];
|
|
llama_paged_seq_state st{};
|
|
llama_paged_scheduler_get_seq_state(sched, rid, &st);
|
|
// greedy argmax from the i-th row of logits
|
|
const float * lg = llama_get_logits_ith(ctx, i);
|
|
int best = 0; float bv = lg[0];
|
|
for (int t = 1; t < llama_vocab_n_tokens(vocab); ++t) if (lg[t] > bv) { bv = lg[t]; best = t; }
|
|
sampled[i] = best;
|
|
const bool stop = llama_vocab_is_eog(vocab, best) || st.n_decoded + 1 >= plans[rid].gen_len;
|
|
stop_flags[i] = stop ? 1 : 0;
|
|
if (!stop) { total_decoded++; live_used_tokens++; }
|
|
if (stop) {
|
|
done++; inflight--;
|
|
live_used_tokens -= (plans[rid].prompt_len + st.n_decoded);
|
|
if (next_req < cfg.total_requests) admit(next_req++); // continuous arrival
|
|
}
|
|
}
|
|
// paged peak KV: blocks are allocated per live seq = ceil(used/block); approximate
|
|
// current paged footprint from live_used_tokens rounded up per the block size.
|
|
const size_t paged_now = (size_t)std::ceil((double)live_used_tokens / cfg.block_size)
|
|
* cfg.block_size * kv_bytes_per_token;
|
|
peak_kv_bytes_paged = std::max(peak_kv_bytes_paged, paged_now);
|
|
|
|
llama_paged_scheduler_update(sched, &batch, sampled.data(), stop_flags.data());
|
|
}
|
|
const double secs = (ggml_time_us() - t0) / 1e6;
|
|
|
|
// Contiguous unified-KV reservation needed to serve the SAME peak concurrency without
|
|
// mid-generation eviction: every live slot must be backed for the worst-case context.
|
|
const size_t contig_reserve = (size_t)peak_inflight * max_ctx * kv_bytes_per_token;
|
|
|
|
printf("\n==== paged-loadgen ====\n");
|
|
printf("requests served : %d (target inflight %d, peak inflight %d)\n", done, cfg.target_inflight, peak_inflight);
|
|
printf("goodput (decode) : %.1f tok/s (%ld tokens / %.2f s)\n", total_decoded / secs, total_decoded, secs);
|
|
printf("kv bytes / token : %zu (n_layer=%d n_head_kv=%d head_dim=%d f16)\n", kv_bytes_per_token, n_layers, n_head_kv, head_dim);
|
|
printf("paged peak KV : %.2f GiB (allocated on demand)\n", peak_kv_bytes_paged / 1073741824.0);
|
|
printf("contiguous reserve : %.2f GiB (peak_inflight * max_ctx %d)\n", contig_reserve / 1073741824.0, max_ctx);
|
|
printf("CAPACITY RATIO : %.2fx <- tenants-per-HBM paging unlocks\n",
|
|
peak_kv_bytes_paged ? (double)contig_reserve / peak_kv_bytes_paged : 0.0);
|
|
printf(" (plus cross-request prefix sharing of the %d-token shared prefix, not counted above)\n", cfg.prefix_tokens);
|
|
|
|
llama_paged_scheduler_free(sched);
|
|
return 0;
|
|
}
|