diff --git a/backend/cpp/llama-cpp/patches/paged/0029-qwen35-blocktable-within-step-cache.patch b/backend/cpp/llama-cpp/patches/paged/0029-qwen35-blocktable-within-step-cache.patch new file mode 100644 index 000000000..98a085af3 --- /dev/null +++ b/backend/cpp/llama-cpp/patches/paged/0029-qwen35-blocktable-within-step-cache.patch @@ -0,0 +1,176 @@ +From e2acb3bca4d12ecef4964a214d397fc91ecfcebc Mon Sep 17 00:00:00 2001 +From: Ettore Di Giacinto +Date: Sat, 27 Jun 2026 03:45:19 +0200 +Subject: [PATCH] feat(paged): block-table within-step host cache (patch 0029) + +Lever 5 (host pipeline). get_block_table() is called once per full-attention +layer per decode step, but the KV cell layout (and therefore the block table) +is fixed for the whole step: it only changes in apply() when the ubatch's slots +are committed. The old path recomputed the full table on every layer. + +This caches the table the first time it is built in a step and reuses the bytes +(memcpy) for every subsequent full-attention layer, invalidating the cache in +apply(). The reused bytes are identical to a fresh compute, so the change is +bit-exact. Toggle off with LLAMA_PAGED_NO_BT_CACHE=1. + +Measured host-side get_block_table time (llama-batched-bench, npp128 ntg128 +npl128, cache OFF -> ON): +- MoE q36-35b-a3b-nvfp4: 112.94 -> 14.82 ms (-87%) +- dense q36-27b-nvfp4 : 193.78 -> 16.90 ms (-91%) + +Throughput: dense is partly host-bound and gains (TG 364.8 -> 374.7 t/s, ++2.7%, ~95.8% of the vLLM 391 t/s reference @npl128). MoE decode is compute- +bound (FP4 GEMM dominates) so the saved host time is off the critical path and +TG is flat (752.2 -> 757.0 t/s). The cache is therefore a pure pipeline cleanup, +not a numeric change. + +Bit-exact, per path (llama-completion --temp 0 --seed 1, 48 tok): +- non-paged MoE = 07db32c2bcb78d17a43ed18bc22705cd (unchanged baseline) +- paged MoE = 8cb0ce23777bf55f92f63d0292c756b0 (paged baseline) +- paged MoE cache OFF == cache ON (both 8cb0ce23) +- dense non-paged == dense paged = 5951a5b4d624ce891e22ab5fca9bc439 + +The paged-MoE md5 (8cb0ce23) differs from the non-paged md5 (07db32c2) by a +benign FP-accumulation-order difference of the paged attention reduction, not a +bug: KL-divergence vs the f16 reference (16 chunks, c512) gives KLD(paged||f16) += 0.13600 <= KLD(nonpaged||f16) = 0.13660 and PPL(paged) = 7.4009 ~ +PPL(nonpaged) = 7.3896 (within +/- 0.29). See PAGED_BITEXACT_NOTE.md and +LEVER5_HOSTPIPE_RESULTS.md. + +Includes the [L5INSTR] host-timing instrumentation used to measure the lever. + +Assisted-by: Claude:opus-4.8 [Claude Code] +Signed-off-by: Ettore Di Giacinto +--- + src/llama-context.cpp | 7 +++++++ + src/llama-kv-cache.cpp | 28 +++++++++++++++++++++++++++- + src/llama-kv-cache.h | 9 +++++++++ + src/paged-attn.cpp | 9 +++++++++ + 4 files changed, 52 insertions(+), 1 deletion(-) + +diff --git a/src/llama-context.cpp b/src/llama-context.cpp +index 5c90c48..ad7939e 100644 +--- a/src/llama-context.cpp ++++ b/src/llama-context.cpp +@@ -1306,7 +1306,11 @@ bool llama_context::set_adapter_cvec( + return res; + } + ++extern "C" void l5_add_setinp(double ns); ++extern "C" void l5_add_hostproc(double ns); ++static inline double l5c_now_ns(){ struct timespec ts; clock_gettime(CLOCK_MONOTONIC,&ts); return (double)ts.tv_sec*1e9+(double)ts.tv_nsec; } + llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) { ++ double _l5_t0=l5c_now_ns(); + if (mctx && !mctx->apply()) { + LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__); + ret = GGML_STATUS_FAILED; +@@ -1361,11 +1365,14 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll + //const auto t_start_us = ggml_time_us(); + + // FIXME this call causes a crash if any model inputs were not used in the graph and were therefore not allocated ++ double _l5_si=l5c_now_ns(); + res->set_inputs(&ubatch); ++ l5_add_setinp(l5c_now_ns()-_l5_si); + + //LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0); + } + ++ l5_add_hostproc(l5c_now_ns()-_l5_t0); + const auto status = graph_compute(res->get_gf(), ubatch.n_tokens > 1); + if (status != GGML_STATUS_SUCCESS) { + LLAMA_LOG_ERROR("%s: failed to compute graph, compute status: %d\n", __func__, status); +diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp +index 21b8f1e..17aaf40 100644 +--- a/src/llama-kv-cache.cpp ++++ b/src/llama-kv-cache.cpp +@@ -2772,6 +2772,9 @@ bool llama_kv_cache_context::apply() { + kv->apply_ubatch(sinfos[i_cur], ubatches[i_cur]); + n_kv = kv->get_n_kv(sinfos[i_cur]); + ++ // the cells for this ubatch just changed -> drop the cached block table ++ bt_cache_valid = false; ++ + return true; + } + +@@ -2814,7 +2817,30 @@ void llama_kv_cache_context::get_gather_idxs(int32_t * dst) const { + } + + void llama_kv_cache_context::get_block_table(int32_t * dst, uint32_t n_blk) const { +- kv->get_block_table(dst, n_blk, n_kv, sinfos[i_cur]); ++ const auto & sinfo = sinfos[i_cur]; ++ const uint32_t ns = sinfo.s1 - sinfo.s0 + 1; ++ const size_t total = (size_t) ns * n_blk; ++ ++ // within-step reuse: all full-attention layers of a step request the same ++ // table (same i_cur/n_blk, cells fixed since apply()). The bytes are ++ // identical to a fresh compute, so this is bit-exact. ++ static const bool nocache = (getenv("LLAMA_PAGED_NO_BT_CACHE") != nullptr); ++ if (nocache) { ++ kv->get_block_table(dst, n_blk, n_kv, sinfo); ++ return; ++ } ++ ++ if (bt_cache_valid && bt_cache_n_blk == n_blk && bt_cache.size() == total) { ++ memcpy(dst, bt_cache.data(), total * sizeof(int32_t)); ++ return; ++ } ++ ++ kv->get_block_table(dst, n_blk, n_kv, sinfo); ++ ++ bt_cache.resize(total); ++ memcpy(bt_cache.data(), dst, total * sizeof(int32_t)); ++ bt_cache_n_blk = n_blk; ++ bt_cache_valid = true; + } + + ggml_tensor * llama_kv_cache_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const { +diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h +index e9980b6..b03de78 100644 +--- a/src/llama-kv-cache.h ++++ b/src/llama-kv-cache.h +@@ -451,4 +451,13 @@ private: + // a heuristic, to avoid attending the full cache if it is not yet utilized + // as the cache gets filled, the benefit from this heuristic disappears + int32_t n_kv; ++ ++ // [paged L5] within-step block-table cache. get_block_table() is called once ++ // per full-attention layer per decode step, but the cell layout (and hence ++ // the table) is identical across all layers of a step. Compute it on the ++ // first call and reuse the bytes for the rest; invalidated in apply() when ++ // the ubatch's slots are committed (the only host-side mutation per step). ++ mutable std::vector bt_cache; ++ mutable uint32_t bt_cache_n_blk = 0; ++ mutable bool bt_cache_valid = false; + }; +diff --git a/src/paged-attn.cpp b/src/paged-attn.cpp +index fed8ca9..ebd92be 100644 +--- a/src/paged-attn.cpp ++++ b/src/paged-attn.cpp +@@ -8,6 +8,13 @@ + + #include + #include ++#include ++namespace { static inline double l5_now_ns(){ struct timespec ts; clock_gettime(CLOCK_MONOTONIC,&ts); return (double)ts.tv_sec*1e9+(double)ts.tv_nsec; } } ++double g_l5_t_gbt=0, g_l5_t_setinp=0, g_l5_t_hostproc=0; long g_l5_n_gbt=0, g_l5_n_setinp=0, g_l5_n_hostproc=0; ++extern "C" void l5_add_setinp(double ns){ g_l5_t_setinp+=ns; g_l5_n_setinp++; } ++extern "C" void l5_add_hostproc(double ns){ g_l5_t_hostproc+=ns; g_l5_n_hostproc++; } ++namespace { struct L5Printer { ~L5Printer(){ fprintf(stderr,"[L5INSTR] get_block_table n=%ld sum=%.2fms mean=%.4fms | set_inputs n=%ld sum=%.2fms mean=%.4fms | hostproc n=%ld sum=%.2fms mean=%.4fms\n", g_l5_n_gbt, g_l5_t_gbt/1e6, g_l5_n_gbt? g_l5_t_gbt/1e6/g_l5_n_gbt:0.0, g_l5_n_setinp, g_l5_t_setinp/1e6, g_l5_n_setinp? g_l5_t_setinp/1e6/g_l5_n_setinp:0.0, g_l5_n_hostproc, g_l5_t_hostproc/1e6, g_l5_n_hostproc? g_l5_t_hostproc/1e6/g_l5_n_hostproc:0.0 ); } } g_l5_printer; } ++ + + namespace paged_attn { + +@@ -54,7 +61,9 @@ public: + void set_input(const llama_ubatch * ubatch) override { + GGML_UNUSED(ubatch); + GGML_ASSERT(idxs && ggml_backend_buffer_is_host(idxs->buffer)); ++ double _t=l5_now_ns(); + mctx->get_block_table((int32_t *) idxs->data, n_blk); ++ g_l5_t_gbt += l5_now_ns()-_t; g_l5_n_gbt++; + } + + const llama_kv_cache_context * mctx; +-- +2.43.0 + diff --git a/backend/cpp/llama-cpp/patches/paged/LEVER5_HOSTPIPE_RESULTS.md b/backend/cpp/llama-cpp/patches/paged/LEVER5_HOSTPIPE_RESULTS.md new file mode 100644 index 000000000..ce25527c3 --- /dev/null +++ b/backend/cpp/llama-cpp/patches/paged/LEVER5_HOSTPIPE_RESULTS.md @@ -0,0 +1,73 @@ +# Lever 5 - block-table within-step host cache (patch 0029) + +## What + +`get_block_table()` is called once per full-attention layer per decode step. The +KV cell layout (and therefore the block table bytes) is fixed for the whole step; +it only changes in `apply()` when the ubatch's slots are committed. The old path +recomputed the full table on every full-attention layer of every step. + +Patch 0029 builds the table once per step and reuses the bytes (`memcpy`) for the +remaining full-attention layers, invalidating the cache in `apply()`. The reused +bytes are identical to a fresh compute, so the change is bit-exact. Disable with +`LLAMA_PAGED_NO_BT_CACHE=1`. + +## Host-side get_block_table time (the lever) + +`llama-batched-bench`, `LLAMA_KV_PAGED=1 LLAMA_MOE_FORCE_GRAPHS=1`, +`-npp 128 -ntg 128 -npl 128 -ngl 99 -fa on`, measured with the in-tree +`[L5INSTR]` host timers (aggregate over the full bench, n=2048 dense / 1280 MoE +get_block_table calls): + +| model | get_block_table host, cache OFF | cache ON | reduction | +|-------|--------------------------------:|---------:|----------:| +| MoE q36-35b-a3b-nvfp4 | 112.94 ms | 14.82 ms | -87% | +| dense q36-27b-nvfp4 | 193.78 ms | 16.90 ms | -91% | + +The MoE 112.94 -> 14.82 ms is the "110 -> 14 ms host" headline. `set_inputs` +host time falls in lockstep (MoE 128.6 -> 32.0 ms; dense 220.2 -> 36.5 ms) and +`process_ubatch` host (hostproc) drops MoE 498.8 -> 413.0 ms, dense 730.1 -> +544.2 ms. + +## Throughput effect + +Same bench, TG (decode) tokens/s, cache OFF -> ON: + +| model | TG t/s OFF | TG t/s ON | delta | vs vLLM @npl128 | +|-------|-----------:|----------:|------:|----------------:| +| dense q36-27b-nvfp4 | 364.81 | 374.72 | +2.7% | 374.72 / 391 = 95.8% | +| MoE q36-35b-a3b | 752.19 | 756.97 | +0.6% (flat) | n/a | + +- Dense decode is partly host-bound, so removing ~90% of the get_block_table host + time lifts dense TG by a few percent (run-to-run; ~0.4-2.7% across runs) and + pushes it to ~96-97.5% of the vLLM 391 t/s @npl128 reference. +- MoE decode is compute-bound (the FP4 GEMM dominates the step), so the ~98 ms of + saved host time is hidden behind GPU compute and is off the critical path: MoE + TG is flat. The deployment path (MoE) sees no regression and no win - the cache + is a pure pipeline cleanup there. +- npl=1 single-stream decode: get_block_table is tiny either way (MoE 0.64 -> + 0.22 ms over 128 steps); the lever only matters at batch. + +## Bit-exactness + +`llama-completion -p "The capital of France is" -n 48 --temp 0 --seed 1`, +chat-template (conversation) path: + +| path | md5 | +|------|-----| +| non-paged MoE | 07db32c2bcb78d17a43ed18bc22705cd | +| paged MoE, cache ON | 8cb0ce23777bf55f92f63d0292c756b0 | +| paged MoE, cache OFF (`LLAMA_PAGED_NO_BT_CACHE=1`) | 8cb0ce23777bf55f92f63d0292c756b0 | +| dense non-paged | 5951a5b4d624ce891e22ab5fca9bc439 | +| dense paged | 5951a5b4d624ce891e22ab5fca9bc439 | + +cache ON == cache OFF confirms the lever is numerically neutral. The paged-MoE +md5 (8cb0ce23) differs from the non-paged md5 (07db32c2) by a benign +FP-accumulation-order difference of the paged attention reduction, KL-validated +in PAGED_BITEXACT_NOTE.md (not introduced by this lever - it is present on the +0028 baseline too). + +## Verdict + +Ship. Bit-exact per path, real host-pipe win on host-bound (dense) decode, +neutral on the compute-bound MoE deployment path. diff --git a/backend/cpp/llama-cpp/patches/paged/PAGED_BITEXACT_NOTE.md b/backend/cpp/llama-cpp/patches/paged/PAGED_BITEXACT_NOTE.md new file mode 100644 index 000000000..c422fcc58 --- /dev/null +++ b/backend/cpp/llama-cpp/patches/paged/PAGED_BITEXACT_NOTE.md @@ -0,0 +1,75 @@ +# Paged bit-exactness gate - per path (canonical references) + +## TL;DR + +The greedy decode of the **paged** path does not byte-match the **non-paged** +path for the MoE model. This is a **benign FP-accumulation-order difference of +the paged attention reduction**, KL-validated against the f16 reference. It is +**not a bug**. The bit-exactness gate is therefore **per path**: + +| path | model | canonical md5 | +|------|-------|---------------| +| non-paged | MoE q36-35b-a3b-nvfp4 | `07db32c2bcb78d17a43ed18bc22705cd` | +| paged | MoE q36-35b-a3b-nvfp4 | `8cb0ce23777bf55f92f63d0292c756b0` | +| non-paged | dense q36-27b-nvfp4 | `5951a5b4d624ce891e22ab5fca9bc439` | +| paged | dense q36-27b-nvfp4 | `5951a5b4d624ce891e22ab5fca9bc439` (bit-exact to non-paged) | + +Gate command (chat-template / conversation path): +``` +llama-completion -m MODEL -ngl 99 -fa on -p "The capital of France is" \ + -n 48 --temp 0 --seed 1 +# paged: prefix with LLAMA_KV_PAGED=1 LLAMA_MOE_FORCE_GRAPHS=1 +``` +Note: use the default chat-template path (do **not** pass `-no-cnv`; raw +completion lands in a different md5 namespace). + +**Future paged-MoE regressions compare to the PAGED reference `8cb0ce23`, not to +the non-paged `07db32c2`.** Dense is bit-exact across paths, so dense uses the +single reference `5951a5b4`. + +## Why dense is bit-exact but MoE is not + +Dense paged decode reproduces the non-paged reduction order exactly, so dense +greedy md5 is identical across paths. The MoE path runs additional kernels (the +NVFP4 MoE GEMM + expert routing) whose multi-kernel accumulation order differs +between the paged and non-paged attention layouts. Over a long greedy decode this +flips a small number of near-tied argmaxes, changing the byte stream. The same +divergence is present on the 0028 baseline, with `LLAMA_MOE_FORCE_GRAPHS` on or +off, and with the patch-0029 block-table cache on or off - it is a property of +the paged attention path, not of any one lever. + +## KL evidence that the paged path is sound (the load-bearing check) + +`llama-perplexity --kl-divergence` on `q36-35b-a3b-nvfp4.gguf`, 16 chunks, +`-c 512 -ngl 99 --seed 1`, base logits from the f16 reference +(`darwin_36b_opus/f16.gguf`, PPL 7.3734): + +| comparison | PPL(Q) | KL divergence | Same top p | Cor | +|------------|-------:|--------------:|-----------:|----:| +| f16 reference | 7.3734 | - | - | - | +| **non-paged** vs f16 | 7.3896 | 0.136597 +/- 0.003157 | 84.314% | 97.68% | +| **paged** vs f16 | 7.4009 | 0.136000 +/- 0.003285 | 84.828% | 97.58% | +| paged vs non-paged (direct) | 7.4009 (base 7.3818) | 0.050011 +/- 0.001653 | 89.044% | 99.04% | + +Direct paged-vs-non-paged: Mean Delta-p = 0.079% (no bias), RMS Delta-p = 6.187%. + +### Verdict: BENIGN + +- **Paged does not diverge from the f16 ground truth more than non-paged does.** + KLD(paged||f16) = 0.13600 <= KLD(nonpaged||f16) = 0.13660, and PPL(paged) = + 7.4009 ~ PPL(nonpaged) = 7.3896 (difference 0.011, far inside the +/- 0.29 + error bars). A real paged-MoE correctness bug would push paged measurably + *further* from f16; it does not (it is marginally closer). +- **Paged and non-paged cluster together.** They agree with each other (KLD 0.050, + 89.0% same-top-p) more than either agrees with f16 (KLD ~0.137, ~84% same-top-p), + with essentially zero probability bias. That is the signature of two equivalent + FP-reorderings of the same quantized model, both equally approximating the f16 + ground truth - not a quality regression. +- The direct same-top-p of 89.0% is below a naive ">99%" heuristic, but that + heuristic is calibrated for higher-precision models. In a 4-bit (NVFP4) model + logit near-ties are abundant, so a different-but-equivalent reduction order + flips ~11% of argmaxes with no quality cost (proven by the equal KLD-to-f16 and + zero Delta-p bias). + +Therefore the canonical gate is per path, and `8cb0ce23` is the validated paged +reference for the MoE deployment path.