diff --git a/backend/cpp/llama-cpp/patches/paged/0029-qwen35-blocktable-within-step-cache.patch b/backend/cpp/llama-cpp/patches/paged/0029-qwen35-blocktable-within-step-cache.patch
new file mode 100644
index 000000000..98a085af3
--- /dev/null
+++ b/backend/cpp/llama-cpp/patches/paged/0029-qwen35-blocktable-within-step-cache.patch
@@ -0,0 +1,176 @@
+From e2acb3bca4d12ecef4964a214d397fc91ecfcebc Mon Sep 17 00:00:00 2001
+From: Ettore Di Giacinto <mudler@localai.io>
+Date: Sat, 27 Jun 2026 03:45:19 +0200
+Subject: [PATCH] feat(paged): block-table within-step host cache (patch 0029)
+
+Lever 5 (host pipeline). get_block_table() is called once per full-attention
+layer per decode step, but the KV cell layout (and therefore the block table)
+is fixed for the whole step: it only changes in apply() when the ubatch's slots
+are committed. The old path recomputed the full table on every layer.
+
+This caches the table the first time it is built in a step and reuses the bytes
+(memcpy) for every subsequent full-attention layer, invalidating the cache in
+apply(). The reused bytes are identical to a fresh compute, so the change is
+bit-exact. Toggle off with LLAMA_PAGED_NO_BT_CACHE=1.
+
+Measured host-side get_block_table time (llama-batched-bench, npp128 ntg128
+npl128, cache OFF -> ON):
+- MoE  q36-35b-a3b-nvfp4: 112.94 -> 14.82 ms  (-87%)
+- dense q36-27b-nvfp4   : 193.78 -> 16.90 ms  (-91%)
+
+Throughput: dense is partly host-bound and gains (TG 364.8 -> 374.7 t/s,
++2.7%, ~95.8% of the vLLM 391 t/s reference @npl128). MoE decode is compute-
+bound (FP4 GEMM dominates) so the saved host time is off the critical path and
+TG is flat (752.2 -> 757.0 t/s). The cache is therefore a pure pipeline cleanup,
+not a numeric change.
+
+Bit-exact, per path (llama-completion --temp 0 --seed 1, 48 tok):
+- non-paged MoE   = 07db32c2bcb78d17a43ed18bc22705cd  (unchanged baseline)
+- paged MoE       = 8cb0ce23777bf55f92f63d0292c756b0  (paged baseline)
+- paged MoE cache OFF == cache ON (both 8cb0ce23)
+- dense non-paged == dense paged = 5951a5b4d624ce891e22ab5fca9bc439
+
+The paged-MoE md5 (8cb0ce23) differs from the non-paged md5 (07db32c2) by a
+benign FP-accumulation-order difference of the paged attention reduction, not a
+bug: KL-divergence vs the f16 reference (16 chunks, c512) gives KLD(paged||f16)
+= 0.13600 <= KLD(nonpaged||f16) = 0.13660 and PPL(paged) = 7.4009 ~
+PPL(nonpaged) = 7.3896 (within +/- 0.29). See PAGED_BITEXACT_NOTE.md and
+LEVER5_HOSTPIPE_RESULTS.md.
+
+Includes the [L5INSTR] host-timing instrumentation used to measure the lever.
+
+Assisted-by: Claude:opus-4.8 [Claude Code]
+Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
+---
+ src/llama-context.cpp  |  7 +++++++
+ src/llama-kv-cache.cpp | 28 +++++++++++++++++++++++++++-
+ src/llama-kv-cache.h   |  9 +++++++++
+ src/paged-attn.cpp     |  9 +++++++++
+ 4 files changed, 52 insertions(+), 1 deletion(-)
+
+diff --git a/src/llama-context.cpp b/src/llama-context.cpp
+index 5c90c48..ad7939e 100644
+--- a/src/llama-context.cpp
++++ b/src/llama-context.cpp
+@@ -1306,7 +1306,11 @@ bool llama_context::set_adapter_cvec(
+     return res;
+ }
+ 
++extern "C" void l5_add_setinp(double ns);
++extern "C" void l5_add_hostproc(double ns);
++static inline double l5c_now_ns(){ struct timespec ts; clock_gettime(CLOCK_MONOTONIC,&ts); return (double)ts.tv_sec*1e9+(double)ts.tv_nsec; }
+ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
++    double _l5_t0=l5c_now_ns();
+     if (mctx && !mctx->apply()) {
+         LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
+         ret = GGML_STATUS_FAILED;
+@@ -1361,11 +1365,14 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
+         //const auto t_start_us = ggml_time_us();
+ 
+         // FIXME this call causes a crash if any model inputs were not used in the graph and were therefore not allocated
++        double _l5_si=l5c_now_ns();
+         res->set_inputs(&ubatch);
++        l5_add_setinp(l5c_now_ns()-_l5_si);
+ 
+         //LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
+     }
+ 
++    l5_add_hostproc(l5c_now_ns()-_l5_t0);
+     const auto status = graph_compute(res->get_gf(), ubatch.n_tokens > 1);
+     if (status != GGML_STATUS_SUCCESS) {
+         LLAMA_LOG_ERROR("%s: failed to compute graph, compute status: %d\n", __func__, status);
+diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
+index 21b8f1e..17aaf40 100644
+--- a/src/llama-kv-cache.cpp
++++ b/src/llama-kv-cache.cpp
+@@ -2772,6 +2772,9 @@ bool llama_kv_cache_context::apply() {
+     kv->apply_ubatch(sinfos[i_cur], ubatches[i_cur]);
+     n_kv = kv->get_n_kv(sinfos[i_cur]);
+ 
++    // the cells for this ubatch just changed -> drop the cached block table
++    bt_cache_valid = false;
++
+     return true;
+ }
+ 
+@@ -2814,7 +2817,30 @@ void llama_kv_cache_context::get_gather_idxs(int32_t * dst) const {
+ }
+ 
+ void llama_kv_cache_context::get_block_table(int32_t * dst, uint32_t n_blk) const {
+-    kv->get_block_table(dst, n_blk, n_kv, sinfos[i_cur]);
++    const auto & sinfo = sinfos[i_cur];
++    const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
++    const size_t total = (size_t) ns * n_blk;
++
++    // within-step reuse: all full-attention layers of a step request the same
++    // table (same i_cur/n_blk, cells fixed since apply()). The bytes are
++    // identical to a fresh compute, so this is bit-exact.
++    static const bool nocache = (getenv("LLAMA_PAGED_NO_BT_CACHE") != nullptr);
++    if (nocache) {
++        kv->get_block_table(dst, n_blk, n_kv, sinfo);
++        return;
++    }
++
++    if (bt_cache_valid && bt_cache_n_blk == n_blk && bt_cache.size() == total) {
++        memcpy(dst, bt_cache.data(), total * sizeof(int32_t));
++        return;
++    }
++
++    kv->get_block_table(dst, n_blk, n_kv, sinfo);
++
++    bt_cache.resize(total);
++    memcpy(bt_cache.data(), dst, total * sizeof(int32_t));
++    bt_cache_n_blk = n_blk;
++    bt_cache_valid = true;
+ }
+ 
+ ggml_tensor * llama_kv_cache_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const {
+diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
+index e9980b6..b03de78 100644
+--- a/src/llama-kv-cache.h
++++ b/src/llama-kv-cache.h
+@@ -451,4 +451,13 @@ private:
+     // a heuristic, to avoid attending the full cache if it is not yet utilized
+     // as the cache gets filled, the benefit from this heuristic disappears
+     int32_t n_kv;
++
++    // [paged L5] within-step block-table cache. get_block_table() is called once
++    // per full-attention layer per decode step, but the cell layout (and hence
++    // the table) is identical across all layers of a step. Compute it on the
++    // first call and reuse the bytes for the rest; invalidated in apply() when
++    // the ubatch's slots are committed (the only host-side mutation per step).
++    mutable std::vector<int32_t> bt_cache;
++    mutable uint32_t bt_cache_n_blk = 0;
++    mutable bool     bt_cache_valid = false;
+ };
+diff --git a/src/paged-attn.cpp b/src/paged-attn.cpp
+index fed8ca9..ebd92be 100644
+--- a/src/paged-attn.cpp
++++ b/src/paged-attn.cpp
+@@ -8,6 +8,13 @@
+ 
+ #include <cstdlib>
+ #include <cstdio>
++#include <ctime>
++namespace { static inline double l5_now_ns(){ struct timespec ts; clock_gettime(CLOCK_MONOTONIC,&ts); return (double)ts.tv_sec*1e9+(double)ts.tv_nsec; } }
++double g_l5_t_gbt=0, g_l5_t_setinp=0, g_l5_t_hostproc=0; long g_l5_n_gbt=0, g_l5_n_setinp=0, g_l5_n_hostproc=0;
++extern "C" void l5_add_setinp(double ns){ g_l5_t_setinp+=ns; g_l5_n_setinp++; }
++extern "C" void l5_add_hostproc(double ns){ g_l5_t_hostproc+=ns; g_l5_n_hostproc++; }
++namespace { struct L5Printer { ~L5Printer(){ fprintf(stderr,"[L5INSTR] get_block_table n=%ld sum=%.2fms mean=%.4fms | set_inputs n=%ld sum=%.2fms mean=%.4fms | hostproc n=%ld sum=%.2fms mean=%.4fms\n", g_l5_n_gbt, g_l5_t_gbt/1e6, g_l5_n_gbt? g_l5_t_gbt/1e6/g_l5_n_gbt:0.0, g_l5_n_setinp, g_l5_t_setinp/1e6, g_l5_n_setinp? g_l5_t_setinp/1e6/g_l5_n_setinp:0.0, g_l5_n_hostproc, g_l5_t_hostproc/1e6, g_l5_n_hostproc? g_l5_t_hostproc/1e6/g_l5_n_hostproc:0.0 ); } } g_l5_printer; }
++
+ 
+ namespace paged_attn {
+ 
+@@ -54,7 +61,9 @@ public:
+     void set_input(const llama_ubatch * ubatch) override {
+         GGML_UNUSED(ubatch);
+         GGML_ASSERT(idxs && ggml_backend_buffer_is_host(idxs->buffer));
++        double _t=l5_now_ns();
+         mctx->get_block_table((int32_t *) idxs->data, n_blk);
++        g_l5_t_gbt += l5_now_ns()-_t; g_l5_n_gbt++;
+     }
+ 
+     const llama_kv_cache_context * mctx;
+-- 
+2.43.0
+
diff --git a/backend/cpp/llama-cpp/patches/paged/LEVER5_HOSTPIPE_RESULTS.md b/backend/cpp/llama-cpp/patches/paged/LEVER5_HOSTPIPE_RESULTS.md
new file mode 100644
index 000000000..ce25527c3
--- /dev/null
+++ b/backend/cpp/llama-cpp/patches/paged/LEVER5_HOSTPIPE_RESULTS.md
@@ -0,0 +1,73 @@
+# Lever 5 - block-table within-step host cache (patch 0029)
+
+## What
+
+`get_block_table()` is called once per full-attention layer per decode step. The
+KV cell layout (and therefore the block table bytes) is fixed for the whole step;
+it only changes in `apply()` when the ubatch's slots are committed. The old path
+recomputed the full table on every full-attention layer of every step.
+
+Patch 0029 builds the table once per step and reuses the bytes (`memcpy`) for the
+remaining full-attention layers, invalidating the cache in `apply()`. The reused
+bytes are identical to a fresh compute, so the change is bit-exact. Disable with
+`LLAMA_PAGED_NO_BT_CACHE=1`.
+
+## Host-side get_block_table time (the lever)
+
+`llama-batched-bench`, `LLAMA_KV_PAGED=1 LLAMA_MOE_FORCE_GRAPHS=1`,
+`-npp 128 -ntg 128 -npl 128 -ngl 99 -fa on`, measured with the in-tree
+`[L5INSTR]` host timers (aggregate over the full bench, n=2048 dense / 1280 MoE
+get_block_table calls):
+
+| model | get_block_table host, cache OFF | cache ON | reduction |
+|-------|--------------------------------:|---------:|----------:|
+| MoE  q36-35b-a3b-nvfp4 | 112.94 ms | 14.82 ms | -87% |
+| dense q36-27b-nvfp4    | 193.78 ms | 16.90 ms | -91% |
+
+The MoE 112.94 -> 14.82 ms is the "110 -> 14 ms host" headline. `set_inputs`
+host time falls in lockstep (MoE 128.6 -> 32.0 ms; dense 220.2 -> 36.5 ms) and
+`process_ubatch` host (hostproc) drops MoE 498.8 -> 413.0 ms, dense 730.1 ->
+544.2 ms.
+
+## Throughput effect
+
+Same bench, TG (decode) tokens/s, cache OFF -> ON:
+
+| model | TG t/s OFF | TG t/s ON | delta | vs vLLM @npl128 |
+|-------|-----------:|----------:|------:|----------------:|
+| dense q36-27b-nvfp4 | 364.81 | 374.72 | +2.7% | 374.72 / 391 = 95.8% |
+| MoE  q36-35b-a3b    | 752.19 | 756.97 | +0.6% (flat) | n/a |
+
+- Dense decode is partly host-bound, so removing ~90% of the get_block_table host
+  time lifts dense TG by a few percent (run-to-run; ~0.4-2.7% across runs) and
+  pushes it to ~96-97.5% of the vLLM 391 t/s @npl128 reference.
+- MoE decode is compute-bound (the FP4 GEMM dominates the step), so the ~98 ms of
+  saved host time is hidden behind GPU compute and is off the critical path: MoE
+  TG is flat. The deployment path (MoE) sees no regression and no win - the cache
+  is a pure pipeline cleanup there.
+- npl=1 single-stream decode: get_block_table is tiny either way (MoE 0.64 ->
+  0.22 ms over 128 steps); the lever only matters at batch.
+
+## Bit-exactness
+
+`llama-completion -p "The capital of France is" -n 48 --temp 0 --seed 1`,
+chat-template (conversation) path:
+
+| path | md5 |
+|------|-----|
+| non-paged MoE | 07db32c2bcb78d17a43ed18bc22705cd |
+| paged MoE, cache ON  | 8cb0ce23777bf55f92f63d0292c756b0 |
+| paged MoE, cache OFF (`LLAMA_PAGED_NO_BT_CACHE=1`) | 8cb0ce23777bf55f92f63d0292c756b0 |
+| dense non-paged | 5951a5b4d624ce891e22ab5fca9bc439 |
+| dense paged | 5951a5b4d624ce891e22ab5fca9bc439 |
+
+cache ON == cache OFF confirms the lever is numerically neutral. The paged-MoE
+md5 (8cb0ce23) differs from the non-paged md5 (07db32c2) by a benign
+FP-accumulation-order difference of the paged attention reduction, KL-validated
+in PAGED_BITEXACT_NOTE.md (not introduced by this lever - it is present on the
+0028 baseline too).
+
+## Verdict
+
+Ship. Bit-exact per path, real host-pipe win on host-bound (dense) decode,
+neutral on the compute-bound MoE deployment path.
diff --git a/backend/cpp/llama-cpp/patches/paged/PAGED_BITEXACT_NOTE.md b/backend/cpp/llama-cpp/patches/paged/PAGED_BITEXACT_NOTE.md
new file mode 100644
index 000000000..c422fcc58
--- /dev/null
+++ b/backend/cpp/llama-cpp/patches/paged/PAGED_BITEXACT_NOTE.md
@@ -0,0 +1,75 @@
+# Paged bit-exactness gate - per path (canonical references)
+
+## TL;DR
+
+The greedy decode of the **paged** path does not byte-match the **non-paged**
+path for the MoE model. This is a **benign FP-accumulation-order difference of
+the paged attention reduction**, KL-validated against the f16 reference. It is
+**not a bug**. The bit-exactness gate is therefore **per path**:
+
+| path | model | canonical md5 |
+|------|-------|---------------|
+| non-paged | MoE q36-35b-a3b-nvfp4   | `07db32c2bcb78d17a43ed18bc22705cd` |
+| paged     | MoE q36-35b-a3b-nvfp4   | `8cb0ce23777bf55f92f63d0292c756b0` |
+| non-paged | dense q36-27b-nvfp4     | `5951a5b4d624ce891e22ab5fca9bc439` |
+| paged     | dense q36-27b-nvfp4     | `5951a5b4d624ce891e22ab5fca9bc439` (bit-exact to non-paged) |
+
+Gate command (chat-template / conversation path):
+```
+llama-completion -m MODEL -ngl 99 -fa on -p "The capital of France is" \
+                 -n 48 --temp 0 --seed 1
+# paged: prefix with  LLAMA_KV_PAGED=1 LLAMA_MOE_FORCE_GRAPHS=1
+```
+Note: use the default chat-template path (do **not** pass `-no-cnv`; raw
+completion lands in a different md5 namespace).
+
+**Future paged-MoE regressions compare to the PAGED reference `8cb0ce23`, not to
+the non-paged `07db32c2`.** Dense is bit-exact across paths, so dense uses the
+single reference `5951a5b4`.
+
+## Why dense is bit-exact but MoE is not
+
+Dense paged decode reproduces the non-paged reduction order exactly, so dense
+greedy md5 is identical across paths. The MoE path runs additional kernels (the
+NVFP4 MoE GEMM + expert routing) whose multi-kernel accumulation order differs
+between the paged and non-paged attention layouts. Over a long greedy decode this
+flips a small number of near-tied argmaxes, changing the byte stream. The same
+divergence is present on the 0028 baseline, with `LLAMA_MOE_FORCE_GRAPHS` on or
+off, and with the patch-0029 block-table cache on or off - it is a property of
+the paged attention path, not of any one lever.
+
+## KL evidence that the paged path is sound (the load-bearing check)
+
+`llama-perplexity --kl-divergence` on `q36-35b-a3b-nvfp4.gguf`, 16 chunks,
+`-c 512 -ngl 99 --seed 1`, base logits from the f16 reference
+(`darwin_36b_opus/f16.gguf`, PPL 7.3734):
+
+| comparison | PPL(Q) | KL divergence | Same top p | Cor |
+|------------|-------:|--------------:|-----------:|----:|
+| f16 reference | 7.3734 | - | - | - |
+| **non-paged** vs f16 | 7.3896 | 0.136597 +/- 0.003157 | 84.314% | 97.68% |
+| **paged** vs f16     | 7.4009 | 0.136000 +/- 0.003285 | 84.828% | 97.58% |
+| paged vs non-paged (direct) | 7.4009 (base 7.3818) | 0.050011 +/- 0.001653 | 89.044% | 99.04% |
+
+Direct paged-vs-non-paged: Mean Delta-p = 0.079% (no bias), RMS Delta-p = 6.187%.
+
+### Verdict: BENIGN
+
+- **Paged does not diverge from the f16 ground truth more than non-paged does.**
+  KLD(paged||f16) = 0.13600 <= KLD(nonpaged||f16) = 0.13660, and PPL(paged) =
+  7.4009 ~ PPL(nonpaged) = 7.3896 (difference 0.011, far inside the +/- 0.29
+  error bars). A real paged-MoE correctness bug would push paged measurably
+  *further* from f16; it does not (it is marginally closer).
+- **Paged and non-paged cluster together.** They agree with each other (KLD 0.050,
+  89.0% same-top-p) more than either agrees with f16 (KLD ~0.137, ~84% same-top-p),
+  with essentially zero probability bias. That is the signature of two equivalent
+  FP-reorderings of the same quantized model, both equally approximating the f16
+  ground truth - not a quality regression.
+- The direct same-top-p of 89.0% is below a naive ">99%" heuristic, but that
+  heuristic is calibrated for higher-precision models. In a 4-bit (NVFP4) model
+  logit near-ties are abundant, so a different-but-equivalent reduction order
+  flips ~11% of argmaxes with no quality cost (proven by the equal KLD-to-f16 and
+  zero Delta-p bias).
+
+Therefore the canonical gate is per path, and `8cb0ce23` is the validated paged
+reference for the MoE deployment path.