From bbc84a9889f8242e8b63c012e41f9d7541ac3e0c Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 19 Jun 2026 08:51:42 +0000
Subject: [PATCH] feat(paged): Gate 0 in-model - token-identical generation
 with paged KV placement

Wire paged, non-contiguous fixed-size BLOCK placement into the real
llama.cpp KV cache (find_slot), behind env LLAMA_KV_PAGED, and validate
Gate 0 on a real GGUF: Qwen3-0.6B greedy generation is TOKEN-IDENTICAL to
the contiguous cache while its KV is physically scattered across permuted
blocks (cells 0-15, 144-159, 32-47, ...). Proven non-contiguous via
LLAMA_KV_PAGED_DEBUG, not a silent fallback.

This retires the correctness premise of paged attention IN THE MODEL (not
just at the ggml-op level): attention is invariant to physical KV placement,
because reads use per-cell pos/seq metadata for masking. The patch lives at
patches/0001-paged-kv-block-placement.patch (against llama.cpp 0253fb21f).

Scope: storage/placement layer, single sequence. Remaining (P4): the
gather-read compute path (attend only a seq's own blocks) for the throughput
win, and the multi-sequence driver. README updated with repro + status.

Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/cpp/llama-cpp/paged/README.md         | 26 ++++++--
 .../0001-paged-kv-block-placement.patch       | 59 +++++++++++++++++++
 2 files changed, 80 insertions(+), 5 deletions(-)
 create mode 100644 backend/cpp/llama-cpp/paged/patches/0001-paged-kv-block-placement.patch

diff --git a/backend/cpp/llama-cpp/paged/README.md b/backend/cpp/llama-cpp/paged/README.md
index b593866fc..77a600443 100644
--- a/backend/cpp/llama-cpp/paged/README.md
+++ b/backend/cpp/llama-cpp/paged/README.md
@@ -16,12 +16,28 @@ Plan:   `docs/superpowers/plans/2026-06-19-paged-attention-llamacpp.md`
 | P1 | ggml paged write/gather mechanism (`set_rows` by slot_mapping → `get_rows` gather) | ✅ verified — `make ggml-check`, non-contiguous blocks `[2,1,5]` round-trip + isolation |
 | P2 (core) | attention over gathered paged KV matches independent host reference | ✅ verified — max abs err **7.5e-08** |
 | P3 (partial) | capacity & prefix-sharing wins | ✅ measured — `make bench`: **9.2×** more concurrent seqs, **11.3×** less KV memory |
-| **P2/P3 (in-model)** | **`build_attn_paged` in llama-graph.cpp + Gate 0 (token-identical generation) + win-2 throughput** | ⛔ **NOT DONE** — large in-tree effort |
+| **P3 (in-model placement)** | **paged, non-contiguous block KV placement in the real model** | ✅ **Gate 0 PASSED** — Qwen3-0.6B token-identical (`patches/0001-paged-kv-block-placement.patch`) |
+| P4 (in-model compute) | gather-read (`build_attn_paged`, read only a seq's blocks) + win-2 throughput + multi-seq | ⛔ remaining |
 
-The design's central risk — *does gather-to-scratch produce correct attention?* — is
-**retired**: paged, non-contiguous KV through the existing ggml attention ops is
-bit-accurate. What remains is wiring that into the model's graph and proving
-token-identical generation on a real GGUF, then measuring tok/s vs concurrency.
+The design's central risk — *does paged (non-contiguous) KV produce correct attention?* —
+is **retired at two levels**: (1) at the ggml-op level (P2, 7.5e-08 vs reference) and
+(2) **in a real model** (P3): with KV physically scattered across permuted, non-contiguous
+blocks (cells `0-15, 144-159, 32-47, …`), Qwen3-0.6B greedy generation is **token-for-token
+identical** to the contiguous cache. Reproduce:
+
+```sh
+# from backend/cpp/llama-cpp-fallback-build/llama.cpp (patch applied, CPU build)
+B=build-cpu/bin/llama-simple; M=<Qwen3-0.6B.Q4_K_M.gguf>; P="...long prompt..."
+"$B" -m "$M" -n 40 "$P"                         > base.txt
+LLAMA_KV_PAGED=1 "$B" -m "$M" -n 40 "$P"        > paged.txt
+diff base.txt paged.txt && echo TOKEN-IDENTICAL
+# LLAMA_KV_PAGED_DEBUG=1 prints the permuted physical cells per step
+```
+
+This proves the **storage/placement** layer of paged attention in-model. What remains (P4)
+is the **compute** optimization that yields the throughput win: a gather-read that attends
+only a sequence's own blocks (instead of scanning `[0,n_kv)` with a mask), plus the
+multi-sequence driver to measure tok/s vs concurrency. The patch is single-sequence scope.
 
 ## Build & test
 
diff --git a/backend/cpp/llama-cpp/paged/patches/0001-paged-kv-block-placement.patch b/backend/cpp/llama-cpp/paged/patches/0001-paged-kv-block-placement.patch
new file mode 100644
index 000000000..9ff9452ea
--- /dev/null
+++ b/backend/cpp/llama-cpp/paged/patches/0001-paged-kv-block-placement.patch
@@ -0,0 +1,59 @@
+diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
+index a49a055a6..d95102bbd 100644
+--- a/src/llama-kv-cache.cpp
++++ b/src/llama-kv-cache.cpp
+@@ -11,6 +11,8 @@
+ #include <cstring>
+ #include <limits>
+ #include <map>
++#include <numeric>
++#include <cstdlib>
+ #include <stdexcept>
+ 
+ static bool ggml_is_power_of_2(int n) {
+@@ -931,6 +933,45 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
+             return { };
+         }
+ 
++        // [paged, experimental] Place this sequence's tokens at permuted,
++        // non-contiguous fixed-size BLOCK positions instead of a contiguous run.
++        // This validates that attention is invariant to physical KV placement -
++        // the correctness premise of paged attention. Enabled via LLAMA_KV_PAGED.
++        // Single-sequence scope (uses get_used() as the logical base); falls back
++        // to the normal allocator if the permuted cells aren't available.
++        static const bool paged_mode = (std::getenv("LLAMA_KV_PAGED") != nullptr);
++        if (paged_mode) {
++            const uint32_t bs   = 16;                 // block size (tokens/block)
++            const uint32_t nblk = cells.size() / bs;  // blocks in this stream's pool
++            if (nblk >= 2) {
++                // stride coprime to nblk => block-index permutation is a bijection
++                uint32_t k = 1;
++                for (uint32_t cand = (nblk / 2) | 1u; cand < nblk; cand += 2) {
++                    if (std::gcd(cand, nblk) == 1u) { k = cand; break; }
++                }
++                const uint32_t base = cells.get_used();
++                bool ok = true;
++                for (uint32_t i = 0; i < n_tokens; ++i) {
++                    const uint32_t L    = base + i;
++                    const uint32_t b    = L / bs;
++                    const uint32_t off  = L % bs;
++                    if (b >= nblk) { ok = false; break; }
++                    const uint32_t phys = ((b * k) % nblk) * bs + off; // permuted block
++                    if (phys >= cells.size() || !cells.is_empty(phys)) { ok = false; break; }
++                    res.idxs[s].push_back(phys);
++                }
++                if (ok && res.idxs[s].size() == n_tokens) {
++                    if (std::getenv("LLAMA_KV_PAGED_DEBUG")) {
++                        fprintf(stderr, "[paged] seq placed %u tok at cells:", n_tokens);
++                        for (uint32_t z = 0; z < res.idxs[s].size() && z < 24; ++z) fprintf(stderr, " %u", res.idxs[s][z]);
++                        fprintf(stderr, " (k=%u nblk=%u base=%u)\n", k, nblk, base);
++                    }
++                    continue; // paged placement succeeded for this sequence
++                }
++                res.idxs[s].clear(); // fall back to the normal allocator
++            }
++        }
++
+         uint32_t n_tested = 0;
+ 
+         // for continuous slots, we test that all tokens in the ubatch fit, starting from the current head