diff --git a/backend/cpp/llama-cpp/patches/0002-paged-kv-block-placement-env-LLAMA_KV_PAGED.patch b/backend/cpp/llama-cpp/patches/0002-paged-kv-block-placement-env-LLAMA_KV_PAGED.patch new file mode 100644 index 000000000..3ba88af4c --- /dev/null +++ b/backend/cpp/llama-cpp/patches/0002-paged-kv-block-placement-env-LLAMA_KV_PAGED.patch @@ -0,0 +1,75 @@ +From 5c9c709e6c6b07e0399b75fd4e46e752d418a9a8 Mon Sep 17 00:00:00 2001 +From: Ettore Di Giacinto +Date: Fri, 19 Jun 2026 23:04:17 +0000 +Subject: [PATCH] paged kv block placement (env LLAMA_KV_PAGED) + +Place each sequence's tokens at permuted, non-contiguous fixed-size block +positions in find_slot, proving attention is invariant to physical KV placement +(token-identical greedy generation). Default off; single-sequence scope; falls +back to the normal allocator. The paged-placement substrate for the gather-read. +--- + src/llama-kv-cache.cpp | 41 +++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 41 insertions(+) + +diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp +index 2802103bd..999e2ae61 100644 +--- a/src/llama-kv-cache.cpp ++++ b/src/llama-kv-cache.cpp +@@ -11,6 +11,8 @@ + #include + #include + #include ++#include ++#include + #include + + static bool ggml_is_power_of_2(int n) { +@@ -1020,6 +1022,45 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch, + return { }; + } + ++ // [paged, experimental] Place this sequence's tokens at permuted, ++ // non-contiguous fixed-size BLOCK positions instead of a contiguous run. ++ // This validates that attention is invariant to physical KV placement - ++ // the correctness premise of paged attention. Enabled via LLAMA_KV_PAGED. ++ // Single-sequence scope (uses get_used() as the logical base); falls back ++ // to the normal allocator if the permuted cells aren't available. ++ static const bool paged_mode = (std::getenv("LLAMA_KV_PAGED") != nullptr); ++ if (paged_mode) { ++ const uint32_t bs = 16; // block size (tokens/block) ++ const uint32_t nblk = cells.size() / bs; // blocks in this stream's pool ++ if (nblk >= 2) { ++ // stride coprime to nblk => block-index permutation is a bijection ++ uint32_t k = 1; ++ for (uint32_t cand = (nblk / 2) | 1u; cand < nblk; cand += 2) { ++ if (std::gcd(cand, nblk) == 1u) { k = cand; break; } ++ } ++ const uint32_t base = cells.get_used(); ++ bool ok = true; ++ for (uint32_t i = 0; i < n_tokens; ++i) { ++ const uint32_t L = base + i; ++ const uint32_t b = L / bs; ++ const uint32_t off = L % bs; ++ if (b >= nblk) { ok = false; break; } ++ const uint32_t phys = ((b * k) % nblk) * bs + off; // permuted block ++ if (phys >= cells.size() || !cells.is_empty(phys)) { ok = false; break; } ++ res.idxs[s].push_back(phys); ++ } ++ if (ok && res.idxs[s].size() == n_tokens) { ++ if (std::getenv("LLAMA_KV_PAGED_DEBUG")) { ++ fprintf(stderr, "[paged] seq placed %u tok at cells:", n_tokens); ++ for (uint32_t z = 0; z < res.idxs[s].size() && z < 24; ++z) fprintf(stderr, " %u", res.idxs[s][z]); ++ fprintf(stderr, " (k=%u nblk=%u base=%u)\n", k, nblk, base); ++ } ++ continue; // paged placement succeeded for this sequence ++ } ++ res.idxs[s].clear(); // fall back to the normal allocator ++ } ++ } ++ + uint32_t n_tested = 0; + + // for continuous slots, we test that all tokens in the ubatch fit, starting from the current head +-- +2.43.0 +