From 4968cd8a94bd568ed45200ad1158b37911f0b964 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Mon, 22 Jun 2026 08:50:57 +0000 Subject: [PATCH] paged-attn 0004: on-demand KV block allocation Wire the paged placement in find_slot through the vendored PagedKVManager (0001) instead of a fixed full-pool permutation. Blocks are popped from a free pool on demand as a sequence crosses block boundaries, and returned on sequence end (full seq_rm / clear). One manager per (kv-cache, stream); all state lives in a new src/paged-alloc unit keyed by a static registry, so the core kv-cache struct is untouched (find_slot/clear/seq_rm gain only a gated call). Default off; stock path byte-identical. Gate 0 (CPU, Qwen3-0.6B-Q8_0), LLAMA_KV_PAGED=1 token-identical vs stock: - single-stream llama-simple, 48 tok: identical - multi-stream driver, 3 seqs x 40 tok: identical Demand-driven confirmed via debug log: blocks grow 0->1->2->3->4 at logical positions 16/32/48 (peak 4 blocks vs 16-block budget), per stream independently. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto --- ...-block-allocation-env-LLAMA_KV_PAGED.patch | 298 ++++++++++++++++++ 1 file changed, 298 insertions(+) create mode 100644 backend/cpp/llama-cpp/patches/0004-paged-on-demand-block-allocation-env-LLAMA_KV_PAGED.patch diff --git a/backend/cpp/llama-cpp/patches/0004-paged-on-demand-block-allocation-env-LLAMA_KV_PAGED.patch b/backend/cpp/llama-cpp/patches/0004-paged-on-demand-block-allocation-env-LLAMA_KV_PAGED.patch new file mode 100644 index 000000000..35ab5f942 --- /dev/null +++ b/backend/cpp/llama-cpp/patches/0004-paged-on-demand-block-allocation-env-LLAMA_KV_PAGED.patch @@ -0,0 +1,298 @@ +From 7c294973de28d1ac991505638d726acfb371d541 Mon Sep 17 00:00:00 2001 +From: Ettore Di Giacinto +Date: Mon, 22 Jun 2026 10:50:35 +0200 +Subject: [PATCH] paged on-demand block allocation (env LLAMA_KV_PAGED) - patch + 0004 + +Drive the paged placement in find_slot through the vendored PagedKVManager +(patch 0001) instead of a fixed full-pool permutation. Blocks are popped from a +free pool on demand as the sequence crosses block boundaries (peak << full +reservation) and returned on sequence end (seq_rm full removal / clear). One +manager per (kv-cache, stream); all state lives in the new src/paged-alloc unit, +so the core kv-cache struct is untouched - find_slot/clear/seq_rm gain only a +gated call. Default off; stock path byte-identical. +--- + src/CMakeLists.txt | 1 + + src/llama-kv-cache.cpp | 69 +++++++++++++++++---------- + src/paged-alloc.cpp | 106 +++++++++++++++++++++++++++++++++++++++++ + src/paged-alloc.h | 39 +++++++++++++++ + 4 files changed, 190 insertions(+), 25 deletions(-) + create mode 100644 src/paged-alloc.cpp + create mode 100644 src/paged-alloc.h + +diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt +index 58083b3..4d9d7d1 100644 +--- a/src/CMakeLists.txt ++++ b/src/CMakeLists.txt +@@ -26,6 +26,7 @@ add_library(llama + llama-kv-cache-iswa.cpp + paged-kv-manager.cpp + paged-attn.cpp ++ paged-alloc.cpp + llama-kv-cache-dsa.cpp + llama-memory.cpp + llama-memory-hybrid.cpp +diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp +index 30d02d7..1125d9a 100644 +--- a/src/llama-kv-cache.cpp ++++ b/src/llama-kv-cache.cpp +@@ -1,4 +1,5 @@ + #include "llama-kv-cache.h" ++#include "paged-alloc.h" + #include + #include + +@@ -381,6 +382,11 @@ llama_kv_cache::llama_kv_cache( + } + + void llama_kv_cache::clear(bool data) { ++ // [paged 0004] return all on-demand blocks to the pool on cache clear. ++ if (paged_alloc::active()) { ++ paged_alloc::release_all(this); ++ } ++ + for (uint32_t s = 0; s < n_stream; ++s) { + v_cells[s].reset(); + v_heads[s] = 0; +@@ -409,6 +415,16 @@ bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { + p1 = std::numeric_limits::max(); + } + ++ // [paged 0004] free a stream's on-demand blocks when its whole sequence is ++ // removed (sequence end), so they return to the pool for reuse. ++ if (paged_alloc::active() && p0 == 0 && p1 == std::numeric_limits::max()) { ++ if (seq_id >= 0) { ++ paged_alloc::release(this, (int) seq_to_stream[seq_id]); ++ } else { ++ paged_alloc::release_all(this); ++ } ++ } ++ + if (seq_id >= 0) { + auto & cells = v_cells[seq_to_stream[seq_id]]; + auto & head = v_heads[seq_to_stream[seq_id]]; +@@ -1030,36 +1046,39 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch, + // the correctness premise of paged attention. Enabled via LLAMA_KV_PAGED. + // Single-sequence scope (uses get_used() as the logical base); falls back + // to the normal allocator if the permuted cells aren't available. +- static const bool paged_mode = (std::getenv("LLAMA_KV_PAGED") != nullptr); +- if (paged_mode) { ++ // [paged 0004] On-demand block allocation. Patch 0002 proved attention is ++ // invariant to physical KV placement; here that placement is driven by ++ // the vendored PagedKVManager (patch 0001): blocks are popped from a free ++ // pool only as the sequence crosses block boundaries (peak << full ++ // reservation) and returned on sequence end. Enabled via LLAMA_KV_PAGED; ++ // falls back to the normal allocator on pool exhaustion or any conflict. ++ if (paged_alloc::active()) { + const uint32_t bs = 16; // block size (tokens/block) +- const uint32_t nblk = cells.size() / bs; // blocks in this stream's pool ++ const uint32_t nblk = cells.size() / bs; // this stream's block budget + if (nblk >= 2) { +- // stride coprime to nblk => block-index permutation is a bijection +- uint32_t k = 1; +- for (uint32_t cand = (nblk / 2) | 1u; cand < nblk; cand += 2) { +- if (std::gcd(cand, nblk) == 1u) { k = cand; break; } +- } + const uint32_t base = cells.get_used(); +- bool ok = true; +- for (uint32_t i = 0; i < n_tokens; ++i) { +- const uint32_t L = base + i; +- const uint32_t b = L / bs; +- const uint32_t off = L % bs; +- if (b >= nblk) { ok = false; break; } +- const uint32_t phys = ((b * k) % nblk) * bs + off; // permuted block +- if (phys >= cells.size() || !cells.is_empty(phys)) { ok = false; break; } +- res.idxs[s].push_back(phys); +- } +- if (ok && res.idxs[s].size() == n_tokens) { +- if (std::getenv("LLAMA_KV_PAGED_DEBUG")) { +- fprintf(stderr, "[paged] seq placed %u tok at cells:", n_tokens); +- for (uint32_t z = 0; z < res.idxs[s].size() && z < 24; ++z) fprintf(stderr, " %u", res.idxs[s][z]); +- fprintf(stderr, " (k=%u nblk=%u base=%u)\n", k, nblk, base); ++ const int strm = (int) seq_to_stream[seq_id]; ++ std::vector placed; ++ if (paged_alloc::place(this, strm, base, n_tokens, bs, nblk, placed)) { ++ bool ok = (placed.size() == n_tokens); ++ for (uint32_t i = 0; ok && i < n_tokens; ++i) { ++ if (placed[i] >= cells.size() || !cells.is_empty(placed[i])) { ++ ok = false; ++ } ++ } ++ if (ok) { ++ for (uint32_t phys : placed) { ++ res.idxs[s].push_back(phys); ++ } ++ if (std::getenv("LLAMA_KV_PAGED_DEBUG")) { ++ fprintf(stderr, "[paged] stream %d placed %u tok at cells:", strm, n_tokens); ++ for (uint32_t z = 0; z < res.idxs[s].size() && z < 24; ++z) fprintf(stderr, " %u", res.idxs[s][z]); ++ fprintf(stderr, " (nblk=%u base=%u)\n", nblk, base); ++ } ++ continue; // on-demand paged placement succeeded + } +- continue; // paged placement succeeded for this sequence ++ res.idxs[s].clear(); // fall back to the normal allocator + } +- res.idxs[s].clear(); // fall back to the normal allocator + } + } + +diff --git a/src/paged-alloc.cpp b/src/paged-alloc.cpp +new file mode 100644 +index 0000000..1d13f9c +--- /dev/null ++++ b/src/paged-alloc.cpp +@@ -0,0 +1,106 @@ ++#include "paged-alloc.h" ++#include "paged-kv-manager.h" ++ ++#include ++#include ++#include ++#include ++#include ++ ++namespace paged_alloc { ++ ++bool active() { ++ static const bool a = (std::getenv("LLAMA_KV_PAGED") != nullptr); ++ return a; ++} ++ ++static bool debug() { ++ static const bool d = (std::getenv("LLAMA_KV_PAGED_DEBUG") != nullptr); ++ return d; ++} ++ ++namespace { ++ ++using key_t = std::pair; ++ ++// One PagedKVManager per (kv-cache, stream): each stream owns a separate ++// physical pool of cells.size() cells, so a manager's block ids map directly to ++// cell ranges within that stream's pool. The internal request id is always 0. ++std::map> g_managers; ++ ++paged::PagedKVManager * get_mgr(const void * cache, int stream, ++ uint32_t pool_blocks, uint32_t block_size) { ++ const key_t k{cache, stream}; ++ auto it = g_managers.find(k); ++ if (it == g_managers.end()) { ++ // enable_caching=false: prefix caching is a later patch; 0004 exercises ++ // only on-demand allocate / free. ++ auto mgr = std::make_unique( ++ (int32_t) pool_blocks, (int) block_size, /*enable_caching=*/false); ++ it = g_managers.emplace(k, std::move(mgr)).first; ++ } ++ return it->second.get(); ++} ++ ++} // namespace ++ ++bool place(const void * cache, int stream, uint32_t base, uint32_t n_tokens, ++ uint32_t block_size, uint32_t pool_blocks, ++ std::vector & out) { ++ if (n_tokens == 0) { ++ return true; ++ } ++ ++ paged::PagedKVManager * mgr = get_mgr(cache, stream, pool_blocks, block_size); ++ ++ const size_t before = mgr->block_table(0).size(); ++ ++ // Grow the request to cover the highest logical position. The manager pops ++ // free blocks only for the boundaries actually crossed - that is the on- ++ // demand behavior; an already-covered range adds nothing. ++ if (!mgr->allocate(0, (size_t) base + n_tokens)) { ++ return false; // pool exhausted -> caller falls back to the stock path ++ } ++ ++ out.reserve(out.size() + n_tokens); ++ for (uint32_t i = 0; i < n_tokens; ++i) { ++ const int64_t s = mgr->slot(0, (int) (base + i)); ++ out.push_back((uint32_t) s); ++ } ++ ++ if (debug()) { ++ const size_t after = mgr->block_table(0).size(); ++ if (after != before) { ++ fprintf(stderr, ++ "[paged-alloc] cache=%p stream=%d grew %zu->%zu blocks " ++ "(budget=%u; base=%u +%u tok)\n", ++ cache, stream, before, after, pool_blocks, base, n_tokens); ++ } ++ } ++ ++ return true; ++} ++ ++void release(const void * cache, int stream) { ++ auto it = g_managers.find({cache, stream}); ++ if (it == g_managers.end()) { ++ return; ++ } ++ it->second->free(0); ++ g_managers.erase(it); ++ if (debug()) { ++ fprintf(stderr, "[paged-alloc] released cache=%p stream=%d\n", cache, stream); ++ } ++} ++ ++void release_all(const void * cache) { ++ for (auto it = g_managers.begin(); it != g_managers.end(); ) { ++ if (it->first.first == cache) { ++ it = g_managers.erase(it); ++ } else { ++ ++it; ++ } ++ } ++} ++ ++} // namespace paged_alloc +diff --git a/src/paged-alloc.h b/src/paged-alloc.h +new file mode 100644 +index 0000000..bf66665 +--- /dev/null ++++ b/src/paged-alloc.h +@@ -0,0 +1,39 @@ ++#pragma once ++// On-demand paged KV block allocation (patch 0004, experimental). ++// ++// Backs the paged placement in llama_kv_cache::find_slot (patch 0002) with the ++// vendored host-side PagedKVManager (patch 0001). Instead of mapping a ++// sequence's logical positions onto a fixed full-pool permutation, blocks are ++// popped from a free pool ON DEMAND as the sequence crosses block boundaries, ++// and returned to the pool on sequence end. This is where the paged memory- ++// capacity benefit begins: a short sequence holds only a few blocks, not the ++// whole reserved window. ++// ++// Gated behind env LLAMA_KV_PAGED; a no-op when unset. All state lives in this ++// unit (a static registry keyed by kv-cache + stream), so the core kv-cache ++// struct stays untouched - find_slot only gains a gated call. ++ ++#include ++#include ++ ++namespace paged_alloc { ++ ++// true iff env LLAMA_KV_PAGED is set (evaluated once). ++bool active(); ++ ++// Place n_tokens logical positions [base, base+n_tokens) of one stream on ++// demand, appending their physical cell indices to `out`. pool_blocks = ++// cells.size()/block_size is this stream's block budget. Returns false (leaving ++// `out` unchanged) on pool exhaustion, so the caller falls back to the stock ++// allocator. The caller still validates each returned cell is empty. ++bool place(const void * cache, int stream, uint32_t base, uint32_t n_tokens, ++ uint32_t block_size, uint32_t pool_blocks, ++ std::vector & out); ++ ++// Return a stream's blocks to the pool (sequence end). ++void release(const void * cache, int stream); ++ ++// Return every stream's blocks for a kv-cache (clear() / teardown). ++void release_all(const void * cache); ++ ++} // namespace paged_alloc +-- +2.43.0 +