mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-23 16:19:07 -04:00
patch(paged) 0002: LLAMA_KV_PAGED block placement, Gate 0 token-identical
find_slot places a sequence's tokens at permuted non-contiguous blocks; greedy generation is token-identical to stock (verified on Qwen3-0.6B at the pin), branch confirmed firing. Default off. The placement substrate for the gather-read. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
@@ -0,0 +1,75 @@
|
||||
From 5c9c709e6c6b07e0399b75fd4e46e752d418a9a8 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Fri, 19 Jun 2026 23:04:17 +0000
|
||||
Subject: [PATCH] paged kv block placement (env LLAMA_KV_PAGED)
|
||||
|
||||
Place each sequence's tokens at permuted, non-contiguous fixed-size block
|
||||
positions in find_slot, proving attention is invariant to physical KV placement
|
||||
(token-identical greedy generation). Default off; single-sequence scope; falls
|
||||
back to the normal allocator. The paged-placement substrate for the gather-read.
|
||||
---
|
||||
src/llama-kv-cache.cpp | 41 +++++++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 41 insertions(+)
|
||||
|
||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
||||
index 2802103bd..999e2ae61 100644
|
||||
--- a/src/llama-kv-cache.cpp
|
||||
+++ b/src/llama-kv-cache.cpp
|
||||
@@ -11,6 +11,8 @@
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <map>
|
||||
+#include <numeric>
|
||||
+#include <cstdlib>
|
||||
#include <stdexcept>
|
||||
|
||||
static bool ggml_is_power_of_2(int n) {
|
||||
@@ -1020,6 +1022,45 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
|
||||
return { };
|
||||
}
|
||||
|
||||
+ // [paged, experimental] Place this sequence's tokens at permuted,
|
||||
+ // non-contiguous fixed-size BLOCK positions instead of a contiguous run.
|
||||
+ // This validates that attention is invariant to physical KV placement -
|
||||
+ // the correctness premise of paged attention. Enabled via LLAMA_KV_PAGED.
|
||||
+ // Single-sequence scope (uses get_used() as the logical base); falls back
|
||||
+ // to the normal allocator if the permuted cells aren't available.
|
||||
+ static const bool paged_mode = (std::getenv("LLAMA_KV_PAGED") != nullptr);
|
||||
+ if (paged_mode) {
|
||||
+ const uint32_t bs = 16; // block size (tokens/block)
|
||||
+ const uint32_t nblk = cells.size() / bs; // blocks in this stream's pool
|
||||
+ if (nblk >= 2) {
|
||||
+ // stride coprime to nblk => block-index permutation is a bijection
|
||||
+ uint32_t k = 1;
|
||||
+ for (uint32_t cand = (nblk / 2) | 1u; cand < nblk; cand += 2) {
|
||||
+ if (std::gcd(cand, nblk) == 1u) { k = cand; break; }
|
||||
+ }
|
||||
+ const uint32_t base = cells.get_used();
|
||||
+ bool ok = true;
|
||||
+ for (uint32_t i = 0; i < n_tokens; ++i) {
|
||||
+ const uint32_t L = base + i;
|
||||
+ const uint32_t b = L / bs;
|
||||
+ const uint32_t off = L % bs;
|
||||
+ if (b >= nblk) { ok = false; break; }
|
||||
+ const uint32_t phys = ((b * k) % nblk) * bs + off; // permuted block
|
||||
+ if (phys >= cells.size() || !cells.is_empty(phys)) { ok = false; break; }
|
||||
+ res.idxs[s].push_back(phys);
|
||||
+ }
|
||||
+ if (ok && res.idxs[s].size() == n_tokens) {
|
||||
+ if (std::getenv("LLAMA_KV_PAGED_DEBUG")) {
|
||||
+ fprintf(stderr, "[paged] seq placed %u tok at cells:", n_tokens);
|
||||
+ for (uint32_t z = 0; z < res.idxs[s].size() && z < 24; ++z) fprintf(stderr, " %u", res.idxs[s][z]);
|
||||
+ fprintf(stderr, " (k=%u nblk=%u base=%u)\n", k, nblk, base);
|
||||
+ }
|
||||
+ continue; // paged placement succeeded for this sequence
|
||||
+ }
|
||||
+ res.idxs[s].clear(); // fall back to the normal allocator
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
uint32_t n_tested = 0;
|
||||
|
||||
// for continuous slots, we test that all tokens in the ubatch fit, starting from the current head
|
||||
--
|
||||
2.43.0
|
||||
|
||||
Reference in New Issue
Block a user