mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-23 16:19:07 -04:00
Host-side paged-attention block manager ported faithfully from vLLM V1 (block_pool.py, kv_cache_utils.py, single_type_kv_cache_manager.py): - KVCacheBlock + intrusive LRU FreeBlockQueue (O(1) middle removal) - BlockPool: get_new_blocks / touch / free_blocks eviction ordering / cache_full_blocks / lazy eviction on reuse - PagedKVManager: on-demand allocate, block_table, slot arithmetic (slot = block_id*block_size + offset), free - Prefix caching: chained block hashing + find_longest_cache_hit (first-miss stop), enabling automatic cross-tenant prefix sharing Pure C++17, zero ggml/llama.cpp dependency, unit-tested to vLLM behavioral parity (4/4 suites green). Parity is on algorithm/behavior, not hash bytes. Phase 0 of docs/superpowers/plans/2026-06-19-paged-attention-llamacpp.md. Phases 1-5 (ggml storage, gather-to-scratch read path, Gate 0 correctness, benchmark wins, prefix-share serving) follow. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
297 lines
10 KiB
C++
297 lines
10 KiB
C++
#include "paged_kv_manager.h"
|
|
#include <cassert>
|
|
#include <stdexcept>
|
|
|
|
namespace paged {
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// FreeBlockQueue (port of kv_cache_utils.py FreeKVCacheBlockQueue)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
FreeBlockQueue::FreeBlockQueue(const std::vector<KVCacheBlock*>& blocks) {
|
|
num_free_blocks = blocks.size();
|
|
for (size_t i = 0; i < blocks.size(); ++i) {
|
|
if (i > 0) blocks[i]->prev_free = blocks[i - 1];
|
|
if (i + 1 < blocks.size()) blocks[i]->next_free = blocks[i + 1];
|
|
}
|
|
if (!blocks.empty()) {
|
|
fake_head.next_free = blocks.front();
|
|
blocks.front()->prev_free = &fake_head;
|
|
fake_tail.prev_free = blocks.back();
|
|
blocks.back()->next_free = &fake_tail;
|
|
} else {
|
|
fake_head.next_free = &fake_tail;
|
|
fake_tail.prev_free = &fake_head;
|
|
}
|
|
}
|
|
|
|
KVCacheBlock* FreeBlockQueue::popleft() {
|
|
KVCacheBlock* first = fake_head.next_free;
|
|
if (first == &fake_tail || first == nullptr) {
|
|
assert(num_free_blocks == 0);
|
|
throw std::runtime_error("No free blocks available");
|
|
}
|
|
fake_head.next_free = first->next_free;
|
|
first->next_free->prev_free = &fake_head;
|
|
first->prev_free = first->next_free = nullptr;
|
|
num_free_blocks--;
|
|
return first;
|
|
}
|
|
|
|
std::vector<KVCacheBlock*> FreeBlockQueue::popleft_n(size_t n) {
|
|
std::vector<KVCacheBlock*> ret;
|
|
if (n == 0) return ret;
|
|
assert(num_free_blocks >= n);
|
|
num_free_blocks -= n;
|
|
KVCacheBlock* curr = fake_head.next_free;
|
|
ret.reserve(n);
|
|
for (size_t i = 0; i < n; ++i) {
|
|
assert(curr != nullptr);
|
|
ret.push_back(curr);
|
|
KVCacheBlock* last = curr;
|
|
curr = curr->next_free;
|
|
last->prev_free = last->next_free = nullptr;
|
|
}
|
|
if (curr != nullptr) {
|
|
fake_head.next_free = curr;
|
|
curr->prev_free = &fake_head;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
void FreeBlockQueue::remove(KVCacheBlock* block) {
|
|
if (!block->prev_free || !block->next_free)
|
|
throw std::runtime_error("remove() called on an invalid block");
|
|
block->prev_free->next_free = block->next_free;
|
|
block->next_free->prev_free = block->prev_free;
|
|
block->prev_free = block->next_free = nullptr;
|
|
num_free_blocks--;
|
|
}
|
|
|
|
void FreeBlockQueue::append(KVCacheBlock* block) {
|
|
KVCacheBlock* last = fake_tail.prev_free;
|
|
last->next_free = block;
|
|
block->prev_free = last;
|
|
block->next_free = &fake_tail;
|
|
fake_tail.prev_free = block;
|
|
num_free_blocks++;
|
|
}
|
|
|
|
void FreeBlockQueue::append_n(const std::vector<KVCacheBlock*>& blocks) {
|
|
if (blocks.empty()) return;
|
|
KVCacheBlock* last = fake_tail.prev_free;
|
|
for (KVCacheBlock* b : blocks) {
|
|
b->prev_free = last;
|
|
last->next_free = b;
|
|
last = b;
|
|
}
|
|
last->next_free = &fake_tail;
|
|
fake_tail.prev_free = last;
|
|
num_free_blocks += blocks.size();
|
|
}
|
|
|
|
void FreeBlockQueue::prepend_n(const std::vector<KVCacheBlock*>& blocks) {
|
|
if (blocks.empty()) return;
|
|
KVCacheBlock* first = fake_head.next_free;
|
|
KVCacheBlock* prev = &fake_head;
|
|
for (KVCacheBlock* b : blocks) {
|
|
b->prev_free = prev;
|
|
prev->next_free = b;
|
|
prev = b;
|
|
}
|
|
prev->next_free = first;
|
|
first->prev_free = prev;
|
|
num_free_blocks += blocks.size();
|
|
}
|
|
|
|
std::vector<KVCacheBlock*> FreeBlockQueue::get_all_free_blocks() const {
|
|
std::vector<KVCacheBlock*> ret;
|
|
const KVCacheBlock* curr = fake_head.next_free;
|
|
while (curr && curr->next_free != nullptr) {
|
|
ret.push_back(const_cast<KVCacheBlock*>(curr));
|
|
curr = curr->next_free;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// BlockPool (port of block_pool.py)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
static std::vector<KVCacheBlock*> make_ptrs(std::vector<KVCacheBlock>& v) {
|
|
std::vector<KVCacheBlock*> p;
|
|
p.reserve(v.size());
|
|
for (auto& b : v) p.push_back(&b);
|
|
return p;
|
|
}
|
|
|
|
static std::vector<KVCacheBlock> make_block_vec(int32_t num_blocks) {
|
|
std::vector<KVCacheBlock> v;
|
|
v.reserve(num_blocks);
|
|
for (int32_t i = 0; i < num_blocks; ++i) v.emplace_back(i);
|
|
return v;
|
|
}
|
|
|
|
BlockPool::BlockPool(int32_t num_blocks, bool enable_caching)
|
|
: enable_caching_(enable_caching),
|
|
blocks_(make_block_vec(num_blocks)),
|
|
ptrs_(make_ptrs(blocks_)),
|
|
free_queue_(ptrs_) {
|
|
// vLLM reserves block_id 0 as the null block (never cached).
|
|
null_block = free_queue_.popleft();
|
|
null_block->is_null = true;
|
|
}
|
|
|
|
bool BlockPool::maybe_evict_cached_block(KVCacheBlock* block) {
|
|
if (!block->has_hash) return false;
|
|
auto it = cached_block_hash_to_block_.find(block->block_hash);
|
|
if (it == cached_block_hash_to_block_.end() || it->second != block) return false;
|
|
cached_block_hash_to_block_.erase(it);
|
|
block->reset_hash();
|
|
return true;
|
|
}
|
|
|
|
std::vector<KVCacheBlock*> BlockPool::get_new_blocks(size_t n) {
|
|
if (n > get_num_free_blocks())
|
|
throw std::runtime_error("Cannot get free blocks from pool");
|
|
auto ret = free_queue_.popleft_n(n);
|
|
for (KVCacheBlock* b : ret) {
|
|
if (enable_caching_) maybe_evict_cached_block(b);
|
|
assert(b->ref_cnt == 0);
|
|
b->ref_cnt += 1;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
KVCacheBlock* BlockPool::get_cached_block(uint64_t block_hash) {
|
|
auto it = cached_block_hash_to_block_.find(block_hash);
|
|
return it == cached_block_hash_to_block_.end() ? nullptr : it->second;
|
|
}
|
|
|
|
void BlockPool::touch(const std::vector<KVCacheBlock*>& blocks) {
|
|
for (KVCacheBlock* b : blocks) {
|
|
// ref_cnt==0 means the block is a free-list eviction candidate; pull it out.
|
|
if (b->ref_cnt == 0 && !b->is_null) free_queue_.remove(b);
|
|
b->ref_cnt += 1;
|
|
}
|
|
}
|
|
|
|
void BlockPool::free_blocks(const std::vector<KVCacheBlock*>& ordered_blocks) {
|
|
std::vector<KVCacheBlock*> without_hash, with_hash;
|
|
for (KVCacheBlock* b : ordered_blocks) {
|
|
if (b->is_null) continue;
|
|
b->ref_cnt -= 1;
|
|
if (b->ref_cnt == 0) (b->has_hash ? with_hash : without_hash).push_back(b);
|
|
}
|
|
free_queue_.prepend_n(without_hash); // un-hashed: evicted first (front)
|
|
free_queue_.append_n(with_hash); // hashed: kept warm (tail)
|
|
}
|
|
|
|
void BlockPool::cache_full_blocks(const std::vector<KVCacheBlock*>& req_blocks,
|
|
size_t num_cached_blocks, size_t num_full_blocks,
|
|
const std::vector<uint64_t>& block_hashes) {
|
|
for (size_t i = num_cached_blocks; i < num_full_blocks; ++i) {
|
|
KVCacheBlock* blk = req_blocks[i];
|
|
if (blk->has_hash) continue;
|
|
blk->has_hash = true;
|
|
blk->block_hash = block_hashes[i];
|
|
cached_block_hash_to_block_[blk->block_hash] = blk;
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// PagedKVManager (port of SingleTypeKVCacheManager / FullAttentionManager)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
static inline size_t cdiv(size_t a, size_t b) { return (a + b - 1) / b; }
|
|
|
|
PagedKVManager::PagedKVManager(int32_t num_blocks, int block_size, bool enable_caching)
|
|
: block_size_(block_size), pool_(num_blocks, enable_caching) {}
|
|
|
|
bool PagedKVManager::allocate(int seq_id, size_t total_tokens) {
|
|
auto& req = req_to_blocks_[seq_id];
|
|
size_t need = cdiv(total_tokens, block_size_);
|
|
if (need <= req.size()) return true;
|
|
size_t add = need - req.size();
|
|
if (add > pool_.get_num_free_blocks()) return false; // OOM
|
|
auto nb = pool_.get_new_blocks(add);
|
|
req.insert(req.end(), nb.begin(), nb.end());
|
|
return true;
|
|
}
|
|
|
|
std::vector<int32_t> PagedKVManager::block_table(int seq_id) const {
|
|
std::vector<int32_t> bt;
|
|
auto it = req_to_blocks_.find(seq_id);
|
|
if (it == req_to_blocks_.end()) return bt;
|
|
bt.reserve(it->second.size());
|
|
for (KVCacheBlock* b : it->second) bt.push_back(b->block_id);
|
|
return bt;
|
|
}
|
|
|
|
int64_t PagedKVManager::slot(int seq_id, int pos) const {
|
|
const auto& req = req_to_blocks_.at(seq_id);
|
|
int32_t phys = req[pos / block_size_]->block_id;
|
|
return (int64_t)phys * block_size_ + (pos % block_size_);
|
|
}
|
|
|
|
std::vector<int64_t> PagedKVManager::slot_mapping(int seq_id, const std::vector<int>& positions) const {
|
|
std::vector<int64_t> sm;
|
|
sm.reserve(positions.size());
|
|
for (int p : positions) sm.push_back(slot(seq_id, p));
|
|
return sm;
|
|
}
|
|
|
|
void PagedKVManager::free(int seq_id) {
|
|
auto it = req_to_blocks_.find(seq_id);
|
|
if (it == req_to_blocks_.end()) return;
|
|
// Free in reverse so the tail of the block chain is evicted first (vLLM order).
|
|
std::vector<KVCacheBlock*> ordered(it->second.rbegin(), it->second.rend());
|
|
pool_.free_blocks(ordered);
|
|
req_to_blocks_.erase(it);
|
|
}
|
|
|
|
// FNV-1a chained block hash. Deterministic and prefix-sensitive; folds the parent
|
|
// hash into the seed so each block hash transitively encodes its whole prefix
|
|
// (behavioral parity with vLLM hash_block_tokens chaining; vLLM uses sha256 bytes).
|
|
uint64_t PagedKVManager::hash_block(uint64_t parent_hash, const std::vector<int>& token_ids) {
|
|
uint64_t h = 1469598103934665603ull ^ parent_hash;
|
|
for (int t : token_ids) {
|
|
h ^= (uint64_t)(uint32_t)t;
|
|
h *= 1099511628211ull;
|
|
}
|
|
if (h == 0) h = 0x9e3779b97f4a7c15ull; // never 0 (0 reads as "no hash")
|
|
return h;
|
|
}
|
|
|
|
std::vector<uint64_t> PagedKVManager::compute_block_hashes(const std::vector<int>& token_ids) const {
|
|
std::vector<uint64_t> hashes;
|
|
uint64_t parent = 0; // NONE_HASH analogue
|
|
size_t n_full = token_ids.size() / block_size_;
|
|
for (size_t i = 0; i < n_full; ++i) {
|
|
std::vector<int> blk(token_ids.begin() + i * block_size_,
|
|
token_ids.begin() + (i + 1) * block_size_);
|
|
parent = hash_block(parent, blk);
|
|
hashes.push_back(parent);
|
|
}
|
|
return hashes;
|
|
}
|
|
|
|
size_t PagedKVManager::get_computed_blocks(const std::vector<uint64_t>& block_hashes) {
|
|
std::vector<KVCacheBlock*> hits;
|
|
for (uint64_t bh : block_hashes) { // stop at first miss (prefix property)
|
|
KVCacheBlock* cb = pool_.get_cached_block(bh);
|
|
if (!cb) break;
|
|
hits.push_back(cb);
|
|
}
|
|
pool_.touch(hits); // ++ref_cnt, pull from free list
|
|
return hits.size() * (size_t)block_size_;
|
|
}
|
|
|
|
void PagedKVManager::cache_blocks(int seq_id, const std::vector<uint64_t>& block_hashes, size_t num_tokens) {
|
|
auto& req = req_to_blocks_[seq_id];
|
|
size_t n_full = num_tokens / block_size_;
|
|
pool_.cache_full_blocks(req, /*num_cached=*/0, n_full, block_hashes);
|
|
}
|
|
|
|
} // namespace paged
|