From edb1a11abc1b04b71010630f7f9afd1821033eca Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 19 Jun 2026 08:26:31 +0000 Subject: [PATCH] feat(paged): vLLM-parity KV block manager (Phase 0, CPU-first prototype) Host-side paged-attention block manager ported faithfully from vLLM V1 (block_pool.py, kv_cache_utils.py, single_type_kv_cache_manager.py): - KVCacheBlock + intrusive LRU FreeBlockQueue (O(1) middle removal) - BlockPool: get_new_blocks / touch / free_blocks eviction ordering / cache_full_blocks / lazy eviction on reuse - PagedKVManager: on-demand allocate, block_table, slot arithmetic (slot = block_id*block_size + offset), free - Prefix caching: chained block hashing + find_longest_cache_hit (first-miss stop), enabling automatic cross-tenant prefix sharing Pure C++17, zero ggml/llama.cpp dependency, unit-tested to vLLM behavioral parity (4/4 suites green). Parity is on algorithm/behavior, not hash bytes. Phase 0 of docs/superpowers/plans/2026-06-19-paged-attention-llamacpp.md. Phases 1-5 (ggml storage, gather-to-scratch read path, Gate 0 correctness, benchmark wins, prefix-share serving) follow. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto --- backend/cpp/llama-cpp/paged/.gitignore | 4 + backend/cpp/llama-cpp/paged/Makefile | 18 ++ .../cpp/llama-cpp/paged/paged_kv_manager.cpp | 296 ++++++++++++++++++ .../cpp/llama-cpp/paged/paged_kv_manager.h | 108 +++++++ .../llama-cpp/paged/tests/test_block_pool.cpp | 42 +++ .../paged/tests/test_free_block_queue.cpp | 44 +++ .../paged/tests/test_paged_kv_manager.cpp | 32 ++ .../paged/tests/test_prefix_cache.cpp | 35 +++ 8 files changed, 579 insertions(+) create mode 100644 backend/cpp/llama-cpp/paged/.gitignore create mode 100644 backend/cpp/llama-cpp/paged/Makefile create mode 100644 backend/cpp/llama-cpp/paged/paged_kv_manager.cpp create mode 100644 backend/cpp/llama-cpp/paged/paged_kv_manager.h create mode 100644 backend/cpp/llama-cpp/paged/tests/test_block_pool.cpp create mode 100644 backend/cpp/llama-cpp/paged/tests/test_free_block_queue.cpp create mode 100644 backend/cpp/llama-cpp/paged/tests/test_paged_kv_manager.cpp create mode 100644 backend/cpp/llama-cpp/paged/tests/test_prefix_cache.cpp diff --git a/backend/cpp/llama-cpp/paged/.gitignore b/backend/cpp/llama-cpp/paged/.gitignore new file mode 100644 index 000000000..4e904a5d8 --- /dev/null +++ b/backend/cpp/llama-cpp/paged/.gitignore @@ -0,0 +1,4 @@ +tests/test_free_block_queue +tests/test_block_pool +tests/test_paged_kv_manager +tests/test_prefix_cache diff --git a/backend/cpp/llama-cpp/paged/Makefile b/backend/cpp/llama-cpp/paged/Makefile new file mode 100644 index 000000000..c0301fe18 --- /dev/null +++ b/backend/cpp/llama-cpp/paged/Makefile @@ -0,0 +1,18 @@ +CXX ?= g++ +CXXFLAGS ?= -std=c++17 -O2 -Wall -Wextra -I. + +TESTS = test_free_block_queue test_block_pool test_paged_kv_manager test_prefix_cache +BINS = $(addprefix tests/,$(TESTS)) + +all: $(BINS) + +tests/%: tests/%.cpp paged_kv_manager.cpp paged_kv_manager.h + $(CXX) $(CXXFLAGS) -o $@ $< paged_kv_manager.cpp + +check: all + @for t in $(BINS); do echo "== $$t =="; ./$$t || exit 1; done + +clean: + rm -f $(BINS) + +.PHONY: all check clean diff --git a/backend/cpp/llama-cpp/paged/paged_kv_manager.cpp b/backend/cpp/llama-cpp/paged/paged_kv_manager.cpp new file mode 100644 index 000000000..20ff191ed --- /dev/null +++ b/backend/cpp/llama-cpp/paged/paged_kv_manager.cpp @@ -0,0 +1,296 @@ +#include "paged_kv_manager.h" +#include +#include + +namespace paged { + +// --------------------------------------------------------------------------- +// FreeBlockQueue (port of kv_cache_utils.py FreeKVCacheBlockQueue) +// --------------------------------------------------------------------------- + +FreeBlockQueue::FreeBlockQueue(const std::vector& blocks) { + num_free_blocks = blocks.size(); + for (size_t i = 0; i < blocks.size(); ++i) { + if (i > 0) blocks[i]->prev_free = blocks[i - 1]; + if (i + 1 < blocks.size()) blocks[i]->next_free = blocks[i + 1]; + } + if (!blocks.empty()) { + fake_head.next_free = blocks.front(); + blocks.front()->prev_free = &fake_head; + fake_tail.prev_free = blocks.back(); + blocks.back()->next_free = &fake_tail; + } else { + fake_head.next_free = &fake_tail; + fake_tail.prev_free = &fake_head; + } +} + +KVCacheBlock* FreeBlockQueue::popleft() { + KVCacheBlock* first = fake_head.next_free; + if (first == &fake_tail || first == nullptr) { + assert(num_free_blocks == 0); + throw std::runtime_error("No free blocks available"); + } + fake_head.next_free = first->next_free; + first->next_free->prev_free = &fake_head; + first->prev_free = first->next_free = nullptr; + num_free_blocks--; + return first; +} + +std::vector FreeBlockQueue::popleft_n(size_t n) { + std::vector ret; + if (n == 0) return ret; + assert(num_free_blocks >= n); + num_free_blocks -= n; + KVCacheBlock* curr = fake_head.next_free; + ret.reserve(n); + for (size_t i = 0; i < n; ++i) { + assert(curr != nullptr); + ret.push_back(curr); + KVCacheBlock* last = curr; + curr = curr->next_free; + last->prev_free = last->next_free = nullptr; + } + if (curr != nullptr) { + fake_head.next_free = curr; + curr->prev_free = &fake_head; + } + return ret; +} + +void FreeBlockQueue::remove(KVCacheBlock* block) { + if (!block->prev_free || !block->next_free) + throw std::runtime_error("remove() called on an invalid block"); + block->prev_free->next_free = block->next_free; + block->next_free->prev_free = block->prev_free; + block->prev_free = block->next_free = nullptr; + num_free_blocks--; +} + +void FreeBlockQueue::append(KVCacheBlock* block) { + KVCacheBlock* last = fake_tail.prev_free; + last->next_free = block; + block->prev_free = last; + block->next_free = &fake_tail; + fake_tail.prev_free = block; + num_free_blocks++; +} + +void FreeBlockQueue::append_n(const std::vector& blocks) { + if (blocks.empty()) return; + KVCacheBlock* last = fake_tail.prev_free; + for (KVCacheBlock* b : blocks) { + b->prev_free = last; + last->next_free = b; + last = b; + } + last->next_free = &fake_tail; + fake_tail.prev_free = last; + num_free_blocks += blocks.size(); +} + +void FreeBlockQueue::prepend_n(const std::vector& blocks) { + if (blocks.empty()) return; + KVCacheBlock* first = fake_head.next_free; + KVCacheBlock* prev = &fake_head; + for (KVCacheBlock* b : blocks) { + b->prev_free = prev; + prev->next_free = b; + prev = b; + } + prev->next_free = first; + first->prev_free = prev; + num_free_blocks += blocks.size(); +} + +std::vector FreeBlockQueue::get_all_free_blocks() const { + std::vector ret; + const KVCacheBlock* curr = fake_head.next_free; + while (curr && curr->next_free != nullptr) { + ret.push_back(const_cast(curr)); + curr = curr->next_free; + } + return ret; +} + +// --------------------------------------------------------------------------- +// BlockPool (port of block_pool.py) +// --------------------------------------------------------------------------- + +static std::vector make_ptrs(std::vector& v) { + std::vector p; + p.reserve(v.size()); + for (auto& b : v) p.push_back(&b); + return p; +} + +static std::vector make_block_vec(int32_t num_blocks) { + std::vector v; + v.reserve(num_blocks); + for (int32_t i = 0; i < num_blocks; ++i) v.emplace_back(i); + return v; +} + +BlockPool::BlockPool(int32_t num_blocks, bool enable_caching) + : enable_caching_(enable_caching), + blocks_(make_block_vec(num_blocks)), + ptrs_(make_ptrs(blocks_)), + free_queue_(ptrs_) { + // vLLM reserves block_id 0 as the null block (never cached). + null_block = free_queue_.popleft(); + null_block->is_null = true; +} + +bool BlockPool::maybe_evict_cached_block(KVCacheBlock* block) { + if (!block->has_hash) return false; + auto it = cached_block_hash_to_block_.find(block->block_hash); + if (it == cached_block_hash_to_block_.end() || it->second != block) return false; + cached_block_hash_to_block_.erase(it); + block->reset_hash(); + return true; +} + +std::vector BlockPool::get_new_blocks(size_t n) { + if (n > get_num_free_blocks()) + throw std::runtime_error("Cannot get free blocks from pool"); + auto ret = free_queue_.popleft_n(n); + for (KVCacheBlock* b : ret) { + if (enable_caching_) maybe_evict_cached_block(b); + assert(b->ref_cnt == 0); + b->ref_cnt += 1; + } + return ret; +} + +KVCacheBlock* BlockPool::get_cached_block(uint64_t block_hash) { + auto it = cached_block_hash_to_block_.find(block_hash); + return it == cached_block_hash_to_block_.end() ? nullptr : it->second; +} + +void BlockPool::touch(const std::vector& blocks) { + for (KVCacheBlock* b : blocks) { + // ref_cnt==0 means the block is a free-list eviction candidate; pull it out. + if (b->ref_cnt == 0 && !b->is_null) free_queue_.remove(b); + b->ref_cnt += 1; + } +} + +void BlockPool::free_blocks(const std::vector& ordered_blocks) { + std::vector without_hash, with_hash; + for (KVCacheBlock* b : ordered_blocks) { + if (b->is_null) continue; + b->ref_cnt -= 1; + if (b->ref_cnt == 0) (b->has_hash ? with_hash : without_hash).push_back(b); + } + free_queue_.prepend_n(without_hash); // un-hashed: evicted first (front) + free_queue_.append_n(with_hash); // hashed: kept warm (tail) +} + +void BlockPool::cache_full_blocks(const std::vector& req_blocks, + size_t num_cached_blocks, size_t num_full_blocks, + const std::vector& block_hashes) { + for (size_t i = num_cached_blocks; i < num_full_blocks; ++i) { + KVCacheBlock* blk = req_blocks[i]; + if (blk->has_hash) continue; + blk->has_hash = true; + blk->block_hash = block_hashes[i]; + cached_block_hash_to_block_[blk->block_hash] = blk; + } +} + +// --------------------------------------------------------------------------- +// PagedKVManager (port of SingleTypeKVCacheManager / FullAttentionManager) +// --------------------------------------------------------------------------- + +static inline size_t cdiv(size_t a, size_t b) { return (a + b - 1) / b; } + +PagedKVManager::PagedKVManager(int32_t num_blocks, int block_size, bool enable_caching) + : block_size_(block_size), pool_(num_blocks, enable_caching) {} + +bool PagedKVManager::allocate(int seq_id, size_t total_tokens) { + auto& req = req_to_blocks_[seq_id]; + size_t need = cdiv(total_tokens, block_size_); + if (need <= req.size()) return true; + size_t add = need - req.size(); + if (add > pool_.get_num_free_blocks()) return false; // OOM + auto nb = pool_.get_new_blocks(add); + req.insert(req.end(), nb.begin(), nb.end()); + return true; +} + +std::vector PagedKVManager::block_table(int seq_id) const { + std::vector bt; + auto it = req_to_blocks_.find(seq_id); + if (it == req_to_blocks_.end()) return bt; + bt.reserve(it->second.size()); + for (KVCacheBlock* b : it->second) bt.push_back(b->block_id); + return bt; +} + +int64_t PagedKVManager::slot(int seq_id, int pos) const { + const auto& req = req_to_blocks_.at(seq_id); + int32_t phys = req[pos / block_size_]->block_id; + return (int64_t)phys * block_size_ + (pos % block_size_); +} + +std::vector PagedKVManager::slot_mapping(int seq_id, const std::vector& positions) const { + std::vector sm; + sm.reserve(positions.size()); + for (int p : positions) sm.push_back(slot(seq_id, p)); + return sm; +} + +void PagedKVManager::free(int seq_id) { + auto it = req_to_blocks_.find(seq_id); + if (it == req_to_blocks_.end()) return; + // Free in reverse so the tail of the block chain is evicted first (vLLM order). + std::vector ordered(it->second.rbegin(), it->second.rend()); + pool_.free_blocks(ordered); + req_to_blocks_.erase(it); +} + +// FNV-1a chained block hash. Deterministic and prefix-sensitive; folds the parent +// hash into the seed so each block hash transitively encodes its whole prefix +// (behavioral parity with vLLM hash_block_tokens chaining; vLLM uses sha256 bytes). +uint64_t PagedKVManager::hash_block(uint64_t parent_hash, const std::vector& token_ids) { + uint64_t h = 1469598103934665603ull ^ parent_hash; + for (int t : token_ids) { + h ^= (uint64_t)(uint32_t)t; + h *= 1099511628211ull; + } + if (h == 0) h = 0x9e3779b97f4a7c15ull; // never 0 (0 reads as "no hash") + return h; +} + +std::vector PagedKVManager::compute_block_hashes(const std::vector& token_ids) const { + std::vector hashes; + uint64_t parent = 0; // NONE_HASH analogue + size_t n_full = token_ids.size() / block_size_; + for (size_t i = 0; i < n_full; ++i) { + std::vector blk(token_ids.begin() + i * block_size_, + token_ids.begin() + (i + 1) * block_size_); + parent = hash_block(parent, blk); + hashes.push_back(parent); + } + return hashes; +} + +size_t PagedKVManager::get_computed_blocks(const std::vector& block_hashes) { + std::vector hits; + for (uint64_t bh : block_hashes) { // stop at first miss (prefix property) + KVCacheBlock* cb = pool_.get_cached_block(bh); + if (!cb) break; + hits.push_back(cb); + } + pool_.touch(hits); // ++ref_cnt, pull from free list + return hits.size() * (size_t)block_size_; +} + +void PagedKVManager::cache_blocks(int seq_id, const std::vector& block_hashes, size_t num_tokens) { + auto& req = req_to_blocks_[seq_id]; + size_t n_full = num_tokens / block_size_; + pool_.cache_full_blocks(req, /*num_cached=*/0, n_full, block_hashes); +} + +} // namespace paged diff --git a/backend/cpp/llama-cpp/paged/paged_kv_manager.h b/backend/cpp/llama-cpp/paged/paged_kv_manager.h new file mode 100644 index 000000000..740280a7f --- /dev/null +++ b/backend/cpp/llama-cpp/paged/paged_kv_manager.h @@ -0,0 +1,108 @@ +#pragma once +// Paged KV cache block manager for llama.cpp (CPU-first prototype). +// +// Host-side block management is a faithful port of vLLM V1: +// vllm/v1/core/kv_cache_utils.py (KVCacheBlock, FreeKVCacheBlockQueue, hash_block_tokens) +// vllm/v1/core/block_pool.py (BlockPool: get_new_blocks/touch/free/evict/cache_full_blocks) +// vllm/v1/core/single_type_kv_cache_manager.py (allocate_new_blocks, find_longest_cache_hit) +// +// Parity is on behavior/algorithm (block chaining, first-miss stop, ref-counting, +// LRU eviction order), not on exact hash bytes. This unit has zero ggml/llama.cpp +// dependency so it can be unit-tested in isolation. + +#include +#include +#include +#include + +namespace paged { + +// vLLM KVCacheBlock (kv_cache_utils.py). +struct KVCacheBlock { + int32_t block_id = 0; + int ref_cnt = 0; + bool has_hash = false; // vLLM: _block_hash is set only when full+cached + uint64_t block_hash = 0; + bool is_null = false; + KVCacheBlock* prev_free = nullptr; + KVCacheBlock* next_free = nullptr; + + explicit KVCacheBlock(int32_t id = 0) : block_id(id) {} + void reset_hash() { has_hash = false; block_hash = 0; } +}; + +// Intrusive doubly-linked free list with fake head/tail (vLLM FreeKVCacheBlockQueue). +// O(1) middle removal is required so touch() can pull a warm cached block out of the +// free list when a later request hits its prefix. +class FreeBlockQueue { +public: + size_t num_free_blocks = 0; + + explicit FreeBlockQueue(const std::vector& blocks); + KVCacheBlock* popleft(); + std::vector popleft_n(size_t n); + void remove(KVCacheBlock* block); + void append(KVCacheBlock* block); + void append_n(const std::vector& blocks); + void prepend_n(const std::vector& blocks); + std::vector get_all_free_blocks() const; + +private: + KVCacheBlock fake_head{-1}; + KVCacheBlock fake_tail{-1}; +}; + +// vLLM BlockPool (block_pool.py). +class BlockPool { +public: + KVCacheBlock* null_block = nullptr; + + BlockPool(int32_t num_blocks, bool enable_caching); + std::vector get_new_blocks(size_t n); + KVCacheBlock* get_cached_block(uint64_t block_hash); + void touch(const std::vector& blocks); + void free_blocks(const std::vector& ordered_blocks); + void cache_full_blocks(const std::vector& req_blocks, + size_t num_cached_blocks, size_t num_full_blocks, + const std::vector& block_hashes); + size_t get_num_free_blocks() const { return free_queue_.num_free_blocks; } + +private: + bool maybe_evict_cached_block(KVCacheBlock* block); + + bool enable_caching_; + std::vector blocks_; // owns all block descriptors + std::vector ptrs_; + FreeBlockQueue free_queue_; + // vLLM stores hash -> {block_id: block} to allow duplicate-content blocks; the + // prototype keeps the last writer (single KV-cache group is sufficient for the wins). + std::unordered_map cached_block_hash_to_block_; +}; + +// Allocation + prefix-caching surface, ported from SingleTypeKVCacheManager / +// FullAttentionManager. Single KV-cache group; no extra_keys / eagle / spec-decode. +class PagedKVManager { +public: + PagedKVManager(int32_t num_blocks, int block_size, bool enable_caching); + + // Grow seq_id to cover total_tokens slots. Returns false on OOM (free queue empty). + bool allocate(int seq_id, size_t total_tokens); + std::vector block_table(int seq_id) const; + int64_t slot(int seq_id, int pos) const; + std::vector slot_mapping(int seq_id, const std::vector& positions) const; + void free(int seq_id); + int block_size() const { return block_size_; } + + // Prefix caching (win 3). + static uint64_t hash_block(uint64_t parent_hash, const std::vector& token_ids); + std::vector compute_block_hashes(const std::vector& token_ids) const; + size_t get_computed_blocks(const std::vector& block_hashes); // returns num cached tokens + void cache_blocks(int seq_id, const std::vector& block_hashes, size_t num_tokens); + +protected: + int block_size_; + BlockPool pool_; + std::map> req_to_blocks_; +}; + +} // namespace paged diff --git a/backend/cpp/llama-cpp/paged/tests/test_block_pool.cpp b/backend/cpp/llama-cpp/paged/tests/test_block_pool.cpp new file mode 100644 index 000000000..a896fb1e8 --- /dev/null +++ b/backend/cpp/llama-cpp/paged/tests/test_block_pool.cpp @@ -0,0 +1,42 @@ +#include "../paged_kv_manager.h" +#include +#include +using namespace paged; + +int main() { + BlockPool pool(/*num_blocks=*/8, /*enable_caching=*/true); + // block 0 is reserved as null_block (vLLM pops one at init) + assert(pool.null_block != nullptr && pool.null_block->block_id == 0); + assert(pool.get_num_free_blocks() == 7); + + // get_new_blocks sets ref_cnt=1 and removes from free list + auto b = pool.get_new_blocks(2); + assert(b.size() == 2 && b[0]->ref_cnt == 1 && b[1]->ref_cnt == 1); + assert(pool.get_num_free_blocks() == 5); + + // cache two full blocks with chained hashes, then look them up + std::vector hashes = {1111, 2222}; + pool.cache_full_blocks(b, /*num_cached=*/0, /*num_full=*/2, hashes); + assert(b[0]->has_hash && b[0]->block_hash == 1111); + assert(pool.get_cached_block(1111) == b[0]); + assert(pool.get_cached_block(2222) == b[1]); + assert(pool.get_cached_block(9999) == nullptr); + + // free: hashed blocks go to tail (kept warm), so they remain queryable. + pool.free_blocks(b); + assert(b[0]->ref_cnt == 0); + assert(pool.get_num_free_blocks() == 7); + assert(pool.get_cached_block(1111) == b[0]); // still cached/warm + + // touch a warm cached block: pulls it out of free list, ++ref_cnt + pool.touch({b[0]}); + assert(b[0]->ref_cnt == 1); + assert(pool.get_num_free_blocks() == 6); + + // exhausting the pool then allocating evicts a warm cached hash + auto rest = pool.get_new_blocks(pool.get_num_free_blocks()); + (void) rest; + assert(pool.get_cached_block(2222) == nullptr); // evicted on reuse + printf("test_block_pool: OK\n"); + return 0; +} diff --git a/backend/cpp/llama-cpp/paged/tests/test_free_block_queue.cpp b/backend/cpp/llama-cpp/paged/tests/test_free_block_queue.cpp new file mode 100644 index 000000000..f799f2a5e --- /dev/null +++ b/backend/cpp/llama-cpp/paged/tests/test_free_block_queue.cpp @@ -0,0 +1,44 @@ +#include "../paged_kv_manager.h" +#include +#include +#include + +using namespace paged; + +static std::vector make_blocks(int n) { + std::vector v; + v.reserve(n); + for (int i = 0; i < n; ++i) v.push_back(KVCacheBlock{i}); + return v; +} + +int main() { + // ordered 0..9 at init; popleft yields ascending block_ids + auto blocks = make_blocks(10); + std::vector ptrs; + for (auto& b : blocks) ptrs.push_back(&b); + FreeBlockQueue q(ptrs); + assert(q.num_free_blocks == 10); + + KVCacheBlock* b0 = q.popleft(); + assert(b0->block_id == 0); + assert(q.num_free_blocks == 9); + + auto two = q.popleft_n(2); // {1,2} + assert(two.size() == 2 && two[0]->block_id == 1 && two[1]->block_id == 2); + assert(q.num_free_blocks == 7); + + // O(1) middle removal: remove block 5 (currently free), count drops + q.remove(ptrs[5]); + assert(q.num_free_blocks == 6); // free: 3,4,6,7,8,9 + + // append puts a block at the tail; it comes back out only after the rest + q.append(b0); // free order now: 3,4,6,7,8,9,0 + assert(q.num_free_blocks == 7); + auto all = q.get_all_free_blocks(); + assert(all.front()->block_id == 3); + assert(all.back()->block_id == 0); + + printf("test_free_block_queue: OK\n"); + return 0; +} diff --git a/backend/cpp/llama-cpp/paged/tests/test_paged_kv_manager.cpp b/backend/cpp/llama-cpp/paged/tests/test_paged_kv_manager.cpp new file mode 100644 index 000000000..b4f63c3a0 --- /dev/null +++ b/backend/cpp/llama-cpp/paged/tests/test_paged_kv_manager.cpp @@ -0,0 +1,32 @@ +#include "../paged_kv_manager.h" +#include +#include +using namespace paged; + +int main() { + PagedKVManager m(/*num_blocks=*/8, /*block_size=*/16, /*enable_caching=*/false); + // 20 tokens -> ceil(20/16)=2 blocks + assert(m.allocate(/*seq=*/0, 20)); + auto bt = m.block_table(0); + assert(bt.size() == 2); + + // slot arithmetic: pos 0 -> block bt[0]*16 + 0 ; pos 17 -> bt[1]*16 + 1 + assert(m.slot(0, 0) == (int64_t)bt[0] * 16 + 0); + assert(m.slot(0, 17) == (int64_t)bt[1] * 16 + 1); + + auto sm = m.slot_mapping(0, {0, 16, 17}); + assert(sm.size() == 3 && sm[1] == (int64_t)bt[1] * 16 + 0); + + // growing the same seq reuses existing blocks, adds only new ones + assert(m.allocate(0, 40)); // ceil(40/16)=3 -> +1 block + assert(m.block_table(0).size() == 3); + + // OOM: blocks left = 8 - 1(null) - 3 = 4 blocks; ask for 5 blocks + assert(m.allocate(1, 5 * 16) == false); + + // free returns blocks to the pool for reuse + m.free(0); + assert(m.allocate(1, 5 * 16)); // now fits + printf("test_paged_kv_manager: OK\n"); + return 0; +} diff --git a/backend/cpp/llama-cpp/paged/tests/test_prefix_cache.cpp b/backend/cpp/llama-cpp/paged/tests/test_prefix_cache.cpp new file mode 100644 index 000000000..b8151936a --- /dev/null +++ b/backend/cpp/llama-cpp/paged/tests/test_prefix_cache.cpp @@ -0,0 +1,35 @@ +#include "../paged_kv_manager.h" +#include +#include +#include +using namespace paged; + +int main() { + PagedKVManager m(/*num_blocks=*/64, /*block_size=*/16, /*enable_caching=*/true); + + // shared prefix of 32 tokens (2 full blocks) + distinct suffix + std::vector shared(32); + for (int i = 0; i < 32; ++i) shared[i] = 100 + i; + + // chained hashing is deterministic and prefix-sensitive + auto h = m.compute_block_hashes(shared); + assert(h.size() == 2); + auto h2 = m.compute_block_hashes(shared); + assert(h == h2); // deterministic + std::vector other = shared; other[0] = 999; + assert(m.compute_block_hashes(other)[0] != h[0]); // sensitive to content + + // seq 0: cold, no cache hit yet + assert(m.get_computed_blocks(h) == 0); + assert(m.allocate(0, 32)); + m.cache_blocks(0, h, 32); + + // seq 1: warm — the 2 shared blocks are a cache hit (32 tokens) + assert(m.get_computed_blocks(h) == 32); + + // first-miss stop: a chain that diverges after block 1 hits only 1 block + auto hmix = h; hmix[1] = 0xDEADBEEF; + assert(m.get_computed_blocks(hmix) == 16); + printf("test_prefix_cache: OK\n"); + return 0; +}