feat(paged): vLLM-parity KV block manager (Phase 0, CPU-first prototype)

Host-side paged-attention block manager ported faithfully from vLLM V1 (block_pool.py, kv_cache_utils.py, single_type_kv_cache_manager.py): - KVCacheBlock + intrusive LRU FreeBlockQueue (O(1) middle removal) - BlockPool: get_new_blocks / touch / free_blocks eviction ordering / cache_full_blocks / lazy eviction on reuse - PagedKVManager: on-demand allocate, block_table, slot arithmetic (slot = block_id*block_size + offset), free - Prefix caching: chained block hashing + find_longest_cache_hit (first-miss stop), enabling automatic cross-tenant prefix sharing Pure C++17, zero ggml/llama.cpp dependency, unit-tested to vLLM behavioral parity (4/4 suites green). Parity is on algorithm/behavior, not hash bytes. Phase 0 of docs/superpowers/plans/2026-06-19-paged-attention-llamacpp.md. Phases 1-5 (ggml storage, gather-to-scratch read path, Gate 0 correctness, benchmark wins, prefix-share serving) follow. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-23 16:19:07 -04:00 · 2026-06-19 08:26:31 +00:00
parent 29dbba7a25
commit edb1a11abc
8 changed files with 579 additions and 0 deletions
--- a/backend/cpp/llama-cpp/paged/.gitignore
+++ b/backend/cpp/llama-cpp/paged/.gitignore
@@ -0,0 +1,4 @@
+tests/test_free_block_queue
+tests/test_block_pool
+tests/test_paged_kv_manager
+tests/test_prefix_cache
--- a/backend/cpp/llama-cpp/paged/Makefile
+++ b/backend/cpp/llama-cpp/paged/Makefile
@@ -0,0 +1,18 @@
+CXX ?= g++
+CXXFLAGS ?= -std=c++17 -O2 -Wall -Wextra -I.
+
+TESTS = test_free_block_queue test_block_pool test_paged_kv_manager test_prefix_cache
+BINS  = $(addprefix tests/,$(TESTS))
+
+all: $(BINS)
+
+tests/%: tests/%.cpp paged_kv_manager.cpp paged_kv_manager.h
+	$(CXX) $(CXXFLAGS) -o $@ $< paged_kv_manager.cpp
+
+check: all
+	@for t in $(BINS); do echo "== $$t =="; ./$$t || exit 1; done
+
+clean:
+	rm -f $(BINS)
+
+.PHONY: all check clean
--- a/backend/cpp/llama-cpp/paged/paged_kv_manager.cpp
+++ b/backend/cpp/llama-cpp/paged/paged_kv_manager.cpp
@@ -0,0 +1,296 @@
+#include "paged_kv_manager.h"
+#include <cassert>
+#include <stdexcept>
+
+namespace paged {
+
+// ---------------------------------------------------------------------------
+// FreeBlockQueue  (port of kv_cache_utils.py FreeKVCacheBlockQueue)
+// ---------------------------------------------------------------------------
+
+FreeBlockQueue::FreeBlockQueue(const std::vector<KVCacheBlock*>& blocks) {
+    num_free_blocks = blocks.size();
+    for (size_t i = 0; i < blocks.size(); ++i) {
+        if (i > 0)                  blocks[i]->prev_free = blocks[i - 1];
+        if (i + 1 < blocks.size())  blocks[i]->next_free = blocks[i + 1];
+    }
+    if (!blocks.empty()) {
+        fake_head.next_free = blocks.front();
+        blocks.front()->prev_free = &fake_head;
+        fake_tail.prev_free = blocks.back();
+        blocks.back()->next_free = &fake_tail;
+    } else {
+        fake_head.next_free = &fake_tail;
+        fake_tail.prev_free = &fake_head;
+    }
+}
+
+KVCacheBlock* FreeBlockQueue::popleft() {
+    KVCacheBlock* first = fake_head.next_free;
+    if (first == &fake_tail || first == nullptr) {
+        assert(num_free_blocks == 0);
+        throw std::runtime_error("No free blocks available");
+    }
+    fake_head.next_free = first->next_free;
+    first->next_free->prev_free = &fake_head;
+    first->prev_free = first->next_free = nullptr;
+    num_free_blocks--;
+    return first;
+}
+
+std::vector<KVCacheBlock*> FreeBlockQueue::popleft_n(size_t n) {
+    std::vector<KVCacheBlock*> ret;
+    if (n == 0) return ret;
+    assert(num_free_blocks >= n);
+    num_free_blocks -= n;
+    KVCacheBlock* curr = fake_head.next_free;
+    ret.reserve(n);
+    for (size_t i = 0; i < n; ++i) {
+        assert(curr != nullptr);
+        ret.push_back(curr);
+        KVCacheBlock* last = curr;
+        curr = curr->next_free;
+        last->prev_free = last->next_free = nullptr;
+    }
+    if (curr != nullptr) {
+        fake_head.next_free = curr;
+        curr->prev_free = &fake_head;
+    }
+    return ret;
+}
+
+void FreeBlockQueue::remove(KVCacheBlock* block) {
+    if (!block->prev_free || !block->next_free)
+        throw std::runtime_error("remove() called on an invalid block");
+    block->prev_free->next_free = block->next_free;
+    block->next_free->prev_free = block->prev_free;
+    block->prev_free = block->next_free = nullptr;
+    num_free_blocks--;
+}
+
+void FreeBlockQueue::append(KVCacheBlock* block) {
+    KVCacheBlock* last = fake_tail.prev_free;
+    last->next_free = block;
+    block->prev_free = last;
+    block->next_free = &fake_tail;
+    fake_tail.prev_free = block;
+    num_free_blocks++;
+}
+
+void FreeBlockQueue::append_n(const std::vector<KVCacheBlock*>& blocks) {
+    if (blocks.empty()) return;
+    KVCacheBlock* last = fake_tail.prev_free;
+    for (KVCacheBlock* b : blocks) {
+        b->prev_free = last;
+        last->next_free = b;
+        last = b;
+    }
+    last->next_free = &fake_tail;
+    fake_tail.prev_free = last;
+    num_free_blocks += blocks.size();
+}
+
+void FreeBlockQueue::prepend_n(const std::vector<KVCacheBlock*>& blocks) {
+    if (blocks.empty()) return;
+    KVCacheBlock* first = fake_head.next_free;
+    KVCacheBlock* prev = &fake_head;
+    for (KVCacheBlock* b : blocks) {
+        b->prev_free = prev;
+        prev->next_free = b;
+        prev = b;
+    }
+    prev->next_free = first;
+    first->prev_free = prev;
+    num_free_blocks += blocks.size();
+}
+
+std::vector<KVCacheBlock*> FreeBlockQueue::get_all_free_blocks() const {
+    std::vector<KVCacheBlock*> ret;
+    const KVCacheBlock* curr = fake_head.next_free;
+    while (curr && curr->next_free != nullptr) {
+        ret.push_back(const_cast<KVCacheBlock*>(curr));
+        curr = curr->next_free;
+    }
+    return ret;
+}
+
+// ---------------------------------------------------------------------------
+// BlockPool  (port of block_pool.py)
+// ---------------------------------------------------------------------------
+
+static std::vector<KVCacheBlock*> make_ptrs(std::vector<KVCacheBlock>& v) {
+    std::vector<KVCacheBlock*> p;
+    p.reserve(v.size());
+    for (auto& b : v) p.push_back(&b);
+    return p;
+}
+
+static std::vector<KVCacheBlock> make_block_vec(int32_t num_blocks) {
+    std::vector<KVCacheBlock> v;
+    v.reserve(num_blocks);
+    for (int32_t i = 0; i < num_blocks; ++i) v.emplace_back(i);
+    return v;
+}
+
+BlockPool::BlockPool(int32_t num_blocks, bool enable_caching)
+    : enable_caching_(enable_caching),
+      blocks_(make_block_vec(num_blocks)),
+      ptrs_(make_ptrs(blocks_)),
+      free_queue_(ptrs_) {
+    // vLLM reserves block_id 0 as the null block (never cached).
+    null_block = free_queue_.popleft();
+    null_block->is_null = true;
+}
+
+bool BlockPool::maybe_evict_cached_block(KVCacheBlock* block) {
+    if (!block->has_hash) return false;
+    auto it = cached_block_hash_to_block_.find(block->block_hash);
+    if (it == cached_block_hash_to_block_.end() || it->second != block) return false;
+    cached_block_hash_to_block_.erase(it);
+    block->reset_hash();
+    return true;
+}
+
+std::vector<KVCacheBlock*> BlockPool::get_new_blocks(size_t n) {
+    if (n > get_num_free_blocks())
+        throw std::runtime_error("Cannot get free blocks from pool");
+    auto ret = free_queue_.popleft_n(n);
+    for (KVCacheBlock* b : ret) {
+        if (enable_caching_) maybe_evict_cached_block(b);
+        assert(b->ref_cnt == 0);
+        b->ref_cnt += 1;
+    }
+    return ret;
+}
+
+KVCacheBlock* BlockPool::get_cached_block(uint64_t block_hash) {
+    auto it = cached_block_hash_to_block_.find(block_hash);
+    return it == cached_block_hash_to_block_.end() ? nullptr : it->second;
+}
+
+void BlockPool::touch(const std::vector<KVCacheBlock*>& blocks) {
+    for (KVCacheBlock* b : blocks) {
+        // ref_cnt==0 means the block is a free-list eviction candidate; pull it out.
+        if (b->ref_cnt == 0 && !b->is_null) free_queue_.remove(b);
+        b->ref_cnt += 1;
+    }
+}
+
+void BlockPool::free_blocks(const std::vector<KVCacheBlock*>& ordered_blocks) {
+    std::vector<KVCacheBlock*> without_hash, with_hash;
+    for (KVCacheBlock* b : ordered_blocks) {
+        if (b->is_null) continue;
+        b->ref_cnt -= 1;
+        if (b->ref_cnt == 0) (b->has_hash ? with_hash : without_hash).push_back(b);
+    }
+    free_queue_.prepend_n(without_hash); // un-hashed: evicted first (front)
+    free_queue_.append_n(with_hash);     // hashed: kept warm (tail)
+}
+
+void BlockPool::cache_full_blocks(const std::vector<KVCacheBlock*>& req_blocks,
+                                  size_t num_cached_blocks, size_t num_full_blocks,
+                                  const std::vector<uint64_t>& block_hashes) {
+    for (size_t i = num_cached_blocks; i < num_full_blocks; ++i) {
+        KVCacheBlock* blk = req_blocks[i];
+        if (blk->has_hash) continue;
+        blk->has_hash = true;
+        blk->block_hash = block_hashes[i];
+        cached_block_hash_to_block_[blk->block_hash] = blk;
+    }
+}
+
+// ---------------------------------------------------------------------------
+// PagedKVManager  (port of SingleTypeKVCacheManager / FullAttentionManager)
+// ---------------------------------------------------------------------------
+
+static inline size_t cdiv(size_t a, size_t b) { return (a + b - 1) / b; }
+
+PagedKVManager::PagedKVManager(int32_t num_blocks, int block_size, bool enable_caching)
+    : block_size_(block_size), pool_(num_blocks, enable_caching) {}
+
+bool PagedKVManager::allocate(int seq_id, size_t total_tokens) {
+    auto& req = req_to_blocks_[seq_id];
+    size_t need = cdiv(total_tokens, block_size_);
+    if (need <= req.size()) return true;
+    size_t add = need - req.size();
+    if (add > pool_.get_num_free_blocks()) return false; // OOM
+    auto nb = pool_.get_new_blocks(add);
+    req.insert(req.end(), nb.begin(), nb.end());
+    return true;
+}
+
+std::vector<int32_t> PagedKVManager::block_table(int seq_id) const {
+    std::vector<int32_t> bt;
+    auto it = req_to_blocks_.find(seq_id);
+    if (it == req_to_blocks_.end()) return bt;
+    bt.reserve(it->second.size());
+    for (KVCacheBlock* b : it->second) bt.push_back(b->block_id);
+    return bt;
+}
+
+int64_t PagedKVManager::slot(int seq_id, int pos) const {
+    const auto& req = req_to_blocks_.at(seq_id);
+    int32_t phys = req[pos / block_size_]->block_id;
+    return (int64_t)phys * block_size_ + (pos % block_size_);
+}
+
+std::vector<int64_t> PagedKVManager::slot_mapping(int seq_id, const std::vector<int>& positions) const {
+    std::vector<int64_t> sm;
+    sm.reserve(positions.size());
+    for (int p : positions) sm.push_back(slot(seq_id, p));
+    return sm;
+}
+
+void PagedKVManager::free(int seq_id) {
+    auto it = req_to_blocks_.find(seq_id);
+    if (it == req_to_blocks_.end()) return;
+    // Free in reverse so the tail of the block chain is evicted first (vLLM order).
+    std::vector<KVCacheBlock*> ordered(it->second.rbegin(), it->second.rend());
+    pool_.free_blocks(ordered);
+    req_to_blocks_.erase(it);
+}
+
+// FNV-1a chained block hash. Deterministic and prefix-sensitive; folds the parent
+// hash into the seed so each block hash transitively encodes its whole prefix
+// (behavioral parity with vLLM hash_block_tokens chaining; vLLM uses sha256 bytes).
+uint64_t PagedKVManager::hash_block(uint64_t parent_hash, const std::vector<int>& token_ids) {
+    uint64_t h = 1469598103934665603ull ^ parent_hash;
+    for (int t : token_ids) {
+        h ^= (uint64_t)(uint32_t)t;
+        h *= 1099511628211ull;
+    }
+    if (h == 0) h = 0x9e3779b97f4a7c15ull; // never 0 (0 reads as "no hash")
+    return h;
+}
+
+std::vector<uint64_t> PagedKVManager::compute_block_hashes(const std::vector<int>& token_ids) const {
+    std::vector<uint64_t> hashes;
+    uint64_t parent = 0; // NONE_HASH analogue
+    size_t n_full = token_ids.size() / block_size_;
+    for (size_t i = 0; i < n_full; ++i) {
+        std::vector<int> blk(token_ids.begin() + i * block_size_,
+                             token_ids.begin() + (i + 1) * block_size_);
+        parent = hash_block(parent, blk);
+        hashes.push_back(parent);
+    }
+    return hashes;
+}
+
+size_t PagedKVManager::get_computed_blocks(const std::vector<uint64_t>& block_hashes) {
+    std::vector<KVCacheBlock*> hits;
+    for (uint64_t bh : block_hashes) {        // stop at first miss (prefix property)
+        KVCacheBlock* cb = pool_.get_cached_block(bh);
+        if (!cb) break;
+        hits.push_back(cb);
+    }
+    pool_.touch(hits);                        // ++ref_cnt, pull from free list
+    return hits.size() * (size_t)block_size_;
+}
+
+void PagedKVManager::cache_blocks(int seq_id, const std::vector<uint64_t>& block_hashes, size_t num_tokens) {
+    auto& req = req_to_blocks_[seq_id];
+    size_t n_full = num_tokens / block_size_;
+    pool_.cache_full_blocks(req, /*num_cached=*/0, n_full, block_hashes);
+}
+
+} // namespace paged
--- a/backend/cpp/llama-cpp/paged/paged_kv_manager.h
+++ b/backend/cpp/llama-cpp/paged/paged_kv_manager.h
@@ -0,0 +1,108 @@
+#pragma once
+// Paged KV cache block manager for llama.cpp (CPU-first prototype).
+//
+// Host-side block management is a faithful port of vLLM V1:
+//   vllm/v1/core/kv_cache_utils.py            (KVCacheBlock, FreeKVCacheBlockQueue, hash_block_tokens)
+//   vllm/v1/core/block_pool.py                (BlockPool: get_new_blocks/touch/free/evict/cache_full_blocks)
+//   vllm/v1/core/single_type_kv_cache_manager.py (allocate_new_blocks, find_longest_cache_hit)
+//
+// Parity is on behavior/algorithm (block chaining, first-miss stop, ref-counting,
+// LRU eviction order), not on exact hash bytes. This unit has zero ggml/llama.cpp
+// dependency so it can be unit-tested in isolation.
+
+#include <cstdint>
+#include <vector>
+#include <unordered_map>
+#include <map>
+
+namespace paged {
+
+// vLLM KVCacheBlock (kv_cache_utils.py).
+struct KVCacheBlock {
+    int32_t  block_id   = 0;
+    int      ref_cnt    = 0;
+    bool     has_hash   = false;   // vLLM: _block_hash is set only when full+cached
+    uint64_t block_hash = 0;
+    bool     is_null    = false;
+    KVCacheBlock* prev_free = nullptr;
+    KVCacheBlock* next_free = nullptr;
+
+    explicit KVCacheBlock(int32_t id = 0) : block_id(id) {}
+    void reset_hash() { has_hash = false; block_hash = 0; }
+};
+
+// Intrusive doubly-linked free list with fake head/tail (vLLM FreeKVCacheBlockQueue).
+// O(1) middle removal is required so touch() can pull a warm cached block out of the
+// free list when a later request hits its prefix.
+class FreeBlockQueue {
+public:
+    size_t num_free_blocks = 0;
+
+    explicit FreeBlockQueue(const std::vector<KVCacheBlock*>& blocks);
+    KVCacheBlock* popleft();
+    std::vector<KVCacheBlock*> popleft_n(size_t n);
+    void remove(KVCacheBlock* block);
+    void append(KVCacheBlock* block);
+    void append_n(const std::vector<KVCacheBlock*>& blocks);
+    void prepend_n(const std::vector<KVCacheBlock*>& blocks);
+    std::vector<KVCacheBlock*> get_all_free_blocks() const;
+
+private:
+    KVCacheBlock fake_head{-1};
+    KVCacheBlock fake_tail{-1};
+};
+
+// vLLM BlockPool (block_pool.py).
+class BlockPool {
+public:
+    KVCacheBlock* null_block = nullptr;
+
+    BlockPool(int32_t num_blocks, bool enable_caching);
+    std::vector<KVCacheBlock*> get_new_blocks(size_t n);
+    KVCacheBlock* get_cached_block(uint64_t block_hash);
+    void touch(const std::vector<KVCacheBlock*>& blocks);
+    void free_blocks(const std::vector<KVCacheBlock*>& ordered_blocks);
+    void cache_full_blocks(const std::vector<KVCacheBlock*>& req_blocks,
+                           size_t num_cached_blocks, size_t num_full_blocks,
+                           const std::vector<uint64_t>& block_hashes);
+    size_t get_num_free_blocks() const { return free_queue_.num_free_blocks; }
+
+private:
+    bool maybe_evict_cached_block(KVCacheBlock* block);
+
+    bool enable_caching_;
+    std::vector<KVCacheBlock> blocks_;     // owns all block descriptors
+    std::vector<KVCacheBlock*> ptrs_;
+    FreeBlockQueue free_queue_;
+    // vLLM stores hash -> {block_id: block} to allow duplicate-content blocks; the
+    // prototype keeps the last writer (single KV-cache group is sufficient for the wins).
+    std::unordered_map<uint64_t, KVCacheBlock*> cached_block_hash_to_block_;
+};
+
+// Allocation + prefix-caching surface, ported from SingleTypeKVCacheManager /
+// FullAttentionManager. Single KV-cache group; no extra_keys / eagle / spec-decode.
+class PagedKVManager {
+public:
+    PagedKVManager(int32_t num_blocks, int block_size, bool enable_caching);
+
+    // Grow seq_id to cover total_tokens slots. Returns false on OOM (free queue empty).
+    bool allocate(int seq_id, size_t total_tokens);
+    std::vector<int32_t> block_table(int seq_id) const;
+    int64_t slot(int seq_id, int pos) const;
+    std::vector<int64_t> slot_mapping(int seq_id, const std::vector<int>& positions) const;
+    void free(int seq_id);
+    int block_size() const { return block_size_; }
+
+    // Prefix caching (win 3).
+    static uint64_t hash_block(uint64_t parent_hash, const std::vector<int>& token_ids);
+    std::vector<uint64_t> compute_block_hashes(const std::vector<int>& token_ids) const;
+    size_t get_computed_blocks(const std::vector<uint64_t>& block_hashes); // returns num cached tokens
+    void cache_blocks(int seq_id, const std::vector<uint64_t>& block_hashes, size_t num_tokens);
+
+protected:
+    int block_size_;
+    BlockPool pool_;
+    std::map<int, std::vector<KVCacheBlock*>> req_to_blocks_;
+};
+
+} // namespace paged
--- a/backend/cpp/llama-cpp/paged/tests/test_block_pool.cpp
+++ b/backend/cpp/llama-cpp/paged/tests/test_block_pool.cpp
@@ -0,0 +1,42 @@
+#include "../paged_kv_manager.h"
+#include <cassert>
+#include <cstdio>
+using namespace paged;
+
+int main() {
+    BlockPool pool(/*num_blocks=*/8, /*enable_caching=*/true);
+    // block 0 is reserved as null_block (vLLM pops one at init)
+    assert(pool.null_block != nullptr && pool.null_block->block_id == 0);
+    assert(pool.get_num_free_blocks() == 7);
+
+    // get_new_blocks sets ref_cnt=1 and removes from free list
+    auto b = pool.get_new_blocks(2);
+    assert(b.size() == 2 && b[0]->ref_cnt == 1 && b[1]->ref_cnt == 1);
+    assert(pool.get_num_free_blocks() == 5);
+
+    // cache two full blocks with chained hashes, then look them up
+    std::vector<uint64_t> hashes = {1111, 2222};
+    pool.cache_full_blocks(b, /*num_cached=*/0, /*num_full=*/2, hashes);
+    assert(b[0]->has_hash && b[0]->block_hash == 1111);
+    assert(pool.get_cached_block(1111) == b[0]);
+    assert(pool.get_cached_block(2222) == b[1]);
+    assert(pool.get_cached_block(9999) == nullptr);
+
+    // free: hashed blocks go to tail (kept warm), so they remain queryable.
+    pool.free_blocks(b);
+    assert(b[0]->ref_cnt == 0);
+    assert(pool.get_num_free_blocks() == 7);
+    assert(pool.get_cached_block(1111) == b[0]); // still cached/warm
+
+    // touch a warm cached block: pulls it out of free list, ++ref_cnt
+    pool.touch({b[0]});
+    assert(b[0]->ref_cnt == 1);
+    assert(pool.get_num_free_blocks() == 6);
+
+    // exhausting the pool then allocating evicts a warm cached hash
+    auto rest = pool.get_new_blocks(pool.get_num_free_blocks());
+    (void) rest;
+    assert(pool.get_cached_block(2222) == nullptr); // evicted on reuse
+    printf("test_block_pool: OK\n");
+    return 0;
+}
--- a/backend/cpp/llama-cpp/paged/tests/test_free_block_queue.cpp
+++ b/backend/cpp/llama-cpp/paged/tests/test_free_block_queue.cpp
@@ -0,0 +1,44 @@
+#include "../paged_kv_manager.h"
+#include <cassert>
+#include <cstdio>
+#include <vector>
+
+using namespace paged;
+
+static std::vector<KVCacheBlock> make_blocks(int n) {
+    std::vector<KVCacheBlock> v;
+    v.reserve(n);
+    for (int i = 0; i < n; ++i) v.push_back(KVCacheBlock{i});
+    return v;
+}
+
+int main() {
+    // ordered 0..9 at init; popleft yields ascending block_ids
+    auto blocks = make_blocks(10);
+    std::vector<KVCacheBlock*> ptrs;
+    for (auto& b : blocks) ptrs.push_back(&b);
+    FreeBlockQueue q(ptrs);
+    assert(q.num_free_blocks == 10);
+
+    KVCacheBlock* b0 = q.popleft();
+    assert(b0->block_id == 0);
+    assert(q.num_free_blocks == 9);
+
+    auto two = q.popleft_n(2);            // {1,2}
+    assert(two.size() == 2 && two[0]->block_id == 1 && two[1]->block_id == 2);
+    assert(q.num_free_blocks == 7);
+
+    // O(1) middle removal: remove block 5 (currently free), count drops
+    q.remove(ptrs[5]);
+    assert(q.num_free_blocks == 6);       // free: 3,4,6,7,8,9
+
+    // append puts a block at the tail; it comes back out only after the rest
+    q.append(b0);                          // free order now: 3,4,6,7,8,9,0
+    assert(q.num_free_blocks == 7);
+    auto all = q.get_all_free_blocks();
+    assert(all.front()->block_id == 3);
+    assert(all.back()->block_id == 0);
+
+    printf("test_free_block_queue: OK\n");
+    return 0;
+}
--- a/backend/cpp/llama-cpp/paged/tests/test_paged_kv_manager.cpp
+++ b/backend/cpp/llama-cpp/paged/tests/test_paged_kv_manager.cpp
@@ -0,0 +1,32 @@
+#include "../paged_kv_manager.h"
+#include <cassert>
+#include <cstdio>
+using namespace paged;
+
+int main() {
+    PagedKVManager m(/*num_blocks=*/8, /*block_size=*/16, /*enable_caching=*/false);
+    // 20 tokens -> ceil(20/16)=2 blocks
+    assert(m.allocate(/*seq=*/0, 20));
+    auto bt = m.block_table(0);
+    assert(bt.size() == 2);
+
+    // slot arithmetic: pos 0 -> block bt[0]*16 + 0 ; pos 17 -> bt[1]*16 + 1
+    assert(m.slot(0, 0)  == (int64_t)bt[0] * 16 + 0);
+    assert(m.slot(0, 17) == (int64_t)bt[1] * 16 + 1);
+
+    auto sm = m.slot_mapping(0, {0, 16, 17});
+    assert(sm.size() == 3 && sm[1] == (int64_t)bt[1] * 16 + 0);
+
+    // growing the same seq reuses existing blocks, adds only new ones
+    assert(m.allocate(0, 40)); // ceil(40/16)=3 -> +1 block
+    assert(m.block_table(0).size() == 3);
+
+    // OOM: blocks left = 8 - 1(null) - 3 = 4 blocks; ask for 5 blocks
+    assert(m.allocate(1, 5 * 16) == false);
+
+    // free returns blocks to the pool for reuse
+    m.free(0);
+    assert(m.allocate(1, 5 * 16)); // now fits
+    printf("test_paged_kv_manager: OK\n");
+    return 0;
+}
--- a/backend/cpp/llama-cpp/paged/tests/test_prefix_cache.cpp
+++ b/backend/cpp/llama-cpp/paged/tests/test_prefix_cache.cpp
@@ -0,0 +1,35 @@
+#include "../paged_kv_manager.h"
+#include <cassert>
+#include <cstdio>
+#include <vector>
+using namespace paged;
+
+int main() {
+    PagedKVManager m(/*num_blocks=*/64, /*block_size=*/16, /*enable_caching=*/true);
+
+    // shared prefix of 32 tokens (2 full blocks) + distinct suffix
+    std::vector<int> shared(32);
+    for (int i = 0; i < 32; ++i) shared[i] = 100 + i;
+
+    // chained hashing is deterministic and prefix-sensitive
+    auto h = m.compute_block_hashes(shared);
+    assert(h.size() == 2);
+    auto h2 = m.compute_block_hashes(shared);
+    assert(h == h2);                          // deterministic
+    std::vector<int> other = shared; other[0] = 999;
+    assert(m.compute_block_hashes(other)[0] != h[0]); // sensitive to content
+
+    // seq 0: cold, no cache hit yet
+    assert(m.get_computed_blocks(h) == 0);
+    assert(m.allocate(0, 32));
+    m.cache_blocks(0, h, 32);
+
+    // seq 1: warm — the 2 shared blocks are a cache hit (32 tokens)
+    assert(m.get_computed_blocks(h) == 32);
+
+    // first-miss stop: a chain that diverges after block 1 hits only 1 block
+    auto hmix = h; hmix[1] = 0xDEADBEEF;
+    assert(m.get_computed_blocks(hmix) == 16);
+    printf("test_prefix_cache: OK\n");
+    return 0;
+}