mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-23 16:19:07 -04:00
Host-side paged-attention block manager ported faithfully from vLLM V1 (block_pool.py, kv_cache_utils.py, single_type_kv_cache_manager.py): - KVCacheBlock + intrusive LRU FreeBlockQueue (O(1) middle removal) - BlockPool: get_new_blocks / touch / free_blocks eviction ordering / cache_full_blocks / lazy eviction on reuse - PagedKVManager: on-demand allocate, block_table, slot arithmetic (slot = block_id*block_size + offset), free - Prefix caching: chained block hashing + find_longest_cache_hit (first-miss stop), enabling automatic cross-tenant prefix sharing Pure C++17, zero ggml/llama.cpp dependency, unit-tested to vLLM behavioral parity (4/4 suites green). Parity is on algorithm/behavior, not hash bytes. Phase 0 of docs/superpowers/plans/2026-06-19-paged-attention-llamacpp.md. Phases 1-5 (ggml storage, gather-to-scratch read path, Gate 0 correctness, benchmark wins, prefix-share serving) follow. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
43 lines
1.6 KiB
C++
43 lines
1.6 KiB
C++
#include "../paged_kv_manager.h"
|
|
#include <cassert>
|
|
#include <cstdio>
|
|
using namespace paged;
|
|
|
|
int main() {
|
|
BlockPool pool(/*num_blocks=*/8, /*enable_caching=*/true);
|
|
// block 0 is reserved as null_block (vLLM pops one at init)
|
|
assert(pool.null_block != nullptr && pool.null_block->block_id == 0);
|
|
assert(pool.get_num_free_blocks() == 7);
|
|
|
|
// get_new_blocks sets ref_cnt=1 and removes from free list
|
|
auto b = pool.get_new_blocks(2);
|
|
assert(b.size() == 2 && b[0]->ref_cnt == 1 && b[1]->ref_cnt == 1);
|
|
assert(pool.get_num_free_blocks() == 5);
|
|
|
|
// cache two full blocks with chained hashes, then look them up
|
|
std::vector<uint64_t> hashes = {1111, 2222};
|
|
pool.cache_full_blocks(b, /*num_cached=*/0, /*num_full=*/2, hashes);
|
|
assert(b[0]->has_hash && b[0]->block_hash == 1111);
|
|
assert(pool.get_cached_block(1111) == b[0]);
|
|
assert(pool.get_cached_block(2222) == b[1]);
|
|
assert(pool.get_cached_block(9999) == nullptr);
|
|
|
|
// free: hashed blocks go to tail (kept warm), so they remain queryable.
|
|
pool.free_blocks(b);
|
|
assert(b[0]->ref_cnt == 0);
|
|
assert(pool.get_num_free_blocks() == 7);
|
|
assert(pool.get_cached_block(1111) == b[0]); // still cached/warm
|
|
|
|
// touch a warm cached block: pulls it out of free list, ++ref_cnt
|
|
pool.touch({b[0]});
|
|
assert(b[0]->ref_cnt == 1);
|
|
assert(pool.get_num_free_blocks() == 6);
|
|
|
|
// exhausting the pool then allocating evicts a warm cached hash
|
|
auto rest = pool.get_new_blocks(pool.get_num_free_blocks());
|
|
(void) rest;
|
|
assert(pool.get_cached_block(2222) == nullptr); // evicted on reuse
|
|
printf("test_block_pool: OK\n");
|
|
return 0;
|
|
}
|