mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-23 16:19:07 -04:00
Host-side paged-attention block manager ported faithfully from vLLM V1 (block_pool.py, kv_cache_utils.py, single_type_kv_cache_manager.py): - KVCacheBlock + intrusive LRU FreeBlockQueue (O(1) middle removal) - BlockPool: get_new_blocks / touch / free_blocks eviction ordering / cache_full_blocks / lazy eviction on reuse - PagedKVManager: on-demand allocate, block_table, slot arithmetic (slot = block_id*block_size + offset), free - Prefix caching: chained block hashing + find_longest_cache_hit (first-miss stop), enabling automatic cross-tenant prefix sharing Pure C++17, zero ggml/llama.cpp dependency, unit-tested to vLLM behavioral parity (4/4 suites green). Parity is on algorithm/behavior, not hash bytes. Phase 0 of docs/superpowers/plans/2026-06-19-paged-attention-llamacpp.md. Phases 1-5 (ggml storage, gather-to-scratch read path, Gate 0 correctness, benchmark wins, prefix-share serving) follow. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
36 lines
1.2 KiB
C++
36 lines
1.2 KiB
C++
#include "../paged_kv_manager.h"
|
|
#include <cassert>
|
|
#include <cstdio>
|
|
#include <vector>
|
|
using namespace paged;
|
|
|
|
int main() {
|
|
PagedKVManager m(/*num_blocks=*/64, /*block_size=*/16, /*enable_caching=*/true);
|
|
|
|
// shared prefix of 32 tokens (2 full blocks) + distinct suffix
|
|
std::vector<int> shared(32);
|
|
for (int i = 0; i < 32; ++i) shared[i] = 100 + i;
|
|
|
|
// chained hashing is deterministic and prefix-sensitive
|
|
auto h = m.compute_block_hashes(shared);
|
|
assert(h.size() == 2);
|
|
auto h2 = m.compute_block_hashes(shared);
|
|
assert(h == h2); // deterministic
|
|
std::vector<int> other = shared; other[0] = 999;
|
|
assert(m.compute_block_hashes(other)[0] != h[0]); // sensitive to content
|
|
|
|
// seq 0: cold, no cache hit yet
|
|
assert(m.get_computed_blocks(h) == 0);
|
|
assert(m.allocate(0, 32));
|
|
m.cache_blocks(0, h, 32);
|
|
|
|
// seq 1: warm — the 2 shared blocks are a cache hit (32 tokens)
|
|
assert(m.get_computed_blocks(h) == 32);
|
|
|
|
// first-miss stop: a chain that diverges after block 1 hits only 1 block
|
|
auto hmix = h; hmix[1] = 0xDEADBEEF;
|
|
assert(m.get_computed_blocks(hmix) == 16);
|
|
printf("test_prefix_cache: OK\n");
|
|
return 0;
|
|
}
|