mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-23 08:08:52 -04:00
Host-side paged-attention block manager ported faithfully from vLLM V1 (block_pool.py, kv_cache_utils.py, single_type_kv_cache_manager.py): - KVCacheBlock + intrusive LRU FreeBlockQueue (O(1) middle removal) - BlockPool: get_new_blocks / touch / free_blocks eviction ordering / cache_full_blocks / lazy eviction on reuse - PagedKVManager: on-demand allocate, block_table, slot arithmetic (slot = block_id*block_size + offset), free - Prefix caching: chained block hashing + find_longest_cache_hit (first-miss stop), enabling automatic cross-tenant prefix sharing Pure C++17, zero ggml/llama.cpp dependency, unit-tested to vLLM behavioral parity (4/4 suites green). Parity is on algorithm/behavior, not hash bytes. Phase 0 of docs/superpowers/plans/2026-06-19-paged-attention-llamacpp.md. Phases 1-5 (ggml storage, gather-to-scratch read path, Gate 0 correctness, benchmark wins, prefix-share serving) follow. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
109 lines
4.4 KiB
C++
109 lines
4.4 KiB
C++
#pragma once
|
|
// Paged KV cache block manager for llama.cpp (CPU-first prototype).
|
|
//
|
|
// Host-side block management is a faithful port of vLLM V1:
|
|
// vllm/v1/core/kv_cache_utils.py (KVCacheBlock, FreeKVCacheBlockQueue, hash_block_tokens)
|
|
// vllm/v1/core/block_pool.py (BlockPool: get_new_blocks/touch/free/evict/cache_full_blocks)
|
|
// vllm/v1/core/single_type_kv_cache_manager.py (allocate_new_blocks, find_longest_cache_hit)
|
|
//
|
|
// Parity is on behavior/algorithm (block chaining, first-miss stop, ref-counting,
|
|
// LRU eviction order), not on exact hash bytes. This unit has zero ggml/llama.cpp
|
|
// dependency so it can be unit-tested in isolation.
|
|
|
|
#include <cstdint>
|
|
#include <vector>
|
|
#include <unordered_map>
|
|
#include <map>
|
|
|
|
namespace paged {
|
|
|
|
// vLLM KVCacheBlock (kv_cache_utils.py).
|
|
struct KVCacheBlock {
|
|
int32_t block_id = 0;
|
|
int ref_cnt = 0;
|
|
bool has_hash = false; // vLLM: _block_hash is set only when full+cached
|
|
uint64_t block_hash = 0;
|
|
bool is_null = false;
|
|
KVCacheBlock* prev_free = nullptr;
|
|
KVCacheBlock* next_free = nullptr;
|
|
|
|
explicit KVCacheBlock(int32_t id = 0) : block_id(id) {}
|
|
void reset_hash() { has_hash = false; block_hash = 0; }
|
|
};
|
|
|
|
// Intrusive doubly-linked free list with fake head/tail (vLLM FreeKVCacheBlockQueue).
|
|
// O(1) middle removal is required so touch() can pull a warm cached block out of the
|
|
// free list when a later request hits its prefix.
|
|
class FreeBlockQueue {
|
|
public:
|
|
size_t num_free_blocks = 0;
|
|
|
|
explicit FreeBlockQueue(const std::vector<KVCacheBlock*>& blocks);
|
|
KVCacheBlock* popleft();
|
|
std::vector<KVCacheBlock*> popleft_n(size_t n);
|
|
void remove(KVCacheBlock* block);
|
|
void append(KVCacheBlock* block);
|
|
void append_n(const std::vector<KVCacheBlock*>& blocks);
|
|
void prepend_n(const std::vector<KVCacheBlock*>& blocks);
|
|
std::vector<KVCacheBlock*> get_all_free_blocks() const;
|
|
|
|
private:
|
|
KVCacheBlock fake_head{-1};
|
|
KVCacheBlock fake_tail{-1};
|
|
};
|
|
|
|
// vLLM BlockPool (block_pool.py).
|
|
class BlockPool {
|
|
public:
|
|
KVCacheBlock* null_block = nullptr;
|
|
|
|
BlockPool(int32_t num_blocks, bool enable_caching);
|
|
std::vector<KVCacheBlock*> get_new_blocks(size_t n);
|
|
KVCacheBlock* get_cached_block(uint64_t block_hash);
|
|
void touch(const std::vector<KVCacheBlock*>& blocks);
|
|
void free_blocks(const std::vector<KVCacheBlock*>& ordered_blocks);
|
|
void cache_full_blocks(const std::vector<KVCacheBlock*>& req_blocks,
|
|
size_t num_cached_blocks, size_t num_full_blocks,
|
|
const std::vector<uint64_t>& block_hashes);
|
|
size_t get_num_free_blocks() const { return free_queue_.num_free_blocks; }
|
|
|
|
private:
|
|
bool maybe_evict_cached_block(KVCacheBlock* block);
|
|
|
|
bool enable_caching_;
|
|
std::vector<KVCacheBlock> blocks_; // owns all block descriptors
|
|
std::vector<KVCacheBlock*> ptrs_;
|
|
FreeBlockQueue free_queue_;
|
|
// vLLM stores hash -> {block_id: block} to allow duplicate-content blocks; the
|
|
// prototype keeps the last writer (single KV-cache group is sufficient for the wins).
|
|
std::unordered_map<uint64_t, KVCacheBlock*> cached_block_hash_to_block_;
|
|
};
|
|
|
|
// Allocation + prefix-caching surface, ported from SingleTypeKVCacheManager /
|
|
// FullAttentionManager. Single KV-cache group; no extra_keys / eagle / spec-decode.
|
|
class PagedKVManager {
|
|
public:
|
|
PagedKVManager(int32_t num_blocks, int block_size, bool enable_caching);
|
|
|
|
// Grow seq_id to cover total_tokens slots. Returns false on OOM (free queue empty).
|
|
bool allocate(int seq_id, size_t total_tokens);
|
|
std::vector<int32_t> block_table(int seq_id) const;
|
|
int64_t slot(int seq_id, int pos) const;
|
|
std::vector<int64_t> slot_mapping(int seq_id, const std::vector<int>& positions) const;
|
|
void free(int seq_id);
|
|
int block_size() const { return block_size_; }
|
|
|
|
// Prefix caching (win 3).
|
|
static uint64_t hash_block(uint64_t parent_hash, const std::vector<int>& token_ids);
|
|
std::vector<uint64_t> compute_block_hashes(const std::vector<int>& token_ids) const;
|
|
size_t get_computed_blocks(const std::vector<uint64_t>& block_hashes); // returns num cached tokens
|
|
void cache_blocks(int seq_id, const std::vector<uint64_t>& block_hashes, size_t num_tokens);
|
|
|
|
protected:
|
|
int block_size_;
|
|
BlockPool pool_;
|
|
std::map<int, std::vector<KVCacheBlock*>> req_to_blocks_;
|
|
};
|
|
|
|
} // namespace paged
|