#pragma once // Paged KV cache block manager for llama.cpp (CPU-first prototype). // // Host-side block management is a faithful port of vLLM V1: // vllm/v1/core/kv_cache_utils.py (KVCacheBlock, FreeKVCacheBlockQueue, hash_block_tokens) // vllm/v1/core/block_pool.py (BlockPool: get_new_blocks/touch/free/evict/cache_full_blocks) // vllm/v1/core/single_type_kv_cache_manager.py (allocate_new_blocks, find_longest_cache_hit) // // Parity is on behavior/algorithm (block chaining, first-miss stop, ref-counting, // LRU eviction order), not on exact hash bytes. This unit has zero ggml/llama.cpp // dependency so it can be unit-tested in isolation. #include #include #include #include namespace paged { // vLLM KVCacheBlock (kv_cache_utils.py). struct KVCacheBlock { int32_t block_id = 0; int ref_cnt = 0; bool has_hash = false; // vLLM: _block_hash is set only when full+cached uint64_t block_hash = 0; bool is_null = false; KVCacheBlock* prev_free = nullptr; KVCacheBlock* next_free = nullptr; explicit KVCacheBlock(int32_t id = 0) : block_id(id) {} void reset_hash() { has_hash = false; block_hash = 0; } }; // Intrusive doubly-linked free list with fake head/tail (vLLM FreeKVCacheBlockQueue). // O(1) middle removal is required so touch() can pull a warm cached block out of the // free list when a later request hits its prefix. class FreeBlockQueue { public: size_t num_free_blocks = 0; explicit FreeBlockQueue(const std::vector& blocks); KVCacheBlock* popleft(); std::vector popleft_n(size_t n); void remove(KVCacheBlock* block); void append(KVCacheBlock* block); void append_n(const std::vector& blocks); void prepend_n(const std::vector& blocks); std::vector get_all_free_blocks() const; private: KVCacheBlock fake_head{-1}; KVCacheBlock fake_tail{-1}; }; // vLLM BlockPool (block_pool.py). class BlockPool { public: KVCacheBlock* null_block = nullptr; BlockPool(int32_t num_blocks, bool enable_caching); std::vector get_new_blocks(size_t n); KVCacheBlock* get_cached_block(uint64_t block_hash); void touch(const std::vector& blocks); void free_blocks(const std::vector& ordered_blocks); void cache_full_blocks(const std::vector& req_blocks, size_t num_cached_blocks, size_t num_full_blocks, const std::vector& block_hashes); size_t get_num_free_blocks() const { return free_queue_.num_free_blocks; } private: bool maybe_evict_cached_block(KVCacheBlock* block); bool enable_caching_; std::vector blocks_; // owns all block descriptors std::vector ptrs_; FreeBlockQueue free_queue_; // vLLM stores hash -> {block_id: block} to allow duplicate-content blocks; the // prototype keeps the last writer (single KV-cache group is sufficient for the wins). std::unordered_map cached_block_hash_to_block_; }; // Allocation + prefix-caching surface, ported from SingleTypeKVCacheManager / // FullAttentionManager. Single KV-cache group; no extra_keys / eagle / spec-decode. class PagedKVManager { public: PagedKVManager(int32_t num_blocks, int block_size, bool enable_caching); // Grow seq_id to cover total_tokens slots. Returns false on OOM (free queue empty). bool allocate(int seq_id, size_t total_tokens); std::vector block_table(int seq_id) const; int64_t slot(int seq_id, int pos) const; std::vector slot_mapping(int seq_id, const std::vector& positions) const; void free(int seq_id); int block_size() const { return block_size_; } // Prefix caching (win 3). static uint64_t hash_block(uint64_t parent_hash, const std::vector& token_ids); std::vector compute_block_hashes(const std::vector& token_ids) const; size_t get_computed_blocks(const std::vector& block_hashes); // returns num cached tokens void cache_blocks(int seq_id, const std::vector& block_hashes, size_t num_tokens); protected: int block_size_; BlockPool pool_; std::map> req_to_blocks_; }; } // namespace paged