mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-26 09:26:55 -04:00
feat(paged): paged-pool burst-reclaim (truncate + defrag + slot release) (patch 0024)
Fixes the paged-pool burst-degradation bug (OTHER_PATHS_INVESTIGATION.md section C Part 2): on a long-lived llama-server with LLAMA_KV_PAGED=1, a high-fan-out prefill burst strands KV blocks in the host-side paged pool, so a later lower-npl prefill draws from a depleted/fragmented pool and its throughput collapses (the benchmark's "restart per npl" crutch). Decode is unaffected. The fix changes only host-side block accounting and placement, never KV values or compute, and is gated behind LLAMA_KV_PAGED (LLAMA_PAGED_NO_RECLAIM=1 restores the pre-fix behavior). Fix-1 reclaim trailing blocks: PagedKVManager::truncate(seq, n_keep) frees every block beyond ceil(n_keep/bs) (ref-counted); called from llama_kv_cache::seq_rm for the p1==MAX && p0>0 partial-tail case so the manager tracks the kv-cache exactly. Fix-2 defrag on empty: when the pool is fully idle, defrag_free_pool() relinks the free queue into ascending block-id order (FreeBlockQueue::rebuild), preserving content-cache hashes. Fix-3 release on slot completion: server_slot::release() issues prompt_clear() under the paged engine so a finished-idle slot returns its blocks promptly. Validation (DGX GB10, q36-27b-nvfp4 = qwen35 hybrid; HEAD f7409c2 = patch 0023): - Bit-exact: greedy md5 identical across paged off / paged on / paged on+NO_RECLAIM (5951a5b4d624ce891e22ab5fca9bc439), == the 0023 baseline. test-backend-ops unaffected (no ggml op touched). - Host unit test: truncate reclaims exactly 16 trailing blocks; defrag restores ascending popleft order. UNIT PASS. - Model A/B (one binary, NO_RECLAIM): fragmentation prefill ratio 0.944 -> 0.998; 64 idle slots strand 2048 blocks, reclaim returns the pool to fresh (2527). - Server A/B (FRESH-npl8 -> BURST-npl64 -> POST-npl8): POST-npl8 prefill collapses 488 -> 44 t/s with NO_RECLAIM (the bug; investigation saw 507 -> 65), restored to 532 t/s (fresh 525, within 1%) with the fix. Paged release-log count 17 -> 96 (Fix-3 fires per slot completion). Canary tokens identical fresh-vs-post in both arms (bit-exact serving). Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
@@ -0,0 +1,357 @@
|
||||
From a8a9d129ae2226a08a12c30ece697865c0fc85c4 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Fri, 26 Jun 2026 12:41:49 +0200
|
||||
Subject: [PATCH] feat(paged): paged-pool burst-reclaim (truncate + defrag +
|
||||
slot release) (patch 0024)
|
||||
|
||||
Fixes the paged-pool burst-degradation bug (OTHER_PATHS_INVESTIGATION.md section C
|
||||
Part 2): on a long-lived llama-server with LLAMA_KV_PAGED=1, a high-fan-out prefill
|
||||
burst strands KV blocks in the host-side paged pool, so a later lower-npl prefill
|
||||
draws from a depleted/fragmented pool and its throughput collapses (the benchmark's
|
||||
"restart per npl" crutch). Decode is unaffected. The fix changes only host-side
|
||||
block accounting and placement, never KV values or compute, and is gated behind
|
||||
LLAMA_KV_PAGED (LLAMA_PAGED_NO_RECLAIM=1 restores the pre-fix behavior).
|
||||
|
||||
Fix-1 reclaim trailing blocks: PagedKVManager::truncate(seq, n_keep) frees every
|
||||
block beyond ceil(n_keep/bs) (ref-counted); called from llama_kv_cache::seq_rm for
|
||||
the p1==MAX && p0>0 partial-tail case so the manager tracks the kv-cache exactly.
|
||||
Fix-2 defrag on empty: when the pool is fully idle, defrag_free_pool() relinks the
|
||||
free queue into ascending block-id order (FreeBlockQueue::rebuild), preserving
|
||||
content-cache hashes.
|
||||
Fix-3 release on slot completion: server_slot::release() issues prompt_clear()
|
||||
under the paged engine so a finished-idle slot returns its blocks promptly.
|
||||
|
||||
Validation (DGX GB10, q36-27b-nvfp4 = qwen35 hybrid; HEAD f7409c2 = patch 0023):
|
||||
- Bit-exact: greedy md5 identical across paged off / paged on / paged on+NO_RECLAIM
|
||||
(5951a5b4d624ce891e22ab5fca9bc439), == the 0023 baseline. test-backend-ops
|
||||
unaffected (no ggml op touched).
|
||||
- Host unit test: truncate reclaims exactly 16 trailing blocks; defrag restores
|
||||
ascending popleft order. UNIT PASS.
|
||||
- Model A/B (one binary, NO_RECLAIM): fragmentation prefill ratio 0.944 -> 0.998;
|
||||
64 idle slots strand 2048 blocks, reclaim returns the pool to fresh (2527).
|
||||
- Server A/B (FRESH-npl8 -> BURST-npl64 -> POST-npl8): POST-npl8 prefill collapses
|
||||
488 -> 44 t/s with NO_RECLAIM (the bug; investigation saw 507 -> 65), restored to
|
||||
532 t/s (fresh 525, within 1%) with the fix. Paged release-log count 17 -> 96
|
||||
(Fix-3 fires per slot completion). Canary tokens identical fresh-vs-post in both
|
||||
arms (bit-exact serving).
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
src/llama-kv-cache.cpp | 13 ++++++++++
|
||||
src/paged-alloc.cpp | 31 +++++++++++++++++++++++
|
||||
src/paged-alloc.h | 18 +++++++++++++
|
||||
src/paged-kv-manager.cpp | 45 +++++++++++++++++++++++++++++++++
|
||||
src/paged-kv-manager.h | 24 ++++++++++++++++++
|
||||
src/paged-prefix-api.cpp | 8 ++++++
|
||||
src/paged-prefix-api.h | 6 +++++
|
||||
tools/server/server-context.cpp | 17 +++++++++++++
|
||||
8 files changed, 162 insertions(+)
|
||||
|
||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
||||
index 0351f86..21b8f1e 100644
|
||||
--- a/src/llama-kv-cache.cpp
|
||||
+++ b/src/llama-kv-cache.cpp
|
||||
@@ -425,6 +425,19 @@ bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
||||
}
|
||||
}
|
||||
|
||||
+ // [paged 0024 Fix-1] Reclaim trailing blocks on a partial TAIL truncation
|
||||
+ // (p1 == MAX, p0 > 0). llama-server issues seq_rm(slot, n_past, -1) on every
|
||||
+ // reused slot and before a cross-request prefix splice; the kv-cache frees the
|
||||
+ // cells [p0, end) but, without this, the paged manager keeps owning those
|
||||
+ // blocks - the reclamation gap that leaks and fragments the pool across a
|
||||
+ // burst. truncate() frees the blocks beyond ceil(p0/bs) so the manager's
|
||||
+ // accounting tracks the kv-cache exactly. Gated so LLAMA_PAGED_NO_RECLAIM
|
||||
+ // restores the pre-fix behavior for A/B.
|
||||
+ if (paged_alloc::active() && paged_alloc::reclaim_active() && seq_id >= 0 &&
|
||||
+ p0 > 0 && p1 == std::numeric_limits<llama_pos>::max()) {
|
||||
+ paged_alloc::truncate(this, (int) seq_to_stream[seq_id], (int) seq_id, (uint32_t) p0);
|
||||
+ }
|
||||
+
|
||||
if (seq_id >= 0) {
|
||||
auto & cells = v_cells[seq_to_stream[seq_id]];
|
||||
auto & head = v_heads[seq_to_stream[seq_id]];
|
||||
diff --git a/src/paged-alloc.cpp b/src/paged-alloc.cpp
|
||||
index c1027fb..ba98dd5 100644
|
||||
--- a/src/paged-alloc.cpp
|
||||
+++ b/src/paged-alloc.cpp
|
||||
@@ -14,6 +14,11 @@ bool active() {
|
||||
return a;
|
||||
}
|
||||
|
||||
+bool reclaim_active() {
|
||||
+ static const bool off = (std::getenv("LLAMA_PAGED_NO_RECLAIM") != nullptr);
|
||||
+ return !off;
|
||||
+}
|
||||
+
|
||||
static bool debug() {
|
||||
static const bool d = (std::getenv("LLAMA_KV_PAGED_DEBUG") != nullptr);
|
||||
return d;
|
||||
@@ -124,12 +129,28 @@ void commit(const void * cache, int stream, int seq,
|
||||
}
|
||||
}
|
||||
|
||||
+void truncate(const void * cache, int stream, int seq, uint32_t n_keep) {
|
||||
+ paged::PagedKVManager * mgr = find_mgr(cache, stream);
|
||||
+ if (!mgr) {
|
||||
+ return;
|
||||
+ }
|
||||
+ mgr->truncate(seq, (size_t) n_keep); // Fix-1: reclaim trailing blocks
|
||||
+ mgr->defrag_free_pool(); // Fix-2: compact iff the pool emptied
|
||||
+ if (debug()) {
|
||||
+ fprintf(stderr, "[paged-alloc] truncate cache=%p stream=%d seq=%d keep<=%u (free=%zu)\n",
|
||||
+ cache, stream, seq, n_keep, mgr->num_free_blocks());
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
void release(const void * cache, int stream, int seq) {
|
||||
paged::PagedKVManager * mgr = find_mgr(cache, stream);
|
||||
if (!mgr) {
|
||||
return;
|
||||
}
|
||||
mgr->free(seq); // ref-counted: shared blocks survive while another seq holds them
|
||||
+ if (reclaim_active()) {
|
||||
+ mgr->defrag_free_pool(); // Fix-2: compact iff the pool emptied
|
||||
+ }
|
||||
if (debug()) {
|
||||
fprintf(stderr, "[paged-alloc] released cache=%p stream=%d seq=%d (free=%zu)\n",
|
||||
cache, stream, seq, mgr->num_free_blocks());
|
||||
@@ -163,4 +184,14 @@ size_t num_free(const void * cache, int stream) {
|
||||
return mgr ? mgr->num_free_blocks() : 0;
|
||||
}
|
||||
|
||||
+size_t num_free_global() {
|
||||
+ size_t total = 0;
|
||||
+ for (auto & kv : g_managers) total += kv.second->num_free_blocks();
|
||||
+ return total;
|
||||
+}
|
||||
+
|
||||
+size_t num_managers() {
|
||||
+ return g_managers.size();
|
||||
+}
|
||||
+
|
||||
} // namespace paged_alloc
|
||||
diff --git a/src/paged-alloc.h b/src/paged-alloc.h
|
||||
index 88dedef..bfaf45b 100644
|
||||
--- a/src/paged-alloc.h
|
||||
+++ b/src/paged-alloc.h
|
||||
@@ -31,6 +31,12 @@ namespace paged_alloc {
|
||||
// true iff env LLAMA_KV_PAGED is set (evaluated once).
|
||||
bool active();
|
||||
|
||||
+// [paged 0024] The burst-reclaim fix (truncate + defrag-on-empty + slot release)
|
||||
+// is on by default whenever the paged engine is active. LLAMA_PAGED_NO_RECLAIM=1
|
||||
+// restores the pre-fix behavior (no trailing-block reclaim, no compaction) for
|
||||
+// A/B measurement. Evaluated once.
|
||||
+bool reclaim_active();
|
||||
+
|
||||
// Place n_tokens logical positions [base, base+n_tokens) of (cache,stream,seq)
|
||||
// on demand, appending their physical cell indices to `out`. pool_blocks =
|
||||
// cells.size()/block_size is the stream's block budget. Returns false (leaving
|
||||
@@ -58,6 +64,12 @@ int64_t slot(const void * cache, int stream, int seq, int pos);
|
||||
void commit(const void * cache, int stream, int seq,
|
||||
const std::vector<int> & tokens, uint32_t block_size, uint32_t pool_blocks);
|
||||
|
||||
+// [paged 0024 Fix-1] Reclaim the trailing blocks of (cache,stream,seq) beyond
|
||||
+// logical position n_keep (ref-counted), mirroring a partial kv-cache seq_rm
|
||||
+// [n_keep, end). When the stream's pool empties as a result, its free queue is
|
||||
+// defragged to pristine contiguous order (Fix-2). No-op if no manager exists.
|
||||
+void truncate(const void * cache, int stream, int seq, uint32_t n_keep);
|
||||
+
|
||||
// Return one sequence's blocks to the pool (ref-counted; sequence end).
|
||||
void release(const void * cache, int stream, int seq);
|
||||
|
||||
@@ -69,4 +81,10 @@ void release_all(const void * cache);
|
||||
int ref_cnt_at(const void * cache, int stream, int seq, int pos, uint32_t block_size);
|
||||
size_t num_free(const void * cache, int stream);
|
||||
|
||||
+// [paged 0024] Total free blocks summed across every live manager (all caches /
|
||||
+// streams). Wrapper-agnostic, so it reports the real pool for hybrid / iSWA
|
||||
+// models whose outer memory is not a llama_kv_cache. Diagnostics only.
|
||||
+size_t num_free_global();
|
||||
+size_t num_managers();
|
||||
+
|
||||
} // namespace paged_alloc
|
||||
diff --git a/src/paged-kv-manager.cpp b/src/paged-kv-manager.cpp
|
||||
index 4c6ee4c..738b332 100644
|
||||
--- a/src/paged-kv-manager.cpp
|
||||
+++ b/src/paged-kv-manager.cpp
|
||||
@@ -104,6 +104,22 @@ void FreeBlockQueue::prepend_n(const std::vector<KVCacheBlock*>& blocks) {
|
||||
num_free_blocks += blocks.size();
|
||||
}
|
||||
|
||||
+void FreeBlockQueue::rebuild(const std::vector<KVCacheBlock*>& blocks) {
|
||||
+ // Relink the intrusive list using THIS queue's stable fake head/tail nodes.
|
||||
+ num_free_blocks = blocks.size();
|
||||
+ for (size_t i = 0; i < blocks.size(); ++i) {
|
||||
+ blocks[i]->prev_free = (i == 0) ? &fake_head : blocks[i - 1];
|
||||
+ blocks[i]->next_free = (i + 1 < blocks.size()) ? blocks[i + 1] : &fake_tail;
|
||||
+ }
|
||||
+ if (!blocks.empty()) {
|
||||
+ fake_head.next_free = blocks.front();
|
||||
+ fake_tail.prev_free = blocks.back();
|
||||
+ } else {
|
||||
+ fake_head.next_free = &fake_tail;
|
||||
+ fake_tail.prev_free = &fake_head;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
std::vector<KVCacheBlock*> FreeBlockQueue::get_all_free_blocks() const {
|
||||
std::vector<KVCacheBlock*> ret;
|
||||
const KVCacheBlock* curr = fake_head.next_free;
|
||||
@@ -199,6 +215,20 @@ void BlockPool::cache_full_blocks(const std::vector<KVCacheBlock*>& req_blocks,
|
||||
}
|
||||
}
|
||||
|
||||
+void BlockPool::defrag_free_queue() {
|
||||
+ // Pool is fully idle: every non-null block is free (ref_cnt 0). Rebuild the
|
||||
+ // free list in ascending block_id order so popleft hands out physically
|
||||
+ // contiguous blocks again. Hashes / the content-cache map are left intact so
|
||||
+ // a warm committed prefix stays re-hittable.
|
||||
+ std::vector<KVCacheBlock*> ordered;
|
||||
+ ordered.reserve(ptrs_.size());
|
||||
+ for (KVCacheBlock* b : ptrs_) {
|
||||
+ if (b->is_null) continue;
|
||||
+ ordered.push_back(b);
|
||||
+ }
|
||||
+ free_queue_.rebuild(ordered);
|
||||
+}
|
||||
+
|
||||
// ---------------------------------------------------------------------------
|
||||
// PagedKVManager (port of SingleTypeKVCacheManager / FullAttentionManager)
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -250,6 +280,21 @@ void PagedKVManager::free(int seq_id) {
|
||||
req_to_blocks_.erase(it);
|
||||
}
|
||||
|
||||
+void PagedKVManager::truncate(int seq_id, size_t n_keep) {
|
||||
+ auto it = req_to_blocks_.find(seq_id);
|
||||
+ if (it == req_to_blocks_.end()) return;
|
||||
+ auto & blocks = it->second;
|
||||
+ const size_t keep = cdiv(n_keep, block_size_); // blocks covering [0, n_keep)
|
||||
+ if (keep >= blocks.size()) return; // nothing trailing to reclaim
|
||||
+ // Free the trailing blocks [keep, end) tail-first (vLLM eviction order). Their
|
||||
+ // cells were just cleared by the partial seq_rm, so they are safe to reuse.
|
||||
+ std::vector<KVCacheBlock*> ordered(blocks.rbegin(),
|
||||
+ blocks.rbegin() + (blocks.size() - keep));
|
||||
+ pool_.free_blocks(ordered);
|
||||
+ blocks.resize(keep);
|
||||
+ if (blocks.empty()) req_to_blocks_.erase(it);
|
||||
+}
|
||||
+
|
||||
// FNV-1a chained block hash. Deterministic and prefix-sensitive; folds the parent
|
||||
// hash into the seed so each block hash transitively encodes its whole prefix
|
||||
// (behavioral parity with vLLM hash_block_tokens chaining; vLLM uses sha256 bytes).
|
||||
diff --git a/src/paged-kv-manager.h b/src/paged-kv-manager.h
|
||||
index 34decbc..e410d58 100644
|
||||
--- a/src/paged-kv-manager.h
|
||||
+++ b/src/paged-kv-manager.h
|
||||
@@ -47,6 +47,11 @@ public:
|
||||
void append_n(const std::vector<KVCacheBlock*>& blocks);
|
||||
void prepend_n(const std::vector<KVCacheBlock*>& blocks);
|
||||
std::vector<KVCacheBlock*> get_all_free_blocks() const;
|
||||
+ // [paged 0024 Fix-2] Relink the intrusive free list to the given order using
|
||||
+ // THIS queue's fake head/tail (the nodes' addresses are stable; a temporary
|
||||
+ // FreeBlockQueue would leave dangling fake-node pointers). Used to restore a
|
||||
+ // pristine, contiguous popleft order after a fragmenting burst drains.
|
||||
+ void rebuild(const std::vector<KVCacheBlock*>& blocks);
|
||||
|
||||
private:
|
||||
KVCacheBlock fake_head{-1};
|
||||
@@ -67,6 +72,14 @@ public:
|
||||
size_t num_cached_blocks, size_t num_full_blocks,
|
||||
const std::vector<uint64_t>& block_hashes);
|
||||
size_t get_num_free_blocks() const { return free_queue_.num_free_blocks; }
|
||||
+ // [paged 0024 Fix-2] Total non-null blocks, and whether the pool is fully
|
||||
+ // idle (every non-null block back in the free queue). defrag_free_queue()
|
||||
+ // relinks the free queue into pristine ascending-block-id order; only valid
|
||||
+ // when all_free() so no live request's block table is disturbed. Block hashes
|
||||
+ // are preserved, so a warm committed prefix stays re-hittable.
|
||||
+ size_t total_blocks() const { return blocks_.size(); }
|
||||
+ bool all_free() const { return free_queue_.num_free_blocks + 1 == blocks_.size(); }
|
||||
+ void defrag_free_queue();
|
||||
|
||||
private:
|
||||
bool maybe_evict_cached_block(KVCacheBlock* block);
|
||||
@@ -94,6 +107,17 @@ public:
|
||||
void free(int seq_id);
|
||||
int block_size() const { return block_size_; }
|
||||
|
||||
+ // [paged 0024 Fix-1] Reclaim the trailing blocks of seq_id beyond logical
|
||||
+ // position n_keep: free every block at index >= ceil(n_keep/bs) (ref-counted,
|
||||
+ // mirroring vLLM's free of a truncated block suffix). Called on a partial tail
|
||||
+ // seq_rm [n_keep, end) so the manager's block accounting tracks the kv-cache
|
||||
+ // exactly instead of stranding the blocks whose cells were just cleared.
|
||||
+ void truncate(int seq_id, size_t n_keep);
|
||||
+
|
||||
+ // [paged 0024 Fix-2] When no live request holds a block, relink the free
|
||||
+ // queue into pristine contiguous order (undo a burst's scrambled free order).
|
||||
+ void defrag_free_pool() { if (pool_.all_free()) pool_.defrag_free_queue(); }
|
||||
+
|
||||
// Prefix caching (win 3).
|
||||
static uint64_t hash_block(uint64_t parent_hash, const std::vector<int>& token_ids);
|
||||
std::vector<uint64_t> compute_block_hashes(const std::vector<int>& token_ids) const;
|
||||
diff --git a/src/paged-prefix-api.cpp b/src/paged-prefix-api.cpp
|
||||
index 8573cd2..209cee8 100644
|
||||
--- a/src/paged-prefix-api.cpp
|
||||
+++ b/src/paged-prefix-api.cpp
|
||||
@@ -45,4 +45,12 @@ long num_free(llama_context * ctx) {
|
||||
return (long) paged_alloc::num_free((const void *) kv, /*stream=*/0);
|
||||
}
|
||||
|
||||
+long num_free_global() {
|
||||
+ return (long) paged_alloc::num_free_global();
|
||||
+}
|
||||
+
|
||||
+long num_managers() {
|
||||
+ return (long) paged_alloc::num_managers();
|
||||
+}
|
||||
+
|
||||
} // namespace paged_prefix_api
|
||||
diff --git a/src/paged-prefix-api.h b/src/paged-prefix-api.h
|
||||
index 78a3864..8dd817e 100644
|
||||
--- a/src/paged-prefix-api.h
|
||||
+++ b/src/paged-prefix-api.h
|
||||
@@ -24,4 +24,10 @@ int ref_at(llama_context * ctx, llama_seq_id seq, int pos);
|
||||
// Number of free blocks in the unified stream-0 pool, or 0 if no manager.
|
||||
long num_free(llama_context * ctx);
|
||||
|
||||
+// [paged 0024] Total free blocks across every live paged manager (all caches /
|
||||
+// streams). Wrapper-agnostic, so it reports the real pool for hybrid / iSWA
|
||||
+// models whose outer memory is not a llama_kv_cache. Diagnostics only.
|
||||
+long num_free_global();
|
||||
+long num_managers();
|
||||
+
|
||||
} // namespace paged_prefix_api
|
||||
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
|
||||
index f7a114c..8c19cfb 100644
|
||||
--- a/tools/server/server-context.cpp
|
||||
+++ b/tools/server/server-context.cpp
|
||||
@@ -411,6 +411,23 @@ struct server_slot {
|
||||
|
||||
reset();
|
||||
|
||||
+ // [paged 0024 Fix-3] Return this finished slot's paged blocks to the
|
||||
+ // pool promptly. Stock llama-server keeps an idle slot's KV for its own
|
||||
+ // next-prompt cache, but under the paged engine that strands blocks in
|
||||
+ // idle slots after a high-fan-out burst, so a later low-npl run sees a
|
||||
+ // depleted, fragmented pool and its prefill collapses. prompt_clear()
|
||||
+ // issues a full seq_rm (clearing the cells AND, via the paged hook,
|
||||
+ // releasing + defragging the blocks) and clears the slot-local prompt
|
||||
+ // cache so the next reuse recomputes from a pristine pool; cross-request
|
||||
+ // reuse still works through the committed paged content cache. Gated on
|
||||
+ // LLAMA_KV_PAGED (LLAMA_PAGED_NO_RECLAIM opts out for A/B); stock
|
||||
+ // (paged off) is byte-identical.
|
||||
+ static const bool paged_release_on_idle =
|
||||
+ getenv("LLAMA_KV_PAGED") != nullptr && getenv("LLAMA_PAGED_NO_RECLAIM") == nullptr;
|
||||
+ if (paged_release_on_idle && prompt.n_tokens() > 0) {
|
||||
+ prompt_clear(false);
|
||||
+ }
|
||||
+
|
||||
callback_on_release(id);
|
||||
}
|
||||
}
|
||||
--
|
||||
2.43.0
|
||||
|
||||
120
backend/cpp/llama-cpp/patches/paged/PAGED_POOL_BURST_FIX.md
Normal file
120
backend/cpp/llama-cpp/patches/paged/PAGED_POOL_BURST_FIX.md
Normal file
@@ -0,0 +1,120 @@
|
||||
# PAGED_POOL_BURST_FIX (patch 0024)
|
||||
|
||||
Fixes the paged-pool **burst-degradation bug** identified in `OTHER_PATHS_INVESTIGATION.md`
|
||||
(section C, Part 2): on a long-lived `llama-server` with `LLAMA_KV_PAGED=1`, a high-fan-out
|
||||
prefill burst strands KV blocks in the host-side paged pool, so a subsequent lower-npl prefill
|
||||
draws from a depleted / fragmented pool and its throughput collapses (the benchmark's documented
|
||||
"restart the server per npl" crutch). Decode is unaffected. The fix touches **only host-side block
|
||||
accounting and placement - never KV values or compute** - so it is gated behind `LLAMA_KV_PAGED`
|
||||
and is byte-identical to HEAD with the flag unset.
|
||||
|
||||
## Root cause (two compounding host-side defects)
|
||||
|
||||
1. **Reclamation gap.** `paged_alloc` returned a sequence's blocks only on a full-range wipe
|
||||
(`seq_rm(seq, 0, MAX)`). A partial **tail** truncation `seq_rm(seq, p0>0, MAX)` - which
|
||||
`llama-server` issues on every reused slot and before a cross-request prefix splice - freed the
|
||||
kv-cache CELLS but left the manager owning the trailing BLOCKS. The two desync; the free pool
|
||||
shrinks. (Applies to pure-attention paged caches; on hybrid SSM models the partial seq_rm is
|
||||
rejected by the recurrent cache before it reaches the attention cache, so the dominant leak there
|
||||
is #1b below.)
|
||||
1b. **Idle-slot retention.** Stock `llama-server` keeps a finished slot's KV resident for that
|
||||
slot's own next-prompt cache. Under the paged engine, the blocks of the many slots a burst
|
||||
touches but a later low-npl run never reassigns are stranded for the process lifetime - a later
|
||||
run sees a depleted pool.
|
||||
2. **No compaction.** `BlockPool::free_blocks` returns blocks in free order; after a burst the free
|
||||
queue is a scrambled permutation of physical ids, so a later prefill pops physically scattered
|
||||
blocks and its KV scatter-write + paged-attention gather lose locality.
|
||||
|
||||
## The fix (all behind `LLAMA_KV_PAGED`; `LLAMA_PAGED_NO_RECLAIM=1` restores pre-fix behavior)
|
||||
|
||||
- **Fix-1 - reclaim trailing blocks.** `paged::PagedKVManager::truncate(seq, n_keep)` frees every
|
||||
block at index >= `ceil(n_keep/bs)` (ref-counted, mirroring vLLM's free of a truncated suffix),
|
||||
exposed as `paged_alloc::truncate(cache, stream, seq, n_keep)` and called from
|
||||
`llama_kv_cache::seq_rm` for the `p1 == MAX && p0 > 0` case. Manager accounting now tracks the
|
||||
kv-cache exactly. (`src/paged-kv-manager.*`, `src/paged-alloc.*`, `src/llama-kv-cache.cpp`)
|
||||
- **Fix-2 - defrag on empty.** When the pool becomes fully idle (`all_free()`),
|
||||
`defrag_free_pool()` relinks the free queue into ascending block-id order (`FreeBlockQueue::rebuild`),
|
||||
preserving content-cache hashes. Triggered after `release`/`truncate`. (`src/paged-kv-manager.*`,
|
||||
`src/paged-alloc.*`)
|
||||
- **Fix-3 - release on slot completion.** At `server_slot::release()` the paged engine issues
|
||||
`prompt_clear()` (full seq_rm: clears cells AND releases+defrags the blocks) and drops the
|
||||
slot-local prompt cache, so a finished-idle slot returns its blocks promptly; cross-request reuse
|
||||
still works through the committed paged content cache. (`tools/server/server-context.cpp`)
|
||||
|
||||
## Validation (DGX GB10, dense q36-27b-nvfp4 = qwen35 hybrid; HEAD f7409c2 = patch 0023)
|
||||
|
||||
### Bit-exactness (the parity-safe property)
|
||||
Greedy decode, fixed prompt/seed, 48 tokens, `llama-completion`:
|
||||
|
||||
| build / flag | md5 |
|
||||
|---|---|
|
||||
| 0023 baseline (paged off) | `5951a5b4d624ce891e22ab5fca9bc439` |
|
||||
| AFTER paged **off** | `5951a5b4d624ce891e22ab5fca9bc439` (== baseline) |
|
||||
| AFTER paged **on**, reclaim default-on | `5951a5b4d624ce891e22ab5fca9bc439` (== baseline) |
|
||||
| AFTER paged **on**, `LLAMA_PAGED_NO_RECLAIM=1` | `5951a5b4d624ce891e22ab5fca9bc439` (== baseline) |
|
||||
|
||||
Identical across the board: the fix changes no KV value or compute. `test-backend-ops` is unaffected
|
||||
by construction (the change touches only host-side block accounting in libllama and the server; no
|
||||
ggml operator is modified) and was re-run green against the fixed `libllama`.
|
||||
|
||||
### Host-side unit test (`llama-paged-reclaim-unit`, no GPU)
|
||||
- Fix-1: `allocate(0,512)` -> 32 blocks; `truncate(0,256)` reclaims exactly **16** trailing blocks;
|
||||
`truncate(0,16)` returns to 1 block; `free` returns to pristine.
|
||||
- Fix-2: 8 blocks freed in scrambled order then `defrag_free_pool()` -> next `block_table` pops
|
||||
**ascending** physical ids. `UNIT PASS`.
|
||||
|
||||
### Repro on the model (`llama-paged-burst-bench`, A/B on one binary via `LLAMA_PAGED_NO_RECLAIM`)
|
||||
NSLOT=64, NPL=8, PP=512, pool=2527 blocks. Same binary, A/B by env.
|
||||
|
||||
- **Fix-2 (fragmentation -> prefill).** Fresh npl8 vs npl8 after a scrambling burst+drain:
|
||||
- BEFORE (`NO_RECLAIM`): prefill 870.5 -> 822.1 t/s, **ratio 0.944** (fragmented free queue).
|
||||
- AFTER (defrag on): prefill 869.2 -> 867.8 t/s, **ratio 0.998** (free queue compacted).
|
||||
- **Fix-3 mechanism (idle-slot leak -> reclaim).** Burst 64 sequences left idle, then full-release
|
||||
(what Fix-3's `prompt_clear` issues at `slot.release()`): pool free
|
||||
**2527 (pristine) -> 479 (64 idle slots strand 2048 blocks) -> 2527 (reclaimed == fresh)**. The
|
||||
leaked-block count is exactly 64 x ceil(512/16) = 2048.
|
||||
- Decode is untouched throughout (single-token append; the fix only moves/accounts blocks).
|
||||
|
||||
### Server repro (`llama-server`, one long-lived process, FRESH-npl8 -> BURST-npl64 -> POST-npl8)
|
||||
`-c 36000 -np 64 -b 2048 -ub 512`, `LLAMA_MAX_BATCH_TOKENS=512`, distinct 512-token prompts,
|
||||
`cache_prompt:false`, A/B by `LLAMA_PAGED_NO_RECLAIM`. Aggregate prefill = total prompt tokens / wave
|
||||
wall.
|
||||
|
||||
| wave | BEFORE (`NO_RECLAIM`) | AFTER (fix) |
|
||||
|---|---|---|
|
||||
| FRESH-npl8 | 488 t/s (wall 8.4 s) | 525 t/s (wall 7.8 s) |
|
||||
| POST-npl8 (after burst) | **44 t/s (wall 93 s)** | **532 t/s (wall 7.7 s)** |
|
||||
| post / fresh | **0.090 (11x collapse)** | **1.01 (recovered, within 1%)** |
|
||||
| paged release lines in log | 17 | **96** (Fix-3 fires at each slot completion) |
|
||||
| `CANARY_TOKENS_MATCH` (fresh vs post, identical prompts) | **YES** | **YES** |
|
||||
|
||||
The bug reproduces exactly (the investigation's 507 -> 65 collapse; here 488 -> 44); the fix restores
|
||||
POST-npl8 to within ~1% of fresh and the release-log count jumps from 17 to 96, confirming Fix-3
|
||||
returns each finished slot's blocks. The canary tokens are identical fresh-vs-post in BOTH arms:
|
||||
paged placement is value-invariant, so the fix never changes the served output - only when the pool
|
||||
recovers. Decode is structurally untouched (release happens after a request completes); greedy md5
|
||||
above proves decode values are byte-identical.
|
||||
|
||||
## Tradeoff / scope notes
|
||||
- On **hybrid SSM models** (qwen35), the recurrent cache rejects a partial tail `seq_rm`, so the
|
||||
hybrid wrapper never forwards it to the attention cache: Fix-1 effectively applies to
|
||||
pure-attention paged caches, while the hybrid leak is dominated by idle-slot retention (Fix-3) and
|
||||
fragmentation (Fix-2). Confirmed by the unit test (Fix-1 logic) and Test-C (2048 blocks stranded
|
||||
by 64 idle slots, returned to fresh on reclaim).
|
||||
- Fix-3 clears a finished slot's KV at `release()`, so a repeated-prompt workload loses the
|
||||
slot-local prompt cache. Cross-request reuse normally falls back to the committed paged content
|
||||
cache, but that publish path (`paged_prefix_api::commit`) is itself a no-op on hybrid wrappers, so
|
||||
for hybrid + repeated prompts Fix-3 trades prompt-cache reuse for pool hygiene. Gated behind
|
||||
`LLAMA_KV_PAGED`; `LLAMA_PAGED_NO_RECLAIM=1` restores the stock retain-idle behavior.
|
||||
|
||||
## Files
|
||||
- `src/paged-kv-manager.{h,cpp}` - `truncate`, `defrag_free_pool`/`defrag_free_queue`,
|
||||
`FreeBlockQueue::rebuild`, `all_free`/`total_blocks`.
|
||||
- `src/paged-alloc.{h,cpp}` - `truncate`, `reclaim_active`, defrag-on-empty in `release`/`truncate`,
|
||||
`num_free_global`/`num_managers`.
|
||||
- `src/llama-kv-cache.cpp` - partial-tail-seq_rm reclaim hook.
|
||||
- `src/paged-prefix-api.{h,cpp}` - `num_free_global`/`num_managers` introspection passthrough.
|
||||
- `tools/server/server-context.cpp` - Fix-3 paged release at `slot.release()`.
|
||||
- `examples/simple/paged-reclaim-unit.cpp`, `paged-burst-bench.cpp` - dev test scaffolding.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
217
backend/cpp/llama-cpp/patches/paged/paged-burst-bench.cpp
Normal file
217
backend/cpp/llama-cpp/patches/paged/paged-burst-bench.cpp
Normal file
@@ -0,0 +1,217 @@
|
||||
// Paged-pool burst-degradation repro (patch 0024). DEV SCAFFOLDING ONLY.
|
||||
//
|
||||
// Reproduces, at the libllama level, the two host-side defects behind the
|
||||
// "later lower-npl prefill collapses, decode fine, restart cures it" benchmark
|
||||
// signature:
|
||||
//
|
||||
// * RECLAMATION GAP (Fix-1): a partial tail seq_rm(seq, p0>0, -1) - exactly
|
||||
// what llama-server issues on every reused slot - frees the kv-cache CELLS
|
||||
// but the paged manager keeps owning the trailing BLOCKS. The manager's
|
||||
// free pool silently shrinks. Test A measures the reclaimed-block delta.
|
||||
//
|
||||
// * FRAGMENTATION / NO COMPACTION (Fix-2): a high-fan-out burst that allocates
|
||||
// many sequences and frees them in a scrambled order leaves the free queue a
|
||||
// scrambled permutation of physical block ids. A later low-npl prefill then
|
||||
// pops physically scattered blocks, so its KV scatter-write + in-kernel
|
||||
// paged-attention gather lose locality and prefill throughput collapses;
|
||||
// decode (single-token append) barely notices. Test B times an npl8 prefill
|
||||
// on a FRESH pool vs an npl8 prefill AFTER a scrambling burst+drain.
|
||||
//
|
||||
// PASS (post-fix): Test A reclaims ceil((PP-KEEP)/bs) trailing blocks on the
|
||||
// partial seq_rm (0 pre-fix); Test B's post-burst npl8 prefill_tps is within ~10%
|
||||
// of the fresh npl8 and num_free returns to the pristine value after the drain.
|
||||
//
|
||||
// Run with LLAMA_KV_PAGED=1. Env: BURST_NSLOT(64) NPL(8) PP(512) KEEP(256)
|
||||
// GEN(4) PAGED_NGL(99). All sequences use distinct content so nothing is shared.
|
||||
|
||||
#include "llama.h"
|
||||
#include "paged-prefix-api.h"
|
||||
|
||||
#include <chrono>
|
||||
#include <clocale>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
||||
static int env_i(const char * k, int dflt) { const char * v = getenv(k); return v ? atoi(v) : dflt; }
|
||||
|
||||
using clk = std::chrono::steady_clock;
|
||||
static double secs(clk::time_point a, clk::time_point b) {
|
||||
return std::chrono::duration<double>(b - a).count();
|
||||
}
|
||||
|
||||
struct Ctx { llama_context * ctx; llama_memory_t mem; llama_batch batch; int n_vocab; };
|
||||
|
||||
// Deterministic, content-distinct token for (seq, pos): keeps every sequence's
|
||||
// blocks unique so no cross-request prefix sharing masks the accounting.
|
||||
static llama_token tok_of(int seq, int pos, int n_vocab) {
|
||||
return (llama_token) (((seq * 1000003 + pos * 131 + 7) % (n_vocab - 200)) + 100);
|
||||
}
|
||||
|
||||
// Prefill n tokens of seq at [pos0, pos0+n) in one ubatch (n <= n_batch).
|
||||
// Returns wall seconds (sync'd).
|
||||
static double prefill(Ctx & C, int seq, int pos0, int n) {
|
||||
clk::time_point t0 = clk::now();
|
||||
C.batch.n_tokens = 0;
|
||||
for (int j = 0; j < n; ++j) {
|
||||
int i = C.batch.n_tokens;
|
||||
C.batch.token[i] = tok_of(seq, pos0 + j, C.n_vocab);
|
||||
C.batch.pos[i] = pos0 + j;
|
||||
C.batch.n_seq_id[i] = 1;
|
||||
C.batch.seq_id[i][0]= seq;
|
||||
C.batch.logits[i] = (j + 1 == n) ? 1 : 0;
|
||||
C.batch.n_tokens++;
|
||||
}
|
||||
if (llama_decode(C.ctx, C.batch)) { fprintf(stderr, "prefill decode failed seq=%d\n", seq); return -1; }
|
||||
llama_synchronize(C.ctx);
|
||||
return secs(t0, clk::now());
|
||||
}
|
||||
|
||||
// One decode step (single token) for seq at pos.
|
||||
static void decode1(Ctx & C, int seq, int pos) {
|
||||
C.batch.n_tokens = 1;
|
||||
C.batch.token[0] = tok_of(seq, pos, C.n_vocab);
|
||||
C.batch.pos[0] = pos; C.batch.n_seq_id[0] = 1; C.batch.seq_id[0][0] = seq; C.batch.logits[0] = 1;
|
||||
if (llama_decode(C.ctx, C.batch)) fprintf(stderr, "decode1 failed seq=%d\n", seq);
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
const char * model_path = nullptr;
|
||||
for (int i = 1; i < argc; ++i) if (!strcmp(argv[i], "-m") && i + 1 < argc) model_path = argv[++i];
|
||||
if (!model_path) { fprintf(stderr, "usage: %s -m model.gguf\n", argv[0]); return 2; }
|
||||
|
||||
const int NSLOT = env_i("BURST_NSLOT", 64);
|
||||
const int NPL = env_i("NPL", 8);
|
||||
const int PP = env_i("PP", 512);
|
||||
const int KEEP = env_i("KEEP", 256);
|
||||
const int GEN = env_i("GEN", 4);
|
||||
const int ngl = env_i("PAGED_NGL", 99);
|
||||
const bool paged = getenv("LLAMA_KV_PAGED") != nullptr;
|
||||
|
||||
ggml_backend_load_all();
|
||||
llama_model_params mp = llama_model_default_params();
|
||||
mp.n_gpu_layers = ngl;
|
||||
llama_model * model = llama_model_load_from_file(model_path, mp);
|
||||
if (!model) { fprintf(stderr, "model load failed\n"); return 1; }
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
const int n_vocab = llama_vocab_n_tokens(vocab);
|
||||
|
||||
// Pool sized for the burst plus headroom so the burst fits but a later npl
|
||||
// run draws from whatever the burst's churn left behind.
|
||||
const long cells = (long) (NSLOT + NPL + 4) * (PP + GEN + 16);
|
||||
llama_context_params cp = llama_context_default_params();
|
||||
cp.n_ctx = (uint32_t) cells;
|
||||
cp.n_batch = (uint32_t) (PP + 16);
|
||||
cp.n_ubatch = (uint32_t) (PP + 16);
|
||||
cp.n_seq_max = NSLOT + NPL + 2;
|
||||
cp.kv_unified = true; // one unified stream-0 pool -> num_free(ctx) is the whole pool
|
||||
cp.no_perf = true;
|
||||
llama_context * ctx = llama_init_from_model(model, cp);
|
||||
if (!ctx) { fprintf(stderr, "ctx init failed (cells=%ld)\n", cells); return 1; }
|
||||
|
||||
Ctx C; C.ctx = ctx; C.mem = llama_get_memory(ctx); C.n_vocab = n_vocab;
|
||||
C.batch = llama_batch_init(cp.n_batch, 0, 1);
|
||||
|
||||
printf("== paged-burst-bench == paged=%d NSLOT=%d NPL=%d PP=%d KEEP=%d GEN=%d n_ctx=%ld\n",
|
||||
paged, NSLOT, NPL, PP, KEEP, GEN, cells);
|
||||
|
||||
llama_memory_clear(C.mem, true);
|
||||
const long F_start = paged_prefix_api::num_free_global();
|
||||
|
||||
// ---- Test A: Fix-1 reclamation gap on a partial tail seq_rm --------------
|
||||
{
|
||||
prefill(C, 0, 0, PP);
|
||||
const long f_after_prefill = paged_prefix_api::num_free_global();
|
||||
llama_memory_seq_rm(C.mem, 0, KEEP, -1); // partial tail removal
|
||||
const long f_after_rm = paged_prefix_api::num_free_global();
|
||||
llama_memory_seq_rm(C.mem, 0, -1, -1); // full free -> pristine
|
||||
const long f_after_full = paged_prefix_api::num_free_global();
|
||||
const long bs = 16;
|
||||
const long expect = (PP + bs - 1)/bs - (KEEP + bs - 1)/bs; // trailing blocks
|
||||
printf("[TEST-A Fix-1] start=%ld afterPrefill=%ld afterPartialRm=%ld reclaimed=%ld "
|
||||
"(expect %ld post-fix, 0 pre-fix) afterFullFree=%ld\n",
|
||||
F_start, f_after_prefill, f_after_rm, f_after_rm - f_after_prefill, expect, f_after_full);
|
||||
}
|
||||
|
||||
// ---- Test B: fragmentation -> npl prefill collapse -----------------------
|
||||
// Fresh npl prefill baseline on a pristine pool.
|
||||
llama_memory_clear(C.mem, true);
|
||||
double tps_fresh;
|
||||
{
|
||||
clk::time_point t0 = clk::now();
|
||||
long ntok = 0;
|
||||
for (int s = 0; s < NPL; ++s) { double d = prefill(C, s, 0, PP); if (d < 0) return 1; ntok += PP; }
|
||||
tps_fresh = ntok / secs(t0, clk::now());
|
||||
for (int s = 0; s < NPL; ++s) llama_memory_seq_rm(C.mem, s, -1, -1);
|
||||
}
|
||||
const long F_pristine = paged_prefix_api::num_free_global();
|
||||
|
||||
// High-fan-out burst: allocate NSLOT sequences, each prefilled + a few decode
|
||||
// steps (mixed alloc), then drain them in a scrambled order (odd ids first,
|
||||
// then even, each truncated before the full free) so the free queue becomes a
|
||||
// scrambled permutation - the fragmentation the bug never compacts.
|
||||
for (int s = 0; s < NSLOT; ++s) {
|
||||
if (prefill(C, NPL + s, 0, PP) < 0) return 1;
|
||||
for (int g = 0; g < GEN; ++g) decode1(C, NPL + s, PP + g);
|
||||
}
|
||||
const long F_during_burst = paged_prefix_api::num_free_global();
|
||||
// Drain: partial tail seq_rm (the reused-slot pattern) then full free, in a
|
||||
// scrambled slot order to scramble the physical free order.
|
||||
for (int parity = 1; parity >= 0; --parity)
|
||||
for (int s = 0; s < NSLOT; ++s) if ((s & 1) == parity) {
|
||||
llama_memory_seq_rm(C.mem, NPL + s, KEEP, -1); // partial (Fix-1 path)
|
||||
llama_memory_seq_rm(C.mem, NPL + s, -1, -1); // full free
|
||||
}
|
||||
const long F_after_drain = paged_prefix_api::num_free_global();
|
||||
|
||||
// Post-burst npl prefill: pops from the (pre-fix scrambled / post-fix
|
||||
// defragged) free queue.
|
||||
double tps_post;
|
||||
{
|
||||
clk::time_point t0 = clk::now();
|
||||
long ntok = 0;
|
||||
for (int s = 0; s < NPL; ++s) { double d = prefill(C, s, 0, PP); if (d < 0) return 1; ntok += PP; }
|
||||
tps_post = ntok / secs(t0, clk::now());
|
||||
for (int s = 0; s < NPL; ++s) llama_memory_seq_rm(C.mem, s, -1, -1);
|
||||
}
|
||||
|
||||
const double ratio = tps_fresh > 0 ? tps_post / tps_fresh : 0;
|
||||
printf("[TEST-B frag] num_free: start=%ld pristine=%ld duringBurst=%ld afterDrain=%ld "
|
||||
"(afterDrain==pristine? %s)\n",
|
||||
F_start, F_pristine, F_during_burst, F_after_drain,
|
||||
F_after_drain == F_pristine ? "YES" : "NO");
|
||||
printf("[TEST-B frag] prefill_tps fresh=%.1f post-burst=%.1f ratio=%.3f "
|
||||
"(PASS if >=0.90)\n", tps_fresh, tps_post, ratio);
|
||||
|
||||
// ---- Test C: idle-slot retention leak -> reclaim (the Fix-3 scenario) -----
|
||||
// Burst NSLOT sequences and leave them IDLE (stock llama-server keeps an idle
|
||||
// slot's KV; the blocks are stranded). F_idle shows the depleted pool a later
|
||||
// low-npl run would see. Then full-seq_rm each (exactly what Fix-3's
|
||||
// prompt_clear() issues at slot.release): F_reclaimed must return to pristine.
|
||||
llama_memory_clear(C.mem, true);
|
||||
// Touch the pool once so the manager exists, then read the full-pool size
|
||||
// (num_free is 0 while no manager is registered).
|
||||
if (prefill(C, 0, 0, 16) < 0) return 1;
|
||||
llama_memory_seq_rm(C.mem, 0, -1, -1);
|
||||
const long F_pre_c = paged_prefix_api::num_free_global();
|
||||
for (int s = 0; s < NSLOT; ++s) { if (prefill(C, NPL + s, 0, PP) < 0) return 1; }
|
||||
const long F_idle = paged_prefix_api::num_free_global();
|
||||
for (int s = 0; s < NSLOT; ++s) llama_memory_seq_rm(C.mem, NPL + s, -1, -1); // Fix-3 release
|
||||
const long F_reclaimed = paged_prefix_api::num_free_global();
|
||||
printf("[TEST-C idle] pristine=%ld idle_after_burst=%ld (leaked=%ld) reclaimed=%ld "
|
||||
"(returns_to_fresh? %s)\n",
|
||||
F_pre_c, F_idle, F_pre_c - F_idle, F_reclaimed,
|
||||
F_reclaimed == F_pre_c ? "YES" : "NO");
|
||||
|
||||
printf("RESULT paged=%d frag_fix2_ratio=%.3f drain_numfree_returns=%s idle_reclaim_returns=%s\n",
|
||||
paged, ratio,
|
||||
F_after_drain == F_pristine ? "YES" : "NO",
|
||||
F_reclaimed == F_pre_c ? "YES" : "NO");
|
||||
|
||||
llama_batch_free(C.batch);
|
||||
llama_free(ctx);
|
||||
llama_model_free(model);
|
||||
return 0;
|
||||
}
|
||||
59
backend/cpp/llama-cpp/patches/paged/paged-reclaim-unit.cpp
Normal file
59
backend/cpp/llama-cpp/patches/paged/paged-reclaim-unit.cpp
Normal file
@@ -0,0 +1,59 @@
|
||||
// Host-side unit test for the paged-pool burst-reclaim fix (patch 0024).
|
||||
// Compiles paged-kv-manager.cpp directly; no ggml / llama / GPU dependency.
|
||||
//
|
||||
// Fix-1 PagedKVManager::truncate(seq, n_keep) reclaims the trailing blocks
|
||||
// beyond ceil(n_keep/bs) (ref-counted), so a partial tail seq_rm no
|
||||
// longer strands blocks whose cells were cleared.
|
||||
// Fix-2 defrag_free_pool() relinks the free queue into ascending block-id
|
||||
// order once the pool is fully idle, undoing a burst's scrambled frees
|
||||
// so a later prefill pops physically contiguous blocks again.
|
||||
|
||||
#include "paged-kv-manager.h"
|
||||
#include <cstdio>
|
||||
|
||||
using paged::PagedKVManager;
|
||||
|
||||
int main() {
|
||||
int rc = 0;
|
||||
|
||||
// ---- Fix-1: truncate reclaims the trailing block suffix -----------------
|
||||
{
|
||||
PagedKVManager m(/*num_blocks=*/64, /*block_size=*/16, /*caching=*/true);
|
||||
const size_t f0 = m.num_free_blocks(); // 63 (block 0 reserved as null)
|
||||
m.allocate(0, 512); // ceil(512/16)=32 blocks
|
||||
const size_t f1 = m.num_free_blocks(); // 31
|
||||
m.truncate(0, 256); // keep ceil(256/16)=16, free 16
|
||||
const size_t f2 = m.num_free_blocks(); // 47
|
||||
printf("[unit Fix-1] free=%zu alloc512=%zu truncate256=%zu reclaimed=%zu (expect 16)\n",
|
||||
f0, f1, f2, f2 - f1);
|
||||
if (f2 - f1 != 16) rc = 1;
|
||||
m.truncate(0, 16); // keep 1 block, free 15 more
|
||||
const size_t f3 = m.num_free_blocks(); // 62
|
||||
printf("[unit Fix-1] truncate16=%zu (expect %zu)\n", f3, f0 - 1);
|
||||
if (f3 != f0 - 1) rc = 1;
|
||||
m.free(0);
|
||||
if (m.num_free_blocks() != f0) { printf("[unit Fix-1] free mismatch\n"); rc = 1; }
|
||||
}
|
||||
|
||||
// ---- Fix-2: defrag restores ascending popleft order ---------------------
|
||||
{
|
||||
PagedKVManager m(/*num_blocks=*/64, /*block_size=*/16, /*caching=*/false);
|
||||
for (int s = 0; s < 8; ++s) m.allocate(s, 16); // pop blocks 1..8
|
||||
const int scrambled[8] = {3, 7, 1, 5, 0, 6, 2, 4}; // free out of order
|
||||
for (int i = 0; i < 8; ++i) m.free(scrambled[i]);
|
||||
m.defrag_free_pool(); // all idle -> compact
|
||||
m.allocate(100, 16 * 3); // pop 3 blocks
|
||||
const auto bt = m.block_table(100);
|
||||
bool asc = true;
|
||||
printf("[unit Fix-2] post-defrag block_table:");
|
||||
for (size_t i = 0; i < bt.size(); ++i) {
|
||||
printf(" %d", bt[i]);
|
||||
if (i && bt[i] < bt[i - 1]) asc = false;
|
||||
}
|
||||
printf(" ascending=%s (expect YES)\n", asc ? "YES" : "NO");
|
||||
if (!asc) rc = 1;
|
||||
}
|
||||
|
||||
printf("UNIT %s\n", rc == 0 ? "PASS" : "FAIL");
|
||||
return rc;
|
||||
}
|
||||
Reference in New Issue
Block a user