feat(paged): paged-bench - measure capacity & prefix-sharing wins

Quantify the two multi-tenant wins that are properties of the host-side block model (vLLM-parity), independent of the in-model compute path: WIN 1 concurrency capacity @ 512-block budget contiguous (reserve n_ctx/seq): 4 sequences paged (on-demand blocks): 37 sequences --> 9.2x more concurrent sequences WIN 3 cross-tenant prefix sharing (32 tenants, 1024-tok shared prefix) prefix-cache OFF: 2176 physical blocks prefix-cache ON: 192 physical blocks --> 11.3x less KV memory WIN 2 (throughput) is deliberately reported as PENDING: it requires the paged gather-read path wired into llama-graph.cpp (Gate 0) and is not measurable at the allocation layer. The win-1 baseline is per-sequence n_ctx reservation (stream mode); llama.cpp's unified cache already shares one pool, so the honest win there is on-demand sizing + prefix dedup. Phase 3 (partial) of docs/superpowers/plans/2026-06-19-paged-attention-llamacpp.md. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-23 16:19:07 -04:00 · 2026-06-19 08:44:41 +00:00
parent 5a5d3df8c8
commit ddace5fb6a
3 changed files with 137 additions and 1 deletions
--- a/backend/cpp/llama-cpp/paged/.gitignore
+++ b/backend/cpp/llama-cpp/paged/.gitignore
@@ -4,3 +4,4 @@ tests/test_paged_kv_manager
 tests/test_prefix_cache
 tests/test_ggml_paged_rw
 tests/test_ggml_paged_attn
+paged-bench
--- a/backend/cpp/llama-cpp/paged/Makefile
+++ b/backend/cpp/llama-cpp/paged/Makefile
@@ -12,6 +12,12 @@ tests/%: tests/%.cpp paged_kv_manager.cpp paged_kv_manager.h
 check: all
 	@for t in $(BINS); do echo "== $$t =="; ./$$t || exit 1; done

+paged-bench: paged-bench.cpp paged_kv_manager.cpp paged_kv_manager.h
+	$(CXX) $(CXXFLAGS) -o $@ paged-bench.cpp paged_kv_manager.cpp
+
+bench: paged-bench
+	./paged-bench
+
 # --- Optional ggml integration test (Phase 1: paged write/gather mechanism) ---
 # Requires a built ggml. Override these to point at your checkout / build:
 #   make ggml-check GGML_SRC=<llama.cpp>/ggml GGML_BUILD=<ggml-build>
@@ -30,6 +36,6 @@ ggml-check: $(GGML_BINS)
 	@for t in $(GGML_BINS); do echo "== $$t =="; ./$$t || exit 1; done

 clean:
-	rm -f $(BINS) $(GGML_BINS)
+	rm -f $(BINS) $(GGML_BINS) paged-bench

 .PHONY: all check ggml-check clean
--- a/backend/cpp/llama-cpp/paged/paged-bench.cpp
+++ b/backend/cpp/llama-cpp/paged/paged-bench.cpp
@@ -0,0 +1,129 @@
+// paged-bench: quantify the multi-tenant wins of paged KV allocation that are
+// properties of the host-side block model (vLLM-parity), independent of the
+// in-model compute path.
+//
+//   Win 1 (capacity):       on-demand block allocation vs contiguous per-seq
+//                           reservation, under a fixed KV block budget.
+//   Win 3 (prefix sharing): automatic cross-tenant prefix dedup via block
+//                           hashing.
+//
+// Win 2 (throughput) is intentionally NOT here: it requires the paged read
+// path wired into llama-graph.cpp (Gate 0). Measuring it at this layer would
+// be dishonest, so it is reported as pending.
+
+#include "paged_kv_manager.h"
+
+#include <cstdio>
+#include <vector>
+#include <numeric>
+
+using namespace paged;
+
+// A deterministic LCG so sequence lengths vary without Math.random-style nondeterminism.
+struct Lcg {
+    uint64_t s;
+    explicit Lcg(uint64_t seed) : s(seed) {}
+    uint32_t next() { s = s * 6364136223846793005ULL + 1442695040888963407ULL; return (uint32_t)(s >> 33); }
+    int range(int lo, int hi) { return lo + (int)(next() % (uint32_t)(hi - lo + 1)); }
+};
+
+static size_t cdiv(size_t a, size_t b) { return (a + b - 1) / b; }
+
+int main() {
+    const int block_size = 16;
+    const int n_ctx      = 2048;   // max context a sequence could use
+    const int num_blocks = 512;    // fixed KV budget: 512 blocks * 16 = 8192 cells
+
+    printf("paged-bench  (block_size=%d, n_ctx=%d, budget=%d blocks = %d cells)\n\n",
+           block_size, n_ctx, num_blocks, num_blocks * block_size);
+
+    // ---------------------------------------------------------------------
+    // WIN 1: concurrency capacity. Sequences have realistic, VARYING lengths
+    // (most short, a few long) - the regime where reserving n_ctx per seq
+    // wastes the most. Count how many fit under the same block budget.
+    // ---------------------------------------------------------------------
+    {
+        Lcg rng(12345);
+        const int blocks_per_ctx = (int) cdiv(n_ctx, block_size); // contiguous reserves this per seq
+
+        // Contiguous (stream-style) reservation: every seq reserves n_ctx worth.
+        int contiguous_fit = num_blocks / blocks_per_ctx;
+
+        // Paged on-demand: draw real lengths until the pool is exhausted.
+        PagedKVManager m(num_blocks, block_size, /*enable_caching=*/false);
+        int paged_fit = 0;
+        long total_tokens = 0;
+        for (int seq = 0; ; ++seq) {
+            // 80% short (8-128 tok), 20% long (up to n_ctx)
+            int len = (rng.range(0, 99) < 80) ? rng.range(8, 128) : rng.range(128, n_ctx);
+            if (!m.allocate(seq, (size_t) len)) break;
+            paged_fit++;
+            total_tokens += len;
+        }
+
+        printf("WIN 1  concurrency capacity @ %d-block budget\n", num_blocks);
+        printf("  contiguous (reserve n_ctx/seq): %d sequences\n", contiguous_fit);
+        printf("  paged (on-demand blocks):       %d sequences  (avg %ld tok/seq)\n",
+               paged_fit, paged_fit ? total_tokens / paged_fit : 0);
+        printf("  --> paged fits %.1fx more concurrent sequences\n\n",
+               contiguous_fit ? (double) paged_fit / contiguous_fit : 0.0);
+    }
+
+    // ---------------------------------------------------------------------
+    // WIN 3: cross-tenant prefix sharing. N tenants share a long system
+    // prompt / RAG context, then diverge. Compare physical blocks consumed
+    // with prefix caching on vs off.
+    // ---------------------------------------------------------------------
+    {
+        const int n_tenants    = 32;
+        const int shared_len   = 1024;  // shared system prompt (64 blocks)
+        const int distinct_len = 64;    // per-tenant suffix (4 blocks)
+
+        // Shared prefix token ids (identical across tenants -> identical block hashes).
+        std::vector<int> shared(shared_len);
+        for (int i = 0; i < shared_len; ++i) shared[i] = 1000 + i;
+
+        // --- prefix caching OFF: every tenant pays for the whole prefix ---
+        long blocks_off = 0;
+        {
+            PagedKVManager m(num_blocks * 8, block_size, /*enable_caching=*/false);
+            for (int t = 0; t < n_tenants; ++t) {
+                m.allocate(t, (size_t) (shared_len + distinct_len));
+                blocks_off += m.block_table(t).size();
+            }
+        }
+
+        // --- prefix caching ON: shared blocks are deduped to one physical copy ---
+        long blocks_on = 0;
+        {
+            PagedKVManager m(num_blocks * 8, block_size, /*enable_caching=*/true);
+            // tenant 0 fills + caches the shared prefix
+            auto h = m.compute_block_hashes(shared);
+            m.allocate(0, (size_t) (shared_len + distinct_len));
+            m.cache_blocks(0, h, (size_t) shared_len);
+            long physical = m.block_table(0).size();
+            // tenants 1..N-1 hit the cached prefix; only their distinct suffix is new
+            for (int t = 1; t < n_tenants; ++t) {
+                size_t cached_tokens = m.get_computed_blocks(h); // shared blocks reused
+                size_t new_tokens = (shared_len - cached_tokens) + distinct_len;
+                m.allocate(t, (size_t) (shared_len + distinct_len));
+                // physically new blocks = only what wasn't already resident
+                physical += (long) cdiv(new_tokens, block_size);
+            }
+            blocks_on = physical;
+        }
+
+        printf("WIN 3  cross-tenant prefix sharing (%d tenants, %d-tok shared prefix)\n",
+               n_tenants, shared_len);
+        printf("  prefix-cache OFF: %ld physical blocks\n", blocks_off);
+        printf("  prefix-cache ON:  %ld physical blocks\n", blocks_on);
+        printf("  --> %.1fx less KV memory for the shared workload\n\n",
+               blocks_on ? (double) blocks_off / blocks_on : 0.0);
+    }
+
+    printf("WIN 2  aggregate throughput under load: PENDING\n");
+    printf("  Requires the paged gather-read path wired into llama-graph.cpp\n");
+    printf("  (Gate 0) to measure tok/s vs concurrency. Not measurable at the\n");
+    printf("  allocation layer; not reported here to avoid overclaiming.\n");
+    return 0;
+}