mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-23 16:19:07 -04:00
feat(paged): paged-bench - measure capacity & prefix-sharing wins
Quantify the two multi-tenant wins that are properties of the host-side
block model (vLLM-parity), independent of the in-model compute path:
WIN 1 concurrency capacity @ 512-block budget
contiguous (reserve n_ctx/seq): 4 sequences
paged (on-demand blocks): 37 sequences
--> 9.2x more concurrent sequences
WIN 3 cross-tenant prefix sharing (32 tenants, 1024-tok shared prefix)
prefix-cache OFF: 2176 physical blocks
prefix-cache ON: 192 physical blocks
--> 11.3x less KV memory
WIN 2 (throughput) is deliberately reported as PENDING: it requires the
paged gather-read path wired into llama-graph.cpp (Gate 0) and is not
measurable at the allocation layer. The win-1 baseline is per-sequence
n_ctx reservation (stream mode); llama.cpp's unified cache already shares
one pool, so the honest win there is on-demand sizing + prefix dedup.
Phase 3 (partial) of docs/superpowers/plans/2026-06-19-paged-attention-llamacpp.md.
Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
@@ -12,6 +12,12 @@ tests/%: tests/%.cpp paged_kv_manager.cpp paged_kv_manager.h
|
||||
check: all
|
||||
@for t in $(BINS); do echo "== $$t =="; ./$$t || exit 1; done
|
||||
|
||||
paged-bench: paged-bench.cpp paged_kv_manager.cpp paged_kv_manager.h
|
||||
$(CXX) $(CXXFLAGS) -o $@ paged-bench.cpp paged_kv_manager.cpp
|
||||
|
||||
bench: paged-bench
|
||||
./paged-bench
|
||||
|
||||
# --- Optional ggml integration test (Phase 1: paged write/gather mechanism) ---
|
||||
# Requires a built ggml. Override these to point at your checkout / build:
|
||||
# make ggml-check GGML_SRC=<llama.cpp>/ggml GGML_BUILD=<ggml-build>
|
||||
@@ -30,6 +36,6 @@ ggml-check: $(GGML_BINS)
|
||||
@for t in $(GGML_BINS); do echo "== $$t =="; ./$$t || exit 1; done
|
||||
|
||||
clean:
|
||||
rm -f $(BINS) $(GGML_BINS)
|
||||
rm -f $(BINS) $(GGML_BINS) paged-bench
|
||||
|
||||
.PHONY: all check ggml-check clean
|
||||
|
||||
Reference in New Issue
Block a user