From 5a5d3df8c8fe83e4926892f01d90e53835a156d9 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 19 Jun 2026 08:35:35 +0000 Subject: [PATCH] feat(paged): Phase 2 core - attention over paged KV matches reference Retire the central numeric risk from the design: feeding gather-to-scratch KV (a sequence whose blocks are non-contiguous in the shared pool, [2,1,5]) into ggml's standard attention ops produces correct attention. Path under test: set_rows write -> get_rows gather (K and V) -> mul_mat(K,Q) -> soft_max_ext -> mul_mat(V^T, probs). Result is compared against an independent host-computed softmax attention over the same K/V/Q. Max abs error ~7.5e-08 (n_kv=48, d=8, n_q=4). This proves the paged read path is numerically sound on CPU with no new ggml op. Remaining: wire build_attn_paged into llama-graph.cpp and validate Gate 0 (token-identical greedy generation in a real model). Phase 2 (core) of docs/superpowers/plans/2026-06-19-paged-attention-llamacpp.md. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto --- backend/cpp/llama-cpp/paged/.gitignore | 1 + backend/cpp/llama-cpp/paged/Makefile | 11 +- .../paged/tests/test_ggml_paged_attn.cpp | 133 ++++++++++++++++++ 3 files changed, 141 insertions(+), 4 deletions(-) create mode 100644 backend/cpp/llama-cpp/paged/tests/test_ggml_paged_attn.cpp diff --git a/backend/cpp/llama-cpp/paged/.gitignore b/backend/cpp/llama-cpp/paged/.gitignore index 66c7d044a..eaba3ba44 100644 --- a/backend/cpp/llama-cpp/paged/.gitignore +++ b/backend/cpp/llama-cpp/paged/.gitignore @@ -3,3 +3,4 @@ tests/test_block_pool tests/test_paged_kv_manager tests/test_prefix_cache tests/test_ggml_paged_rw +tests/test_ggml_paged_attn diff --git a/backend/cpp/llama-cpp/paged/Makefile b/backend/cpp/llama-cpp/paged/Makefile index 0e3f9e135..61c5e562a 100644 --- a/backend/cpp/llama-cpp/paged/Makefile +++ b/backend/cpp/llama-cpp/paged/Makefile @@ -19,14 +19,17 @@ GGML_SRC ?= ../../llama-cpp-fallback-build/llama.cpp/ggml GGML_BUILD ?= /tmp/ggml-build GGML_LIBDIR = $(GGML_BUILD)/src -tests/test_ggml_paged_rw: tests/test_ggml_paged_rw.cpp paged_kv_manager.cpp paged_kv_manager.h +GGML_TESTS = test_ggml_paged_rw test_ggml_paged_attn +GGML_BINS = $(addprefix tests/,$(GGML_TESTS)) + +tests/test_ggml_%: tests/test_ggml_%.cpp paged_kv_manager.cpp paged_kv_manager.h $(CXX) $(CXXFLAGS) -I$(GGML_SRC)/include -o $@ $< paged_kv_manager.cpp \ -L$(GGML_LIBDIR) -lggml -lggml-base -lggml-cpu -Wl,-rpath,$(GGML_LIBDIR) -ggml-check: tests/test_ggml_paged_rw - @echo "== tests/test_ggml_paged_rw =="; ./tests/test_ggml_paged_rw +ggml-check: $(GGML_BINS) + @for t in $(GGML_BINS); do echo "== $$t =="; ./$$t || exit 1; done clean: - rm -f $(BINS) tests/test_ggml_paged_rw + rm -f $(BINS) $(GGML_BINS) .PHONY: all check ggml-check clean diff --git a/backend/cpp/llama-cpp/paged/tests/test_ggml_paged_attn.cpp b/backend/cpp/llama-cpp/paged/tests/test_ggml_paged_attn.cpp new file mode 100644 index 000000000..0a8b59ff7 --- /dev/null +++ b/backend/cpp/llama-cpp/paged/tests/test_ggml_paged_attn.cpp @@ -0,0 +1,133 @@ +// Phase 2 (core numeric de-risk): attention over GATHERED paged KV must equal +// an independent host-computed reference. +// +// This answers the central risk in the design: feeding gather-to-scratch KV +// (a sequence whose blocks are non-contiguous in the shared pool) into ggml's +// standard attention ops (mul_mat -> soft_max_ext -> mul_mat) produces correct +// attention. If this holds, the paged read path is numerically sound; the +// remaining work is wiring it into llama-graph.cpp (Gate 0 in a real model). + +#include "../paged_kv_manager.h" + +#include "ggml.h" +#include "ggml-cpu.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" + +#include +#include +#include +#include + +using namespace paged; + +int main() { + const int d = 8; // head dim + const int n_kv = 48; // 3 blocks worth of KV tokens + const int n_q = 4; // query tokens + const int block_size = 16; + const int num_blocks = 8; + const int total_slots = block_size * num_blocks; + const float scale = 1.0f / std::sqrt((float) d); + + // Non-contiguous physical layout for the KV sequence (blocks [2,1,5]). + PagedKVManager m(num_blocks, block_size, /*enable_caching=*/false); + assert(m.allocate(0, 2 * block_size)); + assert(m.allocate(1, 2 * block_size)); + m.free(0); + assert(m.allocate(2, n_kv)); + std::vector positions(n_kv); + for (int i = 0; i < n_kv; ++i) positions[i] = i; + auto slots64 = m.slot_mapping(2, positions); + std::vector slots32(slots64.begin(), slots64.end()); + + // Deterministic K, V, Q in logical [d, n] layout (column-major: col = token). + std::vector K(d * n_kv), V(d * n_kv), Q(d * n_q); + for (int t = 0; t < n_kv; ++t) + for (int e = 0; e < d; ++e) { + K[t * d + e] = std::sin(0.1f * t + 0.3f * e); + V[t * d + e] = std::cos(0.2f * t - 0.1f * e); + } + for (int q = 0; q < n_q; ++q) + for (int e = 0; e < d; ++e) Q[q * d + e] = std::sin(0.05f * q + 0.7f * e); + + // ---- Independent host reference attention ------------------------------- + std::vector ref(d * n_q, 0.0f); + for (int q = 0; q < n_q; ++q) { + std::vector score(n_kv); + float mx = -1e30f; + for (int t = 0; t < n_kv; ++t) { + float dot = 0.0f; + for (int e = 0; e < d; ++e) dot += K[t * d + e] * Q[q * d + e]; + score[t] = dot * scale; + mx = std::fmax(mx, score[t]); + } + float sum = 0.0f; + for (int t = 0; t < n_kv; ++t) { score[t] = std::exp(score[t] - mx); sum += score[t]; } + for (int t = 0; t < n_kv; ++t) { + float p = score[t] / sum; + for (int e = 0; e < d; ++e) ref[q * d + e] += p * V[t * d + e]; + } + } + + // ---- ggml paged path ---------------------------------------------------- + ggml_backend_t backend = ggml_backend_cpu_init(); + struct ggml_init_params dp = { ggml_tensor_overhead() * 16, NULL, true }; + struct ggml_context * ctx_data = ggml_init(dp); + + struct ggml_tensor * poolK = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, d, total_slots); + struct ggml_tensor * poolV = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, d, total_slots); + struct ggml_tensor * kSrc = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, d, n_kv); + struct ggml_tensor * vSrc = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, d, n_kv); + struct ggml_tensor * qT = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, d, n_q); + struct ggml_tensor * wIdx = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I64, n_kv); + struct ggml_tensor * gIdx = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I32, n_kv); + + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx_data, backend); + std::vector zeros(d * total_slots, 0.0f); + ggml_backend_tensor_set(poolK, zeros.data(), 0, ggml_nbytes(poolK)); + ggml_backend_tensor_set(poolV, zeros.data(), 0, ggml_nbytes(poolV)); + ggml_backend_tensor_set(kSrc, K.data(), 0, ggml_nbytes(kSrc)); + ggml_backend_tensor_set(vSrc, V.data(), 0, ggml_nbytes(vSrc)); + ggml_backend_tensor_set(qT, Q.data(), 0, ggml_nbytes(qT)); + ggml_backend_tensor_set(wIdx, slots64.data(), 0, ggml_nbytes(wIdx)); + ggml_backend_tensor_set(gIdx, slots32.data(), 0, ggml_nbytes(gIdx)); + + struct ggml_init_params cp = { ggml_tensor_overhead() * 64 + ggml_graph_overhead(), NULL, true }; + struct ggml_context * ctx = ggml_init(cp); + + struct ggml_tensor * wroteK = ggml_set_rows(ctx, poolK, kSrc, wIdx); + struct ggml_tensor * wroteV = ggml_set_rows(ctx, poolV, vSrc, wIdx); + struct ggml_tensor * gK = ggml_get_rows(ctx, wroteK, gIdx); // [d, n_kv] + struct ggml_tensor * gV = ggml_get_rows(ctx, wroteV, gIdx); // [d, n_kv] + + struct ggml_tensor * kq = ggml_mul_mat(ctx, gK, qT); // [n_kv, n_q] + struct ggml_tensor * probs = ggml_soft_max_ext(ctx, kq, NULL, scale, 0.0f); + struct ggml_tensor * vT = ggml_cont(ctx, ggml_transpose(ctx, gV)); // [n_kv, d] + struct ggml_tensor * out = ggml_mul_mat(ctx, vT, probs); // [d, n_q] + ggml_set_output(out); + + struct ggml_cgraph * gf = ggml_new_graph(ctx); + ggml_build_forward_expand(gf, out); + ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); + assert(ggml_gallocr_alloc_graph(galloc, gf)); + assert(ggml_backend_graph_compute(backend, gf) == GGML_STATUS_SUCCESS); + + std::vector got(d * n_q); + ggml_backend_tensor_get(out, got.data(), 0, ggml_nbytes(out)); + + // ---- compare ------------------------------------------------------------ + double max_err = 0.0; + for (int i = 0; i < d * n_q; ++i) max_err = std::fmax(max_err, std::fabs(got[i] - ref[i])); + printf("paged attention max abs err vs host reference: %.3e\n", max_err); + assert(max_err < 1e-4 && "paged-gathered attention must match host reference"); + + ggml_gallocr_free(galloc); + ggml_free(ctx); + ggml_free(ctx_data); + ggml_backend_buffer_free(buf); + ggml_backend_free(backend); + + printf("test_ggml_paged_attn: OK (attention over non-contiguous paged KV matches reference)\n"); + return 0; +}