feat(paged): Phase 2 core - attention over paged KV matches reference

Retire the central numeric risk from the design: feeding gather-to-scratch KV (a sequence whose blocks are non-contiguous in the shared pool, [2,1,5]) into ggml's standard attention ops produces correct attention. Path under test: set_rows write -> get_rows gather (K and V) -> mul_mat(K,Q) -> soft_max_ext -> mul_mat(V^T, probs). Result is compared against an independent host-computed softmax attention over the same K/V/Q. Max abs error ~7.5e-08 (n_kv=48, d=8, n_q=4). This proves the paged read path is numerically sound on CPU with no new ggml op. Remaining: wire build_attn_paged into llama-graph.cpp and validate Gate 0 (token-identical greedy generation in a real model). Phase 2 (core) of docs/superpowers/plans/2026-06-19-paged-attention-llamacpp.md. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-23 08:08:52 -04:00 · 2026-06-19 08:35:35 +00:00
parent c6698dd4bf
commit 5a5d3df8c8
3 changed files with 141 additions and 4 deletions
--- a/backend/cpp/llama-cpp/paged/.gitignore
+++ b/backend/cpp/llama-cpp/paged/.gitignore
@@ -3,3 +3,4 @@ tests/test_block_pool
 tests/test_paged_kv_manager
 tests/test_prefix_cache
 tests/test_ggml_paged_rw
+tests/test_ggml_paged_attn
--- a/backend/cpp/llama-cpp/paged/Makefile
+++ b/backend/cpp/llama-cpp/paged/Makefile
@@ -19,14 +19,17 @@ GGML_SRC   ?= ../../llama-cpp-fallback-build/llama.cpp/ggml
 GGML_BUILD ?= /tmp/ggml-build
 GGML_LIBDIR = $(GGML_BUILD)/src

-tests/test_ggml_paged_rw: tests/test_ggml_paged_rw.cpp paged_kv_manager.cpp paged_kv_manager.h
+GGML_TESTS = test_ggml_paged_rw test_ggml_paged_attn
+GGML_BINS  = $(addprefix tests/,$(GGML_TESTS))
+
+tests/test_ggml_%: tests/test_ggml_%.cpp paged_kv_manager.cpp paged_kv_manager.h
 	$(CXX) $(CXXFLAGS) -I$(GGML_SRC)/include -o $@ $< paged_kv_manager.cpp \
 		-L$(GGML_LIBDIR) -lggml -lggml-base -lggml-cpu -Wl,-rpath,$(GGML_LIBDIR)

-ggml-check: tests/test_ggml_paged_rw
-	@echo "== tests/test_ggml_paged_rw =="; ./tests/test_ggml_paged_rw
+ggml-check: $(GGML_BINS)
+	@for t in $(GGML_BINS); do echo "== $$t =="; ./$$t || exit 1; done

 clean:
-	rm -f $(BINS) tests/test_ggml_paged_rw
+	rm -f $(BINS) $(GGML_BINS)

 .PHONY: all check ggml-check clean
--- a/backend/cpp/llama-cpp/paged/tests/test_ggml_paged_attn.cpp
+++ b/backend/cpp/llama-cpp/paged/tests/test_ggml_paged_attn.cpp
@@ -0,0 +1,133 @@
+// Phase 2 (core numeric de-risk): attention over GATHERED paged KV must equal
+// an independent host-computed reference.
+//
+// This answers the central risk in the design: feeding gather-to-scratch KV
+// (a sequence whose blocks are non-contiguous in the shared pool) into ggml's
+// standard attention ops (mul_mat -> soft_max_ext -> mul_mat) produces correct
+// attention. If this holds, the paged read path is numerically sound; the
+// remaining work is wiring it into llama-graph.cpp (Gate 0 in a real model).
+
+#include "../paged_kv_manager.h"
+
+#include "ggml.h"
+#include "ggml-cpu.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+
+#include <cassert>
+#include <cstdio>
+#include <cmath>
+#include <vector>
+
+using namespace paged;
+
+int main() {
+    const int d          = 8;     // head dim
+    const int n_kv       = 48;    // 3 blocks worth of KV tokens
+    const int n_q        = 4;     // query tokens
+    const int block_size = 16;
+    const int num_blocks = 8;
+    const int total_slots = block_size * num_blocks;
+    const float scale = 1.0f / std::sqrt((float) d);
+
+    // Non-contiguous physical layout for the KV sequence (blocks [2,1,5]).
+    PagedKVManager m(num_blocks, block_size, /*enable_caching=*/false);
+    assert(m.allocate(0, 2 * block_size));
+    assert(m.allocate(1, 2 * block_size));
+    m.free(0);
+    assert(m.allocate(2, n_kv));
+    std::vector<int> positions(n_kv);
+    for (int i = 0; i < n_kv; ++i) positions[i] = i;
+    auto slots64 = m.slot_mapping(2, positions);
+    std::vector<int32_t> slots32(slots64.begin(), slots64.end());
+
+    // Deterministic K, V, Q in logical [d, n] layout (column-major: col = token).
+    std::vector<float> K(d * n_kv), V(d * n_kv), Q(d * n_q);
+    for (int t = 0; t < n_kv; ++t)
+        for (int e = 0; e < d; ++e) {
+            K[t * d + e] = std::sin(0.1f * t + 0.3f * e);
+            V[t * d + e] = std::cos(0.2f * t - 0.1f * e);
+        }
+    for (int q = 0; q < n_q; ++q)
+        for (int e = 0; e < d; ++e) Q[q * d + e] = std::sin(0.05f * q + 0.7f * e);
+
+    // ---- Independent host reference attention -------------------------------
+    std::vector<float> ref(d * n_q, 0.0f);
+    for (int q = 0; q < n_q; ++q) {
+        std::vector<float> score(n_kv);
+        float mx = -1e30f;
+        for (int t = 0; t < n_kv; ++t) {
+            float dot = 0.0f;
+            for (int e = 0; e < d; ++e) dot += K[t * d + e] * Q[q * d + e];
+            score[t] = dot * scale;
+            mx = std::fmax(mx, score[t]);
+        }
+        float sum = 0.0f;
+        for (int t = 0; t < n_kv; ++t) { score[t] = std::exp(score[t] - mx); sum += score[t]; }
+        for (int t = 0; t < n_kv; ++t) {
+            float p = score[t] / sum;
+            for (int e = 0; e < d; ++e) ref[q * d + e] += p * V[t * d + e];
+        }
+    }
+
+    // ---- ggml paged path ----------------------------------------------------
+    ggml_backend_t backend = ggml_backend_cpu_init();
+    struct ggml_init_params dp = { ggml_tensor_overhead() * 16, NULL, true };
+    struct ggml_context * ctx_data = ggml_init(dp);
+
+    struct ggml_tensor * poolK = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, d, total_slots);
+    struct ggml_tensor * poolV = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, d, total_slots);
+    struct ggml_tensor * kSrc  = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, d, n_kv);
+    struct ggml_tensor * vSrc  = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, d, n_kv);
+    struct ggml_tensor * qT    = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, d, n_q);
+    struct ggml_tensor * wIdx  = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I64, n_kv);
+    struct ggml_tensor * gIdx  = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I32, n_kv);
+
+    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx_data, backend);
+    std::vector<float> zeros(d * total_slots, 0.0f);
+    ggml_backend_tensor_set(poolK, zeros.data(), 0, ggml_nbytes(poolK));
+    ggml_backend_tensor_set(poolV, zeros.data(), 0, ggml_nbytes(poolV));
+    ggml_backend_tensor_set(kSrc, K.data(), 0, ggml_nbytes(kSrc));
+    ggml_backend_tensor_set(vSrc, V.data(), 0, ggml_nbytes(vSrc));
+    ggml_backend_tensor_set(qT,   Q.data(), 0, ggml_nbytes(qT));
+    ggml_backend_tensor_set(wIdx, slots64.data(), 0, ggml_nbytes(wIdx));
+    ggml_backend_tensor_set(gIdx, slots32.data(), 0, ggml_nbytes(gIdx));
+
+    struct ggml_init_params cp = { ggml_tensor_overhead() * 64 + ggml_graph_overhead(), NULL, true };
+    struct ggml_context * ctx = ggml_init(cp);
+
+    struct ggml_tensor * wroteK = ggml_set_rows(ctx, poolK, kSrc, wIdx);
+    struct ggml_tensor * wroteV = ggml_set_rows(ctx, poolV, vSrc, wIdx);
+    struct ggml_tensor * gK = ggml_get_rows(ctx, wroteK, gIdx);          // [d, n_kv]
+    struct ggml_tensor * gV = ggml_get_rows(ctx, wroteV, gIdx);          // [d, n_kv]
+
+    struct ggml_tensor * kq    = ggml_mul_mat(ctx, gK, qT);              // [n_kv, n_q]
+    struct ggml_tensor * probs = ggml_soft_max_ext(ctx, kq, NULL, scale, 0.0f);
+    struct ggml_tensor * vT    = ggml_cont(ctx, ggml_transpose(ctx, gV)); // [n_kv, d]
+    struct ggml_tensor * out   = ggml_mul_mat(ctx, vT, probs);           // [d, n_q]
+    ggml_set_output(out);
+
+    struct ggml_cgraph * gf = ggml_new_graph(ctx);
+    ggml_build_forward_expand(gf, out);
+    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
+    assert(ggml_gallocr_alloc_graph(galloc, gf));
+    assert(ggml_backend_graph_compute(backend, gf) == GGML_STATUS_SUCCESS);
+
+    std::vector<float> got(d * n_q);
+    ggml_backend_tensor_get(out, got.data(), 0, ggml_nbytes(out));
+
+    // ---- compare ------------------------------------------------------------
+    double max_err = 0.0;
+    for (int i = 0; i < d * n_q; ++i) max_err = std::fmax(max_err, std::fabs(got[i] - ref[i]));
+    printf("paged attention max abs err vs host reference: %.3e\n", max_err);
+    assert(max_err < 1e-4 && "paged-gathered attention must match host reference");
+
+    ggml_gallocr_free(galloc);
+    ggml_free(ctx);
+    ggml_free(ctx_data);
+    ggml_backend_buffer_free(buf);
+    ggml_backend_free(backend);
+
+    printf("test_ggml_paged_attn: OK (attention over non-contiguous paged KV matches reference)\n");
+    return 0;
+}