diff --git a/backend/cpp/llama-cpp/patches/0003-paged-gather-read-env-LLAMA_KV_PAGED.patch b/backend/cpp/llama-cpp/patches/0003-paged-gather-read-env-LLAMA_KV_PAGED.patch new file mode 100644 index 000000000..4a3370988 --- /dev/null +++ b/backend/cpp/llama-cpp/patches/0003-paged-gather-read-env-LLAMA_KV_PAGED.patch @@ -0,0 +1,318 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Ettore Di Giacinto +Date: Mon, 22 Jun 2026 10:24:22 +0200 +Subject: [PATCH] paged gather-read (env LLAMA_KV_PAGED) - patch 0003 + +--- + src/CMakeLists.txt | 1 + + src/llama-graph.cpp | 9 +++- + src/llama-kv-cache.cpp | 51 ++++++++++++++++++++ + src/llama-kv-cache.h | 10 ++++ + src/paged-attn.cpp | 106 +++++++++++++++++++++++++++++++++++++++++ + src/paged-attn.h | 40 ++++++++++++++++ + 6 files changed, 216 insertions(+), 1 deletion(-) + create mode 100644 src/paged-attn.cpp + create mode 100644 src/paged-attn.h + +diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt +index a030940..58083b3 100644 +--- a/src/CMakeLists.txt ++++ b/src/CMakeLists.txt +@@ -25,6 +25,7 @@ add_library(llama + llama-kv-cache.cpp + llama-kv-cache-iswa.cpp + paged-kv-manager.cpp ++ paged-attn.cpp + llama-kv-cache-dsa.cpp + llama-memory.cpp + llama-memory-hybrid.cpp +diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp +index 68c9e60..b59d2a5 100644 +--- a/src/llama-graph.cpp ++++ b/src/llama-graph.cpp +@@ -6,6 +6,8 @@ + #include "llama-cparams.h" + + #include "llama-kv-cache.h" ++ ++#include "paged-attn.h" + #include "llama-kv-cache-iswa.h" + #include "llama-kv-cache-dsa.h" + #include "llama-memory-hybrid.h" +@@ -2356,7 +2358,12 @@ ggml_tensor * llm_graph_context::build_attn( + ggml_tensor * k = mctx_cur->get_k(ctx0, il); + ggml_tensor * v = mctx_cur->get_v(ctx0, il); + +- ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il); ++ // [paged 0003] gather K, V and the mask to the sequence's used cells only ++ // (no-op unless env LLAMA_KV_PAGED is set). ++ ggml_tensor * kq_mask_g = kq_mask; ++ paged_attn::gather(ctx0, res, mctx_cur, &k, &v, &kq_mask_g); ++ ++ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask_g, sinks, v_mla, kq_scale, il); + cb(cur, "kqv_out", il); + + if (inp->self_v_rot) { +diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp +index 999e2ae..2306013 100644 +--- a/src/llama-kv-cache.cpp ++++ b/src/llama-kv-cache.cpp +@@ -1,4 +1,6 @@ + #include "llama-kv-cache.h" ++#include ++#include + + #include "llama-impl.h" + #include "llama-io.h" +@@ -1329,6 +1331,47 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k + ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0); + } + ++// [paged 0003] gather-read: enumerate the non-empty cells in [0, n_kv) for the ++// single stream addressed by sinfo. With paged placement (patch 0002) these are ++// the sequence's scattered block cells; gathering K/V/mask by this index list ++// compacts the attention read while preserving every unmasked (token,cell) pair. ++uint32_t llama_kv_cache::get_n_gather(uint32_t n_kv, const slot_info & sinfo) const { ++ GGML_ASSERT(sinfo.n_stream() == 1); ++ const auto & cells = v_cells[sinfo.strm[0]]; ++ const uint32_t n = std::min(n_kv, cells.size()); ++ uint32_t cnt = 0; ++ for (uint32_t i = 0; i < n; ++i) { ++ if (!cells.is_empty(i)) { ++ ++cnt; ++ } ++ } ++ return cnt; ++} ++ ++void llama_kv_cache::get_gather_idxs(int32_t * dst, uint32_t n_kv, const slot_info & sinfo) const { ++ GGML_ASSERT(sinfo.n_stream() == 1); ++ const auto & cells = v_cells[sinfo.strm[0]]; ++ const uint32_t n = std::min(n_kv, cells.size()); ++ // Collect the non-empty cells, then order them by token POSITION (not by ++ // physical cell index). The attention reduction (flash-attn online softmax, ++ // and the non-flash soft_max) runs over cells in array order and is ++ // order-sensitive in floating point. Stock (contiguous) placement happens ++ // to store cells in position order, so emitting the gathered indices in ++ // position order reproduces stock's exact reduction order - making the ++ // paged read bit-identical, not merely mathematically equivalent. ++ std::vector> pc; ++ pc.reserve(n); ++ for (uint32_t i = 0; i < n; ++i) { ++ if (!cells.is_empty(i)) { ++ pc.emplace_back(cells.pos_get(i), (int32_t) i); ++ } ++ } ++ std::sort(pc.begin(), pc.end()); ++ for (size_t j = 0; j < pc.size(); ++j) { ++ dst[j] = pc[j].second; ++ } ++} ++ + ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const { + GGML_UNUSED(sinfo); + +@@ -2620,6 +2663,14 @@ ggml_tensor * llama_kv_cache_context::get_v(ggml_context * ctx, int32_t il) cons + return kv->get_v(ctx, il, n_kv, sinfos[i_cur]); + } + ++uint32_t llama_kv_cache_context::get_n_gather() const { ++ return kv->get_n_gather(n_kv, sinfos[i_cur]); ++} ++ ++void llama_kv_cache_context::get_gather_idxs(int32_t * dst) const { ++ kv->get_gather_idxs(dst, n_kv, sinfos[i_cur]); ++} ++ + ggml_tensor * llama_kv_cache_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const { + return kv->cpy_k(ctx, k_cur, k_idxs, il, sinfos[i_cur]); + } +diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h +index 3d68f98..1b81617 100644 +--- a/src/llama-kv-cache.h ++++ b/src/llama-kv-cache.h +@@ -171,6 +171,11 @@ public: + ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const; + ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const; + ++ // [paged 0003] count / list the non-empty cells in [0, n_kv) for the ++ // single stream of sinfo (ascending). Used by paged-attn gather-read. ++ uint32_t get_n_gather(uint32_t n_kv, const slot_info & sinfo) const; ++ void get_gather_idxs(int32_t * dst, uint32_t n_kv, const slot_info & sinfo) const; ++ + // store k_cur and v_cur in the cache based on the provided head location + ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const; + ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const; +@@ -368,6 +373,11 @@ public: + ggml_tensor * get_k(ggml_context * ctx, int32_t il) const; + ggml_tensor * get_v(ggml_context * ctx, int32_t il) const; + ++ // [paged 0003] gather-read helpers (delegate to the kv cache for the ++ // current ubatch's stream). ++ uint32_t get_n_gather() const; ++ void get_gather_idxs(int32_t * dst) const; ++ + // store k_cur and v_cur in the cache based on the provided head location + // note: the heads in k_cur and v_cur should be laid out contiguously in memory + // - k_cur [n_embd_head_k, n_head_k, n_tokens] +diff --git a/src/paged-attn.cpp b/src/paged-attn.cpp +new file mode 100644 +index 0000000..4bbf244 +--- /dev/null ++++ b/src/paged-attn.cpp +@@ -0,0 +1,106 @@ ++#include "paged-attn.h" ++ ++#include "llama-graph.h" ++#include "llama-kv-cache.h" ++ ++#include "ggml.h" ++#include "ggml-backend.h" ++ ++#include ++ ++namespace paged_attn { ++ ++bool active() { ++ static const bool a = (std::getenv("LLAMA_KV_PAGED") != nullptr); ++ return a; ++} ++ ++namespace { ++ ++// Graph input that, at set_input time, fills an I32 [n_gather] tensor with the ++// current sequence's non-empty cell indices (ascending) by delegating to the ++// kv-cache context. Private to this unit; default can_reuse()==false keeps the ++// graph from being reused across decodes (n_gather grows every step). ++class input_gather_idxs : public llm_graph_input_i { ++public: ++ input_gather_idxs(const llama_kv_cache_context * mctx, ggml_tensor * idxs) ++ : mctx(mctx), idxs(idxs) {} ++ ++ void set_input(const llama_ubatch * ubatch) override { ++ GGML_UNUSED(ubatch); ++ GGML_ASSERT(idxs && ggml_backend_buffer_is_host(idxs->buffer)); ++ mctx->get_gather_idxs((int32_t *) idxs->data); ++ } ++ ++ const llama_kv_cache_context * mctx; ++ ggml_tensor * idxs; ++}; ++ ++} // namespace ++ ++void gather(ggml_context * ctx0, ++ llm_graph_result * res, ++ const llama_kv_cache_context * mctx, ++ ggml_tensor ** k, ++ ggml_tensor ** v, ++ ggml_tensor ** kq_mask) { ++ if (!active()) { ++ return; ++ } ++ ++ ggml_tensor * K = *k; ++ ggml_tensor * V = *v; ++ ggml_tensor * M = *kq_mask; ++ ++ // First cut: single stream only (multi-stream is a follow-up). ++ GGML_ASSERT(K->ne[3] == 1); ++ ++ const int64_t n_gather = (int64_t) mctx->get_n_gather(); ++ if (n_gather <= 0) { ++ // Worst-case graph reserve (empty cache) or nothing placed yet: leave ++ // the full [0, n_kv) read untouched so buffer sizing stays worst-case. ++ return; ++ } ++ ++ // Index tensor, filled at set_input from the cache's non-empty cells. ++ ggml_tensor * idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_gather); ++ ggml_set_input(idx); ++ res->add_input(llm_graph_input_ptr(new input_gather_idxs(mctx, idx))); ++ ++ // --- gather K: collapse (head_dim, n_head) so cells become the row axis --- ++ { ++ ggml_tensor * t = ggml_cont(ctx0, K); // [d, h, n_kv, 1] ++ t = ggml_reshape_3d(ctx0, t, K->ne[0]*K->ne[1], K->ne[2], 1); // [d*h, n_kv, 1] ++ t = ggml_get_rows(ctx0, t, idx); // [d*h, n_gather, 1] ++ *k = ggml_reshape_4d(ctx0, t, K->ne[0], K->ne[1], n_gather, 1); // [d, h, n_gather, 1] ++ } ++ ++ // --- gather V --- ++ // Normalize to a non-transposed [d, h, n_kv, 1] view first, so the gathered ++ // result is contiguous and build_attn_mha sees a consistent v_trans==false. ++ { ++ const bool v_trans = V->nb[1] > V->nb[2]; ++ ggml_tensor * vsrc = v_trans ++ ? ggml_permute(ctx0, V, 2, 1, 0, 3) // [n_kv, h, d, 1] -> [d, h, n_kv, 1] ++ : V; // already [d, h, n_kv, 1] ++ ggml_tensor * t = ggml_cont(ctx0, vsrc); // [d, h, n_kv, 1] ++ t = ggml_reshape_3d(ctx0, t, vsrc->ne[0]*vsrc->ne[1], vsrc->ne[2], 1); // [d*h, n_kv, 1] ++ t = ggml_get_rows(ctx0, t, idx); // [d*h, n_gather, 1] ++ *v = ggml_reshape_4d(ctx0, t, vsrc->ne[0], vsrc->ne[1], n_gather, 1); // [d, h, n_gather, 1] ++ } ++ ++ // --- gather mask (cells are ne0): transpose, gather, transpose back --- ++ { ++ ggml_tensor * m = ggml_reshape_2d(ctx0, M, M->ne[0], M->ne[1]); // [n_kv, n_tps] ++ m = ggml_cont(ctx0, ggml_transpose(ctx0, m)); // [n_tps, n_kv] ++ m = ggml_get_rows(ctx0, m, idx); // [n_tps, n_gather] (F32) ++ m = ggml_cont(ctx0, ggml_transpose(ctx0, m)); // [n_gather, n_tps] ++ m = ggml_reshape_4d(ctx0, m, n_gather, M->ne[1], 1, 1); ++ if (M->type != m->type) { ++ m = ggml_cast(ctx0, m, M->type); // flash-attn requires an F16 mask ++ } ++ *kq_mask = m; ++ } ++} ++ ++} // namespace paged_attn +diff --git a/src/paged-attn.h b/src/paged-attn.h +new file mode 100644 +index 0000000..c5b7bd7 +--- /dev/null ++++ b/src/paged-attn.h +@@ -0,0 +1,40 @@ ++#pragma once ++// Paged attention gather-read (patch 0003, experimental). ++// ++// Companion to the paged block placement in llama_kv_cache::find_slot (patch ++// 0002). Patch 0002 places a sequence's tokens at permuted, non-contiguous ++// fixed-size block cells, but attention still reads the whole [0, n_kv) window ++// (empty cells masked to -inf). This unit compacts that read: it gathers K, V ++// and the kq_mask down to ONLY the sequence's used (non-empty) cells before ++// build_attn_mha. ++// ++// Correctness: attention is permutation-invariant over the KV set, and dropping ++// already-masked empty cells removes only exp(-inf)=0 terms - so greedy output ++// is identical to stock. Gated behind env LLAMA_KV_PAGED; a no-op when unset. ++// ++// All logic lives here to keep the core files additive: build_attn gets one ++// call, llama_kv_cache_context gets two thin accessors, CMake gets one line. ++ ++#include ++ ++struct ggml_context; ++struct ggml_tensor; ++class llm_graph_result; ++class llama_kv_cache_context; ++ ++namespace paged_attn { ++ ++// true iff env LLAMA_KV_PAGED is set (evaluated once). ++bool active(); ++ ++// Gather K, V and the kq_mask down to the current sequence's non-empty cells. ++// No-op (returns immediately) unless active(). On return *k, *v and *kq_mask ++// point at the compacted tensors; pass them straight to build_attn_mha. ++void gather(ggml_context * ctx0, ++ llm_graph_result * res, ++ const llama_kv_cache_context * mctx, ++ ggml_tensor ** k, ++ ggml_tensor ** v, ++ ggml_tensor ** kq_mask); ++ ++} // namespace paged_attn +-- +2.43.0 + diff --git a/backend/cpp/llama-cpp/patches/README.md b/backend/cpp/llama-cpp/patches/README.md index 238647d4a..99fa0b69a 100644 --- a/backend/cpp/llama-cpp/patches/README.md +++ b/backend/cpp/llama-cpp/patches/README.md @@ -56,7 +56,19 @@ All variants (avx/avx2/avx512/cuda/…) copy the patched `llama.cpp/` tree, so t - **0001 vendor manager — DONE.** Applies clean to the pin; builds into `libllama`. - **0002 block placement — DONE + VERIFIED.** Built `llama-simple` at the pin; greedy generation is **token-identical** stock vs `LLAMA_KV_PAGED=1` (Qwen3-0.6B), paged branch confirmed firing. -- **0003 gather-read — NEXT.** The intricate `build_attn` graph surgery; the real engine compute. Multi-session. +- **0003 gather-read — DONE + VERIFIED (Gate 0 green).** Implemented in the **additive** form + (`ADDITIVE_DESIGN.md`): all logic in new `src/paged-attn.{h,cpp}` (a `llm_graph_input_i` gather-index + subclass + the K/V/mask gather), hooked by **one** line in `build_attn` + **two** thin accessors on + `llama_kv_cache_context` + 1 CMake line (216 insertions; no edit to `llm_graph_input_attn_kv` or + `llama-graph.h`). Greedy generation is **token-identical** stock vs `LLAMA_KV_PAGED=1` (Qwen3-0.6B, + **9/9** across 3 prompts × {32,96,128} tokens), with `n_gather=71 < n_kv=256` confirming real + compaction. Patch: `0003-paged-gather-read-env-LLAMA_KV_PAGED.patch`. + - **Key correctness finding:** `get_gather_idxs` must emit cells **sorted by token position**. The CPU + flash-attn online softmax reduces cells in physical-array order and is FP-order-sensitive, so 0002's + scattered placement *alone* (full-window read, no gather) diverges from stock once a sequence crosses + the first 16-cell block. The position-sorted gather reproduces stock's exact reduction order -> bit- + identical, not merely mathematically equivalent. So 0002 is the placement substrate; **0003 is what + makes paged placement token-identical under flash-attn.** - 0004–0006 follow. ### Honest parity note (important)