diff --git a/backend/cpp/llama-cpp/patches/0003-paged-gather-read-env-LLAMA_KV_PAGED.patch b/backend/cpp/llama-cpp/patches/0003-paged-gather-read-env-LLAMA_KV_PAGED.patch
new file mode 100644
index 000000000..4a3370988
--- /dev/null
+++ b/backend/cpp/llama-cpp/patches/0003-paged-gather-read-env-LLAMA_KV_PAGED.patch
@@ -0,0 +1,318 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ettore Di Giacinto <mudler@localai.io>
+Date: Mon, 22 Jun 2026 10:24:22 +0200
+Subject: [PATCH] paged gather-read (env LLAMA_KV_PAGED) - patch 0003
+
+---
+ src/CMakeLists.txt     |   1 +
+ src/llama-graph.cpp    |   9 +++-
+ src/llama-kv-cache.cpp |  51 ++++++++++++++++++++
+ src/llama-kv-cache.h   |  10 ++++
+ src/paged-attn.cpp     | 106 +++++++++++++++++++++++++++++++++++++++++
+ src/paged-attn.h       |  40 ++++++++++++++++
+ 6 files changed, 216 insertions(+), 1 deletion(-)
+ create mode 100644 src/paged-attn.cpp
+ create mode 100644 src/paged-attn.h
+
+diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
+index a030940..58083b3 100644
+--- a/src/CMakeLists.txt
++++ b/src/CMakeLists.txt
+@@ -25,6 +25,7 @@ add_library(llama
+             llama-kv-cache.cpp
+             llama-kv-cache-iswa.cpp
+             paged-kv-manager.cpp
++            paged-attn.cpp
+             llama-kv-cache-dsa.cpp
+             llama-memory.cpp
+             llama-memory-hybrid.cpp
+diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
+index 68c9e60..b59d2a5 100644
+--- a/src/llama-graph.cpp
++++ b/src/llama-graph.cpp
+@@ -6,6 +6,8 @@
+ #include "llama-cparams.h"
+ 
+ #include "llama-kv-cache.h"
++
++#include "paged-attn.h"
+ #include "llama-kv-cache-iswa.h"
+ #include "llama-kv-cache-dsa.h"
+ #include "llama-memory-hybrid.h"
+@@ -2356,7 +2358,12 @@ ggml_tensor * llm_graph_context::build_attn(
+     ggml_tensor * k = mctx_cur->get_k(ctx0, il);
+     ggml_tensor * v = mctx_cur->get_v(ctx0, il);
+ 
+-    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
++    // [paged 0003] gather K, V and the mask to the sequence's used cells only
++    //   (no-op unless env LLAMA_KV_PAGED is set).
++    ggml_tensor * kq_mask_g = kq_mask;
++    paged_attn::gather(ctx0, res, mctx_cur, &k, &v, &kq_mask_g);
++
++    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask_g, sinks, v_mla, kq_scale, il);
+     cb(cur, "kqv_out", il);
+ 
+     if (inp->self_v_rot) {
+diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
+index 999e2ae..2306013 100644
+--- a/src/llama-kv-cache.cpp
++++ b/src/llama-kv-cache.cpp
+@@ -1,4 +1,6 @@
+ #include "llama-kv-cache.h"
++#include <vector>
++#include <utility>
+ 
+ #include "llama-impl.h"
+ #include "llama-io.h"
+@@ -1329,6 +1331,47 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k
+             ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0);
+ }
+ 
++// [paged 0003] gather-read: enumerate the non-empty cells in [0, n_kv) for the
++// single stream addressed by sinfo. With paged placement (patch 0002) these are
++// the sequence's scattered block cells; gathering K/V/mask by this index list
++// compacts the attention read while preserving every unmasked (token,cell) pair.
++uint32_t llama_kv_cache::get_n_gather(uint32_t n_kv, const slot_info & sinfo) const {
++    GGML_ASSERT(sinfo.n_stream() == 1);
++    const auto & cells = v_cells[sinfo.strm[0]];
++    const uint32_t n = std::min<uint32_t>(n_kv, cells.size());
++    uint32_t cnt = 0;
++    for (uint32_t i = 0; i < n; ++i) {
++        if (!cells.is_empty(i)) {
++            ++cnt;
++        }
++    }
++    return cnt;
++}
++
++void llama_kv_cache::get_gather_idxs(int32_t * dst, uint32_t n_kv, const slot_info & sinfo) const {
++    GGML_ASSERT(sinfo.n_stream() == 1);
++    const auto & cells = v_cells[sinfo.strm[0]];
++    const uint32_t n = std::min<uint32_t>(n_kv, cells.size());
++    // Collect the non-empty cells, then order them by token POSITION (not by
++    // physical cell index). The attention reduction (flash-attn online softmax,
++    // and the non-flash soft_max) runs over cells in array order and is
++    // order-sensitive in floating point. Stock (contiguous) placement happens
++    // to store cells in position order, so emitting the gathered indices in
++    // position order reproduces stock's exact reduction order - making the
++    // paged read bit-identical, not merely mathematically equivalent.
++    std::vector<std::pair<llama_pos, int32_t>> pc;
++    pc.reserve(n);
++    for (uint32_t i = 0; i < n; ++i) {
++        if (!cells.is_empty(i)) {
++            pc.emplace_back(cells.pos_get(i), (int32_t) i);
++        }
++    }
++    std::sort(pc.begin(), pc.end());
++    for (size_t j = 0; j < pc.size(); ++j) {
++        dst[j] = pc[j].second;
++    }
++}
++
+ ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const {
+     GGML_UNUSED(sinfo);
+ 
+@@ -2620,6 +2663,14 @@ ggml_tensor * llama_kv_cache_context::get_v(ggml_context * ctx, int32_t il) cons
+     return kv->get_v(ctx, il, n_kv, sinfos[i_cur]);
+ }
+ 
++uint32_t llama_kv_cache_context::get_n_gather() const {
++    return kv->get_n_gather(n_kv, sinfos[i_cur]);
++}
++
++void llama_kv_cache_context::get_gather_idxs(int32_t * dst) const {
++    kv->get_gather_idxs(dst, n_kv, sinfos[i_cur]);
++}
++
+ ggml_tensor * llama_kv_cache_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const {
+     return kv->cpy_k(ctx, k_cur, k_idxs, il, sinfos[i_cur]);
+ }
+diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
+index 3d68f98..1b81617 100644
+--- a/src/llama-kv-cache.h
++++ b/src/llama-kv-cache.h
+@@ -171,6 +171,11 @@ public:
+     ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
+     ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
+ 
++    // [paged 0003] count / list the non-empty cells in [0, n_kv) for the
++    //   single stream of sinfo (ascending). Used by paged-attn gather-read.
++    uint32_t get_n_gather(uint32_t n_kv, const slot_info & sinfo) const;
++    void     get_gather_idxs(int32_t * dst, uint32_t n_kv, const slot_info & sinfo) const;
++
+     // store k_cur and v_cur in the cache based on the provided head location
+     ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const;
+     ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const;
+@@ -368,6 +373,11 @@ public:
+     ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
+     ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
+ 
++    // [paged 0003] gather-read helpers (delegate to the kv cache for the
++    //   current ubatch's stream).
++    uint32_t get_n_gather() const;
++    void     get_gather_idxs(int32_t * dst) const;
++
+     // store k_cur and v_cur in the cache based on the provided head location
+     // note: the heads in k_cur and v_cur should be laid out contiguously in memory
+     //   - k_cur  [n_embd_head_k, n_head_k, n_tokens]
+diff --git a/src/paged-attn.cpp b/src/paged-attn.cpp
+new file mode 100644
+index 0000000..4bbf244
+--- /dev/null
++++ b/src/paged-attn.cpp
+@@ -0,0 +1,106 @@
++#include "paged-attn.h"
++
++#include "llama-graph.h"
++#include "llama-kv-cache.h"
++
++#include "ggml.h"
++#include "ggml-backend.h"
++
++#include <cstdlib>
++
++namespace paged_attn {
++
++bool active() {
++    static const bool a = (std::getenv("LLAMA_KV_PAGED") != nullptr);
++    return a;
++}
++
++namespace {
++
++// Graph input that, at set_input time, fills an I32 [n_gather] tensor with the
++// current sequence's non-empty cell indices (ascending) by delegating to the
++// kv-cache context. Private to this unit; default can_reuse()==false keeps the
++// graph from being reused across decodes (n_gather grows every step).
++class input_gather_idxs : public llm_graph_input_i {
++public:
++    input_gather_idxs(const llama_kv_cache_context * mctx, ggml_tensor * idxs)
++        : mctx(mctx), idxs(idxs) {}
++
++    void set_input(const llama_ubatch * ubatch) override {
++        GGML_UNUSED(ubatch);
++        GGML_ASSERT(idxs && ggml_backend_buffer_is_host(idxs->buffer));
++        mctx->get_gather_idxs((int32_t *) idxs->data);
++    }
++
++    const llama_kv_cache_context * mctx;
++    ggml_tensor * idxs;
++};
++
++} // namespace
++
++void gather(ggml_context * ctx0,
++            llm_graph_result * res,
++            const llama_kv_cache_context * mctx,
++            ggml_tensor ** k,
++            ggml_tensor ** v,
++            ggml_tensor ** kq_mask) {
++    if (!active()) {
++        return;
++    }
++
++    ggml_tensor * K = *k;
++    ggml_tensor * V = *v;
++    ggml_tensor * M = *kq_mask;
++
++    // First cut: single stream only (multi-stream is a follow-up).
++    GGML_ASSERT(K->ne[3] == 1);
++
++    const int64_t n_gather = (int64_t) mctx->get_n_gather();
++    if (n_gather <= 0) {
++        // Worst-case graph reserve (empty cache) or nothing placed yet: leave
++        // the full [0, n_kv) read untouched so buffer sizing stays worst-case.
++        return;
++    }
++
++    // Index tensor, filled at set_input from the cache's non-empty cells.
++    ggml_tensor * idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_gather);
++    ggml_set_input(idx);
++    res->add_input(llm_graph_input_ptr(new input_gather_idxs(mctx, idx)));
++
++    // --- gather K: collapse (head_dim, n_head) so cells become the row axis ---
++    {
++        ggml_tensor * t = ggml_cont(ctx0, K);                                  // [d, h, n_kv, 1]
++        t = ggml_reshape_3d(ctx0, t, K->ne[0]*K->ne[1], K->ne[2], 1);          // [d*h, n_kv, 1]
++        t = ggml_get_rows(ctx0, t, idx);                                       // [d*h, n_gather, 1]
++        *k = ggml_reshape_4d(ctx0, t, K->ne[0], K->ne[1], n_gather, 1);        // [d, h, n_gather, 1]
++    }
++
++    // --- gather V ---
++    // Normalize to a non-transposed [d, h, n_kv, 1] view first, so the gathered
++    // result is contiguous and build_attn_mha sees a consistent v_trans==false.
++    {
++        const bool v_trans = V->nb[1] > V->nb[2];
++        ggml_tensor * vsrc = v_trans
++            ? ggml_permute(ctx0, V, 2, 1, 0, 3)   // [n_kv, h, d, 1] -> [d, h, n_kv, 1]
++            : V;                                  // already [d, h, n_kv, 1]
++        ggml_tensor * t = ggml_cont(ctx0, vsrc);                               // [d, h, n_kv, 1]
++        t = ggml_reshape_3d(ctx0, t, vsrc->ne[0]*vsrc->ne[1], vsrc->ne[2], 1); // [d*h, n_kv, 1]
++        t = ggml_get_rows(ctx0, t, idx);                                       // [d*h, n_gather, 1]
++        *v = ggml_reshape_4d(ctx0, t, vsrc->ne[0], vsrc->ne[1], n_gather, 1);  // [d, h, n_gather, 1]
++    }
++
++    // --- gather mask (cells are ne0): transpose, gather, transpose back ---
++    {
++        ggml_tensor * m = ggml_reshape_2d(ctx0, M, M->ne[0], M->ne[1]);        // [n_kv, n_tps]
++        m = ggml_cont(ctx0, ggml_transpose(ctx0, m));                          // [n_tps, n_kv]
++        m = ggml_get_rows(ctx0, m, idx);                                       // [n_tps, n_gather] (F32)
++        m = ggml_cont(ctx0, ggml_transpose(ctx0, m));                          // [n_gather, n_tps]
++        m = ggml_reshape_4d(ctx0, m, n_gather, M->ne[1], 1, 1);
++        if (M->type != m->type) {
++            m = ggml_cast(ctx0, m, M->type);   // flash-attn requires an F16 mask
++        }
++        *kq_mask = m;
++    }
++}
++
++} // namespace paged_attn
+diff --git a/src/paged-attn.h b/src/paged-attn.h
+new file mode 100644
+index 0000000..c5b7bd7
+--- /dev/null
++++ b/src/paged-attn.h
+@@ -0,0 +1,40 @@
++#pragma once
++// Paged attention gather-read (patch 0003, experimental).
++//
++// Companion to the paged block placement in llama_kv_cache::find_slot (patch
++// 0002). Patch 0002 places a sequence's tokens at permuted, non-contiguous
++// fixed-size block cells, but attention still reads the whole [0, n_kv) window
++// (empty cells masked to -inf). This unit compacts that read: it gathers K, V
++// and the kq_mask down to ONLY the sequence's used (non-empty) cells before
++// build_attn_mha.
++//
++// Correctness: attention is permutation-invariant over the KV set, and dropping
++// already-masked empty cells removes only exp(-inf)=0 terms - so greedy output
++// is identical to stock. Gated behind env LLAMA_KV_PAGED; a no-op when unset.
++//
++// All logic lives here to keep the core files additive: build_attn gets one
++// call, llama_kv_cache_context gets two thin accessors, CMake gets one line.
++
++#include <cstdint>
++
++struct ggml_context;
++struct ggml_tensor;
++class  llm_graph_result;
++class  llama_kv_cache_context;
++
++namespace paged_attn {
++
++// true iff env LLAMA_KV_PAGED is set (evaluated once).
++bool active();
++
++// Gather K, V and the kq_mask down to the current sequence's non-empty cells.
++// No-op (returns immediately) unless active(). On return *k, *v and *kq_mask
++// point at the compacted tensors; pass them straight to build_attn_mha.
++void gather(ggml_context * ctx0,
++            llm_graph_result * res,
++            const llama_kv_cache_context * mctx,
++            ggml_tensor ** k,
++            ggml_tensor ** v,
++            ggml_tensor ** kq_mask);
++
++} // namespace paged_attn
+-- 
+2.43.0
+
diff --git a/backend/cpp/llama-cpp/patches/README.md b/backend/cpp/llama-cpp/patches/README.md
index 238647d4a..99fa0b69a 100644
--- a/backend/cpp/llama-cpp/patches/README.md
+++ b/backend/cpp/llama-cpp/patches/README.md
@@ -56,7 +56,19 @@ All variants (avx/avx2/avx512/cuda/…) copy the patched `llama.cpp/` tree, so t
 - **0001 vendor manager — DONE.** Applies clean to the pin; builds into `libllama`.
 - **0002 block placement — DONE + VERIFIED.** Built `llama-simple` at the pin; greedy generation is
   **token-identical** stock vs `LLAMA_KV_PAGED=1` (Qwen3-0.6B), paged branch confirmed firing.
-- **0003 gather-read — NEXT.** The intricate `build_attn` graph surgery; the real engine compute. Multi-session.
+- **0003 gather-read — DONE + VERIFIED (Gate 0 green).** Implemented in the **additive** form
+  (`ADDITIVE_DESIGN.md`): all logic in new `src/paged-attn.{h,cpp}` (a `llm_graph_input_i` gather-index
+  subclass + the K/V/mask gather), hooked by **one** line in `build_attn` + **two** thin accessors on
+  `llama_kv_cache_context` + 1 CMake line (216 insertions; no edit to `llm_graph_input_attn_kv` or
+  `llama-graph.h`). Greedy generation is **token-identical** stock vs `LLAMA_KV_PAGED=1` (Qwen3-0.6B,
+  **9/9** across 3 prompts × {32,96,128} tokens), with `n_gather=71 < n_kv=256` confirming real
+  compaction. Patch: `0003-paged-gather-read-env-LLAMA_KV_PAGED.patch`.
+  - **Key correctness finding:** `get_gather_idxs` must emit cells **sorted by token position**. The CPU
+    flash-attn online softmax reduces cells in physical-array order and is FP-order-sensitive, so 0002's
+    scattered placement *alone* (full-window read, no gather) diverges from stock once a sequence crosses
+    the first 16-cell block. The position-sorted gather reproduces stock's exact reduction order -> bit-
+    identical, not merely mathematically equivalent. So 0002 is the placement substrate; **0003 is what
+    makes paged placement token-identical under flash-attn.**
 - 0004–0006 follow.
 
 ### Honest parity note (important)