mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-30 11:26:32 -04:00
Compare commits
201 Commits
fix/watchd
...
worktree-f
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1b9176c2c8 | ||
|
|
2033086f60 | ||
|
|
8bb47e5a8a | ||
|
|
2431090ff3 | ||
|
|
baf1025245 | ||
|
|
6edbb56b06 | ||
|
|
bd100dd20a | ||
|
|
be65438eac | ||
|
|
7b38c6b2a3 | ||
|
|
042deab40e | ||
|
|
c4058eb4da | ||
|
|
f1c98ff0b9 | ||
|
|
b028c81eda | ||
|
|
2fa8ef8fc5 | ||
|
|
d706980c2b | ||
|
|
000705321f | ||
|
|
4bdd26a7f0 | ||
|
|
9a28f23134 | ||
|
|
e610347367 | ||
|
|
11128cb080 | ||
|
|
4cd90bfae9 | ||
|
|
2c59805267 | ||
|
|
c51ff4cec9 | ||
|
|
ea72a56e2c | ||
|
|
1f3e5ba301 | ||
|
|
4da769c1ca | ||
|
|
23b11a5239 | ||
|
|
9bb8994c4e | ||
|
|
0b84fda496 | ||
|
|
1431f72b92 | ||
|
|
266fcc79ad | ||
|
|
3466094c68 | ||
|
|
ed5eb705c7 | ||
|
|
53f66a6f03 | ||
|
|
08b754f910 | ||
|
|
db14006fcd | ||
|
|
a4e730979d | ||
|
|
9115c2c52c | ||
|
|
984c8fcbea | ||
|
|
4a9a1dd247 | ||
|
|
78fac9a28f | ||
|
|
fb2dc33d52 | ||
|
|
a5a5b2ad80 | ||
|
|
7e1832b868 | ||
|
|
2bee7a5ab1 | ||
|
|
e160041f05 | ||
|
|
400930db19 | ||
|
|
202a29f980 | ||
|
|
621a20d2b5 | ||
|
|
2332587fdc | ||
|
|
af6e133759 | ||
|
|
87cfd1fadb | ||
|
|
2a2de1d6c1 | ||
|
|
5667dfe461 | ||
|
|
34abf392fc | ||
|
|
683e22500f | ||
|
|
db6ebc53b2 | ||
|
|
9b0e4e544c | ||
|
|
e3f8149f3b | ||
|
|
9a1be79f04 | ||
|
|
62c407ed55 | ||
|
|
c1f1d1e8ea | ||
|
|
6dd8a3d895 | ||
|
|
79edfd26a3 | ||
|
|
bf9b4fafa8 | ||
|
|
b1667b48ea | ||
|
|
6c6a925213 | ||
|
|
3b59571579 | ||
|
|
b3d3323105 | ||
|
|
9c1c2a6a16 | ||
|
|
1f857f179e | ||
|
|
33dfe7fd41 | ||
|
|
fe5bd3f53d | ||
|
|
6bfca146d6 | ||
|
|
4d3fecd524 | ||
|
|
ec7c1b1f68 | ||
|
|
30a2b590d9 | ||
|
|
167768cac3 | ||
|
|
125d10a782 | ||
|
|
b061e4aef0 | ||
|
|
89e62fc74f | ||
|
|
001d833426 | ||
|
|
00f92659f8 | ||
|
|
7dd3431040 | ||
|
|
ae0042f214 | ||
|
|
aaaa90ae4b | ||
|
|
7c45447c9e | ||
|
|
24833f0966 | ||
|
|
634c0e5a0f | ||
|
|
64766ecc85 | ||
|
|
02cbae5ea9 | ||
|
|
3c1ed67b4b | ||
|
|
8f8777e0f4 | ||
|
|
5cec1a6a21 | ||
|
|
17855735c7 | ||
|
|
2a8103c419 | ||
|
|
fd4332e8f0 | ||
|
|
5825b073a5 | ||
|
|
a72385257a | ||
|
|
2b57997df0 | ||
|
|
e597a8ac78 | ||
|
|
b895f4dff8 | ||
|
|
c0e0ed3865 | ||
|
|
ee13fd18ce | ||
|
|
6f0792c3be | ||
|
|
5ce2f1df51 | ||
|
|
34cadb64af | ||
|
|
2dd5d68e6d | ||
|
|
da67fd87e2 | ||
|
|
40f019e761 | ||
|
|
39e16cc2c4 | ||
|
|
7434d64c75 | ||
|
|
c1d7f336cb | ||
|
|
ea634ee958 | ||
|
|
e4c63179e0 | ||
|
|
f7500df64e | ||
|
|
24ce7d0823 | ||
|
|
fccbb4082d | ||
|
|
5a38dd3f09 | ||
|
|
ed17fc804e | ||
|
|
362eea90ff | ||
|
|
c7075fb796 | ||
|
|
c8b1f16507 | ||
|
|
2975a74fb4 | ||
|
|
ee78ae4a11 | ||
|
|
acb22a66ed | ||
|
|
010067d900 | ||
|
|
8925c009b7 | ||
|
|
a3abd60ae0 | ||
|
|
dd6a4425e0 | ||
|
|
4bc2b4a9b2 | ||
|
|
ba6bd94976 | ||
|
|
e983919516 | ||
|
|
2c5adda28c | ||
|
|
ee13a94a8c | ||
|
|
4dcbcfcf92 | ||
|
|
80e0c1ac6b | ||
|
|
52f0f7b8cf | ||
|
|
f347f7ca1d | ||
|
|
0dd45f0da5 | ||
|
|
9537726649 | ||
|
|
d1ba327843 | ||
|
|
ecffd4b097 | ||
|
|
67c6208b3a | ||
|
|
667a21c119 | ||
|
|
04e3d04ab8 | ||
|
|
4968cd8a94 | ||
|
|
37e0e1ef55 | ||
|
|
d9d846e04b | ||
|
|
84d59e659b | ||
|
|
931793aa24 | ||
|
|
0337505dc8 | ||
|
|
faeb5b457c | ||
|
|
6e0b910210 | ||
|
|
aaf7b4112e | ||
|
|
037ad82b7c | ||
|
|
1887385b79 | ||
|
|
40ee9cdd13 | ||
|
|
d6c91b7d62 | ||
|
|
92e93dfc34 | ||
|
|
fdb7f56bb7 | ||
|
|
07985ba45b | ||
|
|
fc589b3fad | ||
|
|
2b79083b71 | ||
|
|
2f648dc6a0 | ||
|
|
9973fa995a | ||
|
|
4de0c3b1b2 | ||
|
|
9a71e81fc4 | ||
|
|
718b31d063 | ||
|
|
d291e15114 | ||
|
|
dae2679c3b | ||
|
|
13e6ee89c7 | ||
|
|
76cc0b6abc | ||
|
|
122df1c620 | ||
|
|
14e3da25b6 | ||
|
|
f5e9caece1 | ||
|
|
d2651c86d9 | ||
|
|
19742aee64 | ||
|
|
ce60737fc5 | ||
|
|
37cbc089b0 | ||
|
|
b7b2e8291c | ||
|
|
cb28deda6b | ||
|
|
2a500c371f | ||
|
|
48fbb9384f | ||
|
|
145e45b6f2 | ||
|
|
c4b4f3a3e4 | ||
|
|
61ff738177 | ||
|
|
ce48cc0751 | ||
|
|
ba3fa5a633 | ||
|
|
62f0ae17e3 | ||
|
|
b14214620c | ||
|
|
1449b806ab | ||
|
|
9f16a907be | ||
|
|
aba0bfd24f | ||
|
|
7aa61d4c32 | ||
|
|
bbc84a9889 | ||
|
|
3ed3279739 | ||
|
|
ddace5fb6a | ||
|
|
5a5d3df8c8 | ||
|
|
c6698dd4bf | ||
|
|
edb1a11abc |
143
.agents/llama-cpp-localai-paged-backend.md
Normal file
143
.agents/llama-cpp-localai-paged-backend.md
Normal file
@@ -0,0 +1,143 @@
|
||||
# llama-cpp-localai-paged Backend (paged attention + Blackwell NVFP4 decode)
|
||||
|
||||
`llama-cpp-localai-paged` is LocalAI's **CUDA-only** paged-attention variant of the
|
||||
llama.cpp backend. It targets high-concurrency decode for the Qwen3.6 hybrid
|
||||
gated-DeltaNet (SSM) models on Blackwell (GB10 / DGX Spark). It reuses the stock
|
||||
`llama-cpp` backend's sources and applies a vendored patch series on top at build
|
||||
time. It is **not** a fork: a source-only `*.patch` stack plus one canonical doc.
|
||||
|
||||
**Canonical reference:** `backend/cpp/llama-cpp-localai-paged/README.md`
|
||||
(architecture, the patch series 0001-0030, benchmarks, dev notes, generality,
|
||||
pin/canary policy). Read it for any technical detail; this guide is the maintenance
|
||||
how-to.
|
||||
|
||||
## Where things live
|
||||
|
||||
- `backend/cpp/llama-cpp-localai-paged/Makefile` - the thin wrapper. It copies the
|
||||
stock `backend/cpp/llama-cpp/` build infra into a build dir, clones llama.cpp at
|
||||
this backend's **own** pin (`LLAMA_VERSION`), applies the paged series via the
|
||||
`apply-paged-patches` define (strict `git apply`), then builds `grpc-server`.
|
||||
- `backend/cpp/llama-cpp-localai-paged/patches/paged/` - the source-only `.patch`
|
||||
series (0001-0030), nothing else.
|
||||
- `backend/cpp/llama-cpp-localai-paged/README.md` - the canonical doc. The
|
||||
operational docs (`PAGED_BITEXACT_NOTE.md`, `UPSTREAM_LAYER2_SCOPE.md`) and
|
||||
dev artifacts live in
|
||||
`backend/cpp/llama-cpp-localai-paged/docs/`.
|
||||
- `backend/Dockerfile.llama-cpp-localai-paged`, `.docker/llama-cpp-localai-paged-compile.sh`
|
||||
- the CUDA build entry points.
|
||||
- `backend/cpp/llama-cpp/` - the **stock** backend, pure upstream. It carries no
|
||||
paged patches.
|
||||
|
||||
## Invariants (do not break these)
|
||||
|
||||
- **Stock stays pure.** The paged patches live ONLY in this backend. Never add a
|
||||
`patches/paged/` dir or `LLAMA_PAGED` logic to `backend/cpp/llama-cpp/`.
|
||||
- **CUDA-only.** Ship cublas/cuda targets only. Off-CUDA the fusions are gated off
|
||||
(patch 0030) and NVFP4 falls back to dequant, so the backend is neutral-to-
|
||||
slightly-negative there - non-CUDA users use the stock `llama-cpp`. Do not add
|
||||
cpu/vulkan/sycl/metal rows for this backend in `.github/backend-matrix.yml`.
|
||||
(Those builds also fail to link `grpc-server` on darwin/arm64 against upstream
|
||||
`stream_*` server symbols - another reason it is CUDA-only.)
|
||||
- **Source-only patches.** A `.patch` may touch only llama.cpp source - never a
|
||||
dev doc or `*.md`. Strict `git apply` on a clean checkout must reach exit 0. (A
|
||||
stray `SSM_DECODE_FIX_RESULTS.md` hunk in patch 0019 once broke the CI build.)
|
||||
- **Bit-exact by default.** Every shipped patch is byte-identical to the f32
|
||||
baseline. (The one opt-in precision trade, `ssm_bf16_tau` / patch 0026, was
|
||||
DROPPED: it went flat once the decode fusions landed - forcing all gated-DeltaNet
|
||||
heads to bf16 gave 780.6 vs 780.0 t/s, zero benefit - so the series is now
|
||||
bit-exact end to end. Do not reintroduce a per-head SSM-precision lever; see the
|
||||
rejected-levers note in the backend README section 5.)
|
||||
|
||||
## Fork-first workflow (MANDATORY)
|
||||
|
||||
The fork **`mudler/llama.cpp` branch `localai-paged`** is the CANONICAL source
|
||||
of truth for ALL paged-backend kernel and patch work. The vendored
|
||||
`patches/paged/*.patch` series is a **derivative**: the fork is the source, the
|
||||
series is a generated mirror of it.
|
||||
|
||||
**Always update the fork FIRST, in this exact order:**
|
||||
|
||||
1. **Commit the change on the `localai-paged` branch and push it.** Every
|
||||
kernel or patch change lands as a fork commit first.
|
||||
2. **Then regenerate the LocalAI series from the fork** via `git format-patch`
|
||||
(one patch per fork commit, source-only) into
|
||||
`backend/cpp/llama-cpp-localai-paged/patches/paged/`, so the series stays a
|
||||
**1:1, drift-free mirror** of the branch.
|
||||
|
||||
Hard rules, no exceptions:
|
||||
|
||||
- **NEVER edit the `patches/paged/*.patch` files directly.** They are generated
|
||||
output, not source.
|
||||
- **NEVER add a patch to the series that has no corresponding fork-branch
|
||||
commit.** Every `.patch` must be the `git format-patch` of a real commit on
|
||||
`localai-paged`.
|
||||
- The fork branch is **where the build and the per-path bit-exact md5 gate
|
||||
actually run**, so it is the **only** place a change is truly validated. A
|
||||
patch living only in the LocalAI series has never been built or gated.
|
||||
|
||||
Verify the mirror by tree hash: applying the full on-disk series on the pin
|
||||
must reproduce the fork branch tree byte-for-byte. (The patch maintenance
|
||||
detail is in `backend/cpp/llama-cpp-localai-paged/docs/PATCH_MAINTENANCE.md`;
|
||||
the hard-gate is section 2.5 of `docs/PARITY_HANDOFF.md`.)
|
||||
|
||||
## Maintaining the pin against new llama.cpp
|
||||
|
||||
The pin (`LLAMA_VERSION` in the wrapper Makefile) is advanced ONLY by the manual
|
||||
pin-sync. It is deliberately **excluded from the nightly auto-bumper**
|
||||
(`bump_deps.yaml`): a naive bump would shift the tree out from under the patches
|
||||
and break `git apply` at build time.
|
||||
|
||||
1. **The canary tells you when to sync.** `.github/workflows/llama-cpp-paged-canary.yml`
|
||||
runs weekly: it applies + builds the series against the latest upstream tip and
|
||||
goes **red** when upstream drifts past the patches. Canary red -> run a pin-sync.
|
||||
2. **The pin-sync** (recorded in the README section 7 and git history): rebase the series onto the new
|
||||
tip (resolve conflicts; re-export **source-only** with a pathspec like
|
||||
`-- src/ ggml/ common/ include/ tools/ tests/ cmake/`), rebuild on a CUDA box,
|
||||
pass the bit-exact gate on **every** path + `test-backend-ops`, **and confirm
|
||||
the full grpc-server build/link is green on CI**, then bump `LLAMA_VERSION`.
|
||||
|
||||
**Hard constraint: keep the pin == the stock `llama-cpp` pin.** `grpc-server.cpp`
|
||||
is shared with the stock backend and tracks the stock pin. A paged pin that
|
||||
diverges PAST an upstream server-API refactor breaks the grpc-server LINK even
|
||||
when the patches are byte-for-byte bit-exact - the bit-exact gate alone does NOT
|
||||
catch it. The `c299a92c` bump did exactly this (patches applied + greedy-md5
|
||||
bit-exact, but `grpc-server.cpp` failed to link with undefined `stream_*` server
|
||||
helpers the refactor pulled into its headers), so it was reverted to `9d5d882d`.
|
||||
A pin bump is shippable only once the full CI grpc-server build is green, which in
|
||||
practice means moving in lockstep with the stock pin (or vendoring a
|
||||
pin-matched grpc-server.cpp, which we deliberately do not, to keep stock pure).
|
||||
|
||||
## The bit-exact gate (run for every change)
|
||||
|
||||
- greedy md5: `llama-completion -m MODEL -ngl 99 -fa on -p "The capital of France is" -n 48 --temp 0 --seed 1 </dev/null | md5sum`,
|
||||
paged paths prefixed `LLAMA_KV_PAGED=1` (+ `LLAMA_MOE_FORCE_GRAPHS=1` for paged
|
||||
MoE). Must match the recorded baseline. Redirect stdin from `/dev/null` or
|
||||
`llama-completion` hangs in conversation mode.
|
||||
- `test-backend-ops` (CUDA0 vs CPU oracle) for every touched op (`SSM_CONV*`,
|
||||
`GATED_DELTA_NET`, `MUL_MAT`, `MUL_MAT_ID`).
|
||||
- **The gate is per-path.** The paged-MoE md5 differs from the non-paged md5 - a
|
||||
benign, KL-validated FP-accumulation-order difference (see `docs/PAGED_BITEXACT_NOTE.md`).
|
||||
Compare a paged-MoE change to the **paged** reference, not the non-paged one.
|
||||
|
||||
## Encapsulating your work
|
||||
|
||||
- When you change a kernel, follow the **Fork-first workflow** above: commit and
|
||||
push on the `localai-paged` branch first, then regenerate the `.patch`
|
||||
(source-only) from the fork so this worktree mirrors the branch byte-for-byte.
|
||||
Commit with sign-off.
|
||||
- New optimization -> next patch number (gaps 0005/0027 are intentional). Update
|
||||
the README's patch table and dev notes - keep the README the single doc; do not
|
||||
scatter `*_RESULTS.md` files.
|
||||
- Record rejected/flat levers in the README too (they stop the next person from
|
||||
re-running dead ends).
|
||||
|
||||
## Follow-ups (Metal / SYCL / Vulkan)
|
||||
|
||||
The decode fusions are implemented for **CUDA + CPU only**. The base
|
||||
gated-DeltaNet + SSM_CONV ops already exist upstream on Metal, SYCL, and Vulkan,
|
||||
so the models **run** there via the non-fused path - what is missing is the
|
||||
fusion speedup. Porting it (strictly mirroring the CUDA kernels, since we have no
|
||||
Metal/SYCL/Vulkan hardware to test on here) is scoped in `docs/UPSTREAM_LAYER2_SCOPE.md`
|
||||
(recommended order: Metal, then SYCL, then Vulkan; ops-first upstream PR, then one
|
||||
PR per backend, each gated by `test-backend-ops` on the target hardware). The
|
||||
methodology for that work is in [.agents/vllm-parity-methodology.md](vllm-parity-methodology.md).
|
||||
101
.agents/vllm-parity-methodology.md
Normal file
101
.agents/vllm-parity-methodology.md
Normal file
@@ -0,0 +1,101 @@
|
||||
# Methodology: Closing the vLLM Decode-Throughput Gap in llama.cpp
|
||||
|
||||
This is the playbook that took the paged backend
|
||||
([.agents/llama-cpp-localai-paged-backend.md](llama-cpp-localai-paged-backend.md))
|
||||
from ~38% of vLLM decode to **parity-to-ahead on dense** (and a proven, honest
|
||||
ceiling on MoE) on GB10. Use it for any "make llama.cpp match or beat engine X on
|
||||
accelerator Y" effort. The *levers* are model- and hardware-specific; the
|
||||
*discipline* below is not. The worked example, with all numbers, is the paged
|
||||
backend README.
|
||||
|
||||
## The core loop
|
||||
|
||||
1. **Establish a bit-exact baseline and gate FIRST.** Record the greedy md5 (per
|
||||
path) and an f32 reference. Every optimization must stay byte-identical to it -
|
||||
or ship as an explicit, default-off precision opt-in. This is what lets you
|
||||
optimize aggressively without silently regressing quality. Gate two ways:
|
||||
greedy md5, and `test-backend-ops` against the CPU oracle.
|
||||
|
||||
2. **Profile - do not assume.** nsys the steady-state decode step, broken down per
|
||||
*kernel* AND per *memcpy*. Find the dominant cost. "It's the GEMM" was wrong
|
||||
here: on hybrid gated-DeltaNet models the bottleneck was the recurrent-state
|
||||
**plumbing** (state memcpy + gathers, ~67% of the step), not the weight GEMM.
|
||||
Also sanity-check GPU-busy %: an early "low utilization" reading was a profiling
|
||||
window artifact (decode was 96-99% GPU-busy), not real idle.
|
||||
|
||||
3. **Ground-truth BOTH engines.** Decompose *your* decode step AND the
|
||||
competitor's, side by side, per bucket, and compute the per-bucket delta. This
|
||||
tells you WHERE the gap actually is - not where you would guess. It overturned
|
||||
premises here: e.g. vLLM does NOT run the GDN/attn projections as NVFP4 (it
|
||||
keeps them bf16, same as us); the MoE expert GEMM was a llama *win*, not the gap.
|
||||
|
||||
4. **Per-lever discipline.** For each candidate: implement -> bit-exact gate ->
|
||||
same-harness A/B bench. Use a runtime env-toggle (flag off vs on) ONLY for
|
||||
levers that are actually runtime-gated; a lever **compiled into** the binary
|
||||
(e.g. the SSM decode fusions here) is NOT isolated by a runtime flag, so measure
|
||||
it build-vs-build. The full-patchset "stock" baseline likewise needs a
|
||||
**separately-built unpatched binary at the same pin** - toggling the runtime
|
||||
flag on the patched binary does not reproduce stock (it measures only the gated
|
||||
part; here that was ~neutral, which is exactly how this gotcha hides). Bank only
|
||||
what lifts AND gates. **Record every rejected or flat lever with the reason** -
|
||||
over time this is the most valuable part: it stops the next person re-running
|
||||
dead ends.
|
||||
|
||||
5. **Name the structural floor.** Prove the bit-exact ceiling exhaustively (every
|
||||
lever measured, not assumed). What remains is physical - the memory-bandwidth
|
||||
floor, the irreducible serial-SSM host loop (sampling can't start until logits
|
||||
land). Name it; do not claim more than you measured.
|
||||
|
||||
## Hard rules learned
|
||||
|
||||
- **Apples-to-apples, or label it.** Stock-vs-patched on the SAME harness
|
||||
(`llama-batched-bench`) is exact - lead with it. But "stock" must be a
|
||||
separately-built unpatched binary at the SAME pin, NOT the patched binary with
|
||||
the runtime flag off (compiled-in wins survive the toggle). Cross-engine "% of vLLM"
|
||||
(batched-bench vs vLLM server+client) is *indicative*; always caveat the harness
|
||||
and config (context length alone shifted the MoE figure 76% <-> 86%).
|
||||
- **Re-measure a "win" after later levers land - it may evaporate.** bf16 SSM
|
||||
state (the `ssm_bf16_tau` lever) benched +12% early and failed the f32 KL gate
|
||||
(vLLM keeps f32 too), so it was kept default-off opt-in. Once the decode fusions
|
||||
(recurrent-state gather-fusion + block-table cache) landed, a clean re-measure
|
||||
forcing ALL gated-DeltaNet heads to bf16 (`tau=100000`) went **flat** - 780.6 vs
|
||||
780.0 t/s. The "+12%" was subsumed by the fusions: the lever bought nothing, so
|
||||
it was **dropped** (precision trade + bug surface + extra CUDA template-instantiation
|
||||
compile cost, zero benefit). A win measured before the rest of the series is not a
|
||||
win after it.
|
||||
- **Reject the obvious-but-wrong, with evidence.** A faster kernel that is off the
|
||||
critical path benches FLAT (the freed time becomes idle). Quantizing the bf16
|
||||
projections to NVFP4 cost ~6% PPL - and vLLM keeps them bf16 for the same reason.
|
||||
Always measure before believing; a plausible mechanism is not a result.
|
||||
- **The gate can be per-path.** Paged vs non-paged attention legitimately produces
|
||||
different (equivalent) FP-reduction orders; validate the difference is benign
|
||||
(KLD to f32) and then gate each path against its own reference.
|
||||
|
||||
## Orchestration (multi-agent)
|
||||
|
||||
- **One GPU profiler/bencher at a time** (the GPU-contention rule). Parallel
|
||||
design/analysis/read agents are fine; concurrent GPU benches pollute each other's
|
||||
numbers.
|
||||
- **Adversarial verify.** Before banking a finding, spawn skeptics prompted to
|
||||
*refute* it; majority-refute kills it. Prevents plausible-but-wrong results.
|
||||
- **Anti-punt.** Use foreground, blocking ssh loops with short benches and a
|
||||
progress-file checkpoint. Agents that background work and "wait for the monitor
|
||||
event" stall - forbid that pattern.
|
||||
- **GPU coexistence.** On a shared host, stop the user's deployments for a clean
|
||||
benchmark window (with their OK) and ALWAYS restore them (wrap the bench so a
|
||||
failure cannot strand them).
|
||||
|
||||
## What generalizes (and what doesn't)
|
||||
|
||||
The *speedups* may be hardware-specific (here: CUDA/Blackwell - the SSM fusions,
|
||||
NVFP4 FP4-MMA, the occupancy tune), which is why other accelerators did not
|
||||
benefit. But the *findings* often generalize and are worth upstreaming: the
|
||||
"decode is plumbing-bound, not GEMM-bound" insight and the bit-exact, CPU-mirrored
|
||||
fusion ops help any backend running these models. Separate "ship our tuned backend"
|
||||
from "upstream the portable op" - they are different deliverables.
|
||||
|
||||
## The closing record
|
||||
|
||||
Write up the result HONESTLY: the shipped wins, the rejected levers (with reasons),
|
||||
the structural ceiling, and the cross-backend / cross-quant generality. Negative
|
||||
results are as valuable as wins. The paged backend README is the template.
|
||||
39
.docker/llama-cpp-localai-paged-compile.sh
Executable file
39
.docker/llama-cpp-localai-paged-compile.sh
Executable file
@@ -0,0 +1,39 @@
|
||||
#!/usr/bin/env bash
|
||||
# Shared compile logic for backend/Dockerfile.llama-cpp-localai-paged.
|
||||
# Sourced (via bind mount) from both builder-fromsource and builder-prebuilt stages.
|
||||
|
||||
set -euxo pipefail
|
||||
|
||||
export CCACHE_DIR=/root/.ccache
|
||||
ccache --max-size=5G || true
|
||||
ccache -z || true
|
||||
|
||||
export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache"
|
||||
|
||||
if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
|
||||
CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}"
|
||||
export CMAKE_ARGS="${CMAKE_ARGS} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
|
||||
echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}"
|
||||
rm -rf /LocalAI/backend/cpp/llama-cpp-localai-paged-*-build
|
||||
fi
|
||||
|
||||
cd /LocalAI/backend/cpp/llama-cpp-localai-paged
|
||||
|
||||
if [ -z "${BUILD_TYPE:-}" ]; then
|
||||
# Pure CPU image: one ggml CPU_ALL_VARIANTS build replaces the per-microarch binaries.
|
||||
# arm64: the armv9.2 SME variants need gcc-14 (gcc-13 rejects +sme).
|
||||
if [ "${TARGETARCH}" = "arm64" ]; then
|
||||
apt-get update -qq && apt-get install -y -qq gcc-14 g++-14
|
||||
export CC=gcc-14 CXX=g++-14
|
||||
fi
|
||||
make llama-cpp-localai-paged-cpu-all
|
||||
else
|
||||
# GPU build (cublas/hipblas/sycl/vulkan/...): single fallback CPU build, the accelerator
|
||||
# does the compute. Keeps the GPU compile from also building the CPU variant matrix and
|
||||
# avoids the gcc-14 apt step on GPU base images such as nvidia l4t.
|
||||
make llama-cpp-localai-paged-fallback
|
||||
fi
|
||||
make llama-cpp-localai-paged-grpc
|
||||
make llama-cpp-localai-paged-rpc-server
|
||||
|
||||
ccache -s || true
|
||||
@@ -7,11 +7,8 @@
|
||||
# Runs only the checks relevant to what's staged:
|
||||
# - Go files -> make lint + make test-coverage-check
|
||||
# - core/http/react-ui -> make test-ui-coverage-check (Playwright e2e + gate)
|
||||
# - realtime state machines / specs -> make test-realtime-conformance
|
||||
# (respcoord/**, turncoord/**, or formal-verification/** -- a pure .fizz
|
||||
# spec edit must still re-verify the design, detected separately from Go)
|
||||
# A commit touching none of these is skipped entirely (other docs/YAML can't
|
||||
# change lint findings, Go coverage, the UI, or the realtime conformance gate).
|
||||
# A commit touching neither is skipped entirely (docs/YAML/etc. can't change
|
||||
# lint findings, Go coverage, or the UI).
|
||||
#
|
||||
# To bypass for a single commit (e.g. a WIP checkpoint): git commit --no-verify
|
||||
set -eu
|
||||
@@ -23,13 +20,11 @@ staged="$(git diff --cached --name-only --diff-filter=ACMRD)"
|
||||
|
||||
go_changed=0
|
||||
ui_changed=0
|
||||
rt_changed=0
|
||||
if echo "$staged" | grep -qE '\.go$'; then go_changed=1; fi
|
||||
if echo "$staged" | grep -qE '^core/http/react-ui/'; then ui_changed=1; fi
|
||||
if echo "$staged" | grep -qE '^(core/http/endpoints/openai/(coordinator|respcoord|turncoord|conncoord|compactcoord|ttscoord)/|formal-verification/)'; then rt_changed=1; fi
|
||||
|
||||
if [ "$go_changed" -eq 0 ] && [ "$ui_changed" -eq 0 ] && [ "$rt_changed" -eq 0 ]; then
|
||||
echo "pre-commit: no Go, React UI, or realtime-spec changes staged — skipping."
|
||||
if [ "$go_changed" -eq 0 ] && [ "$ui_changed" -eq 0 ]; then
|
||||
echo "pre-commit: no Go or React UI changes staged — skipping."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
@@ -62,11 +57,4 @@ if [ "$ui_changed" -eq 1 ]; then
|
||||
make test-ui-coverage-check
|
||||
fi
|
||||
|
||||
if [ "$rt_changed" -eq 1 ]; then
|
||||
echo "pre-commit ▶ realtime state-machine conformance (make test-realtime-conformance) —"
|
||||
echo " Go transition/rapid tests under -race + FizzBee model check of the"
|
||||
echo " authoritative specs. Fail-closed: needs FizzBee (make install-fizzbee)."
|
||||
make test-realtime-conformance
|
||||
fi
|
||||
|
||||
echo "pre-commit ✓ all relevant checks passed"
|
||||
|
||||
33
.github/backend-matrix.yml
vendored
33
.github/backend-matrix.yml
vendored
@@ -5177,6 +5177,39 @@ include:
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
# llama-cpp-localai-paged: the LocalAI paged-attention llama.cpp variant. Each
|
||||
# row mirrors the corresponding llama-cpp row with backend/dockerfile/tag-suffix
|
||||
# swapped; builder-base-image is left UNCHANGED so these reuse the same
|
||||
# base-grpc-* prebuilt bases (same gRPC + same toolchain), needing no new
|
||||
# base-images.yml variant.
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "13"
|
||||
cuda-minor-version: "0"
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-nvidia-cuda-13-llama-cpp-localai-paged'
|
||||
builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-cuda-13-amd64'
|
||||
runs-on: 'bigger-runner'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "llama-cpp-localai-paged"
|
||||
dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "13"
|
||||
cuda-minor-version: "0"
|
||||
platforms: 'linux/arm64'
|
||||
skip-drivers: 'false'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-nvidia-l4t-cuda-13-arm64-llama-cpp-localai-paged'
|
||||
builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-cuda-13-arm64'
|
||||
base-image: "ubuntu:24.04"
|
||||
runs-on: 'ubuntu-24.04-arm'
|
||||
ubuntu-version: '2404'
|
||||
backend: "llama-cpp-localai-paged"
|
||||
dockerfile: "./backend/Dockerfile.llama-cpp-localai-paged"
|
||||
context: "./"
|
||||
|
||||
# Darwin matrix (consumed by backend-jobs-darwin).
|
||||
includeDarwin:
|
||||
|
||||
77
.github/scripts/paged-canary-apply.sh
vendored
Executable file
77
.github/scripts/paged-canary-apply.sh
vendored
Executable file
@@ -0,0 +1,77 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# paged-canary-apply.sh - apply the vendored paged-attention patch series
|
||||
# (backend/cpp/llama-cpp-localai-paged/patches/paged/0001-0030) to a llama.cpp checkout, the
|
||||
# same way the build does, but tolerating the ONE known-benign pre-existing
|
||||
# quirk in the series. Used by the early-warning canary
|
||||
# (.github/workflows/llama-cpp-paged-canary.yml) so it only goes red on a REAL
|
||||
# upstream break, never on that quirk.
|
||||
#
|
||||
# Usage: paged-canary-apply.sh <llama.cpp-checkout-dir> <patches-dir>
|
||||
# <patches-dir> is normally backend/cpp/llama-cpp-localai-paged/patches (it holds the
|
||||
# top-level base series 0*.patch, currently empty, and the paged/ subseries).
|
||||
#
|
||||
# Exit 0 = the whole series applied -> patches still fit upstream.
|
||||
# Exit !=0 = a patch failed to apply = the red signal: an upstream change moved
|
||||
# the tree out from under the patches, so it is time to run a PIN_SYNC.
|
||||
#
|
||||
# Apply method MIRRORS backend/cpp/llama-cpp/Makefile's `llama.cpp` target:
|
||||
# plain `git apply --verbose`, which natively tolerates @@ line-number offsets
|
||||
# but NOT context-line changes. Matching the build's method is the point - the
|
||||
# canary's apply result is exactly what the real build's apply would do.
|
||||
#
|
||||
# The ONLY tolerance, and it is path-scoped (not a blanket `|| true`): patch
|
||||
# 0019 carries a stray *modify* hunk against the dev-only doc
|
||||
# SSM_DECODE_FIX_RESULTS.md, a file that exists only on the DGX dev tree and is
|
||||
# absent from any clean upstream checkout. `git apply` is atomic, so that single
|
||||
# missing-file hunk rejects the whole patch - and because 0021/0022/0026/0028
|
||||
# build on 0019's code, the rejection cascades to them too. This is a
|
||||
# PRE-EXISTING shipped-series defect, present identically on every pin, NOT an
|
||||
# upstream break (see backend/cpp/llama-cpp-localai-paged/README.md section 7,
|
||||
# "Pin + maintenance policy"). We exclude ONLY that dev-doc path and still
|
||||
# apply 0019's real code hunks atomically, so a genuine code-hunk break in 0019
|
||||
# still fails the canary. prepare.sh tolerates the same hunk via
|
||||
# `patch ... || true`; this mirrors that tolerance precisely.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
CHECKOUT="${1:?usage: paged-canary-apply.sh <llama.cpp-checkout> <patches-dir>}"
|
||||
PATCHES="${2:?usage: paged-canary-apply.sh <llama.cpp-checkout> <patches-dir>}"
|
||||
|
||||
# The lone tolerated dev-doc, and the only patch allowed to carry it.
|
||||
DEVDOC_GLOB='*SSM_DECODE_FIX_RESULTS.md'
|
||||
DEVDOC_PATCH='0019-qwen35-ssm-decode-fused-gather.patch'
|
||||
|
||||
# Resolve to absolute paths so the apply works after we cd into the checkout.
|
||||
PATCHES="$(cd "$PATCHES" && pwd)"
|
||||
cd "$CHECKOUT"
|
||||
|
||||
shopt -s nullglob
|
||||
|
||||
apply_one() {
|
||||
local p="$1"; shift
|
||||
echo "paged-canary: applying $(basename "$p")"
|
||||
if ! git apply --verbose "$@" "$p"; then
|
||||
echo "::error::paged patch no longer applies to the upstream llama.cpp tip: $(basename "$p")"
|
||||
echo "::error::upstream drifted past the vendored paged series - run a PIN_SYNC (see backend/cpp/llama-cpp-localai-paged/README.md section 7, Pin + maintenance policy), do NOT bump the pin blindly"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Base series first (parity with the build: patches/0*.patch before
|
||||
# patches/paged/0*.patch). Currently empty; nullglob makes this a no-op.
|
||||
for p in "$PATCHES"/0*.patch; do
|
||||
apply_one "$p"
|
||||
done
|
||||
|
||||
# Paged series, in order.
|
||||
for p in "$PATCHES"/paged/0*.patch; do
|
||||
if [ "$(basename "$p")" = "$DEVDOC_PATCH" ]; then
|
||||
# Apply 0019's real code hunks; exclude ONLY the benign dev-doc hunk.
|
||||
apply_one "$p" --exclude="$DEVDOC_GLOB"
|
||||
else
|
||||
apply_one "$p"
|
||||
fi
|
||||
done
|
||||
|
||||
echo "paged-canary: the full paged patch series applied cleanly to the upstream tip"
|
||||
25
.github/workflows/backend_build_darwin.yml
vendored
25
.github/workflows/backend_build_darwin.yml
vendored
@@ -82,7 +82,7 @@ jobs:
|
||||
# as the Linux registry cache.
|
||||
- name: Restore Homebrew cache
|
||||
id: brew-cache
|
||||
uses: actions/cache/restore@v6
|
||||
uses: actions/cache/restore@v4
|
||||
with:
|
||||
path: |
|
||||
~/Library/Caches/Homebrew/downloads
|
||||
@@ -142,7 +142,7 @@ jobs:
|
||||
|
||||
- name: Save Homebrew cache
|
||||
if: github.event_name != 'pull_request' && steps.brew-cache.outputs.cache-hit != 'true'
|
||||
uses: actions/cache/save@v6
|
||||
uses: actions/cache/save@v4
|
||||
with:
|
||||
path: |
|
||||
~/Library/Caches/Homebrew/downloads
|
||||
@@ -169,16 +169,16 @@ jobs:
|
||||
# invalidates cleanly; restore-keys fall back to the latest entry for the
|
||||
# same pin so unchanged TUs stay warm even when the cache is fresh.
|
||||
- name: Compute llama.cpp version
|
||||
if: inputs.backend == 'llama-cpp'
|
||||
if: inputs.backend == 'llama-cpp' || inputs.backend == 'llama-cpp-localai-paged'
|
||||
id: llama-version
|
||||
run: |
|
||||
version=$(grep '^LLAMA_VERSION' backend/cpp/llama-cpp/Makefile | head -1 | cut -d= -f2 | cut -d'?' -f1 | tr -d ' ')
|
||||
echo "version=${version}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Restore ccache
|
||||
if: inputs.backend == 'llama-cpp'
|
||||
if: inputs.backend == 'llama-cpp' || inputs.backend == 'llama-cpp-localai-paged'
|
||||
id: ccache-cache
|
||||
uses: actions/cache/restore@v6
|
||||
uses: actions/cache/restore@v4
|
||||
with:
|
||||
path: ~/Library/Caches/ccache
|
||||
key: ccache-llama-${{ runner.arch }}-${{ steps.llama-version.outputs.version }}-${{ github.run_id }}
|
||||
@@ -186,7 +186,7 @@ jobs:
|
||||
ccache-llama-${{ runner.arch }}-${{ steps.llama-version.outputs.version }}-
|
||||
|
||||
- name: Configure ccache
|
||||
if: inputs.backend == 'llama-cpp'
|
||||
if: inputs.backend == 'llama-cpp' || inputs.backend == 'llama-cpp-localai-paged'
|
||||
run: |
|
||||
mkdir -p "$HOME/Library/Caches/ccache"
|
||||
ccache -M 2G
|
||||
@@ -211,7 +211,7 @@ jobs:
|
||||
- name: Restore Python wheel cache
|
||||
if: inputs.lang == 'python'
|
||||
id: pyenv-cache
|
||||
uses: actions/cache/restore@v6
|
||||
uses: actions/cache/restore@v4
|
||||
with:
|
||||
path: |
|
||||
~/Library/Caches/pip
|
||||
@@ -251,19 +251,24 @@ jobs:
|
||||
BACKEND=${{ inputs.backend }} BUILD_TYPE=${{ inputs.build-type }} USE_PIP=${{ inputs.use-pip }} make build-darwin-${{ inputs.lang }}-backend
|
||||
|
||||
- name: ccache stats
|
||||
if: inputs.backend == 'llama-cpp'
|
||||
if: inputs.backend == 'llama-cpp' || inputs.backend == 'llama-cpp-localai-paged'
|
||||
run: ccache -s
|
||||
|
||||
# Only stock llama-cpp persists the ccache: both backends share the same
|
||||
# ccache-llama-<arch>-<version>-<run_id> key, so the paged job restores from
|
||||
# the shared prefix (warm) but must NOT also save under the identical key in
|
||||
# the same run (it would collide). The shared upstream TUs stay warm via the
|
||||
# stock save; the paged-only patched TUs are a small recompile.
|
||||
- name: Save ccache
|
||||
if: inputs.backend == 'llama-cpp' && github.event_name != 'pull_request'
|
||||
uses: actions/cache/save@v6
|
||||
uses: actions/cache/save@v4
|
||||
with:
|
||||
path: ~/Library/Caches/ccache
|
||||
key: ccache-llama-${{ runner.arch }}-${{ steps.llama-version.outputs.version }}-${{ github.run_id }}
|
||||
|
||||
- name: Save Python wheel cache
|
||||
if: inputs.lang == 'python' && github.event_name != 'pull_request' && steps.pyenv-cache.outputs.cache-hit != 'true'
|
||||
uses: actions/cache/save@v6
|
||||
uses: actions/cache/save@v4
|
||||
with:
|
||||
path: |
|
||||
~/Library/Caches/pip
|
||||
|
||||
17
.github/workflows/bump_deps.yaml
vendored
17
.github/workflows/bump_deps.yaml
vendored
@@ -9,6 +9,23 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# NOTE: there is intentionally NO entry for the llama-cpp-localai-paged
|
||||
# backend. It carries a vendored paged-attention patch series
|
||||
# (backend/cpp/llama-cpp-localai-paged/patches/paged/) hand-verified bit-exact against
|
||||
# ONE specific llama.cpp tip; a naive nightly bump would move the tip out
|
||||
# from under the patches and break `git apply` at build time. Its pin is
|
||||
# therefore decoupled (its own LLAMA_VERSION in
|
||||
# backend/cpp/llama-cpp-localai-paged/Makefile) and advanced ONLY by the
|
||||
# manual PIN_SYNC process. Do not add it here. (turboquant CAN be
|
||||
# auto-bumped below because its fork branch carries the patches.)
|
||||
#
|
||||
# Excluding it from the auto-bumper removed the early warning of upstream
|
||||
# drift; that signal is restored separately by the dedicated canary
|
||||
# .github/workflows/llama-cpp-paged-canary.yml, which weekly applies +
|
||||
# compiles the paged series against the latest llama.cpp tip and goes red
|
||||
# when upstream breaks it (prompting a PIN_SYNC). The canary is
|
||||
# signal-only - it never opens a bump PR and never moves the pin - so
|
||||
# this dep-bump workflow and its PRs stay green regardless.
|
||||
include:
|
||||
- repository: "ggml-org/llama.cpp"
|
||||
variable: "LLAMA_VERSION"
|
||||
|
||||
179
.github/workflows/llama-cpp-paged-canary.yml
vendored
Normal file
179
.github/workflows/llama-cpp-paged-canary.yml
vendored
Normal file
@@ -0,0 +1,179 @@
|
||||
name: 'llama.cpp paged patches: upstream canary'
|
||||
|
||||
# EARLY-WARNING CANARY for the vendored paged-attention patch series
|
||||
# (backend/cpp/llama-cpp-localai-paged/patches/paged/0001-0030).
|
||||
#
|
||||
# WHY THIS EXISTS
|
||||
# The paged backend (backend/cpp/llama-cpp-localai-paged) pins its OWN verified
|
||||
# llama.cpp tip (LLAMA_VERSION in backend/cpp/llama-cpp-localai-paged/Makefile)
|
||||
# and is intentionally EXCLUDED from the nightly auto-bumper
|
||||
# (.github/workflows/bump_deps.yaml), so a naive upstream bump can never silently
|
||||
# break the shipped build. The cost of that safety: nobody finds out when
|
||||
# upstream DRIFTS past the patches. This canary restores that signal WITHOUT
|
||||
# touching the shipped pin - weekly it tries the patch series + a real compile
|
||||
# against the LATEST llama.cpp master tip and goes red the moment upstream breaks
|
||||
# the patches.
|
||||
#
|
||||
# RED HERE means: time to run a PIN_SYNC (rebase the patches onto the new tip,
|
||||
# pass the bit-exact gate on the GPU, re-export the .patch files, THEN advance
|
||||
# the pin in backend/cpp/llama-cpp-localai-paged/Makefile). See the backend README
|
||||
# section 7 (Pin + maintenance policy):
|
||||
# backend/cpp/llama-cpp-localai-paged/README.md.
|
||||
#
|
||||
# SIGNAL-ONLY: this workflow moves no pinned version, ships nothing, and is fully
|
||||
# decoupled from bump_deps - so the main dep-bump PR stays green regardless. A
|
||||
# green run means "the paged series still applies and compiles on upstream HEAD";
|
||||
# a red run means "upstream moved - schedule a pin-sync".
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Weekly (Mondays 06:00 UTC), mirroring the weekly DEPS_REFRESH / bump_deps
|
||||
# cadence. Offset from bump_deps' nightly 20:00 so the two never pile up.
|
||||
- cron: '0 6 * * 1'
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
concurrency:
|
||||
group: llama-cpp-paged-canary
|
||||
cancel-in-progress: false
|
||||
|
||||
env:
|
||||
# Upstream source of truth - the same repo/branch bump_deps tracks for the
|
||||
# stock llama-cpp pin.
|
||||
LLAMA_UPSTREAM: 'https://github.com/ggml-org/llama.cpp'
|
||||
|
||||
jobs:
|
||||
apply-check:
|
||||
# Cheap, fast, toolchain-free early warning: does the series still APPLY to
|
||||
# the latest upstream tip? A patch no longer applying is by far the most
|
||||
# common way upstream breaks a vendored series, so this runs first, is
|
||||
# reliable on a free runner, and feeds the resolved tip to the compile job.
|
||||
if: github.repository == 'mudler/LocalAI'
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 20
|
||||
outputs:
|
||||
tip: ${{ steps.resolve.outputs.tip }}
|
||||
steps:
|
||||
- name: Checkout LocalAI
|
||||
uses: actions/checkout@v7
|
||||
|
||||
- name: Resolve latest llama.cpp master tip
|
||||
id: resolve
|
||||
run: |
|
||||
tip="$(git ls-remote "$LLAMA_UPSTREAM" refs/heads/master | cut -f1)"
|
||||
if [ -z "$tip" ]; then
|
||||
echo "::error::could not resolve llama.cpp master tip from $LLAMA_UPSTREAM"
|
||||
exit 1
|
||||
fi
|
||||
pin="$(grep -m1 'LLAMA_VERSION?=' backend/cpp/llama-cpp-localai-paged/Makefile | cut -d= -f2)"
|
||||
echo "latest llama.cpp master tip: $tip"
|
||||
echo "shipped paged pin: $pin"
|
||||
echo "tip=$tip" >> "$GITHUB_OUTPUT"
|
||||
{
|
||||
echo "## llama.cpp paged canary"
|
||||
echo ""
|
||||
echo "- upstream master tip: \`$tip\`"
|
||||
echo "- shipped paged pin: \`$pin\`"
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
- name: Checkout llama.cpp at latest tip (shallow)
|
||||
run: |
|
||||
mkdir -p /tmp/llama.cpp
|
||||
cd /tmp/llama.cpp
|
||||
git init -q
|
||||
git remote add origin "$LLAMA_UPSTREAM"
|
||||
git fetch -q --depth 1 origin "${{ steps.resolve.outputs.tip }}"
|
||||
git checkout -q FETCH_HEAD
|
||||
git log --oneline -1
|
||||
|
||||
- name: Apply paged patch series (build's git-apply method)
|
||||
run: |
|
||||
bash .github/scripts/paged-canary-apply.sh \
|
||||
/tmp/llama.cpp \
|
||||
"$PWD/backend/cpp/llama-cpp-localai-paged/patches"
|
||||
echo "- apply: full paged series applies to the upstream tip :white_check_mark:" >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
compile:
|
||||
# Proves the patches still COMPILE against the latest tip, using the SAME
|
||||
# toolchain + build target the shipped paged backend uses (the
|
||||
# base-grpc-cuda-12 builder base + the Makefile `grpc-server` cublas target),
|
||||
# so a failure means upstream drift, not toolchain noise. CUDA is compiled
|
||||
# (nvcc; no GPU required) because most of the paged series is CUDA kernels.
|
||||
# Runs only if the apply check passed, on the exact tip it validated.
|
||||
#
|
||||
# If a full CUDA compile on the hosted runner ever proves too heavy/flaky,
|
||||
# switch `runs-on` to 'bigger-runner' (the runner class the real paged CUDA
|
||||
# build uses), or drop to a CPU build (BUILD_TYPE='') which still compiles
|
||||
# all host + CPU paged code, leaving CUDA-kernel coverage to the apply check
|
||||
# plus the manual PIN_SYNC GPU gate.
|
||||
needs: apply-check
|
||||
if: github.repository == 'mudler/LocalAI'
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 180
|
||||
steps:
|
||||
- name: Checkout LocalAI
|
||||
uses: actions/checkout@v7
|
||||
|
||||
- name: Free disk space
|
||||
uses: ./.github/actions/free-disk-space
|
||||
with:
|
||||
mode: hosted
|
||||
|
||||
- name: Login to Quay.io
|
||||
uses: docker/login-action@v4
|
||||
with:
|
||||
registry: quay.io
|
||||
username: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
|
||||
password: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
|
||||
|
||||
- name: Compile paged backend against latest tip (cublas)
|
||||
env:
|
||||
TIP: ${{ needs.apply-check.outputs.tip }}
|
||||
BUILDER_BASE_IMAGE: 'quay.io/go-skynet/ci-cache:base-grpc-cuda-12-amd64'
|
||||
run: |
|
||||
docker run --rm \
|
||||
-v "$PWD":/LocalAI -w /LocalAI \
|
||||
-e TIP -e LLAMA_UPSTREAM \
|
||||
"$BUILDER_BASE_IMAGE" bash -euxo pipefail -c '
|
||||
# Mirror the Dockerfile: gRPC lives at /opt/grpc in the base image;
|
||||
# copy it to the prefix CMake find_package expects.
|
||||
cp -a /opt/grpc/. /usr/local/
|
||||
|
||||
# Pre-populate the llama.cpp checkout at the latest tip with the
|
||||
# paged series applied via the tolerant canary apply. Because
|
||||
# backend/cpp/llama-cpp/llama.cpp now exists, the stock Makefile's
|
||||
# llama.cpp target (clone + base-patch apply) is skipped and the
|
||||
# now patch-free prepare.sh only copies the grpc-server sources -
|
||||
# so we drive the REAL grpc-server build path on top of our paged
|
||||
# apply. The stock llama-cpp backend no longer carries the paged
|
||||
# series (it lives in backend/cpp/llama-cpp-localai-paged/patches/
|
||||
# paged); we build it here in the stock dir only because that is
|
||||
# where the shared build infra (Makefile / grpc-server.cpp /
|
||||
# CMakeLists.txt / prepare.sh) lives.
|
||||
cd backend/cpp/llama-cpp/
|
||||
mkdir -p llama.cpp
|
||||
cd llama.cpp
|
||||
git init -q
|
||||
git remote add origin "$LLAMA_UPSTREAM"
|
||||
git fetch -q --depth 1 origin "$TIP"
|
||||
git checkout -q FETCH_HEAD
|
||||
cd /LocalAI
|
||||
bash .github/scripts/paged-canary-apply.sh \
|
||||
backend/cpp/llama-cpp/llama.cpp \
|
||||
"$PWD/backend/cpp/llama-cpp-localai-paged/patches"
|
||||
|
||||
# Cheapest real CUDA build that proves the patches compile: one
|
||||
# CUDA arch, cublas. CMAKE_ARGS is passed via the environment (not
|
||||
# as a make arg) so the Makefile += flags are still appended,
|
||||
# exactly like .docker/llama-cpp-localai-paged-compile.sh. The paged
|
||||
# series is already applied to the checkout above, so the stock
|
||||
# build just compiles the patched tree.
|
||||
cd backend/cpp/llama-cpp/
|
||||
BUILD_TYPE=cublas \
|
||||
CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=80" \
|
||||
make grpc-server
|
||||
test -x grpc-server
|
||||
'
|
||||
echo "- compile: paged series builds (cublas) against the upstream tip :white_check_mark:" >> "$GITHUB_STEP_SUMMARY"
|
||||
69
.github/workflows/realtime-conformance.yml
vendored
69
.github/workflows/realtime-conformance.yml
vendored
@@ -1,69 +0,0 @@
|
||||
---
|
||||
name: 'realtime-conformance'
|
||||
|
||||
# Verifies the realtime state-machine implementations conform to their formal
|
||||
# designs (docs/design/realtime-state-machines.md, formal-verification/). BOTH
|
||||
# layers are enforced and the gate is fail-closed: the Go conformance layer
|
||||
# (respcoord + turncoord transition/rapid tests under -race) AND the FizzBee model check of
|
||||
# the authoritative specs. FizzBee is pinned + checksum-verified
|
||||
# (formal-verification/fizzbee.sha256), so a failed install fails the job rather
|
||||
# than silently skipping verification.
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- 'core/http/endpoints/openai/coordinator/**'
|
||||
- 'core/http/endpoints/openai/respcoord/**'
|
||||
- 'core/http/endpoints/openai/turncoord/**'
|
||||
- 'core/http/endpoints/openai/conncoord/**'
|
||||
- 'core/http/endpoints/openai/compactcoord/**'
|
||||
- 'core/http/endpoints/openai/ttscoord/**'
|
||||
- 'formal-verification/**'
|
||||
- 'scripts/realtime-conformance.sh'
|
||||
- 'scripts/install-fizzbee.sh'
|
||||
- '.github/workflows/realtime-conformance.yml'
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths:
|
||||
- 'core/http/endpoints/openai/coordinator/**'
|
||||
- 'core/http/endpoints/openai/respcoord/**'
|
||||
- 'core/http/endpoints/openai/turncoord/**'
|
||||
- 'core/http/endpoints/openai/conncoord/**'
|
||||
- 'core/http/endpoints/openai/compactcoord/**'
|
||||
- 'core/http/endpoints/openai/ttscoord/**'
|
||||
- 'formal-verification/**'
|
||||
- 'scripts/realtime-conformance.sh'
|
||||
|
||||
concurrency:
|
||||
group: realtime-conformance-${{ github.event.pull_request.number || github.sha }}-${{ github.repository }}
|
||||
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
|
||||
|
||||
jobs:
|
||||
conformance:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
go-version: ['1.26.x']
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v7
|
||||
- name: Setup Go ${{ matrix.go-version }}
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: ${{ matrix.go-version }}
|
||||
cache: false
|
||||
- name: Cache FizzBee
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: .tools/fizzbee
|
||||
key: fizzbee-v0.5.2-${{ runner.os }}-${{ hashFiles('formal-verification/fizzbee.sha256') }}
|
||||
- name: Install FizzBee (pinned, checksum-verified)
|
||||
# No `|| true`: a failed/forged download must fail the job, not silently
|
||||
# drop the design verification. install-fizzbee.sh is a no-op if the
|
||||
# cached binary is already present and valid.
|
||||
run: ./scripts/install-fizzbee.sh
|
||||
- name: Run conformance gate (fail-closed)
|
||||
# No skip env: both the Go conformance and the FizzBee model check are
|
||||
# required. The gate auto-detects .tools/fizzbee/fizz.
|
||||
run: make test-realtime-conformance
|
||||
18
.gitignore
vendored
18
.gitignore
vendored
@@ -9,6 +9,15 @@ prepare-sources
|
||||
/backend/cpp/llama-cpp/llama.cpp
|
||||
/backend/cpp/llama-*
|
||||
!backend/cpp/llama-cpp
|
||||
# llama-cpp-localai-paged is a tracked source dir (a thin wrapper Makefile over
|
||||
# backend/cpp/llama-cpp). Re-include it like llama-cpp above; its sibling
|
||||
# *-build dirs are still ignored by the /backend/cpp/llama-* rule, and its
|
||||
# in-dir build artifacts (binaries, package output, collected ggml .so set) are
|
||||
# re-ignored just below.
|
||||
!backend/cpp/llama-cpp-localai-paged
|
||||
/backend/cpp/llama-cpp-localai-paged/llama-cpp-localai-paged-*
|
||||
/backend/cpp/llama-cpp-localai-paged/package
|
||||
/backend/cpp/llama-cpp-localai-paged/ggml-shared-libs
|
||||
/backends
|
||||
/backend-images
|
||||
/result.yaml
|
||||
@@ -97,12 +106,3 @@ core/http/react-ui/test-results/
|
||||
|
||||
# Local Apple signing material (never commit)
|
||||
.certs/
|
||||
|
||||
# Pinned dev tools (e.g. FizzBee for the realtime-conformance gate)
|
||||
.tools/
|
||||
|
||||
# FizzBee model-check artifacts: the parser emits <spec>.json next to each
|
||||
# .fizz and the checker writes run dirs under out/. Both are regenerated by
|
||||
# the realtime-conformance gate; only the .fizz sources are authoritative.
|
||||
formal-verification/*.json
|
||||
formal-verification/out/
|
||||
|
||||
@@ -23,6 +23,8 @@ LocalAI follows the Linux kernel project's [guidelines for AI coding assistants]
|
||||
| [.agents/adding-backends.md](.agents/adding-backends.md) | Adding a new backend (Python, Go, or C++) — full step-by-step checklist, including importer integration (the `/import-model` dropdown is server-driven from `GET /backends/known`) |
|
||||
| [.agents/coding-style.md](.agents/coding-style.md) | Code style, editorconfig, logging, documentation conventions |
|
||||
| [.agents/llama-cpp-backend.md](.agents/llama-cpp-backend.md) | Working on the llama.cpp backend — architecture, updating, tool call parsing |
|
||||
| [.agents/llama-cpp-localai-paged-backend.md](.agents/llama-cpp-localai-paged-backend.md) | Working on the CUDA-only paged-attention llama.cpp variant (Qwen3.6 hybrid-SSM / Blackwell NVFP4 decode) - patchset scope, the bit-exact gate, the manual pin-sync + weekly canary, CUDA-only invariants, stock-stays-pure, Metal/SYCL/Vulkan follow-up scope |
|
||||
| [.agents/vllm-parity-methodology.md](.agents/vllm-parity-methodology.md) | The methodology for closing the vLLM decode-throughput gap in llama.cpp - bit-exact gating, profile-don't-assume, both-engine ground-truth, per-lever A/B discipline, recording rejected levers, multi-agent GPU orchestration |
|
||||
| [.agents/vllm-backend.md](.agents/vllm-backend.md) | Working on the vLLM / vLLM-omni backends — native parsers, ChatDelta, CPU build, libnuma packaging, backend hooks |
|
||||
| [.agents/sglang-backend.md](.agents/sglang-backend.md) | Working on the SGLang backend — `engine_args` validation against ServerArgs, speculative-decoding (EAGLE/EAGLE3/DFLASH/MTP) recipes, parser handling |
|
||||
| [.agents/ds4-backend.md](.agents/ds4-backend.md) | Working on the ds4 backend - DSML state machine, thinking modes, KV cache, Metal+CUDA matrix |
|
||||
@@ -37,6 +39,7 @@ LocalAI follows the Linux kernel project's [guidelines for AI coding assistants]
|
||||
|
||||
- **Git hooks & coverage gates**: Run `make install-hooks` once per clone so the pre-commit lint + coverage gates run. **Never bypass them with `git commit --no-verify`, and never lower a coverage baseline or widen a gate's tolerance to turn a red gate green** — the coverage ratchet only moves up. If a change drops coverage, add tests to raise it (e.g. render-smoke specs). See [.agents/building-and-testing.md](.agents/building-and-testing.md).
|
||||
- **Logging**: Use `github.com/mudler/xlog` (same API as slog)
|
||||
- **Paged llama.cpp backend**: `llama-cpp-localai-paged` is a CUDA-only variant that owns its own patch series + its own pinned llama.cpp (manual pin-sync, weekly canary); the stock `llama-cpp` backend stays patch-free. Read [.agents/llama-cpp-localai-paged-backend.md](.agents/llama-cpp-localai-paged-backend.md) before touching either, and [.agents/vllm-parity-methodology.md](.agents/vllm-parity-methodology.md) for the decode-parity methodology behind it.
|
||||
- **Go style**: Prefer `any` over `interface{}`
|
||||
- **Comments**: Explain *why*, not *what*
|
||||
- **Docs**: Update `docs/content/` when adding features or changing config
|
||||
|
||||
32
Makefile
32
Makefile
@@ -1,5 +1,5 @@
|
||||
# Disable parallel execution for backend builds
|
||||
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/omnivoice-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio backends/supertonic backends/depth-anything-cpp backends/privacy-filter backends/privacy-filter-darwin
|
||||
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/omnivoice-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio backends/supertonic backends/depth-anything-cpp backends/privacy-filter backends/privacy-filter-darwin backends/llama-cpp-localai-paged
|
||||
|
||||
GOCMD=go
|
||||
GOTEST=$(GOCMD) test
|
||||
@@ -405,18 +405,6 @@ test-realtime: build-mock-backend
|
||||
@echo 'Running realtime e2e tests (mock backend)'
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="Realtime && !real-models" --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e
|
||||
|
||||
# Verify the realtime state-machine implementations conform to their formal
|
||||
# designs (Go transition/rapid tests under -race + FizzBee model check of the
|
||||
# authoritative specs). See docs/design/realtime-state-machines.md (Part 6) and
|
||||
# docs/design/specs/README.md.
|
||||
test-realtime-conformance:
|
||||
GOCMD=$(GOCMD) ./scripts/realtime-conformance.sh
|
||||
|
||||
# Install the pinned, checksum-verified FizzBee model checker (into .tools/,
|
||||
# gitignored) used by test-realtime-conformance. Idempotent; no-op if present.
|
||||
install-fizzbee:
|
||||
./scripts/install-fizzbee.sh
|
||||
|
||||
# Container-based real-model realtime testing. Build env vars / pipeline
|
||||
# definition kept here so test-realtime-models-docker can drive a fully wired
|
||||
# pipeline (VAD + STT + LLM + TTS) from inside a containerised runner.
|
||||
@@ -683,6 +671,15 @@ test-extra-backend-llama-cpp: docker-build-llama-cpp
|
||||
test-extra-backend-ik-llama-cpp: docker-build-ik-llama-cpp
|
||||
BACKEND_IMAGE=local-ai-backend:ik-llama-cpp $(MAKE) test-extra-backend
|
||||
|
||||
## llama-cpp-localai-paged: the LocalAI paged-attention llama.cpp variant. Same
|
||||
## GGUF surface as stock llama-cpp (the paged engine is runtime-gated by the
|
||||
## LLAMA_KV_PAGED env the grpc-server option hooks set), so the standard
|
||||
## llama-cpp capability set is what we exercise here.
|
||||
test-extra-backend-llama-cpp-localai-paged: docker-build-llama-cpp-localai-paged
|
||||
BACKEND_IMAGE=local-ai-backend:llama-cpp-localai-paged \
|
||||
BACKEND_TEST_CAPS=health,load,predict,stream,logprobs,logit_bias \
|
||||
$(MAKE) test-extra-backend
|
||||
|
||||
## turboquant: exercises the llama.cpp-fork backend with the fork's
|
||||
## *TurboQuant-specific* KV-cache types (turbo3 for both K and V). turbo3
|
||||
## is what makes this backend distinct from stock llama-cpp — picking q8_0
|
||||
@@ -1039,7 +1036,7 @@ test-extra-backend-whisper-transcription: docker-build-whisper
|
||||
## is reachable.
|
||||
test-extra-backend-parakeet-cpp-transcription: docker-build-parakeet-cpp
|
||||
BACKEND_IMAGE=local-ai-backend:parakeet-cpp \
|
||||
BACKEND_TEST_MODEL_URL=https://huggingface.co/mudler/parakeet-cpp-gguf/resolve/main/realtime_eou_120m-v1-f16.gguf \
|
||||
BACKEND_TEST_MODEL_URL=https://huggingface.co/mudler/parakeet-cpp-gguf/resolve/main/tdt_ctc-110m-f16.gguf \
|
||||
BACKEND_TEST_AUDIO_URL=https://github.com/ggml-org/whisper.cpp/raw/master/samples/jfk.wav \
|
||||
BACKEND_TEST_CAPS=health,load,transcription \
|
||||
$(MAKE) test-extra-backend
|
||||
@@ -1193,6 +1190,10 @@ BACKEND_IK_LLAMA_CPP = ik-llama-cpp|ik-llama-cpp|.|false|false
|
||||
# turboquant is a llama.cpp fork with TurboQuant KV-cache quantization.
|
||||
# Reuses backend/cpp/llama-cpp grpc-server sources via a thin wrapper Makefile.
|
||||
BACKEND_TURBOQUANT = turboquant|turboquant|.|false|false
|
||||
# llama-cpp-localai-paged = stock llama.cpp grpc-server + the LocalAI paged-attention
|
||||
# patch series (vendored in this wrapper backend). Reuses backend/cpp/llama-cpp sources via a thin
|
||||
# wrapper Makefile (same upstream pin as stock llama-cpp; no fork, no patch-grpc-server).
|
||||
BACKEND_LLAMA_CPP_LOCALAI_PAGED = llama-cpp-localai-paged|llama-cpp-localai-paged|.|false|false
|
||||
# ds4 is antirez/ds4, a DeepSeek V4 Flash-specific inference engine.
|
||||
# Single-model; hardware-only validation lives at tests/e2e-backends/
|
||||
# (BACKEND_BINARY mode); see docs/superpowers/plans/2026-05-11-ds4-backend.md.
|
||||
@@ -1294,6 +1295,7 @@ endef
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_IK_LLAMA_CPP)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_TURBOQUANT)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP_LOCALAI_PAGED)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_DS4)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_PRIVACY_FILTER)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_PIPER)))
|
||||
@@ -1357,7 +1359,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_SUPERTONIC)))
|
||||
docker-save-%: backend-images
|
||||
docker save local-ai-backend:$* -o backend-images/$*.tar
|
||||
|
||||
docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-crispasr docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-liquid-audio docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-rfdetr-cpp docker-build-qwen3-tts-cpp docker-build-omnivoice-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx docker-build-cloud-proxy docker-build-supertonic docker-build-depth-anything-cpp docker-build-privacy-filter
|
||||
docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-llama-cpp-localai-paged docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-crispasr docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-liquid-audio docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-rfdetr-cpp docker-build-qwen3-tts-cpp docker-build-omnivoice-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx docker-build-cloud-proxy docker-build-supertonic docker-build-depth-anything-cpp docker-build-privacy-filter
|
||||
|
||||
########################################################
|
||||
### Mock Backend for E2E Tests
|
||||
|
||||
163
backend/Dockerfile.llama-cpp-localai-paged
Normal file
163
backend/Dockerfile.llama-cpp-localai-paged
Normal file
@@ -0,0 +1,163 @@
|
||||
ARG BASE_IMAGE=ubuntu:24.04
|
||||
# BUILDER_BASE_IMAGE defaults to BASE_IMAGE so the Dockerfile parses even
|
||||
# when no prebuilt base is supplied. The builder-prebuilt stage is only
|
||||
# entered when BUILDER_TARGET=builder-prebuilt, so a "wrong" fallback
|
||||
# content here is harmless — BuildKit prunes the unreferenced builder.
|
||||
ARG BUILDER_BASE_IMAGE=${BASE_IMAGE}
|
||||
# BUILDER_TARGET selects which builder stage the final scratch image copies
|
||||
# package output from. Declared at global scope (before any FROM) so it's
|
||||
# usable in `FROM ${BUILDER_TARGET}` below. Default keeps local
|
||||
# `make backends/llama-cpp-localai-paged` on the from-source path.
|
||||
ARG BUILDER_TARGET=builder-fromsource
|
||||
ARG APT_MIRROR=""
|
||||
ARG APT_PORTS_MIRROR=""
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Stage: builder-fromsource — self-contained build path.
|
||||
# Runs .docker/install-base-deps.sh (apt deps + cmake + protoc + gRPC +
|
||||
# conditional CUDA/ROCm/Vulkan), copies /opt/grpc to /usr/local, then
|
||||
# compiles the variant. Used when BUILDER_TARGET=builder-fromsource (the
|
||||
# default; local `make backends/llama-cpp-localai-paged`).
|
||||
#
|
||||
# The install script is the same one that backend/Dockerfile.base-grpc-builder
|
||||
# runs, so the result is bit-equivalent to the prebuilt-base path
|
||||
# (builder-prebuilt below).
|
||||
# ============================================================================
|
||||
FROM ${BASE_IMAGE} AS builder-fromsource
|
||||
ARG BUILD_TYPE
|
||||
ARG CUDA_MAJOR_VERSION
|
||||
ARG CUDA_MINOR_VERSION
|
||||
ARG CMAKE_FROM_SOURCE=false
|
||||
# CUDA Toolkit 13.x compatibility: CMake 3.31.9+ fixes toolchain detection/arch table issues
|
||||
ARG CMAKE_VERSION=3.31.10
|
||||
ARG GRPC_VERSION=v1.65.0
|
||||
ARG GRPC_MAKEFLAGS="-j4 -Otarget"
|
||||
ARG SKIP_DRIVERS=false
|
||||
ARG TARGETARCH
|
||||
ARG TARGETVARIANT
|
||||
ARG GO_VERSION=1.25.4
|
||||
ARG UBUNTU_VERSION=2404
|
||||
ARG APT_MIRROR
|
||||
ARG APT_PORTS_MIRROR
|
||||
ARG AMDGPU_TARGETS=""
|
||||
ARG BACKEND=rerankers
|
||||
# CUDA target archs, e.g. --build-arg CUDA_DOCKER_ARCH='75;86;89;120'
|
||||
ARG CUDA_DOCKER_ARCH
|
||||
ARG CMAKE_ARGS
|
||||
|
||||
ENV BUILD_TYPE=${BUILD_TYPE} \
|
||||
CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION} \
|
||||
CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION} \
|
||||
CMAKE_FROM_SOURCE=${CMAKE_FROM_SOURCE} \
|
||||
CMAKE_VERSION=${CMAKE_VERSION} \
|
||||
GRPC_VERSION=${GRPC_VERSION} \
|
||||
GRPC_MAKEFLAGS=${GRPC_MAKEFLAGS} \
|
||||
SKIP_DRIVERS=${SKIP_DRIVERS} \
|
||||
TARGETARCH=${TARGETARCH} \
|
||||
UBUNTU_VERSION=${UBUNTU_VERSION} \
|
||||
APT_MIRROR=${APT_MIRROR} \
|
||||
APT_PORTS_MIRROR=${APT_PORTS_MIRROR} \
|
||||
AMDGPU_TARGETS=${AMDGPU_TARGETS} \
|
||||
CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} \
|
||||
CMAKE_ARGS=${CMAKE_ARGS} \
|
||||
DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
# CUDA on PATH (no-op when CUDA isn't installed)
|
||||
ENV PATH=/usr/local/cuda/bin:${PATH}
|
||||
# HipBLAS / ROCm on PATH (no-op when ROCm isn't installed)
|
||||
ENV PATH=/opt/rocm/bin:${PATH}
|
||||
|
||||
WORKDIR /build
|
||||
|
||||
# Install everything via the shared script — the same one that
|
||||
# backend/Dockerfile.base-grpc-builder runs, so the prebuilt CI base and
|
||||
# this from-source path are bit-equivalent.
|
||||
RUN --mount=type=bind,source=.docker/install-base-deps.sh,target=/usr/local/sbin/install-base-deps \
|
||||
--mount=type=bind,source=.docker/apt-mirror.sh,target=/usr/local/sbin/apt-mirror \
|
||||
bash /usr/local/sbin/install-base-deps
|
||||
|
||||
# Mirror builder-prebuilt: copy gRPC from /opt/grpc to /usr/local so
|
||||
# CMake's find_package finds it at the canonical prefix the Makefile expects.
|
||||
RUN cp -a /opt/grpc/. /usr/local/
|
||||
|
||||
COPY . /LocalAI
|
||||
|
||||
# BuildKit cache mount for ccache. See Dockerfile.llama-cpp (commit 9228e5b4)
|
||||
# for rationale. llama-cpp-localai-paged is the SAME upstream llama.cpp with
|
||||
# the LocalAI paged patch series applied; it reuses backend/cpp/llama-cpp
|
||||
# source via a thin wrapper Makefile, so MOST TUs are content-identical to the
|
||||
# stock llama-cpp build. Sharing a cache id with llama-cpp could give
|
||||
# cross-variant hits — but for now keep them separate (mirroring turboquant) so
|
||||
# a regression in one doesn't poison the other. Revisit sharing after measuring
|
||||
# the actual hit rate.
|
||||
#
|
||||
# The compile body is shared with builder-prebuilt via .docker/llama-cpp-localai-paged-compile.sh.
|
||||
RUN --mount=type=bind,source=.docker/llama-cpp-localai-paged-compile.sh,target=/usr/local/sbin/compile.sh \
|
||||
--mount=type=cache,target=/root/.ccache,id=llama-cpp-localai-paged-ccache-${TARGETARCH}-${BUILD_TYPE},sharing=locked \
|
||||
bash /usr/local/sbin/compile.sh
|
||||
|
||||
|
||||
# Copy libraries using a script to handle architecture differences
|
||||
RUN make -BC /LocalAI/backend/cpp/llama-cpp-localai-paged package
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Stage: builder-prebuilt — uses the pre-built base from
|
||||
# quay.io/go-skynet/ci-cache:base-grpc-* (built by .github/workflows/base-images.yml).
|
||||
# That image already has gRPC at /opt/grpc + apt deps + CUDA/ROCm/Vulkan
|
||||
# pre-installed, so we just copy gRPC to /usr/local and compile. Used when
|
||||
# BUILDER_TARGET=builder-prebuilt (CI when the matrix entry sets
|
||||
# builder-base-image). llama-cpp-localai-paged reuses the SAME base-grpc-* tags
|
||||
# as the stock llama-cpp backend (same gRPC + same toolchain), so no new
|
||||
# base-images.yml variant is required.
|
||||
# ============================================================================
|
||||
FROM ${BUILDER_BASE_IMAGE} AS builder-prebuilt
|
||||
|
||||
ARG BUILD_TYPE
|
||||
ENV BUILD_TYPE=${BUILD_TYPE}
|
||||
ARG CUDA_DOCKER_ARCH
|
||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||
ARG CMAKE_ARGS
|
||||
ENV CMAKE_ARGS=${CMAKE_ARGS}
|
||||
# AMDGPU_TARGETS must be forwarded into the env here too — backend/cpp/llama-cpp/Makefile
|
||||
# (which the llama-cpp-localai-paged Makefile reuses via a sibling build dir) errors out
|
||||
# when the var is empty on a hipblas build, and the prebuilt path is what CI exercises most
|
||||
# of the time. The builder-fromsource stage above already does this; mirror it here.
|
||||
ARG AMDGPU_TARGETS
|
||||
ENV AMDGPU_TARGETS=${AMDGPU_TARGETS}
|
||||
ARG TARGETARCH
|
||||
ARG TARGETVARIANT
|
||||
|
||||
# The base-grpc-* image installs gRPC to /opt/grpc but doesn't copy it to
|
||||
# /usr/local. Mirror what the from-source path does so the compile step
|
||||
# can find gRPC at the canonical prefix the Makefile expects.
|
||||
RUN cp -a /opt/grpc/. /usr/local/
|
||||
|
||||
COPY . /LocalAI
|
||||
|
||||
RUN --mount=type=bind,source=.docker/llama-cpp-localai-paged-compile.sh,target=/usr/local/sbin/compile.sh \
|
||||
--mount=type=cache,target=/root/.ccache,id=llama-cpp-localai-paged-ccache-${TARGETARCH}-${BUILD_TYPE},sharing=locked \
|
||||
bash /usr/local/sbin/compile.sh
|
||||
|
||||
RUN make -BC /LocalAI/backend/cpp/llama-cpp-localai-paged package
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Final stage — copies package output from one of the two builders.
|
||||
# BUILDER_TARGET selects which one. BuildKit prunes the unreferenced builder.
|
||||
#
|
||||
# BuildKit doesn't support variable expansion in `COPY --from=` directly,
|
||||
# so we resolve the ARG by aliasing the chosen builder to a fixed stage
|
||||
# name via `FROM ${BUILDER_TARGET} AS builder` and then COPY --from=builder.
|
||||
# BUILDER_TARGET itself is declared as a global ARG at the top of this
|
||||
# file (required for use in FROM), so we just re-import it into this
|
||||
# stage's scope before the FROM directive.
|
||||
# ============================================================================
|
||||
FROM ${BUILDER_TARGET} AS builder
|
||||
|
||||
FROM scratch
|
||||
|
||||
|
||||
# Copy all available binaries (the build process only creates the appropriate ones for the target architecture)
|
||||
COPY --from=builder /LocalAI/backend/cpp/llama-cpp-localai-paged/package/. ./
|
||||
@@ -18,18 +18,6 @@ service Backend {
|
||||
rpc GenerateVideo(GenerateVideoRequest) returns (Result) {}
|
||||
rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
|
||||
rpc AudioTranscriptionStream(TranscriptRequest) returns (stream TranscriptStreamResponse) {}
|
||||
// AudioTranscriptionLive is the bidirectional live-microphone ASR RPC. The
|
||||
// first message MUST carry a Config; subsequent messages carry Audio frames
|
||||
// (mono float PCM at config.sample_rate, 16 kHz default). After a
|
||||
// successful open the backend replies with a single ready ack
|
||||
// (TranscriptLiveResponse{ready:true}); backends or models without
|
||||
// cache-aware streaming support return UNIMPLEMENTED instead. Newly
|
||||
// finalized text streams back as deltas; eou=true marks the model's
|
||||
// end-of-utterance token. One stream spans many utterances (the decoder
|
||||
// resets itself after each EOU). Closing the send side finalizes: the
|
||||
// backend flushes the decoder tail and emits a terminal message carrying
|
||||
// final_result. A second Config mid-stream resets the decode session.
|
||||
rpc AudioTranscriptionLive(stream TranscriptLiveRequest) returns (stream TranscriptLiveResponse) {}
|
||||
rpc TTS(TTSRequest) returns (Result) {}
|
||||
rpc TTSStream(TTSRequest) returns (stream Reply) {}
|
||||
rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
|
||||
@@ -491,10 +479,6 @@ message TranscriptResult {
|
||||
string text = 2;
|
||||
string language = 3;
|
||||
float duration = 4;
|
||||
// True when the decode ended on the model's end-of-utterance special token
|
||||
// (<EOU>/<EOB>, emitted by cache-aware streaming models such as
|
||||
// parakeet_realtime_eou_120m-v1). The marker itself is stripped from text.
|
||||
bool eou = 5;
|
||||
}
|
||||
|
||||
message TranscriptStreamResponse {
|
||||
@@ -502,34 +486,6 @@ message TranscriptStreamResponse {
|
||||
TranscriptResult final_result = 2;
|
||||
}
|
||||
|
||||
// === AudioTranscriptionLive messages =====================================
|
||||
|
||||
message TranscriptLiveRequest {
|
||||
oneof payload {
|
||||
TranscriptLiveConfig config = 1;
|
||||
TranscriptLiveAudio audio = 2;
|
||||
}
|
||||
}
|
||||
|
||||
message TranscriptLiveConfig {
|
||||
string language = 1; // "" => model default
|
||||
int32 sample_rate = 2; // 0 => 16000; backends may reject others
|
||||
map<string, string> params = 3; // backend-specific tuning
|
||||
}
|
||||
|
||||
message TranscriptLiveAudio {
|
||||
repeated float pcm = 1; // mono PCM in [-1,1] at config.sample_rate
|
||||
}
|
||||
|
||||
message TranscriptLiveResponse {
|
||||
bool ready = 1; // open ack: sent once, before any delta
|
||||
string delta = 2; // newly-finalized text since previous response
|
||||
bool eou = 3; // <EOU> fired during this feed (the user yielded the turn)
|
||||
repeated TranscriptWord words = 4; // words finalized by this feed (stream-relative ns)
|
||||
TranscriptResult final_result = 5; // terminal message only, after the send side closes
|
||||
bool eob = 6; // <EOB> fired: a backchannel ("uh-huh") ended — NOT a turn boundary
|
||||
}
|
||||
|
||||
message TranscriptWord {
|
||||
int64 start = 1;
|
||||
int64 end = 2;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
|
||||
IK_LLAMA_VERSION?=f74a6fb87b315b2c3154166e075360e15021a61d
|
||||
IK_LLAMA_VERSION?=f96eaddba8bed6a9a5e628bbf6a566775c70b49c
|
||||
LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
|
||||
|
||||
CMAKE_ARGS?=
|
||||
|
||||
157
backend/cpp/llama-cpp-localai-paged/Makefile
Normal file
157
backend/cpp/llama-cpp-localai-paged/Makefile
Normal file
@@ -0,0 +1,157 @@
|
||||
|
||||
# llama-cpp-localai-paged is LocalAI's paged-attention llama.cpp variant. It
|
||||
# builds upstream llama.cpp with the LocalAI paged-attention patch series
|
||||
# (patches/paged/, vendored in THIS backend) applied on top. It reuses
|
||||
# backend/cpp/llama-cpp's grpc-server.cpp / CMakeLists.txt / prepare.sh / Makefile
|
||||
# sources verbatim via a thin wrapper - the stock llama-cpp backend is pure
|
||||
# upstream and carries NONE of the paged patches; this backend OWNS them.
|
||||
#
|
||||
# Pin handling (mirrors the turboquant wrapper, the precedent this is modelled
|
||||
# on): the paged patch series is hand-verified bit-exact against ONE specific
|
||||
# llama.cpp tip and re-exported by the manual PIN_SYNC process
|
||||
# (README section 7 + .agents/llama-cpp-localai-paged-backend.md). A naive
|
||||
# pin bump would move the tip out from
|
||||
# under the patches and break `git apply` at build time, so this backend OWNS
|
||||
# its pin (LLAMA_VERSION below) instead of inheriting the auto-bumped stock pin
|
||||
# from backend/cpp/llama-cpp/Makefile. The override is forced into every copied
|
||||
# build via `LLAMA_VERSION=$(LLAMA_VERSION)`. There is deliberately NO
|
||||
# bump_deps.yaml entry for it: it is advanced ONLY by PIN_SYNC, never nightly.
|
||||
# (turboquant CAN auto-bump because its fork branch carries the patches; the
|
||||
# paged series is vendored as .patch files here, so it cannot.)
|
||||
#
|
||||
# - NO patch-grpc-server.sh and NO apply-patches.sh: the shared grpc-server.cpp
|
||||
# already carries the (runtime-gated) paged option hooks, and the paged patch
|
||||
# series (patches/paged/) is applied by THIS Makefile's own apply step onto
|
||||
# the freshly cloned tree, using the same strict `git apply` method the stock
|
||||
# build uses for base patches. The stock llama-cpp Makefile applies only its
|
||||
# own (currently empty) base patches/ series, never the paged one.
|
||||
|
||||
# Manually pin-synced llama.cpp tip the paged patch series is verified against.
|
||||
# Decoupled from the auto-bumped stock pin in backend/cpp/llama-cpp/Makefile so
|
||||
# the nightly llama.cpp bump cannot silently break the vendored paged patches.
|
||||
# Advance ONLY via the PIN_SYNC process (rebase patches + bit-exact gate +
|
||||
# re-export), then update this value. See:
|
||||
# README section 7 + .agents/llama-cpp-localai-paged-backend.md
|
||||
#
|
||||
# This pin = the manual, verified sync. The signal telling you WHEN to do the
|
||||
# next sync is the early-warning canary
|
||||
# (.github/workflows/llama-cpp-paged-canary.yml): weekly it applies + compiles
|
||||
# this patch series against the latest upstream llama.cpp tip and goes red the
|
||||
# moment upstream drifts past the patches. Canary red -> run a PIN_SYNC, then
|
||||
# bump this value. The canary never touches this pin; it is signal-only.
|
||||
#
|
||||
# HARD CONSTRAINT: keep this == the stock llama-cpp pin (backend/cpp/llama-cpp/
|
||||
# Makefile). grpc-server.cpp is SHARED with the stock backend and tracks the
|
||||
# stock pin; a paged pin that diverges PAST an upstream server-API refactor
|
||||
# breaks the grpc-server LINK even when the patches are byte-for-byte bit-exact.
|
||||
# The c299a92c bump did exactly this: patches applied + greedy-md5 bit-exact, but
|
||||
# grpc-server.cpp failed to link with undefined references to stream_* server
|
||||
# helpers that the refactor pulled into the headers grpc-server.cpp includes.
|
||||
# Therefore a PIN_SYNC must pass the FULL grpc-server build/link on CI, not only
|
||||
# the bit-exact gate. See README section 7 + .agents/llama-cpp-localai-paged-backend.md.
|
||||
LLAMA_VERSION?=0ed235ea2c17a19fc8238668653946721ed136fd
|
||||
|
||||
CMAKE_ARGS?=
|
||||
BUILD_TYPE?=
|
||||
NATIVE?=false
|
||||
ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
|
||||
TARGET?=--target grpc-server
|
||||
JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 1)
|
||||
ARCH?=$(shell uname -m)
|
||||
|
||||
CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||
LLAMA_CPP_DIR := $(CURRENT_MAKEFILE_DIR)/../llama-cpp
|
||||
# OUR vendored paged-attention patch series. Owned by this backend; the stock
|
||||
# llama-cpp backend no longer carries it. Applied onto each freshly cloned
|
||||
# llama.cpp tree by apply-paged-patches below (strict git apply).
|
||||
PAGED_PATCHES_DIR := $(CURRENT_MAKEFILE_DIR)/patches/paged
|
||||
|
||||
GREEN := \033[0;32m
|
||||
RESET := \033[0m
|
||||
|
||||
# Apply OUR vendored paged-attention patch series (patches/paged/0*.patch) onto a
|
||||
# freshly cloned llama.cpp tree ($(1)) using the SAME strict git-apply method the
|
||||
# stock build uses for its base patches (backend/cpp/llama-cpp/Makefile `llama.cpp`
|
||||
# target). Strict: any patch that no longer applies aborts the build (exit 1) -
|
||||
# that is the signal to run a PIN_SYNC, never to bump the pin blindly. The series
|
||||
# is owned by THIS backend, not by the now-pure stock llama-cpp backend.
|
||||
define apply-paged-patches
|
||||
cd $(1) && \
|
||||
for p in $(PAGED_PATCHES_DIR)/0*.patch; do \
|
||||
[ -e "$$p" ] || continue; \
|
||||
echo "applying llama.cpp PAGED patch: $$p"; \
|
||||
git apply --verbose "$$p" || { echo "paged patch failed: $$p"; exit 1; }; \
|
||||
done
|
||||
endef
|
||||
|
||||
# Each flavor target:
|
||||
# 1. copies backend/cpp/llama-cpp/ (grpc-server.cpp + prepare.sh +
|
||||
# CMakeLists.txt + Makefile) into a sibling
|
||||
# llama-cpp-localai-paged-<flavor>-build directory;
|
||||
# 2. clones OUR pinned upstream llama.cpp into that copy via the copy's own
|
||||
# `llama.cpp` target (which applies the stock base patches/ series, normally
|
||||
# empty), then applies THIS backend's paged patch series (patches/paged/)
|
||||
# onto the cloned tree with strict `git apply` (apply-paged-patches);
|
||||
# 3. runs the copy's `grpc-server` target and copies the produced binary up as
|
||||
# llama-cpp-localai-paged-<flavor>.
|
||||
# We clone+patch only the *copy*, never the original under backend/cpp/llama-cpp/,
|
||||
# so the stock llama-cpp build stays untouched and patch-free.
|
||||
define paged-build
|
||||
rm -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-$(1)-build
|
||||
cp -rf $(LLAMA_CPP_DIR) $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-$(1)-build
|
||||
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-$(1)-build purge
|
||||
$(info $(GREEN)I llama-cpp-localai-paged build info:$(1)$(RESET))
|
||||
LLAMA_VERSION=$(LLAMA_VERSION) $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-$(1)-build llama.cpp
|
||||
$(call apply-paged-patches,$(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-$(1)-build/llama.cpp)
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) $(2)" TARGET="$(3)" LLAMA_VERSION=$(LLAMA_VERSION) \
|
||||
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-$(1)-build grpc-server
|
||||
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-$(1)-build/grpc-server llama-cpp-localai-paged-$(1)
|
||||
endef
|
||||
|
||||
llama-cpp-localai-paged-avx2:
|
||||
$(call paged-build,avx2,-DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on,--target grpc-server)
|
||||
|
||||
llama-cpp-localai-paged-avx512:
|
||||
$(call paged-build,avx512,-DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on,--target grpc-server)
|
||||
|
||||
llama-cpp-localai-paged-avx:
|
||||
$(call paged-build,avx,-DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server)
|
||||
|
||||
llama-cpp-localai-paged-fallback:
|
||||
$(call paged-build,fallback,-DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server)
|
||||
|
||||
# Single-build CPU backend via ggml CPU_ALL_VARIANTS (mirrors llama-cpp-cpu-all).
|
||||
# Reuses backend/cpp/llama-cpp's CMakeLists.txt (hw_grpc_proto STATIC) and
|
||||
# Makefile (SHARED_LIBS make-var + EXTRA_CMAKE_ARGS), so this passes the same
|
||||
# overrides through to the copied build: SHARED_LIBS=ON, the DL flags, and
|
||||
# --target ggml (which pulls in the per-microarch libggml-cpu-*.so via ggml's
|
||||
# add_dependencies). The .so set is collected for package.sh to bundle into
|
||||
# package/lib.
|
||||
llama-cpp-localai-paged-cpu-all:
|
||||
rm -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-cpu-all-build
|
||||
cp -rf $(LLAMA_CPP_DIR) $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-cpu-all-build
|
||||
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-cpu-all-build purge
|
||||
$(info $(GREEN)I llama-cpp-localai-paged build info:cpu-all-variants$(RESET))
|
||||
LLAMA_VERSION=$(LLAMA_VERSION) $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-cpu-all-build llama.cpp
|
||||
$(call apply-paged-patches,$(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-cpu-all-build/llama.cpp)
|
||||
SHARED_LIBS=ON EXTRA_CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON" TARGET="--target grpc-server --target ggml" LLAMA_VERSION=$(LLAMA_VERSION) \
|
||||
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-cpu-all-build grpc-server
|
||||
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-cpu-all-build/grpc-server llama-cpp-localai-paged-cpu-all
|
||||
rm -rf ggml-shared-libs && mkdir -p ggml-shared-libs
|
||||
find $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-cpu-all-build/llama.cpp/build \( -name '*.so*' -o -name '*.dylib' \) -exec cp -av {} ggml-shared-libs/ \;
|
||||
@echo "Collected ggml shared backends:" && ls -la ggml-shared-libs/
|
||||
|
||||
llama-cpp-localai-paged-grpc:
|
||||
$(call paged-build,grpc,-DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server --target ggml-rpc-server)
|
||||
|
||||
llama-cpp-localai-paged-rpc-server: llama-cpp-localai-paged-grpc
|
||||
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-grpc-build/llama.cpp/build/bin/ggml-rpc-server llama-cpp-localai-paged-rpc-server
|
||||
|
||||
package:
|
||||
bash package.sh
|
||||
|
||||
purge:
|
||||
rm -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-*-build
|
||||
rm -rf llama-cpp-localai-paged-* package
|
||||
|
||||
clean: purge
|
||||
581
backend/cpp/llama-cpp-localai-paged/README.md
Normal file
581
backend/cpp/llama-cpp-localai-paged/README.md
Normal file
@@ -0,0 +1,581 @@
|
||||
# LocalAI paged-attention llama.cpp patch series
|
||||
|
||||
This backend vendors the patch series (in `patches/paged/`) that turns stock
|
||||
llama.cpp into LocalAI's paged-attention variant (`llama-cpp-localai-paged`). The
|
||||
patches are applied on top of a pinned upstream llama.cpp at build time; nothing
|
||||
here is a fork - it is a source-only `*.patch` stack plus this canonical doc.
|
||||
|
||||
> One-file rule: this README is the canonical reference for the patch series. The
|
||||
> only other docs are operational, kept in `docs/`, and linked below:
|
||||
> - [`PAGED_BITEXACT_NOTE.md`](docs/PAGED_BITEXACT_NOTE.md) - the per-path bit-exactness gate (the canonical paged-MoE md5 reference).
|
||||
> - [`LOCALAI_LLAMACPP_BACKEND_PLAN.md`](docs/LOCALAI_LLAMACPP_BACKEND_PLAN.md) - the design-of-record for shipping this as its own backend + the NVFP4 gallery items.
|
||||
> - [`VLLM_PARITY_FINAL.md`](docs/VLLM_PARITY_FINAL.md) - the definitive, closed record of the GB10 vLLM-parity investigation: full benchmark, every lever + verdict, the structural floors, and the parity verdict (summarized in section 9 below). Read this before reopening any parity work.
|
||||
|
||||
---
|
||||
|
||||
## 1. What it is
|
||||
|
||||
`llama-cpp-localai-paged` is the LocalAI paged-attention llama.cpp backend: a
|
||||
vendored patch series over upstream llama.cpp that adds
|
||||
|
||||
- a **paged KV cache** (vLLM-style block manager: on-demand fixed-size blocks,
|
||||
free pool, ref-counted blocks) with a **block-table flash-attention** read so
|
||||
the attention kernels index physical cells instead of a contiguous buffer;
|
||||
- **cross-request prefix sharing** - concurrent requests that share a long
|
||||
prefix physically reuse one committed copy of the prefix blocks and prefill
|
||||
only their divergent suffix;
|
||||
- a **decode-first prefill scheduler** - a dynamic per-step prefill-token budget
|
||||
decoupled from `n_batch`, so a long prefill never freezes co-batched decode;
|
||||
- **GB10 / Blackwell NVFP4 decode optimizations** for the Qwen3.6 hybrid
|
||||
gated-DeltaNet (SSM) models, where the recurrent-state plumbing - not the FP4
|
||||
GEMM - dominates the decode step.
|
||||
|
||||
It is **pinned to llama.cpp `9d5d882d`** (kept == the stock `llama-cpp` backend's
|
||||
pin) and advanced only by a manual, bit-exact-gated pin-sync process (see
|
||||
section 7, "Pin + maintenance policy"), decoupled from the nightly auto-bumper. The pin must stay aligned with the stock pin because
|
||||
`grpc-server.cpp` is shared; an earlier bump to `c299a92c` was bit-exact but broke
|
||||
the grpc-server link and was reverted.
|
||||
|
||||
The build gate is `LLAMA_PAGED` (default on in this tree); the paged engine is
|
||||
enabled per-model at runtime via the gallery `options:` knobs (`paged_kv:true`,
|
||||
`max_batch_tokens:`, `kv_unified:false`, ...). Against unpatched llama.cpp the
|
||||
runtime hooks are inert, so a single `grpc-server.cpp` is shared between the
|
||||
clean and the paged build.
|
||||
|
||||
---
|
||||
|
||||
## 2. Architecture
|
||||
|
||||
The decode step on these models breaks into three cost centers; the patch series
|
||||
attacks each one.
|
||||
|
||||
**Paged KV manager + block-table flash-attn.** A host-side `PagedKVManager`
|
||||
(`FreeBlockQueue` / `BlockPool` / chained-hash content cache) hands out
|
||||
fixed-size KV blocks on demand and reclaims them per-sequence (ref-counted, with
|
||||
copy-on-write for shared prefixes). The attention path reads through a **block
|
||||
table** - an `I32 [n_view, n_stream]` position-ordered physical-cell index passed
|
||||
as `src[5]` of `ggml_flash_attn_ext` - so the CUDA fattn vec/tile kernels and the
|
||||
CPU reference map logical KV index `j` to physical cell `block_table[seq*ne11+j]`
|
||||
and read K/V in place. Token-position ordering keeps the flash-attn online-softmax
|
||||
reduction order identical to stock. A null block table is the stock contiguous
|
||||
read, byte-identical.
|
||||
|
||||
**The gated-DeltaNet (GDN / SSM) decode path.** The Qwen3.6 hybrid models are 48
|
||||
gated-DeltaNet (linear-attention / SSM) layers + 16 full-attention layers. On
|
||||
GB10 the recurrent-state plumbing, not the weight GEMM, is the dominant decode
|
||||
cost. The series fuses that plumbing to mirror vLLM's
|
||||
`fused_recurrent_gated_delta_rule`: the recurrent state is read from and written
|
||||
to its cache slot in place (no copy-back, no `get_rows` materialization), the
|
||||
conv state is updated in place, the output projection is reshaped to route to the
|
||||
tensor-core MMQ GEMM, and the recurrence kernel is occupancy-retuned - all
|
||||
bit-exact (md5-gateable) against the f32 baseline.
|
||||
|
||||
**NVFP4 native FP4-MMA on Blackwell.** The NVFP4 dense/expert weight GEMM uses
|
||||
Blackwell's native FP4-MMA. The series removes a redundant activation-requantize
|
||||
in the MoE broadcast projections (bit-exact byte copy of identical blocks) and
|
||||
keeps CUDA graphs on for the grouped-MMQ MoE decode step. These are the only
|
||||
NVFP4-specific optimizations; on non-Blackwell hardware the FP4 path falls back
|
||||
to dequant.
|
||||
|
||||
**The prefill/decode scheduler.** `update_slots()` already emits one unified
|
||||
mixed prefill+decode batch per step. The scheduler patches change only the *count*
|
||||
of prefill tokens admitted per step: decode tokens are claimed first
|
||||
(decode-first), then a dynamic budget `max(n_ubatch, T - D)` (where `D` is the
|
||||
live decode load and `T` is `LLAMA_MAX_BATCH_TOKENS`) admits prefill, auto-
|
||||
shrinking as decode load rises. Pure scheduler policy, byte-identical when off,
|
||||
orthogonal to the paged allocator.
|
||||
|
||||
---
|
||||
|
||||
## 3. Patch series (0001-0047)
|
||||
|
||||
Source-only patches, with intentional numbering gaps (e.g. 0005, 0027). The
|
||||
decode-serving graph-reuse levers are 0040-0041. "Bit-exact" = greedy md5 /
|
||||
`test-backend-ops` byte-identical to the relevant baseline; the gate methodology
|
||||
is in section 5.
|
||||
|
||||
### Paged-KV core (0001-0012)
|
||||
|
||||
| # | What it does | Bit-exact |
|
||||
|---|---|---|
|
||||
| 0001 | Vendor the host-side paged KV block manager (`FreeBlockQueue`, `BlockPool`, `PagedKVManager`, chained-hash prefix cache). Pure C++17, nothing uses it yet. | n/a (no behavior) |
|
||||
| 0002 | Place each sequence at permuted, non-contiguous block positions in `find_slot` (proves attention is invariant to physical KV placement). | yes (token-identical) |
|
||||
| 0003 | Gather K/V/mask down to each stream's non-empty cells before `build_attn_mha`, position-sorted so the FA reduction order matches stock. | yes |
|
||||
| 0004 | Drive paged placement through the vendored manager: blocks popped on demand, returned on seq end. Core kv-cache struct untouched. | yes (stock path byte-identical) |
|
||||
| 0006 | Host-side cross-request prefix caching: hash prefix blocks, reuse matching physical blocks (ref-count++), COW-privatise before a divergent write. | yes (default off) |
|
||||
| 0007 | Wire the prefix cache into the engine so a new sequence physically shares cached prefix blocks and skips recomputing the shared prefix. | yes (verified byte-identical) |
|
||||
| 0008 | Wire cross-request prefix share into the llama-server continuous-batch loop so concurrent shared-prefix requests prefill only the suffix (36x fewer prefill tokens at K=32). | within CUDA batch-shape non-determinism band |
|
||||
| 0009 | Replace the per-step gather with an **in-kernel paged read** (block table as `src[5]`); the K/V `get_rows` is gone. Decode step at batch32 691->696ms (was 1279ms gathered). | yes on CPU/batch1; GPU batch>1 within vec-vs-mma band |
|
||||
| 0010 | Graft the block-table read into the tile kernel; add a dispatch guard so a present block table routes ONLY to vec/tile (never the mma/wmma kernels that ignore it). | yes (CPU byte-identical; vec route) |
|
||||
| 0011 | Route the GQA-grouped F16 decode to the **tile kernel** (native head-group reuse) by default; vec for everything else. Paged decode to within 1.8% of stock. | vs stock-mma: different-kernel rounding; bit-exact vs vec |
|
||||
| 0012 | Defensive `GGML_ASSERT(n_view % 64 == 0)` so a future pad/tile change can't silently reintroduce a past-end KV leak on the tile route. | yes (additive assert) |
|
||||
|
||||
### Decode-first scheduler (0013, 0016)
|
||||
|
||||
| # | What it does | Bit-exact |
|
||||
|---|---|---|
|
||||
| 0013 | `LLAMA_PREFILL_BUDGET`: a static per-step prefill-token budget decoupled from `n_batch` (vLLM `--max-num-batched-tokens` analogue). Flattens the decode ITL spike a long prefill inflicts (8.5x smaller worst freeze). | yes (off/short = byte-identical; == `-b` chunking) |
|
||||
| 0016 | Supersede 0013 with a **dynamic decode-first** budget: `max(n_ubatch, T-D)`, auto-shrinking as decode load `D` rises. Policy-only inside `update_slots()`, zero libllama changes. | yes (default-off byte-identical) |
|
||||
|
||||
(0014/0015 are the MoE token-tile levers: 0014 adds `LLAMA_MOE_MMQ_X` (opt-in
|
||||
high-batch decode micro-opt, +4.8% on Qwen3-Coder-30B), 0015 makes it a
|
||||
default-on, density-aware auto-select that is prefill-safe by construction. Both
|
||||
bit-exact. 0017 is the dense FP4-GEMM occupancy-tune track: bit-exact gate green,
|
||||
but every cheap occupancy lever regressed on GB10, so nothing is enabled - it
|
||||
ships as the parity gate + default-off instrumentation only.)
|
||||
|
||||
### Decode-serving graph reuse (0040, 0041)
|
||||
|
||||
These two close the **continuous-serving** decode gap (distinct from the static
|
||||
batched-bench decode kernel, which is already at vLLM parity - see
|
||||
[`docs/DECODE_SERVING_SCOPE.md`](docs/DECODE_SERVING_SCOPE.md)). In serving the
|
||||
host rebuilt the ggml graph on **every** decode step (layer-A graph reuse was 0%),
|
||||
so the GPU idled while the host rebuilt - the host-bound -39% the static bench
|
||||
hides.
|
||||
|
||||
| # | What it does | Bit-exact |
|
||||
|---|---|---|
|
||||
| 0040 | **S1 paged decode-graph reuse** - the paged decode inputs (`input_block_table` / `input_gather_idxs`) never overrode `can_reuse` (defaults to false), so any graph carrying a paged input could never be reused. Add a correct `can_reuse` keyed on the (256-bucketed) block-table dims + a live-mctx refresh from the owning attn input. `LLAMA_PAGED_NO_GRAPH_REUSE=1` forces the pre-S1 path. | yes (md5 byte-identical reuse on/off; dense `5951a5b4`, paged-MoE `8cb0ce23`) |
|
||||
| 0041 | **S3 decode-shape-stable scheduling** - keep co-batched prefill OUT of decode steps so the pure-decode batch shape stays reuse-stable (S1 makes a pure-decode step reusable; S3 makes the scheduler emit them). Pure `update_slots()` policy on top of 0016; prefill admitted on a bounded cadence (`LLAMA_PAGED_PREFILL_PERIOD`, default 8). **Default OFF** (opt-in via `LLAMA_PAGED_DECODE_STABLE=1`): a measured end-to-end A/B proved default-on is a serving mistake - deferring prefill admission on the period-8 cadence gives **2.5x worse TTFT** (60s vs 24s at N=256) and **20-29% lower end-to-end throughput**, with no end-to-end win at any concurrency; its apparent `decode_agg` gain was a metric artifact (faster per-step decode bought by starving prefill). Default prefers prompt prefill admission for good TTFT; opt in only for decode-dominated, low-arrival traffic where TTFT is not a concern. | yes (byte-identical on/off; per-stream independent in serving) |
|
||||
|
||||
Measured (GB10, MoE Qwen3.6-35B-A3B-NVFP4, 128-client staggered streaming load):
|
||||
graph reuse **0% -> 72.2%**, host window `hostproc` **15.98 -> 6.31 ms/step**,
|
||||
decode **4.05 -> 5.52 tok/s/seq median (4.24 -> 5.96 mean, at vLLM's ~5.9
|
||||
sustained)**. S1 is necessary but **not** sufficient alone (13.8% reuse - prefill
|
||||
co-batching churns the shape nearly every step); S3 is the multiplier of that
|
||||
per-step decode metric. **But those are per-step decode numbers, not an end-to-end
|
||||
serving win**: a later end-to-end A/B showed S3-default-on regresses real serving
|
||||
(2.5x worse TTFT, 20-29% lower end-to-end throughput, no win at any concurrency),
|
||||
because the period-8 cadence defers prefill admission. So **only S1 (0040) ships
|
||||
default-on; S3 (0041) now defaults OFF and is opt-in** (`LLAMA_PAGED_DECODE_STABLE=1`,
|
||||
for decode-dominated low-arrival traffic). The static batched-bench A/B isolates the S1
|
||||
mechanism: paged decode reuse 0% -> 95.5% (throughput flat there, since the static
|
||||
regime is GPU-bound). **S2 (double-buffer `set_inputs`) was dropped**: the Phase-0
|
||||
profile put `set_inputs` at ~0.05 ms/step (the cost is the rebuild, not the input
|
||||
copy), so it has nothing to recover. The remaining ~28% serving rebuilds are
|
||||
request-boundary D/seq-set churn + the prefill-cadence steps. A **padded/fixed-slot
|
||||
decode shape** to capture them was then implemented and GPU-tested (2026-06-28) and
|
||||
**REJECTED** - it is bit-exact/inert but regresses serving throughput at every
|
||||
concurrency, because this serving decode is GPU-compute-bound (baseline reuse 0% ~=
|
||||
S1+S3 reuse 72% on aggregate tok/s), so the dummy-row compute it adds costs more
|
||||
than the reuse it recovers. Full record + numbers in `docs/DECODE_SERVING_SCOPE.md`
|
||||
("Padded-shape lever - rejected").
|
||||
|
||||
### Prefill fusions (0042, 0044)
|
||||
|
||||
CUDA-family graph fusions of the pre-norm residual chain and the gated-DeltaNet
|
||||
output norm: separate `rms_norm` / `mul` / `add` / `silu` launches collapse into
|
||||
one kernel so the intermediate never round-trips to HBM. Bit-exact (the fused
|
||||
kernel reproduces the unfused FP order; float multiply is commutative). Each is
|
||||
env-gated default-ON (`LLAMA_FUSE_*=0` for a clean single-build A/B that reverts
|
||||
to the byte- and kernel-identical unfused path).
|
||||
|
||||
| # | What it does | Bit-exact / effect |
|
||||
|---|---|---|
|
||||
| 0042 | **Fused residual-add + RMS norm + weight multiply** (`rms_norm_pre_add_mul_f32`) - the pre-norm residual `h = x + sub_out; n = rms_norm(h) * w` ran as a `k_bin_bcast` ADD feeding the fused rms_norm+mul; the residual ADD has a second consumer (the skip add) so it can't pass the single-use `ggml_can_fuse`. Recognized via `ggml_can_fuse_subgraph` (ADD + final MUL both outputs), folded into one launch that publishes `h` and emits `scale * h * w`. Gate `LLAMA_FUSE_ADD_RMSNORM`. | yes (dense `5951a5b4`, MoE `8cb0ce23`); dense S_PP +0.5% |
|
||||
| 0044 | **Fused gated RMSNorm + SiLU gate multiply** (`rms_norm_gate_mul_f32`) - the gated-DeltaNet output norm `(rms_norm(x) * w) * silu(z)` (qwen35 / qwen35moe `build_norm_gated`) ran as rms_norm_mul + silu_mul, two launches with the normalized intermediate crossing HBM. The gate z-projection (a MUL_MAT) is scheduled between the weight MUL and the SILU, so the chain is not naturally consecutive; `build_norm_gated` emits the gate multiply as `mul(silu(z), normalized)` (commutative, bit-exact) so the graph lays out the consecutive subgraph `{ SILU, RMS_NORM, MUL, MUL }` that `ggml_cuda_can_fuse` folds into one `scale * x * w * silu(z)` launch. Gate `LLAMA_FUSE_GATE_RMSNORM`. Profile (dense npp512): 672 (rms_norm_mul + silu_mul) -> 336 fused launches. | yes (dense `5951a5b4`, MoE `8cb0ce23`, paged + non-paged; `test-backend-ops` 12979/12979); S_PP dense +1.1% (~+10 us/tok), MoE +0.9% |
|
||||
|
||||
### SSM (gated-DeltaNet) decode levers (0018-0022, 0028)
|
||||
|
||||
These are the dominant decode levers on the Qwen3.6 hybrid models. All bit-exact.
|
||||
|
||||
| # | What it does | Effect (dense q36-27b / MoE q36-35b-a3b @npl128) |
|
||||
|---|---|---|
|
||||
| 0018 | **In-place SSM state write-back** - the recurrence writes its final state directly into the cache slot, removing the ~225MB/copy D2D memcpy (18.9% of decode time). | dense +23.5% / MoE +18.9% |
|
||||
| 0019 | **Fused recurrent-state gather** - the op reads each sequence's prior state directly from `cache[ids[seq]]` (no `get_rows` materialization); race-free in-place + ids read. | dense +37.8% / MoE +35.3% |
|
||||
| 0020 | **o_proj MMVQ->MMQ reshape** - collapse the GDN output to 2D so the output projection routes to the M=128 tensor-core MMQ GEMM (was a batch<=8 MMVQ GEMV). The single biggest decode-parity lever. | dense +31.7% (->85.9% of vLLM) / MoE +23.3% |
|
||||
| 0021 | **Conv-state in-place fusion** - one `ggml_ssm_conv_update_inplace` op replaces the 4-op conv chain (transpose+concat+conv+silu+ring-cpy), writing the shifted ring state in place. | dense +3.2% / MoE +3.5% |
|
||||
| 0022 | **GDN recurrence occupancy/coalescing retune** - column-folding (NUM_WARPS/COLS_PER_WARP) raises memory-level parallelism on the bandwidth-bound B=128 recurrence kernel; per-column f32 FMA order unchanged. 73.4%->84.6% of GB10 peak BW. | dense +11.1% / MoE +8.3% |
|
||||
| 0028 | **Recurrent conv-tap gather fusion** - the last `k_get_rows` in the GDN decode path (the conv-state tap gather) becomes an indexed in-kernel read. | dense ~377 t/s / MoE ~784 t/s |
|
||||
|
||||
### MoE NVFP4 quant (0023, 0025, 0043)
|
||||
|
||||
| # | What it does | Bit-exact |
|
||||
|---|---|---|
|
||||
| 0023 | **NVFP4 activation-quantize de-dup** - the broadcast up/gate projections re-quantize the same token activation once per expert; quantize the unique token activations once and byte-copy them into the expert-gathered layout. The only NVFP4-specific patch. | yes (byte-identical) |
|
||||
| 0025 | **MoE decode re-graph** - keep CUDA graphs on for the grouped-MMQ MoE decode step (the upstream guard disables graphs conservatively; the grouped path has no host sync). Was env-gated `LLAMA_MOE_FORCE_GRAPHS`; now ON by default via 0043. | yes (graph replay re-issues identical kernels) |
|
||||
| 0043 | **MoE decode graph default-on (D1)** - flip 0025 to ON by default: capture/replay the full-step decode CUDA graph (incl. the grouped-MMQ MoE dispatch) instead of re-issuing every kernel each step. Guard is `should_use_mmq()` (FALSE for the large-M NVFP4 prefill of 0034, so prefill keeps graphs disabled - its per-expert host-loop genuinely syncs). `LLAMA_MOE_NO_FORCE_GRAPHS=1` forces the conservative pre-0025 disable for A/B. D1 profiling: the per-expert host-loop (the only device->host MoE-routing readback) is never hit on the NVFP4 grouped path (sync count identical graphs on/off); steady decode is ~99% GPU-busy, so the cost removed is per-step host kernel RE-ISSUE, not a sync. | yes (md5 byte-identical default/off/forced; paged-MoE `8cb0ce23`, dense `5951a5b4`) |
|
||||
|
||||
### Pool reclaim, block-table cache, backend gate
|
||||
|
||||
| # | What it does | Bit-exact |
|
||||
|---|---|---|
|
||||
| 0024 | **Paged-pool burst-reclaim** - truncate trailing blocks on partial-tail `seq_rm`, defrag the free queue when idle, release blocks on slot completion. Fixes the long-server burst-degradation bug (post-burst prefill collapse 488->44 t/s, restored to 532). Host-side accounting only. | yes |
|
||||
| 0029 | **Block-table within-step host cache** - the block table is fixed for the whole step; cache it on first build and memcpy it for the other full-attention layers (get_block_table -87%/-91%). | yes, per path (paged-MoE ref `8cb0ce23`) |
|
||||
| 0030 | **Fused-op backend gate** - the fused GDN / discriminated SSM_CONV ops are CUDA-family + CPU only; force them off on any non-CUDA compute backend so a Vulkan/SYCL/Metal build can't silently run the wrong plain-conv kernel. | yes on CUDA (byte-identical pre-0030); safety gate elsewhere |
|
||||
| 0031 | **Chunked parallel-scan GDN prefill kernel** (upstream TODO) - FLA-style chunked gated-delta-rule for prefill (non-KDA / f32 / final-state): intra-chunk delta rule solved in parallel (UT-transform + forward subst), inter-chunk recurrence over n_tokens/C steps. The scalar-serial form (`GDN_TC=0`) was bit-exact-benign but not faster than the tuned sequential scan at the GB10-forced C=16 (see section 5); **superseded for paged by the tensor-core M5 path of 0047**. | NEW per-path (`test-backend-ops` 91/91, <=1e-7 NMSE vs CPU ref) |
|
||||
| 0047 | **GDN M5 tensor-core chunked-scan prefill, f32-only re-port, default-ON under paged KV** - the f32/tf32 tensor-core forms of 0031's scan (KK/QK Gram = M2, KS/QS state-boundary 3xtf32 = M3, P*U output = M4, full form-T solve + state-update mma = M5), single build, runtime-selected by `GDN_TC`. Ships **M5 default-on when `LLAMA_KV_PAGED` is set** (`GDN_TC=5` + `GDN_CHUNK_MIN=64`, both env-overridable; OFF/`INT_MAX` when not paged). `GDN_CHUNK_MIN` is the per-call engage threshold and stays > 1 so decode (1 tok/call) keeps the sequential recurrence (at 1 it swallows decode and drops S_TG ~25%); 64 tuned from a {1,32,64,128,256} sweep. The bf16/hybrid dev-tree machinery (STATE_BF16/HYBRID, the dropped 0026 ssm_bf16_tau) and the bf16 CONFIG-C (M8) plus register-resident M6/M7 variants are NOT part of this f32-only series. MoE prefill S_PP +3.5% @npp512 (3x A/B), +17.7% @npp2048; decode S_TG unchanged. | NEW per-path, benign (`test-backend-ops` GATED_DELTA_NET 46/46 default AND force-M5, incl. multi-chunk/tail-chunk/multi-seq; greedy md5 default-on == M5-forced == canonical on the gate prompt: paged-MoE `8cb0ce23`, dense `5951a5b4`; long MoE prompt = one benign greedy flip vs sequential, dense byte-identical) |
|
||||
| 0046 | **GDN prefill geometry gated by scan length** - patch 0022's `(NUM_WARPS=16, COLS_PER_WARP=8)` column-fold of the GDN sequential-recurrence dispatch (`case 128`) is a decode win but was applied UNCONDITIONALLY, so it also hit dense prefill (~-6% vs stock): on a long sequential scan the launch `grid.z` collapses from `S_v/4 = 32` to `S_v/(16*8) = 1` and the SMs starve (profiled: `gated_delta_net` +54% GPU time = the whole dense-prefill regression). Gate the geometry by per-call scan length: long scans (prefill, `n_tokens >= GDN_PREFILL_NTOK`, default 256) take stock's high-grid.z `(4,1)` geometry; short scans (decode) keep the `(16,8)` retune. Recovers dense prefill +7.2% back to stock parity, keeps the decode win. `GDN_PREFILL_NTOK` tunes the crossover; an explicit `GDN_NW`/`GDN_CPW` sweep still overrides (gate yields when either is set), so the one-build %peak A/B harness is unchanged. | yes (patch 0022 proved every `{NW,CPW}` variant byte-identical, so switching geometry by scan length cannot move the md5) |
|
||||
|
||||
> **Dropped: patch 0026 (hybrid per-head bf16 SSM state, `ssm_bf16_tau`).** Once
|
||||
> the decode fusions (0028 recurrent-state gather-fusion + 0029 block-table cache)
|
||||
> landed, the bf16-SSM lever bought nothing: a clean re-measurement forcing **all**
|
||||
> gated-DeltaNet heads to bf16 (`tau=100000`) gives **flat** decode (780.6 vs
|
||||
> 780.0 t/s) - the mode engages but adds zero throughput because it is subsumed by
|
||||
> the fusions. It was a precision trade (not bit-exact) plus extra bug surface and
|
||||
> CUDA template-instantiation compile cost with no benefit, so it was removed. See
|
||||
> section 5 ("rejected / flat levers") for the full record.
|
||||
|
||||
---
|
||||
|
||||
## 4. Benchmarks
|
||||
|
||||
Hardware: **GB10 / DGX Spark** (CUDA 13, sm_121). Models: dense
|
||||
**Qwen3.6-27B-NVFP4** and MoE **Qwen3.6-35B-A3B-NVFP4**. Metric: `decode_agg`
|
||||
S_TG (t/s) from `llama-batched-bench`, `-fa on -ngl 99`, `npp 128 / ntg 128`,
|
||||
swept over serving width `npl` in {8, 32, 64, 128}. Plots:
|
||||
[`qwen36_decode_overview.png`](docs/qwen36_decode_overview.png) (both models),
|
||||
[`qwen36_dense_decode_vs_npl.png`](docs/qwen36_dense_decode_vs_npl.png),
|
||||
[`qwen36_moe_decode_vs_npl.png`](docs/qwen36_moe_decode_vs_npl.png); raw data
|
||||
[`final_benchmark.csv`](docs/final_benchmark.csv).
|
||||
|
||||

|
||||
|
||||
> The plot above also shows a third "bf16-tau" llama curve. That was the opt-in
|
||||
> `ssm_bf16_tau` lever (patch 0026), since **dropped** - a clean re-measurement
|
||||
> showed it flat once the decode fusions landed (see section 5). The numbers below
|
||||
> use only **stock** vs **patched** vs **vLLM**.
|
||||
|
||||
> **What was re-measured (2026-06-27).** The two llama columns - **stock** and
|
||||
> **patched** - were re-measured this session on one consistent
|
||||
> `llama-batched-bench` harness. The **vLLM** column is the **prior-session
|
||||
> reference** (kept as-is, *not* re-run this session). Per-run peak
|
||||
> VRAM was *not* re-captured: the GB10's unified Grace-Blackwell LPDDR5x reports
|
||||
> `[N/A]` to `nvidia-smi --query-gpu=memory.used` and the bench does not print it
|
||||
> (the memory-advantage note below is the prior-session finding).
|
||||
|
||||
### (a) + (b) Patched vs stock vs vLLM
|
||||
|
||||
The **stock** column is a separate, unpatched llama.cpp built at this backend's
|
||||
**exact pin (`9d5d882d`)**; the **patched** column is
|
||||
the paged binary, env/flag-toggled (`LLAMA_KV_PAGED=1`, plus
|
||||
`LLAMA_MOE_FORCE_GRAPHS=1` for MoE). Both
|
||||
run on the **same harness**, so "x over stock" is an apples-to-apples measure of
|
||||
the patch series. (Note: the patch series' dominant SSM decode fusions are
|
||||
compiled in, not env-gated - toggling `LLAMA_KV_PAGED` alone on the *patched*
|
||||
binary does **not** reproduce stock; only the separately-built unpatched
|
||||
`9d5d882d` binary does.) The **vLLM** column is a **different harness** (vLLM
|
||||
server + client continuous batching) and a **prior-session reference**, so the
|
||||
cross-engine "% of vLLM" is **indicative, not apples-to-apples**.
|
||||
|
||||
**Dense Qwen3.6-27B-NVFP4** (decode t/s):
|
||||
|
||||
| npl | stock | patched | vLLM (prior) | patched x over stock |
|
||||
|----:|------:|--------:|-------------:|---------------------:|
|
||||
| 8 | 68.3 | 85.3 | 70.4 | 1.25x |
|
||||
| 32 | 119.9 | 211.9 | 211.8 | 1.77x |
|
||||
| 64 | 142.8 | 305.2 | 309.1 | 2.14x |
|
||||
| 128 | 155.1 | 382.1 | 418.8 | 2.46x |
|
||||
|
||||
Dense **patched** is parity-to-ahead of vLLM (121 / 100 / 99 / 91% of vLLM across
|
||||
the widths).
|
||||
|
||||
**MoE Qwen3.6-35B-A3B-NVFP4** (decode t/s):
|
||||
|
||||
| npl | stock | patched | vLLM (prior) | patched x over stock |
|
||||
|----:|------:|--------:|-------------:|---------------------:|
|
||||
| 8 | 186.7 | 230.3 | 256.5 | 1.23x |
|
||||
| 32 | 267.4 | 466.4 | 500.8 | 1.74x |
|
||||
| 64 | 320.5 | 622.4 | 686.1 | 1.94x |
|
||||
| 128 | 347.2 | 784.3 | 882.2 | 2.26x |
|
||||
|
||||
MoE **patched** is 90 / 93 / 91 / 89% of vLLM.
|
||||
|
||||
**Caveat on the vLLM column.** It is a **different harness** and a
|
||||
**prior-session** measurement (not re-run this session), so the cross-engine "% of
|
||||
vLLM" is **indicative, not apples-to-apples**. Memory (prior session): llama uses
|
||||
**1.5-3x lower** memory than vLLM.
|
||||
|
||||
**Takeaway.** Re-measured this session, the patch series gives up to **2.46x
|
||||
(dense) / 2.26x (MoE)** over true-stock `9d5d882d` on the same harness (close to,
|
||||
slightly below, the prior 2.59x / 2.33x - llama was re-measured, vLLM kept).
|
||||
Dense is parity-to-ahead of vLLM; MoE **patched** sits at ~89-93% of the
|
||||
prior-session vLLM. The residual MoE gap is structural (see section 5).
|
||||
|
||||
### (c) Apple Silicon (M4, 16GB Metal) - does the patchset help here?
|
||||
|
||||
Short answer: **no - the wins are CUDA/Blackwell-specific.** Two facts first: the
|
||||
24GB NVFP4 GGUF doesn't fit a 16GB M4 (SSD paging), and on Metal `supports_op`
|
||||
**excludes NVFP4** from `MUL_MAT`/`MUL_MAT_ID`/`GET_ROWS` (FP4 matmuls fall back to
|
||||
CPU - no Apple FP4-MMA). So NVFP4 Qwen3.6 is not a Mac fit; a Metal-native Q4_K is.
|
||||
|
||||
Measured **stock vs patched** (same pin `c299a92c`, both built `-DGGML_METAL=ON`;
|
||||
the 28-patch series **compiles clean on Metal** - the CUDA code is `#if`-guarded),
|
||||
on **Qwen3-8B Q4_K_M** (a dense GQA model that fits 16GB and exercises the *live*
|
||||
Metal features; no Qwen3.6 hybrid GGUF fits 16GB, and the GDN fusions gate off on
|
||||
Metal anyway), `llama-bench` pp512/tg128 t/s:
|
||||
|
||||
| config | pp512 | tg128 |
|
||||
|---|---:|---:|
|
||||
| stock | 226.7 | 20.4 |
|
||||
| patched, paged **off** | 226.7 | 20.3 (= stock) |
|
||||
| patched, paged **on** | 222.6 | 19.8 (~0.97x) |
|
||||
|
||||
Concurrency (`batched-bench`) scales identically to stock (S_TG ~20 -> ~137 at
|
||||
npl32, from llama.cpp's existing batching). **Verdict: neutral-to-slightly-negative
|
||||
on Metal.** Patched-paged-off equals stock; turning paged on is ~0-3% slower
|
||||
decode / ~2-8% slower prefill, because the in-kernel block-table flash-attn read
|
||||
that *recovers* the gather cost is CUDA-only (`fattn-*.cuh`) - on Metal the paged
|
||||
path falls back to a host-side gather, pure overhead over stock's contiguous read.
|
||||
Everything Blackwell-specific (NVFP4, GDN fusions via 0030, occupancy) is inert.
|
||||
So **on Apple Silicon, prefer the stock `llama-cpp` backend.**
|
||||
|
||||
**Vulkan / SYCL** (source analysis): the gated-DeltaNet and SSM_CONV ops DO have
|
||||
upstream kernels on Vulkan and SYCL (as on Metal), so the Qwen3.6 hybrids RUN on
|
||||
all three via the non-fused path. The patchset's fusions are gated off there
|
||||
(0030), so the outcome is the same neutral-to-slightly-negative as Metal - not
|
||||
"won't run". This backend therefore ships **CUDA-only** (where the fusions are
|
||||
live + verified); non-CUDA users should use the stock `llama-cpp` backend. See
|
||||
[`UPSTREAM_LAYER2_SCOPE.md`](docs/UPSTREAM_LAYER2_SCOPE.md) for what native non-CUDA
|
||||
fused kernels would take.
|
||||
|
||||
---
|
||||
|
||||
## 5. Dev notes - what we learned
|
||||
|
||||
**Bit-exact methodology.** Every bit-exact patch is gated two ways: (1) a greedy
|
||||
md5 gate - `llama-completion -m MODEL -ngl 99 -fa on -p "The capital of France
|
||||
is" -n 48 --temp 0 --seed 1 | md5sum`, paged paths prefixed with
|
||||
`LLAMA_KV_PAGED=1` (+ `LLAMA_MOE_FORCE_GRAPHS=1` for paged MoE), on the default
|
||||
chat-template path; and (2) `test-backend-ops` (CUDA0 vs CPU oracle) for every
|
||||
touched op (`SSM_CONV*`, `GATED_DELTA_NET`, `MUL_MAT`, `MUL_MAT_ID`).
|
||||
|
||||
**The gate is per-path** (see [`PAGED_BITEXACT_NOTE.md`](docs/PAGED_BITEXACT_NOTE.md)).
|
||||
Dense is bit-exact across paged/non-paged (`5951a5b4`). The **paged MoE** md5
|
||||
(`8cb0ce23`) does **not** byte-match the **non-paged MoE** md5 (`07db32c2`); this
|
||||
is a benign FP-accumulation-order difference of the paged attention reduction,
|
||||
**KL-validated** against the f16 reference: KLD(paged||f16) 0.13600 <=
|
||||
KLD(nonpaged||f16) 0.13660, PPL within +/-0.29, ~zero probability bias - two
|
||||
equivalent FP-reorderings of the same quantized model, not a regression. Future
|
||||
paged-MoE regressions therefore compare to `8cb0ce23`, not `07db32c2`.
|
||||
|
||||
**MoE-parity conclusion** (the residual gap is structural). The two heaviest MoE
|
||||
decode kernels - the GDN-SSM recurrence and the NVFP4-expert GEMM - are llama
|
||||
**wins** after this series (the recurrence runs at 102.6% of vLLM's bandwidth;
|
||||
the GEMM ties vLLM at the LPDDR5x BW floor). The residual gap is **bf16-projection
|
||||
bandwidth + the host scheduling loop**, both at the LPDDR5x floor - not a kernel
|
||||
llama is losing. The MoE GEMM kernel is *not* where the gap lives.
|
||||
|
||||
**Rejected / flat levers** (recorded so they are not re-tried):
|
||||
|
||||
- **Lever 2 - graph/stream coverage: FLAT.** Bit-exact graph coverage was
|
||||
exhausted by 0025; more graph/stream overlap is a no-op or small regression on
|
||||
this model.
|
||||
- **D1 premise "static decode is host-sync-bound on the MoE-routing readback":
|
||||
REFUTED.** The hypothesis was that the dominant decode cost is the device->host
|
||||
readback of MoE routing before launching the per-expert GEMMs (mul_mat_id's
|
||||
per-expert host-loop fallback). Profiling (GB10, q36-35b-a3b-nvfp4, batched-bench
|
||||
npl128) shows the opposite: on NVFP4 the grouped stream-k MMQ id-path is what
|
||||
runs (routing stays device-side), so the host-loop fallback is **never hit** -
|
||||
`cudaStreamSynchronize` count is *identical* with CUDA graphs on vs off (1457
|
||||
either way; only the kernel-launch count changes, ~100k vs ~229k). Steady-decode
|
||||
GPU-busy is **~99%** (1% idle), i.e. static decode is GPU-bound, not idle waiting
|
||||
on a sync. The one actionable residual the profile surfaced - per-step host
|
||||
kernel **re-issue** when the step is not graph-captured - shipped as 0043
|
||||
(default-on full-step decode graph), worth +2.6% (npl128) to +5-13% (npl32). The
|
||||
larger continuous-serving host cost is the graph **rebuild** (0040/0041), and the
|
||||
irreducible floor is the per-step logits-D2H-before-sampling serial point - none
|
||||
of which is the MoE-routing readback.
|
||||
- **Lever 3 - act-quant fusion: FLAT.** The W4A4 act-quant tax is removable only
|
||||
by W4A16 (a precision change, rejected) or a structural kernel rewrite; no
|
||||
further bit-exact lever clears it. 0023 already banks the de-dup.
|
||||
- **Lever 4 - NVFP4 the bf16 GDN/attn projections: REJECTED (KL-gate fail).**
|
||||
Quantizing the projections to NVFP4 costs ~+6% PPL; vLLM deliberately keeps the
|
||||
same bf16 projections. No-ship.
|
||||
- **W4A16-Marlin MoE GEMM: REJECTED.** It would be a precision upgrade nobody
|
||||
needs bought with a ~5% slower kernel; both kernels are already at the BW floor.
|
||||
(The "the win was NVFP4-dense-quant, not the Marlin kernel" dense verdict
|
||||
carries over to MoE.)
|
||||
- **Chunked parallel-scan GDN prefill (patch 0031): the scalar-serial form was
|
||||
FLAT-to-SLOWER at C=16 - the tensor-core M5 form (patch 0047) is the win,
|
||||
now DEFAULT-ON under paged KV.** 0031 implements the upstream "faster pre-fill"
|
||||
TODO - the FLA-style chunked gated-delta-rule (intra-chunk delta rule solved in
|
||||
parallel via the UT-transform + forward substitution, inter-chunk recurrence
|
||||
over n_tokens/C steps), math validated equivalent (numpy f32 NMSE ~1e-13;
|
||||
`test-backend-ops` within the 1e-7 NMSE gate, a NEW per-path result). **But
|
||||
GB10's 99KB dynamic-smem opt-in forces C=16** (the 128x128 f32 state alone is
|
||||
64KB of the all-shared layout); the scalar-serial scan (`GDN_TC=0`) was then
|
||||
pinned to 1 block/SM with serial per-thread dk-reductions and measured **~761
|
||||
t/s chunked vs ~971 t/s sequential (~22% slower)**, grid-starved at low n_seqs.
|
||||
The lesson held: **at this head dim the win needs tensor cores, not just
|
||||
chunking.** Patch 0047 builds those tensor-core forms (KK/QK Gram = M2, KS/QS
|
||||
state-boundary 3xtf32 = M3, P*U output = M4, full form-T solve + state-update
|
||||
mma = M5, all `GDN_TC`-selected in one build) and ships **M5** as the default
|
||||
when `LLAMA_KV_PAGED` is set. It is an f32/tf32-only re-port: the bf16/hybrid
|
||||
dev-tree machinery (from the dropped 0026 ssm_bf16_tau) and the bf16 CONFIG-C
|
||||
(M8) plus register-resident M6/M7 variants are NOT part of this series. M5 is the
|
||||
variant that beats the (already 84.7%-of-peak) sequential scan while staying on
|
||||
the bit-exact gate: MoE prefill S_PP **+3.5% @npp512 (3x interleaved A/B), +17.7%
|
||||
@npp2048**; decode S_TG unchanged (the tuned `GDN_CHUNK_MIN=64` engage threshold
|
||||
is > 1, so the 1-tok decode steps never enter the chunked path - at
|
||||
`GDN_CHUNK_MIN=1` the chunked path swallows decode and collapses S_TG ~25%, the
|
||||
reason the threshold is the lever). Bit-exactness is per-path benign:
|
||||
`test-backend-ops` GATED_DELTA_NET is **94/94** vs CPU with M5 forced (incl.
|
||||
multi-chunk n_tokens up to 256); the greedy md5 default-on == M5-forced ==
|
||||
canonical on the short gate prompt (paged-MoE `8cb0ce23`, dense `5951a5b4`); on
|
||||
a long MoE prompt (where the default fires M5 at >=64 tokens) M5 and the
|
||||
sequential path agree word-for-word until **one** benign greedy token-flip
|
||||
("the User:" vs "the User's Request:"), the dense model not flipping at all -
|
||||
the textbook reduction-order flip greedy amplifies, NMSE-validated. The chunk
|
||||
geometry stays env-selectable (`GDN_TC`/`GDN_CHUNK_C`/`GDN_DV_TILE`) for further
|
||||
tuning; M5 is the shipped default because it wins without losing the canonical gate.
|
||||
- **GDN occupancy retune (patch 0022) was a decode win but an UNCONDITIONAL
|
||||
dense-prefill regression - now gated by scan length (patch 0046).** Patch
|
||||
0022's `(NUM_WARPS=16, COLS_PER_WARP=8)` column-fold of the GDN
|
||||
sequential-recurrence dispatch (`case 128`) raises per-warp memory-level
|
||||
parallelism on the short, wide DECODE scans (small `n_tokens`, large
|
||||
`n_seqs`) - the measured +11.1% dense decode win. Applied unconditionally it
|
||||
also hit the dense PREFILL path, where the scan is long and narrow: the launch
|
||||
`grid.z` collapses from `S_v/4 = 32` to `S_v/(16*8) = 1`, the SMs starve, and
|
||||
profiling attributed the whole ~-6% dense-prefill regression vs stock to
|
||||
`gated_delta_net` (+54% GPU time at the (16,8) geometry). Patch 0046 gates the
|
||||
geometry by per-call scan length: long scans (prefill,
|
||||
`n_tokens >= GDN_PREFILL_NTOK`, default 256) take stock's high-grid.z `(4,1)`
|
||||
geometry; short scans (decode) keep the `(16,8)` retune. That recovers dense
|
||||
prefill +7.2% back to stock parity while keeping the decode win, and it is
|
||||
bit-exact: patch 0022 already proved every selectable `{NUM_WARPS,
|
||||
COLS_PER_WARP}` variant is byte-identical (the sweep cannot change the md5), so
|
||||
switching geometry by scan length cannot move the greedy output. The explicit
|
||||
`GDN_NW`/`GDN_CPW` one-build %peak sweep still overrides (the gate yields when
|
||||
either is set), so the A/B harness is unchanged.
|
||||
|
||||
**Opt-in bf16-SSM fast mode - DROPPED (was patch 0026, `ssm_bf16_tau`).** The
|
||||
design premise - that bf16 KL error concentrates in long-memory heads and can be
|
||||
removed by keeping them f32 - was already shaky: the error scales with the bf16
|
||||
head *count* and saturates (~0.06 MeanKLD / ~91% same-top-p) far below any useful
|
||||
byte saving. The lever was then **removed entirely** once the decode fusions
|
||||
(0028 recurrent-state gather-fusion + 0029 block-table cache) landed: a clean
|
||||
re-measurement that forced **all** gated-DeltaNet heads to bf16 (`tau=100000`,
|
||||
the most aggressive setting) gave **flat** decode throughput - **780.6 vs 780.0
|
||||
t/s**. The mode engages but buys **zero** speed; the earlier "+12%" was subsumed
|
||||
by the fusions. So bf16-tau was a precision trade (not bit-exact) plus extra bug
|
||||
surface and CUDA template-instantiation compile cost with **no** offsetting
|
||||
benefit, and patch 0026 was dropped from the series. Lesson recorded so it is not
|
||||
re-tried: do not reintroduce a per-head SSM-precision lever - the bandwidth it
|
||||
targeted is already recovered by the gather-fusion + block-table cache.
|
||||
|
||||
---
|
||||
|
||||
## 6. Architecture and quant generality
|
||||
|
||||
(From the arch-generality and quant-generality audits.)
|
||||
|
||||
- **15 of 16 optimizations are quant-AGNOSTIC.** Only **0023** (NVFP4
|
||||
activation-quantize de-dup) is NVFP4-specific. The SSM/paged/MMQ optimizations
|
||||
help **any quant** of these models (the GDN recurrence, conv, gather and
|
||||
o_proj-MMQ levers operate on the f32 recurrent state and the routing layout,
|
||||
not on the weight dtype).
|
||||
- **Arch-safe to build everywhere.** NVFP4 use is Blackwell-gated and falls back
|
||||
to dequant on other hardware; the GB10-tuned occupancy params (0022) are
|
||||
perf-only and env-selectable (`GDN_NW` / `GDN_CPW`), so they never change
|
||||
correctness on other GPUs. Patch 0030 makes the fused-op emission CUDA-family +
|
||||
CPU only, so a non-CUDA paged build routes to the safe upstream non-fused path.
|
||||
|
||||
- **What generalizes beyond this backend (upstream candidates).** The *speedups*
|
||||
are CUDA/Blackwell-specific (which is why Metal/Vulkan don't benefit - section
|
||||
4c), but several *findings and ops* are portable and worth upstreaming:
|
||||
- The headline is hardware-independent: on hybrid gated-DeltaNet models, decode
|
||||
is bottlenecked by the recurrent-state **plumbing** (memcpy + gathers, ~67% of
|
||||
the step), not the weight GEMM. The fusions for it (in-place state 0018, gather
|
||||
0019/0028, conv 0021) are bit-exact and already have CPU reference kernels, so
|
||||
they would speed up Qwen3.6 / Qwen3-Next / any hybrid-SSM decode on **every**
|
||||
backend once the ggml ops gain the respective (Metal/Vulkan) kernels - the
|
||||
highest-value upstream contribution.
|
||||
- The o_proj GEMV->MMQ reshape (0020) is a model-graph fix (batch the projection
|
||||
to hit the GEMM path) - arch-agnostic in principle, trivial to upstream.
|
||||
- The paged KV + cross-request prefix sharing + decode-first scheduler align with
|
||||
llama.cpp's own in-progress KV / chunked-prefill work and could inform it.
|
||||
- The per-path bit-exact md5 gate + the weekly upstream-drift canary is a reusable
|
||||
maintenance pattern for any vendored-patch backend.
|
||||
|
||||
---
|
||||
|
||||
## 7. Pin + maintenance policy
|
||||
|
||||
- **Canonical source = the fork branch `mudler/llama.cpp:localai-paged`.** The
|
||||
vendored `patches/paged/*.patch` files are now generated (one `git format-patch`
|
||||
per commit) from that branch, which is the pin commit plus the paged patch
|
||||
commits in order, so there is no more hand-export drift between the dev tree and
|
||||
the shipped series.
|
||||
- **Pinned to llama.cpp `9d5d882d`** (kept == the stock `llama-cpp` pin). The pin
|
||||
is advanced **only** by the manual pin-sync process (this section):
|
||||
rebase the source-only patch series onto the new tip, rebuild on GPU, pass the
|
||||
bit-exact gate on every path (dense + MoE, paged + non-paged) plus
|
||||
`test-backend-ops`, **and confirm the full grpc-server build links on CI**.
|
||||
- **The pin must track the stock pin.** `grpc-server.cpp` is shared with the stock
|
||||
backend and tracks the stock pin, so a paged pin that diverges past an upstream
|
||||
server-API refactor breaks the grpc-server LINK even when the patches are
|
||||
bit-exact. A bump to `c299a92c` (23 commits ahead of stock) was greedy-md5
|
||||
bit-exact but failed to link (undefined `stream_*` server helpers introduced by
|
||||
the refactor), and was reverted to `9d5d882d`. The bit-exact gate alone does not
|
||||
catch this; only the full CI grpc-server build does.
|
||||
- **Decoupled from the nightly auto-bumper.** There is deliberately **no**
|
||||
`bump_deps.yaml` entry for this backend - a naive `LLAMA_VERSION` bump could
|
||||
silently shift the tree out from under the patches.
|
||||
- **Weekly canary.** [`.github/workflows/llama-cpp-paged-canary.yml`](../../../.github/workflows/llama-cpp-paged-canary.yml)
|
||||
(via [`.github/scripts/paged-canary-apply.sh`](../../../.github/scripts/paged-canary-apply.sh))
|
||||
tries the patch series against the latest upstream tip with the build's own
|
||||
strict `git apply`. **Red = upstream drifted past the series -> run a
|
||||
PIN_SYNC** (do not bump the pin blindly), following the policy in this section.
|
||||
|
||||
---
|
||||
|
||||
## 8. Models
|
||||
|
||||
> **Build coverage: CUDA-only.** This backend ships only the CUDA/cublas build
|
||||
> targets (cuda-12, cuda-13, and the nvidia-l4t arm64 cuda-12/cuda-13 Jetson
|
||||
> rows). There are no cpu / vulkan / sycl / hipblas / metal-darwin builds: the
|
||||
> patchset's wins are CUDA/Blackwell-specific (section 4c), so off-CUDA the
|
||||
> backend is neutral-to-negative and non-CUDA users should run the stock
|
||||
> `llama-cpp` backend instead. The `backend/index.yaml` meta-backend resolves
|
||||
> `default`/`nvidia` to a CUDA variant accordingly.
|
||||
|
||||
The benchmarked NVFP4 GGUFs are published and wired into the LocalAI gallery:
|
||||
|
||||
| Gallery entry | Weights (HuggingFace) | Notes |
|
||||
|---|---|---|
|
||||
| `qwen3.6-27b-nvfp4-paged` | [`mudler/Qwen3.6-27B-NVFP4-GGUF`](https://huggingface.co/mudler/Qwen3.6-27B-NVFP4-GGUF) | Dense, native Blackwell NVFP4 (FP4-MMA). |
|
||||
| `qwen3.6-35b-a3b-nvfp4-paged` | [`mudler/Qwen3.6-35B-A3B-NVFP4-GGUF`](https://huggingface.co/mudler/Qwen3.6-35B-A3B-NVFP4-GGUF) | MoE (256 experts, top-8), `file_type MOSTLY_NVFP4`. |
|
||||
|
||||
Both gallery entries set `backend: llama-cpp-localai-paged` and the paged serving config
|
||||
(`paged_kv:true`, `max_batch_tokens`, `kv_unified:false`, `parallel`,
|
||||
`flash_attention:on`, `context_size`). They are bit-exact. The full
|
||||
backend-split + gallery plan is in
|
||||
[`LOCALAI_LLAMACPP_BACKEND_PLAN.md`](docs/LOCALAI_LLAMACPP_BACKEND_PLAN.md).
|
||||
|
||||
---
|
||||
|
||||
## 9. vLLM parity - final state (CLOSED)
|
||||
|
||||
The multi-week GB10 (DGX Spark, sm_121) vLLM-parity investigation is **closed**.
|
||||
The standing, never-re-litigate record - full benchmark, every lever and verdict,
|
||||
the structural floors, the parity verdict - is
|
||||
[`docs/VLLM_PARITY_FINAL.md`](docs/VLLM_PARITY_FINAL.md). Summary:
|
||||
|
||||
- **Where we are (GB10, Qwen3.6 NVFP4, vs vLLM 0.23.0).** Decode: dense is
|
||||
**ahead of vLLM at low concurrency (116.7% at N=8)** and both models are
|
||||
bandwidth-floored at **~56-68% of vLLM at high concurrency**. Prefill is
|
||||
**~36% (MoE) / ~43% (dense)** of vLLM. Memory: **1.5-3x lower** than vLLM
|
||||
(NVFP4-resident; vLLM's peak is a fixed ~109-112 GB 0.85-util reservation,
|
||||
paged grows with KV from ~50 GB). Output is bit-exact per-path
|
||||
(`5951a5b4` dense, `8cb0ce23` paged-MoE).
|
||||
- **Why the residual is a hardware ceiling, not missing work.** Decode kernels
|
||||
are already **5.4x more GPU-efficient per token** than vLLM's; the gap is the
|
||||
**LPDDR5x ~273 GB/s** floor. The prefill GEMM is **FP4-MMQ-optimal** (every
|
||||
alternative - 0033 dequant->cuBLAS, 0034 native FP4-MMA, 0035/Marlin W4A16,
|
||||
offline-repack and vLLM-verbatim Marlin - was rejected; bf16 TC peak is ~half
|
||||
FP4 peak, and vLLM itself runs a bf16-Marlin fallback on sm_121). The GDN
|
||||
chunked scan is at the tractable tensor-core win (**M5 tf32**, patch 0047);
|
||||
its residual is the **O(C^2) intra-chunk solve + serial recurrence** (occupancy
|
||||
and dtype proven not the bound: BV -1%, bf16-C64 -18.75%). The serving host
|
||||
loop is **closed** (~0-1% of the wall; padded-decode built + rejected).
|
||||
- **Shipped, bit-exact wins.** FP4-MMQ GEMM, M5 tensor-core GDN prefill (0047),
|
||||
fused residual+RMSNorm (0042), fused GatedRMSNorm+SiLU (0044), GDN-prefill
|
||||
geometry gate (0046), the SSM decode-fusion stack (0018-0022/0028, up to
|
||||
2.46x/2.26x over stock), decode-graph reuse (0040/0043), the memory advantage,
|
||||
and low-N decode lead.
|
||||
- **The path to parity is different hardware.** Datacenter Blackwell (HBM,
|
||||
native tcgen05/CUTLASS FP4) lifts the bandwidth floor and **restores exactly
|
||||
the vLLM advantages that lose on GB10** (FLA blocked-solve GDN, Marlin/CUTLASS
|
||||
grouped FP4, HBM-tuned full-cudagraph decode). Re-run the methodology on new
|
||||
silicon; do not reopen the GB10 levers.
|
||||
@@ -0,0 +1,374 @@
|
||||
# Accelerator-porting scope: bringing the paged backend's portable benefits to Metal / SYCL / Vulkan (+ a ROCm note)
|
||||
|
||||
Source-only analysis (no GPU, no build) of which `llama-cpp-localai-paged` benefits
|
||||
are portable off the CUDA family, and what each port costs per accelerator. This is
|
||||
the umbrella doc; it BUILDS ON, and does not repeat,
|
||||
[`UPSTREAM_LAYER2_SCOPE.md`](UPSTREAM_LAYER2_SCOPE.md) (the GDN/SSM fusion kernel
|
||||
scope) - that doc remains the authoritative reference for benefit #1 below.
|
||||
|
||||
The backend ships **CUDA-only** today (README sections 4c, 8): off-CUDA the fusions
|
||||
gate off (patch 0030) and NVFP4 falls back to dequant, so it is
|
||||
neutral-to-slightly-negative there and non-CUDA users run the stock `llama-cpp`.
|
||||
"Porting the benefits" is the upstream-contribution track that would make these
|
||||
wins real on the other accelerators. Methodology for the work itself is in
|
||||
[`.agents/vllm-parity-methodology.md`](../../../../.agents/vllm-parity-methodology.md).
|
||||
|
||||
We have **no Metal / SYCL / Vulkan / ROCm hardware here**, so every port is gated
|
||||
by `test-backend-ops` (backendX-vs-CPU) **on the target hardware** - the same gate
|
||||
discipline the existing layer-2 doc sets out.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
## 0. The four benefits and their portability class
|
||||
|
||||
| # | Benefit (patches) | Portable off CUDA? | Where scoped |
|
||||
|---|---|---|---|
|
||||
| 1 | **GDN/SSM decode fusions** (0018-0022, 0028) - in-place state write-back, fused recurrent-state gather, conv-state in-place fusion, o_proj MMQ reshape, occupancy retune | YES - per-backend KERNEL work | [`UPSTREAM_LAYER2_SCOPE.md`](UPSTREAM_LAYER2_SCOPE.md) (consolidated in section 1 here) |
|
||||
| 2 | **Paged KV in-kernel block-table flash-attn read** (0009-0011) | YES - per-backend KERNEL work | **Section 2 here (the new analysis)** |
|
||||
| 3 | **Decode-first prefill scheduler** (0013/0016) | YES - FREE, host-side, zero kernel work | Section 3 here |
|
||||
| 4 | **NVFP4 FP4-MMA + its decode levers** (0017/0023/0025) | NO (Blackwell FP4-MMA) - out of scope; two analogues flagged | Section 4 here |
|
||||
|
||||
The two kernel-bearing tracks (#1 and #2) share an identical port SHAPE - they touch
|
||||
the same decode kernel(s), the same `supports_op`, the same dispatch guard, and
|
||||
sequence the same way (ops-first PR, then one PR per backend). They should be
|
||||
**bundled into one per-backend PR**, not pursued as two separate efforts; section 5
|
||||
sequences them together. Tracks #3 (free) and #4 (out of scope) are independent.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
## 1. Benefit #1 - GDN/SSM decode fusions (consolidated; full scope is the layer-2 doc)
|
||||
|
||||
Do not re-derive this here. [`UPSTREAM_LAYER2_SCOPE.md`](UPSTREAM_LAYER2_SCOPE.md)
|
||||
already establishes, and this doc adopts wholesale:
|
||||
|
||||
- The base `GGML_OP_GATED_DELTA_NET` + `GGML_OP_SSM_CONV` + `GGML_OP_SSM_SCAN`
|
||||
kernels **already exist on Metal, Vulkan AND SYCL**, so the Qwen3.6 hybrids RUN
|
||||
on all three today via the upstream non-fused path. Layer-2 is the decode
|
||||
SPEEDUP, not "make it run." (NB: the README section 4c no longer carries the
|
||||
stale "no Vulkan kernel" line that the layer-2 doc section 0 was written to
|
||||
correct - that correction has since been folded into the README, so treat
|
||||
layer-2 section 0 as historical context, not a live correction.)
|
||||
- The four fusion ops (A in-place state 0018, B fused state gather 0019, C
|
||||
conv-update in-place 0021, D conv-tap gather 0028) reuse the existing op enums
|
||||
with extra `src[]` discriminators; only OP C is a genuinely new kernel, the rest
|
||||
redirect the read source / write target of the EXISTING kernel. The builders,
|
||||
CPU reference kernels, model graph and `test-backend-ops` cases are SHARED and
|
||||
already done.
|
||||
- Per-backend net-new work, effort and gotchas: **SYCL easiest** (near-verbatim
|
||||
CUDA mirror, ~250-350 LOC, no shader-gen), **Metal medium** (~350-500 LOC,
|
||||
fixed 32 simdgroup = simplest bit-exactness), **Vulkan hardest** (~450-650 LOC +
|
||||
shaders-gen + descriptor growth + per-vendor subgroup validation).
|
||||
- Bit-exactness is per-backend BY CONSTRUCTION (the fusions redirect addresses, not
|
||||
the f32 reduce order); gated by `test-backend-ops` (backendX-vs-CPU).
|
||||
- Upstream path: ops-first PR (incl. the capability-driven replacement for patch
|
||||
0030's backend-name allow-list), then one PR per backend.
|
||||
|
||||
The value/effort ranking from that doc (**Metal 1st, SYCL 2nd, Vulkan 3rd**) is
|
||||
adopted unchanged here and, as section 5 shows, coincides with benefit #2's ranking
|
||||
- which is why the two bundle cleanly per backend.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
## 2. Benefit #2 - paged KV in-kernel block-table flash-attn read (NEW scope)
|
||||
|
||||
### 2.0 What it is, and why it is the lever that makes paged KV non-negative off-CUDA
|
||||
|
||||
On CUDA, patches 0009-0011 replaced the per-step host-side K/V gather (patch 0003)
|
||||
with an **in-kernel paged read**. `ggml_flash_attn_ext` gained an optional
|
||||
`src[5]` = an I32 block table `[n_view, n_stream]` in token-POSITION order; the
|
||||
fattn vec/tile kernel maps logical KV index `j` to physical cell
|
||||
`block_table[seq*ne11 + j]` and reads `K0 + cell*nb11` / `V0 + cell*nb21` in place,
|
||||
so the `get_rows` of K and V (the bulk of the gather) is gone. A null block table is
|
||||
the stock contiguous read, byte-identical. Position ordering keeps the online-softmax
|
||||
reduction order identical to stock, so it is bit-exact (CPU/batch1) by construction.
|
||||
|
||||
The crucial point for portability: **the entire host side is already
|
||||
backend-agnostic.** The block-table fill (`llama_kv_cache::get_block_table`), the
|
||||
K/V views, the mask compaction, the `input_block_table` graph input, and the
|
||||
`ggml.c` / `ggml.h` builder (`ggml_flash_attn_ext_set_block_table`) all live in
|
||||
`src/` and `ggml/...` shared code. The ONLY per-backend work is, in each backend's
|
||||
flash-attn kernel: (a) thread one extra source through to the kernel, and (b) do the
|
||||
indexed read at the K/V load sites. The CPU reference already does it (patch 0009,
|
||||
`ops.cpp`).
|
||||
|
||||
Off-CUDA today the paged path falls back to the **host-side gather** (patch 0003),
|
||||
which the README section 4c measured as neutral-to-slightly-negative on the M4
|
||||
(~0-3% slower decode / ~2-8% slower prefill vs stock's contiguous read - pure
|
||||
overhead, because the in-kernel read that *recovers* the gather cost is CUDA-only).
|
||||
**Porting the block-table read is exactly what flips paged KV from
|
||||
"neutral-to-negative" to "neutral-to-positive" off CUDA** - it removes the gather
|
||||
overhead so paged KV's memory-management and prefix-sharing wins come for free
|
||||
instead of at a decode tax. (The big decode multipliers on the hybrids are still the
|
||||
benefit-#1 GDN fusions; this benefit is what makes the paged *allocator* pay its own
|
||||
way off CUDA.)
|
||||
|
||||
### 2.1 The cross-cutting finding (applies to all three backends)
|
||||
|
||||
The indexed per-cell read only fits the **vec / scalar decode kernel**. Every
|
||||
backend's FAST attention path - CUDA mma, Metal `simdgroup_load` MM, Vulkan
|
||||
coopmat2, SYCL tile - loads K/V as **contiguous tiles** (8-cell `simdgroup_load`,
|
||||
`coopMatLoadTensorNV` over a linear stride, shared-memory tile loads) that cannot
|
||||
express an arbitrary per-cell gather without a staging pre-pass. This is exactly why
|
||||
the CUDA port (0009-0010) wired ONLY the vec kernel and added a dispatch guard
|
||||
(`if (dst->src[5]) force vec`).
|
||||
|
||||
So each port mirrors that: **route any FA op carrying a block table onto the vec /
|
||||
scalar kernel; leave the fast MM path contiguous-only**, and keep the null-table
|
||||
contiguous read on the fast path untouched. The decode shape (1 query token/stream)
|
||||
naturally lands on or near the vec/scalar kernel on all three, so this is a small
|
||||
routing change, not a rewrite of the fast path.
|
||||
|
||||
### 2.2 SYCL - EASIEST (near line-for-line CUDA mirror)
|
||||
|
||||
- **Exists today:** `ggml-sycl/fattn-vec.hpp` is a DPCT-style near-verbatim mirror
|
||||
of CUDA `fattn-vec.cuh`; the kernel signature ends in the same `nb11..nb33`
|
||||
cluster the CUDA patch appends `const int* block_table` to (fattn-vec.hpp:65-76).
|
||||
Args are passed by SYCL lambda value-capture - **no descriptor/binding/push-
|
||||
constant bookkeeping at all** (strictly easier than CUDA). `supports_op`
|
||||
(`fattn.cpp` -> `ggml_sycl_get_best_fattn_kernel`) needs no change to ACCEPT
|
||||
`src[5]`.
|
||||
- **Port shape (value: medium / effort: LOW):** append `const int* block_table`
|
||||
to the kernel + `fattn_kernel_t` typedef + `lauch_kernel`/`launch_fattn`
|
||||
(sourcing `dst->src[5]->data`); 3 read-site substitutions (K at line 318, V at
|
||||
389 and 410): `K0 + block_table[seq*ne11 + k_VKQ_0 + i_KQ]*nb11`.
|
||||
- **Two SYCL-specific gotchas:**
|
||||
1. **Pointer pre-advance.** The vec kernel advances `K`/`V` by `k_VKQ_0` OUTSIDE
|
||||
the inner read (fattn-vec.hpp:293-300), so `i_KQ`/`k` are tile-local. The port
|
||||
must keep an UN-advanced base `K0`/`V0`, drop the per-iteration `K +=`/`V +=`
|
||||
on the paged path, and reconstruct the absolute cell. Get this wrong and you
|
||||
read the wrong cells with NO compile error.
|
||||
2. **Dispatch guard is bigger than CUDA's.** f16-GQA decode routes to the TILE
|
||||
kernel, not vec (`fattn.cpp:198-208` fall-through). Add
|
||||
`if (dst->src[5]) return BEST_FATTN_KERNEL_VEC;` near the top of
|
||||
`ggml_sycl_get_best_fattn_kernel`. The shared `fattn_kernel_t` typedef means
|
||||
the tile kernel must gain a matching ignored `block_table` param (or split the
|
||||
typedef) - a trivial chore.
|
||||
- **Bit-exact:** sub-group width (16) is fixed and the indexed read does not touch
|
||||
lane assignment, loop bounds, or the XOR-reduction stride - reduction order is
|
||||
invariant, so the paged vec path is byte-identical to SYCL's own contiguous vec
|
||||
path. Gate: `test-backend-ops` FLASH_ATTN_EXT (with a block-table case) on Intel
|
||||
GPU.
|
||||
|
||||
### 2.3 Metal - EASY-MEDIUM (decode already routes to the vec kernel)
|
||||
|
||||
- **Exists today:** decode (1 query token/stream, GQA) dispatches to
|
||||
`kernel_flash_attn_ext_vec` (`ggml-metal-ops.cpp` `..._use_vec`: `ne01 < 20`).
|
||||
Metal IS a true vec-equivalent (not a single unified FA kernel), and the vec
|
||||
kernel's quantized K/V branches ALREADY compute a per-cell base address
|
||||
(`k + ((ic + NE*cc + ty)*nb11)`, ggml-metal.metal:6934 / V at :7045) - so a
|
||||
per-cell indexed read is unambiguously admissible. `supports_op`
|
||||
(`ggml-metal-device.m` FLASH_ATTN_EXT) inspects no src count, so `src[5]` is
|
||||
accepted as-is.
|
||||
- **Port shape (value: HIGH / effort: EASY-MEDIUM):** append a
|
||||
`device const char * block_table` param after `dst` (**buffer index 8** for vec)
|
||||
+ a kargs field + a `has_block_table` function-constant; reuse the existing
|
||||
"bind dummy when null" idiom for a missing table; substitute the cell index with
|
||||
`block_table[seq*ne11 + cell]` at the K reads (lines 6919/6934) and V reads
|
||||
(7032/7045) - a localized rewrite of ~2 loops (the fast path must adopt the
|
||||
per-cell base form the quantized branch already uses).
|
||||
- **Gotcha:** the **non-vec MM kernel is a HARD blocker** -
|
||||
`simdgroup_load(..., NS10, ...)` reads 8 physically-CONTIGUOUS KV cells as one
|
||||
matrix tile (lines 6160 / 6339-6363); an arbitrary gather can't be a single
|
||||
strided matrix load. Mitigate exactly as CUDA did: force any block-table op onto
|
||||
the vec kernel in `..._use_vec` (ggml-metal-ops.cpp:2517); leave the MM path
|
||||
contiguous-only. Also watch a NAME COLLISION: `kernel_flash_attn_ext_blk` is an
|
||||
existing mask-skip optimization, NOT a paged block table.
|
||||
- **Bit-exact:** fixed 32-wide simdgroup + address-only redirect = byte-identical to
|
||||
Metal's own vec contiguous path. Gate: `test-backend-ops` on Apple Silicon.
|
||||
|
||||
### 2.4 Vulkan - MEDIUM (the fast NVIDIA decode path cannot do it)
|
||||
|
||||
- **Exists today:** three FA shaders - `flash_attn.comp` (scalar/vec),
|
||||
`flash_attn_cm1.comp` (coopmat1, stages K/V through shared memory),
|
||||
`flash_attn_cm2.comp` (coopmat2, the fast NVIDIA path). FA uses **7 descriptor
|
||||
bindings (0-6)**; `supports_op` (`ggml-vulkan.cpp` FLASH_ATTN_EXT) checks
|
||||
specific srcs only, no count check; but `src[5]` is **not even threaded today** -
|
||||
`ggml_vk_flash_attn` stops at `src[4]` (ggml-vulkan.cpp:14537), so wiring it
|
||||
through is part of the work.
|
||||
- **Port shape (value: HIGHEST breadth / effort: MEDIUM):** add binding 7 in the
|
||||
shader(s), bump `7`->`8` in the three `ggml_vk_create_pipeline` calls (:3997,
|
||||
:4033, :4070) and the two dispatch subbuffer lists (passing a dummy when null),
|
||||
and wrap the indexed read in one `phys_kv()` helper applied at the ~4 K + 2 V
|
||||
load sites (flash_attn.comp; the logical index is the same `(j*Bc + ...)`
|
||||
expression at every site).
|
||||
- **Two gotchas, one structural:**
|
||||
1. **Push constants are FULL.** `vk_flash_attn_push_constants` is exactly
|
||||
128 bytes with a `static_assert(... <= 128)` (the Vulkan guaranteed minimum) -
|
||||
**no room for a new field.** Signal "block-table enabled" via the existing
|
||||
`Flags` spec constant (flash_attn_base.glsl, `constant_id=10`, already
|
||||
bit-packed) - add a `BLOCK_TABLE_ENABLE` bit. The per-seq stride is already
|
||||
`p.KV`; the seq index is derivable in `init_indices()`.
|
||||
2. **coopmat2 (the fast NVIDIA GQA-decode path) is INCOMPATIBLE.** Its K/V load
|
||||
is a hardware `coopMatLoadTensorNV` over a LINEAR stride
|
||||
(flash_attn_cm2.comp:307-313/377-383); the decode callback only dequantizes,
|
||||
it cannot remap the physical address. The indexed read drops cleanly into
|
||||
**scalar** (which non-GQA decode already uses) and **cm1** (which stages
|
||||
through shmem - remap the staging loop), but **not cm2**. With a block table
|
||||
present, NVIDIA GQA decode falls back to scalar/cm1 (slower than cm2, still
|
||||
correct); the **null-table path keeps using cm2 unchanged**. AMD/Intel (no
|
||||
cm2) are fully covered by scalar/cm1.
|
||||
- **Net positive?** Yes. Non-GQA decode already runs scalar (paged read ~free);
|
||||
AMD/Intel covered; only NVIDIA GQA decode trades cm2 for scalar/cm1 *when a table
|
||||
is supplied*, and paged KV's payoff is allocator/memory + prefix-sharing, not raw
|
||||
FA throughput, so the trade is contained and the fast contiguous path is
|
||||
untouched.
|
||||
- **Bit-exact:** the read is a per-thread scalar load, subgroup-size agnostic
|
||||
(already abstracted via the `SubGroupSize` spec constant); position ordering keeps
|
||||
the reduction order identical, so byte-identical to the backend's own
|
||||
scalar/cm1 contiguous path. **Build burden is low** - these are EXISTING shader
|
||||
variants recompiling (no new `string_to_spv` shape), so no shaders-gen matrix
|
||||
growth. Gate: `test-backend-ops` per vendor (AMD + Intel + NVIDIA).
|
||||
|
||||
### 2.5 Benefit-#2 ranking and the shared dispatch/supports_op pattern
|
||||
|
||||
| backend | value | author effort | structural risk | rank |
|
||||
|---|---|---|---|---|
|
||||
| SYCL | medium (Intel GPU) | **LOW** (line-for-line; no bindings) | low (pointer pre-advance; force-vec guard) | easiest |
|
||||
| Metal | **HIGH** (largest non-CUDA base) | EASY-MEDIUM (decode = vec already) | medium (MM blocker -> force vec) | mid |
|
||||
| Vulkan| **HIGHEST breadth** (AMD+Intel+NVIDIA) | MEDIUM (7->8 bindings; Flags bit) | medium (cm2 can't; full push-const) | hardest |
|
||||
|
||||
Common to all three (mirrors CUDA 0009-0010): (1) `supports_op` needs no change to
|
||||
ACCEPT `src[5]`; (2) a **dispatch guard forces any block-table op onto the
|
||||
vec/scalar kernel**; (3) the fast MM/coopmat2 path stays contiguous-only and the
|
||||
null-table read on it is byte-identical to stock.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
## 3. Benefit #3 - decode-first prefill scheduler (FREE portable win, confirmed)
|
||||
|
||||
Patches 0013 (static `LLAMA_PREFILL_BUDGET`) and 0016 (dynamic decode-first
|
||||
`max(n_ubatch, T-D)`) are **pure host-side scheduler policy inside `update_slots()`
|
||||
with zero libllama / zero ggml-backend changes** (README sections 2, 3). They change
|
||||
only the *count* of prefill tokens admitted per step; they touch no kernel, no
|
||||
`supports_op`, no device code. They are therefore **already backend-portable with no
|
||||
per-accelerator work** - they run identically on Metal, SYCL, Vulkan, ROCm, CPU.
|
||||
Byte-identical when off (default-off / short prefill == upstream `-b` chunking).
|
||||
|
||||
This is the cheapest portable benefit: it needs no port at all, only the decision to
|
||||
leave it enabled in the (currently CUDA-only) build, or to upstream the policy. The
|
||||
only reason it is not "live everywhere" today is that the backend ships CUDA-only;
|
||||
the code itself is accelerator-neutral. If the scheduler levers are upstreamed
|
||||
independently of the kernels, they help any llama.cpp build on any accelerator at
|
||||
once - the lowest-effort, broadest-reach contribution of the whole series.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
## 4. Benefit #4 - NVFP4 FP4-MMA (NOT portable) + two backend-agnostic analogues
|
||||
|
||||
The NVFP4 decode track is **Blackwell-specific and out of scope** for accelerator
|
||||
porting: Metal, SYCL, Vulkan and ROCm/AMD lack native FP4-MMA (Metal `supports_op`
|
||||
already excludes NVFP4 from `MUL_MAT`/`MUL_MAT_ID`/`GET_ROWS`; on non-Blackwell the
|
||||
FP4 path dequants). Patch 0017 (dense FP4-GEMM occupancy tune) ships only as the
|
||||
parity gate + default-off instrumentation even on CUDA, so there is nothing to port.
|
||||
|
||||
Two of the NVFP4 *decode levers*, however, have backend-agnostic analogues worth a
|
||||
note (do not over-claim - these are observations, not scoped ports):
|
||||
|
||||
- **0023 (NVFP4 activation-quantize de-dup)** - the IDEA generalizes, the patch does
|
||||
not. The MoE broadcast up/gate projections re-quantize the same token activation
|
||||
once per expert; 0023 quantizes the unique activations once and byte-copies them
|
||||
into the expert-gathered layout. Any backend whose MoE path requantizes a shared
|
||||
activation per-expert (e.g. a Q8 activation-quant before an integer-dot MoE GEMM)
|
||||
could dedup the same way. It is NOT NVFP4-specific in PRINCIPLE - but it IS the
|
||||
one quant-specific patch in the series (README section 6), so a port is a
|
||||
per-backend MoE-quant investigation, not a lift-and-shift. Low priority.
|
||||
- **0025 (MoE decode re-graph / `LLAMA_MOE_FORCE_GRAPHS`)** - keeping the graph/
|
||||
capture path on across the grouped-MMQ MoE decode step is a CUDA-graphs concept.
|
||||
Metal/Vulkan/SYCL have their own command-buffer/graph reuse machinery; the
|
||||
generalizable finding is "the grouped MoE decode step has no host sync, so it is
|
||||
safe to keep in a captured/replayed command buffer." Whether each backend's graph
|
||||
layer already covers this is a per-backend question. The methodology note (README
|
||||
dev notes: graph/stream coverage was a FLAT lever beyond 0025 on CUDA) is the
|
||||
more durable takeaway - do not expect a large graph-coverage win on any backend.
|
||||
|
||||
Neither analogue is on the critical path; both are recorded so the next person does
|
||||
not mistake them for free ports.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
## 5. Combined sequencing and top recommendations
|
||||
|
||||
Benefits #1 (GDN fusions) and #2 (block-table FA read) share the port shape
|
||||
(vec/scalar decode kernel + `supports_op`/dispatch guard + ops-first-then-per-backend
|
||||
PR) and rank in the SAME order per backend. So sequence them TOGETHER, per backend,
|
||||
behind one shared ops-first PR:
|
||||
|
||||
1. **PR #1 - OPS (largely done, upstreamable as-is):** the `ggml.h`/`ggml.c`
|
||||
builders, the CPU reference kernels, the CUDA kernels, the `test-backend-ops`
|
||||
cases (GDN fusions AND a FLASH_ATTN_EXT block-table case), and the
|
||||
**capability-driven gate** replacing patch 0030's backend-name allow-list (make
|
||||
`supports_op` + the dispatch guard authoritative, so routing falls out of the
|
||||
normal scheduler fallback and no backend name is hard-coded). Independently
|
||||
mergeable.
|
||||
2. **PR #2 - Metal:** GDN fusion kernels (layer-2 doc) + block-table read into
|
||||
`kernel_flash_attn_ext_vec` + the force-vec routing guard. Gate on Apple Silicon.
|
||||
3. **PR #3 - SYCL:** the near-verbatim CUDA mirror of both tracks + the force-vec
|
||||
guard. Gate on Intel GPU.
|
||||
4. **PR #4 - Vulkan:** GDN fusion shaders + the scalar/cm1 block-table read (cm2
|
||||
stays contiguous, falls back when a table is present) + the `Flags` spec-constant
|
||||
bit + the 7->8 binding bump. Gate per vendor.
|
||||
|
||||
Do NOT bundle the backends into one PR (each needs its own hardware for
|
||||
`test-backend-ops`; reviewers are backend-specialized; a regression in one must not
|
||||
block the others).
|
||||
|
||||
### Top recommendations
|
||||
|
||||
1. **Metal first, both benefits together.** Largest non-CUDA LocalAI base; the
|
||||
decode shape already routes to the Metal vec kernel (block-table read is
|
||||
EASY-MEDIUM there) and the base GDN/conv kernels already exist (fusions are
|
||||
MEDIUM); fixed 32-wide simdgroup makes bit-exactness the simplest of the three.
|
||||
Highest value at moderate effort.
|
||||
2. **SYCL second as the cheap mechanical follow-on.** Both tracks are near
|
||||
line-for-line CUDA mirrors with no binding/shader-gen bookkeeping, so it is
|
||||
low-cost insurance even though the Intel-GPU audience is smaller. Budget the
|
||||
effort on the two SYCL gotchas (pointer pre-advance; the force-vec guard since
|
||||
f16-GQA decode routes to tile), not on plumbing.
|
||||
3. **Vulkan last as the high-breadth capstone.** Reaches AMD + Intel + NVIDIA, but
|
||||
carries the most host glue and the coopmat2 limitation (NVIDIA GQA decode trades
|
||||
the fast path for scalar/cm1 only when a table is present). Do it once the
|
||||
pattern is proven on Metal + SYCL.
|
||||
|
||||
A cheaper variant (from the layer-2 doc, reaffirmed): ship **Metal + SYCL together**
|
||||
right after the ops PR and treat Vulkan as a separate later effort.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
## 6. ROCm note
|
||||
|
||||
ROCm is in the **CUDA family**, not a separate port: patch 0030's allow-list already
|
||||
admits `"CUDA"/"ROCm"/"MUSA"`, and the CUDA kernels compile for HIP, so benefits #1
|
||||
and #2 are largely already-built or near-free on ROCm rather than a from-scratch
|
||||
accelerator port. Two caveats:
|
||||
|
||||
- **FP4-MMA (benefit #4) stays NVIDIA-Blackwell-only** - AMD has no native FP4-MMA,
|
||||
so the NVFP4 path dequants on ROCm exactly as elsewhere.
|
||||
- **The block-table read's force-vec routing matters on AMD too.** The AMD fast FA
|
||||
path is the wmma/mma kernel (`fattn-wmma-f16`), which - like CUDA mma, Metal MM
|
||||
and Vulkan cm2 - ignores the block table; the CUDA dispatch guard already forces a
|
||||
block-table op onto the vec kernel, so ROCm inherits correct routing, but the
|
||||
perf trade (vec vs wmma for AMD GQA decode with a table present) should be
|
||||
measured on AMD hardware before claiming a win. The GDN fusions, being plain
|
||||
CUDA-C, port to HIP with the rest of the CUDA path.
|
||||
|
||||
Net: ROCm is a "validate, don't re-port" follow-up - confirm the HIP build picks up
|
||||
the fusions + the force-vec block-table routing and gate it with `test-backend-ops`
|
||||
on an AMD GPU. It is genuinely separate from, and lighter than, the Metal / SYCL /
|
||||
Vulkan ports.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
## 7. Summary
|
||||
|
||||
- **Benefit #3 (decode-first scheduler) is free and already portable** - host-side
|
||||
policy, zero kernel work; it only needs to be left enabled / upstreamed.
|
||||
- **Benefits #1 (GDN fusions) and #2 (block-table FA read) are the real ports** -
|
||||
both are vec/scalar-decode-kernel + `supports_op`/dispatch-guard changes, both
|
||||
rank Metal-then-SYCL-then-Vulkan, and they bundle into one per-backend PR behind a
|
||||
shared ops-first PR.
|
||||
- **Benefit #2 is the lever that makes paged KV non-negative off CUDA** - it removes
|
||||
the host-gather overhead the README measured as neutral-to-slightly-negative on
|
||||
the Mac. Feasibility: SYCL EASY, Metal EASY-MEDIUM, Vulkan MEDIUM. The universal
|
||||
constraint is that only the vec/scalar kernel admits the indexed read; the fast
|
||||
MM/coopmat2 path is contiguous-only, so route block-table ops onto vec (as CUDA
|
||||
already does) and leave the fast path's null-table read byte-identical.
|
||||
- **Benefit #4 (NVFP4 FP4-MMA) is out of scope** (Blackwell only); 0023's de-dup and
|
||||
0025's graph-coverage have backend-agnostic *ideas* but no lift-and-shift port.
|
||||
- **ROCm rides the CUDA path** (validate, don't re-port); FP4-MMA stays Blackwell-only.
|
||||
- Everything is bit-exact per-backend BY CONSTRUCTION (position-ordered table +
|
||||
address-only redirect = identical reduction order), gated by `test-backend-ops`
|
||||
(backendX-vs-CPU) **on the target hardware**, which we do not have here.
|
||||
</content>
|
||||
</invoke>
|
||||
422
backend/cpp/llama-cpp-localai-paged/docs/DECODE_SERVING_SCOPE.md
Normal file
422
backend/cpp/llama-cpp-localai-paged/docs/DECODE_SERVING_SCOPE.md
Normal file
@@ -0,0 +1,422 @@
|
||||
# DECODE_SERVING_SCOPE - the continuous-serving decode gap
|
||||
|
||||
**Status: S1 + S3 IMPLEMENTED, GPU-validated, bit-exact, shipped as patches
|
||||
0040 (S1) + 0041 (S3). S2 DROPPED (measured non-target). See the results block
|
||||
below; the rest of this doc is the design/rationale those patches implement.**
|
||||
|
||||
## Results (GB10, measured)
|
||||
|
||||
Phase 0 confirmed host-bound: serving graph reuse **0% over ~5k steps** (layer-A
|
||||
rebuilds every step), `hostproc` 3.44 ms/step vs 1.59 static - the +1.85 ms IS the
|
||||
graph rebuild; `set_inputs` 0.047 ms and block-table 0.002 ms are negligible.
|
||||
|
||||
- **S1 (patch 0040)** - root cause: the paged decode inputs never overrode
|
||||
`can_reuse` (defaults false), so the graph could never be reused. Fixed with a
|
||||
256-bucketed-shape `can_reuse` + live-mctx refresh. Static batched-bench A/B:
|
||||
paged decode reuse **0% -> 95.5%**, bit-exact (md5 byte-identical reuse on/off).
|
||||
Necessary but **not** sufficient in serving (13.8% reuse alone - prefill
|
||||
co-batching churns the shape).
|
||||
- **S3 (patch 0041)** - keeps prefill out of decode steps so the scheduler emits
|
||||
reuse-stable pure-decode steps. **S1+S3 together (128-client staggered serving,
|
||||
MoE Qwen3.6-35B-A3B-NVFP4): reuse 0% -> 72.2%, `hostproc` 15.98 -> 6.31 ms/step,
|
||||
decode 4.05 -> 5.52 tok/s/seq median (4.24 -> 5.96 mean, at vLLM's ~5.9).**
|
||||
- **S2 (double-buffer set_inputs) - DROPPED.** Phase 0 put `set_inputs` at
|
||||
~0.05 ms/step: it is not the cost (the rebuild is), so S2 has nothing to recover.
|
||||
- **Follow-up to ~100% reuse - PADDED/FIXED-SLOT DECODE SHAPE: IMPLEMENTED,
|
||||
GPU-TESTED, REJECTED (not shipped).** See the "Padded-shape lever - rejected"
|
||||
block below. Summary: it does NOT close the serving gap. Padding holds the
|
||||
pure-decode width constant by emitting masked-inert dummy decodes for idle
|
||||
slots, and it is provably inert (single-seq md5 bit-exact + per-stream
|
||||
noise-floor determinism), but it **regresses throughput at every concurrency**
|
||||
(catastrophically at low load) because the serving decode here is
|
||||
**GPU-compute-bound, not host-rebuild-bound** - so the dummy-row compute it adds
|
||||
costs more than the graph-reuse it recovers. The original "remaining ~28% is
|
||||
request-boundary churn -> pad it" hypothesis stands mechanically, but the payoff
|
||||
premise (closing reuse pulls decode toward vLLM) is **not supported by
|
||||
measurement**.
|
||||
|
||||
---
|
||||
|
||||
## Padded-shape lever - rejected (implemented + GPU-tested, 2026-06-28)
|
||||
|
||||
The S1 section-(a) **padded / fixed-slot decode shape** was implemented in an
|
||||
isolated worktree off the committed S1/S3/tail base (paged HEAD `05eceb4`), built
|
||||
CUDA-only, and benched on GB10. **Verdict: REJECTED - it regresses serving
|
||||
throughput and does not close the vLLM gap.** Recorded here so it is not re-tried.
|
||||
|
||||
**Implementation** (default-off, `LLAMA_PAGED_PAD_DECODE=1`; `LLAMA_PAGED_PAD_WIDTH`
|
||||
caps the slot range): at the end of `pre_decode()`, on any step where no prompt
|
||||
tokens were admitted (`n_prompt_budgeted == 0`) and there is decode load, emit a
|
||||
masked-inert dummy decode for **every IDLE slot** (`batch.add(slot.id, 0,
|
||||
pos_max+1, /*output=*/true)`; cold slot -> fresh pos-0). This holds `n_tokens`,
|
||||
`n_seqs`, `n_seqs_unq`, `n_outputs` and the participating seq-id SET constant
|
||||
across arrivals/completions. A `release()`-side guard keeps a finished slot warm
|
||||
under padding (else patch 0024's reclaim-on-idle frees its KV and the next-step
|
||||
pos-0 re-warm churns paged-block allocation, destroying reuse). Each dummy is its
|
||||
OWN sequence, so its recurrent (gated-DeltaNet) state is private and its paged
|
||||
attention reads only its own cells; its logits are computed but never read
|
||||
(`post_decode()` only consumes `slot.i_batch` of GENERATING slots).
|
||||
|
||||
**Gates.** (1) Single-seq greedy md5 **bit-exact PASS** - dense
|
||||
`5951a5b4d624ce891e22ab5fca9bc439`, paged-MoE `8cb0ce23777bf55f92f63d0292c756b0`
|
||||
(the lever lives only in `llama-server`'s `update_slots()`, never in
|
||||
`llama-completion`). (2) **Per-stream serving determinism**: the literal
|
||||
"ON-vs-OFF token sequences identical" gate is **unachievable** - concurrent
|
||||
cuBLAS/FA decode is **not bit-reproducible run-to-run** even with padding OFF
|
||||
(OFF-vs-OFF diverging streams: dense 3/16, MoE 8/16, lockstep K=16). The
|
||||
**achievable inertness gate PASSED**: per-stream prefix-agreement ON-vs-OFF equals
|
||||
the OFF-vs-OFF noise floor exactly (MoE 0.940/0.940, dense 0.812/0.812), i.e. the
|
||||
dummy slots inject no systematic divergence beyond the pre-existing concurrent FP
|
||||
noise. So padding is provably inert; it just does not help.
|
||||
|
||||
**Bench (MoE Qwen3.6-35B-A3B-NVFP4, GB10).** Burst h2h, decode tok/s/seq:
|
||||
|
||||
| n | S1+S3 | PAD | vLLM |
|
||||
|-----|-------|------|------|
|
||||
| 8 | 28.16 | 6.05 | 44.8 |
|
||||
| 32 | 11.66 | 4.84 | 17.45|
|
||||
| 64 | 7.16 | 4.33 | 11.07|
|
||||
| 128 | 4.53 | 4.32 | 6.87 |
|
||||
|
||||
Staggered (`serve_bench.py` k=128 n=160 stagger0.25), aggregate decode tok/s and
|
||||
graph-reuse: baseline (reuse 0%) **757.6**; S1+S3 (reuse 72%) **763.3**; **PAD
|
||||
(reuse 38%) 558.0**.
|
||||
|
||||
**Why it fails (four independent reasons):**
|
||||
|
||||
1. **Serving decode is GPU-compute-bound, not host-rebuild-bound (this run).**
|
||||
Baseline reuse 0% (757.6 agg) is statistically equal to S1+S3 reuse 72% (763.3
|
||||
agg): `hostproc` is only ~4-8% of the per-step wall, so eliminating the host
|
||||
graph rebuild buys ~nothing. (This **corrects the host-bound hypothesis** above
|
||||
for this hardware: the earlier 542->762 host-bound delta did **not** reproduce
|
||||
- it was GPU-state/contention variance, not a stable reuse effect.)
|
||||
2. **Padding ADDS dummy-row compute** (full-width decode), costing throughput in
|
||||
direct proportion to `pad_width - real_load`: catastrophic at low concurrency
|
||||
(n=8: 28.16 -> 6.05, ~4.6x slower, because 8 real streams pay for a 128-wide
|
||||
step).
|
||||
3. **In continuous serving padding can't even hold the width constant**: arrivals
|
||||
are perpetually mid-prefill, so the idle-slot count varies and reuse DROPS
|
||||
72% -> 38% (the opposite of the goal). It only stabilises the pure-decode
|
||||
*tail* of a burst (verified: width pinned at 64 as real decoders fell 49->5),
|
||||
which is exactly where the dummy compute is most wasteful.
|
||||
4. **The completion-driven batch shrink that padding prevents is itself a
|
||||
throughput WIN** in a compute-bound regime (fewer real streams -> cheaper
|
||||
steps -> survivors finish faster); forcing constant width forfeits it.
|
||||
|
||||
**Conclusion.** The residual burst gap (paged 4.53 vs vLLM 6.87 at n=128 ~= 66%)
|
||||
is a **GPU-compute** gap (vLLM's MoE decode kernel + scheduler are ~1.3x faster on
|
||||
aggregate), not a host-loop gap. A host-side graph-reuse lever cannot close it.
|
||||
Do not re-pursue padded/fixed-slot shapes for throughput; if the host loop is ever
|
||||
re-confirmed dominant on other hardware (re-run reason 1's baseline-vs-S1+S3 A/B
|
||||
first), revisit - but only with an *adaptive* width matched to live load, never a
|
||||
fixed pad-to-`--parallel`.
|
||||
|
||||
---
|
||||
|
||||
Per the
|
||||
"profile-don't-assume" rule in
|
||||
[`.agents/vllm-parity-methodology.md`](../../../../.agents/vllm-parity-methodology.md),
|
||||
**Phase 0 (section 5) is to confirm the bottleneck on GPU before touching any
|
||||
code.** Everything below the Phase-0 line is a hypothesis ranked by
|
||||
value/effort/risk, not a measured result.
|
||||
|
||||
> **Regime warning (read first).** Every "decode is at the BW floor / ties vLLM"
|
||||
> and "host scheduling loop is the structural residual" conclusion in
|
||||
> [`README.md`](../README.md) section 5 was measured with **`llama-batched-bench`**:
|
||||
> a STATIC serving width (fixed `npl`, all sequences in lockstep, constant
|
||||
> batch shape every step). That is the **decode KERNEL** regime, and there the
|
||||
> patch series is at parity (paged ~6.1 tok/s/seq vs vLLM ~5.9 at npl128). This
|
||||
> document is about a **different regime**: real **continuous SERVING** through
|
||||
> `llama-server`'s `update_slots()` loop, where requests arrive and complete
|
||||
> asynchronously, the batch shape churns every step, and paged drops to ~3.7
|
||||
> tok/s/seq (-39%) while vLLM sustains ~5.9. The gap is the **scheduler / host
|
||||
> loop**, not the kernel. This is the serving analogue of the prefill-GEMM regime
|
||||
> split called out in [`PREFILL_GEMM_SCOPE.md`](PREFILL_GEMM_SCOPE.md).
|
||||
|
||||
Cross-links: [`README.md`](../README.md) sections 2 (scheduler), 3 (patches
|
||||
0008/0013/0016/0024/0025/0029), 5 (rejected levers - lever 2 graph coverage was
|
||||
FLAT *in the static regime*; this doc reopens it for the *serving* regime);
|
||||
[`.agents/llama-cpp-localai-paged-backend.md`](../../../../.agents/llama-cpp-localai-paged-backend.md)
|
||||
(bit-exact gate);
|
||||
[`.agents/vllm-parity-methodology.md`](../../../../.agents/vllm-parity-methodology.md)
|
||||
(both-engine ground-truth, per-lever A/B, record-rejected-levers).
|
||||
|
||||
---
|
||||
|
||||
## 1. The two regimes, and why the kernel-parity result does not carry over
|
||||
|
||||
`llama-batched-bench` and a real serving workload exercise the **same decode
|
||||
kernels** but **different host loops**:
|
||||
|
||||
| | `llama-batched-bench` (kernel regime) | `llama-server` continuous serving |
|
||||
|---|---|---|
|
||||
| batch shape per step | **constant** (fixed `npl`, lockstep) | **churns** (arrivals/completions, interleaved prefill) |
|
||||
| participating seq-set | **fixed** for the whole run | **changes** as requests start/finish |
|
||||
| graph reuse (see s.2) | holds after warmup -> 1 capture, replayed | breaks nearly every step -> rebuild + re-capture |
|
||||
| measured | paged ~6.1 tok/s/seq ~ vLLM ~5.9 | paged ~3.7 vs vLLM ~5.9 (-39%) |
|
||||
|
||||
The README's decode parity, BW-floor, and "host loop is the irreducible
|
||||
residual" findings are all **kernel-regime** findings. They prove the *kernels*
|
||||
are not the serving gap. They do **not** prove the host loop is irreducible *in
|
||||
serving* - the static bench holds the batch shape constant, which is exactly the
|
||||
condition that lets both graph-reuse layers (section 2) stay hot. Serving
|
||||
violates that condition. So the serving gap is reopened here as a host /
|
||||
scheduler problem, orthogonal to the kernel.
|
||||
|
||||
---
|
||||
|
||||
## 2. Root-cause hypothesis (from source, pin `9d5d882d` + the dev tree)
|
||||
|
||||
There are **two independent graph-reuse layers**, and continuous batching breaks
|
||||
**both** on nearly every step. This is the leading hypothesis for the -39%.
|
||||
|
||||
### 2a. Layer A - llama-context graph reuse (`can_reuse` / `allow_reuse`)
|
||||
|
||||
`llama_context::process_ubatch` (`src/llama-context.cpp` ~L1366) only **reuses
|
||||
the built ggml graph** when `res->can_reuse(gparams)` holds. `allow_reuse`
|
||||
(`src/llama-graph.h` ~L631) requires, among others:
|
||||
|
||||
```
|
||||
ubatch.n_tokens == other.ubatch.n_tokens &&
|
||||
ubatch.n_seqs == other.ubatch.n_seqs &&
|
||||
ubatch.n_seqs_unq == other.ubatch.n_seqs_unq &&
|
||||
ubatch.equal_seqs() == other.ubatch.equal_seqs()
|
||||
// + (when equal_seqs) the participating sequence-id SET must match
|
||||
```
|
||||
|
||||
In serving, `n_tokens` changes whenever the decode load `D` changes or a prefill
|
||||
chunk is co-batched, and the **sequence-id set** changes whenever a request
|
||||
starts or finishes. Either makes `can_reuse` return false, so `process_ubatch`
|
||||
falls into the `else` branch: **rebuild the graph** (`model.build_graph`) +
|
||||
`ggml_backend_sched_reset` + `ggml_backend_sched_alloc_graph` - full host-side
|
||||
graph construction + allocation, **every step**. In batched-bench all sequences
|
||||
are lockstep so `n_tokens`/seq-set are constant and `can_reuse` is true after
|
||||
warmup (the `graphs reused = N` perf line is ~all steps).
|
||||
|
||||
### 2b. Layer B - CUDA graph capture (`ggml_cuda_graph_*`)
|
||||
|
||||
Even when layer A reuses, the CUDA backend re-checks
|
||||
`ggml_cuda_graph_update_required` (`ggml-cuda.cu` ~L3367): it `memcmp`s every
|
||||
node's `ne`, `nb`, and `src[]->data` pointers against the captured graph. Any
|
||||
shape change -> `cudaGraphExecUpdate` / re-instantiate. Two serving-specific
|
||||
triggers:
|
||||
|
||||
- **shape churn** (same root cause as layer A): different `n_tokens` -> different
|
||||
node `ne` -> update required.
|
||||
- **paged data-pointer churn**: when a co-batched prefill allocates new KV blocks
|
||||
(or a finished sequence frees them), the per-step KV view tensors' `data`
|
||||
pointers move, so even a constant-shape decode step can trip the `memcmp`. (The
|
||||
block-table *contents* live in a fixed device buffer filled by `set_inputs`, so
|
||||
the table tensor pointer itself is stable - 0029 keeps that cheap - but the K/V
|
||||
cache views are not.)
|
||||
|
||||
Net: under serving, the GPU sits idle between launches while the host rebuilds
|
||||
the graph (layer A) and re-instantiates the CUDA graph (layer B), then runs an
|
||||
un-graphed `set_inputs` (H2D input copies) before each launch. vLLM avoids this
|
||||
with **padded/bucketed decode batch shapes + piecewise CUDA graphs**: it pads the
|
||||
decode batch to a fixed set of sizes and captures one persistent graph per
|
||||
bucket, so the steady-state decode step is a single `cudaGraphLaunch` with no
|
||||
host rebuild. Its scheduler is also a tight C++ loop with chunked-prefill
|
||||
interleave that keeps the GPU fed.
|
||||
|
||||
### 2c. Per-step host work that runs un-graphed regardless (already instrumented)
|
||||
|
||||
The dev tree carries a built-in `[L5INSTR]` profiler (`src/paged-attn.cpp`,
|
||||
hooks in `src/llama-context.cpp` and `src/llama-kv-cache.cpp`) that already
|
||||
isolates the host buckets we care about, printed at process exit:
|
||||
|
||||
```
|
||||
[L5INSTR] get_block_table n=.. sum=..ms mean=..ms | set_inputs n=.. mean=..ms | hostproc n=.. mean=..ms
|
||||
```
|
||||
|
||||
- `hostproc` = `mctx->apply()` + graph reuse-check/rebuild + `set_inputs`, i.e.
|
||||
the whole host window **before** `graph_compute` (it does NOT include the GPU
|
||||
launch). Prior profiles put this near ~1.4 ms/step.
|
||||
- `set_inputs` = the H2D input fills (positions, masks, block table, idxs).
|
||||
- `get_block_table` = the paged block-table host build (0029 caches it
|
||||
within-step; `LLAMA_PAGED_NO_BT_CACHE` A/B-toggles that).
|
||||
|
||||
If `hostproc` per step is a large fraction of the serving per-step wall time
|
||||
(and the `graphs reused` count is low), the gap is host-bound, not kernel-bound.
|
||||
|
||||
### 2d. The serial-SSM host loop (named in README s.5, secondary here)
|
||||
|
||||
The gated-DeltaNet decode advances recurrent state per step; sampling cannot
|
||||
start until logits land. The README already names this as a structural floor in
|
||||
the *kernel* regime. It is the same in serving but is the *smaller* term - the
|
||||
graph-rebuild/re-capture overhead (2a/2b) is the new, serving-specific cost the
|
||||
static bench hides, and it is the one to attack first.
|
||||
|
||||
---
|
||||
|
||||
## 3. What the already-shipped scheduler patches do (and do NOT do)
|
||||
|
||||
These exist; understand them before proposing anything. **None of them touch the
|
||||
two graph-reuse layers** - they target prefill freezing and burst collapse, not
|
||||
steady-state decode-step host overhead. That is why the serving gap survives them.
|
||||
|
||||
| Patch | What it does | What it does NOT do |
|
||||
|---|---|---|
|
||||
| 0008 cross-request prefix-share (server loop) | Concurrent shared-prefix requests prefill only the divergent suffix (fewer prefill tokens). | Does not stabilise decode batch shape; does not graph-reuse. |
|
||||
| 0013 `LLAMA_PREFILL_BUDGET` | Static per-step prefill-token cap (vLLM `--max-num-batched-tokens` analogue); flattens the ITL spike a long prefill inflicts on co-batched decode. | Ignores decode load; per-workload tuning; no effect on decode-step graph reuse. |
|
||||
| 0016 dynamic decode-first budget | `max(n_ubatch, T-D)` leftover-after-decode budget + per-slot chunk cap; decode claimed first, auto-shrinks as `D` rises. Stops a prefill chunk from inflating the step past `T`. | **Still lets the per-step decode `n_tokens` and seq-set vary**, so it does not make the decode step graph-reusable; it shapes prefill admission, not decode-shape stability. |
|
||||
| 0024 paged-pool burst-reclaim | Truncate/defrag/release KV blocks; fixes long-server prefill burst collapse (488->44->532 t/s). | Host accounting only; nothing about decode-step graph capture. |
|
||||
| 0025 `LLAMA_MOE_FORCE_GRAPHS` | Keeps CUDA graphs ON for the grouped-MMQ MoE decode step (lifts the conservative `MUL_MAT_ID` graph-disable). | Helps the CUDA-graph *eligibility* of one op; does **not** make layer-A/B *reuse* hold across churning steps. It is necessary-not-sufficient: a step that rebuilds anyway gets recaptured regardless. |
|
||||
| 0029 block-table within-step cache | `get_block_table` computed once per step, memcpy'd to other full-attn layers (-87/-91%). | Shrinks one `set_inputs`/`hostproc` sub-term; does not address rebuild/re-capture. |
|
||||
|
||||
**README s.5 "lever 2 (graph/stream coverage): FLAT"** was concluded **in the
|
||||
static batched-bench regime**, where graphs already reuse - so more graph
|
||||
coverage was correctly a no-op there. That conclusion does **not** apply to the
|
||||
serving regime, where graphs do **not** reuse. This doc reopens graph coverage
|
||||
**for serving only**; record it as a regime-scoped reopening, not a contradiction.
|
||||
|
||||
---
|
||||
|
||||
## 4. Ranked lever plan (hypotheses - gate on Phase 0 first)
|
||||
|
||||
Ranked by value/effort with bit-exactness/risk called out. All are **host-side /
|
||||
scheduler** levers (no decode-kernel changes), so all are *bit-exact-safe by
|
||||
construction* provided padding tokens are masked-inert and verified against the
|
||||
per-path md5 gate.
|
||||
|
||||
### Lever S1 (TOP) - bucketed/padded decode-step shape for graph reuse
|
||||
|
||||
**Value: high (targets the dominant -39% mechanism). Effort: medium-high. Risk:
|
||||
medium (correctness of padding inertness; seq-set churn is harder than n_tokens).**
|
||||
|
||||
Make the steady-state decode step present a **stable, bucketed shape** to both
|
||||
reuse layers, mirroring vLLM's padded decode batch + piecewise CUDA graphs:
|
||||
|
||||
- Pad the per-step decode `n_tokens` (and the stream/seq count the graph sees) up
|
||||
to the next bucket in a small fixed set (e.g. {power-of-two or fixed grid}), so
|
||||
`allow_reuse` (layer A) and `update_required` (layer B) hold across steps with
|
||||
the same bucket. Padding tokens are dummy, masked positions that contribute
|
||||
nothing to any real sequence's logits.
|
||||
- Bound the number of distinct live buckets so a handful of persistent CUDA
|
||||
graphs cover steady decode (vLLM captures ~tens).
|
||||
- Handle the seq-set component of `allow_reuse`: bucketing `n_tokens` alone is
|
||||
insufficient because the *participating sequence-id set* must also match. Either
|
||||
(a) pad to a fixed stream-slot layout so the seq-set is stable across arrivals
|
||||
/completions, or (b) relax/extend the reuse key so a pure-decode step keyed on
|
||||
bucket+slot-layout reuses regardless of which slots are occupied. (b) is the
|
||||
higher-leverage but more invasive option.
|
||||
|
||||
Bit-exact gate: greedy md5 per path with padding ON must equal the recorded
|
||||
references (`5951a5b4` dense, `8cb0ce23` paged-MoE); `test-backend-ops`
|
||||
unaffected (no op changes). The risk is that masked/padded positions leak into a
|
||||
real logit (off-by-one in the mask) - the md5 gate catches it.
|
||||
|
||||
### Lever S2 - overlap per-step host work with GPU decode (double-buffer inputs)
|
||||
|
||||
**Value: medium-high (recovers the `hostproc` window even when S1 partial).
|
||||
Effort: medium. Risk: low (host-side reordering only, bit-exact-safe).**
|
||||
|
||||
Even with graphs reused, `set_inputs` (+ the pre-`set_inputs` sync) runs
|
||||
un-graphed and serially *before* each launch (`hostproc` ~1.4 ms/step in prior
|
||||
profiles). Overlap the host scheduling + input build of step N+1 with the GPU
|
||||
decode of step N: double-buffer the input device tensors so the host can fill
|
||||
N+1's inputs while N's graph is in flight, and prepare the next ubatch / block
|
||||
table on the host concurrently. This is the llama.cpp analogue of vLLM keeping
|
||||
the GPU fed. Strictly host-side, no numeric change -> bit-exact. (0029 already
|
||||
banks part of this for the block table within a step; S2 extends it across
|
||||
steps.)
|
||||
|
||||
### Lever S3 - graph-shape-stable scheduling (bridge from 0016)
|
||||
|
||||
**Value: medium (multiplies S1; low marginal value without S1). Effort: low-medium
|
||||
(extends the existing 0016 policy). Risk: low (scheduler policy, bit-exact when
|
||||
the decode result is unchanged).**
|
||||
|
||||
Extend the existing decode-first budget (0016) so the scheduler actively *prefers
|
||||
graph-reusable steps*: keep prefill chunks out of the decode step (run prefill in
|
||||
its own steps, or at a fixed chunk size) so the decode batch shape stays on a
|
||||
bucket rather than being perturbed by interleaved prefill tokens every step. This
|
||||
is the policy half of S1 - S1 makes a bucketed step reusable; S3 makes the
|
||||
scheduler emit bucketed steps. Pair them.
|
||||
|
||||
**Rejected/deferred (record so they are not re-tried):**
|
||||
|
||||
- **More CUDA-graph *coverage* alone (the README lever-2 redo): still FLAT
|
||||
without S1.** Forcing more ops graph-eligible (beyond 0025) does nothing while
|
||||
layer A rebuilds the graph every step - the recapture dominates. Only valuable
|
||||
*after* S1 makes reuse hold.
|
||||
- **`GGML_CUDA_DISABLE_GRAPHS` / disabling graphs in serving: REJECTED a priori
|
||||
as a fix** (it is an A/B *probe* for Phase 0, not a lever) - it removes capture
|
||||
cost but also removes replay benefit; expected net-negative.
|
||||
- **Precision levers (W4A16, bf16-SSM): out of scope** - this gap is host-bound,
|
||||
not GEMM/BW-bound (see README s.5 rejections; do not reopen).
|
||||
|
||||
---
|
||||
|
||||
## 5. Phase 0 - confirm it is host-bound BEFORE building (run when the GPU frees)
|
||||
|
||||
Do NOT build any lever until this confirms host-bound. The dev tree already has
|
||||
all the instrumentation; this is a measurement, not a code change. **One GPU
|
||||
bencher at a time** (GPU-contention rule).
|
||||
|
||||
**Workload.** Real continuous serving, not batched-bench: run `llama-server`
|
||||
(paged build) with the paged config and drive it with a steady concurrent
|
||||
streaming load (e.g. a K-client async generator hitting `/completion` with
|
||||
staggered arrivals so requests start/finish asynchronously - the regime
|
||||
batched-bench cannot produce). Use the same models/flags as README s.4:
|
||||
`-fa on -ngl 99`, `LLAMA_KV_PAGED=1` (+ `LLAMA_MOE_FORCE_GRAPHS=1` for MoE),
|
||||
dense Qwen3.6-27B-NVFP4 and MoE Qwen3.6-35B-A3B-NVFP4. Pick K so the *effective
|
||||
decode width* matches a static `npl` you have a kernel-regime number for (e.g.
|
||||
~128) - that gives the apples comparison: static 6.1 vs serving 3.7 tok/s/seq.
|
||||
|
||||
**Signals to capture (all already exist):**
|
||||
|
||||
1. **Graph reuse rate.** The `graphs reused = N` perf line (`llama-context.cpp`
|
||||
~L4146, from `data.n_reused`) over total decode steps. Hypothesis: ~100% in
|
||||
batched-bench, near 0% in serving. This is the single most decisive number.
|
||||
A/B with `LLAMA_GRAPH_REUSE_DISABLE=1` (forces the rebuild path) - if serving
|
||||
is already near that floor, layer-A reuse is the gap.
|
||||
2. **`[L5INSTR]` host buckets** (printed at exit): `hostproc`, `set_inputs`,
|
||||
`get_block_table` mean ms/step. Compare serving vs batched-bench. A/B the
|
||||
block-table cache with `LLAMA_PAGED_NO_BT_CACHE`.
|
||||
3. **GPU-busy %** in a steady-state serving window via nsys (sum of kernel
|
||||
durations / wall) and the **inter-launch host gap** (time between consecutive
|
||||
`cudaGraphLaunch`/kernel launches). Hypothesis: batched-bench ~96-99% busy
|
||||
(README/methodology note the early "low util" was a window artifact); serving
|
||||
materially lower, with the gap ~= `hostproc`/step. *Watch the same window
|
||||
artifact* the methodology warns about - measure a clean steady-state span.
|
||||
4. **CUDA-graph re-instantiation count** - confirm layer B is also re-capturing
|
||||
(nsys shows `cudaGraphInstantiate`/`cudaGraphExecUpdate` per step, or add a
|
||||
host-side counter print - host-side only, no kernel code).
|
||||
|
||||
**Decision rule.** Host-bound (proceed with S1/S2/S3) if: serving `graphs reused`
|
||||
is low AND `hostproc`/step is a large fraction of serving per-step wall AND
|
||||
GPU-busy% drops vs batched-bench by ~the observed throughput ratio (~3.7/6.1).
|
||||
If instead GPU-busy% stays high and per-kernel time grows, the cause is
|
||||
elsewhere (e.g. serving runs a worse effective batch shape into the kernels) -
|
||||
re-scope before building.
|
||||
|
||||
**Ground-truth vLLM (both-engine rule).** Capture vLLM at the same concurrency:
|
||||
GPU-busy% / step cadence (nsys) and its scheduler step time. Confirm vLLM stays
|
||||
GPU-bound (persistent graphs) where paged goes host-bound - that is the
|
||||
direct evidence the gap is the host loop, and it sizes the achievable win.
|
||||
|
||||
---
|
||||
|
||||
## 6. Summary
|
||||
|
||||
- The serving gap (paged 3.7 vs vLLM 5.9 tok/s/seq, -39%) is a **host/scheduler**
|
||||
problem, distinct from the decode **kernel** (at parity in batched-bench). The
|
||||
README's BW-floor/host-loop-residual findings are kernel-regime and do not
|
||||
bound the serving regime.
|
||||
- Leading mechanism: continuous batching's **batch-shape + seq-set churn breaks
|
||||
both graph-reuse layers** (llama-context `can_reuse`, CUDA `update_required`)
|
||||
every step, so the GPU idles while the host rebuilds + re-captures + runs
|
||||
un-graphed `set_inputs`. vLLM avoids this with padded/bucketed decode shapes +
|
||||
piecewise CUDA graphs.
|
||||
- The shipped scheduler patches (0008/0013/0016/0024/0025/0029) target prefill
|
||||
freezing + burst collapse, **not** decode-step graph reuse - which is why the
|
||||
serving gap survives them.
|
||||
- Top levers (all host-side, bit-exact-safe): **S1** bucketed/padded decode-step
|
||||
shape for graph reuse, **S2** double-buffer/overlap per-step host work, **S3**
|
||||
graph-shape-stable scheduling (extend 0016). Gate everything on **Phase 0**:
|
||||
the `graphs reused` rate + `[L5INSTR]` host buckets + nsys GPU-busy% in real
|
||||
`llama-server` serving vs batched-bench, with vLLM ground-truthed at the same
|
||||
concurrency.
|
||||
</content>
|
||||
</invoke>
|
||||
@@ -0,0 +1,514 @@
|
||||
# Plan: ship the paged llama.cpp as its OWN backend + NVFP4 Qwen3.6 gallery items
|
||||
|
||||
Scoping deliverable only. NOTHING is changed by this document. It is grounded in the
|
||||
actual repo structure (read 2026-06-26 in worktree feat+paged-attention), not assumptions.
|
||||
|
||||
SHIPPED REALITY (update 2026-06-27): the backend ships CUDA-only. The matrix rows and
|
||||
the index.yaml meta-backend keep ONLY the CUDA/cublas variants (cuda-12, cuda-13, and
|
||||
the nvidia-l4t arm64 cuda-12/cuda-13 Jetson rows). The cpu / vulkan / sycl / hipblas /
|
||||
metal-darwin variants discussed below as optional/phase-2 were NOT shipped (and the
|
||||
darwin row was removed): off-CUDA the patchset's wins gate off, so it is neutral-to-
|
||||
negative there and non-CUDA users should use the stock llama-cpp backend (README 4c).
|
||||
|
||||
================================================================================
|
||||
0. GROUND TRUTH (what the repo actually does today)
|
||||
================================================================================
|
||||
|
||||
The paged patchset is ALREADY integrated into the stock llama-cpp backend in this
|
||||
worktree. Two mechanisms, both already present:
|
||||
|
||||
(a) BUILD: backend/cpp/llama-cpp/Makefile has `LLAMA_PAGED?=on`. The `llama.cpp:`
|
||||
target git-applies patches/0*.patch (base series) then, when LLAMA_PAGED != off,
|
||||
patches/paged/0*.patch (the 0018-0023 paged series + the earlier 0001-0017).
|
||||
prepare.sh has a fallback `patch`-based apply guarded by a sentinel
|
||||
(llama.cpp/src/paged-kv-manager.cpp). So a stock `make backends/llama-cpp` TODAY
|
||||
already ships the paged engine compiled in.
|
||||
|
||||
(b) RUNTIME GATING: backend/cpp/llama-cpp/grpc-server.cpp ALREADY carries the option
|
||||
hooks (lines ~752-842). They only call setenv() before context init:
|
||||
- option `kv_paged` / `paged_kv` / `paged_attention` -> setenv LLAMA_KV_PAGED=1
|
||||
- option `kv_paged_debug` / `paged_kv_debug` -> setenv LLAMA_KV_PAGED_DEBUG=1
|
||||
- option `max_prefill_tokens` / `mpt` / `prefill_budget` -> setenv LLAMA_PREFILL_BUDGET
|
||||
- option `max_batch_tokens` / `mbt` -> setenv LLAMA_MAX_BATCH_TOKENS
|
||||
- option `prefill_cap` -> setenv LLAMA_PREFILL_CAP
|
||||
Against UNPATCHED llama.cpp these setenv() calls are inert (nothing reads the env),
|
||||
so grpc-server.cpp is byte-safe to share between a clean build and a paged build.
|
||||
The paged engine itself lives entirely inside the patched llama.cpp lib
|
||||
(paged-kv-manager.cpp etc.), NOT in grpc-server.cpp.
|
||||
|
||||
Conclusion: "stock llama-cpp + paged patchset, runtime-gated" is the CURRENT state of
|
||||
ONE backend. The task is to SPLIT that into two backends:
|
||||
- llama-cpp = clean upstream llama.cpp (de-risked: a dep-bump can never break on a
|
||||
paged hook), grpc-server.cpp keeps the dormant hooks.
|
||||
- <newname> = stock grpc-server.cpp + paged patch series applied + paged on.
|
||||
|
||||
The turboquant backend is the EXACT precedent for "a llama.cpp variant that reuses the
|
||||
backend/cpp/llama-cpp grpc-server sources via a thin wrapper Makefile + its own Dockerfile
|
||||
+ its own matrix rows". Copy turboquant's shape, with two simplifications (see section 1).
|
||||
|
||||
CPU_ALL_VARIANTS reuse: backend/cpp/llama-cpp/Makefile already has `llama-cpp-cpu-all`
|
||||
(one grpc-server + dlopen libggml-cpu-*.so via -DGGML_BACKEND_DL/-DGGML_CPU_ALL_VARIANTS,
|
||||
SHARED_LIBS=ON make-var). turboquant mirrors it with `turboquant-cpu-all`. The new backend
|
||||
gets the same single-build CPU target for free by reusing the same Makefile machinery.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
RECOMMENDED BACKEND NAME: `llama-cpp-paged` (see section 4 for the full rationale)
|
||||
--------------------------------------------------------------------------------
|
||||
Everywhere below, NAME = llama-cpp-paged, DOCKERFILE = Dockerfile.llama-cpp-paged,
|
||||
SRC DIR = backend/cpp/llama-cpp-paged/, MAKE VAR = BACKEND_LLAMA_CPP_PAGED.
|
||||
DO NOT use the dotted working name `localai-llama.cpp`: a dot in Dockerfile.<suffix> and
|
||||
in the tag-suffix is unprecedented (every sibling is hyphenated: llama-cpp, ik-llama-cpp,
|
||||
turboquant, ds4) and complicates the changed-backends.js endsWith() suffix matching.
|
||||
|
||||
================================================================================
|
||||
1. NEW BACKEND - file by file
|
||||
================================================================================
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
1.1 backend/cpp/llama-cpp/Makefile (the ONE necessary touch to stock)
|
||||
--------------------------------------------------------------------------------
|
||||
Change exactly one default so the STOCK image ships clean against upstream:
|
||||
|
||||
-LLAMA_PAGED?=on
|
||||
+LLAMA_PAGED?=off
|
||||
|
||||
Why: this is the entire point of the split - stock llama-cpp must build clean so an
|
||||
upstream LLAMA_VERSION bump can never fail on a paged hook. The runtime hooks in
|
||||
grpc-server.cpp stay (inert). The new backend forces LLAMA_PAGED=on explicitly (1.2), so
|
||||
it does not depend on this default. NOTE this DOES change stock's shipped artifact (it
|
||||
currently ships paged-compiled-in-but-gated); that is intended de-risking, call it out in
|
||||
the PR. If the team prefers stock literally untouched, the alternative is to leave
|
||||
`?=on` and accept that stock keeps carrying the patch series - but then "clean stock" is
|
||||
not achieved. Recommendation: flip to off.
|
||||
|
||||
(No other change to backend/cpp/llama-cpp/ - grpc-server.cpp, CMakeLists.txt, prepare.sh,
|
||||
patches/, patches/paged/ are all reused as-is by the new backend.)
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
1.2 backend/cpp/llama-cpp-paged/Makefile (NEW - thin wrapper, model on turboquant)
|
||||
--------------------------------------------------------------------------------
|
||||
Mirror backend/cpp/turboquant/Makefile, but SIMPLER (two things turboquant needs that we
|
||||
do NOT):
|
||||
- turboquant overrides LLAMA_REPO/LLAMA_VERSION to a fork. We use the SAME upstream pin
|
||||
as stock (it lives in backend/cpp/llama-cpp/Makefile, already auto-bumped). So we do
|
||||
NOT set LLAMA_VERSION here -> no bump_deps.yaml entry needed (big simplification vs
|
||||
turboquant). We only force LLAMA_PAGED=on.
|
||||
- turboquant runs patch-grpc-server.sh (augments the KV-cache type allow-list) and
|
||||
apply-patches.sh (fork catch-up). We need NEITHER: grpc-server.cpp already has the
|
||||
paged hooks, and the paged patch series is applied by the copied llama-cpp Makefile's
|
||||
own `llama.cpp:` target when LLAMA_PAGED=on.
|
||||
|
||||
Shape (one flavor shown; replicate the turboquant flavor set: avx/avx2/avx512/fallback/
|
||||
cpu-all/grpc/rpc-server):
|
||||
|
||||
LLAMA_CPP_DIR := $(CURRENT_MAKEFILE_DIR)/../llama-cpp
|
||||
|
||||
define paged-build # $(1)=flavor $(2)=cmake flags $(3)=target
|
||||
rm -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-paged-$(1)-build
|
||||
cp -rf $(LLAMA_CPP_DIR) $(CURRENT_MAKEFILE_DIR)/../llama-cpp-paged-$(1)-build
|
||||
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-paged-$(1)-build purge
|
||||
# clone upstream + apply base AND paged patch series (LLAMA_PAGED=on forces it)
|
||||
LLAMA_PAGED=on $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-paged-$(1)-build llama.cpp
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) $(2)" TARGET="$(3)" LLAMA_PAGED=on \
|
||||
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-paged-$(1)-build grpc-server
|
||||
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-paged-$(1)-build/grpc-server llama-cpp-paged-$(1)
|
||||
endef
|
||||
|
||||
llama-cpp-paged-cpu-all:
|
||||
# identical to turboquant-cpu-all: SHARED_LIBS=ON + GGML_BACKEND_DL + CPU_ALL_VARIANTS
|
||||
# + --target ggml; then collect ggml-shared-libs/ for package.sh to bundle.
|
||||
... LLAMA_PAGED=on SHARED_LIBS=ON \
|
||||
EXTRA_CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON" \
|
||||
TARGET="--target grpc-server --target ggml" ...
|
||||
|
||||
package: ; bash package.sh
|
||||
purge: ; rm -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-paged-*-build; rm -rf llama-cpp-paged-* package
|
||||
clean: purge
|
||||
|
||||
Binaries are named llama-cpp-paged-{cpu-all,fallback,grpc,rpc-server,...} so run.sh and
|
||||
package.sh glob them.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
1.3 backend/cpp/llama-cpp-paged/run.sh (NEW - copy turboquant/run.sh, rename binaries)
|
||||
--------------------------------------------------------------------------------
|
||||
s/turboquant/llama-cpp-paged/g. Prefers llama-cpp-paged-cpu-all if present, falls back to
|
||||
llama-cpp-paged-fallback; llama-cpp-paged-grpc when LLAMACPP_GRPC_SERVERS set; Darwin
|
||||
DYLD_LIBRARY_PATH branch; lib/ld.so launch. Keep verbatim otherwise.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
1.4 backend/cpp/llama-cpp-paged/package.sh (NEW - copy turboquant/package.sh, rename)
|
||||
--------------------------------------------------------------------------------
|
||||
s/turboquant/llama-cpp-paged/g. Copies llama-cpp-paged-* into package/, bundles
|
||||
ggml-shared-libs/*.so* into package/lib (the CPU_ALL_VARIANTS dlopen set), copies run.sh,
|
||||
and the per-arch libc/ld.so set (unchanged).
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
1.5 backend/Dockerfile.llama-cpp-paged (NEW - copy Dockerfile.turboquant, swap paths)
|
||||
--------------------------------------------------------------------------------
|
||||
Identical 3-stage structure (builder-fromsource / builder-prebuilt / FROM scratch). Edits:
|
||||
- bind/run .docker/llama-cpp-paged-compile.sh (new, 1.6) instead of turboquant-compile.sh
|
||||
- ccache id: id=llama-cpp-paged-ccache-${TARGETARCH}-${BUILD_TYPE}
|
||||
(OPTIONAL OPTIMIZATION: set id=llama-cpp-ccache-${TARGETARCH}-${BUILD_TYPE} to SHARE
|
||||
stock llama-cpp's ccache - the paged TUs are mostly byte-identical to stock, so a warm
|
||||
stock cache would give the paged build near-free object reuse. Trade-off: a regression
|
||||
in one could surface as a cold miss in the other. Recommend sharing; revisit if noisy.)
|
||||
- both `make -BC /LocalAI/backend/cpp/llama-cpp-paged package`
|
||||
- final COPY --from=builder /LocalAI/backend/cpp/llama-cpp-paged/package/. ./
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
1.6 .docker/llama-cpp-paged-compile.sh (NEW - copy llama-cpp-compile.sh, swap make targets)
|
||||
--------------------------------------------------------------------------------
|
||||
Identical to .docker/llama-cpp-compile.sh except `cd .../llama-cpp-paged` and call
|
||||
`make llama-cpp-paged-cpu-all` (BUILD_TYPE empty / CPU) or `make llama-cpp-paged-fallback`
|
||||
(GPU), then `make llama-cpp-paged-grpc` + `make llama-cpp-paged-rpc-server`. Keep the
|
||||
arm64 gcc-14 apt step (CPU_ALL_VARIANTS armv9.2 SME needs gcc-14). ccache export unchanged.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
1.7 Makefile (top-level) - 6 edits, mirror the turboquant lines
|
||||
--------------------------------------------------------------------------------
|
||||
a) .NOTPARALLEL (line 2): append `backends/llama-cpp-paged`
|
||||
b) Backend def (after BACKEND_TURBOQUANT, line ~1172):
|
||||
# llama-cpp-paged = stock llama.cpp grpc-server + LocalAI paged-attention patch
|
||||
# series (LLAMA_PAGED=on). Reuses backend/cpp/llama-cpp sources via a thin wrapper.
|
||||
BACKEND_LLAMA_CPP_PAGED = llama-cpp-paged|llama-cpp-paged|.|false|false
|
||||
(lang field `llama-cpp-paged` -> Dockerfile.llama-cpp-paged, matching the
|
||||
llama-cpp / ik-llama-cpp / turboquant convention where lang==backend name.)
|
||||
c) generate-docker-build-target eval (after BACKEND_TURBOQUANT, line ~1273):
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP_PAGED)))
|
||||
d) docker-build-backends (line ~1337): append docker-build-llama-cpp-paged
|
||||
e) test-extra-backend-llama-cpp-paged target (mirror test-extra-backend-turboquant,
|
||||
line ~673): BACKEND_IMAGE=local-ai-backend:llama-cpp-paged $(MAKE) test-extra-backend
|
||||
f) (optional) backends/llama-cpp-paged-darwin target if shipping metal (mirror
|
||||
backends/llama-cpp-darwin at line 1124; see 1.11).
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
1.8 .github/backend-matrix.yml - add rows (mirror every llama-cpp row, swap names)
|
||||
--------------------------------------------------------------------------------
|
||||
For EACH variant you choose to ship (see phased recommendation in section 4), add a row
|
||||
copied from the corresponding llama-cpp row with:
|
||||
- backend: "llama-cpp-paged"
|
||||
- dockerfile: "./backend/Dockerfile.llama-cpp-paged"
|
||||
- tag-suffix: swap `-llama-cpp` -> `-llama-cpp-paged`
|
||||
(e.g. -cpu-llama-cpp -> -cpu-llama-cpp-paged;
|
||||
-gpu-nvidia-cuda-12-llama-cpp -> -gpu-nvidia-cuda-12-llama-cpp-paged; etc.)
|
||||
- builder-base-image: UNCHANGED - reuse the same base-grpc-* tags as llama-cpp
|
||||
(this backend compiles the same gRPC + same toolchain; no new base-images.yml variant
|
||||
is needed, so NO base-images bootstrap step). This is the cheap-variant payoff.
|
||||
- CPU: TWO per-arch rows (amd64 ubuntu-latest + arm64 ubuntu-24.04-arm) sharing
|
||||
tag-suffix '-cpu-llama-cpp-paged' so changed-backends.js emits a merge-matrix entry and
|
||||
backend-merge-jobs assembles the manifest list. Same per-arch native + manifest-merge
|
||||
pattern as -cpu-llama-cpp.
|
||||
- Darwin (if shipping): add to includeDarwin:
|
||||
- backend: "llama-cpp-paged"
|
||||
tag-suffix: "-metal-darwin-arm64-llama-cpp-paged"
|
||||
lang: "go"
|
||||
(omit build-type, exactly like the llama-cpp darwin row at line 4908.)
|
||||
|
||||
REMINDER: the CI path filter only builds a backend on a PR when a file under its dir
|
||||
changes. The PR that adds this backend touches backend/cpp/llama-cpp-paged/* so it self-
|
||||
triggers. But also add the cross-trigger in 1.9 so future edits to backend/cpp/llama-cpp/
|
||||
(the shared source) retrigger this backend too.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
1.9 scripts/changed-backends.js - two edits (mirror turboquant exactly)
|
||||
--------------------------------------------------------------------------------
|
||||
a) inferBackendPath(): add BEFORE the generic `endsWith("llama-cpp")` branch (line 56),
|
||||
next to the turboquant branch (line 45):
|
||||
if (item.dockerfile.endsWith("llama-cpp-paged")) {
|
||||
// reuses backend/cpp/llama-cpp sources via a thin wrapper Makefile
|
||||
return `backend/cpp/llama-cpp-paged/`;
|
||||
}
|
||||
ORDER MATTERS: "Dockerfile.llama-cpp-paged".endsWith("llama-cpp") is false today, but
|
||||
keep the specific branch first regardless (defensive, and returns the right path).
|
||||
b) inferBackendPathDarwin(): add a case (next to the llama-cpp one at line 66):
|
||||
if (item.backend === "llama-cpp-paged") { return `backend/cpp/llama-cpp-paged/`; }
|
||||
c) Per-backend cross-trigger (line 274-278, mirror the turboquant block):
|
||||
if (backend === "llama-cpp-paged" && !changed) {
|
||||
changed = changedFiles.some(file => file.startsWith("backend/cpp/llama-cpp/"));
|
||||
}
|
||||
Verify: node -e "... e.dockerfile.endsWith('llama-cpp-paged') ..." per adding-backends.md.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
1.10 backend/index.yaml - meta + image entries (META-BACKEND - capabilities map, NO uri)
|
||||
--------------------------------------------------------------------------------
|
||||
GOTCHA (project_backend_meta_gotcha): a backend that ships per-platform images MUST be a
|
||||
meta backend = an anchor with a `capabilities:` map and NO top-level `uri:`; the concrete
|
||||
per-platform entries carry the uri. Copy the *llamacpp anchor (lines 3-31).
|
||||
|
||||
Step a - meta anchor in `## metas` (after *turboquant, ~line 74):
|
||||
- &llamacpppaged
|
||||
name: "llama-cpp-paged"
|
||||
alias: "llama-cpp-paged"
|
||||
license: mit
|
||||
icon: <same as llama-cpp>
|
||||
description: |
|
||||
LocalAI's paged-attention llama.cpp: on-demand paged KV cache + decode-first
|
||||
prefill budget. Stock llama.cpp grpc-server + the LocalAI paged patch series.
|
||||
Tuned for NVFP4 dense/MoE on Blackwell/GB10. Reuses the llama-cpp gRPC server.
|
||||
urls: [ https://github.com/ggerganov/llama.cpp ]
|
||||
tags: [ text-to-text, LLM, CPU, GPU, CUDA, Metal, paged-attention, nvfp4 ]
|
||||
capabilities:
|
||||
default: "cpu-llama-cpp-paged"
|
||||
nvidia: "cuda12-llama-cpp-paged"
|
||||
nvidia-cuda-12: "cuda12-llama-cpp-paged"
|
||||
nvidia-cuda-13: "cuda13-llama-cpp-paged"
|
||||
nvidia-l4t: "nvidia-l4t-arm64-llama-cpp-paged"
|
||||
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-llama-cpp-paged"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp-paged"
|
||||
metal: "metal-llama-cpp-paged"
|
||||
# add amd/intel/vulkan keys ONLY for variants you actually build (section 4)
|
||||
|
||||
Step b - a `-development` meta (mirror llama-cpp-development, line 1611) with the same
|
||||
capabilities map pointing at the `*-development` image names.
|
||||
|
||||
Step c - concrete image entries at end of file (mirror the llama-cpp block lines
|
||||
2106-2200), one latest + one development per variant, each as:
|
||||
- !!merge <<: *llamacpppaged
|
||||
name: "cpu-llama-cpp-paged"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-llama-cpp-paged"
|
||||
mirrors: [ localai/localai-backends:latest-cpu-llama-cpp-paged ]
|
||||
- !!merge <<: *llamacpppaged
|
||||
name: "cpu-llama-cpp-paged-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-llama-cpp-paged"
|
||||
mirrors: [ localai/localai-backends:master-cpu-llama-cpp-paged ]
|
||||
...repeat for cuda12 / cuda13 / l4t / metal etc.
|
||||
The `latest-` / `master-` uri prefix + tag-suffix MUST match the matrix tag-suffix exactly.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
1.11 Darwin (only if shipping metal; the NVFP4 target is CUDA, so metal is optional/phase 2)
|
||||
--------------------------------------------------------------------------------
|
||||
If metal is shipped, also:
|
||||
- scripts/build/llama-cpp-paged-darwin.sh (copy scripts/build/llama-cpp-darwin.sh; it
|
||||
drives the 3 CMake variants + otool dylib bundling). Ensure it forces LLAMA_PAGED=on.
|
||||
- Makefile `backends/llama-cpp-paged-darwin` target (mirror backends/llama-cpp-darwin).
|
||||
- backend_build_darwin.yml: add the llama-cpp-paged branch (mirror the llama-cpp-specific
|
||||
step that calls `make backends/llama-cpp-darwin`).
|
||||
- index.yaml metal-llama-cpp-paged / -development image entries (already in 1.10).
|
||||
- C++ proto gotcha already handled (reuses llama-cpp CMakeLists.txt with hw_grpc_proto
|
||||
linking protobuf/grpc++), so no Homebrew-include failure.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
1.12 Importer / /backends/known dropdown (drop-in, NOT a new importer)
|
||||
--------------------------------------------------------------------------------
|
||||
This backend consumes GGUF exactly like llama-cpp -> extend the EXISTING importer, do not
|
||||
add a new one (per adding-backends.md rule 2). Edit core/gallery/importers/llama-cpp.go:
|
||||
- AdditionalBackends() (line 37): append
|
||||
{Name: "llama-cpp-paged", Modality: "text",
|
||||
Description: "Paged-attention llama.cpp (on-demand paged KV + decode-first budget)"}
|
||||
- Import() backend allow-list (line 133): add "llama-cpp-paged" to the switch case so a
|
||||
preferences.backend == "llama-cpp-paged" is honored:
|
||||
case "ik-llama-cpp", "turboquant", "llama-cpp-paged": backend = b
|
||||
- core/gallery/importers/importers_test.go: add a table case asserting the preference
|
||||
override emits backend: llama-cpp-paged (Ginkgo/Gomega; reuse an existing public GGUF
|
||||
HF fixture). Run `go test ./core/gallery/importers/...`.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
1.13 Docs
|
||||
--------------------------------------------------------------------------------
|
||||
- docs/content/features/backends.md: add llama-cpp-paged to the text-to-text/LLM list,
|
||||
one line noting paged KV + NVFP4 Blackwell tuning. (Not an in-house from-scratch engine
|
||||
-> it is a llama.cpp variant -> do NOT add to the README maintained-engines table.)
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
1.14 Does grpc-server.cpp need the paged hooks? YES - already present, reused unchanged.
|
||||
--------------------------------------------------------------------------------
|
||||
The hooks (kv_paged / max_batch_tokens / prefill_budget / prefill_cap) are already in the
|
||||
SHARED backend/cpp/llama-cpp/grpc-server.cpp. The paged backend reuses that file verbatim
|
||||
(via the Makefile copy). No patch-grpc-server.sh step is needed (unlike turboquant). The
|
||||
hooks are what translate the gallery `options:` (1.10 section 2) into the LLAMA_KV_PAGED /
|
||||
LLAMA_MAX_BATCH_TOKENS env that the paged llama.cpp lib reads.
|
||||
|
||||
================================================================================
|
||||
2. GALLERY ITEMS - NVFP4 Qwen3.6 dense + MoE
|
||||
================================================================================
|
||||
|
||||
Add two entries to gallery/index.yaml. Schema (verified against existing GGUF items and
|
||||
the LocalAI config structs): backend selection via `overrides.backend`; runtime knobs via
|
||||
either typed config fields (context_size/f16/flash_attention/gpu_layers/batch) or the
|
||||
`options:` string list (key:value, parsed by grpc-server.cpp set_option).
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
2.1 Benchmark llama-server flags -> LocalAI model-config mapping
|
||||
--------------------------------------------------------------------------------
|
||||
-c 131072 -> context_size: 131072 (LLMConfig.ContextSize, yaml context_size)
|
||||
-fa on -> flash_attention: "on" (LLMConfig.FlashAttention, yaml flash_attention; string)
|
||||
-ngl 99 -> gpu_layers: 99 (LLMConfig.NGPULayers, yaml gpu_layers; or omit -> DefaultNGPULayers offloads all)
|
||||
-b 2048 -> batch: 2048 (schema.PredictionOptions.Batch, yaml batch) [see caveat]
|
||||
--parallel 128 -> options: ["parallel:128"] (grpc-server.cpp:629; alias n_parallel)
|
||||
LLAMA_KV_PAGED=1 -> options: ["paged_kv:true"] (grpc-server.cpp:778)
|
||||
LLAMA_MAX_BATCH_TOKENS=512 -> options: ["max_batch_tokens:512"] (grpc-server.cpp:821; alias mbt)
|
||||
f16 KV -> f16: true (LLMConfig.F16, yaml f16)
|
||||
(recommended for paged) -> options: ["kv_unified:false"] (grpc-server.cpp:746 - the per-slot paged
|
||||
capacity/memory benefit only materializes with a per-sequence cache;
|
||||
the patch comment explicitly recommends pairing paged with kv_unified:false)
|
||||
|
||||
CAVEAT (-ub 512): LocalAI sets params.n_ubatch = params.n_batch = request->nbatch()
|
||||
(grpc-server.cpp:528,532). There is NO separate config field for n_ubatch, so the
|
||||
benchmark's `-b 2048 -ub 512` split is NOT exactly reproducible. Options:
|
||||
(i) set batch: 512 -> n_batch=n_ubatch=512 (matches -ub; the decode-first
|
||||
max_batch_tokens=512 budget is the dominant prefill lever anyway, and the
|
||||
benchmark states decode throughput is budget-independent), OR
|
||||
(ii) set batch: 2048 -> n_ubatch also 2048 (bigger physical batch, more KV scratch).
|
||||
RECOMMEND (i) batch: 512 for the shipped gallery config (closest to the measured run +
|
||||
lighter memory). Flag separately: a tiny grpc-server.cpp option `n_ubatch`/`ubatch` could
|
||||
be added later to honor -b/-ub independently (not required to ship).
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
2.2 gallery/index.yaml entry - DENSE q36-27b-nvfp4
|
||||
--------------------------------------------------------------------------------
|
||||
- name: "qwen3.6-27b-nvfp4-paged"
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
urls:
|
||||
- https://huggingface.co/<ORG>/Qwen3.6-27B-NVFP4-GGUF # placeholder, section 3
|
||||
description: |
|
||||
Qwen3.6-27B dense, native Blackwell NVFP4 (FP4-MMA) GGUF. Configured for LocalAI's
|
||||
paged-attention llama.cpp backend: on-demand paged KV + decode-first prefill budget.
|
||||
Benchmarked on GB10/DGX Spark at 90-117% of vLLM dense decode at 1.5-3x lower memory.
|
||||
license: "apache-2.0" # confirm vs Qwen license
|
||||
tags: [ llm, gguf, nvfp4, reasoning ]
|
||||
icon: https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png
|
||||
overrides:
|
||||
backend: llama-cpp-paged
|
||||
f16: true
|
||||
flash_attention: "on"
|
||||
context_size: 131072
|
||||
gpu_layers: 99
|
||||
batch: 512 # see -ub caveat 2.1; matches the 512 ubatch floor
|
||||
known_usecases: [ chat ]
|
||||
options:
|
||||
- use_jinja:true
|
||||
- paged_kv:true # LLAMA_KV_PAGED=1
|
||||
- max_batch_tokens:512 # LLAMA_MAX_BATCH_TOKENS=512 (decode-first QoS budget)
|
||||
- kv_unified:false # enables the per-slot paged capacity/memory benefit
|
||||
- parallel:128 # --parallel 128 serving slots
|
||||
parameters:
|
||||
model: llama-cpp/models/Qwen3.6-27B-NVFP4-GGUF/q36-27b-nvfp4.gguf
|
||||
template:
|
||||
use_tokenizer_template: true
|
||||
files:
|
||||
- filename: llama-cpp/models/Qwen3.6-27B-NVFP4-GGUF/q36-27b-nvfp4.gguf
|
||||
sha256: <FILL after publish>
|
||||
uri: https://huggingface.co/<ORG>/Qwen3.6-27B-NVFP4-GGUF/resolve/main/q36-27b-nvfp4.gguf
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
2.3 gallery/index.yaml entry - MoE q36-35b-a3b-nvfp4
|
||||
--------------------------------------------------------------------------------
|
||||
Same shape; the MoE is lighter on memory (~3B active). parallel:128 + budget 256 was the
|
||||
MoE decode-throughput sweet spot in the sweep, but 512 is fine as a default; if optimizing
|
||||
purely for saturated MoE decode use max_batch_tokens:256.
|
||||
- name: "qwen3.6-35b-a3b-nvfp4-paged"
|
||||
urls: [ https://huggingface.co/<ORG>/Qwen3.6-35B-A3B-NVFP4-GGUF ]
|
||||
...
|
||||
overrides:
|
||||
backend: llama-cpp-paged
|
||||
f16: true
|
||||
flash_attention: "on"
|
||||
context_size: 131072
|
||||
batch: 512
|
||||
options:
|
||||
- use_jinja:true
|
||||
- paged_kv:true
|
||||
- max_batch_tokens:512 # or 256 for max saturated MoE decode (sweep winner)
|
||||
- kv_unified:false
|
||||
- parallel:128
|
||||
parameters:
|
||||
model: llama-cpp/models/Qwen3.6-35B-A3B-NVFP4-GGUF/q36-35b-a3b-nvfp4.gguf
|
||||
files:
|
||||
- filename: llama-cpp/models/Qwen3.6-35B-A3B-NVFP4-GGUF/q36-35b-a3b-nvfp4.gguf
|
||||
sha256: <FILL after publish>
|
||||
uri: https://huggingface.co/<ORG>/Qwen3.6-35B-A3B-NVFP4-GGUF/resolve/main/q36-35b-a3b-nvfp4.gguf
|
||||
|
||||
Note: these are the BENCHMARK serving configs. For an interactive single-user default you
|
||||
may want a second lighter gallery variant (context_size 16384, parallel 4, drop the budget)
|
||||
- optional, not required to ship the benchmark reproduction.
|
||||
|
||||
================================================================================
|
||||
3. GGUF PUBLISHING (so the gallery uri: resolves)
|
||||
================================================================================
|
||||
|
||||
The two GGUFs already exist on the DGX dev box (final_benchmark.csv references
|
||||
q36-27b-nvfp4.gguf and q36-35b-a3b-nvfp4.gguf; README.md "Models" + "Benchmarks"
|
||||
document provenance: dense = native Blackwell FP4 unsloth W4A4 lineage; MoE = 241 NVFP4
|
||||
tensors from nvidia modelopt weights). To publish:
|
||||
|
||||
1. HF repos (suggest two, under the org that owns the gallery-referenced weights):
|
||||
<ORG>/Qwen3.6-27B-NVFP4-GGUF (single q36-27b-nvfp4.gguf)
|
||||
<ORG>/Qwen3.6-35B-A3B-NVFP4-GGUF (single q36-35b-a3b-nvfp4.gguf)
|
||||
ORG = localai-org (brand) or mudler (personal); pick per ownership of the conversions.
|
||||
2. Upload each .gguf; compute sha256 (sha256sum) and paste into the gallery `files:` sha256
|
||||
(LocalAI verifies it on download). Without sha256 the entry still works but loses the
|
||||
integrity check - fill it.
|
||||
3. Model card metadata: base_model Qwen/Qwen3.6-*, library_name gguf, quantization NVFP4,
|
||||
pipeline_tag text-generation, license (confirm Qwen3.6 license terms - apache-2.0 vs
|
||||
Qwen community license), a note that it REQUIRES the llama-cpp-paged backend (NVFP4 +
|
||||
paged), and the GB10 benchmark table (link README.md "Benchmarks" numbers).
|
||||
4. NVFP4 requires a llama.cpp new enough to read the NVFP4 GGUF type. Confirm the pinned
|
||||
LLAMA_VERSION in backend/cpp/llama-cpp/Makefile supports NVFP4 tensor types (the dev
|
||||
tree that produced the GGUFs did). If the current pin predates NVFP4 GGUF support, the
|
||||
backend pin must be bumped OR the paged patch series must carry the NVFP4 reader. THIS
|
||||
IS A GATING CHECK before the gallery items are usable - verify on a GPU box.
|
||||
5. Provenance/licensing: the dense conversion derives from unsloth; the MoE from nvidia
|
||||
modelopt weights. Ensure redistribution of the converted GGUFs is permitted and
|
||||
attribute upstream in the card.
|
||||
|
||||
================================================================================
|
||||
4. OPEN DECISIONS / BLOCKERS / BUILD COST
|
||||
================================================================================
|
||||
|
||||
BACKEND NAME - RECOMMEND `llama-cpp-paged`.
|
||||
- llama-cpp-paged (RECOMMENDED): descriptive (it IS the paged variant), hyphenated like
|
||||
every sibling (llama-cpp/ik-llama-cpp/turboquant/ds4), collision-free in the
|
||||
changed-backends.js endsWith() suffix scheme, self-documenting in the /backends/known
|
||||
importer dropdown. Reads correctly next to "turboquant" and "ik-llama-cpp".
|
||||
- localai-llama-cpp (branding alternative, ACCEPTABLE): keeps the LocalAI brand without a
|
||||
dot; hyphenated and safe. Use this if marketing wants "LocalAI's own llama.cpp" framing.
|
||||
Slightly less self-explanatory about WHAT differs (paged) in the dropdown.
|
||||
- localai-llama.cpp (the working name; NOT RECOMMENDED): the dot makes Dockerfile.localai-
|
||||
llama.cpp and tag-suffix -cpu-localai-llama.cpp the only dotted ones in the repo, and
|
||||
".cpp" looks like a file extension to the suffix matcher. Avoid.
|
||||
|
||||
BLOCKERS / GATING CHECKS (cannot be closed read-only, no GPU here):
|
||||
1. NVFP4 GGUF read support in the pinned LLAMA_VERSION (section 3.4). Must verify on GPU.
|
||||
If unsupported, bump the pin (which also affects stock llama-cpp) or carry the reader.
|
||||
2. The two GGUFs are not yet on HF (section 3). Gallery uri + sha256 are placeholders
|
||||
until upload. Blocks gallery validation only, not the backend build.
|
||||
3. -ub vs -b split (section 2.1) is not exactly reproducible without a tiny grpc-server
|
||||
option; shipped config uses batch:512. Minor, not a blocker.
|
||||
4. Flipping stock LLAMA_PAGED?=off changes stock's shipped artifact (de-risking, intended)
|
||||
- get explicit sign-off since it alters a heavily-used backend's build.
|
||||
|
||||
PLATFORM SHIP MATRIX (RECOMMENDED PHASING - the variant is cheap because it reuses the same
|
||||
base-grpc-* prebuilt bases and the same compile machinery, so each row is just CI minutes):
|
||||
Phase 1 (the benchmark target - GB10/Blackwell is CUDA):
|
||||
- cuda12 amd64, cuda13 amd64, cuda13 arm64 (sbsa), l4t-cuda-12 arm64 (NVFP4/paged win)
|
||||
- cpu-all amd64 + cpu-all arm64 (the single CPU_ALL_VARIANTS build; baseline coverage)
|
||||
Phase 2 (parity with stock llama-cpp coverage, only if demand):
|
||||
- metal-darwin-arm64 (1.11), vulkan amd64/arm64, rocm amd64, intel sycl f16/f32
|
||||
Defer rocm/sycl/vulkan/metal unless asked - the paged + NVFP4 story is GPU/CUDA-centric
|
||||
and these add CI cost without a clear consumer.
|
||||
|
||||
BUILD-COST ESTIMATE PER PLATFORM (with warm base-grpc-* base + ccache; the paged TUs are
|
||||
~byte-identical to stock so a SHARED ccache id makes most objects free):
|
||||
- CPU_ALL_VARIANTS (per arch): ~15-30 min warm / ~35-50 min cold. arm64 adds a gcc-14
|
||||
apt step. Two arches + a merge job.
|
||||
- CUDA (per arch): ~25-45 min warm / ~45-75 min cold (nvcc dominates; ccache helps less
|
||||
across CUDA arch flag changes). amd64 cuda12 + cuda13, arm64 cuda13 + l4t = 4 jobs.
|
||||
- Metal/Darwin (if Phase 2): native macos-14 runner, ~20-35 min with the ccache cache.
|
||||
- No base-images.yml change and no bootstrap dispatch (reuses existing base-grpc-* tags),
|
||||
so the only new CI cost is the per-row build minutes above. PR builds read cache, don't
|
||||
write; first master build per row pays the cold cost once, then warm.
|
||||
|
||||
VERIFICATION (post-implementation, needs a GPU box - out of scope here):
|
||||
- `make backends/llama-cpp-paged` builds + installs locally (from-source path).
|
||||
- Confirm stock `make backends/llama-cpp` now builds clean (no paged-kv-manager.cpp in the
|
||||
checkout) - proves the split.
|
||||
- Load a published NVFP4 GGUF via the gallery entry, hit /v1/chat/completions, confirm the
|
||||
server log shows LLAMA_KV_PAGED engaged (LLAMA_KV_PAGED_DEBUG trace) and the configured
|
||||
max_batch_tokens/parallel took effect.
|
||||
- go test ./core/gallery/importers/... green (importer drop-in case).
|
||||
- node scripts/changed-backends.js dry-run: editing backend/cpp/llama-cpp/* retriggers
|
||||
llama-cpp-paged (cross-trigger), editing backend/cpp/llama-cpp-paged/* triggers it too.
|
||||
|
||||
================================================================================
|
||||
END OF PLAN
|
||||
================================================================================
|
||||
@@ -0,0 +1,75 @@
|
||||
# Paged bit-exactness gate - per path (canonical references)
|
||||
|
||||
## TL;DR
|
||||
|
||||
The greedy decode of the **paged** path does not byte-match the **non-paged**
|
||||
path for the MoE model. This is a **benign FP-accumulation-order difference of
|
||||
the paged attention reduction**, KL-validated against the f16 reference. It is
|
||||
**not a bug**. The bit-exactness gate is therefore **per path**:
|
||||
|
||||
| path | model | canonical md5 |
|
||||
|------|-------|---------------|
|
||||
| non-paged | MoE q36-35b-a3b-nvfp4 | `07db32c2bcb78d17a43ed18bc22705cd` |
|
||||
| paged | MoE q36-35b-a3b-nvfp4 | `8cb0ce23777bf55f92f63d0292c756b0` |
|
||||
| non-paged | dense q36-27b-nvfp4 | `5951a5b4d624ce891e22ab5fca9bc439` |
|
||||
| paged | dense q36-27b-nvfp4 | `5951a5b4d624ce891e22ab5fca9bc439` (bit-exact to non-paged) |
|
||||
|
||||
Gate command (chat-template / conversation path):
|
||||
```
|
||||
llama-completion -m MODEL -ngl 99 -fa on -p "The capital of France is" \
|
||||
-n 48 --temp 0 --seed 1
|
||||
# paged: prefix with LLAMA_KV_PAGED=1 LLAMA_MOE_FORCE_GRAPHS=1
|
||||
```
|
||||
Note: use the default chat-template path (do **not** pass `-no-cnv`; raw
|
||||
completion lands in a different md5 namespace).
|
||||
|
||||
**Future paged-MoE regressions compare to the PAGED reference `8cb0ce23`, not to
|
||||
the non-paged `07db32c2`.** Dense is bit-exact across paths, so dense uses the
|
||||
single reference `5951a5b4`.
|
||||
|
||||
## Why dense is bit-exact but MoE is not
|
||||
|
||||
Dense paged decode reproduces the non-paged reduction order exactly, so dense
|
||||
greedy md5 is identical across paths. The MoE path runs additional kernels (the
|
||||
NVFP4 MoE GEMM + expert routing) whose multi-kernel accumulation order differs
|
||||
between the paged and non-paged attention layouts. Over a long greedy decode this
|
||||
flips a small number of near-tied argmaxes, changing the byte stream. The same
|
||||
divergence is present on the 0028 baseline, with `LLAMA_MOE_FORCE_GRAPHS` on or
|
||||
off, and with the patch-0029 block-table cache on or off - it is a property of
|
||||
the paged attention path, not of any one lever.
|
||||
|
||||
## KL evidence that the paged path is sound (the load-bearing check)
|
||||
|
||||
`llama-perplexity --kl-divergence` on `q36-35b-a3b-nvfp4.gguf`, 16 chunks,
|
||||
`-c 512 -ngl 99 --seed 1`, base logits from the f16 reference
|
||||
(`darwin_36b_opus/f16.gguf`, PPL 7.3734):
|
||||
|
||||
| comparison | PPL(Q) | KL divergence | Same top p | Cor |
|
||||
|------------|-------:|--------------:|-----------:|----:|
|
||||
| f16 reference | 7.3734 | - | - | - |
|
||||
| **non-paged** vs f16 | 7.3896 | 0.136597 +/- 0.003157 | 84.314% | 97.68% |
|
||||
| **paged** vs f16 | 7.4009 | 0.136000 +/- 0.003285 | 84.828% | 97.58% |
|
||||
| paged vs non-paged (direct) | 7.4009 (base 7.3818) | 0.050011 +/- 0.001653 | 89.044% | 99.04% |
|
||||
|
||||
Direct paged-vs-non-paged: Mean Delta-p = 0.079% (no bias), RMS Delta-p = 6.187%.
|
||||
|
||||
### Verdict: BENIGN
|
||||
|
||||
- **Paged does not diverge from the f16 ground truth more than non-paged does.**
|
||||
KLD(paged||f16) = 0.13600 <= KLD(nonpaged||f16) = 0.13660, and PPL(paged) =
|
||||
7.4009 ~ PPL(nonpaged) = 7.3896 (difference 0.011, far inside the +/- 0.29
|
||||
error bars). A real paged-MoE correctness bug would push paged measurably
|
||||
*further* from f16; it does not (it is marginally closer).
|
||||
- **Paged and non-paged cluster together.** They agree with each other (KLD 0.050,
|
||||
89.0% same-top-p) more than either agrees with f16 (KLD ~0.137, ~84% same-top-p),
|
||||
with essentially zero probability bias. That is the signature of two equivalent
|
||||
FP-reorderings of the same quantized model, both equally approximating the f16
|
||||
ground truth - not a quality regression.
|
||||
- The direct same-top-p of 89.0% is below a naive ">99%" heuristic, but that
|
||||
heuristic is calibrated for higher-precision models. In a 4-bit (NVFP4) model
|
||||
logit near-ties are abundant, so a different-but-equivalent reduction order
|
||||
flips ~11% of argmaxes with no quality cost (proven by the equal KLD-to-f16 and
|
||||
zero Delta-p bias).
|
||||
|
||||
Therefore the canonical gate is per path, and `8cb0ce23` is the validated paged
|
||||
reference for the MoE deployment path.
|
||||
286
backend/cpp/llama-cpp-localai-paged/docs/PARITY_HANDOFF.md
Normal file
286
backend/cpp/llama-cpp-localai-paged/docs/PARITY_HANDOFF.md
Normal file
@@ -0,0 +1,286 @@
|
||||
# PARITY_HANDOFF: how to pick up the GB10 vLLM-parity work
|
||||
|
||||
Audience: an agent with **zero prior context** who has been told to "continue the GB10 vLLM-parity investigation" on the `llama-cpp-localai-paged` backend.
|
||||
|
||||
This file is the **operational how-to**. It is the companion to `VLLM_PARITY_FINAL.md`, which is the **why / authoritative record** ("never re-litigate"). If the two ever disagree on a *fact*, `VLLM_PARITY_FINAL.md` and the bench artifacts it cites win; this file wins on *procedure* (how to ssh, lock, build, bench, profile).
|
||||
|
||||
Read order for a cold start:
|
||||
1. This file (TL;DR + hard gates + quickstart).
|
||||
2. `VLLM_PARITY_FINAL.md` (the closed record, every number cites its artifact).
|
||||
3. `.agents/vllm-parity-methodology.md` (the methodology: bit-exact gating, profile-don't-assume, both-engine ground truth).
|
||||
4. The patch-series `README.md` (~44 KB, canonical backend doc) and `PAGED_BITEXACT_NOTE.md`.
|
||||
|
||||
---
|
||||
|
||||
## 1. TL;DR STATE
|
||||
|
||||
- The investigation is **CLOSED**. Parity is **not reachable on GB10** silicon; the residual is a hardware ceiling, not engineering debt.
|
||||
- **Prefill** is a genuine floor at **~36% (MoE) / ~43% (dense)** of vLLM. Prefill is **not** CUDA-graph-replayed, so these numbers are real, not measurement artifacts.
|
||||
- **Decode** is **near-parity: ~86% of vLLM's TRUE GPU-steady decode** (924 vs 1078 t/s). The long-standing **~56% headline was a CUDA-graph measurement artifact** (nsys without `--cuda-graph-trace=node` collapses each graph replay into one opaque launch). Decode is also **ahead of vLLM at low concurrency** (dense 116.7% at N=8) and uses **1.5-3x less memory**, bit-exact per-path.
|
||||
- The lever search was **exhaustive**: every attempt (prefill GEMM, GDN chunked scan, decode fusions, serving/scheduler) is recorded with its verdict and number so it is **not re-run**.
|
||||
- **The path to parity is different hardware: datacenter Blackwell** (B200, HBM, native tcgen05 / CUTLASS FP4). Do NOT reopen GB10 kernels. Re-run the methodology on the new silicon, where vLLM's GB10-losing FLA/Marlin kernels invert.
|
||||
|
||||
---
|
||||
|
||||
## 2. THE HARD GATES YOU MUST NOT VIOLATE
|
||||
|
||||
These are non-negotiable. Violating any of them invalidates the result or the contribution.
|
||||
|
||||
### 2.1 The per-path greedy-md5 bit-exact gate (sacred)
|
||||
The gate is **per-path**: paged vs non-paged attention legitimately produce different (equivalent) FP-reduction orders. Each path is gated against **its own** reference, validated benign by KL-divergence to the f16 reference. Canonical greedy md5s:
|
||||
|
||||
| Path | Model | Canonical md5 |
|
||||
|---|---|---|
|
||||
| non-paged | MoE q36-35b-a3b-nvfp4 | `07db32c2bcb78d17a43ed18bc22705cd` |
|
||||
| **paged** | MoE q36-35b-a3b-nvfp4 | `8cb0ce23777bf55f92f63d0292c756b0` |
|
||||
| non-paged | dense q36-27b-nvfp4 | `5951a5b4d624ce891e22ab5fca9bc439` |
|
||||
| paged | dense q36-27b-nvfp4 | `5951a5b4d624ce891e22ab5fca9bc439` (bit-exact to non-paged) |
|
||||
|
||||
- **Compare paged-to-paged only.** Future paged-MoE regressions compare to `8cb0ce23`, NOT `07db32c2`.
|
||||
- **Why paged-MoE differs (benign, KL-validated):** `llama-perplexity --kl-divergence` on the MoE GGUF (16 chunks, f16 base PPL 7.3734) shows non-paged-vs-f16 KLD 0.136597 and paged-vs-f16 KLD 0.136000, i.e. paged does NOT diverge from f16 ground truth more than non-paged does. Paged and non-paged are two equivalent FP-reorderings of the same 4-bit model. This holds on the 0028 baseline and with `LLAMA_MOE_FORCE_GRAPHS`/0029 on or off, so it is a property of the paged path, not any one lever.
|
||||
- **Every bit-exact patch is gated two ways:** greedy md5 (per path) AND `test-backend-ops` vs the CPU oracle for every touched op.
|
||||
|
||||
### 2.2 The KL-gate for opt-in lossy paths
|
||||
Any path that is NOT byte-identical (e.g. 0033 dequant-bf16, the 0034/0035 large-M FP paths, FP8-KV) ships **default-off** and is gated by a **KL-divergence band**: it requires `KLD(new||f16) <= KLD(FP4-MMQ||f16)` and PPL within the established band. Lossy levers never ship default-on.
|
||||
|
||||
### 2.3 In-backend A/B is the only proof (hard methodology rule)
|
||||
A lever compiled into the binary is **NOT** isolated by a runtime flag alone. It needs a **separately-built in-backend A/B**. Precedents that burned this in: 0031 chunking math was correct yet -22% in-backend; 0034 had a standalone PoC win that did not hold in-backend.
|
||||
|
||||
### 2.4 Contribution / commit gates (LocalAI policy)
|
||||
- **DCO sign-off required:** every commit ends with `Signed-off-by: Ettore Di Giacinto <mudler@localai.io>`.
|
||||
- **AI attribution via `Assisted-by:` trailer:** `Assisted-by: Claude:opus-4.8 [Claude Code]`.
|
||||
- **NEVER add `Co-Authored-By:` (AI) trailers** and never add an AI `Signed-off-by`.
|
||||
- **No em-dashes** anywhere in output (use `-`, `:`, parentheses, or rephrase).
|
||||
- **Ask before every `git push`.** Prior approval does not carry over.
|
||||
|
||||
### 2.5 Fork-first is MANDATORY (the fork is canonical)
|
||||
- The **canonical source of truth is the fork branch `mudler/llama.cpp:localai-paged`** (= pin commit + paged patch commits in order). It is canonical for ALL paged-backend kernel/patch work. The shipped `patches/paged/*.patch` series is a **derivative**: the fork is the source.
|
||||
- **Always update the fork FIRST, in this exact order:** (1) commit the change on the `localai-paged` branch and **push it**, then (2) regenerate the LocalAI series (`backend/cpp/llama-cpp-localai-paged/patches/paged/`) from the fork via `git format-patch` (one patch per fork commit, source-only, never touching a `*.md`/dev-doc), so the series stays a **1:1, drift-free mirror** of the branch. No hand-export.
|
||||
- **NEVER edit the LocalAI `patches/paged/*.patch` files directly**, and **NEVER add a patch to the series with no corresponding fork-branch commit.** They are generated output, not source.
|
||||
- The fork branch is also **where the build and the per-path bit-exact md5 gate actually run**, so it is the **only** place a change is truly validated. A patch that lives only in the LocalAI series has never been built or gated.
|
||||
- **Mirror invariant (verify by tree hash):** applying the full on-disk series on the pin must reproduce the fork branch tree byte-for-byte. The series has **intentional gaps** (missing 0005, 0026, 0027, 0032, 0036-0039, 0045), so the patch count is not the max number; what must hold is the tree-hash equality, not the count. (Concretely: fork HEAD `51168c5ee` "patch 0044" is byte-identical to worktree `0044-feat-paged-fused-gated-RMSNorm-SiLU-gate-mul.patch`; the f32-only M5 tensor-core scan is worktree patch `0047`.)
|
||||
|
||||
### 2.6 Bench hygiene gates
|
||||
- **NEVER set `LLAMA_MAX_BATCH_TOKENS` in benches** (the harness explicitly logs "NO LLAMA_MAX_BATCH_TOKENS").
|
||||
- Do **not** set `GDN_TC`, `GDN_CHUNK_MIN`, or `LLAMA_PAGED_DECODE_STABLE` in parity benches. Production defaults are compiled in: **GDN M5 on (`GDN_TC=5`, `GDN_CHUNK_MIN=64`), S1 decode-graph on, S3 off.**
|
||||
- **Decode profiling MUST use `nsys --cuda-graph-trace=node`** (see section 3.4). This is a gate, not a suggestion.
|
||||
|
||||
---
|
||||
|
||||
## 3. OPERATIONAL QUICKSTART (copy-pasteable)
|
||||
|
||||
### 3.0 Host
|
||||
```
|
||||
ssh dgx.casa # resolves to hostname promaxgb10-4ad8; GPU = NVIDIA GB10 (unified LPDDR5x, ~273 GB/s, the bandwidth floor)
|
||||
```
|
||||
`nvidia-smi` reports memory as `[N/A]` (unified memory). CUDA 13 / sm_121.
|
||||
|
||||
### 3.1 GPU lock protocol (`~/gpu_bench_lock`) - TWO conventions, reconcile carefully
|
||||
There are two conventions in flight:
|
||||
- **Old harnesses** (`combined_definitive.sh`, `fuse_validate.sh`, `fuse_profile.sh`) treat it as an **empty mutex dir**: `mkdir ~/gpu_bench_lock` to acquire, `rmdir` to release.
|
||||
- **Newer harnesses** (`fp4norm_profile.sh`) use an **owner-file convention**: `mkdir -p ~/gpu_bench_lock` then `echo "$ME $(date +%s)" > ~/gpu_bench_lock/owner`. They poll until `nvidia-smi --query-compute-apps=pid` count is 0 AND `owner` is `FREE*`/absent for 2 consecutive checks, and clear a stale `~/gpu_bench_lock/release` file. Release **writes** `FREE released-by-... $(date +%s)` to `owner` (it does NOT remove the dir).
|
||||
|
||||
Because the dir now permanently contains an `owner` file, **release with `rm -rf ~/gpu_bench_lock`, NOT `rmdir`** (rmdir fails on the non-empty dir). Recommended procedure for a future agent:
|
||||
1. Read `~/gpu_bench_lock/owner`. `FREE*`/absent + 0 compute-apps means free.
|
||||
2. Acquire via `mkdir -p ~/gpu_bench_lock` + write `owner`.
|
||||
3. Release by writing `FREE ...` to `owner` (or `rm -rf ~/gpu_bench_lock`).
|
||||
|
||||
A separate 0-byte `~/bench/gpu.lock` is legacy/unrelated - ignore.
|
||||
|
||||
**Always gate on BOTH** `nvidia-smi --query-compute-apps=pid` count == 0 **and** `owner` FREE before benching. Concurrent jobs share this GPU: an offline-repack Marlin workflow, an `~/.cache/autoresearch-quant/` quant pipeline (this is the `llama-imatrix` class of job), and finetune trees. The canonical harnesses poll for GPU-idle up to 2h.
|
||||
|
||||
### 3.2 Build (long; run detached + poll)
|
||||
- **Mainline / canonical grpc-server + binaries: CUDA arch `121`** (`-DCMAKE_CUDA_ARCHITECTURES=121`). Runtime banner shows `ARCHS = 1210 | BLACKWELL_NATIVE_FP4 = 1`.
|
||||
- **FP4-MMA / tensor-core experimental kernels: the accelerated `121a` gencode** (`arch=compute_121a,code=[compute_121a,sm_121a]`). The `a` suffix unlocks tcgen05 / native FP4-MMA intrinsics. `121a` lives ONLY in the DGX experimental build scripts (`~/gdn_cc.sh` standalone nvcc, `~/gdn_bv_build.sh` `-DCMAKE_CUDA_ARCHITECTURES=121a`, `~/paged-build.sh` `--build-arg CUDA_DOCKER_ARCH=121a`), not in the worktree build files. Supply it at build time via `CMAKE_CUDA_ARCHITECTURES` / `CUDA_DOCKER_ARCH`.
|
||||
- **Long builds: run detached and poll for a marker.** Pattern: `nohup ... > build.log 2>&1 &` then poll for a `.DONE`/`.done` file. Do NOT block a foreground shell.
|
||||
|
||||
Built binaries live at `dgx:~/llama-paged-dev/build-cuda/bin/` (`llama-server`, `llama-batched-bench`, `llama-completion`; thin ~70 KB dynamic wrappers).
|
||||
|
||||
### 3.3 The standard bench env + commands
|
||||
```
|
||||
cd /home/mudler/llama-paged-dev/build-cuda/bin
|
||||
L="LLAMA_KV_PAGED=1 LLAMA_MOE_FORCE_GRAPHS=1 GGML_NO_BACKTRACE=1" # GGML_NO_BACKTRACE is log-hygiene, not a lever
|
||||
MOE=/home/mudler/bench/q36-35b-a3b-nvfp4.gguf # arch qwen35moe, ~22.2 GiB
|
||||
DENSE=/home/mudler/bench/q36-27b-nvfp4.gguf # arch qwen35, ~17.5 GiB
|
||||
|
||||
# (1) Bit-exact / coherence gate. stdin MUST be /dev/null or it hangs in conv mode.
|
||||
env $L ./llama-completion -m "$MOE" -ngl 99 -fa on -c 4096 --temp 0 --seed 1 -n 48 -no-cnv \
|
||||
-p "The capital of France is" </dev/null | md5sum
|
||||
# The PAGED_BITEXACT_NOTE gate command uses the chat-template path (NO -no-cnv):
|
||||
# ./llama-completion -m MODEL -ngl 99 -fa on -p "The capital of France is" -n 48 --temp 0 --seed 1
|
||||
# (compare to the canonical md5 for that model+path; paged-to-paged only)
|
||||
|
||||
# (2) PREFILL bench (S_PP from llama-batched-bench)
|
||||
env $L ./llama-batched-bench -m "$MOE" -c 131072 -b 2048 -ub 512 -ngl 99 -fa on \
|
||||
-npp 512,2048 -ntg 4 -npl 32
|
||||
|
||||
# (3) SERVING bench: one --parallel 256 server, then drive with h2h_cli3.py
|
||||
env $L nohup ./llama-server -m "$MOE" -c 262144 --parallel 256 -b 2048 -ub 512 \
|
||||
-ngl 99 -fa on --host 127.0.0.1 --port 8090 --no-webui >/home/mudler/bench/paged_server.log 2>&1 &
|
||||
# poll http://127.0.0.1:8090/health for '"ok"', then:
|
||||
python3 /home/mudler/bench/h2h_cli3.py # OpenAI /v1/completions, ignore_eos, fresh-nonce, ptok128 gen128, NPL sweep 8/32/128/256
|
||||
```
|
||||
**vLLM side** (for both-engine parity): `~/vllm-bench/bin/vllm` (version **0.23.0**), served `gpu-util 0.85 max-model-len 4096 max-num-seqs 256 tp1`, models `~/bench/q36-35b-a3b-nvfp4-vllm/` and `~/bench/q36-27b-nvfp4-vllm/`.
|
||||
|
||||
**The full automated both-engine harness is `dgx:~/bench/combined_definitive.sh`** (acquires lock, waits for GPU-idle up to 2h, runs MoE then dense for both engines, writes `COMBINED_DEFINITIVE.txt` + `.done`, traps cleanup to kill servers and release lock on exit). This is the reference harness; clone its discipline for any new run.
|
||||
|
||||
### 3.4 THE DECODE-PROFILING RULE (this trap caused 4 wrong analyses)
|
||||
Decode runs as a **replayed CUDA graph**. `nsys` **without** `--cuda-graph-trace=node` collapses each graph replay into ONE opaque launch, so every per-kernel attribution becomes an artifact. This is exactly what made the old "paged 159 us/tok, GPU ~16% busy, host-bound, 5.4x more GPU-efficient" story wrong, and produced the wrong ~56% headline.
|
||||
|
||||
Mandatory method for any decode profile:
|
||||
- Use **`nsys --cuda-graph-trace=node`**.
|
||||
- Decompose with the **difference method**: per-token cost = (ntg=64 profile) - (ntg=16 profile).
|
||||
|
||||
Under the correct method, paged decode at npl=256 is **99% GPU-busy (1.4% idle), NOT host-bound** - the opposite of the collapsed-graph reading. The clean graph-node-traced profiles are at `~/highN_prof2/*.nsys-rep` (paged, npl=256) and `~/highN_vllm/*.nsys-rep` (vLLM), captured 2026-06-30. They **supersede every earlier decode decomposition.**
|
||||
|
||||
### 3.5 Models + artifacts (all on DGX)
|
||||
GGUF (paged): `~/bench/q36-35b-a3b-nvfp4.gguf` (MoE, qwen35moe), `~/bench/q36-27b-nvfp4.gguf` (dense, qwen35). vLLM safetensors: `~/bench/q36-35b-a3b-nvfp4-vllm/` (has `hf_quant_config.json` confirming MIXED_PRECISION / FP8-proj), `~/bench/q36-27b-nvfp4-vllm/`.
|
||||
Authoritative run: `~/bench/COMBINED_DEFINITIVE.txt` (+ `.log`, `.done`, `combined_definitive.sh`, per-engine `COMBINED_*_server.log`). A/B dirs: `~/bench/marlin_gate/`, `~/bench/gdn_p1_ab/`. NOTE: the `*_RESULTS*`/`*_MAP*` docs live only in the worktree `docs/`, not on the DGX.
|
||||
|
||||
---
|
||||
|
||||
## 4. THE COMPLETE LEVER MAP (do NOT re-run the rejected ones)
|
||||
|
||||
Verdicts and numbers are from `VLLM_PARITY_FINAL.md` + the cited artifacts. "BE" = greedy-md5 bit-exact; "KL-benign" = lossy path inside the KL band.
|
||||
|
||||
### 4.1 Prefill weight-GEMM track - WHOLE TRACK REJECTED (FP4-MMQ is optimal on GB10)
|
||||
Decisive surprise: on sm_121 **vLLM itself does NOT run native FP4** - it runs **Marlin W4A16** (FP4 dequant->bf16 in-register + bf16 GEMM) for experts and FP8 projections, capped at ~half FP4 peak, because native CUTLASS NVFP4 grouped-GEMM is broken on consumer Blackwell (TMA-WS init failure, CUTLASS #3096; no tcgen05/TMEM). So MMQ's native FP4 is already structurally competitive here.
|
||||
|
||||
| Lever | What | Verdict | Key number |
|
||||
|---|---|---|---|
|
||||
| 0033 dequant->bf16 cuBLAS | route large-M NVFP4 dense GEMM to dequant->bf16 cuBLAS | REJECTED, ships default-off | dense S_PP -49%/-42%/-29% at M=512/1024/2048; BE + KL-better |
|
||||
| dense-cuBLAS reroute (full) | same across dense+MoE prefill | REJECTED | -31% to -62% band |
|
||||
| 0034 native FP4-MMA W4A4 | Blackwell `mxf4nvf4` OMMA large-M | REJECTED in-backend | PoC 103 TFLOP/s (57.7% FP4 peak, NMSE=0) but win did not hold in-backend |
|
||||
| 0035 W4A16-Marlin grouped MoE | FP4->bf16 in-register + bf16 mma, zero act-quant tax | REJECTED (perf) | correct + KL-benign-and-better but **-39%** S_PP vs MMQ |
|
||||
| 0045/0046 offline-repack / vLLM-verbatim Marlin | repack to Marlin layout; port vLLM kernel verbatim | REJECTED | verbatim correct but -39%; offline-repack same bf16-peak ceiling, no win |
|
||||
|
||||
Why it loses: bf16 TC peak on GB10 is ~half FP4 peak, so any dequant->bf16 kernel caps at ~half FP4-MMQ; the dequant write is an un-amortized weight-sized memory pass (~8x the FP4-read traffic). **The GEMM bucket is not winnable on GB10 with available kernels.**
|
||||
|
||||
### 4.2 Prefill GDN chunked-scan track - M5 tf32 C=16 is the SHIPPED winner
|
||||
GDN is the #1 prefill-gap contributor (+59.2 us/tok, ~30%). vLLM's FLA `chunk_gated_delta_rule` runs the same math at 36.5 vs paged 95.7 us/tok = 2.62x via tensor-core intra-chunk Gram products.
|
||||
|
||||
| Lever | What | Verdict | Key number |
|
||||
|---|---|---|---|
|
||||
| 0031 scalar-serial chunked scan | FLA-style scalar/serial (`GDN_TC=0`) | superseded | correct but ~22% slower at forced C=16 |
|
||||
| **0047 / M5 tf32 tensor-core scan** | tf32 `m16n8k8` mma form-T solve, f32-only | **SHIPPED default-on under paged** | MoE prefill +3.5% @npp512, +17.7% @npp2048; decode unchanged; BE-benign |
|
||||
| bf16 CONFIG-C (M8) | bf16 Kc/Qc + 2 C*C scratch, C->64 | REJECTED (not in f32 series) | confirmed geometry then dropped |
|
||||
| bf16-C16 | bf16 Gram at C=16 | REJECTED | no win; bf16 mantissa unsafe on state-coupled products |
|
||||
| BV block-occupancy A/B (tf32) | raise blocks/SM | REJECTED (occupancy NOT the bound) | 1844 vs 1814 S_PP (-1.04%, within noise) |
|
||||
| bf16-C64 | bf16 Gram at C=64 | REJECTED | -18.75%; O(C^2) intra-chunk + serial recurrence dominates |
|
||||
|
||||
Why not occupancy/dtype: the cost is the **O(C^2) intra-chunk triangular A-inverse solve + the strictly-serial inter-chunk recurrence**, with C forced to **16** by GB10's 99 KB dynamic-smem cap (the 128x128 f32 state alone is 64 KB). M5 captures the tractable TC part; it does not fully close 2.62x because vLLM's FLA blocked-solve is a more complete TC implementation.
|
||||
|
||||
### 4.3 Decode / fusion levers - all REJECTED (near-parity already at ~86% true GPU-steady)
|
||||
| Lever | What | Verdict | Key number |
|
||||
|---|---|---|---|
|
||||
| act-quant folded into ggml MMQ | inline y-quant in MoE expert MMQ | REJECTED | **-79.4%**; ggml MMQ re-quantizes y per weight-row-tile x stream-k split, no TC for inline quant |
|
||||
| norm+quant+silu fusion | one launch (vLLM Triton kernel) | REJECTED (infeasible) | `ggml_cuda_can_fuse` cannot express it: FP4 quant is a mul_mat-internal prologue, silu separated from norm by 2 GEMMs + router |
|
||||
| Q8_0 / FP8 projection | quantize bf16 GDN/attn projections | REJECTED (regime error) | vLLM DOES use FP8 proj, but at N>=128 proj is only ~12% of stream, closes <=6% |
|
||||
| NVFP4 the projections | drop proj to NVFP4 | REJECTED | KL-fail, ~+6% PPL; vLLM keeps SAME bf16/FP8 proj, never NVFP4 |
|
||||
| W4A16-Marlin MoE decode | Marlin grouped expert GEMM at decode | REJECTED | BW-floored wash, ~5% slower |
|
||||
| bf16-tau per-head SSM (0026) | per-head bf16 tau on SSM decode | DROPPED | flat 780.6 vs 780.0 t/s; earlier "+12%" subsumed by 0028/0029 |
|
||||
| D3 FA-split / D4 GDN-width-adaptive | older off-critical-path levers | SUPERSEDED reasoning | were rejected via the debunked "5.4x/host-bound" reading; under HNP the GDN scan IS critical path (51%) but is the shared BW floor where paged leads (83% vs 79%), so still not a win |
|
||||
|
||||
Dense decode is **AHEAD at low N (116.7% @ N=8)** - the one operating point where paged is unambiguously faster.
|
||||
|
||||
### 4.4 Serving / engine levers - host loop and scheduler CLOSED
|
||||
| Lever | What | Verdict | Key number |
|
||||
|---|---|---|---|
|
||||
| **0040 / S1** paged decode-graph reuse | `can_reuse` keyed on bucketed block-table dims | SHIPPED default-on | serving reuse 0% -> 72.2% (with S3); static 0% -> 95.5% |
|
||||
| **0041 / S3** decode-shape-stable scheduling (`LLAMA_PAGED_DECODE_STABLE`) | keep prefill out of decode steps | SHIPPED **default-OFF** (opt-in) | recovers the ~17 pt graph-reuse overhead at a TTFT cost; default-on regressed real serving (2.5x worse TTFT, 20-29% lower e2e throughput) |
|
||||
| **0043 / D1** full-step MoE decode CUDA graph | graph whole decode step incl. grouped-MMQ MoE dispatch | SHIPPED default-on | +2.6% (npl128) to +5-13% (npl32); D1 premise "host-sync on MoE readback" REFUTED (sync count identical 1457 on/off) |
|
||||
| S2 double-buffer set_inputs | overlap host input build with GPU | DROPPED | `set_inputs` ~0.05 ms/step, nothing to recover |
|
||||
| whole-step graph / host loop | host loop as serving residual | CLOSED (~0-1%) | reuse 0% (757.6) == S1+S3 72% (763.3); hostproc only ~4-8% of step wall |
|
||||
| padded / fixed-slot decode | pad decode width to `--parallel` for ~100% reuse | **REJECTED (built, GPU-tested, commit b028c81e)** | inert (BE) but regresses everywhere; N=8 burst 28.16->6.05 tok/s/seq; serving decode is GPU-compute-bound, dummy-row compute > reuse recovered |
|
||||
| speculative decode (MTP) | draft + verify | ORTHOGONAL, not pursued | both engines have it; crux is hybrid-SSM in-place-state (0018) rollback; a feature both can add, not a paged-specific gap |
|
||||
|
||||
### 4.5 SHIPPED WINS (all BE / KL-benign) - keep these, do not regress
|
||||
- **FP4-MMQ MoE/dense GEMM** (native Blackwell FP4-MMA at the FP4 weight-BW floor; reason 4.1 stays default-off).
|
||||
- **M5 tf32 tensor-core chunked GDN prefill (patch 0047)**, default-on under `LLAMA_KV_PAGED` (`GDN_TC=5` + `GDN_CHUNK_MIN=64`).
|
||||
- **0042 fused residual-add + RMSNorm + weight-mul** (dense S_PP +0.5%, BE).
|
||||
- **0044 fused GatedRMSNorm + SiLU gate-mul** (672 -> 336 launches @npp512; dense +1.1%, MoE +0.9%, test-backend-ops 12979/12979).
|
||||
- **0046 GDN-prefill geometry gate** (gates 0022's decode retune by scan length; recovers +7.2% dense prefill, keeps the decode win, BE).
|
||||
- **SSM decode-fusion stack (0018-0022, 0028)**: in-place state (+23.5%/+18.9%), fused gather (+37.8%/+35.3%), o_proj reshape (+31.7%/+23.3%), conv in-place (+3.2%/+3.5%), occupancy retune (+11.1%/+8.3%) = the **2.26x / 2.46x over stock** decode multiplier.
|
||||
- **Serving host loop closed (0040 S1, 0043 D1).**
|
||||
- **The memory advantage** (1.5-3x lower VRAM, NVFP4-resident, no persistent bf16 dequant copies).
|
||||
- **Low-N decode lead** (dense 116.7% @ N=8). **Bit-exact output per-path** through the whole series.
|
||||
|
||||
### 4.6 REMAINING / unattempted levers + EV
|
||||
- **Multi-week persistent-Marlin decode kernel** (vLLM's fused-Marlin MoE persistent-tiling + Triton elementwise): the only path to the residual ~14 pt GPU-steady decode gap. **Low-EV**: decode-only ~4-14%, our own ggml Marlin port already lost -19.6%, needs mature tiling + multi-stream overlap (hard inside a single-stream CUDA graph), GB10-uncertain, and **cannot lift the prefill floor**. Not a free bit-exact lever.
|
||||
- **Datacenter-Blackwell pivot** (B200, ~8 TB/s HBM, native tcgen05/CUTLASS FP4, TMEM): lifts the LPDDR5x GDN bandwidth floor ~30x and restores exactly the vLLM advantages that lose on GB10. **This is the documented path to parity.** Re-run the methodology on the new silicon, do not reopen GB10 levers.
|
||||
|
||||
The `VLLM_PARITY_LEVER_MAP.md` "pursue list" (A1-A7/B1-B7/C1: graph-safe ragged grouped FP4-MMA MoE kernel, FP8 paged KV, MTP spec-decode, etc.) is the **earlier working brainstorm written before the final profiling**. `VLLM_PARITY_FINAL.md` is the authoritative supersession; treat those buckets as rejected / infeasible / different-hardware unless re-validated on new silicon.
|
||||
|
||||
---
|
||||
|
||||
## 5. METHODOLOGY LESSONS (so you do not repeat the mistakes)
|
||||
|
||||
1. **Profile, don't assume. The analysts were wrong 4 times.** Every one was caught only by an in-backend A/B or a corrected profile:
|
||||
- **GDN-scalar grep** (assumed the scan was scalar/serial from reading source) - wrong, retired by the tensor-core port.
|
||||
- **dense-cuBLAS reroute** (assumed dequant->bf16 would win) - wrong, -31% to -62%.
|
||||
- **occupancy** (assumed blocks/SM was the GDN bound) - wrong, 1844 vs 1814 within noise.
|
||||
- **projection-regime** (assumed FP8/NVFP4 projections were a big lever) - wrong, projections are ~12% of the decode stream at high N.
|
||||
**In-backend A/B is the only truth.** A standalone PoC win (0034) is not a result.
|
||||
2. **Per-kernel us/tok overstates end-to-end S_PP/S_TG.** A kernel that is X% faster in isolation does not move throughput X%; always confirm against the end-to-end batched-bench / serving number.
|
||||
3. **The CUDA-graph-trace decode artifact (the big one).** Decode is a replayed graph; nsys without `--cuda-graph-trace=node` collapses it and lies. This single trap produced the wrong "host-bound / 159 us/tok / 56%" story across multiple analyses. Always graph-node-trace + difference method (section 3.4).
|
||||
4. **Beware GPU contention skewing absolutes.** The box runs concurrent quant/repack/finetune jobs. Gate on idle GPU + free lock; prefer the same-session both-engine harness so both numbers move together.
|
||||
5. **The vLLM server number is inflated ~8 pt vs its true GPU-steady.** vLLM's chunked-prefill-overlap inflates its own server-measured decode window (1177 server vs 1078 true GPU-steady). Compare GPU-steady to GPU-steady, or you will chase a phantom gap. The reconciliation chain that must sum: vLLM server 1177 (100%) -> vLLM true GPU-steady 1078 (92%) -> llama GPU-steady 924 (78.5% of 1177, = 86% of 1078) -> llama server 718 (60.7%, the S3-recoverable serving overhead).
|
||||
|
||||
---
|
||||
|
||||
## 6. THE THREE FORWARD DIRECTIONS
|
||||
|
||||
### (a) Close / ship the record (lowest effort, do this first)
|
||||
The investigation is already CLOSED in the docs. Concrete first steps:
|
||||
1. Commit the untracked `patches/paged/0044-feat-paged-fused-gated-RMSNorm-SiLU-gate-mul.patch` into the worktree (it is on the fork as `51168c5ee` and on disk, but shows `??` here).
|
||||
2. Reconcile the **pin discrepancy** (section 7): the Makefile builds with `0ed235ea`, but README section 7 prose and `VLLM_PARITY_FINAL.md` still say `9d5d882d`. Update the prose to the Makefile value (trust the Makefile when building).
|
||||
3. Re-run the bit-exact gate on a clean tree to confirm `8cb0ce23` (paged-MoE) / `5951a5b4` (dense) before any release; resolve the `0921716...` open item in section 7.
|
||||
|
||||
### (b) Datacenter-Blackwell pivot (THE real parity path)
|
||||
The thesis: every vLLM advantage that wins on GB10 is a kernel that is **broken or capped on consumer Blackwell** and **inverts on datacenter Blackwell** (B200): FLA blocked-solve GDN, Marlin/CUTLASS grouped FP4, HBM-tuned full-cudagraph decode, native tcgen05/TMEM. ~8 TB/s HBM lifts the LPDDR5x GDN bandwidth floor ~30x. Concrete first steps:
|
||||
1. Acquire a B200 (or equivalent HBM tcgen05 part). Reproduce the **both-engine same-session harness** there (`combined_definitive.sh` discipline): build the stock and paged binaries, build vLLM 0.23.0+, run MoE + dense prefill + serving for both engines.
|
||||
2. Re-measure the FP4 path: on B200, native CUTLASS NVFP4 grouped-GEMM should work (the CUTLASS #3096 / TMA-WS failure is consumer-Blackwell-specific). Confirm whether vLLM now runs **native FP4** instead of Marlin W4A16. If so, the 4.1 GEMM track must be re-evaluated from scratch (it was rejected on a GB10-specific ceiling).
|
||||
3. Re-take the decode profile with `--cuda-graph-trace=node`; the GDN scan that floors at 273 GB/s on GB10 should no longer dominate at HBM bandwidth - re-derive the per-token decomposition before choosing any lever.
|
||||
|
||||
### (c) Multi-week persistent-Marlin decode kernel (decode-only, low-EV, CANNOT reach parity)
|
||||
Only pursue if (a)+(b) are not options and someone explicitly wants the residual decode gap closed on GB10. It targets the ~14 pt GPU-steady decode gap (vLLM's fused-Marlin MoE persistent-tiling + single Triton elementwise). Concrete first steps:
|
||||
1. Re-confirm the ceiling first: our own ggml Marlin port already lost -19.6% at decode (4.3), so the bar is "beat that and beat FP4-MMQ at the decode BW floor".
|
||||
2. Prototype the persistent-tiling grouped-FP4 MoE kernel **standalone**, then prove it **in-backend** (a PoC win is not a result, per 0034). It must live inside a single-stream CUDA graph or bring its own multi-stream overlap.
|
||||
3. Bound the upside honestly: this is **decode-only ~4-14%** and **does nothing for the prefill floor (36-43%)**, so it does not reach parity. Record the verdict either way.
|
||||
|
||||
---
|
||||
|
||||
## 7. KEY FILE / ARTIFACT INDEX
|
||||
|
||||
### Fork (canonical source of truth)
|
||||
- `dgx:~/llama-paged-fork`, remote `fork git@github.com:mudler/llama.cpp.git`, branch **`localai-paged`**, HEAD `51168c5eee2e35348d9006f0b2fab3dc6e7c01cc` ("fused gated RMSNorm + SiLU gate-mul CUDA op (patch 0044)"). **Currently dirty** (uncommitted `M ggml/src/ggml-cuda/gated_delta_net.cu`).
|
||||
- `dgx:~/llama-paged-dev` (experimental dev/build tree), branch **`paged`**, HEAD `a7d439e8ce6990eb09721223c975da4e49d8d136` ("GDN CONFIG C (M8) - bf16 Kc/Qc"). **Dirty** + many untracked profiling artifacts. This tree's `build-cuda/bin/` produced the benchmarked binaries; `COMBINED_DEFINITIVE` recorded `GIT_HEAD=a7d439e` (the M8 bf16 dev config), NOT the fork HEAD. The dev tree carries bf16/hybrid M6/M7/M8 machinery deliberately EXCLUDED from the shipped f32-only series.
|
||||
|
||||
### LocalAI worktree
|
||||
- Path: `/home/mudler/_git/LocalAI/.claude/worktrees/feat+paged-attention`, branch `worktree-feat+paged-attention` (199 ahead, 25 behind origin/master; the ahead count grows with each new commit).
|
||||
- Backend dir: `backend/cpp/llama-cpp-localai-paged/` (`Makefile` thin wrapper, `package.sh`, `run.sh`, `README.md` ~44 KB canonical, `docs/`, `patches/paged/`).
|
||||
- `docs/`: `VLLM_PARITY_FINAL.md` (authoritative record), `VLLM_PARITY_LEVER_MAP.md` (working brainstorm, profile-validated section), `DECODE_SERVING_SCOPE.md`, `PREFILL_GEMM_SCOPE.md`, `PREFILL_GEMM_RESULTS.md`, `TENSORCORE_GDN_SCOPE.md`, `TENSORCORE_GDN_BUILD_PLAN.md`, `ACCELERATOR_PORTING_SCOPE.md`, `UPSTREAM_LAYER2_SCOPE.md`, `LOCALAI_LLAMACPP_BACKEND_PLAN.md`, `PAGED_BITEXACT_NOTE.md`, `PATCH_MAINTENANCE.md`, `final_benchmark.csv`, `paged-burst-bench.cpp`, `paged-reclaim-unit.cpp`, 3 PNGs, and this `PARITY_HANDOFF.md`.
|
||||
- `patches/paged/`: **38** `.patch` files spanning 0001-0047 with intentional gaps (missing 0005, 0026 [dropped ssm_bf16_tau], 0027, 0032, 0036-0039, 0045). Core paged-KV 0001-0012; decode-first scheduler 0013/0016; serving graph reuse 0040/0041; prefill fusions 0042/0044; SSM/GDN decode 0018-0022/0028; MoE NVFP4 quant 0023/0025/0043; FP4-MMA/Marlin scaffolds 0033/0034/0035 (default-off); GDN tensor-core prefill 0031 -> 0046 (geometry gate) -> 0047 (f32-only M5, default-on under paged KV).
|
||||
|
||||
### Bench artifacts (DGX)
|
||||
- `~/bench/COMBINED_DEFINITIVE.txt` (+ `.log`, `.done`, `combined_definitive.sh`, `combined_definitive.out`) - the definitive same-session both-engine run.
|
||||
- Per-engine logs `~/bench/COMBINED_{paged,vllm}_{MOE,DENSE}_server.log`; `~/bench/BENCHMARK_PROGRESS.md`.
|
||||
- Graph-node-traced high-N profiles: `~/highN_prof2/*.nsys-rep` (paged npl=256), `~/highN_vllm/*.nsys-rep` (vLLM), 2026-06-30.
|
||||
- A/B dirs: `~/bench/marlin_gate/`, `~/bench/gdn_p1_ab/`.
|
||||
|
||||
### Unpushed doc commits (in this worktree, not on origin)
|
||||
- `6edbb56b0` "docs(paged): definitive vLLM-parity final-state record (GB10, CLOSED)" - adds `VLLM_PARITY_FINAL.md`.
|
||||
- `baf102524` "docs(paged): correct decode-serving record to ~86% GPU-steady parity (graph-node-traced)" - the ~56% -> ~86% correction.
|
||||
- `bd100dd20` "fix(paged): repair the patch series, sync to the fork branch" - dropped dev-tree 0044/0045, added f32-only M5 as 0047.
|
||||
- `b028c81ed` "docs(paged): record padded/fixed-slot decode shape as tested-and-rejected".
|
||||
|
||||
### Discrepancies to flag / resolve (carried verbatim from the gather, including UNVERIFIED labels)
|
||||
1. **Pin mismatch.** Makefile line 52 `LLAMA_VERSION?=0ed235ea2c17a19fc8238668653946721ed136fd` (authoritative, what builds; recent `ea72a56e2` / `2c5980526` pin-synced to it) vs README section 7 prose `9d5d882d` and `VLLM_PARITY_FINAL.md` "backend pin 9d5d882d" (STALE). Hard rule: the paged pin must equal the stock `llama-cpp` pin (shared `grpc-server.cpp`); a bump to `c299a92c` once broke the grpc-server link despite being bit-exact and was reverted. Trust the Makefile; fix the prose.
|
||||
2. **Both DGX checkouts are dirty** (`gated_delta_net.cu` modified in each), and the fork HEAD (`51168c5ee`, patch 0044) differs from the dev-tree HEAD (`a7d439e`, M8 bf16) that actually produced the `COMBINED_DEFINITIVE` numbers.
|
||||
3. **Worktree patch 0044 is committed on the fork but untracked here** (`patches/paged/0044-*.patch` shows `??`).
|
||||
4. **`sm_121a` is not in the worktree build files** - it lives only in the DGX experimental build scripts (`gdn_cc.sh`, `gdn_bv_build.sh`, `paged-build.sh`); mainline uses arch `121`. **UNVERIFIED** whether the shipped CI Dockerfile build path injects `121a` for the FP4-MMA kernels (`Dockerfile.llama-cpp-localai-paged` does not hardcode a CUDA arch).
|
||||
5. **The `0921716...` paged-MoE md5 open item.** `COMBINED_DEFINITIVE.txt` records `PAGED_GATE_MD5=0921716cd0582b5d15af8c362b811d00` for MoE, but a full doc/patch/`git log -S` grep of the worktree found **no** occurrence of `0921716...` in any committed source; the committed canonical paged-MoE gate is `8cb0ce23`. Treat this as **unreconciled**: the documented, KL-validated paged-MoE gate remains `8cb0ce23`, and any paged-MoE divergence (including `0921716`) must be KL-validated against the f16 reference before being accepted as benign, never on assertion alone. The `0921716` value is **UNVERIFIED** as a sanctioned gate; do not adopt it as canonical without re-running the KL gate. The **dense** run is symmetric: `COMBINED_DEFINITIVE.txt` records `PAGED_GATE_MD5=ecfe924dee6c5622c149f419ff2a6481` for dense, which likewise differs from the canonical dense gate `5951a5b4`. Both CDEF `PAGED_GATE_MD5` values come from the `combined_definitive.sh` harness's own gate command, NOT the canonical bit-exact gate command in section 3.3, which is why they diverge from the committed `8cb0ce23` / `5951a5b4`; neither is a sanctioned gate and both must be KL-validated before being treated as benign.
|
||||
|
||||
---
|
||||
|
||||
*Status: investigation CLOSED. This handoff is procedure; `VLLM_PARITY_FINAL.md` is the record. The path to parity is datacenter Blackwell, not GB10 kernels.*
|
||||
@@ -0,0 +1,86 @@
|
||||
# llama.cpp patch series — paged attention (vLLM-parity engine)
|
||||
|
||||
A **stacking** series: each patch is a small, self-contained, independently-buildable step toward an
|
||||
in-model paged-attention engine. They apply in numeric order on top of the pinned `LLAMA_VERSION`
|
||||
(`backend/cpp/llama-cpp/Makefile`). The build applies them automatically after checkout (see the
|
||||
`llama.cpp:` target). Keeping the work as ordered patches — rather than one big diff — is what lets us
|
||||
**rebase cleanly across llama.cpp bumps and avoid drift**: when a patch stops applying, only that small
|
||||
patch needs fixing, and the failure points at exactly which step the upstream change touched.
|
||||
|
||||
## Base
|
||||
|
||||
- `LLAMA_VERSION` pin in `../Makefile`. **All patches are generated against that exact commit.** Bumping
|
||||
the pin = re-run the regen workflow below and fix only the patches that no longer apply.
|
||||
|
||||
## The series (phases → patches)
|
||||
|
||||
| # | Patch | What | Verifies |
|
||||
|---|-------|------|----------|
|
||||
| 0001 | `0001-vendor-paged-kv-manager.patch` | Add `src/paged-kv-manager.{h,cpp}` (vLLM-parity block manager, CPU foundation) + CMake; no behavior change | builds; unit-tested separately |
|
||||
| 0002 | `0002-paged-kv-storage.patch` | Shared block-pool KV tensor + `set_rows`-by-slot writes, behind `LLAMA_KV_PAGED` | builds; write/gather round-trip |
|
||||
| 0003 | `0003-paged-gather-read.patch` | `build_attn_paged` gather-read in `llama-graph.cpp` | **Gate 0**: token-identical greedy gen, single + multi-seq |
|
||||
| 0004 | `0004-paged-ondemand-alloc.patch` | On-demand block allocation via PagedKVManager | max concurrent seqs before OOM |
|
||||
| 0005 | `0005-paged-continuous-batching.patch` | Block-granular admit/evict in the server slot path | tok/s vs concurrency, mixed-length |
|
||||
| 0006 | `0006-paged-prefix-caching.patch` | Block-hash cross-request prefix dedup | TTFT + memory on shared prefixes |
|
||||
|
||||
Each row is a separate `git commit` on the dev branch (below), exported 1:1 as a patch. Default off
|
||||
(`LLAMA_KV_PAGED`) until Gate 0 (0003) is green, so partial series never changes stock behavior.
|
||||
|
||||
## Regen workflow (the anti-drift recipe)
|
||||
|
||||
```sh
|
||||
# 1. check out the exact pin into a dev tree
|
||||
git -C /tmp clone https://github.com/ggml-org/llama.cpp llama-dev && cd /tmp/llama-dev
|
||||
git checkout <LLAMA_VERSION from ../Makefile>
|
||||
git checkout -b paged
|
||||
|
||||
# 2. apply the current series (each becomes a commit), or develop the next patch
|
||||
git am /path/to/backend/cpp/llama-cpp-localai-paged/patches/paged/00*.patch # or `git apply` + commit per patch
|
||||
|
||||
# 3. iterate a phase as ONE commit, then export the whole series 1:1
|
||||
git format-patch <LLAMA_VERSION>..paged -o /path/to/backend/cpp/llama-cpp-localai-paged/patches/paged/ --zero-commit -N
|
||||
|
||||
# 4. on a pin bump: rebase `paged` onto the new pin; only conflicting patches need edits; re-export.
|
||||
```
|
||||
|
||||
## Build integration
|
||||
|
||||
The series is owned by this backend (`backend/cpp/llama-cpp-localai-paged`), not by the stock
|
||||
`llama-cpp` backend, which is pure upstream. `../Makefile` (the paged wrapper) clones the pinned
|
||||
`llama.cpp` via the copied stock build infra, then applies this series onto the cloned tree with the
|
||||
same strict `git apply` the stock build uses for base patches:
|
||||
```
|
||||
for p in $(PAGED_PATCHES_DIR)/0*.patch; do git apply --verbose "$p" || exit 1; done
|
||||
```
|
||||
All variants (avx/avx2/avx512/cuda/…) clone + apply into their own build copy, so the series ships
|
||||
everywhere without ever touching the stock `llama-cpp` source tree.
|
||||
|
||||
## Status
|
||||
|
||||
- **0001 vendor manager — DONE.** Applies clean to the pin; builds into `libllama`.
|
||||
- **0002 block placement — DONE + VERIFIED.** Built `llama-simple` at the pin; greedy generation is
|
||||
**token-identical** stock vs `LLAMA_KV_PAGED=1` (Qwen3-0.6B), paged branch confirmed firing.
|
||||
- **0003 gather-read — DONE + VERIFIED (Gate 0 green).** Implemented in the **additive** form
|
||||
(see `../README.md`): all logic in new `src/paged-attn.{h,cpp}` (a `llm_graph_input_i` gather-index
|
||||
subclass + the K/V/mask gather), hooked by **one** line in `build_attn` + **two** thin accessors on
|
||||
`llama_kv_cache_context` + 1 CMake line (216 insertions; no edit to `llm_graph_input_attn_kv` or
|
||||
`llama-graph.h`). Greedy generation is **token-identical** stock vs `LLAMA_KV_PAGED=1` (Qwen3-0.6B,
|
||||
**9/9** across 3 prompts × {32,96,128} tokens), with `n_gather=71 < n_kv=256` confirming real
|
||||
compaction. Patch: `0003-paged-gather-read-env-LLAMA_KV_PAGED.patch`.
|
||||
- **Key correctness finding:** `get_gather_idxs` must emit cells **sorted by token position**. The CPU
|
||||
flash-attn online softmax reduces cells in physical-array order and is FP-order-sensitive, so 0002's
|
||||
scattered placement *alone* (full-window read, no gather) diverges from stock once a sequence crosses
|
||||
the first 16-cell block. The position-sorted gather reproduces stock's exact reduction order -> bit-
|
||||
identical, not merely mathematically equivalent. So 0002 is the placement substrate; **0003 is what
|
||||
makes paged placement token-identical under flash-attn.**
|
||||
- 0004–0006 follow.
|
||||
|
||||
### Honest parity note (important)
|
||||
|
||||
This series delivers the paged-attention **engine** (capacity + scheduling + prefix sharing). It does **not**
|
||||
by itself reach vLLM throughput parity, because the measured prefill bottleneck is the **FP4 MoE GEMM kernel**
|
||||
(Lever 3: `mul_mat_q<MXFP4>` ~22 TFLOP/s, ~27× behind vLLM) — a *per-token compute* gap that paging does not
|
||||
touch. Paged attention closes the **concurrency/memory** gap (more sequences, prefix reuse); the prefill/throughput
|
||||
gap additionally needs the tcgen05/CUTLASS grouped-GEMM (deferred, upstream-grade, no shortcut — see
|
||||
`../README.md`). So full vLLM parity = this series **AND** the
|
||||
kernel; neither alone suffices.
|
||||
@@ -0,0 +1,76 @@
|
||||
# PREFILL_GEMM_RESULTS - option (a) dequant->bf16 cuBLAS, measured on GB10
|
||||
|
||||
Companion to `PREFILL_GEMM_SCOPE.md`. This records the GPU A/B for the #1
|
||||
prefill lever (route large-M NVFP4 dense GEMMs off FP4-MMQ onto dequant->bf16
|
||||
cuBLAS / nvjet). Shipped as patch `0033`, **default-off** because the measured
|
||||
result is a regression on this hardware.
|
||||
|
||||
Hardware: NVIDIA GB10 (sm_121), CUDA 13.0. Backend pin `9d5d882d`.
|
||||
Models: `q36-27b-nvfp4.gguf` (dense), `q36-35b-a3b-nvfp4.gguf` (MoE).
|
||||
Binary: `build-cuda/bin/llama-batched-bench -fa on -ngl 99`, `LLAMA_KV_PAGED=1`.
|
||||
A/B is a single build toggled by `LLAMA_FP4_PREFILL_M` (0 = MMQ baseline, >0 =
|
||||
route prefill M>threshold to bf16 cuBLAS), so it isolates exactly this lever.
|
||||
|
||||
## 1. Bit-exact / numeric gate (PASS - divergence benign)
|
||||
|
||||
| Gate | Result |
|
||||
|---|---|
|
||||
| `test-backend-ops -o MUL_MAT` (default, threshold off) | 1146/1146 pass |
|
||||
| `test-backend-ops -o MUL_MAT_ID` (default) | 806/806 pass (MoE untouched) |
|
||||
| `test-backend-ops -o MUL_MAT`, path FORCED (`LLAMA_FP4_PREFILL_M=64`) | NVFP4 large-M cases (m=2048/1600/2050, n=128, k=2048) green CUDA-vs-CPU |
|
||||
| greedy md5, short prefill (< threshold), lever vs base | identical: `5951a5b4d624ce891e22ab5fca9bc439` (== documented dense reference; decode byte-untouched) |
|
||||
| greedy md5, long prefill (> threshold, exercises bf16 path), lever vs base | identical: `5f3967df5781445feeb25762abb9eae7` (the new FP path flips no greedy argmax) |
|
||||
|
||||
The new path (NVFP4->bf16 round, bf16 tensor cores, f32 accumulate) is a
|
||||
different FP path from fused FP4xQ8_1 MMQ, but it is precision-neutral-to-better:
|
||||
keeping activations in bf16 instead of Q8_1 is strictly more precise, and the
|
||||
greedy output is byte-identical. This matches the scope's prediction
|
||||
(KLD(dequant-bf16 || f16) <= KLD(FP4-MMQ || f16)).
|
||||
|
||||
## 2. Performance (REGRESSION - the lever loses on GB10)
|
||||
|
||||
S_PP (prefill tokens/s), q36-27b dense, A/B `LLAMA_FP4_PREFILL_M` off vs on:
|
||||
|
||||
| prefill ubatch M | npl | base S_PP (MMQ) | lever S_PP (bf16 cuBLAS) | delta |
|
||||
|---|---|---|---|---|
|
||||
| 512 | 32 | 958.99 | 486.65 | -49% |
|
||||
| 1024 | 8 | 1013.65 | 587.27 | -42% |
|
||||
| 2048 | 8 | 918.46 | 649.42 | -29% |
|
||||
|
||||
Default-off control (no env): S_PP 966.98 == base (within noise) -> the patch is
|
||||
inert by default.
|
||||
|
||||
## 3. Why it loses (the scope premise was wrong for GB10)
|
||||
|
||||
The scope assumed FP4-MMQ is register-bound to ~3% of FP4 peak at large M, so a
|
||||
vendor large-M kernel would win. **Measured, FP4-MMQ at M=512..2048 beats
|
||||
dequant->bf16 cuBLAS by 29-49%.** Two compounding reasons:
|
||||
|
||||
1. **bf16 tensor-core peak is ~half FP4 peak on GB10.** Even a perfect bf16 GEMM
|
||||
caps at ~half the throughput the FP4-MMA path can reach.
|
||||
2. **The dequant tax is an un-amortized memory pass.** Per prefill step the new
|
||||
path reads FP4 weights (~0.5 B/elt), writes bf16 (2 B/elt), then the GEMM
|
||||
reads bf16 (2 B/elt) = ~8x the weight byte traffic of the FP4-MMQ read
|
||||
(~0.5 B/elt). The dequant write is M-independent, so it only amortizes as M
|
||||
grows: the gap shrinks 49% -> 42% -> 29% from M=512 -> 2048 but never crosses
|
||||
even at M=2048 (above the default n_ubatch).
|
||||
|
||||
This is also consistent with the README decode finding that the dense path was
|
||||
already ~96-97% of vLLM - the dense GEMM was never the bottleneck the way the
|
||||
prefill ground-truth (measured on the MoE decision model) implied.
|
||||
|
||||
## 4. Status of the phases
|
||||
|
||||
- **Phase 1 (dense): REJECTED on GB10**, landed default-off as a validated,
|
||||
env-gated scaffold (mechanism + bit-exact gate reusable by option (b) and by
|
||||
non-GB10 hardware where bf16 may fare differently).
|
||||
- **Phase 2 (MoE grouped large-M): NOT implemented.** It inherits the same
|
||||
bf16-peak < FP4-peak ceiling plus a per-expert dequant, so a grouped
|
||||
bf16-cuBLAS would regress for the same reason; the MoE id-path also has the
|
||||
graph-safety catch (a false `should_use_mmq` falls to the host-sync sorted
|
||||
loop, not CUDA-graph-safe). Not worth the multi-day grouped-cuBLAS + graph
|
||||
work on a path the dense A/B already shows loses.
|
||||
- **The only route to a real prefill GEMM win is option (b)** - a native
|
||||
Blackwell FP4-MMA large-M kernel (multi-week), to greenlight only if the
|
||||
prefill regime is funded. The committed scaffold gives option (b) its
|
||||
M-threshold routing and its bit-exact gate for free.
|
||||
264
backend/cpp/llama-cpp-localai-paged/docs/PREFILL_GEMM_SCOPE.md
Normal file
264
backend/cpp/llama-cpp-localai-paged/docs/PREFILL_GEMM_SCOPE.md
Normal file
@@ -0,0 +1,264 @@
|
||||
# PREFILL_GEMM_SCOPE - large-M NVFP4 expert/dense GEMM (design only)
|
||||
|
||||
**Status: DESIGN + PLAN ONLY. No kernel written, no GPU run in this pass.**
|
||||
This scopes the #1 prefill lever for `llama-cpp-localai-paged`: the NVFP4 weight
|
||||
GEMM at large M (prefill), where llama.cpp's `mul_mat_q` (MMQ) NVFP4 path is far
|
||||
slower than vLLM's `marlin_moe_wna16` (MoE) + cutlass/nvjet (dense). Per the
|
||||
prefill ground-truth that motivated this scope, the GEMM bucket is ~232 us/tok
|
||||
(paged) vs ~68 us/tok (vLLM) - 3.4x slower, ~51% of the paged-vs-vLLM prefill
|
||||
gap (164 us/tok).
|
||||
|
||||
> **Regime warning (read first).** Every "GEMM is at the BW floor / ties vLLM"
|
||||
> conclusion in `README.md` section 5 is a **DECODE** finding (M<=128,
|
||||
> bandwidth-bound). This document is about **PREFILL** (large M, compute /
|
||||
> tensor-core-throughput bound) - a different regime, which is exactly why the
|
||||
> rejected "W4A16-Marlin MoE GEMM" lever is revisited here **for prefill only**.
|
||||
> The 232/164/68 us/tok prefill bucket came from the prefill ground-truth that
|
||||
> commissioned this scope and is **not** in a committed in-repo profile (the
|
||||
> committed profiling - `GAP_PROGRESS.md` etc. - is decode-focused). Per the
|
||||
> "profile-don't-assume" rule in `.agents/vllm-parity-methodology.md`, **step 0 of
|
||||
> any build is to re-confirm the prefill GEMM bucket on GPU** (nsys, prefill-only
|
||||
> window) before touching code.
|
||||
|
||||
---
|
||||
|
||||
## 1. Why `mul_mat_q` is slow at large M (confirmed from source)
|
||||
|
||||
Source: `ggml/src/ggml-cuda/mmq.cu`, `mmq.cuh` at this backend's pin (`9d5d882d`).
|
||||
|
||||
MMQ is built for the **M<=128 decode tile**. Three structural facts from the code:
|
||||
|
||||
1. **The M (column/token) tile is capped at 128.**
|
||||
`get_mmq_x_max_host()` / `get_mmq_x_max_device()` (mmq.cuh ~108-140) return
|
||||
`128` on Blackwell (`turing_mma_available(cc)`), and the host launch loop
|
||||
(mmq.cuh ~4237) picks `mmq_x_best` only to *minimise the column-tile count for
|
||||
`ncols_max`, never exceeding `mmq_x_max`*. So a prefill ubatch of M=512 (or
|
||||
4096) tokens is processed as many `mmq_x<=128` column-tiles. The compile-time
|
||||
accumulator tile is `mmq_x`-wide; there is no large-M (e.g. 256-wide) tile
|
||||
variant. The whole tile-selection machinery exists to pick a *small* tile for
|
||||
*small* batches, not to grow for large ones.
|
||||
|
||||
2. **The FP4-MMA kernel is register-bound to 1 CTA/SM.**
|
||||
`mul_mat_q` for FP4 is `__launch_bounds__(warp_size*nwarps, min_blocks=1)`
|
||||
(mmq.cuh ~3579-3585), i.e. 256 threads, 1 resident block/SM (~255 regs/thread).
|
||||
The patch-0017 comment in-tree states this plainly: the kernel is
|
||||
"REGISTER-bound to 1 CTA/SM ... the under-occupancy that strands the kernel at
|
||||
~3% of FP4 peak at M=128." At large M the work per tile is bigger, but with one
|
||||
CTA/SM the tensor cores still stall on LPDDR5x / shared-memory weight loads
|
||||
with no CTA-level latency hiding - the design has no async multi-stage global->
|
||||
shared pipeline (cp.async double-buffering) that large-M GEMMs need.
|
||||
|
||||
3. **Per-tile fixed overheads amortise poorly only because the tile stays small.**
|
||||
Each tile re-stages weights into shared memory, runs the `MMQ_ITER_K_FP4=512`
|
||||
K-loop, and the activations are quantized to Q8_1 (`quantize_mmq_fp4_cuda`,
|
||||
block_fp4_mmq = FP4 weights x int8 activations). For decode this is the right
|
||||
trade (FP4 weight traffic is the bottleneck). For large-M prefill the GEMM is
|
||||
compute-bound, so the right structure is big tensor-core output tiles (e.g.
|
||||
128x256), a deep async load pipeline, and full SM occupancy - exactly what
|
||||
cutlass 3.x / nvjet (cuBLAS) and marlin implement and MMQ does not.
|
||||
|
||||
Patch 0017 already proved every *cheap* large-tile/occupancy lever inside MMQ
|
||||
(`GGML_CUDA_FP4_MMQ_Y`, `GGML_CUDA_FP4_MINBLOCKS`) is a no-win on GB10 - because
|
||||
the limit is the small-tile kernel *structure*, not a tunable. To win at large M
|
||||
you must leave MMQ for a large-M kernel.
|
||||
|
||||
---
|
||||
|
||||
## 2. Options (feasibility / bit-exactness / effort)
|
||||
|
||||
### Key enabling facts already in the tree
|
||||
|
||||
- **NVFP4 -> bf16/f16 dequant kernels already exist.** `convert.cu` defines
|
||||
`dequantize_row_nvfp4_cuda`; `ggml_get_to_bf16_cuda` / `ggml_get_to_fp16_cuda`
|
||||
/ `ggml_get_to_fp16_nc_cuda` all return it for `GGML_TYPE_NVFP4`. The
|
||||
non-Blackwell fallback ("falls back to dequant", README s2) already uses this.
|
||||
- **cuBLAS on GB10 dispatches to nvjet** (NVIDIA's JIT tensor-core GEMM) - the
|
||||
committed profiles already show `nvjet lm_head` and `nvjet non-FP4 cublas GEMM`
|
||||
rows. So a dequant->cuBLAS bf16 GEMM lands on a vendor-tuned large-M kernel for
|
||||
free.
|
||||
- **BUT NVFP4 is explicitly excluded from the tensor-core cuBLAS path.** In
|
||||
`ggml_cuda_op_mul_mat_cublas` (ggml-cuda.cu ~1659) the `use_fp16` predicate
|
||||
begins `src0->type != GGML_TYPE_NVFP4 && ...`. So if NVFP4 reaches cuBLAS today
|
||||
it falls to the `else` branch: dequant to **F32** + `cublasSgemm` (**no tensor
|
||||
cores**) - useless for prefill. Relaxing this one exclusion (route NVFP4 to the
|
||||
bf16/f16 tensor-core branch, where `to_*_cuda(NVFP4)` already exists) is the
|
||||
pivot that makes option (a) a few-line change rather than a kernel.
|
||||
|
||||
### (a) Dequant -> cuBLAS/cutlass bf16 GEMM for large M -- RECOMMENDED
|
||||
|
||||
Dequant the NVFP4 weights to bf16 (transient pool buffer) once per prefill step,
|
||||
then a large-M tensor-core `cublasGemmEx` (CUBLAS_COMPUTE_32F accumulate, bf16
|
||||
inputs). Activations stay bf16 (not Q8_1-quantized).
|
||||
|
||||
- **Feasibility: HIGH.** All pieces exist (dequant kernels, cuBLAS bf16 path,
|
||||
pool allocator). The only code change for the dense path is (i) make
|
||||
`ggml_cuda_should_use_mmq` return false for NVFP4 dense above an M threshold so
|
||||
the dispatch falls through to `ggml_cuda_op_mul_mat_cublas`, and (ii) relax the
|
||||
`src0->type != GGML_TYPE_NVFP4` exclusion so it dequants to bf16 and uses
|
||||
`cublasGemmEx` tensor-core, not f32 Sgemm.
|
||||
- **Cost model (the crux - why it wins ONLY at large M).** Dequant is one extra
|
||||
weight-sized memory pass (read ~0.5B/elt FP4 + scales, write 2B/elt bf16). The
|
||||
bf16 GEMM then reads weights as bf16 = **4x the byte traffic of the FP4-MMQ
|
||||
read**. At small M (decode) this 4x weight traffic dominates -> bf16-cuBLAS
|
||||
loses -> keep MMQ (this is why decode stays FP4-MMQ; consistent with the
|
||||
README decode verdict). At large M the GEMM is compute-bound and weight traffic
|
||||
is amortised over hundreds of columns, so the 4x is cheap and cuBLAS's mature
|
||||
large tiles + async pipeline + full occupancy dominate MMQ's 3%-of-peak small
|
||||
tile. The dequant pass itself is ~one weight-read amortised over the whole
|
||||
prefill step - negligible at large M.
|
||||
- **Honest ceiling.** GB10 bf16 tensor-core peak is ~**half** the FP4 tensor-core
|
||||
peak. A bf16 cuBLAS GEMM at ~70-80% of bf16 peak is ~35-40% of FP4 peak. That
|
||||
is a huge jump from MMQ's ~3% large-M utilisation, but it is **not** automatic
|
||||
full vLLM parity (vLLM prefill uses 4-bit weight tiles, staying near FP4-class
|
||||
throughput). Expect this to recover most, not all, of the 232->68 gap. See s4.
|
||||
- **Bit-exactness: NEW FP path** (NVFP4->bf16 round, bf16 TC, f32 accumulate) vs
|
||||
fused FP4xQ8_1 MMQ. **Not byte-identical** - gate per-path via KLD exactly like
|
||||
the paged-MoE `8cb0ce23` precedent (README s5 / `PAGED_BITEXACT_NOTE.md`). It
|
||||
should pass *easily and favourably*: keeping activations in bf16 instead of
|
||||
Q8_1 is strictly more precise than the MMQ path, so KLD(dequant-bf16 || f16)
|
||||
should be <= KLD(FP4-MMQ || f16). This is a precision-neutral-to-better change,
|
||||
not a precision regression like the rejected lever 4.
|
||||
- **Effort: LOW-MEDIUM (a few days).** Dispatch flip + exclusion relax + an M
|
||||
threshold + the KL gate + a prefill bench. No new kernel. Dense first; MoE is
|
||||
the harder follow-on (see (c)/plan).
|
||||
- **Memory note.** Dequant into a *transient* pool scratch per step (do **not**
|
||||
cache bf16 weights - a persistent bf16 copy is 4x VRAM for those tensors and
|
||||
would erase the backend's "1.5-3x less memory" property). The per-step dequant
|
||||
pass is the price of keeping the model FP4-resident.
|
||||
|
||||
### (b) Marlin-style fused NVFP4 large-M MoE GEMM (port `marlin_moe_wna16`)
|
||||
|
||||
Port vLLM's marlin grouped MoE kernel (4-bit weights, f16 activations, dequant-
|
||||
in-register, async cp.async pipelines, swizzled layouts).
|
||||
|
||||
- **Feasibility: LOW (hardest).** Marlin is a hand-tuned CUTLASS-class kernel and
|
||||
is **not NVFP4-aware** (it targets wna16 group-quant, not NVFP4's 16-elt blocks
|
||||
with ue4m3 micro-scales). You would either (i) adapt marlin to dequant NVFP4
|
||||
in-register and accumulate in f16 (abandoning native Blackwell FP4-MMA), or
|
||||
(ii) write a brand-new Blackwell sm_121 FP4-MMA large-M kernel - which is
|
||||
essentially re-implementing what cutlass 3.x / nvjet already give you via (a).
|
||||
- **Bit-exactness:** new FP path, KL-gate (same as (a)).
|
||||
- **Effort: HIGH (multi-week, high risk),** kernel + layout + Blackwell MMA
|
||||
scheduling + graph-safety + the bit-exact gate.
|
||||
- **Verdict: do NOT start here.** Its only structural advantage over (a) is 4-bit
|
||||
weight traffic, which matters only when BW-bound = small M = **decode**, the
|
||||
regime already rejected. At large M (a) reaches the same vendor large-M kernels
|
||||
for ~1% of the effort. Keep (b) on the shelf as the *only* route to true 68
|
||||
us/tok parity if (a)'s bf16 ceiling proves insufficient and the win justifies a
|
||||
multi-week kernel.
|
||||
|
||||
### (c) M-threshold routing (the integration mechanism for (a))
|
||||
|
||||
Not an alternative to (a) - it is *how* (a) is wired. Keep FP4-MMQ for decode
|
||||
(M<=threshold), switch to the large-M path for prefill.
|
||||
|
||||
- **Cleanest hook:** `ggml_cuda_should_use_mmq(type, cc, ne11_or_ne12, n_experts)`
|
||||
already receives M (`ne11` dense / `ne12` MoE tokens). Add an NVFP4+Blackwell
|
||||
branch: return false when M > `LLAMA_FP4_PREFILL_M` (default e.g. 256-512,
|
||||
env/`-D` tunable, default value chosen so default == today's behaviour until
|
||||
validated). It is called from both `ggml_cuda_mul_mat` (~2573/2582) and
|
||||
`ggml_cuda_mul_mat_id` (~2664), so one edit covers dense + MoE routing.
|
||||
- **Dense fallthrough is clean:** `ggml_cuda_mul_mat` final `else` ->
|
||||
`ggml_cuda_op_mul_mat(..., ggml_cuda_op_mul_mat_cublas, ...)` -> with the
|
||||
exclusion relaxed, dequant->bf16->`cublasGemmEx`. Works.
|
||||
- **MoE fallthrough is NOT clean (the catch):** in `ggml_cuda_mul_mat_id`, a
|
||||
false `should_use_mmq` falls to `should_use_mmf` (no NVFP4 support) then to the
|
||||
**host-side sorted per-expert loop** with a `cudaStreamSynchronize` (ggml-cuda.cu
|
||||
~2700) - slow and **not CUDA-graph-safe** (it would break the MoE re-graph,
|
||||
patch 0025). So MoE large-M needs a *dedicated graph-safe grouped GEMM* (dequant
|
||||
the expert-gathered weights to bf16 + `cublasGemmGroupedBatchedEx`, CUDA 12.5+,
|
||||
over the existing `expert_bounds`/`ids_dst` sorted layout), not a bare
|
||||
fallthrough. This is why the plan ships **dense first, MoE second**.
|
||||
|
||||
---
|
||||
|
||||
## 3. Recommended approach + implementation plan
|
||||
|
||||
**Recommendation: (a) dequant->bf16 cuBLAS, wired via (c) M-threshold routing,
|
||||
dense-path first, MoE grouped-cuBLAS second. Reject (b).**
|
||||
|
||||
### Phase 0 - confirm the bucket on GPU (no code)
|
||||
- nsys prefill-only window (`-npp <large> -ntg 0/1`, exclude the graph-capture
|
||||
step) on q36-27b dense and q36-35b-a3b MoE at the backend pin. Confirm the
|
||||
NVFP4 `mul_mat_q` / `mul_mat_id` bucket is ~232 us/tok and that it is
|
||||
compute-bound at prefill M (check tensor-core active % low, not BW-saturated).
|
||||
If the bucket is not what the ground-truth claims, stop and re-scope.
|
||||
|
||||
### Phase 1 - dense large-M NVFP4 -> bf16 cuBLAS (the bankable win)
|
||||
Files / edits:
|
||||
1. `ggml/src/ggml-cuda/mmq.cu` - `ggml_cuda_should_use_mmq`: add
|
||||
`if (type==GGML_TYPE_NVFP4 && blackwell_mma_available(cc) && ne11 > LLAMA_FP4_PREFILL_M && n_experts==0) return false;`
|
||||
(n_experts==0 = dense only in Phase 1). Default threshold == effectively
|
||||
disabled until A/B-validated, env/`-D` overridable (mirror the 0017
|
||||
`GGML_CUDA_FP4_*` knob style + in-tree comment).
|
||||
2. `ggml/src/ggml-cuda/ggml-cuda.cu` - `ggml_cuda_op_mul_mat_cublas`: relax the
|
||||
`src0->type != GGML_TYPE_NVFP4` guard in `use_fp16` (prefer a dedicated bf16
|
||||
branch: NVFP4 -> `ggml_get_to_bf16_cuda` -> `cublasGemmEx` CUDA_R_16BF /
|
||||
COMPUTE_32F, matching the existing BF16 src0 branch for best accuracy).
|
||||
3. Transient pool scratch for the dequanted weights (reuse `ggml_cuda_pool_alloc`
|
||||
as the existing branch does; no persistent allocation).
|
||||
|
||||
### Phase 2 - MoE grouped large-M (the harder, higher-value follow-on)
|
||||
1. New grouped path reached from `ggml_cuda_mul_mat_id` when
|
||||
`should_use_mmq`==false for NVFP4+large-M+`n_experts>0`: dequant the
|
||||
expert-gathered weights to bf16 and run `cublasGemmGroupedBatchedEx` over the
|
||||
existing `expert_bounds` / `ids_dst` sorted layout that `mul_mat_q` already
|
||||
builds. Reuse the patch-0023 de-dup'd activation gather where applicable.
|
||||
2. **Must stay CUDA-graph-safe** - no host sync (do not fall into the legacy
|
||||
sorted loop). Validate the MoE re-graph (patch 0025 / `LLAMA_MOE_FORCE_GRAPHS`)
|
||||
still captures.
|
||||
|
||||
### The bit-exact / KL gate (both phases)
|
||||
- Greedy md5 on the standard prompt (README s5) to detect *unexpected* divergence
|
||||
on the non-prefill paths (must stay == the per-path reference: dense
|
||||
`5951a5b4`, paged-MoE `8cb0ce23`). The large-M path itself will differ -> gate
|
||||
it by KLD vs the f16 reference, requiring `KLD(new||f16) <= KLD(FP4-MMQ||f16)`
|
||||
and PPL within the established band, recorded in `PAGED_BITEXACT_NOTE.md`.
|
||||
- `test-backend-ops` MUL_MAT / MUL_MAT_ID at NVFP4 **prefill shapes** (large M)
|
||||
CUDA0-vs-CPU, plus the existing decode shapes to prove decode is byte-untouched
|
||||
(default threshold keeps decode on MMQ).
|
||||
|
||||
### The bench
|
||||
- `llama-batched-bench -fa on -ngl 99` reporting **S_PP** (prefill t/s), swept
|
||||
over prefill length and `npl`, A/B with `LLAMA_FP4_PREFILL_M` off vs on, dense
|
||||
and MoE, vs stock and vs the vLLM prefill reference. Per-lever A/B discipline
|
||||
(`.agents/vllm-parity-methodology.md`): one knob at a time, record the rejected
|
||||
threshold values too.
|
||||
|
||||
---
|
||||
|
||||
## 4. Honest risk + expected speedup
|
||||
|
||||
- **Phase 1 (dense) is a tractable routing change, not a kernel project** - days,
|
||||
low risk. It reuses existing dequant kernels and the existing nvjet/cuBLAS
|
||||
large-M path; the net new code is a threshold + a one-line exclusion relax + a
|
||||
KL gate.
|
||||
- **Phase 2 (MoE) is medium risk** - the grouped-batched cuBLAS wiring +
|
||||
CUDA-graph-safety is real work (the bare fallthrough is a slow, graph-breaking
|
||||
host loop), but still far short of a from-scratch kernel.
|
||||
- **Will the GEMM bucket hit 232 -> ~68 us/tok (full vLLM parity)? Honestly, no -
|
||||
not from bf16-cuBLAS alone.** bf16 tensor-core peak on GB10 is ~half FP4 peak,
|
||||
so the realistic floor for a dequant->bf16 GEMM is ~**90-130 us/tok** (roughly
|
||||
35-45% of FP4 peak at ~70-80% of bf16 peak). That recovers ~**60-75%** of the
|
||||
232->68 bucket gap = a large prefill win (the GEMM is ~51% of the total prefill
|
||||
gap, so closing ~two-thirds of it is a meaningful S_PP improvement), but it
|
||||
leaves a residual. **True 68 us/tok parity requires a native FP4-MMA large-M
|
||||
kernel (option (b)) - the multi-week project** to greenlight only if Phase 1's
|
||||
measured win proves the prefill regime matters enough to fund it.
|
||||
- **Recommendation:** build Phase 1, measure, and let the measured dense S_PP
|
||||
gain decide whether Phase 2 (MoE grouped cuBLAS) and ultimately (b) (native FP4
|
||||
large-M kernel) are worth funding. Bank the cheap two-thirds before paying for
|
||||
the kernel.
|
||||
|
||||
---
|
||||
|
||||
## 5. Summary table
|
||||
|
||||
| Option | Feasibility | Bit-exact | Effort | Verdict |
|
||||
|---|---|---|---|---|
|
||||
| (a) dequant->bf16 cuBLAS large-M | HIGH (parts exist) | new FP path, KL-gate (likely better PPL) | LOW-MED (days) | **RECOMMENDED** (dense first) |
|
||||
| (b) Marlin/native FP4 large-M kernel | LOW | new FP path, KL-gate | HIGH (multi-week) | shelf - only route to true 68 us/tok |
|
||||
| (c) M-threshold routing | HIGH | n/a (mechanism) | LOW | **the wiring for (a)** |
|
||||
|
||||
Decode is untouched by all of the above (threshold keeps M<=128 on FP4-MMQ); this
|
||||
is a **prefill-only** lever.
|
||||
@@ -0,0 +1,613 @@
|
||||
# Tensor-Core GDN Build Plan
|
||||
|
||||
> Auto-generated from the GDN build-design workflow. Build-ready spec for the full tensor-core chunked-scan kernel (2nd prefill lever).
|
||||
|
||||
## 1. Remaining intra-chunk products -> mma mapping
|
||||
|
||||
I have everything needed: the exact chunked-scan math from patch 0031, the sm_121a constraints from the scope doc, and the concrete ggml tf32 fragment (`mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32`, `tile<16,8,float> D, tile<16,8,float> A, tile<8,8,float> B`) at `mma.cuh:976-984`. Here is the design.
|
||||
|
||||
---
|
||||
|
||||
# Tensor-core mapping of the REMAINING intra-chunk GDN products (patch 0031 steps 3-7)
|
||||
|
||||
## 0. Building block + what the PoC already covered
|
||||
|
||||
**Grounding.** Math: `backend/cpp/llama-cpp-localai-paged/patches/paged/0031-paged-chunked-gdn-prefill-scan-kernel.patch` (steps reproduced inline below). Scope/constraints: `backend/cpp/llama-cpp-localai-paged/docs/TENSORCORE_GDN_SCOPE.md`. Fragment API: `ggml/src/ggml-cuda/mma.cuh:976-984` (the only f32-accumulate tf32 overload on sm_121a).
|
||||
|
||||
The single warp-level primitive on sm_121a is **`m16n8k8` tf32 / f32-accumulate**:
|
||||
- `A` fragment = `tile<16,8,float>` (M=16, K=8; 4 floats/thread, `Axi[0..3]`)
|
||||
- `B` fragment = `tile<8,8,float>` (K=8, N=8, `.col` operand; 2 floats/thread, `Bxi[0..1]`)
|
||||
- `D` accumulator = `tile<16,8,float>` (M=16, N=8; 4 floats/thread)
|
||||
- A GEMM `[M×K]·[K×N]` tiles to `ceil(M/16) × ceil(N/8) × ceil(K/8)` mma calls, f32-accumulating over the K-subtiles.
|
||||
- bf16 alternative `m16n8k16` (`mma.cuh:1064`, K=16/mma, 7-bit mantissa) exists but is **only** admissible for the tf32-safe Gram class — never the state/decay-coupled class.
|
||||
- 3xtf32 ladder = split each f32 operand into 3 tf32 limbs, run 3 limb-products per K-subtile (hi·hi, hi·lo, lo·hi), accumulate high→low. ~3x the mma count, ~f32 accuracy.
|
||||
|
||||
**PoC covered products 1 + 2** (the two `C×C` Gram products, both tf32-safe, NMSE ~3e-9): `KK[t,t']=k_t·k_t'` → `A`, and `QK[t,t']=q_t·k_t'` → `P`. Both are `(C×dk)·(dk×C)`, M=C N=C K=dk=128, decay+beta applied in f32 after. They already share the `Kc^T` B-fragments.
|
||||
|
||||
The remaining families are **steps 3,4,5,6,7**. Notation: `C` = chunk (default 64; PoC 16), `dk=dv=128`, per `(head,seq)` block. Tile counts below are for **C=64**.
|
||||
|
||||
---
|
||||
|
||||
## 1. Per-product mma mapping table (the deliverable)
|
||||
|
||||
| # | Product (0031 step) | Result = matmul | M | N | K | mma tiles `(M/16)·(N/8)·(K/8)` @C=64 | Accumulation order | Precision class | Shares staged operand with |
|
||||
|---|---|---|---|---|---|---|---|---|---|
|
||||
| 1 | `KK→A` (PoC) | `Kc · Kcᵀ` | C | C | dk=128 | 4·8·16 = **512** (~½ tri) | over 16 k-subtiles | **tf32-safe** (proven) | `Kcᵀ` B-frag ↔ P2; `Kc` LHS ↔ P3 |
|
||||
| 2 | `QK→P` (PoC) | `Qc · Kcᵀ` | C | C | dk=128 | 4·8·16 = **512** (~½ tri) | over 16 k-subtiles | **tf32-safe** (proven) | `Kcᵀ` B-frag ↔ P1; `Qc` LHS ↔ P4 |
|
||||
| 3 | `KS = S0ᵀk_t` | `Kc · S0` | C | dv=128 | dk=128 | 4·16·16 = **1024** | 16 k-subtiles, limbs hi→lo | **3xtf32 / f32** (state-boundary, feeds solve) | `S0` B-frag ↔ P4; `Kc` LHS ↔ P1 |
|
||||
| 4 | `QS = S0ᵀq_t` | `Qc · S0` | C | dv=128 | dk=128 | 4·16·16 = **1024** | 16 k-subtiles, limbs hi→lo | **3xtf32 → demote-first** (×γ_t≤1 attenuated) | `S0` B-frag ↔ P3; `Qc` LHS ↔ P2 |
|
||||
| 5 | `O += P·U` | `P · U` | C | dv=128 | C=64 | 4·16·8 = **512** (~½ tri over K) | C/8 k-subtiles, triangular | **tf32-safe** (P decay-masked & bounded in f32 first) | `P`(=Amat) ↔ P2; `U` B-frag ↔ P6 |
|
||||
| 6 | `S_C += Kᵀ(D·U)` | `Kcᵀ · DU` | dk=128 | dv=128 | C=64 | 8·16·8 = **1024** | scale state by γ_last (f32) **first**, then C/8 k-subtiles, limbs hi→lo | **3xtf32 / f32** (THE cross-chunk carry, compounds over n_tok/C) | `U` B-frag ↔ P5; `Kc` (transposed) ↔ P1/3 |
|
||||
| 7 | `U = A⁻¹·RHS` off-diag coupling `A_ij·U_j` | `A_ij · U_j` | b=16 | dv=128 | b=16 | 1·16·2 = **32**/pair → **~192** (6 pairs) +~128 diag | forward sweep i=0..C/b; off-diag subtractions before diagonal solve | **tf32-safe off-diag + f32 in-register `16×16` diagonal** | `A`(=Amat) ↔ P1; `U` blocks ↔ P5/P6 |
|
||||
|
||||
3xtf32 inflation if the ladder is taken: P3 1024→**3072**, P4→**3072**, P6 1024→**3072**.
|
||||
|
||||
---
|
||||
|
||||
## 2. Per-product detail (the 5 remaining families)
|
||||
|
||||
### Product 3 - `KS = S0ᵀ k_t` (RHS state-boundary term)
|
||||
0031: `ks = Σ_i Sd[j·dk+i]·Kc[t·dk+i]`; feeds `RHS[t][j] = β_t(v_t[j] − γ_t·ks)`.
|
||||
- **As a GEMM:** `KS[t][j] = Σ_i Kc[t][i]·S0[i][j]` ⇒ `KS = Kc[C×dk] · S0[dk×dv]`. **M=C, N=dv=128, K=dk=128.** Contraction over the state-row index `i`.
|
||||
- **Schedule:** `Kc` is the LHS (M-major over `t`, K over `i`) — already staged for P1. `S0` is the B operand, K-major over `i`, N over `j`. The patch's `Sd[j·dk+i]` layout (i contiguous for fixed j) **is already a K-major B layout** → `ldmatrix`-friendly as `tile<8,8>` B fragments. Accumulate 16 k-subtiles into f32 D.
|
||||
- **Precision: 3xtf32/f32.** This is a state-boundary product: `S0` carries the full sequence history (wide dynamic range), and the result is *differenced* against `v_t` then fed into the solve, so error here propagates through `U` into both `O` and `S_C`. Default to the 3xtf32 ladder; A/B a plain-tf32 demote only after P4.
|
||||
|
||||
### Product 4 - `QS = S0ᵀ q_t` (γ cross-chunk `O` term)
|
||||
0031: `qs = Σ_i Sd[j·dk+i]·Qc[t·dk+i]`; `o = γ_t·qs + Σ P·U`.
|
||||
- **As a GEMM:** `QS = Qc[C×dk] · S0[dk×dv]`. **M=C, N=dv=128, K=dk=128** — identical shape to P3.
|
||||
- **Schedule:** identical to P3 but LHS=`Qc` (shared with P2). **Fuse with P3 on the shared `S0` B-fragments:** stage `S0` once as B, run `Kc·S0` then `Qc·S0` back-to-back — `S0` is the heavy operand (128×128) and is loaded once for both.
|
||||
- **Precision: 3xtf32 but the demote-first candidate.** The term is scaled by `γ_t ≤ 1` in f32 after the mma, so when the chunk has decayed (`γ_t→0`) the absolute error is attenuated. Least sensitive of the three state-boundary products; it is the first to try at plain tf32 in the precision A/B.
|
||||
|
||||
### Product 5 - `O += P · U` (attention-weighted output)
|
||||
0031: `o += Amat[t·Cc+tp]·Ud[j·C+tp]` for `tp≤t`.
|
||||
- **As a GEMM:** `O[C×dv] += P[C×C] · U[C×dv]`. **M=C, N=dv=128, K=C=64.** Contraction over the chunk index `t'`.
|
||||
- **Schedule:** `P` (=Amat scratch from P2, with `d(t',t)` applied in f32) is LHS (M over t, K over t'); `U` (solved, in `Ud`) is the B operand, K-major over t'. `P` is **lower-triangular** ⇒ for M-tile `m` only K-subtiles `≤ m` are non-zero → ~½ the mma. Accumulate `C/8` k-subtiles. Add the `γ_t·QS` term (P4) into the same f32 D accumulator before write-out.
|
||||
- **Precision: tf32-safe.** `P = d·QK` with `d≤1` is formed and bounded **in f32 first** (strong-decay rows already underflowed to ~0), so down-casting the bounded `P` to tf32 for this mma is benign. The decay is never inside the accumulation — it is pre-baked in f32, preserving the bounded de-gating invariant.
|
||||
|
||||
### Product 6 - `S_C += Kᵀ(D·U)` (the state update)
|
||||
0031: `s = γ_last·Sd[j·dk+i] + Σ_t d(t,last)·Kc[t·dk+i]·Ud[j·C+t]`.
|
||||
- **As a GEMM:** let `DU[t][j] = d(t,last)·U[t][j]` (D=diag applied in f32). `S_C[i][j] += Σ_t Kc[t][i]·DU[t][j]` ⇒ `S_C[dk×dv] += Kcᵀ[dk×C] · DU[C×dv]`. **M=dk=128, N=dv=128, K=C=64.** Contraction over the chunk index `t`.
|
||||
- **Schedule:** the accumulator D fragments **are the register-resident state** that persists across the chunk loop. Order is strict: (i) scale the state fragments by `γ_last` in f32 in-register, **then** (ii) mma-accumulate `Kcᵀ·DU` over `C/8` k-subtiles into them. LHS = `Kc` read **transposed** (i as M-row, t as K) — a different fragment view of the same `Kc` smem buffer (use the `ldmatrix` transpose / J-major tile view). B = `DU` = `U` scaled by `d(t,last)` in f32, K-major over t — **same `U` B-layout as P5**.
|
||||
- **Precision: 3xtf32 / f32 — the strongest ladder candidate.** This is the only product whose error *compounds across all `n_tokens/C` chunk steps*; it defines the state trajectory. Keep at 3xtf32 longest; this is the last product to ever consider demoting, and the place where a full-f32 accumulate (3xtf32) is most justified even if everything else passes plain tf32.
|
||||
|
||||
### Product 7 - the A-inverse (blocked forward substitution, FLA UT-transform)
|
||||
0031 does a serial per-thread fwd-subst. Tensor-core form (block `b=16` = one mma M-tile, `C/b=4` blocks at C=64):
|
||||
- For block `i`: `U_i = Ainv_ii·(RHS_i − Σ_{j<i} A_ij·U_j)`.
|
||||
- **The A-inverse-adjacent matmul = the off-diagonal coupling `A_ij·U_j`:** **M=b=16, N=dv=128, K=b=16** ⇒ `1·16·2 = 32` mma/pair; 6 lower pairs at C=64 → **192** mma. Optional materialized-`Ainv_ii` apply is the same shape (~128 more).
|
||||
- **Schedule:** forward sweep `i=0..3`; for each `i` accumulate all `j<i` couplings into a `b×dv` register tile (subtract from `RHS_i`), then apply the `b×b` diagonal inverse. `A`=Amat (from P1, β·d applied in f32) is the LHS; `U_j` blocks are read from `Ud` and updated in place as the sweep advances.
|
||||
- **Precision: split.** Off-diagonal coupling = **tf32-safe** (`A_ij`=β·d·kk is bounded, `d≤1`; well-conditioned for the stable de-gating). The `16×16` **diagonal block inverse stays f32/in-register** (Neumann series on the b-nilpotent, ≤b−1 terms, or a short serial solve) — exact, sensitive, but tiny. This is exactly the scope's recommended structure.
|
||||
|
||||
---
|
||||
|
||||
## 3. Staged-operand sharing graph (load amortization)
|
||||
|
||||
Five smem/register operands, and which products read them — the fusion that makes the added flops nearly free:
|
||||
|
||||
- **`Kc` (C×dk)** — the most-shared buffer. M-major-over-t LHS: P1, P3. K-major-over-i B (`Kcᵀ`): P1, P2. Transposed (i-major, contract t): P6. ⇒ stage once per chunk, feeds 1,2,3,6.
|
||||
- **`Qc` (C×dk)** — LHS for P2 and P4.
|
||||
- **`S0` B-fragments (dk×dv, register-resident state)** — P3 and P4. **Stage once as B, run KS then QS** (heaviest operand, amortized 2×).
|
||||
- **`Amat` (C×C)** — P1 writes `A` → P7 reads `A` → P2 overwrites with `P` → P5 reads `P`. One buffer, lifecycle-reused (as 0031 already does).
|
||||
- **`Ud` (C×dv)** — P7 writes `U` → P5 reads `U` (B, contract t) → P6 reads `U` scaled to `DU` (B, contract t). **P5 and P6 share the identical `U` B-layout** (both contract the chunk dim) → fully shared B-fragments.
|
||||
|
||||
Three concrete fusions worth coding as fused passes:
|
||||
1. **P1+P2** share `Kcᵀ` B (PoC already does this).
|
||||
2. **P3+P4** share `S0` B (stage the 128×128 state-as-B once).
|
||||
3. **P5+P6** share `U` as B (both K=C contractions); compute `P·U` and `Kcᵀ·DU` from one `U` staging, P6 accumulating straight into the persistent state fragments.
|
||||
|
||||
---
|
||||
|
||||
## 4. tf32-safe vs 3xtf32 ladder - summary + recommended A/B order
|
||||
|
||||
**Plain-tf32-safe (well-conditioned, bounded, intra-chunk; bf16 `m16n8k16` is even an option if more throughput is needed):**
|
||||
- P1 `KK`, P2 `QK` (PoC-proven), P5 `P·U` (P bounded/f32-pre-masked), P7 off-diagonal coupling.
|
||||
|
||||
**3xtf32 / f32 ladder (state-boundary, cross-chunk carry, error compounds):**
|
||||
- P6 `Kᵀ(D·U)` — keep at 3xtf32 longest (compounds over every chunk).
|
||||
- P3 `KS` — feeds the solve; 3xtf32 by default.
|
||||
- P4 `QS` — 3xtf32 by default but γ_t-attenuated → **first to demote** to plain tf32 in the precision A/B.
|
||||
- P7 `16×16` diagonal block inverse — stays **f32/in-register** (not a tensor-core op).
|
||||
|
||||
**Recommended precision A/B ladder (drives the KL-gate from `PAGED_BITEXACT_NOTE.md`):** start P3/P4/P6 at 3xtf32 and P1/P2/P5/P7-offdiag at plain tf32. If the KL-gate has margin, demote in order **P4 → P3**, holding **P6 at 3xtf32**. If even all-3xtf32 misses the KL-gate, the residual is the `16×16` diagonal solve precision, not the mma — that already stays f32.
|
||||
|
||||
---
|
||||
|
||||
## 5. Two honest implementation gotchas (not in the scope doc, surface in the mapping)
|
||||
|
||||
1. **Accumulator→B relayout of the state at each chunk boundary.** The register-resident state lives as P6's **D/accumulator** fragments (`tile<16,8>`), but P3/P4 need it as a **B operand** (`tile<8,8>`, K-major over `i`). These fragment layouts differ, so at chunk entry the state must be re-laid-out accumulator→B. Cheapest correct path: bounce the 128×128 state through a transient smem tile (write D fragments, `ldmatrix` back as B fragments) once per chunk — `n_tokens/C` times total, negligible vs the `C`x state-BW saved, but it means the "freed 64KB" needs a *transient* state-shaped smem tile at boundaries (not resident across the intra-chunk work). With dv-slabbing it's per-slab (`dk×dv_tile`).
|
||||
|
||||
2. **`Kc` needs two fragment views.** P1/P2/P3 read `Kc` contracting over `i` (dk); P6 reads `Kc` contracting over `t` (transposed, i as M-row). One smem buffer, but P6 must use the `ldmatrix` transpose / J-major `tile` view — budget for the transposed load, don't assume one staging serves both.
|
||||
|
||||
Ordering within a chunk is already correct in 0031 and must be preserved by the tensor-core version: P3,P4 read **pre-update** `S0` → P7 solve → P5 → **P6 overwrites** `S0`→`S_C`. Single accumulator, read-first/write-last, no state double-buffer needed.
|
||||
|
||||
## 2. A-inverse solve (form-T then apply, FLA UT transform)
|
||||
|
||||
I have everything I need: the chunked GDN math (patch 0031), the scope doc's recommendation, the ggml tf32 `mma.sync` API (`mma(tile<16,8,float>&D, tile<16,8,float>&A, tile<8,8,float>&B)` = m16n8k8), and the proven Gram PoC layout (`g=lane>>2, t=lane&3`; tf32 NMSE ~3e-9). Here is the design.
|
||||
|
||||
---
|
||||
|
||||
# A-inverse solve on `mma.sync` tensor cores (C=64, sm_121a) - design
|
||||
|
||||
Notation: `C=64`, head dim `dk=dv=128`, block size `b=16` (= one `m16n8k8` m-tile), `n_b=C/b=4`. `A = I + N`, `N = tril(beta_t·d(t',t)·(k_t·k_t'), -1)` strictly-lower (nilpotent, `N^C=0`); `RHS[t][j] = beta_t(v_t[j] - gamma_t(S0^T k_t)[j])` is `C×dv`; we want `U = A^{-1}·RHS`.
|
||||
|
||||
## 0. Core decision: form `T=A^{-1}` explicitly, then one wide apply (not direct back-subst)
|
||||
|
||||
Two routes were on the table. **Form `T = A^{-1}` in the `C×C` domain (FLA "UT transform"), then `U = T·RHS` as a single tf32 GEMM** - rather than blocked forward-substitution applied directly to the `C×dv` RHS. Reasons, all decisive on this part:
|
||||
|
||||
1. **Confines the only triangular dependency to the cheap `C×C` domain.** The expensive `dv=128`-wide work (`U=T·RHS`) becomes a dependency-free dense GEMM. The serial part is just the tiny `T`-formation. This is the single most important move for "don't serialize the warps."
|
||||
2. **Fewer serial passes vs `dv`.** Inverting the `16×16` diagonal block once = a 16-column solve. Direct-solving against `RHS` re-solves against all `dv=128` columns per block. Form-`T`-once + reuse via mma is far cheaper in serial work.
|
||||
3. **dv-slab reuse (the occupancy lever).** `T` depends only on `K`, not on `dv`. Form once, reuse for every `dv`-slab's `T·RHS_slab` apply. (Improvement over the scope's conservative "recompute per slab": when single-block, `T` lives in 16KB shared and is broadcast; only when dv-slabbing across separate blocks for occupancy do we recompute - which is cheap anyway, ~12% of the apply's mma count.)
|
||||
4. **Isolates the error amplifier.** All recursion (the part that "amplifies error") lives in the small `T`-formation where 3xtf32 is nearly free; the bulk apply is a single well-conditioned GEMM.
|
||||
|
||||
This still **is** the scope's "blocked forward substitution: in-register diagonal solves + mma off-diagonal coupling" - just organized to produce `T` explicitly so the wide apply is dependency-free.
|
||||
|
||||
## 1. Solve algorithm
|
||||
|
||||
Block-partition `A` into a `4×4` lower-triangular grid of `16×16` blocks. `A_ii = I_b + N_ii` (unit-lower-tri, `N_ii` strictly-lower nilpotent); `A_ij` (i>j) full `16×16`. `T=A^{-1}` is block-lower-tri with:
|
||||
|
||||
```
|
||||
T_ii = A_ii^{-1} (diagonal block inverse)
|
||||
T_ij = -A_ii^{-1} · ( Σ_{m=j}^{i-1} A_im · T_mj ) for i > j (block fwd subst)
|
||||
```
|
||||
|
||||
Then `U = T·RHS`, with `U_i = Σ_{j≤i} T_ij·RHS_j`.
|
||||
|
||||
**Phase D - diagonal inverses (4 blocks, fully parallel).** Each `A_ii` is `16×16` unit-lower-tri. Invert **exactly in f32** via shared-memory column-parallel forward substitution: stage `A_ii` to shared; thread `c` (c=0..15) solves `A_ii x = e_c` (`x_c=1`, `x_r = -Σ_{m=c}^{r-1} A_ii[r][m]·x_m`), writes column `c` of `T_ii`. 16 columns in parallel, ≤16 serial MACs each, all 4 blocks on 4 warps simultaneously. **No tensor cores here, and no reduced precision** - this is where the strongest coupling lives (see §4).
|
||||
|
||||
**Phase O - off-diagonal, mma.** For each i>j: accumulate `P_ij = Σ_m A_im·T_mj` (δ block-products), then `T_ij = -T_ii·P_ij`. All on `mma.sync` (`16×16×16` = `2 n-tiles × 2 k-steps` = 4 m16n8k8 per block-product).
|
||||
|
||||
**Apply.** `U = T·RHS`: warp `w` owns output rows `16w..16w+15`, sweeps all `dv=128` (16 n-tiles) × `C=64` (8 k-steps) = 128 m16n8k8/warp. This is the bulk and is embarrassingly parallel.
|
||||
|
||||
`A`, `P` (the QK term), `RHS`, and `T` are all assembled from tf32 Gram mma's (`KK`,`QK`,`KS`,`QS` - the PoC-proven step-1/2 plus step-3/4) with **all decay/`gamma`/`beta` applied in f32 outside the mma** (preserves bounded de-gating).
|
||||
|
||||
## 2. Tile schedule - keeping the triangular dependency off the warps
|
||||
|
||||
Block = 128 threads = 4 warps; **"warp == 16-row m-tile" throughout** (same mapping as the PoC's C=64 kernel, `rowbase = warp*16`, `g=lane>>2`, `t=lane&3`). Three layered mechanisms keep the warps busy despite the triangular dependency:
|
||||
|
||||
**(a) Wavefront (anti-diagonal) parallelism in `T`-formation.** The 6 off-diagonal blocks have a critical path of only `n_b-1=3`, not 6. Group by distance `δ=i-j`:
|
||||
|
||||
| Wave | Blocks (δ) | count | depends on | mapped to |
|
||||
|---|---|---|---|---|
|
||||
| D | (0,0)(1,1)(2,2)(3,3) | 4 | - | 4 warps ‖ |
|
||||
| 1 | (1,0)(2,1)(3,2) | 3 | D | 3 warps ‖ |
|
||||
| 2 | (2,0)(3,1) | 2 | D,1 | 2 warps ‖ |
|
||||
| 3 | (3,0) | 1 | D,1,2 | 1 warp |
|
||||
|
||||
Within each wave all blocks are independent → one block per warp, no intra-wave serialization. Critical path = 4 dependency levels. Total `T`-formation mma: ~10 accumulation block-products + 6 inverse-applies = ~16 block-products × 4 = **~64 m16n8k8**, vs the apply's **512** (128/warp × 4) - so `T`-formation is ~12% of apply width and carries the only dependency.
|
||||
|
||||
**(b) Confinement.** Because we form `T` then apply, the dependency-laden work is the ~64-mma `C×C` formation; the 512-mma `dv`-wide apply has zero triangular dependency. The serial chain never touches the throughput-critical width.
|
||||
|
||||
**(c) Latency hiding via RHS overlap.** `T` depends only on `K` (→ `A` ← KK Gram). `RHS` depends on `V` and `S0^T k` (KS Gram, `dv`-wide, the expensive RHS term) and is **independent of the solve**. Schedule the wavefront `T`-formation (cheap, short critical path) concurrently with the `dv`-wide KS/QS Grams that build `RHS` and the `O` cross-term. The Phase-D shared scalar inverse (~16 shared round-trips × 4 warps) hides entirely under the KS Gram (thousands of cycles). By the time `T` is ready, `RHS` is staged and the apply fires immediately.
|
||||
|
||||
**Shared/register budget (C=64, state register-resident per the scope):**
|
||||
|
||||
| Buffer | bytes | note |
|
||||
|---|---|---|
|
||||
| `Kc`,`Qc` (bf16/tf32-staged) | 16KB+16KB | Gram operands |
|
||||
| `A`→`T` scratch (`C×C` f32, in place) | 16KB | `A` consumed into `T`; reuses scope's A/P slot |
|
||||
| `RHS`/`U` (`C×dv`) | 16-32KB | bf16 for the P·U and KᵀU mma's |
|
||||
| diag-inverse scratch | ~1KB | `16×16` per warp, transient |
|
||||
| gates `cs/gam/beta` | <1KB | f32 |
|
||||
| state `S` | 0 (registers) | frees the 64KB that forced 0031's C=16 |
|
||||
|
||||
Total ~65-80KB, under the 99KB opt-in - the solve adds **no** net shared pressure (T overwrites A; diag scratch is transient). Per-thread diag-inverse needs ~16 regs (one column of `x`), released before the apply - does not compound the already-heavy state-accumulator register budget.
|
||||
|
||||
## 3. Precision risk assessment
|
||||
|
||||
**Error model.** `‖ΔU‖/‖U‖ ≲ κ(A)·(‖ΔA‖/‖A‖ + ‖ΔRHS‖/‖RHS‖) + ‖Δ_apply‖/‖U‖`. The inverse is the amplifier; `κ(A)` is data-dependent. For DeltaNet, keys are L2-normalized so `|k_t·k_t'|≤1` ⇒ `|N[t][t']|≤beta_t≤1`; in the decaying regime `‖N‖<1` and `κ` is modest, but in the weak-decay/aligned-keys corner `κ` grows and the `δ=3` column path (`T_30`) compounds 3 multiplies. tf32 input rounding is ~`2⁻¹¹`≈`5e-4` relative (f32-accumulate; PoC measured Gram NMSE ~`3e-9`). 3xtf32 (3-limb split, the CUTLASS fp32-emulation trick) buys ~f32 (~`1e-7`) at ~3× that step's mma cost.
|
||||
|
||||
**Where the strong coupling actually sits (the key structural fact):** the *inverse* `T_ii` is computed f32-exact, **but the dominant near-diagonal mixing is applied in the tf32 apply GEMM** (`U_i ⊃ T_ii·RHS_i`), and block-boundary adjacent pairs (e.g. tokens 15↔16) live in the `δ=1` off-diagonal `T_10`. So "f32 protects the strong coupling" is only true for the inverse *computation*; its *application* is tf32 unless promoted. This drives the ladder.
|
||||
|
||||
**Precision config + 3xtf32 ladder (mandatory vs optional):**
|
||||
|
||||
| Step | Default | Mandatory? | 3xtf32 cost |
|
||||
|---|---|---|---|
|
||||
| Diagonal inverse `T_ii` | **f32 (shared scalar)** | **Mandatory-and-free** (it's already f32) | n/a |
|
||||
| Off-diag coupling `A_im·T_mj`, `T_ii·P_ij` | **3xtf32 (default-on)** | Effectively mandatory; ~3× of ~64 tiny mma = **negligible** | free insurance |
|
||||
| KK/QK Gram → A,P | tf32 | optional (rung 1) | 3× of C×C Grams (cheap) |
|
||||
| Apply `U=T·RHS` | tf32 | optional (rungs 2-4) | up to 3× the bulk |
|
||||
| KS/QS Gram → RHS, O | tf32 | optional (rung 5) | vLLM keeps these bf16 (L4-rejected precedent) |
|
||||
|
||||
Decays/`gamma`/`beta` **always f32, outside the mma** - invariant, not a rung.
|
||||
|
||||
**Ladder ordering if the default config misses the KL-gate (cheapest → most expensive):**
|
||||
1. KK Gram (feeds `A`) → 3xtf32 [cheap, C×C].
|
||||
2. Apply **block-diagonal terms only** `T_ii·RHS_i` → 3xtf32 [≈+0.8× apply; protects within-window strong coupling - mixed-precision-by-distance].
|
||||
3. + apply `δ=1` off-diagonal terms → 3xtf32 [covers block-boundary adjacent pairs].
|
||||
4. Full apply → 3xtf32 [≈+2× apply; expensive escape hatch].
|
||||
5. KS/QS Gram → 3xtf32.
|
||||
6. Fall back to direct blocked back-substitution against RHS in 3xtf32 (the alternative route, slightly more accurate than form-`T`-then-multiply at the cost of the parallelism), else keep 0031's serial path.
|
||||
|
||||
**Adversarial `g∈[-20,-1e-4]`:** strong decay ⇒ `d=exp(big-negative)→0` ⇒ off-diagonal `N→0` ⇒ `A≈I`, `T≈I`, apply≈identity, tf32 error vanishes; bounded de-gating (f32) guarantees underflow-to-zero, never inf. Weak decay (`g→0`) ⇒ `d≈1`, `A` well-conditioned, tf32's 8-bit exponent (vs f16's 5) holds the `gamma` dynamic range. The dangerous middle is the only KL-empirical risk - re-run this op case explicitly per the scope.
|
||||
|
||||
**KL impact / gating.** Same gate as the backend's new-FP-path precedents: NMSE is expected to *fail* at reduced precision (this is a new path on a new path) - **the binding gate is KL** (`KLD(tc‖f16) ≤ KLD(seq‖f16)` + PPL band) plus greedy-md5 stability (md5 will not match 0031's serial path - per-path, validated benign). Expectation: the **default config (f32 diagonal + 3xtf32 off-diagonal-coupling + tf32 everything-else)** clears the KL-gate, because (i) the dominant apply matches the PoC Gram's `~3e-9`/tf32-input grade and (ii) the recursion-amplified `C×C` work is f32-grade for free. The expensive apply-3xtf32 rungs are reserved escapes. Worst case all-3xtf32 ≈ 3× the mma cost - still an order of magnitude under 0031's serial-f32 reductions and still net-positive given the `~C×` state-BW cut.
|
||||
|
||||
## 4. Integration + validation
|
||||
|
||||
- Build on `ggml/src/ggml-cuda/mma.cuh`: the tf32 path is `mma(tile<16,8,float>&D, tile<16,8,float>&A, tile<8,8,float>&B)` → `mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32` (line ~1089), gated by `AMPERE_MMA_AVAILABLE` (sm_121-correct). tf32 operands stage to shared and load via `load_generic` (or the PoC's `cvt.rna.tf32.f32` register packing); `ldmatrix` is `.b16`-only so it is **not** usable for tf32 fragments - use `load_generic`. 3xtf32 = 3 limb-split tf32 mma's + f32 recombine, reusing the same tile type.
|
||||
- Slots into patch 0031's `gated_delta_net_chunked_cuda`: replace the serial `KK`/`QK` Grams (lines 183-198, 222-235) with the PoC tiles; replace the serial forward-subst (lines 211-218) with this Phase-D + Phase-O `T`-formation; replace the serial `O`/`S_C` reductions (steps 5/6) with the apply + `KᵀU` mma. `C=16`→`C=64`, state register-resident.
|
||||
- Validation step before the full kernel: extend `~/scratch_tc_gdn_poc/gdn_gram_bench.cu` (DGX) to microbench Phase-D+O `T`-formation and the `T·RHS` apply in isolation, with the f64 host oracle measuring `κ(A)` and the tf32-vs-3xtf32 NMSE per rung - mirrors the Gram PoC that proved 6.7×→9.3×. This tells you, cheaply, whether the default precision config will clear the KL-gate before committing to the multi-week kernel (Phase 2 of the scope's plan).
|
||||
|
||||
**Net:** the solve adds zero shared pressure, ~12% mma overhead over the apply, a 4-level (not 64-step) critical path fully hidden under the RHS Grams, an f32-exact inverse, and a cheap-by-default 3xtf32 placement aimed exactly at the recursion-amplified term - so the A-inverse is **not** the bottleneck and **not** the precision wall; the residual risk is the data-dependent `κ(A)` in the weak-decay corner, arbitrated by the KL-gate with the apply-3xtf32 rungs as the escape.
|
||||
|
||||
Files referenced (absolute): `/home/mudler/_git/LocalAI/.claude/worktrees/feat+paged-attention/backend/cpp/llama-cpp-localai-paged/docs/TENSORCORE_GDN_SCOPE.md`; `/home/mudler/_git/LocalAI/.claude/worktrees/feat+paged-attention/backend/cpp/llama-cpp-localai-paged/patches/paged/0031-paged-chunked-gdn-prefill-scan-kernel.patch`; ggml tf32 mma at `ggml/src/ggml-cuda/mma.cuh` (line ~1089); DGX PoC `~/scratch_tc_gdn_poc/gdn_gram_bench.cu`.
|
||||
|
||||
## 3. Register-resident state + C=64 budget
|
||||
|
||||
I have everything I need: the scope doc, the full 0031 kernel (smem layout, the 7 product families, the chunk-loop sequencing), and the sibling FP4-MMA GEMM kernel's concrete patterns (PAD=4 u32 rows, STAGES=2 cp.async, WM=4/WN=2 8-warp grid, ldmatrix, register-resident accumulators). Here is the concrete data-layout design.
|
||||
|
||||
---
|
||||
|
||||
# Register-resident state + C=64 design (tensor-core chunked GDN, sm_121a / mma.sync)
|
||||
|
||||
## 0. Block/warp configuration (the frame everything hangs on)
|
||||
|
||||
0031 used `block_dims(128,1,1)` = 128 threads, "thread j owns v-column j" - a **column-parallel scalar** model. The tensor-core kernel must abandon that and adopt a **warp-tiled** model (same as the sibling GEMM kernel):
|
||||
|
||||
- **256 threads = 8 warps**, arranged as a **WARPS_M x WARPS_N = 4 x 2** warp grid (the GEMM kernel's proven `WM=4, WN=2`).
|
||||
- Threads no longer own columns; warp `(wm,wn)` owns a rectangular sub-tile of each matrix and drives `mma.sync` on it.
|
||||
- Precision: **tf32 m16n8k8** for the S-coupled / decay-coupled products, **bf16 m16n8k16** allowed only for the well-conditioned intra-chunk Gram terms (KK, QK). f32 accumulate throughout. Decays/`gamma`/`beta` stay f32, applied outside the mma (preserve bounded de-gating).
|
||||
|
||||
This 4x2 warp grid is the denominator for every ownership calc below.
|
||||
|
||||
---
|
||||
|
||||
## 1. The one hard problem: S is an *accumulator* for step 6 but an *operand* for steps 3/4
|
||||
|
||||
This is the crux the scope hand-waves ("read S as the stationary operand; step 6 accumulates into it"). The register fragment layouts are **not** interchangeable:
|
||||
|
||||
| Use | Role | mma shape | S indexing | Fragment layout |
|
||||
|---|---|---|---|---|
|
||||
| Step 6 `S += Kᵀ(D·U)` | **accumulator (D/C)** | m=dk, n=dv, k=C | `S[i][j]`, m=i, n=j | `tile<16,8,float>` acc grid |
|
||||
| Step 3 `KS = K·S` | **B operand** | m=C, n=dv, k=dk | `S[i][j]`, k=i, n=j | `tile<8,8,float>` B frag |
|
||||
| Step 4 `QS = Q·S` | **B operand** | m=C, n=dv, k=dk | same as step 3 | `tile<8,8,float>` B frag |
|
||||
|
||||
An accumulator fragment's thread→element map differs from a B-operand fragment's, so **you cannot feed the persistent S registers directly into the step-3/4 mma.** A bridge is mandatory. The design decision:
|
||||
|
||||
> **S lives register-resident in the step-6 ACCUMULATOR layout** (it is written every chunk; that is the hot path). Steps 3/4 reach it via a **once-per-chunk restage to a small smem tile, re-read with `ldmatrix`** as B-operand fragments.
|
||||
|
||||
The restage cost is paid `n_tokens/C` times (not per token) - it is *inside* the BW saving the whole lever buys. And critically, the restage smem **time-multiplexes onto the Uc/Amat region**: at chunk entry (when KS/QS are needed) U and A for this chunk are not yet computed, so their buffers are free to hold the S restage. **Net additional persistent smem for the state: 0KB** - the scope's "0KB shared state" holds, with this scheduling caveat made explicit.
|
||||
|
||||
KS and QS read the **same** pre-update S0, so restage once → do both → then overwrite with U.
|
||||
|
||||
---
|
||||
|
||||
## 2. Register allocation map (per thread, 256-thread block)
|
||||
|
||||
State `S` is `dk x dv = 128 x 128` f32 = 16384 elems. Distributed over 256 threads = **64 f32/thread** at full dv.
|
||||
|
||||
| Register class | Lifetime | Full dv=128 | dv-slab=64 | dv-slab=32 | Layout / ownership |
|
||||
|---|---|---|---|---|---|
|
||||
| **Persistent S accumulator** | whole chunk loop | **64 regs** | **32 regs** | **16 regs** | warp `(wm,wn)` owns dk-rows `[wm·32, +32)` x dv-cols `[wn·(dv/2), +dv/2)`; = 2 m-tiles x (dv/2/8) n-tiles of `tile<16,8,float>`, 4 f32 each |
|
||||
| Transient A-operand frag | per product | 4 regs/tile | same | same | `tile<16,8,float>` (tf32 packs k8) reused across KK/QK/KS/QS/O/Supd |
|
||||
| Transient B-operand frag | per product | 2 regs/tile | same | same | `tile<8,8,float>` |
|
||||
| Transient product accumulator (KK/QK/KS/QS/O) | per product, then spilled to smem | ≤8 tiles·4 = 32 regs | ≤16 | ≤8 | these outputs go to smem; acc is transient, reused |
|
||||
| A⁻¹ diagonal-block solve (16x16, in-registers) | step 7 only | ~8-12 regs | same | same | one `b=16` unit-lower-tri block per warp-row, scalar Neumann/`<b-1` terms |
|
||||
| loop/index/gate scalars | always | ~12 regs | same | same | c0, Cc, cs/gam/beta locals |
|
||||
|
||||
**Per-thread totals (256 threads):**
|
||||
- Full dv: 64 (S) + ~50 (transients, non-overlapping with S) + ~12 ≈ **~130 regs/thread** → fits **1 block/SM** (256 regs/thread budget at 65536/SM÷256). 2 blocks/SM (128 regs/thread cap) would spill - hence dv-slab for occupancy.
|
||||
- dv-slab 64: 32 (S) + ~50 + 12 ≈ **~94 regs/thread** → fits **2 blocks/SM** (128-reg cap). ✓
|
||||
- dv-slab 32: 16 (S) → ~78 regs/thread → headroom; grid x4.
|
||||
|
||||
Persistent-state register pressure is the occupancy gate; everything else is transient and reused across the 7 products.
|
||||
|
||||
---
|
||||
|
||||
## 3. Shared-memory allocation map (PAD-padded, conflict-free)
|
||||
|
||||
Apply the GEMM kernel's lesson verbatim: **PAD = 4 (in the row's element width)** so a 128-wide row (a multiple of the 32 banks → 8-way conflict for the 8-row `ldmatrix`) becomes stride `132`, and the 8 rows of an `ldmatrix.m8n8` land in 8 distinct banks: `(r·132 + c) mod 32 = (4r + c) mod 32`, distinct for `r=0..7`. ✓
|
||||
|
||||
| Buffer | Logical shape | Row stride (padded) | Element | Notes |
|
||||
|---|---|---|---|---|
|
||||
| `Kc` (chunk K) | `[C][dk]` | `dk + 8` bf16 (= +4 u32) | bf16 | A-operand for KK/QK; A/Bᵀ for KS; transposed-A for Supd. bf16 default |
|
||||
| `Qc` (chunk Q) | `[C][dk]` | `dk + 8` bf16 | bf16 | A-operand for QK/QS/O |
|
||||
| `Uc` (solved U) | `[dv][C]` | `C + 4` f32 | f32 | f32 for the triangular solve accuracy; B-operand (down-cast tf32) for O & Supd |
|
||||
| `Amat` (A then P) | `[C][C]` | `C + 4` f32 | f32 | KK→A→solve, then reused for QK→P; decays applied in f32 here |
|
||||
| `gates` cs/gam/beta | `[3·C]` | (1-D, no pad) | f32 | prefix-sum + `expf`, f32 always |
|
||||
| **S-restage tile** | `[dk][dv_strip]` | `dv_strip + 4` f32 | f32 | **overlays Uc∪Amat** at chunk entry; not additive at peak |
|
||||
| `cp.async` stage dup | (STAGES=2 on Kc/Qc) | as Kc/Qc | bf16 | Phase-3 latency hiding only |
|
||||
|
||||
PAD widths: f32 tiles +4 elems; bf16 tiles +8 elems (= +4 u32, identical bank offset as the GEMM kernel). `Uc` and `Amat` are padded on the C dimension (their `ldmatrix` access dimension).
|
||||
|
||||
---
|
||||
|
||||
## 4. C=64 shared budget table (under the 99KB opt-in)
|
||||
|
||||
Byte math with PAD included (`KB = bytes/1024`):
|
||||
|
||||
| Buffer | CONFIG A — **default**: C=64, dv=128, K/Q **bf16** | CONFIG B: C=64, dv=128, K/Q **tf32** | CONFIG C — **2 blk/SM**: C=32, dv-slab=64, K/Q bf16 |
|
||||
|---|---|---|---|
|
||||
| `Kc` | 64·136·2 = **17.0KB** | 64·132·4 = 33.0KB | 32·136·2 = 8.5KB |
|
||||
| `Qc` | **17.0KB** | 33.0KB | 8.5KB |
|
||||
| `Uc` (f32) | 128·68·4 = **34.0KB** | 34.0KB | 64·36·4 = 9.0KB |
|
||||
| `Amat` (f32) | 64·68·4 = **17.0KB** | 17.0KB | 32·36·4 = 4.5KB |
|
||||
| gates | **0.75KB** | 0.75KB | 0.4KB |
|
||||
| S-restage | overlay (0 net) | overlay (0 net) | overlay (0 net) |
|
||||
| **Per-block total** | **≈ 85.8KB** ✅ < 99 | **≈ 117.8KB** ❌ | **≈ 30.9KB** |
|
||||
| Blocks/SM (≈100KB/SM) | **1** | n/a | **2** (61.8KB) ✅ |
|
||||
|
||||
Read-out:
|
||||
- **CONFIG A is the recommended default**: C=64 (4x the 0031 chunk), full dv, fits at ~86KB with margin, 1 block/SM. Peak is the O/Supd phase (all of Kc+Qc+Uc+Amat live).
|
||||
- **CONFIG B (tf32 K/Q) is budget-hostile** (117KB) - tf32 K/Q tiles don't shrink with dv-slab (they're `C x dk`), so even dv-slab=64 lands ~101KB. **Conclusion: stage K/Q as bf16; reserve tf32/3xtf32 for the S-coupled and decay-coupled terms** (which arrive via the small streamed S-restage and the f32 gate scaling), exactly per the scope's "bf16 only for well-conditioned Gram terms."
|
||||
- **CONFIG C is the 2-block/SM lever**: C=32 + dv-slab=64 → 31KB/block, two resident blocks under the ~100KB/SM total, and the grid grows to `H x n_seqs x 2`.
|
||||
|
||||
---
|
||||
|
||||
## 5. dv-slab strategy (the 2nd block/SM + grid-starvation fix)
|
||||
|
||||
Split the `dv=128` value dimension into `n_slabs` blocks; each block computes a `dv_tile`-wide vertical strip of O and of the state.
|
||||
|
||||
- **Grid**: `dim3(H, n_seqs, n_slabs)` (was `(H, n_seqs)`). `n_slabs ∈ {1,2,4}` for `dv_tile ∈ {128,64,32}`. This **multiplies the grid by `n_slabs`**, directly attacking 0031's low-`n_seqs` grid starvation.
|
||||
- **What is dv-independent and must be recomputed per slab**: `A` (KK Gram), the `A⁻¹` solve, the gate prefix - all depend only on K and the gates, *not* dv. Each slab recomputes them. Cheap once they are on tensor cores (the whole point); this is the FLA per-slab pattern.
|
||||
- **What is dv-sliced**: the S accumulator (`128 x dv_tile`), `Uc` (`dv_tile x C`), KS/QS/O outputs, step-6 update. Halving/quartering dv halves/quarters both the **S register footprint** (64→32→16 regs/thread, §2) and the dv-scaled smem (`Uc`, restage).
|
||||
- **Restage budget bonus**: at `dv_tile=64` the per-block S is `128 x 64` = 32KB, so the once-per-chunk restage fits the Uc∪Amat overlay window in a single pass (no strip loop). At full dv=128 the restage is done as **2 dv-strips of 32KB** reusing the same overlay (or 16 k-strips of 8x128 if registers are tighter than smem).
|
||||
|
||||
`b`-block forward substitution (step 7) is independent of dv too, so the in-register `16x16` diagonal solves are computed once and the off-diagonal mma coupling `Uᵢ -= Aᵢⱼ Uⱼ` runs per slab as a `(16x16)·(16 x dv_tile)` mma.
|
||||
|
||||
---
|
||||
|
||||
## 6. Bank-conflict-free layout - the GEMM PAD lesson applied
|
||||
|
||||
Concretely, per the sibling kernel's `ARS = KBLK·SAW + PAD` with `PAD=4` (which gave +19%):
|
||||
|
||||
- Every smem matrix read by `ldmatrix` (or its tf32 equivalent in `ggml/src/ggml-cuda/mma.cuh`) is stored with **row stride = logical_width + PAD**, PAD chosen so `stride mod 32 ≠ 0`: f32 width-128 → 132 (`132 mod 32 = 4`); bf16 width-128 (packed 64 u32) → 68 u32 (`68 mod 32 = 4`).
|
||||
- This guarantees the 8 rows an `m8n8` `ldmatrix` touches map to 8 distinct banks for any fixed column → no replays on the operand loads, which are the kernel's inner-loop smem traffic.
|
||||
- `cp.async` (CONFIG, Phase 3): `STAGES=2` double-buffer on `Kc`/`Qc` only (the GEMM kernel found multistage saturates BW past depth 2). 16B `cp.async.cg` copies into the padded rows; `commit_group`/`wait_group` Ampere-style (no TMA on sm_121). The pad keeps the staged writes coalesced and the mma reads conflict-free simultaneously.
|
||||
|
||||
---
|
||||
|
||||
## 7. Summary of the allocation decisions
|
||||
|
||||
| Decision | Value |
|
||||
|---|---|
|
||||
| Threads / warp grid | 256 / 4x2 (WM=4, WN=2) |
|
||||
| **State residency** | register-resident in **step-6 accumulator layout** (`tile<16,8,float>` grid), 64/32/16 f32-regs/thread at dv 128/64/32 |
|
||||
| **Accumulator↔operand bridge** | once-per-chunk `ldmatrix` restage of S to a small smem tile that **overlays Uc∪Amat** (0 net persistent smem); KS+QS share one restage |
|
||||
| K/Q precision | **bf16** staged (tf32 K/Q breaks the 99KB budget); tf32/f32 reserved for S-coupled + decay-coupled terms |
|
||||
| Uc / Amat | f32, padded on C (+4) |
|
||||
| **PAD** | +4 f32 (+8 bf16 = +4 u32) row stride → `ldmatrix` 8-row conflict-free (GEMM-proven) |
|
||||
| **C=64 default budget** | **≈86KB**, 1 block/SM (CONFIG A) ✅ |
|
||||
| 2 blocks/SM | C=32 + dv-slab=64 → ≈31KB/block, grid x2 (CONFIG C) |
|
||||
| dv-slab | grid `(H, n_seqs, n_slabs)`; A/A⁻¹/gates recomputed per slab; S/Uc/O dv-sliced |
|
||||
| cp.async | STAGES=2 on Kc/Qc (Phase 3 only) |
|
||||
|
||||
One honest caveat surfaced beyond the scope doc: the scope's "~64-80KB / 0KB shared state" budget only holds with **bf16 K/Q staging** and the **overlay restage**; tf32 K/Q (CONFIG B) does not fit even with dv-slab, which is why bf16-Gram + tf32-only-for-S/decay is the forced (and scope-consistent) precision split.
|
||||
|
||||
Files referenced: `/home/mudler/_git/LocalAI/.claude/worktrees/feat+paged-attention/backend/cpp/llama-cpp-localai-paged/docs/TENSORCORE_GDN_SCOPE.md`, `/home/mudler/_git/LocalAI/.claude/worktrees/feat+paged-attention/backend/cpp/llama-cpp-localai-paged/patches/paged/0031-paged-chunked-gdn-prefill-scan-kernel.patch`, and the sibling GEMM patterns in `/home/mudler/_git/LocalAI/.claude/worktrees/feat+paged-attention/backend/cpp/llama-cpp-localai-paged/patches/paged/0034-feat-paged-native-NVFP4-W4A4-FP4-MMA-large-M-prefill.patch`.
|
||||
|
||||
## 4. Occupancy + launch config
|
||||
|
||||
I have everything I need. Key facts confirmed: tf32 tile is `mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32` (mma.cuh:1089, accumulator `tile<16,8,float>` = 4 f32/thread/tile); grid.x = `H` = `n_v_heads` = `ssm_dt_rank` (llama-model.cpp:504; Qwen3-Next family = 32 GDN value heads); the sequential kernel already uses a 3D grid `(H, n_seqs, ceil(S_v/num_warps))` (gated_delta_net.cu:184) — the chunked 0031 collapsed that z-axis to 1, which is exactly what starves the grid.
|
||||
|
||||
Here is the occupancy + launch design.
|
||||
|
||||
---
|
||||
|
||||
# Occupancy + launch design — tensor-core chunked GDN prefill (sm_121a)
|
||||
|
||||
## 0. The two independent caps 0031 hit (must relieve BOTH for ≥2 blocks/SM)
|
||||
|
||||
0031's -22% is not one wall, it's two stacked walls, and they are relieved by *different* levers:
|
||||
|
||||
| Cap | 0031 value | Binding resource | Lever |
|
||||
|---|---|---|---|
|
||||
| **Shared-memory cap** | 89 KB (64 KB all-shared state) | 100 KB/SM, 99 KB dyn opt-in | state→registers **+ smaller C** |
|
||||
| **Register cap** | n/a (was scalar) | 65536 regs/SM | **dv-slab** the register-resident state |
|
||||
| **Grid cap** | `(H, n_seqs, 1)` = 32·n_seqs blocks | 48 SMs | **dv-slab multiplies grid** by n_slabs |
|
||||
|
||||
sm_120/121-class per-SM limits used throughout: **1536 threads/SM, 65536 32-bit regs/SM, 100 KB shared/SM (99 KB dynamic opt-in), 255 regs/thread, ≤24-32 blocks/SM (hw, never the binding limit here).** The binding limits are **shared and registers.**
|
||||
|
||||
Critical correction to the scope-doc budget table: it assumes **bf16** K/Q staging (2 B). The precision default is **tf32**, which is a 32-bit container in shared — tf32 K/Q would *double* Kc/Qc and blow C=64 past 99 KB. So the occupancy config **stages K/Q as bf16** (the well-conditioned KK/QK Gram products per the scope's "bf16 only for Gram terms"), keeps gates/decays/beta/the solve in f32. This is a real precision↔occupancy coupling, flagged in §5.
|
||||
|
||||
## 1. Grid mapping — three parallel axes, the chunk axis is serial
|
||||
|
||||
The inter-chunk recurrence carries state `S` across chunks, so **the chunk axis cannot be a grid axis** (it's the sequential dependency — that's the whole algorithm). The only legitimate grid axes that don't break the recurrence are:
|
||||
|
||||
```
|
||||
dim3 grid(H, n_seqs, n_slabs); // H = n_v_heads = 32 (ssm_dt_rank)
|
||||
// n_slabs = dv / dv_tile (the new lever)
|
||||
```
|
||||
|
||||
- `blockIdx.x = head` (0..31), `blockIdx.y = seq`, `blockIdx.z = dv-slab`.
|
||||
- A block owns v-columns `[z·dv_tile, (z+1)·dv_tile)`, walks the chunk loop serially, and keeps **only its `dk × dv_tile` state slab** register-resident.
|
||||
- This reuses the **same 3D grid shape the sequential kernel already has** (gated_delta_net.cu:184 uses z for S_v-splitting); the chunked kernel repurposes z from S_v-split to dv-slab. The dispatcher change is minimal.
|
||||
|
||||
**Saturation math (the core of the task).** Target ≥2 blocks/SM on 48 SMs ⇒ **≥96 concurrent blocks**. With H=32:
|
||||
|
||||
| n_seqs | dv_tile=128 (n_slabs=1) | dv_tile=64 (2) | dv_tile=32 (4) |
|
||||
|---|---|---|---|
|
||||
| 1 | 32 (starved, 0031) | 64 (48/48 SMs busy, 67% warp-occ) | **128 (100%)** |
|
||||
| 2 | 64 | **128 (100%)** | 256 |
|
||||
| 4 | 128 | 256 | 512 |
|
||||
|
||||
So **dv-slabbing is simultaneously the register-relief lever and the grid-multiplier** — it's the single most important move. Rejected grid alternatives: split-K over dk (needs cross-block atomic reduction + fights the state carry); batching heads/seqs per block (reduces grid, wrong direction).
|
||||
|
||||
## 2. Block dim / warp count — 8 warps / 256 threads
|
||||
|
||||
```
|
||||
constexpr int WARPS = 8;
|
||||
dim3 block(32 * WARPS, 1, 1); // 256 threads
|
||||
```
|
||||
|
||||
Why 8 warps:
|
||||
- **Clean mma tile partition at C=32:** KK/QK output is `C×C = 32×32` = (32/16)·(32/8) = **8 m16n8 tiles → exactly 1 tile/warp**, dk=128 = 16 k8-steps. Steps 3/4 (KS/QS) and 5 (P·U) → 2 tiles/warp. Step 6 state update `dk×dv_tile`=128×64 → 64 tiles → **8 tiles/warp** (these are the persistent register-resident accumulators).
|
||||
- **Register dilution:** the register-resident state accumulator is spread across all 256 threads (see §3) — more warps = fewer state-regs/thread.
|
||||
- **Threads are not the cap:** 256 threads ⇒ up to 6 blocks/SM by the 1536 thread limit, so registers/shared decide.
|
||||
|
||||
Fallback if register-capped (§5): **12 warps (384 threads)** dilutes the state accumulator further (dv_tile=64: 32→21 state-regs/thread) at the cost of thinner per-warp tiles and ≤4 blocks/SM by threads.
|
||||
|
||||
## 3. Register-resident state ↔ dv-slab ↔ occupancy interaction
|
||||
|
||||
The state slab is held as **tf32 mma accumulator fragments** (`tile<16,8,float>`, 4 f32/thread/tile) persisting across the chunk loop. Per-thread state-register cost = `dk·dv_tile / 256`:
|
||||
|
||||
| dv_tile | state f32/block | state regs/thread (256 thr) | + working (est.) | regs/thread | regs/block | reg-allowed blocks/SM |
|
||||
|---|---|---|---|---|---|---|
|
||||
| 128 (no slab) | 16384 | 64 | ~50 | ~114 | ~29 K | 2 (tight) |
|
||||
| 64 | 8192 | 32 | ~50 | ~82 | ~21 K | 3 |
|
||||
| 32 | 4096 | 16 | ~50 | ~66 | ~17 K | 3 |
|
||||
|
||||
So on registers alone, dv_tile≤64 admits ≥2 blocks/SM. **Shared memory is then the binding cap**, and it's governed by **C**, not dv_tile (Kc/Qc/A all scale with C, only U scales with dv_tile):
|
||||
|
||||
| Config | Kc+Qc (bf16) | A/P (f32) | U (f32) | single | +cp.async dbl-buf K/Q | blocks/SM (shared) |
|
||||
|---|---|---|---|---|---|---|
|
||||
| C=64, dv_tile=128 | 32 KB | 16 KB | 32 KB | 80 KB | 112 KB ✗(no room!) | **1** |
|
||||
| C=64, dv_tile=64 | 32 KB | 16 KB | 16 KB | 64 KB | 96 KB ✓ | **1** |
|
||||
| **C=32, dv_tile=64** | 16 KB | 4 KB | 8 KB | **28 KB** | **44 KB ✓** | **2** |
|
||||
| C=32, dv_tile=32 | 16 KB | 4 KB | 4 KB | 24 KB | 40 KB ✓ | **2** |
|
||||
|
||||
**Finding the scope doc missed:** C=64-no-slab is shared-saturated at 80 KB — there is **no room for cp.async double-buffering**, so the 1-block/SM kernel would have *no latency hiding* and likely still lose. C=64 needs dv_tile≤64 *just to make room for cp.async*, and is still 1 block/SM. **Genuine 2 blocks/SM requires C=32** (to drop Kc/Qc/A under the 49.5 KB/block budget).
|
||||
|
||||
## 4. cp.async double-buffering (depth 2, no TMA)
|
||||
|
||||
At 1 block/SM (C=64 path) cp.async is the *only* latency-hiding mechanism, so it's mandatory, not optional. Plain Ampere `cp.async` (`cp.async.commit_group` / `cp.async.wait_group`) — **no `cp.async.bulk`/TMA on sm_121.** Stage the **next chunk's Kc, Qc** (and Vc if the KL-gate doesn't force V from global) into a second shared buffer while the current chunk's mma runs. Depth **2 only** — the sibling GEMM kernel proved multistage saturates BW past depth 2. The double-buffer cost is already in the "+cp.async" column above (44 KB at C=32 keeps 2 blocks/SM).
|
||||
|
||||
## 5. Launch config (concrete) + honest occupancy estimate
|
||||
|
||||
**Recommended default (batched-prefill serving regime, n_seqs≥2):**
|
||||
```
|
||||
C = 32 ; dv_tile = 64 ; n_slabs = 2 ; WARPS = 8
|
||||
grid = dim3(H=32, n_seqs, 2)
|
||||
block = dim3(256, 1, 1)
|
||||
smem = 44 KB (Kc/Qc bf16 ×2 dbl-buf + A/P f32 + U f32) // cudaFuncSetAttribute return CHECKED (0031 precedent)
|
||||
→ 2 blocks/SM. n_seqs≥2 ⇒ ≥128 blocks ⇒ 48/48 SMs at full 2-block occupancy (100%), 1.33 waves.
|
||||
A/Gram/solve recomputed 2× across slabs (state-update per slab is 2× the A work ⇒ ~25% redundant-flop overhead).
|
||||
```
|
||||
|
||||
**Single-stream prefill (n_seqs=1) saturator:**
|
||||
```
|
||||
C = 32 ; dv_tile = 32 ; n_slabs = 4 ; WARPS = 8
|
||||
grid = dim3(32, 1, 4) = 128 blocks ⇒ 2 blocks/SM on all 48 SMs (100%) even at n_seqs=1.
|
||||
Cost: A recomputed 4×, and at dv_tile=32 the A bucket ≈ the per-slab state bucket ⇒ ~1.5-2× total-flop overhead.
|
||||
```
|
||||
|
||||
**BW-max alternative (1 block/SM, bench against the above):**
|
||||
```
|
||||
C = 64 ; dv_tile = 64 ; n_slabs = 2 ; WARPS = 8 ; smem = 96 KB (dbl-buf, fits 99 KB)
|
||||
→ 1 block/SM, but 4× state-BW cut (vs 2× at C=32) + grid ×2. n_seqs=1 ⇒ 64 blocks ⇒ 48/48 SMs busy (67% warp-occ).
|
||||
```
|
||||
|
||||
**Occupancy summary:**
|
||||
|
||||
| Config | blocks/SM | regs/thread | smem/block | SM util @ n_seqs=1 | SM util @ n_seqs≥2 | state-BW cut | redundant-A |
|
||||
|---|---|---|---|---|---|---|---|
|
||||
| 0031 | 1 | scalar | 89 KB | 32/48 busy (starved) | 1024 blk, no overlap | 1× (C=16) | none |
|
||||
| C=32 dv64 (default) | **2** | ~82 | 44 KB | 48 busy, 67% occ | **100%** | 2× | 2× (~25%) |
|
||||
| C=32 dv32 (1-seq) | **2** | ~66 | 40 KB | **100%** | 100% | 2× | 4× (~1.5-2×) |
|
||||
| C=64 dv64 (BW-max) | 1 | ~114 | 96 KB | 48 busy, 67% occ | 100%, multi-wave | **4×** | 2× |
|
||||
|
||||
The C=32 (occupancy) vs C=64 (BW) choice is the empirical fork the scope doc defers to Phase-3 bench: 2 blocks/SM at half the BW saving, vs 1 block/SM at full BW saving + cp.async. **Wire both behind the existing `GDN_CHUNK_MIN` gate plus a `GDN_CHUNK_C` / `GDN_DV_TILE` selector and A/B them; do not assume.**
|
||||
|
||||
## 6. Residual risk — register pressure likely caps it at 1 block/SM (honest)
|
||||
|
||||
The ≥2-blocks/SM result rests on the **~50 working-regs/thread estimate**, which is optimistic:
|
||||
|
||||
- **The blocked-forward-subst A⁻¹ (step 7) is the swing factor.** The in-register 16×16 unit-lower-triangular diagonal inverse + the off-diagonal mma coupling + mma operand fragments + the **accumulator→operand fragment transpose** for reusing the register-resident S as a step-3/4 operand (a `movmatrix`/shared round-trip, since S lives in C-fragment layout but steps 3/4 need it as an A/B operand) can push working regs to **80-120**. At 256 threads, regs/thread > 128 ⇒ > 32 K regs/block ⇒ **silently drops to 1 block/SM** regardless of the 44 KB shared headroom. The scope doc names this exactly: "blocked-forward-subst register pressure trades against state-register pressure; both compete for the same budget."
|
||||
- **Mitigation ladder, in order:** (i) 12 warps to dilute (dv_tile=64: state 32→21 regs/thread); (ii) `__launch_bounds__(256, 2)` to force the compiler under 128 regs/thread (risks spills to local → BW back); (iii) smaller dv_tile (more grid, more redundant A). If all fail, accept **1 block/SM and lean on cp.async double-buffering + the 4× BW cut + mma throughput** — which is still very likely a win over 0031's serial-f32/-22%, just not the 2-block target.
|
||||
- **Grid-starvation at n_seqs=1 is structural** (H=32 < 96): only dv_tile=32 fully saturates a single stream, and it pays ~1.5-2× redundant-A flops. This is an inherent floor of the chunked formulation on a 32-head layer; batched serving (n_seqs≥2) is where the design cleanly saturates.
|
||||
- **Precision↔occupancy coupling:** the 2-block budget assumes **bf16 K/Q** staging. If the KL-gate demands tf32 for the KS/QS S0-products (decay-coupled), that needs a second 32-bit K/Q copy or 3×tf32 — both inflate shared/registers and can knock C=32 back toward 1 block/SM. The occupancy win is contingent on bf16 Gram clearing the gate.
|
||||
|
||||
**Bottom line:** 2 blocks/SM and full 48-SM saturation are *reachable* — via **C=32 + dv-slab (64 for serving, 32 for single-stream) + 8 warps + depth-2 cp.async** — on the **shared-memory** budget. Whether the **register** budget also permits it is the one genuinely open risk, and the A⁻¹ solve's footprint is what decides it; the realistic outcome remains the scope doc's honest **1-2 blocks/SM**, with the win carried by mma + cp.async + the C× BW cut rather than by high occupancy.
|
||||
|
||||
Files: design lives in `backend/cpp/llama-cpp-localai-paged/docs/TENSORCORE_GDN_SCOPE.md` (§3 occupancy plan — this analysis sharpens it); kernel/dispatcher to modify `backend/cpp/llama-cpp-paged-dev/ggml/src/ggml-cuda/gated_delta_net.cu` (grid at :184, chunked launch added by patch `patches/paged/0031-paged-chunked-gdn-prefill-scan-kernel.patch`); tf32 tile at `ggml/src/ggml-cuda/mma.cuh:1089`; H source `src/llama-model.cpp:504`.
|
||||
|
||||
## 5. Synthesized build plan + milestones + gate
|
||||
|
||||
All anchors verified. 0031's kernel body, the 7-step structure, the `GDN_CHUNK_MIN`/`GDN_CHUNK_OFF` gating at the `if constexpr (!KDA && !keep_rs_t)` site, the `launch_gdn_chunked<128,16>` template, the smem formula, and the test-backend-ops shapes are all confirmed. The scope doc's KL gate, 3xtf32 ladder, risk register, and Phase 0-3 plan are confirmed. Here is the build-ready synthesis.
|
||||
|
||||
---
|
||||
|
||||
# BUILD-READY PLAN: tensor-core chunked-GDN prefill kernel (sm_121a)
|
||||
|
||||
Anchors (absolute): kernel + launch + gate live in `ggml/src/ggml-cuda/gated_delta_net.cu` (grid at :184), added by `/home/mudler/_git/LocalAI/.claude/worktrees/feat+paged-attention/backend/cpp/llama-cpp-localai-paged/patches/paged/0031-paged-chunked-gdn-prefill-scan-kernel.patch`. tf32 tile `mma(tile<16,8,float>&D, tile<16,8,float>&A, tile<8,8,float>&B)` = `mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32` in `ggml/src/ggml-cuda/mma.cuh` (m16n8k8 overload ~976-984, dispatch ~1089), gated by `AMPERE_MMA_AVAILABLE`. PAD/cp.async patterns from `patches/paged/0034-feat-paged-native-NVFP4-W4A4-FP4-MMA-large-M-prefill.patch`. Gate/precedent docs: `docs/TENSORCORE_GDN_SCOPE.md`, `docs/PAGED_BITEXACT_NOTE.md`, `README.md` s5. Microbench: `~/scratch_tc_gdn_poc/gdn_gram_bench.cu` (DGX). Last patch in series is 0042 → this work is patches 0043+.
|
||||
|
||||
The new kernel is `gated_delta_net_chunked_tc_cuda<S_v, C, DV_TILE>`, a sibling to 0031's `gated_delta_net_chunked_cuda`. Symbols below reuse 0031's smem names (`Sd, Kc, Qc, Ud, Amat, csh, gam, bet`).
|
||||
|
||||
---
|
||||
|
||||
## (1) Phase-by-phase kernel structure
|
||||
|
||||
Block = **256 threads / 8 warps** in a **4×2 (WM×WN)** warp grid. State `S` (`dk×dv_tile`) is **register-resident in the step-6 accumulator layout** (`tile<16,8,float>` grid). Grid = `dim3(H, n_seqs, n_slabs)`, `blockIdx.z` = dv-slab. Chunk axis is the serial recurrence (NOT a grid axis). Invariant preserved from 0031: read pre-update `S0` (P3/P4) → solve → output (P5) → **overwrite S last** (P6). Single accumulator, no state double-buffer.
|
||||
|
||||
Per chunk `c0` (the loop body):
|
||||
|
||||
**Phase A - chunk load + gate prefix (f32, cooperative).** Load `Kc[C][dk]`, `Qc[C][dk]` **as bf16** (tf32 K/Q blows the 99KB budget - see §5 of the state design), load `V` chunk. Compute `csh = cumsum(g)` (≤0), `gam = exp(csh)` (≤1), `bet` - all f32, identical to 0031 lines (the `j==0` prefix scan, kept scalar; it is <1KB and hides under the Grams). cp.async depth-2 prefetch of the *next* chunk's `Kc/Qc` starts here.
|
||||
|
||||
**Phase B - state restage (accumulator→B bridge).** The crux. `S0` lives as P6's D/accumulator fragments but P3/P4 need it as a **B operand** (`tile<8,8>`, K-major over `i`). Bounce the `dk×dv_tile` state through a transient smem tile that **overlays the `Ud∪Amat` region** (free at chunk entry - U/A not yet computed) → `load_generic` back as B fragments (NOT `ldmatrix`: it is `.b16`-only, unusable for tf32; use `load_generic`). Paid `n_tokens/C` times, **0KB net persistent smem**. KS and QS share this one restage.
|
||||
|
||||
**Phase C - Gram + state-boundary products (the matmuls that read pre-update S0).**
|
||||
- **P1 `KK→A`** = `Kc·Kcᵀ`, M=C N=C K=dk, lower-tri (~½ tiles). **tf32-safe** (PoC-proven NMSE ~3e-9). Apply `A = I + tril(βₜ·d(t',t)·KK, -1)` in **f32** after the mma.
|
||||
- **P3 `KS`** = `Kc·S0`, M=C N=dv K=dk. **3xtf32** (state-boundary, feeds the solve). Output → `Ud` region (becomes RHS).
|
||||
- **P4 `QS`** = `Qc·S0`, M=C N=dv K=dk, **fused with P3 on the shared S0 B-fragments**. **3xtf32** (γ-attenuated → first demote candidate). Seed the **O accumulator fragments register-resident with `γₜ·QS`** immediately (avoids parking QS in smem through to Phase F). Restage overlay is now free; `Ud`/`Amat` reclaim it.
|
||||
|
||||
**Phase D - A-inverse (form T = A⁻¹ explicitly, then wide apply).**
|
||||
- **Phase-D inverses:** 4 diagonal `16×16` unit-lower-tri blocks, **f32 in shared-memory column-parallel forward substitution** (thread `c` solves `A_ii x = e_c`). No tensor cores, no reduced precision (this is the strong-coupling amplifier). 4 blocks on 4 warps in parallel, hides entirely under the Phase-C/RHS Grams.
|
||||
- **Phase-O off-diagonal:** wavefront (anti-diagonal) schedule, critical path `n_b-1=3` not 6. For each i>j: `P_ij = Σ_m A_im·T_mj` then `T_ij = -T_ii·P_ij`, on `m16n8k8`. **3xtf32 default-on** (~64 tiny mma total, negligible). `T` overwrites the `A` scratch in place.
|
||||
|
||||
**Phase E - RHS + apply.** `RHS = βₜ(vₜ - γₜ·KS)` in **f32** (uses P3 result + V) → `Ud`. **`U = T·RHS`** as one dependency-free wide **tf32** GEMM, M=C N=dv K=C (the bulk, 128 mma/warp at full dv), in place → `Ud`.
|
||||
|
||||
**Phase F - intra-chunk output.**
|
||||
- **P2 `QK→P`** = `Qc·Kcᵀ`, reuse `Amat` (now free after T consumed). **tf32-safe**. Apply `P = d(t',t)·QK` in **f32** (bounded, decay pre-baked - preserves the bounded de-gating invariant).
|
||||
- **P5 `O += P·U`**, M=C N=dv K=C, P lower-tri (~½ tiles). **tf32-safe** (P f32-bounded first). Accumulate into the O fragments already seeded with `γₜ·QS`. Write `O*scale` to `dst`.
|
||||
|
||||
**Phase G - state carry (overwrites S0 last).** `DU = d(t,last)·U` in f32. **Scale the persistent S accumulator fragments by `γ_last` in f32 in-register first**, then **P6 `S_C += Kcᵀ·DU`** = `Kcᵀ·DU`, M=dk N=dv K=C, **3xtf32 (the strongest ladder candidate - compounds over every chunk)**, accumulated straight into the persistent registers. `Kc` is read **transposed** here (second fragment view, `load_generic` transpose). No restage-out: S stays resident for the next chunk.
|
||||
|
||||
After the loop: final-state write-back (M-layout), identical to 0031's tail.
|
||||
|
||||
Buffer lifecycle (single `Amat`, single `Ud`, as 0031): `Amat`: A(P1) → T(Phase-D/O, in place) → consumed by apply → P(P2) → consumed by P5. `Ud`: KS(P3) → RHS(Phase-E) → U(apply, in place) → read by P5 (B) and P6 (B, scaled to DU). Restage tile overlays `Ud∪Amat` only at chunk entry (Phase B), before either is written.
|
||||
|
||||
---
|
||||
|
||||
## (2) Build sequence - incremental, each independently GPU-verifiable vs 0031
|
||||
|
||||
Each milestone is a **separate patch** stacked on 0031, **green on `test-backend-ops GATED_DELTA_NET` + greedy-md5 stable before the next is started**. Reference for every step = the `test_gated_delta_net` op's f64/CPU oracle (already in-tree) and 0031's serial-chunked output. **No milestone integrates on top of an unverified one.**
|
||||
|
||||
| M | Scope | Patch | GPU verification gate (vs 0031 / op oracle) |
|
||||
|---|---|---|---|
|
||||
| **M0** | Re-confirm regime, NO code (scope Phase 0) | - | Profile 0031 (`GDN_CHUNK_MIN` low): confirm GDN prefill bucket dominates + grid-starved at low n_seqs. If not, kill the lever now. |
|
||||
| **M1** | **DGX microbench (NO kernel yet)** - extend `gdn_gram_bench.cu` with KS/QS/PU/KᵀU microkernels + Phase-D/O T-formation + T·RHS apply, each with f64 host oracle measuring **κ(A)** and tf32-vs-3xtf32 NMSE per rung, incl. adversarial `g∈[-20,-1e-4]` | - | **The cheap go/no-go before multi-week kernel work.** Pass = default precision config (f32 diag + 3xtf32 off-diag + tf32 bulk) reaches ~PoC `3e-9`-grade on benign data and survives the κ(A) weak-decay corner within the ladder. Mirrors the PoC that proved 6.7×→9.3×. |
|
||||
| **M2** | In-kernel: replace **only** step-1/2 serial Grams (KK/QK) with tensor-core tiles. **C=16, scalar everything else, same occupancy** (scope Phase 1 / PoC integration) | 0043 | test-backend-ops 128-shapes green via KL gate (NMSE if it passes); greedy-md5 stable. |
|
||||
| **M3** | Add **P3/P4 (KS/QS)** tensor-core (3xtf32) + S restage bridge. Still C=16, scalar solve + scalar O/state | 0044 | Same gate. Isolates the accumulator→B bridge correctness. |
|
||||
| **M4** | **A-inverse** Phase-D (f32 diag) + Phase-O (3xtf32 off-diag), form T; replace 0031's serial fwd-subst. Still C=16 | 0045 | Same gate + the adversarial-decay op case (this is the amplifier). |
|
||||
| **M5** | **Apply `U=T·RHS`** + **P5 `P·U`** tensor-core. Still C=16 | 0046 | Same gate. |
|
||||
| **M6** | **P6 `Kᵀ(D·U)`** tensor-core + **register-resident state** (step-6 accumulator layout) + accumulator→B restage in steady state. State leaves smem here | 0047 | Same gate. Frees the 64KB that forced C=16. |
|
||||
| **M7** | **Flip C=16→C=64, full dv (CONFIG A ~86KB, 1 blk/SM)**, 8-warp 4×2 grid, PAD=4 smem | 0048 | Gate + **first A/B bench vs sequential** (S_PP at n_seqs≥2). |
|
||||
| **M8** | **Occupancy:** C=32 + dv-slab grid `(H,n_seqs,n_slabs)` (CONFIG C, 2 blk/SM) + cp.async depth-2; selectors `GDN_CHUNK_C`/`GDN_DV_TILE` | 0049 | Gate + A/B bench across {C=32/dv64, C=32/dv32, C=64/dv64-BW-max}; pick winner per regime. |
|
||||
|
||||
---
|
||||
|
||||
## (3) Bit-exact / KL gate plan
|
||||
|
||||
**md5 is per-path and will NOT match** 0031-serial or the sequential recurrence (different FP reduction order). This is the established `-paged` precedent (`PAGED_BITEXACT_NOTE.md`): per-path md5, validated benign. So:
|
||||
|
||||
- **Binding gate = KL** (not strict NMSE): `KLD(tensorcore ‖ f16) ≤ KLD(sequential ‖ f16)` plus a PPL band, on the README s5 harness. NMSE is *expected to fail* at reduced precision (new path on a new path); NMSE-pass is a bonus, KL-pass is the bar.
|
||||
- **Stability gate:** greedy-md5 **stable across runs** (deterministic), not equal to the serial path.
|
||||
- **Adversarial op case mandatory:** `g∈[-20,-1e-4]` (the dangerous middle-decay regime where κ(A) grows); strong-decay underflows to 0 (safe), weak-decay is well-conditioned (tf32's 8-bit exponent holds γ range), the middle is the only empirical risk.
|
||||
|
||||
**Precision default config (the bet that clears the gate):** f32 diagonal inverse (mandatory, already f32) · **3xtf32 off-diagonal coupling** (default-on, negligible ~64-mma cost) · **tf32** Grams + apply · **bf16** K/Q staging (well-conditioned KK/QK only) · decays/γ/β **always f32 outside the mma** (invariant, not a rung). Hold **P6 state carry at 3xtf32 longest** (it compounds over every chunk).
|
||||
|
||||
**3xtf32 ladder (cheapest→dearest) if default misses the gate:** (1) KK Gram→3xtf32; (2) apply **block-diagonal `T_ii·RHS_i`**→3xtf32 (within-window strong coupling, mixed-precision-by-distance); (3) +**δ=1 off-diagonal** apply→3xtf32 (block-boundary adjacent pairs e.g. tokens 15↔16); (4) **full apply**→3xtf32 (≈+2× apply, expensive escape); (5) KS/QS→3xtf32; (6) fall back to direct blocked back-substitution in 3xtf32, else keep 0031's serial path. **Demote order if the gate has margin:** P4→P3, holding P6 at 3xtf32. If even all-3xtf32 misses, the residual is the f32 diagonal solve (already f32) → not fixable by more mma precision → fall to (6). Record the final rung in `PAGED_BITEXACT_NOTE.md` + README s5.
|
||||
|
||||
---
|
||||
|
||||
## (4) Slot into 0031's existing framework (opt-in, default-OFF)
|
||||
|
||||
Same dispatch site - the `if constexpr (!KDA && !keep_rs_t)` block inside `launch_gated_delta_net` (0031 patch, after `init_fastdiv_values`). Extend, don't replace:
|
||||
|
||||
- Keep `GDN_CHUNK_MIN` (token threshold, default `INT_MAX` = off) and `GDN_CHUNK_OFF` (kill switch).
|
||||
- Add **`GDN_CHUNK_TC`** selector: `0` = 0031 serial-solve chunked (fallback, retained), `1` = tensor-core. Add **`GDN_CHUNK_C` ∈ {16,32,64}** and **`GDN_DV_TILE` ∈ {32,64,128}** for A/B; defaults `C=32, DV_TILE=64` (CONFIG C) for serving, `DV_TILE=32` saturator for n_seqs=1.
|
||||
- New launcher `launch_gdn_chunked_tc<128, C, DV_TILE>` mirrors `launch_gdn_chunked`: `cudaFuncSetAttribute(...MaxDynamicSharedMemorySize...)` **return-checked** (0031 precedent), `grid = dim3(H, n_seqs, n_slabs)`, `block = dim3(256,1,1)`. Per-slab the kernel recomputes A/A⁻¹/gates (dv-independent), dv-slices S/Ud/O.
|
||||
- **Default OFF** (`gdn_chunk_min=INT_MAX`) exactly as 0031 ships. Flip the default to on **only when** the M8 A/B shows an S_PP win over the tuned sequential recurrence at the serving regime (n_seqs≥2) **and** the KL gate + adversarial op case hold - recorded in README s5 (dev notes / rejected-flat levers) and `PAGED_BITEXACT_NOTE.md`. Until then it ships like 0031: opt-in, regression-free default.
|
||||
- Extend the test-backend-ops block 0031 added (the `S_v==128` shapes at lines after :9398) so the tc path is exercised at C=64 and C=32 in CI.
|
||||
- New per-path md5 acknowledged in the dispatch comment (tc-md5 ≠ serial-chunked-md5 ≠ sequential-md5; all benign, KL-validated).
|
||||
|
||||
---
|
||||
|
||||
## (5) Top 3 risks that could make it NOT beat sequential + kill-criteria
|
||||
|
||||
**Risk 1 - Register pressure forces 1 block/SM (the swing factor).** The ~50 working-regs/thread estimate is optimistic; the A⁻¹ blocked solve (in-register `16×16` diag inverse), the accumulator→B restage transpose, and the O+state transient accumulators can push working regs to 80-120. At 256 threads, >128 regs/thread → >32K regs/block → **silently 1 block/SM regardless of the 44KB shared headroom**, and local-memory spills push BW back. *Mitigation ladder:* (i) 12 warps (dilute state 32→21 regs/thread); (ii) `__launch_bounds__(256,2)`; (iii) smaller `DV_TILE`. **Kill criterion:** if after the full ladder the M8 occupancy build still spills to local OR stays 1 block/SM, **and** the CONFIG-A BW-max 1-block path (C=64, dv64, 96KB, cp.async, 4× state-BW cut) **also** fails to beat sequential S_PP at n_seqs≥2 in the A/B bench → the occupancy lever is dead; keep 0031 serial-chunked behind `GDN_CHUNK_TC=0`, record rejected in README s5.
|
||||
|
||||
**Risk 2 - Precision: tf32 (and even all-3xtf32) misses the KL gate in the weak-decay/aligned-keys κ(A) corner.** The inverse amplifies error; κ(A) is data-dependent and grows where keys align and decay is weak. **Detected cheaply at M1** (microbench measures κ(A) + per-rung NMSE on the adversarial case *before* the kernel exists). **Kill criterion:** if at M1 the **top of the ladder (all-3xtf32 + f32 diagonal)** cannot reach f32-grade on `g∈[-20,-1e-4]`, OR at M4+ `KLD(tc‖f16) ≤ KLD(seq‖f16)` fails on that op case at the top rung → the tensor-core solve is not numerically viable as a default; fall to ladder rung (6) (direct back-subst 3xtf32); if that also misses, abandon the tc solve and keep 0031 serial. **Fail-fast:** M1 gates this before any multi-week kernel commitment.
|
||||
|
||||
**Risk 3 - Grid starvation at n_seqs=1 is structural (H=32 < the ~96 blocks needed for 2 blk/SM × 48 SM).** Only `DV_TILE=32` (4 slabs) fully saturates a single stream, and it pays ~1.5-2× redundant-A flops (A/A⁻¹/gates recomputed per slab) plus the per-chunk restage. **Kill criterion:** if the M8 bench shows single-stream (n_seqs=1) S_PP is slower than sequential even at full saturation (dv32×4) due to redundant-A + restage overhead, **and** the batched regime (n_seqs≥2) gain also fails to materialize → the lever only helps a regime the target workload doesn't hit → keep default-OFF, ship as opt-in experiment only, record. (If n_seqs≥2 *does* win, ship enabled for the serving regime and gate single-stream back to sequential via `GDN_CHUNK_MIN` + an n_seqs check - a partial, honest win.)
|
||||
|
||||
**Overarching kill gate:** the disposition is the bench, not the theory. The kernel flips to default-on only when it beats the tuned sequential recurrence at the serving regime AND clears the KL + adversarial gates. Any milestone that regresses test-backend-ops or md5-stability halts the stack until fixed; M1 and M0 are the cheap fail-fast exits before the expensive kernel work.
|
||||
362
backend/cpp/llama-cpp-localai-paged/docs/TENSORCORE_GDN_SCOPE.md
Normal file
362
backend/cpp/llama-cpp-localai-paged/docs/TENSORCORE_GDN_SCOPE.md
Normal file
@@ -0,0 +1,362 @@
|
||||
# TENSORCORE_GDN_SCOPE - tensor-core chunked gated-DeltaNet prefill (design only)
|
||||
|
||||
**Status: DESIGN + SCOPE ONLY. No kernel written, no GPU run, no PTX in this pass.**
|
||||
This scopes the follow-up recorded by patch 0031 and README section 5: a
|
||||
tensor-core (`mma`) chunked gated-DeltaNet (GDN) prefill kernel - the path that
|
||||
would actually *beat* the tuned sequential scan and close the GDN prefill bucket
|
||||
toward vLLM. vLLM's chunked GDN scan was measured ~2.5x cheaper in the prefill
|
||||
ground-truth precisely because it pushes the intra-chunk products through
|
||||
tensor-core matmuls; patch 0031 proved the chunking math but, with serial
|
||||
per-thread reductions at the GB10-forced `C=16`, came out ~22% *slower* than the
|
||||
sequential recurrence. This document scopes replacing those reductions with
|
||||
`mma.sync` matmuls and lifting the occupancy ceiling.
|
||||
|
||||
> **Read patch 0031 + README section 5 first.** The bounded/stable de-gating form
|
||||
> (pairwise decays `d <= 1`, `gamma <= 1`), the per-path bit-exact precedent, and
|
||||
> the honest negative ("C=16 all-shared -> 1 block/SM -> serial reductions -> 22%
|
||||
> slower, grid-starved at low n_seqs") are the starting point. This doc does not
|
||||
> re-derive the algebra; it maps it onto tensor cores.
|
||||
|
||||
> **Regime note (the mechanism, read this).** The sequential scan is
|
||||
> **bandwidth-bound**: it re-streams the entire `128x128` f32 state (64KB) once
|
||||
> *per token*. README section 5 already records it runs at ~84.7% of GB10 peak BW
|
||||
> (decode) and the recurrence is a llama *win* vs vLLM's BW. So a tensor-core
|
||||
> kernel does **not** win by doing the same work faster - it wins by **changing
|
||||
> the work**: chunking by `C` reads/writes the state `n_tokens/C` times instead of
|
||||
> `n_tokens` (a ~`C`x cut in state traffic, the dominant prefill GDN cost), and the
|
||||
> price is `O(C^2)` extra intra-chunk dot-products per chunk. The naive 0031 paid
|
||||
> that price in serial f32 reductions, which cost *more* than the BW it saved -
|
||||
> hence 22% slower. **Tensor cores make the added intra-chunk flops nearly free,
|
||||
> so the BW saving becomes a net win.** That is exactly why vLLM's chunked scan is
|
||||
> 2.5x cheaper. The whole lever rests on this trade; if a GPU re-profile shows
|
||||
> prefill GDN is *not* state-BW-bound, stop and re-scope (step 0 below).
|
||||
|
||||
---
|
||||
|
||||
## 1. GB10 tensor-core reality (sm_121a) - confirmed, not assumed
|
||||
|
||||
GB10 / DGX Spark reports **compute capability 12.1 (sm_121)**, CUDA 13 (README
|
||||
section "Hardware: GB10 / DGX Spark (CUDA 13, sm_121)"). sm_121a is **consumer
|
||||
Blackwell** (the SM12x family, same tensor-core programming model as RTX 50 /
|
||||
sm_120), **not** data-center Blackwell (sm_100a / GB200). This distinction is the
|
||||
single most important input to the design and is confirmed from sources, not
|
||||
assumed:
|
||||
|
||||
- **No `wgmma`.** Warp-group MMA is Hopper (sm_90a) only; targeting SM12x yields
|
||||
`ptxas error: Instruction 'wgmma.fence' not supported on .target 'sm_120'`.
|
||||
Do **not** design around Hopper-style warp-group MMA.
|
||||
- **No `tcgen05` / no TMEM.** SM12x lacks the Tensor Memory hardware entirely, so
|
||||
the autonomous 5th-gen tensor-core path (`tcgen05.mma`, the sm_100a data-center
|
||||
instruction) is unavailable. This is the same wall that makes vLLM/CUTLASS fall
|
||||
back to Marlin and gate FP4 to sm_100a on GB10 (tracked in CUTLASS #2800/#2947,
|
||||
vLLM #43906). We cannot use it either.
|
||||
- **What sm_121a DOES have: extended `mma.sync`.** The Ampere/Ada warp-level
|
||||
`mma.sync` family, extended with the Blackwell numeric formats (FP8/FP6/FP4).
|
||||
"Consumer Blackwell put new data types on top of the oldest programming model."
|
||||
For our operands (q/k/v/state are f32 in the op, see below) the usable tiles are
|
||||
the standard warp-level ones:
|
||||
- **bf16/f16 inputs, f32 accumulate:** `mma.sync.aligned.m16n8k16` (and
|
||||
`m16n8k8`). 7-bit (bf16) / 10-bit (f16) input mantissa.
|
||||
- **tf32 inputs, f32 accumulate:** `m16n8k8` / `m16n8k4`. 10-bit input mantissa
|
||||
- the **highest-precision tensor-core option** on this part, and the one this
|
||||
design defaults to (the GDN is decay-sensitive; see section 4).
|
||||
- FP8 (`m16n8k32`) / FP4 (`m16n8k64.kind::mxf4nvf4`, block-scaled) compile on
|
||||
sm_121a but are **out of scope** here - the GDN q/k/v/state are not 4/8-bit.
|
||||
- **`cp.async` is available** (Ampere+), so global->shared double-buffering of the
|
||||
K/Q chunk tiles is on the table for the occupancy phase. There is **no TMA** on
|
||||
SM12x; staging is plain `cp.async`, not `cp.async.bulk`.
|
||||
|
||||
**Reuse, do not hand-roll PTX.** ggml already ships a warp-level MMA tile
|
||||
abstraction at `ggml/src/ggml-cuda/mma.cuh` (the `tile<M,N,T>` fragments +
|
||||
`mma()` used by the FlashAttention-mma and MMQ kernels), and it already routes
|
||||
through `turing_mma_available(cc)` / `ampere_mma_available(cc)` - i.e. it is
|
||||
sm_121-correct today. Build the GDN matmuls on that API (bf16/half/tf32 fragments,
|
||||
f32 accumulators), not on raw `asm volatile("mma.sync...")`. This de-risks the
|
||||
kernel and keeps it consistent with the backend's other tensor-core paths.
|
||||
|
||||
**Bottom line for the design:** the kernel is a **warp-synchronous `mma.sync`**
|
||||
kernel (Ampere-class programming model with Blackwell silicon), *not* a
|
||||
warp-group / TMA / tcgen05 kernel. Every "wgmma"/"tcgen05" idea from FLA's
|
||||
sm_90/sm_100 kernels must be down-translated to `mma.sync` + `cp.async`. Patch
|
||||
0031's and README's shorthand "mma/wgmma" should be read as **mma only** on this
|
||||
part.
|
||||
|
||||
---
|
||||
|
||||
## 2. Mapping the chunked GDN matmuls onto `mma.sync`
|
||||
|
||||
The chunked gated-delta-rule (patch 0031 header) has six dot-product families.
|
||||
Five are plain matmuls and map cleanly to `mma`; the sixth (the A-inverse) is a
|
||||
unit-lower-triangular solve and is the one subtle case. Notation: `C` = chunk
|
||||
length, `dk = dv = S_v = 128` (GDN head dim), per `(head, seq)` block.
|
||||
|
||||
| # | Product (0031 step) | Shape | mma form | Notes |
|
||||
|---|---|---|---|---|
|
||||
| 1 | `KK[t,t'] = k_t . k_t'` (for `A`) | `C x C` over `k=dk=128` | `(C x dk) x (dk x C)` | Gram matrix; only strict-lower triangle used. Decay `d(t',t)` + `beta_t` applied **after** mma in f32. |
|
||||
| 2 | `QK[t,t'] = q_t . k_t'` (for `P`/`O`) | `C x C` over `k=dk` | `(C x dk) x (dk x C)` | Lower triangle (`t' <= t`); decay applied after in f32. |
|
||||
| 3 | `KS[t,j] = (S0^T k_t)[j]` | `C x dv` over `k=dk` | `(C x dk) x (dk x dv)` | `S0` is the chunk-entry state (stationary operand). Feeds RHS of the solve. |
|
||||
| 4 | `QS[t,j] = (S0^T q_t)[j]` | `C x dv` over `k=dk` | `(C x dk) x (dk x dv)` | The `gamma_t` cross-chunk term of `O`. |
|
||||
| 5 | `O += P . U` | `C x dv` over `k=C` | `(C x C) x (C x dv)` | `P` (decay-masked `QK`) times the solved `U`. |
|
||||
| 6 | `S_C += K^T (D .* U)` | `dk x dv` over `k=C` | `(dk x C) x (C x dv)` | The state update; `D` = `diag(d(t,last))` applied to `U` in f32 first. |
|
||||
| 7 | `U = A^{-1} RHS` | `C x C` solve, `C x dv` RHS | blocked fwd-subst (see below) | The only non-GEMM. |
|
||||
|
||||
**Critical precision invariant (preserve the bounded de-gating).** Every decay
|
||||
(`gamma_t`, `d(t',t) = exp(cs_t - cs_t')`) and every `beta_t` stays in **f32** and
|
||||
is applied as an elementwise scale **before/after** the mma, never inside it. The
|
||||
mma only ever multiplies the raw, unweighted dot-products (`k.k`, `q.k`,
|
||||
`S0^T k`, `S0^T q`, `P.U`, `K^T U`). This keeps the strong-decay underflow-to-zero
|
||||
behaviour (the adversarial `g in [-20, -1e-4]` op test) exactly as 0031 has it -
|
||||
the numerically delicate part never touches reduced precision. This is the
|
||||
discipline that makes a tf32/bf16 mma kernel safe for a decay-sensitive op.
|
||||
|
||||
### The A-inverse (step 7) - it CAN be tensor-core'd
|
||||
|
||||
`A = I + N`, `N = tril(beta_t d(t',t) k_t.k_t', -1)` is **strictly lower
|
||||
triangular**, hence **nilpotent** (`N^C = 0`). Two routes, both better than 0031's
|
||||
serial per-thread forward substitution:
|
||||
|
||||
- **Blocked forward substitution (RECOMMENDED, this is the FLA "UT transform").**
|
||||
Partition `C` into sub-blocks of `b` (e.g. `b = 16`, one mma `m`-tile). Invert
|
||||
each `b x b` diagonal block in registers (it is unit-lower-triangular `b x b`,
|
||||
cheap: a short serial solve or the finite Neumann series on a `b`-nilpotent,
|
||||
`<= b-1` terms), then propagate to the off-diagonal sub-blocks with **mma**
|
||||
(the inter-block coupling `U_i -= A_ij U_j` is exactly a `(b x b) x (b x dv)`
|
||||
matmul). For `C = 64, b = 16` that is 4 tiny in-register diagonal solves + a
|
||||
triangular sweep of mma updates - the bulk of the solve is on tensor cores, only
|
||||
the `16 x 16` diagonals stay scalar.
|
||||
- **Neumann/Newton-Schulz inverse (fallback).** `A^{-1} = I - N + N^2 - ... ` is
|
||||
finite (`C` terms) but `O(C)` mma's of `C x C`; Newton-Schulz
|
||||
(`X <- X(2I - AX)`) converges in `~log2(C)` steps for the nilpotent part. Cheap
|
||||
in flops, but more numerically exposed than blocked subst for adversarial decays.
|
||||
Keep as a fallback if blocked subst's register pressure hurts occupancy.
|
||||
|
||||
Verdict: **blocked forward substitution** - it keeps the sensitive diagonal solve
|
||||
exact-in-registers and tensor-core's only the well-conditioned off-diagonal
|
||||
coupling. This is precisely the structure FLA/vLLM use, down-translated to `mma`.
|
||||
|
||||
### Tile/chunk design that fits the 99KB shared budget AND feeds the mma
|
||||
|
||||
The 0031 failure was a layout failure: the all-shared `128x128` f32 state (64KB)
|
||||
crowded out everything and forced `C=16`. The fix is to get the state **out of the
|
||||
bulk shared footprint**. Two complementary mechanisms:
|
||||
|
||||
1. **State register-resident across the chunk loop (the key move).** `S` only
|
||||
participates at chunk boundaries (steps 3,4 at entry; step 6 at exit). Keep it
|
||||
as **mma accumulator fragments distributed across the block's warps** (each
|
||||
warp owns a `dk x dv` sub-tile of `S`), persisting in registers across the
|
||||
sequential chunk loop. Steps 3/4 read `S` as the stationary mma operand; step 6
|
||||
accumulates into it. This **frees the entire 64KB** - shared then holds only the
|
||||
per-chunk K/Q/U/A tiles. (The chunked algorithm's whole point is that the heavy
|
||||
work is intra-chunk and state-free, so the state need not be in shared.)
|
||||
2. **dv-slab tiling for occupancy (the secondary move).** If register pressure
|
||||
from a register-resident `128x128` state caps the kernel at 1 block/SM (likely
|
||||
- that is a lot of accumulator registers), split the `dv=128` value dimension
|
||||
into slabs (`dv_tile in {64, 32}`). Each warp-group owns a `128 x dv_tile`
|
||||
state slab. `A` and the solve depend only on `K` (not `dv`), so they are
|
||||
computed once and the `C x C` `A^{-1}` is **broadcast/recomputed** per slab
|
||||
(cheap once it is mma'd). This shrinks per-block register/shared pressure and is
|
||||
the lever for >1 block/SM.
|
||||
|
||||
**Shared budget at `C = 64` (state register-resident), staging K/Q as bf16/tf32:**
|
||||
|
||||
| Buffer | Elems | Bytes |
|
||||
|---|---|---|
|
||||
| `Kc` (chunk K) | `C x dk = 64x128` | 16KB (bf16) |
|
||||
| `Qc` (chunk Q) | `C x dk` | 16KB (bf16) |
|
||||
| `Uc` (solved U) | `C x dv = 64x128` | 32KB (f32 for the solve) / 16KB (bf16 for the P.U + K^T U mma) |
|
||||
| `A`/`P` scratch | `C x C = 64x64` | 16KB (f32) |
|
||||
| gates `cs/gam/beta` | `~3C` | <1KB |
|
||||
| **state** | (registers) | **0KB shared** |
|
||||
| **Total** | | **~64-80KB** (under the 99KB opt-in) |
|
||||
|
||||
So **`C = 64` fits the 99KB budget once the state is register-resident** - 4x the
|
||||
0031 chunk, and a natural multiple of the `m16n8k*` tiles. For >1 block/SM, drop
|
||||
to `C = 32` + bf16-staged U (`8 + 8 + 16 + 4 = 36KB`, two blocks fit under the
|
||||
~49.5KB/block needed) and/or dv-slab the state. **Recommended default: `C = 64`,
|
||||
tf32 mma, state register-resident** (maximize the BW-saving `C` first; chase the
|
||||
second block/SM only if the bench says occupancy, not BW, is the residual).
|
||||
|
||||
---
|
||||
|
||||
## 3. Occupancy plan (break the 1 block/SM ceiling)
|
||||
|
||||
0031 is pinned to 1 block/SM by the 64KB shared state. The plan, in priority order:
|
||||
|
||||
1. **Free the 64KB: state register-resident** (section 2). This alone may not give
|
||||
2 blocks/SM (the register-distributed `128x128` f32 accumulator is heavy), but
|
||||
it is the precondition for everything and it lets `C` grow to 64 - which is the
|
||||
dominant win (`C`x less state BW). Even at 1 block/SM, `C=64` + mma should flip
|
||||
the sign vs 0031.
|
||||
2. **dv-slab the state** (`dv_tile = 64` then `32`): halve/quarter the per-block
|
||||
accumulator-register and shared pressure to admit a 2nd resident block, at the
|
||||
cost of recomputing the `C x C` `A^{-1}` per slab (cheap on mma). This is the
|
||||
primary occupancy lever once (1) is in.
|
||||
3. **`cp.async` double-buffer the K/Q chunk loads**: overlap the next chunk's
|
||||
global->shared staging with the current chunk's mma, hiding LPDDR5x latency that
|
||||
1-2 blocks/SM cannot. No TMA on sm_121, so plain `cp.async` (`commit_group` /
|
||||
`wait_group`), Ampere-style.
|
||||
4. **Grid starvation at low `n_seqs`** (0031's other failure: grid is `H x n_seqs`,
|
||||
~few hundred blocks): the larger `C` reduces per-block serial chunk steps, and
|
||||
dv-slabbing **multiplies the grid by the slab count** (`H x n_seqs x n_slabs`),
|
||||
directly mitigating the low-`n_seqs` starvation that hurt 0031.
|
||||
|
||||
Honest occupancy caveat: a register-resident `128x128` f32 state is a large
|
||||
register commitment; the realistic outcome is **1-2 blocks/SM**, not high
|
||||
occupancy. The design leans on **mma throughput + cp.async latency hiding + the
|
||||
`C`x BW cut**, not on many resident blocks, to win. If profiling shows the kernel
|
||||
register-capped at 1 block/SM *and* tensor-core-active-% still low, that is the
|
||||
signal to dv-slab harder (smaller `dv_tile`) or accept the achieved win.
|
||||
|
||||
---
|
||||
|
||||
## 4. Bit-exactness + precision risk
|
||||
|
||||
This is a **NEW FP path on top of a NEW FP path**. 0031 is already not byte-equal
|
||||
to the sequential recurrence (different reduction order; README s5 records it as a
|
||||
benign per-path result). Adding tf32/bf16 mma is a *further* reduced-precision
|
||||
step. Gate it exactly like the backend's other new-FP-path precedents
|
||||
(`PAGED_BITEXACT_NOTE.md`, the paged-MoE `8cb0ce23`, the PREFILL_GEMM scope):
|
||||
|
||||
- **Greedy md5 stability** on the standard prompt (README s5 harness) - to catch
|
||||
*unexpected* divergence on the non-prefill paths (decode must stay on the tuned
|
||||
sequential kernel and byte-match its reference; this lever is prefill-only and
|
||||
opt-in, so the default path is untouched).
|
||||
- **`test-backend-ops GATED_DELTA_NET`** at the 0031 prefill shapes (the
|
||||
`S_v=128` exact-multiple / tail / multi-seq / GQA / permuted cases), CUDA0 vs the
|
||||
CPU f32 oracle. **Honest expectation: bf16 mma will likely NOT clear the 1e-7
|
||||
NMSE gate; tf32 is borderline.** So the binding gate is the **KL-gate**, not
|
||||
strict NMSE: require `KLD(tensorcore || f16) <= KLD(sequential || f16)` and PPL
|
||||
within the established band, recorded in `PAGED_BITEXACT_NOTE.md`. tf32 (10-bit
|
||||
mantissa, f32 accumulate) is the precision default precisely to give the KL-gate
|
||||
the best chance.
|
||||
- **Precision fallback ladder if tf32 fails the KL-gate:** (i) **3xtf32**
|
||||
emulation (split each f32 operand into 3 tf32 limbs, 3 mma's, recombine - the
|
||||
CUTLASS fp32-emulation trick; near-f32 accuracy at 3x the mma cost, still far
|
||||
cheaper than serial f32 loops and still a likely net win given the `C`x BW cut);
|
||||
(ii) keep the **decay-coupled and state-boundary products in 3xtf32/f32** while
|
||||
the well-conditioned intra-chunk Gram products use plain tf32 (mixed precision by
|
||||
sensitivity). Do **not** fall back to bf16 for the decay-sensitive terms.
|
||||
- **Preserve the bounded de-gating (section 2):** decays/`gamma`/`beta` stay f32,
|
||||
applied outside the mma. Re-run the adversarial `g in [-20, -1e-4]` op case
|
||||
specifically; a tensor-core kernel that moved a decay inside the mma would be a
|
||||
silent precision regression even if the benign cases pass.
|
||||
|
||||
The likely-favourable framing (as in PREFILL_GEMM): keeping the heavy reductions
|
||||
in f32-accumulate tensor cores is *more* precise than a naive f32 serial loop only
|
||||
if the inputs stay full-width; here inputs are down-cast (tf32/bf16), so this is a
|
||||
genuine precision *trade*, not a free win - hence the KL-gate is mandatory and the
|
||||
3xtf32 ladder exists. Treat NMSE-gate-pass as a bonus, KL-gate-pass as the bar.
|
||||
|
||||
---
|
||||
|
||||
## 5. Honest effort + expected gain
|
||||
|
||||
**This is a multi-week GPU kernel project, not a routing change.** Unlike the
|
||||
PREFILL_GEMM dense lever (a dispatch flip onto an existing vendor kernel), there is
|
||||
no vendor chunked-GDN kernel to route to on sm_121 (CUTLASS/FLA gate the good
|
||||
paths to sm_100a; that is the whole reason vLLM falls back to Marlin on GB10). We
|
||||
must write the `mma` kernel ourselves. Realistic estimate: **4-8 weeks** of
|
||||
focused kernel work, high risk, with non-trivial probability the occupancy/register
|
||||
wall caps the win.
|
||||
|
||||
**Expected gain (mechanism-grounded, section 0/regime-note):** the lever attacks
|
||||
the state-BW that dominates sequential GDN prefill by `~C`x (chunking) while
|
||||
tensor cores absorb the `O(C^2)` intra-chunk flops. Fully realized, it targets
|
||||
vLLM's ~2.5x-cheaper chunked GDN prefill bucket = the ~17% prefill lever the
|
||||
ground-truth attributes to GDN. It should also help the serial-SSM portion of the
|
||||
**decode** residual (README names the irreducible "serial-SSM host loop" as part
|
||||
of the decode floor; a chunked state-update reduces the per-step state traffic
|
||||
there too, though decode `n_tokens` is small so the prefill regime is where it
|
||||
pays). **Honest ceiling:** sm_121 has no wgmma/tcgen05, so we cannot match a
|
||||
hypothetical sm_100a FLA kernel's throughput - the `mma.sync` path is the Ampere-
|
||||
class programming model on Blackwell silicon. But `mma` over serial f32 reductions
|
||||
is an order-of-magnitude flop-rate jump, which is more than enough to flip 0031's
|
||||
-22% into a win and recover most of the GDN prefill bucket. Do not promise full
|
||||
parity with vLLM's sm_100-class kernels; promise "beats the sequential scan and
|
||||
closes most of the GDN prefill gap."
|
||||
|
||||
**Risk register:**
|
||||
- Register-resident `128x128` state may cap occupancy at 1 block/SM (section 3) -
|
||||
mitigated by dv-slabbing, but slabbing recomputes `A^{-1}` per slab.
|
||||
- tf32 may miss the KL-gate -> 3xtf32 ladder (3x mma cost) -> thinner margin.
|
||||
- The win is contingent on prefill GDN being state-BW-bound (regime note); a GPU
|
||||
re-profile that says otherwise kills the lever (step 0).
|
||||
- Blocked-forward-subst register pressure trades against state-register pressure;
|
||||
both compete for the same budget on a 1-block/SM kernel.
|
||||
|
||||
---
|
||||
|
||||
## 6. Phased build plan
|
||||
|
||||
Smallest tensor-core proof-of-concept first, bit-exact/KL-gate + A/B bench at every
|
||||
phase, per `.agents/vllm-parity-methodology.md` (one lever at a time, record
|
||||
rejected/flat variants, ground-truth both engines).
|
||||
|
||||
### Phase 0 - re-confirm the regime on GPU (NO code)
|
||||
nsys a **prefill-only** window (`llama-batched-bench -npp <large> -ntg 0/1`,
|
||||
exclude graph capture) on q36-27b-nvfp4 + q36-35b-a3b, at the backend pin, with
|
||||
`GDN_CHUNK_MIN` set so 0031 runs. Confirm (a) the GDN prefill bucket is
|
||||
state-BW-bound (state memcpy/recurrence dominates, tensor-core-active-% low), and
|
||||
(b) it is ~17% of the prefill step / ~2.5x vLLM's. **If prefill GDN is not
|
||||
state-BW-bound, stop and re-scope** - the entire mechanism (section 0) fails.
|
||||
|
||||
### Phase 1 - PoC: tensor-core just TWO products, same occupancy
|
||||
Keep 0031's `C=16` all-shared layout and 1 block/SM. Replace **only** the two
|
||||
cleanest `C x C` Gram products - step 1 (`KK` for `A`) and step 2 (`QK` for `P`) -
|
||||
with `ggml/src/ggml-cuda/mma.cuh` tf32 tiles (decays still applied in f32 after).
|
||||
Leave the solve, the `S0` products, and the state update serial. This is the
|
||||
minimal "do tensor cores help here at all" probe at fixed occupancy.
|
||||
- Gate: greedy md5 stable; `test-backend-ops GATED_DELTA_NET` prefill shapes via
|
||||
the KL-gate (NMSE if it passes).
|
||||
- Bench: `llama-batched-bench` S_PP, A/B vs sequential and vs 0031-serial, same
|
||||
harness. **If even this does not move S_PP, the head-dim/occupancy is the wall,
|
||||
not the reductions - learn it cheaply before the big build.**
|
||||
|
||||
### Phase 2 - full intra-chunk tensor-core + register-resident state + C=64
|
||||
State register-resident (free the 64KB), `C=64`, tf32 mma for all of steps 1-6,
|
||||
blocked-forward-subst `A^{-1}` (step 7) with mma off-diagonal coupling +
|
||||
in-register `16x16` diagonal solves. Decays/gamma/beta stay f32 throughout.
|
||||
- Gate: as Phase 1, plus the adversarial `g in [-20,-1e-4]` op case explicitly.
|
||||
If tf32 misses the KL-gate, climb the 3xtf32 ladder (section 4).
|
||||
- Bench: S_PP A/B vs sequential, sweep prefill length and `npl`; record the
|
||||
`C in {32,64,128}` sweep and any rejected `C`.
|
||||
|
||||
### Phase 3 - occupancy + latency hiding
|
||||
dv-slab the state (`dv_tile in {64,32}`) for a 2nd resident block and to multiply
|
||||
the grid (fix low-`n_seqs` starvation); `cp.async` double-buffer the K/Q chunk
|
||||
loads. Tune `C`, `dv_tile`, warp count per the bench.
|
||||
- Gate: unchanged (the FP path does not change; this is scheduling).
|
||||
- Bench: final S_PP vs sequential + indicative % of vLLM prefill; name the
|
||||
residual floor honestly (register-cap / sm_121-has-no-tcgen05).
|
||||
|
||||
### Disposition
|
||||
Like 0031, ship **opt-in default-OFF first** (extend the existing `GDN_CHUNK_MIN`
|
||||
gate, add a `GDN_CHUNK_TC` selector if the serial path is kept as fallback). Flip
|
||||
the default only when a separately-built A/B proves S_PP beats the sequential scan
|
||||
*and* the KL-gate holds, recorded in README section 5 + `PAGED_BITEXACT_NOTE.md`.
|
||||
If a phase comes back flat-or-slower, record it as a rejected lever with the reason
|
||||
(the most valuable output if it fails) and keep 0031's serial path as the shipped
|
||||
prefill kernel.
|
||||
|
||||
---
|
||||
|
||||
## 7. Summary
|
||||
|
||||
| Aspect | Decision |
|
||||
|---|---|
|
||||
| Tensor-core ISA | **`mma.sync` only** (sm_121a: no wgmma, no tcgen05/TMEM - confirmed) |
|
||||
| Building block | reuse `ggml/src/ggml-cuda/mma.cuh` tiles, not raw PTX |
|
||||
| Precision default | **tf32** inputs / f32 accumulate; **3xtf32** ladder if KL-gate misses; bf16 only for well-conditioned Gram terms |
|
||||
| Decay handling | gamma/d/beta stay **f32**, applied outside the mma (preserve bounded de-gating) |
|
||||
| A-inverse | blocked forward substitution (FLA UT-transform): in-register diagonal solves + mma off-diagonal |
|
||||
| Chunk size | **C=64** default (4x 0031), C=32 for 2 blocks/SM |
|
||||
| State | **register-resident** (frees the 64KB that forced C=16); dv-slab for occupancy |
|
||||
| Shared budget | ~64-80KB at C=64 state-register-resident (under the 99KB opt-in) |
|
||||
| Mechanism / why it wins | chunking cuts state-BW by ~Cx; mma absorbs the O(C^2) intra-chunk flops the serial 0031 could not |
|
||||
| Bit-exact | NEW per-path; **KL-gate** binding (NMSE likely fails at reduced precision), greedy md5 + adversarial-decay op case |
|
||||
| Effort | **multi-week (4-8 wk), high risk**; no vendor kernel to route to on sm_121 |
|
||||
| Expected gain | beats the sequential scan, closes most of the ~17% GDN prefill bucket toward vLLM's 2.5x; also helps the decode serial-SSM residual. NOT full sm_100-class parity. |
|
||||
| Phasing | P0 re-profile -> P1 two-product PoC -> P2 full intra-chunk + C=64 + reg-state -> P3 occupancy/cp.async; opt-in default-OFF until A/B-proven |
|
||||
|
||||
Decode is untouched (this is prefill-only, opt-in); the stock `llama-cpp` backend
|
||||
stays patch-free. This lever lives entirely in `llama-cpp-localai-paged`.
|
||||
@@ -0,0 +1,343 @@
|
||||
# Layer-2 upstream scope: native fused-GDN kernels for Metal / Vulkan / SYCL
|
||||
|
||||
Source-only analysis (no GPU, no build) of what it would take to give the
|
||||
gated-DeltaNet (GDN / SSM) decode fusions native kernels on the non-CUDA compute
|
||||
backends, so the patch-series decode win extends past CUDA-family hardware.
|
||||
|
||||
This doc is the GDN/SSM-fusion (benefit #1) detail. For the umbrella scope that
|
||||
also covers the paged KV block-table flash-attn read (benefit #2), the free
|
||||
host-side scheduler (benefit #3), the out-of-scope NVFP4 track (benefit #4) and a
|
||||
ROCm note - and the combined per-backend sequencing - see
|
||||
[`ACCELERATOR_PORTING_SCOPE.md`](ACCELERATOR_PORTING_SCOPE.md).
|
||||
|
||||
In our changeset (patches 0018-0030) these fusions ship with CUDA native kernels
|
||||
+ CPU reference kernels ONLY; patch 0030 force-gates them OFF on Metal / Vulkan /
|
||||
SYCL (a CPU-fallback fused op would regress via the device round-trip, and a
|
||||
backend that ran the plain op on the discriminated node would silently
|
||||
miscompute). "Layer 2" is the upstream work that adds the missing native kernels.
|
||||
|
||||
This doc was written against the ggml backend trees in
|
||||
`backend/cpp/llama-cpp-paged-dev` (upstream base #24732, one commit OLDER than the
|
||||
series pin `c299a92c` #25045, with only the two paged-KV patches applied - neither
|
||||
touches GDN/SSM). So every "kernel already exists" statement below is a
|
||||
conservative lower bound: the pin has at least these kernels.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
## 0. Headline finding (correct a stale assumption first)
|
||||
|
||||
The series README (section 4c) says "the gated-DeltaNet op has no Vulkan kernel
|
||||
upstream, so the Qwen3.6 hybrid models assert / fall back and don't run there."
|
||||
**That is now stale.** All three backends already carry the BASE compute ops:
|
||||
|
||||
| op | Metal | Vulkan | SYCL |
|
||||
|------------------------|------------------------------------|------------------------------------------|---------------------------------|
|
||||
| GGML_OP_GATED_DELTA_NET| `kernel_gated_delta_net_impl` (f32, NSG 1/2/4) | `gated_delta_net.comp` (d16/32/64/128 x kda, shmem/cluster/nocluster variants) | `gated_delta_net.cpp` (`launch_gated_delta_net<KDA,keep_rs>`) |
|
||||
| GGML_OP_SSM_CONV | `kernel_ssm_conv_f32_f32` (+ `_4`, + batched) | `ssm_conv.comp` (+ APPLY_BIAS, APPLY_SILU specialization consts) | `ssm_conv.cpp` (`kernel_ssm_conv`) |
|
||||
| GGML_OP_SSM_SCAN | yes | `ssm_scan.comp` (mamba2) | `ssm_scan.cpp` (mamba2) |
|
||||
|
||||
Verified: Vulkan `gated_delta_net.comp` was last touched at the upstream base
|
||||
commit (#24732), not by any LocalAI patch. So the GDN COMPUTE op is present on
|
||||
Metal, Vulkan AND SYCL. The Qwen3.6 hybrids therefore DO run on all three today
|
||||
(via the upstream non-fused path that 0030 routes to). The Layer-2 value-add is
|
||||
the decode SPEEDUP from the fusions, NOT enabling the model to run at all.
|
||||
|
||||
Consequence: the GDN-compute op being "partly there" is true on every backend,
|
||||
not just Metal. What is still missing per backend is only the FUSION plumbing
|
||||
(in-place write-back target, the ids gather read, and the conv-update kernel) -
|
||||
a materially smaller scope than "port GDN from scratch."
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
## 1. Per-op semantics (the four fusions to port)
|
||||
|
||||
All four reuse an existing GGML_OP enum with extra `src[]` slots as a
|
||||
discriminator; none adds a new enum value. f32 throughout. The arithmetic core
|
||||
is IDENTICAL to the upstream non-fused op; only the read source and/or the write
|
||||
target are redirected. That single fact drives the whole bit-exactness story
|
||||
(section 3).
|
||||
|
||||
### OP A - `ggml_gated_delta_net_inplace` (patch 0018)
|
||||
- Enum `GGML_OP_GATED_DELTA_NET`, discriminated by a non-null `src[6]` =
|
||||
`state_dst` (a contiguous `[S_v*S_v*H, n_seqs]` view into the recurrent-state
|
||||
cache at `kv_head`). K == 1 only.
|
||||
- Semantics: run the standard GDN recurrence, but write the FINAL recurrent state
|
||||
directly into `state_dst` instead of appending it to the op output. The op
|
||||
output then carries only the attention scores. Removes the per-layer per-step
|
||||
~full-state D2D copy-back (the 0018 win).
|
||||
- Race (in-place read == write): each (seq, head) block owns a disjoint cache
|
||||
slot. The kernel loads the whole prior state `s0` into per-thread registers
|
||||
(`s_shard` on CUDA, `ls[NSG]` on Metal, the column shard on Vulkan/SYCL)
|
||||
BEFORE the ring write, so reading and writing the same slot is safe.
|
||||
|
||||
### OP B - `ggml_gated_delta_net_inplace_ids` (patch 0019)
|
||||
- Adds `src[5]` = FULL state cache `[S_v,S_v,H,n_rs_slots]`, `src[7]` = `ids`
|
||||
(I32, per-seq source slot == the recurrent-state `s_copy`), `op_param[1]` =
|
||||
`rs_head` (destination base slot). Still has the OP-A `src[6]` in-place target.
|
||||
- Semantics: read each sequence's prior state directly from `cache[ids[seq]]`
|
||||
(mirrors `ggml_ssm_scan`'s ids source), eliminating the `ggml_get_rows`
|
||||
materialization. Combined with OP A the op now reads AND writes the cache in
|
||||
place.
|
||||
- Race: identity sequences (`ids[s] == rs_head + s`, the steady AR-decode case)
|
||||
read s0 in place from the destination slot (safe via the register snapshot
|
||||
above). Non-identity sequences (reorder / rs_zero remap) are first copied by a
|
||||
TINY separate gather kernel (`gdn_gather_nonident`, one block/seq) into a
|
||||
DISJOINT scratch that the recurrence then reads, so the recurrence never reads
|
||||
a slot another block is writing. Value-preserving memcpy -> bit-identical to
|
||||
the get_rows path.
|
||||
|
||||
### OP C - `ggml_ssm_conv_update_inplace` (patch 0021)
|
||||
- Enum `GGML_OP_SSM_CONV`, discriminated by a non-null `src[3]` =
|
||||
`conv_state_dst` (`[(K-1)*channels, n_seqs]` in-place ring view).
|
||||
`src[0]` = conv_states `[K-1, channels, n_seqs]`, `src[1]` = conv_kernel
|
||||
`[K, channels]`, `src[2]` = x_cur `[channels, 1, n_seqs]`. `op_param[0]` =
|
||||
fuse_silu.
|
||||
- Semantics (decode, n_seq_tokens == 1): per (channel, sequence) assemble the
|
||||
width-K conv window in registers from the K-1 cached taps + the current token,
|
||||
compute the depthwise conv with the SAME ascending-tap FMA order as plain
|
||||
`ssm_conv` (`tap0*w0 + ... + xc*w_{K-1}`, then `+0.0f` to match plain conv's
|
||||
`sumf += b` with b==0), optionally fold SiLU, write the conv output
|
||||
`[channels,1,n_seqs]`, and write the 1-token-shifted ring state back in place.
|
||||
Replaces the 4-op decode conv chain (transpose + concat + conv + silu + ring
|
||||
cpy).
|
||||
- Race: read source (gathered taps) and write target (cache view) are disjoint
|
||||
buffers -> race-free by construction, no ids/identity logic.
|
||||
|
||||
### OP D - `ggml_ssm_conv_update_inplace_ids` (patch 0028)
|
||||
- Same enum, discriminated by a non-null `src[4]` = `ids`; `src[0]` becomes the
|
||||
FULL conv cache `[K-1, channels, n_cells]`; `op_param[1]` = rs_head.
|
||||
- Semantics: gather-free conv-update - read each sequence's prior taps from
|
||||
`cache[ids[s]]` in-kernel (no get_rows). Identity reads in place from
|
||||
`conv_state_dst`; non-identity gathered into a disjoint scratch first by a tiny
|
||||
`ssm_conv_gather_nonident` kernel. The window is copied to a local array
|
||||
BEFORE the (possibly aliasing) ring write so the identity read==write slot is
|
||||
correct. Bit-identical to get_rows + OP C.
|
||||
|
||||
### Net new kernels vs reuse, per op
|
||||
- OP A: NOT a new compute kernel - a write-target redirection of the EXISTING
|
||||
GDN kernel + 1 buffer binding + a supports_op/op-handler branch.
|
||||
- OP B: the GDN kernel gains a per-seq read-base select (identity vs scratch) +
|
||||
1 ids binding + rs_head param + 1 tiny gather kernel.
|
||||
- OP C: a GENUINELY NEW kernel on each backend. The existing `ssm_conv` computes
|
||||
a windowed reduction over a PRE-concatenated input; it does not assemble the
|
||||
window from cached taps + the current token, fold silu, or write the shifted
|
||||
ring state. This is the largest net-new piece.
|
||||
- OP D: the OP-C kernel gains the read-base select + 1 ids binding + rs_head + 1
|
||||
tiny conv gather kernel.
|
||||
|
||||
The `ggml.h` / `ggml.c` builders, the CPU reference kernels, the model-graph
|
||||
emission (`delta-net-base.cpp`, qwen35*), and the `test-backend-ops` cases are
|
||||
SHARED and already done by patches 0018/0019/0021/0028. The only NEW per-backend
|
||||
work is the kernel(s) + the backend wiring.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
## 2. Per-backend: authoring model, effort, gotchas, wiring
|
||||
|
||||
### 2.1 Metal (MSL)
|
||||
|
||||
Authoring model: `.metal` MSL source (`ggml-metal.metal`), function-constant
|
||||
specialization (e.g. `FC_GATED_DELTA_NET`), kernels templated on `NSG`; host
|
||||
glue split across `ggml-metal-ops.cpp` (`ggml_metal_op_*` encode), the pipeline
|
||||
lookup in `ggml-metal-device.cpp`/`.m`, the kargs struct in `ggml-metal-impl.h`,
|
||||
and `supports_op` in `ggml-metal-device.m`. Threadgroup model; Apple GPU
|
||||
simdgroup width is a FIXED 32, `simd_sum` for the per-column reduce.
|
||||
|
||||
Effort: MEDIUM. ~350-500 LOC. The GDN and plain-ssm_conv kernels already exist
|
||||
and are ergonomic to extend. OP A is a write-base redirect of the existing
|
||||
`kernel_gated_delta_net_impl` (its tail already does
|
||||
`dst_state = dst + attn_size + state_out_base; dst_state[is] = ls[j]` after
|
||||
loading `ls[]` into registers - just point `dst_state` at the `state_dst` buffer
|
||||
and add the binding). OP C is the one net-new MSL kernel (Metal has NO bias/silu
|
||||
ssm_conv variant today - only plain + `_4` + batched - so the silu-fold and ring
|
||||
write are both new). Host glue spans 3-4 files.
|
||||
|
||||
Gotchas:
|
||||
- In-place race: the existing kernel ALREADY snapshots the state column into
|
||||
`ls[NSG]` registers before writing, so OP A/B are safe with no barrier; OP C/D
|
||||
must mirror the `float window[K]` local-copy-before-write that CPU/CUDA use.
|
||||
- Discriminated SSM_CONV: `supports_op` for `GGML_OP_SSM_CONV` currently returns
|
||||
`has_simdgroup_reduction` with NO check of `src[3]`/`src[4]`; GDN returns
|
||||
`has_simdgroup_reduction && src[2]->ne[0] % 32 == 0` with NO check of
|
||||
`src[6]`/`src[7]`. Both must be tightened (accept the discriminated variant
|
||||
only once the kernel exists) AND `ggml_metal_op_ssm_conv` /
|
||||
`ggml_metal_op_gated_delta_net` must branch on the extra src to pick the kernel.
|
||||
- Bit-exactness: fixed 32-wide simdgroup makes this the SIMPLEST of the three -
|
||||
the fused variant only redirects addresses, so it is bit-identical to Metal's
|
||||
own non-fused path by construction (the conv per-channel FMA needs the exact
|
||||
ascending order + the `+0.0f`).
|
||||
- The kargs struct grows by the `state_dst` / `ids` / `rs_head` fields; a new
|
||||
pipeline name (or a function-constant branch) distinguishes the variants.
|
||||
|
||||
### 2.2 Vulkan (GLSL .comp -> SPIR-V)
|
||||
|
||||
Authoring model: GLSL `.comp` in `vulkan-shaders/`, compiled at build time by
|
||||
`vulkan-shaders-gen` into embedded SPIR-V byte arrays (`gated_delta_net_f32_data`
|
||||
etc.); pipeline creation in `ggml-vulkan.cpp` declares the binding count +
|
||||
push-constant size; a push-constant struct per op; host dispatch `ggml_vk_*`
|
||||
binds subbuffers; `supports_op` in the device support function. Subgroup size
|
||||
VARIES by vendor (NVIDIA 32, AMD 64, Intel 8/16/32).
|
||||
|
||||
Effort: HARDEST. ~450-650 LOC + the most build/host glue. Same kernel logic as
|
||||
Metal/SYCL, but every new shader or variant requires: the shaders-gen regen, a
|
||||
new `ggml_vk_create_pipeline` registration with an explicit binding count and
|
||||
push-constant size, a new/extended push-constant struct (add `rs_head`), and
|
||||
GROWING the descriptor binding set from the current 7 (`src[0..5]` + dst) to 8-9
|
||||
(`state_dst`, `ids`). The GDN host dispatch hardcodes a 6-src bind loop and the
|
||||
pipeline is created with `"main", 7, ...` - both must change.
|
||||
|
||||
Gotchas:
|
||||
- Subgroup variance interacts with the EXISTING variant matrix: the GDN comp
|
||||
already ships shmem / cluster / nocluster variants keyed on subgroup size and
|
||||
relies on `S_V % COLS_PER_WG == 0`. The OP-A/B read/write redirect must be
|
||||
applied across ALL of those variants, and re-validated per vendor.
|
||||
- In-place race: GLSL must read the full column shard into local registers before
|
||||
the ring write (same pattern); confirm the SPIR-V memory model is not relied on
|
||||
for cross-invocation ordering (it is not - blocks are disjoint per (seq,head)).
|
||||
OP C/D need the explicit window-to-local copy.
|
||||
- Discriminated SSM_CONV: `supports_op` returns `op->src[0]->type == F32` with NO
|
||||
discriminator check; GDN loops `src[0..5]` F32 with NO `src[6]`/`src[7]` check.
|
||||
Both must be tightened. This is the backend where the 0030 hazard is most
|
||||
concrete (a present plain-conv kernel + a permissive supports_op = silent
|
||||
miscompute) - Vulkan is the exact case 0030 was written for.
|
||||
- conv-update is per-channel (one invocation per channel) so it is
|
||||
subgroup-AGNOSTIC; only the GDN recurrence carries the subgroup-width burden.
|
||||
- Vulkan's `ssm_conv.comp` ALREADY has APPLY_SILU + APPLY_BIAS specialization
|
||||
constants, so the silu-fold half of OP C is partly precedented here (unlike
|
||||
Metal); the ring write-back + tap-window assembly are still new.
|
||||
|
||||
### 2.3 SYCL (single-source DPC++)
|
||||
|
||||
Authoring model: plain C++ `.cpp`/`.hpp` per op (`gated_delta_net.cpp`,
|
||||
`ssm_conv.cpp`); a SYCL `queue.parallel_for` over an `nd_range` with
|
||||
`reqd_sub_group_size(WARP_SIZE)`; sub-group reductions (`warp_reduce_sum`);
|
||||
`supports_op` in `ggml-sycl.cpp`. NO separate shader-compile step (single
|
||||
source).
|
||||
|
||||
Effort: EASIEST to author. ~250-350 LOC. The SYCL op handlers + kernels are
|
||||
near-VERBATIM mirrors of the CUDA ones (`launch_gated_delta_net<KDA,keep_rs>`,
|
||||
`s_shard`, `curr_state`, `state = dst + attn_score_elems`, `warp_reduce_sum`) -
|
||||
a dpct/SYCLomatic-style port. The CUDA diffs in 0018/0019/0021/0028 would port
|
||||
almost line-for-line: add the `state_dst` param, the `ids`/`rs_head` params, the
|
||||
read-base select, the two tiny gather kernels, and the new conv-update kernel.
|
||||
No pipeline/push-constant/binding bookkeeping.
|
||||
|
||||
Gotchas:
|
||||
- In-place race: the `s_shard[]` / window arrays are per-work-item private, so
|
||||
the register-snapshot-before-write pattern carries over directly. Safe.
|
||||
- Discriminated SSM_CONV: `supports_op` checks `src[0]`/`src[1]` F32 with NO
|
||||
discriminator check; GDN returns a BARE `true` (the MOST permissive, so the
|
||||
hazard is worst here). Both must be tightened, and `ggml_sycl_op_ssm_conv` /
|
||||
`ggml_sycl_op_gated_delta_net` must branch on the extra src.
|
||||
- Bit-exactness: `WARP_SIZE` is compile-fixed (Intel sub-group 8/16/32), same
|
||||
situation as CUDA; the fused variant matches SYCL's own non-fused path by
|
||||
construction. conv-update is per-channel -> subgroup-agnostic.
|
||||
|
||||
### 2.4 Common wiring (all three) + the 0030 emission-gate change
|
||||
|
||||
Per backend, four wiring touch-points beyond the kernel body:
|
||||
1. `supports_op`: tighten the `GGML_OP_SSM_CONV` and `GGML_OP_GATED_DELTA_NET`
|
||||
entries so the discriminated/extra-src node is reported supported ONLY when
|
||||
the new kernel handles it (and rejected otherwise, instead of today's
|
||||
silently-true-for-the-plain-kernel).
|
||||
2. op handler: branch on `src[3]`/`src[4]` (conv) and `src[6]`/`src[7]` (GDN) to
|
||||
dispatch the fused kernel.
|
||||
3. pipeline/kernel registration (Vulkan: + push-constant struct + descriptor
|
||||
bindings; Metal: + kargs fields + pipeline name; SYCL: just the new functions).
|
||||
4. The patch-0030 gate in `src/llama-context.cpp`.
|
||||
|
||||
The 0030 change today is a hard allow-list: any non-CPU compute backend whose reg
|
||||
name is not `"CUDA"`/`"ROCm"`/`"MUSA"` forces `fused_gdn_ar = fused_gdn_ch =
|
||||
auto_fgdn = false`. As each backend gains kernels this must become capability-
|
||||
driven, in one of two ways:
|
||||
- minimal: add the backend's reg name (e.g. `"Metal"`) to the allow-list once its
|
||||
kernels + tightened supports_op ship; OR
|
||||
- clean (recommended upstream form): DELETE the name allow-list and make
|
||||
`supports_op` authoritative - have the `auto_fgdn` resolution probe
|
||||
`ggml_backend_dev_supports_op` on a representative node that carries the
|
||||
discriminated `src[]` slots. Then routing falls out of the normal scheduler
|
||||
fallback and no backend name is ever hard-coded. This also fixes 0030's stated
|
||||
weakness that the upstream `auto_fgdn` check only inspects GATED_DELTA_NET
|
||||
nodes and covered the discriminated SSM_CONV only incidentally.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
## 3. Bit-exactness per backend (the md5 gate question)
|
||||
|
||||
Feasible on ALL THREE, and not actually constraining, because of how the gate is
|
||||
scoped:
|
||||
|
||||
- The series md5 gate is a CUDA-vs-CPU comparison; each GPU backend ALREADY has
|
||||
its own f32 reduction order (Metal `simd_sum`, Vulkan subgroup reduce, SYCL
|
||||
`warp_reduce_sum`) that differs from CUDA's and from CPU's. There is no
|
||||
cross-backend md5 and none is expected.
|
||||
- The relevant per-backend invariant is: the FUSED variant must equal that
|
||||
backend's OWN non-fused path. The fusions change only the read source
|
||||
(gather -> indexed read; the gather is a value-preserving memcpy) and the write
|
||||
target (appended output -> in-place cache slot). They do NOT touch the
|
||||
per-column FMA/reduce order. So the fused op is bit-identical to the
|
||||
non-fused op on the same backend BY CONSTRUCTION.
|
||||
- Two arithmetic details each port MUST preserve exactly: (a) the conv
|
||||
ascending-tap order plus the `+0.0f` that matches plain `ssm_conv`'s
|
||||
`sumf += b` with b==0; (b) the existing GDN per-column subgroup reduce (do not
|
||||
re-order it). Get those right and `test-backend-ops` (backendX-vs-CPU, already
|
||||
registered for SSM_CONV / SSM_CONV_UPDATE / SSM_CONV_UPDATE_IDS /
|
||||
GATED_DELTA_NET) is the per-backend gate.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
## 4. Upstream path and ranked recommendation
|
||||
|
||||
### Ops-first, then one PR per backend (NOT one big PR)
|
||||
|
||||
Recommended sequence:
|
||||
|
||||
1. PR #1 - OPS (already essentially done, upstreamable as-is): the `ggml.h`/
|
||||
`ggml.c` builders, the CPU reference kernels, the CUDA kernels, the
|
||||
`test-backend-ops` cases, and the capability-driven gate (the clean
|
||||
`supports_op`-authoritative version of 0030). This is independently mergeable
|
||||
and mirrors how llama.cpp lands new ops (CPU + CUDA first; GDN itself landed
|
||||
that way).
|
||||
2. PR #2 - Metal kernels + wiring.
|
||||
3. PR #3 - SYCL kernels + wiring.
|
||||
4. PR #4 - Vulkan kernels + wiring.
|
||||
|
||||
Do NOT bundle the backends: each needs its own hardware to validate
|
||||
`test-backend-ops`, reviewers are backend-specialized, and a regression in one
|
||||
must not block the others.
|
||||
|
||||
### Value x effort ranking (which backend first)
|
||||
|
||||
| backend | user base / value | author effort | bit-exact difficulty | net rank |
|
||||
|---------|----------------------------|---------------|----------------------|----------|
|
||||
| Metal | HIGH (Apple Silicon = largest non-CUDA LocalAI base; unified memory makes the no-copy / no-gather plumbing wins map directly) | MEDIUM | LOWEST (fixed 32 simdgroup) | **1st** |
|
||||
| SYCL | LOW-MED (Intel GPU) | LOWEST (near-verbatim CUDA mirror) | LOW | **2nd** |
|
||||
| Vulkan | HIGHEST breadth (AMD + Intel + cross-vendor) | HIGHEST (shaders-gen + variant matrix + subgroup variance + descriptor growth) | MEDIUM (per-vendor subgroup validation) | **3rd** |
|
||||
|
||||
Recommendation: **Metal first.** It banks the biggest user-facing decode win at
|
||||
medium effort, the base GDN + conv kernels already exist, and Apple's fixed
|
||||
simdgroup width makes bit-exactness the simplest. **SYCL second** as a cheap,
|
||||
nearly mechanical follow-on (the port is a line-for-line CUDA mirror, so it is
|
||||
low-cost insurance even though the Intel-GPU audience is smaller). **Vulkan last**
|
||||
as the high-effort / high-breadth capstone - it reaches the widest hardware
|
||||
(AMD + Intel + anything with a Vulkan driver), but the shader-gen pipeline, the
|
||||
existing variant matrix, the subgroup-width variance, and the per-vendor
|
||||
validation burden make it the right capstone once the pattern is proven on
|
||||
Metal + SYCL.
|
||||
|
||||
A reasonable cheaper variant: ship Metal + SYCL together right after the ops PR
|
||||
(both are register-snapshot ports with no shader-gen step) and treat Vulkan as a
|
||||
separate later effort.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
## 5. Summary
|
||||
|
||||
- GDN-compute and plain SSM_CONV kernels ALREADY EXIST on Metal, Vulkan and SYCL
|
||||
(the README's "no Vulkan kernel" line is stale). The Qwen3.6 hybrids run on all
|
||||
three today via the non-fused path; Layer-2 is about the decode SPEEDUP.
|
||||
- Per backend the NEW work is: redirect the GDN state write (OP A) + add the ids
|
||||
read (OP B) to the existing GDN kernel, write ONE new conv-update kernel
|
||||
(OP C) + its ids variant (OP D), add two tiny gather kernels, and tighten
|
||||
supports_op + the op-handler branch + (Vulkan) the pipeline/push-constant/
|
||||
descriptor wiring. The builders, CPU refs, model graph and tests are shared and
|
||||
already done.
|
||||
- Bit-exactness is feasible everywhere and per-backend by construction (the
|
||||
fusions redirect addresses, not the f32 reduction order); `test-backend-ops`
|
||||
(backendX-vs-CPU) is the gate.
|
||||
- Sequence: ops-first PR (incl. the capability-driven replacement for 0030's
|
||||
name allow-list), then Metal, then SYCL, then Vulkan.
|
||||
401
backend/cpp/llama-cpp-localai-paged/docs/VLLM_PARITY_FINAL.md
Normal file
401
backend/cpp/llama-cpp-localai-paged/docs/VLLM_PARITY_FINAL.md
Normal file
@@ -0,0 +1,401 @@
|
||||
# vLLM Parity - Final State (Qwen3.6 NVFP4 on GB10)
|
||||
|
||||
> **Status: CLOSED.** This is the standing record of the exhaustive GB10 (DGX
|
||||
> Spark, sm_121) parity investigation for `llama-cpp-localai-paged` against vLLM
|
||||
> on the Qwen3.6 hybrid gated-DeltaNet NVFP4 models. It exists so the
|
||||
> investigation is **never re-litigated**: every lever attempted, its verdict,
|
||||
> its key number, and the structural floors that bound the result are recorded
|
||||
> below with the artifact each number came from. The one-line conclusion:
|
||||
> **prefill is genuinely capped at 36-43% of vLLM (FP4-MMQ optimality + GDN
|
||||
> O(C^2) intra-chunk complexity; prefill is not CUDA-graph-replayed, so these are
|
||||
> real floors, not profiling artifacts); decode-serving is near-parity at ~86% of
|
||||
> vLLM's true GPU-steady decode (the long-standing ~56% headline was a
|
||||
> measurement / operating-point artifact, corrected below), with the residual
|
||||
> ~14% being vLLM's mature fused-Marlin + Triton-elementwise kernels that are not
|
||||
> cheaply replicable on GB10.**
|
||||
|
||||
Companion docs (design/rationale, not re-summarized here): the patch-series
|
||||
[`README.md`](../README.md) (section 5 dev-notes), `VLLM_PARITY_LEVER_MAP.md`,
|
||||
`PREFILL_GEMM_SCOPE.md`, `PREFILL_GEMM_RESULTS.md`, `DECODE_SERVING_SCOPE.md`,
|
||||
`TENSORCORE_GDN_SCOPE.md`, `TENSORCORE_GDN_BUILD_PLAN.md`, `PAGED_BITEXACT_NOTE.md`.
|
||||
|
||||
Source key (every number below cites one of these):
|
||||
- **CDEF** = the definitive same-session both-engine run `dgx:~/bench/COMBINED_DEFINITIVE.txt` (2026-06-29, GIT_HEAD `a7d439e`, h2h_cli3 OpenAI `/v1/completions`, fresh-nonce prompts, ignore_eos, ptok128 gen128; paged `LLAMA_KV_PAGED=1 LLAMA_MOE_FORCE_GRAPHS=1`, GDN M5 on, S1 on, S3 off; vLLM 0.23.0 gpu-util 0.85 max-model-len 4096 max-num-seqs 256 tp1).
|
||||
- **README** = the static `llama-batched-bench` table in [`README.md`](../README.md) section 4 (npp128/ntg128; patched vs stock-`9d5d882d` vs vLLM-prior).
|
||||
- **PGR** = `PREFILL_GEMM_RESULTS.md`. **LMAP** = `VLLM_PARITY_LEVER_MAP.md` (profile-validated section). **DSS** = `DECODE_SERVING_SCOPE.md`. **MG** = `dgx:~/bench/marlin_gate/`. **GDNAB** = `dgx:~/bench/gdn_p1_ab/`. **0034/0035** = patch headers in `patches/paged/`.
|
||||
- **HNP** = the clean, uncontended, **graph-node-traced** both-engine high-N decode profile (2026-06-30): `dgx:~/highN_prof2/*.nsys-rep` (paged, npl=256) + `dgx:~/highN_vllm/*.nsys-rep` (vLLM), captured with `nsys --cuda-graph-trace=node` and decomposed by the **difference method** (per-token cost = ntg=64 profile minus ntg=16 profile). **This supersedes every earlier decode decomposition** (LMAP included): those were taken without `--cuda-graph-trace=node`, which collapses each graph replay into one opaque launch and made the per-kernel decode attribution an artifact (see 2c).
|
||||
- "estimated" marks any figure not pinned to one of the above.
|
||||
|
||||
---
|
||||
|
||||
## 1. The benchmark (paged vs vLLM vs stock)
|
||||
|
||||
Two models: the MoE **Qwen3.6-35B-A3B-NVFP4** (decision model, 256 experts top-8,
|
||||
30 GDN + 10 full-attn layers + a dense shared expert per layer) and the dense
|
||||
**Qwen3.6-27B-NVFP4** (48 GDN + 16 full-attn). All numbers GB10 / CUDA 13 /
|
||||
sm_121, backend pin `9d5d882d`.
|
||||
|
||||
### 1a. Prefill (S_PP, prefill tokens/s)
|
||||
|
||||
Paged = static `llama-batched-bench` PP block; vLLM = server prefill-phase rate
|
||||
at the same prompt length. Source: **CDEF**.
|
||||
|
||||
| Model | shape | paged S_PP | vLLM S_PP | paged % of vLLM |
|
||||
|---|---|---:|---:|---:|
|
||||
| MoE 35B-A3B | PP=512, B=32 | 2309.6 | 6418.9 | **36.0%** |
|
||||
| MoE 35B-A3B | PP=2048, B=32 | 2401.9 | 6748.5 | **35.6%** |
|
||||
| Dense 27B | PP=512, B=32 | 960.3 | 2277.3 | **42.2%** |
|
||||
| Dense 27B | PP=2048, B=32 | 1010.2 | 2360.1 | **42.8%** |
|
||||
|
||||
Prefill is the largest absolute gap. The profile-validated decomposition (LMAP,
|
||||
nsys both-engine, MoE decision model) attributes it as: paged **395.9 us/tok** vs
|
||||
vLLM **197.0 us/tok** (total gap ~198.9 us/tok), split GDN **+59.2** (~30%),
|
||||
MoE-GEMM **+56.5** (~28%), ew/layout/glue **+21.4** (~11%), act-quant **+15.2**
|
||||
(~8%), bf16-proj **+13.7** (~7%), gate **+12.4** (~6%), norms **+11.1** (~6%),
|
||||
dispatch **+5.9** (~3%).
|
||||
|
||||
### 1b. Decode / serving (per-seq + aggregate decode t/s), staggered serving
|
||||
|
||||
Source: **CDEF** NPL runs (continuous serving via h2h_cli3). `decode_agg` =
|
||||
aggregate decode t/s; `perseq` = decode tok/s/seq; PEAK_GB = peak process VRAM.
|
||||
|
||||
**MoE Qwen3.6-35B-A3B-NVFP4:**
|
||||
|
||||
| N | paged decode_agg | vLLM decode_agg | paged perseq | vLLM perseq | perseq % of vLLM | paged TTFT_mean ms | vLLM TTFT_mean ms | paged PEAK_GB | vLLM PEAK_GB |
|
||||
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
||||
| 8 | 208.1 | 297.1 | 25.68 | 36.68 | **70.0%** | 747.9 | 204.2 | 50.03 | 112.42 |
|
||||
| 32 | 379.1 | 575.7 | 11.40 | 17.49 | **65.2%** | 2377.9 | 640.8 | 52.13 | 112.20 |
|
||||
| 128 | 611.9 | 958.2 | 4.14 | 6.97 | **59.4%** | 7058.3 | 1965.4 | 60.57 | 112.51 |
|
||||
| 256 | 717.8 | 1177.4| 2.29 | 4.12 | **55.6%** | 13533.6 | 3937.3 | 70.18 | 112.55 |
|
||||
|
||||
**Dense Qwen3.6-27B-NVFP4:**
|
||||
|
||||
| N | paged decode_agg | vLLM decode_agg | paged perseq | vLLM perseq | perseq % of vLLM | paged TTFT_mean ms | vLLM TTFT_mean ms | paged PEAK_GB | vLLM PEAK_GB |
|
||||
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
||||
| 8 | 84.0 | 72.1 | 10.42 | 8.93 | **116.7%** | 1914.7 | 493.1 | 77.97 | 109.63 |
|
||||
| 32 | 196.5 | 214.7 | 5.83 | 6.56 | **88.9%** | 7023.3 | 1735.4 | 83.04 | 109.65 |
|
||||
| 128 | 343.8 | 431.8 | 2.18 | 3.10 | **70.3%** | 19468.9 | 5455.0 | 101.93 | 109.67 |
|
||||
| 256 | 380.3 | 532.5 | 1.13 | 1.82 | **62.1%** | 36306.8 | 10824.1 | 114.63 | 109.67 |
|
||||
|
||||
End-to-end aggregate `agg_tps` (incl. prefill contention), **CDEF**: MoE paged
|
||||
179.7/301.4/425.6/459.9 vs vLLM 278.5/515.6/798.3/915.4 at N=8/32/128/256; dense
|
||||
paged 72.6/141.4/205.8/213.3 vs vLLM 69.4/193.3/346.6/394.7.
|
||||
|
||||
**Reading the table.** Dense decode is **ahead of vLLM at low concurrency
|
||||
(116.7% at N=8)**. The high-N percentages here (perseq ~56%, decode_agg ~61% at
|
||||
N=256) are **server-window** numbers and **understate true engine parity**: they
|
||||
divide the paged serving rate by vLLM's *prefill-overlap-inflated* server rate.
|
||||
The corrected, graph-node-traced decomposition (section 2c, **HNP**) shows paged
|
||||
decode at **~86% of vLLM's true GPU-steady decode**, with the remaining
|
||||
server-window gap being an S3-recoverable serving graph-reuse overhead (2d). The
|
||||
earlier "this is just the bandwidth floor / vLLM pays equally" reading was a
|
||||
**profiling artifact** and is corrected in 2c.
|
||||
|
||||
**PEAK_GB is the structural memory advantage.** vLLM's PEAK_GB is a **fixed
|
||||
~109-112.5 GB reservation** (the `--gpu-memory-utilization 0.85` block-manager
|
||||
pre-allocation of the ~128 GB unified LPDDR5x) and does **not** vary with N. The
|
||||
paged backend allocates KV on demand, so its peak **grows with load** but stays
|
||||
far below vLLM at low/mid concurrency: MoE N=8 uses **50.0 vs 112.4 GB (~2.2x
|
||||
less)**, and even at N=256 MoE is 70.2 vs 112.6 GB. This is the headline of
|
||||
section 5 (memory advantage / higher max concurrency per GPU) and is real,
|
||||
bit-exact, and not an operating-point trick.
|
||||
|
||||
### 1c. Patched vs true-stock (static batched-bench, the patch-series multiplier)
|
||||
|
||||
Stock `9d5d882d` was not in the same-session CDEF run; the patched-vs-stock
|
||||
multiplier is the static `llama-batched-bench` table (**README**, npp128/ntg128,
|
||||
decode t/s):
|
||||
|
||||
| | N=8 | N=32 | N=64 | N=128 | max x over stock |
|
||||
|---|---:|---:|---:|---:|---:|
|
||||
| Dense patched / stock | 85.3 / 68.3 | 211.9 / 119.9 | 305.2 / 142.8 | 382.1 / 155.1 | **2.46x** |
|
||||
| MoE patched / stock | 230.3 / 186.7 | 466.4 / 267.4 | 622.4 / 320.5 | 784.3 / 347.2 | **2.26x** |
|
||||
|
||||
In that **static** regime the patched decode kernel is **at vLLM parity**
|
||||
(dense 121/100/99/91% of vLLM-prior across widths; MoE 90/93/91/89%). The serving
|
||||
table in 1b is the harder continuous regime; the gap between the two regimes is
|
||||
the subject of section 2 (serving) and was fully closed on the host side.
|
||||
|
||||
---
|
||||
|
||||
## 2. Complete lever map (every attempt, verdict, key number)
|
||||
|
||||
Bit-exactness convention (per `PAGED_BITEXACT_NOTE.md`): the gate is **per-path**.
|
||||
Dense greedy md5 `5951a5b4`; paged-MoE greedy md5 `8cb0ce23` (a benign
|
||||
FP-accumulation-order reorder vs non-paged `07db32c2`, KL-validated). "BE" = greedy
|
||||
md5 byte-identical; "KL-benign" = new FP path, gated by KL-divergence within band.
|
||||
|
||||
### 2a. PREFILL - weight GEMM track (verdict: FP4-MMQ is optimal on GB10)
|
||||
|
||||
Four kernels were built or ported to beat MMQ at large-M MoE prefill. **All
|
||||
rejected; FP4-MMQ stays the shipped path.** The decisive surprise (LMAP, both-engine
|
||||
nsys): **on sm_121 vLLM itself does not run native FP4** - it runs **Marlin W4A16**
|
||||
(FP4 dequant to bf16 in-register + bf16 GEMM) for experts and FP8 projections,
|
||||
capped at bf16-tensor-core peak (~half FP4 peak). So MMQ's native FP4 path is
|
||||
already structurally competitive on this exact silicon.
|
||||
|
||||
| Lever | What | Verdict | Key number | Source |
|
||||
|---|---|---|---|---|
|
||||
| **0033** dequant -> bf16 cuBLAS | route large-M NVFP4 dense GEMM off MMQ to dequant->bf16 nvjet/cuBLAS | **REJECTED** (regression) | dense S_PP **-49% / -42% / -29%** at M=512/1024/2048; bit-exact md5 identical, KL-better | PGR |
|
||||
| dense-cuBLAS reroute (full sweep) | the same reroute across the dense + MoE prefill sweep | **REJECTED** | **-31% to -62%** band (estimated; the artifact-pinned dense subset is -29% to -49%, PGR) | LMAP / recorded verdict |
|
||||
| **0034** native FP4-MMA W4A4 | Blackwell `mxf4nvf4` OMMA large-M kernel, PoC verbatim | **REJECTED in-backend** | PoC `~103 TFLOP/s` (57.7% of FP4 peak, beats cuBLAS-bf16, NMSE=0), but the standalone PoC win **did not hold in-backend** | 0034 header / LMAP |
|
||||
| **0035** W4A16-Marlin grouped MoE | FP4->bf16 in-register dequant + bf16 `mma.sync`, zero act-quant tax (vLLM's exact sm_121 shape) | **REJECTED** (perf regression) | correct + bit-exact-gated: `test-backend-ops MUL_MAT_ID` 81/81; KL **benign and better** (marlin KLD **0.131** < MMQ **0.137**, same-top-p 84.6% vs 84.3%); md5 short identical, long one benign flip - but **-39%** S_PP vs MMQ (estimated/recorded; MG holds only the correctness+KL gate) | 0035 header, MG |
|
||||
| offline-repack Marlin / vLLM-verbatim Marlin | repack weights offline to Marlin layout; port vLLM's Marlin kernel verbatim | **REJECTED** | verbatim-Marlin: **correct but -39%**; offline-repack: workflow built (shared the GPU lock, `combined_definitive.sh:29`), same bf16-peak ceiling, no win | recorded verdict / combined_definitive.sh |
|
||||
|
||||
**Why the whole track loses (the structural reason):** bf16 tensor-core peak on
|
||||
GB10 is **~half FP4 peak** (PGR s3), so any dequant->bf16 kernel caps at ~half the
|
||||
throughput the native FP4-MMQ read reaches; and the dequant write is an
|
||||
un-amortized weight-sized memory pass (~8x the FP4-read byte traffic, PGR). The
|
||||
W4A16 angle was the most promising because it *also* erases the ~8% act-quant tax
|
||||
vLLM never pays - but the bf16-peak ceiling still made it a net regression. **MMQ
|
||||
is optimal; the GEMM bucket is not winnable on GB10 with the available kernels.**
|
||||
|
||||
### 2b. PREFILL - GDN chunked-scan track (verdict: M5 tf32 C=16 is the shipped winner)
|
||||
|
||||
The gated-DeltaNet chunked scan is the **#1 single prefill-gap contributor**
|
||||
(+59.2 us/tok, ~30% of the gap; LMAP). vLLM's FLA `chunk_gated_delta_rule` runs the
|
||||
same math at **36.5 us/tok vs paged 95.7 = 2.62x** (LMAP), pushing intra-chunk Gram
|
||||
products through tensor cores. The series chased that headroom.
|
||||
|
||||
| Lever | What | Verdict | Key number | Source |
|
||||
|---|---|---|---|---|
|
||||
| **0031** scalar-serial chunked scan | FLA-style chunk gated-delta-rule, scalar/serial form (`GDN_TC=0`) | superseded | math-correct (`test-backend-ops` 91/91, <=1e-7 NMSE) but **~761 vs ~971 t/s = ~22% slower** at the GB10-forced C=16 | README s5 |
|
||||
| **0047 / M5** tf32 tensor-core scan | full form-T solve + state-update on tf32 `m16n8k8` mma, f32-only re-port | **SHIPPED (default-on under paged)** | MoE prefill S_PP **+3.5% @npp512 (3x A/B), +17.7% @npp2048**; decode unchanged; bit-exact-benign (`GATED_DELTA_NET` 46-94/94, md5 == canonical) | README s3/s5 |
|
||||
| bf16 CONFIG-C (M8) | bf16 `Kc/Qc` + 2 C*C scratch, C->64 + 2 blk/SM | **REJECTED** (not in f32-only series) | the run that confirmed the geometry (CDEF GIT_HEAD), then dropped | CDEF / README s5 |
|
||||
| bf16-C16 | bf16 Gram at C=16 | rejected | no win over tf32-M5; bf16 mantissa unsafe on the state-coupled products | GDN build-plan s4 |
|
||||
| BV block-occupancy A/B (tf32) | raise blocks/SM to test if occupancy is the bound | **REJECTED** (occupancy is NOT the bound; latency is wave-hidden) | two arms statistically equal: **1844 vs 1814 S_PP (-1.04%, within noise)** | GDNAB armA/armB |
|
||||
| bf16-C64 | bf16 Gram at the larger C=64 chunk | **REJECTED** | **-18.75%** - the O(C^2) intra-chunk triangular-solve + serial recurrence dominates, so growing C hurts | recorded verdict / GDN build-plan |
|
||||
|
||||
**Why the bottleneck is not occupancy/dtype:** the cost is the **O(C^2)
|
||||
intra-chunk triangular solve + the serial inter-chunk recurrence dependency**, not
|
||||
grid occupancy (BV: -1.04%, latency is wave-hidden) and not Gram dtype (bf16-C64:
|
||||
-18.75%). GB10's 99 KB
|
||||
dynamic-smem cap forces **C=16** (the 128x128 f32 state alone is 64 KB of the
|
||||
all-shared layout), and at this head dim the only win is tensor cores on the
|
||||
intra-chunk products, not chunking or wider chunks. M5 tf32 at C=16 is exactly
|
||||
that and is the shipped winner; it does not fully close the 2.62x because vLLM's
|
||||
mature FLA blocked-solve is a more complete tensor-core implementation.
|
||||
|
||||
### 2c. DECODE / serving (verdict: near-parity at ~86% of vLLM's true GPU-steady decode; the earlier "BW-floored / vLLM pays equally" was a profiling artifact)
|
||||
|
||||
**Methodology correction - why every earlier decode decomposition was wrong.**
|
||||
Decode runs as a **replayed CUDA graph**. `nsys` *without* `--cuda-graph-trace=node`
|
||||
collapses each graph replay into a **single opaque launch**, so the per-kernel
|
||||
attribution in every prior decode profile (the "paged 159 us/tok, GPU ~16% busy,
|
||||
host-bound, 5.4x more GPU-efficient per token" picture, and the conclusion that the
|
||||
high-N gap was a pure bandwidth floor vLLM pays equally) was an **artifact of graph
|
||||
collapse, not real per-token cost**. The correct method, used for the numbers below
|
||||
(**HNP**, clean uncontended node, 2026-06-30), is `nsys --cuda-graph-trace=node`
|
||||
plus the **difference method**: per-token cost = the ntg=64 profile minus the
|
||||
ntg=16 profile, isolating per-token-linear work from fixed per-step overhead. Under
|
||||
this method **paged decode at npl=256 is 99% GPU-busy (GPU-idle only 1.4%), NOT
|
||||
host-bound** - the opposite of the collapsed-graph reading. This supersedes the
|
||||
LMAP decode decomposition.
|
||||
|
||||
**The real per-token decomposition (paged, npl=256, HNP)** - GPU-steady ~1082
|
||||
us/tok (924 t/s):
|
||||
|
||||
| Bucket | us/tok | % of decode | Note |
|
||||
|---|---:|---:|---|
|
||||
| GDN recurrent scan | 553 | **51%** | **LINEAR in batch** - the dominant cost; shared BW floor (below) |
|
||||
| NVFP4 expert GEMM | 254 | 23% | amortizes with batch |
|
||||
| bf16 projections | 73 | 7% | |
|
||||
| elementwise | 57 | 5% | |
|
||||
| SSM conv | 31 | 3% | |
|
||||
| rest | small | - | |
|
||||
| GPU-idle | - | **1.4%** | not host-bound |
|
||||
|
||||
**The gap reconciled (the numbers must sum).** The headline N=256 figures (perseq
|
||||
~56%, decode_agg ~61%, section 1b) were paged-**server** **718** over vLLM-**server**
|
||||
**1177**. But the vLLM server number is **inflated ~8 pts**: vLLM's true GPU-steady
|
||||
decode is **1078 t/s**, and its chunked-prefill overlap inflates the
|
||||
server-measured decode window. The reconciled chain:
|
||||
|
||||
| Measurement | t/s | % of vLLM-server (1177) |
|
||||
|---|---:|---:|
|
||||
| vLLM server (CDEF) | 1177 | 100% |
|
||||
| vLLM **true GPU-steady** decode | 1078 | 92% |
|
||||
| llama **GPU-steady** decode | 924 | 78.5% (**= 86% of vLLM's true 1078**) |
|
||||
| llama server (CDEF) | 718 | ~60.7% (61%) |
|
||||
|
||||
So **vs vLLM's true GPU-steady decode, paged is ~86%, not ~56%.** The ~56% headline
|
||||
conflated two distinct things: vLLM's prefill-overlap-inflated server window, and
|
||||
the paged serving graph-reuse overhead. The **~17 pt** drop from llama GPU-steady
|
||||
(78.5%) to llama server (60.7%) is exactly that **serving graph-reuse overhead**,
|
||||
which is **S3-recoverable** (2d).
|
||||
|
||||
**GDN is a shared BW floor where paged is ahead.** The GDN recurrent scan moves
|
||||
**~32 GB/step of f32 recurrent-state traffic**; paged runs it at **83% of the
|
||||
273 GB/s LPDDR5x peak vs vLLM's 79%**. Both engines' high-N sublinearity (only
|
||||
**1.17-1.18x throughput for a 2x batch**) comes from this **shared** floor - it is
|
||||
not a paged-specific loss, and paged is the faster of the two on it.
|
||||
|
||||
**The residual ~14 pt GPU-steady gap is real but not cheaply closable.** vLLM's
|
||||
GPU-steady 1078 vs paged 924 decomposes into two buckets: the **MoE expert path
|
||||
(~+11 ms)** - vLLM's fused Marlin persistent-tiling vs ggml's separate act-quant +
|
||||
MMQ - and **elementwise (~+10 ms)** - vLLM fuses it into one Triton kernel. Both
|
||||
fusions were attempted and rejected (table below). Closing the residual needs
|
||||
vLLM's mature Marlin tiling (our own ggml Marlin port already lost **-19.6%**) plus
|
||||
multi-stream overlap (hard inside a single-stream CUDA graph): **low-EV,
|
||||
multi-week, GB10-uncertain**.
|
||||
|
||||
**Decode / fusion levers (verdicts).**
|
||||
|
||||
| Lever | What | Verdict | Key number | Source |
|
||||
|---|---|---|---|---|
|
||||
| act-quant folded into ggml MMQ | erase the act-quant pass by quantizing the y-operand inside the MoE expert MMQ kernel (vLLM's fused-Marlin single-pass shape) | **REJECTED** (regression) | **-79.4%**: ggml MMQ re-quantizes the y-operand **once per weight-row-tile x stream-k split**, with no tensor cores for the inline quant - structural, ggml MMQ lacks vLLM's persistent single-pass tiling | HNP / recorded verdict |
|
||||
| norm + quant + silu fusion | fold the elementwise path into one launch (vLLM's Triton kernel) | **REJECTED** (architecturally infeasible) | `ggml_cuda_can_fuse` cannot express it: FP4 quant is a **mul_mat-internal prologue, not a cgraph node**; the norm is already fused (0042/0044); silu is separated from the norm by **2 GEMMs + the router** | recorded verdict |
|
||||
| Q8_0 / FP8 projection | quantize the bf16 GDN/attn projections (premise: vLLM uses FP8 here) | **REJECTED** (regime error, not premise error) | vLLM **does** use FP8 projections (confirmed from `hf_quant_config.json` `MIXED_PRECISION`), but at N=128/256 projections are only **~12% of the decode stream**, so this closes **<=6%, not the gap** | HNP / hf_quant_config.json |
|
||||
| NVFP4 the bf16 GDN/attn projections | drop projections to NVFP4 (more aggressive than FP8) | **REJECTED** | **KL-fail, ~+6% PPL**; vLLM keeps the SAME bf16/FP8 projections, never NVFP4 | LMAP |
|
||||
| W4A16-Marlin MoE decode | Marlin grouped expert GEMM on the decode path | **REJECTED** | BW-floored wash, **~5% slower** kernel | LMAP |
|
||||
| bf16-tau per-head SSM (0026) | per-head bf16 tau on the SSM decode | **DROPPED** | flat **780.6 vs 780.0 t/s** once the fusion patches landed | README s5 |
|
||||
| D3 FA-split / D4 GDN-width-adaptive | the older "off critical path" decode levers | **SUPERSEDED reasoning** | originally rejected via the now-debunked "5.4x faster / host-bound" reading; under HNP the GDN scan **is** the critical path (51%), but it is the shared BW floor where paged already leads (83% vs 79%), so neither is a win | HNP |
|
||||
|
||||
**Dense decode is AHEAD at low N (116.7% @ N=8, CDEF)** because the GPU is
|
||||
underutilized there and the paged path's per-token efficiency wins; this is the one
|
||||
operating point where paged is unambiguously faster than vLLM.
|
||||
|
||||
### 2d. SERVING / engine (verdict: host loop and scheduler closed; spec-decode orthogonal)
|
||||
|
||||
| Lever | What | Verdict | Key number | Source |
|
||||
|---|---|---|---|---|
|
||||
| **0040 / S1** paged decode-graph reuse | correct `can_reuse` keyed on bucketed block-table dims | **SHIPPED (default-on)** | serving graph reuse **0% -> 72.2%** (with S3); static **0% -> 95.5%** | README, DSS |
|
||||
| **0041 / S3** decode-shape-stable scheduling (`LLAMA_PAGED_DECODE_STABLE`) | keep prefill out of decode steps for reuse-stable shapes | **SHIPPED default-OFF** (opt-in throughput-max knob) | recovers the **~17 pt serving graph-reuse overhead** (llama server 60.7% -> toward GPU-steady 78.5%, 2c) at a TTFT cost; default-on regressed real serving: **2.5x worse TTFT** (60s vs 24s @N=256), **20-29% lower** end-to-end throughput, hence opt-in | README, DSS, HNP |
|
||||
| **0043 / D1** full-step MoE decode CUDA graph | graph the whole decode step incl. grouped-MMQ MoE dispatch | **SHIPPED (default-on)** | +2.6% (npl128) to +5-13% (npl32); the D1 premise "host-sync on MoE-routing readback" was **REFUTED** (sync count identical graphs on/off; 99% GPU-busy static) | README s5 |
|
||||
| S2 double-buffer set_inputs | overlap host input build with GPU | **DROPPED** | `set_inputs` is **~0.05 ms/step** - nothing to recover (the rebuild was the cost) | DSS |
|
||||
| whole-step graph / host loop | the host scheduling loop as the serving residual | **CLOSED (~0-1%)** | baseline reuse 0% (agg 757.6) **statistically equal** to S1+S3 reuse 72% (agg 763.3); `hostproc` only ~4-8% of the per-step wall = **measured dead** | DSS |
|
||||
| padded / fixed-slot decode | pad decode width to `--parallel` for ~100% reuse | **REJECTED (built, GPU-tested)** | inert (md5 bit-exact) but **regresses at every concurrency**; N=8 burst 28.16 -> 6.05 tok/s/seq (~4.6x slower); serving decode is **GPU-compute-bound**, dummy-row compute > reuse recovered | DSS |
|
||||
| speculative decode (MTP) | draft + verify; greedy is bit-exact | **ORTHOGONAL, not pursued** | both engines have it; the crux is hybrid-SSM in-place-state (0018) rollback. Not a paged-specific gap - a feature both can add | LMAP |
|
||||
|
||||
The serving regime was the one place the static-bench parity did not carry over
|
||||
(paged ~3.7 vs vLLM ~5.9 tok/s/seq, -39%, DSS). S1 made the decode step reusable
|
||||
and the host loop was driven to ~0-1% of the wall. The graph-node-traced HNP
|
||||
profile (2c) then resolves the remaining serving gap into two parts: the **~17 pt
|
||||
serving graph-reuse overhead** (S3-recoverable via this knob) and the **~14 pt
|
||||
GPU-steady kernel gap** vs vLLM's true 1078 t/s (vLLM's fused-Marlin MoE + Triton
|
||||
elementwise, 2c). Both are real; neither is the "pure LPDDR5x floor, vLLM pays
|
||||
equally" story the collapsed-graph profile implied.
|
||||
|
||||
---
|
||||
|
||||
## 3. Structural floors (not closable on GB10)
|
||||
|
||||
These are the hardware/algorithm ceilings the investigation hit. They are why
|
||||
parity is unreachable on this part, and they are the levers' "why" in one place.
|
||||
|
||||
1. **LPDDR5x bandwidth (~273 GB/s) bounds the GDN recurrent scan - a *shared*
|
||||
floor where paged leads.** The GDN scan is the dominant decode bucket (553
|
||||
us/tok, 51%, LINEAR in batch; HNP) and moves ~32 GB/step of f32 recurrent
|
||||
state; paged runs it at **83% of the 273 GB/s peak vs vLLM's 79%**, and both
|
||||
engines' high-N sublinearity (1.17-1.18x for a 2x batch) is this same floor.
|
||||
This is **not** the explanation for the high-N server-window gap: the
|
||||
graph-node-traced HNP profile (2c) shows paged decode **99% GPU-busy at ~86% of
|
||||
vLLM's true GPU-steady decode**, with the server-window ~56% being a
|
||||
prefill-overlap measurement artifact (~8 pt) plus an S3-recoverable graph-reuse
|
||||
overhead (~17 pt), not a bandwidth floor vLLM pays equally. The residual ~14 pt
|
||||
GPU-steady gap is kernel maturity (point 4 below + 2c), not bandwidth. On
|
||||
datacenter HBM (B200: ~8 TB/s) this GDN floor lifts ~30x.
|
||||
|
||||
2. **FP4-MMQ optimality at GB10's tensor-core ratios.** Native FP4-MMQ at M<=128 is
|
||||
at the FP4 weight-BW floor (decode) and beats every dequant->bf16 alternative at
|
||||
large M (prefill), because bf16 TC peak is ~half FP4 peak on sm_121 and the
|
||||
dequant pass is an un-amortized memory pass (PGR). vLLM itself is on a **bf16
|
||||
Marlin fallback** here (no tcgen05/CUTLASS-grouped FP4 on consumer Blackwell,
|
||||
CUTLASS #3096), so there is no faster GEMM to port.
|
||||
|
||||
3. **GDN O(C^2) intra-chunk solve + serial inter-chunk recurrence.** The chunked
|
||||
scan's cost is the triangular A-inverse solve (quadratic in chunk size C) plus
|
||||
the strictly-serial cross-chunk state carry, with C forced to 16 by the 99 KB
|
||||
smem cap. Occupancy (BV: -1%) and dtype (bf16-C64: -18.75%) are not the bound;
|
||||
only a fuller tensor-core blocked-solve closes the residual 2.62x, and M5 tf32
|
||||
captures the tractable part.
|
||||
|
||||
4. **vLLM's mature fused kernels (FLA blocked-solve, fused-Marlin MoE, Triton
|
||||
elementwise) are tuned for HBM.** They are the source of both the prefill cap
|
||||
and the residual ~14 pt decode GPU-steady gap (2c): the fused-Marlin
|
||||
persistent-tiling MoE path (~+11 ms) and the single-kernel Triton elementwise
|
||||
(~+10 ms). The matching ggml fusions were rejected as infeasible or regressive
|
||||
(2c): folding act-quant into MMQ regressed -79.4% (no single-pass tiling), and
|
||||
norm+quant+silu cannot be expressed via `ggml_cuda_can_fuse`. The FLA chunked
|
||||
GDN, Marlin grouped GEMM, and FULL/PIECEWISE cudagraphs all assume datacenter
|
||||
bandwidth and TC ratios; they are real wins on B200, which is why closing the
|
||||
residual is a different-hardware question (mature kernels + multi-stream
|
||||
overlap), not a missing single-lever optimization.
|
||||
|
||||
---
|
||||
|
||||
## 4. Shipped wins (all bit-exact / KL-benign)
|
||||
|
||||
What the series actually banks, all gated per-path:
|
||||
|
||||
- **FP4-MMQ MoE/dense GEMM** - native Blackwell FP4-MMA, at the FP4 weight-BW
|
||||
floor (decode parity) and beating every dequant alternative at prefill. The
|
||||
reason the whole 2a track stays default-off.
|
||||
- **M5 tf32 tensor-core chunked GDN prefill (patch 0047)** - default-on under
|
||||
`LLAMA_KV_PAGED`; MoE prefill **+3.5% @npp512, +17.7% @npp2048**, decode
|
||||
untouched, bit-exact-benign.
|
||||
- **0042 fused residual-add + RMSNorm + weight-mul** - one kernel for `h = x +
|
||||
sub; n = rms_norm(h) * w`; dense S_PP +0.5%, bit-exact.
|
||||
- **0044 fused gated RMSNorm + SiLU gate-mul (GatedRMSNorm fusion)** - the GDN
|
||||
output norm `(rms_norm(x)*w)*silu(z)` folded into one launch (672 -> 336
|
||||
launches @npp512); S_PP dense +1.1%, MoE +0.9%, `test-backend-ops` 12979/12979.
|
||||
- **0046 GDN-prefill geometry gate** - gates patch 0022's decode occupancy retune
|
||||
by scan length so it stops regressing dense prefill; recovers **+7.2%** dense
|
||||
prefill back to stock parity while keeping the decode win, bit-exact.
|
||||
- **SSM decode fusion stack (0018-0022, 0028)** - in-place state, fused gather,
|
||||
o_proj MMQ reshape, conv in-place, occupancy retune; the **2.26x/2.46x over
|
||||
stock** decode multiplier (README).
|
||||
- **Serving host loop closed (0040 S1, 0043 D1)** - decode-graph reuse and
|
||||
full-step graph capture; host loop driven to ~0-1% of the serving wall.
|
||||
- **The memory advantage** - **1.5-3x lower VRAM** than vLLM (NVFP4-resident, no
|
||||
persistent bf16 dequant copies; CDEF PEAK_GB e.g. MoE N=8 50 vs 112 GB), which
|
||||
is a legitimate higher-max-concurrency-per-GPU operating point.
|
||||
- **Low-N decode efficiency** - dense decode **ahead of vLLM (116.7% @ N=8)**.
|
||||
- **Bit-exact output** - per-path greedy md5 stable (dense `5951a5b4`, paged-MoE
|
||||
`8cb0ce23`), the sacred gate held through the entire series.
|
||||
|
||||
---
|
||||
|
||||
## 5. The parity verdict and the path
|
||||
|
||||
**Verdict (revised): PREFILL is genuinely capped on GB10; DECODE-SERVING is near
|
||||
vLLM parity (~86% of its true GPU-steady decode), with the long-standing ~56%
|
||||
headline now identified as a measurement / operating-point artifact.** Prefill
|
||||
sits at **36% (MoE) / 43% (dense)** of vLLM and is a real floor (FP4-MMQ optimality
|
||||
+ GDN O(C^2) intra-chunk complexity; prefill is **not** CUDA-graph-replayed, so
|
||||
unlike decode these numbers are not profiling artifacts). The GDN chunked scan is
|
||||
at its tractable tensor-core win (M5) and the prefill GEMM bucket is FP4-MMQ-optimal
|
||||
(every alternative rejected; vLLM is itself on a bf16-Marlin fallback here). For
|
||||
decode, the graph-node-traced HNP profile corrects the record: paged decode is
|
||||
**99% GPU-busy at ~86% of vLLM's true GPU-steady decode (924 vs 1078 t/s)**; the
|
||||
~56% server-window figure was vLLM's prefill-overlap inflation (~8 pt) plus the
|
||||
S3-recoverable serving graph-reuse overhead (~17 pt). The residual **~14 pt**
|
||||
GPU-steady gap is vLLM's mature fused-Marlin MoE (~+11 ms) and Triton elementwise
|
||||
(~+10 ms) kernels; the matching ggml fusions were rejected (act-quant-into-MMQ
|
||||
-79.4%, norm+quant+silu infeasible), and closing the residual needs mature Marlin
|
||||
tiling (our port lost -19.6%) plus multi-stream overlap - low-EV, multi-week,
|
||||
GB10-uncertain, not a free bit-exact lever.
|
||||
|
||||
**The honest framing:** on GB10 the paged backend is **at or ahead of vLLM at low
|
||||
concurrency (dense 117% @N=8), uses 1.5-3x less memory, and is bit-exact**, runs
|
||||
high-N decode at **~86% of vLLM's true GPU-steady decode** (the ~56% server-window
|
||||
number is a measurement artifact, 2c), and sits at **~36% (MoE) / ~43% (dense) of
|
||||
vLLM prefill**. The prefill residual is a real FP4-MMQ + GDN-O(C^2) floor; the
|
||||
~14 pt decode residual is vLLM's mature fused kernels, not engineering debt and not
|
||||
a cheap lever.
|
||||
|
||||
**The path to parity is different hardware.** A datacenter Blackwell (B200,
|
||||
~8 TB/s HBM, native tcgen05/CUTLASS FP4, TMEM) lifts the bandwidth floor ~30x and
|
||||
**restores exactly the vLLM advantages that lose on GB10**: its FLA blocked-solve
|
||||
GDN, its Marlin/CUTLASS grouped FP4 GEMM, and its HBM-tuned full-cudagraph decode
|
||||
all assume that bandwidth and those TC ratios. On that hardware the parity question
|
||||
is re-opened from scratch; on GB10 it is closed. Do not re-litigate the GB10 levers
|
||||
- re-run the methodology on the new silicon instead.
|
||||
|
||||
---
|
||||
|
||||
*Recorded per `.agents/vllm-parity-methodology.md` (both-engine ground-truth,
|
||||
per-lever A/B, record-rejected-levers). All GPU numbers from `ssh dgx.casa`
|
||||
artifacts under `~/bench/`; all in-repo numbers from the docs cited in the source
|
||||
key. The GPU lock was not touched in producing this document (CPU-only:
|
||||
artifact-read + write).*
|
||||
@@ -0,0 +1,493 @@
|
||||
# vLLM Parity Lever Map
|
||||
|
||||
> Auto-generated from the parity-exploration workflow. Working artifact (the multi-week path to vLLM parity on prefill + decode, Qwen3.6 NVFP4 / GB10).
|
||||
|
||||
## 1. Prefill gap re-audit
|
||||
|
||||
I have walked the full prefill forward pass against the committed numbers (final_benchmark.csv, PREFILL_GEMM_SCOPE/RESULTS, the 0042 dense nsys profile, the qwen35moe/delta-net graph source). Here is the re-audit.
|
||||
|
||||
---
|
||||
|
||||
# PREFILL gap re-audit - Qwen3.6 NVFP4 on GB10
|
||||
|
||||
## Grounding (what the gap actually is)
|
||||
|
||||
From `docs/final_benchmark.csv`, prefill (S_PP, t/s; patched vs vLLM):
|
||||
- **Dense 27B**: ~922 vs ~1929-2182 → patched is **44-48%** of vLLM.
|
||||
- **MoE 35B-A3B** (the decision model): ~1510-2177 vs ~5186-6223 → patched is **29-41%** of vLLM. In us/tok at npl64: llama ~471, vLLM ~169 → **gap ~302 us/tok**.
|
||||
|
||||
The GEMM scope's bucket (~232 us/tok llama vs ~68 vLLM) = a **164 us/tok** GEMM difference = **~51-54% of the gap**, and GEMM is **~49% of the llama prefill wall** (232/471). GDN is cited at **~17% of the gap** (vLLM chunked scan ~2.5x cheaper). So GEMM+GDN ≈ **~68% of the gap** by the existing framing - leaving ~30% that the two levers' headline numbers do not name. This audit walks every op to place that residual.
|
||||
|
||||
Important structural facts confirmed from source (`models/qwen35moe.cpp`, `delta-net-base.cpp`, `llama-graph.cpp`):
|
||||
- MoE = 40 layers (interval-4 → **30 GDN + 10 full-attention**), 256 experts top-8, **plus a dense shared expert on every layer**. Dense = 64 layers (48 GDN + 16 attn).
|
||||
- **Default prefill GDN is NOT a single kernel.** `fused_gdn_ch`/patch-0031 is default-OFF, so prefill runs `build_delta_net_chunking` - a long graph of `ggml_mul`/`mul_mat`/`solve_tri`/`cumsum`/`tri`/`exp` + many `ggml_cont`/`transpose`/`pad`/`repeat` layout copies + a host-side per-chunk loop. The GDN lever (tensor-core fused kernel) is scoped to replace this **entire** decomposition, so the "11% k_bin_bcast op_mul gating muls" the 0042 patch calls "a separate lever" are in fact **inside the GDN bucket** (a fused GDN kernel subsumes them).
|
||||
|
||||
## Prefill op-share table (MoE decision model; % of the patched/llama prefill wall)
|
||||
|
||||
Estimates triangulated from the committed numbers (232/68 GEMM, 11%/5% from the 0042 dense nsys, the gap arithmetic), not a fresh nsys run.
|
||||
|
||||
| Op (prefill) | ~% of llama wall | vLLM faster? why | Covered by GEMM lever | Covered by GDN lever |
|
||||
|---|---:|---|:---:|:---:|
|
||||
| Token embed (`get_rows`) | <1% | tie | - | - |
|
||||
| **NVFP4 weight GEMMs** total | **~49%** | **Yes** - vLLM W4A16-Marlin/cutlass large-M tiles + async pipeline vs MMQ small-tile / new FP4-MMA at 57.7% of peak | **YES** | - |
|
||||
| ┝ routed-expert grouped GEMM (gate_up+down, `mul_mat_id`) | ~28% | yes (biggest single bucket) | yes | - |
|
||||
| ┝ shared-expert dense GEMMs (all tokens, ×40) | ~9% | yes | yes | - |
|
||||
| ┝ GDN in/out projections (wqkv, wqkv_gate, ssm_out) | ~7% | yes | yes | - |
|
||||
| ┝ attention QKV/O projections (×10) | ~5% | yes | yes | - |
|
||||
| **GDN chunked decomposition** (30 layers) | **~22%** | **Yes** - vLLM chunked scan ~2.5x cheaper (tensor-core intra-chunk vs llama's f32 graph ops + layout copies + host loop) | - | **YES** |
|
||||
| ┝ gating/decay muls (`k_bin_bcast op_mul`) | ~11%* | yes | - | yes (fused kernel absorbs) |
|
||||
| ┝ small f32 mul_mats + `solve_tri` + cumsum/tri/exp | ~7% | yes | - | yes |
|
||||
| ┝ layout `cont`/`transpose`/`pad`/`repeat` copies | ~4% | yes | - | yes |
|
||||
| **FlashAttention prefill** (QK^T·softmax·PV, 10 layers) | **~3-6%**† | maybe - L²-growing; bounded at npp=128, larger at serving context | **NO** | **NO** |
|
||||
| **MoE router + combine/scatter** | **~5-8%** | **Yes** - vLLM fuses gather/weight/scatter into the grouped-GEMM epilogue | **NO** | **NO** |
|
||||
| ┝ `argsort_top_k`(256→8) + softmax + weight-norm | ~2-3% | yes | no | no |
|
||||
| ┝ combine: 7× fp32 `add` + weight `mul` (×40) | ~3-5% | yes | no | no |
|
||||
| **Activation quantization** (W4A4 e4m3 pass per GEMM) | **~3-6%** | **Yes - structurally**: vLLM W4A16-Marlin on GB10 has **no** activation-quant step | **NO**‡ | partial |
|
||||
| Norm + residual tail (attn/post/q/k/ssm/l2/out + adds) | ~4% | small (0042 fused the main one) | - | - |
|
||||
| RoPE + sigmoid/silu gates + scale | ~2-3% | small | - | - |
|
||||
| LM head (last-token only in prefill) | <1% | tie | - | - |
|
||||
|
||||
\* 0042 dense profile; in MoE the relative share is a bit lower (MoE FFN is heavier). † grows quadratically - under-weighted at the benchmark's npp=128; re-measure at real serving lengths. ‡ the quant pass feeds the GEMM but is a *separate kernel*, not inside the GEMM-lever's mul_mat bucket.
|
||||
|
||||
## Verdict: GEMM + GDN are the two dominant buckets but NOT the whole gap
|
||||
|
||||
They cover ~71% of the prefill wall and the bulk of the gap. Three contributors are **materially uncovered** by either lever:
|
||||
|
||||
### Newly-identified lever 1 - MoE router + combine/scatter (the strongest miss on the decision model)
|
||||
llama runs the expert routing and recombination as **separate memory-bound ggml ops**: `argsort_top_k` over 256 experts, softmax/normalize, then a fan-in of **7 fp32 `ggml_add` + a weight `ggml_mul`** per MoE layer (`llama-graph.cpp` ~1797-1824), every one of 40 layers. vLLM's fused-MoE (and Marlin grouped) path folds the gather, the router-weight multiply, and the scatter-accumulate into the **GEMM epilogue/prologue** - so this is overhead vLLM essentially does not pay. Est. ~5-8% of the MoE prefill wall, entirely outside GEMM (the `mul_mat_id` is covered; the surrounding argsort/adds/mul are not) and outside GDN. **Lever: a fused top-k-weighted expert-output accumulation (or a fused-MoE epilogue), removing the 7-add fan-in and the separate weight mul.** Bit-exact-gateable (it is an fp32 reduction-order change, same precedent as the paged-MoE `8cb0ce23`).
|
||||
|
||||
### Newly-identified lever 2 - the W4A4 activation-quant pass (a vLLM-asymmetry, not just a kernel-speed gap)
|
||||
Every NVFP4 GEMM (MMQ today, and the new 0034 FP4-MMA) **quantizes activations to e4m3 (amax/6 + code search) before the matmul** - a distinct, M-proportional kernel. vLLM on **sm_121 falls back to W4A16-Marlin** (the TENSORCORE_GDN_SCOPE confirms this: no tcgen05/cutlass-FP4 on GB10), i.e. **f16 activations, zero activation-quant**. So this pass (~3-6% of prefill) is a structural cost vLLM avoids, and it explains part of why even a peak FP4-MMA GEMM will not fully reach vLLM's prefill. The README's "act-quant FLAT" and "W4A16 rejected" verdicts are **decode/BW-bound findings**; in compute-bound prefill the trade is different and unaudited. **Lever: measure this quant bucket as its own nsys row; consider fusing the activation-quant into the GEMM prologue (cp.async + in-register quant) so it is not a separate global-memory pass.**
|
||||
|
||||
### Flag 3 - FlashAttention prefill (context-dependent, currently under-measured)
|
||||
The 10-16 full-attention layers' QK^T·softmax·PV is a separate kernel covered by neither lever. It is small at the benchmark's npp=128 but **grows as L²**; at the long contexts the decode-serving work targets it can become a real bucket. The whole prefill ground-truth (232/68) was taken at one ubatch size - **re-profile FA share at the real serving prefill lengths** before assuming it is negligible.
|
||||
|
||||
### Confirmed inside the existing levers (not new)
|
||||
- The 0042 "11% gating muls" and all the GDN small-matmuls/`solve_tri`/cumsum/layout-conts are **inside the GDN bucket** - the tensor-core GDN kernel subsumes them; they are only "live and uncovered" *today* because patch 0031 is default-off and losing at C=16.
|
||||
- Shared-expert dense GEMMs, GDN/attention projections = **GEMM lever** (the FP4-MMA 0034 path already routes them).
|
||||
|
||||
## Bottom line
|
||||
Two prefill levers (GEMM, GDN) are correctly the top-2 and own ~the gap's majority, but they are **not** the whole gap. The op-walk surfaces **MoE router+combine/scatter** and the **W4A4 activation-quant pass** as genuine, currently-untracked prefill contributors on the MoE decision model (~8-14% combined), plus **FA prefill** as a context-dependent risk the npp=128 bench hides. Per the methodology, step 0 is an nsys prefill-only window that explicitly breaks out `argsort/add(combine)`, `quantize_mmq_nvfp4`, and `flash_attn` as separate rows to size these three before funding a kernel.
|
||||
|
||||
Relevant files: `/home/mudler/_git/LocalAI/.claude/worktrees/feat+paged-attention/backend/cpp/llama-cpp-localai-paged/docs/{PREFILL_GEMM_SCOPE.md,PREFILL_GEMM_RESULTS.md,TENSORCORE_GDN_SCOPE.md,final_benchmark.csv}`, `/home/mudler/_git/LocalAI/.claude/worktrees/feat+paged-attention/backend/cpp/llama-cpp-localai-paged/patches/paged/0042-feat-paged-fused-residual-add-RMS-norm-weight-multip.patch`, and the graph source `/home/mudler/_git/LocalAI/backend/cpp/llama-cpp-paged-dev/src/models/{qwen35moe.cpp,delta-net-base.cpp}` + `/home/mudler/_git/LocalAI/backend/cpp/llama-cpp-paged-dev/src/llama-graph.cpp` (build_moe_ffn ~1500-1834, build_attn ~2136-2189).
|
||||
|
||||
## 2. Decode-serving compute hypotheses (ranked)
|
||||
|
||||
RANKED DECODE-SERVING GPU-COMPUTE HYPOTHESES (paged llama.cpp vs vLLM, MoE Qwen3.6-35B-A3B-NVFP4 on GB10)
|
||||
|
||||
Grounding facts that constrain the ranking:
|
||||
- The gap is empirically MoE-specific: dense static is parity-to-ahead, MoE static is 89-93% of vLLM, but MoE *burst* serving is ~66% (n=128: paged 4.53 vs vLLM 6.87 tok/s/seq). So whatever degrades is on a path that hurts MoE far more than dense.
|
||||
- It is GPU-compute-bound, NOT host/reuse-bound: padded-shape lever rejected, baseline reuse 0% statistically equal to S1+S3 reuse 72% on aggregate tok/s, hostproc only 4-8% of wall. So the host loop (0040/0041/S2) is closed; the residual lives in per-step kernel time.
|
||||
- The decode KERNELS tie vLLM at a fixed WIDE lockstep shape (static batched-bench). The serving loss is therefore about how a RAGGED/NARROW/fluctuating live batch (varying decoder count D, ragged KV lengths, ragged token->expert assignment) feeds those same kernels, vs how gracefully vLLM's kernels degrade at the same concurrency. This is exactly the Phase-0 "re-scope" branch in DECODE_SERVING_SCOPE.md ("serving runs a worse effective batch shape into the kernels").
|
||||
|
||||
Decisive measurement that arbitrates all of these (run first): nsys a clean steady-state serving window (serve_bench staggered ~128 clients through llama-server, LLAMA_KV_PAGED=1 + LLAMA_MOE_FORCE_GRAPHS=1, -fa on -ngl 99) AND the same nsys on vLLM at the same concurrency (both-engine rule). Decompose per-step GPU-kernel-time into buckets {MoE-expert-GEMM (MUL_MAT_ID), full-attn FA, GDN recurrence, bf16 projections, activation-quant, sampling/logits} and compare serving-narrow vs static-wide vs vLLM. The bucket whose per-useful-token time grows MOST going static->serving (relative to vLLM's same bucket) is the gap. Avoid the known window artifact; measure a steady span. Reference doc: backend/cpp/llama-cpp-localai-paged/docs/DECODE_SERVING_SCOPE.md.
|
||||
|
||||
---
|
||||
|
||||
H1 (TOP) - MoE expert GEMM collapses to per-expert GEMV at ragged/narrow serving width, plus risk of the host-sync sorted per-expert fallback.
|
||||
- Mechanism: top-8 of 256 experts. Tokens/expert ~= D*8/256. Static npl128 -> ~4 tok/expert; serving burst-tail D->8 -> ~0.25 tok/expert, so most active experts get 0-1 tokens. The grouped MMQ id-GEMM's per-expert M collapses to 1 -> pure GEMV that reads the full FP4 expert weight (memory-bound, weight bytes unamortized) and re-loads per-expert scales. This is the "256 tiny-expert weight bandwidth" README s5 names as the residual. Separately, patch 0025 only keeps CUDA graphs on for the should_use_mmq grouped path; any serving step where MUL_MAT_ID ne[2]>8 (mmvq_mmid_max) AND should_use_mmq returns false falls to the per-expert host-loop fallback that cudaStreamSynchronizes per expert (the [TAG_MUL_MAT_ID_CUDA_GRAPHS] disable) - catastrophic, and serving's varying per-step shapes can trip it unevenly.
|
||||
- Why slower than vLLM: vLLM runs ONE fused MoE GEMM with sorted_token_ids/expert_ids computed on-GPU (fused_moe / Marlin-MoE), a single persistent launch that keeps the grouped GEMM dense and amortizes launch + scale loads; it degrades gracefully at small M. llama issues a grouped MMQ that, at ragged narrow width, is many near-empty expert tiles each re-reading scales, and can drop to a host-synced loop.
|
||||
- nsys metric to confirm: (a) MUL_MAT_ID kernel-time as % of per-step GPU wall, static-wide vs serving-narrow vs vLLM; (b) the tokens-per-expert (M) distribution per step - look for M->1 GEMV collapse and achieved FLOP/s vs M; (c) count cudaStreamSynchronize / per-expert cudaMemcpy *between* MUL_MAT_ID launches per step (host-sync fallback firing); (d) vLLM single fused-MoE kernel duration at same concurrency.
|
||||
- Candidate fix: a fused grouped-NVFP4 MoE decode GEMM with on-GPU token sorting (device-computed sorted token offsets + expert ids) so all active experts share one persistent launch and scales amortize - i.e. port vLLM's fused-MoE dispatch shape onto the FP4-MMA MMQ id-path; as a floor, extend 0025 to GUARANTEE the grouped should_use_mmq path for every serving shape so the host-sync loop never fires. Bit-exact-gateable (graph-replay/grouped path re-issues identical kernels).
|
||||
|
||||
H2 - Paged full-attention decode kernel: ragged-KV load imbalance, no tensor cores, indirect block-table reads.
|
||||
- Mechanism: the 16 full-attn layers run the paged block-table FA decode, pinned by the 0010/0011 dispatch guard to vec/tile and NEVER the mma/wmma tensor-core FA (a present block table routes only to vec/tile; tile loads half2, F16 cache only). Static bench: all sequences one KV length -> balanced. Serving: KV lengths are ragged (each request at a different position), so per-sequence attention work is imbalanced across the grid and the step waits on the longest-context tail; there is no KV-dimension split. Every K/V access is an indirect physical-cell load via the block table (gather-like), less coalesced than a contiguous read.
|
||||
- Why slower than vLLM: vLLM PagedAttention v2 uses a split-K / partitioned reduction designed for ragged long contexts (flash-decoding style) that balances work and lifts occupancy on the tail, and keeps the contiguous-within-block layout. llama's vec/tile paged read has no KV split and leaves tensor cores idle on the full-attn layers.
|
||||
- nsys metric to confirm: FA-decode (vec/tile) kernel duration vs KV-length VARIANCE across the live batch (does it scale with max-KV/tail rather than mean-KV?); tensor-core-active-% during FA layers (expect ~0); achieved memory-BW of the FA kernel under ragged KV; vLLM paged-attn kernel time + util at same concurrency.
|
||||
- Candidate fix: a KV-split (flash-decoding / split-K) paged FA decode so long sequences are partitioned across blocks for balance + occupancy; longer term a tensor-core paged FA for the full-attn layers (mma.sync down-translation, same approach as the GDN tensor-core scope). At minimum a per-sequence work-balanced launch.
|
||||
|
||||
H3 - GDN/SSM recurrence decode kernel under-occupied at narrow/variable serving width.
|
||||
- Mechanism: patch 0022 tuned the recurrence (NUM_WARPS=16, COLS_PER_WARP=8, grid.z = S_v/(NW*CPW)) for the WIDE B=128 lockstep batch; its DRAM-latency coverage / MLP needs ~128 independent sequence-states in flight, and it is bandwidth-bound (re-streams the 128x128 f32 state per sequence per step at 84.6% of peak BW *at B=128*). In serving D fluctuates and collapses in the burst tail; at low D the kernel is grid-starved (few independent states), achieved-BW falls below the tuned point and per-token state traffic rises - the same grid-starvation failure mode the chunked-prefill kernel hit at low n_seqs. Plus the serial-SSM host loop (README s2d/s5 structural floor) is amortized over fewer tokens.
|
||||
- Why slower than vLLM: vLLM's fused_recurrent_gated_delta_rule + its scheduler keep the recurrence fed at small batch; llama's fixed B=128-tuned launch params under-saturate when D is small.
|
||||
- nsys metric to confirm: gated_delta_net kernel achieved-BW (GB/s) and occupancy as a function of live D in serving vs the static 84.6%@B128 baseline; recurrence kernel time/token vs D; grid occupancy at the burst tail.
|
||||
- Candidate fix: width-adaptive recurrence launch params - auto-select NUM_WARPS/COLS_PER_WARP (already env GDN_NW/GDN_CPW) by live D so the grid stays saturated at narrow width; bit-exact-safe (0022's column assignment is provably independent of visit order). Longer term the chunked/register-resident state scan cuts state traffic.
|
||||
|
||||
H4 - Continuous-batch ragged-shape overhead: every kernel sized to the batch union/max; bf16 projections become GEMV at narrow D (umbrella + the "bf16-projection bandwidth" half of README's stated residual).
|
||||
- Mechanism: ragged positions/lengths/expert-assignments mean each per-step kernel is launched for the max/union over the live batch, so useful-token efficiency < lockstep. This is the shared root of H1-H3 but is worth isolating because it also covers the q/k/v/gate/o projections (deliberately kept bf16, per README s5) which at narrow D become GEMV-like memory-bound weight reads - the "bf16-projection bandwidth" residual vLLM also pays but amortizes over a steadier batch.
|
||||
- Why slower than vLLM: vLLM's scheduler holds a steadier/denser decode batch (padded bucketed decode + chunked-prefill interleave) so its projection/attn GEMMs run at higher effective M; llama's batch width fluctuates more.
|
||||
- nsys metric to confirm: GPU-busy% in a steady serving window vs static (expect lower in serving) and (sum useful-token FLOPs)/(kernel-time) serving vs static; bf16 projection GEMM achieved FLOP/s vs M (GEMV collapse at small D).
|
||||
- Candidate fix: largely subsumed by fixing H1-H3 at the kernel level. Note: holding D high via admission was effectively probed by the padded-shape lever and REJECTED for throughput (the completion-driven shrink is itself a per-survivor win); so do NOT re-pursue width-padding - the payoff is in the per-kernel fixes.
|
||||
|
||||
H5 - Per-step sampling + logits handling across D independent sequences (low, cheap to exclude).
|
||||
- Mechanism: each live sequence has its own sampler chain run after logits land; at narrow D this fixed per-step cost (+ any D2H logits copy) is amortized over fewer tokens. vLLM batches sampling on-GPU across the whole decode batch.
|
||||
- nsys metric to confirm: sampling/logits-copy time as % of per-step wall serving vs static; D2H logits cudaMemcpy size+time; count of per-sequence sampler launches.
|
||||
- Candidate fix: single on-GPU batched sampler over [D, vocab], no per-sequence D2H. Likely small on the greedy/temp0 path; profile mainly to exclude.
|
||||
|
||||
H6 - NVFP4 activation (re)quantize fixed per-step cost at small M (lowest; likely a wash vs vLLM, no bit-exact lever).
|
||||
- Mechanism: W4A4 quantizes activations to FP4 each step; 0023 de-dups the per-expert re-quant but the base quantize + scale compute is a per-row pass amortized over fewer tokens at narrow D. README lever-3 records the act-quant tax is removable only by W4A16 (rejected) or a kernel rewrite.
|
||||
- nsys metric to confirm: quantize-kernel time as % of step vs M, serving vs static, vs vLLM (which also FP4-quantizes activations -> expected wash).
|
||||
- Candidate fix: none bit-exact; include only so the profile can rule it out.
|
||||
|
||||
---
|
||||
|
||||
Priority for the profiling workflow: profile the MoE model (the gap model) first; bucket per-step GPU-kernel-time and A/B serving-narrow vs static-wide vs vLLM. Expect the ranking to be dominated by H1 (MoE expert GEMM at collapsed M + any host-sync fallback) and H2 (ragged-KV FA on vec/tile), with H3 (recurrence occupancy at narrow D) third; H4 is the shared root, H5/H6 are exclusion checks. The single most decisive metric is the per-step time share of MUL_MAT_ID vs FA vs gated_delta_net in serving-narrow relative to vLLM's equivalents at the same concurrency.
|
||||
|
||||
## 3. vLLM pipeline deep-dive
|
||||
|
||||
I have what I need. Synthesizing the vLLM pipeline against our patch series.
|
||||
|
||||
---
|
||||
|
||||
# vLLM vs `llama-cpp-localai-paged`: kernel + scheduler deep-dive (Qwen3-style hybrid SSM+attn MoE, NVFP4, consumer Blackwell sm_121)
|
||||
|
||||
## The single most important grounding fact (changes how every GEMM row reads)
|
||||
|
||||
On **consumer Blackwell (sm_120/sm_121: DGX Spark/GB10, RTX 5090, RTX PRO 6000)** the native CUTLASS NVFP4 grouped-GEMM path is **broken** (TMA-WS grouped-GEMM init failure, CUTLASS #3096) and there is **no `tcgen05`/TMEM**. So vLLM on *this exact hardware* does **not** run a native FP4-MMA grouped GEMM - it **falls back to the Marlin BF16 kernel that dequantizes FP4->BF16 in-register**, capped at bf16-tensor-core peak (~half FP4 peak). Native FP4 (W4A4/tcgen05) and the best FlashInfer/TRT-LLM kernels are gated to **data-center Blackwell sm_100a**. This means several "vLLM advantages" assumed for B200 do **not** hold on GB10, and our native FP4-MMA path (the just-verified 103 TFLOP/s = 57.7% of FP4 peak GEMM) is potentially *ahead of* vLLM's Marlin-bf16 fallback on this part - the opposite of the usual framing.
|
||||
|
||||
## Comparison table
|
||||
|
||||
| # | Component | vLLM (this model class, sm_121 reality) | Ours (`llama-cpp-localai-paged`) | Regime | Verdict / gap |
|
||||
|---|---|---|---|---|---|
|
||||
| 1 | **Dense weight GEMM - decode** (M≤128, BW-bound) | Marlin FP4→bf16 in-register dequant (W4A4 broken→fallback); reads 4-bit weights | Native FP4-MMA MMQ (FP4 wt × Q8_1 int8 act), M≤128 tile | decode | **Parity** - both at FP4 weight-BW floor. Ours ~96-97% of vLLM, ahead at low concurrency |
|
||||
| 2 | **Dense weight GEMM - prefill** (large-M, compute-bound) | Marlin grouped/dense, async cp.async pipeline, big tiles, ~bf16 peak | MMQ small-tile, 1 CTA/SM. **New native FP4-MMA large-M kernel @103 TFLOP/s being integrated** (beats cuBLAS-bf16, bit-exact) | prefill | dequant→bf16-cuBLAS lever (0033) was **rejected** (MMQ beat it 29-49%); the native FP4-MMA kernel is the real fix and could **beat** vLLM's bf16-Marlin here |
|
||||
| 3 | **MoE expert GEMM - decode** | Marlin FP4→bf16 grouped, indirect addressing | Grouped MMQ (`mul_mat_id`), sorted expert layout, native FP4-MMA | decode | **Parity** - both BW-floor. Recurrence/GEMM are *our wins*; residual = bf16-projection BW + host loop |
|
||||
| 4 | **MoE expert GEMM - prefill** | Marlin grouped GEMM, fused, big tiles | MMQ small-tile grouped (1 CTA/SM) | prefill | **GAP (#1 prefill bottleneck per docs).** Native FP4-MMA grouped kernel is the planned fix; today MMQ is small-tile-bound |
|
||||
| 5 | **MoE routing / gather / scatter / epilogue** | Triton persistent fused-MoE: indirect token addressing, **fused gate+up + SwiGLU epilogue**, once-quantize, scatter+weighted-combine fused | Sorted per-expert layout; **NVFP4 act-quant de-dup (0023)** mirrors once-quantize; SwiGLU is **separate ops** (no fused epilogue) | both | Partial parity. **No fused gate+up+SwiGLU epilogue** (extra IO passes); matters at prefill, minor at decode |
|
||||
| 6 | **GDN / linear-attn - decode** | FLA Triton `fused_recurrent_gated_delta_rule` + `fused_sigmoid_gating_delta_rule_update` (sequential, per-step state) | Fused sequential recurrence: in-place state write-back (0018), fused state gather (0019), o_proj MMVQ→MMQ (0020), occupancy retune (0022), conv-tap gather fusion (0028) | decode | **Parity-to-win** - recurrence runs at **102.6% of vLLM bandwidth**, 84.6% of GB10 peak BW. Our strongest area |
|
||||
| 7 | **GDN / linear-attn - prefill** | FLA `chunk_gated_delta_rule`: intra-chunk products on **tensor cores** (UT-transform), ~2.5× cheaper | Tuned **sequential** scan (default); chunked parallel-scan (0031) is **opt-in + ~22% slower** (serial f32 reductions, no TC, C=16 forced by 99KB smem) | prefill | **GAP (#2 prefill bottleneck).** No tensor-core chunked GDN. Scoped (TENSORCORE_GDN_SCOPE, mma.sync only); **Gram products de-risked at 6.7-9.3× over sequential**, kernel not yet built |
|
||||
| 8 | **Causal conv1d (short conv)** | FLA `causal_conv1d_fn`/`_update` Triton | `ggml_ssm_conv_update_inplace` (0021): 5-op chain → 1 op, in-place ring | both | Parity |
|
||||
| 9 | **Full-attention - decode** (16 of 64 layers) | FlashInfer / TRT-LLM paged decode (tensor-core, cascade wrapper, FP8-KV capable) | llama.cpp FA `ggml_flash_attn_ext` with **block-table paged read** (src[5]); routed to **vec/tile** kernels | decode | Parity at decode width (vec/tile is right for small batch) |
|
||||
| 10 | **Full-attention - prefill** (large-M) | FlashInfer/TRT-LLM tensor-core prefill FA | **Forced to vec/tile** (block-table only grafted into vec/tile; mma/wmma FA ignores it, dispatch-guarded off) | prefill | **GAP (secondary).** Paged prefill full-attn gets **no tensor-core FA**. Docs rank it below MoE-GEMM/GDN, so not the dominant prefill term |
|
||||
| 11 | **Paged KV manager (full-attn)** | vLLM block manager + hybrid KV cache manager (co-sizes attn/linear blocks to equal physical bytes, anti-fragmentation) + auto prefix caching | `PagedKVManager` (FreeBlockQueue/BlockPool/COW), cross-request prefix sharing, burst-reclaim (0024) | both | **Parity** on the attn side; we lack vLLM's *unified* hybrid co-sizing (we manage SSM state separately - see #12) |
|
||||
| 12 | **Hybrid SSM-state cache mgmt** | Unified hybrid manager pages linear-attn state alongside attn KV | SSM recurrent + conv state in fixed per-seq slots, updated **in-place** (not paged; O(1)/seq) | both | Different approach, not a perf gap (recurrent state doesn't need paging); we lack unified fragmentation accounting |
|
||||
| 13 | **Sampler** | **GPU FlashInfer sorting-free sampler** (Dual-Pivot rejection sampling, single kernel, no logits sort, ~0 overhead); RejectionSampler for spec-decode | llama.cpp **host-side** sampler chain (CPU partial-sort for top-k/p) | serving | **GAP - NO EQUIVALENT.** Host sampler + D2H logits adds to the per-step host loop at high concurrency (greedy md5 bench hides it) |
|
||||
| 14 | **Scheduler / continuous batching / chunked prefill** | V1: mixed prefill+decode step, **chunked prefill default-on**, decode-prioritized `max_num_batched_tokens` budget, auto-chunk | `update_slots()` unified step, **decode-first dynamic budget** (0016, `max(n_ubatch,T−D)`), prefill budget (0013), prefix-share (0008) | serving | **Parity** - we match the chunked-prefill + decode-first token-budget design |
|
||||
| 15 | **CUDA graphs - decode** | **FULL cudagraph**: padded/bucketed decode shapes → 1 persistent captured graph per bucket → steady decode = single `cudaGraphLaunch`, zero host rebuild | S1+S3 (0040/0041) graph **reuse** keyed on bucketed block-table dims + decode-shape-stable scheduling → serving reuse 0%→**72.2%** | serving | **Partial.** We reuse, not full-capture. **Padded/fixed-slot decode (→~100% like vLLM) was built + GPU-tested + REJECTED** - serving decode here is GPU-compute-bound, so dummy-row compute > reuse recovered |
|
||||
| 16 | **CUDA graphs - prefill** | PIECEWISE cudagraph (default FULL_AND_PIECEWISE) | ggml graph rebuild per prefill step (paged data-ptr churn) | prefill | Gap, low value (prefill is compute-bound; launch overhead amortized over large M) |
|
||||
| 17 | **Speculative decoding / MTP** | **MTP head + EAGLE-style spec-decode** supported for this model class (Qwen3-Next ships an MTP module) | **None** | decode | **GAP - NO EQUIVALENT.** Biggest *unexploited* decode-throughput lever vLLM has and we don't (potential ~1.5-2× at low-medium concurrency) |
|
||||
| 18 | **KV-cache dtype** | FP8 KV cache + FP8 attention (halves KV BW) | F16 paged KV | both | Minor gap; partly offset by our overall 1.5-3× lower memory (NVFP4 weights). FP8-KV would cut KV BW further |
|
||||
|
||||
## Gaps where we have NO equivalent (ranked by value)
|
||||
|
||||
1. **Speculative decoding via the MTP head (#17).** Qwen3-Next/3.6 ships a Multi-Token-Prediction module; vLLM exploits it for spec-decode. We have nothing. This is the single largest *structural* decode-throughput lever vLLM has that is **completely absent** from our series - and unlike the kernel gaps it is not BW-floored. Highest-value greenfield item.
|
||||
|
||||
2. **Tensor-core chunked GDN prefill (#7).** vLLM's FLA `chunk_gated_delta_rule` pushes intra-chunk Gram products through tensor cores (~2.5× cheaper prefill). Our 0031 chunked kernel is opt-in and 22% *slower* (serial f32 reductions). Scoped (mma.sync-only on sm_121, no wgmma/tcgen05), Gram products de-risked at 6.7-9.3×, kernel not built. One of the two named prefill bottlenecks.
|
||||
|
||||
3. **Large-M native FP4-MMA grouped MoE GEMM (#4).** The #1 prefill bottleneck. vLLM uses Marlin-bf16 grouped (capped at bf16 peak on sm_121); our MMQ is small-tile/1-CTA-bound. The new native FP4-MMA GEMM (103 TFLOP/s, beats cuBLAS-bf16) is the integration that closes this - and because vLLM is bf16-Marlin here, a working native FP4 grouped kernel could *exceed* vLLM on this exact hardware.
|
||||
|
||||
4. **GPU fused sorting-free sampler (#13).** vLLM samples on-device (FlashInfer Dual-Pivot rejection, no logits sort); llama.cpp samples on host. Adds to the serving host loop at 128-way concurrency for top-k/p workloads. No GPU-sampler equivalent in the series.
|
||||
|
||||
5. **Fused MoE SwiGLU epilogue (#5).** vLLM fuses gate+up+SwiGLU into the grouped-GEMM epilogue (fewer IO passes). We have the act-quant de-dup (0023) but run SwiGLU as separate ops. Prefill-relevant, decode-minor.
|
||||
|
||||
6. **Tensor-core FA for the paged prefill full-attn path (#10).** Paged forces vec/tile (mma FA ignores the block table). Secondary - docs rank it below #2/#3 in the prefill budget.
|
||||
|
||||
7. **FP8 KV cache / FP8 attention (#18).** Minor; partly offset by our NVFP4 memory lead.
|
||||
|
||||
## Where we are at or ahead of vLLM (not gaps)
|
||||
|
||||
- **GDN decode recurrence (#6):** 102.6% of vLLM bandwidth - our fusion series (0018-0022, 0028) is the strongest area.
|
||||
- **Decode weight GEMMs dense+MoE (#1, #3):** at the FP4 weight-BW floor = parity; dense ahead at low concurrency. The residual MoE serving gap (~66% at n=128 burst) is a **GPU-compute** gap (vLLM's MoE decode kernel+scheduler ~1.3× on aggregate), **not** a host-loop gap that a graph-reuse/padding lever can close (padded-shape lever proved this, rejected 2026-06-28).
|
||||
- **Memory:** 1.5-3× lower than vLLM (NVFP4-resident, no persistent bf16 dequant copies).
|
||||
- **Scheduler design (#14):** chunked-prefill + decode-first budget matches vLLM's V1 model.
|
||||
|
||||
## Net assessment
|
||||
|
||||
Our **decode kernels are at parity-to-ahead** (GDN recurrence, both FP4 GEMMs at BW floor) - confirmed in the kernel regime. The two real, *named-in-docs* **prefill** gaps (MoE grouped GEMM #4, tensor-core chunked GDN #7) are being actively closed with the native FP4-MMA GEMM + the de-risked tensor-core Gram products; on consumer Blackwell specifically these can match-or-beat vLLM because vLLM is itself on a **bf16-Marlin fallback**, not native FP4. The two gaps with **no equivalent in the series at all** are **MTP speculative decoding** (highest-value, structural, decode) and the **GPU fused sampler** (serving host-loop, secondary). The serving-decode residual is GPU-compute-bound (not host/graph-reuse), so vLLM's edge there is its faster MoE decode kernel + scheduler, not something a host-side lever recovers.
|
||||
|
||||
---
|
||||
|
||||
Files read (all absolute): `/home/mudler/_git/LocalAI/.claude/worktrees/feat+paged-attention/backend/cpp/llama-cpp-localai-paged/README.md`, `docs/DECODE_SERVING_SCOPE.md`, `docs/PREFILL_GEMM_SCOPE.md`, `docs/PREFILL_GEMM_RESULTS.md`, `docs/TENSORCORE_GDN_SCOPE.md` (same dir).
|
||||
|
||||
Sources:
|
||||
- [vLLM Now Supports Qwen3-Next (FLA Triton kernels, hybrid KV manager, MTP)](https://blog.vllm.ai/2025/09/11/qwen3-next.html)
|
||||
- [CUTLASS #3096 - SM120 NVFP4 MoE grouped GEMM broken, FlashInfer/Marlin fallback](https://github.com/NVIDIA/cutlass/issues/3096)
|
||||
- [vLLM Quantization Kernels (NVFP4 W4A16/W4A4, Marlin, Machete)](https://deepwiki.com/bytedance-iaas/vllm/11.4-quantization-kernels)
|
||||
- [SM120 NVFP4 MoE perf report - Marlin bf16 fallback on consumer Blackwell](https://discuss.vllm.ai/t/sm120-rtx-pro-6000-nvfp4-moe-performance-report-qwen3-5-397b/2536)
|
||||
- [vLLM Attention Backends - FlashInfer/TRT-LLM default on Blackwell](https://docs.vllm.ai/en/latest/design/attention_backends/)
|
||||
- [vLLM FLA fused_recurrent_gated_delta_rule](https://docs.vllm.ai/en/latest/api/vllm/model_executor/layers/fla/ops/fused_recurrent/)
|
||||
- [vLLM Fused MoE Kernel Features](https://docs.vllm.ai/en/latest/design/moe_kernel_features/)
|
||||
- [vLLM scheduling - chunked prefill, decode-first budget, FULL_AND_PIECEWISE cudagraph](https://docs.vllm.ai/en/stable/configuration/optimization/)
|
||||
- [FlashInfer sorting-free GPU sampling (Dual-Pivot rejection)](https://flashinfer.ai/2025/03/10/sampling.html)
|
||||
- [vLLM #11394 - FlashInfer sampling kernel in V1](https://github.com/vllm-project/vllm/pull/11394)
|
||||
- [vLLM #42960 - batch-invariant GDN_ATTN for Qwen3-Next/Qwen3.6](https://github.com/vllm-project/vllm/issues/42960)
|
||||
|
||||
## 4. Novel levers
|
||||
|
||||
I've grounded myself in the four scope docs, the README patch table + benchmarks (final_benchmark.csv), the methodology doc, and the 0034 FP4-MMA / 0042 fused-residual patch headers. Verified state: prefill is the biggest gap (dense ~920 vs vLLM ~2000 t/s ≈ 44-46%; MoE ~2177 vs ~5300-6223 ≈ 35-41%); decode kernel at parity; serving decode ~65% and measured GPU-compute-bound (host/graph-reuse + padded-shape proved neutral-or-worse). Already-explored/rejected: dequant→bf16 cuBLAS (0033, rejected), bf16-tau (dropped), NVFP4 projections (KL-fail), W4A16-Marlin (rejected), graph coverage (flat), act-quant fusion on decode (flat), padded-shape decode (rejected). Below are levers that go beyond those.
|
||||
|
||||
---
|
||||
|
||||
# Candidate-lever brainstorm: closing the vLLM gap (paged Qwen3.6 NVFP4, GB10 sm_121a)
|
||||
|
||||
Organized by where the verified gap actually is. For each: mechanism / expected gain / gate (bit-exact vs KL) / risk / effort-reward. "Profile-gated" = run Phase-0 nsys before building, per the methodology.
|
||||
|
||||
## A. PREFILL (the largest gap, 35-46% of vLLM) — highest reward bucket
|
||||
|
||||
### A1. Graph-safe ragged grouped FP4-MMA MoE kernel (remove the per-expert host-sync loop)
|
||||
- **Mechanism:** 0034 lands the native FP4-MMA dense kernel but routes MoE prefill through the *per-expert host-sync loop* (a `cudaStreamSynchronize` per expert per layer — e.g. dozens-to-hundreds of syncs/layer). Replace it with ONE ragged/grouped FP4-MMA launch over the existing `expert_bounds`/`ids_dst` sorted layout (variable M per expert, single kernel). This is the follow-up 0034 itself flags.
|
||||
- **Gain:** HIGH. MoE expert GEMM is named the #1 prefill cost; this both removes the serial host syncs and unlocks kernel overlap + graph capture. The single biggest remaining prefill lever after 0034.
|
||||
- **Gate:** bit-exact by construction (same FP4 math, same K-order as the per-expert path) → greedy md5.
|
||||
- **Risk:** medium-high (ragged tiling + boundary handling, graph-safety).
|
||||
- **Effort/reward: HIGH effort / HIGH reward.** The flagged 0034 follow-up; rank #1 for prefill.
|
||||
|
||||
### A2. Multi-stream expert dispatch (cheap stepping-stone to A1)
|
||||
- **Mechanism:** before writing the full ragged kernel, run the independent per-expert FP4-MMA GEMMs on N CUDA streams instead of the serial host-sync loop, overlapping their LPDDR5x weight reads + tensor-core work.
|
||||
- **Gain:** medium (partial overlap; recovers some of the serial-sync stall without the kernel rewrite).
|
||||
- **Gate:** bit-exact (same kernel, reordered launches) → greedy md5.
|
||||
- **Risk:** medium (stream/event mgmt, not graph-safe — prefill isn't graph-replayed so OK).
|
||||
- **Effort/reward: LOW-MED effort / MED reward.** Bank this before A1.
|
||||
|
||||
### A3. Fuse MoE router → token-gather/scatter → GEMM (permutation fusion)
|
||||
- **Mechanism:** vLLM/SGLang fuse routing→permute→grouped-GEMM→unpermute. Here the activation gather (into the sorted-expert layout) and the scatter-back are separate memory passes. Read activations through `ids_dst` in the GEMM prologue and write through the inverse permutation in the epilogue → removes two full activation memory passes per MoE layer.
|
||||
- **Gain:** medium for prefill (large activation tensor); smaller for decode (0019/0028 already fuse the decode gather).
|
||||
- **Gate:** bit-exact (index indirection only, same values) → greedy md5.
|
||||
- **Risk:** medium (epilogue indexing correctness).
|
||||
- **Effort/reward: MED / MED.** Pairs naturally with A1's kernel.
|
||||
|
||||
### A4. Fused MoE FFN (up_proj → SiLU → down_proj, intermediate register/shared-resident)
|
||||
- **Mechanism:** keep the per-expert intermediate activation in shared/registers across up→act→down instead of round-tripping it to global. For large-M prefill the intermediate is big → a real BW save; also helps decode.
|
||||
- **Gain:** medium-high (removes one full intermediate read+write per expert per layer).
|
||||
- **Gate:** bit-exact if SiLU + accumulation order preserved → greedy md5 (else KL-gate).
|
||||
- **Risk:** HIGH (fused FP4 FFN kernel is complex; register pressure on sm_121a).
|
||||
- **Effort/reward: HIGH / MED-HIGH.** Strong but expensive; sequence after A1.
|
||||
|
||||
### A5. Activation-quant fusion into the 0042 residual/RMSNorm epilogue (prefill)
|
||||
- **Mechanism:** the README's "act-quant fusion FLAT" verdict was *decode-only*. For prefill the W4A4 activation-quantize pass is a bigger tensor. 0042 already fuses residual-add+RMSNorm+mul; extend its epilogue to emit the FP4-quantized activation the next GEMM consumes, removing a dedicated act-quant read+write.
|
||||
- **Gain:** low-medium for prefill.
|
||||
- **Gate:** bit-exact (same `quantize_mmq_nvfp4` math, just fused) → greedy md5.
|
||||
- **Risk:** medium (epilogue + the FP4 codepath coupling).
|
||||
- **Effort/reward: MED / LOW-MED.** Cheap-ish add-on once 0034/A1 are in.
|
||||
|
||||
### A6. Stream-K / split-K for the FP4 prefill GEMM (SM occupancy on few-SM GB10)
|
||||
- **Mechanism:** GB10 has relatively few SMs. For layers whose output grid (⌈M/128⌉×⌈N/128⌉) is smaller than the SM count, SMs idle. Stream-K splits the K dimension across CTAs with a reduction, keeping all SMs busy.
|
||||
- **Gain:** medium for small-output-grid layers (profile-gated — only if 0034's grid under-fills the GPU).
|
||||
- **Gate:** bit-exact if the f32-accumulate reduction order is fixed/deterministic; otherwise KL-gate.
|
||||
- **Risk:** medium (reduction correctness, workspace).
|
||||
- **Effort/reward: MED / MED.** Complements 0034; profile first.
|
||||
|
||||
### A7. Prefill CUDA-graph capture (follow-on to A1)
|
||||
- **Mechanism:** with fixed prefill chunk size (0013/0016 budgets already exist) and A1 removing the host-sync MoE loop, the whole prefill chunk becomes graph-capturable.
|
||||
- **Gain:** LOW marginal — prefill kernels are large so launch overhead is amortized; the value is mostly *enabling* it (which A1 already does). Record as low-reward, not a standalone lever.
|
||||
- **Gate:** bit-exact.
|
||||
- **Effort/reward: LOW / LOW.** Note, don't prioritize.
|
||||
|
||||
## B. DECODE-SERVING (~65% of vLLM aggregate, measured GPU-compute-bound)
|
||||
|
||||
### B1. Speculative decoding, greedy = bit-exact (SSM-state rollback is the crux) ⭐ novel
|
||||
- **Mechanism:** draft γ tokens (small draft model, or prompt-lookup/n-gram for zero extra weights), verify in one target forward. At **temp=0 the accepted tokens are argmax-identical to non-spec → the greedy md5 gate PASSES by construction** (lossless). This is the rare throughput-multiplier that's bit-exact-compatible. Especially powerful at low concurrency where paged is farthest below vLLM (n=8 burst: paged 28 vs vLLM 45) and the GPU is underutilized.
|
||||
- **The non-obvious crux:** hybrid-SSM rollback. KV rollback under paged is easy (truncate blocks). But the gated-DeltaNet recurrent state is updated **in-place** (patch 0018), so a rejected draft requires restoring the 128×128 f32 state per layer to the last accepted position — snapshot-before-speculate (memory+BW cost) or recompute. This SSM-state checkpoint/restore is the real engineering risk and is why naive llama.cpp spec-decode plumbing won't transfer.
|
||||
- **Gain:** HIGH (2-3x at favorable acceptance/low concurrency).
|
||||
- **Gate:** **bit-exact for greedy** (md5 holds); distribution-preserving (KL-gate) for temp>0.
|
||||
- **Risk:** HIGH (SSM snapshot/rollback, draft integration with paged KV + recurrent state, acceptance tuning).
|
||||
- **Effort/reward: HIGH / HIGH.** Biggest novel decode lever; start with zero-draft prompt-lookup to de-risk the rollback plumbing before adding a draft model.
|
||||
|
||||
### B2. FP8 / quantized paged KV cache
|
||||
- **Mechanism:** decode is BW-bound; quantizing the paged KV (llama.cpp already has q8_0/q4_0 `--cache-type-k/v`) halves the KV-gather BW and **doubles effective KV capacity → higher max concurrency**. Wire the existing quantized-KV FA-vec path through the paged block-table read (0009/0010). Matches a vLLM feature (fp8 KV).
|
||||
- **Gain:** medium-high for long-context / high-concurrency decode.
|
||||
- **Gate:** KL-gate (KV quant changes attention numerics; watch long-context recall), per the `8cb0ce23` precedent.
|
||||
- **Risk:** medium (paged FA-read FP8 path; precision on long context).
|
||||
- **Effort/reward: MED / MED-HIGH.**
|
||||
|
||||
### B3. Coalesced paged-KV block layout for the in-kernel decode gather
|
||||
- **Mechanism:** decode is at the LPDDR5x floor, so *effective* BW depends on coalescing. vLLM lays K as `[blocks, kv_heads, head_size/x, block_size, x]` precisely to coalesce the FA read. Re-lay-out the paged blocks so 0009/0010's in-kernel gather issues fully-coalesced vectorized loads matching the FA kernel's access pattern.
|
||||
- **Gain:** medium (profile-gated: measure the FA-read achieved-BW / sector efficiency first).
|
||||
- **Gate:** bit-exact (pure memory layout, identical values) → greedy md5.
|
||||
- **Risk:** medium (touches paged KV manager + FA read).
|
||||
- **Effort/reward: MED / MED.** Profile before building.
|
||||
|
||||
### B4. Megakernel / persistent decode (single-launch fused decode step)
|
||||
- **Mechanism:** fuse the per-layer decode ops into one persistent kernel that loops layers internally (à la Mirage/MPK persistent megakernel), eliminating inter-op launch overhead, inter-op global round-trips, and the host loop for the decode step; keep the recurrent state resident across the step.
|
||||
- **Gain:** potentially high for the GPU-compute-bound serving regime (kills launch/scheduling bubbles vLLM avoids). Honest caveat: at 27-35B the activations don't fit SMEM across layers, so the win is mostly launch-overhead + scheduling, less data-residency.
|
||||
- **Gate:** in principle bit-exact (same ops/order) but extremely hard to guarantee → realistically KL-gate.
|
||||
- **Risk:** VERY HIGH (essentially re-implements the decode forward as one kernel).
|
||||
- **Effort/reward: VERY HIGH / HIGH.** The swing-for-the-fences lever; only after cheaper decode levers are exhausted.
|
||||
|
||||
### B5. Pipeline sampling off the decode critical path
|
||||
- **Mechanism:** the doc names the "serial-SSM host loop / sampling can't start until logits land" as a floor. S2 (double-buffer set_inputs) was dropped because set_inputs is cheap — but the *sampling stall* between steps is different. Overlap step N's sampling + step N+1's input build with the GPU launch, so the GPU never idles waiting on host sampling.
|
||||
- **Gain:** medium (recovers the inter-step sampling bubble; this is the precise residual S2 didn't target).
|
||||
- **Gate:** bit-exact (host reordering only) → greedy md5.
|
||||
- **Risk:** medium (ordering correctness vs the recurrent in-place state).
|
||||
- **Effort/reward: MED / MED.**
|
||||
|
||||
### B6. Co-batch chunked prefill INTO decode steps (vLLM-style GPU saturation — flips S3) ⭐ reframe
|
||||
- **Mechanism:** S3 deliberately keeps prefill *out* of decode steps (for graph reuse). But the later measurement proved serving decode is **GPU-compute-bound, not host-bound** — which *removes S3's rationale*. vLLM does the opposite: mixes small prefill chunks into decode steps to fill otherwise-idle GPU at low decode width. Test co-batching a sized prefill chunk with decode to use spare SMs.
|
||||
- **Gain:** medium at low-to-mid decode width (better GPU utilization).
|
||||
- **Gate:** bit-exact (same math, scheduling only) → greedy md5.
|
||||
- **Risk:** low-medium (it partially contradicts S3 — A/B them; the GPU-compute-bound finding says S3's reuse benefit is ~nil here, so co-batching likely wins).
|
||||
- **Effort/reward: LOW-MED / MED.** Cheap A/B with high information value (directly tests the regime conclusion).
|
||||
|
||||
### B7. Adaptive-width bucketed decode graph (doc-sanctioned revisit)
|
||||
- **Mechanism:** the rejected padded-shape lever used fixed pad-to-`--parallel`; the doc explicitly leaves the door open for *adaptive* width (round up to next small bucket 8/16/32/64).
|
||||
- **Gain:** LOW on GB10 — the same doc measured serving decode GPU-compute-bound, so graph reuse buys ~nothing here. Record as: revisit ONLY if the host loop is re-confirmed dominant on other hardware.
|
||||
- **Gate:** bit-exact.
|
||||
- **Effort/reward: MED / LOW (on GB10).** Note, don't build for GB10.
|
||||
|
||||
## C. CROSS-CUTTING / aggregate-throughput reframes
|
||||
|
||||
### C1. Exploit the 1.5-3x memory advantage for higher max concurrency ⭐ reframe
|
||||
- **Mechanism:** the benchmark stops at npl=128 where both engines fit. With 1.5-3x lower memory (and synergistic with B2 FP8-KV), the paged backend can serve npl=256+ in the same VRAM where vLLM OOMs. Per-stream tok/s gap is irrelevant if paged sustains 2x the concurrent streams per GPU — aggregate tok/s/GPU can match or beat vLLM.
|
||||
- **Gain:** HIGH for aggregate throughput-per-GPU at the memory ceiling (a legitimate, honestly-labeled "different operating point," not a per-stream parity claim).
|
||||
- **Gate:** bit-exact (no numeric change) → greedy md5.
|
||||
- **Risk:** low (scheduler/admission tuning to actually pack the streams).
|
||||
- **Effort/reward: LOW / HIGH.** Cheapest high-reward lever — measure aggregate at max-concurrency, pair with B2.
|
||||
|
||||
---
|
||||
|
||||
## Ranked summary (effort vs reward)
|
||||
|
||||
| # | Lever | Regime | Gate | Effort | Reward |
|
||||
|---|-------|--------|------|--------|--------|
|
||||
| C1 | Higher max-concurrency via memory advantage (+B2) | aggregate | bit-exact | LOW | **HIGH** |
|
||||
| A1 | Graph-safe ragged grouped FP4-MMA MoE kernel | prefill | bit-exact | HIGH | **HIGH** |
|
||||
| B1 | Speculative decode (greedy=bit-exact; SSM rollback crux) | decode | bit-exact (greedy) | HIGH | **HIGH** |
|
||||
| A2 | Multi-stream expert dispatch (→A1) | prefill | bit-exact | LOW-MED | MED |
|
||||
| B6 | Co-batch chunked-prefill into decode (flips S3) | serving | bit-exact | LOW-MED | MED |
|
||||
| B2 | FP8/quantized paged KV cache | decode | KL-gate | MED | MED-HIGH |
|
||||
| A3 | MoE router+gather+GEMM permutation fusion | prefill | bit-exact | MED | MED |
|
||||
| B3 | Coalesced paged-KV layout for decode gather | decode | bit-exact | MED | MED |
|
||||
| B5 | Pipeline sampling off decode critical path | serving | bit-exact | MED | MED |
|
||||
| A4 | Fused MoE FFN (up+SiLU+down resident) | prefill+decode | bit-exact | HIGH | MED-HIGH |
|
||||
| A6 | Stream-K/split-K FP4 prefill GEMM | prefill | bit-exact/KL | MED | MED |
|
||||
| A5 | Act-quant fusion into 0042 epilogue (prefill) | prefill | bit-exact | MED | LOW-MED |
|
||||
| B4 | Megakernel/persistent decode | decode | KL-gate | VERY HIGH | HIGH |
|
||||
| A7 | Prefill CUDA-graph capture (→ enabled by A1) | prefill | bit-exact | LOW | LOW |
|
||||
| B7 | Adaptive-width bucketed decode graph | serving | bit-exact | MED | LOW (GB10) |
|
||||
|
||||
**Suggested attack order:** (1) **C1** — near-free aggregate win exploiting the memory advantage, immediately defensible. (2) **A2→A1** — the prefill MoE GEMM is the biggest single gap and 0034 already flags A1. (3) **B6** — cheap A/B that directly tests/exploits the "serving is GPU-compute-bound" conclusion. (4) **B1** — the highest-ceiling decode lever, but gate the SSM-state rollback plumbing first via zero-draft prompt-lookup. (5) **B2/B3/B5** as the BW + bubble cleanup. (6) **A4 / B4** as the high-effort structural swings only if the cheaper levers leave a funded gap.
|
||||
|
||||
**Two highest-value non-obvious insights:** (a) speculative decoding is *bit-exact under greedy* (md5 passes by construction) — the only throughput-multiplier compatible with the sacred gate — but its hybrid-SSM in-place-state rollback (patch 0018) is the unsolved crux. (b) the measured "serving decode is GPU-compute-bound" finding **invalidates S3's keep-prefill-out rationale** and argues for the *opposite* (B6 co-batching, vLLM-style), plus reframes the win toward aggregate-per-GPU concurrency (C1) rather than per-stream parity.
|
||||
|
||||
Relevant files: `/home/mudler/_git/LocalAI/.claude/worktrees/feat+paged-attention/backend/cpp/llama-cpp-localai-paged/docs/{DECODE_SERVING_SCOPE,PREFILL_GEMM_SCOPE,PREFILL_GEMM_RESULTS,TENSORCORE_GDN_SCOPE}.md`, `.../README.md` (s4 benchmarks, s5 rejected levers), `.../docs/final_benchmark.csv`, `.../patches/paged/0034-feat-paged-native-NVFP4-W4A4-FP4-MMA-large-M-prefill.patch` (A1 is its flagged follow-up), `.../patches/paged/0042-feat-paged-fused-residual-add-RMS-norm-weight-multip.patch` (A5 extends it).
|
||||
|
||||
## 5. Synthesized prioritized lever map
|
||||
|
||||
# Prioritized Lever Map - vLLM Parity, Qwen3.6 NVFP4 on GB10 (sm_121a)
|
||||
|
||||
## Bottom line (where the gap actually is)
|
||||
- **Prefill is the largest absolute gap**: dense ~44-48% of vLLM, MoE (decision model) ~29-41%. Two buckets own ~71% of the wall (NVFP4 GEMM ~49%, chunked GDN ~22%); the op-walk surfaces **three uncovered residuals** (MoE router/combine, prefill act-quant, FA-at-length).
|
||||
- **Decode kernels are at parity-to-ahead** (GDN recurrence 102.6% of vLLM BW; both FP4 GEMMs at the BW floor). **Decode-*serving* is the still-open gap** (~66% at n=128 burst), is **MoE-specific** and **GPU-compute-bound** (host-loop/graph-reuse/padded-shape all proved neutral-or-worse, so they are closed).
|
||||
- The two structural levers vLLM has that the series has **no equivalent for**: **MTP speculative decode** and **GPU fused sampler**. On *this* hardware vLLM is itself on a **bf16-Marlin FP4 fallback** (no tcgen05/CUTLASS-grouped), so a working native FP4 path can **match-or-beat** it, not just chase it.
|
||||
|
||||
## Single highest-leverage NEXT action for the still-open decode-serving gap
|
||||
**Run the both-engine steady-state serving nsys window FIRST (it is the gate before any decode kernel is funded).** Stagger ~128 clients through `llama-server` (`LLAMA_KV_PAGED=1 LLAMA_MOE_FORCE_GRAPHS=1 -fa -ngl 99`) and the identical concurrency on vLLM; bucket per-step GPU-kernel time into `{MUL_MAT_ID, FA-vec/tile, gated_delta_net, bf16-projections, act-quant, sampling}` and compare **serving-narrow vs static-wide vs vLLM**. The decisive single metric: the per-useful-token time share of `MUL_MAT_ID` vs `FA` vs `gated_delta_net` in serving relative to vLLM. **Primary hypothesis to confirm/refute: H1** - MoE grouped GEMM collapsing to per-expert GEMV at ragged width, **and** count `cudaStreamSynchronize` *between* `MUL_MAT_ID` launches to catch the per-expert host-sync fallback firing. This one A/B arbitrates D2 vs D3 vs D4 (all HIGH-effort) at once, and the methodology forbids building a kernel before it. **Bank D1 (grouped-path guarantee) immediately as near-free insurance against the host-sync cliff regardless of outcome.**
|
||||
|
||||
## Master ranked lever table (pursue list)
|
||||
|
||||
| # | Lever | Gap | Gain → parity | Effort | Risk | Gate | Dependency / sequence | Status |
|
||||
|---|-------|-----|--------------|--------|------|------|----------------------|--------|
|
||||
| 0 | **Phase-0 serving nsys (both-engine bucket A/B)** | decode | enabling - sizes/arbitrates H1-H4 | LOW | low | n/a | none - **do first** | NOT DONE |
|
||||
| 1 | **X1 (C1) Exploit 1.5-3× memory → serve npl=256+ where vLLM OOMs** | aggregate | **HIGH** (different operating point: aggregate tok/s/GPU) | LOW | low | BE | pairs w/ D6; admission tuning | NOT STARTED |
|
||||
| 2 | **P1 Native FP4-MMA large-M dense GEMM (patch 0034)** | prefill | **HIGH** - GEMM ~49% of wall; can *beat* vLLM bf16-Marlin | HIGH | med | BE (md5) | foundation for P2/P8 | **IN PROGRESS (0034 scaffold landed)** |
|
||||
| 3 | **D1 Guarantee grouped MMQ path - never host-sync per-expert fallback (extend 0025)** | decode | **HIGH if firing** (removes catastrophic cliff) | LOW | low | BE | gated by #0; bank regardless | NOT STARTED |
|
||||
| 4 | **P3 Multi-stream expert dispatch (→P2)** | prefill | MED (partial overlap of serial syncs) | LOW-MED | med | BE | stepping-stone, bank before P2 | NOT STARTED |
|
||||
| 5 | **P2 (A1) Graph-safe ragged grouped FP4-MMA MoE GEMM** | prefill | **HIGH** - the #1 prefill bucket (~28% of wall) | HIGH | med-high | BE (md5) | after P1/P3; **shares kernel arch w/ D2** | **FLAGGED 0034 follow-up** |
|
||||
| 6 | **D10 (B6) Co-batch chunked-prefill into decode (flips S3)** | serving | MED (fills idle SMs at low D) | LOW-MED | low-med | BE | cheap A/B; tests "GPU-compute-bound" conclusion | NOT STARTED |
|
||||
| 7 | **P4 Tensor-core chunked GDN prefill kernel (rewrite 0031)** | prefill | **HIGH** - #2 prefill bucket (~22% of wall, ~17% of gap) | HIGH | med-high | BE→KL | Gram products de-risked 6.7-9.3× | **DESIGN SCOPED, kernel NOT built** |
|
||||
| 8 | **D2 (H1) Fused grouped-NVFP4 MoE decode GEMM + on-GPU token sort** | decode | **HIGH** - top decode hypothesis (MoE-specific) | HIGH | high | BE | gated by #0; **co-develop kernel w/ P2** | NOT STARTED |
|
||||
| 9 | **D5 (B1) Speculative decode via MTP head** | decode | **HIGH** (2-3× at low/mid concurrency) | HIGH | high | BE (greedy) / KL (temp>0) | crux=SSM in-place state rollback (0018); de-risk w/ zero-draft prompt-lookup | NOT STARTED |
|
||||
| 10 | **D6 (B2) FP8 / quantized paged KV cache** | decode | MED-HIGH (halves KV BW; doubles capacity → enables X1) | MED | med | KL (8cb0ce23 precedent) | wire quantized-KV FA-vec through paged read (0009/0010) | NOT STARTED |
|
||||
| 11 | **D3 (H2) KV-split / flash-decoding paged FA decode** | decode | MED-HIGH (ragged-KV balance + occupancy) | MED-HIGH | med | BE→KL | gated by #0 (build only if FA bucket grows) | NOT STARTED |
|
||||
| 12 | **P5 (A3+PREFILL-L1) Fused MoE router+gather+scatter+combine** | prefill | MED (~5-8% MoE wall, uncovered by P2/P4) | MED | med | BE (fp32 reorder; 8cb0ce23) | pairs w/ P2 kernel | NOT STARTED |
|
||||
| 13 | **D4 (H3) Width-adaptive GDN recurrence launch params** | decode | MED (saturate grid at narrow D) | LOW-MED | low | BE (0022 col-independence) | env GDN_NW/GDN_CPW already exists | NOT STARTED |
|
||||
| 14 | **D7 (B3) Coalesced paged-KV block layout for decode gather** | decode | MED (effective BW / sector efficiency) | MED | med | BE | profile-gated (#0 FA-read BW) | NOT STARTED |
|
||||
| 15 | **P6 (A4) Fused MoE FFN (up→SiLU→down resident)** | prefill+decode | MED-HIGH (removes intermediate round-trip) | HIGH | high | BE→KL | after P2 | NOT STARTED |
|
||||
| 16 | **D9 (B5) Pipeline host sampling off decode critical path** | serving | MED (recovers inter-step sampling bubble) | MED | med | BE | ordering vs in-place recurrent state | NOT STARTED |
|
||||
| 17 | **D8 (H5/#13) GPU fused sorting-free sampler** | serving | MED (small on greedy; matters at 128-way top-k/p) | MED | med | BE-ish | alt to D9; profile to size | NOT STARTED |
|
||||
| 18 | **P8 (A6) Stream-K / split-K FP4 prefill GEMM** | prefill | MED (small-output-grid layers on few-SM GB10) | MED | med | BE if det. else KL | profile-gated; complements P1 | NOT STARTED |
|
||||
| 19 | **P7 (A5/PREFILL-L2) Act-quant fusion into 0042 epilogue (prefill)** | prefill | LOW-MED (~3-6% prefill; vLLM avoids it entirely) | MED | med | BE (md5) | extends landed 0042; after P1 | NOT STARTED |
|
||||
| 20 | **P9 (#10/flag-3) Tensor-core paged prefill FA** | prefill | LOW-MED, **context-dependent (grows L²)** | MED-HIGH | med | BE→KL | re-profile FA share at real serving lengths first | NOT STARTED |
|
||||
| 21 | **D11 (B4) Megakernel / persistent decode** | decode | HIGH (kills launch/scheduling bubbles) | VERY HIGH | very high | KL | last resort, only if funded gap remains | NOT STARTED |
|
||||
|
||||
Gate key: BE = bit-exact (greedy md5); KL = KL-divergence gate; BE→KL = bit-exact preferred, KL fallback.
|
||||
|
||||
## Drop / closed (do NOT pursue)
|
||||
|
||||
| Lever | Why dropped |
|
||||
|-------|-------------|
|
||||
| Padded / fixed-slot decode (pad-to-`--parallel`) | Built, GPU-tested, **REJECTED** - serving decode is GPU-compute-bound; dummy-row compute > reuse recovered |
|
||||
| B7 Adaptive-width bucketed decode graph | LOW value on GB10 (same GPU-compute-bound finding); revisit only if host-loop re-confirmed dominant on other HW |
|
||||
| dequant→bf16 cuBLAS prefill (0033) | **REJECTED** - MMQ beat it 29-49%; superseded by native FP4-MMA (P1) |
|
||||
| W4A16-Marlin / NVFP4 projections (bf16→FP4) | **REJECTED** - KL-fail; vLLM keeps SAME bf16 projections, no advantage to chase |
|
||||
| bf16-tau | Dropped |
|
||||
| Act-quant fusion on **decode** (lever-3) | **FLAT** - decode is BW-bound; the prefill variant (P7) is the live one |
|
||||
| S2 double-buffer set_inputs | Dropped - set_inputs is cheap (host loop closed by 0040/0041) |
|
||||
| H6 NVFP4 act-quant decode tax | No bit-exact lever; **exclusion check only** (expected wash vs vLLM, which also FP4-quantizes) |
|
||||
| P10 (A7) Prefill CUDA-graph capture | LOW/LOW - prefill launch overhead amortized over large M; merely *enabled* by P2, not a standalone item |
|
||||
| H4 ragged-shape umbrella | Not a lever - it is the shared *root* of H1-H3; fixed by D2/D3/D4 at the kernel level |
|
||||
| H5 (as exclusion) / H6 | profile-only rule-outs, not builds (D8 is the actual sampler lever) |
|
||||
|
||||
## Critical-path sequence (two parallel tracks per the multi-agent GPU methodology)
|
||||
|
||||
**Decode-serving track (gated):** #0 serving nsys → bank #3 (D1) → branch on the dominant bucket: if MUL_MAT_ID-GEMV → #8 (D2); if FA → #11 (D3); if recurrence → #13 (D4). In parallel, cheap A/Bs #6 (D10) and #1 (X1). Highest-ceiling greenfield #9 (D5) once SSM-rollback de-risked via zero-draft prompt-lookup. BW cleanup #10 (D6, synergistic with X1).
|
||||
|
||||
**Prefill track (already moving):** #2 (P1, in progress) → #4 (P3) → #5 (P2) - and **co-develop the P2 ragged-grouped kernel with the D2 decode kernel** (one fused-MoE dispatch that degrades gracefully across M = vLLM's single fused_moe shape). In parallel #7 (P4, design ready). Then the residual-coverage adds #12 (P5), #15 (P6), #19 (P7). Profile-gated #18 (P8), #20 (P9).
|
||||
|
||||
**Two highest non-obvious insights to act on:** (a) the P2 prefill kernel and the D2 decode kernel are the **same kernel** (on-GPU token sort + single persistent grouped FP4-MMA launch) at different M - fund them as one effort. (b) the "serving decode is GPU-compute-bound" finding **invalidates S3's keep-prefill-out rationale** - #6 (D10 co-batching, vLLM-style) and #1 (X1 aggregate concurrency) are the cheap wins that follow from it, and are higher-reward-per-effort than any further host-side or graph-reuse work.
|
||||
|
||||
Relevant files (all absolute): `/home/mudler/_git/LocalAI/.claude/worktrees/feat+paged-attention/backend/cpp/llama-cpp-localai-paged/docs/{DECODE_SERVING_SCOPE.md,PREFILL_GEMM_SCOPE.md,PREFILL_GEMM_RESULTS.md,TENSORCORE_GDN_SCOPE.md,final_benchmark.csv}`, `.../README.md`, `.../patches/paged/0034-feat-paged-native-NVFP4-W4A4-FP4-MMA-large-M-prefill.patch` (P1/P2), `.../patches/paged/0042-feat-paged-fused-residual-add-RMS-norm-weight-multip.patch` (P7), `.../patches/paged/0031` (P4), `0025` (D1), `0018/0022` (D4/D5), `0009/0010` (D3/D6/D7); graph source `/home/mudler/_git/LocalAI/backend/cpp/llama-cpp-paged-dev/src/{models/qwen35moe.cpp,models/delta-net-base.cpp,llama-graph.cpp}`.
|
||||
|
||||
---
|
||||
|
||||
# PROFILE-VALIDATED PATH (both-engine nsys, adversarially verified Sun Jun 28 11:55:12 PM UTC 2026)
|
||||
|
||||
## Prefill gap decomposition (paged 396 vs vLLM 197 us/tok)
|
||||
All 4 runs ran on DGX (GB10) via ssh dgx.casa; GPU lock held+released, GPU restored idle. Model = decision MoE Qwen3.6-35B-A3B-NVFP4 (paged GGUF vs q36-35b-a3b-nvfp4-vllm). Buckets = % of GPU-kernel wall (nsys cuda_gpu_kern_sum), and per-prefill-token us.
|
||||
|
||||
PAGED MoE PREFILL (npp512 ntg4 npl32, LLAMA_KV_PAGED=1 +LLAMA_MOE_FORCE_GRAPHS=1): S_PP=2417.8 t/s; kernel 6.485s/16384 tok = 395.9 us/tok. MoE-expert-GEMM(MMQ nvfp4) 26.5% | GDN 24.2% (gdn_core 17.2, gdn_gather 3.3, gdn_conv 2.7, l2norm 1.0) | layout-copy 9.8 (convert_dtype 6.3, concat 2.9) | ew-mul 8.7 | bf16-proj 8.6 | act-quant(quantize_mmq_nvfp4) 4.7 | ew-add 4.6 | silu/sigmoid-gate 4.3 | norms 3.6 | MoE-DISPATCH(argsort 0.4+mm_ids 1.1+gather_mmq 0.7) 2.2 | get_rows 1.0 | FA 0.6 | softmax 0.05 | scatter 0.06.
|
||||
|
||||
vLLM MoE PREFILL (32x512, 5 reps): S_PP=4925.8 t/s; kernel 16.138s/81920 tok = 197.0 us/tok. SURPRISE: on sm_121 vLLM runs experts as Marlin W4A16 (FP4->bf16 dequant + bf16 GEMM), NOT fused-FP4 cutlass; projections are FP8 (sm89_xmma_e4m3). ew-glue(torch elementwise) 31.7% | MoE-expert-GEMM(Marlin) 24.6 | GDN(FLA chunk_* + causal_conv) 18.5 | bf16/fp8-proj 10.4 | reduce(cumsum/softmax) 5.2 | gate 2.3 | act-quant(scaled_fp8) 1.7 | layernorm 1.7 | MoE-DISPATCH(gather/align/count_sort/argsort) 1.4 | FA 1.1.
|
||||
|
||||
Per-token gap decomposition (paged-vLLM, of 198.9 us/tok total): GDN +59.2 (~30%), MoE-GEMM +56.5 (~28%), ew/layout/glue net +21.4 (~11%), act-quant +15.2 (~8%), bf16-proj +13.7 (~7%), gate +12.4 (~6%), norms +11.1 (~6%), dispatch +5.9 (~3%).
|
||||
|
||||
## Decode picture (host-bound, not kernel/graph-reuse)
|
||||
3 decode profiles. KEY: paged decode KERNELS are 5.4x more GPU-efficient than vLLM's, but paged static decode is HOST-BOUND (GPU ~16% busy); vLLM is GPU-bound (99% busy) on a slow recurrent GDN. They tie at static-wide-128 (paged 782 vs vLLM ~819 t/s pure decode) via opposite regimes.
|
||||
|
||||
PAGED DECODE-SERVING (staggered 128 clients, llama-server, steady 22s window, 83.5% GPU-busy): MoE/FFN-GEMM 40.7% (mmq 34.2 + gemv_moe 4.6 + gemv 1.4) | bf16-proj 22.8 (mul_mat_f 11.1 + nvjet 9.1 + cutlass 2.5) | GDN 21.2 (gdn_core 19.9) | act-quant 2.8 | layout 2.1 | get_rows 2.0 | ew-mul 2.0 | FA 1.6 | norms 1.2 | MoE-DISPATCH 1.1 | scatter 0.2 | softmax 0.1.
|
||||
|
||||
PAGED STATIC npl=128 lockstep (PP128+TG256, ~16% GPU-busy, HOST-BOUND): kernel 7.83s/49152 tok=159 us/tok, S_TG=782 t/s. MoE-GEMM 37.5 | GDN 21.6 | layout 9.6 | bf16-proj 9.2 | ew-mul 5.5 | act-quant 4.1 | ew-add 3.4 | norms 2.5 | dispatch 1.8 | FA 0.55. cudaStreamSynchronize=43.4s (84% of API/87% of wall) vs 7.83s GPU kernel => GPU idle ~84%.
|
||||
|
||||
PAGED STATIC npl=1 (batch-1): kernel 0.20s, MEMOPS 0.44s (68% of kern+mem), cudaStreamSync 66.7% => latency/BW-bound, GPU ~4% busy.
|
||||
|
||||
vLLM 128-wide offline (PT128 GEN256, 99% GPU-busy): kernel 42.56s/49152 tok=866 us/tok. GDN 45.2% (fused_recurrent_gated_delta decode 42.8!) | MoE-GEMM(Marlin) 36.2 | bf16/fp8-proj 6.6 | ew-glue 6.3 | FA 2.1 | reduce 1.4 | dispatch 0.7.
|
||||
|
||||
Per-token decode (paged static-128 | vLLM | ratio): MoE-GEMM 59.7|313.5 paged 5.3x faster; GDN 34.3|391.7 paged 11.4x faster; bf16-proj 14.7|57.2; total 159|866 paged 5.4x less GPU.
|
||||
|
||||
H1 verdict (false): the stated mechanism - 'MUL_MAT_ID per-useful-token time growing static->serving from grouped-GEMV collapse' - is REFUTED at the kernel level. The grouped path engages correctly: at width-1 the MoE expert path is GEMV (mul_mat_vec_q), and at width>=~16 it switches to grouped MMQ (mul_mat_q nvfp4) - npl=128 is 37% MMQ/~0 GEMV, serving is 34% MMQ + 6% gemv_moe. It does NOT collapse to per-token GEMV. What IS confirmed (the real H1 mechanism) is HOST-SIDE SERIALIZATION: cudaStreamSynchronize dominates the static-decode wall - npl=1 66.7% of API time (~89% of wall), npl=128 84.3% of API time (43.4s sync vs 7.83s GPU kernel => GPU ~84% idle); the serving window logged 40,902 cudaStreamSynchronize. The grouped MMQ also runs at ragged small-M tiles (mmq_x = 16/24/32/40/48/64/80/96) because tokens-per-expert is tiny -> low tensor-core utilization (small-M MMQ, not a GEMV collapse). Mechanistically the device->host sync to read MoE routing before launching per-expert GEMMs is the serializer (task D1/#104 'no host-sync MoE path').
|
||||
|
||||
THE BIG DECODE PICTURE (most important finding): paged and vLLM have OPPOSITE decode profiles. Paged decode kernels are 5.4x more GPU-efficient (159 vs 866 us/tok) but paged static decode is host-bound (GPU ~16% busy, serial SSM+sampling+MoE-dispatch host loop); vLLM is GPU-bound (99% busy) on a recurrent GDN kernel that is 11x slower per token, but it saturates the GPU via CUDA graphs. They tie at static-wide-128 (782 vs ~819 t/s). At SERVING the paged GPU rises to 83.5% busy because overlapping request streams hide the host stalls - so the serving lever for paged is NOT faster decode kernels (they're already fast/idle) but (a) removing host serialization / graphing the whole step incl MoE dispatch, and (b) chunked-prefill: paged's 2x-slower prefill steals serving cycles during continuous batching (the gen-80-128 serving config was ~55% prefill work; the nsys'd run2 gen-256-512 ~25%). vLLM bf16/fp8 projections are a bigger paged decode bucket than expected (22.8% serving) because batch-1/small-batch bf16 proj uses mul_mat_f (11.1%) + nvjet (9.1%).
|
||||
|
||||
Methodology/scope: profiled with nsys --trace=cuda + cuda_gpu_kern_sum; no NVTX in either engine so buckets are by kernel-name regex (bucketer at dgx:/home/mudler/bench/bucket2.py; reports at dgx:/home/mudler/bench/profgap/). Shared elementwise (k_bin_bcast add/mul, torch elementwise) straddle resid/MoE-fanin/GDN-glue and are bucketed by dominant use with that caveat; vLLM's torch_ew (31.7% prefill) is GDN-glue+MoE-combine+resid and is genuinely ambiguous. The dense Qwen3.6-27B-NVFP4 was NOT separately profiled (time budget; the MoE decision-model contains both MoE experts AND the same GDN/attention stack, fully answering A/B/C); GDN findings generalize to dense. vLLM decode here is offline 128-wide (continuous-batched), not staggered-server, so the cross-engine serving ratio is taken from prior h2h benches (~55-80% of vLLM at npl 64-128), not a fresh staggered vLLM run. Cross-engine 'gap' numbers are GPU-kernel-time per token (apples for GPU-bound prefill; for decode the host-bound vs GPU-bound asymmetry means wall-throughput parity hides a 5.4x GPU-efficiency paged advantage).
|
||||
|
||||
## Decision
|
||||
### moe_prefill_lever
|
||||
BETTER GROUPED GEMM KERNEL (D2/#105), NOT P5 dispatch fusion. The profile settles this empirically: explicit MoE dispatch (argsort+softmax+get_rows+set_rows+mm_ids+gather_mmq) is only 8.6 us/tok (~2-3% of the paged prefill wall; +5.9 us/tok = ~3% of the gap). P5 is REJECTED as a standalone lever - and the premise it rests on ("vLLM fuses dispatch into the GEMM epilogue") is FALSE on GB10: vLLM runs Marlin W4A16 with its OWN separate dispatch kernels (count_and_sort_expert_tokens/moe_align/vectorized_gather/moe_sum, 2.7 us/tok). Dispatch is cheap in both engines; epilogue-fusing it buys ~3% at most.
|
||||
|
||||
The real lever is the grouped GEMM: paged grouped-MMQ MUL_MAT_ID is 105 us/tok vs vLLM Marlin 48.5 us/tok = 2.16x slower, ~28% of the prefill gap (+56.5 us/tok). It does NOT collapse to GEMV - the grouped path engages correctly; it loses because ragged small-M-per-expert tiles (mmq_x 16-96) under-utilize tensor cores.
|
||||
|
||||
Is it winnable given MMQ already beat our native kernel? YES in principle, but ONLY via a kernel approach we have NOT yet tried correctly. Both prior attempts failed for identifiable reasons: 0033 did dequant as a SEPARATE global-memory pass then cuBLAS (lost to fused FP4 MMQ 29-49%); 0034 native FP4-MMA W4A4 PoC did NOT hold in-backend. vLLM proves the winning shape on THIS EXACT silicon (sm_121, Marlin bf16 fallback - no native FP4) is IN-REGISTER FP4->bf16 dequant feeding bf16 mma.sync with cp.async pipelining + large/grouped tiles, and W4A16 means ZERO activation-quant. That second point is load-bearing: act-quant (quantize_mmq_nvfp4) is +15.2 us/tok = ~8% of the gap that vLLM STRUCTURALLY does not pay because it is W4A16. So a Marlin-style W4A16 grouped MoE-prefill GEMM is a combined ~36% prefill lever (GEMM 28% + act-quant 8%), and it is a DIFFERENT kernel from both rejects (not a separate-pass dequant, not native FP4-MMA). The README's "W4A16 rejected" verdict was DECODE-only (BW-bound, wash); prefill is compute-bound and the act-quant pass is M-proportional, so W4A16 for prefill is unaudited and the most promising structural fix. GATE: must beat MMQ in a SEPARATELY-BUILT in-backend A/B at the real ragged-small-M MoE-prefill shapes (NOT a standalone PoC - the exact lesson from rejecting native FP4-MMA); bit-exact via KL-gate for the bf16-dequant reduction-order change (paged-MoE 8cb0ce23 precedent).
|
||||
|
||||
### gdn_build_go
|
||||
True
|
||||
|
||||
### gdn_rationale
|
||||
GO on #101, with a Phase-1 in-backend kill-gate. The profile makes the regime check the scope doc demanded (TENSORCORE_GDN_SCOPE Phase 0) pass cleanly: (1) GDN is the #1 SINGLE contributor to the prefill gap at +59.2 us/tok (~30% of the gap), edging out MoE-GEMM (+56.5). (2) The cost is MATH-predominant, not layout/host: gdn_core (the hand-written FP32 chunked-scan, NOT tensor-core) is 17.2% of the wall; GDN-attributable layout (gdn_gather 3.3 + head-concat 2.9 + a convert_dtype slice) is only ~6-7% (~1/4). So tensor cores attack the dominant 3/4, and the 1/4 layout folds into the same fused kernel. (3) The headroom is MEASURED on identical silicon: vLLM's FLA chunked GDN runs the SAME math at 36.5 us/tok vs paged 95.7 = 2.62x, confirming the scope's "mma absorbs the O(C^2) intra-chunk flops so the Cx state-BW cut becomes a net win" mechanism. (4) Bonus dual payoff: it also chips the decode serial-SSM residual and, via continuous batching, the serving-decode lever (prefill steals ~25-55% of serving cycles).
|
||||
|
||||
CONDITION (empirical guard, not PoC-optimism): 0031's chunking math was correct yet came back 22% SLOWER in-backend, and we JUST rejected native FP4-MMA because its standalone PoC win did not hold in-backend. So GO funds Phase 1 ONLY (two Gram products on mma.cuh tf32 tiles at fixed C=16/1-block-SM); it must move S_PP in a SEPARATELY-BUILT in-backend A/B vs the sequential scan. If Phase 1 is flat, the occupancy/register wall is the blocker, not the reductions - NO-GO the multi-week Phase 2/3 build. Precision gate is the KL-gate (tf32 default, 3xtf32 ladder), greedy md5 stability, plus the adversarial g in [-20,-1e-4] decay op case; ship opt-in default-off until a separately-built A/B beats sequential.
|
||||
|
||||
### top_decode_lever
|
||||
D1/#104 - the no-host-sync MoE decode path + full-step CUDA-graph capture (graph the WHOLE decode step INCLUDING MoE dispatch), targeting the device->host MoE-routing readback. Ranked decisively by the profile, NOT by raw GPU-bucket size: the dominant decode cost is not a GPU kernel at all - it is cudaStreamSynchronize, 84% of the static-decode wall (43.4s sync vs 7.83s GPU kernel; npl=1 66.7%, npl=128 84.3% of API time; 40,902 syncs in the serving window). Root cause = the device->host sync to read MoE routing before launching per-expert GEMMs. Paged decode KERNELS are already 5.4x more GPU-efficient than vLLM's and the GPU sits 84% idle in static decode, so D1 is the only decode lever that attacks the actual bottleneck.
|
||||
|
||||
D2/D3/D4 for DECODE are all REJECTED by the methodology's "a faster kernel off the critical path benches flat" rule: D2 fused MoE decode GEMM - paged MoE-GEMM is already 5.3x faster/token than vLLM (59.7 vs 313.5 us/tok); making it faster just adds idle. D3 FA-split - FA is 1.6% of decode-serving wall / 0.55% static (H2 refuted; the hybrid is mostly GDN with few full-attn layers); not a lever. D4 GDN-width-adaptive - paged GDN decode is already 11.4x faster/token than vLLM (34 vs 392); H3 confirmed (flat across width, no amortization) but the recurrence is NOT the bottleneck, host serialization is - an occupancy retune yields ~nothing until the host loop is gone.
|
||||
|
||||
Honest scope on D1's payoff: at HIGH-concurrency serving the paged GPU is already 83.5% busy because overlapping request streams hide the host stalls, so D1's win concentrates at LOW-concurrency / latency / batch-1 (GPU 4-16% busy), where it is large. The complementary serving-throughput lever is FIXING PREFILL (GDN #101 + MoE GEMM D2/#105): paged's 2x-slower prefill steals serving cycles under continuous batching (~25-55% of the serving step is prefill work) - so the prefill levers ARE also serving-decode levers. GATE: separately-built in-backend A/B (compiled-in, so a runtime flag does NOT isolate it) showing higher static/low-concurrency decode t/s with no high-concurrency-serving regression; bit-exact greedy md5 (graph replay re-issues identical kernels).
|
||||
|
||||
### next_3_levers
|
||||
Ranked, each with its pass-gate:
|
||||
|
||||
1) #101 TENSOR-CORE mma CHUNKED GDN PREFILL KERNEL (prefill, GO). #1 prefill-gap contributor (+59 us/tok, ~30%), ~3/4 math (tensor cores help) with 2.62x measured headroom on identical silicon, 1/4 layout folds in; also helps serving decode. GATE: Phase-0 regime already satisfied by this profile; Phase-1 two-Gram-product PoC must move S_PP in a SEPARATELY-BUILT in-backend A/B vs sequential (flat => NO-GO the multi-week build); then KL-gate (tf32/3xtf32) + greedy md5 + adversarial-decay op test; ship opt-in default-off until A/B beats sequential.
|
||||
|
||||
2) D1/#104 NO-HOST-SYNC MoE DECODE PATH + FULL-STEP CUDA-GRAPH CAPTURE (decode). Attacks the cudaStreamSynchronize that is 84% of the static-decode wall (the MoE-routing device->host readback). Lowest effort, bit-exact, highest-confidence decode win (concentrated at low-concurrency/latency). GATE: separately-built in-backend A/B (not a runtime-flag toggle) - higher static/low-concurrency decode t/s, no high-concurrency-serving regression; bit-exact greedy md5.
|
||||
|
||||
3) D2/#105 MARLIN-STYLE W4A16 GROUPED MoE PREFILL GEMM (prefill). In-register FP4->bf16 dequant + bf16 mma.sync, cp.async, large grouped tiles - captures the 28% MoE-GEMM gap AND the 8% act-quant gap (W4A16 has no activation-quant), = ~36% combined; this is exactly what vLLM does on sm_121. Ranked #3 because of HIGH risk: two prior in-backend GEMM attempts failed (0033 separate-pass dequant, 0034 native FP4-MMA PoC didn't hold). GATE: must beat MMQ in a SEPARATELY-BUILT in-backend A/B at ragged-small-M MoE-prefill shapes (NOT a standalone PoC); bit-exact via KL-gate (bf16-dequant reduction order).
|
||||
|
||||
Explicitly REJECTED/deprioritized (record so they aren't re-run): P5 dispatch fusion (~3%, and the "vLLM fuses dispatch" premise is false on GB10); D2-for-decode, D3 FA-split, D4 GDN-width-adaptive (their kernels are already 5-11x faster than vLLM and GPU-idle -> bench flat); padded/fixed-slot decode (already tested+rejected, commit b028c81e).
|
||||
|
||||
### notes
|
||||
Empirical discipline applied throughout (per the just-rejected native FP4-MMA): every funded lever is gated on a SEPARATELY-BUILT in-backend A/B, never a standalone PoC - 0031 (chunking math correct, -22% in-backend) and 0034 (PoC win, didn't hold) are the two cautionary precedents. Two compiled-in levers (#101, D1) cannot be isolated by a runtime flag, so they need build-vs-build A/B (methodology hard rule).
|
||||
|
||||
Two profile surprises that reshape the directions: (a) vLLM on sm_121 is NOT native FP4 - it runs Marlin W4A16 (FP4->bf16 in-register dequant + bf16 GEMM) for experts and FP8 projections. So the winnable MoE-prefill GEMM is a W4A16-Marlin-style kernel (which also erases our 8% act-quant tax), not another native-FP4 attempt. (b) Decode is a regime asymmetry, not a kernel gap: paged decode kernels are 5.4x more GPU-efficient than vLLM's but paged static decode is HOST-BOUND (GPU 84% idle on cudaStreamSynchronize); vLLM is GPU-bound at 99% on a recurrence 11x slower/token. They tie at static-wide-128. Hence "make decode kernels faster" is the wrong instinct (benches flat); "remove host serialization / graph the full step" (D1) and "fix prefill so it stops stealing serving cycles" (#101, D2) are the decode-serving levers.
|
||||
|
||||
Cross-cutting: the prefill levers (#101 GDN, D2 MoE GEMM) double as serving-decode levers because continuous batching interleaves ~25-55% prefill work into the serving step. GDN edges MoE-GEMM as the top prefill pick (bigger gap, cleaner math mechanism, 2.6x proven headroom, lower in-backend risk, dual payoff).
|
||||
|
||||
All numbers from the both-engine nsys profile (cuda_gpu_kern_sum buckets, bucketer dgx:/home/mudler/bench/bucket2.py, reports dgx:/home/mudler/bench/profgap/); caveats: no NVTX (kernel-name regex buckets); shared elementwise straddles resid/MoE-fanin/GDN-glue; vLLM decode is offline 128-wide, not staggered-server. Relevant repo paths (absolute): /home/mudler/_git/LocalAI/.claude/worktrees/feat+paged-attention/backend/cpp/llama-cpp-localai-paged/docs/{TENSORCORE_GDN_SCOPE.md,TENSORCORE_GDN_BUILD_PLAN.md,VLLM_PARITY_LEVER_MAP.md,PREFILL_GEMM_SCOPE.md,PREFILL_GEMM_RESULTS.md,DECODE_SERVING_SCOPE.md,PAGED_BITEXACT_NOTE.md,final_benchmark.csv}; patches dir .../patches/paged/ (existing 0031 chunked-GDN serial, 0033 dequant->cuBLAS rejected, 0034 native FP4-MMA, 0040/0041 S1/S3 decode-graph, 0042 fused residual+RMSNorm); methodology /home/mudler/_git/LocalAI/.claude/worktrees/feat+paged-attention/.agents/vllm-parity-methodology.md.
|
||||
|
||||
25
backend/cpp/llama-cpp-localai-paged/docs/final_benchmark.csv
Normal file
25
backend/cpp/llama-cpp-localai-paged/docs/final_benchmark.csv
Normal file
@@ -0,0 +1,25 @@
|
||||
model,engine,npl,decode_agg_tps,prefill_tps
|
||||
q36-27b-nvfp4,llama-stock,8,68.3,937.7
|
||||
q36-27b-nvfp4,llama-stock,32,119.9,885.2
|
||||
q36-27b-nvfp4,llama-stock,64,142.8,885.1
|
||||
q36-27b-nvfp4,llama-stock,128,155.1,887.2
|
||||
q36-27b-nvfp4,llama-patched,8,85.3,915.1
|
||||
q36-27b-nvfp4,llama-patched,32,211.9,919.0
|
||||
q36-27b-nvfp4,llama-patched,64,305.2,923.5
|
||||
q36-27b-nvfp4,llama-patched,128,382.1,922.9
|
||||
q36-27b-nvfp4,vllm,8,70.4,2096.2
|
||||
q36-27b-nvfp4,vllm,32,211.8,2182.6
|
||||
q36-27b-nvfp4,vllm,64,309.1,2088.9
|
||||
q36-27b-nvfp4,vllm,128,418.8,1929.1
|
||||
q36-35b-a3b-nvfp4,llama-stock,8,186.7,1501.5
|
||||
q36-35b-a3b-nvfp4,llama-stock,32,267.4,1856.8
|
||||
q36-35b-a3b-nvfp4,llama-stock,64,320.5,1949.5
|
||||
q36-35b-a3b-nvfp4,llama-stock,128,347.2,1995.4
|
||||
q36-35b-a3b-nvfp4,llama-patched,8,230.3,1510.3
|
||||
q36-35b-a3b-nvfp4,llama-patched,32,466.4,1969.2
|
||||
q36-35b-a3b-nvfp4,llama-patched,64,622.4,2122.8
|
||||
q36-35b-a3b-nvfp4,llama-patched,128,784.3,2177.0
|
||||
q36-35b-a3b-nvfp4,vllm,8,256.5,5186.5
|
||||
q36-35b-a3b-nvfp4,vllm,32,500.8,6223.4
|
||||
q36-35b-a3b-nvfp4,vllm,64,686.1,5926.5
|
||||
q36-35b-a3b-nvfp4,vllm,128,882.2,5300.5
|
||||
|
217
backend/cpp/llama-cpp-localai-paged/docs/paged-burst-bench.cpp
Normal file
217
backend/cpp/llama-cpp-localai-paged/docs/paged-burst-bench.cpp
Normal file
@@ -0,0 +1,217 @@
|
||||
// Paged-pool burst-degradation repro (patch 0024). DEV SCAFFOLDING ONLY.
|
||||
//
|
||||
// Reproduces, at the libllama level, the two host-side defects behind the
|
||||
// "later lower-npl prefill collapses, decode fine, restart cures it" benchmark
|
||||
// signature:
|
||||
//
|
||||
// * RECLAMATION GAP (Fix-1): a partial tail seq_rm(seq, p0>0, -1) - exactly
|
||||
// what llama-server issues on every reused slot - frees the kv-cache CELLS
|
||||
// but the paged manager keeps owning the trailing BLOCKS. The manager's
|
||||
// free pool silently shrinks. Test A measures the reclaimed-block delta.
|
||||
//
|
||||
// * FRAGMENTATION / NO COMPACTION (Fix-2): a high-fan-out burst that allocates
|
||||
// many sequences and frees them in a scrambled order leaves the free queue a
|
||||
// scrambled permutation of physical block ids. A later low-npl prefill then
|
||||
// pops physically scattered blocks, so its KV scatter-write + in-kernel
|
||||
// paged-attention gather lose locality and prefill throughput collapses;
|
||||
// decode (single-token append) barely notices. Test B times an npl8 prefill
|
||||
// on a FRESH pool vs an npl8 prefill AFTER a scrambling burst+drain.
|
||||
//
|
||||
// PASS (post-fix): Test A reclaims ceil((PP-KEEP)/bs) trailing blocks on the
|
||||
// partial seq_rm (0 pre-fix); Test B's post-burst npl8 prefill_tps is within ~10%
|
||||
// of the fresh npl8 and num_free returns to the pristine value after the drain.
|
||||
//
|
||||
// Run with LLAMA_KV_PAGED=1. Env: BURST_NSLOT(64) NPL(8) PP(512) KEEP(256)
|
||||
// GEN(4) PAGED_NGL(99). All sequences use distinct content so nothing is shared.
|
||||
|
||||
#include "llama.h"
|
||||
#include "paged-prefix-api.h"
|
||||
|
||||
#include <chrono>
|
||||
#include <clocale>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
||||
static int env_i(const char * k, int dflt) { const char * v = getenv(k); return v ? atoi(v) : dflt; }
|
||||
|
||||
using clk = std::chrono::steady_clock;
|
||||
static double secs(clk::time_point a, clk::time_point b) {
|
||||
return std::chrono::duration<double>(b - a).count();
|
||||
}
|
||||
|
||||
struct Ctx { llama_context * ctx; llama_memory_t mem; llama_batch batch; int n_vocab; };
|
||||
|
||||
// Deterministic, content-distinct token for (seq, pos): keeps every sequence's
|
||||
// blocks unique so no cross-request prefix sharing masks the accounting.
|
||||
static llama_token tok_of(int seq, int pos, int n_vocab) {
|
||||
return (llama_token) (((seq * 1000003 + pos * 131 + 7) % (n_vocab - 200)) + 100);
|
||||
}
|
||||
|
||||
// Prefill n tokens of seq at [pos0, pos0+n) in one ubatch (n <= n_batch).
|
||||
// Returns wall seconds (sync'd).
|
||||
static double prefill(Ctx & C, int seq, int pos0, int n) {
|
||||
clk::time_point t0 = clk::now();
|
||||
C.batch.n_tokens = 0;
|
||||
for (int j = 0; j < n; ++j) {
|
||||
int i = C.batch.n_tokens;
|
||||
C.batch.token[i] = tok_of(seq, pos0 + j, C.n_vocab);
|
||||
C.batch.pos[i] = pos0 + j;
|
||||
C.batch.n_seq_id[i] = 1;
|
||||
C.batch.seq_id[i][0]= seq;
|
||||
C.batch.logits[i] = (j + 1 == n) ? 1 : 0;
|
||||
C.batch.n_tokens++;
|
||||
}
|
||||
if (llama_decode(C.ctx, C.batch)) { fprintf(stderr, "prefill decode failed seq=%d\n", seq); return -1; }
|
||||
llama_synchronize(C.ctx);
|
||||
return secs(t0, clk::now());
|
||||
}
|
||||
|
||||
// One decode step (single token) for seq at pos.
|
||||
static void decode1(Ctx & C, int seq, int pos) {
|
||||
C.batch.n_tokens = 1;
|
||||
C.batch.token[0] = tok_of(seq, pos, C.n_vocab);
|
||||
C.batch.pos[0] = pos; C.batch.n_seq_id[0] = 1; C.batch.seq_id[0][0] = seq; C.batch.logits[0] = 1;
|
||||
if (llama_decode(C.ctx, C.batch)) fprintf(stderr, "decode1 failed seq=%d\n", seq);
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
const char * model_path = nullptr;
|
||||
for (int i = 1; i < argc; ++i) if (!strcmp(argv[i], "-m") && i + 1 < argc) model_path = argv[++i];
|
||||
if (!model_path) { fprintf(stderr, "usage: %s -m model.gguf\n", argv[0]); return 2; }
|
||||
|
||||
const int NSLOT = env_i("BURST_NSLOT", 64);
|
||||
const int NPL = env_i("NPL", 8);
|
||||
const int PP = env_i("PP", 512);
|
||||
const int KEEP = env_i("KEEP", 256);
|
||||
const int GEN = env_i("GEN", 4);
|
||||
const int ngl = env_i("PAGED_NGL", 99);
|
||||
const bool paged = getenv("LLAMA_KV_PAGED") != nullptr;
|
||||
|
||||
ggml_backend_load_all();
|
||||
llama_model_params mp = llama_model_default_params();
|
||||
mp.n_gpu_layers = ngl;
|
||||
llama_model * model = llama_model_load_from_file(model_path, mp);
|
||||
if (!model) { fprintf(stderr, "model load failed\n"); return 1; }
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
const int n_vocab = llama_vocab_n_tokens(vocab);
|
||||
|
||||
// Pool sized for the burst plus headroom so the burst fits but a later npl
|
||||
// run draws from whatever the burst's churn left behind.
|
||||
const long cells = (long) (NSLOT + NPL + 4) * (PP + GEN + 16);
|
||||
llama_context_params cp = llama_context_default_params();
|
||||
cp.n_ctx = (uint32_t) cells;
|
||||
cp.n_batch = (uint32_t) (PP + 16);
|
||||
cp.n_ubatch = (uint32_t) (PP + 16);
|
||||
cp.n_seq_max = NSLOT + NPL + 2;
|
||||
cp.kv_unified = true; // one unified stream-0 pool -> num_free(ctx) is the whole pool
|
||||
cp.no_perf = true;
|
||||
llama_context * ctx = llama_init_from_model(model, cp);
|
||||
if (!ctx) { fprintf(stderr, "ctx init failed (cells=%ld)\n", cells); return 1; }
|
||||
|
||||
Ctx C; C.ctx = ctx; C.mem = llama_get_memory(ctx); C.n_vocab = n_vocab;
|
||||
C.batch = llama_batch_init(cp.n_batch, 0, 1);
|
||||
|
||||
printf("== paged-burst-bench == paged=%d NSLOT=%d NPL=%d PP=%d KEEP=%d GEN=%d n_ctx=%ld\n",
|
||||
paged, NSLOT, NPL, PP, KEEP, GEN, cells);
|
||||
|
||||
llama_memory_clear(C.mem, true);
|
||||
const long F_start = paged_prefix_api::num_free_global();
|
||||
|
||||
// ---- Test A: Fix-1 reclamation gap on a partial tail seq_rm --------------
|
||||
{
|
||||
prefill(C, 0, 0, PP);
|
||||
const long f_after_prefill = paged_prefix_api::num_free_global();
|
||||
llama_memory_seq_rm(C.mem, 0, KEEP, -1); // partial tail removal
|
||||
const long f_after_rm = paged_prefix_api::num_free_global();
|
||||
llama_memory_seq_rm(C.mem, 0, -1, -1); // full free -> pristine
|
||||
const long f_after_full = paged_prefix_api::num_free_global();
|
||||
const long bs = 16;
|
||||
const long expect = (PP + bs - 1)/bs - (KEEP + bs - 1)/bs; // trailing blocks
|
||||
printf("[TEST-A Fix-1] start=%ld afterPrefill=%ld afterPartialRm=%ld reclaimed=%ld "
|
||||
"(expect %ld post-fix, 0 pre-fix) afterFullFree=%ld\n",
|
||||
F_start, f_after_prefill, f_after_rm, f_after_rm - f_after_prefill, expect, f_after_full);
|
||||
}
|
||||
|
||||
// ---- Test B: fragmentation -> npl prefill collapse -----------------------
|
||||
// Fresh npl prefill baseline on a pristine pool.
|
||||
llama_memory_clear(C.mem, true);
|
||||
double tps_fresh;
|
||||
{
|
||||
clk::time_point t0 = clk::now();
|
||||
long ntok = 0;
|
||||
for (int s = 0; s < NPL; ++s) { double d = prefill(C, s, 0, PP); if (d < 0) return 1; ntok += PP; }
|
||||
tps_fresh = ntok / secs(t0, clk::now());
|
||||
for (int s = 0; s < NPL; ++s) llama_memory_seq_rm(C.mem, s, -1, -1);
|
||||
}
|
||||
const long F_pristine = paged_prefix_api::num_free_global();
|
||||
|
||||
// High-fan-out burst: allocate NSLOT sequences, each prefilled + a few decode
|
||||
// steps (mixed alloc), then drain them in a scrambled order (odd ids first,
|
||||
// then even, each truncated before the full free) so the free queue becomes a
|
||||
// scrambled permutation - the fragmentation the bug never compacts.
|
||||
for (int s = 0; s < NSLOT; ++s) {
|
||||
if (prefill(C, NPL + s, 0, PP) < 0) return 1;
|
||||
for (int g = 0; g < GEN; ++g) decode1(C, NPL + s, PP + g);
|
||||
}
|
||||
const long F_during_burst = paged_prefix_api::num_free_global();
|
||||
// Drain: partial tail seq_rm (the reused-slot pattern) then full free, in a
|
||||
// scrambled slot order to scramble the physical free order.
|
||||
for (int parity = 1; parity >= 0; --parity)
|
||||
for (int s = 0; s < NSLOT; ++s) if ((s & 1) == parity) {
|
||||
llama_memory_seq_rm(C.mem, NPL + s, KEEP, -1); // partial (Fix-1 path)
|
||||
llama_memory_seq_rm(C.mem, NPL + s, -1, -1); // full free
|
||||
}
|
||||
const long F_after_drain = paged_prefix_api::num_free_global();
|
||||
|
||||
// Post-burst npl prefill: pops from the (pre-fix scrambled / post-fix
|
||||
// defragged) free queue.
|
||||
double tps_post;
|
||||
{
|
||||
clk::time_point t0 = clk::now();
|
||||
long ntok = 0;
|
||||
for (int s = 0; s < NPL; ++s) { double d = prefill(C, s, 0, PP); if (d < 0) return 1; ntok += PP; }
|
||||
tps_post = ntok / secs(t0, clk::now());
|
||||
for (int s = 0; s < NPL; ++s) llama_memory_seq_rm(C.mem, s, -1, -1);
|
||||
}
|
||||
|
||||
const double ratio = tps_fresh > 0 ? tps_post / tps_fresh : 0;
|
||||
printf("[TEST-B frag] num_free: start=%ld pristine=%ld duringBurst=%ld afterDrain=%ld "
|
||||
"(afterDrain==pristine? %s)\n",
|
||||
F_start, F_pristine, F_during_burst, F_after_drain,
|
||||
F_after_drain == F_pristine ? "YES" : "NO");
|
||||
printf("[TEST-B frag] prefill_tps fresh=%.1f post-burst=%.1f ratio=%.3f "
|
||||
"(PASS if >=0.90)\n", tps_fresh, tps_post, ratio);
|
||||
|
||||
// ---- Test C: idle-slot retention leak -> reclaim (the Fix-3 scenario) -----
|
||||
// Burst NSLOT sequences and leave them IDLE (stock llama-server keeps an idle
|
||||
// slot's KV; the blocks are stranded). F_idle shows the depleted pool a later
|
||||
// low-npl run would see. Then full-seq_rm each (exactly what Fix-3's
|
||||
// prompt_clear() issues at slot.release): F_reclaimed must return to pristine.
|
||||
llama_memory_clear(C.mem, true);
|
||||
// Touch the pool once so the manager exists, then read the full-pool size
|
||||
// (num_free is 0 while no manager is registered).
|
||||
if (prefill(C, 0, 0, 16) < 0) return 1;
|
||||
llama_memory_seq_rm(C.mem, 0, -1, -1);
|
||||
const long F_pre_c = paged_prefix_api::num_free_global();
|
||||
for (int s = 0; s < NSLOT; ++s) { if (prefill(C, NPL + s, 0, PP) < 0) return 1; }
|
||||
const long F_idle = paged_prefix_api::num_free_global();
|
||||
for (int s = 0; s < NSLOT; ++s) llama_memory_seq_rm(C.mem, NPL + s, -1, -1); // Fix-3 release
|
||||
const long F_reclaimed = paged_prefix_api::num_free_global();
|
||||
printf("[TEST-C idle] pristine=%ld idle_after_burst=%ld (leaked=%ld) reclaimed=%ld "
|
||||
"(returns_to_fresh? %s)\n",
|
||||
F_pre_c, F_idle, F_pre_c - F_idle, F_reclaimed,
|
||||
F_reclaimed == F_pre_c ? "YES" : "NO");
|
||||
|
||||
printf("RESULT paged=%d frag_fix2_ratio=%.3f drain_numfree_returns=%s idle_reclaim_returns=%s\n",
|
||||
paged, ratio,
|
||||
F_after_drain == F_pristine ? "YES" : "NO",
|
||||
F_reclaimed == F_pre_c ? "YES" : "NO");
|
||||
|
||||
llama_batch_free(C.batch);
|
||||
llama_free(ctx);
|
||||
llama_model_free(model);
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
// Host-side unit test for the paged-pool burst-reclaim fix (patch 0024).
|
||||
// Compiles paged-kv-manager.cpp directly; no ggml / llama / GPU dependency.
|
||||
//
|
||||
// Fix-1 PagedKVManager::truncate(seq, n_keep) reclaims the trailing blocks
|
||||
// beyond ceil(n_keep/bs) (ref-counted), so a partial tail seq_rm no
|
||||
// longer strands blocks whose cells were cleared.
|
||||
// Fix-2 defrag_free_pool() relinks the free queue into ascending block-id
|
||||
// order once the pool is fully idle, undoing a burst's scrambled frees
|
||||
// so a later prefill pops physically contiguous blocks again.
|
||||
|
||||
#include "paged-kv-manager.h"
|
||||
#include <cstdio>
|
||||
|
||||
using paged::PagedKVManager;
|
||||
|
||||
int main() {
|
||||
int rc = 0;
|
||||
|
||||
// ---- Fix-1: truncate reclaims the trailing block suffix -----------------
|
||||
{
|
||||
PagedKVManager m(/*num_blocks=*/64, /*block_size=*/16, /*caching=*/true);
|
||||
const size_t f0 = m.num_free_blocks(); // 63 (block 0 reserved as null)
|
||||
m.allocate(0, 512); // ceil(512/16)=32 blocks
|
||||
const size_t f1 = m.num_free_blocks(); // 31
|
||||
m.truncate(0, 256); // keep ceil(256/16)=16, free 16
|
||||
const size_t f2 = m.num_free_blocks(); // 47
|
||||
printf("[unit Fix-1] free=%zu alloc512=%zu truncate256=%zu reclaimed=%zu (expect 16)\n",
|
||||
f0, f1, f2, f2 - f1);
|
||||
if (f2 - f1 != 16) rc = 1;
|
||||
m.truncate(0, 16); // keep 1 block, free 15 more
|
||||
const size_t f3 = m.num_free_blocks(); // 62
|
||||
printf("[unit Fix-1] truncate16=%zu (expect %zu)\n", f3, f0 - 1);
|
||||
if (f3 != f0 - 1) rc = 1;
|
||||
m.free(0);
|
||||
if (m.num_free_blocks() != f0) { printf("[unit Fix-1] free mismatch\n"); rc = 1; }
|
||||
}
|
||||
|
||||
// ---- Fix-2: defrag restores ascending popleft order ---------------------
|
||||
{
|
||||
PagedKVManager m(/*num_blocks=*/64, /*block_size=*/16, /*caching=*/false);
|
||||
for (int s = 0; s < 8; ++s) m.allocate(s, 16); // pop blocks 1..8
|
||||
const int scrambled[8] = {3, 7, 1, 5, 0, 6, 2, 4}; // free out of order
|
||||
for (int i = 0; i < 8; ++i) m.free(scrambled[i]);
|
||||
m.defrag_free_pool(); // all idle -> compact
|
||||
m.allocate(100, 16 * 3); // pop 3 blocks
|
||||
const auto bt = m.block_table(100);
|
||||
bool asc = true;
|
||||
printf("[unit Fix-2] post-defrag block_table:");
|
||||
for (size_t i = 0; i < bt.size(); ++i) {
|
||||
printf(" %d", bt[i]);
|
||||
if (i && bt[i] < bt[i - 1]) asc = false;
|
||||
}
|
||||
printf(" ascending=%s (expect YES)\n", asc ? "YES" : "NO");
|
||||
if (!asc) rc = 1;
|
||||
}
|
||||
|
||||
printf("UNIT %s\n", rc == 0 ? "PASS" : "FAIL");
|
||||
return rc;
|
||||
}
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 217 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 123 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 125 KiB |
66
backend/cpp/llama-cpp-localai-paged/package.sh
Executable file
66
backend/cpp/llama-cpp-localai-paged/package.sh
Executable file
@@ -0,0 +1,66 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Script to copy the appropriate libraries based on architecture
|
||||
# This script is used in the final stage of the Dockerfile
|
||||
|
||||
set -e
|
||||
|
||||
CURDIR=$(dirname "$(realpath $0)")
|
||||
REPO_ROOT="${CURDIR}/../../.."
|
||||
|
||||
# Create lib directory
|
||||
mkdir -p $CURDIR/package/lib
|
||||
|
||||
cp -avrf $CURDIR/llama-cpp-localai-paged-* $CURDIR/package/
|
||||
cp -rfv $CURDIR/run.sh $CURDIR/package/
|
||||
|
||||
# Bundle the ggml shared backends from the CPU_ALL_VARIANTS build into package/lib. ggml
|
||||
# discovers the per-microarch libggml-cpu-*.so by scanning the executable directory, which
|
||||
# (via the bundled lib/ld.so that run.sh launches through) resolves to lib/. See the
|
||||
# matching comment in backend/cpp/llama-cpp/package.sh. No-op on the fallback/ROCm builds.
|
||||
if [ -d "$CURDIR/ggml-shared-libs" ]; then
|
||||
echo "Bundling ggml shared backends (CPU_ALL_VARIANTS)..."
|
||||
cp -avf $CURDIR/ggml-shared-libs/*.so* $CURDIR/package/lib/
|
||||
fi
|
||||
|
||||
# Detect architecture and copy appropriate libraries
|
||||
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
|
||||
# x86_64 architecture
|
||||
echo "Detected x86_64 architecture, copying x86_64 libraries..."
|
||||
cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
|
||||
cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
|
||||
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
|
||||
# ARM64 architecture
|
||||
echo "Detected ARM64 architecture, copying ARM64 libraries..."
|
||||
cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
|
||||
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
|
||||
else
|
||||
echo "Error: Could not detect architecture"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Package GPU libraries based on BUILD_TYPE
|
||||
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
|
||||
if [ -f "$GPU_LIB_SCRIPT" ]; then
|
||||
echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
|
||||
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
|
||||
package_gpu_libs
|
||||
fi
|
||||
|
||||
echo "Packaging completed successfully"
|
||||
ls -liah $CURDIR/package/
|
||||
ls -liah $CURDIR/package/lib/
|
||||
@@ -0,0 +1,448 @@
|
||||
From bef64835d444a44ed8391bc395cdab38164229d5 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Fri, 19 Jun 2026 22:54:49 +0000
|
||||
Subject: [PATCH] vendor paged kv manager
|
||||
|
||||
vLLM-parity host-side KV block manager (FreeBlockQueue, BlockPool,
|
||||
PagedKVManager, chained-hash prefix cache). Pure C++17, no behavior change -
|
||||
nothing uses it yet; wired in by later patches in the series.
|
||||
---
|
||||
src/CMakeLists.txt | 1 +
|
||||
src/paged-kv-manager.cpp | 296 +++++++++++++++++++++++++++++++++++++++
|
||||
src/paged-kv-manager.h | 108 ++++++++++++++
|
||||
3 files changed, 405 insertions(+)
|
||||
create mode 100644 src/paged-kv-manager.cpp
|
||||
create mode 100644 src/paged-kv-manager.h
|
||||
|
||||
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
|
||||
index d15ccfd99..a030940b8 100644
|
||||
--- a/src/CMakeLists.txt
|
||||
+++ b/src/CMakeLists.txt
|
||||
@@ -24,6 +24,7 @@ add_library(llama
|
||||
llama-io.cpp
|
||||
llama-kv-cache.cpp
|
||||
llama-kv-cache-iswa.cpp
|
||||
+ paged-kv-manager.cpp
|
||||
llama-kv-cache-dsa.cpp
|
||||
llama-memory.cpp
|
||||
llama-memory-hybrid.cpp
|
||||
diff --git a/src/paged-kv-manager.cpp b/src/paged-kv-manager.cpp
|
||||
new file mode 100644
|
||||
index 000000000..ca0dcd83a
|
||||
--- /dev/null
|
||||
+++ b/src/paged-kv-manager.cpp
|
||||
@@ -0,0 +1,296 @@
|
||||
+#include "paged-kv-manager.h"
|
||||
+#include <cassert>
|
||||
+#include <stdexcept>
|
||||
+
|
||||
+namespace paged {
|
||||
+
|
||||
+// ---------------------------------------------------------------------------
|
||||
+// FreeBlockQueue (port of kv_cache_utils.py FreeKVCacheBlockQueue)
|
||||
+// ---------------------------------------------------------------------------
|
||||
+
|
||||
+FreeBlockQueue::FreeBlockQueue(const std::vector<KVCacheBlock*>& blocks) {
|
||||
+ num_free_blocks = blocks.size();
|
||||
+ for (size_t i = 0; i < blocks.size(); ++i) {
|
||||
+ if (i > 0) blocks[i]->prev_free = blocks[i - 1];
|
||||
+ if (i + 1 < blocks.size()) blocks[i]->next_free = blocks[i + 1];
|
||||
+ }
|
||||
+ if (!blocks.empty()) {
|
||||
+ fake_head.next_free = blocks.front();
|
||||
+ blocks.front()->prev_free = &fake_head;
|
||||
+ fake_tail.prev_free = blocks.back();
|
||||
+ blocks.back()->next_free = &fake_tail;
|
||||
+ } else {
|
||||
+ fake_head.next_free = &fake_tail;
|
||||
+ fake_tail.prev_free = &fake_head;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+KVCacheBlock* FreeBlockQueue::popleft() {
|
||||
+ KVCacheBlock* first = fake_head.next_free;
|
||||
+ if (first == &fake_tail || first == nullptr) {
|
||||
+ assert(num_free_blocks == 0);
|
||||
+ throw std::runtime_error("No free blocks available");
|
||||
+ }
|
||||
+ fake_head.next_free = first->next_free;
|
||||
+ first->next_free->prev_free = &fake_head;
|
||||
+ first->prev_free = first->next_free = nullptr;
|
||||
+ num_free_blocks--;
|
||||
+ return first;
|
||||
+}
|
||||
+
|
||||
+std::vector<KVCacheBlock*> FreeBlockQueue::popleft_n(size_t n) {
|
||||
+ std::vector<KVCacheBlock*> ret;
|
||||
+ if (n == 0) return ret;
|
||||
+ assert(num_free_blocks >= n);
|
||||
+ num_free_blocks -= n;
|
||||
+ KVCacheBlock* curr = fake_head.next_free;
|
||||
+ ret.reserve(n);
|
||||
+ for (size_t i = 0; i < n; ++i) {
|
||||
+ assert(curr != nullptr);
|
||||
+ ret.push_back(curr);
|
||||
+ KVCacheBlock* last = curr;
|
||||
+ curr = curr->next_free;
|
||||
+ last->prev_free = last->next_free = nullptr;
|
||||
+ }
|
||||
+ if (curr != nullptr) {
|
||||
+ fake_head.next_free = curr;
|
||||
+ curr->prev_free = &fake_head;
|
||||
+ }
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+void FreeBlockQueue::remove(KVCacheBlock* block) {
|
||||
+ if (!block->prev_free || !block->next_free)
|
||||
+ throw std::runtime_error("remove() called on an invalid block");
|
||||
+ block->prev_free->next_free = block->next_free;
|
||||
+ block->next_free->prev_free = block->prev_free;
|
||||
+ block->prev_free = block->next_free = nullptr;
|
||||
+ num_free_blocks--;
|
||||
+}
|
||||
+
|
||||
+void FreeBlockQueue::append(KVCacheBlock* block) {
|
||||
+ KVCacheBlock* last = fake_tail.prev_free;
|
||||
+ last->next_free = block;
|
||||
+ block->prev_free = last;
|
||||
+ block->next_free = &fake_tail;
|
||||
+ fake_tail.prev_free = block;
|
||||
+ num_free_blocks++;
|
||||
+}
|
||||
+
|
||||
+void FreeBlockQueue::append_n(const std::vector<KVCacheBlock*>& blocks) {
|
||||
+ if (blocks.empty()) return;
|
||||
+ KVCacheBlock* last = fake_tail.prev_free;
|
||||
+ for (KVCacheBlock* b : blocks) {
|
||||
+ b->prev_free = last;
|
||||
+ last->next_free = b;
|
||||
+ last = b;
|
||||
+ }
|
||||
+ last->next_free = &fake_tail;
|
||||
+ fake_tail.prev_free = last;
|
||||
+ num_free_blocks += blocks.size();
|
||||
+}
|
||||
+
|
||||
+void FreeBlockQueue::prepend_n(const std::vector<KVCacheBlock*>& blocks) {
|
||||
+ if (blocks.empty()) return;
|
||||
+ KVCacheBlock* first = fake_head.next_free;
|
||||
+ KVCacheBlock* prev = &fake_head;
|
||||
+ for (KVCacheBlock* b : blocks) {
|
||||
+ b->prev_free = prev;
|
||||
+ prev->next_free = b;
|
||||
+ prev = b;
|
||||
+ }
|
||||
+ prev->next_free = first;
|
||||
+ first->prev_free = prev;
|
||||
+ num_free_blocks += blocks.size();
|
||||
+}
|
||||
+
|
||||
+std::vector<KVCacheBlock*> FreeBlockQueue::get_all_free_blocks() const {
|
||||
+ std::vector<KVCacheBlock*> ret;
|
||||
+ const KVCacheBlock* curr = fake_head.next_free;
|
||||
+ while (curr && curr->next_free != nullptr) {
|
||||
+ ret.push_back(const_cast<KVCacheBlock*>(curr));
|
||||
+ curr = curr->next_free;
|
||||
+ }
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+// ---------------------------------------------------------------------------
|
||||
+// BlockPool (port of block_pool.py)
|
||||
+// ---------------------------------------------------------------------------
|
||||
+
|
||||
+static std::vector<KVCacheBlock*> make_ptrs(std::vector<KVCacheBlock>& v) {
|
||||
+ std::vector<KVCacheBlock*> p;
|
||||
+ p.reserve(v.size());
|
||||
+ for (auto& b : v) p.push_back(&b);
|
||||
+ return p;
|
||||
+}
|
||||
+
|
||||
+static std::vector<KVCacheBlock> make_block_vec(int32_t num_blocks) {
|
||||
+ std::vector<KVCacheBlock> v;
|
||||
+ v.reserve(num_blocks);
|
||||
+ for (int32_t i = 0; i < num_blocks; ++i) v.emplace_back(i);
|
||||
+ return v;
|
||||
+}
|
||||
+
|
||||
+BlockPool::BlockPool(int32_t num_blocks, bool enable_caching)
|
||||
+ : enable_caching_(enable_caching),
|
||||
+ blocks_(make_block_vec(num_blocks)),
|
||||
+ ptrs_(make_ptrs(blocks_)),
|
||||
+ free_queue_(ptrs_) {
|
||||
+ // vLLM reserves block_id 0 as the null block (never cached).
|
||||
+ null_block = free_queue_.popleft();
|
||||
+ null_block->is_null = true;
|
||||
+}
|
||||
+
|
||||
+bool BlockPool::maybe_evict_cached_block(KVCacheBlock* block) {
|
||||
+ if (!block->has_hash) return false;
|
||||
+ auto it = cached_block_hash_to_block_.find(block->block_hash);
|
||||
+ if (it == cached_block_hash_to_block_.end() || it->second != block) return false;
|
||||
+ cached_block_hash_to_block_.erase(it);
|
||||
+ block->reset_hash();
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+std::vector<KVCacheBlock*> BlockPool::get_new_blocks(size_t n) {
|
||||
+ if (n > get_num_free_blocks())
|
||||
+ throw std::runtime_error("Cannot get free blocks from pool");
|
||||
+ auto ret = free_queue_.popleft_n(n);
|
||||
+ for (KVCacheBlock* b : ret) {
|
||||
+ if (enable_caching_) maybe_evict_cached_block(b);
|
||||
+ assert(b->ref_cnt == 0);
|
||||
+ b->ref_cnt += 1;
|
||||
+ }
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+KVCacheBlock* BlockPool::get_cached_block(uint64_t block_hash) {
|
||||
+ auto it = cached_block_hash_to_block_.find(block_hash);
|
||||
+ return it == cached_block_hash_to_block_.end() ? nullptr : it->second;
|
||||
+}
|
||||
+
|
||||
+void BlockPool::touch(const std::vector<KVCacheBlock*>& blocks) {
|
||||
+ for (KVCacheBlock* b : blocks) {
|
||||
+ // ref_cnt==0 means the block is a free-list eviction candidate; pull it out.
|
||||
+ if (b->ref_cnt == 0 && !b->is_null) free_queue_.remove(b);
|
||||
+ b->ref_cnt += 1;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+void BlockPool::free_blocks(const std::vector<KVCacheBlock*>& ordered_blocks) {
|
||||
+ std::vector<KVCacheBlock*> without_hash, with_hash;
|
||||
+ for (KVCacheBlock* b : ordered_blocks) {
|
||||
+ if (b->is_null) continue;
|
||||
+ b->ref_cnt -= 1;
|
||||
+ if (b->ref_cnt == 0) (b->has_hash ? with_hash : without_hash).push_back(b);
|
||||
+ }
|
||||
+ free_queue_.prepend_n(without_hash); // un-hashed: evicted first (front)
|
||||
+ free_queue_.append_n(with_hash); // hashed: kept warm (tail)
|
||||
+}
|
||||
+
|
||||
+void BlockPool::cache_full_blocks(const std::vector<KVCacheBlock*>& req_blocks,
|
||||
+ size_t num_cached_blocks, size_t num_full_blocks,
|
||||
+ const std::vector<uint64_t>& block_hashes) {
|
||||
+ for (size_t i = num_cached_blocks; i < num_full_blocks; ++i) {
|
||||
+ KVCacheBlock* blk = req_blocks[i];
|
||||
+ if (blk->has_hash) continue;
|
||||
+ blk->has_hash = true;
|
||||
+ blk->block_hash = block_hashes[i];
|
||||
+ cached_block_hash_to_block_[blk->block_hash] = blk;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+// ---------------------------------------------------------------------------
|
||||
+// PagedKVManager (port of SingleTypeKVCacheManager / FullAttentionManager)
|
||||
+// ---------------------------------------------------------------------------
|
||||
+
|
||||
+static inline size_t cdiv(size_t a, size_t b) { return (a + b - 1) / b; }
|
||||
+
|
||||
+PagedKVManager::PagedKVManager(int32_t num_blocks, int block_size, bool enable_caching)
|
||||
+ : block_size_(block_size), pool_(num_blocks, enable_caching) {}
|
||||
+
|
||||
+bool PagedKVManager::allocate(int seq_id, size_t total_tokens) {
|
||||
+ auto& req = req_to_blocks_[seq_id];
|
||||
+ size_t need = cdiv(total_tokens, block_size_);
|
||||
+ if (need <= req.size()) return true;
|
||||
+ size_t add = need - req.size();
|
||||
+ if (add > pool_.get_num_free_blocks()) return false; // OOM
|
||||
+ auto nb = pool_.get_new_blocks(add);
|
||||
+ req.insert(req.end(), nb.begin(), nb.end());
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+std::vector<int32_t> PagedKVManager::block_table(int seq_id) const {
|
||||
+ std::vector<int32_t> bt;
|
||||
+ auto it = req_to_blocks_.find(seq_id);
|
||||
+ if (it == req_to_blocks_.end()) return bt;
|
||||
+ bt.reserve(it->second.size());
|
||||
+ for (KVCacheBlock* b : it->second) bt.push_back(b->block_id);
|
||||
+ return bt;
|
||||
+}
|
||||
+
|
||||
+int64_t PagedKVManager::slot(int seq_id, int pos) const {
|
||||
+ const auto& req = req_to_blocks_.at(seq_id);
|
||||
+ int32_t phys = req[pos / block_size_]->block_id;
|
||||
+ return (int64_t)phys * block_size_ + (pos % block_size_);
|
||||
+}
|
||||
+
|
||||
+std::vector<int64_t> PagedKVManager::slot_mapping(int seq_id, const std::vector<int>& positions) const {
|
||||
+ std::vector<int64_t> sm;
|
||||
+ sm.reserve(positions.size());
|
||||
+ for (int p : positions) sm.push_back(slot(seq_id, p));
|
||||
+ return sm;
|
||||
+}
|
||||
+
|
||||
+void PagedKVManager::free(int seq_id) {
|
||||
+ auto it = req_to_blocks_.find(seq_id);
|
||||
+ if (it == req_to_blocks_.end()) return;
|
||||
+ // Free in reverse so the tail of the block chain is evicted first (vLLM order).
|
||||
+ std::vector<KVCacheBlock*> ordered(it->second.rbegin(), it->second.rend());
|
||||
+ pool_.free_blocks(ordered);
|
||||
+ req_to_blocks_.erase(it);
|
||||
+}
|
||||
+
|
||||
+// FNV-1a chained block hash. Deterministic and prefix-sensitive; folds the parent
|
||||
+// hash into the seed so each block hash transitively encodes its whole prefix
|
||||
+// (behavioral parity with vLLM hash_block_tokens chaining; vLLM uses sha256 bytes).
|
||||
+uint64_t PagedKVManager::hash_block(uint64_t parent_hash, const std::vector<int>& token_ids) {
|
||||
+ uint64_t h = 1469598103934665603ull ^ parent_hash;
|
||||
+ for (int t : token_ids) {
|
||||
+ h ^= (uint64_t)(uint32_t)t;
|
||||
+ h *= 1099511628211ull;
|
||||
+ }
|
||||
+ if (h == 0) h = 0x9e3779b97f4a7c15ull; // never 0 (0 reads as "no hash")
|
||||
+ return h;
|
||||
+}
|
||||
+
|
||||
+std::vector<uint64_t> PagedKVManager::compute_block_hashes(const std::vector<int>& token_ids) const {
|
||||
+ std::vector<uint64_t> hashes;
|
||||
+ uint64_t parent = 0; // NONE_HASH analogue
|
||||
+ size_t n_full = token_ids.size() / block_size_;
|
||||
+ for (size_t i = 0; i < n_full; ++i) {
|
||||
+ std::vector<int> blk(token_ids.begin() + i * block_size_,
|
||||
+ token_ids.begin() + (i + 1) * block_size_);
|
||||
+ parent = hash_block(parent, blk);
|
||||
+ hashes.push_back(parent);
|
||||
+ }
|
||||
+ return hashes;
|
||||
+}
|
||||
+
|
||||
+size_t PagedKVManager::get_computed_blocks(const std::vector<uint64_t>& block_hashes) {
|
||||
+ std::vector<KVCacheBlock*> hits;
|
||||
+ for (uint64_t bh : block_hashes) { // stop at first miss (prefix property)
|
||||
+ KVCacheBlock* cb = pool_.get_cached_block(bh);
|
||||
+ if (!cb) break;
|
||||
+ hits.push_back(cb);
|
||||
+ }
|
||||
+ pool_.touch(hits); // ++ref_cnt, pull from free list
|
||||
+ return hits.size() * (size_t)block_size_;
|
||||
+}
|
||||
+
|
||||
+void PagedKVManager::cache_blocks(int seq_id, const std::vector<uint64_t>& block_hashes, size_t num_tokens) {
|
||||
+ auto& req = req_to_blocks_[seq_id];
|
||||
+ size_t n_full = num_tokens / block_size_;
|
||||
+ pool_.cache_full_blocks(req, /*num_cached=*/0, n_full, block_hashes);
|
||||
+}
|
||||
+
|
||||
+} // namespace paged
|
||||
diff --git a/src/paged-kv-manager.h b/src/paged-kv-manager.h
|
||||
new file mode 100644
|
||||
index 000000000..740280a7f
|
||||
--- /dev/null
|
||||
+++ b/src/paged-kv-manager.h
|
||||
@@ -0,0 +1,109 @@
|
||||
+#pragma once
|
||||
+// Paged KV cache block manager for llama.cpp (CPU-first prototype).
|
||||
+//
|
||||
+// Host-side block management is a faithful port of vLLM V1:
|
||||
+// vllm/v1/core/kv_cache_utils.py (KVCacheBlock, FreeKVCacheBlockQueue, hash_block_tokens)
|
||||
+// vllm/v1/core/block_pool.py (BlockPool: get_new_blocks/touch/free/evict/cache_full_blocks)
|
||||
+// vllm/v1/core/single_type_kv_cache_manager.py (allocate_new_blocks, find_longest_cache_hit)
|
||||
+//
|
||||
+// Parity is on behavior/algorithm (block chaining, first-miss stop, ref-counting,
|
||||
+// LRU eviction order), not on exact hash bytes. This unit has zero ggml/llama.cpp
|
||||
+// dependency so it can be unit-tested in isolation.
|
||||
+
|
||||
+#include <cstddef>
|
||||
+#include <cstdint>
|
||||
+#include <vector>
|
||||
+#include <unordered_map>
|
||||
+#include <map>
|
||||
+
|
||||
+namespace paged {
|
||||
+
|
||||
+// vLLM KVCacheBlock (kv_cache_utils.py).
|
||||
+struct KVCacheBlock {
|
||||
+ int32_t block_id = 0;
|
||||
+ int ref_cnt = 0;
|
||||
+ bool has_hash = false; // vLLM: _block_hash is set only when full+cached
|
||||
+ uint64_t block_hash = 0;
|
||||
+ bool is_null = false;
|
||||
+ KVCacheBlock* prev_free = nullptr;
|
||||
+ KVCacheBlock* next_free = nullptr;
|
||||
+
|
||||
+ explicit KVCacheBlock(int32_t id = 0) : block_id(id) {}
|
||||
+ void reset_hash() { has_hash = false; block_hash = 0; }
|
||||
+};
|
||||
+
|
||||
+// Intrusive doubly-linked free list with fake head/tail (vLLM FreeKVCacheBlockQueue).
|
||||
+// O(1) middle removal is required so touch() can pull a warm cached block out of the
|
||||
+// free list when a later request hits its prefix.
|
||||
+class FreeBlockQueue {
|
||||
+public:
|
||||
+ size_t num_free_blocks = 0;
|
||||
+
|
||||
+ explicit FreeBlockQueue(const std::vector<KVCacheBlock*>& blocks);
|
||||
+ KVCacheBlock* popleft();
|
||||
+ std::vector<KVCacheBlock*> popleft_n(size_t n);
|
||||
+ void remove(KVCacheBlock* block);
|
||||
+ void append(KVCacheBlock* block);
|
||||
+ void append_n(const std::vector<KVCacheBlock*>& blocks);
|
||||
+ void prepend_n(const std::vector<KVCacheBlock*>& blocks);
|
||||
+ std::vector<KVCacheBlock*> get_all_free_blocks() const;
|
||||
+
|
||||
+private:
|
||||
+ KVCacheBlock fake_head{-1};
|
||||
+ KVCacheBlock fake_tail{-1};
|
||||
+};
|
||||
+
|
||||
+// vLLM BlockPool (block_pool.py).
|
||||
+class BlockPool {
|
||||
+public:
|
||||
+ KVCacheBlock* null_block = nullptr;
|
||||
+
|
||||
+ BlockPool(int32_t num_blocks, bool enable_caching);
|
||||
+ std::vector<KVCacheBlock*> get_new_blocks(size_t n);
|
||||
+ KVCacheBlock* get_cached_block(uint64_t block_hash);
|
||||
+ void touch(const std::vector<KVCacheBlock*>& blocks);
|
||||
+ void free_blocks(const std::vector<KVCacheBlock*>& ordered_blocks);
|
||||
+ void cache_full_blocks(const std::vector<KVCacheBlock*>& req_blocks,
|
||||
+ size_t num_cached_blocks, size_t num_full_blocks,
|
||||
+ const std::vector<uint64_t>& block_hashes);
|
||||
+ size_t get_num_free_blocks() const { return free_queue_.num_free_blocks; }
|
||||
+
|
||||
+private:
|
||||
+ bool maybe_evict_cached_block(KVCacheBlock* block);
|
||||
+
|
||||
+ bool enable_caching_;
|
||||
+ std::vector<KVCacheBlock> blocks_; // owns all block descriptors
|
||||
+ std::vector<KVCacheBlock*> ptrs_;
|
||||
+ FreeBlockQueue free_queue_;
|
||||
+ // vLLM stores hash -> {block_id: block} to allow duplicate-content blocks; the
|
||||
+ // prototype keeps the last writer (single KV-cache group is sufficient for the wins).
|
||||
+ std::unordered_map<uint64_t, KVCacheBlock*> cached_block_hash_to_block_;
|
||||
+};
|
||||
+
|
||||
+// Allocation + prefix-caching surface, ported from SingleTypeKVCacheManager /
|
||||
+// FullAttentionManager. Single KV-cache group; no extra_keys / eagle / spec-decode.
|
||||
+class PagedKVManager {
|
||||
+public:
|
||||
+ PagedKVManager(int32_t num_blocks, int block_size, bool enable_caching);
|
||||
+
|
||||
+ // Grow seq_id to cover total_tokens slots. Returns false on OOM (free queue empty).
|
||||
+ bool allocate(int seq_id, size_t total_tokens);
|
||||
+ std::vector<int32_t> block_table(int seq_id) const;
|
||||
+ int64_t slot(int seq_id, int pos) const;
|
||||
+ std::vector<int64_t> slot_mapping(int seq_id, const std::vector<int>& positions) const;
|
||||
+ void free(int seq_id);
|
||||
+ int block_size() const { return block_size_; }
|
||||
+
|
||||
+ // Prefix caching (win 3).
|
||||
+ static uint64_t hash_block(uint64_t parent_hash, const std::vector<int>& token_ids);
|
||||
+ std::vector<uint64_t> compute_block_hashes(const std::vector<int>& token_ids) const;
|
||||
+ size_t get_computed_blocks(const std::vector<uint64_t>& block_hashes); // returns num cached tokens
|
||||
+ void cache_blocks(int seq_id, const std::vector<uint64_t>& block_hashes, size_t num_tokens);
|
||||
+
|
||||
+protected:
|
||||
+ int block_size_;
|
||||
+ BlockPool pool_;
|
||||
+ std::map<int, std::vector<KVCacheBlock*>> req_to_blocks_;
|
||||
+};
|
||||
+
|
||||
+} // namespace paged
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,75 @@
|
||||
From 5c9c709e6c6b07e0399b75fd4e46e752d418a9a8 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Fri, 19 Jun 2026 23:04:17 +0000
|
||||
Subject: [PATCH] paged kv block placement (env LLAMA_KV_PAGED)
|
||||
|
||||
Place each sequence's tokens at permuted, non-contiguous fixed-size block
|
||||
positions in find_slot, proving attention is invariant to physical KV placement
|
||||
(token-identical greedy generation). Default off; single-sequence scope; falls
|
||||
back to the normal allocator. The paged-placement substrate for the gather-read.
|
||||
---
|
||||
src/llama-kv-cache.cpp | 41 +++++++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 41 insertions(+)
|
||||
|
||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
||||
index 2802103bd..999e2ae61 100644
|
||||
--- a/src/llama-kv-cache.cpp
|
||||
+++ b/src/llama-kv-cache.cpp
|
||||
@@ -11,6 +11,8 @@
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <map>
|
||||
+#include <numeric>
|
||||
+#include <cstdlib>
|
||||
#include <stdexcept>
|
||||
|
||||
static bool ggml_is_power_of_2(int n) {
|
||||
@@ -1020,6 +1022,45 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
|
||||
return { };
|
||||
}
|
||||
|
||||
+ // [paged, experimental] Place this sequence's tokens at permuted,
|
||||
+ // non-contiguous fixed-size BLOCK positions instead of a contiguous run.
|
||||
+ // This validates that attention is invariant to physical KV placement -
|
||||
+ // the correctness premise of paged attention. Enabled via LLAMA_KV_PAGED.
|
||||
+ // Single-sequence scope (uses get_used() as the logical base); falls back
|
||||
+ // to the normal allocator if the permuted cells aren't available.
|
||||
+ static const bool paged_mode = (std::getenv("LLAMA_KV_PAGED") != nullptr);
|
||||
+ if (paged_mode) {
|
||||
+ const uint32_t bs = 16; // block size (tokens/block)
|
||||
+ const uint32_t nblk = cells.size() / bs; // blocks in this stream's pool
|
||||
+ if (nblk >= 2) {
|
||||
+ // stride coprime to nblk => block-index permutation is a bijection
|
||||
+ uint32_t k = 1;
|
||||
+ for (uint32_t cand = (nblk / 2) | 1u; cand < nblk; cand += 2) {
|
||||
+ if (std::gcd(cand, nblk) == 1u) { k = cand; break; }
|
||||
+ }
|
||||
+ const uint32_t base = cells.get_used();
|
||||
+ bool ok = true;
|
||||
+ for (uint32_t i = 0; i < n_tokens; ++i) {
|
||||
+ const uint32_t L = base + i;
|
||||
+ const uint32_t b = L / bs;
|
||||
+ const uint32_t off = L % bs;
|
||||
+ if (b >= nblk) { ok = false; break; }
|
||||
+ const uint32_t phys = ((b * k) % nblk) * bs + off; // permuted block
|
||||
+ if (phys >= cells.size() || !cells.is_empty(phys)) { ok = false; break; }
|
||||
+ res.idxs[s].push_back(phys);
|
||||
+ }
|
||||
+ if (ok && res.idxs[s].size() == n_tokens) {
|
||||
+ if (std::getenv("LLAMA_KV_PAGED_DEBUG")) {
|
||||
+ fprintf(stderr, "[paged] seq placed %u tok at cells:", n_tokens);
|
||||
+ for (uint32_t z = 0; z < res.idxs[s].size() && z < 24; ++z) fprintf(stderr, " %u", res.idxs[s][z]);
|
||||
+ fprintf(stderr, " (k=%u nblk=%u base=%u)\n", k, nblk, base);
|
||||
+ }
|
||||
+ continue; // paged placement succeeded for this sequence
|
||||
+ }
|
||||
+ res.idxs[s].clear(); // fall back to the normal allocator
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
uint32_t n_tested = 0;
|
||||
|
||||
// for continuous slots, we test that all tokens in the ubatch fit, starting from the current head
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,370 @@
|
||||
From c1de00f4cc1eb0dd25993880bb4c8562be1937d4 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Mon, 22 Jun 2026 10:24:22 +0200
|
||||
Subject: [PATCH] paged gather-read (env LLAMA_KV_PAGED) - patch 0003
|
||||
|
||||
Gather K, V and the kq_mask down to each sequence stream's non-empty cells
|
||||
before build_attn_mha. Position-sorted per stream so the flash-attn online
|
||||
softmax reduction order matches stock byte-for-byte. Multi-stream: one index
|
||||
column per stream over k->ne[3], padded to the max non-empty count with a
|
||||
masked (empty) cell. Gated behind LLAMA_KV_PAGED; no-op when unset.
|
||||
---
|
||||
src/CMakeLists.txt | 1 +
|
||||
src/llama-graph.cpp | 9 ++-
|
||||
src/llama-kv-cache.cpp | 74 ++++++++++++++++++++++++
|
||||
src/llama-kv-cache.h | 11 ++++
|
||||
src/paged-attn.cpp | 128 +++++++++++++++++++++++++++++++++++++++++
|
||||
src/paged-attn.h | 40 +++++++++++++
|
||||
6 files changed, 262 insertions(+), 1 deletion(-)
|
||||
create mode 100644 src/paged-attn.cpp
|
||||
create mode 100644 src/paged-attn.h
|
||||
|
||||
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
|
||||
index a030940..58083b3 100644
|
||||
--- a/src/CMakeLists.txt
|
||||
+++ b/src/CMakeLists.txt
|
||||
@@ -25,6 +25,7 @@ add_library(llama
|
||||
llama-kv-cache.cpp
|
||||
llama-kv-cache-iswa.cpp
|
||||
paged-kv-manager.cpp
|
||||
+ paged-attn.cpp
|
||||
llama-kv-cache-dsa.cpp
|
||||
llama-memory.cpp
|
||||
llama-memory-hybrid.cpp
|
||||
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
|
||||
index 68c9e60..b59d2a5 100644
|
||||
--- a/src/llama-graph.cpp
|
||||
+++ b/src/llama-graph.cpp
|
||||
@@ -6,6 +6,8 @@
|
||||
#include "llama-cparams.h"
|
||||
|
||||
#include "llama-kv-cache.h"
|
||||
+
|
||||
+#include "paged-attn.h"
|
||||
#include "llama-kv-cache-iswa.h"
|
||||
#include "llama-kv-cache-dsa.h"
|
||||
#include "llama-memory-hybrid.h"
|
||||
@@ -2356,7 +2358,12 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
||||
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
||||
|
||||
- ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
||||
+ // [paged 0003] gather K, V and the mask to the sequence's used cells only
|
||||
+ // (no-op unless env LLAMA_KV_PAGED is set).
|
||||
+ ggml_tensor * kq_mask_g = kq_mask;
|
||||
+ paged_attn::gather(ctx0, res, mctx_cur, &k, &v, &kq_mask_g);
|
||||
+
|
||||
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask_g, sinks, v_mla, kq_scale, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
if (inp->self_v_rot) {
|
||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
||||
index 999e2ae..30d02d7 100644
|
||||
--- a/src/llama-kv-cache.cpp
|
||||
+++ b/src/llama-kv-cache.cpp
|
||||
@@ -1,4 +1,6 @@
|
||||
#include "llama-kv-cache.h"
|
||||
+#include <vector>
|
||||
+#include <utility>
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-io.h"
|
||||
@@ -1329,6 +1331,70 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k
|
||||
ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0);
|
||||
}
|
||||
|
||||
+// [paged 0003] gather-read: enumerate the non-empty cells in [0, n_kv) for the
|
||||
+// single stream addressed by sinfo. With paged placement (patch 0002) these are
|
||||
+// the sequence's scattered block cells; gathering K/V/mask by this index list
|
||||
+// compacts the attention read while preserving every unmasked (token,cell) pair.
|
||||
+uint32_t llama_kv_cache::get_n_gather(uint32_t n_kv, const slot_info & sinfo) const {
|
||||
+ // Multi-stream: the gathered K/V/mask tensors are rectangular [.., n_gather,
|
||||
+ // n_stream], so n_gather is the MAX non-empty count across the batch streams.
|
||||
+ // Streams with fewer cells are padded (see get_gather_idxs) with a masked
|
||||
+ // (empty) cell index, which contributes exp(-inf)=0 and is thus a no-op.
|
||||
+ // K is laid out over physical streams [s0, s1]; index v_cells the same way.
|
||||
+ const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
|
||||
+ uint32_t mx = 0;
|
||||
+ for (uint32_t j = 0; j < ns; ++j) {
|
||||
+ const auto & cells = v_cells[sinfo.s0 + j];
|
||||
+ const uint32_t n = std::min<uint32_t>(n_kv, cells.size());
|
||||
+ uint32_t cnt = 0;
|
||||
+ for (uint32_t i = 0; i < n; ++i) {
|
||||
+ if (!cells.is_empty(i)) {
|
||||
+ ++cnt;
|
||||
+ }
|
||||
+ }
|
||||
+ mx = std::max(mx, cnt);
|
||||
+ }
|
||||
+ return mx;
|
||||
+}
|
||||
+
|
||||
+void llama_kv_cache::get_gather_idxs(int32_t * dst, uint32_t n_kv, const slot_info & sinfo) const {
|
||||
+ const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
|
||||
+ const uint32_t n_gather = get_n_gather(n_kv, sinfo);
|
||||
+ // dst is [n_gather, n_stream] (ne0 = n_gather): column s at dst[s*n_gather..].
|
||||
+ for (uint32_t j = 0; j < ns; ++j) {
|
||||
+ const auto & cells = v_cells[sinfo.s0 + j];
|
||||
+ const uint32_t n = std::min<uint32_t>(n_kv, cells.size());
|
||||
+ // Collect the non-empty cells, then order them by token POSITION (not by
|
||||
+ // physical cell index). The attention reduction (flash-attn online
|
||||
+ // softmax, and the non-flash soft_max) runs over cells in array order and
|
||||
+ // is order-sensitive in floating point. Stock (contiguous) placement
|
||||
+ // happens to store cells in position order, so emitting the gathered
|
||||
+ // indices in position order reproduces stock's exact reduction order -
|
||||
+ // making the paged read bit-identical, not merely math-equivalent.
|
||||
+ std::vector<std::pair<llama_pos, int32_t>> pc;
|
||||
+ pc.reserve(n);
|
||||
+ int32_t pad = -1;
|
||||
+ for (uint32_t i = 0; i < n; ++i) {
|
||||
+ if (!cells.is_empty(i)) {
|
||||
+ pc.emplace_back(cells.pos_get(i), (int32_t) i);
|
||||
+ } else if (pad < 0) {
|
||||
+ pad = (int32_t) i; // first empty cell: its mask is -inf -> safe pad
|
||||
+ }
|
||||
+ }
|
||||
+ std::sort(pc.begin(), pc.end());
|
||||
+ int32_t * col = dst + (size_t) j * n_gather;
|
||||
+ for (size_t k = 0; k < pc.size(); ++k) {
|
||||
+ col[k] = pc[k].second;
|
||||
+ }
|
||||
+ // Pad the tail to n_gather with a masked (empty) cell so the rectangular
|
||||
+ // gather drops to zero contribution for streams shorter than the max.
|
||||
+ const int32_t padv = (pad >= 0) ? pad : (pc.empty() ? 0 : pc.back().second);
|
||||
+ for (uint32_t k = (uint32_t) pc.size(); k < n_gather; ++k) {
|
||||
+ col[k] = padv;
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const {
|
||||
GGML_UNUSED(sinfo);
|
||||
|
||||
@@ -2620,6 +2686,14 @@ ggml_tensor * llama_kv_cache_context::get_v(ggml_context * ctx, int32_t il) cons
|
||||
return kv->get_v(ctx, il, n_kv, sinfos[i_cur]);
|
||||
}
|
||||
|
||||
+uint32_t llama_kv_cache_context::get_n_gather() const {
|
||||
+ return kv->get_n_gather(n_kv, sinfos[i_cur]);
|
||||
+}
|
||||
+
|
||||
+void llama_kv_cache_context::get_gather_idxs(int32_t * dst) const {
|
||||
+ kv->get_gather_idxs(dst, n_kv, sinfos[i_cur]);
|
||||
+}
|
||||
+
|
||||
ggml_tensor * llama_kv_cache_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const {
|
||||
return kv->cpy_k(ctx, k_cur, k_idxs, il, sinfos[i_cur]);
|
||||
}
|
||||
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
|
||||
index 3d68f98..494c0fb 100644
|
||||
--- a/src/llama-kv-cache.h
|
||||
+++ b/src/llama-kv-cache.h
|
||||
@@ -171,6 +171,12 @@ public:
|
||||
ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
|
||||
ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
|
||||
|
||||
+ // [paged 0003] count / list the non-empty cells in [0, n_kv) per stream of
|
||||
+ // sinfo (position-sorted, padded across streams). Used by paged-attn
|
||||
+ // gather-read. get_n_gather returns the max count across streams.
|
||||
+ uint32_t get_n_gather(uint32_t n_kv, const slot_info & sinfo) const;
|
||||
+ void get_gather_idxs(int32_t * dst, uint32_t n_kv, const slot_info & sinfo) const;
|
||||
+
|
||||
// store k_cur and v_cur in the cache based on the provided head location
|
||||
ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const;
|
||||
ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const;
|
||||
@@ -368,6 +374,11 @@ public:
|
||||
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
|
||||
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
|
||||
|
||||
+ // [paged 0003] gather-read helpers (delegate to the kv cache for the
|
||||
+ // current ubatch's stream).
|
||||
+ uint32_t get_n_gather() const;
|
||||
+ void get_gather_idxs(int32_t * dst) const;
|
||||
+
|
||||
// store k_cur and v_cur in the cache based on the provided head location
|
||||
// note: the heads in k_cur and v_cur should be laid out contiguously in memory
|
||||
// - k_cur [n_embd_head_k, n_head_k, n_tokens]
|
||||
diff --git a/src/paged-attn.cpp b/src/paged-attn.cpp
|
||||
new file mode 100644
|
||||
index 0000000..ade75e8
|
||||
--- /dev/null
|
||||
+++ b/src/paged-attn.cpp
|
||||
@@ -0,0 +1,128 @@
|
||||
+#include "paged-attn.h"
|
||||
+
|
||||
+#include "llama-graph.h"
|
||||
+#include "llama-kv-cache.h"
|
||||
+
|
||||
+#include "ggml.h"
|
||||
+#include "ggml-backend.h"
|
||||
+
|
||||
+#include <cstdlib>
|
||||
+#include <cstdio>
|
||||
+
|
||||
+namespace paged_attn {
|
||||
+
|
||||
+bool active() {
|
||||
+ static const bool a = (std::getenv("LLAMA_KV_PAGED") != nullptr);
|
||||
+ return a;
|
||||
+}
|
||||
+
|
||||
+static bool debug() {
|
||||
+ static const bool d = (std::getenv("LLAMA_KV_PAGED_DEBUG") != nullptr);
|
||||
+ return d;
|
||||
+}
|
||||
+
|
||||
+namespace {
|
||||
+
|
||||
+// Graph input that, at set_input time, fills an I32 [n_gather, n_stream] tensor
|
||||
+// with each stream's non-empty cell indices (position-sorted, padded with a
|
||||
+// masked/empty cell) by delegating to the kv-cache context. Private to this
|
||||
+// unit; default can_reuse()==false keeps the graph from being reused across
|
||||
+// decodes (n_gather grows every step).
|
||||
+class input_gather_idxs : public llm_graph_input_i {
|
||||
+public:
|
||||
+ input_gather_idxs(const llama_kv_cache_context * mctx, ggml_tensor * idxs)
|
||||
+ : mctx(mctx), idxs(idxs) {}
|
||||
+
|
||||
+ void set_input(const llama_ubatch * ubatch) override {
|
||||
+ GGML_UNUSED(ubatch);
|
||||
+ GGML_ASSERT(idxs && ggml_backend_buffer_is_host(idxs->buffer));
|
||||
+ mctx->get_gather_idxs((int32_t *) idxs->data);
|
||||
+ }
|
||||
+
|
||||
+ const llama_kv_cache_context * mctx;
|
||||
+ ggml_tensor * idxs;
|
||||
+};
|
||||
+
|
||||
+} // namespace
|
||||
+
|
||||
+void gather(ggml_context * ctx0,
|
||||
+ llm_graph_result * res,
|
||||
+ const llama_kv_cache_context * mctx,
|
||||
+ ggml_tensor ** k,
|
||||
+ ggml_tensor ** v,
|
||||
+ ggml_tensor ** kq_mask) {
|
||||
+ if (!active()) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ ggml_tensor * K = *k;
|
||||
+ ggml_tensor * V = *v;
|
||||
+ ggml_tensor * M = *kq_mask;
|
||||
+
|
||||
+ // Number of streams (sequences) in the unified batch. K is laid out
|
||||
+ // [d, h, n_kv, n_stream] and the mask is [n_kv, n_tps, 1, n_stream]; the
|
||||
+ // gather is per-stream (one index column per stream), so a single
|
||||
+ // ggml_get_rows over the stream axis handles 1..N streams uniformly.
|
||||
+ const int64_t n_stream = K->ne[3];
|
||||
+ GGML_ASSERT(M->ne[3] == n_stream);
|
||||
+
|
||||
+ const int64_t n_gather = (int64_t) mctx->get_n_gather();
|
||||
+ if (n_gather <= 0) {
|
||||
+ // Worst-case graph reserve (empty cache) or nothing placed yet: leave
|
||||
+ // the full [0, n_kv) read untouched so buffer sizing stays worst-case.
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ if (debug()) {
|
||||
+ static int64_t once = 0;
|
||||
+ if (once++ < 2) {
|
||||
+ fprintf(stderr, "[paged-attn] gather n_stream=%lld n_kv=%lld n_gather=%lld\n",
|
||||
+ (long long) n_stream, (long long) K->ne[2], (long long) n_gather);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ // Per-stream index tensor [n_gather, n_stream], filled at set_input from
|
||||
+ // each stream's non-empty cells. ggml_get_rows broadcasts along ne[1]==
|
||||
+ // n_stream, so column s gathers from stream s of the source.
|
||||
+ ggml_tensor * idx = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_gather, n_stream);
|
||||
+ ggml_set_input(idx);
|
||||
+ res->add_input(llm_graph_input_ptr(new input_gather_idxs(mctx, idx)));
|
||||
+
|
||||
+ // --- gather K: collapse (head_dim, n_head) so cells become the row axis ---
|
||||
+ {
|
||||
+ ggml_tensor * t = ggml_cont(ctx0, K); // [d, h, n_kv, ns]
|
||||
+ t = ggml_reshape_3d(ctx0, t, K->ne[0]*K->ne[1], K->ne[2], n_stream); // [d*h, n_kv, ns]
|
||||
+ t = ggml_get_rows(ctx0, t, idx); // [d*h, n_gather, ns]
|
||||
+ *k = ggml_reshape_4d(ctx0, t, K->ne[0], K->ne[1], n_gather, n_stream); // [d, h, n_gather, ns]
|
||||
+ }
|
||||
+
|
||||
+ // --- gather V ---
|
||||
+ // Normalize to a non-transposed [d, h, n_kv, ns] view first, so the gathered
|
||||
+ // result is contiguous and build_attn_mha sees a consistent v_trans==false.
|
||||
+ {
|
||||
+ const bool v_trans = V->nb[1] > V->nb[2];
|
||||
+ ggml_tensor * vsrc = v_trans
|
||||
+ ? ggml_permute(ctx0, V, 2, 1, 0, 3) // [n_kv, h, d, ns] -> [d, h, n_kv, ns]
|
||||
+ : V; // already [d, h, n_kv, ns]
|
||||
+ ggml_tensor * t = ggml_cont(ctx0, vsrc); // [d, h, n_kv, ns]
|
||||
+ t = ggml_reshape_3d(ctx0, t, vsrc->ne[0]*vsrc->ne[1], vsrc->ne[2], n_stream); // [d*h, n_kv, ns]
|
||||
+ t = ggml_get_rows(ctx0, t, idx); // [d*h, n_gather, ns]
|
||||
+ *v = ggml_reshape_4d(ctx0, t, vsrc->ne[0], vsrc->ne[1], n_gather, n_stream); // [d, h, n_gather, ns]
|
||||
+ }
|
||||
+
|
||||
+ // --- gather mask (cells are ne0): transpose so cells become the row axis,
|
||||
+ // gather per stream, transpose back ---
|
||||
+ {
|
||||
+ ggml_tensor * m = ggml_reshape_3d(ctx0, M, M->ne[0], M->ne[1], n_stream); // [n_kv, n_tps, ns]
|
||||
+ m = ggml_cont(ctx0, ggml_transpose(ctx0, m)); // [n_tps, n_kv, ns]
|
||||
+ m = ggml_get_rows(ctx0, m, idx); // [n_tps, n_gather, ns] (F32)
|
||||
+ m = ggml_cont(ctx0, ggml_transpose(ctx0, m)); // [n_gather, n_tps, ns]
|
||||
+ m = ggml_reshape_4d(ctx0, m, n_gather, M->ne[1], 1, n_stream);
|
||||
+ if (M->type != m->type) {
|
||||
+ m = ggml_cast(ctx0, m, M->type); // flash-attn requires an F16 mask
|
||||
+ }
|
||||
+ *kq_mask = m;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+} // namespace paged_attn
|
||||
diff --git a/src/paged-attn.h b/src/paged-attn.h
|
||||
new file mode 100644
|
||||
index 0000000..c5b7bd7
|
||||
--- /dev/null
|
||||
+++ b/src/paged-attn.h
|
||||
@@ -0,0 +1,41 @@
|
||||
+#pragma once
|
||||
+// Paged attention gather-read (patch 0003, experimental).
|
||||
+//
|
||||
+// Companion to the paged block placement in llama_kv_cache::find_slot (patch
|
||||
+// 0002). Patch 0002 places a sequence's tokens at permuted, non-contiguous
|
||||
+// fixed-size block cells, but attention still reads the whole [0, n_kv) window
|
||||
+// (empty cells masked to -inf). This unit compacts that read: it gathers K, V
|
||||
+// and the kq_mask down to ONLY the sequence's used (non-empty) cells before
|
||||
+// build_attn_mha.
|
||||
+//
|
||||
+// Correctness: attention is permutation-invariant over the KV set, and dropping
|
||||
+// already-masked empty cells removes only exp(-inf)=0 terms - so greedy output
|
||||
+// is identical to stock. Gated behind env LLAMA_KV_PAGED; a no-op when unset.
|
||||
+//
|
||||
+// All logic lives here to keep the core files additive: build_attn gets one
|
||||
+// call, llama_kv_cache_context gets two thin accessors, CMake gets one line.
|
||||
+
|
||||
+#include <cstddef>
|
||||
+#include <cstdint>
|
||||
+
|
||||
+struct ggml_context;
|
||||
+struct ggml_tensor;
|
||||
+class llm_graph_result;
|
||||
+class llama_kv_cache_context;
|
||||
+
|
||||
+namespace paged_attn {
|
||||
+
|
||||
+// true iff env LLAMA_KV_PAGED is set (evaluated once).
|
||||
+bool active();
|
||||
+
|
||||
+// Gather K, V and the kq_mask down to the current sequence's non-empty cells.
|
||||
+// No-op (returns immediately) unless active(). On return *k, *v and *kq_mask
|
||||
+// point at the compacted tensors; pass them straight to build_attn_mha.
|
||||
+void gather(ggml_context * ctx0,
|
||||
+ llm_graph_result * res,
|
||||
+ const llama_kv_cache_context * mctx,
|
||||
+ ggml_tensor ** k,
|
||||
+ ggml_tensor ** v,
|
||||
+ ggml_tensor ** kq_mask);
|
||||
+
|
||||
+} // namespace paged_attn
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,298 @@
|
||||
From 7c294973de28d1ac991505638d726acfb371d541 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Mon, 22 Jun 2026 10:50:35 +0200
|
||||
Subject: [PATCH] paged on-demand block allocation (env LLAMA_KV_PAGED) - patch
|
||||
0004
|
||||
|
||||
Drive the paged placement in find_slot through the vendored PagedKVManager
|
||||
(patch 0001) instead of a fixed full-pool permutation. Blocks are popped from a
|
||||
free pool on demand as the sequence crosses block boundaries (peak << full
|
||||
reservation) and returned on sequence end (seq_rm full removal / clear). One
|
||||
manager per (kv-cache, stream); all state lives in the new src/paged-alloc unit,
|
||||
so the core kv-cache struct is untouched - find_slot/clear/seq_rm gain only a
|
||||
gated call. Default off; stock path byte-identical.
|
||||
---
|
||||
src/CMakeLists.txt | 1 +
|
||||
src/llama-kv-cache.cpp | 69 +++++++++++++++++----------
|
||||
src/paged-alloc.cpp | 106 +++++++++++++++++++++++++++++++++++++++++
|
||||
src/paged-alloc.h | 39 +++++++++++++++
|
||||
4 files changed, 190 insertions(+), 25 deletions(-)
|
||||
create mode 100644 src/paged-alloc.cpp
|
||||
create mode 100644 src/paged-alloc.h
|
||||
|
||||
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
|
||||
index 58083b3..4d9d7d1 100644
|
||||
--- a/src/CMakeLists.txt
|
||||
+++ b/src/CMakeLists.txt
|
||||
@@ -26,6 +26,7 @@ add_library(llama
|
||||
llama-kv-cache-iswa.cpp
|
||||
paged-kv-manager.cpp
|
||||
paged-attn.cpp
|
||||
+ paged-alloc.cpp
|
||||
llama-kv-cache-dsa.cpp
|
||||
llama-memory.cpp
|
||||
llama-memory-hybrid.cpp
|
||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
||||
index 30d02d7..1125d9a 100644
|
||||
--- a/src/llama-kv-cache.cpp
|
||||
+++ b/src/llama-kv-cache.cpp
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "llama-kv-cache.h"
|
||||
+#include "paged-alloc.h"
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
@@ -381,6 +382,11 @@ llama_kv_cache::llama_kv_cache(
|
||||
}
|
||||
|
||||
void llama_kv_cache::clear(bool data) {
|
||||
+ // [paged 0004] return all on-demand blocks to the pool on cache clear.
|
||||
+ if (paged_alloc::active()) {
|
||||
+ paged_alloc::release_all(this);
|
||||
+ }
|
||||
+
|
||||
for (uint32_t s = 0; s < n_stream; ++s) {
|
||||
v_cells[s].reset();
|
||||
v_heads[s] = 0;
|
||||
@@ -409,6 +415,16 @@ bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
||||
p1 = std::numeric_limits<llama_pos>::max();
|
||||
}
|
||||
|
||||
+ // [paged 0004] free a stream's on-demand blocks when its whole sequence is
|
||||
+ // removed (sequence end), so they return to the pool for reuse.
|
||||
+ if (paged_alloc::active() && p0 == 0 && p1 == std::numeric_limits<llama_pos>::max()) {
|
||||
+ if (seq_id >= 0) {
|
||||
+ paged_alloc::release(this, (int) seq_to_stream[seq_id]);
|
||||
+ } else {
|
||||
+ paged_alloc::release_all(this);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
if (seq_id >= 0) {
|
||||
auto & cells = v_cells[seq_to_stream[seq_id]];
|
||||
auto & head = v_heads[seq_to_stream[seq_id]];
|
||||
@@ -1030,36 +1046,39 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
|
||||
// the correctness premise of paged attention. Enabled via LLAMA_KV_PAGED.
|
||||
// Single-sequence scope (uses get_used() as the logical base); falls back
|
||||
// to the normal allocator if the permuted cells aren't available.
|
||||
- static const bool paged_mode = (std::getenv("LLAMA_KV_PAGED") != nullptr);
|
||||
- if (paged_mode) {
|
||||
+ // [paged 0004] On-demand block allocation. Patch 0002 proved attention is
|
||||
+ // invariant to physical KV placement; here that placement is driven by
|
||||
+ // the vendored PagedKVManager (patch 0001): blocks are popped from a free
|
||||
+ // pool only as the sequence crosses block boundaries (peak << full
|
||||
+ // reservation) and returned on sequence end. Enabled via LLAMA_KV_PAGED;
|
||||
+ // falls back to the normal allocator on pool exhaustion or any conflict.
|
||||
+ if (paged_alloc::active()) {
|
||||
const uint32_t bs = 16; // block size (tokens/block)
|
||||
- const uint32_t nblk = cells.size() / bs; // blocks in this stream's pool
|
||||
+ const uint32_t nblk = cells.size() / bs; // this stream's block budget
|
||||
if (nblk >= 2) {
|
||||
- // stride coprime to nblk => block-index permutation is a bijection
|
||||
- uint32_t k = 1;
|
||||
- for (uint32_t cand = (nblk / 2) | 1u; cand < nblk; cand += 2) {
|
||||
- if (std::gcd(cand, nblk) == 1u) { k = cand; break; }
|
||||
- }
|
||||
const uint32_t base = cells.get_used();
|
||||
- bool ok = true;
|
||||
- for (uint32_t i = 0; i < n_tokens; ++i) {
|
||||
- const uint32_t L = base + i;
|
||||
- const uint32_t b = L / bs;
|
||||
- const uint32_t off = L % bs;
|
||||
- if (b >= nblk) { ok = false; break; }
|
||||
- const uint32_t phys = ((b * k) % nblk) * bs + off; // permuted block
|
||||
- if (phys >= cells.size() || !cells.is_empty(phys)) { ok = false; break; }
|
||||
- res.idxs[s].push_back(phys);
|
||||
- }
|
||||
- if (ok && res.idxs[s].size() == n_tokens) {
|
||||
- if (std::getenv("LLAMA_KV_PAGED_DEBUG")) {
|
||||
- fprintf(stderr, "[paged] seq placed %u tok at cells:", n_tokens);
|
||||
- for (uint32_t z = 0; z < res.idxs[s].size() && z < 24; ++z) fprintf(stderr, " %u", res.idxs[s][z]);
|
||||
- fprintf(stderr, " (k=%u nblk=%u base=%u)\n", k, nblk, base);
|
||||
+ const int strm = (int) seq_to_stream[seq_id];
|
||||
+ std::vector<uint32_t> placed;
|
||||
+ if (paged_alloc::place(this, strm, base, n_tokens, bs, nblk, placed)) {
|
||||
+ bool ok = (placed.size() == n_tokens);
|
||||
+ for (uint32_t i = 0; ok && i < n_tokens; ++i) {
|
||||
+ if (placed[i] >= cells.size() || !cells.is_empty(placed[i])) {
|
||||
+ ok = false;
|
||||
+ }
|
||||
+ }
|
||||
+ if (ok) {
|
||||
+ for (uint32_t phys : placed) {
|
||||
+ res.idxs[s].push_back(phys);
|
||||
+ }
|
||||
+ if (std::getenv("LLAMA_KV_PAGED_DEBUG")) {
|
||||
+ fprintf(stderr, "[paged] stream %d placed %u tok at cells:", strm, n_tokens);
|
||||
+ for (uint32_t z = 0; z < res.idxs[s].size() && z < 24; ++z) fprintf(stderr, " %u", res.idxs[s][z]);
|
||||
+ fprintf(stderr, " (nblk=%u base=%u)\n", nblk, base);
|
||||
+ }
|
||||
+ continue; // on-demand paged placement succeeded
|
||||
}
|
||||
- continue; // paged placement succeeded for this sequence
|
||||
+ res.idxs[s].clear(); // fall back to the normal allocator
|
||||
}
|
||||
- res.idxs[s].clear(); // fall back to the normal allocator
|
||||
}
|
||||
}
|
||||
|
||||
diff --git a/src/paged-alloc.cpp b/src/paged-alloc.cpp
|
||||
new file mode 100644
|
||||
index 0000000..1d13f9c
|
||||
--- /dev/null
|
||||
+++ b/src/paged-alloc.cpp
|
||||
@@ -0,0 +1,106 @@
|
||||
+#include "paged-alloc.h"
|
||||
+#include "paged-kv-manager.h"
|
||||
+
|
||||
+#include <cstdlib>
|
||||
+#include <cstdio>
|
||||
+#include <map>
|
||||
+#include <memory>
|
||||
+#include <utility>
|
||||
+
|
||||
+namespace paged_alloc {
|
||||
+
|
||||
+bool active() {
|
||||
+ static const bool a = (std::getenv("LLAMA_KV_PAGED") != nullptr);
|
||||
+ return a;
|
||||
+}
|
||||
+
|
||||
+static bool debug() {
|
||||
+ static const bool d = (std::getenv("LLAMA_KV_PAGED_DEBUG") != nullptr);
|
||||
+ return d;
|
||||
+}
|
||||
+
|
||||
+namespace {
|
||||
+
|
||||
+using key_t = std::pair<const void *, int>;
|
||||
+
|
||||
+// One PagedKVManager per (kv-cache, stream): each stream owns a separate
|
||||
+// physical pool of cells.size() cells, so a manager's block ids map directly to
|
||||
+// cell ranges within that stream's pool. The internal request id is always 0.
|
||||
+std::map<key_t, std::unique_ptr<paged::PagedKVManager>> g_managers;
|
||||
+
|
||||
+paged::PagedKVManager * get_mgr(const void * cache, int stream,
|
||||
+ uint32_t pool_blocks, uint32_t block_size) {
|
||||
+ const key_t k{cache, stream};
|
||||
+ auto it = g_managers.find(k);
|
||||
+ if (it == g_managers.end()) {
|
||||
+ // enable_caching=false: prefix caching is a later patch; 0004 exercises
|
||||
+ // only on-demand allocate / free.
|
||||
+ auto mgr = std::make_unique<paged::PagedKVManager>(
|
||||
+ (int32_t) pool_blocks, (int) block_size, /*enable_caching=*/false);
|
||||
+ it = g_managers.emplace(k, std::move(mgr)).first;
|
||||
+ }
|
||||
+ return it->second.get();
|
||||
+}
|
||||
+
|
||||
+} // namespace
|
||||
+
|
||||
+bool place(const void * cache, int stream, uint32_t base, uint32_t n_tokens,
|
||||
+ uint32_t block_size, uint32_t pool_blocks,
|
||||
+ std::vector<uint32_t> & out) {
|
||||
+ if (n_tokens == 0) {
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
+ paged::PagedKVManager * mgr = get_mgr(cache, stream, pool_blocks, block_size);
|
||||
+
|
||||
+ const size_t before = mgr->block_table(0).size();
|
||||
+
|
||||
+ // Grow the request to cover the highest logical position. The manager pops
|
||||
+ // free blocks only for the boundaries actually crossed - that is the on-
|
||||
+ // demand behavior; an already-covered range adds nothing.
|
||||
+ if (!mgr->allocate(0, (size_t) base + n_tokens)) {
|
||||
+ return false; // pool exhausted -> caller falls back to the stock path
|
||||
+ }
|
||||
+
|
||||
+ out.reserve(out.size() + n_tokens);
|
||||
+ for (uint32_t i = 0; i < n_tokens; ++i) {
|
||||
+ const int64_t s = mgr->slot(0, (int) (base + i));
|
||||
+ out.push_back((uint32_t) s);
|
||||
+ }
|
||||
+
|
||||
+ if (debug()) {
|
||||
+ const size_t after = mgr->block_table(0).size();
|
||||
+ if (after != before) {
|
||||
+ fprintf(stderr,
|
||||
+ "[paged-alloc] cache=%p stream=%d grew %zu->%zu blocks "
|
||||
+ "(budget=%u; base=%u +%u tok)\n",
|
||||
+ cache, stream, before, after, pool_blocks, base, n_tokens);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+void release(const void * cache, int stream) {
|
||||
+ auto it = g_managers.find({cache, stream});
|
||||
+ if (it == g_managers.end()) {
|
||||
+ return;
|
||||
+ }
|
||||
+ it->second->free(0);
|
||||
+ g_managers.erase(it);
|
||||
+ if (debug()) {
|
||||
+ fprintf(stderr, "[paged-alloc] released cache=%p stream=%d\n", cache, stream);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+void release_all(const void * cache) {
|
||||
+ for (auto it = g_managers.begin(); it != g_managers.end(); ) {
|
||||
+ if (it->first.first == cache) {
|
||||
+ it = g_managers.erase(it);
|
||||
+ } else {
|
||||
+ ++it;
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+} // namespace paged_alloc
|
||||
diff --git a/src/paged-alloc.h b/src/paged-alloc.h
|
||||
new file mode 100644
|
||||
index 0000000..bf66665
|
||||
--- /dev/null
|
||||
+++ b/src/paged-alloc.h
|
||||
@@ -0,0 +1,39 @@
|
||||
+#pragma once
|
||||
+// On-demand paged KV block allocation (patch 0004, experimental).
|
||||
+//
|
||||
+// Backs the paged placement in llama_kv_cache::find_slot (patch 0002) with the
|
||||
+// vendored host-side PagedKVManager (patch 0001). Instead of mapping a
|
||||
+// sequence's logical positions onto a fixed full-pool permutation, blocks are
|
||||
+// popped from a free pool ON DEMAND as the sequence crosses block boundaries,
|
||||
+// and returned to the pool on sequence end. This is where the paged memory-
|
||||
+// capacity benefit begins: a short sequence holds only a few blocks, not the
|
||||
+// whole reserved window.
|
||||
+//
|
||||
+// Gated behind env LLAMA_KV_PAGED; a no-op when unset. All state lives in this
|
||||
+// unit (a static registry keyed by kv-cache + stream), so the core kv-cache
|
||||
+// struct stays untouched - find_slot only gains a gated call.
|
||||
+
|
||||
+#include <cstdint>
|
||||
+#include <vector>
|
||||
+
|
||||
+namespace paged_alloc {
|
||||
+
|
||||
+// true iff env LLAMA_KV_PAGED is set (evaluated once).
|
||||
+bool active();
|
||||
+
|
||||
+// Place n_tokens logical positions [base, base+n_tokens) of one stream on
|
||||
+// demand, appending their physical cell indices to `out`. pool_blocks =
|
||||
+// cells.size()/block_size is this stream's block budget. Returns false (leaving
|
||||
+// `out` unchanged) on pool exhaustion, so the caller falls back to the stock
|
||||
+// allocator. The caller still validates each returned cell is empty.
|
||||
+bool place(const void * cache, int stream, uint32_t base, uint32_t n_tokens,
|
||||
+ uint32_t block_size, uint32_t pool_blocks,
|
||||
+ std::vector<uint32_t> & out);
|
||||
+
|
||||
+// Return a stream's blocks to the pool (sequence end).
|
||||
+void release(const void * cache, int stream);
|
||||
+
|
||||
+// Return every stream's blocks for a kv-cache (clear() / teardown).
|
||||
+void release_all(const void * cache);
|
||||
+
|
||||
+} // namespace paged_alloc
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,143 @@
|
||||
From 141029beec609e87f24f6f6bba3ec842d7037862 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Mon, 22 Jun 2026 12:13:44 +0200
|
||||
Subject: [PATCH] paged cross-request prefix caching (env LLAMA_KV_PAGED) -
|
||||
patch 0006
|
||||
|
||||
Add host-side cross-request prefix sharing to the vendored PagedKVManager
|
||||
(patches 0001-0004): on placement, hash a new sequence prefix blocks, reuse the
|
||||
matching cached physical blocks (ref_cnt++) for the shared prefix and allocate
|
||||
fresh blocks only for the divergent suffix. A shared block is freed only at
|
||||
ref 0; copy-on-write privatises a still-shared (ref>1) block before a divergent
|
||||
write so co-owners stay byte-correct. All logic lives in the vendored
|
||||
src/paged-kv-manager unit (place_with_prefix / cow_block / ref-counting); the
|
||||
core kv-cache files are untouched. Default off; gated behind LLAMA_KV_PAGED.
|
||||
|
||||
Wiring the physical-cell reuse into find_slot so the engine itself skips
|
||||
recompute needs core seq-membership changes and is left to a later patch.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
src/paged-kv-manager.cpp | 65 ++++++++++++++++++++++++++++++++++++++++
|
||||
src/paged-kv-manager.h | 23 ++++++++++++++
|
||||
2 files changed, 88 insertions(+)
|
||||
|
||||
diff --git a/src/paged-kv-manager.cpp b/src/paged-kv-manager.cpp
|
||||
index ca0dcd8..4c6ee4c 100644
|
||||
--- a/src/paged-kv-manager.cpp
|
||||
+++ b/src/paged-kv-manager.cpp
|
||||
@@ -293,4 +293,69 @@ void PagedKVManager::cache_blocks(int seq_id, const std::vector<uint64_t>& block
|
||||
pool_.cache_full_blocks(req, /*num_cached=*/0, n_full, block_hashes);
|
||||
}
|
||||
|
||||
+// ---------------------------------------------------------------------------
|
||||
+// Cross-request prefix caching + copy-on-write (patch 0006)
|
||||
+// ---------------------------------------------------------------------------
|
||||
+
|
||||
+size_t PagedKVManager::place_with_prefix(int seq_id, const std::vector<int>& token_ids) {
|
||||
+ auto& req = req_to_blocks_[seq_id];
|
||||
+
|
||||
+ // Longest cached prefix: hash the full blocks and stop at the first miss.
|
||||
+ // A block hash transitively encodes its whole prefix (FNV chaining), so the
|
||||
+ // first miss bounds the reusable prefix (vLLM find_longest_cache_hit).
|
||||
+ const std::vector<uint64_t> hashes = compute_block_hashes(token_ids);
|
||||
+ std::vector<KVCacheBlock*> hits;
|
||||
+ for (uint64_t bh : hashes) {
|
||||
+ KVCacheBlock* cb = pool_.get_cached_block(bh);
|
||||
+ if (!cb) break;
|
||||
+ hits.push_back(cb);
|
||||
+ }
|
||||
+
|
||||
+ // Reuse: ++ref_cnt (pulling warm blocks back out of the free list) then
|
||||
+ // splice the shared physical blocks into this sequence's block table.
|
||||
+ pool_.touch(hits);
|
||||
+ req.insert(req.end(), hits.begin(), hits.end());
|
||||
+
|
||||
+ // Allocate fresh blocks only for the divergent suffix.
|
||||
+ const size_t need = cdiv(token_ids.size(), block_size_);
|
||||
+ if (need > req.size()) {
|
||||
+ const size_t add = need - req.size();
|
||||
+ if (add > pool_.get_num_free_blocks()) {
|
||||
+ // OOM: roll the sequence back (un-touch the shared prefix so no ref
|
||||
+ // leaks) and report no placement; the caller falls back to stock.
|
||||
+ std::vector<KVCacheBlock*> ordered(req.rbegin(), req.rend());
|
||||
+ pool_.free_blocks(ordered);
|
||||
+ req.clear();
|
||||
+ return 0;
|
||||
+ }
|
||||
+ auto nb = pool_.get_new_blocks(add);
|
||||
+ req.insert(req.end(), nb.begin(), nb.end());
|
||||
+ }
|
||||
+ return hits.size();
|
||||
+}
|
||||
+
|
||||
+std::pair<int32_t, int32_t> PagedKVManager::cow_block(int seq_id, size_t bi) {
|
||||
+ auto& req = req_to_blocks_.at(seq_id);
|
||||
+ KVCacheBlock* old = req.at(bi);
|
||||
+ if (old->ref_cnt <= 1) {
|
||||
+ return { old->block_id, old->block_id }; // already private - no copy
|
||||
+ }
|
||||
+ // Private copy for this sequence. get_new_blocks sets the fresh block's
|
||||
+ // ref_cnt to 1; free_blocks decrements the shared block, which stays >0 so
|
||||
+ // it is NOT returned to the pool and the other owners are left untouched.
|
||||
+ KVCacheBlock* fresh = pool_.get_new_blocks(1).front();
|
||||
+ pool_.free_blocks({ old });
|
||||
+ req[bi] = fresh;
|
||||
+ return { old->block_id, fresh->block_id };
|
||||
+}
|
||||
+
|
||||
+int PagedKVManager::block_ref_cnt_at(int seq_id, size_t bi) const {
|
||||
+ return req_to_blocks_.at(seq_id).at(bi)->ref_cnt;
|
||||
+}
|
||||
+
|
||||
+size_t PagedKVManager::num_blocks(int seq_id) const {
|
||||
+ auto it = req_to_blocks_.find(seq_id);
|
||||
+ return it == req_to_blocks_.end() ? 0 : it->second.size();
|
||||
+}
|
||||
+
|
||||
} // namespace paged
|
||||
diff --git a/src/paged-kv-manager.h b/src/paged-kv-manager.h
|
||||
index 740280a..34decbc 100644
|
||||
--- a/src/paged-kv-manager.h
|
||||
+++ b/src/paged-kv-manager.h
|
||||
@@ -14,6 +14,7 @@
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <map>
|
||||
+#include <utility>
|
||||
|
||||
namespace paged {
|
||||
|
||||
@@ -99,6 +100,28 @@ public:
|
||||
size_t get_computed_blocks(const std::vector<uint64_t>& block_hashes); // returns num cached tokens
|
||||
void cache_blocks(int seq_id, const std::vector<uint64_t>& block_hashes, size_t num_tokens);
|
||||
|
||||
+ // Cross-request prefix caching + copy-on-write (patch 0006).
|
||||
+ //
|
||||
+ // Splice the longest cached prefix of token_ids into seq_id (reuse the
|
||||
+ // shared physical blocks, ref_cnt++ so a block frees only at ref 0) and
|
||||
+ // allocate fresh blocks only for the divergent suffix. Returns the number of
|
||||
+ // shared (reused) blocks; the caller skips recomputing those tokens. On pool
|
||||
+ // exhaustion the sequence is rolled back (no ref leak) and 0 is returned.
|
||||
+ size_t place_with_prefix(int seq_id, const std::vector<int>& token_ids);
|
||||
+
|
||||
+ // Copy-on-write the block at logical index bi of seq_id. If that block is
|
||||
+ // shared (ref_cnt>1), allocate a fresh private block, drop this seq's ref on
|
||||
+ // the shared one (other owners keep it, content untouched) and install the
|
||||
+ // fresh block at bi. Returns {old_block_id, new_block_id}; new==old when the
|
||||
+ // block was already private (ref_cnt<=1) and no copy is needed. The caller
|
||||
+ // copies the physical cell contents old_block_id -> new_block_id.
|
||||
+ std::pair<int32_t, int32_t> cow_block(int seq_id, size_t bi);
|
||||
+
|
||||
+ // Introspection for the prefix-share gate (debug/tests).
|
||||
+ int block_ref_cnt_at(int seq_id, size_t bi) const;
|
||||
+ size_t num_blocks(int seq_id) const;
|
||||
+ size_t num_free_blocks() const { return pool_.get_num_free_blocks(); }
|
||||
+
|
||||
protected:
|
||||
int block_size_;
|
||||
BlockPool pool_;
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,534 @@
|
||||
From da20c1c0571e84bc76202d915d4bb82892a3392b Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Mon, 22 Jun 2026 12:46:28 +0200
|
||||
Subject: [PATCH] paged engine prefix recompute-skip (env LLAMA_KV_PAGED) -
|
||||
patch 0007
|
||||
|
||||
Wire the host-side cross-request prefix cache (patch 0006) into the engine so a
|
||||
new sequence physically SHARES the cached prefix blocks and skips recomputing the
|
||||
shared prefix - the actual compute win that 0006 (which only proved the host-side
|
||||
machinery + realised reuse via the stock seq_cp) did not yet deliver from the
|
||||
paged path itself.
|
||||
|
||||
Mechanism (all gated behind LLAMA_KV_PAGED; default off, stock byte-identical):
|
||||
|
||||
* paged-alloc reworked from a per-stream, request-0, destroyed-on-free manager
|
||||
into ONE persistent caching PagedKVManager per (kv-cache, stream) whose
|
||||
requests are keyed by the real llama_seq_id. free(seq) now releases exactly
|
||||
one sequence, so ref-counted shared blocks survive while another sharer holds
|
||||
them. New seams: share_prefix (place_with_prefix -> shared prefix tokens),
|
||||
slot, commit (publish a sequence into the content cache), ref-counted release,
|
||||
plus ref/num-free introspection.
|
||||
|
||||
* Two gated llama_kv_cache methods (the core seq-membership handling 0007 needs):
|
||||
paged_prefix_share() reuses the longest cached content prefix for a sequence
|
||||
and marks the shared physical cells as belonging to it (cells.seq_add) so the
|
||||
engine's attention mask includes the already-computed prefix KV; the caller
|
||||
then decodes ONLY the divergent suffix. paged_prefix_commit() publishes a
|
||||
sequence's full blocks for later reuse.
|
||||
|
||||
* find_slot's paged branch anchors placement on each sequence's own logical base
|
||||
(ubatch.pos) and keys the manager request by seq_id, so an independently-freed
|
||||
sequence and a shared prefix coexist in one unified pool. seq_rm/clear free
|
||||
per-sequence (ref-counted) instead of nuking the whole stream.
|
||||
|
||||
* paged-prefix-api: a thin gated shim so a caller holding only the public
|
||||
llama.h can reach the seam and the introspection without the internal headers.
|
||||
|
||||
Core existing-file touch: src/llama-kv-cache.{cpp,h}, +71 -3. Everything else is
|
||||
additive vendored units. Verified on Qwen3-0.6B-Q8_0 (CPU, unified cache): a
|
||||
sequence B sharing A's prefix decodes greedy tokens byte-identical to B from
|
||||
scratch with the prefill computing ONLY the suffix (32 prefix tokens skipped) at
|
||||
a block boundary AND mid-block; the shared block carries ref_cnt 2 while both
|
||||
hold it, drops to 1 when one sharer is removed (survivor intact, re-shareable, no
|
||||
use-after-free) and returns to the pool only when all sharers are freed. The
|
||||
0004 serving gate (unified and non-unified) stays byte-identical stock vs paged.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
src/CMakeLists.txt | 1 +
|
||||
src/llama-kv-cache.cpp | 66 +++++++++++++++++++++++--
|
||||
src/llama-kv-cache.h | 8 +++
|
||||
src/paged-alloc.cpp | 104 ++++++++++++++++++++++++++++++---------
|
||||
src/paged-alloc.h | 69 +++++++++++++++++++-------
|
||||
src/paged-prefix-api.cpp | 48 ++++++++++++++++++
|
||||
src/paged-prefix-api.h | 27 ++++++++++
|
||||
7 files changed, 280 insertions(+), 43 deletions(-)
|
||||
create mode 100644 src/paged-prefix-api.cpp
|
||||
create mode 100644 src/paged-prefix-api.h
|
||||
|
||||
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
|
||||
index 4d9d7d1..432f42d 100644
|
||||
--- a/src/CMakeLists.txt
|
||||
+++ b/src/CMakeLists.txt
|
||||
@@ -27,6 +27,7 @@ add_library(llama
|
||||
paged-kv-manager.cpp
|
||||
paged-attn.cpp
|
||||
paged-alloc.cpp
|
||||
+ paged-prefix-api.cpp
|
||||
llama-kv-cache-dsa.cpp
|
||||
llama-memory.cpp
|
||||
llama-memory-hybrid.cpp
|
||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
||||
index 1125d9a..7510ff9 100644
|
||||
--- a/src/llama-kv-cache.cpp
|
||||
+++ b/src/llama-kv-cache.cpp
|
||||
@@ -419,7 +419,7 @@ bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
||||
// removed (sequence end), so they return to the pool for reuse.
|
||||
if (paged_alloc::active() && p0 == 0 && p1 == std::numeric_limits<llama_pos>::max()) {
|
||||
if (seq_id >= 0) {
|
||||
- paged_alloc::release(this, (int) seq_to_stream[seq_id]);
|
||||
+ paged_alloc::release(this, (int) seq_to_stream[seq_id], (int) seq_id);
|
||||
} else {
|
||||
paged_alloc::release_all(this);
|
||||
}
|
||||
@@ -1056,10 +1056,15 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
|
||||
const uint32_t bs = 16; // block size (tokens/block)
|
||||
const uint32_t nblk = cells.size() / bs; // this stream's block budget
|
||||
if (nblk >= 2) {
|
||||
- const uint32_t base = cells.get_used();
|
||||
+ // [paged 0007] Anchor placement on this sequence's own logical
|
||||
+ // base position (ubatch.pos), not the shared used-count, and key
|
||||
+ // the manager request by the real seq_id. slot(seq,pos) is then
|
||||
+ // stable per sequence, so an independently-freed (ref-counted)
|
||||
+ // sequence and a shared prefix can coexist in one unified pool.
|
||||
+ const uint32_t base = (uint32_t) ubatch.pos[s*n_tokens];
|
||||
const int strm = (int) seq_to_stream[seq_id];
|
||||
std::vector<uint32_t> placed;
|
||||
- if (paged_alloc::place(this, strm, base, n_tokens, bs, nblk, placed)) {
|
||||
+ if (paged_alloc::place(this, strm, (int) seq_id, base, n_tokens, bs, nblk, placed)) {
|
||||
bool ok = (placed.size() == n_tokens);
|
||||
for (uint32_t i = 0; ok && i < n_tokens; ++i) {
|
||||
if (placed[i] >= cells.size() || !cells.is_empty(placed[i])) {
|
||||
@@ -1165,6 +1170,61 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
|
||||
return res;
|
||||
}
|
||||
|
||||
+// [paged 0007] Cross-request prefix recompute-skip.
|
||||
+//
|
||||
+// Reuse a cached content prefix for seq_id: share_prefix() splices the longest
|
||||
+// matching cached physical blocks into seq_id (ref_cnt++) and reserves fresh
|
||||
+// blocks for the divergent suffix. We then mark the shared physical cells as
|
||||
+// belonging to seq_id - those cells already hold the owner's computed KV at the
|
||||
+// matching logical positions, so the caller decodes ONLY the suffix and the
|
||||
+// prefix is never recomputed. Returns the number of shared prefix tokens.
|
||||
+// Gated behind LLAMA_KV_PAGED; a no-op (returns 0) otherwise.
|
||||
+int32_t llama_kv_cache::paged_prefix_share(llama_seq_id seq_id, const std::vector<llama_token> & tokens) {
|
||||
+ if (!paged_alloc::active() || tokens.empty()) {
|
||||
+ return 0;
|
||||
+ }
|
||||
+ const uint32_t bs = 16;
|
||||
+ const uint32_t strm = (uint32_t) seq_to_stream[seq_id];
|
||||
+ auto & cells = v_cells[strm];
|
||||
+ const uint32_t nblk = cells.size() / bs;
|
||||
+ if (nblk < 2) {
|
||||
+ return 0;
|
||||
+ }
|
||||
+
|
||||
+ std::vector<int> toks(tokens.begin(), tokens.end());
|
||||
+ const size_t kshare = paged_alloc::share_prefix(this, (int) strm, (int) seq_id, toks, bs, nblk);
|
||||
+
|
||||
+ for (size_t p = 0; p < kshare; ++p) {
|
||||
+ const int64_t cell = paged_alloc::slot(this, (int) strm, (int) seq_id, (int) p);
|
||||
+ if (cell < 0 || (uint32_t) cell >= cells.size() ||
|
||||
+ cells.is_empty((uint32_t) cell) ||
|
||||
+ cells.pos_get((uint32_t) cell) != (llama_pos) p) {
|
||||
+ // Owner cell missing / repurposed: cannot safely share. Roll the
|
||||
+ // sequence back so the caller recomputes the whole prompt.
|
||||
+ paged_alloc::release(this, (int) strm, (int) seq_id);
|
||||
+ return 0;
|
||||
+ }
|
||||
+ if (!cells.seq_has((uint32_t) cell, seq_id)) {
|
||||
+ cells.seq_add((uint32_t) cell, seq_id);
|
||||
+ }
|
||||
+ }
|
||||
+ return (int32_t) kshare;
|
||||
+}
|
||||
+
|
||||
+// [paged 0007] Publish a sequence's full blocks into the content cache so a
|
||||
+// later paged_prefix_share() can reuse them. Call after the sequence KV is
|
||||
+// computed (its prefill decode has run).
|
||||
+void llama_kv_cache::paged_prefix_commit(llama_seq_id seq_id, const std::vector<llama_token> & tokens) {
|
||||
+ if (!paged_alloc::active() || tokens.empty()) {
|
||||
+ return;
|
||||
+ }
|
||||
+ const uint32_t bs = 16;
|
||||
+ const uint32_t strm = (uint32_t) seq_to_stream[seq_id];
|
||||
+ const uint32_t nblk = v_cells[strm].size() / bs;
|
||||
+ std::vector<int> toks(tokens.begin(), tokens.end());
|
||||
+ paged_alloc::commit(this, (int) strm, (int) seq_id, toks, bs, nblk);
|
||||
+}
|
||||
+
|
||||
void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch & ubatch) {
|
||||
// TODO: refactor [TAG_KV_CACHE_SHARE_CELLS]
|
||||
if (other) {
|
||||
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
|
||||
index 494c0fb..f374ac6 100644
|
||||
--- a/src/llama-kv-cache.h
|
||||
+++ b/src/llama-kv-cache.h
|
||||
@@ -199,6 +199,14 @@ public:
|
||||
// emplace the ubatch context into slot: [sinfo.idxs[0...ubatch.n_tokens - 1]]
|
||||
void apply_ubatch(const slot_info & sinfo, const llama_ubatch & ubatch);
|
||||
|
||||
+ // [paged 0007] Cross-request prefix recompute-skip (experimental, gated by
|
||||
+ // env LLAMA_KV_PAGED). paged_prefix_share() reuses a cached content prefix
|
||||
+ // for seq_id and returns the number of shared prefix tokens (the caller
|
||||
+ // decodes only the suffix); paged_prefix_commit() publishes a sequence into
|
||||
+ // the content cache for later reuse. No-ops when LLAMA_KV_PAGED is unset.
|
||||
+ int32_t paged_prefix_share (llama_seq_id seq_id, const std::vector<llama_token> & tokens);
|
||||
+ void paged_prefix_commit(llama_seq_id seq_id, const std::vector<llama_token> & tokens);
|
||||
+
|
||||
//
|
||||
// input API
|
||||
//
|
||||
diff --git a/src/paged-alloc.cpp b/src/paged-alloc.cpp
|
||||
index 1d13f9c..c1027fb 100644
|
||||
--- a/src/paged-alloc.cpp
|
||||
+++ b/src/paged-alloc.cpp
|
||||
@@ -23,9 +23,13 @@ namespace {
|
||||
|
||||
using key_t = std::pair<const void *, int>;
|
||||
|
||||
-// One PagedKVManager per (kv-cache, stream): each stream owns a separate
|
||||
-// physical pool of cells.size() cells, so a manager's block ids map directly to
|
||||
-// cell ranges within that stream's pool. The internal request id is always 0.
|
||||
+// One persistent PagedKVManager per (kv-cache, stream): each stream owns a
|
||||
+// separate physical pool of cells.size() cells, so a manager's block ids map
|
||||
+// directly to cell ranges within that stream's pool. Requests inside a manager
|
||||
+// are keyed by the real llama_seq_id (NOT a fixed 0), so free(seq) releases one
|
||||
+// sequence and shared blocks survive at ref>0 - this is what makes ref-counted
|
||||
+// cross-request prefix sharing (0007) possible. Caching is enabled so commit()
|
||||
+// can publish blocks and share_prefix() can hit them.
|
||||
std::map<key_t, std::unique_ptr<paged::PagedKVManager>> g_managers;
|
||||
|
||||
paged::PagedKVManager * get_mgr(const void * cache, int stream,
|
||||
@@ -33,18 +37,21 @@ paged::PagedKVManager * get_mgr(const void * cache, int stream,
|
||||
const key_t k{cache, stream};
|
||||
auto it = g_managers.find(k);
|
||||
if (it == g_managers.end()) {
|
||||
- // enable_caching=false: prefix caching is a later patch; 0004 exercises
|
||||
- // only on-demand allocate / free.
|
||||
auto mgr = std::make_unique<paged::PagedKVManager>(
|
||||
- (int32_t) pool_blocks, (int) block_size, /*enable_caching=*/false);
|
||||
+ (int32_t) pool_blocks, (int) block_size, /*enable_caching=*/true);
|
||||
it = g_managers.emplace(k, std::move(mgr)).first;
|
||||
}
|
||||
return it->second.get();
|
||||
}
|
||||
|
||||
+paged::PagedKVManager * find_mgr(const void * cache, int stream) {
|
||||
+ auto it = g_managers.find({cache, stream});
|
||||
+ return it == g_managers.end() ? nullptr : it->second.get();
|
||||
+}
|
||||
+
|
||||
} // namespace
|
||||
|
||||
-bool place(const void * cache, int stream, uint32_t base, uint32_t n_tokens,
|
||||
+bool place(const void * cache, int stream, int seq, uint32_t base, uint32_t n_tokens,
|
||||
uint32_t block_size, uint32_t pool_blocks,
|
||||
std::vector<uint32_t> & out) {
|
||||
if (n_tokens == 0) {
|
||||
@@ -53,43 +60,79 @@ bool place(const void * cache, int stream, uint32_t base, uint32_t n_tokens,
|
||||
|
||||
paged::PagedKVManager * mgr = get_mgr(cache, stream, pool_blocks, block_size);
|
||||
|
||||
- const size_t before = mgr->block_table(0).size();
|
||||
+ const size_t before = mgr->block_table(seq).size();
|
||||
|
||||
- // Grow the request to cover the highest logical position. The manager pops
|
||||
- // free blocks only for the boundaries actually crossed - that is the on-
|
||||
- // demand behavior; an already-covered range adds nothing.
|
||||
- if (!mgr->allocate(0, (size_t) base + n_tokens)) {
|
||||
+ // Grow this sequence's request to cover its highest logical position. The
|
||||
+ // manager pops free blocks only for boundaries actually crossed; if
|
||||
+ // share_prefix() already reserved these blocks, this is a no-op.
|
||||
+ if (!mgr->allocate(seq, (size_t) base + n_tokens)) {
|
||||
return false; // pool exhausted -> caller falls back to the stock path
|
||||
}
|
||||
|
||||
out.reserve(out.size() + n_tokens);
|
||||
for (uint32_t i = 0; i < n_tokens; ++i) {
|
||||
- const int64_t s = mgr->slot(0, (int) (base + i));
|
||||
+ const int64_t s = mgr->slot(seq, (int) (base + i));
|
||||
out.push_back((uint32_t) s);
|
||||
}
|
||||
|
||||
if (debug()) {
|
||||
- const size_t after = mgr->block_table(0).size();
|
||||
+ const size_t after = mgr->block_table(seq).size();
|
||||
if (after != before) {
|
||||
fprintf(stderr,
|
||||
- "[paged-alloc] cache=%p stream=%d grew %zu->%zu blocks "
|
||||
+ "[paged-alloc] cache=%p stream=%d seq=%d grew %zu->%zu blocks "
|
||||
"(budget=%u; base=%u +%u tok)\n",
|
||||
- cache, stream, before, after, pool_blocks, base, n_tokens);
|
||||
+ cache, stream, seq, before, after, pool_blocks, base, n_tokens);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
-void release(const void * cache, int stream) {
|
||||
- auto it = g_managers.find({cache, stream});
|
||||
- if (it == g_managers.end()) {
|
||||
+size_t share_prefix(const void * cache, int stream, int seq,
|
||||
+ const std::vector<int> & tokens,
|
||||
+ uint32_t block_size, uint32_t pool_blocks) {
|
||||
+ paged::PagedKVManager * mgr = get_mgr(cache, stream, pool_blocks, block_size);
|
||||
+ const size_t shared_blocks = mgr->place_with_prefix(seq, tokens);
|
||||
+ const size_t shared_tokens = shared_blocks * (size_t) block_size;
|
||||
+ if (debug() && shared_blocks > 0) {
|
||||
+ fprintf(stderr,
|
||||
+ "[paged-alloc] cache=%p stream=%d seq=%d shares %zu prefix blocks "
|
||||
+ "(%zu tokens) - prefix NOT recomputed\n",
|
||||
+ cache, stream, seq, shared_blocks, shared_tokens);
|
||||
+ }
|
||||
+ return shared_tokens;
|
||||
+}
|
||||
+
|
||||
+int64_t slot(const void * cache, int stream, int seq, int pos) {
|
||||
+ paged::PagedKVManager * mgr = find_mgr(cache, stream);
|
||||
+ if (!mgr) {
|
||||
+ return -1;
|
||||
+ }
|
||||
+ if ((size_t) (pos / mgr->block_size()) >= mgr->num_blocks(seq)) {
|
||||
+ return -1;
|
||||
+ }
|
||||
+ return mgr->slot(seq, pos);
|
||||
+}
|
||||
+
|
||||
+void commit(const void * cache, int stream, int seq,
|
||||
+ const std::vector<int> & tokens, uint32_t block_size, uint32_t pool_blocks) {
|
||||
+ paged::PagedKVManager * mgr = get_mgr(cache, stream, pool_blocks, block_size);
|
||||
+ mgr->cache_blocks(seq, mgr->compute_block_hashes(tokens), tokens.size());
|
||||
+ if (debug()) {
|
||||
+ fprintf(stderr, "[paged-alloc] cache=%p stream=%d seq=%d committed %zu tokens\n",
|
||||
+ cache, stream, seq, tokens.size());
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+void release(const void * cache, int stream, int seq) {
|
||||
+ paged::PagedKVManager * mgr = find_mgr(cache, stream);
|
||||
+ if (!mgr) {
|
||||
return;
|
||||
}
|
||||
- it->second->free(0);
|
||||
- g_managers.erase(it);
|
||||
+ mgr->free(seq); // ref-counted: shared blocks survive while another seq holds them
|
||||
if (debug()) {
|
||||
- fprintf(stderr, "[paged-alloc] released cache=%p stream=%d\n", cache, stream);
|
||||
+ fprintf(stderr, "[paged-alloc] released cache=%p stream=%d seq=%d (free=%zu)\n",
|
||||
+ cache, stream, seq, mgr->num_free_blocks());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -103,4 +146,21 @@ void release_all(const void * cache) {
|
||||
}
|
||||
}
|
||||
|
||||
+int ref_cnt_at(const void * cache, int stream, int seq, int pos, uint32_t block_size) {
|
||||
+ paged::PagedKVManager * mgr = find_mgr(cache, stream);
|
||||
+ if (!mgr) {
|
||||
+ return -1;
|
||||
+ }
|
||||
+ const size_t bi = (size_t) pos / block_size;
|
||||
+ if (bi >= mgr->num_blocks(seq)) {
|
||||
+ return -1;
|
||||
+ }
|
||||
+ return mgr->block_ref_cnt_at(seq, bi);
|
||||
+}
|
||||
+
|
||||
+size_t num_free(const void * cache, int stream) {
|
||||
+ paged::PagedKVManager * mgr = find_mgr(cache, stream);
|
||||
+ return mgr ? mgr->num_free_blocks() : 0;
|
||||
+}
|
||||
+
|
||||
} // namespace paged_alloc
|
||||
diff --git a/src/paged-alloc.h b/src/paged-alloc.h
|
||||
index bf66665..88dedef 100644
|
||||
--- a/src/paged-alloc.h
|
||||
+++ b/src/paged-alloc.h
|
||||
@@ -1,17 +1,28 @@
|
||||
#pragma once
|
||||
-// On-demand paged KV block allocation (patch 0004, experimental).
|
||||
+// On-demand paged KV block allocation + cross-request prefix reuse
|
||||
+// (patches 0004 + 0007, experimental).
|
||||
//
|
||||
-// Backs the paged placement in llama_kv_cache::find_slot (patch 0002) with the
|
||||
-// vendored host-side PagedKVManager (patch 0001). Instead of mapping a
|
||||
-// sequence's logical positions onto a fixed full-pool permutation, blocks are
|
||||
-// popped from a free pool ON DEMAND as the sequence crosses block boundaries,
|
||||
-// and returned to the pool on sequence end. This is where the paged memory-
|
||||
-// capacity benefit begins: a short sequence holds only a few blocks, not the
|
||||
-// whole reserved window.
|
||||
+// Backs the paged placement in llama_kv_cache::find_slot with the vendored
|
||||
+// host-side PagedKVManager (patch 0001). Two responsibilities:
|
||||
//
|
||||
-// Gated behind env LLAMA_KV_PAGED; a no-op when unset. All state lives in this
|
||||
-// unit (a static registry keyed by kv-cache + stream), so the core kv-cache
|
||||
-// struct stays untouched - find_slot only gains a gated call.
|
||||
+// * On-demand allocation (0004): a sequence's logical positions are mapped to
|
||||
+// physical cells block-by-block, popped from a free pool only as the
|
||||
+// sequence grows and returned on sequence end.
|
||||
+//
|
||||
+// * Cross-request prefix reuse (0007): before a new sequence's suffix is
|
||||
+// decoded, share_prefix() reuses the cached physical blocks of a matching
|
||||
+// content prefix (ref_cnt++), so the engine shares the already-computed KV
|
||||
+// cells and the caller decodes ONLY the divergent suffix - the prefix is not
|
||||
+// recomputed. commit() publishes a sequence's full blocks into the content
|
||||
+// cache so later sequences can hit them. Freeing is ref-counted: a shared
|
||||
+// block returns to the pool only when every sharer has been released.
|
||||
+//
|
||||
+// One persistent PagedKVManager per (kv-cache, stream); requests inside it are
|
||||
+// keyed by the real llama_seq_id, so free(seq) releases exactly one sequence and
|
||||
+// shared blocks survive at ref>0. All state lives in this unit (a static
|
||||
+// registry), so the core kv-cache struct stays untouched - find_slot gains only
|
||||
+// gated calls. Gated behind env LLAMA_KV_PAGED; a no-op when unset.
|
||||
|
||||
+#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
@@ -21,19 +31,42 @@ namespace paged_alloc {
|
||||
// true iff env LLAMA_KV_PAGED is set (evaluated once).
|
||||
bool active();
|
||||
|
||||
-// Place n_tokens logical positions [base, base+n_tokens) of one stream on
|
||||
-// demand, appending their physical cell indices to `out`. pool_blocks =
|
||||
-// cells.size()/block_size is this stream's block budget. Returns false (leaving
|
||||
+// Place n_tokens logical positions [base, base+n_tokens) of (cache,stream,seq)
|
||||
+// on demand, appending their physical cell indices to `out`. pool_blocks =
|
||||
+// cells.size()/block_size is the stream's block budget. Returns false (leaving
|
||||
// `out` unchanged) on pool exhaustion, so the caller falls back to the stock
|
||||
// allocator. The caller still validates each returned cell is empty.
|
||||
-bool place(const void * cache, int stream, uint32_t base, uint32_t n_tokens,
|
||||
+bool place(const void * cache, int stream, int seq, uint32_t base, uint32_t n_tokens,
|
||||
uint32_t block_size, uint32_t pool_blocks,
|
||||
std::vector<uint32_t> & out);
|
||||
|
||||
-// Return a stream's blocks to the pool (sequence end).
|
||||
-void release(const void * cache, int stream);
|
||||
+// [0007] Reuse the longest cached content prefix of `tokens` for (cache,stream,
|
||||
+// seq): splice the shared physical blocks into seq (ref_cnt++) and reserve fresh
|
||||
+// blocks for the divergent suffix. Returns the number of shared PREFIX TOKENS
|
||||
+// (block-aligned); the caller marks those cells for seq and decodes only the
|
||||
+// suffix. 0 if nothing matched or on pool exhaustion (sequence rolled back).
|
||||
+size_t share_prefix(const void * cache, int stream, int seq,
|
||||
+ const std::vector<int> & tokens,
|
||||
+ uint32_t block_size, uint32_t pool_blocks);
|
||||
+
|
||||
+// [0007] Physical cell backing logical position `pos` of (cache,stream,seq), or
|
||||
+// -1 if seq is unknown. Used to map a shared prefix position to its cell.
|
||||
+int64_t slot(const void * cache, int stream, int seq, int pos);
|
||||
|
||||
-// Return every stream's blocks for a kv-cache (clear() / teardown).
|
||||
+// [0007] Publish seq's full (block-aligned) blocks into the content cache so a
|
||||
+// later share_prefix() can reuse them. Call after the sequence's KV is computed.
|
||||
+void commit(const void * cache, int stream, int seq,
|
||||
+ const std::vector<int> & tokens, uint32_t block_size, uint32_t pool_blocks);
|
||||
+
|
||||
+// Return one sequence's blocks to the pool (ref-counted; sequence end).
|
||||
+void release(const void * cache, int stream, int seq);
|
||||
+
|
||||
+// Drop every manager for a kv-cache (clear() / teardown).
|
||||
void release_all(const void * cache);
|
||||
|
||||
+// Introspection for the prefix-share gate (debug/tests). ref_cnt_at returns the
|
||||
+// ref count of the block backing logical position `pos`, or -1 if unknown.
|
||||
+int ref_cnt_at(const void * cache, int stream, int seq, int pos, uint32_t block_size);
|
||||
+size_t num_free(const void * cache, int stream);
|
||||
+
|
||||
} // namespace paged_alloc
|
||||
diff --git a/src/paged-prefix-api.cpp b/src/paged-prefix-api.cpp
|
||||
new file mode 100644
|
||||
index 0000000..8573cd2
|
||||
--- /dev/null
|
||||
+++ b/src/paged-prefix-api.cpp
|
||||
@@ -0,0 +1,48 @@
|
||||
+#include "paged-prefix-api.h"
|
||||
+#include "paged-alloc.h"
|
||||
+#include "llama-kv-cache.h"
|
||||
+
|
||||
+#include <vector>
|
||||
+
|
||||
+namespace paged_prefix_api {
|
||||
+
|
||||
+static llama_kv_cache * kv_of(llama_context * ctx) {
|
||||
+ // The driver targets a plain unified KV-cache model; dynamic_cast yields null
|
||||
+ // for wrapped caches (iSWA / hybrid), where cross-request cell sharing does
|
||||
+ // not apply, so the shim degrades to a safe no-op.
|
||||
+ return dynamic_cast<llama_kv_cache *>(llama_get_memory(ctx));
|
||||
+}
|
||||
+
|
||||
+int32_t share(llama_context * ctx, llama_seq_id seq, const llama_token * tokens, int n) {
|
||||
+ llama_kv_cache * kv = kv_of(ctx);
|
||||
+ if (!kv || n <= 0) {
|
||||
+ return 0;
|
||||
+ }
|
||||
+ return kv->paged_prefix_share(seq, std::vector<llama_token>(tokens, tokens + n));
|
||||
+}
|
||||
+
|
||||
+void commit(llama_context * ctx, llama_seq_id seq, const llama_token * tokens, int n) {
|
||||
+ llama_kv_cache * kv = kv_of(ctx);
|
||||
+ if (!kv || n <= 0) {
|
||||
+ return;
|
||||
+ }
|
||||
+ kv->paged_prefix_commit(seq, std::vector<llama_token>(tokens, tokens + n));
|
||||
+}
|
||||
+
|
||||
+int ref_at(llama_context * ctx, llama_seq_id seq, int pos) {
|
||||
+ llama_kv_cache * kv = kv_of(ctx);
|
||||
+ if (!kv) {
|
||||
+ return -1;
|
||||
+ }
|
||||
+ return paged_alloc::ref_cnt_at((const void *) kv, /*stream=*/0, (int) seq, pos, /*block_size=*/16);
|
||||
+}
|
||||
+
|
||||
+long num_free(llama_context * ctx) {
|
||||
+ llama_kv_cache * kv = kv_of(ctx);
|
||||
+ if (!kv) {
|
||||
+ return 0;
|
||||
+ }
|
||||
+ return (long) paged_alloc::num_free((const void *) kv, /*stream=*/0);
|
||||
+}
|
||||
+
|
||||
+} // namespace paged_prefix_api
|
||||
diff --git a/src/paged-prefix-api.h b/src/paged-prefix-api.h
|
||||
new file mode 100644
|
||||
index 0000000..78a3864
|
||||
--- /dev/null
|
||||
+++ b/src/paged-prefix-api.h
|
||||
@@ -0,0 +1,29 @@
|
||||
+#pragma once
|
||||
+// Thin test/diagnostic shim over the paged cross-request prefix engine seam
|
||||
+// (patch 0007). Lets a driver that only includes the public llama.h reach the
|
||||
+// gated llama_kv_cache::paged_prefix_* methods and the paged-alloc introspection
|
||||
+// without pulling in the internal kv-cache headers. All entry points are no-ops
|
||||
+// (return 0) unless env LLAMA_KV_PAGED is set. Experimental; not a stable API.
|
||||
+
|
||||
+#include <cstddef>
|
||||
+#include <cstdint>
|
||||
+#include "llama.h"
|
||||
+
|
||||
+namespace paged_prefix_api {
|
||||
+
|
||||
+// Reuse the longest cached content prefix of [tokens, tokens+n) for `seq` and
|
||||
+// return the number of shared prefix tokens (the caller decodes only the
|
||||
+// suffix). 0 if nothing was shared.
|
||||
+int32_t share(llama_context * ctx, llama_seq_id seq, const llama_token * tokens, int n);
|
||||
+
|
||||
+// Publish `seq`'s full blocks into the content cache (call after its KV is computed).
|
||||
+void commit(llama_context * ctx, llama_seq_id seq, const llama_token * tokens, int n);
|
||||
+
|
||||
+// Ref count of the paged block backing logical position `pos` of `seq` (unified
|
||||
+// stream 0), or -1 if unknown.
|
||||
+int ref_at(llama_context * ctx, llama_seq_id seq, int pos);
|
||||
+
|
||||
+// Number of free blocks in the unified stream-0 pool, or 0 if no manager.
|
||||
+long num_free(llama_context * ctx);
|
||||
+
|
||||
+} // namespace paged_prefix_api
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,130 @@
|
||||
From 240758ef7e144619c750aaf1d3339051ecc29098 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Mon, 22 Jun 2026 17:02:22 +0200
|
||||
Subject: [PATCH] paged server cross-request prefix share (env LLAMA_KV_PAGED)
|
||||
- patch 0008
|
||||
|
||||
Wire the paged cross-request prefix recompute-skip (patch 0007's engine seam,
|
||||
paged_prefix_api::share/commit) into the llama-server continuous-batching loop
|
||||
(update_slots) so CONCURRENT requests that share a long prefix physically reuse
|
||||
one committed copy of the prefix blocks and prefill only their divergent suffix.
|
||||
Patch 0007 proved the engine seam correct via a standalone driver, but the server
|
||||
never called it: two concurrent shared-prefix requests each recomputed the full
|
||||
prefix. The server's native prompt cache only reuses a slot's OWN prior prompt
|
||||
(longest-common-prefix vs slot.prompt.tokens) - it does not share across distinct
|
||||
concurrent slots. 0008 adds that cross-slot share.
|
||||
|
||||
Mechanism (all gated behind LLAMA_KV_PAGED; default off, stock byte-identical):
|
||||
|
||||
* In update_slots prompt-processing, after the native n_past is computed and
|
||||
only for a FRESH slot (n_past < one block, i.e. the native cache did not
|
||||
already cover the prefix), call paged_prefix_api::share() to splice the
|
||||
longest committed cross-request prefix into this sequence (ref_cnt++ on the
|
||||
shared physical blocks) and advance n_past past it, so the batch fill computes
|
||||
ONLY the suffix. The slot's own divergent tail cells are removed first so the
|
||||
shared cells own [n_past, kshare) without colliding (the native path removes
|
||||
these later anyway). The n_past < block gate guarantees any block-aligned
|
||||
share the engine returns is strictly larger than n_past and therefore always
|
||||
adopted, so the engine's reservation always matches the suffix-only batch and
|
||||
never leaves stale blocks (which otherwise fragment the paged pool).
|
||||
|
||||
* When a slot finishes prefill (SLOT_STATE_DONE_PROMPT -> GENERATING, the prefix
|
||||
KV just computed), call paged_prefix_api::commit() to publish its prefix so
|
||||
concurrent/later sharers can reuse it.
|
||||
|
||||
The share() / commit() entry points are forward-declared (defined in libllama,
|
||||
src/paged-prefix-api.cpp) to avoid pulling internal kv-cache headers into the
|
||||
server translation unit.
|
||||
|
||||
Verified in the server (32B NVFP4, CUDA, --kv-unified): with a live sequence
|
||||
holding the prefix, K=16/32 concurrent shared-prefix requests prefill only their
|
||||
~27-token suffix instead of the ~1003-token prefix (36x fewer prefill tokens;
|
||||
K=16 23.9s -> 1.5s, K=32 57.9s -> 2.3s), the engine logs "shares ... prefix
|
||||
blocks - NOT recomputed" with ref_cnt>1, and greedy output stays within the
|
||||
documented CUDA batch-shape non-determinism band (stock native prompt-caching
|
||||
shows the same magnitude). Cross-request sharing requires the unified KV cache.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
tools/server/server-context.cpp | 50 +++++++++++++++++++++++++++++++++
|
||||
1 file changed, 50 insertions(+)
|
||||
|
||||
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
|
||||
index 39b7eb2..b5f9d37 100644
|
||||
--- a/tools/server/server-context.cpp
|
||||
+++ b/tools/server/server-context.cpp
|
||||
@@ -16,6 +16,16 @@
|
||||
#include "mtmd.h"
|
||||
#include "mtmd-helper.h"
|
||||
|
||||
+// [paged 0008] Cross-request prefix recompute-skip shim. share()/commit() are
|
||||
+// defined in libllama (src/paged-prefix-api.cpp, patch 0007) and are no-ops
|
||||
+// unless env LLAMA_KV_PAGED is set. Declared here so the paged cross-slot prefix
|
||||
+// cache wires into update_slots() without pulling in internal kv-cache headers.
|
||||
+// Fully gated; stock (paged off) is byte-identical.
|
||||
+namespace paged_prefix_api {
|
||||
+ int32_t share (llama_context * ctx, llama_seq_id seq, const llama_token * tokens, int n);
|
||||
+ void commit(llama_context * ctx, llama_seq_id seq, const llama_token * tokens, int n);
|
||||
+}
|
||||
+
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <cinttypes>
|
||||
@@ -3335,6 +3345,37 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
+ // [paged 0008] Cross-request prefix recompute-skip. The native prompt cache
|
||||
+ // above only reuses THIS slot's own prior prompt; when the paged KV
|
||||
+ // engine is active, also reuse a committed CROSS-slot prefix so
|
||||
+ // concurrent requests sharing a long prefix skip recompute. Gated on
|
||||
+ // LLAMA_KV_PAGED (paged_kv_share static); stock stays byte-identical.
|
||||
+ static const bool paged_kv_share = getenv("LLAMA_KV_PAGED") != nullptr;
|
||||
+ // Only attempt the cross-request share on a FRESH slot (the native
|
||||
+ // cache above did not already cover the prefix). With n_past < a
|
||||
+ // block, any block-aligned share the engine returns is strictly
|
||||
+ // larger than n_past and is therefore always adopted below - so the
|
||||
+ // engine's full-prompt reservation always matches the suffix-only
|
||||
+ // submission and never leaves stale blocks (which fragmented the
|
||||
+ // paged pool and crashed the server under high fan-out otherwise).
|
||||
+ if (paged_kv_share && n_past < 16 && slot.task->params.cache_prompt && !input_tokens.has_mtmd) {
|
||||
+ const llama_tokens ptoks = input_tokens.get_text_tokens();
|
||||
+ // Drop this slot's own cells beyond the natively-cached prefix before
|
||||
+ // splicing the shared physical prefix in, so the shared cells can own
|
||||
+ // [n_past, kshare) without colliding (the native path removes exactly
|
||||
+ // these later; a no-op for a fresh slot).
|
||||
+ common_context_seq_rm(ctx_tgt, slot.id, n_past, -1);
|
||||
+ const int32_t kshare = paged_prefix_api::share(ctx_tgt, slot.id, ptoks.data(), (int) ptoks.size());
|
||||
+ if (kshare > n_past) {
|
||||
+ slot.prompt.tokens.keep_first(n_past);
|
||||
+ for (int i = n_past; i < kshare; ++i) {
|
||||
+ slot.prompt.tokens.push_back(ptoks[i]);
|
||||
+ }
|
||||
+ n_past = kshare;
|
||||
+ SLT_INF(slot, "paged: reusing %d cross-request shared prefix tokens - not recomputed\n", n_past);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
// [TAG_PROMPT_LOGITS]
|
||||
if (n_past == slot.task->n_tokens() && n_past > 0) {
|
||||
SLT_WRN(slot, "need to evaluate at least 1 token for each active slot (n_past = %d, task.n_tokens() = %d)\n", n_past, slot.task->n_tokens());
|
||||
@@ -3741,6 +3782,15 @@ private:
|
||||
// prompt evaluated for next-token prediction
|
||||
slot.state = SLOT_STATE_GENERATING;
|
||||
|
||||
+ // [paged 0008] Publish this slot's computed prefix so concurrent/later
|
||||
+ // slots can share it (no-op unless LLAMA_KV_PAGED). The prefill decode
|
||||
+ // for [0, n_tokens) has just run, so the prefix KV is computed.
|
||||
+ static const bool paged_kv_commit = getenv("LLAMA_KV_PAGED") != nullptr;
|
||||
+ if (paged_kv_commit && slot.task->params.cache_prompt && !slot.prompt.tokens.has_mtmd) {
|
||||
+ const llama_tokens ctoks = slot.prompt.tokens.get_text_tokens();
|
||||
+ paged_prefix_api::commit(ctx_tgt, slot.id, ctoks.data(), (int) ctoks.size());
|
||||
+ }
|
||||
+
|
||||
if (slot.can_speculate()) {
|
||||
common_speculative_begin(spec.get(), slot.id, slot.prompt.tokens.get_text_tokens());
|
||||
}
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,609 @@
|
||||
From 59490d82e4d0d4ad05ffb5ca3cccc668f4a75281 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Mon, 22 Jun 2026 20:03:17 +0200
|
||||
Subject: [PATCH] paged in-kernel decode read (env LLAMA_KV_PAGED) - patch 0009
|
||||
|
||||
Replace the per-layer per-step gather (patch 0003: ggml_get_rows of K/V into a
|
||||
contiguous buffer) with an in-kernel paged read on the decode step. build_attn
|
||||
passes the UNMODIFIED physical K/V views plus a block table (src[5] of
|
||||
ggml_flash_attn_ext: an I32 [n_view, n_stream] position-ordered physical-cell
|
||||
index, padded to FATTN_KQ_STRIDE). The CUDA fattn vec kernel and the CPU
|
||||
reference map logical KV index j -> physical cell block_table[seq*ne11+j] and
|
||||
read K_base+cell*nb11 / V_base+cell*nb21 in place, so the get_rows of K and V
|
||||
(the bulk of the gather) is gone. The mask stays a small compacted [n_view]
|
||||
causal mask in the same position order; KV_max / parallel_blocks / stream_k
|
||||
split-K are unchanged. The decode shape is forced onto the vec kernel (the only
|
||||
one wired for the block table); a nullptr block table => the stock contiguous
|
||||
read, byte-identical.
|
||||
|
||||
Token-POSITION ordering keeps the flash-attn reduction order identical to stock,
|
||||
so CPU-paged logits == CPU-stock bit-for-bit (verified: 4-stream FA greedy, 64
|
||||
tokens). On GPU paged(vec) == stock(vec) at batch 1; at batch>1 it stays within
|
||||
the documented vec-vs-mma non-determinism band. Decode step at batch 32 / 1024
|
||||
ctx on GB10 (Qwen3-32B NVFP4): paged-gather 1279 ms -> in-kernel 696 ms (-46%),
|
||||
recovering the gather regression to stock parity (647 ms). Gated behind
|
||||
LLAMA_KV_PAGED; no-op (stock byte-identical) when unset.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
ggml/include/ggml.h | 6 ++
|
||||
ggml/src/ggml-cpu/ops.cpp | 10 ++-
|
||||
ggml/src/ggml-cuda/fattn-common.cuh | 8 +-
|
||||
ggml/src/ggml-cuda/fattn-mma-f16.cuh | 4 +-
|
||||
ggml/src/ggml-cuda/fattn-tile.cuh | 4 +-
|
||||
ggml/src/ggml-cuda/fattn-vec.cuh | 25 +++++--
|
||||
ggml/src/ggml-cuda/fattn-wmma-f16.cu | 4 +-
|
||||
ggml/src/ggml-cuda/fattn.cu | 9 +++
|
||||
ggml/src/ggml.c | 14 ++++
|
||||
src/llama-graph.cpp | 23 ++++--
|
||||
src/llama-graph.h | 3 +-
|
||||
src/llama-kv-cache.cpp | 31 ++++++++
|
||||
src/llama-kv-cache.h | 4 +
|
||||
src/paged-attn.cpp | 107 +++++++++++++++++++++++++++
|
||||
src/paged-attn.h | 18 +++++
|
||||
15 files changed, 248 insertions(+), 22 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
|
||||
index d6807b6..823f5a9 100644
|
||||
--- a/ggml/include/ggml.h
|
||||
+++ b/ggml/include/ggml.h
|
||||
@@ -2427,6 +2427,12 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * sinks);
|
||||
|
||||
+ // [paged] optional block table in src[5]: I32 [n_kv_logical, n_stream]; maps each
|
||||
+ // logical KV index to the physical cell within K/V. nullptr => stock contiguous read.
|
||||
+ GGML_API void ggml_flash_attn_ext_set_block_table(
|
||||
+ struct ggml_tensor * a,
|
||||
+ struct ggml_tensor * block_table);
|
||||
+
|
||||
// TODO: needs to be adapted to ggml_flash_attn_ext
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
||||
struct ggml_context * ctx,
|
||||
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
|
||||
index 74611dc..63c07a2 100644
|
||||
--- a/ggml/src/ggml-cpu/ops.cpp
|
||||
+++ b/ggml/src/ggml-cpu/ops.cpp
|
||||
@@ -8330,6 +8330,8 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
|
||||
const ggml_tensor * v = dst->src[2];
|
||||
const ggml_tensor * mask = dst->src[3];
|
||||
const ggml_tensor * sinks = dst->src[4];
|
||||
+ const ggml_tensor * block_table = dst->src[5]; // [paged] logical->physical cell map (src[5])
|
||||
+ const int32_t * bt = block_table ? (const int32_t *) block_table->data : nullptr;
|
||||
|
||||
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
|
||||
@@ -8449,7 +8451,9 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
|
||||
|
||||
float s; // KQ value
|
||||
|
||||
- const char * k_data = (const char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3);
|
||||
+ // [paged] map the logical KV index ic to its physical cell via the block table.
|
||||
+ const int64_t ic_phys = bt ? (int64_t) bt[ik3*nek1 + ic] : ic;
|
||||
+ const char * k_data = (const char *) k->data + ( ic_phys*nbk1 + ik2*nbk2 + ik3*nbk3);
|
||||
kq_vec_dot(DK, &s, 0, k_data, 0, Q_q, 0, 1);
|
||||
|
||||
s = s*scale; // scale KQ value
|
||||
@@ -8465,7 +8469,7 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
|
||||
float ms = 1.0f; // upon new higher max val, scale VKQ and KQ sum with this value
|
||||
float vs = 1.0f; // post-softmax KQ value, expf(s - M)
|
||||
|
||||
- const char * v_data = ((const char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3));
|
||||
+ const char * v_data = ((const char *) v->data + (ic_phys*nbv1 + iv2*nbv2 + iv3*nbv3));
|
||||
|
||||
if (v->type == GGML_TYPE_F16) {
|
||||
if (s > M) {
|
||||
@@ -9021,7 +9025,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
||||
const int64_t dr = (nr + nchunk - 1) / nchunk;
|
||||
|
||||
static constexpr int64_t Q_TILE_SZ = ggml_fa_tile_config::Q;
|
||||
- bool use_tiled = !use_ref &&
|
||||
+ bool use_tiled = !use_ref && dst->src[5] == nullptr && // [paged] one_chunk honors the block table
|
||||
(q->type == GGML_TYPE_F32 &&
|
||||
kv_is_f32_or_f16 &&
|
||||
k->type == v->type &&
|
||||
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
|
||||
index 8dfa51a..3c6ddd5 100644
|
||||
--- a/ggml/src/ggml-cuda/fattn-common.cuh
|
||||
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
|
||||
@@ -39,7 +39,8 @@ typedef void (* fattn_kernel_t)(
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
- const int32_t nb31, const int32_t nb32, const int64_t nb33);
|
||||
+ const int32_t nb31, const int32_t nb32, const int64_t nb33,
|
||||
+ const int * __restrict__ block_table);
|
||||
|
||||
typedef float (*vec_dot_KQ_t)(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds);
|
||||
@@ -981,6 +982,8 @@ void launch_fattn(
|
||||
|
||||
const ggml_tensor * mask = dst->src[3];
|
||||
const ggml_tensor * sinks = dst->src[4];
|
||||
+ const ggml_tensor * block_table = dst->src[5]; // [paged] optional logical->physical map
|
||||
+ const int * bt_ptr = block_table ? (const int *) block_table->data : nullptr;
|
||||
|
||||
ggml_tensor * KQV = dst;
|
||||
|
||||
@@ -1217,7 +1220,8 @@ void launch_fattn(
|
||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3], nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
mask ? mask->ne[1] : 0, mask ? mask->ne[2] : 0, mask ? mask->ne[3] : 0,
|
||||
- mask ? mask->nb[1] : 0, mask ? mask->nb[2] : 0, mask ? mask->nb[3] : 0
|
||||
+ mask ? mask->nb[1] : 0, mask ? mask->nb[2] : 0, mask ? mask->nb[3] : 0,
|
||||
+ bt_ptr
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
|
||||
index 83478a0..0a92cd6 100644
|
||||
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
|
||||
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
|
||||
@@ -1723,7 +1723,9 @@ static __global__ void flash_attn_ext_f16(
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
- const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
+ const int32_t nb31, const int32_t nb32, const int64_t nb33,
|
||||
+ const int * __restrict__ block_table) {
|
||||
+ GGML_UNUSED(block_table); // [paged] block table is honored only by the vec kernel
|
||||
ggml_cuda_pdl_sync(); // TODO optimize placement
|
||||
#if defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE))
|
||||
const char * GGML_CUDA_RESTRICT Q = Q_ptr;
|
||||
diff --git a/ggml/src/ggml-cuda/fattn-tile.cuh b/ggml/src/ggml-cuda/fattn-tile.cuh
|
||||
index 0a09981..0ff14e6 100644
|
||||
--- a/ggml/src/ggml-cuda/fattn-tile.cuh
|
||||
+++ b/ggml/src/ggml-cuda/fattn-tile.cuh
|
||||
@@ -808,7 +808,9 @@ static __global__ void flash_attn_tile(
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
- const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
+ const int32_t nb31, const int32_t nb32, const int64_t nb33,
|
||||
+ const int * __restrict__ block_table) {
|
||||
+ GGML_UNUSED(block_table); // [paged] block table is honored only by the vec kernel
|
||||
#ifdef FLASH_ATTN_AVAILABLE
|
||||
const char * GGML_CUDA_RESTRICT Q = Q_ptr;
|
||||
const char * GGML_CUDA_RESTRICT K = K_ptr;
|
||||
diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh
|
||||
index 69dd936..a09e2fb 100644
|
||||
--- a/ggml/src/ggml-cuda/fattn-vec.cuh
|
||||
+++ b/ggml/src/ggml-cuda/fattn-vec.cuh
|
||||
@@ -39,7 +39,8 @@ static __global__ void flash_attn_ext_vec(
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
- const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
+ const int32_t nb31, const int32_t nb32, const int64_t nb33,
|
||||
+ const int * __restrict__ block_table) {
|
||||
ggml_cuda_pdl_lc();
|
||||
#ifdef FLASH_ATTN_AVAILABLE
|
||||
const char * GGML_CUDA_RESTRICT Q = Q_ptr;
|
||||
@@ -61,7 +62,7 @@ static __global__ void flash_attn_ext_vec(
|
||||
nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
ne31, ne32, ne33,
|
||||
- nb31, nb32, nb33);
|
||||
+ nb31, nb32, nb33, block_table);
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
@@ -110,6 +111,14 @@ static __global__ void flash_attn_ext_vec(
|
||||
K += nb13*sequence + nb12*(head / gqa_ratio);
|
||||
V += nb23*sequence + nb22*(head / gqa_ratio);
|
||||
|
||||
+ // [paged] in-kernel block-table read: logical KV index j -> physical cell
|
||||
+ // block_table[sequence*ne11 + j]; read K0 + cell*nb11 / V0 + cell*nb21. The
|
||||
+ // mask/KV_max stay logical (the table is in token-position order). nullptr =>
|
||||
+ // the stock contiguous read below.
|
||||
+ const char * GGML_CUDA_RESTRICT K0 = K;
|
||||
+ const char * GGML_CUDA_RESTRICT V0 = V;
|
||||
+ const int * GGML_CUDA_RESTRICT bt = block_table ? block_table + (size_t) sequence*ne11 : nullptr;
|
||||
+
|
||||
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
|
||||
|
||||
const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
|
||||
@@ -267,10 +276,11 @@ static __global__ void flash_attn_ext_vec(
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < nthreads_KQ; ++i_KQ_0) {
|
||||
const int i_KQ = threadIdx.y*WARP_SIZE + (nthreads_KQ == WARP_SIZE ? 0 : (threadIdx.x & ~(nthreads_KQ-1))) + i_KQ_0;
|
||||
+ const char * GGML_CUDA_RESTRICT K_blk = bt ? (K0 + (int64_t) bt[k_VKQ_0 + i_KQ]*nb11) : (K + i_KQ*nb11);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
- float sum = vec_dot_KQ(K + i_KQ*nb11, Q_reg[j], Q_i32[j], Q_ds[j]);
|
||||
+ float sum = vec_dot_KQ(K_blk, Q_reg[j], Q_i32[j], Q_ds[j]);
|
||||
sum = warp_reduce_sum<nthreads_KQ>(sum);
|
||||
|
||||
if (use_logit_softcap) {
|
||||
@@ -324,6 +334,7 @@ static __global__ void flash_attn_ext_vec(
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < WARP_SIZE; k0 += V_cols_per_iter) {
|
||||
const int k = threadIdx.y*WARP_SIZE + k0 + (nthreads_V == WARP_SIZE ? 0 : threadIdx.x / nthreads_V);
|
||||
+ const char * GGML_CUDA_RESTRICT V_blk = bt ? (V0 + (int64_t) bt[k_VKQ_0 + k]*nb21) : (V + k*nb21);
|
||||
|
||||
#ifdef V_DOT2_F32_F16_AVAILABLE
|
||||
half2 KQ_k[ncols];
|
||||
@@ -336,14 +347,14 @@ static __global__ void flash_attn_ext_vec(
|
||||
half2 tmp[V_rows_per_thread/2];
|
||||
if constexpr (type_V == GGML_TYPE_BF16) {
|
||||
float2 tmp_f[V_rows_per_thread/2];
|
||||
- dequantize_V(V + k*nb21, tmp_f,
|
||||
+ dequantize_V(V_blk, tmp_f,
|
||||
2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
|
||||
#pragma unroll
|
||||
for (int i_VKQ_1 = 0; i_VKQ_1 < V_rows_per_thread/2; ++i_VKQ_1) {
|
||||
tmp[i_VKQ_1] = __float22half2_rn(tmp_f[i_VKQ_1]);
|
||||
}
|
||||
} else {
|
||||
- dequantize_V(V + k*nb21, tmp,
|
||||
+ dequantize_V(V_blk, tmp,
|
||||
2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
|
||||
}
|
||||
#pragma unroll
|
||||
@@ -363,7 +374,7 @@ static __global__ void flash_attn_ext_vec(
|
||||
#pragma unroll
|
||||
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
|
||||
float2 tmp[V_rows_per_thread/2];
|
||||
- dequantize_V(V + k*nb21, tmp,
|
||||
+ dequantize_V(V_blk, tmp,
|
||||
2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
|
||||
#pragma unroll
|
||||
for (int i_VKQ_1 = 0; i_VKQ_1 < V_rows_per_thread/2; ++i_VKQ_1) {
|
||||
@@ -522,7 +533,7 @@ static __global__ void flash_attn_ext_vec(
|
||||
nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
ne31, ne32, ne33,
|
||||
- nb31, nb32, nb33);
|
||||
+ nb31, nb32, nb33, block_table);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
}
|
||||
diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
|
||||
index 6850716..5357849 100644
|
||||
--- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu
|
||||
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
|
||||
@@ -44,7 +44,9 @@ static __global__ void flash_attn_ext_f16(
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
- const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
+ const int32_t nb31, const int32_t nb32, const int64_t nb33,
|
||||
+ const int * __restrict__ block_table) {
|
||||
+ GGML_UNUSED(block_table); // [paged] block table is honored only by the vec kernel
|
||||
#if defined(FLASH_ATTN_AVAILABLE) && (defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_USE_WMMA_FATTN))
|
||||
const char * GGML_CUDA_RESTRICT Q = Q_ptr;
|
||||
const char * GGML_CUDA_RESTRICT K = K_ptr;
|
||||
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
|
||||
index d6c501b..e3771ee 100644
|
||||
--- a/ggml/src/ggml-cuda/fattn.cu
|
||||
+++ b/ggml/src/ggml-cuda/fattn.cu
|
||||
@@ -574,6 +574,15 @@ size_t ggml_cuda_flash_attn_ext_get_alloc_size(int device, const ggml_tensor * d
|
||||
|
||||
void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_set_device(ctx.device);
|
||||
+
|
||||
+ // [paged] the block table (src[5]) is only honored by the vec kernel's
|
||||
+ // in-kernel read; force it. build_attn only sets it for a vec-supported
|
||||
+ // 1-token-per-stream decode shape.
|
||||
+ if (dst->src[5] != nullptr) {
|
||||
+ ggml_cuda_flash_attn_ext_vec(ctx, dst);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
switch (ggml_cuda_get_best_fattn_kernel(ggml_cuda_get_device(), dst)) {
|
||||
case BEST_FATTN_KERNEL_NONE:
|
||||
GGML_ABORT("fatal error");
|
||||
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
|
||||
index b43016c..adbe52b 100644
|
||||
--- a/ggml/src/ggml.c
|
||||
+++ b/ggml/src/ggml.c
|
||||
@@ -5442,6 +5442,20 @@ void ggml_flash_attn_ext_add_sinks(
|
||||
a->src[4] = sinks;
|
||||
}
|
||||
|
||||
+void ggml_flash_attn_ext_set_block_table(
|
||||
+ struct ggml_tensor * a,
|
||||
+ struct ggml_tensor * block_table) {
|
||||
+ if (!block_table) {
|
||||
+ a->src[5] = NULL;
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
|
||||
+ GGML_ASSERT(block_table->type == GGML_TYPE_I32);
|
||||
+
|
||||
+ a->src[5] = block_table;
|
||||
+}
|
||||
+
|
||||
// ggml_flash_attn_back
|
||||
|
||||
struct ggml_tensor * ggml_flash_attn_back(
|
||||
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
|
||||
index b59d2a5..abdb48d 100644
|
||||
--- a/src/llama-graph.cpp
|
||||
+++ b/src/llama-graph.cpp
|
||||
@@ -2074,7 +2074,8 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||
ggml_tensor * sinks,
|
||||
ggml_tensor * v_mla,
|
||||
float kq_scale,
|
||||
- int il) const {
|
||||
+ int il,
|
||||
+ ggml_tensor * block_table) const {
|
||||
const bool v_trans = v->nb[1] > v->nb[2];
|
||||
|
||||
// split the batch into streams if needed
|
||||
@@ -2109,6 +2110,9 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||
hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
|
||||
cb(cur, LLAMA_TENSOR_NAME_FATTN, il);
|
||||
|
||||
+ if (block_table) {
|
||||
+ ggml_flash_attn_ext_set_block_table(cur, block_table);
|
||||
+ }
|
||||
ggml_flash_attn_ext_add_sinks(cur, sinks);
|
||||
ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
|
||||
|
||||
@@ -2358,12 +2362,19 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
||||
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
||||
|
||||
- // [paged 0003] gather K, V and the mask to the sequence's used cells only
|
||||
- // (no-op unless env LLAMA_KV_PAGED is set).
|
||||
- ggml_tensor * kq_mask_g = kq_mask;
|
||||
- paged_attn::gather(ctx0, res, mctx_cur, &k, &v, &kq_mask_g);
|
||||
+ // [paged] decode read: when paging is active and this is a 1-token-per-stream
|
||||
+ // decode step, present K/V as n_gather views + a block table so the fattn
|
||||
+ // kernel reads the sequence's cells in-kernel (no get_rows of K/V). Else
|
||||
+ // fall back to the gather-read (prefill, transposed V, or env off). All a
|
||||
+ // no-op unless env LLAMA_KV_PAGED is set => stock byte-identical.
|
||||
+ ggml_tensor * kq_mask_g = kq_mask;
|
||||
+ ggml_tensor * block_table = nullptr;
|
||||
+ const bool is_decode = (q_cur->ne[2] == k->ne[3]); // 1 query token per stream
|
||||
+ if (!(is_decode && paged_attn::in_kernel_decode(ctx0, res, mctx_cur, &k, &v, &kq_mask_g, &block_table))) {
|
||||
+ paged_attn::gather(ctx0, res, mctx_cur, &k, &v, &kq_mask_g);
|
||||
+ }
|
||||
|
||||
- ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask_g, sinks, v_mla, kq_scale, il);
|
||||
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask_g, sinks, v_mla, kq_scale, il, block_table);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
if (inp->self_v_rot) {
|
||||
diff --git a/src/llama-graph.h b/src/llama-graph.h
|
||||
index 5e8a658..c95ae49 100644
|
||||
--- a/src/llama-graph.h
|
||||
+++ b/src/llama-graph.h
|
||||
@@ -969,7 +969,8 @@ struct llm_graph_context {
|
||||
ggml_tensor * sinks, // [n_head_q]
|
||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
||||
float kq_scale,
|
||||
- int il) const;
|
||||
+ int il,
|
||||
+ ggml_tensor * block_table = nullptr) const; // [paged] optional src[5] block table
|
||||
|
||||
llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
|
||||
|
||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
||||
index 7510ff9..0351f86 100644
|
||||
--- a/src/llama-kv-cache.cpp
|
||||
+++ b/src/llama-kv-cache.cpp
|
||||
@@ -1474,6 +1474,33 @@ void llama_kv_cache::get_gather_idxs(int32_t * dst, uint32_t n_kv, const slot_in
|
||||
}
|
||||
}
|
||||
|
||||
+void llama_kv_cache::get_block_table(int32_t * dst, uint32_t n_blk, uint32_t n_kv, const slot_info & sinfo) const {
|
||||
+ const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
|
||||
+ for (uint32_t j = 0; j < ns; ++j) {
|
||||
+ const auto & cells = v_cells[sinfo.s0 + j];
|
||||
+ const uint32_t n = std::min<uint32_t>(n_kv, cells.size());
|
||||
+ std::vector<std::pair<llama_pos, int32_t>> pc;
|
||||
+ pc.reserve(n);
|
||||
+ int32_t pad = -1;
|
||||
+ for (uint32_t i = 0; i < n; ++i) {
|
||||
+ if (!cells.is_empty(i)) {
|
||||
+ pc.emplace_back(cells.pos_get(i), (int32_t) i);
|
||||
+ } else if (pad < 0) {
|
||||
+ pad = (int32_t) i;
|
||||
+ }
|
||||
+ }
|
||||
+ std::sort(pc.begin(), pc.end());
|
||||
+ int32_t * col = dst + (size_t) j * n_blk;
|
||||
+ for (size_t k = 0; k < pc.size(); ++k) {
|
||||
+ col[k] = pc[k].second;
|
||||
+ }
|
||||
+ const int32_t padv = (pad >= 0) ? pad : (pc.empty() ? 0 : pc.back().second);
|
||||
+ for (uint32_t k = (uint32_t) pc.size(); k < n_blk; ++k) {
|
||||
+ col[k] = padv;
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const {
|
||||
GGML_UNUSED(sinfo);
|
||||
|
||||
@@ -2773,6 +2800,10 @@ void llama_kv_cache_context::get_gather_idxs(int32_t * dst) const {
|
||||
kv->get_gather_idxs(dst, n_kv, sinfos[i_cur]);
|
||||
}
|
||||
|
||||
+void llama_kv_cache_context::get_block_table(int32_t * dst, uint32_t n_blk) const {
|
||||
+ kv->get_block_table(dst, n_blk, n_kv, sinfos[i_cur]);
|
||||
+}
|
||||
+
|
||||
ggml_tensor * llama_kv_cache_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const {
|
||||
return kv->cpy_k(ctx, k_cur, k_idxs, il, sinfos[i_cur]);
|
||||
}
|
||||
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
|
||||
index f374ac6..e9980b6 100644
|
||||
--- a/src/llama-kv-cache.h
|
||||
+++ b/src/llama-kv-cache.h
|
||||
@@ -176,6 +176,9 @@ public:
|
||||
// gather-read. get_n_gather returns the max count across streams.
|
||||
uint32_t get_n_gather(uint32_t n_kv, const slot_info & sinfo) const;
|
||||
void get_gather_idxs(int32_t * dst, uint32_t n_kv, const slot_info & sinfo) const;
|
||||
+ // [paged inc1] block table [n_blk, n_stream] (position order, padded to n_blk
|
||||
+ // per column with a masked empty cell) for the in-kernel paged read.
|
||||
+ void get_block_table(int32_t * dst, uint32_t n_blk, uint32_t n_kv, const slot_info & sinfo) const;
|
||||
|
||||
// store k_cur and v_cur in the cache based on the provided head location
|
||||
ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const;
|
||||
@@ -386,6 +389,7 @@ public:
|
||||
// current ubatch's stream).
|
||||
uint32_t get_n_gather() const;
|
||||
void get_gather_idxs(int32_t * dst) const;
|
||||
+ void get_block_table(int32_t * dst, uint32_t n_blk) const;
|
||||
|
||||
// store k_cur and v_cur in the cache based on the provided head location
|
||||
// note: the heads in k_cur and v_cur should be laid out contiguously in memory
|
||||
diff --git a/src/paged-attn.cpp b/src/paged-attn.cpp
|
||||
index ade75e8..8eebeaa 100644
|
||||
--- a/src/paged-attn.cpp
|
||||
+++ b/src/paged-attn.cpp
|
||||
@@ -43,6 +43,25 @@ public:
|
||||
ggml_tensor * idxs;
|
||||
};
|
||||
|
||||
+// Block table filler for the in-kernel paged read: fills an I32 [n_blk, n_stream]
|
||||
+// tensor with each stream's position-ordered cells, padded to n_blk (per column)
|
||||
+// with a masked empty cell, by delegating to the kv-cache context.
|
||||
+class input_block_table : public llm_graph_input_i {
|
||||
+public:
|
||||
+ input_block_table(const llama_kv_cache_context * mctx, ggml_tensor * idxs, uint32_t n_blk)
|
||||
+ : mctx(mctx), idxs(idxs), n_blk(n_blk) {}
|
||||
+
|
||||
+ void set_input(const llama_ubatch * ubatch) override {
|
||||
+ GGML_UNUSED(ubatch);
|
||||
+ GGML_ASSERT(idxs && ggml_backend_buffer_is_host(idxs->buffer));
|
||||
+ mctx->get_block_table((int32_t *) idxs->data, n_blk);
|
||||
+ }
|
||||
+
|
||||
+ const llama_kv_cache_context * mctx;
|
||||
+ ggml_tensor * idxs;
|
||||
+ uint32_t n_blk;
|
||||
+};
|
||||
+
|
||||
} // namespace
|
||||
|
||||
void gather(ggml_context * ctx0,
|
||||
@@ -125,4 +144,92 @@ void gather(ggml_context * ctx0,
|
||||
}
|
||||
}
|
||||
|
||||
+bool in_kernel_decode(ggml_context * ctx0,
|
||||
+ llm_graph_result * res,
|
||||
+ const llama_kv_cache_context * mctx,
|
||||
+ ggml_tensor ** k,
|
||||
+ ggml_tensor ** v,
|
||||
+ ggml_tensor ** kq_mask,
|
||||
+ ggml_tensor ** block_table) {
|
||||
+ if (!active()) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ // Bench escape hatch: LLAMA_KV_PAGED_GATHER=1 forces the old gather-read decode
|
||||
+ // path (for a same-build BEFORE/AFTER decode-step comparison). Dev-only.
|
||||
+ static const bool force_gather = (std::getenv("LLAMA_KV_PAGED_GATHER") != nullptr);
|
||||
+ if (force_gather) {
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ ggml_tensor * K = *k;
|
||||
+ ggml_tensor * V = *v;
|
||||
+ ggml_tensor * M = *kq_mask;
|
||||
+
|
||||
+ const int64_t n_stream = K->ne[3];
|
||||
+ GGML_ASSERT(M->ne[3] == n_stream);
|
||||
+
|
||||
+ const int64_t n_gather = (int64_t) mctx->get_n_gather();
|
||||
+ if (n_gather <= 0) {
|
||||
+ // Worst-case reserve / nothing placed yet: keep the dense [0,n_kv) read.
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ // The in-kernel read addresses V along its d-major (non-transposed) axis. If
|
||||
+ // the cache stores V transposed, fall back to gather() (which normalizes it).
|
||||
+ if (V->nb[1] > V->nb[2]) {
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ if (debug()) {
|
||||
+ static int64_t once = 0;
|
||||
+ if (once++ < 2) {
|
||||
+ fprintf(stderr, "[paged-attn] in-kernel decode n_stream=%lld n_kv=%lld n_gather=%lld\n",
|
||||
+ (long long) n_stream, (long long) K->ne[2], (long long) n_gather);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ // Block table [n_gather, n_stream]: column s holds stream s's non-empty cells
|
||||
+ // in token-POSITION order (identical to the gather index, so the reduction
|
||||
+ // order matches stock bit-for-bit), padded with a masked empty cell. Filled
|
||||
+ // at set_input from the kv-cache (get_gather_idxs), exactly like the gather.
|
||||
+ // Pad the logical length to FATTN_KQ_STRIDE (256) so the CUDA fattn vec kernel
|
||||
+ // reads fixed 128-wide KV blocks without overrun and the KV_max mask scan
|
||||
+ // engages; padded entries point at a masked empty cell (0 contribution). Stays
|
||||
+ // <= n_kv since n_kv is itself padded to 256 and n_gather <= n_kv.
|
||||
+ int64_t n_view = GGML_PAD(n_gather, 256);
|
||||
+ if (n_view > K->ne[2]) {
|
||||
+ n_view = K->ne[2];
|
||||
+ }
|
||||
+
|
||||
+ ggml_tensor * idx = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_view, n_stream);
|
||||
+ ggml_set_input(idx);
|
||||
+ res->add_input(llm_graph_input_ptr(new input_block_table(mctx, idx, (uint32_t) n_view)));
|
||||
+
|
||||
+ // Present K and V as [d, h, n_view, ns] VIEWS of the full physical window:
|
||||
+ // identical per-cell (nb1,nb2) and per-stream (nb3) strides, only the cell
|
||||
+ // dim shrinks to n_view. NOT materialized - the kernel reads in place.
|
||||
+ *k = ggml_view_4d(ctx0, K, K->ne[0], K->ne[1], n_view, n_stream,
|
||||
+ K->nb[1], K->nb[2], K->nb[3], 0);
|
||||
+ *v = ggml_view_4d(ctx0, V, V->ne[0], V->ne[1], n_view, n_stream,
|
||||
+ V->nb[1], V->nb[2], V->nb[3], 0);
|
||||
+
|
||||
+ // Compact the mask to [n_gather, n_tps, 1, ns] in the same position order so
|
||||
+ // the kernel's logical mask index aligns with the block table. Cheap: the
|
||||
+ // mask is ~(d*h) smaller than K/V, which is why only its get_rows remains.
|
||||
+ {
|
||||
+ ggml_tensor * m = ggml_reshape_3d(ctx0, M, M->ne[0], M->ne[1], n_stream);
|
||||
+ m = ggml_cont(ctx0, ggml_transpose(ctx0, m));
|
||||
+ m = ggml_get_rows(ctx0, m, idx);
|
||||
+ m = ggml_cont(ctx0, ggml_transpose(ctx0, m));
|
||||
+ m = ggml_reshape_4d(ctx0, m, n_view, M->ne[1], 1, n_stream);
|
||||
+ if (M->type != m->type) {
|
||||
+ m = ggml_cast(ctx0, m, M->type);
|
||||
+ }
|
||||
+ *kq_mask = m;
|
||||
+ }
|
||||
+
|
||||
+ *block_table = idx;
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
} // namespace paged_attn
|
||||
diff --git a/src/paged-attn.h b/src/paged-attn.h
|
||||
index c5b7bd7..23e2184 100644
|
||||
--- a/src/paged-attn.h
|
||||
+++ b/src/paged-attn.h
|
||||
@@ -37,4 +37,22 @@ void gather(ggml_context * ctx0,
|
||||
ggml_tensor ** v,
|
||||
ggml_tensor ** kq_mask);
|
||||
|
||||
+// [paged inc1] In-kernel paged decode read. Instead of materializing the
|
||||
+// sequence's cells (gather()), present K and V as n_gather-length VIEWS of the
|
||||
+// full physical window and return the position-ordered physical-cell index list
|
||||
+// as a block table (src[5] of ggml_flash_attn_ext). The fattn kernel/op then
|
||||
+// reads K_base + block_table[j]*nb in-kernel, removing the get_rows of K and V
|
||||
+// (the bulk of the gather). On return (true): *k,*v point at the views, *kq_mask
|
||||
+// at the compacted mask, *block_table at the I32 [n_gather, n_stream] index.
|
||||
+// Returns false (leaving *k,*v,*kq_mask untouched) when the in-kernel path does
|
||||
+// not apply - env off, nothing placed, or a transposed V cache - so the caller
|
||||
+// keeps the dense gather()/contiguous read.
|
||||
+bool in_kernel_decode(ggml_context * ctx0,
|
||||
+ llm_graph_result * res,
|
||||
+ const llama_kv_cache_context * mctx,
|
||||
+ ggml_tensor ** k,
|
||||
+ ggml_tensor ** v,
|
||||
+ ggml_tensor ** kq_mask,
|
||||
+ ggml_tensor ** block_table);
|
||||
+
|
||||
} // namespace paged_attn
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,269 @@
|
||||
From 9ac56933abd5de4a1f349c811c2d74aab09f7ab1 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Mon, 22 Jun 2026 22:36:09 +0200
|
||||
Subject: [PATCH] paged tile in-kernel decode read + dispatch guard (env
|
||||
LLAMA_KV_PAGED) - patch 0010
|
||||
|
||||
Increment 2 (robustness, ~0 headline ms): make the paged in-kernel decode read
|
||||
safe against silent mis-routing, and plumb the same read into the tile kernel
|
||||
for the increment-3 GQA head-group work.
|
||||
|
||||
fattn-tile.cuh: graft the patch-0009 phys(j) block-table read (mirror of
|
||||
fattn-vec.cuh). Both flash_attn_tile_load_tile overloads, flash_attn_tile_iter_KQ
|
||||
(K) and flash_attn_tile_iter (V) take an optional per-sequence block table; a row
|
||||
i is read from base + block_table[row_base + i]*stride instead of base + i*stride.
|
||||
The table defaults to nullptr (default args + a null bt_seq when src[5] is unset),
|
||||
so every existing non-paged caller is byte-identical to stock. The mask / KV_max
|
||||
stay logical (token-position order), as in vec.
|
||||
|
||||
fattn.cu: DISPATCH GUARD. When the block table (src[5]) is present, route ONLY to
|
||||
the vec or tile kernel and never fall through to the best-kernel switch. The
|
||||
mma/wmma kernels GGML_UNUSED the table and would silently read the wrong
|
||||
(contiguous physical) cells; the guard makes that unreachable. The vec dispatcher
|
||||
GGML_ABORTs for an unsupported D/type rather than mis-reading. Default route is vec
|
||||
(the inc-1 byte-validated path). LLAMA_KV_PAGED_DISPATCH_LOG=1 prints the routed
|
||||
kernel once.
|
||||
|
||||
Gates: CPU byte-identical paged-on vs off (Qwen3-0.6B, build-cpu) PASS. GPU
|
||||
vec-paged == stock at -s 1 PASS. Dispatch confirmed VEC for the real decode shape:
|
||||
Qwen3-0.6B Q ne=[128,1,16,1] and Qwen3-32B NVFP4 Q ne=[128,1,64,N] both route to
|
||||
vec, matching the nsys profile (flash_attn_ext_vec).
|
||||
|
||||
The tile graft is plumbed for increment-3 GQA head-group reuse but is EXPERIMENTAL
|
||||
and NOT yet byte-validated (LLAMA_KV_PAGED_TILE=1). A tile-vs-tile gate shows
|
||||
tile-paged diverging from tile-stock at the first cross-tile KV depth: the
|
||||
GQA-grouped (ncols2>1) tile path reads a full nbatch_fa-row tile with
|
||||
oob_check=false while the compacted paged mask is not padded to cover the tile, so
|
||||
past-end rows leak. vec bounds its KV walk by KV_max and is unaffected. Bounding
|
||||
the tile path is increment-3 work; the default vec route and all stock paths are
|
||||
untouched.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
ggml/src/ggml-cuda/fattn-tile.cuh | 45 ++++++++++++++++++++-----------
|
||||
ggml/src/ggml-cuda/fattn.cu | 38 +++++++++++++++++++++++---
|
||||
2 files changed, 64 insertions(+), 19 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/fattn-tile.cuh b/ggml/src/ggml-cuda/fattn-tile.cuh
|
||||
index 0ff14e6..bb84d61 100644
|
||||
--- a/ggml/src/ggml-cuda/fattn-tile.cuh
|
||||
+++ b/ggml/src/ggml-cuda/fattn-tile.cuh
|
||||
@@ -373,7 +373,8 @@ static constexpr __device__ int ggml_cuda_fattn_tile_get_nbatch_K(const int DKQ,
|
||||
// TODO: deduplicate with mma-f16
|
||||
template<int warp_size, int nwarps, int I, int J, int J_padding, bool oob_check>
|
||||
static __device__ __forceinline__ void flash_attn_tile_load_tile(
|
||||
- const half2 * const __restrict__ KV, half2 * const __restrict__ tile_KV, const int stride_KV, const int i_sup) {
|
||||
+ const half2 * const __restrict__ KV, half2 * const __restrict__ tile_KV, const int stride_KV, const int i_sup,
|
||||
+ const int * const __restrict__ block_table = nullptr, const int row_base = 0) {
|
||||
constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
|
||||
constexpr int cpy_ne = cpy_nb / 4;
|
||||
|
||||
@@ -402,9 +403,11 @@ static __device__ __forceinline__ void flash_attn_tile_load_tile(
|
||||
const int j = j0*cpy_ne + (stride_j == warp_size ? threadIdx.x : threadIdx.x % stride_j)*cpy_ne;
|
||||
|
||||
const __align__(16) half2 zero[cpy_ne] = {{0.0f, 0.0f}};
|
||||
+ // [paged] remap the row through the block table (nullptr => stock contiguous read).
|
||||
+ const half2 * const KV_row = block_table ? KV + (int64_t) block_table[row_base + i]*stride_KV : KV + i*stride_KV;
|
||||
ggml_cuda_memcpy_1<cpy_nb>(
|
||||
tile_KV + i*(J/2 + J_padding) + j,
|
||||
- !oob_check || i < i_sup ? KV + i*stride_KV + j : zero);
|
||||
+ !oob_check || i < i_sup ? KV_row + j : zero);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -423,7 +426,8 @@ static __device__ __forceinline__ void flash_attn_tile_load_tile(
|
||||
|
||||
template<int warp_size, int nwarps, int I, int J, int J_padding, bool oob_check>
|
||||
static __device__ __forceinline__ void flash_attn_tile_load_tile(
|
||||
- const half2 * const __restrict__ KV, float * const __restrict__ tile_KV, const int stride_KV, const int i_sup) {
|
||||
+ const half2 * const __restrict__ KV, float * const __restrict__ tile_KV, const int stride_KV, const int i_sup,
|
||||
+ const int * const __restrict__ block_table = nullptr, const int row_base = 0) {
|
||||
constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
|
||||
constexpr int cpy_ne = cpy_nb / 4;
|
||||
|
||||
@@ -453,8 +457,10 @@ static __device__ __forceinline__ void flash_attn_tile_load_tile(
|
||||
|
||||
const half2 zero[cpy_ne/2] = {{0.0f, 0.0f}};
|
||||
__align__(16) half2 tmp_h2[cpy_ne/2];
|
||||
+ // [paged] remap the row through the block table (nullptr => stock contiguous read).
|
||||
+ const half2 * const KV_row = block_table ? KV + (int64_t) block_table[row_base + i]*stride_KV : KV + i*stride_KV;
|
||||
ggml_cuda_memcpy_1<sizeof(tmp_h2)>(
|
||||
- tmp_h2, !oob_check || i < i_sup ? KV + i*stride_KV + j : zero);
|
||||
+ tmp_h2, !oob_check || i < i_sup ? KV_row + j : zero);
|
||||
|
||||
__align__(16) float2 tmp_f2[cpy_ne/2];
|
||||
#pragma unroll
|
||||
@@ -487,6 +493,7 @@ static __device__ __forceinline__ void flash_attn_tile_iter_KQ(
|
||||
const int k_VKQ_0,
|
||||
const int k_VKQ_sup,
|
||||
const int k_KQ_0,
|
||||
+ const int * const __restrict__ block_table,
|
||||
float * KQ_acc) {
|
||||
constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
|
||||
constexpr int cpy_ne = cpy_nb / 4;
|
||||
@@ -495,8 +502,10 @@ static __device__ __forceinline__ void flash_attn_tile_iter_KQ(
|
||||
constexpr int cpw = ncols > nwarps ? ncols/nwarps : 1; // Q columns per warp
|
||||
constexpr int np = nwarps > ncols ? nwarps/ncols : 1; // number of parallel warps per Q column
|
||||
|
||||
+ // [paged] when block_table is set K_h2 is the un-offset base; the table supplies the row.
|
||||
+ const half2 * const K_base = block_table ? (K_h2 + k_KQ_0/2) : (K_h2 + int64_t(k_VKQ_0)*stride_K2 + k_KQ_0/2);
|
||||
flash_attn_tile_load_tile<warp_size, nwarps, nbatch_fa, nbatch_K, cpy_ne, oob_check>
|
||||
- (K_h2 + int64_t(k_VKQ_0)*stride_K2 + k_KQ_0/2, KV_tmp, stride_K2, k_VKQ_sup);
|
||||
+ (K_base, KV_tmp, stride_K2, k_VKQ_sup, block_table, k_VKQ_0);
|
||||
__syncthreads();
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
@@ -572,7 +581,8 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
|
||||
T_acc * const VKQ,
|
||||
const int k_VKQ_0,
|
||||
const int k_VKQ_max,
|
||||
- const int col_Q_0) {
|
||||
+ const int col_Q_0,
|
||||
+ const int * const __restrict__ block_table) {
|
||||
constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
|
||||
constexpr int cpy_ne = cpy_nb / 4;
|
||||
|
||||
@@ -605,12 +615,12 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < DKQ - nbatch_K_last; k_KQ_0 += nbatch_K) {
|
||||
flash_attn_tile_iter_KQ<warp_size, nwarps, ncols1, ncols2, DKQ, nbatch_fa, nbatch_K, use_logit_softcap, oob_check>(
|
||||
- Q_tmp, K_h2, KV_tmp, stride_K2, k_VKQ_0, k_VKQ_sup, k_KQ_0, KQ_acc);
|
||||
+ Q_tmp, K_h2, KV_tmp, stride_K2, k_VKQ_0, k_VKQ_sup, k_KQ_0, block_table, KQ_acc);
|
||||
}
|
||||
if (nbatch_K_last > 0) {
|
||||
constexpr int k_KQ_0 = DKQ - nbatch_K_last;
|
||||
flash_attn_tile_iter_KQ<warp_size, nwarps, ncols1, ncols2, DKQ, nbatch_fa, nbatch_K_last, use_logit_softcap, oob_check>(
|
||||
- Q_tmp, K_h2, KV_tmp, stride_K2, k_VKQ_0, k_VKQ_sup, k_KQ_0, KQ_acc);
|
||||
+ Q_tmp, K_h2, KV_tmp, stride_K2, k_VKQ_0, k_VKQ_sup, k_KQ_0, block_table, KQ_acc);
|
||||
}
|
||||
|
||||
// Apply logit softcap + mask, update KQ_max:
|
||||
@@ -715,8 +725,10 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
|
||||
static_assert(nbatch_V % np == 0, "bad nbatch_V");
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < nbatch_fa; k0 += nbatch_V) {
|
||||
+ // [paged] when block_table is set V_h2 is the un-offset base; the table supplies the row.
|
||||
+ const half2 * const V_base = block_table ? V_h2 : (V_h2 + int64_t(k_VKQ_0 + k0)*stride_V2);
|
||||
flash_attn_tile_load_tile<warp_size, nwarps, nbatch_V, DV, 0, oob_check>
|
||||
- (V_h2 + int64_t(k_VKQ_0 + k0)*stride_V2, KV_tmp, stride_V2, k_VKQ_sup - k0);
|
||||
+ (V_base, KV_tmp, stride_V2, k_VKQ_sup - k0, block_table, k_VKQ_0 + k0);
|
||||
__syncthreads();
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
@@ -810,7 +822,6 @@ static __global__ void flash_attn_tile(
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33,
|
||||
const int * __restrict__ block_table) {
|
||||
- GGML_UNUSED(block_table); // [paged] block table is honored only by the vec kernel
|
||||
#ifdef FLASH_ATTN_AVAILABLE
|
||||
const char * GGML_CUDA_RESTRICT Q = Q_ptr;
|
||||
const char * GGML_CUDA_RESTRICT K = K_ptr;
|
||||
@@ -837,7 +848,7 @@ static __global__ void flash_attn_tile(
|
||||
nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
ne31, ne32, ne33,
|
||||
- nb31, nb32, nb33);
|
||||
+ nb31, nb32, nb33, block_table);
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
@@ -861,6 +872,10 @@ static __global__ void flash_attn_tile(
|
||||
const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head0 / gqa_ratio));
|
||||
const half2 * V_h2 = (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio)); // K and V have same shape
|
||||
|
||||
+ // [paged] per-sequence logical->physical block table in token-position order
|
||||
+ // (mask/KV_max stay logical); nullptr => the stock contiguous read.
|
||||
+ const int * const __restrict__ bt_seq = block_table ? block_table + (size_t) sequence*ne11 : nullptr;
|
||||
+
|
||||
const half * maskh = mask ? (const half *) (mask + nb33*(sequence % ne33)) : nullptr;
|
||||
|
||||
const int stride_K2 = nb11 / sizeof(half2);
|
||||
@@ -963,14 +978,14 @@ static __global__ void flash_attn_tile(
|
||||
constexpr bool oob_check = false;
|
||||
flash_attn_tile_iter<warp_size, nwarps, ncols1, ncols2, DKQ, DV, nbatch_fa, nbatch_K, use_logit_softcap, oob_check>
|
||||
(Q_tmp, K_h2, V_h2, maskh, ne01, logit_softcap, slope, KQ, KV_tmp,
|
||||
- stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max, col_Q_0);
|
||||
+ stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max, col_Q_0, bt_seq);
|
||||
k_VKQ_0 += gridDim.y*nbatch_fa;
|
||||
}
|
||||
if (k_VKQ_0 < k_VKQ_max) {
|
||||
constexpr bool oob_check = true;
|
||||
flash_attn_tile_iter<warp_size, nwarps, ncols1, ncols2, DKQ, DV, nbatch_fa, nbatch_K, use_logit_softcap, oob_check>
|
||||
(Q_tmp, K_h2, V_h2, maskh, ne01, logit_softcap, slope, KQ, KV_tmp,
|
||||
- stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max, col_Q_0);
|
||||
+ stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max, col_Q_0, bt_seq);
|
||||
}
|
||||
} else {
|
||||
// Branch without out-of-bounds checks.
|
||||
@@ -978,7 +993,7 @@ static __global__ void flash_attn_tile(
|
||||
constexpr bool oob_check = false;
|
||||
flash_attn_tile_iter<warp_size, nwarps, ncols1, ncols2, DKQ, DV, nbatch_fa, nbatch_K, use_logit_softcap, oob_check>
|
||||
(Q_tmp, K_h2, V_h2, maskh, ne01, logit_softcap, slope, KQ, KV_tmp,
|
||||
- stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max, col_Q_0);
|
||||
+ stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max, col_Q_0, bt_seq);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1144,7 +1159,7 @@ static __global__ void flash_attn_tile(
|
||||
nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
ne31, ne32, ne33,
|
||||
- nb31, nb32, nb33);
|
||||
+ nb31, nb32, nb33, block_table);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
}
|
||||
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
|
||||
index e3771ee..afcafa2 100644
|
||||
--- a/ggml/src/ggml-cuda/fattn.cu
|
||||
+++ b/ggml/src/ggml-cuda/fattn.cu
|
||||
@@ -575,11 +575,41 @@ size_t ggml_cuda_flash_attn_ext_get_alloc_size(int device, const ggml_tensor * d
|
||||
void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_set_device(ctx.device);
|
||||
|
||||
- // [paged] the block table (src[5]) is only honored by the vec kernel's
|
||||
- // in-kernel read; force it. build_attn only sets it for a vec-supported
|
||||
- // 1-token-per-stream decode shape.
|
||||
+ // [paged] DISPATCH GUARD. The block table (src[5]) is read in-kernel ONLY by
|
||||
+ // the vec and tile kernels; the mma/wmma kernels GGML_UNUSED it and would
|
||||
+ // silently read the wrong (contiguous physical) cells. So when a block table
|
||||
+ // is present we route here and NEVER fall through to the best-kernel switch
|
||||
+ // below - no decode shape can silently reach an mma/wmma misread. build_attn
|
||||
+ // only sets src[5] for the 1-token-per-stream decode shape; the vec
|
||||
+ // dispatcher GGML_ABORTs for an unsupported D/type rather than mis-reading,
|
||||
+ // and any shape that should not be paged must take the host-side gather path
|
||||
+ // (LLAMA_KV_PAGED_GATHER=1) instead.
|
||||
+ //
|
||||
+ // Default route = vec (inc-1, byte-validated: vec-paged == stock at -s 1 and
|
||||
+ // CPU byte-identical). LLAMA_KV_PAGED_TILE=1 routes the same shape to the
|
||||
+ // tile kernel; the tile in-kernel read is plumbed (fattn-tile.cuh) for the
|
||||
+ // increment-3 GQA head-group reuse, but is EXPERIMENTAL / NOT yet byte-
|
||||
+ // validated: the GQA-grouped (ncols2>1) tile path reads a full nbatch_fa tile
|
||||
+ // with oob_check=false while the compacted paged mask is not padded to cover
|
||||
+ // it, so it diverges from stock. Not for production paged decode until
|
||||
+ // increment-3 bounds that path; the default vec route is unaffected.
|
||||
if (dst->src[5] != nullptr) {
|
||||
- ggml_cuda_flash_attn_ext_vec(ctx, dst);
|
||||
+ static const bool paged_tile = getenv("LLAMA_KV_PAGED_TILE") != nullptr;
|
||||
+ if (getenv("LLAMA_KV_PAGED_DISPATCH_LOG") != nullptr) {
|
||||
+ static bool logged = false;
|
||||
+ if (!logged) {
|
||||
+ logged = true;
|
||||
+ fprintf(stderr, "[paged] decode src[5] set -> routing to %s (Q ne=[%ld,%ld,%ld,%ld])\n",
|
||||
+ paged_tile ? "TILE(experimental)" : "VEC",
|
||||
+ (long) dst->src[0]->ne[0], (long) dst->src[0]->ne[1],
|
||||
+ (long) dst->src[0]->ne[2], (long) dst->src[0]->ne[3]);
|
||||
+ }
|
||||
+ }
|
||||
+ if (paged_tile) {
|
||||
+ ggml_cuda_flash_attn_ext_tile(ctx, dst);
|
||||
+ } else {
|
||||
+ ggml_cuda_flash_attn_ext_vec(ctx, dst);
|
||||
+ }
|
||||
return;
|
||||
}
|
||||
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,147 @@
|
||||
From d5ca5cd756e42214d0003bca815ca91943679b0d Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Tue, 23 Jun 2026 00:18:35 +0200
|
||||
Subject: [PATCH] paged decode: route GQA-grouped tile kernel by default (F16,
|
||||
gqa>=2) - patch 0011
|
||||
|
||||
Increment 3 (the attention lever). In fattn.cu's paged dispatch guard, route the
|
||||
in-kernel decode to the tile kernel for the common grouped-query F16 case, and
|
||||
keep the inc-1 vec kernel for everything else.
|
||||
|
||||
The tile kernel carries native GQA head-group reuse: its ncols2 axis groups the
|
||||
q-heads that share one kv-head, so each K/V row is loaded once for the whole
|
||||
group instead of once per q-head. vec re-streams each kv-head's K/V once per
|
||||
q-head (8x for Qwen3-32B's n_head 64 / n_head_kv 8) and runs at 168 regs ->
|
||||
3 blocks/SM = 25% occupancy on GB10; tile is 108-128 regs with native grouping.
|
||||
The inc-2 phys(j) block-table read was already plumbed into tile (patch 0010);
|
||||
this patch makes it the default for {F16 K and V, gqa_ratio >= 2}.
|
||||
|
||||
Routing guard (why conditional): the tile kernel has no K/V type template - it
|
||||
loads half2 - so a non-F16 cache (BF16 / quantized) would be converted by
|
||||
launch_fattn to a contiguous F16 copy, which breaks the in-kernel block-table
|
||||
read (the table indexes the original paged layout, not the copy). So tile is
|
||||
correct only for an F16 cache; non-F16 caches and the non-grouped gqa==1 shape
|
||||
fall back to the inc-1 vec path, exactly as before this change. The head-group
|
||||
reuse also only helps at gqa_ratio >= 2. LLAMA_KV_PAGED_VEC=1 forces vec for A/B.
|
||||
Note: paged decode is currently exercised with an F16 cache only; quantized +
|
||||
paged is a separate pre-existing limitation, independent of this change
|
||||
(verified: stock + q8_0 cache works, but paged + q8_0 aborts both before and
|
||||
after this patch, since both route the non-F16 cache to vec).
|
||||
|
||||
Measured GB10 (sm_121, 48 SM), Qwen3-32B NVFP4 dense, F16 cache, gqa 8, batch 32,
|
||||
1024 ctx, llama-batched-bench npp=1024 ntg=128 npl=32, GGML_CUDA_DISABLE_GRAPHS=1,
|
||||
same build, env-toggled:
|
||||
STOCK (mma) 174.8 ms/step 183.1 t/s
|
||||
PAGED-VEC (inc-1) 186.3 ms/step 171.8 t/s (+6.6% vs stock)
|
||||
PAGED-TILE (inc-3) 177.9 ms/step 179.8 t/s (+1.8% vs stock)
|
||||
GQA grouping recovers 8.4 ms/step (-4.5%) over the inc-1 vec default and brings
|
||||
paged decode to within 1.8% of stock. The win grows with context (npl=8, tile vs
|
||||
vec decode step): 1024 -2.3%, 4096 -3.3%, 8192 and 16384 wider, as attention
|
||||
takes a larger share of the step.
|
||||
|
||||
Why not the split-K tune: the vec decode grid is already block-saturated
|
||||
(1 x parallel_blocks 3 x 2048 = 6144 blocks ~ 43 waves over 144 resident on 48
|
||||
SM), so raising parallel_blocks / KV_max adds no SM fill. The under-saturation is
|
||||
intra-SM (occupancy + the 8x KV re-streaming), which GQA grouping attacks
|
||||
directly; more split-K does not.
|
||||
|
||||
Correctness (greedy, GGML_CUDA_DISABLE_GRAPHS=1):
|
||||
- CPU plumbing gate (Qwen3-0.6B, build-cpu, paged-on vs off): BYTE-IDENTICAL.
|
||||
- GPU 0.6B gqa=2, 8 seq x 48 tok: tile is token-identical to the inc-1 vec path
|
||||
in 7/8 sequences; the 8th diverges at token 5, within the same kernel-noise
|
||||
band where vec also drifts from stock. Stock uses the mma kernel for this
|
||||
multi-stream GQA shape, so a different kernel = different rounding =
|
||||
autoregressive token drift; vec and tile agree with each other while both
|
||||
differ from stock (both pick 15678 where stock picks 38835), confirming the
|
||||
drift is kernel choice, not a paging error.
|
||||
- GPU 32B gqa=8, 4 seq x 40 tok: tile tracks stock at least as well as vec
|
||||
(seq3: tile == stock == 624 at the token where vec picked 13).
|
||||
|
||||
Stock is byte-identical: the dispatch guard only diverts when the block table
|
||||
(src[5]) is set; the non-paged best-kernel switch is untouched. The ncols2>1 tile
|
||||
path reads the last nbatch_fa tile with oob_check=false and relies on the mask
|
||||
-inf padding - the same pattern stock uses for ncols2>1 - and the compacted paged
|
||||
mask is gathered to the n_view (GGML_PAD 256) width so it carries that padding.
|
||||
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
---
|
||||
ggml/src/ggml-cuda/fattn.cu | 51 ++++++++++++++++++++++++++-----------
|
||||
1 file changed, 36 insertions(+), 15 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
|
||||
index afcafa2..6b15810 100644
|
||||
--- a/ggml/src/ggml-cuda/fattn.cu
|
||||
+++ b/ggml/src/ggml-cuda/fattn.cu
|
||||
@@ -580,32 +580,53 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
// silently read the wrong (contiguous physical) cells. So when a block table
|
||||
// is present we route here and NEVER fall through to the best-kernel switch
|
||||
// below - no decode shape can silently reach an mma/wmma misread. build_attn
|
||||
- // only sets src[5] for the 1-token-per-stream decode shape; the vec
|
||||
+ // only sets src[5] for the 1-token-per-stream decode shape; the vec/tile
|
||||
// dispatcher GGML_ABORTs for an unsupported D/type rather than mis-reading,
|
||||
// and any shape that should not be paged must take the host-side gather path
|
||||
// (LLAMA_KV_PAGED_GATHER=1) instead.
|
||||
//
|
||||
- // Default route = vec (inc-1, byte-validated: vec-paged == stock at -s 1 and
|
||||
- // CPU byte-identical). LLAMA_KV_PAGED_TILE=1 routes the same shape to the
|
||||
- // tile kernel; the tile in-kernel read is plumbed (fattn-tile.cuh) for the
|
||||
- // increment-3 GQA head-group reuse, but is EXPERIMENTAL / NOT yet byte-
|
||||
- // validated: the GQA-grouped (ncols2>1) tile path reads a full nbatch_fa tile
|
||||
- // with oob_check=false while the compacted paged mask is not padded to cover
|
||||
- // it, so it diverges from stock. Not for production paged decode until
|
||||
- // increment-3 bounds that path; the default vec route is unaffected.
|
||||
+ // Default route = the GQA-grouped TILE kernel (inc-3) WHEN it is both correct
|
||||
+ // and a win, else the inc-1 vec path. Tile groups the q-heads that share one
|
||||
+ // kv-head (ncols2), loading each K/V row once for the whole group instead of
|
||||
+ // once per q-head, and runs at higher occupancy than vec (108-128 regs vs 168).
|
||||
+ // Two constraints make this conditional: (1) the tile kernel has no K/V type
|
||||
+ // template - it loads half2 - so a non-F16 cache (BF16/quantized) would be
|
||||
+ // converted by launch_fattn to a contiguous F16 copy, which breaks the
|
||||
+ // in-kernel block-table read (the table indexes the original paged layout, not
|
||||
+ // the copy); vec instead reads the original cache with in-kernel dequant, so it
|
||||
+ // is the only correct paged path for non-F16 caches. (2) the head-group reuse
|
||||
+ // only helps when gqa_ratio>=2. So route to tile only for {F16 K and V,
|
||||
+ // gqa_ratio>=2}; everything else stays on vec, matching stock (which also sends
|
||||
+ // quantized-cache decode to the vector kernel). Measured on GB10 (Qwen3-32B
|
||||
+ // nvfp4, F16 cache, gqa 8, batch 32, 1024 ctx): tile 177.9 ms/step vs vec 186.3
|
||||
+ // vs stock 174.8 - GQA grouping recovers ~4.5% over the inc-1 vec default and
|
||||
+ // brings paged decode to ~1.8% of stock. Validated token-coherent with vec:
|
||||
+ // 0.6B 8-seq 7/8 identical (8th within the kernel-noise band where vec also
|
||||
+ // drifts from stock), 32B gqa=8 tile tracks stock at least as well as vec, CPU
|
||||
+ // plumbing gate byte-identical. The ncols2>1 tile path reads the last nbatch_fa
|
||||
+ // tile with oob_check=false relying on mask -inf padding (the SAME pattern stock
|
||||
+ // uses for ncols2>1); the compacted paged mask is gathered to the n_view
|
||||
+ // (GGML_PAD 256) width so it carries that padding. LLAMA_KV_PAGED_VEC=1 forces
|
||||
+ // the inc-1 vec path for A/B.
|
||||
if (dst->src[5] != nullptr) {
|
||||
- static const bool paged_tile = getenv("LLAMA_KV_PAGED_TILE") != nullptr;
|
||||
+ const ggml_tensor * Qp = dst->src[0];
|
||||
+ const ggml_tensor * Kp = dst->src[1];
|
||||
+ const ggml_tensor * Vp = dst->src[2];
|
||||
+ const bool kv_f16 = Kp->type == GGML_TYPE_F16 && Vp->type == GGML_TYPE_F16;
|
||||
+ const int64_t gqa_ratio = Kp->ne[2] > 0 ? Qp->ne[2] / Kp->ne[2] : 1;
|
||||
+ const bool force_vec = getenv("LLAMA_KV_PAGED_VEC") != nullptr;
|
||||
+ const bool use_tile = !force_vec && kv_f16 && gqa_ratio >= 2;
|
||||
if (getenv("LLAMA_KV_PAGED_DISPATCH_LOG") != nullptr) {
|
||||
static bool logged = false;
|
||||
if (!logged) {
|
||||
logged = true;
|
||||
- fprintf(stderr, "[paged] decode src[5] set -> routing to %s (Q ne=[%ld,%ld,%ld,%ld])\n",
|
||||
- paged_tile ? "TILE(experimental)" : "VEC",
|
||||
- (long) dst->src[0]->ne[0], (long) dst->src[0]->ne[1],
|
||||
- (long) dst->src[0]->ne[2], (long) dst->src[0]->ne[3]);
|
||||
+ fprintf(stderr, "[paged] decode src[5] set -> routing to %s (Q ne=[%ld,%ld,%ld,%ld] gqa=%ld kv_f16=%d)\n",
|
||||
+ use_tile ? "TILE(gqa)" : "VEC",
|
||||
+ (long) Qp->ne[0], (long) Qp->ne[1], (long) Qp->ne[2], (long) Qp->ne[3],
|
||||
+ (long) gqa_ratio, (int) kv_f16);
|
||||
}
|
||||
}
|
||||
- if (paged_tile) {
|
||||
+ if (use_tile) {
|
||||
ggml_cuda_flash_attn_ext_tile(ctx, dst);
|
||||
} else {
|
||||
ggml_cuda_flash_attn_ext_vec(ctx, dst);
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,50 @@
|
||||
From 6e3e976e2b11adb05519f31dd5aad0c204678f5c Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Tue, 23 Jun 2026 11:12:05 +0200
|
||||
Subject: [PATCH] feat(paged): assert mask-pad invariant for the paged tile
|
||||
route (patch 0012)
|
||||
|
||||
The now-default paged decode route (GQA-grouped fattn-tile kernel) does not
|
||||
leak past-end KV rows only because the compacted mask/block-table length is
|
||||
padded to a whole number of flash-attn KV tiles: n_view = GGML_PAD(n_gather,
|
||||
256), and the tile (nbatch_fa = 64 for head_dim 128) divides 256, so the last
|
||||
tile sits entirely inside the -inf pad window. That invariant was implicit.
|
||||
|
||||
Add a defensive GGML_ASSERT(n_view % 64 == 0) right after the pad/clamp so a
|
||||
future change to the pad (e.g. < 256) or the tile (> 256) that broke the
|
||||
whole-tile property cannot silently reintroduce the leak. Additive only, no
|
||||
behaviour change.
|
||||
|
||||
Verified: build-cpu compiles, and the paged CPU byte gate (LLAMA_KV_PAGED off
|
||||
vs on, Qwen3-0.6B-Q8_0, greedy, -ngl 0) stays byte-identical while the assert
|
||||
stays silent (n_view remains a whole number of tiles across all decode steps).
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
src/paged-attn.cpp | 9 +++++++++
|
||||
1 file changed, 9 insertions(+)
|
||||
|
||||
diff --git a/src/paged-attn.cpp b/src/paged-attn.cpp
|
||||
index 8eebeaa..fed8ca9 100644
|
||||
--- a/src/paged-attn.cpp
|
||||
+++ b/src/paged-attn.cpp
|
||||
@@ -201,6 +201,15 @@ bool in_kernel_decode(ggml_context * ctx0,
|
||||
n_view = K->ne[2];
|
||||
}
|
||||
|
||||
+ // The flash-attn KV tile is 64 rows wide (nbatch_fa for head_dim 128). n_view must be
|
||||
+ // a whole number of such tiles so the in-kernel decode never reads past the gathered
|
||||
+ // rows: the trailing pad cells [n_gather, n_view) are all -inf, so any tile straddling
|
||||
+ // the boundary still contributes zero. This holds today only because the pad (256) is a
|
||||
+ // multiple of the tile; a future pad < 256 (or nbatch_fa > 256) that broke it would
|
||||
+ // silently reintroduce a past-end KV leak, so assert it rather than trust it.
|
||||
+ // pad must be a multiple of the flash-attn KV tile so the last tile is fully inside the -inf pad
|
||||
+ GGML_ASSERT(n_view % 64 == 0);
|
||||
+
|
||||
ggml_tensor * idx = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_view, n_stream);
|
||||
ggml_set_input(idx);
|
||||
res->add_input(llm_graph_input_ptr(new input_block_table(mctx, idx, (uint32_t) n_view)));
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,136 @@
|
||||
From 6d3743105c1bbfbf9cd16c0c0ba39bfaac74216e Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Tue, 23 Jun 2026 11:52:45 +0200
|
||||
Subject: [PATCH] feat(paged): decoupled per-step prefill-token budget (patch
|
||||
0013)
|
||||
|
||||
llama-server already co-batches decode with chunked prefill: update_slots()
|
||||
appends every generating slot's sampled token first, then fills the rest of the
|
||||
n_batch budget with prompt tokens, deferring the overflow to the next step. But
|
||||
the prefill chunk size is hard-wired to n_batch (default 2048): one slot's
|
||||
~2048-token prefill chunk lands in a single compute-heavy step, and every decode
|
||||
co-batched into that step sees a multi-second inter-token-latency (ITL) spike.
|
||||
Lowering n_batch shrinks the chunk but also caps decode-concurrency width and
|
||||
prefill throughput, because they are coupled.
|
||||
|
||||
Add LLAMA_PREFILL_BUDGET: a per-step prefill-token budget decoupled from n_batch
|
||||
(the analogue of vLLM's --max-num-batched-tokens / long_prefill_token_threshold).
|
||||
The prompt-fill loop and the outer slot loop now also stop once this many prompt
|
||||
tokens have been added in the current update_slots() step, so a long prefill is
|
||||
split across more steps that each still advance in-flight decode. Default (env
|
||||
unset or <= 0) = disabled, so stock behaviour is byte-identical. Orthogonal to
|
||||
LLAMA_KV_PAGED: this is a pure scheduler knob and works with paged off.
|
||||
|
||||
Measured on GB10 (sm_121), dense Qwen3-32B-NVFP4, paged build, 8 steady decode
|
||||
streams with one 6000-token prefill injected mid-stream; same binary, only
|
||||
LLAMA_PREFILL_BUDGET differs:
|
||||
|
||||
metric stock(off) budget=256 budget=512
|
||||
worst decode freeze (ms) 3380 482 (7.0x) 778 (4.3x)
|
||||
median decode ITL in window 2264 411 (5.5x) 689
|
||||
decode_stall (ms) 3285 387 (8.5x) 684 (4.8x)
|
||||
decode steps during prefill 38 201 (5.3x) 108
|
||||
injected-req TTFT (ms) 8493 10172 (+20%) 8432 (~0%)
|
||||
steady-state baseline ITL 94 95 94
|
||||
|
||||
This is a LATENCY/fairness lever, not an aggregate-throughput lever: it flattens
|
||||
the decode ITL spike a long prefill inflicts on co-batched decoders (8.5x smaller
|
||||
worst freeze and 5.3x more decode progress during the prefill at budget=256), in
|
||||
exchange for a modest TTFT rise on the long request (the classic chunked-prefill
|
||||
trade-off; budget=512 buys 4.8x with ~no TTFT cost). Steady aggregate decode is
|
||||
unchanged: it is bandwidth/weight-capped on GB10 (the NVFP4 weight-read floor),
|
||||
which the scheduler cannot lift.
|
||||
|
||||
Correctness (same model, greedy temp 0, fa on):
|
||||
- budget unset or >= n_batch: byte-identical to stock (the added break never
|
||||
fires before the existing n_batch break; the off-path is a no-op by
|
||||
construction).
|
||||
- short prompt (<= budget): byte-identical to stock.
|
||||
- the knob is exactly equivalent to stock's native -b chunking: budget=512 ==
|
||||
stock -b512 and budget=256 == stock -b256, both BYTE-IDENTICAL, while keeping
|
||||
n_batch=2048 for decode width.
|
||||
- on a prompt larger than the budget the chunked greedy output diverges from the
|
||||
single n_batch chunk only by intrinsic flash-attn chunk-size FP grouping: PURE
|
||||
stock -b256 diverges from stock -b2048 the same way with the patch inactive,
|
||||
and the output stays coherent and answers correctly.
|
||||
|
||||
Productisation (LocalAI): surface as a model options knob (max_prefill_tokens /
|
||||
mpt) parsed in grpc-server.cpp, default 0 = disabled, per CHUNKED_PREFILL_PLAN
|
||||
Phase B; the vendored update_slots() hunk here is that plan's scheduler patch and
|
||||
stays disjoint from the paged allocation hunks.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
tools/server/server-context.cpp | 34 ++++++++++++++++++++++++++++++++-
|
||||
1 file changed, 33 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
|
||||
index b5f9d37..afcdebe 100644
|
||||
--- a/tools/server/server-context.cpp
|
||||
+++ b/tools/server/server-context.cpp
|
||||
@@ -3043,6 +3043,29 @@ private:
|
||||
int32_t n_batch = llama_n_batch(ctx_tgt);
|
||||
int32_t n_ubatch = llama_n_ubatch(ctx_tgt);
|
||||
|
||||
+ // PAGED serving lever (patch 0013): decoupled per-step prefill-token budget.
|
||||
+ // Analogue of vLLM's --max-num-batched-tokens. Stock llama-server caps the prompt
|
||||
+ // tokens ingested per update_slots() step at n_batch only; with cont_batching the
|
||||
+ // sampled decode tokens of every generating slot are appended FIRST, then prompt
|
||||
+ // tokens fill the batch up to n_batch. A long prompt therefore grabs an ~n_batch
|
||||
+ // chunk in a SINGLE compute-heavy step, spiking the inter-token latency of every
|
||||
+ // co-batched decoder (head-of-line jitter). LLAMA_PREFILL_BUDGET caps the prompt
|
||||
+ // tokens added per step independently of n_batch, splitting a long prefill across
|
||||
+ // more steps so in-flight decode keeps advancing smoothly. Default (env unset or
|
||||
+ // <=0) = disabled => stock behavior is byte-identical. Orthogonal to LLAMA_KV_PAGED
|
||||
+ // (this is a pure scheduler knob; works with paged off).
|
||||
+ int32_t n_prefill_budget = 0; // 0 = disabled (stock n_batch-only chunking)
|
||||
+ {
|
||||
+ const char * env_pb = getenv("LLAMA_PREFILL_BUDGET");
|
||||
+ if (env_pb) {
|
||||
+ const int v = atoi(env_pb);
|
||||
+ if (v > 0) {
|
||||
+ n_prefill_budget = std::min(n_batch, std::max(1, v));
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ int32_t n_prompt_budgeted = 0; // prompt tokens added to the batch this step (across slots)
|
||||
+
|
||||
auto & alora_scale = batch.alora_scale;
|
||||
auto & alora_disabled_id = batch.alora_disabled_id;
|
||||
|
||||
@@ -3487,7 +3510,10 @@ private:
|
||||
const auto last_user_pos = spans.last_user_message_pos();
|
||||
|
||||
// add prompt tokens for processing in the current batch
|
||||
- while (slot.prompt.n_tokens() < slot.task->n_tokens() && batch.size() < n_batch) {
|
||||
+ // (patch 0013) also stop once the per-step prefill budget is spent, so a long
|
||||
+ // prompt is split across more steps and leaves batch room for co-batched decode
|
||||
+ while (slot.prompt.n_tokens() < slot.task->n_tokens() && batch.size() < n_batch &&
|
||||
+ (n_prefill_budget == 0 || n_prompt_budgeted < n_prefill_budget)) {
|
||||
// get next token to process
|
||||
llama_token cur_tok = input_tokens[slot.prompt.n_tokens()];
|
||||
if (cur_tok == LLAMA_TOKEN_NULL) {
|
||||
@@ -3512,6 +3538,7 @@ private:
|
||||
slot.prompt.tokens.push_back(cur_tok);
|
||||
|
||||
slot.n_prompt_tokens_processed++;
|
||||
+ n_prompt_budgeted++; // (patch 0013) count toward the per-step prefill budget
|
||||
|
||||
// stop the prompt batch exactly before a user message
|
||||
if (spans.is_user_start(slot.prompt.n_tokens())) {
|
||||
@@ -3597,6 +3624,11 @@ private:
|
||||
if (!slot_batched) {
|
||||
slot_batched = &slot;
|
||||
}
|
||||
+ // (patch 0013) stop adding prompts once the per-step prefill budget is spent,
|
||||
+ // leaving the remaining batch capacity for co-batched decode of other slots
|
||||
+ if (n_prefill_budget > 0 && n_prompt_budgeted >= n_prefill_budget) {
|
||||
+ add_ok = false;
|
||||
+ }
|
||||
});
|
||||
}
|
||||
}
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,140 @@
|
||||
From 652b858252b354f4d4fb49e5ed7468eeee8e32fc Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Tue, 23 Jun 2026 15:47:06 +0200
|
||||
Subject: [PATCH] feat(paged): expert-aware MoE token-tile cap (patch 0014)
|
||||
|
||||
On GB10 (sm_121) the Qwen3-30B-A3B-class mxfp4 MoE decode path already uses the
|
||||
sorted grouped FP4-MMA GEMM (MUL_MAT_ID -> ggml_cuda_mul_mat_q ids branch:
|
||||
mm_ids_helper moe_align/scatter + one persistent stream-k mul_mat_q), so the
|
||||
originally reported npl128 throughput cliff does NOT reproduce on this build.
|
||||
llama-batched-bench decode (S_TG t/s) is monotonic across batch:
|
||||
|
||||
npl 1 8 32 64 128 256
|
||||
S_TG 85 282 629 935 1295 1779 (stock, mxfp4 MoE, -fa on)
|
||||
|
||||
There is no knee to erase; the old cliff (a real high-batch regression, 620 t/s
|
||||
at npl128) was fixed upstream by grouped-mmq + MoE stream-k load balancing.
|
||||
|
||||
What remains is a pure tile-shape micro-inefficiency. In mul_mat_q_case the
|
||||
token-tile width mmq_x is chosen to cover ncols_max (= ne12, the per-expert
|
||||
column upper bound = token count, up to 128) in one column-tile. At MoE decode
|
||||
the per-expert token density is ~ne12*k/n_experts (top-8 of 128 => ~1/16 of
|
||||
ne12, e.g. ~8 tokens/expert at npl128), so each expert's single mmq_x-wide
|
||||
col-tile is only ~6% filled: the MMA accumulator tile is mmq_x-wide at compile
|
||||
time and burns throughput on the padding columns while the larger y-tile lowers
|
||||
occupancy. Stock picks the LARGEST tile (128) where the SMALLEST tile that still
|
||||
covers the density would raise fill + occupancy at no extra weight read (at
|
||||
tokens/expert <= mmq_x there is exactly one non-empty col-tile per expert; the
|
||||
emptier tiles are skipped by the jt*mmq_x >= col_diff guard in the stream-k
|
||||
kernel) - the inverse of vLLM's small per-expert BLOCK_SIZE_M.
|
||||
|
||||
Add LLAMA_MOE_MMQ_X: an env cap on mmq_x for the MUL_MAT_ID path only
|
||||
(expert_bounds != nullptr). Default (unset or <= 0) = disabled, so the mmq_x
|
||||
selection, and therefore every kernel launched, is byte-identical to stock. The
|
||||
cap only ever lowers the loop's upper bound and still selects from the same
|
||||
granularity- and shared-memory-validated mmq_x set stock already uses for
|
||||
smaller batches, so no new kernel configuration is exercised.
|
||||
|
||||
Measured on GB10, qwen3coder-mxfp4.gguf, -fa on, -npp 128 -ntg 128, same binary,
|
||||
only LLAMA_MOE_MMQ_X differs (decode S_TG t/s / prefill S_PP t/s):
|
||||
|
||||
npl stock S_TG cap64 S_TG d% stock S_PP cap64 S_PP
|
||||
64 936 938 +0.1 2924 2883
|
||||
128 1295 1357 +4.8 3075 3038
|
||||
256 1784 1825 +2.3 3085 3046
|
||||
|
||||
(reproduced across interleaved reps; cap64 npl128 = 1357.5/1357.0, very stable)
|
||||
|
||||
cap64 lifts high-batch decode +4.8% (npl128) / +2.3% (npl256), neutral at
|
||||
npl <= 64, for a consistent ~1.3% prefill cost. Smaller caps are net-negative:
|
||||
cap16 / cap32 crater prefill -41% / -17% (a 512-token prefill ubatch has ~32
|
||||
tokens/expert, which overflows a 16/32-wide tile into extra col-tiles + weight
|
||||
re-reads), so 64 is the recommended value and the only one that helps net.
|
||||
|
||||
Honest framing: this is NOT a cliff fix (no cliff exists) and not a real-server
|
||||
throughput unlock (llama-server continuous batching already scales). It is a
|
||||
modest high-effective-batch DECODE micro-optimization that matches vLLM's
|
||||
smaller per-expert M-tiling, surfaced as an opt-in, default-off knob. The
|
||||
durable density-aware auto-select (drop the blunt global cap, choose mmq_x from
|
||||
ne_get_rows / n_active_experts so prefill keeps its large tile) is scoped in
|
||||
patches/paged/MOE_GROUPED_GEMM_SCOPE.md.
|
||||
|
||||
Correctness: greedy temp-0 llama-server output with cap64 is byte-identical to
|
||||
stock for single-stream generation (fibonacci / capital-of-France / photosynthesis
|
||||
prompts) and stays coherent; batched-bench ran thousands of capped MoE matmuls at
|
||||
npl128/256 (mmq_x forced 128 -> 64) with no CUDA error / NaN and stable output.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
ggml/src/ggml-cuda/mmq.cuh | 37 ++++++++++++++++++++++++++++++++++++-
|
||||
1 file changed, 36 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
|
||||
index edf546d..cff608e 100644
|
||||
--- a/ggml/src/ggml-cuda/mmq.cuh
|
||||
+++ b/ggml/src/ggml-cuda/mmq.cuh
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
#include <climits>
|
||||
#include <cstdint>
|
||||
+#include <cstdlib>
|
||||
|
||||
using namespace ggml_cuda_mma;
|
||||
|
||||
@@ -4052,6 +4053,18 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
}
|
||||
}
|
||||
|
||||
+// [paged patch 0014] MoE token-tile (mmq_x) cap, read once from env LLAMA_MOE_MMQ_X.
|
||||
+// Returns 0 when unset / non-positive => disabled (stock mmq_x selection, byte-identical).
|
||||
+// On the MUL_MAT_ID grouped-GEMM path this caps the per-expert column-tile width toward the
|
||||
+// low MoE-decode per-expert token density, raising tile fill + occupancy (see mul_mat_q_case).
|
||||
+static inline int ggml_cuda_moe_mmq_x_cap() {
|
||||
+ static const int cap = []() -> int {
|
||||
+ const char * s = getenv("LLAMA_MOE_MMQ_X");
|
||||
+ return s ? atoi(s) : 0;
|
||||
+ }();
|
||||
+ return cap;
|
||||
+}
|
||||
+
|
||||
template <ggml_type type>
|
||||
void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
|
||||
const int id = ggml_cuda_get_device();
|
||||
@@ -4063,10 +4076,32 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
|
||||
const int mmq_x_max = get_mmq_x_max_host(cc);
|
||||
const int mmq_y = get_mmq_y_host(cc);
|
||||
|
||||
+ // [paged patch 0014] expert-aware MoE token-tile (mmq_x) cap.
|
||||
+ // On the MUL_MAT_ID grouped-GEMM path (expert_bounds != nullptr) the GEMM columns are
|
||||
+ // tokens sorted by expert; stock picks mmq_x to cover ncols_max (= ne12, the token count,
|
||||
+ // up to 128) in a single column-tile. At MoE decode the per-expert token density is low
|
||||
+ // (top-k of many experts: ~ne12*k/n_experts tokens/expert, e.g. ~8 at npl128 for
|
||||
+ // Qwen3-30B-A3B top-8/128), so each expert's single mmq_x-wide col-tile is mostly empty:
|
||||
+ // the MMA accumulator tile is mmq_x-wide at compile time and wastes throughput on the
|
||||
+ // padding columns while the larger y-tile lowers occupancy. Capping mmq_x toward the
|
||||
+ // per-expert density raises tile fill + occupancy with no extra weight reads (at
|
||||
+ // tokens/expert <= mmq_x there is still exactly one non-empty col-tile per expert; the
|
||||
+ // emptier tiles are skipped by the jt*mmq_x >= col_diff guard in the stream-k kernel).
|
||||
+ // Default (env unset or <= 0) = disabled => mmq_x selection is byte-identical to stock;
|
||||
+ // off the ids path the cap never applies.
|
||||
+ int mmq_x_lim = mmq_x_max;
|
||||
+ if (args.expert_bounds != nullptr) {
|
||||
+ const int moe_cap = ggml_cuda_moe_mmq_x_cap();
|
||||
+ if (moe_cap > 0) {
|
||||
+ const int cap = moe_cap < 8 ? 8 : moe_cap;
|
||||
+ mmq_x_lim = cap < mmq_x_max ? cap : mmq_x_max;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
int mmq_x_best = 0;
|
||||
int ntiles_x_best = INT_MAX;
|
||||
|
||||
- for (int mmq_x = 8; mmq_x <= mmq_x_max && ntiles_x_best > 1; mmq_x += 8) {
|
||||
+ for (int mmq_x = 8; mmq_x <= mmq_x_lim && ntiles_x_best > 1; mmq_x += 8) {
|
||||
const int granularity = mmq_get_granularity_host(mmq_x, cc);
|
||||
|
||||
if (mmq_x % granularity != 0 || mmq_get_nbytes_shared<type>(mmq_x, mmq_y, cc, warp_size, nwarps) > smpbo) {
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,238 @@
|
||||
From 5349f8231b1e11214f5e8a668129397fb6e2f9ac Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Tue, 23 Jun 2026 21:03:00 +0200
|
||||
Subject: [PATCH] feat(paged): expert-density-aware MoE token-tile auto-select
|
||||
(patch 0015)
|
||||
|
||||
The durable follow-up to patch 0014's blunt LLAMA_MOE_MMQ_X global cap (which the
|
||||
0014 doc itself scoped): replace the manual env cap with a host-side, default-on
|
||||
auto-select inside mul_mat_q_case that picks a small token-tile (mmq_x) for the
|
||||
MUL_MAT_ID grouped FP4-MMA GEMM only when the per-expert token density is low
|
||||
(decode), and keeps the large 128-wide tile when density is high (prefill). No new
|
||||
kernel: the selection only lowers the loop's upper bound to an already-compiled,
|
||||
granularity- and shared-memory-validated mmq_x.
|
||||
|
||||
Density is estimated host-side from the args the ids path already passes:
|
||||
ne_get_rows = ncols_dst = ne12 * n_expert_used (token-expert assignments)
|
||||
n_experts = nchannels_x = ne02
|
||||
density = ceil(ne_get_rows / min(ne_get_rows, n_experts)) (tokens/expert)
|
||||
Cap to the small tile (default 64) only when density <= density_max. Unlike 0014's
|
||||
global cap, the high-density prefill ubatch stays on the big tile, so S_PP does not
|
||||
regress by construction.
|
||||
|
||||
density_max default = 8 (not tile/4 = 16). The cap must fire for decode but not for
|
||||
a prefill ubatch, and each has per-expert density n_tokens*n_used/n_experts. At the
|
||||
standard n_ubatch=512, n_used=8: prefill density = 4096/n_experts (32 at 128 experts,
|
||||
16 at 256), decode at npl<=128 is <= 1024/n_experts (8 at 128, 4 at 256). Default 8
|
||||
sits strictly between for every n_experts in [128,511], so it caps decode and leaves
|
||||
prefill on the big tile. tile/4 (=16) equalled the 256-expert prefill density and
|
||||
cratered its S_PP by ~2%, the regression this threshold exists to avoid.
|
||||
|
||||
Measured on GB10 (sm_121), Qwen3.6-35B-A3B NVFP4 (256 experts, top-8, GDN linear
|
||||
attention), llama-batched-bench -fa on -npp 128 -ntg 128, default-on vs stock
|
||||
(LLAMA_MOE_AUTO_TILE=0), median of 5 reps:
|
||||
|
||||
npl S_TG stock S_TG 0015 dTG% S_PP stock S_PP 0015 dPP%
|
||||
8 183.59 183.18 -0.22% 1489.2 1500.1 +0.73%
|
||||
32 264.02 263.44 -0.22% 2034.5 2033.5 -0.05%
|
||||
64 311.76 310.41 -0.43% 2028.3 2027.6 -0.03%
|
||||
128 336.10 337.32 +0.36% 2025.0 2027.7 +0.13%
|
||||
|
||||
Honest read: on THIS model the decode effect is within run-to-run noise (neutral)
|
||||
and prefill is neutral. q36-35b-a3b decode is bound by the GDN/SSM recurrence and
|
||||
256 tiny-expert weight bandwidth, not the MoE col-tile occupancy, so the col-tile
|
||||
lever (worth +4.8% @npl128 on Qwen3-Coder-30B, 128 larger experts, patch 0014
|
||||
cap64) does not move it. A npl128 tile sweep on this model confirms 64 is the only
|
||||
useful width (TILE8 -6.3%, TILE16 -3.2%, TILE32 -0.2%, TILE64 +0.7%, TILE96 -0.8%):
|
||||
smaller tiles lose to grid/scheduling overhead and the FP4-MMA minimum width.
|
||||
|
||||
Value banked default-on: (1) removes 0014's ~1.3% prefill cost by construction
|
||||
(density-gated, not global); (2) auto-selects the small tile for col-tile-bound MoE
|
||||
decode, reproducing 0014 cap64's tile=64 at npl128 by construction, so it preserves
|
||||
the +4.8% on Qwen3-Coder-30B without the prefill cost; (3) prefill-safe and decode-
|
||||
neutral on the SSM model, harmless where it does not help. Conservative by design:
|
||||
at npl256 the qwen3coder decode density (16) equals the 256-expert prefill density
|
||||
(16), indistinguishable to a pure-density gate, so density_max=8 forgoes 0014's
|
||||
+2.3% @npl256 to keep 256-expert prefill safe; an ne12-aware refinement is future
|
||||
work.
|
||||
|
||||
LLAMA_MOE_MMQ_X (patch 0014) is KEPT as a manual override that, when > 0, forces the
|
||||
old blunt global cap and bypasses the auto-select (explicit A/B knob). The auto-
|
||||
select is the default; LLAMA_MOE_AUTO_TILE=0 restores exact stock mmq_x selection.
|
||||
LLAMA_MOE_DECODE_TILE / LLAMA_MOE_DENSITY_MAX tune the small tile / threshold.
|
||||
|
||||
Correctness: extends tests/test-backend-ops test_mul_mat_id with a ragged small-M
|
||||
NVFP4/MXFP4 MoE decode-density gate (128 experts, top-8, m=768, k=2048, n in
|
||||
{16,33,64,128,130,200,256,512} spanning the cap boundary and ragged token counts).
|
||||
All 16 shapes pass CUDA-vs-CPU oracle on GB10 both default-on and with
|
||||
LLAMA_MOE_AUTO_TILE=0; full MUL_MAT_ID suite 2/2 backends OK. Off the ids path
|
||||
nothing changes (non-MoE mul_mat byte-identical to stock).
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
ggml/src/ggml-cuda/mmq.cuh | 100 ++++++++++++++++++++++++++++++-------
|
||||
tests/test-backend-ops.cpp | 16 ++++++
|
||||
2 files changed, 99 insertions(+), 17 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
|
||||
index cff608e..9718b12 100644
|
||||
--- a/ggml/src/ggml-cuda/mmq.cuh
|
||||
+++ b/ggml/src/ggml-cuda/mmq.cuh
|
||||
@@ -4053,10 +4053,11 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
}
|
||||
}
|
||||
|
||||
-// [paged patch 0014] MoE token-tile (mmq_x) cap, read once from env LLAMA_MOE_MMQ_X.
|
||||
-// Returns 0 when unset / non-positive => disabled (stock mmq_x selection, byte-identical).
|
||||
-// On the MUL_MAT_ID grouped-GEMM path this caps the per-expert column-tile width toward the
|
||||
-// low MoE-decode per-expert token density, raising tile fill + occupancy (see mul_mat_q_case).
|
||||
+// [paged patch 0014] MoE token-tile (mmq_x) MANUAL cap, read once from env LLAMA_MOE_MMQ_X.
|
||||
+// Returns 0 when unset / non-positive => disabled (fall through to the patch-0015 auto-select).
|
||||
+// When > 0 it forces a blunt GLOBAL cap on the per-expert column-tile width for the MUL_MAT_ID
|
||||
+// grouped-GEMM path (decode AND prefill), overriding the density-aware auto-select below. Kept
|
||||
+// as an explicit override / A-B knob; the default path is now the auto-select.
|
||||
static inline int ggml_cuda_moe_mmq_x_cap() {
|
||||
static const int cap = []() -> int {
|
||||
const char * s = getenv("LLAMA_MOE_MMQ_X");
|
||||
@@ -4065,6 +4066,43 @@ static inline int ggml_cuda_moe_mmq_x_cap() {
|
||||
return cap;
|
||||
}
|
||||
|
||||
+// [paged patch 0015] expert-density-aware MoE token-tile (mmq_x) auto-select knobs (DEFAULT-ON).
|
||||
+// LLAMA_MOE_AUTO_TILE=0 disables the auto-select => exact stock mmq_x selection.
|
||||
+static inline bool ggml_cuda_moe_auto_tile_enabled() {
|
||||
+ static const bool en = []() -> bool {
|
||||
+ const char * s = getenv("LLAMA_MOE_AUTO_TILE");
|
||||
+ return !(s && atoi(s) == 0);
|
||||
+ }();
|
||||
+ return en;
|
||||
+}
|
||||
+// The small high-occupancy token-tile chosen for low-density (decode) MoE matmuls. Default 64:
|
||||
+// the measured GB10 sweet spot (full per-expert fill with >=4x routing-imbalance headroom).
|
||||
+static inline int ggml_cuda_moe_decode_tile() {
|
||||
+ static const int t = []() -> int {
|
||||
+ const char * s = getenv("LLAMA_MOE_DECODE_TILE");
|
||||
+ const int v = s ? atoi(s) : 0;
|
||||
+ return v >= 8 ? v : 64;
|
||||
+ }();
|
||||
+ return t;
|
||||
+}
|
||||
+// Per-expert token-density ceiling under which the small tile is selected. Default 8: the cap must
|
||||
+// fire for decode but NOT for a prefill ubatch, and the per-expert density of each is
|
||||
+// n_tokens*n_used/n_experts. For the standard n_ubatch=512, n_used=8 the prefill density is
|
||||
+// 4096/n_experts (= 32 at 128 experts, 16 at 256 experts); decode at npl<=128 is <=1024/n_experts
|
||||
+// (= 8 at 128 experts, 4 at 256). Default 8 sits strictly between the two for every n_experts in
|
||||
+// [128,511], so it caps decode and leaves the prefill ubatch on the big 128 tile - whereas the old
|
||||
+// tile/4 (=16) equalled the 256-expert prefill density and cratered its S_PP by ~2% (measured on
|
||||
+// Qwen3.6-35B-A3B NVFP4). 8 also keeps >=8x fill headroom at tile 64 so an imbalanced expert
|
||||
+// segment never splits into an extra col-tile.
|
||||
+static inline int ggml_cuda_moe_density_max() {
|
||||
+ static const int d = []() -> int {
|
||||
+ const char * s = getenv("LLAMA_MOE_DENSITY_MAX");
|
||||
+ const int v = s ? atoi(s) : 0;
|
||||
+ return v > 0 ? v : 8;
|
||||
+ }();
|
||||
+ return d;
|
||||
+}
|
||||
+
|
||||
template <ggml_type type>
|
||||
void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
|
||||
const int id = ggml_cuda_get_device();
|
||||
@@ -4076,25 +4114,53 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
|
||||
const int mmq_x_max = get_mmq_x_max_host(cc);
|
||||
const int mmq_y = get_mmq_y_host(cc);
|
||||
|
||||
- // [paged patch 0014] expert-aware MoE token-tile (mmq_x) cap.
|
||||
- // On the MUL_MAT_ID grouped-GEMM path (expert_bounds != nullptr) the GEMM columns are
|
||||
- // tokens sorted by expert; stock picks mmq_x to cover ncols_max (= ne12, the token count,
|
||||
- // up to 128) in a single column-tile. At MoE decode the per-expert token density is low
|
||||
- // (top-k of many experts: ~ne12*k/n_experts tokens/expert, e.g. ~8 at npl128 for
|
||||
- // Qwen3-30B-A3B top-8/128), so each expert's single mmq_x-wide col-tile is mostly empty:
|
||||
- // the MMA accumulator tile is mmq_x-wide at compile time and wastes throughput on the
|
||||
- // padding columns while the larger y-tile lowers occupancy. Capping mmq_x toward the
|
||||
- // per-expert density raises tile fill + occupancy with no extra weight reads (at
|
||||
- // tokens/expert <= mmq_x there is still exactly one non-empty col-tile per expert; the
|
||||
- // emptier tiles are skipped by the jt*mmq_x >= col_diff guard in the stream-k kernel).
|
||||
- // Default (env unset or <= 0) = disabled => mmq_x selection is byte-identical to stock;
|
||||
- // off the ids path the cap never applies.
|
||||
+ // [paged patch 0015] expert-density-aware MoE token-tile (mmq_x) auto-select (DEFAULT-ON).
|
||||
+ // On the MUL_MAT_ID grouped-GEMM path (expert_bounds != nullptr) the GEMM columns are tokens
|
||||
+ // sorted by expert; stock picks mmq_x to cover ncols_max (= ne12, the token count, up to 128)
|
||||
+ // in a single column-tile, i.e. it MAXIMIZES the tile (128 on Blackwell) for the aggregate
|
||||
+ // batch. But the tile is then applied PER EXPERT, and at MoE decode the per-expert token
|
||||
+ // density is tiny (top-k of many experts), so each expert's single 128-wide col-tile is mostly
|
||||
+ // empty: the MMA accumulator tile is mmq_x-wide at compile time and burns throughput on the
|
||||
+ // padding columns while the larger y-tile lowers occupancy. vLLM's fused-MoE does the opposite
|
||||
+ // (a small per-expert BLOCK_SIZE_M). We reproduce that here, host-side only, by picking a
|
||||
+ // SMALLER mmq_x when - and only when - the per-expert density is low:
|
||||
+ //
|
||||
+ // ne_get_rows = args.ncols_dst = ne12 * n_expert_used (total token-expert assignments)
|
||||
+ // n_experts = args.nchannels_x = ne02
|
||||
+ // n_active_est = min(n_experts, ne_get_rows) (upper bound on active experts)
|
||||
+ // density = ceil(ne_get_rows / n_active_est) (avg tokens per active expert)
|
||||
+ //
|
||||
+ // Cap to the small tile (default 64) only when density <= density_max (default 8). 8 sits below
|
||||
+ // every prefill-ubatch density and above every decode density for n_experts in [128,511] at the
|
||||
+ // standard n_ubatch=512 (prefill 4096/n_experts, decode <=1024/n_experts), with >=8x fill headroom
|
||||
+ // so a capped expert segment never splits a col-tile. Decode (per-expert density 4 at 256 experts,
|
||||
+ // 8 at 128 experts @npl128) gets the fuller high-occupancy tile; the prefill ubatch (density 16 at
|
||||
+ // 256 / 32 at 128 experts) stays ABOVE the threshold and keeps the big
|
||||
+ // 128 compute tile - so unlike the blunt global cap (LLAMA_MOE_MMQ_X / patch 0014) this is
|
||||
+ // prefill-safe by construction. The selection only ever picks an already-compiled, granularity-
|
||||
+ // and shared-memory-validated mmq_x that the loop below would consider for a smaller batch; no
|
||||
+ // new kernel. Off the ids path (expert_bounds == nullptr) nothing changes => non-MoE mul_mat
|
||||
+ // and the gated f16/bf16 host-loop fallback stay byte-identical to stock.
|
||||
+ // - LLAMA_MOE_MMQ_X=<n> : manual blunt global cap, overrides the auto-select (patch 0014).
|
||||
+ // - LLAMA_MOE_AUTO_TILE=0 : disable the auto-select (exact stock selection).
|
||||
+ // - LLAMA_MOE_DECODE_TILE=<n>, LLAMA_MOE_DENSITY_MAX=<n> : tune the tile / threshold.
|
||||
int mmq_x_lim = mmq_x_max;
|
||||
if (args.expert_bounds != nullptr) {
|
||||
const int moe_cap = ggml_cuda_moe_mmq_x_cap();
|
||||
if (moe_cap > 0) {
|
||||
const int cap = moe_cap < 8 ? 8 : moe_cap;
|
||||
mmq_x_lim = cap < mmq_x_max ? cap : mmq_x_max;
|
||||
+ } else if (ggml_cuda_moe_auto_tile_enabled()) {
|
||||
+ const int64_t ne_get_rows = args.ncols_dst;
|
||||
+ const int64_t n_experts = args.nchannels_x;
|
||||
+ if (ne_get_rows > 0 && n_experts > 0) {
|
||||
+ const int64_t n_active = ne_get_rows < n_experts ? ne_get_rows : n_experts;
|
||||
+ const int64_t density = (ne_get_rows + n_active - 1) / n_active;
|
||||
+ const int tile = ggml_cuda_moe_decode_tile();
|
||||
+ if (density <= (int64_t) ggml_cuda_moe_density_max() && tile < mmq_x_max) {
|
||||
+ mmq_x_lim = tile;
|
||||
+ }
|
||||
+ }
|
||||
}
|
||||
}
|
||||
|
||||
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
|
||||
index c83e91f..62a0989 100644
|
||||
--- a/tests/test-backend-ops.cpp
|
||||
+++ b/tests/test-backend-ops.cpp
|
||||
@@ -8603,6 +8603,22 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_MXFP4, GGML_TYPE_F32, 32, 2, false, 2880, 32, 2880));
|
||||
test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_Q4_0, GGML_TYPE_F32, 32, 2, false, 2880, 32, 2880));
|
||||
|
||||
+ // [paged P0] MXFP4/NVFP4 qwen3-30b-a3b MoE decode-density regression gate for the expert-
|
||||
+ // density-aware mmq_x auto-select (patch 0015). Real expert-FFN slice (128 experts, top-8,
|
||||
+ // m=768, k=2048) so this exercises the exact grouped FP4-MMA mmq kernel the model runs.
|
||||
+ // Per-expert token density = n*n_used/n_mats = n/16; cover the decode band (density 1/4/8/16
|
||||
+ // at n 16/64/128/256), ragged token counts (n 33/130/200: experts with 0/1/2 tokens, n not a
|
||||
+ // multiple of the tile) where the tiny-M col-tiles change geometry and any masking can leak,
|
||||
+ // and a prefill-density shape (n 512 => density 32) the auto-select must leave on the large
|
||||
+ // 128 tile. n>=128 is exactly where stock picks mmq_x=128 and the auto-select picks 64, so the
|
||||
+ // op-test (CPU oracle vs CUDA, deterministic) is the bit-exact regression gate for P1: it must
|
||||
+ // pass with the auto-select on (default) and with LLAMA_MOE_AUTO_TILE=0 (stock selection).
|
||||
+ for (ggml_type type_a : {GGML_TYPE_MXFP4, GGML_TYPE_NVFP4}) {
|
||||
+ for (int n : {16, 33, 64, 128, 130, 200, 256, 512}) {
|
||||
+ test_cases.emplace_back(new test_mul_mat_id(type_a, GGML_TYPE_F32, 128, 8, false, 768, n, 2048));
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
for (ggml_type type_a : all_types) {
|
||||
test_cases.emplace_back(new test_mul_mat_id(type_a, GGML_TYPE_F32, 4, 2, false, 64, 16, 3*ggml_blck_size(type_a)));
|
||||
}
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,191 @@
|
||||
From 02fa0473a9324b7e12f9b203d221cc4ac80cfd33 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Wed, 24 Jun 2026 10:11:48 +0200
|
||||
Subject: [PATCH] feat(paged): dynamic decode-first prefill-token budget (patch
|
||||
0016, continuous-batch P1)
|
||||
|
||||
Supersede patch 0013's STATIC per-step prefill cap with a DYNAMIC,
|
||||
decode-first token budget: the P1 of the token-granular continuous-batch
|
||||
scheduler. POLICY change only inside update_slots(): no new slot states, no
|
||||
batch-formation rewrite, zero libllama changes. llama-server already emits one
|
||||
unified mixed prefill+decode batch per step (Phase 1 appends every ready decode
|
||||
token unconditionally; Phase 2 fills prefill into the same batch). 0016 only
|
||||
changes the COUNT of prefill tokens admitted per step.
|
||||
|
||||
The budget block already sits AFTER Phase 1's decode fill, so batch.n_tokens
|
||||
== D (the live decode load) is known there. Instead of 0013's constant
|
||||
LLAMA_PREFILL_BUDGET (which ignores D, needs per-workload tuning, and lets one
|
||||
long prompt monopolise the step), compute a dynamic budget:
|
||||
|
||||
T = clamp(LLAMA_MAX_BATCH_TOKENS (default n_batch), n_ubatch, n_batch)
|
||||
prefill_budget_step = max(n_ubatch, T - D) (leftover after decode,
|
||||
auto-shrinks as decode load rises so the step never inflates past T)
|
||||
prefill_cap_per_slot = min(T, ceil(0.04*n_ctx)) floored at n_ubatch,
|
||||
pinned to n_batch when T == n_batch (LLAMA_PREFILL_CAP overrides)
|
||||
|
||||
Phase 2's inner prompt-fill loop and outer admission break are bounded by
|
||||
prefill_budget_step (across slots) and a new per-slot slot_prompt_added
|
||||
counter; the n_batch hard ceiling stays as the compute bound. Decode is
|
||||
structurally claimed first and never capped (Phase 1), so the decode-first
|
||||
guarantee is free.
|
||||
|
||||
DEFAULT-OFF BYTE-IDENTICAL: with all knobs unset, behaviour is byte-identical
|
||||
to stock. The degenerate T == n_batch case is byte-identical to stock/0013 (the
|
||||
determinism oracle). The legacy LLAMA_PREFILL_BUDGET path is preserved exactly
|
||||
(honoured only when LLAMA_MAX_BATCH_TOKENS is unset), so 0013 is cleanly
|
||||
subsumed. Orthogonal to LLAMA_KV_PAGED: pure scheduler policy, identical
|
||||
decisions paged on or off.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
tools/server/server-context.cpp | 107 +++++++++++++++++++++++++-------
|
||||
1 file changed, 85 insertions(+), 22 deletions(-)
|
||||
|
||||
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
|
||||
index afcdebe..b8b8f00 100644
|
||||
--- a/tools/server/server-context.cpp
|
||||
+++ b/tools/server/server-context.cpp
|
||||
@@ -3043,24 +3043,78 @@ private:
|
||||
int32_t n_batch = llama_n_batch(ctx_tgt);
|
||||
int32_t n_ubatch = llama_n_ubatch(ctx_tgt);
|
||||
|
||||
- // PAGED serving lever (patch 0013): decoupled per-step prefill-token budget.
|
||||
- // Analogue of vLLM's --max-num-batched-tokens. Stock llama-server caps the prompt
|
||||
- // tokens ingested per update_slots() step at n_batch only; with cont_batching the
|
||||
- // sampled decode tokens of every generating slot are appended FIRST, then prompt
|
||||
- // tokens fill the batch up to n_batch. A long prompt therefore grabs an ~n_batch
|
||||
- // chunk in a SINGLE compute-heavy step, spiking the inter-token latency of every
|
||||
- // co-batched decoder (head-of-line jitter). LLAMA_PREFILL_BUDGET caps the prompt
|
||||
- // tokens added per step independently of n_batch, splitting a long prefill across
|
||||
- // more steps so in-flight decode keeps advancing smoothly. Default (env unset or
|
||||
- // <=0) = disabled => stock behavior is byte-identical. Orthogonal to LLAMA_KV_PAGED
|
||||
- // (this is a pure scheduler knob; works with paged off).
|
||||
- int32_t n_prefill_budget = 0; // 0 = disabled (stock n_batch-only chunking)
|
||||
+ // PAGED serving lever (patch 0016, supersedes 0013): dynamic decode-first
|
||||
+ // per-step prefill-token budget (continuous-batch scheduler P1). llama-server
|
||||
+ // already builds ONE mixed batch per update_slots() step: Phase 1 (just above)
|
||||
+ // appended every generating slot's sampled token UNCONDITIONALLY, so at this point
|
||||
+ // batch.n_tokens == D is the live decode load; Phase 2 (below) fills the remaining
|
||||
+ // batch capacity with prompt tokens. Patch 0013 capped Phase 2 with a STATIC
|
||||
+ // constant (LLAMA_PREFILL_BUDGET) that ignores D, needs per-workload tuning, and
|
||||
+ // lets one long prompt monopolise the step.
|
||||
+ //
|
||||
+ // This computes a DYNAMIC budget instead, the vLLM v1 token-budget analogue:
|
||||
+ // a single total per-step token budget T, decode claims its D tokens first
|
||||
+ // (already in the batch), and prefill gets the leftover T - D distributed across
|
||||
+ // waiting prompts with a per-slot chunk cap. As decode load D rises the prefill
|
||||
+ // leftover auto-shrinks, so the step never inflates past T at any concurrency:
|
||||
+ // the budget self-tunes across the npl range and across dense vs MoE without a
|
||||
+ // hand-picked constant (the 161/333 tok/s GB10 decode ceiling is held tuning-free
|
||||
+ // instead of via 0013's hand-tuned 256). Decode is structurally claimed first and
|
||||
+ // never capped (Phase 1), so the decode-first guarantee is free here.
|
||||
+ //
|
||||
+ // LLAMA_MAX_BATCH_TOKENS (T) total per-step token budget (decode + prefill),
|
||||
+ // default n_batch, clamped to [n_ubatch, n_batch] so
|
||||
+ // the compute loop stays a single llama_decode and
|
||||
+ // prefill keeps an n_ubatch floor of progress.
|
||||
+ // LLAMA_PREFILL_CAP per-slot max prompt tokens per step (the
|
||||
+ // long_prefill_token_threshold analogue), default
|
||||
+ // min(T, ceil(0.04*n_ctx)) floored at n_ubatch, so
|
||||
+ // one long prompt cannot eat the whole leftover.
|
||||
+ // LLAMA_PREFILL_BUDGET legacy static cap (patch 0013); honoured ONLY when
|
||||
+ // LLAMA_MAX_BATCH_TOKENS is unset, for back-compat.
|
||||
+ //
|
||||
+ // DEFAULT-OFF BYTE-IDENTICAL: with all three knobs unset, and in the degenerate
|
||||
+ // T == n_batch case, behaviour is byte-identical to stock. At T == n_batch the
|
||||
+ // dynamic leftover max(n_ubatch, n_batch - D) and the n_batch per-slot cap both
|
||||
+ // reach the existing `batch.n_tokens < n_batch` ceiling at the SAME point, so no
|
||||
+ // new bound fires (the determinism oracle). Orthogonal to LLAMA_KV_PAGED: pure
|
||||
+ // scheduler policy, identical decisions with paged on or off.
|
||||
+ const int32_t n_decode_in_batch = batch.size(); // D: Phase 1 appended D decode tokens above
|
||||
+ int32_t prefill_budget_step = 0; // 0 = disabled (stock n_batch-only chunking)
|
||||
+ int32_t prefill_cap_per_slot = 0; // 0 = disabled (no per-slot prompt-chunk cap)
|
||||
{
|
||||
- const char * env_pb = getenv("LLAMA_PREFILL_BUDGET");
|
||||
- if (env_pb) {
|
||||
+ int32_t mbt = 0;
|
||||
+ if (const char * env_mbt = getenv("LLAMA_MAX_BATCH_TOKENS")) {
|
||||
+ mbt = atoi(env_mbt);
|
||||
+ }
|
||||
+ if (mbt > 0) {
|
||||
+ // dynamic decode-first budget (P1): T clamped to [n_ubatch, n_batch]
|
||||
+ int32_t T = std::min(n_batch, mbt);
|
||||
+ T = std::max(T, n_ubatch);
|
||||
+ // leftover after decode, floored at n_ubatch so prefill never fully starves
|
||||
+ prefill_budget_step = std::max(n_ubatch, T - n_decode_in_batch);
|
||||
+ // per-slot prompt-chunk cap (long_prefill_token_threshold analogue)
|
||||
+ int32_t cap = 0;
|
||||
+ if (const char * env_cap = getenv("LLAMA_PREFILL_CAP")) {
|
||||
+ cap = atoi(env_cap);
|
||||
+ }
|
||||
+ if (cap <= 0) {
|
||||
+ const int32_t pct4 = (n_ctx + 24) / 25; // ceil(0.04 * n_ctx)
|
||||
+ cap = std::min(T, std::max(n_ubatch, pct4));
|
||||
+ }
|
||||
+ cap = std::min(n_batch, std::max(n_ubatch, cap));
|
||||
+ // at T == n_batch the leftover and cap both reach the n_batch ceiling
|
||||
+ // together; pin the cap to n_batch so this case stays byte-identical
|
||||
+ if (T >= n_batch) {
|
||||
+ cap = n_batch;
|
||||
+ }
|
||||
+ prefill_cap_per_slot = cap;
|
||||
+ } else if (const char * env_pb = getenv("LLAMA_PREFILL_BUDGET")) {
|
||||
+ // legacy static budget (patch 0013), kept for back-compat when the
|
||||
+ // dynamic knob is unset: a constant per-step prefill cap, no per-slot cap
|
||||
const int v = atoi(env_pb);
|
||||
if (v > 0) {
|
||||
- n_prefill_budget = std::min(n_batch, std::max(1, v));
|
||||
+ prefill_budget_step = std::min(n_batch, std::max(1, v));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3509,11 +3563,18 @@ private:
|
||||
const auto & spans = slot.task->params.message_spans;
|
||||
const auto last_user_pos = spans.last_user_message_pos();
|
||||
|
||||
+ // (patch 0016) per-slot prompt tokens added this step, for the per-slot
|
||||
+ // chunk cap (resets each slot); n_batch stays the hard compute ceiling
|
||||
+ int32_t slot_prompt_added = 0;
|
||||
+
|
||||
// add prompt tokens for processing in the current batch
|
||||
- // (patch 0013) also stop once the per-step prefill budget is spent, so a long
|
||||
- // prompt is split across more steps and leaves batch room for co-batched decode
|
||||
+ // (patch 0016) also stop once (a) the dynamic per-step prefill budget
|
||||
+ // (the T - D leftover) is spent across all slots, or (b) this slot's
|
||||
+ // per-slot chunk cap is hit, so a long prompt is split across more steps
|
||||
+ // and leaves batch room for co-batched decode of the other slots
|
||||
while (slot.prompt.n_tokens() < slot.task->n_tokens() && batch.size() < n_batch &&
|
||||
- (n_prefill_budget == 0 || n_prompt_budgeted < n_prefill_budget)) {
|
||||
+ (prefill_budget_step == 0 || n_prompt_budgeted < prefill_budget_step) &&
|
||||
+ (prefill_cap_per_slot == 0 || slot_prompt_added < prefill_cap_per_slot)) {
|
||||
// get next token to process
|
||||
llama_token cur_tok = input_tokens[slot.prompt.n_tokens()];
|
||||
if (cur_tok == LLAMA_TOKEN_NULL) {
|
||||
@@ -3538,7 +3599,8 @@ private:
|
||||
slot.prompt.tokens.push_back(cur_tok);
|
||||
|
||||
slot.n_prompt_tokens_processed++;
|
||||
- n_prompt_budgeted++; // (patch 0013) count toward the per-step prefill budget
|
||||
+ n_prompt_budgeted++; // (patch 0016) toward the dynamic per-step prefill budget
|
||||
+ slot_prompt_added++; // (patch 0016) toward this slot's per-step chunk cap
|
||||
|
||||
// stop the prompt batch exactly before a user message
|
||||
if (spans.is_user_start(slot.prompt.n_tokens())) {
|
||||
@@ -3624,9 +3686,10 @@ private:
|
||||
if (!slot_batched) {
|
||||
slot_batched = &slot;
|
||||
}
|
||||
- // (patch 0013) stop adding prompts once the per-step prefill budget is spent,
|
||||
- // leaving the remaining batch capacity for co-batched decode of other slots
|
||||
- if (n_prefill_budget > 0 && n_prompt_budgeted >= n_prefill_budget) {
|
||||
+ // (patch 0016) stop admitting prompts once the dynamic per-step prefill
|
||||
+ // budget (the T - D leftover) is spent, leaving the remaining batch
|
||||
+ // capacity for co-batched decode of the other slots
|
||||
+ if (prefill_budget_step > 0 && n_prompt_budgeted >= prefill_budget_step) {
|
||||
add_ok = false;
|
||||
}
|
||||
});
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,245 @@
|
||||
From 089f78d2a2c04465a566d499dbe0a67c008435a8 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Wed, 24 Jun 2026 19:56:05 +0200
|
||||
Subject: [PATCH] feat(paged): FP4 decode GEMM track-B P0 gate + default-off
|
||||
occupancy instrumentation (patch 0017)
|
||||
|
||||
Track B targets the dense NVFP4 weight GEMM (~59% of the GB10 decode step). This lands the P0
|
||||
bit-exact parity gate and the P1 occupancy levers (default-off / byte-identical) and records the
|
||||
honest P1 result: the cheap host/occupancy tuning does NOT lift decode_agg on GB10 (sm_121) - the
|
||||
kill-gate tripped - so nothing is enabled by default.
|
||||
|
||||
P0 gate (tests/test-backend-ops.cpp): NVFP4/MXFP4 dense decode-shape MUL_MAT cases at the weight-
|
||||
row tiling boundary (m in {2048,1600,2050} = exact + ragged vs mmq_y 64/128, n in {32,128} = decode
|
||||
M, k=2048), so the bit-exact CPU-vs-CUDA oracle covers the mmq_y / min-blocks paths. Green at
|
||||
default and with every lever on: MUL_MAT 1115/1115, MUL_MAT_ID 805/805, NVFP4 0 fail.
|
||||
|
||||
P1 levers (ggml/src/ggml-cuda/mmq.cuh), all default-off => default build byte-identical to stock:
|
||||
- GGML_CUDA_FP4_MMQ_Y (default 128): type-aware get_mmq_y_host/device plumbing for an NVFP4
|
||||
weight-row tile override. mmq_y is rigidly nwarps*tile_C::I (=8*16=128, the mmq.cuh static_
|
||||
assert), so mmq_y<128 also needs nwarps-down (a warp-remap through the shared vec_dot/loader),
|
||||
left as the P2 kernel change; the host/device plumbing is in place and inert.
|
||||
- GGML_CUDA_FP4_MINBLOCKS (default 1): NVFP4-only __launch_bounds__ min-resident-CTAs lever
|
||||
(register-cap the FP4-MMA kernel so >1 CTA co-resides) - the bounded occupancy probe.
|
||||
- GGML_CUDA_FP4_DENSE_MMQ_X (env, default off): dense col-tile re-read occupancy diagnostic.
|
||||
|
||||
Measured GB10 (llama-batched-bench -fa on -npp 128 -ntg 128 -npl 32,128), decode_agg (S_TG):
|
||||
DENSE q36-27b-nvfp4 @npl128: P0 149.5 -> MINBLOCKS=2 147.9 (-1.1%) -> DENSE_MMQ_X=64 144.3
|
||||
(-3.5%) -> =32 141.7 (-5.2%). Every occupancy probe regresses.
|
||||
MoE q36-35b-a3b-nvfp4 @npl128: stock 336.3, MINBLOCKS=2 337.7 (+0.4%, noise), TILE16 324.0
|
||||
(-3.7%), TILE8 316.6 (-5.9%). mmq_x-down regresses (reproduces patch 0015; GDN/BW-bound).
|
||||
|
||||
nsys (kill-gate evidence): the decode FP4 GEMM mul_mat_q<NVFP4,128,0> went 2.782s -> 3.025s
|
||||
(avg 608us -> 661us, +8.7% slower) under MINBLOCKS=2 - register-capping spills, so occupancy did
|
||||
not usefully rise. Verdict: the dense M=128 tile is already weight-read/one-read-optimal at
|
||||
mmq_x=128, NOT occupancy-starved via the cheap levers; the only untested lever is the structural
|
||||
mmq_y-down (nwarps=4 warp-remap), deferred to P2. Bit-exact gate holds throughout.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
ggml/src/ggml-cuda/mmq.cuh | 85 ++++++++++++++++++++++++++++++++++----
|
||||
tests/test-backend-ops.cpp | 16 +++++++
|
||||
2 files changed, 92 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
|
||||
index 9718b12..b53e38a 100644
|
||||
--- a/ggml/src/ggml-cuda/mmq.cuh
|
||||
+++ b/ggml/src/ggml-cuda/mmq.cuh
|
||||
@@ -140,7 +140,24 @@ static constexpr __device__ int get_mmq_x_max_device() {
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
}
|
||||
|
||||
-static int get_mmq_y_host(const int cc) {
|
||||
+// [paged patch 0017 / track B] Dense NVFP4 decode mmq_y (weight-row tile) override.
|
||||
+// mmq_y tiles the N (weight-row) dimension of the FP4-MMA weight GEMM. Lowering it raises the
|
||||
+// number of resident CTAs (smaller per-CTA shared footprint + smaller per-thread accumulator) to
|
||||
+// hide LPDDR5x weight-load latency at the M=128 decode tile, WITHOUT re-reading weights: every
|
||||
+// weight row lives in exactly one row-tile, so total weight traffic is unchanged (bandwidth-
|
||||
+// neutral) - the dense-decode occupancy lever from FP4_GEMM_SCOPE_B.md s3/s4.1. mmq_y is a PURE
|
||||
+// N-row tiling knob: the per-output reduction over K is identical for any mmq_y, so the result
|
||||
+// stays BIT-EXACT (gated by test-backend-ops MUL_MAT NVFP4 decode shapes). Default 128 == exact
|
||||
+// stock behaviour (a default build is byte-identical to stock); build -DGGML_CUDA_FP4_MMQ_Y=64
|
||||
+// (or 96) to enable the tune. Applies ONLY to NVFP4 on Blackwell; every other type/arch untouched.
|
||||
+#ifndef GGML_CUDA_FP4_MMQ_Y
|
||||
+#define GGML_CUDA_FP4_MMQ_Y 128
|
||||
+#endif
|
||||
+
|
||||
+static int get_mmq_y_host(const int cc, const ggml_type type = GGML_TYPE_COUNT) {
|
||||
+ if (GGML_CUDA_FP4_MMQ_Y != 128 && type == GGML_TYPE_NVFP4 && blackwell_mma_available(cc)) {
|
||||
+ return GGML_CUDA_FP4_MMQ_Y;
|
||||
+ }
|
||||
return GGML_CUDA_CC_IS_AMD(cc) ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) :
|
||||
((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ? 128 : 64);
|
||||
}
|
||||
@@ -154,7 +171,13 @@ if (type == GGML_TYPE_NVFP4 || type == GGML_TYPE_MXFP4) {
|
||||
return MMQ_ITER_K;
|
||||
}
|
||||
|
||||
+template <ggml_type type = GGML_TYPE_COUNT>
|
||||
static constexpr __device__ int get_mmq_y_device() {
|
||||
+#if defined(BLACKWELL_MMA_AVAILABLE)
|
||||
+ if (type == GGML_TYPE_NVFP4 && GGML_CUDA_FP4_MMQ_Y != 128) {
|
||||
+ return GGML_CUDA_FP4_MMQ_Y;
|
||||
+ }
|
||||
+#endif // defined(BLACKWELL_MMA_AVAILABLE)
|
||||
#if defined(GGML_USE_HIP)
|
||||
#if defined(RDNA1)
|
||||
return 64;
|
||||
@@ -170,6 +193,28 @@ static constexpr __device__ int get_mmq_y_device() {
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
}
|
||||
|
||||
+// [paged patch 0017 / track B] Dense NVFP4 decode occupancy lever: min resident CTAs per SM.
|
||||
+// The FP4-MMA mul_mat_q is REGISTER-bound to 1 CTA/SM (__launch_bounds__(256,1) => ~255 regs/thread
|
||||
+// => one resident block, the under-occupancy that strands the kernel at ~3% of FP4 peak at M=128).
|
||||
+// Raising the __launch_bounds__ min-blocks operand register-caps the compiler so N CTAs co-reside,
|
||||
+// hiding LPDDR5x weight-load latency by CTA-parallelism (the scope s4.1 occupancy goal) WITHOUT a
|
||||
+// structural mmq_y/nwarps change and WITHOUT extra weight reads (each weight tile still read once).
|
||||
+// Register allocation cannot change results => BIT-EXACT (gated by test-backend-ops MUL_MAT NVFP4).
|
||||
+// Default 1 == exact stock behaviour (byte-identical); build -DGGML_CUDA_FP4_MINBLOCKS=2 to enable.
|
||||
+// Applies ONLY to NVFP4 on Blackwell; every other type/arch keeps the stock min-blocks.
|
||||
+#ifndef GGML_CUDA_FP4_MINBLOCKS
|
||||
+#define GGML_CUDA_FP4_MINBLOCKS 1
|
||||
+#endif
|
||||
+template <ggml_type type = GGML_TYPE_COUNT>
|
||||
+static constexpr __device__ int mmq_get_min_blocks_device(const int stock) {
|
||||
+#if defined(BLACKWELL_MMA_AVAILABLE)
|
||||
+ if (type == GGML_TYPE_NVFP4 && GGML_CUDA_FP4_MINBLOCKS != 1) {
|
||||
+ return GGML_CUDA_FP4_MINBLOCKS;
|
||||
+ }
|
||||
+#endif // defined(BLACKWELL_MMA_AVAILABLE)
|
||||
+ return stock;
|
||||
+}
|
||||
+
|
||||
// Decouple shared memory tile sizes from WARP_SIZE to allow for different warp sizes.
|
||||
// The K dimension of the tiles has either,
|
||||
// 1*MMQ_TILE_NE_K==32 (always for TILE_Y_K) or 2*MMQ_TILE_NE_K==64 (typically for TILE_X_K),
|
||||
@@ -3454,7 +3499,7 @@ static __device__ __forceinline__ void mul_mat_q_process_tile(
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
- constexpr int mmq_y = get_mmq_y_device();
|
||||
+ constexpr int mmq_y = get_mmq_y_device<type>();
|
||||
constexpr load_tiles_mmq_t load_tiles = mmq_type_traits<mmq_x, mmq_y, need_check, type>::load_tiles;
|
||||
|
||||
extern __shared__ int data_mul_mat_q[];
|
||||
@@ -3531,13 +3576,13 @@ static __device__ __forceinline__ void mul_mat_q_process_tile(
|
||||
template <ggml_type type, int mmq_x, bool need_check>
|
||||
#if defined(GGML_USE_HIP)
|
||||
#if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
||||
- __launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 2)
|
||||
+ __launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), mmq_get_min_blocks_device<type>(2))
|
||||
#endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
||||
#else
|
||||
#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
- __launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 1)
|
||||
+ __launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), mmq_get_min_blocks_device<type>(1))
|
||||
#else
|
||||
- __launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 2)
|
||||
+ __launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), mmq_get_min_blocks_device<type>(2))
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
static __global__ void mul_mat_q(
|
||||
@@ -3558,7 +3603,7 @@ static __global__ void mul_mat_q(
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
- constexpr int mmq_y = get_mmq_y_device();
|
||||
+ constexpr int mmq_y = get_mmq_y_device<type>();
|
||||
|
||||
const uint32_t nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
|
||||
|
||||
@@ -3790,7 +3835,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
|
||||
float * __restrict__ tmp_last_tile, const uint3 blocks_per_ne00, const int nrows_x, const int ncols_dst,
|
||||
const int stride_col_dst, const uint3 nchannels_y, const int stride_channel_dst, const uint3 nsamples_y,
|
||||
const int stride_sample_dst, const uint3 ntx) {
|
||||
- constexpr int mmq_y = get_mmq_y_device();
|
||||
+ constexpr int mmq_y = get_mmq_y_device<type>();
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int ITER_K = get_iter_k(type);
|
||||
constexpr int blocks_per_iter = ITER_K / qk;
|
||||
@@ -3947,7 +3992,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
const int nsm = ggml_cuda_info().devices[id].nsm;
|
||||
const int warp_size = ggml_cuda_info().devices[id].warp_size;
|
||||
const int nwarps = mmq_get_nwarps_host(cc, warp_size);
|
||||
- const int mmq_y = get_mmq_y_host(cc);
|
||||
+ const int mmq_y = get_mmq_y_host(cc, type);
|
||||
|
||||
const dim3 block_dims(warp_size, nwarps, 1);
|
||||
|
||||
@@ -4103,6 +4148,21 @@ static inline int ggml_cuda_moe_density_max() {
|
||||
return d;
|
||||
}
|
||||
|
||||
+// [paged patch 0017 / track B] DENSE NVFP4 decode mmq_x re-read occupancy DIAGNOSTIC (env, default off).
|
||||
+// GGML_CUDA_FP4_DENSE_MMQ_X=<n> caps the dense (non-MoE) NVFP4 col-tile to <n>, splitting the M=128
|
||||
+// decode ubatch into ceil(128/n) col-tiles. Each col-tile re-reads the full weight set (fatal cost
|
||||
+// in the BW-bound regime) but multiplies resident CTAs. This is the scope s4.1 A/B probe: if
|
||||
+// decode_agg RISES with cap=64 despite the 2x weight read, occupancy is badly broken (the kernel is
|
||||
+// compute/occupancy-bound, so mmq_y-down / min-blocks has large upside); if it FALLS, the tile is
|
||||
+// already bandwidth-saturated and the occupancy ceiling is lower. Unset/<=0 => stock selection.
|
||||
+static inline int ggml_cuda_fp4_dense_mmq_x_cap() {
|
||||
+ static const int c = []() -> int {
|
||||
+ const char * s = getenv("GGML_CUDA_FP4_DENSE_MMQ_X");
|
||||
+ return s ? atoi(s) : 0;
|
||||
+ }();
|
||||
+ return c;
|
||||
+}
|
||||
+
|
||||
template <ggml_type type>
|
||||
void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
|
||||
const int id = ggml_cuda_get_device();
|
||||
@@ -4112,7 +4172,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
|
||||
const int nwarps = mmq_get_nwarps_host(cc, warp_size);
|
||||
|
||||
const int mmq_x_max = get_mmq_x_max_host(cc);
|
||||
- const int mmq_y = get_mmq_y_host(cc);
|
||||
+ const int mmq_y = get_mmq_y_host(cc, type);
|
||||
|
||||
// [paged patch 0015] expert-density-aware MoE token-tile (mmq_x) auto-select (DEFAULT-ON).
|
||||
// On the MUL_MAT_ID grouped-GEMM path (expert_bounds != nullptr) the GEMM columns are tokens
|
||||
@@ -4145,6 +4205,13 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
|
||||
// - LLAMA_MOE_AUTO_TILE=0 : disable the auto-select (exact stock selection).
|
||||
// - LLAMA_MOE_DECODE_TILE=<n>, LLAMA_MOE_DENSITY_MAX=<n> : tune the tile / threshold.
|
||||
int mmq_x_lim = mmq_x_max;
|
||||
+ if (args.expert_bounds == nullptr && type == GGML_TYPE_NVFP4) {
|
||||
+ // dense NVFP4 decode mmq_x re-read occupancy diagnostic (see ggml_cuda_fp4_dense_mmq_x_cap).
|
||||
+ const int cap = ggml_cuda_fp4_dense_mmq_x_cap();
|
||||
+ if (cap > 0 && cap < mmq_x_max) {
|
||||
+ mmq_x_lim = cap < 8 ? 8 : cap;
|
||||
+ }
|
||||
+ }
|
||||
if (args.expert_bounds != nullptr) {
|
||||
const int moe_cap = ggml_cuda_moe_mmq_x_cap();
|
||||
if (moe_cap > 0) {
|
||||
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
|
||||
index f219309..291c275 100644
|
||||
--- a/tests/test-backend-ops.cpp
|
||||
+++ b/tests/test-backend-ops.cpp
|
||||
@@ -8591,6 +8591,22 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
}
|
||||
}
|
||||
|
||||
+ // [paged P0 / track B] NVFP4/MXFP4 dense decode-shape mmq_y-down bit-exact gate.
|
||||
+ // The dense FP4 weight GEMM is the track-B target; P1 lowers mmq_y (the weight-row tile) on the
|
||||
+ // NVFP4 decode path to raise resident-CTA occupancy. mmq_y is a pure N-row tiling knob, so a
|
||||
+ // smaller mmq_y must stay BIT-EXACT (identical per-output reduction over K) - this gate proves
|
||||
+ // it. m = weight rows (N, tiled by mmq_y): 2048 (exact at mmq_y 64 & 128), 1600 (ragged vs 128),
|
||||
+ // 2050 (ragged vs both 64 & 128 -> exercises the need_check last-row-tile at both). n = decode
|
||||
+ // token count M = 32 and 128 (the scope decode shapes, tiled by mmq_x). k = 2048 hidden. Must
|
||||
+ // pass with the default build (mmq_y=128) AND a mmq_y=64 build, CUDA-vs-CPU oracle, bit-exact.
|
||||
+ for (ggml_type type_a : {GGML_TYPE_MXFP4, GGML_TYPE_NVFP4}) {
|
||||
+ for (int64_t m : {2048, 1600, 2050}) {
|
||||
+ for (int64_t n : {32, 128}) {
|
||||
+ test_cases.emplace_back(new test_mul_mat(type_a, GGML_TYPE_F32, m, n, 2048, {1, 1}, {1, 1}));
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
for (ggml_type type_a : all_types) {
|
||||
test_cases.emplace_back(new test_mul_mat_id(type_a, GGML_TYPE_F32, 4, 2, false, 64, 16, 3*ggml_blck_size(type_a)));
|
||||
}
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,349 @@
|
||||
From 17f16e8f6d8dbc689d5151c44759792d683c957b Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Thu, 25 Jun 2026 00:44:13 +0200
|
||||
Subject: [PATCH] feat(paged): qwen35 gated-DeltaNet in-place SSM state
|
||||
write-back (patch 0018)
|
||||
|
||||
Decode on the Qwen3.6 hybrid-SSM models (arch qwen35, 48 gated-DeltaNet :
|
||||
16 full-attention layers) was dominated by recurrent-state plumbing, not the
|
||||
FP4 GEMM. Per SSM layer per step the fused gated_delta_net op wrote its new
|
||||
recurrent state into graph scratch, then a separate ggml_cpy persisted it into
|
||||
the recurrent-state cache. nsys attributed 18.9% of decode GPU time to that
|
||||
~225 MB/copy D2D memcpy (1584 ops, 356 GB over the A2 decompose window).
|
||||
|
||||
This mirrors vLLM fused_recurrent_gated_delta_rule (state kept in place):
|
||||
ggml_gated_delta_net_inplace writes the final recurrent state directly into the
|
||||
active sequences contiguous cache slot (at kv_head), removing the copy-back. The
|
||||
op output then carries only the attention scores; the SSM arithmetic is
|
||||
unchanged (bit-identical greedy output vs the copy-back baseline).
|
||||
|
||||
- new op builder ggml_gated_delta_net_inplace (src[6] = state_dst cache view)
|
||||
- CUDA + CPU honor src[6]; final-state (K==1, keep_rs off) write redirected there
|
||||
- delta-net-base build_recurrent_attn uses it on the fused decode/prefill path,
|
||||
dropping the ggml_cpy; rollback (n_rs_seq>0) path unchanged
|
||||
|
||||
Measured (q36-27b-nvfp4, decode_agg S_TG, npp128 ntg128, -fa on, paged on):
|
||||
npl 32 : 113.74 -> 136.39 t/s (+19.9 percent)
|
||||
npl 128: 146.23 -> 180.53 t/s (+23.5 percent, = predicted copy-removal ceiling)
|
||||
MoE q36-35b-a3b-nvfp4: npl128 313.36 -> 372.62 t/s (+18.9 percent).
|
||||
nsys D2D memcpy bucket 18.9 -> 0.23 percent (356 -> 2.93 GB). vLLM share
|
||||
(391 @128) 37.4 -> 46.2 percent. get_rows state gather (now 18.8 percent) is the
|
||||
next lever.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
ggml/include/ggml.h | 14 ++++++
|
||||
ggml/src/ggml-cpu/ops.cpp | 13 ++++-
|
||||
ggml/src/ggml-cuda/gated_delta_net.cu | 39 ++++++++++-----
|
||||
ggml/src/ggml.c | 68 +++++++++++++++++++++++++++
|
||||
src/models/delta-net-base.cpp | 30 ++++++++++++
|
||||
5 files changed, 152 insertions(+), 12 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
|
||||
index 823f5a9..4e7ab32 100644
|
||||
--- a/ggml/include/ggml.h
|
||||
+++ b/ggml/include/ggml.h
|
||||
@@ -2579,6 +2579,20 @@ extern "C" {
|
||||
struct ggml_tensor * state,
|
||||
int64_t K);
|
||||
|
||||
+ // same recurrence as ggml_gated_delta_net with K == 1, but the final recurrent state is written
|
||||
+ // in place into state_dst (a view into the recurrent-state cache) instead of being appended to
|
||||
+ // the op output, eliminating the per-step state copy-back during decode. state_dst must be a
|
||||
+ // contiguous [S_v*S_v*H, n_seqs] view (per-seq stride == dense state size).
|
||||
+ GGML_API struct ggml_tensor * ggml_gated_delta_net_inplace(
|
||||
+ struct ggml_context * ctx,
|
||||
+ struct ggml_tensor * q,
|
||||
+ struct ggml_tensor * k,
|
||||
+ struct ggml_tensor * v,
|
||||
+ struct ggml_tensor * g,
|
||||
+ struct ggml_tensor * beta,
|
||||
+ struct ggml_tensor * state,
|
||||
+ struct ggml_tensor * state_dst);
|
||||
+
|
||||
// custom operators
|
||||
|
||||
typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
|
||||
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
|
||||
index 63c07a2..9457add 100644
|
||||
--- a/ggml/src/ggml-cpu/ops.cpp
|
||||
+++ b/ggml/src/ggml-cpu/ops.cpp
|
||||
@@ -10600,6 +10600,7 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
|
||||
ggml_tensor * src_g = dst->src[3];
|
||||
ggml_tensor * src_beta = dst->src[4];
|
||||
ggml_tensor * src_state = dst->src[5];
|
||||
+ ggml_tensor * src_state_dst = dst->src[6]; // optional in-place final-state write-back target
|
||||
|
||||
const int64_t S_v = src_v->ne[0];
|
||||
const int64_t H = src_v->ne[1];
|
||||
@@ -10660,6 +10661,16 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
|
||||
|
||||
const float scale = 1.0f / sqrtf((float) S_v);
|
||||
|
||||
+ // when src_state_dst is provided (in-place decode write-back) the final state is written
|
||||
+ // directly into the persistent cache view, removing the separate state copy-back node.
|
||||
+ float * inplace_state_base = nullptr;
|
||||
+ if (src_state_dst != nullptr) {
|
||||
+ GGML_ASSERT(K == 1);
|
||||
+ GGML_ASSERT(src_state_dst->nb[0] == sizeof(float));
|
||||
+ GGML_ASSERT(src_state_dst->nb[1] == (size_t) S_v * S_v * H * sizeof(float));
|
||||
+ inplace_state_base = (float *) src_state_dst->data;
|
||||
+ }
|
||||
+
|
||||
for (int64_t ir = ir0; ir < ir1; ++ir) {
|
||||
const int64_t iv1 = ir % H; // head_index
|
||||
const int64_t iv3 = ir / H; // sequence
|
||||
@@ -10674,7 +10685,7 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
|
||||
// For K>1, work in scratch and copy out per-token when the slot is in range.
|
||||
float * s_out = (K > 1)
|
||||
? state_work
|
||||
- : state_out_base + (iv3 * H + iv1) * S_v * S_v;
|
||||
+ : (inplace_state_base ? inplace_state_base : state_out_base) + (iv3 * H + iv1) * S_v * S_v;
|
||||
|
||||
// copy input state into the working buffer and operate in-place
|
||||
// state layout [S_v, S_v, H, n_seqs]: seq iv3 starts at iv3 * state_seq_stride.
|
||||
diff --git a/ggml/src/ggml-cuda/gated_delta_net.cu b/ggml/src/ggml-cuda/gated_delta_net.cu
|
||||
index a547360..61a2b91 100644
|
||||
--- a/ggml/src/ggml-cuda/gated_delta_net.cu
|
||||
+++ b/ggml/src/ggml-cuda/gated_delta_net.cu
|
||||
@@ -25,7 +25,8 @@ gated_delta_net_cuda(const float * q,
|
||||
const uint3 neqk1_magic,
|
||||
const uint3 rq3_magic,
|
||||
float scale,
|
||||
- int K) {
|
||||
+ int K,
|
||||
+ float * state_dst) {
|
||||
const uint32_t h_idx = blockIdx.x;
|
||||
const uint32_t sequence = blockIdx.y;
|
||||
// each warp owns one column, using warp-level primitives to reduce across rows
|
||||
@@ -37,7 +38,10 @@ gated_delta_net_cuda(const float * q,
|
||||
|
||||
const int64_t attn_score_elems = S_v * H * n_tokens * n_seqs;
|
||||
float * attn_data = dst;
|
||||
- float * state = dst + attn_score_elems;
|
||||
+ // when state_dst is provided (in-place decode write-back) the final recurrent state is written
|
||||
+ // directly into the persistent cache view instead of being appended to the op output; this
|
||||
+ // eliminates the per-layer per-step D2D state copy-back. Only used when keep_rs_t == false.
|
||||
+ float * state = (state_dst != nullptr) ? state_dst : (dst + attn_score_elems);
|
||||
|
||||
// input state holds s0 only: [S_v, S_v, H, n_seqs] — seq stride is D = H * S_v * S_v.
|
||||
// output state layout (per-slot D * n_seqs) — same per-(seq,head) offset as before.
|
||||
@@ -171,7 +175,7 @@ template <bool KDA, bool keep_rs_t>
|
||||
static void launch_gated_delta_net(
|
||||
const float * q_d, const float * k_d, const float * v_d,
|
||||
const float * g_d, const float * b_d, const float * s_d,
|
||||
- float * dst_d,
|
||||
+ float * dst_d, float * state_dst_d,
|
||||
int64_t S_v, int64_t H, int64_t n_tokens, int64_t n_seqs,
|
||||
int64_t sq1, int64_t sq2, int64_t sq3,
|
||||
int64_t sv1, int64_t sv2, int64_t sv3,
|
||||
@@ -195,26 +199,26 @@ static void launch_gated_delta_net(
|
||||
ggml_cuda_kernel_launch(gated_delta_net_cuda<16, KDA, keep_rs_t>, launch_params,
|
||||
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
||||
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
- sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K);
|
||||
+ sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K, state_dst_d);
|
||||
break;
|
||||
case 32:
|
||||
ggml_cuda_kernel_launch(gated_delta_net_cuda<32, KDA, keep_rs_t>, launch_params,
|
||||
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
||||
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
- sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K);
|
||||
+ sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K, state_dst_d);
|
||||
break;
|
||||
case 64: {
|
||||
ggml_cuda_kernel_launch(gated_delta_net_cuda<64, KDA, keep_rs_t>, launch_params,
|
||||
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
||||
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
- sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K);
|
||||
+ sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K, state_dst_d);
|
||||
break;
|
||||
}
|
||||
case 128: {
|
||||
ggml_cuda_kernel_launch(gated_delta_net_cuda<128, KDA, keep_rs_t>, launch_params,
|
||||
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
||||
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
- sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K);
|
||||
+ sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K, state_dst_d);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
@@ -230,6 +234,7 @@ void ggml_cuda_op_gated_delta_net(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
ggml_tensor * src_g = dst->src[3];
|
||||
ggml_tensor * src_beta = dst->src[4];
|
||||
ggml_tensor * src_state = dst->src[5];
|
||||
+ ggml_tensor * src_state_dst = dst->src[6]; // optional in-place state write-back target
|
||||
|
||||
GGML_TENSOR_LOCALS(int64_t, neq, src_q, ne);
|
||||
GGML_TENSOR_LOCALS(size_t , nbq, src_q, nb);
|
||||
@@ -260,6 +265,15 @@ void ggml_cuda_op_gated_delta_net(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
const float * s_d = (const float *) src_state->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
+ float * state_dst_d = nullptr;
|
||||
+ if (src_state_dst != nullptr) {
|
||||
+ // in-place final-state cache view: per-seq stride must be the dense state size D = S_v*S_v*H
|
||||
+ GGML_ASSERT(src_state_dst->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(src_state_dst->nb[0] == sizeof(float));
|
||||
+ GGML_ASSERT(src_state_dst->nb[1] == (size_t) S_v * S_v * H * sizeof(float));
|
||||
+ state_dst_d = (float *) src_state_dst->data;
|
||||
+ }
|
||||
+
|
||||
GGML_ASSERT(ggml_is_contiguous_rows(src_q));
|
||||
GGML_ASSERT(ggml_is_contiguous_rows(src_k));
|
||||
GGML_ASSERT(ggml_is_contiguous_rows(src_v));
|
||||
@@ -288,23 +302,26 @@ void ggml_cuda_op_gated_delta_net(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
const int K = ggml_get_op_params_i32(dst, 0);
|
||||
const bool keep_rs = K > 1;
|
||||
|
||||
+ // in-place write-back is only valid for the single-snapshot (final-state) case
|
||||
+ GGML_ASSERT(state_dst_d == nullptr || !keep_rs);
|
||||
+
|
||||
if (kda) {
|
||||
if (keep_rs) {
|
||||
- launch_gated_delta_net<true, true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
|
||||
+ launch_gated_delta_net<true, true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, state_dst_d,
|
||||
S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
sb1, sb2, sb3, neqk1, rq3, scale, K, stream);
|
||||
} else {
|
||||
- launch_gated_delta_net<true, false>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
|
||||
+ launch_gated_delta_net<true, false>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, state_dst_d,
|
||||
S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
sb1, sb2, sb3, neqk1, rq3, scale, K, stream);
|
||||
}
|
||||
} else {
|
||||
if (keep_rs) {
|
||||
- launch_gated_delta_net<false, true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
|
||||
+ launch_gated_delta_net<false, true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, state_dst_d,
|
||||
S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
sb1, sb2, sb3, neqk1, rq3, scale, K, stream);
|
||||
} else {
|
||||
- launch_gated_delta_net<false, false>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
|
||||
+ launch_gated_delta_net<false, false>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, state_dst_d,
|
||||
S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
sb1, sb2, sb3, neqk1, rq3, scale, K, stream);
|
||||
}
|
||||
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
|
||||
index adbe52b..b8d34bf 100644
|
||||
--- a/ggml/src/ggml.c
|
||||
+++ b/ggml/src/ggml.c
|
||||
@@ -6285,6 +6285,74 @@ struct ggml_tensor * ggml_gated_delta_net(
|
||||
return result;
|
||||
}
|
||||
|
||||
+// ggml_gated_delta_net_inplace
|
||||
+//
|
||||
+// Same recurrence as ggml_gated_delta_net with K == 1, but the final recurrent state is written
|
||||
+// in place into `state_dst` (a view into the persistent recurrent-state cache) instead of being
|
||||
+// appended to the op output. This removes the per-layer per-step D2D state copy-back during decode.
|
||||
+// The op output holds ONLY the attention scores; the state region is still allocated (unused) so
|
||||
+// the attention-output view layout is identical to ggml_gated_delta_net.
|
||||
+struct ggml_tensor * ggml_gated_delta_net_inplace(
|
||||
+ struct ggml_context * ctx,
|
||||
+ struct ggml_tensor * q,
|
||||
+ struct ggml_tensor * k,
|
||||
+ struct ggml_tensor * v,
|
||||
+ struct ggml_tensor * g,
|
||||
+ struct ggml_tensor * beta,
|
||||
+ struct ggml_tensor * state,
|
||||
+ struct ggml_tensor * state_dst) {
|
||||
+ GGML_ASSERT(ggml_is_contiguous_rows(q));
|
||||
+ GGML_ASSERT(ggml_is_contiguous_rows(k));
|
||||
+ GGML_ASSERT(ggml_is_contiguous_rows(v));
|
||||
+ GGML_ASSERT(ggml_is_contiguous(g));
|
||||
+ GGML_ASSERT(ggml_is_contiguous(beta));
|
||||
+ GGML_ASSERT(ggml_is_contiguous(state));
|
||||
+
|
||||
+ GGML_ASSERT(q->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(k->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(v->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(g->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(beta->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(state->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(state_dst != NULL);
|
||||
+ GGML_ASSERT(state_dst->type == GGML_TYPE_F32);
|
||||
+
|
||||
+ const int64_t S_v = v->ne[0];
|
||||
+ const int64_t H = v->ne[1];
|
||||
+ const int64_t n_tokens = v->ne[2];
|
||||
+ const int64_t n_seqs = v->ne[3];
|
||||
+
|
||||
+ GGML_ASSERT(g->ne[0] == 1 || g->ne[0] == S_v);
|
||||
+ GGML_ASSERT(beta->ne[0] == 1);
|
||||
+
|
||||
+ GGML_ASSERT(state->ne[0] == S_v);
|
||||
+ GGML_ASSERT(state->ne[1] == S_v);
|
||||
+ GGML_ASSERT(state->ne[2] == H);
|
||||
+ GGML_ASSERT(state->ne[3] == n_seqs);
|
||||
+
|
||||
+ // state_dst holds the per-seq final state contiguously: [S_v*S_v*H, >= n_seqs]
|
||||
+ GGML_ASSERT(state_dst->ne[0] == S_v * S_v * H);
|
||||
+ GGML_ASSERT(state_dst->ne[1] >= n_seqs);
|
||||
+ GGML_ASSERT(state_dst->nb[0] == sizeof(float));
|
||||
+
|
||||
+ const int64_t state_rows = S_v * n_seqs; // K == 1
|
||||
+ const int64_t ne[4] = { S_v * H, n_tokens * n_seqs + state_rows, 1, 1 };
|
||||
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
||||
+
|
||||
+ ggml_set_op_params_i32(result, 0, 1); // K == 1
|
||||
+
|
||||
+ result->op = GGML_OP_GATED_DELTA_NET;
|
||||
+ result->src[0] = q;
|
||||
+ result->src[1] = k;
|
||||
+ result->src[2] = v;
|
||||
+ result->src[3] = g;
|
||||
+ result->src[4] = beta;
|
||||
+ result->src[5] = state;
|
||||
+ result->src[6] = state_dst;
|
||||
+
|
||||
+ return result;
|
||||
+}
|
||||
+
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct ggml_hash_set ggml_hash_set_new(size_t size) {
|
||||
diff --git a/src/models/delta-net-base.cpp b/src/models/delta-net-base.cpp
|
||||
index ad9ce77..26a718b 100644
|
||||
--- a/src/models/delta-net-base.cpp
|
||||
+++ b/src/models/delta-net-base.cpp
|
||||
@@ -546,6 +546,36 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn(
|
||||
const bool keep = cparams.n_rs_seq > 0;
|
||||
|
||||
if (!keep) {
|
||||
+ const bool fused = (n_seq_tokens == 1) ? cparams.fused_gdn_ar : cparams.fused_gdn_ch;
|
||||
+
|
||||
+ if (fused) {
|
||||
+ // In-place state write-back: the fused gated-DeltaNet op writes the new recurrent state
|
||||
+ // directly into the persistent cache slot for the active sequences (a contiguous block
|
||||
+ // at kv_head), eliminating the per-layer per-step ~full-state D2D copy-back that
|
||||
+ // dominated decode. The op output then carries only the attention scores.
|
||||
+ ggml_tensor * state_dst = ggml_view_2d(ctx0, ssm_states_all, hparams.n_embd_s(), n_seqs,
|
||||
+ ssm_states_all->nb[1], kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all));
|
||||
+
|
||||
+ ggml_tensor * result = ggml_gated_delta_net_inplace(ctx0, q, k, v, g, b, s, state_dst);
|
||||
+ if (n_seq_tokens == 1) {
|
||||
+ cb(result, LLAMA_TENSOR_NAME_FGDN_AR, il);
|
||||
+ } else {
|
||||
+ cb(result, LLAMA_TENSOR_NAME_FGDN_CH, il);
|
||||
+ }
|
||||
+
|
||||
+ ggml_tensor * output = ggml_view_4d(ctx0, result,
|
||||
+ S_v, H_v, n_seq_tokens, n_seqs,
|
||||
+ ggml_row_size(result->type, S_v),
|
||||
+ ggml_row_size(result->type, S_v * H_v),
|
||||
+ ggml_row_size(result->type, S_v * H_v * n_seq_tokens), 0);
|
||||
+ cb(output, "attn_output", il);
|
||||
+
|
||||
+ // the state write is a side effect of the op; pull the op into the graph via the output
|
||||
+ ggml_build_forward_expand(gf, output);
|
||||
+
|
||||
+ return output;
|
||||
+ }
|
||||
+
|
||||
auto attn_out = build_delta_net(q, k, v, g, b, s, il);
|
||||
ggml_tensor * output = attn_out.first;
|
||||
ggml_tensor * new_state = attn_out.second;
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,583 @@
|
||||
From 46d7dd80bbce7f3c1dbf9363d6527c8c9b687a6b Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Thu, 25 Jun 2026 01:45:02 +0200
|
||||
Subject: [PATCH] feat(paged): qwen35 SSM decode fused recurrent-state gather
|
||||
(patch 0019)
|
||||
|
||||
Step 2 of the SSM decode-throughput work. After Step 1 (in-place state
|
||||
write-back, patch 0018) the largest non-GEMM decode bucket was the recurrent-
|
||||
state get_rows gather (18.8% of decode GPU time): build_rs materialized each
|
||||
sequence's prior state into a contiguous scratch via ggml_get_rows before the
|
||||
gated-DeltaNet op read it.
|
||||
|
||||
This eliminates that materialization, mirroring ggml_ssm_scan's ids source.
|
||||
ggml_gated_delta_net_inplace_ids takes the FULL recurrent-state cache plus the
|
||||
s_copy ids (src[5] = full cache, src[7] = ids, op_param[1] = rs_head) and reads
|
||||
each sequence's prior state directly from cache[ids[seq]]. Combined with Step 1's
|
||||
in-place write the op now reads AND writes the cache directly: no recurrent-state
|
||||
materialization at all. build_recurrent_attn feeds the full cache + ids through
|
||||
the build_rs get_state_rows lambda exactly like mamba-base, keeping the rs_zero
|
||||
clear and the extra-states copy around the op.
|
||||
|
||||
Race-free by construction on CUDA. In-place write plus an ids read of the same
|
||||
cache is only safe when read slot == write slot; s_copy is identity
|
||||
(rs_head + s) for stable continuing sequences (the whole AR decode path) but can
|
||||
remap on reorder or rs_zero (e.g. multiple new sequences in one prefill ubatch).
|
||||
The recurrence kernel handles both per (seq, head) block on device: identity
|
||||
sequences read s0 in place from the destination slot (the kernel loads all of s0
|
||||
into registers before writing, so reading and writing the same slot is safe),
|
||||
and non-identity sequences read from a disjoint scratch that a small gather
|
||||
kernel copies from cache[ids[seq]] first, so the recurrence never reads a slot
|
||||
another block writes. The CPU op mirrors this (host identity check + a serial
|
||||
gather in the dispatcher). ids stays a device pointer (read only in-kernel; it is
|
||||
device-resident at op-execute time). Bit-identical to the get_rows path in every
|
||||
case.
|
||||
|
||||
- new builder ggml_gated_delta_net_inplace_ids; CUDA gather kernel
|
||||
(gdn_gather_nonident) + per-block read-base select in gated_delta_net_cuda;
|
||||
CPU identity guard + serial gather fallback in the dispatcher
|
||||
- delta-net-base build_recurrent_attn gains a gather-free overload; qwen35 and
|
||||
qwen35moe drop the pre-gather. qwen3next, kimi-linear, the non-fused path and
|
||||
the rollback (n_rs_seq > 0) path are unchanged.
|
||||
|
||||
Measured (decode_agg S_TG, npp128 ntg128, -fa on, paged on, fusion off):
|
||||
dense q36-27b-nvfp4 : npl 32 137.64 -> 170.68 (+24.0 percent)
|
||||
npl 128 186.25 -> 256.57 (+37.8 percent, 47.6 -> 65.6 percent of vLLM 391)
|
||||
MoE q36-35b-a3b-nvfp4: npl 32 299.68 -> 366.69 (+22.4 percent)
|
||||
npl 128 409.30 -> 553.63 (+35.3 percent)
|
||||
Greedy (--temp 0 --seed 1) llama-completion bit-identical vs the Step-1 build
|
||||
(dense model text md5 match, MoE byte-identical, step2 run1 == run2). nsys
|
||||
k_get_rows_float bucket 18.8 -> 0.7 percent; the new gdn_gather_nonident kernel
|
||||
is 1.7 percent (no-op at decode, median 1.2 us). The residual decode gap to vLLM
|
||||
is now the FP4 GEMM (~48 percent of decode), a separate kernel track.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
ggml/include/ggml.h | 17 ++++++
|
||||
ggml/src/ggml-cpu/ops.cpp | 49 ++++++++++++++-
|
||||
ggml/src/ggml-cuda/gated_delta_net.cu | 85 ++++++++++++++++++++++----
|
||||
ggml/src/ggml.c | 76 +++++++++++++++++++++++
|
||||
src/models/delta-net-base.cpp | 63 ++++++++++++++++++++
|
||||
src/models/models.h | 13 ++++
|
||||
src/models/qwen35.cpp | 6 +-
|
||||
src/models/qwen35moe.cpp | 6 +-
|
||||
8 files changed, 292 insertions(+), 23 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
|
||||
index 4e7ab32..951dd21 100644
|
||||
--- a/ggml/include/ggml.h
|
||||
+++ b/ggml/include/ggml.h
|
||||
@@ -2593,6 +2593,23 @@ extern "C" {
|
||||
struct ggml_tensor * state,
|
||||
struct ggml_tensor * state_dst);
|
||||
|
||||
+ // Step 2: same recurrence as ggml_gated_delta_net_inplace, but the prior recurrent state is read
|
||||
+ // directly from the full state cache via per-sequence indices (ids == s_copy), mirroring
|
||||
+ // ggml_ssm_scan, instead of from a materialized ggml_get_rows gather. `state` is the FULL cache
|
||||
+ // [S_v, S_v, H, n_rs_slots]; `ids` are the per-seq source slots; `rs_head` is the destination
|
||||
+ // base slot. Eliminates the recurrent-state gather on the decode path.
|
||||
+ GGML_API struct ggml_tensor * ggml_gated_delta_net_inplace_ids(
|
||||
+ struct ggml_context * ctx,
|
||||
+ struct ggml_tensor * q,
|
||||
+ struct ggml_tensor * k,
|
||||
+ struct ggml_tensor * v,
|
||||
+ struct ggml_tensor * g,
|
||||
+ struct ggml_tensor * beta,
|
||||
+ struct ggml_tensor * state,
|
||||
+ struct ggml_tensor * state_dst,
|
||||
+ struct ggml_tensor * ids,
|
||||
+ int rs_head);
|
||||
+
|
||||
// custom operators
|
||||
|
||||
typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
|
||||
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
|
||||
index 9457add..b6a1976 100644
|
||||
--- a/ggml/src/ggml-cpu/ops.cpp
|
||||
+++ b/ggml/src/ggml-cpu/ops.cpp
|
||||
@@ -10633,7 +10633,7 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
|
||||
const int64_t K = ggml_get_op_params_i32(dst, 0);
|
||||
GGML_ASSERT(K >= 1);
|
||||
// per-seq stride in floats (seq s starts at state + s * seq_stride)
|
||||
- const int64_t state_seq_stride = src_state->nb[3] / sizeof(float);
|
||||
+ int64_t state_seq_stride = src_state->nb[3] / sizeof(float);
|
||||
|
||||
const int64_t per_thread = S_v + (K > 1 ? S_v * S_v : 0);
|
||||
const int ith = params->ith;
|
||||
@@ -10654,6 +10654,26 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
|
||||
|
||||
const float * state_in_base = (const float *)src_state->data;
|
||||
|
||||
+ // Step 2: fused recurrent-state gather (ids == s_copy in src[7]). Read the prior state directly
|
||||
+ // from the full cache at cache[ids[seq]] instead of from a materialized gather. For the identity
|
||||
+ // decode case the prior state is the in-place destination block [rs_head, rs_head+n_seqs);
|
||||
+ // otherwise the dispatcher has gathered cache[ids[seq]] into the (unused) output-state scratch
|
||||
+ // region. Bit-identical to the get_rows path.
|
||||
+ ggml_tensor * src_ids = dst->src[7];
|
||||
+ if (src_ids != nullptr) {
|
||||
+ const int64_t D = S_v * S_v * H;
|
||||
+ const int32_t rs_head = ggml_get_op_params_i32(dst, 1);
|
||||
+ const int32_t * ids = (const int32_t *) src_ids->data;
|
||||
+ bool identity = true;
|
||||
+ for (int64_t s = 0; s < n_seqs; ++s) {
|
||||
+ if (ids[s] != rs_head + (int32_t) s) { identity = false; break; }
|
||||
+ }
|
||||
+ state_seq_stride = D;
|
||||
+ state_in_base = identity
|
||||
+ ? (const float *) src_state->data + (int64_t) rs_head * D
|
||||
+ : (const float *) state_out_base; // gathered by the dispatcher (non-identity)
|
||||
+ }
|
||||
+
|
||||
//const int64_t rq1 = nev1 / neq1;
|
||||
//const int64_t rk1 = nev1 / nek1;
|
||||
const int64_t rq3 = nev3 / neq3;
|
||||
@@ -10777,6 +10797,33 @@ static void ggml_compute_forward_gated_delta_net_f32(
|
||||
|
||||
if (ith == 0) {
|
||||
ggml_threadpool_chunk_set(params->threadpool, nth);
|
||||
+
|
||||
+ // Step 2: non-identity ids fallback -- serially gather each sequence's prior state from
|
||||
+ // cache[ids[seq]] into the (otherwise unused) output-state scratch region before the parallel
|
||||
+ // recurrence, so the in-place write never aliases another sequence's read.
|
||||
+ ggml_tensor * src_ids = dst->src[7];
|
||||
+ if (src_ids != nullptr) {
|
||||
+ const ggml_tensor * src_state = dst->src[5];
|
||||
+ const int64_t S_v = V->ne[0];
|
||||
+ const int64_t H = V->ne[1];
|
||||
+ const int64_t n_tokens = V->ne[2];
|
||||
+ const int64_t n_seqs = V->ne[3];
|
||||
+ const int64_t D = S_v * S_v * H;
|
||||
+ const int32_t rs_head = ggml_get_op_params_i32(dst, 1);
|
||||
+ const int32_t * ids = (const int32_t *) src_ids->data;
|
||||
+ bool identity = true;
|
||||
+ for (int64_t s = 0; s < n_seqs; ++s) {
|
||||
+ if (ids[s] != rs_head + (int32_t) s) { identity = false; break; }
|
||||
+ }
|
||||
+ if (!identity) {
|
||||
+ const int64_t attn_score_elems = S_v * H * n_tokens * n_seqs;
|
||||
+ const float * cache = (const float *) src_state->data;
|
||||
+ float * scratch = (float *) dst->data + attn_score_elems;
|
||||
+ for (int64_t s = 0; s < n_seqs; ++s) {
|
||||
+ memcpy(scratch + s * D, cache + (int64_t) ids[s] * D, D * sizeof(float));
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
diff --git a/ggml/src/ggml-cuda/gated_delta_net.cu b/ggml/src/ggml-cuda/gated_delta_net.cu
|
||||
index 61a2b91..86d5e2a 100644
|
||||
--- a/ggml/src/ggml-cuda/gated_delta_net.cu
|
||||
+++ b/ggml/src/ggml-cuda/gated_delta_net.cu
|
||||
@@ -1,6 +1,34 @@
|
||||
#include "gated_delta_net.cuh"
|
||||
#include "ggml-cuda/common.cuh"
|
||||
|
||||
+// Step 2: gather only the NON-identity sequences' prior recurrent state from the full cache into a
|
||||
+// disjoint scratch buffer. Identity sequences (ids[s] == rs_head + s) are read in place from the
|
||||
+// destination slot by the recurrence kernel and are skipped here. One block per sequence.
|
||||
+__global__ void gdn_gather_nonident_kernel(const float * cache, const int32_t * ids, int rs_head,
|
||||
+ float * scratch, int64_t D, int n_seqs) {
|
||||
+ const int s = blockIdx.x;
|
||||
+ if (s >= n_seqs) {
|
||||
+ return;
|
||||
+ }
|
||||
+ const int r = ids[s];
|
||||
+ if (r == rs_head + s) {
|
||||
+ return; // identity: prior state already lives in the in-place destination slot
|
||||
+ }
|
||||
+ const float * src = cache + (int64_t) r * D;
|
||||
+ float * dst = scratch + (int64_t) s * D;
|
||||
+ for (int64_t i = threadIdx.x; i < D; i += blockDim.x) {
|
||||
+ dst[i] = src[i];
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static void ggml_cuda_gdn_gather_nonident(const float * cache, const int32_t * ids, int rs_head,
|
||||
+ float * scratch, int64_t D, int64_t n_seqs, cudaStream_t stream) {
|
||||
+ if (n_seqs <= 0) {
|
||||
+ return;
|
||||
+ }
|
||||
+ gdn_gather_nonident_kernel<<<(unsigned) n_seqs, 256, 0, stream>>>(cache, ids, rs_head, scratch, D, (int) n_seqs);
|
||||
+}
|
||||
+
|
||||
template <int S_v, bool KDA, bool keep_rs_t>
|
||||
__global__ void __launch_bounds__((ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v) * 4, 2)
|
||||
gated_delta_net_cuda(const float * q,
|
||||
@@ -26,7 +54,9 @@ gated_delta_net_cuda(const float * q,
|
||||
const uint3 rq3_magic,
|
||||
float scale,
|
||||
int K,
|
||||
- float * state_dst) {
|
||||
+ float * state_dst,
|
||||
+ const int32_t * ids,
|
||||
+ int rs_head) {
|
||||
const uint32_t h_idx = blockIdx.x;
|
||||
const uint32_t sequence = blockIdx.y;
|
||||
// each warp owns one column, using warp-level primitives to reduce across rows
|
||||
@@ -48,7 +78,15 @@ gated_delta_net_cuda(const float * q,
|
||||
const int64_t state_in_offset = sequence * H * S_v * S_v + h_idx * S_v * S_v;
|
||||
const int64_t state_out_offset = (sequence * H + h_idx) * S_v * S_v;
|
||||
state += state_out_offset;
|
||||
- curr_state += state_in_offset + col * S_v;
|
||||
+ // Step 2: select the prior-state read base per sequence. For the ids variant, identity
|
||||
+ // sequences (ids[seq] == rs_head + seq) read s0 directly from the in-place destination slot
|
||||
+ // state_dst (no materialization); non-identity sequences read from the pre-gathered scratch
|
||||
+ // (curr_state). state_in_offset == state_out_offset, so both bases use the same per-(seq,head)
|
||||
+ // offset. The whole s0 is loaded into registers before the new state is written, so reading and
|
||||
+ // writing the same slot per block (identity) is race-free.
|
||||
+ const float * read_state = (ids != nullptr && ids[sequence] == rs_head + (int) sequence)
|
||||
+ ? state_dst : curr_state;
|
||||
+ read_state += state_in_offset + col * S_v;
|
||||
attn_data += (sequence * n_tokens * H + h_idx) * S_v;
|
||||
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v;
|
||||
@@ -61,7 +99,7 @@ gated_delta_net_cuda(const float * q,
|
||||
#pragma unroll
|
||||
for (int r = 0; r < rows_per_lane; r++) {
|
||||
const int i = r * warp_size + lane;
|
||||
- s_shard[r] = curr_state[i];
|
||||
+ s_shard[r] = read_state[i];
|
||||
}
|
||||
|
||||
for (int t = 0; t < n_tokens; t++) {
|
||||
@@ -176,6 +214,7 @@ static void launch_gated_delta_net(
|
||||
const float * q_d, const float * k_d, const float * v_d,
|
||||
const float * g_d, const float * b_d, const float * s_d,
|
||||
float * dst_d, float * state_dst_d,
|
||||
+ const int32_t * ids_d, int rs_head,
|
||||
int64_t S_v, int64_t H, int64_t n_tokens, int64_t n_seqs,
|
||||
int64_t sq1, int64_t sq2, int64_t sq3,
|
||||
int64_t sv1, int64_t sv2, int64_t sv3,
|
||||
@@ -199,26 +238,26 @@ static void launch_gated_delta_net(
|
||||
ggml_cuda_kernel_launch(gated_delta_net_cuda<16, KDA, keep_rs_t>, launch_params,
|
||||
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
||||
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
- sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K, state_dst_d);
|
||||
+ sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K, state_dst_d, ids_d, rs_head);
|
||||
break;
|
||||
case 32:
|
||||
ggml_cuda_kernel_launch(gated_delta_net_cuda<32, KDA, keep_rs_t>, launch_params,
|
||||
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
||||
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
- sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K, state_dst_d);
|
||||
+ sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K, state_dst_d, ids_d, rs_head);
|
||||
break;
|
||||
case 64: {
|
||||
ggml_cuda_kernel_launch(gated_delta_net_cuda<64, KDA, keep_rs_t>, launch_params,
|
||||
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
||||
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
- sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K, state_dst_d);
|
||||
+ sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K, state_dst_d, ids_d, rs_head);
|
||||
break;
|
||||
}
|
||||
case 128: {
|
||||
ggml_cuda_kernel_launch(gated_delta_net_cuda<128, KDA, keep_rs_t>, launch_params,
|
||||
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
||||
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
- sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K, state_dst_d);
|
||||
+ sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K, state_dst_d, ids_d, rs_head);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
@@ -262,7 +301,6 @@ void ggml_cuda_op_gated_delta_net(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
const float * g_d = (const float *) src_g->data;
|
||||
const float * b_d = (const float *) src_beta->data;
|
||||
|
||||
- const float * s_d = (const float *) src_state->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
float * state_dst_d = nullptr;
|
||||
@@ -274,6 +312,29 @@ void ggml_cuda_op_gated_delta_net(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
state_dst_d = (float *) src_state_dst->data;
|
||||
}
|
||||
|
||||
+ // Step 2: fused recurrent-state gather (src[7] = ids == s_copy). Read the prior state directly
|
||||
+ // from the full cache via ids instead of from a materialized ggml_get_rows gather. The recurrence
|
||||
+ // kernel reads identity sequences (ids[seq] == rs_head + seq) in place from state_dst (no
|
||||
+ // materialization at all); any non-identity sequence (reorder / rs_zero remap) is gathered here
|
||||
+ // into a disjoint scratch that the kernel reads instead. The gather writes a disjoint buffer and
|
||||
+ // the recurrence never reads a slot another block writes, so it is race-free and bit-identical to
|
||||
+ // the get_rows path. ids stays a DEVICE pointer (dereferenced only inside the kernels).
|
||||
+ ggml_tensor * src_ids = dst->src[7];
|
||||
+ const float * s_d = (const float *) src_state->data;
|
||||
+ const int32_t * ids_d = nullptr;
|
||||
+ int rs_head = 0;
|
||||
+ ggml_cuda_pool_alloc<float> ids_state_scratch(ctx.pool());
|
||||
+ if (src_ids != nullptr) {
|
||||
+ GGML_ASSERT(state_dst_d != nullptr);
|
||||
+ GGML_ASSERT(src_ids->type == GGML_TYPE_I32);
|
||||
+ rs_head = ggml_get_op_params_i32(dst, 1);
|
||||
+ ids_d = (const int32_t *) src_ids->data;
|
||||
+ const int64_t D = S_v * S_v * H;
|
||||
+ float * scratch = ids_state_scratch.alloc((size_t) D * n_seqs);
|
||||
+ ggml_cuda_gdn_gather_nonident(s_d, ids_d, rs_head, scratch, D, n_seqs, ctx.stream());
|
||||
+ s_d = scratch;
|
||||
+ }
|
||||
+
|
||||
GGML_ASSERT(ggml_is_contiguous_rows(src_q));
|
||||
GGML_ASSERT(ggml_is_contiguous_rows(src_k));
|
||||
GGML_ASSERT(ggml_is_contiguous_rows(src_v));
|
||||
@@ -307,21 +368,21 @@ void ggml_cuda_op_gated_delta_net(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
|
||||
if (kda) {
|
||||
if (keep_rs) {
|
||||
- launch_gated_delta_net<true, true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, state_dst_d,
|
||||
+ launch_gated_delta_net<true, true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, state_dst_d, ids_d, rs_head,
|
||||
S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
sb1, sb2, sb3, neqk1, rq3, scale, K, stream);
|
||||
} else {
|
||||
- launch_gated_delta_net<true, false>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, state_dst_d,
|
||||
+ launch_gated_delta_net<true, false>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, state_dst_d, ids_d, rs_head,
|
||||
S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
sb1, sb2, sb3, neqk1, rq3, scale, K, stream);
|
||||
}
|
||||
} else {
|
||||
if (keep_rs) {
|
||||
- launch_gated_delta_net<false, true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, state_dst_d,
|
||||
+ launch_gated_delta_net<false, true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, state_dst_d, ids_d, rs_head,
|
||||
S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
sb1, sb2, sb3, neqk1, rq3, scale, K, stream);
|
||||
} else {
|
||||
- launch_gated_delta_net<false, false>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, state_dst_d,
|
||||
+ launch_gated_delta_net<false, false>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, state_dst_d, ids_d, rs_head,
|
||||
S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
sb1, sb2, sb3, neqk1, rq3, scale, K, stream);
|
||||
}
|
||||
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
|
||||
index b8d34bf..1762037 100644
|
||||
--- a/ggml/src/ggml.c
|
||||
+++ b/ggml/src/ggml.c
|
||||
@@ -6353,6 +6353,82 @@ struct ggml_tensor * ggml_gated_delta_net_inplace(
|
||||
return result;
|
||||
}
|
||||
|
||||
+// ggml_gated_delta_net_inplace_ids
|
||||
+//
|
||||
+// Same recurrence as ggml_gated_delta_net_inplace, but the prior recurrent state is read directly
|
||||
+// from the FULL state cache `state` ([S_v, S_v, H, n_rs_slots]) at cache[ids[seq]] (mirroring
|
||||
+// ggml_ssm_scan's ids source) instead of from a materialized ggml_get_rows gather. `rs_head` is the
|
||||
+// destination base slot, used by the backend to detect the common identity case (ids[s] == rs_head
|
||||
+// + s), where the prior state already lives in the in-place destination slots.
|
||||
+struct ggml_tensor * ggml_gated_delta_net_inplace_ids(
|
||||
+ struct ggml_context * ctx,
|
||||
+ struct ggml_tensor * q,
|
||||
+ struct ggml_tensor * k,
|
||||
+ struct ggml_tensor * v,
|
||||
+ struct ggml_tensor * g,
|
||||
+ struct ggml_tensor * beta,
|
||||
+ struct ggml_tensor * state,
|
||||
+ struct ggml_tensor * state_dst,
|
||||
+ struct ggml_tensor * ids,
|
||||
+ int rs_head) {
|
||||
+ GGML_ASSERT(ggml_is_contiguous_rows(q));
|
||||
+ GGML_ASSERT(ggml_is_contiguous_rows(k));
|
||||
+ GGML_ASSERT(ggml_is_contiguous_rows(v));
|
||||
+ GGML_ASSERT(ggml_is_contiguous(g));
|
||||
+ GGML_ASSERT(ggml_is_contiguous(beta));
|
||||
+ GGML_ASSERT(ggml_is_contiguous(state));
|
||||
+
|
||||
+ GGML_ASSERT(q->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(k->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(v->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(g->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(beta->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(state->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(state_dst != NULL && state_dst->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(ids != NULL && ids->type == GGML_TYPE_I32);
|
||||
+
|
||||
+ const int64_t S_v = v->ne[0];
|
||||
+ const int64_t H = v->ne[1];
|
||||
+ const int64_t n_tokens = v->ne[2];
|
||||
+ const int64_t n_seqs = v->ne[3];
|
||||
+
|
||||
+ GGML_ASSERT(g->ne[0] == 1 || g->ne[0] == S_v);
|
||||
+ GGML_ASSERT(beta->ne[0] == 1);
|
||||
+
|
||||
+ // state is the FULL recurrent-state cache: [S_v, S_v, H, n_rs_slots], n_rs_slots >= n_seqs
|
||||
+ GGML_ASSERT(state->ne[0] == S_v);
|
||||
+ GGML_ASSERT(state->ne[1] == S_v);
|
||||
+ GGML_ASSERT(state->ne[2] == H);
|
||||
+ GGML_ASSERT(state->ne[3] >= n_seqs);
|
||||
+
|
||||
+ // state_dst holds the per-seq final state contiguously: [S_v*S_v*H, >= n_seqs]
|
||||
+ GGML_ASSERT(state_dst->ne[0] == S_v * S_v * H);
|
||||
+ GGML_ASSERT(state_dst->ne[1] >= n_seqs);
|
||||
+ GGML_ASSERT(state_dst->nb[0] == sizeof(float));
|
||||
+
|
||||
+ // ids: per-seq source slot into the full cache (s_copy_main)
|
||||
+ GGML_ASSERT(ids->ne[0] >= n_seqs);
|
||||
+
|
||||
+ const int64_t state_rows = S_v * n_seqs; // K == 1
|
||||
+ const int64_t ne[4] = { S_v * H, n_tokens * n_seqs + state_rows, 1, 1 };
|
||||
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
||||
+
|
||||
+ ggml_set_op_params_i32(result, 0, 1); // K == 1
|
||||
+ ggml_set_op_params_i32(result, 1, rs_head); // destination base slot (for the ids identity check)
|
||||
+
|
||||
+ result->op = GGML_OP_GATED_DELTA_NET;
|
||||
+ result->src[0] = q;
|
||||
+ result->src[1] = k;
|
||||
+ result->src[2] = v;
|
||||
+ result->src[3] = g;
|
||||
+ result->src[4] = beta;
|
||||
+ result->src[5] = state; // FULL cache (read via ids)
|
||||
+ result->src[6] = state_dst; // in-place final-state write-back target
|
||||
+ result->src[7] = ids; // per-seq source slots (s_copy)
|
||||
+
|
||||
+ return result;
|
||||
+}
|
||||
+
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct ggml_hash_set ggml_hash_set_new(size_t size) {
|
||||
diff --git a/src/models/delta-net-base.cpp b/src/models/delta-net-base.cpp
|
||||
index 26a718b..194e611 100644
|
||||
--- a/src/models/delta-net-base.cpp
|
||||
+++ b/src/models/delta-net-base.cpp
|
||||
@@ -524,6 +524,69 @@ ggml_tensor * llm_build_delta_net_base::build_conv_state(
|
||||
return conv_input;
|
||||
}
|
||||
|
||||
+// Step 2: gather-free recurrent attention. Mirrors mamba-base's get_ssm_rows pattern: the fused
|
||||
+// gated-DeltaNet op reads each sequence's prior state directly from the full cache via the s_copy
|
||||
+// ids (no ggml_get_rows materialization) and writes the new state in place (Step 1). The non-fused
|
||||
+// and rollback paths fall back to materializing the prior state and delegating below.
|
||||
+ggml_tensor * llm_build_delta_net_base::build_recurrent_attn(
|
||||
+ llm_graph_input_rs * inp,
|
||||
+ ggml_tensor * ssm_states_all,
|
||||
+ ggml_tensor * q,
|
||||
+ ggml_tensor * k,
|
||||
+ ggml_tensor * v,
|
||||
+ ggml_tensor * g,
|
||||
+ ggml_tensor * b,
|
||||
+ int il) {
|
||||
+ const auto * mctx_cur = inp->mctx;
|
||||
+ const auto kv_head = mctx_cur->get_head();
|
||||
+
|
||||
+ const int64_t S_v = v->ne[0];
|
||||
+ const int64_t H_v = v->ne[1];
|
||||
+ const int64_t n_seqs = v->ne[3];
|
||||
+ const int64_t n_seq_tokens = q->ne[2];
|
||||
+
|
||||
+ const bool keep = cparams.n_rs_seq > 0;
|
||||
+ const bool fused = (n_seq_tokens == 1) ? cparams.fused_gdn_ar : cparams.fused_gdn_ch;
|
||||
+
|
||||
+ if (!keep && fused) {
|
||||
+ // build_rs feeds the FULL state cache + the s_copy ids into the op (via the get_state_rows
|
||||
+ // lambda, exactly like mamba-base's ggml_ssm_scan) and still performs the rs_zero clear and
|
||||
+ // the extra-states copy around it. The op reads curr_state from cache[ids[seq]] and writes
|
||||
+ // the final state in place at kv_head; no recurrent-state materialization at all.
|
||||
+ auto get_state_op = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) -> ggml_tensor * {
|
||||
+ ggml_tensor * cache4d = ggml_reshape_4d(ctx, states, S_v, S_v, H_v, states->ne[1]);
|
||||
+ ggml_tensor * state_dst = ggml_view_2d(ctx, ssm_states_all, hparams.n_embd_s(), n_seqs,
|
||||
+ ssm_states_all->nb[1], kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all));
|
||||
+ return ggml_gated_delta_net_inplace_ids(ctx, q, k, v, g, b, cache4d, state_dst, ids, (int) kv_head);
|
||||
+ };
|
||||
+
|
||||
+ ggml_tensor * result = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs, get_state_op);
|
||||
+ if (n_seq_tokens == 1) {
|
||||
+ cb(result, LLAMA_TENSOR_NAME_FGDN_AR, il);
|
||||
+ } else {
|
||||
+ cb(result, LLAMA_TENSOR_NAME_FGDN_CH, il);
|
||||
+ }
|
||||
+
|
||||
+ ggml_tensor * output = ggml_view_4d(ctx0, result,
|
||||
+ S_v, H_v, n_seq_tokens, n_seqs,
|
||||
+ ggml_row_size(result->type, S_v),
|
||||
+ ggml_row_size(result->type, S_v * H_v),
|
||||
+ ggml_row_size(result->type, S_v * H_v * n_seq_tokens), 0);
|
||||
+ cb(output, "attn_output", il);
|
||||
+
|
||||
+ // the state write is a side effect of the op; pull the op into the graph via the output
|
||||
+ ggml_build_forward_expand(gf, output);
|
||||
+
|
||||
+ return output;
|
||||
+ }
|
||||
+
|
||||
+ // non-fused / rollback: materialize the prior state via gather and delegate to the
|
||||
+ // state-taking overload (its fused !keep branch performs the Step-1 in-place write).
|
||||
+ ggml_tensor * s = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
|
||||
+ s = ggml_reshape_4d(ctx0, s, S_v, S_v, H_v, n_seqs);
|
||||
+ return build_recurrent_attn(inp, ssm_states_all, q, k, v, g, b, s, il);
|
||||
+}
|
||||
+
|
||||
ggml_tensor * llm_build_delta_net_base::build_recurrent_attn(
|
||||
llm_graph_input_rs * inp,
|
||||
ggml_tensor * ssm_states_all,
|
||||
diff --git a/src/models/models.h b/src/models/models.h
|
||||
index 2ac8415..98b89e9 100644
|
||||
--- a/src/models/models.h
|
||||
+++ b/src/models/models.h
|
||||
@@ -88,6 +88,19 @@ struct llm_build_delta_net_base : public llm_graph_context {
|
||||
ggml_tensor * b,
|
||||
ggml_tensor * s,
|
||||
int il);
|
||||
+
|
||||
+ // Step 2: gather-free variant. Reads the prior recurrent state directly from the full cache via
|
||||
+ // the s_copy ids (no ggml_get_rows materialization) on the fused decode/prefill path, and
|
||||
+ // delegates to the state-taking overload for the non-fused and rollback paths.
|
||||
+ ggml_tensor * build_recurrent_attn(
|
||||
+ llm_graph_input_rs * inp,
|
||||
+ ggml_tensor * ssm_states_all,
|
||||
+ ggml_tensor * q,
|
||||
+ ggml_tensor * k,
|
||||
+ ggml_tensor * v,
|
||||
+ ggml_tensor * g,
|
||||
+ ggml_tensor * b,
|
||||
+ int il);
|
||||
};
|
||||
|
||||
struct llm_build_rwkv6_base : public llm_graph_context {
|
||||
diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp
|
||||
index 6783d98..0be3247 100644
|
||||
--- a/src/models/qwen35.cpp
|
||||
+++ b/src/models/qwen35.cpp
|
||||
@@ -385,10 +385,6 @@ ggml_tensor * llama_model_qwen35::graph::build_layer_attn_linear(
|
||||
|
||||
ggml_tensor * conv_input = build_conv_state(inp, conv_states_all, qkv_mixed, conv_kernel_size, conv_channels, il);
|
||||
|
||||
- ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
|
||||
- state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs);
|
||||
- cb(state, "state_predelta", il);
|
||||
-
|
||||
ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
|
||||
cb(conv_output_proper, "conv_output_raw", il);
|
||||
|
||||
@@ -445,7 +441,7 @@ ggml_tensor * llama_model_qwen35::graph::build_layer_attn_linear(
|
||||
cb(k_conv, "k_conv_predelta", il);
|
||||
cb(v_conv, "v_conv_predelta", il);
|
||||
|
||||
- ggml_tensor * output = build_recurrent_attn(inp, ssm_states_all, q_conv, k_conv, v_conv, gate, beta, state, il);
|
||||
+ ggml_tensor * output = build_recurrent_attn(inp, ssm_states_all, q_conv, k_conv, v_conv, gate, beta, il);
|
||||
|
||||
// z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
|
||||
ggml_tensor * z_2d = ggml_reshape_4d(ctx0, z, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
|
||||
diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp
|
||||
index eb5e9a4..2995f04 100644
|
||||
--- a/src/models/qwen35moe.cpp
|
||||
+++ b/src/models/qwen35moe.cpp
|
||||
@@ -409,10 +409,6 @@ ggml_tensor * llama_model_qwen35moe::graph::build_layer_attn_linear(
|
||||
|
||||
ggml_tensor * conv_input = build_conv_state(inp, conv_states_all, qkv_mixed, conv_kernel_size, conv_channels, il);
|
||||
|
||||
- ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
|
||||
- state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs);
|
||||
- cb(state, "state_predelta", il);
|
||||
-
|
||||
ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
|
||||
cb(conv_output_proper, "conv_output_raw", il);
|
||||
|
||||
@@ -469,7 +465,7 @@ ggml_tensor * llama_model_qwen35moe::graph::build_layer_attn_linear(
|
||||
cb(k_conv, "k_conv_predelta", il);
|
||||
cb(v_conv, "v_conv_predelta", il);
|
||||
|
||||
- ggml_tensor * output = build_recurrent_attn(inp, ssm_states_all, q_conv, k_conv, v_conv, gate, beta, state, il);
|
||||
+ ggml_tensor * output = build_recurrent_attn(inp, ssm_states_all, q_conv, k_conv, v_conv, gate, beta, il);
|
||||
|
||||
// z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
|
||||
ggml_tensor * z_2d = ggml_reshape_4d(ctx0, z, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,140 @@
|
||||
From df1cc97b68df048834ab735c944b71c3a2e8737e Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Thu, 25 Jun 2026 12:40:49 +0200
|
||||
Subject: [PATCH] feat(paged): qwen35 gated-DeltaNet o_proj MMVQ->MMQ reshape
|
||||
(patch 0020)
|
||||
|
||||
Lever 1, the single biggest decode-parity lever for the Qwen3.6 hybrid-SSM
|
||||
models (arch qwen35: 48 gated-DeltaNet + 16 full-attention layers). Post-SSM
|
||||
(patches 0018 + 0019) dense decode sat at 255 t/s = 65% of vLLM 391; profiling
|
||||
both engines pinned the largest llama-specific overage to the gated-DeltaNet
|
||||
OUTPUT projection (ssm_out).
|
||||
|
||||
The GDN op left its output in SSM layout and the graph reshaped it to 3D
|
||||
[value_dim, n_seq_tokens=1, n_seqs=128] before the ssm_out matmul, so
|
||||
src1->ne[1]=1. That trips the ggml-cuda MMVQ dispatch (ne[1] <= 8) with the 128
|
||||
sequences stuck in ne[2]; MMVQ is built for batch <= 8 and does not amortize the
|
||||
ssm_out weight read across the 128 sequences (one 5120x128 grid, 48 calls/step,
|
||||
the 40%-vs-62% GPU-utilization gap). vLLM packs the same projection into one
|
||||
M=128 GEMM. The in-projection was already 2D -> MMQ; only the output was 3D.
|
||||
|
||||
The fix collapses the GDN output to 2D [value_dim, n_seq_tokens * n_seqs]
|
||||
(= [6144, 128] at decode) before the ssm_out ggml_mul_mat, so src1->ne[1]=128
|
||||
routes to the MMQ M=128 tensor-core GEMM (which amortizes the weight read across
|
||||
all 128 tokens). The result is then already 2D, so the redundant post-matmul
|
||||
reshape_2d is dropped. Same contiguous data, just a 2D vs 3D view: bit-identical.
|
||||
Gated to the gated-DeltaNet path (qwen35 / qwen35moe / qwen3next); other archs
|
||||
untouched.
|
||||
|
||||
Bit-identical greedy (--temp 0 --seed 1) vs the post-SSM baseline on both
|
||||
q36-27b-nvfp4 (dense) and q36-35b-a3b-nvfp4 (MoE), byte/md5-identical.
|
||||
test-backend-ops MUL_MAT and MUL_MAT_ID OK.
|
||||
|
||||
decode_agg S_TG (llama-batched-bench, -fa on, npp128 ntg128, npl 32/128):
|
||||
dense q36-27b: 170.52 / 254.92 -> 200.00 / 335.80 t/s (+17.3% / +31.7%)
|
||||
MoE q36-35b-a3b: 373.28 / 560.66 -> 420.77 / 691.24 t/s (+12.7% / +23.3%)
|
||||
Dense @128 = 335.80 t/s = 85.9% of vLLM 391 (up from 65%; target 82-85% hit).
|
||||
|
||||
nsys: the o_proj mul_mat_vec_q<NVFP4,m=1> bucket (132.8 ms / 48 inst) collapses
|
||||
to zero; mul_mat_q<NVFP4,m=128> absorbs it (+1200 inst, +363 ms) with a LOWER
|
||||
per-call average (620.8 -> 582.7 us). Realized o_proj-as-MMQ cost ~0.30 ms/call
|
||||
vs 2.77 ms/call for the old GEMV.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
src/models/qwen35.cpp | 13 ++++---
|
||||
src/models/qwen35moe.cpp | 13 ++++---
|
||||
src/models/qwen3next.cpp | 13 ++++---
|
||||
3 files changed, 21 insertions(+), 18 deletions(-)
|
||||
|
||||
diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp
|
||||
index 0be3247..0874c43 100644
|
||||
--- a/src/models/qwen35.cpp
|
||||
+++ b/src/models/qwen35.cpp
|
||||
@@ -449,17 +449,18 @@ ggml_tensor * llama_model_qwen35::graph::build_layer_attn_linear(
|
||||
// Apply gated normalization: self.norm(core_attn_out, z)
|
||||
ggml_tensor * attn_out_norm = build_norm_gated(output, model.layers[il].ssm_norm, z_2d, il);
|
||||
|
||||
- // Final reshape: [head_dim, n_heads, n_tokens, n_seqs] -> [n_tokens, n_seqs, n_heads * head_dim]
|
||||
- ggml_tensor * final_output = ggml_reshape_3d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
|
||||
+ // Lever 1: collapse the gated-DeltaNet output to 2D [value_dim, n_seq_tokens * n_seqs] so the
|
||||
+ // ssm_out projection runs as an M = n_seq_tokens*n_seqs MMQ tensor-core GEMM. The prior
|
||||
+ // reshape_3d to [value_dim, 1, n_seqs] left src1->ne[1]=1, routing decode to the batch-1 MMVQ
|
||||
+ // GEMV which does not amortize the ssm_out weight read across the sequences. Same contiguous
|
||||
+ // data, just a 2D vs 3D view, so the result is bit-identical.
|
||||
+ ggml_tensor * final_output = ggml_reshape_2d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens * n_seqs);
|
||||
cb(final_output, "final_output", il);
|
||||
|
||||
- // Output projection
|
||||
+ // Output projection (output is already 2D [n_embd, n_seq_tokens * n_seqs])
|
||||
cur = build_lora_mm(model.layers[il].ssm_out, final_output, model.layers[il].ssm_out_s);
|
||||
cb(cur, "linear_attn_out", il);
|
||||
|
||||
- // Reshape back to original dimensions
|
||||
- cur = ggml_reshape_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs);
|
||||
-
|
||||
return cur;
|
||||
}
|
||||
|
||||
diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp
|
||||
index 2995f04..1f6f643 100644
|
||||
--- a/src/models/qwen35moe.cpp
|
||||
+++ b/src/models/qwen35moe.cpp
|
||||
@@ -473,17 +473,18 @@ ggml_tensor * llama_model_qwen35moe::graph::build_layer_attn_linear(
|
||||
// Apply gated normalization: self.norm(core_attn_out, z)
|
||||
ggml_tensor * attn_out_norm = build_norm_gated(output, model.layers[il].ssm_norm, z_2d, il);
|
||||
|
||||
- // Final reshape: [head_dim, n_heads, n_tokens, n_seqs] -> [n_tokens, n_seqs, n_heads * head_dim]
|
||||
- ggml_tensor * final_output = ggml_reshape_3d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
|
||||
+ // Lever 1: collapse the gated-DeltaNet output to 2D [value_dim, n_seq_tokens * n_seqs] so the
|
||||
+ // ssm_out projection runs as an M = n_seq_tokens*n_seqs MMQ tensor-core GEMM. The prior
|
||||
+ // reshape_3d to [value_dim, 1, n_seqs] left src1->ne[1]=1, routing decode to the batch-1 MMVQ
|
||||
+ // GEMV which does not amortize the ssm_out weight read across the sequences. Same contiguous
|
||||
+ // data, just a 2D vs 3D view, so the result is bit-identical.
|
||||
+ ggml_tensor * final_output = ggml_reshape_2d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens * n_seqs);
|
||||
cb(final_output, "final_output", il);
|
||||
|
||||
- // Output projection
|
||||
+ // Output projection (output is already 2D [n_embd, n_seq_tokens * n_seqs])
|
||||
cur = build_lora_mm(model.layers[il].ssm_out, final_output, model.layers[il].ssm_out_s);
|
||||
cb(cur, "linear_attn_out", il);
|
||||
|
||||
- // Reshape back to original dimensions
|
||||
- cur = ggml_reshape_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs);
|
||||
-
|
||||
return cur;
|
||||
}
|
||||
|
||||
diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp
|
||||
index 97200a4..bfdf026 100644
|
||||
--- a/src/models/qwen3next.cpp
|
||||
+++ b/src/models/qwen3next.cpp
|
||||
@@ -519,17 +519,18 @@ ggml_tensor * llama_model_qwen3next::graph::build_layer_attn_linear(
|
||||
// Apply gated normalization: self.norm(core_attn_out, z)
|
||||
ggml_tensor * attn_out_norm = build_norm_gated(output, model.layers[il].ssm_norm, z_2d, il);
|
||||
|
||||
- // Final reshape: [head_dim, n_heads, n_tokens, n_seqs] -> [n_tokens, n_seqs, n_heads * head_dim]
|
||||
- ggml_tensor * final_output = ggml_reshape_3d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
|
||||
+ // Lever 1: collapse the gated-DeltaNet output to 2D [value_dim, n_seq_tokens * n_seqs] so the
|
||||
+ // ssm_out projection runs as an M = n_seq_tokens*n_seqs MMQ tensor-core GEMM. The prior
|
||||
+ // reshape_3d to [value_dim, 1, n_seqs] left src1->ne[1]=1, routing decode to the batch-1 MMVQ
|
||||
+ // GEMV which does not amortize the ssm_out weight read across the sequences. Same contiguous
|
||||
+ // data, just a 2D vs 3D view, so the result is bit-identical.
|
||||
+ ggml_tensor * final_output = ggml_reshape_2d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens * n_seqs);
|
||||
cb(final_output, "final_output", il);
|
||||
|
||||
- // Output projection
|
||||
+ // Output projection (output is already 2D [n_embd, n_seq_tokens * n_seqs])
|
||||
cur = build_lora_mm(model.layers[il].ssm_out, final_output);
|
||||
cb(cur, "linear_attn_out", il);
|
||||
|
||||
- // Reshape back to original dimensions
|
||||
- cur = ggml_reshape_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs);
|
||||
-
|
||||
return cur;
|
||||
}
|
||||
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,655 @@
|
||||
From 58426b58aaf5431a59d499d513b2fe2d6ab990d8 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Thu, 25 Jun 2026 18:55:54 +0200
|
||||
Subject: [PATCH] feat(paged): qwen35 decode conv-state in-place fusion (patch
|
||||
0021)
|
||||
|
||||
The no-regret bit-exact conv-state cleanup from the GDN recurrence byte-gate
|
||||
design (point 3). After the recurrence verdict (NO-BUILD: the gated-DeltaNet
|
||||
recurrence is already single-pass at the f32 byte floor), the decode conv path
|
||||
was the only remaining bit-exact lever.
|
||||
|
||||
New fused op ggml_ssm_conv_update_inplace (reuses GGML_OP_SSM_CONV, discriminated
|
||||
by a non-null src[3]). On the single-token decode path it replaces the four-op
|
||||
conv chain - qkv transpose + ggml_concat (concat_cont) + ggml_ssm_conv + ggml_silu
|
||||
+ ggml_cpy of the shifted ring state (cpy_scalar) - with one kernel that, per
|
||||
(channel, sequence), assembles the width-K window in registers from the K-1 cached
|
||||
taps plus the current qkv_mixed token, computes the depthwise conv with the SAME
|
||||
ascending-tap FMA order as ssm_conv_f32 at i==0, folds silu, writes the conv
|
||||
output, and writes the 1-token-shifted ring state back IN PLACE into the conv
|
||||
cache slot at kv_head. This is vLLM causal_conv1d_update; it mirrors the 0018
|
||||
in-place write-back and 0019 patterns. Read source (the build_rs tap gather) and
|
||||
write target (the cache view) are disjoint buffers, so it is race-free by
|
||||
construction with no ids/identity logic.
|
||||
|
||||
- ggml.h/ggml.c: builder (src0=conv_states [K-1,ch,n_seqs], src1=conv_kernel,
|
||||
src2=x_cur [ch,1,n_seqs], src3=conv_state_dst [(K-1)*ch,n_seqs] in-place ring;
|
||||
op_params[0]=fuse_silu)
|
||||
- ggml-cuda/ssm-conv.cu: ssm_conv_update_f32<apply_silu,d_conv> kernel +
|
||||
ggml_cuda_op_ssm_conv_update + src[3]-discriminated branch in ggml_cuda_op_ssm_conv
|
||||
- ggml-cpu/ops.cpp: ggml_compute_forward_ssm_conv_update_f32 (threads over channels)
|
||||
+ branch in ggml_compute_forward_ssm_conv
|
||||
- delta-net-base.cpp/models.h: build_conv_state_fused (keeps the cheap build_rs
|
||||
conv-tap gather; fuses conv+silu+shifted write-back)
|
||||
- qwen35.cpp, qwen35moe.cpp, qwen3next.cpp: route the single-token decode path
|
||||
(n_seq_tokens==1 && n_rs_seq==0 && fused_gdn_ar); prefill/chunked/rollback keep
|
||||
the original chain
|
||||
- tests/test-backend-ops.cpp: test_ssm_conv_update (16 cases) vs the CPU reference
|
||||
|
||||
test-backend-ops: SSM_CONV 45/45, SSM_CONV_UPDATE 16/16, SSM_CONV_BIAS_SILU 90/90.
|
||||
|
||||
Greedy (--temp 0 --seed 1 --ignore-eos -n 256) byte-identical to the Lever-1
|
||||
(0019/0020) baseline: q36-27b-nvfp4 md5 675cd522..., q36-35b-a3b-nvfp4 md5
|
||||
ac163882... both BYTE-IDENTICAL.
|
||||
|
||||
decode_agg S_TG (npp128 ntg128, -fa on, CUDA-graph), same session:
|
||||
dense q36-27b-nvfp4 : npl 32 199.76 -> 202.99 (+1.6%)
|
||||
npl 128 336.35 -> 347.14 (+3.2%, 86.0 -> 88.8 percent of vLLM 391)
|
||||
MoE q36-35b-a3b : npl 32 421.72 -> 432.39 (+2.5%)
|
||||
npl 128 689.74 -> 713.54 (+3.5%)
|
||||
Lift holds in eager too (dense npl128 333.62 -> 342.97). Step -11.9 ms/step
|
||||
(dense npl128: 380.6 -> 368.7). nsys eager decode: concat_cont (1152 calls) and the
|
||||
decode cpy_scalar GONE; ssm_conv_f32 at decode replaced by ssm_conv_update (1152);
|
||||
conv-path ~20.9 -> ~7.6 ms/step. Bit-exact, no regression, de-risks the bf16-state
|
||||
conv-cache plumbing.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
ggml/include/ggml.h | 16 +++++
|
||||
ggml/src/ggml-cpu/ops.cpp | 73 ++++++++++++++++++++-
|
||||
ggml/src/ggml-cuda/ssm-conv.cu | 112 +++++++++++++++++++++++++++++++++
|
||||
ggml/src/ggml.c | 54 ++++++++++++++++
|
||||
src/models/delta-net-base.cpp | 51 +++++++++++++++
|
||||
src/models/models.h | 14 +++++
|
||||
src/models/qwen35.cpp | 23 +++++--
|
||||
src/models/qwen35moe.cpp | 23 +++++--
|
||||
src/models/qwen3next.cpp | 29 ++++++---
|
||||
tests/test-backend-ops.cpp | 47 ++++++++++++++
|
||||
10 files changed, 420 insertions(+), 22 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
|
||||
index 951dd21..76fa401 100644
|
||||
--- a/ggml/include/ggml.h
|
||||
+++ b/ggml/include/ggml.h
|
||||
@@ -2447,6 +2447,22 @@ extern "C" {
|
||||
struct ggml_tensor * sx,
|
||||
struct ggml_tensor * c);
|
||||
|
||||
+ // Fused decode-time depthwise causal conv1d update (mirrors vLLM causal_conv1d_update). Assembles
|
||||
+ // the width-K conv window in registers from the cached K-1 taps (`conv_states`, [K-1, channels,
|
||||
+ // n_seqs]) plus the single current token (`x_cur`, [channels, 1, n_seqs]), computes the depthwise
|
||||
+ // conv with the SAME ascending-tap FMA order as ggml_ssm_conv, optionally folds SiLU, and writes
|
||||
+ // the 1-token-shifted ring state back IN PLACE into `conv_state_dst` (a [(K-1)*channels, n_seqs]
|
||||
+ // view into the conv-state cache). This eliminates the concat + transpose + scalar copy-back +
|
||||
+ // separate silu of the decode conv path. Output: [channels, 1, n_seqs]. Reuses GGML_OP_SSM_CONV;
|
||||
+ // detected by the backends via a non-null src[3]. n_seq_tokens must be 1 (single-token decode).
|
||||
+ GGML_API struct ggml_tensor * ggml_ssm_conv_update_inplace(
|
||||
+ struct ggml_context * ctx,
|
||||
+ struct ggml_tensor * conv_states,
|
||||
+ struct ggml_tensor * conv_kernel,
|
||||
+ struct ggml_tensor * x_cur,
|
||||
+ struct ggml_tensor * conv_state_dst,
|
||||
+ bool fuse_silu);
|
||||
+
|
||||
GGML_API struct ggml_tensor * ggml_ssm_scan(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * s,
|
||||
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
|
||||
index b6a1976..f9cd850 100644
|
||||
--- a/ggml/src/ggml-cpu/ops.cpp
|
||||
+++ b/ggml/src/ggml-cpu/ops.cpp
|
||||
@@ -9463,13 +9463,84 @@ static void ggml_compute_forward_ssm_conv_f32(
|
||||
}
|
||||
}
|
||||
|
||||
+// Fused decode-time depthwise causal conv1d update (mirror of the CUDA ssm_conv_update_f32). Reads the
|
||||
+// K-1 cached taps (src[0]) and the single new token (src[2]), computes the depthwise conv with the same
|
||||
+// ascending-tap FMA order as ggml_compute_forward_ssm_conv_f32, optionally folds silu, writes the conv
|
||||
+// output to dst, and writes the 1-token-shifted ring state back in place into src[3]. Threads split
|
||||
+// over channels.
|
||||
+static void ggml_compute_forward_ssm_conv_update_f32(
|
||||
+ const ggml_compute_params * params,
|
||||
+ ggml_tensor * dst) {
|
||||
+ const ggml_tensor * conv_states = dst->src[0]; // [K-1, channels, n_seqs]
|
||||
+ const ggml_tensor * conv_kernel = dst->src[1]; // [K, channels]
|
||||
+ const ggml_tensor * x_cur = dst->src[2]; // [channels, 1, n_seqs]
|
||||
+ ggml_tensor * cdst = dst->src[3]; // [(K-1)*channels, n_seqs] in-place ring target
|
||||
+
|
||||
+ const int ith = params->ith;
|
||||
+ const int nth = params->nth;
|
||||
+
|
||||
+ const int64_t d_conv = conv_kernel->ne[0];
|
||||
+ const int64_t channels = conv_kernel->ne[1];
|
||||
+ const int64_t n_seqs = conv_states->ne[2];
|
||||
+ const bool apply_silu = ggml_get_op_params_i32(dst, 0) != 0;
|
||||
+
|
||||
+ GGML_ASSERT(conv_states->nb[0] == sizeof(float));
|
||||
+ GGML_ASSERT(conv_kernel->nb[0] == sizeof(float));
|
||||
+
|
||||
+ const int64_t states_seq_stride = conv_states->nb[2] / sizeof(float);
|
||||
+ const int64_t states_ch_stride = conv_states->nb[1] / sizeof(float);
|
||||
+ const int64_t w_stride = conv_kernel->nb[1] / sizeof(float);
|
||||
+ const int64_t x_seq_stride = x_cur->nb[2] / sizeof(float);
|
||||
+ const int64_t dst_seq_stride = dst->nb[2] / sizeof(float);
|
||||
+ const int64_t cdst_seq_stride = cdst->nb[1] / sizeof(float);
|
||||
+
|
||||
+ const float * states_base = (const float *) conv_states->data;
|
||||
+ const float * w_base = (const float *) conv_kernel->data;
|
||||
+ const float * x_base = (const float *) x_cur->data;
|
||||
+ float * cdst_base = (float *) cdst->data;
|
||||
+ float * dst_base = (float *) dst->data;
|
||||
+
|
||||
+ const int64_t dc = (channels + nth - 1) / nth;
|
||||
+ const int64_t c0 = dc * ith;
|
||||
+ const int64_t c1 = MIN(c0 + dc, channels);
|
||||
+
|
||||
+ for (int64_t s = 0; s < n_seqs; ++s) {
|
||||
+ for (int64_t c = c0; c < c1; ++c) {
|
||||
+ const float * states_c = states_base + s * states_seq_stride + c * states_ch_stride;
|
||||
+ const float * w_c = w_base + c * w_stride;
|
||||
+ const float xc = x_base[s * x_seq_stride + c];
|
||||
+
|
||||
+ // ascending-tap FMA: tap0*w0 + ... + tap_{K-2}*w_{K-2} + xc*w_{K-1} (matches ssm_conv)
|
||||
+ float sumf = 0.0f;
|
||||
+ for (int64_t j = 0; j < d_conv - 1; ++j) {
|
||||
+ sumf += states_c[j] * w_c[j];
|
||||
+ }
|
||||
+ sumf += xc * w_c[d_conv - 1];
|
||||
+ sumf += 0.0f; // matches ssm_conv `sumf += b` with b == 0
|
||||
+
|
||||
+ dst_base[s * dst_seq_stride + c] = apply_silu ? (sumf / (1.0f + expf(-sumf))) : sumf;
|
||||
+
|
||||
+ // 1-token-shifted ring write-back: [tap1 .. tap_{K-2}, xc]
|
||||
+ float * out_state = cdst_base + s * cdst_seq_stride + c * (d_conv - 1);
|
||||
+ for (int64_t j = 0; j < d_conv - 2; ++j) {
|
||||
+ out_state[j] = states_c[j + 1];
|
||||
+ }
|
||||
+ out_state[d_conv - 2] = xc;
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
void ggml_compute_forward_ssm_conv(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
switch (dst->src[0]->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
- ggml_compute_forward_ssm_conv_f32(params, dst);
|
||||
+ if (dst->src[3] != nullptr) {
|
||||
+ ggml_compute_forward_ssm_conv_update_f32(params, dst);
|
||||
+ } else {
|
||||
+ ggml_compute_forward_ssm_conv_f32(params, dst);
|
||||
+ }
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
diff --git a/ggml/src/ggml-cuda/ssm-conv.cu b/ggml/src/ggml-cuda/ssm-conv.cu
|
||||
index 1463169..e1af1cd 100644
|
||||
--- a/ggml/src/ggml-cuda/ssm-conv.cu
|
||||
+++ b/ggml/src/ggml-cuda/ssm-conv.cu
|
||||
@@ -123,6 +123,109 @@ static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0,
|
||||
}
|
||||
}
|
||||
|
||||
+// Fused decode-time depthwise causal conv1d update (one new token). Each thread owns one channel of
|
||||
+// one sequence: it assembles the width-d_conv window from the K-1 cached taps (conv_states) plus the
|
||||
+// current token (x_cur), computes the depthwise conv with the SAME ascending-tap FMA order as
|
||||
+// ssm_conv_f32 at i==0, optionally folds silu, writes the conv output, and writes the 1-token-shifted
|
||||
+// ring state back in place into conv_state_dst. Bit-identical to ssm_conv(concat) + silu + copy-back.
|
||||
+template <bool apply_silu, int d_conv>
|
||||
+static __global__ void ssm_conv_update_f32(const float * __restrict__ conv_states,
|
||||
+ const float * __restrict__ conv_kernel,
|
||||
+ const float * __restrict__ x_cur,
|
||||
+ float * __restrict__ conv_state_dst,
|
||||
+ float * __restrict__ dst,
|
||||
+ const int channels,
|
||||
+ const int states_seq_stride,
|
||||
+ const int w_stride,
|
||||
+ const int x_seq_stride,
|
||||
+ const int dst_seq_stride,
|
||||
+ const int cdst_seq_stride) {
|
||||
+ const int c = blockIdx.x * blockDim.x + threadIdx.x; // channel
|
||||
+ const int s = blockIdx.y; // sequence
|
||||
+ if (c >= channels) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ const float * states_c = conv_states + (int64_t) s * states_seq_stride + (int64_t) c * (d_conv - 1);
|
||||
+ const float * w_c = conv_kernel + (int64_t) c * w_stride;
|
||||
+ const float xc = x_cur[(int64_t) s * x_seq_stride + c];
|
||||
+
|
||||
+ // window = [tap0 .. tap_{K-2}, current-token], same ordering as the concat(conv_states, x) window
|
||||
+ float window[d_conv];
|
||||
+#pragma unroll
|
||||
+ for (int j = 0; j < d_conv - 1; j++) {
|
||||
+ window[j] = states_c[j];
|
||||
+ }
|
||||
+ window[d_conv - 1] = xc;
|
||||
+
|
||||
+ float sumf = 0.0f;
|
||||
+#pragma unroll
|
||||
+ for (int j = 0; j < d_conv; j++) {
|
||||
+ sumf += window[j] * w_c[j];
|
||||
+ }
|
||||
+ sumf += 0.0f; // matches ssm_conv_f32 `sumf += b` with b == 0 (qwen35 conv1d has no bias)
|
||||
+ dst[(int64_t) s * dst_seq_stride + c] = apply_silu ? ggml_cuda_op_silu_single(sumf) : sumf;
|
||||
+
|
||||
+ // 1-token-shifted ring write-back: drop the oldest tap, append the current token
|
||||
+ float * out_state = conv_state_dst + (int64_t) s * cdst_seq_stride + (int64_t) c * (d_conv - 1);
|
||||
+#pragma unroll
|
||||
+ for (int j = 0; j < d_conv - 1; j++) {
|
||||
+ out_state[j] = window[j + 1];
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static void ggml_cuda_op_ssm_conv_update(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
+ const ggml_tensor * conv_states = dst->src[0]; // [K-1, channels, n_seqs]
|
||||
+ const ggml_tensor * conv_kernel = dst->src[1]; // [K, channels]
|
||||
+ const ggml_tensor * x_cur = dst->src[2]; // [channels, 1, n_seqs]
|
||||
+ const ggml_tensor * cdst = dst->src[3]; // [(K-1)*channels, n_seqs] in-place ring target
|
||||
+
|
||||
+ const int64_t d_conv = conv_kernel->ne[0];
|
||||
+ const int64_t channels = conv_kernel->ne[1];
|
||||
+ const int64_t n_seqs = conv_states->ne[2];
|
||||
+ const bool apply_silu = ggml_get_op_params_i32(dst, 0) != 0;
|
||||
+
|
||||
+ GGML_ASSERT(conv_states->type == GGML_TYPE_F32 && conv_kernel->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(x_cur->type == GGML_TYPE_F32 && cdst->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(conv_states->nb[0] == sizeof(float));
|
||||
+ GGML_ASSERT(conv_states->nb[1] == (size_t) (d_conv - 1) * sizeof(float));
|
||||
+ GGML_ASSERT(conv_kernel->nb[0] == sizeof(float));
|
||||
+ GGML_ASSERT(dst->ne[0] == channels && dst->ne[1] == 1 && dst->ne[2] == n_seqs);
|
||||
+
|
||||
+ const float * states_d = (const float *) conv_states->data;
|
||||
+ const float * w_d = (const float *) conv_kernel->data;
|
||||
+ const float * x_d = (const float *) x_cur->data;
|
||||
+ float * cdst_d = (float *) cdst->data;
|
||||
+ float * dst_d = (float *) dst->data;
|
||||
+ cudaStream_t stream = ctx.stream();
|
||||
+
|
||||
+ const int states_seq_stride = (int) (conv_states->nb[2] / sizeof(float));
|
||||
+ const int w_stride = (int) (conv_kernel->nb[1] / sizeof(float));
|
||||
+ const int x_seq_stride = (int) (x_cur->nb[2] / sizeof(float));
|
||||
+ const int dst_seq_stride = (int) (dst->nb[2] / sizeof(float));
|
||||
+ const int cdst_seq_stride = (int) (cdst->nb[1] / sizeof(float));
|
||||
+
|
||||
+ const int threads = 128;
|
||||
+ const dim3 blocks((channels + threads - 1) / threads, (unsigned) n_seqs, 1);
|
||||
+
|
||||
+ auto launch = [&](auto NC) {
|
||||
+ constexpr int kNC = decltype(NC)::value;
|
||||
+ if (apply_silu) {
|
||||
+ ssm_conv_update_f32<true, kNC><<<blocks, threads, 0, stream>>>(states_d, w_d, x_d, cdst_d, dst_d,
|
||||
+ (int) channels, states_seq_stride, w_stride, x_seq_stride, dst_seq_stride, cdst_seq_stride);
|
||||
+ } else {
|
||||
+ ssm_conv_update_f32<false, kNC><<<blocks, threads, 0, stream>>>(states_d, w_d, x_d, cdst_d, dst_d,
|
||||
+ (int) channels, states_seq_stride, w_stride, x_seq_stride, dst_seq_stride, cdst_seq_stride);
|
||||
+ }
|
||||
+ };
|
||||
+
|
||||
+ switch (d_conv) {
|
||||
+ case 3: launch(std::integral_constant<int, 3>{}); break;
|
||||
+ case 4: launch(std::integral_constant<int, 4>{}); break;
|
||||
+ default: GGML_ABORT("ssm_conv_update only supports d_conv 3 or 4");
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
template <bool apply_silu>
|
||||
static void ssm_conv_f32_cuda(const float * src0, const float * src1, const float * bias, const int src0_nb0, const int src0_nb1,
|
||||
const int src0_nb2, const int src1_nb1, float * dst, const int dst_nb0, const int dst_nb1,
|
||||
@@ -158,6 +261,15 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const floa
|
||||
}
|
||||
|
||||
void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * bias_add_node, ggml_tensor * silu_dst) {
|
||||
+ // Fused decode conv-update-in-place variant (ggml_ssm_conv_update_inplace): discriminated by a
|
||||
+ // non-null src[3] (the in-place ring write-back target). It folds the concat/transpose/copy-back/
|
||||
+ // silu of the decode conv path into a single kernel.
|
||||
+ if (dst->src[3] != nullptr) {
|
||||
+ GGML_ASSERT(bias_add_node == nullptr && silu_dst == nullptr);
|
||||
+ ggml_cuda_op_ssm_conv_update(ctx, dst);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
const struct ggml_tensor * src0 = dst->src[0]; // conv_x
|
||||
const struct ggml_tensor * src1 = dst->src[1]; // conv1d.weight
|
||||
const bool fuse_bias = bias_add_node != nullptr;
|
||||
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
|
||||
index 1762037..b777748 100644
|
||||
--- a/ggml/src/ggml.c
|
||||
+++ b/ggml/src/ggml.c
|
||||
@@ -5555,6 +5555,60 @@ struct ggml_tensor * ggml_ssm_conv(
|
||||
return result;
|
||||
}
|
||||
|
||||
+// ggml_ssm_conv_update_inplace
|
||||
+//
|
||||
+// Fused decode-time depthwise causal conv1d update. Reuses GGML_OP_SSM_CONV but is discriminated by a
|
||||
+// non-null src[3]. The op reads each channel's K-1 cached taps from `conv_states` and the single new
|
||||
+// token from `x_cur`, computes the depthwise conv (ascending-tap FMA, bit-identical to ggml_ssm_conv),
|
||||
+// optionally folds SiLU, writes the conv output to dst ([channels, 1, n_seqs]) and writes the
|
||||
+// 1-token-shifted ring state back in place into `conv_state_dst` (the active sequences' conv-cache
|
||||
+// slot). op_params[0] carries the fuse_silu flag. Mirrors the 0018/0019 in-place state pattern.
|
||||
+struct ggml_tensor * ggml_ssm_conv_update_inplace(
|
||||
+ struct ggml_context * ctx,
|
||||
+ struct ggml_tensor * conv_states,
|
||||
+ struct ggml_tensor * conv_kernel,
|
||||
+ struct ggml_tensor * x_cur,
|
||||
+ struct ggml_tensor * conv_state_dst,
|
||||
+ bool fuse_silu) {
|
||||
+ GGML_ASSERT(ggml_is_3d(conv_states));
|
||||
+ GGML_ASSERT(ggml_is_matrix(conv_kernel));
|
||||
+ GGML_ASSERT(ggml_is_3d(x_cur));
|
||||
+
|
||||
+ const int64_t d_conv = conv_kernel->ne[0];
|
||||
+ const int64_t channels = conv_kernel->ne[1];
|
||||
+ const int64_t n_seqs = conv_states->ne[2];
|
||||
+
|
||||
+ GGML_ASSERT(conv_states->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(conv_kernel->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(x_cur->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(conv_state_dst != NULL && conv_state_dst->type == GGML_TYPE_F32);
|
||||
+
|
||||
+ // conv_states: [K-1, channels, n_seqs], contiguous taps per channel
|
||||
+ GGML_ASSERT(conv_states->ne[0] == d_conv - 1);
|
||||
+ GGML_ASSERT(conv_states->ne[1] == channels);
|
||||
+ GGML_ASSERT(conv_states->nb[0] == sizeof(float));
|
||||
+ // x_cur: single decode token per sequence
|
||||
+ GGML_ASSERT(x_cur->ne[0] == channels);
|
||||
+ GGML_ASSERT(x_cur->ne[1] == 1);
|
||||
+ GGML_ASSERT(x_cur->ne[2] == n_seqs);
|
||||
+ // conv_state_dst: [(K-1)*channels, n_seqs] in-place ring write target
|
||||
+ GGML_ASSERT(conv_state_dst->ne[0] == (d_conv - 1) * channels);
|
||||
+ GGML_ASSERT(conv_state_dst->ne[1] >= n_seqs);
|
||||
+ GGML_ASSERT(conv_state_dst->nb[0] == sizeof(float));
|
||||
+
|
||||
+ struct ggml_tensor * result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, channels, 1, n_seqs);
|
||||
+
|
||||
+ ggml_set_op_params_i32(result, 0, fuse_silu ? 1 : 0);
|
||||
+
|
||||
+ result->op = GGML_OP_SSM_CONV;
|
||||
+ result->src[0] = conv_states;
|
||||
+ result->src[1] = conv_kernel;
|
||||
+ result->src[2] = x_cur;
|
||||
+ result->src[3] = conv_state_dst;
|
||||
+
|
||||
+ return result;
|
||||
+}
|
||||
+
|
||||
// ggml_ssm_scan
|
||||
|
||||
struct ggml_tensor * ggml_ssm_scan(
|
||||
diff --git a/src/models/delta-net-base.cpp b/src/models/delta-net-base.cpp
|
||||
index 194e611..0eee804 100644
|
||||
--- a/src/models/delta-net-base.cpp
|
||||
+++ b/src/models/delta-net-base.cpp
|
||||
@@ -524,6 +524,57 @@ ggml_tensor * llm_build_delta_net_base::build_conv_state(
|
||||
return conv_input;
|
||||
}
|
||||
|
||||
+// Fused decode conv path (patch 0021). Reads the active sequences' prior conv-state taps (the same
|
||||
+// cheap build_rs gather as build_conv_state), then fuses the depthwise conv + silu + the 1-token-
|
||||
+// shifted ring write-back into a single ggml_ssm_conv_update_inplace op. This removes the concat
|
||||
+// (concat_cont), the transpose materialization, the scalar copy-back (cpy_scalar) and the separate
|
||||
+// silu of the decode conv path. The op reads from the (disjoint) materialized taps and writes the
|
||||
+// new ring state in place into the cache slot at kv_head -- exactly the slot the baseline ggml_cpy
|
||||
+// wrote -- so it is bit-identical to build_conv_state + ggml_ssm_conv + ggml_silu.
|
||||
+ggml_tensor * llm_build_delta_net_base::build_conv_state_fused(
|
||||
+ llm_graph_input_rs * inp,
|
||||
+ ggml_tensor * conv_states_all,
|
||||
+ ggml_tensor * qkv_mixed,
|
||||
+ ggml_tensor * conv_kernel,
|
||||
+ int64_t conv_kernel_size,
|
||||
+ int64_t conv_channels,
|
||||
+ int il) {
|
||||
+ const auto * mctx_cur = inp->mctx;
|
||||
+ const auto kv_head = mctx_cur->get_head();
|
||||
+
|
||||
+ const int64_t n_seqs = ubatch.n_seqs;
|
||||
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||
+
|
||||
+ GGML_ASSERT(n_seq_tokens == 1); // single-token decode only
|
||||
+ GGML_ASSERT(cparams.n_rs_seq == 0); // no rollback splits on this path
|
||||
+
|
||||
+ // Prior conv-state taps for the active sequences: [K-1, conv_channels, n_seqs]. Same get_rows
|
||||
+ // gather as the baseline build_conv_state read (tiny; not one of the eliminated buckets).
|
||||
+ ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
|
||||
+ conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
|
||||
+ cb(conv_states, "conv_states_reshaped", il);
|
||||
+
|
||||
+ // Current token, native (non-transposed) qkv_mixed: [conv_channels, 1, n_seqs].
|
||||
+ ggml_tensor * x_cur = ggml_reshape_3d(ctx0, qkv_mixed, conv_channels, n_seq_tokens, n_seqs);
|
||||
+
|
||||
+ // In-place ring write-back target = the active sequences' conv-cache slot at kv_head, exactly the
|
||||
+ // destination the baseline ggml_cpy wrote to (s_slot == 0).
|
||||
+ const int64_t row_count = (conv_kernel_size - 1) * conv_channels;
|
||||
+ const size_t row_size = ggml_row_size(conv_states_all->type, row_count);
|
||||
+ ggml_tensor * conv_state_dst =
|
||||
+ ggml_view_2d(ctx0, conv_states_all, row_count, n_seqs, conv_states_all->nb[1], kv_head * row_size);
|
||||
+ cb(conv_state_dst, "conv_state_update", il);
|
||||
+
|
||||
+ ggml_tensor * conv_output =
|
||||
+ ggml_ssm_conv_update_inplace(ctx0, conv_states, conv_kernel, x_cur, conv_state_dst, /*fuse_silu=*/true);
|
||||
+ cb(conv_output, "conv_output_silu", il);
|
||||
+
|
||||
+ // the ring write is a side effect of the op; pull the op into the graph via the output
|
||||
+ ggml_build_forward_expand(gf, conv_output);
|
||||
+
|
||||
+ return conv_output; // [conv_channels, 1, n_seqs], already silu'd
|
||||
+}
|
||||
+
|
||||
// Step 2: gather-free recurrent attention. Mirrors mamba-base's get_ssm_rows pattern: the fused
|
||||
// gated-DeltaNet op reads each sequence's prior state directly from the full cache via the s_copy
|
||||
// ids (no ggml_get_rows materialization) and writes the new state in place (Step 1). The non-fused
|
||||
diff --git a/src/models/models.h b/src/models/models.h
|
||||
index 98b89e9..da0dd86 100644
|
||||
--- a/src/models/models.h
|
||||
+++ b/src/models/models.h
|
||||
@@ -76,6 +76,20 @@ struct llm_build_delta_net_base : public llm_graph_context {
|
||||
int64_t conv_channels,
|
||||
int il);
|
||||
|
||||
+ // Fused decode-time conv path (patch 0021). Replaces the concat + transpose + ssm_conv + silu +
|
||||
+ // copy-back chain with a single ggml_ssm_conv_update_inplace op that reads the cached K-1 taps and
|
||||
+ // the current token, computes the depthwise conv, folds silu, and writes the 1-token-shifted ring
|
||||
+ // state back in place. Decode-only (n_seq_tokens == 1, n_rs_seq == 0). Returns the silu'd conv
|
||||
+ // output: (conv_channels, 1, n_seqs). Bit-identical to the build_conv_state + ggml_ssm_conv chain.
|
||||
+ ggml_tensor * build_conv_state_fused(
|
||||
+ llm_graph_input_rs * inp,
|
||||
+ ggml_tensor * conv_states_all,
|
||||
+ ggml_tensor * qkv_mixed,
|
||||
+ ggml_tensor * conv_kernel,
|
||||
+ int64_t conv_kernel_size,
|
||||
+ int64_t conv_channels,
|
||||
+ int il);
|
||||
+
|
||||
// run delta-net attention and write the new recurrent state(s) back to ssm_states_all
|
||||
// s: (head_v_dim, head_v_dim, num_v_heads, n_seqs); returns output: (head_v_dim, num_v_heads, n_seq_tokens, n_seqs)
|
||||
ggml_tensor * build_recurrent_attn(
|
||||
diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp
|
||||
index 0874c43..b6dcc5f 100644
|
||||
--- a/src/models/qwen35.cpp
|
||||
+++ b/src/models/qwen35.cpp
|
||||
@@ -383,15 +383,26 @@ ggml_tensor * llama_model_qwen35::graph::build_layer_attn_linear(
|
||||
const int64_t conv_kernel_size = conv_kernel->ne[0];
|
||||
const int64_t conv_channels = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state;
|
||||
|
||||
- ggml_tensor * conv_input = build_conv_state(inp, conv_states_all, qkv_mixed, conv_kernel_size, conv_channels, il);
|
||||
+ // Patch 0021: on the single-token decode path, fuse the conv window assembly + depthwise conv +
|
||||
+ // silu + the 1-token-shifted ring write-back into one in-place op (removes concat_cont, the
|
||||
+ // transpose materialization, cpy_scalar and the separate silu). Bit-identical to the chain below.
|
||||
+ const bool conv_decode_fused = (n_seq_tokens == 1) && (cparams.n_rs_seq == 0) && cparams.fused_gdn_ar;
|
||||
+
|
||||
+ ggml_tensor * conv_qkv_mix;
|
||||
+ if (conv_decode_fused) {
|
||||
+ conv_qkv_mix = build_conv_state_fused(inp, conv_states_all, qkv_mixed, conv_kernel,
|
||||
+ conv_kernel_size, conv_channels, il);
|
||||
+ } else {
|
||||
+ ggml_tensor * conv_input = build_conv_state(inp, conv_states_all, qkv_mixed, conv_kernel_size, conv_channels, il);
|
||||
|
||||
- ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
|
||||
- cb(conv_output_proper, "conv_output_raw", il);
|
||||
+ ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
|
||||
+ cb(conv_output_proper, "conv_output_raw", il);
|
||||
|
||||
- ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper);
|
||||
- cb(conv_output_silu, "conv_output_silu", il);
|
||||
+ ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper);
|
||||
+ cb(conv_output_silu, "conv_output_silu", il);
|
||||
|
||||
- ggml_tensor * conv_qkv_mix = conv_output_silu;
|
||||
+ conv_qkv_mix = conv_output_silu;
|
||||
+ }
|
||||
|
||||
// Calculate the total conv dimension
|
||||
int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads;
|
||||
diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp
|
||||
index 1f6f643..c7c7c44 100644
|
||||
--- a/src/models/qwen35moe.cpp
|
||||
+++ b/src/models/qwen35moe.cpp
|
||||
@@ -407,15 +407,26 @@ ggml_tensor * llama_model_qwen35moe::graph::build_layer_attn_linear(
|
||||
const int64_t conv_kernel_size = conv_kernel->ne[0];
|
||||
const int64_t conv_channels = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state;
|
||||
|
||||
- ggml_tensor * conv_input = build_conv_state(inp, conv_states_all, qkv_mixed, conv_kernel_size, conv_channels, il);
|
||||
+ // Patch 0021: on the single-token decode path, fuse the conv window assembly + depthwise conv +
|
||||
+ // silu + the 1-token-shifted ring write-back into one in-place op (removes concat_cont, the
|
||||
+ // transpose materialization, cpy_scalar and the separate silu). Bit-identical to the chain below.
|
||||
+ const bool conv_decode_fused = (n_seq_tokens == 1) && (cparams.n_rs_seq == 0) && cparams.fused_gdn_ar;
|
||||
+
|
||||
+ ggml_tensor * conv_qkv_mix;
|
||||
+ if (conv_decode_fused) {
|
||||
+ conv_qkv_mix = build_conv_state_fused(inp, conv_states_all, qkv_mixed, conv_kernel,
|
||||
+ conv_kernel_size, conv_channels, il);
|
||||
+ } else {
|
||||
+ ggml_tensor * conv_input = build_conv_state(inp, conv_states_all, qkv_mixed, conv_kernel_size, conv_channels, il);
|
||||
|
||||
- ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
|
||||
- cb(conv_output_proper, "conv_output_raw", il);
|
||||
+ ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
|
||||
+ cb(conv_output_proper, "conv_output_raw", il);
|
||||
|
||||
- ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper);
|
||||
- cb(conv_output_silu, "conv_output_silu", il);
|
||||
+ ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper);
|
||||
+ cb(conv_output_silu, "conv_output_silu", il);
|
||||
|
||||
- ggml_tensor * conv_qkv_mix = conv_output_silu;
|
||||
+ conv_qkv_mix = conv_output_silu;
|
||||
+ }
|
||||
|
||||
// Calculate the total conv dimension
|
||||
int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads;
|
||||
diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp
|
||||
index bfdf026..92749d1 100644
|
||||
--- a/src/models/qwen3next.cpp
|
||||
+++ b/src/models/qwen3next.cpp
|
||||
@@ -434,19 +434,30 @@ ggml_tensor * llama_model_qwen3next::graph::build_layer_attn_linear(
|
||||
const int64_t conv_kernel_size = conv_kernel->ne[0];
|
||||
const int64_t conv_channels = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state;
|
||||
|
||||
- ggml_tensor * conv_input = build_conv_state(inp, conv_states_all, qkv_mixed, conv_kernel_size, conv_channels, il);
|
||||
+ // Patch 0021: on the single-token decode path, fuse the conv window assembly + depthwise conv +
|
||||
+ // silu + the 1-token-shifted ring write-back into one in-place op (removes concat_cont, the
|
||||
+ // transpose materialization, cpy_scalar and the separate silu). Bit-identical to the chain below.
|
||||
+ const bool conv_decode_fused = (n_seq_tokens == 1) && (cparams.n_rs_seq == 0) && cparams.fused_gdn_ar;
|
||||
+
|
||||
+ ggml_tensor * conv_qkv_mix;
|
||||
+ if (conv_decode_fused) {
|
||||
+ conv_qkv_mix = build_conv_state_fused(inp, conv_states_all, qkv_mixed, conv_kernel,
|
||||
+ conv_kernel_size, conv_channels, il);
|
||||
+ } else {
|
||||
+ ggml_tensor * conv_input = build_conv_state(inp, conv_states_all, qkv_mixed, conv_kernel_size, conv_channels, il);
|
||||
|
||||
- ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
|
||||
- state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs);
|
||||
- cb(state, "state_predelta", il);
|
||||
+ ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
|
||||
+ cb(conv_output_proper, "conv_output_raw", il);
|
||||
|
||||
- ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
|
||||
- cb(conv_output_proper, "conv_output_raw", il);
|
||||
+ ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper);
|
||||
+ cb(conv_output_silu, "conv_output_silu", il);
|
||||
|
||||
- ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper);
|
||||
- cb(conv_output_silu, "conv_output_silu", il);
|
||||
+ conv_qkv_mix = conv_output_silu;
|
||||
+ }
|
||||
|
||||
- ggml_tensor * conv_qkv_mix = conv_output_silu;
|
||||
+ ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
|
||||
+ state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs);
|
||||
+ cb(state, "state_predelta", il);
|
||||
|
||||
// Calculate the total conv dimension
|
||||
int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads;
|
||||
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
|
||||
index 291c275..c7348d6 100644
|
||||
--- a/tests/test-backend-ops.cpp
|
||||
+++ b/tests/test-backend-ops.cpp
|
||||
@@ -3748,6 +3748,43 @@ struct test_ssm_conv_bias_silu : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
+// GGML_OP_SSM_CONV fused decode conv-update-in-place (ggml_ssm_conv_update_inplace, patch 0021).
|
||||
+// Validates the conv + silu output (dst) against the CPU reference across backends. The 1-token-
|
||||
+// shifted ring write-back to conv_state_dst is a side effect (validated end-to-end by the greedy
|
||||
+// md5 gate); here it just exercises the in-place write target as an op src.
|
||||
+struct test_ssm_conv_update : public test_case {
|
||||
+ const int64_t d_conv;
|
||||
+ const int64_t channels;
|
||||
+ const int64_t n_seqs;
|
||||
+
|
||||
+ std::string op_desc(ggml_tensor * t) override {
|
||||
+ GGML_UNUSED(t);
|
||||
+ return "SSM_CONV_UPDATE";
|
||||
+ }
|
||||
+
|
||||
+ std::string vars() override {
|
||||
+ return VARS_TO_STR3(d_conv, channels, n_seqs);
|
||||
+ }
|
||||
+
|
||||
+ test_ssm_conv_update(int64_t d_conv = 4, int64_t channels = 256, int64_t n_seqs = 4)
|
||||
+ : d_conv(d_conv), channels(channels), n_seqs(n_seqs) {}
|
||||
+
|
||||
+ ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
+ ggml_tensor * conv_states = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_conv - 1, channels, n_seqs);
|
||||
+ ggml_tensor * conv_kernel = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, d_conv, channels);
|
||||
+ ggml_tensor * x_cur = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, channels, 1, n_seqs);
|
||||
+ ggml_tensor * conv_state_dst = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, (d_conv - 1) * channels, n_seqs);
|
||||
+ ggml_set_name(conv_states, "conv_states");
|
||||
+ ggml_set_name(conv_kernel, "conv_kernel");
|
||||
+ ggml_set_name(x_cur, "x_cur");
|
||||
+ ggml_set_name(conv_state_dst, "conv_state_dst");
|
||||
+
|
||||
+ ggml_tensor * out = ggml_ssm_conv_update_inplace(ctx, conv_states, conv_kernel, x_cur, conv_state_dst, true);
|
||||
+ ggml_set_name(out, "out");
|
||||
+ return out;
|
||||
+ }
|
||||
+};
|
||||
+
|
||||
// GGML_OP_SSM_SCAN
|
||||
struct test_ssm_scan : public test_case {
|
||||
const ggml_type type;
|
||||
@@ -8355,6 +8392,16 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
}
|
||||
}
|
||||
|
||||
+ // fused decode conv-update-in-place (ggml_ssm_conv_update_inplace, patch 0021). channels must be
|
||||
+ // a multiple of 128 for the CUDA SSM_CONV supports_op gate.
|
||||
+ for (int64_t d_conv : {3, 4}) {
|
||||
+ for (int64_t channels : {256, 3328}) {
|
||||
+ for (int64_t n_seqs : {1, 4, 32, 128}) {
|
||||
+ test_cases.emplace_back(new test_ssm_conv_update(d_conv, channels, n_seqs));
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 16, 1, 1024, 1, 32, 4)); // Mamba-1
|
||||
test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 128, 64, 16, 2, 32, 4)); // Mamba-2
|
||||
test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 256, 64, 8, 2, 32, 4)); // Falcon-H1
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,403 @@
|
||||
From 8a3229f41d5b712e87901796dfae3faee1f2f07d Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Thu, 25 Jun 2026 20:32:55 +0200
|
||||
Subject: [PATCH] feat(paged): qwen35 gated-DeltaNet decode
|
||||
occupancy/coalescing retune (patch 0022)
|
||||
|
||||
Bit-exact occupancy retune of gated_delta_net_cuda, the B=128 decode recurrence
|
||||
kernel. After the f32 verdict (vLLM carries the gated-DeltaNet temporal state in
|
||||
float32 and moves the same ~805 MB/call as llama; the gap was pure DRAM bandwidth
|
||||
efficiency on equal bytes - llama 73.4% vs vLLM 82.4% of the 273 GB/s GB10 peak),
|
||||
the lever is a latency-coverage retune that keeps the per-column f32 reduction/FMA
|
||||
order byte-identical (md5-gateable). The bf16-state plan stays shelved.
|
||||
|
||||
Column folding: two new template params NUM_WARPS (default 4) and COLS_PER_WARP
|
||||
(default 1). Each warp now owns COLS_PER_WARP columns of the 128x128 recurrent
|
||||
state instead of 1, looping the existing per-column body over col, col+NUM_WARPS,
|
||||
... within a per-block column tile of NUM_WARPS*COLS_PER_WARP columns;
|
||||
grid.z = S_v / (NUM_WARPS*COLS_PER_WARP). The S_v rows of every column stay sharded
|
||||
across the lanes by the same strided i = r*warp_size + lane mapping, and every
|
||||
column's per-lane FMA accumulation and warp_reduce_sum butterfly are byte-for-byte
|
||||
unchanged; only the (warp,block)->column assignment and visit order differ, which a
|
||||
column's value provably does not depend on (columns are fully independent). This
|
||||
raises per-warp memory-level parallelism ~COLS_PER_WARP-fold (independent
|
||||
state-load bursts before any reduction + interleaved butterfly reductions hiding
|
||||
each other's shfl latency), covering more DRAM latency on this bandwidth-bound
|
||||
kernel. Every global access stays identically coalesced, so it is a scheduling /
|
||||
latency-coverage win, not a coalescing change. The forbidden float4 state load
|
||||
(which would repartition a lane to 4 contiguous rows and change the reduction
|
||||
grouping) is NOT done, so the md5 stays invariant. The S_v=128 tile is
|
||||
env-selectable (GDN_NW / GDN_CPW) for one-build re-tuning; default is the measured
|
||||
GB10 winner (16, 8).
|
||||
|
||||
GB10 (CUDA 13, sm_121, nsys CUPTI timing - HW counters perm-blocked):
|
||||
gated_delta_net B=128 decode call (805.3 MB f32 R+W) 4.02 -> 3.49 ms/call,
|
||||
200.3 -> 230.9 GB/s = 73.4% -> 84.6% of 273 GB/s peak (now above vLLM's 82.4%;
|
||||
102.6% of vLLM's recurrence bandwidth). decode S_TG t/s (npp128 ntg128, -fa on):
|
||||
dense 27b npl128 335.9 -> 373.2 (+11.1%), npl32 199.2 -> 207.6 (+4.2%); MoE
|
||||
35b-a3b npl128 688.4 -> 745.7 (+8.3%), npl32 420.6 -> 440.0 (+4.6%). Prefill
|
||||
unchanged.
|
||||
|
||||
Bit-exact: greedy --temp 0 --seed 1 md5 byte-identical to the 0021 baseline on
|
||||
both q36-27b-nvfp4 and q36-35b-a3b-nvfp4 (winner 16x8 and 4x1 control);
|
||||
test-backend-ops -o GATED_DELTA_NET 36/36 PASS.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
ggml/src/ggml-cuda/gated_delta_net.cu | 236 +++++++++++++++++---------
|
||||
1 file changed, 157 insertions(+), 79 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/gated_delta_net.cu b/ggml/src/ggml-cuda/gated_delta_net.cu
|
||||
index 86d5e2a..d071d5a 100644
|
||||
--- a/ggml/src/ggml-cuda/gated_delta_net.cu
|
||||
+++ b/ggml/src/ggml-cuda/gated_delta_net.cu
|
||||
@@ -1,6 +1,8 @@
|
||||
#include "gated_delta_net.cuh"
|
||||
#include "ggml-cuda/common.cuh"
|
||||
|
||||
+#include <cstdlib>
|
||||
+
|
||||
// Step 2: gather only the NON-identity sequences' prior recurrent state from the full cache into a
|
||||
// disjoint scratch buffer. Identity sequences (ids[s] == rs_head + s) are read in place from the
|
||||
// destination slot by the recurrence kernel and are skipped here. One block per sequence.
|
||||
@@ -29,8 +31,22 @@ static void ggml_cuda_gdn_gather_nonident(const float * cache, const int32_t * i
|
||||
gdn_gather_nonident_kernel<<<(unsigned) n_seqs, 256, 0, stream>>>(cache, ids, rs_head, scratch, D, (int) n_seqs);
|
||||
}
|
||||
|
||||
-template <int S_v, bool KDA, bool keep_rs_t>
|
||||
-__global__ void __launch_bounds__((ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v) * 4, 2)
|
||||
+// Occupancy/coalescing retune (patch 0022). Each warp owns COLS_PER_WARP columns of the recurrent
|
||||
+// state instead of 1, looping the existing per-column body over col, col+NUM_WARPS, ... within a
|
||||
+// per-block column tile of size NUM_WARPS*COLS_PER_WARP. The S_v rows of every column stay sharded
|
||||
+// across the lanes by the SAME strided mapping i = r*warp_size + lane, and every column's per-lane
|
||||
+// FMA accumulation and warp_reduce_sum<warp_size> butterfly are byte-for-byte unchanged. Only the
|
||||
+// (warp,block)->column assignment and the order a warp visits its columns differ, and a column's
|
||||
+// f32 value provably does not depend on either (columns are fully independent: column c reads only
|
||||
+// its own S_v-float state slice plus the shared per-(token,head,seq) q/k/v/g/beta). So the result
|
||||
+// and the stored final state are bit-identical to the COLS_PER_WARP==1 baseline (md5-gateable),
|
||||
+// while per-warp memory-level parallelism rises ~COLS_PER_WARP-fold (COLS_PER_WARP independent
|
||||
+// state-load bursts issued before any reduction, and the independent butterfly reductions interleave
|
||||
+// to hide each other's shfl latency) which covers more DRAM latency on this bandwidth-bound kernel.
|
||||
+// Every individual global access stays IDENTICALLY coalesced (32 consecutive lanes -> one 128B
|
||||
+// sector), so this is a latency-coverage / scheduling win, not a coalescing change.
|
||||
+template <int S_v, bool KDA, bool keep_rs_t, int NUM_WARPS = 4, int COLS_PER_WARP = 1, int MIN_BLOCKS = 2>
|
||||
+__global__ void __launch_bounds__((ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v) * NUM_WARPS, MIN_BLOCKS)
|
||||
gated_delta_net_cuda(const float * q,
|
||||
const float * k,
|
||||
const float * v,
|
||||
@@ -59,9 +75,9 @@ gated_delta_net_cuda(const float * q,
|
||||
int rs_head) {
|
||||
const uint32_t h_idx = blockIdx.x;
|
||||
const uint32_t sequence = blockIdx.y;
|
||||
- // each warp owns one column, using warp-level primitives to reduce across rows
|
||||
+ // each warp owns COLS_PER_WARP columns, using warp-level primitives to reduce across rows.
|
||||
const int lane = threadIdx.x;
|
||||
- const int col = blockIdx.z * blockDim.y + threadIdx.y;
|
||||
+ const int col_base = blockIdx.z * (NUM_WARPS * COLS_PER_WARP) + threadIdx.y;
|
||||
|
||||
const uint32_t iq1 = fastmodulo(h_idx, neqk1_magic);
|
||||
const uint32_t iq3 = fastdiv(sequence, rq3_magic);
|
||||
@@ -86,20 +102,25 @@ gated_delta_net_cuda(const float * q,
|
||||
// writing the same slot per block (identity) is race-free.
|
||||
const float * read_state = (ids != nullptr && ids[sequence] == rs_head + (int) sequence)
|
||||
? state_dst : curr_state;
|
||||
- read_state += state_in_offset + col * S_v;
|
||||
+ read_state += state_in_offset;
|
||||
attn_data += (sequence * n_tokens * H + h_idx) * S_v;
|
||||
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v;
|
||||
static_assert(S_v % warp_size == 0, "S_v must be a multiple of warp_size");
|
||||
constexpr int rows_per_lane = (S_v + warp_size - 1) / warp_size;
|
||||
- float s_shard[rows_per_lane];
|
||||
- // state is stored transposed: M[col][i] = S[i][col], row col is contiguous
|
||||
+ // per-column register shard of the recurrent state; state is stored transposed: M[col][i] = S[i][col].
|
||||
+ float s_shard[COLS_PER_WARP][rows_per_lane];
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
#pragma unroll
|
||||
- for (int r = 0; r < rows_per_lane; r++) {
|
||||
- const int i = r * warp_size + lane;
|
||||
- s_shard[r] = read_state[i];
|
||||
+ for (int cc = 0; cc < COLS_PER_WARP; cc++) {
|
||||
+ const int col = col_base + cc * NUM_WARPS;
|
||||
+ const float * rs = read_state + col * S_v;
|
||||
+#pragma unroll
|
||||
+ for (int r = 0; r < rows_per_lane; r++) {
|
||||
+ const int i = r * warp_size + lane;
|
||||
+ s_shard[cc][r] = rs[i];
|
||||
+ }
|
||||
}
|
||||
|
||||
for (int t = 0; t < n_tokens; t++) {
|
||||
@@ -113,7 +134,7 @@ gated_delta_net_cuda(const float * q,
|
||||
|
||||
const float beta_val = *beta_t;
|
||||
|
||||
- // Cache k and q in registers
|
||||
+ // Cache k and q in registers (shared across the COLS_PER_WARP columns of this warp).
|
||||
float k_reg[rows_per_lane];
|
||||
float q_reg[rows_per_lane];
|
||||
#pragma unroll
|
||||
@@ -126,59 +147,69 @@ gated_delta_net_cuda(const float * q,
|
||||
if constexpr (!KDA) {
|
||||
const float g_val = expf(*g_t);
|
||||
|
||||
- // kv[col] = (S^T @ k)[col] = sum_i S[i][col] * k[i]
|
||||
- float kv_shard = 0.0f;
|
||||
#pragma unroll
|
||||
- for (int r = 0; r < rows_per_lane; r++) {
|
||||
- kv_shard += s_shard[r] * k_reg[r];
|
||||
- }
|
||||
- float kv_col = warp_reduce_sum<warp_size>(kv_shard);
|
||||
+ for (int cc = 0; cc < COLS_PER_WARP; cc++) {
|
||||
+ const int col = col_base + cc * NUM_WARPS;
|
||||
|
||||
- // delta[col] = (v[col] - g * kv[col]) * beta
|
||||
- float delta_col = (v_t[col] - g_val * kv_col) * beta_val;
|
||||
+ // kv[col] = (S^T @ k)[col] = sum_i S[i][col] * k[i]
|
||||
+ float kv_shard = 0.0f;
|
||||
+#pragma unroll
|
||||
+ for (int r = 0; r < rows_per_lane; r++) {
|
||||
+ kv_shard += s_shard[cc][r] * k_reg[r];
|
||||
+ }
|
||||
+ float kv_col = warp_reduce_sum<warp_size>(kv_shard);
|
||||
|
||||
- // fused: S[i][col] = g * S[i][col] + k[i] * delta[col]
|
||||
- // attn[col] = (S^T @ q)[col] = sum_i S[i][col] * q[i]
|
||||
- float attn_partial = 0.0f;
|
||||
+ // delta[col] = (v[col] - g * kv[col]) * beta
|
||||
+ float delta_col = (v_t[col] - g_val * kv_col) * beta_val;
|
||||
+
|
||||
+ // fused: S[i][col] = g * S[i][col] + k[i] * delta[col]
|
||||
+ // attn[col] = (S^T @ q)[col] = sum_i S[i][col] * q[i]
|
||||
+ float attn_partial = 0.0f;
|
||||
#pragma unroll
|
||||
- for (int r = 0; r < rows_per_lane; r++) {
|
||||
- s_shard[r] = g_val * s_shard[r] + k_reg[r] * delta_col;
|
||||
- attn_partial += s_shard[r] * q_reg[r];
|
||||
- }
|
||||
+ for (int r = 0; r < rows_per_lane; r++) {
|
||||
+ s_shard[cc][r] = g_val * s_shard[cc][r] + k_reg[r] * delta_col;
|
||||
+ attn_partial += s_shard[cc][r] * q_reg[r];
|
||||
+ }
|
||||
|
||||
- float attn_col = warp_reduce_sum<warp_size>(attn_partial);
|
||||
+ float attn_col = warp_reduce_sum<warp_size>(attn_partial);
|
||||
|
||||
- if (lane == 0) {
|
||||
- attn_data[col] = attn_col * scale;
|
||||
+ if (lane == 0) {
|
||||
+ attn_data[col] = attn_col * scale;
|
||||
+ }
|
||||
}
|
||||
} else {
|
||||
- // kv[col] = sum_i g[i] * S[i][col] * k[i]
|
||||
- float kv_shard = 0.0f;
|
||||
#pragma unroll
|
||||
- for (int r = 0; r < rows_per_lane; r++) {
|
||||
- const int i = r * warp_size + lane;
|
||||
- kv_shard += expf(g_t[i]) * s_shard[r] * k_reg[r];
|
||||
- }
|
||||
+ for (int cc = 0; cc < COLS_PER_WARP; cc++) {
|
||||
+ const int col = col_base + cc * NUM_WARPS;
|
||||
+
|
||||
+ // kv[col] = sum_i g[i] * S[i][col] * k[i]
|
||||
+ float kv_shard = 0.0f;
|
||||
+#pragma unroll
|
||||
+ for (int r = 0; r < rows_per_lane; r++) {
|
||||
+ const int i = r * warp_size + lane;
|
||||
+ kv_shard += expf(g_t[i]) * s_shard[cc][r] * k_reg[r];
|
||||
+ }
|
||||
|
||||
- float kv_col = warp_reduce_sum<warp_size>(kv_shard);
|
||||
+ float kv_col = warp_reduce_sum<warp_size>(kv_shard);
|
||||
|
||||
- // delta[col] = (v[col] - kv[col]) * beta
|
||||
- float delta_col = (v_t[col] - kv_col) * beta_val;
|
||||
+ // delta[col] = (v[col] - kv[col]) * beta
|
||||
+ float delta_col = (v_t[col] - kv_col) * beta_val;
|
||||
|
||||
- // fused: S[i][col] = g[i] * S[i][col] + k[i] * delta[col]
|
||||
- // attn[col] = (S^T @ q)[col] = sum_i S[i][col] * q[i]
|
||||
- float attn_partial = 0.0f;
|
||||
+ // fused: S[i][col] = g[i] * S[i][col] + k[i] * delta[col]
|
||||
+ // attn[col] = (S^T @ q)[col] = sum_i S[i][col] * q[i]
|
||||
+ float attn_partial = 0.0f;
|
||||
#pragma unroll
|
||||
- for (int r = 0; r < rows_per_lane; r++) {
|
||||
- const int i = r * warp_size + lane;
|
||||
- s_shard[r] = expf(g_t[i]) * s_shard[r] + k_reg[r] * delta_col;
|
||||
- attn_partial += s_shard[r] * q_reg[r];
|
||||
- }
|
||||
+ for (int r = 0; r < rows_per_lane; r++) {
|
||||
+ const int i = r * warp_size + lane;
|
||||
+ s_shard[cc][r] = expf(g_t[i]) * s_shard[cc][r] + k_reg[r] * delta_col;
|
||||
+ attn_partial += s_shard[cc][r] * q_reg[r];
|
||||
+ }
|
||||
|
||||
- float attn_col = warp_reduce_sum<warp_size>(attn_partial);
|
||||
+ float attn_col = warp_reduce_sum<warp_size>(attn_partial);
|
||||
|
||||
- if (lane == 0) {
|
||||
- attn_data[col] = attn_col * scale;
|
||||
+ if (lane == 0) {
|
||||
+ attn_data[col] = attn_col * scale;
|
||||
+ }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -190,11 +221,15 @@ gated_delta_net_cuda(const float * q,
|
||||
const int64_t state_size_per_token = S_v * S_v * H * n_seqs; // per-slot stride in output
|
||||
const int target_slot = (int) n_tokens - 1 - t;
|
||||
if (target_slot >= 0 && target_slot < K) {
|
||||
- float * curr_state = (dst + attn_score_elems) + target_slot * state_size_per_token + state_out_offset;
|
||||
#pragma unroll
|
||||
- for (int r = 0; r < rows_per_lane; r++) {
|
||||
- const int i = r * warp_size + lane;
|
||||
- curr_state[col * S_v + i] = s_shard[r];
|
||||
+ for (int cc = 0; cc < COLS_PER_WARP; cc++) {
|
||||
+ const int col = col_base + cc * NUM_WARPS;
|
||||
+ float * curr_state = (dst + attn_score_elems) + target_slot * state_size_per_token + state_out_offset;
|
||||
+#pragma unroll
|
||||
+ for (int r = 0; r < rows_per_lane; r++) {
|
||||
+ const int i = r * warp_size + lane;
|
||||
+ curr_state[col * S_v + i] = s_shard[cc][r];
|
||||
+ }
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -202,13 +237,48 @@ gated_delta_net_cuda(const float * q,
|
||||
|
||||
if constexpr (!keep_rs_t) {
|
||||
#pragma unroll
|
||||
- for (int r = 0; r < rows_per_lane; r++) {
|
||||
- const int i = r * warp_size + lane;
|
||||
- state[col * S_v + i] = s_shard[r];
|
||||
+ for (int cc = 0; cc < COLS_PER_WARP; cc++) {
|
||||
+ const int col = col_base + cc * NUM_WARPS;
|
||||
+#pragma unroll
|
||||
+ for (int r = 0; r < rows_per_lane; r++) {
|
||||
+ const int i = r * warp_size + lane;
|
||||
+ state[col * S_v + i] = s_shard[cc][r];
|
||||
+ }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+// Default column-folding tile for the S_v==128 decode/prefill path (the GDN head dim of this model).
|
||||
+// Measured winner of the bit-exact occupancy sweep (patch 0022). Override at runtime for the sweep
|
||||
+// via GDN_NW / GDN_CPW; all selectable variants are bit-identical, only %peak differs.
|
||||
+#ifndef GDN_DEFAULT_NW
|
||||
+#define GDN_DEFAULT_NW 16
|
||||
+#endif
|
||||
+#ifndef GDN_DEFAULT_CPW
|
||||
+#define GDN_DEFAULT_CPW 8
|
||||
+#endif
|
||||
+
|
||||
+template <int S_v, bool KDA, bool keep_rs_t, int NUM_WARPS, int COLS_PER_WARP, int MIN_BLOCKS>
|
||||
+static void launch_gdn_variant(
|
||||
+ const float * q_d, const float * k_d, const float * v_d,
|
||||
+ const float * g_d, const float * b_d, const float * s_d,
|
||||
+ float * dst_d, float * state_dst_d, const int32_t * ids_d, int rs_head,
|
||||
+ int64_t H, int64_t n_tokens, int64_t n_seqs,
|
||||
+ int64_t sq1, int64_t sq2, int64_t sq3,
|
||||
+ int64_t sv1, int64_t sv2, int64_t sv3,
|
||||
+ int64_t sb1, int64_t sb2, int64_t sb3,
|
||||
+ const uint3 neqk1_magic, const uint3 rq3_magic,
|
||||
+ float scale, int K, int warp_size, cudaStream_t stream) {
|
||||
+ static_assert(S_v % (NUM_WARPS * COLS_PER_WARP) == 0, "NUM_WARPS*COLS_PER_WARP must divide S_v");
|
||||
+ dim3 grid_dims(H, n_seqs, S_v / (NUM_WARPS * COLS_PER_WARP));
|
||||
+ dim3 block_dims(warp_size <= S_v ? warp_size : S_v, NUM_WARPS, 1);
|
||||
+ const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(grid_dims, block_dims, 0, stream);
|
||||
+ ggml_cuda_kernel_launch(gated_delta_net_cuda<S_v, KDA, keep_rs_t, NUM_WARPS, COLS_PER_WARP, MIN_BLOCKS>, launch_params,
|
||||
+ q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
||||
+ n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
+ sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K, state_dst_d, ids_d, rs_head);
|
||||
+}
|
||||
+
|
||||
template <bool KDA, bool keep_rs_t>
|
||||
static void launch_gated_delta_net(
|
||||
const float * q_d, const float * k_d, const float * v_d,
|
||||
@@ -223,47 +293,55 @@ static void launch_gated_delta_net(
|
||||
float scale, int K, cudaStream_t stream) {
|
||||
//TODO: Add chunked kernel for even faster pre-fill
|
||||
const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size;
|
||||
- const int num_warps = 4;
|
||||
- dim3 grid_dims(H, n_seqs, (S_v + num_warps - 1) / num_warps);
|
||||
- dim3 block_dims(warp_size <= S_v ? warp_size : S_v, num_warps, 1);
|
||||
|
||||
const uint3 neqk1_magic = init_fastdiv_values(neqk1);
|
||||
const uint3 rq3_magic = init_fastdiv_values(rq3);
|
||||
|
||||
- int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
+#define GDN_LAUNCH_ARGS \
|
||||
+ q_d, k_d, v_d, g_d, b_d, s_d, dst_d, state_dst_d, ids_d, rs_head, \
|
||||
+ H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2, sb3, \
|
||||
+ neqk1_magic, rq3_magic, scale, K, warp_size, stream
|
||||
|
||||
- const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(grid_dims, block_dims, 0, stream);
|
||||
switch (S_v) {
|
||||
case 16:
|
||||
- ggml_cuda_kernel_launch(gated_delta_net_cuda<16, KDA, keep_rs_t>, launch_params,
|
||||
- q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
||||
- n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
- sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K, state_dst_d, ids_d, rs_head);
|
||||
+ launch_gdn_variant<16, KDA, keep_rs_t, 4, 1, 2>(GDN_LAUNCH_ARGS);
|
||||
break;
|
||||
case 32:
|
||||
- ggml_cuda_kernel_launch(gated_delta_net_cuda<32, KDA, keep_rs_t>, launch_params,
|
||||
- q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
||||
- n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
- sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K, state_dst_d, ids_d, rs_head);
|
||||
+ launch_gdn_variant<32, KDA, keep_rs_t, 4, 1, 2>(GDN_LAUNCH_ARGS);
|
||||
break;
|
||||
- case 64: {
|
||||
- ggml_cuda_kernel_launch(gated_delta_net_cuda<64, KDA, keep_rs_t>, launch_params,
|
||||
- q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
||||
- n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
- sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K, state_dst_d, ids_d, rs_head);
|
||||
+ case 64:
|
||||
+ launch_gdn_variant<64, KDA, keep_rs_t, 4, 1, 2>(GDN_LAUNCH_ARGS);
|
||||
break;
|
||||
- }
|
||||
case 128: {
|
||||
- ggml_cuda_kernel_launch(gated_delta_net_cuda<128, KDA, keep_rs_t>, launch_params,
|
||||
- q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
||||
- n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
- sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K, state_dst_d, ids_d, rs_head);
|
||||
+ // Bit-exact occupancy/coalescing retune (patch 0022): fold COLS_PER_WARP columns per warp
|
||||
+ // to raise per-warp memory-level parallelism on this bandwidth-bound recurrence. Default is
|
||||
+ // the measured winner; GDN_NW / GDN_CPW override it for the one-build %peak sweep (every
|
||||
+ // selectable {num_warps, cols} is bit-identical, so the sweep cannot change the md5).
|
||||
+ static const int gdn_nw = []{ const char * e = getenv("GDN_NW"); return e ? atoi(e) : GDN_DEFAULT_NW; }();
|
||||
+ static const int gdn_cpw = []{ const char * e = getenv("GDN_CPW"); return e ? atoi(e) : GDN_DEFAULT_CPW; }();
|
||||
+ // NUM_WARPS in {4,8,16} x COLS_PER_WARP ladder (all <=512 threads/block, no 1024-thread
|
||||
+ // .minnctapersm warnings). Measured GB10 %peak: (4,1)=73 baseline ... (16,4)=82 ...
|
||||
+ // (16,8)=84.7 winner ~ tied with (8,8)/(8,16)/(32,4); the plateau is just above vLLM (82.4).
|
||||
+ if (gdn_nw == 4 && gdn_cpw == 1) launch_gdn_variant<128, KDA, keep_rs_t, 4, 1, 2>(GDN_LAUNCH_ARGS);
|
||||
+ else if (gdn_nw == 4 && gdn_cpw == 2) launch_gdn_variant<128, KDA, keep_rs_t, 4, 2, 2>(GDN_LAUNCH_ARGS);
|
||||
+ else if (gdn_nw == 4 && gdn_cpw == 4) launch_gdn_variant<128, KDA, keep_rs_t, 4, 4, 2>(GDN_LAUNCH_ARGS);
|
||||
+ else if (gdn_nw == 8 && gdn_cpw == 1) launch_gdn_variant<128, KDA, keep_rs_t, 8, 1, 2>(GDN_LAUNCH_ARGS);
|
||||
+ else if (gdn_nw == 8 && gdn_cpw == 2) launch_gdn_variant<128, KDA, keep_rs_t, 8, 2, 2>(GDN_LAUNCH_ARGS);
|
||||
+ else if (gdn_nw == 8 && gdn_cpw == 4) launch_gdn_variant<128, KDA, keep_rs_t, 8, 4, 2>(GDN_LAUNCH_ARGS);
|
||||
+ else if (gdn_nw == 8 && gdn_cpw == 8) launch_gdn_variant<128, KDA, keep_rs_t, 8, 8, 2>(GDN_LAUNCH_ARGS);
|
||||
+ else if (gdn_nw == 16 && gdn_cpw == 1) launch_gdn_variant<128, KDA, keep_rs_t, 16, 1, 2>(GDN_LAUNCH_ARGS);
|
||||
+ else if (gdn_nw == 16 && gdn_cpw == 2) launch_gdn_variant<128, KDA, keep_rs_t, 16, 2, 2>(GDN_LAUNCH_ARGS);
|
||||
+ else if (gdn_nw == 16 && gdn_cpw == 4) launch_gdn_variant<128, KDA, keep_rs_t, 16, 4, 2>(GDN_LAUNCH_ARGS);
|
||||
+ else if (gdn_nw == 16 && gdn_cpw == 8) launch_gdn_variant<128, KDA, keep_rs_t, 16, 8, 2>(GDN_LAUNCH_ARGS);
|
||||
+ else launch_gdn_variant<128, KDA, keep_rs_t, GDN_DEFAULT_NW, GDN_DEFAULT_CPW, 2>(GDN_LAUNCH_ARGS);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
+
|
||||
+#undef GDN_LAUNCH_ARGS
|
||||
}
|
||||
|
||||
void ggml_cuda_op_gated_delta_net(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,144 @@
|
||||
From f7409c2de2868a6a048d3c333329468b4cc9e483 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Thu, 25 Jun 2026 23:47:25 +0200
|
||||
Subject: [PATCH] feat(paged): qwen35moe NVFP4 activation-quantize de-dup
|
||||
(patch 0023)
|
||||
|
||||
Bit-exact decode/prefill lever for the MoE (qwen3.5moe) NVFP4 path. ggml`s
|
||||
mul_mat_id quantizes the EXPERT-GATHERED activation rows (ne11_flat =
|
||||
ne12*n_expert_used). For the broadcast up/gate projections (ne11 == 1) every
|
||||
expert of a token receives the SAME token activation, so the stock path
|
||||
re-quantizes each token n_expert_used times. quantize_mmq_nvfp4 produces each
|
||||
block as a pure per-thread function of its 16 consecutive inputs (no cross-thread
|
||||
reduction), so the gathered blocks are byte-identical across the experts.
|
||||
|
||||
Lever: when ne11 == 1, quantize the ne12 UNIQUE token activations once, then
|
||||
gather the resulting block_fp4_mmq rows into the expert-gathered layout keyed by
|
||||
ids_src1 with a coalesced uint4 copy (block_fp4_mmq == 9 uint4 == 144 B). Pure
|
||||
byte copy of identical blocks, so the gathered buffer is byte-for-byte identical
|
||||
to re-quantizing each gathered row; the GEMM is untouched. down_proj
|
||||
(ne11 == n_expert_used, distinct per expert) keeps the stock path.
|
||||
|
||||
Measured GB10 (sm_121a), on top of HEAD 8a3229f (patch 0022), q36-35b-a3b-nvfp4:
|
||||
- nsys decode-isolated: quantize_mmq_nvfp4 868 -> 457 ms/run (-411 ms), new
|
||||
gather_mmq_fp4 +32 ms; net -379 ms of decode GPU-time.
|
||||
- S_TG npl128 745.2 -> 758.1 t/s (+1.73%), npl32 +0.6%; prefill T_PP -4%.
|
||||
- Dense q36-27b-nvfp4 byte-flat (no mul_mat_id): 373.24 t/s unchanged.
|
||||
|
||||
Bit-exact gate (greedy --temp 0 --seed 1 md5, byte-identical to 0022):
|
||||
q36-27b-nvfp4 5951a5b4d624ce891e22ab5fca9bc439 (unchanged)
|
||||
q36-35b-a3b-nvfp4 07db32c2bcb78d17a43ed18bc22705cd (de-dup on == off)
|
||||
test-backend-ops MUL_MAT 1115/1115, MUL_MAT_ID 805/805.
|
||||
|
||||
On by default; GGML_CUDA_MOE_QUANT_DEDUP=0 restores the stock re-quantize path.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
ggml/src/ggml-cuda/mmq.cu | 21 +++++++++++++++++--
|
||||
ggml/src/ggml-cuda/quantize.cu | 37 +++++++++++++++++++++++++++++++++
|
||||
ggml/src/ggml-cuda/quantize.cuh | 4 ++++
|
||||
3 files changed, 60 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
|
||||
index e1add5e..9933fa6 100644
|
||||
--- a/ggml/src/ggml-cuda/mmq.cu
|
||||
+++ b/ggml/src/ggml-cuda/mmq.cu
|
||||
@@ -1,3 +1,4 @@
|
||||
+#include <cstdlib>
|
||||
#include "common.cuh"
|
||||
#include "mmq.cuh"
|
||||
#include "quantize.cuh"
|
||||
@@ -197,8 +198,24 @@ void ggml_cuda_mul_mat_q(
|
||||
const int64_t s13 = src1->nb[3] / ts_src1;
|
||||
|
||||
if (use_native_fp4) {
|
||||
- quantize_mmq_fp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
|
||||
- ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
|
||||
+ // 0023: de-dup the broadcast (up/gate) quantize. ne11==1 means src1 is shared
|
||||
+ // across experts, so quantize the ne12 unique tokens once and gather the blocks.
|
||||
+ static const bool moe_quant_dedup = []{
|
||||
+ const char * e = getenv("GGML_CUDA_MOE_QUANT_DEDUP");
|
||||
+ return e ? atoi(e) != 0 : true; // 0023: on by default; GGML_CUDA_MOE_QUANT_DEDUP=0 disables
|
||||
+ }();
|
||||
+ if (moe_quant_dedup && ne11 == 1) {
|
||||
+ const size_t nbytes_unique = ne12*ne10_padded * sizeof(block_q8_1)/QK8_1 +
|
||||
+ get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq);
|
||||
+ ggml_cuda_pool_alloc<char> src1_unique(ctx.pool(), nbytes_unique);
|
||||
+ quantize_mmq_fp4_cuda(src1_d, nullptr, src1_unique.get(), src0->type, ne10, s12, 0, 0,
|
||||
+ ne10_padded, ne12, 1, 1, stream);
|
||||
+ gather_mmq_fp4_cuda(src1_unique.get(), ids_src1.get(), src1_q8_1.get(),
|
||||
+ ne11_flat, ne12, ne10_padded, stream);
|
||||
+ } else {
|
||||
+ quantize_mmq_fp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
|
||||
+ ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
|
||||
+ }
|
||||
} else {
|
||||
quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
|
||||
ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
|
||||
diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu
|
||||
index 39a500a..a7fd86f 100644
|
||||
--- a/ggml/src/ggml-cuda/quantize.cu
|
||||
+++ b/ggml/src/ggml-cuda/quantize.cu
|
||||
@@ -419,6 +419,43 @@ void quantize_mmq_q8_1_cuda(
|
||||
}
|
||||
}
|
||||
|
||||
+// MoE NVFP4 quantize de-dup (0023): for the broadcast (up/gate) expert matmuls every
|
||||
+// gathered row references one of ne12 unique token activations, so the stock path
|
||||
+// re-quantizes each token n_expert_used times. Quantize the unique tokens once, then copy
|
||||
+// the resulting block_fp4_mmq rows into the expert-gathered layout keyed by ids. This is a
|
||||
+// pure byte copy of identical blocks => the gathered buffer is byte-identical to stock.
|
||||
+static __global__ void gather_mmq_fp4(
|
||||
+ const uint4 * __restrict__ unique, const int32_t * __restrict__ ids,
|
||||
+ uint4 * __restrict__ gathered, const int ne11_flat, const int ne12_unique,
|
||||
+ const int64_t total_words) {
|
||||
+ constexpr int W = (int) (sizeof(block_fp4_mmq) / sizeof(uint4)); // 9 uint4 per 144B block
|
||||
+ const int64_t t = (int64_t) blockIdx.x * blockDim.x + threadIdx.x;
|
||||
+ if (t >= total_words) {
|
||||
+ return;
|
||||
+ }
|
||||
+ const int w = (int) (t % W);
|
||||
+ const int64_t ib = t / W; // destination block index = kb*ne11_flat + j
|
||||
+ const int j = (int) (ib % ne11_flat);
|
||||
+ const int kb = (int) (ib / ne11_flat);
|
||||
+ const int src = ids[j];
|
||||
+ const int64_t ib_u = (int64_t) kb * ne12_unique + src;
|
||||
+ gathered[t] = unique[ib_u * W + w];
|
||||
+}
|
||||
+
|
||||
+void gather_mmq_fp4_cuda(
|
||||
+ const void * unique, const int32_t * ids, void * gathered,
|
||||
+ int64_t ne11_flat, int64_t ne12_unique, int64_t ne0_padded, cudaStream_t stream) {
|
||||
+ const int blocks_per_col = (int) ((ne0_padded + QK_K - 1) / QK_K);
|
||||
+ constexpr int W = (int) (sizeof(block_fp4_mmq) / sizeof(uint4));
|
||||
+ const int64_t total_words = ne11_flat * (int64_t) blocks_per_col * W;
|
||||
+ const int bs = 256;
|
||||
+ const dim3 block_size(bs, 1, 1);
|
||||
+ const dim3 num_blocks((unsigned) ((total_words + bs - 1) / bs), 1, 1);
|
||||
+ gather_mmq_fp4<<<num_blocks, block_size, 0, stream>>>(
|
||||
+ (const uint4 *) unique, ids, (uint4 *) gathered,
|
||||
+ (int) ne11_flat, (int) ne12_unique, total_words);
|
||||
+}
|
||||
+
|
||||
void quantize_mmq_fp4_cuda(
|
||||
const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
|
||||
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
|
||||
diff --git a/ggml/src/ggml-cuda/quantize.cuh b/ggml/src/ggml-cuda/quantize.cuh
|
||||
index 768a3ae..7f64069 100644
|
||||
--- a/ggml/src/ggml-cuda/quantize.cuh
|
||||
+++ b/ggml/src/ggml-cuda/quantize.cuh
|
||||
@@ -26,6 +26,10 @@ void quantize_mmq_q8_1_cuda(
|
||||
ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,
|
||||
int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);
|
||||
|
||||
+void gather_mmq_fp4_cuda(const void * unique, const int32_t * ids, void * gathered,
|
||||
+ int64_t ne11_flat, int64_t ne12_unique, int64_t ne0_padded,
|
||||
+ cudaStream_t stream);
|
||||
+
|
||||
void quantize_mmq_fp4_cuda(const float * x,
|
||||
const int32_t * ids,
|
||||
void * vy,
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,357 @@
|
||||
From a8a9d129ae2226a08a12c30ece697865c0fc85c4 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Fri, 26 Jun 2026 12:41:49 +0200
|
||||
Subject: [PATCH] feat(paged): paged-pool burst-reclaim (truncate + defrag +
|
||||
slot release) (patch 0024)
|
||||
|
||||
Fixes the paged-pool burst-degradation bug (OTHER_PATHS_INVESTIGATION.md section C
|
||||
Part 2): on a long-lived llama-server with LLAMA_KV_PAGED=1, a high-fan-out prefill
|
||||
burst strands KV blocks in the host-side paged pool, so a later lower-npl prefill
|
||||
draws from a depleted/fragmented pool and its throughput collapses (the benchmark's
|
||||
"restart per npl" crutch). Decode is unaffected. The fix changes only host-side
|
||||
block accounting and placement, never KV values or compute, and is gated behind
|
||||
LLAMA_KV_PAGED (LLAMA_PAGED_NO_RECLAIM=1 restores the pre-fix behavior).
|
||||
|
||||
Fix-1 reclaim trailing blocks: PagedKVManager::truncate(seq, n_keep) frees every
|
||||
block beyond ceil(n_keep/bs) (ref-counted); called from llama_kv_cache::seq_rm for
|
||||
the p1==MAX && p0>0 partial-tail case so the manager tracks the kv-cache exactly.
|
||||
Fix-2 defrag on empty: when the pool is fully idle, defrag_free_pool() relinks the
|
||||
free queue into ascending block-id order (FreeBlockQueue::rebuild), preserving
|
||||
content-cache hashes.
|
||||
Fix-3 release on slot completion: server_slot::release() issues prompt_clear()
|
||||
under the paged engine so a finished-idle slot returns its blocks promptly.
|
||||
|
||||
Validation (DGX GB10, q36-27b-nvfp4 = qwen35 hybrid; HEAD f7409c2 = patch 0023):
|
||||
- Bit-exact: greedy md5 identical across paged off / paged on / paged on+NO_RECLAIM
|
||||
(5951a5b4d624ce891e22ab5fca9bc439), == the 0023 baseline. test-backend-ops
|
||||
unaffected (no ggml op touched).
|
||||
- Host unit test: truncate reclaims exactly 16 trailing blocks; defrag restores
|
||||
ascending popleft order. UNIT PASS.
|
||||
- Model A/B (one binary, NO_RECLAIM): fragmentation prefill ratio 0.944 -> 0.998;
|
||||
64 idle slots strand 2048 blocks, reclaim returns the pool to fresh (2527).
|
||||
- Server A/B (FRESH-npl8 -> BURST-npl64 -> POST-npl8): POST-npl8 prefill collapses
|
||||
488 -> 44 t/s with NO_RECLAIM (the bug; investigation saw 507 -> 65), restored to
|
||||
532 t/s (fresh 525, within 1%) with the fix. Paged release-log count 17 -> 96
|
||||
(Fix-3 fires per slot completion). Canary tokens identical fresh-vs-post in both
|
||||
arms (bit-exact serving).
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
src/llama-kv-cache.cpp | 13 ++++++++++
|
||||
src/paged-alloc.cpp | 31 +++++++++++++++++++++++
|
||||
src/paged-alloc.h | 18 +++++++++++++
|
||||
src/paged-kv-manager.cpp | 45 +++++++++++++++++++++++++++++++++
|
||||
src/paged-kv-manager.h | 24 ++++++++++++++++++
|
||||
src/paged-prefix-api.cpp | 8 ++++++
|
||||
src/paged-prefix-api.h | 6 +++++
|
||||
tools/server/server-context.cpp | 17 +++++++++++++
|
||||
8 files changed, 162 insertions(+)
|
||||
|
||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
||||
index 0351f86..21b8f1e 100644
|
||||
--- a/src/llama-kv-cache.cpp
|
||||
+++ b/src/llama-kv-cache.cpp
|
||||
@@ -425,6 +425,19 @@ bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
||||
}
|
||||
}
|
||||
|
||||
+ // [paged 0024 Fix-1] Reclaim trailing blocks on a partial TAIL truncation
|
||||
+ // (p1 == MAX, p0 > 0). llama-server issues seq_rm(slot, n_past, -1) on every
|
||||
+ // reused slot and before a cross-request prefix splice; the kv-cache frees the
|
||||
+ // cells [p0, end) but, without this, the paged manager keeps owning those
|
||||
+ // blocks - the reclamation gap that leaks and fragments the pool across a
|
||||
+ // burst. truncate() frees the blocks beyond ceil(p0/bs) so the manager's
|
||||
+ // accounting tracks the kv-cache exactly. Gated so LLAMA_PAGED_NO_RECLAIM
|
||||
+ // restores the pre-fix behavior for A/B.
|
||||
+ if (paged_alloc::active() && paged_alloc::reclaim_active() && seq_id >= 0 &&
|
||||
+ p0 > 0 && p1 == std::numeric_limits<llama_pos>::max()) {
|
||||
+ paged_alloc::truncate(this, (int) seq_to_stream[seq_id], (int) seq_id, (uint32_t) p0);
|
||||
+ }
|
||||
+
|
||||
if (seq_id >= 0) {
|
||||
auto & cells = v_cells[seq_to_stream[seq_id]];
|
||||
auto & head = v_heads[seq_to_stream[seq_id]];
|
||||
diff --git a/src/paged-alloc.cpp b/src/paged-alloc.cpp
|
||||
index c1027fb..ba98dd5 100644
|
||||
--- a/src/paged-alloc.cpp
|
||||
+++ b/src/paged-alloc.cpp
|
||||
@@ -14,6 +14,11 @@ bool active() {
|
||||
return a;
|
||||
}
|
||||
|
||||
+bool reclaim_active() {
|
||||
+ static const bool off = (std::getenv("LLAMA_PAGED_NO_RECLAIM") != nullptr);
|
||||
+ return !off;
|
||||
+}
|
||||
+
|
||||
static bool debug() {
|
||||
static const bool d = (std::getenv("LLAMA_KV_PAGED_DEBUG") != nullptr);
|
||||
return d;
|
||||
@@ -124,12 +129,28 @@ void commit(const void * cache, int stream, int seq,
|
||||
}
|
||||
}
|
||||
|
||||
+void truncate(const void * cache, int stream, int seq, uint32_t n_keep) {
|
||||
+ paged::PagedKVManager * mgr = find_mgr(cache, stream);
|
||||
+ if (!mgr) {
|
||||
+ return;
|
||||
+ }
|
||||
+ mgr->truncate(seq, (size_t) n_keep); // Fix-1: reclaim trailing blocks
|
||||
+ mgr->defrag_free_pool(); // Fix-2: compact iff the pool emptied
|
||||
+ if (debug()) {
|
||||
+ fprintf(stderr, "[paged-alloc] truncate cache=%p stream=%d seq=%d keep<=%u (free=%zu)\n",
|
||||
+ cache, stream, seq, n_keep, mgr->num_free_blocks());
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
void release(const void * cache, int stream, int seq) {
|
||||
paged::PagedKVManager * mgr = find_mgr(cache, stream);
|
||||
if (!mgr) {
|
||||
return;
|
||||
}
|
||||
mgr->free(seq); // ref-counted: shared blocks survive while another seq holds them
|
||||
+ if (reclaim_active()) {
|
||||
+ mgr->defrag_free_pool(); // Fix-2: compact iff the pool emptied
|
||||
+ }
|
||||
if (debug()) {
|
||||
fprintf(stderr, "[paged-alloc] released cache=%p stream=%d seq=%d (free=%zu)\n",
|
||||
cache, stream, seq, mgr->num_free_blocks());
|
||||
@@ -163,4 +184,14 @@ size_t num_free(const void * cache, int stream) {
|
||||
return mgr ? mgr->num_free_blocks() : 0;
|
||||
}
|
||||
|
||||
+size_t num_free_global() {
|
||||
+ size_t total = 0;
|
||||
+ for (auto & kv : g_managers) total += kv.second->num_free_blocks();
|
||||
+ return total;
|
||||
+}
|
||||
+
|
||||
+size_t num_managers() {
|
||||
+ return g_managers.size();
|
||||
+}
|
||||
+
|
||||
} // namespace paged_alloc
|
||||
diff --git a/src/paged-alloc.h b/src/paged-alloc.h
|
||||
index 88dedef..bfaf45b 100644
|
||||
--- a/src/paged-alloc.h
|
||||
+++ b/src/paged-alloc.h
|
||||
@@ -31,6 +31,12 @@ namespace paged_alloc {
|
||||
// true iff env LLAMA_KV_PAGED is set (evaluated once).
|
||||
bool active();
|
||||
|
||||
+// [paged 0024] The burst-reclaim fix (truncate + defrag-on-empty + slot release)
|
||||
+// is on by default whenever the paged engine is active. LLAMA_PAGED_NO_RECLAIM=1
|
||||
+// restores the pre-fix behavior (no trailing-block reclaim, no compaction) for
|
||||
+// A/B measurement. Evaluated once.
|
||||
+bool reclaim_active();
|
||||
+
|
||||
// Place n_tokens logical positions [base, base+n_tokens) of (cache,stream,seq)
|
||||
// on demand, appending their physical cell indices to `out`. pool_blocks =
|
||||
// cells.size()/block_size is the stream's block budget. Returns false (leaving
|
||||
@@ -58,6 +64,12 @@ int64_t slot(const void * cache, int stream, int seq, int pos);
|
||||
void commit(const void * cache, int stream, int seq,
|
||||
const std::vector<int> & tokens, uint32_t block_size, uint32_t pool_blocks);
|
||||
|
||||
+// [paged 0024 Fix-1] Reclaim the trailing blocks of (cache,stream,seq) beyond
|
||||
+// logical position n_keep (ref-counted), mirroring a partial kv-cache seq_rm
|
||||
+// [n_keep, end). When the stream's pool empties as a result, its free queue is
|
||||
+// defragged to pristine contiguous order (Fix-2). No-op if no manager exists.
|
||||
+void truncate(const void * cache, int stream, int seq, uint32_t n_keep);
|
||||
+
|
||||
// Return one sequence's blocks to the pool (ref-counted; sequence end).
|
||||
void release(const void * cache, int stream, int seq);
|
||||
|
||||
@@ -69,4 +81,10 @@ void release_all(const void * cache);
|
||||
int ref_cnt_at(const void * cache, int stream, int seq, int pos, uint32_t block_size);
|
||||
size_t num_free(const void * cache, int stream);
|
||||
|
||||
+// [paged 0024] Total free blocks summed across every live manager (all caches /
|
||||
+// streams). Wrapper-agnostic, so it reports the real pool for hybrid / iSWA
|
||||
+// models whose outer memory is not a llama_kv_cache. Diagnostics only.
|
||||
+size_t num_free_global();
|
||||
+size_t num_managers();
|
||||
+
|
||||
} // namespace paged_alloc
|
||||
diff --git a/src/paged-kv-manager.cpp b/src/paged-kv-manager.cpp
|
||||
index 4c6ee4c..738b332 100644
|
||||
--- a/src/paged-kv-manager.cpp
|
||||
+++ b/src/paged-kv-manager.cpp
|
||||
@@ -104,6 +104,22 @@ void FreeBlockQueue::prepend_n(const std::vector<KVCacheBlock*>& blocks) {
|
||||
num_free_blocks += blocks.size();
|
||||
}
|
||||
|
||||
+void FreeBlockQueue::rebuild(const std::vector<KVCacheBlock*>& blocks) {
|
||||
+ // Relink the intrusive list using THIS queue's stable fake head/tail nodes.
|
||||
+ num_free_blocks = blocks.size();
|
||||
+ for (size_t i = 0; i < blocks.size(); ++i) {
|
||||
+ blocks[i]->prev_free = (i == 0) ? &fake_head : blocks[i - 1];
|
||||
+ blocks[i]->next_free = (i + 1 < blocks.size()) ? blocks[i + 1] : &fake_tail;
|
||||
+ }
|
||||
+ if (!blocks.empty()) {
|
||||
+ fake_head.next_free = blocks.front();
|
||||
+ fake_tail.prev_free = blocks.back();
|
||||
+ } else {
|
||||
+ fake_head.next_free = &fake_tail;
|
||||
+ fake_tail.prev_free = &fake_head;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
std::vector<KVCacheBlock*> FreeBlockQueue::get_all_free_blocks() const {
|
||||
std::vector<KVCacheBlock*> ret;
|
||||
const KVCacheBlock* curr = fake_head.next_free;
|
||||
@@ -199,6 +215,20 @@ void BlockPool::cache_full_blocks(const std::vector<KVCacheBlock*>& req_blocks,
|
||||
}
|
||||
}
|
||||
|
||||
+void BlockPool::defrag_free_queue() {
|
||||
+ // Pool is fully idle: every non-null block is free (ref_cnt 0). Rebuild the
|
||||
+ // free list in ascending block_id order so popleft hands out physically
|
||||
+ // contiguous blocks again. Hashes / the content-cache map are left intact so
|
||||
+ // a warm committed prefix stays re-hittable.
|
||||
+ std::vector<KVCacheBlock*> ordered;
|
||||
+ ordered.reserve(ptrs_.size());
|
||||
+ for (KVCacheBlock* b : ptrs_) {
|
||||
+ if (b->is_null) continue;
|
||||
+ ordered.push_back(b);
|
||||
+ }
|
||||
+ free_queue_.rebuild(ordered);
|
||||
+}
|
||||
+
|
||||
// ---------------------------------------------------------------------------
|
||||
// PagedKVManager (port of SingleTypeKVCacheManager / FullAttentionManager)
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -250,6 +280,21 @@ void PagedKVManager::free(int seq_id) {
|
||||
req_to_blocks_.erase(it);
|
||||
}
|
||||
|
||||
+void PagedKVManager::truncate(int seq_id, size_t n_keep) {
|
||||
+ auto it = req_to_blocks_.find(seq_id);
|
||||
+ if (it == req_to_blocks_.end()) return;
|
||||
+ auto & blocks = it->second;
|
||||
+ const size_t keep = cdiv(n_keep, block_size_); // blocks covering [0, n_keep)
|
||||
+ if (keep >= blocks.size()) return; // nothing trailing to reclaim
|
||||
+ // Free the trailing blocks [keep, end) tail-first (vLLM eviction order). Their
|
||||
+ // cells were just cleared by the partial seq_rm, so they are safe to reuse.
|
||||
+ std::vector<KVCacheBlock*> ordered(blocks.rbegin(),
|
||||
+ blocks.rbegin() + (blocks.size() - keep));
|
||||
+ pool_.free_blocks(ordered);
|
||||
+ blocks.resize(keep);
|
||||
+ if (blocks.empty()) req_to_blocks_.erase(it);
|
||||
+}
|
||||
+
|
||||
// FNV-1a chained block hash. Deterministic and prefix-sensitive; folds the parent
|
||||
// hash into the seed so each block hash transitively encodes its whole prefix
|
||||
// (behavioral parity with vLLM hash_block_tokens chaining; vLLM uses sha256 bytes).
|
||||
diff --git a/src/paged-kv-manager.h b/src/paged-kv-manager.h
|
||||
index 34decbc..e410d58 100644
|
||||
--- a/src/paged-kv-manager.h
|
||||
+++ b/src/paged-kv-manager.h
|
||||
@@ -47,6 +47,11 @@ public:
|
||||
void append_n(const std::vector<KVCacheBlock*>& blocks);
|
||||
void prepend_n(const std::vector<KVCacheBlock*>& blocks);
|
||||
std::vector<KVCacheBlock*> get_all_free_blocks() const;
|
||||
+ // [paged 0024 Fix-2] Relink the intrusive free list to the given order using
|
||||
+ // THIS queue's fake head/tail (the nodes' addresses are stable; a temporary
|
||||
+ // FreeBlockQueue would leave dangling fake-node pointers). Used to restore a
|
||||
+ // pristine, contiguous popleft order after a fragmenting burst drains.
|
||||
+ void rebuild(const std::vector<KVCacheBlock*>& blocks);
|
||||
|
||||
private:
|
||||
KVCacheBlock fake_head{-1};
|
||||
@@ -67,6 +72,14 @@ public:
|
||||
size_t num_cached_blocks, size_t num_full_blocks,
|
||||
const std::vector<uint64_t>& block_hashes);
|
||||
size_t get_num_free_blocks() const { return free_queue_.num_free_blocks; }
|
||||
+ // [paged 0024 Fix-2] Total non-null blocks, and whether the pool is fully
|
||||
+ // idle (every non-null block back in the free queue). defrag_free_queue()
|
||||
+ // relinks the free queue into pristine ascending-block-id order; only valid
|
||||
+ // when all_free() so no live request's block table is disturbed. Block hashes
|
||||
+ // are preserved, so a warm committed prefix stays re-hittable.
|
||||
+ size_t total_blocks() const { return blocks_.size(); }
|
||||
+ bool all_free() const { return free_queue_.num_free_blocks + 1 == blocks_.size(); }
|
||||
+ void defrag_free_queue();
|
||||
|
||||
private:
|
||||
bool maybe_evict_cached_block(KVCacheBlock* block);
|
||||
@@ -94,6 +107,17 @@ public:
|
||||
void free(int seq_id);
|
||||
int block_size() const { return block_size_; }
|
||||
|
||||
+ // [paged 0024 Fix-1] Reclaim the trailing blocks of seq_id beyond logical
|
||||
+ // position n_keep: free every block at index >= ceil(n_keep/bs) (ref-counted,
|
||||
+ // mirroring vLLM's free of a truncated block suffix). Called on a partial tail
|
||||
+ // seq_rm [n_keep, end) so the manager's block accounting tracks the kv-cache
|
||||
+ // exactly instead of stranding the blocks whose cells were just cleared.
|
||||
+ void truncate(int seq_id, size_t n_keep);
|
||||
+
|
||||
+ // [paged 0024 Fix-2] When no live request holds a block, relink the free
|
||||
+ // queue into pristine contiguous order (undo a burst's scrambled free order).
|
||||
+ void defrag_free_pool() { if (pool_.all_free()) pool_.defrag_free_queue(); }
|
||||
+
|
||||
// Prefix caching (win 3).
|
||||
static uint64_t hash_block(uint64_t parent_hash, const std::vector<int>& token_ids);
|
||||
std::vector<uint64_t> compute_block_hashes(const std::vector<int>& token_ids) const;
|
||||
diff --git a/src/paged-prefix-api.cpp b/src/paged-prefix-api.cpp
|
||||
index 8573cd2..209cee8 100644
|
||||
--- a/src/paged-prefix-api.cpp
|
||||
+++ b/src/paged-prefix-api.cpp
|
||||
@@ -45,4 +45,12 @@ long num_free(llama_context * ctx) {
|
||||
return (long) paged_alloc::num_free((const void *) kv, /*stream=*/0);
|
||||
}
|
||||
|
||||
+long num_free_global() {
|
||||
+ return (long) paged_alloc::num_free_global();
|
||||
+}
|
||||
+
|
||||
+long num_managers() {
|
||||
+ return (long) paged_alloc::num_managers();
|
||||
+}
|
||||
+
|
||||
} // namespace paged_prefix_api
|
||||
diff --git a/src/paged-prefix-api.h b/src/paged-prefix-api.h
|
||||
index 78a3864..8dd817e 100644
|
||||
--- a/src/paged-prefix-api.h
|
||||
+++ b/src/paged-prefix-api.h
|
||||
@@ -24,4 +24,10 @@ int ref_at(llama_context * ctx, llama_seq_id seq, int pos);
|
||||
// Number of free blocks in the unified stream-0 pool, or 0 if no manager.
|
||||
long num_free(llama_context * ctx);
|
||||
|
||||
+// [paged 0024] Total free blocks across every live paged manager (all caches /
|
||||
+// streams). Wrapper-agnostic, so it reports the real pool for hybrid / iSWA
|
||||
+// models whose outer memory is not a llama_kv_cache. Diagnostics only.
|
||||
+long num_free_global();
|
||||
+long num_managers();
|
||||
+
|
||||
} // namespace paged_prefix_api
|
||||
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
|
||||
index f7a114c..8c19cfb 100644
|
||||
--- a/tools/server/server-context.cpp
|
||||
+++ b/tools/server/server-context.cpp
|
||||
@@ -411,6 +411,23 @@ struct server_slot {
|
||||
|
||||
reset();
|
||||
|
||||
+ // [paged 0024 Fix-3] Return this finished slot's paged blocks to the
|
||||
+ // pool promptly. Stock llama-server keeps an idle slot's KV for its own
|
||||
+ // next-prompt cache, but under the paged engine that strands blocks in
|
||||
+ // idle slots after a high-fan-out burst, so a later low-npl run sees a
|
||||
+ // depleted, fragmented pool and its prefill collapses. prompt_clear()
|
||||
+ // issues a full seq_rm (clearing the cells AND, via the paged hook,
|
||||
+ // releasing + defragging the blocks) and clears the slot-local prompt
|
||||
+ // cache so the next reuse recomputes from a pristine pool; cross-request
|
||||
+ // reuse still works through the committed paged content cache. Gated on
|
||||
+ // LLAMA_KV_PAGED (LLAMA_PAGED_NO_RECLAIM opts out for A/B); stock
|
||||
+ // (paged off) is byte-identical.
|
||||
+ static const bool paged_release_on_idle =
|
||||
+ getenv("LLAMA_KV_PAGED") != nullptr && getenv("LLAMA_PAGED_NO_RECLAIM") == nullptr;
|
||||
+ if (paged_release_on_idle && prompt.n_tokens() > 0) {
|
||||
+ prompt_clear(false);
|
||||
+ }
|
||||
+
|
||||
callback_on_release(id);
|
||||
}
|
||||
}
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,56 @@
|
||||
From 2f4f5ab7c9050f890ee1137ef9c8ee09dfcd9ae7 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Fri, 26 Jun 2026 16:52:21 +0200
|
||||
Subject: [PATCH] feat(paged): qwen35moe NVFP4 MoE-decode re-graph
|
||||
(should_use_mmq graph-safe id-path) (patch 0025)
|
||||
|
||||
The MUL_MAT_ID CUDA-graph guard (ggml-cuda.cu [TAG_MUL_MAT_ID_CUDA_GRAPHS]) disables CUDA graphs for
|
||||
the whole decode step whenever a MUL_MAT_ID node has ne[2] > mmvq_mmid_max (8 for NVFP4 on sm_121),
|
||||
because the per-expert host-loop fallback synchronizes the stream. But on Blackwell NVFP4 the path
|
||||
actually taken is should_use_mmq()==true -> the grouped stream-k mul_mat_q id-branch, which launches
|
||||
on one stream with NO host sync (no cudaStreamSynchronize/Memcpy in mmq.cu/mmid.cu). The disable is
|
||||
therefore conservative; graphs are safe for the grouped path.
|
||||
|
||||
Env-gated (LLAMA_MOE_FORCE_GRAPHS, default-off = byte-identical to stock): when set and the node
|
||||
takes the grouped MMQ path, keep CUDA graphs on for the MoE decode step.
|
||||
|
||||
Measured (DGX GB10 sm_121, q36-35b-a3b-nvfp4, llama-batched-bench -fa on -npp128 -ntg128, decode_agg):
|
||||
npl 8 226.0 -> 226.4 +0.2% (noise; ne2<=8 already on the MMVQ-graphed path)
|
||||
npl 32 433.8 -> 452.7 +4.4%
|
||||
npl 64 589.0 -> 605.9 +2.9%
|
||||
npl 128 743.1 -> 757.1 +1.9%
|
||||
|
||||
Bit-exact (graph replay re-issues identical kernels): test-backend-ops MUL_MAT_ID 806/806 CUDA0 OK;
|
||||
parallel-greedy np16 (ne2=16>8) generated content byte-identical ON==OFF.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 12 +++++++++++-
|
||||
1 file changed, 11 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index cca7059..254d2e0 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -3275,7 +3275,17 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
|
||||
if (node->op == GGML_OP_MUL_MAT_ID) {
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
const int mmvq_mmid_max = get_mmvq_mmid_max_batch(node->src[0]->type, cc);
|
||||
- if (!ggml_is_quantized(node->src[0]->type) || node->ne[2] > mmvq_mmid_max) {
|
||||
+ bool mmid_needs_sync = !ggml_is_quantized(node->src[0]->type) || node->ne[2] > mmvq_mmid_max;
|
||||
+ // PROBE (bit-exact, env LLAMA_MOE_FORCE_GRAPHS): the grouped stream-k MMQ id-path is
|
||||
+ // launched on-stream with no host sync (only the per-expert host-loop fallback syncs);
|
||||
+ // when should_use_mmq() is true (Blackwell NVFP4 grouped path) the op is graph-safe
|
||||
+ // even for ne[2] > mmvq_mmid_max, so graphs need not be disabled for the whole step.
|
||||
+ if (mmid_needs_sync && ggml_is_quantized(node->src[0]->type) &&
|
||||
+ getenv("LLAMA_MOE_FORCE_GRAPHS") != nullptr &&
|
||||
+ ggml_cuda_should_use_mmq(node->src[0]->type, cc, node->src[1]->ne[2], node->src[0]->ne[2])) {
|
||||
+ mmid_needs_sync = false;
|
||||
+ }
|
||||
+ if (mmid_needs_sync) {
|
||||
// under these conditions, the mul_mat_id operation will need to synchronize the stream, so we cannot use CUDA graphs
|
||||
// TODO: figure out a way to enable for larger batch sizes, without hurting performance
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/18958
|
||||
--
|
||||
2.43.0
|
||||
@@ -0,0 +1,578 @@
|
||||
From fafe8785c8595f53a51efec20cf84f9146437e0c Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Fri, 26 Jun 2026 22:58:47 +0200
|
||||
Subject: [PATCH] feat(paged): qwen35 recurrent-state gather fusion (patch
|
||||
0028)
|
||||
|
||||
The MoE-gap groundtruth found k_get_rows_float to be the single biggest decode
|
||||
kernel vLLM has no equivalent of (~5.2 ms/step MoE; also dense): vLLM updates its
|
||||
gated-DeltaNet recurrent state in place, while llama ran a separate ggml_get_rows
|
||||
gather. Patch 0019 fused the SSM-state gather; patch 0021 fused the conv compute
|
||||
but kept a build_rs gather for the conv taps. This closes that residual.
|
||||
|
||||
nsys located the residual k_get_rows as the conv-state tap gather in
|
||||
build_conv_state_fused: a 24576-float (= n_embd_r = (d_conv-1)*(d_inner +
|
||||
2*n_group*d_state)) row x 128 sequences, once per GDN layer per decode step
|
||||
(~720 big ~115 us gathers / 24-step window). The SSM-state gather is already
|
||||
fused by 0019, so this conv gather is the last k_get_rows in the GDN decode path.
|
||||
|
||||
New op ggml_ssm_conv_update_inplace_ids (reuses GGML_OP_SSM_CONV, discriminated
|
||||
by a non-null src[4] = ids) takes the FULL conv cache + the s_copy ids and reads
|
||||
each active sequence's prior taps directly from cache[ids[s]] in the kernel (no
|
||||
ggml_get_rows). Identity sequences (ids[s] == rs_head + s, the AR-decode path)
|
||||
read in place from the conv_state_dst write slot (the whole window is loaded into
|
||||
registers before the ring write-back, so read==write is race-free); non-identity
|
||||
sequences (reorder / rs_zero) are gathered into a disjoint scratch by a small
|
||||
ssm_conv_gather_nonident_kernel first. Mirrors the 0019 in-place + ids gather
|
||||
fusion. The read VALUES are unchanged; only the read path (gather -> indexed
|
||||
in-kernel read) changes, so it is bit-identical to the build_rs gather + 0021 op.
|
||||
|
||||
build_conv_state_fused now feeds the full cache + ids through the build_rs
|
||||
get_state_rows lambda (rs_zero clear + extra-states copy still run around it).
|
||||
Helps BOTH dense and MoE (shared GDN conv path).
|
||||
|
||||
GATE test-backend-ops (CUDA0 vs CPU, 2/2 backends): SSM_CONV_UPDATE_IDS OK (new),
|
||||
SSM_CONV_UPDATE OK, SSM_CONV OK, GATED_DELTA_NET OK, GET_ROWS OK.
|
||||
|
||||
GATE greedy md5 (--temp 0 --seed 1 -n 48) BYTE-IDENTICAL both models:
|
||||
q36-27b-nvfp4 5951a5b4d624ce891e22ab5fca9bc439, q36-35b-a3b-nvfp4
|
||||
07db32c2bcb78d17a43ed18bc22705cd (== baseline).
|
||||
|
||||
nsys: k_get_rows_float float,float 10174 -> 9454 instances (720 fewer = 30 GDN
|
||||
layers x 24 steps), 186.3 -> 102.8 ms; the 720 ~115 us conv gathers replaced by a
|
||||
720 x ~1.1 us no-op ssm_conv_gather_nonident (all identity at steady decode).
|
||||
MoE npl128 783.9 t/s (step 163.3 ms vs MOE_GAP 169.8 ms @0025), dense 377.3 t/s.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
ggml/include/ggml.h | 20 ++++
|
||||
ggml/src/ggml-cpu/ops.cpp | 90 +++++++++++++++++-
|
||||
ggml/src/ggml-cuda/ssm-conv.cu | 155 ++++++++++++++++++++++++++++++-
|
||||
ggml/src/ggml.c | 62 +++++++++++++
|
||||
src/models/delta-net-base.cpp | 26 ++++--
|
||||
tests/test-backend-ops.cpp | 69 ++++++++++++++
|
||||
6 files changed, 411 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
|
||||
index 2a5cbce..5fa220a 100644
|
||||
--- a/ggml/include/ggml.h
|
||||
+++ b/ggml/include/ggml.h
|
||||
@@ -2463,6 +2463,26 @@ extern "C" {
|
||||
struct ggml_tensor * conv_state_dst,
|
||||
bool fuse_silu);
|
||||
|
||||
+ // Gather-free variant of ggml_ssm_conv_update_inplace (patch 0028). Instead of a pre-gathered
|
||||
+ // per-sequence tap scratch, it takes the FULL conv-state cache (`conv_states` = [K-1, channels,
|
||||
+ // n_cells]) plus the per-sequence `ids` ([n_seqs], I32, = the recurrent-state s_copy) and reads
|
||||
+ // each active sequence's prior taps directly from cache[ids[s]] inside the kernel -- no
|
||||
+ // ggml_get_rows materialization (mirrors ggml_gated_delta_net_inplace_ids). Identity sequences
|
||||
+ // (ids[s] == rs_head + s) are read in place from `conv_state_dst` (the write slot); any
|
||||
+ // non-identity sequence (reorder / rs_zero remap) is gathered into a disjoint scratch by the
|
||||
+ // backend first, so the read never aliases another sequence's in-place ring write -> race-free
|
||||
+ // and bit-identical to the get_rows + ggml_ssm_conv_update_inplace path. op_params[0]=fuse_silu,
|
||||
+ // op_params[1]=rs_head. Reuses GGML_OP_SSM_CONV, discriminated by a non-null src[4].
|
||||
+ GGML_API struct ggml_tensor * ggml_ssm_conv_update_inplace_ids(
|
||||
+ struct ggml_context * ctx,
|
||||
+ struct ggml_tensor * conv_states,
|
||||
+ struct ggml_tensor * conv_kernel,
|
||||
+ struct ggml_tensor * x_cur,
|
||||
+ struct ggml_tensor * conv_state_dst,
|
||||
+ struct ggml_tensor * ids,
|
||||
+ int rs_head,
|
||||
+ bool fuse_silu);
|
||||
+
|
||||
GGML_API struct ggml_tensor * ggml_ssm_scan(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * s,
|
||||
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
|
||||
index 07ab9e5..515aae4 100644
|
||||
--- a/ggml/src/ggml-cpu/ops.cpp
|
||||
+++ b/ggml/src/ggml-cpu/ops.cpp
|
||||
@@ -9580,6 +9580,90 @@ static void ggml_compute_forward_ssm_conv_update_f32(
|
||||
}
|
||||
}
|
||||
|
||||
+// Patch 0028: CPU reference for ggml_ssm_conv_update_inplace_ids (mirror of the CUDA
|
||||
+// ssm_conv_update_ids_f32). Reads each active sequence's prior K-1 taps directly from the FULL conv
|
||||
+// cache (src[0]) via ids (src[4]) -- identity sequences (ids[s] == rs_head + s) read in place from the
|
||||
+// destination slot src[3], non-identity from cache[ids[s]] -- computes the depthwise conv with the
|
||||
+// same ascending-tap FMA order, optionally folds silu, writes the conv output to dst, and writes the
|
||||
+// 1-token-shifted ring state back in place into src[3]. The window is copied to a local before the
|
||||
+// write so the identity (read == write slot) case is correct. Threads split over channels.
|
||||
+static void ggml_compute_forward_ssm_conv_update_ids_f32(
|
||||
+ const ggml_compute_params * params,
|
||||
+ ggml_tensor * dst) {
|
||||
+ const ggml_tensor * conv_states = dst->src[0]; // FULL cache [K-1, channels, n_cells]
|
||||
+ const ggml_tensor * conv_kernel = dst->src[1]; // [K, channels]
|
||||
+ const ggml_tensor * x_cur = dst->src[2]; // [channels, 1, n_seqs]
|
||||
+ ggml_tensor * cdst = dst->src[3]; // [(K-1)*channels, n_seqs] in-place ring target
|
||||
+ const ggml_tensor * ids = dst->src[4]; // [n_seqs] I32 slot indices (s_copy)
|
||||
+
|
||||
+ const int ith = params->ith;
|
||||
+ const int nth = params->nth;
|
||||
+
|
||||
+ const int64_t d_conv = conv_kernel->ne[0];
|
||||
+ const int64_t channels = conv_kernel->ne[1];
|
||||
+ const int64_t n_seqs = x_cur->ne[2];
|
||||
+ const bool apply_silu = ggml_get_op_params_i32(dst, 0) != 0;
|
||||
+ const int32_t rs_head = ggml_get_op_params_i32(dst, 1);
|
||||
+
|
||||
+ GGML_ASSERT(conv_states->nb[0] == sizeof(float));
|
||||
+ GGML_ASSERT(conv_kernel->nb[0] == sizeof(float));
|
||||
+ GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
||||
+ GGML_ASSERT(d_conv <= 8);
|
||||
+
|
||||
+ const int64_t cache_row_stride = conv_states->nb[2] / sizeof(float); // (K-1)*channels
|
||||
+ const int64_t w_stride = conv_kernel->nb[1] / sizeof(float);
|
||||
+ const int64_t x_seq_stride = x_cur->nb[2] / sizeof(float);
|
||||
+ const int64_t dst_seq_stride = dst->nb[2] / sizeof(float);
|
||||
+ const int64_t cdst_seq_stride = cdst->nb[1] / sizeof(float);
|
||||
+
|
||||
+ const float * cache_base = (const float *) conv_states->data;
|
||||
+ const float * w_base = (const float *) conv_kernel->data;
|
||||
+ const float * x_base = (const float *) x_cur->data;
|
||||
+ float * cdst_base = (float *) cdst->data;
|
||||
+ float * dst_base = (float *) dst->data;
|
||||
+ const int32_t * ids_base = (const int32_t *) ids->data;
|
||||
+
|
||||
+ const int64_t dc = (channels + nth - 1) / nth;
|
||||
+ const int64_t c0 = dc * ith;
|
||||
+ const int64_t c1 = MIN(c0 + dc, channels);
|
||||
+
|
||||
+ for (int64_t s = 0; s < n_seqs; ++s) {
|
||||
+ const int32_t r = ids_base[s];
|
||||
+ const bool ident = (r == rs_head + (int32_t) s);
|
||||
+ // identity reads the K-1 taps in place from the destination slot; non-identity from cache[r].
|
||||
+ const float * states_seq = ident
|
||||
+ ? (cdst_base + s * cdst_seq_stride)
|
||||
+ : (cache_base + (int64_t) r * cache_row_stride);
|
||||
+ for (int64_t c = c0; c < c1; ++c) {
|
||||
+ const float * states_c = states_seq + c * (d_conv - 1);
|
||||
+ const float * w_c = w_base + c * w_stride;
|
||||
+ const float xc = x_base[s * x_seq_stride + c];
|
||||
+
|
||||
+ // window = [tap0 .. tap_{K-2}, xc], copied to a local before the (possibly aliasing) write
|
||||
+ float window[8];
|
||||
+ for (int64_t j = 0; j < d_conv - 1; ++j) {
|
||||
+ window[j] = states_c[j];
|
||||
+ }
|
||||
+ window[d_conv - 1] = xc;
|
||||
+
|
||||
+ // ascending-tap FMA: tap0*w0 + ... + tap_{K-2}*w_{K-2} + xc*w_{K-1} (matches ssm_conv)
|
||||
+ float sumf = 0.0f;
|
||||
+ for (int64_t j = 0; j < d_conv; ++j) {
|
||||
+ sumf += window[j] * w_c[j];
|
||||
+ }
|
||||
+ sumf += 0.0f; // matches ssm_conv `sumf += b` with b == 0
|
||||
+
|
||||
+ dst_base[s * dst_seq_stride + c] = apply_silu ? (sumf / (1.0f + expf(-sumf))) : sumf;
|
||||
+
|
||||
+ // 1-token-shifted ring write-back: [tap1 .. tap_{K-2}, xc]
|
||||
+ float * out_state = cdst_base + s * cdst_seq_stride + c * (d_conv - 1);
|
||||
+ for (int64_t j = 0; j < d_conv - 1; ++j) {
|
||||
+ out_state[j] = window[j + 1];
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
void ggml_compute_forward_ssm_conv(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
@@ -9587,7 +9671,11 @@ void ggml_compute_forward_ssm_conv(
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
if (dst->src[3] != nullptr) {
|
||||
- ggml_compute_forward_ssm_conv_update_f32(params, dst);
|
||||
+ if (dst->src[4] != nullptr) {
|
||||
+ ggml_compute_forward_ssm_conv_update_ids_f32(params, dst);
|
||||
+ } else {
|
||||
+ ggml_compute_forward_ssm_conv_update_f32(params, dst);
|
||||
+ }
|
||||
} else {
|
||||
ggml_compute_forward_ssm_conv_f32(params, dst);
|
||||
}
|
||||
diff --git a/ggml/src/ggml-cuda/ssm-conv.cu b/ggml/src/ggml-cuda/ssm-conv.cu
|
||||
index e1af1cd..28b3cce 100644
|
||||
--- a/ggml/src/ggml-cuda/ssm-conv.cu
|
||||
+++ b/ggml/src/ggml-cuda/ssm-conv.cu
|
||||
@@ -226,6 +226,153 @@ static void ggml_cuda_op_ssm_conv_update(ggml_backend_cuda_context & ctx, ggml_t
|
||||
}
|
||||
}
|
||||
|
||||
+// Patch 0028: gather only the NON-identity sequences' prior conv taps from the FULL conv cache into a
|
||||
+// disjoint scratch buffer. Identity sequences (ids[s] == rs_head + s) are read in place from the
|
||||
+// destination slot by the update kernel and are skipped here. One block per sequence. Mirrors
|
||||
+// gdn_gather_nonident_kernel (the 0019 recurrent-state gather fusion).
|
||||
+static __global__ void ssm_conv_gather_nonident_kernel(const float * __restrict__ cache,
|
||||
+ const int32_t * __restrict__ ids, int rs_head,
|
||||
+ float * __restrict__ scratch, int row_stride, int n_seqs) {
|
||||
+ const int s = blockIdx.x;
|
||||
+ if (s >= n_seqs) {
|
||||
+ return;
|
||||
+ }
|
||||
+ const int r = ids[s];
|
||||
+ if (r == rs_head + s) {
|
||||
+ return; // identity: prior taps already live in the in-place destination slot
|
||||
+ }
|
||||
+ const float * src = cache + (int64_t) r * row_stride;
|
||||
+ float * dst = scratch + (int64_t) s * row_stride;
|
||||
+ for (int i = threadIdx.x; i < row_stride; i += blockDim.x) {
|
||||
+ dst[i] = src[i];
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+// Patch 0028: gather-free fused conv update. Per (channel, sequence), read the K-1 prior taps from the
|
||||
+// active sequence's cache slot via ids -- identity (ids[s] == rs_head + s) reads in place from
|
||||
+// conv_state_dst (the same slot it writes; the whole window is loaded into registers before any write,
|
||||
+// so it is race-free), non-identity reads the pre-gathered disjoint scratch -- then computes the
|
||||
+// depthwise conv with the SAME ascending-tap FMA order as ssm_conv_update_f32, folds silu, writes the
|
||||
+// conv output, and writes the 1-token-shifted ring state back in place. Bit-identical to the get_rows +
|
||||
+// ssm_conv_update_f32 path: the read VALUES are the same; only the read POINTER changes.
|
||||
+template <bool apply_silu, int d_conv>
|
||||
+static __global__ void ssm_conv_update_ids_f32(const float * __restrict__ nonident_scratch,
|
||||
+ const float * __restrict__ conv_kernel,
|
||||
+ const float * __restrict__ x_cur,
|
||||
+ float * __restrict__ conv_state_dst,
|
||||
+ float * __restrict__ dst,
|
||||
+ const int32_t * __restrict__ ids,
|
||||
+ const int rs_head,
|
||||
+ const int channels,
|
||||
+ const int scratch_seq_stride,
|
||||
+ const int w_stride,
|
||||
+ const int x_seq_stride,
|
||||
+ const int dst_seq_stride,
|
||||
+ const int cdst_seq_stride) {
|
||||
+ const int c = blockIdx.x * blockDim.x + threadIdx.x; // channel
|
||||
+ const int s = blockIdx.y; // sequence
|
||||
+ if (c >= channels) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ const bool ident = (ids[s] == rs_head + s);
|
||||
+ const float * states_c = ident
|
||||
+ ? conv_state_dst + (int64_t) s * cdst_seq_stride + (int64_t) c * (d_conv - 1)
|
||||
+ : nonident_scratch + (int64_t) s * scratch_seq_stride + (int64_t) c * (d_conv - 1);
|
||||
+ const float * w_c = conv_kernel + (int64_t) c * w_stride;
|
||||
+ const float xc = x_cur[(int64_t) s * x_seq_stride + c];
|
||||
+
|
||||
+ // window = [tap0 .. tap_{K-2}, current-token], same ordering as ssm_conv_update_f32
|
||||
+ float window[d_conv];
|
||||
+#pragma unroll
|
||||
+ for (int j = 0; j < d_conv - 1; j++) {
|
||||
+ window[j] = states_c[j];
|
||||
+ }
|
||||
+ window[d_conv - 1] = xc;
|
||||
+
|
||||
+ float sumf = 0.0f;
|
||||
+#pragma unroll
|
||||
+ for (int j = 0; j < d_conv; j++) {
|
||||
+ sumf += window[j] * w_c[j];
|
||||
+ }
|
||||
+ sumf += 0.0f; // matches ssm_conv_f32 `sumf += b` with b == 0 (qwen35 conv1d has no bias)
|
||||
+ dst[(int64_t) s * dst_seq_stride + c] = apply_silu ? ggml_cuda_op_silu_single(sumf) : sumf;
|
||||
+
|
||||
+ // 1-token-shifted ring write-back: drop the oldest tap, append the current token
|
||||
+ float * out_state = conv_state_dst + (int64_t) s * cdst_seq_stride + (int64_t) c * (d_conv - 1);
|
||||
+#pragma unroll
|
||||
+ for (int j = 0; j < d_conv - 1; j++) {
|
||||
+ out_state[j] = window[j + 1];
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static void ggml_cuda_op_ssm_conv_update_ids(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
+ const ggml_tensor * conv_states = dst->src[0]; // FULL cache [K-1, channels, n_cells]
|
||||
+ const ggml_tensor * conv_kernel = dst->src[1]; // [K, channels]
|
||||
+ const ggml_tensor * x_cur = dst->src[2]; // [channels, 1, n_seqs]
|
||||
+ const ggml_tensor * cdst = dst->src[3]; // [(K-1)*channels, n_seqs] in-place ring target
|
||||
+ const ggml_tensor * ids = dst->src[4]; // [n_seqs] I32 slot indices (s_copy)
|
||||
+
|
||||
+ const int64_t d_conv = conv_kernel->ne[0];
|
||||
+ const int64_t channels = conv_kernel->ne[1];
|
||||
+ const int64_t n_seqs = x_cur->ne[2];
|
||||
+ const bool apply_silu = ggml_get_op_params_i32(dst, 0) != 0;
|
||||
+ const int rs_head = ggml_get_op_params_i32(dst, 1);
|
||||
+
|
||||
+ GGML_ASSERT(conv_states->type == GGML_TYPE_F32 && conv_kernel->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(x_cur->type == GGML_TYPE_F32 && cdst->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
||||
+ GGML_ASSERT(conv_states->nb[0] == sizeof(float));
|
||||
+ GGML_ASSERT(conv_states->nb[1] == (size_t) (d_conv - 1) * sizeof(float));
|
||||
+ GGML_ASSERT(conv_kernel->nb[0] == sizeof(float));
|
||||
+ GGML_ASSERT(dst->ne[0] == channels && dst->ne[1] == 1 && dst->ne[2] == n_seqs);
|
||||
+
|
||||
+ const float * cache_d = (const float *) conv_states->data;
|
||||
+ const float * w_d = (const float *) conv_kernel->data;
|
||||
+ const float * x_d = (const float *) x_cur->data;
|
||||
+ float * cdst_d = (float *) cdst->data;
|
||||
+ float * dst_d = (float *) dst->data;
|
||||
+ const int32_t * ids_d = (const int32_t *) ids->data;
|
||||
+ cudaStream_t stream = ctx.stream();
|
||||
+
|
||||
+ // n_embd_r = (K-1)*channels: the per-cell row stride of the full conv cache.
|
||||
+ const int cache_row_stride = (int) (conv_states->nb[2] / sizeof(float));
|
||||
+ const int w_stride = (int) (conv_kernel->nb[1] / sizeof(float));
|
||||
+ const int x_seq_stride = (int) (x_cur->nb[2] / sizeof(float));
|
||||
+ const int dst_seq_stride = (int) (dst->nb[2] / sizeof(float));
|
||||
+ const int cdst_seq_stride = (int) (cdst->nb[1] / sizeof(float));
|
||||
+
|
||||
+ // Gather only the non-identity sequences' prior taps into a disjoint scratch (identity sequences
|
||||
+ // read in place from cdst). The scratch is written here and read-only by the update kernel, so the
|
||||
+ // update kernel never reads a slot another block writes -> race-free. No-op at steady AR decode.
|
||||
+ ggml_cuda_pool_alloc<float> nonident_scratch(ctx.pool());
|
||||
+ float * scratch = nonident_scratch.alloc((size_t) cache_row_stride * n_seqs);
|
||||
+ if (n_seqs > 0) {
|
||||
+ ssm_conv_gather_nonident_kernel<<<(unsigned) n_seqs, 256, 0, stream>>>(
|
||||
+ cache_d, ids_d, rs_head, scratch, cache_row_stride, (int) n_seqs);
|
||||
+ }
|
||||
+
|
||||
+ const int threads = 128;
|
||||
+ const dim3 blocks((channels + threads - 1) / threads, (unsigned) n_seqs, 1);
|
||||
+
|
||||
+ auto launch = [&](auto NC) {
|
||||
+ constexpr int kNC = decltype(NC)::value;
|
||||
+ if (apply_silu) {
|
||||
+ ssm_conv_update_ids_f32<true, kNC><<<blocks, threads, 0, stream>>>(scratch, w_d, x_d, cdst_d, dst_d,
|
||||
+ ids_d, rs_head, (int) channels, cache_row_stride, w_stride, x_seq_stride, dst_seq_stride, cdst_seq_stride);
|
||||
+ } else {
|
||||
+ ssm_conv_update_ids_f32<false, kNC><<<blocks, threads, 0, stream>>>(scratch, w_d, x_d, cdst_d, dst_d,
|
||||
+ ids_d, rs_head, (int) channels, cache_row_stride, w_stride, x_seq_stride, dst_seq_stride, cdst_seq_stride);
|
||||
+ }
|
||||
+ };
|
||||
+
|
||||
+ switch (d_conv) {
|
||||
+ case 3: launch(std::integral_constant<int, 3>{}); break;
|
||||
+ case 4: launch(std::integral_constant<int, 4>{}); break;
|
||||
+ default: GGML_ABORT("ssm_conv_update_ids only supports d_conv 3 or 4");
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
template <bool apply_silu>
|
||||
static void ssm_conv_f32_cuda(const float * src0, const float * src1, const float * bias, const int src0_nb0, const int src0_nb1,
|
||||
const int src0_nb2, const int src1_nb1, float * dst, const int dst_nb0, const int dst_nb1,
|
||||
@@ -266,7 +413,13 @@ void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst, g
|
||||
// silu of the decode conv path into a single kernel.
|
||||
if (dst->src[3] != nullptr) {
|
||||
GGML_ASSERT(bias_add_node == nullptr && silu_dst == nullptr);
|
||||
- ggml_cuda_op_ssm_conv_update(ctx, dst);
|
||||
+ // Patch 0028: a non-null src[4] (ids) selects the gather-free variant that reads each
|
||||
+ // sequence's prior taps directly from the full cache via ids (no get_rows materialization).
|
||||
+ if (dst->src[4] != nullptr) {
|
||||
+ ggml_cuda_op_ssm_conv_update_ids(ctx, dst);
|
||||
+ } else {
|
||||
+ ggml_cuda_op_ssm_conv_update(ctx, dst);
|
||||
+ }
|
||||
return;
|
||||
}
|
||||
|
||||
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
|
||||
index 16b180f..dcc09bd 100644
|
||||
--- a/ggml/src/ggml.c
|
||||
+++ b/ggml/src/ggml.c
|
||||
@@ -5606,6 +5606,68 @@ struct ggml_tensor * ggml_ssm_conv_update_inplace(
|
||||
return result;
|
||||
}
|
||||
|
||||
+// ggml_ssm_conv_update_inplace_ids
|
||||
+//
|
||||
+// Gather-free variant of ggml_ssm_conv_update_inplace (patch 0028). Instead of a pre-gathered
|
||||
+// per-sequence tap scratch, it takes the FULL conv-state cache (`conv_states` = [K-1, channels,
|
||||
+// n_cells]) plus the per-sequence `ids` (the recurrent-state s_copy) and reads each active sequence's
|
||||
+// prior taps directly from cache[ids[s]] inside the kernel (no ggml_get_rows). Identity sequences
|
||||
+// (ids[s] == rs_head + s) read in place from the `conv_state_dst` write slot; non-identity sequences
|
||||
+// are gathered into a disjoint scratch by the backend first. Bit-identical to the get_rows +
|
||||
+// ggml_ssm_conv_update_inplace path. Reuses GGML_OP_SSM_CONV, discriminated by a non-null src[4].
|
||||
+// op_params[1] carries rs_head. Mirrors the 0019 ggml_gated_delta_net_inplace_ids gather fusion.
|
||||
+struct ggml_tensor * ggml_ssm_conv_update_inplace_ids(
|
||||
+ struct ggml_context * ctx,
|
||||
+ struct ggml_tensor * conv_states,
|
||||
+ struct ggml_tensor * conv_kernel,
|
||||
+ struct ggml_tensor * x_cur,
|
||||
+ struct ggml_tensor * conv_state_dst,
|
||||
+ struct ggml_tensor * ids,
|
||||
+ int rs_head,
|
||||
+ bool fuse_silu) {
|
||||
+ GGML_ASSERT(ggml_is_3d(conv_states));
|
||||
+ GGML_ASSERT(ggml_is_matrix(conv_kernel));
|
||||
+ GGML_ASSERT(ggml_is_3d(x_cur));
|
||||
+ GGML_ASSERT(ids != NULL && ids->type == GGML_TYPE_I32);
|
||||
+
|
||||
+ const int64_t d_conv = conv_kernel->ne[0];
|
||||
+ const int64_t channels = conv_kernel->ne[1];
|
||||
+ const int64_t n_seqs = x_cur->ne[2];
|
||||
+
|
||||
+ GGML_ASSERT(conv_states->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(conv_kernel->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(x_cur->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(conv_state_dst != NULL && conv_state_dst->type == GGML_TYPE_F32);
|
||||
+
|
||||
+ // conv_states: FULL cache [K-1, channels, n_cells], contiguous taps per channel
|
||||
+ GGML_ASSERT(conv_states->ne[0] == d_conv - 1);
|
||||
+ GGML_ASSERT(conv_states->ne[1] == channels);
|
||||
+ GGML_ASSERT(conv_states->nb[0] == sizeof(float));
|
||||
+ // x_cur: single decode token per sequence
|
||||
+ GGML_ASSERT(x_cur->ne[0] == channels);
|
||||
+ GGML_ASSERT(x_cur->ne[1] == 1);
|
||||
+ // ids: one slot index per active sequence
|
||||
+ GGML_ASSERT(ids->ne[0] == n_seqs);
|
||||
+ // conv_state_dst: [(K-1)*channels, n_seqs] in-place ring write target
|
||||
+ GGML_ASSERT(conv_state_dst->ne[0] == (d_conv - 1) * channels);
|
||||
+ GGML_ASSERT(conv_state_dst->ne[1] >= n_seqs);
|
||||
+ GGML_ASSERT(conv_state_dst->nb[0] == sizeof(float));
|
||||
+
|
||||
+ struct ggml_tensor * result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, channels, 1, n_seqs);
|
||||
+
|
||||
+ ggml_set_op_params_i32(result, 0, fuse_silu ? 1 : 0);
|
||||
+ ggml_set_op_params_i32(result, 1, rs_head);
|
||||
+
|
||||
+ result->op = GGML_OP_SSM_CONV;
|
||||
+ result->src[0] = conv_states;
|
||||
+ result->src[1] = conv_kernel;
|
||||
+ result->src[2] = x_cur;
|
||||
+ result->src[3] = conv_state_dst;
|
||||
+ result->src[4] = ids;
|
||||
+
|
||||
+ return result;
|
||||
+}
|
||||
+
|
||||
// ggml_ssm_scan
|
||||
|
||||
struct ggml_tensor * ggml_ssm_scan(
|
||||
diff --git a/src/models/delta-net-base.cpp b/src/models/delta-net-base.cpp
|
||||
index 58f3d0c..962f5eb 100644
|
||||
--- a/src/models/delta-net-base.cpp
|
||||
+++ b/src/models/delta-net-base.cpp
|
||||
@@ -548,25 +548,33 @@ ggml_tensor * llm_build_delta_net_base::build_conv_state_fused(
|
||||
GGML_ASSERT(n_seq_tokens == 1); // single-token decode only
|
||||
GGML_ASSERT(cparams.n_rs_seq == 0); // no rollback splits on this path
|
||||
|
||||
- // Prior conv-state taps for the active sequences: [K-1, conv_channels, n_seqs]. Same get_rows
|
||||
- // gather as the baseline build_conv_state read (tiny; not one of the eliminated buckets).
|
||||
- ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
|
||||
- conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
|
||||
- cb(conv_states, "conv_states_reshaped", il);
|
||||
-
|
||||
// Current token, native (non-transposed) qkv_mixed: [conv_channels, 1, n_seqs].
|
||||
ggml_tensor * x_cur = ggml_reshape_3d(ctx0, qkv_mixed, conv_channels, n_seq_tokens, n_seqs);
|
||||
|
||||
// In-place ring write-back target = the active sequences' conv-cache slot at kv_head, exactly the
|
||||
// destination the baseline ggml_cpy wrote to (s_slot == 0).
|
||||
- const int64_t row_count = (conv_kernel_size - 1) * conv_channels;
|
||||
+ const int64_t row_count = (conv_kernel_size - 1) * conv_channels; // = n_embd_r
|
||||
const size_t row_size = ggml_row_size(conv_states_all->type, row_count);
|
||||
ggml_tensor * conv_state_dst =
|
||||
ggml_view_2d(ctx0, conv_states_all, row_count, n_seqs, conv_states_all->nb[1], kv_head * row_size);
|
||||
cb(conv_state_dst, "conv_state_update", il);
|
||||
|
||||
- ggml_tensor * conv_output =
|
||||
- ggml_ssm_conv_update_inplace(ctx0, conv_states, conv_kernel, x_cur, conv_state_dst, /*fuse_silu=*/true);
|
||||
+ // Patch 0028: fuse the residual conv-state tap gather (the k_get_rows that build_conv_state's
|
||||
+ // build_rs left firing -- ~the biggest single residual decode kernel, see MOE_GAP_VS_VLLM.md).
|
||||
+ // Exactly like the 0019 SSM-state gather fusion, build_rs feeds the FULL conv cache + the s_copy
|
||||
+ // ids into the op (via the get_state_rows lambda) and still performs the rs_zero clear and the
|
||||
+ // extra-states copy around it; the op reads each active sequence's prior taps directly from
|
||||
+ // cache[ids[s]] (identity sequences read in place from conv_state_dst), so the separate
|
||||
+ // ggml_get_rows materialization is eliminated. The read VALUES are unchanged, only the read path
|
||||
+ // (gather -> indexed in-kernel read) changes, so it is bit-identical to the build_rs gather.
|
||||
+ auto get_conv_op = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) -> ggml_tensor * {
|
||||
+ // states = full conv-state cache reshaped 2d [n_embd_r, n_cells]
|
||||
+ ggml_tensor * cache3d = ggml_reshape_3d(ctx, states, conv_kernel_size - 1, conv_channels, states->ne[1]);
|
||||
+ return ggml_ssm_conv_update_inplace_ids(ctx, cache3d, conv_kernel, x_cur, conv_state_dst,
|
||||
+ ids, (int) kv_head, /*fuse_silu=*/true);
|
||||
+ };
|
||||
+
|
||||
+ ggml_tensor * conv_output = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs, get_conv_op);
|
||||
cb(conv_output, "conv_output_silu", il);
|
||||
|
||||
// the ring write is a side effect of the op; pull the op into the graph via the output
|
||||
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
|
||||
index b5e3048..302975f 100644
|
||||
--- a/tests/test-backend-ops.cpp
|
||||
+++ b/tests/test-backend-ops.cpp
|
||||
@@ -3793,6 +3793,65 @@ struct test_ssm_conv_update : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
+// GGML_OP_SSM_CONV gather-free fused decode conv-update via ids (ggml_ssm_conv_update_inplace_ids,
|
||||
+// patch 0028). conv_states is the FULL cache; ids (a shuffled permutation of [0,n_seqs), rs_head=0)
|
||||
+// selects each sequence's slot, exercising BOTH the identity in-place read (ids[s]==s) and the
|
||||
+// non-identity cache read. Validates the conv + silu output (dst) against the CPU reference.
|
||||
+struct test_ssm_conv_update_ids : public test_case {
|
||||
+ const int64_t d_conv;
|
||||
+ const int64_t channels;
|
||||
+ const int64_t n_seqs;
|
||||
+
|
||||
+ std::string op_desc(ggml_tensor * t) override {
|
||||
+ GGML_UNUSED(t);
|
||||
+ return "SSM_CONV_UPDATE_IDS";
|
||||
+ }
|
||||
+
|
||||
+ std::string vars() override {
|
||||
+ return VARS_TO_STR3(d_conv, channels, n_seqs);
|
||||
+ }
|
||||
+
|
||||
+ test_ssm_conv_update_ids(int64_t d_conv = 4, int64_t channels = 256, int64_t n_seqs = 4)
|
||||
+ : d_conv(d_conv), channels(channels), n_seqs(n_seqs) {}
|
||||
+
|
||||
+ ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
+ ggml_tensor * conv_states = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_conv - 1, channels, n_seqs);
|
||||
+ ggml_tensor * conv_kernel = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, d_conv, channels);
|
||||
+ ggml_tensor * x_cur = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, channels, 1, n_seqs);
|
||||
+ ggml_tensor * conv_state_dst = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, (d_conv - 1) * channels, n_seqs);
|
||||
+ ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
|
||||
+ ggml_set_name(conv_states, "conv_states");
|
||||
+ ggml_set_name(conv_kernel, "conv_kernel");
|
||||
+ ggml_set_name(x_cur, "x_cur");
|
||||
+ ggml_set_name(conv_state_dst, "conv_state_dst");
|
||||
+ ggml_set_name(ids, "ids");
|
||||
+
|
||||
+ ggml_tensor * out = ggml_ssm_conv_update_inplace_ids(ctx, conv_states, conv_kernel, x_cur,
|
||||
+ conv_state_dst, ids, /*rs_head=*/0, /*fuse_silu=*/true);
|
||||
+ ggml_set_name(out, "out");
|
||||
+ return out;
|
||||
+ }
|
||||
+
|
||||
+ void initialize_tensors(ggml_context * ctx) override {
|
||||
+ std::random_device rd;
|
||||
+ std::default_random_engine rng(rd());
|
||||
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
+ if (t->type == GGML_TYPE_I32) {
|
||||
+ // ids: shuffled permutation of [0, n_seqs) into the full cache (rs_head == 0), so some
|
||||
+ // sequences are identity (ids[s] == s, in-place read) and some are not (scratch read).
|
||||
+ std::vector<int32_t> data(t->ne[0]);
|
||||
+ for (int i = 0; i < t->ne[0]; i++) {
|
||||
+ data[i] = i;
|
||||
+ }
|
||||
+ std::shuffle(data.begin(), data.end(), rng);
|
||||
+ ggml_backend_tensor_set(t, data.data(), 0, t->ne[0] * sizeof(int32_t));
|
||||
+ } else {
|
||||
+ init_tensor_uniform(t);
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+};
|
||||
+
|
||||
// GGML_OP_SSM_SCAN
|
||||
struct test_ssm_scan : public test_case {
|
||||
const ggml_type type;
|
||||
@@ -8504,6 +8563,16 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
}
|
||||
}
|
||||
|
||||
+ // gather-free fused decode conv-update via ids (ggml_ssm_conv_update_inplace_ids, patch 0028).
|
||||
+ // channels must be a multiple of 128 for the CUDA SSM_CONV supports_op gate.
|
||||
+ for (int64_t d_conv : {3, 4}) {
|
||||
+ for (int64_t channels : {256, 3328}) {
|
||||
+ for (int64_t n_seqs : {1, 4, 32, 128}) {
|
||||
+ test_cases.emplace_back(new test_ssm_conv_update_ids(d_conv, channels, n_seqs));
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 16, 1, 1024, 1, 32, 4)); // Mamba-1
|
||||
test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 128, 64, 16, 2, 32, 4)); // Mamba-2
|
||||
test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 256, 64, 8, 2, 32, 4)); // Falcon-H1
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,176 @@
|
||||
From e2acb3bca4d12ecef4964a214d397fc91ecfcebc Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Sat, 27 Jun 2026 03:45:19 +0200
|
||||
Subject: [PATCH] feat(paged): block-table within-step host cache (patch 0029)
|
||||
|
||||
Lever 5 (host pipeline). get_block_table() is called once per full-attention
|
||||
layer per decode step, but the KV cell layout (and therefore the block table)
|
||||
is fixed for the whole step: it only changes in apply() when the ubatch's slots
|
||||
are committed. The old path recomputed the full table on every layer.
|
||||
|
||||
This caches the table the first time it is built in a step and reuses the bytes
|
||||
(memcpy) for every subsequent full-attention layer, invalidating the cache in
|
||||
apply(). The reused bytes are identical to a fresh compute, so the change is
|
||||
bit-exact. Toggle off with LLAMA_PAGED_NO_BT_CACHE=1.
|
||||
|
||||
Measured host-side get_block_table time (llama-batched-bench, npp128 ntg128
|
||||
npl128, cache OFF -> ON):
|
||||
- MoE q36-35b-a3b-nvfp4: 112.94 -> 14.82 ms (-87%)
|
||||
- dense q36-27b-nvfp4 : 193.78 -> 16.90 ms (-91%)
|
||||
|
||||
Throughput: dense is partly host-bound and gains (TG 364.8 -> 374.7 t/s,
|
||||
+2.7%, ~95.8% of the vLLM 391 t/s reference @npl128). MoE decode is compute-
|
||||
bound (FP4 GEMM dominates) so the saved host time is off the critical path and
|
||||
TG is flat (752.2 -> 757.0 t/s). The cache is therefore a pure pipeline cleanup,
|
||||
not a numeric change.
|
||||
|
||||
Bit-exact, per path (llama-completion --temp 0 --seed 1, 48 tok):
|
||||
- non-paged MoE = 07db32c2bcb78d17a43ed18bc22705cd (unchanged baseline)
|
||||
- paged MoE = 8cb0ce23777bf55f92f63d0292c756b0 (paged baseline)
|
||||
- paged MoE cache OFF == cache ON (both 8cb0ce23)
|
||||
- dense non-paged == dense paged = 5951a5b4d624ce891e22ab5fca9bc439
|
||||
|
||||
The paged-MoE md5 (8cb0ce23) differs from the non-paged md5 (07db32c2) by a
|
||||
benign FP-accumulation-order difference of the paged attention reduction, not a
|
||||
bug: KL-divergence vs the f16 reference (16 chunks, c512) gives KLD(paged||f16)
|
||||
= 0.13600 <= KLD(nonpaged||f16) = 0.13660 and PPL(paged) = 7.4009 ~
|
||||
PPL(nonpaged) = 7.3896 (within +/- 0.29). See PAGED_BITEXACT_NOTE.md and
|
||||
LEVER5_HOSTPIPE_RESULTS.md.
|
||||
|
||||
Includes the [L5INSTR] host-timing instrumentation used to measure the lever.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
src/llama-context.cpp | 7 +++++++
|
||||
src/llama-kv-cache.cpp | 28 +++++++++++++++++++++++++++-
|
||||
src/llama-kv-cache.h | 9 +++++++++
|
||||
src/paged-attn.cpp | 9 +++++++++
|
||||
4 files changed, 52 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index 5c90c48..ad7939e 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -1306,7 +1306,11 @@ bool llama_context::set_adapter_cvec(
|
||||
return res;
|
||||
}
|
||||
|
||||
+extern "C" void l5_add_setinp(double ns);
|
||||
+extern "C" void l5_add_hostproc(double ns);
|
||||
+static inline double l5c_now_ns(){ struct timespec ts; clock_gettime(CLOCK_MONOTONIC,&ts); return (double)ts.tv_sec*1e9+(double)ts.tv_nsec; }
|
||||
llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
|
||||
+ double _l5_t0=l5c_now_ns();
|
||||
if (mctx && !mctx->apply()) {
|
||||
LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
|
||||
ret = GGML_STATUS_FAILED;
|
||||
@@ -1361,11 +1365,14 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
|
||||
//const auto t_start_us = ggml_time_us();
|
||||
|
||||
// FIXME this call causes a crash if any model inputs were not used in the graph and were therefore not allocated
|
||||
+ double _l5_si=l5c_now_ns();
|
||||
res->set_inputs(&ubatch);
|
||||
+ l5_add_setinp(l5c_now_ns()-_l5_si);
|
||||
|
||||
//LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
|
||||
}
|
||||
|
||||
+ l5_add_hostproc(l5c_now_ns()-_l5_t0);
|
||||
const auto status = graph_compute(res->get_gf(), ubatch.n_tokens > 1);
|
||||
if (status != GGML_STATUS_SUCCESS) {
|
||||
LLAMA_LOG_ERROR("%s: failed to compute graph, compute status: %d\n", __func__, status);
|
||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
||||
index 21b8f1e..17aaf40 100644
|
||||
--- a/src/llama-kv-cache.cpp
|
||||
+++ b/src/llama-kv-cache.cpp
|
||||
@@ -2772,6 +2772,9 @@ bool llama_kv_cache_context::apply() {
|
||||
kv->apply_ubatch(sinfos[i_cur], ubatches[i_cur]);
|
||||
n_kv = kv->get_n_kv(sinfos[i_cur]);
|
||||
|
||||
+ // the cells for this ubatch just changed -> drop the cached block table
|
||||
+ bt_cache_valid = false;
|
||||
+
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -2814,7 +2817,30 @@ void llama_kv_cache_context::get_gather_idxs(int32_t * dst) const {
|
||||
}
|
||||
|
||||
void llama_kv_cache_context::get_block_table(int32_t * dst, uint32_t n_blk) const {
|
||||
- kv->get_block_table(dst, n_blk, n_kv, sinfos[i_cur]);
|
||||
+ const auto & sinfo = sinfos[i_cur];
|
||||
+ const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
|
||||
+ const size_t total = (size_t) ns * n_blk;
|
||||
+
|
||||
+ // within-step reuse: all full-attention layers of a step request the same
|
||||
+ // table (same i_cur/n_blk, cells fixed since apply()). The bytes are
|
||||
+ // identical to a fresh compute, so this is bit-exact.
|
||||
+ static const bool nocache = (getenv("LLAMA_PAGED_NO_BT_CACHE") != nullptr);
|
||||
+ if (nocache) {
|
||||
+ kv->get_block_table(dst, n_blk, n_kv, sinfo);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ if (bt_cache_valid && bt_cache_n_blk == n_blk && bt_cache.size() == total) {
|
||||
+ memcpy(dst, bt_cache.data(), total * sizeof(int32_t));
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ kv->get_block_table(dst, n_blk, n_kv, sinfo);
|
||||
+
|
||||
+ bt_cache.resize(total);
|
||||
+ memcpy(bt_cache.data(), dst, total * sizeof(int32_t));
|
||||
+ bt_cache_n_blk = n_blk;
|
||||
+ bt_cache_valid = true;
|
||||
}
|
||||
|
||||
ggml_tensor * llama_kv_cache_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const {
|
||||
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
|
||||
index e9980b6..b03de78 100644
|
||||
--- a/src/llama-kv-cache.h
|
||||
+++ b/src/llama-kv-cache.h
|
||||
@@ -451,4 +451,13 @@ private:
|
||||
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
||||
// as the cache gets filled, the benefit from this heuristic disappears
|
||||
int32_t n_kv;
|
||||
+
|
||||
+ // [paged L5] within-step block-table cache. get_block_table() is called once
|
||||
+ // per full-attention layer per decode step, but the cell layout (and hence
|
||||
+ // the table) is identical across all layers of a step. Compute it on the
|
||||
+ // first call and reuse the bytes for the rest; invalidated in apply() when
|
||||
+ // the ubatch's slots are committed (the only host-side mutation per step).
|
||||
+ mutable std::vector<int32_t> bt_cache;
|
||||
+ mutable uint32_t bt_cache_n_blk = 0;
|
||||
+ mutable bool bt_cache_valid = false;
|
||||
};
|
||||
diff --git a/src/paged-attn.cpp b/src/paged-attn.cpp
|
||||
index fed8ca9..ebd92be 100644
|
||||
--- a/src/paged-attn.cpp
|
||||
+++ b/src/paged-attn.cpp
|
||||
@@ -8,6 +8,13 @@
|
||||
|
||||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
+#include <ctime>
|
||||
+namespace { static inline double l5_now_ns(){ struct timespec ts; clock_gettime(CLOCK_MONOTONIC,&ts); return (double)ts.tv_sec*1e9+(double)ts.tv_nsec; } }
|
||||
+double g_l5_t_gbt=0, g_l5_t_setinp=0, g_l5_t_hostproc=0; long g_l5_n_gbt=0, g_l5_n_setinp=0, g_l5_n_hostproc=0;
|
||||
+extern "C" void l5_add_setinp(double ns){ g_l5_t_setinp+=ns; g_l5_n_setinp++; }
|
||||
+extern "C" void l5_add_hostproc(double ns){ g_l5_t_hostproc+=ns; g_l5_n_hostproc++; }
|
||||
+namespace { struct L5Printer { ~L5Printer(){ fprintf(stderr,"[L5INSTR] get_block_table n=%ld sum=%.2fms mean=%.4fms | set_inputs n=%ld sum=%.2fms mean=%.4fms | hostproc n=%ld sum=%.2fms mean=%.4fms\n", g_l5_n_gbt, g_l5_t_gbt/1e6, g_l5_n_gbt? g_l5_t_gbt/1e6/g_l5_n_gbt:0.0, g_l5_n_setinp, g_l5_t_setinp/1e6, g_l5_n_setinp? g_l5_t_setinp/1e6/g_l5_n_setinp:0.0, g_l5_n_hostproc, g_l5_t_hostproc/1e6, g_l5_n_hostproc? g_l5_t_hostproc/1e6/g_l5_n_hostproc:0.0 ); } } g_l5_printer; }
|
||||
+
|
||||
|
||||
namespace paged_attn {
|
||||
|
||||
@@ -54,7 +61,9 @@ public:
|
||||
void set_input(const llama_ubatch * ubatch) override {
|
||||
GGML_UNUSED(ubatch);
|
||||
GGML_ASSERT(idxs && ggml_backend_buffer_is_host(idxs->buffer));
|
||||
+ double _t=l5_now_ns();
|
||||
mctx->get_block_table((int32_t *) idxs->data, n_blk);
|
||||
+ g_l5_t_gbt += l5_now_ns()-_t; g_l5_n_gbt++;
|
||||
}
|
||||
|
||||
const llama_kv_cache_context * mctx;
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,106 @@
|
||||
From a095f4ebeefafd16dd54c514eb86148fa46daef3 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Sat, 27 Jun 2026 07:30:43 +0000
|
||||
Subject: [PATCH] feat(paged): backend-gate fused GDN/discriminated SSM_CONV
|
||||
emission (patch 0030)
|
||||
|
||||
Closes the latent silent-miscompute hazard (audit RISKY-1). The fused/in-place
|
||||
Gated Delta Net op (0018/0019/0026: ggml_gated_delta_net_inplace[_ids][_hybrid])
|
||||
and the discriminated SSM_CONV decode op (0021/0028: ggml_ssm_conv_update_inplace
|
||||
[_ids], which REUSE GGML_OP_SSM_CONV / GGML_OP_GATED_DELTA_NET with extra src
|
||||
slots - a non-null src[3]/src[4] ring/ids discriminator) are emitted DEFAULT-ON
|
||||
(cparams.fused_gdn_ar/ch=true, auto_fgdn=true) but are implemented for the
|
||||
CUDA-family TU (CUDA / HIP "ROCm" / "MUSA", hipified ggml-cuda) and the CPU
|
||||
reference ONLY.
|
||||
|
||||
The hazard: a compute backend that supports PLAIN GGML_OP_SSM_CONV but ignores
|
||||
the src[3]/src[4] discriminator (Vulkan/SYCL/Metal) reports supports_op==true for
|
||||
the node and the scheduler assigns the discriminated conv to it; it then runs the
|
||||
wrong plain conv => SILENT corruption (not a crash). The upstream auto_fgdn
|
||||
device-mismatch resolution only inspects GATED_DELTA_NET nodes, so the
|
||||
discriminated-SSM_CONV safety was only incidentally covered (it happened to share
|
||||
backend coverage with the GDN op); it becomes live the moment a non-CUDA paged
|
||||
build of a gated-DeltaNet model exists.
|
||||
|
||||
FIX: gate the fused-op emission on the active compute backend type. Before the
|
||||
auto_fgdn resolution in llama_context::sched_reserve(), if any non-CPU compute
|
||||
backend is not CUDA-family (reg name != "CUDA"/"ROCm"/"MUSA"), force
|
||||
fused_gdn_ar = fused_gdn_ch = auto_fgdn = false. Every emission site keys off
|
||||
these flags (conv_decode_fused = ... && fused_gdn_ar; fused = ... fused_gdn_ar/ch),
|
||||
so disabling them routes the graph to the upstream non-fused path: a PLAIN
|
||||
ggml_ssm_conv (no discriminator) + ggml_silu, which every backend handles
|
||||
correctly. This makes the discriminated-op safety explicit and decoupled from the
|
||||
GDN-op device-mismatch heuristic.
|
||||
|
||||
INVARIANT (CUDA byte-identical): on a CUDA backend the reg name is "CUDA", so
|
||||
fgdn_backend_ok stays true, the flags are left untouched, and the emitted decode
|
||||
graph is unchanged - byte-identical to pre-0030. The fix only changes behavior on
|
||||
non-CUDA/non-CPU backends.
|
||||
|
||||
GATE compile: CPU-only build (GGML_CUDA=OFF) of the full series (pin 9d5d882d +
|
||||
0001-0029 + this) links libllama.so and test-backend-ops with 0 errors; the
|
||||
edited llama-context.cpp compiles clean (uses only already-included <cstring> +
|
||||
backend-reg API already used in this TU). test-backend-ops correctness for
|
||||
SSM_CONV / SSM_CONV_UPDATE / SSM_CONV_UPDATE_IDS / GATED_DELTA_NET is a
|
||||
CUDA0-vs-CPU comparison (CPU-only run skips CPU-vs-CPU); the test cases are
|
||||
registered and exercised on the CUDA DGX run.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
src/llama-context.cpp | 39 +++++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 39 insertions(+)
|
||||
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index ad7939e..c408eef 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -521,6 +521,45 @@ void llama_context::sched_reserve() {
|
||||
cparams.auto_fa = false;
|
||||
}
|
||||
|
||||
+ // RISKY-1 guard: the fused/in-place Gated Delta Net op and the discriminated
|
||||
+ // SSM_CONV (which reuse GGML_OP_GATED_DELTA_NET / GGML_OP_SSM_CONV with extra
|
||||
+ // src slots - a non-null src[3]/src[4] ring/ids discriminator) are only
|
||||
+ // implemented for the CUDA-family backends (CUDA / HIP "ROCm" / "MUSA" - all
|
||||
+ // built from the hipified ggml-cuda TU) and the CPU reference. Any other
|
||||
+ // compute backend (Vulkan/SYCL/Metal/...) that supports *plain* SSM_CONV but
|
||||
+ // ignores the discriminator src would silently run the WRONG conv. The
|
||||
+ // upstream auto_fgdn device-mismatch check below only inspects
|
||||
+ // GATED_DELTA_NET nodes, so couple the discriminated-SSM_CONV safety
|
||||
+ // explicitly to the backend type here: keep the fused path enabled only when
|
||||
+ // every non-CPU compute backend is CUDA-family. On CUDA this leaves the flags
|
||||
+ // untouched, so the emitted decode graph is byte-identical.
|
||||
+ if (cparams.fused_gdn_ar || cparams.fused_gdn_ch) {
|
||||
+ bool fgdn_backend_ok = true;
|
||||
+ for (auto & backend : backends) {
|
||||
+ ggml_backend_dev_t dev = ggml_backend_get_device(backend.get());
|
||||
+ if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
+ // CPU reference handles the fused/discriminated ops
|
||||
+ continue;
|
||||
+ }
|
||||
+ ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
||||
+ const char * name = reg ? ggml_backend_reg_name(reg) : "";
|
||||
+ // GGML_CUDA_NAME is "CUDA" / "ROCm" (HIP) / "MUSA"; all three are the
|
||||
+ // same ggml-cuda TU that carries the discriminated-op handling.
|
||||
+ if (strcmp(name, "CUDA") != 0 && strcmp(name, "ROCm") != 0 && strcmp(name, "MUSA") != 0) {
|
||||
+ fgdn_backend_ok = false;
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (!fgdn_backend_ok) {
|
||||
+ cparams.fused_gdn_ar = false;
|
||||
+ cparams.fused_gdn_ch = false;
|
||||
+ cparams.auto_fgdn = false;
|
||||
+ LLAMA_LOG_INFO("%s: fused Gated Delta Net / discriminated SSM_CONV disabled "
|
||||
+ "(compute backend is not CUDA/HIP/CPU)\n", __func__);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
if (cparams.auto_fgdn) {
|
||||
LLAMA_LOG_INFO("%s: resolving fused Gated Delta Net support:\n", __func__);
|
||||
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,357 @@
|
||||
From 37549ecce806130b36012dfd0077ad830989ec71 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Sun, 28 Jun 2026 19:30:01 +0000
|
||||
Subject: [PATCH] feat(paged): chunked parallel-scan GDN prefill kernel (patch
|
||||
0031)
|
||||
|
||||
Implements the explicit upstream TODO at gated_delta_net.cu's
|
||||
launch_gated_delta_net ("Add chunked kernel for even faster pre-fill"). The
|
||||
stock kernel runs a strictly sequential per-token recurrence (one block per
|
||||
(head,seq) looping over all n_tokens), so prefill cannot use token-level
|
||||
parallelism - a confirmed gap versus vLLM, which uses an FLA-style chunked
|
||||
scan.
|
||||
|
||||
What this adds
|
||||
--------------
|
||||
A chunked parallel-scan prefill path for gated DeltaNet, gated to the
|
||||
compile-time subset that matters for Qwen3.6 prefill: non-KDA (scalar gate),
|
||||
f32 state, final-state-only (keep_rs == false), homogeneous (non-hybrid,
|
||||
non-bf16-state). One block per (head,seq); thread j owns the j-th v-column.
|
||||
The sequence is split into chunks of C tokens: the inter-chunk recurrence in
|
||||
the state S stays sequential (n_tokens/C steps instead of n_tokens), while the
|
||||
intra-chunk gated delta rule is solved in parallel via the FLA chunked form:
|
||||
|
||||
gamma_t = prod_{i<=t} g_i (<=1), d(j,t) = gamma_t / gamma_j in (0,1]
|
||||
A = I + tril(beta_t d(j,t) (k_t . k_j), -1) [unit lower-tri, C x C]
|
||||
U = A^{-1} ( beta_t (v_t - gamma_t S0^T k_t) ) (forward substitution)
|
||||
O_t = gamma_t (S0^T q_t) + sum_{j<=t} d(j,t)(q_t . k_j) u_j (then * scale)
|
||||
S_C = gamma_C S0 + sum_t d(t,C) k_t u_t^T
|
||||
|
||||
This uses the bounded/stable de-gating (pairwise decays d <= 1, gamma <= 1), so
|
||||
strong-decay tokens underflow to the correct zero rather than to inf - it is
|
||||
numerically robust even for the adversarial g in [-20, -1e-4] of the op test.
|
||||
|
||||
Bit-exactness (NEW per-path)
|
||||
----------------------------
|
||||
The chunked form is mathematically equivalent to the sequential recurrence but
|
||||
reduces in a different FP order, so it is a NEW path (its md5 will not match the
|
||||
sequential path), gated exactly like the paged-vs-nonpaged precedent. A numpy
|
||||
prototype confirms f32 chunked-vs-sequential NMSE ~1e-13 (max abs ~1e-7).
|
||||
test-backend-ops GATED_DELTA_NET is 91/91 (this patch adds 8 S_v=128 prefill
|
||||
cases: exact-multiple / tail / multi-seq / GQA / permuted), i.e. within the
|
||||
default 1e-7 NMSE gate versus the CPU reference.
|
||||
|
||||
Disposition: OPT-IN, default OFF (no regression)
|
||||
------------------------------------------------
|
||||
GB10's max dynamic shared-memory opt-in is 99KB, so the all-shared layout that
|
||||
keeps the 128x128 state resident forces C=16 (89KB). At C=16, with one block /
|
||||
SM (the 64KB state dominates shared) and serial per-thread dk-reductions, the
|
||||
kernel is correct but NOT yet faster than the already-tuned sequential
|
||||
recurrence: measured S_PP on q36-27b-nvfp4 (llama-batched-bench -npp 512 -ntg 4
|
||||
-npl 32) is ~761 t/s chunked vs ~971 t/s sequential (~22% slower, also
|
||||
grid-starved at low n_seqs). It is therefore wired OPT-IN: the default
|
||||
(no env) keeps the sequential path, and the chunked path is enabled with
|
||||
GDN_CHUNK_MIN=<token-threshold>. The default backend behaviour is unchanged.
|
||||
|
||||
cudaFuncSetAttribute's return is checked (a silent failure when the requested
|
||||
dynamic smem exceeded the device opt-in left a sticky CUDA error during
|
||||
bring-up).
|
||||
|
||||
Remaining work to make it a win (recorded for the follow-up): break the 1
|
||||
block/SM occupancy ceiling (the 64KB state in shared) and the serial
|
||||
dk-reductions - either register-resident state with static-unrolled (larger)
|
||||
chunks, or tensor-core (mma/wgmma) matmuls for the KK/QK/KS/QS/PU products and
|
||||
the A-inverse, which is what FLA/vLLM use to beat the sequential scan. See
|
||||
README section 5 (dev notes / rejected-flat levers).
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
---
|
||||
ggml/src/ggml-cuda/gated_delta_net.cu | 237 ++++++++++++++++++++++++++
|
||||
tests/test-backend-ops.cpp | 8 +
|
||||
2 files changed, 245 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/gated_delta_net.cu b/ggml/src/ggml-cuda/gated_delta_net.cu
|
||||
index d071d5a..7121d80 100644
|
||||
--- a/ggml/src/ggml-cuda/gated_delta_net.cu
|
||||
+++ b/ggml/src/ggml-cuda/gated_delta_net.cu
|
||||
@@ -1,7 +1,10 @@
|
||||
#include "gated_delta_net.cuh"
|
||||
#include "ggml-cuda/common.cuh"
|
||||
|
||||
+#include <climits>
|
||||
#include <cstdlib>
|
||||
+#include <cuda_bf16.h>
|
||||
+#include <type_traits>
|
||||
|
||||
// Step 2: gather only the NON-identity sequences' prior recurrent state from the full cache into a
|
||||
// disjoint scratch buffer. Identity sequences (ids[s] == rs_head + s) are read in place from the
|
||||
@@ -279,6 +282,219 @@ static void launch_gdn_variant(
|
||||
sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K, state_dst_d, ids_d, rs_head);
|
||||
}
|
||||
|
||||
+// ============================================================================
|
||||
+// CHUNKED parallel-scan prefill kernel (upstream TODO: "faster pre-fill").
|
||||
+// Scope: non-KDA (scalar gate), f32 state, final-state-only (keep_rs==false),
|
||||
+// homogeneous (non-hybrid) path. One block per (head, seq); thread j owns the
|
||||
+// j-th v-column. The sequence is split into chunks of C tokens; the inter-chunk
|
||||
+// recurrence in S is sequential (n_tokens/C steps instead of n_tokens), and the
|
||||
+// intra-chunk gated delta rule is solved in parallel via the FLA chunked form:
|
||||
+// gamma_t = prod_{i<=t} g_i (<=1), d(j,t) = gamma_t / gamma_j in (0,1]
|
||||
+// A = I + tril(beta_t d(j,t) (k_t . k_j), -1) [Cc x Cc unit lower-tri]
|
||||
+// U = A^{-1} ( beta_t (v_t - gamma_t S0^T k_t) ) [Cc x dv] (fwd subst)
|
||||
+// O_t = gamma_t (S0^T q_t) + sum_{j<=t} d(j,t)(q_t . k_j) u_j (then * scale)
|
||||
+// S_C = gamma_C S0 + sum_t d(t,C) k_t u_t^T
|
||||
+// This is the bounded/stable de-gating (pairwise decays d <= 1, gamma <= 1), so
|
||||
+// strong-decay tokens underflow to the correct zero rather than to inf. The math
|
||||
+// is equivalent to the sequential recurrence up to FP reduction order (a NEW
|
||||
+// per-path result, validated benign by test-backend-ops NMSE and greedy output).
|
||||
+template <int S_v, int C>
|
||||
+__global__ void gated_delta_net_chunked_cuda(
|
||||
+ const float * __restrict__ q, const float * __restrict__ k,
|
||||
+ const float * __restrict__ v, const float * __restrict__ g,
|
||||
+ const float * __restrict__ beta, const float * __restrict__ curr_state,
|
||||
+ float * __restrict__ dst,
|
||||
+ int64_t H, int64_t n_tokens, int64_t n_seqs,
|
||||
+ int64_t sq1, int64_t sq2, int64_t sq3,
|
||||
+ int64_t sv1, int64_t sv2, int64_t sv3,
|
||||
+ int64_t sb1, int64_t sb2, int64_t sb3,
|
||||
+ uint3 neqk1_magic, uint3 rq3_magic,
|
||||
+ float scale, float * __restrict__ state_dst,
|
||||
+ const int32_t * __restrict__ ids, int rs_head) {
|
||||
+ constexpr int dk = S_v;
|
||||
+ constexpr int dv = S_v;
|
||||
+ const int h_idx = blockIdx.x;
|
||||
+ const int seq = blockIdx.y;
|
||||
+ const int j = threadIdx.x; // this thread's v-column (0..dv-1)
|
||||
+
|
||||
+ const uint32_t iq1 = fastmodulo((uint32_t) h_idx, neqk1_magic);
|
||||
+ const uint32_t iq3 = fastdiv((uint32_t) seq, rq3_magic);
|
||||
+
|
||||
+ extern __shared__ float gdn_smem[];
|
||||
+ float * Sd = gdn_smem; // [dk*dv] M-layout: Sd[col*dk + i] = S[i][col]
|
||||
+ float * Kc = Sd + (size_t) dk * dv; // [C*dk] Kc[t*dk + i]
|
||||
+ float * Qc = Kc + (size_t) C * dk; // [C*dk] Qc[t*dk + i]
|
||||
+ float * Ud = Qc + (size_t) C * dk; // [dv*C] column-major per thread: Ud[col*C + t]
|
||||
+ float * Amat = Ud + (size_t) dv * C; // [C*C] A / P scratch, row-major Amat[t*C + t']
|
||||
+ float * csh = Amat + (size_t) C * C; // [C] cumsum(log-gate)
|
||||
+ float * gam = csh + C; // [C] gamma_t = exp(cs_t)
|
||||
+ float * bet = gam + C; // [C] beta_t
|
||||
+
|
||||
+ // S0: thread j owns column j (Sd[j*dk + i]); load is a contiguous per-thread copy from the
|
||||
+ // M-layout cache view (read_state[j*dk + i] = M[j*S_v + i] = S[i][j]). Same identity/gather
|
||||
+ // plumbing as the sequential kernel (gather of non-identity seqs done by the dispatcher).
|
||||
+ const bool identity = (ids != nullptr && ids[seq] == rs_head + seq);
|
||||
+ const float * read_state = (identity ? state_dst : curr_state)
|
||||
+ + (int64_t) seq * H * dk * dv + (int64_t) h_idx * dk * dv;
|
||||
+ for (int i = 0; i < dk; i++) {
|
||||
+ Sd[j * dk + i] = read_state[j * dk + i];
|
||||
+ }
|
||||
+
|
||||
+ const float * q_base = q + iq3 * sq3 + iq1 * sq1; // + t*sq2 + i
|
||||
+ const float * k_base = k + iq3 * sq3 + iq1 * sq1;
|
||||
+ const float * v_base = v + seq * sv3 + h_idx * sv1; // + t*sv2 + j
|
||||
+ const int64_t gb_base = seq * sb3 + h_idx * sb1; // + t*sb2
|
||||
+
|
||||
+ float * attn_base = dst + (int64_t) (seq * n_tokens * H + h_idx) * S_v; // + tok*S_v*H + j
|
||||
+
|
||||
+ for (int64_t c0 = 0; c0 < n_tokens; c0 += C) {
|
||||
+ const int Cc = (int) ((n_tokens - c0) < (int64_t) C ? (n_tokens - c0) : (int64_t) C);
|
||||
+
|
||||
+ // --- load chunk K,Q (cooperative), beta and the gate prefix (cs, gamma) ---
|
||||
+ for (int e = j; e < Cc * dk; e += dv) {
|
||||
+ const int t = e / dk;
|
||||
+ const int i = e % dk;
|
||||
+ Kc[t * dk + i] = k_base[(c0 + t) * sq2 + i];
|
||||
+ Qc[t * dk + i] = q_base[(c0 + t) * sq2 + i];
|
||||
+ }
|
||||
+ if (j < Cc) {
|
||||
+ csh[j] = g[gb_base + (c0 + j) * sb2]; // raw log-gate, prefix-summed below
|
||||
+ bet[j] = beta[gb_base + (c0 + j) * sb2];
|
||||
+ }
|
||||
+ __syncthreads();
|
||||
+ if (j == 0) {
|
||||
+ float run = 0.0f;
|
||||
+ for (int t = 0; t < Cc; t++) {
|
||||
+ run += csh[t];
|
||||
+ csh[t] = run; // cs_t = sum_{i<=t} g_i (<= 0)
|
||||
+ gam[t] = expf(run); // gamma_t (<= 1)
|
||||
+ }
|
||||
+ }
|
||||
+ __syncthreads();
|
||||
+
|
||||
+ // --- A = I + tril(beta_t * d(t',t) * (k_t . k_t'), -1) (cooperative over C*C) ---
|
||||
+ for (int e = j; e < Cc * Cc; e += dv) {
|
||||
+ const int t = e / Cc;
|
||||
+ const int tp = e % Cc;
|
||||
+ float a = 0.0f;
|
||||
+ if (tp < t) {
|
||||
+ float kk = 0.0f;
|
||||
+ for (int i = 0; i < dk; i++) {
|
||||
+ kk += Kc[t * dk + i] * Kc[tp * dk + i];
|
||||
+ }
|
||||
+ const float dd = expf(csh[t] - csh[tp]); // d(tp,t) = gamma_t/gamma_tp
|
||||
+ a = bet[t] * dd * kk;
|
||||
+ } else if (tp == t) {
|
||||
+ a = 1.0f;
|
||||
+ }
|
||||
+ Amat[t * Cc + tp] = a;
|
||||
+ }
|
||||
+ __syncthreads();
|
||||
+
|
||||
+ // --- RHS[t][j] = beta_t (v_t[j] - gamma_t * (S0^T k_t)[j]) -> Ud[j*C + t] ---
|
||||
+ for (int t = 0; t < Cc; t++) {
|
||||
+ float ks = 0.0f; // (S0^T k_t)[j] = sum_i S[i][j] k_t[i]
|
||||
+ for (int i = 0; i < dk; i++) {
|
||||
+ ks += Sd[j * dk + i] * Kc[t * dk + i];
|
||||
+ }
|
||||
+ const float vtj = v_base[(c0 + t) * sv2 + j];
|
||||
+ Ud[j * C + t] = bet[t] * (vtj - gam[t] * ks);
|
||||
+ }
|
||||
+
|
||||
+ // --- solve A U = RHS in place (unit lower-tri fwd subst); per-thread, no inter-step sync ---
|
||||
+ for (int t = 1; t < Cc; t++) {
|
||||
+ float acc = Ud[j * C + t];
|
||||
+ for (int tp = 0; tp < t; tp++) {
|
||||
+ acc -= Amat[t * Cc + tp] * Ud[j * C + tp];
|
||||
+ }
|
||||
+ Ud[j * C + t] = acc;
|
||||
+ }
|
||||
+ __syncthreads(); // U finalized; Amat free for P below (and Ud read across-thread? no, own col)
|
||||
+
|
||||
+ // --- P[t][t'] = d(t',t) * (q_t . k_t') for t' <= t (reuse Amat) ---
|
||||
+ for (int e = j; e < Cc * Cc; e += dv) {
|
||||
+ const int t = e / Cc;
|
||||
+ const int tp = e % Cc;
|
||||
+ float p = 0.0f;
|
||||
+ if (tp <= t) {
|
||||
+ float qk = 0.0f;
|
||||
+ for (int i = 0; i < dk; i++) {
|
||||
+ qk += Qc[t * dk + i] * Kc[tp * dk + i];
|
||||
+ }
|
||||
+ const float dd = expf(csh[t] - csh[tp]);
|
||||
+ p = dd * qk;
|
||||
+ }
|
||||
+ Amat[t * Cc + tp] = p;
|
||||
+ }
|
||||
+ __syncthreads();
|
||||
+
|
||||
+ // --- O[t][j] = gamma_t (S0^T q_t)[j] + sum_{t'<=t} P[t][t'] U[t'][j] (* scale) ---
|
||||
+ for (int t = 0; t < Cc; t++) {
|
||||
+ float qs = 0.0f; // (S0^T q_t)[j] (uses pre-update S)
|
||||
+ for (int i = 0; i < dk; i++) {
|
||||
+ qs += Sd[j * dk + i] * Qc[t * dk + i];
|
||||
+ }
|
||||
+ float o = gam[t] * qs;
|
||||
+ for (int tp = 0; tp <= t; tp++) {
|
||||
+ o += Amat[t * Cc + tp] * Ud[j * C + tp];
|
||||
+ }
|
||||
+ attn_base[(c0 + t) * S_v * H + j] = o * scale;
|
||||
+ }
|
||||
+
|
||||
+ // --- S_C[i][j] = gamma_{C-1} S[i][j] + sum_t d(t,C-1) k_t[i] u_t[j] ---
|
||||
+ const float glast = gam[Cc - 1];
|
||||
+ const float cslast = csh[Cc - 1];
|
||||
+ for (int i = 0; i < dk; i++) {
|
||||
+ float s = glast * Sd[j * dk + i];
|
||||
+ for (int t = 0; t < Cc; t++) {
|
||||
+ const float dd = expf(cslast - csh[t]); // d(t, last)
|
||||
+ s += dd * Kc[t * dk + i] * Ud[j * C + t];
|
||||
+ }
|
||||
+ Sd[j * dk + i] = s;
|
||||
+ }
|
||||
+ __syncthreads(); // Sd reused as S0 of next chunk; Kc/Qc/Amat reloaded next chunk
|
||||
+ }
|
||||
+
|
||||
+ // --- final-state write-back (M-layout): in-place cache view or f32 op-output scratch ---
|
||||
+ const int64_t state_out_offset = (int64_t) (seq * H + h_idx) * S_v * S_v;
|
||||
+ const int64_t attn_score_elems = (int64_t) S_v * H * n_tokens * n_seqs;
|
||||
+ float * st = (state_dst != nullptr) ? (state_dst + state_out_offset)
|
||||
+ : (dst + attn_score_elems + state_out_offset);
|
||||
+ for (int i = 0; i < dk; i++) {
|
||||
+ st[j * dk + i] = Sd[j * dk + i];
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+template <int S_v, int C>
|
||||
+static void launch_gdn_chunked(
|
||||
+ const float * q_d, const float * k_d, const float * v_d,
|
||||
+ const float * g_d, const float * b_d, const float * s_d,
|
||||
+ float * dst_d, float * state_dst_d, const int32_t * ids_d, int rs_head,
|
||||
+ int64_t H, int64_t n_tokens, int64_t n_seqs,
|
||||
+ int64_t sq1, int64_t sq2, int64_t sq3,
|
||||
+ int64_t sv1, int64_t sv2, int64_t sv3,
|
||||
+ int64_t sb1, int64_t sb2, int64_t sb3,
|
||||
+ const uint3 neqk1_magic, const uint3 rq3_magic,
|
||||
+ float scale, cudaStream_t stream) {
|
||||
+ const size_t smem = ((size_t) S_v * S_v + (size_t) 2 * C * S_v + (size_t) S_v * C
|
||||
+ + (size_t) C * C + (size_t) 3 * C) * sizeof(float);
|
||||
+ static bool attr_set = false;
|
||||
+ if (!attr_set) {
|
||||
+ const cudaError_t e = cudaFuncSetAttribute(gated_delta_net_chunked_cuda<S_v, C>,
|
||||
+ cudaFuncAttributeMaxDynamicSharedMemorySize, (int) smem);
|
||||
+ if (e != cudaSuccess) {
|
||||
+ GGML_ABORT("gdn chunked: cudaFuncSetAttribute(maxDynSmem=%zu) failed: %s\n", smem, cudaGetErrorString(e));
|
||||
+ }
|
||||
+ attr_set = true;
|
||||
+ }
|
||||
+ dim3 grid_dims(H, n_seqs, 1);
|
||||
+ dim3 block_dims(S_v, 1, 1);
|
||||
+ gated_delta_net_chunked_cuda<S_v, C><<<grid_dims, block_dims, smem, stream>>>(
|
||||
+ q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs,
|
||||
+ sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2, sb3,
|
||||
+ neqk1_magic, rq3_magic, scale, state_dst_d, ids_d, rs_head);
|
||||
+}
|
||||
+
|
||||
template <bool KDA, bool keep_rs_t>
|
||||
static void launch_gated_delta_net(
|
||||
const float * q_d, const float * k_d, const float * v_d,
|
||||
@@ -297,6 +513,27 @@ static void launch_gated_delta_net(
|
||||
const uint3 neqk1_magic = init_fastdiv_values(neqk1);
|
||||
const uint3 rq3_magic = init_fastdiv_values(rq3);
|
||||
|
||||
+ // Chunked parallel-scan prefill path (upstream TODO at this site). Compile-time subset:
|
||||
+ // non-KDA scalar gate, f32 state, final-state-only, homogeneous. Gated at runtime on the GDN
|
||||
+ // head dim (S_v==128) and a prefill token threshold; decode (n_tokens small) keeps the tuned
|
||||
+ // sequential recurrence. Mathematically equivalent up to FP reduction order (NEW per-path md5;
|
||||
+ // validated benign by test-backend-ops NMSE + greedy output). Toggle: GDN_CHUNK_OFF / GDN_CHUNK_MIN.
|
||||
+ if constexpr (!KDA && !keep_rs_t) {
|
||||
+ // OPT-IN: this chunked path is bit-exact-benign (test-backend-ops green) but, at C=16
|
||||
+ // (forced by GB10 99KB dyn-smem opt-in, all-shared), it is NOT yet faster than the tuned
|
||||
+ // sequential recurrence on this model (measured ~22%% slower S_PP, grid-starved at low
|
||||
+ // n_seqs + 1 block/SM occupancy). Default OFF so the backend default is regression-free;
|
||||
+ // enable for experiments / tuning with GDN_CHUNK_MIN=<token-threshold>. See README section 5 (dev notes / rejected-flat levers).
|
||||
+ static const int gdn_chunk_min = []{ const char * e = getenv("GDN_CHUNK_MIN"); return e ? atoi(e) : INT_MAX; }();
|
||||
+ if (S_v == 128 && n_tokens >= gdn_chunk_min) {
|
||||
+ launch_gdn_chunked<128, 16>(
|
||||
+ q_d, k_d, v_d, g_d, b_d, (const float *) s_d, dst_d, (float *) state_dst_d, ids_d, rs_head,
|
||||
+ H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2, sb3,
|
||||
+ neqk1_magic, rq3_magic, scale, stream);
|
||||
+ return;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
#define GDN_LAUNCH_ARGS \
|
||||
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, state_dst_d, ids_d, rs_head, \
|
||||
H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2, sb3, \
|
||||
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
|
||||
index ac30e47..4e40d23 100644
|
||||
--- a/tests/test-backend-ops.cpp
|
||||
+++ b/tests/test-backend-ops.cpp
|
||||
@@ -9398,6 +9398,14 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 64, 100, 1));
|
||||
test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 64, 200, 1));
|
||||
test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 64, 127, 2));
|
||||
+ // chunked parallel-scan prefill path (S_v==128, n_tokens>=64): exact-multiple, tail, multi-seq, perm
|
||||
+ test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 128, 64, 1));
|
||||
+ test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 128, 128, 1));
|
||||
+ test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 128, 127, 1));
|
||||
+ test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 128, 256, 1));
|
||||
+ test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 128, 100, 2));
|
||||
+ test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 2, 128, 200, 3));
|
||||
+ test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 128, 130, 1, 1, true));
|
||||
test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 64, 64, 1, 1, false, true));
|
||||
test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 64, 33, 1, 1, false, true));
|
||||
test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 64, 100, 1, 1, false, true));
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,174 @@
|
||||
From 0033003300330033003300330033003300330033 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Sun, 28 Jun 2026 19:35:00 +0200
|
||||
Subject: [PATCH] feat(paged): FP4 prefill large-M dequant->bf16 cuBLAS scaffold
|
||||
(default-off, rejected on GB10) (patch 0033)
|
||||
|
||||
Option (a) of docs/PREFILL_GEMM_SCOPE.md: route large-M (prefill) NVFP4 dense
|
||||
weight GEMMs OFF the decode-tuned FP4-MMQ kernel and through the dequant->bf16
|
||||
cuBLAS (nvjet) tensor-core path. This lands the validated, bit-exact-gated
|
||||
mechanism and records the honest result: on GB10 (sm_121) the lever is a
|
||||
REGRESSION, so it is kept default-OFF (byte-identical to stock), mirroring the
|
||||
patch-0017 default-off discipline.
|
||||
|
||||
Mechanism (all three edits are the integration scaffold, no new kernel):
|
||||
- ggml/src/ggml-cuda/mmq.cu (ggml_cuda_should_use_mmq): NVFP4 + Blackwell +
|
||||
dense (n_experts==0) + M > LLAMA_FP4_PREFILL_M returns false, so the dense
|
||||
dispatch falls through to ggml_cuda_op_mul_mat_cublas. -D / env
|
||||
LLAMA_FP4_PREFILL_M tunable; default 0 == disabled == stock. Decode and
|
||||
small batches (M <= threshold) stay on FP4-MMQ.
|
||||
- ggml/src/ggml-cuda/ggml-cuda.cu (ggml_cuda_op_mul_mat_cublas): new NVFP4
|
||||
branch dequants the FP4 weights to a TRANSIENT bf16 pool buffer (not cached,
|
||||
so the model stays FP4-resident) and runs cublasGemmEx CUDA_R_16BF /
|
||||
COMPUTE_32F (tensor cores) instead of the f32 cublasSgemm fallback (no
|
||||
tensor cores) that NVFP4 would otherwise hit.
|
||||
- ggml/src/ggml-cuda/convert.cu (ggml_get_to_bf16_cuda): add the NVFP4 case
|
||||
(the dequant kernel is dst-type generic; bf16 preserves the model's native
|
||||
activation range vs f16). nullptr-by-default for other types is unchanged.
|
||||
|
||||
Bit-exact / numeric gate (PASS, divergence benign):
|
||||
- test-backend-ops MUL_MAT 1146/1146, MUL_MAT_ID 806/806 at default; and with
|
||||
the path FORCED (LLAMA_FP4_PREFILL_M=64) the NVFP4 large-M cases are green
|
||||
CUDA-vs-CPU (the bf16 path is numerically within the project tolerance).
|
||||
- greedy md5 (q36-27b dense, "The capital of France is", -n 48, temp 0):
|
||||
lever == base == 5951a5b4d624ce891e22ab5fca9bc439 (the documented dense
|
||||
reference) for short prefill (decode byte-untouched), AND identical for a
|
||||
>threshold prefill that exercises the new bf16 path (5f3967df...): the new
|
||||
FP path does not flip a single greedy argmax. As predicted by the scope,
|
||||
bf16 activations are strictly more precise than the FP4-MMQ Q8_1 path, so
|
||||
this is precision-neutral-to-better, not a regression.
|
||||
|
||||
Honest performance result (S_PP t/s, q36-27b dense, llama-batched-bench
|
||||
-fa on -ngl 99, A/B via env), see docs/PREFILL_GEMM_RESULTS.md:
|
||||
-npp 512 -npl 32 : base(MMQ) 958.99 -> lever 486.65 (-49%)
|
||||
-npp 1024 -npl 8 : base(MMQ)1013.65 -> lever 587.27 (-42%)
|
||||
-npp 2048 -npl 8 : base(MMQ) 918.46 -> lever 649.42 (-29%)
|
||||
The scope premise (FP4-MMQ ~3% of FP4 peak at large M) is FALSE on GB10:
|
||||
FP4-MMQ at M=512..2048 beats dequant->bf16 cuBLAS, because bf16 tensor-core peak
|
||||
is ~half FP4 peak AND the per-step weight dequant + 4x bf16 weight traffic
|
||||
(~8x total vs the FP4 read) dominate, only partially amortizing as M grows
|
||||
(gap shrinks 49%->29%, never crosses). Default-off keeps stock S_PP (966.98,
|
||||
within noise of base).
|
||||
|
||||
Phase 2 (MoE grouped large-M) is NOT implemented: it inherits the same
|
||||
bf16-peak < FP4-peak ceiling plus a per-expert dequant, so grouped bf16-cuBLAS
|
||||
would regress for the same reason. The only route to a real prefill GEMM win is
|
||||
option (b) - a native FP4-MMA large-M kernel (multi-week). This patch is the
|
||||
validated, env-gated scaffold that option (b) / non-GB10 hardware can reuse for
|
||||
the M-threshold routing + bit-exact gate.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
---
|
||||
diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu
|
||||
index 61630a3..f0273c1 100644
|
||||
--- a/ggml/src/ggml-cuda/convert.cu
|
||||
+++ b/ggml/src/ggml-cuda/convert.cu
|
||||
@@ -704,6 +704,15 @@ to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type) {
|
||||
return convert_unary_cont_cuda<float>;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cont_cuda<half>;
|
||||
+ // Paged prefill lever (patch 0033): NVFP4 -> bf16 dequant for the large-M
|
||||
+ // dequant->bf16 cuBLAS (nvjet) prefill GEMM path in
|
||||
+ // ggml_cuda_op_mul_mat_cublas. The dequant kernel is dst-type generic, so
|
||||
+ // this instantiates the bf16 variant; bf16 (not f16) preserves the model's
|
||||
+ // native bf16 activation range and avoids f16 overflow on large prefill
|
||||
+ // activations. Only the new prefill path consumes this; nullptr-by-default
|
||||
+ // for all other types is unchanged.
|
||||
+ case GGML_TYPE_NVFP4:
|
||||
+ return dequantize_row_nvfp4_cuda;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
|
||||
index 9933fa6..2dcaaab 100644
|
||||
--- a/ggml/src/ggml-cuda/mmq.cu
|
||||
+++ b/ggml/src/ggml-cuda/mmq.cu
|
||||
@@ -321,6 +321,33 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
|
||||
return false;
|
||||
}
|
||||
|
||||
+ // Paged prefill lever (patch 0033): OPTION-(a) route large-M NVFP4 dense GEMMs
|
||||
+ // OFF the FP4-MMQ kernel and through the dequant->bf16 cuBLAS (nvjet)
|
||||
+ // tensor-core path (ggml_cuda_op_mul_mat_cublas, NVFP4 bf16 branch). The
|
||||
+ // scope premise was that FP4-MMQ is register-bound to ~3% of FP4 peak at
|
||||
+ // large M. MEASURED ON GB10 THIS IS FALSE: FP4-MMQ at M=512..2048 beats
|
||||
+ // dequant->bf16 cuBLAS by 29-49% (S_PP A/B in docs/PREFILL_GEMM_RESULTS.md),
|
||||
+ // because bf16 tensor-core peak is ~half FP4 peak AND the per-step weight
|
||||
+ // dequant + 4x bf16 weight traffic (~8x total vs the FP4 read) dominate and
|
||||
+ // only partially amortize as M grows. The path is NUMERICALLY VALID and
|
||||
+ // benign (greedy md5 byte-identical to FP4-MMQ; test-backend-ops passes), so
|
||||
+ // it is kept as a validated, env-gated scaffold (for option-(b) native FP4
|
||||
+ // large-M kernels and non-GB10 hardware), but DEFAULT-DISABLED (== stock).
|
||||
+ // Set -D LLAMA_FP4_PREFILL_M=<M> or env LLAMA_FP4_PREFILL_M=<M> to A/B it;
|
||||
+ // 0 (default) disables. Dense only (n_experts == 0).
|
||||
+#ifndef LLAMA_FP4_PREFILL_M
|
||||
+#define LLAMA_FP4_PREFILL_M 0
|
||||
+#endif // LLAMA_FP4_PREFILL_M
|
||||
+ if (type == GGML_TYPE_NVFP4 && n_experts == 0 && blackwell_mma_available(cc)) {
|
||||
+ static const int64_t fp4_prefill_m = [] {
|
||||
+ const char * e = getenv("LLAMA_FP4_PREFILL_M");
|
||||
+ return e != nullptr ? (int64_t) atoll(e) : (int64_t) LLAMA_FP4_PREFILL_M;
|
||||
+ }();
|
||||
+ if (fp4_prefill_m > 0 && ne11 > fp4_prefill_m) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
if (turing_mma_available(cc)) {
|
||||
return true;
|
||||
}
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 0dad6e1..6476d46 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -1660,7 +1660,47 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||
row_diff == src0->ne[1] &&
|
||||
dst->op_params[0] == GGML_PREC_DEFAULT;
|
||||
|
||||
- if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
||||
+ if (supports_bf16 && src0->type == GGML_TYPE_NVFP4 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
||||
+ // Paged prefill lever (patch 0033): NVFP4 only reaches cuBLAS when
|
||||
+ // ggml_cuda_should_use_mmq() returned false (large-M dense prefill).
|
||||
+ // Dequant the FP4 weights to a TRANSIENT bf16 pool buffer and run a
|
||||
+ // tensor-core bf16 GEMM (nvjet) instead of the f32 cublasSgemm fallback
|
||||
+ // (no tensor cores) that the final else-branch would otherwise use. The
|
||||
+ // weights are NOT cached as bf16 (pool scratch, freed at step end) so the
|
||||
+ // model stays FP4-resident and the backend keeps its memory advantage.
|
||||
+ ggml_cuda_pool_alloc<nv_bfloat16> src0_as_bf16(ctx.pool(id), row_diff*ne00);
|
||||
+ const to_bf16_cuda_t to_bf16_cuda_src0 = ggml_get_to_bf16_cuda(GGML_TYPE_NVFP4);
|
||||
+ GGML_ASSERT(to_bf16_cuda_src0 != nullptr);
|
||||
+ to_bf16_cuda_src0(src0_dd_i, src0_as_bf16.get(), row_diff*ne00, stream);
|
||||
+
|
||||
+ ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
|
||||
+ if (src1->type != GGML_TYPE_BF16) {
|
||||
+ const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type);
|
||||
+ GGML_ASSERT(to_bf16_cuda != nullptr);
|
||||
+ size_t ne = src1_ncols*ne10;
|
||||
+ src1_as_bf16.alloc(ne);
|
||||
+ to_bf16_cuda(src1_ddf_i, src1_as_bf16.get(), ne, stream);
|
||||
+ }
|
||||
+ const nv_bfloat16 * src1_ptr = src1->type == GGML_TYPE_BF16 ? (const nv_bfloat16 *) src1_ddf_i : src1_as_bf16.get();
|
||||
+ const nv_bfloat16 * src0_ptr = src0_as_bf16.get();
|
||||
+ ggml_cuda_pool_alloc<nv_bfloat16> dst_bf16(ctx.pool(id), row_diff*src1_ncols);
|
||||
+
|
||||
+ const float alpha_f32 = 1.0f;
|
||||
+ const float beta_f32 = 0.0f;
|
||||
+
|
||||
+ CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
|
||||
+ CUBLAS_CHECK(
|
||||
+ cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
+ row_diff, src1_ncols, ne10,
|
||||
+ &alpha_f32, src0_ptr, CUDA_R_16BF, ne00,
|
||||
+ src1_ptr, CUDA_R_16BF, ne10,
|
||||
+ &beta_f32, dst_bf16.get(), CUDA_R_16BF, ldc,
|
||||
+ CUBLAS_COMPUTE_32F,
|
||||
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
+
|
||||
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
|
||||
+ to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
||||
+ } else if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
||||
ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
|
||||
if (src1->type != GGML_TYPE_BF16) {
|
||||
const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type);
|
||||
--
|
||||
2.43.0
|
||||
@@ -0,0 +1,638 @@
|
||||
From 14824147a504b58cc8be2f127f7d6bedb672cfc9 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Mon, 29 Jun 2026 00:11:22 +0200
|
||||
Subject: [PATCH] feat(paged): native NVFP4 (W4A4) FP4-MMA large-M prefill GEMM
|
||||
(patch 0034)
|
||||
|
||||
Replace the rejected 0033 dequant->bf16 cuBLAS scaffold with a native FP4-MMA
|
||||
(W4A4 block-scale OMMA) large-M GEMM that engages only at prefill, behind the
|
||||
same LLAMA_FP4_PREFILL_M threshold, so decode / small-M stay byte-untouched.
|
||||
|
||||
KERNEL (ggml/src/ggml-cuda/fp4-gemm.{cu,cuh}): the VERIFIED PoC
|
||||
(fp4_gemm_w4a4_opt.cu, NMSE=0 vs same-dequant f32) copied verbatim at its tuned
|
||||
best config 128x128 / KBLK4 / STAGES2 / PAD4 (~103 TFLOP/s, beats cuBLAS bf16).
|
||||
Preserved exactly: e4m3(true_scale) convention, the ldmatrix.sync.m8n8.x4 A-operand
|
||||
load, the mma.sync.kind::mxf4nvf4.block_scale.scale_vec::4X.m16n8k64 OMMA, cp.async
|
||||
multistage prefetch, register-resident accumulators, smem PAD. Activations are
|
||||
quantized with the SAME math as quantize_mmq_nvfp4 (e4m3 amax/6 + the +/-2 code
|
||||
search + ggml_cuda_float_to_fp4_e2m1), so it is bit-exact-by-construction with the
|
||||
shipped FP4-MMQ path (only the K-reduction order differs, greedy-md5 gated).
|
||||
|
||||
DENSE: routed in ggml_cuda_mul_mat via ggml_cuda_fp4_prefill_should_engage()
|
||||
(src0 NVFP4 + src1/dst f32, contiguous, non-transposed, 2D, Blackwell, M>thr,
|
||||
N%128==0, K%256==0). Non-divisible shapes fall back to FP4-MMQ (NOT the rejected
|
||||
bf16 cuBLAS path). LANDED + greedy-md5 byte-identical (on==off: "Paris").
|
||||
|
||||
MoE GROUPED (the actual prefill bottleneck): mmq.cu forces the grouped FP4-MMQ
|
||||
id-path OFF at large M (n_experts>0), so mul_mat_id falls to its per-expert
|
||||
host-sync loop where each expert slice flows back through ggml_cuda_mul_mat and
|
||||
hits the native kernel per-expert. Prefill is not graph-replayed so this is safe;
|
||||
decode keeps ne12<=threshold so the graph-safe MMQ id-path (patch 0025) is
|
||||
untouched. LANDED via host-sync + greedy-md5 byte-identical (on==off).
|
||||
FOLLOW-UP (flagged): a graph-safe ragged-batched grouped FP4-MMA kernel to remove
|
||||
the per-expert host-sync loop; out of scope for this pass.
|
||||
|
||||
BUILD: arch=compute_121a,code=[compute_121a,sm_121a] already in build-cuda flags;
|
||||
the kernel uses BLACKWELL_MMA_AVAILABLE/CP_ASYNC_AVAILABLE guards. Incremental
|
||||
build-cuda green (ggml-cuda relinked, llama-server + llama-cli relinked).
|
||||
|
||||
Default-off (LLAMA_FP4_PREFILL_M=0 == stock); set env/-D to engage.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
ggml/src/ggml-cuda/fp4-gemm.cu | 453 ++++++++++++++++++++++++++++++++
|
||||
ggml/src/ggml-cuda/fp4-gemm.cuh | 38 +++
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 14 +
|
||||
ggml/src/ggml-cuda/mmq.cu | 35 +--
|
||||
4 files changed, 525 insertions(+), 15 deletions(-)
|
||||
create mode 100644 ggml/src/ggml-cuda/fp4-gemm.cu
|
||||
create mode 100644 ggml/src/ggml-cuda/fp4-gemm.cuh
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/fp4-gemm.cu b/ggml/src/ggml-cuda/fp4-gemm.cu
|
||||
new file mode 100644
|
||||
index 0000000..86da551
|
||||
--- /dev/null
|
||||
+++ b/ggml/src/ggml-cuda/fp4-gemm.cu
|
||||
@@ -0,0 +1,453 @@
|
||||
+#include "fp4-gemm.cuh"
|
||||
+
|
||||
+#include <cfloat>
|
||||
+#include <cstdint>
|
||||
+#include <cstdlib>
|
||||
+
|
||||
+// ===========================================================================
|
||||
+// [paged patch 0034] Native NVFP4 (W4A4) large-M GEMM. See fp4-gemm.cuh.
|
||||
+//
|
||||
+// The GEMM kernel, the m16n8k64 block-scale OMMA wrapper, the cp.async helpers and
|
||||
+// the layout-split kernel are the VERIFIED PoC (fp4_gemm_w4a4_opt.cu, NMSE=0) copied
|
||||
+// verbatim - do not "tidy" the index math, it is the load-bearing correctness.
|
||||
+// ===========================================================================
|
||||
+
|
||||
+#define FP4_QK 64 // == QK_NVFP4
|
||||
+#define FP4_SAW 8 // u32 per nvfp4 block qs (32 bytes)
|
||||
+
|
||||
+#ifndef LLAMA_FP4_PREFILL_M
|
||||
+#define LLAMA_FP4_PREFILL_M 0
|
||||
+#endif // LLAMA_FP4_PREFILL_M
|
||||
+
|
||||
+static int64_t ggml_cuda_fp4_prefill_m() {
|
||||
+ static const int64_t m = [] {
|
||||
+ const char * e = getenv("LLAMA_FP4_PREFILL_M");
|
||||
+ return e != nullptr ? (int64_t) atoll(e) : (int64_t) LLAMA_FP4_PREFILL_M;
|
||||
+ }();
|
||||
+ return m;
|
||||
+}
|
||||
+
|
||||
+// ---- layout split: block_nvfp4[R*Kb] -> qs codes [R*Kb*8 u32] + scales [R*Kb u32] ----
|
||||
+// Same fp4 codes & e4m3 scale bytes as the GGUF, restored into two contiguous,
|
||||
+// 16B-friendly arrays so the kernel's cp.async copies are coalesced. (PoC verbatim.)
|
||||
+static __global__ void fp4_split_layout(
|
||||
+ const block_nvfp4 * __restrict__ X, uint32_t * __restrict__ Q, uint32_t * __restrict__ S,
|
||||
+ int R, int Kb) {
|
||||
+ const int64_t b = (int64_t) blockIdx.x * blockDim.x + threadIdx.x;
|
||||
+ const int64_t tot = (int64_t) R * Kb;
|
||||
+ if (b >= tot) {
|
||||
+ return;
|
||||
+ }
|
||||
+ const block_nvfp4 & blk = X[b];
|
||||
+ const uint32_t * q = (const uint32_t *) blk.qs;
|
||||
+ uint32_t * dq = &Q[b * 8];
|
||||
+#pragma unroll
|
||||
+ for (int w = 0; w < 8; w++) {
|
||||
+ dq[w] = q[w];
|
||||
+ }
|
||||
+ S[b] = *(const uint32_t *) blk.d;
|
||||
+}
|
||||
+
|
||||
+// ---- activation quantizer: f32 [M_real x K] -> split NVFP4 (Aq codes + As scales) ----
|
||||
+// Uses the SAME math as quantize_mmq_nvfp4 (quantize.cu): e4m3 scale = ue4m3(amax/6)
|
||||
+// with the +/-2 code search, ggml_cuda_float_to_fp4_e2m1 for the nibbles, so the
|
||||
+// activation codes are identical to the shipped FP4-MMQ path. Packs into the PoC
|
||||
+// block layout (qs[s*8+j] = code(e[j]) | code(e[j+8])<<4) expected by the kernel's
|
||||
+// ldmatrix A-operand load. One thread per (row, kb, sub-block).
|
||||
+static __global__ void fp4_quantize_act_split(
|
||||
+ const float * __restrict__ x, uint32_t * __restrict__ Aq, uint32_t * __restrict__ As,
|
||||
+ int M_real, int K, int Kb) {
|
||||
+#ifdef BLACKWELL_MMA_AVAILABLE
|
||||
+ const int64_t tot = (int64_t) M_real * Kb * 4; // 4 sub-blocks per 64-element block
|
||||
+ const int64_t t = (int64_t) blockIdx.x * blockDim.x + threadIdx.x;
|
||||
+ if (t >= tot) {
|
||||
+ return;
|
||||
+ }
|
||||
+ const int sub = (int) (t & 3);
|
||||
+ const int64_t rb = t >> 2; // row*Kb + kb
|
||||
+ const int kb = (int) (rb % Kb);
|
||||
+ const int64_t row = rb / Kb;
|
||||
+
|
||||
+ const float * v16 = x + row * (int64_t) K + (int64_t) kb * FP4_QK + sub * 16;
|
||||
+ float vals[16];
|
||||
+ float amax = 0.0f;
|
||||
+#pragma unroll
|
||||
+ for (int k = 0; k < 16; k++) {
|
||||
+ const float vv = v16[k];
|
||||
+ vals[k] = vv;
|
||||
+ amax = fmaxf(amax, fabsf(vv));
|
||||
+ }
|
||||
+
|
||||
+ static constexpr int test_offsets[5] = { 0, -1, 1, -2, 2 };
|
||||
+ const int first_fp8_code = (int) ggml_cuda_fp32_to_ue4m3(amax / 6.0f);
|
||||
+
|
||||
+ float best_err = FLT_MAX;
|
||||
+ uint8_t fp8_code = 0;
|
||||
+ float subblock_scale = 0.0f;
|
||||
+#pragma unroll
|
||||
+ for (int i = 0; i < 5; i++) {
|
||||
+ const int test_code = first_fp8_code + test_offsets[i];
|
||||
+ if (test_code < 0 || test_code > 0x7e) {
|
||||
+ continue;
|
||||
+ }
|
||||
+ const uint8_t code = (uint8_t) test_code;
|
||||
+ const float test_scale = ggml_cuda_ue4m3_to_fp32(code);
|
||||
+ const float test_inv_scale = test_scale > 0.0f ? 0.5f / test_scale : 0.0f;
|
||||
+ float cur_err = 0.0f;
|
||||
+#pragma unroll
|
||||
+ for (int k = 0; k < 16; k++) {
|
||||
+ const uint8_t q = ggml_cuda_float_to_fp4_e2m1(vals[k], test_inv_scale);
|
||||
+ const float err_diff = fabsf(vals[k]) - fabsf((float) kvalues_mxfp4[q & 0x7]) * test_scale;
|
||||
+ cur_err = fmaf(err_diff, err_diff, cur_err);
|
||||
+ }
|
||||
+ if (cur_err < best_err) {
|
||||
+ best_err = cur_err;
|
||||
+ fp8_code = code;
|
||||
+ subblock_scale = test_scale;
|
||||
+ }
|
||||
+ }
|
||||
+ const float inv_scale = subblock_scale > 0.0f ? 0.5f / subblock_scale : 0.0f;
|
||||
+
|
||||
+ // PoC packing: qs[s*8+j] = code(e[j]) | code(e[j+8])<<4 -> two u32 words per sub-block.
|
||||
+ uint32_t w0 = 0, w1 = 0;
|
||||
+#pragma unroll
|
||||
+ for (int j = 0; j < 4; j++) {
|
||||
+ const uint32_t lo = ggml_cuda_float_to_fp4_e2m1(vals[j], inv_scale);
|
||||
+ const uint32_t hi = ggml_cuda_float_to_fp4_e2m1(vals[j + 8], inv_scale);
|
||||
+ w0 |= ((lo | (hi << 4)) & 0xff) << (8 * j);
|
||||
+ }
|
||||
+#pragma unroll
|
||||
+ for (int j = 0; j < 4; j++) {
|
||||
+ const uint32_t lo = ggml_cuda_float_to_fp4_e2m1(vals[j + 4], inv_scale);
|
||||
+ const uint32_t hi = ggml_cuda_float_to_fp4_e2m1(vals[j + 12], inv_scale);
|
||||
+ w1 |= ((lo | (hi << 4)) & 0xff) << (8 * j);
|
||||
+ }
|
||||
+
|
||||
+ const int64_t blk = row * (int64_t) Kb + kb;
|
||||
+ Aq[blk * 8 + sub * 2 + 0] = w0;
|
||||
+ Aq[blk * 8 + sub * 2 + 1] = w1;
|
||||
+ reinterpret_cast<uint8_t *>(As + blk)[sub] = fp8_code;
|
||||
+#else
|
||||
+ GGML_UNUSED(x); GGML_UNUSED(Aq); GGML_UNUSED(As);
|
||||
+ GGML_UNUSED(M_real); GGML_UNUSED(K); GGML_UNUSED(Kb);
|
||||
+ NO_DEVICE_CODE;
|
||||
+#endif // BLACKWELL_MMA_AVAILABLE
|
||||
+}
|
||||
+
|
||||
+// ---- native FP4 block-scale OMMA wrapper (PoC verbatim) ----
|
||||
+static __device__ __forceinline__ void fp4_mma(
|
||||
+ float d[4], const uint32_t a[4], const uint32_t b[2], uint32_t as, uint32_t bs) {
|
||||
+#ifdef BLACKWELL_MMA_AVAILABLE
|
||||
+ asm volatile(
|
||||
+ "mma.sync.aligned.kind::mxf4nvf4.block_scale.scale_vec::4X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue4m3 "
|
||||
+ "{%0,%1,%2,%3},{%4,%5,%6,%7},{%8,%9},{%0,%1,%2,%3},%10,{0,0},%11,{0,0};"
|
||||
+ : "+f"(d[0]),"+f"(d[1]),"+f"(d[2]),"+f"(d[3])
|
||||
+ : "r"(a[0]),"r"(a[1]),"r"(a[2]),"r"(a[3]),"r"(b[0]),"r"(b[1]),"r"(as),"r"(bs));
|
||||
+#else
|
||||
+ GGML_UNUSED(d); GGML_UNUSED(a); GGML_UNUSED(b); GGML_UNUSED(as); GGML_UNUSED(bs);
|
||||
+ NO_DEVICE_CODE;
|
||||
+#endif // BLACKWELL_MMA_AVAILABLE
|
||||
+}
|
||||
+
|
||||
+// ---- cp.async helpers (PoC verbatim) ----
|
||||
+static __device__ __forceinline__ void fp4_cp_async16(void * smem, const void * gmem) {
|
||||
+#ifdef CP_ASYNC_AVAILABLE
|
||||
+ unsigned s = (unsigned) __cvta_generic_to_shared(smem);
|
||||
+ asm volatile("cp.async.cg.shared.global [%0],[%1],16;\n" :: "r"(s), "l"(gmem));
|
||||
+#else
|
||||
+ GGML_UNUSED(smem); GGML_UNUSED(gmem); NO_DEVICE_CODE;
|
||||
+#endif // CP_ASYNC_AVAILABLE
|
||||
+}
|
||||
+template<int B>
|
||||
+static __device__ __forceinline__ void fp4_cp_async_small(void * smem, const void * gmem) {
|
||||
+#ifdef CP_ASYNC_AVAILABLE
|
||||
+ unsigned s = (unsigned) __cvta_generic_to_shared(smem);
|
||||
+ asm volatile("cp.async.ca.shared.global [%0],[%1],%2;\n" :: "r"(s), "l"(gmem), "n"(B));
|
||||
+#else
|
||||
+ GGML_UNUSED(smem); GGML_UNUSED(gmem); NO_DEVICE_CODE;
|
||||
+#endif // CP_ASYNC_AVAILABLE
|
||||
+}
|
||||
+static __device__ __forceinline__ void fp4_cp_commit() {
|
||||
+#ifdef CP_ASYNC_AVAILABLE
|
||||
+ asm volatile("cp.async.commit_group;\n" ::);
|
||||
+#else
|
||||
+ NO_DEVICE_CODE;
|
||||
+#endif // CP_ASYNC_AVAILABLE
|
||||
+}
|
||||
+template<int N>
|
||||
+static __device__ __forceinline__ void fp4_cp_wait() {
|
||||
+#ifdef CP_ASYNC_AVAILABLE
|
||||
+ asm volatile("cp.async.wait_group %0;\n" :: "n"(N));
|
||||
+#else
|
||||
+ NO_DEVICE_CODE;
|
||||
+#endif // CP_ASYNC_AVAILABLE
|
||||
+}
|
||||
+
|
||||
+// ---------------------------------------------------------------------------
|
||||
+// Optimized native FP4 GEMM (PoC verbatim). C[M,N] = A_fp4[M,K] @ W_fp4[N,K]^T
|
||||
+// inputs are layout-split: Aq[M*Kb*8], As[M*Kb], Wq[N*Kb*8], Ws[N*Kb]
|
||||
+// Tile BM x BN, K-step = KBLK nvfp4 blocks (BK = 64*KBLK), STAGES-deep pipeline,
|
||||
+// PAD u32 padding per smem row to defeat bank conflicts.
|
||||
+// ---------------------------------------------------------------------------
|
||||
+template<int BM,int BN,int WARPS_M,int WARPS_N,int KBLK,int STAGES,int PAD>
|
||||
+__launch_bounds__(WARPS_M*WARPS_N*32,1)
|
||||
+static __global__ void fp4_opt_kernel(
|
||||
+ const uint32_t * __restrict__ Aq, const uint32_t * __restrict__ As,
|
||||
+ const uint32_t * __restrict__ Wq, const uint32_t * __restrict__ Ws,
|
||||
+ float * __restrict__ C, int M, int N, int K) {
|
||||
+#ifdef BLACKWELL_MMA_AVAILABLE
|
||||
+ constexpr int NWARP=WARPS_M*WARPS_N;
|
||||
+ constexpr int THREADS=NWARP*32;
|
||||
+ constexpr int WM=BM/WARPS_M, WN=BN/WARPS_N;
|
||||
+ constexpr int MF=WM/16, NF=WN/8;
|
||||
+ constexpr int SAW=8; // u32 per block (qs)
|
||||
+ constexpr int ARS=KBLK*SAW+PAD; // A smem row stride (u32)
|
||||
+ constexpr int WRS=KBLK*SAW+PAD; // W smem row stride (u32)
|
||||
+
|
||||
+ extern __shared__ uint32_t smem[];
|
||||
+ // per-stage slabs
|
||||
+ constexpr int SZ_AQ=BM*ARS, SZ_AS=BM*KBLK, SZ_WQ=BN*WRS, SZ_WS=BN*KBLK;
|
||||
+ constexpr int STAGE_SZ=SZ_AQ+SZ_AS+SZ_WQ+SZ_WS;
|
||||
+ uint32_t* sAq[STAGES]; uint32_t* sAs[STAGES]; uint32_t* sWq[STAGES]; uint32_t* sWs[STAGES];
|
||||
+#pragma unroll
|
||||
+ for(int s=0;s<STAGES;s++){
|
||||
+ uint32_t* base=smem+s*STAGE_SZ;
|
||||
+ sAq[s]=base; sAs[s]=base+SZ_AQ; sWq[s]=base+SZ_AQ+SZ_AS; sWs[s]=base+SZ_AQ+SZ_AS+SZ_WQ;
|
||||
+ }
|
||||
+
|
||||
+ const int tid=threadIdx.x, warp=tid>>5, lane=tid&31;
|
||||
+ const int wrow=warp/WARPS_N, wcol=warp%WARPS_N;
|
||||
+ const int grp=lane>>2, tig=lane&3;
|
||||
+ const int tidxA = lane/4 + (lane%2)*8;
|
||||
+ const int tidxB = lane/4;
|
||||
+ const int blockRow=blockIdx.y*BM, blockCol=blockIdx.x*BN;
|
||||
+ const int Kb=K/64;
|
||||
+ const int numK=Kb/KBLK;
|
||||
+
|
||||
+ float acc[MF][NF][4];
|
||||
+#pragma unroll
|
||||
+ for(int i=0;i<MF;i++)for(int j=0;j<NF;j++)for(int r=0;r<4;r++)acc[i][j][r]=0;
|
||||
+
|
||||
+ // async-load k-tile `kt` into stage `st`
|
||||
+ auto load_tile=[&](int st,int kt){
|
||||
+ const int kb0=kt*KBLK;
|
||||
+ // A qs: BM*KBLK blocks, 2x 16B chunks each
|
||||
+#pragma unroll 1
|
||||
+ for(int idx=tid; idx<BM*KBLK*2; idx+=THREADS){
|
||||
+ int chunk=idx&1, blk=idx>>1;
|
||||
+ int r=blk/KBLK, kb=blk%KBLK;
|
||||
+ const uint32_t* src=&Aq[((size_t)(blockRow+r)*Kb + kb0+kb)*SAW + chunk*4];
|
||||
+ fp4_cp_async16(&sAq[st][r*ARS + kb*SAW + chunk*4], src);
|
||||
+ }
|
||||
+ // W qs
|
||||
+#pragma unroll 1
|
||||
+ for(int idx=tid; idx<BN*KBLK*2; idx+=THREADS){
|
||||
+ int chunk=idx&1, blk=idx>>1;
|
||||
+ int r=blk/KBLK, kb=blk%KBLK;
|
||||
+ const uint32_t* src=&Wq[((size_t)(blockCol+r)*Kb + kb0+kb)*SAW + chunk*4];
|
||||
+ fp4_cp_async16(&sWq[st][r*WRS + kb*SAW + chunk*4], src);
|
||||
+ }
|
||||
+ // A scales: BM rows, KBLK contiguous u32 each
|
||||
+#pragma unroll 1
|
||||
+ for(int r=tid; r<BM; r+=THREADS){
|
||||
+ const uint32_t* src=&As[(size_t)(blockRow+r)*Kb + kb0];
|
||||
+ uint32_t* dst=&sAs[st][r*KBLK];
|
||||
+ if(KBLK==4) fp4_cp_async16(dst,src);
|
||||
+ else if(KBLK==2) fp4_cp_async_small<8>(dst,src);
|
||||
+ else fp4_cp_async_small<4>(dst,src);
|
||||
+ }
|
||||
+ // W scales
|
||||
+#pragma unroll 1
|
||||
+ for(int r=tid; r<BN; r+=THREADS){
|
||||
+ const uint32_t* src=&Ws[(size_t)(blockCol+r)*Kb + kb0];
|
||||
+ uint32_t* dst=&sWs[st][r*KBLK];
|
||||
+ if(KBLK==4) fp4_cp_async16(dst,src);
|
||||
+ else if(KBLK==2) fp4_cp_async_small<8>(dst,src);
|
||||
+ else fp4_cp_async_small<4>(dst,src);
|
||||
+ }
|
||||
+ };
|
||||
+
|
||||
+ // prologue: issue STAGES-1 tiles (tiles 0..STAGES-2 into stages 0..STAGES-2)
|
||||
+#pragma unroll
|
||||
+ for(int s=0;s<STAGES-1;s++){ if(s<numK) load_tile(s,s); fp4_cp_commit(); }
|
||||
+
|
||||
+ for(int kt=0; kt<numK; kt++){
|
||||
+ // prefetch tile kt+STAGES-1 into its stage (overlaps this iter's compute)
|
||||
+ int ld=kt+(STAGES-1);
|
||||
+ if(ld<numK) load_tile(ld%STAGES,ld);
|
||||
+ fp4_cp_commit();
|
||||
+ // wait until tile kt has landed (leave STAGES-1 prefetches in flight)
|
||||
+ fp4_cp_wait<STAGES-1>();
|
||||
+ __syncthreads();
|
||||
+
|
||||
+ const int rs=kt%STAGES;
|
||||
+#pragma unroll
|
||||
+ for(int kb=0; kb<KBLK; kb++){
|
||||
+ // A fragments via ldmatrix (PRESERVED layout)
|
||||
+ uint32_t af[MF][4]; uint32_t asc[MF];
|
||||
+#pragma unroll
|
||||
+ for(int mi=0; mi<MF; mi++){
|
||||
+ int rb=wrow*WM+mi*16;
|
||||
+ const uint32_t* base=&sAq[rs][rb*ARS + kb*SAW];
|
||||
+ const uint32_t* xs = base + (lane%16)*ARS + (lane/16)*4;
|
||||
+ asm volatile("ldmatrix.sync.aligned.m8n8.x4.b16 {%0,%1,%2,%3},[%4];"
|
||||
+ : "=r"(af[mi][0]),"=r"(af[mi][1]),"=r"(af[mi][2]),"=r"(af[mi][3])
|
||||
+ : "l"(xs));
|
||||
+ asc[mi]=sAs[rs][(rb+tidxA)*KBLK+kb];
|
||||
+ }
|
||||
+ // B fragments (PRESERVED manual gather), padded row stride
|
||||
+ uint32_t bf[NF][2]; uint32_t bsc[NF];
|
||||
+#pragma unroll
|
||||
+ for(int ni=0; ni<NF; ni++){
|
||||
+ int nb=wcol*WN+ni*8;
|
||||
+ const uint32_t* base=&sWq[rs][nb*WRS + kb*SAW];
|
||||
+#pragma unroll
|
||||
+ for(int l=0;l<2;l++){
|
||||
+ int gi=grp, gj=l*4+tig;
|
||||
+ bf[ni][l]=base[gi*WRS + gj];
|
||||
+ }
|
||||
+ bsc[ni]=sWs[rs][(nb+tidxB)*KBLK+kb];
|
||||
+ }
|
||||
+#pragma unroll
|
||||
+ for(int mi=0;mi<MF;mi++)
|
||||
+#pragma unroll
|
||||
+ for(int ni=0;ni<NF;ni++)
|
||||
+ fp4_mma(acc[mi][ni], af[mi], bf[ni], asc[mi], bsc[ni]);
|
||||
+ }
|
||||
+ // ensure all warps finished reading stage rs before it is reused by a
|
||||
+ // future prefetch (the stage is overwritten at iter kt+1's prefetch).
|
||||
+ __syncthreads();
|
||||
+ }
|
||||
+
|
||||
+#pragma unroll
|
||||
+ for(int mi=0;mi<MF;mi++)
|
||||
+#pragma unroll
|
||||
+ for(int ni=0;ni<NF;ni++){
|
||||
+ int orb=blockRow+wrow*WM+mi*16, ocb=blockCol+wcol*WN+ni*8;
|
||||
+ float* d=acc[mi][ni];
|
||||
+ C[(size_t)(orb+grp)*N+ocb+2*tig] =d[0];
|
||||
+ C[(size_t)(orb+grp)*N+ocb+2*tig+1] =d[1];
|
||||
+ C[(size_t)(orb+grp+8)*N+ocb+2*tig] =d[2];
|
||||
+ C[(size_t)(orb+grp+8)*N+ocb+2*tig+1]=d[3];
|
||||
+ }
|
||||
+#else
|
||||
+ GGML_UNUSED(Aq); GGML_UNUSED(As); GGML_UNUSED(Wq); GGML_UNUSED(Ws);
|
||||
+ GGML_UNUSED(C); GGML_UNUSED(M); GGML_UNUSED(N); GGML_UNUSED(K);
|
||||
+ NO_DEVICE_CODE;
|
||||
+#endif // BLACKWELL_MMA_AVAILABLE
|
||||
+}
|
||||
+
|
||||
+// ===========================================================================
|
||||
+// ggml integration
|
||||
+// ===========================================================================
|
||||
+
|
||||
+bool ggml_cuda_fp4_prefill_should_engage(
|
||||
+ const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst, int cc) {
|
||||
+ if (src0->type != GGML_TYPE_NVFP4) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (!blackwell_mma_available(cc)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ const int64_t thr = ggml_cuda_fp4_prefill_m();
|
||||
+ if (thr <= 0) {
|
||||
+ return false; // default-off == stock; decode/small-M untouched
|
||||
+ }
|
||||
+ if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (src1->ne[1] <= thr) {
|
||||
+ return false; // M = src1->ne[1]; only LARGE M (prefill)
|
||||
+ }
|
||||
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (ggml_is_transposed(src0) || ggml_is_transposed(src1)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ // 2D only (a single weight matrix; per-expert MoE slices set ne[2]=ne[3]=1).
|
||||
+ if (src0->ne[2] != 1 || src0->ne[3] != 1 || src1->ne[2] != 1 || src1->ne[3] != 1) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ const int64_t K = src0->ne[0];
|
||||
+ const int64_t N = src0->ne[1];
|
||||
+ if (N % 128 != 0 || K % 256 != 0) {
|
||||
+ return false; // tile constraints; otherwise fall back to MMQ
|
||||
+ }
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+void ggml_cuda_mul_mat_fp4_large_m(
|
||||
+ ggml_backend_cuda_context & ctx,
|
||||
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
+ GGML_ASSERT(src0->type == GGML_TYPE_NVFP4);
|
||||
+ GGML_ASSERT(src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
+
|
||||
+ const int64_t K = src0->ne[0];
|
||||
+ const int64_t N = src0->ne[1];
|
||||
+ const int64_t M = src1->ne[1];
|
||||
+ const int64_t Kb = K / FP4_QK;
|
||||
+ GGML_ASSERT(K % 256 == 0 && N % 128 == 0);
|
||||
+
|
||||
+ cudaStream_t stream = ctx.stream();
|
||||
+
|
||||
+ constexpr int BM = 128, BN = 128, WM = 4, WN = 2, KBLK = 4, STAGES = 2, PAD = 4;
|
||||
+ const int64_t Mpad = ((M + BM - 1) / BM) * BM;
|
||||
+
|
||||
+ ggml_cuda_pool_alloc<uint32_t> Wq(ctx.pool(), (size_t) N * Kb * 8);
|
||||
+ ggml_cuda_pool_alloc<uint32_t> Ws(ctx.pool(), (size_t) N * Kb);
|
||||
+ ggml_cuda_pool_alloc<uint32_t> Aq(ctx.pool(), (size_t) Mpad * Kb * 8);
|
||||
+ ggml_cuda_pool_alloc<uint32_t> As(ctx.pool(), (size_t) Mpad * Kb);
|
||||
+
|
||||
+ // Zero the scales of the padded A-rows (M..Mpad) so they contribute 0 (scale 0 ->
|
||||
+ // the OMMA's per-block scale is 0). The padded qs may stay uninitialized.
|
||||
+ if (Mpad > M) {
|
||||
+ CUDA_CHECK(cudaMemsetAsync(As.get() + (size_t) M * Kb, 0,
|
||||
+ (size_t) (Mpad - M) * Kb * sizeof(uint32_t), stream));
|
||||
+ }
|
||||
+
|
||||
+ // split weights (GGUF block_nvfp4 -> Wq/Ws)
|
||||
+ {
|
||||
+ const int64_t tot = N * Kb;
|
||||
+ const int threads = 256;
|
||||
+ const int64_t grid = (tot + threads - 1) / threads;
|
||||
+ fp4_split_layout<<<grid, threads, 0, stream>>>(
|
||||
+ (const block_nvfp4 *) src0->data, Wq.get(), Ws.get(), (int) N, (int) Kb);
|
||||
+ CUDA_CHECK(cudaGetLastError());
|
||||
+ }
|
||||
+ // quantize + split activations (real rows only)
|
||||
+ {
|
||||
+ const int64_t tot = M * Kb * 4;
|
||||
+ const int threads = 256;
|
||||
+ const int64_t grid = (tot + threads - 1) / threads;
|
||||
+ fp4_quantize_act_split<<<grid, threads, 0, stream>>>(
|
||||
+ (const float *) src1->data, Aq.get(), As.get(), (int) M, (int) K, (int) Kb);
|
||||
+ CUDA_CHECK(cudaGetLastError());
|
||||
+ }
|
||||
+
|
||||
+ // Output: write the (Mpad x N) result straight into dst when M is tile-aligned,
|
||||
+ // otherwise into a temp and copy back the first M rows (C is row-major C[m*N+n]).
|
||||
+ float * Cout = (float *) dst->data;
|
||||
+ ggml_cuda_pool_alloc<float> Ctmp(ctx.pool());
|
||||
+ if (Mpad > M) {
|
||||
+ Cout = Ctmp.alloc((size_t) Mpad * N);
|
||||
+ }
|
||||
+
|
||||
+ auto kern = fp4_opt_kernel<BM, BN, WM, WN, KBLK, STAGES, PAD>;
|
||||
+ constexpr int SZ_AQ = BM * (KBLK * 8 + PAD), SZ_AS = BM * KBLK;
|
||||
+ constexpr int SZ_WQ = BN * (KBLK * 8 + PAD), SZ_WS = BN * KBLK;
|
||||
+ constexpr int STAGE_SZ = SZ_AQ + SZ_AS + SZ_WQ + SZ_WS;
|
||||
+ const int smem_bytes = STAGES * STAGE_SZ * (int) sizeof(uint32_t);
|
||||
+ CUDA_CHECK(cudaFuncSetAttribute(kern, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes));
|
||||
+
|
||||
+ dim3 grid((unsigned) (N / BN), (unsigned) (Mpad / BM));
|
||||
+ dim3 block(WM * WN * 32);
|
||||
+ kern<<<grid, block, smem_bytes, stream>>>(
|
||||
+ Aq.get(), As.get(), Wq.get(), Ws.get(), Cout, (int) Mpad, (int) N, (int) K);
|
||||
+ CUDA_CHECK(cudaGetLastError());
|
||||
+
|
||||
+ if (Mpad > M) {
|
||||
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, Ctmp.get(), (size_t) M * N * sizeof(float),
|
||||
+ cudaMemcpyDeviceToDevice, stream));
|
||||
+ }
|
||||
+}
|
||||
diff --git a/ggml/src/ggml-cuda/fp4-gemm.cuh b/ggml/src/ggml-cuda/fp4-gemm.cuh
|
||||
new file mode 100644
|
||||
index 0000000..8ed1aa4
|
||||
--- /dev/null
|
||||
+++ b/ggml/src/ggml-cuda/fp4-gemm.cuh
|
||||
@@ -0,0 +1,38 @@
|
||||
+#pragma once
|
||||
+
|
||||
+#include "common.cuh"
|
||||
+
|
||||
+// [paged patch 0034] Native NVFP4 (W4A4) large-M GEMM for Blackwell sm_121a (GB10).
|
||||
+//
|
||||
+// A Marlin-class tiled FP4-MMA GEMM (cp.async multistage prefetch, register-resident
|
||||
+// accumulators, ldmatrix A-operand, m16n8k64 mxf4nvf4 block-scale OMMA with e4m3
|
||||
+// true-scale) that beats the dequant->bf16 cuBLAS (nvjet) path that the rejected 0033
|
||||
+// scaffold routed large-M prefill through. The kernel body is the bit-exact PoC
|
||||
+// (NMSE=0 vs a same-dequant f32 reference) at its tuned best config
|
||||
+// (128x128 / KBLK4 / STAGES2 / PAD4).
|
||||
+//
|
||||
+// It is bit-exact-by-construction with the shipped FP4-MMQ path: it consumes the SAME
|
||||
+// e2m1 weight nibbles + e4m3 scale bytes from the GGUF block_nvfp4, quantizes
|
||||
+// activations with the SAME math as quantize_mmq_nvfp4 (e4m3 amax/6 scale + the +/-2
|
||||
+// code search + ggml_cuda_float_to_fp4_e2m1), and feeds the SAME hardware OMMA. The
|
||||
+// only difference vs FP4-MMQ is the K-accumulation order (a different but equivalent
|
||||
+// f32 reduction tree), which is greedy-md5 gated like every other paged path.
|
||||
+//
|
||||
+// Engages ONLY at large M (prefill), behind the 0033 LLAMA_FP4_PREFILL_M threshold;
|
||||
+// decode and small-M are byte-untouched and never reach this kernel.
|
||||
+
|
||||
+// True if the native FP4 large-M path should handle this dense NVFP4 mul_mat:
|
||||
+// src0 NVFP4 + src1/dst f32, contiguous, not transposed, 2D, Blackwell,
|
||||
+// LLAMA_FP4_PREFILL_M > 0, M = src1->ne[1] > threshold, N % 128 == 0, K % 256 == 0.
|
||||
+// This single predicate also routes per-expert MoE slices (they flow through
|
||||
+// ggml_cuda_mul_mat) into the native kernel.
|
||||
+bool ggml_cuda_fp4_prefill_should_engage(
|
||||
+ const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst, int cc);
|
||||
+
|
||||
+// Native FP4 W4A4 GEMM: dst[M,N] = src1_act[M,K] @ src0_w[N,K]^T.
|
||||
+// src0 = NVFP4 weights, src1 = f32 activations, dst = f32. Streams on ctx.stream(),
|
||||
+// pool-allocates scratch; no host sync. Caller must have checked
|
||||
+// ggml_cuda_fp4_prefill_should_engage().
|
||||
+void ggml_cuda_mul_mat_fp4_large_m(
|
||||
+ ggml_backend_cuda_context & ctx,
|
||||
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 2ecc971..a92003c 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -25,6 +25,7 @@
|
||||
#include "ggml-cuda/diagmask.cuh"
|
||||
#include "ggml-cuda/diag.cuh"
|
||||
#include "ggml-cuda/fattn.cuh"
|
||||
+#include "ggml-cuda/fp4-gemm.cuh"
|
||||
#include "ggml-cuda/fwht.cuh"
|
||||
#include "ggml-cuda/getrows.cuh"
|
||||
#include "ggml-cuda/im2col.cuh"
|
||||
@@ -2541,6 +2542,19 @@ static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) {
|
||||
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
|
||||
|
||||
+ // [paged patch 0034] Native NVFP4 (W4A4) large-M (prefill) FP4-MMA GEMM. Engages only
|
||||
+ // when LLAMA_FP4_PREFILL_M>0 and M=src1->ne[1] exceeds it (and tile dims divide), so
|
||||
+ // decode / small-M is byte-untouched. This also catches the per-expert MoE slices that
|
||||
+ // flow through here from the mul_mat_id host-sync loop, routing each expert GEMM to the
|
||||
+ // native kernel (see ggml_cuda_should_use_mmq's MoE gate in mmq.cu).
|
||||
+ if (!split) {
|
||||
+ const int cc_fp4 = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
+ if (ggml_cuda_fp4_prefill_should_engage(src0, src1, dst, cc_fp4)) {
|
||||
+ ggml_cuda_mul_mat_fp4_large_m(ctx, src0, src1, dst);
|
||||
+ return;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
// If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
|
||||
// But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
|
||||
// Therefore, in such cases use cuBLAS.
|
||||
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
|
||||
index 2dcaaab..694a402 100644
|
||||
--- a/ggml/src/ggml-cuda/mmq.cu
|
||||
+++ b/ggml/src/ggml-cuda/mmq.cu
|
||||
@@ -321,24 +321,29 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
|
||||
return false;
|
||||
}
|
||||
|
||||
- // Paged prefill lever (patch 0033): OPTION-(a) route large-M NVFP4 dense GEMMs
|
||||
- // OFF the FP4-MMQ kernel and through the dequant->bf16 cuBLAS (nvjet)
|
||||
- // tensor-core path (ggml_cuda_op_mul_mat_cublas, NVFP4 bf16 branch). The
|
||||
- // scope premise was that FP4-MMQ is register-bound to ~3% of FP4 peak at
|
||||
- // large M. MEASURED ON GB10 THIS IS FALSE: FP4-MMQ at M=512..2048 beats
|
||||
- // dequant->bf16 cuBLAS by 29-49% (S_PP A/B in docs/PREFILL_GEMM_RESULTS.md),
|
||||
- // because bf16 tensor-core peak is ~half FP4 peak AND the per-step weight
|
||||
- // dequant + 4x bf16 weight traffic (~8x total vs the FP4 read) dominate and
|
||||
- // only partially amortize as M grows. The path is NUMERICALLY VALID and
|
||||
- // benign (greedy md5 byte-identical to FP4-MMQ; test-backend-ops passes), so
|
||||
- // it is kept as a validated, env-gated scaffold (for option-(b) native FP4
|
||||
- // large-M kernels and non-GB10 hardware), but DEFAULT-DISABLED (== stock).
|
||||
- // Set -D LLAMA_FP4_PREFILL_M=<M> or env LLAMA_FP4_PREFILL_M=<M> to A/B it;
|
||||
- // 0 (default) disables. Dense only (n_experts == 0).
|
||||
+ // Paged prefill lever (patch 0033 -> 0034): route large-M NVFP4 prefill GEMMs to the
|
||||
+ // native FP4-MMA (W4A4 OMMA) kernel in fp4-gemm.cu instead of the FP4-MMQ kernel.
|
||||
+ //
|
||||
+ // - DENSE (n_experts == 0): the reroute happens earlier, in ggml_cuda_mul_mat's
|
||||
+ // ggml_cuda_fp4_prefill_should_engage() early check, which knows the N/K tile
|
||||
+ // divisibility. We deliberately do NOT force dense off MMQ here: if the native
|
||||
+ // kernel cannot take a shape (non-divisible N/K) MMQ stays the correct fallback,
|
||||
+ // NOT the rejected dequant->bf16 cuBLAS path.
|
||||
+ // - MoE (n_experts > 0): force the grouped FP4-MMQ id-path OFF at large M so
|
||||
+ // mul_mat_id falls to its per-expert host-sync loop, where each expert slice flows
|
||||
+ // back through ggml_cuda_mul_mat and hits the native kernel. CUDA graphs are
|
||||
+ // disabled for that prefill step (prefill is not graph-replayed); a graph-safe
|
||||
+ // grouped (ragged-batched) FP4-MMA kernel is the flagged follow-up. Decode keeps
|
||||
+ // ne12 <= threshold so the grouped graph-safe MMQ id-path (patch 0025) is untouched.
|
||||
+ //
|
||||
+ // The historical 0033 finding stands: dequant->bf16 cuBLAS LOSES to FP4-MMQ at large M
|
||||
+ // (bf16 tensor-core peak is ~half FP4 peak + 8x weight traffic), which is exactly why
|
||||
+ // the native FP4-MMA kernel (NMSE=0, ~103 TFLOP/s, beats cuBLAS bf16) replaces it here.
|
||||
+ // Set -D LLAMA_FP4_PREFILL_M=<M> or env LLAMA_FP4_PREFILL_M=<M>; 0 (default) == stock.
|
||||
#ifndef LLAMA_FP4_PREFILL_M
|
||||
#define LLAMA_FP4_PREFILL_M 0
|
||||
#endif // LLAMA_FP4_PREFILL_M
|
||||
- if (type == GGML_TYPE_NVFP4 && n_experts == 0 && blackwell_mma_available(cc)) {
|
||||
+ if (type == GGML_TYPE_NVFP4 && n_experts > 0 && blackwell_mma_available(cc)) {
|
||||
static const int64_t fp4_prefill_m = [] {
|
||||
const char * e = getenv("LLAMA_FP4_PREFILL_M");
|
||||
return e != nullptr ? (int64_t) atoll(e) : (int64_t) LLAMA_FP4_PREFILL_M;
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,572 @@
|
||||
From df186bd20a23a1baae92f2828fc68f240c115e7d Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Mon, 29 Jun 2026 03:34:48 +0200
|
||||
Subject: [PATCH] feat(paged): Marlin-style W4A16 grouped MoE prefill GEMM
|
||||
(patch 0035)
|
||||
|
||||
Profile-validated #2 prefill lever: a DISTINCT kernel from the two prefill
|
||||
rejects. NOT patch 0033 (separate-pass dequant -> bf16 cuBLAS/nvjet, lost to
|
||||
FP4-MMQ at large M). NOT patch 0034 (native W4A4 FP4-MMA mxf4nvf4 OMMA, still
|
||||
pays the quantize_mmq_nvfp4 activation-quant tax). This is the W4A16 shape vLLM
|
||||
uses on sm_121: FP4 expert weights dequantized to bf16 IN REGISTERS right before
|
||||
the MMA, activations kept bf16 (a cheap f32->bf16 cast, NO per-block amax/code
|
||||
quantize -> ZERO activation-quant tax), standard bf16 m16n8k16 mma.sync (reuses
|
||||
ggml/src/ggml-cuda/mma.cuh tiles) into f32 accumulators, cp.async multistage.
|
||||
|
||||
GROUPED (the actual prefill shape): one kernel launch over the mul_mat_id
|
||||
token-sorted activation buffer (src1_sorted is already sorted-by-expert by the
|
||||
existing host path), with a per-M-tile expert map so each output tile reads its
|
||||
own expert weight matrix (src0 + expert*nb02); the ragged per-expert row tail is
|
||||
masked. No per-expert kernel launch, no per-expert M-padding (vs the 0034
|
||||
per-expert host-sync loop). The B (weight) fragment is filled by in-register
|
||||
FP4->bf16 dequant via the tile get_i/get_j contract (correct-by-construction
|
||||
vs ldmatrix); the A (activation) fragment is a bf16 ldmatrix.
|
||||
|
||||
ROUTING (default-off; distinct env from 0034):
|
||||
- mmq.cu (ggml_cuda_should_use_mmq): NVFP4 + n_experts>0 + Blackwell +
|
||||
ne11(tokens) > LLAMA_W4A16_PREFILL_M returns false, so mul_mat_id falls to
|
||||
the token-sorting host path.
|
||||
- ggml-cuda.cu (ggml_cuda_mul_mat_id): once src1_sorted is built, if
|
||||
ggml_cuda_w4a16_moe_grouped_should_engage() the grouped kernel replaces the
|
||||
per-expert GEMM loop (dst_sorted then scattered back as usual). Decode keeps
|
||||
ne12 <= threshold so the graph-safe grouped MMQ id-path (0025/0043) is
|
||||
untouched; non-MoE / non-NVFP4 / small-M are byte-untouched.
|
||||
|
||||
TOGGLE / A-B: env (or -D) LLAMA_W4A16_PREFILL_M. 0 (default) == OFF == stock;
|
||||
>0 engages for MoE prefill GEMMs with tokens > the value. LLAMA_W4A16_DEBUG=1
|
||||
prints per-GEMM engagement (total_rows / n_tiles / max-tokens-per-expert).
|
||||
|
||||
VALIDATION (GB10, sm_121a, Qwen3.6-35B-A3B-NVFP4):
|
||||
- test-backend-ops MUL_MAT_ID nvfp4 (CUDA0 vs CPU oracle), W4A16 forced
|
||||
(LLAMA_W4A16_PREFILL_M=1): 81/81 OK, 0 FAIL (incl. multi-tile-per-expert
|
||||
cases). The threading bug found here (mma.cuh tile ops use threadIdx.x AS the
|
||||
warp lane, so the block must be 2D (32,NWARP)) is fixed.
|
||||
- greedy md5 (paged MoE, LLAMA_KV_PAGED=1): NOT-engaged (high threshold) ==
|
||||
OFF baseline 4a3fd812 BYTE-IDENTICAL (default-off is stock); engaged
|
||||
(120 grouped GEMMs on a 116-token prefill) is coherent + benign (a different
|
||||
but equivalent bf16-vs-Q8_1 K-reduction, like the documented paged-MoE path
|
||||
divergence), output near-identical to stock.
|
||||
|
||||
HONEST PERF (S_PP t/s, llama-batched-bench -fa on -ngl 99 -ntg 32 -npl 1,
|
||||
LLAMA_KV_PAGED=1, OFF vs W4A16 thr=64), CURRENTLY A REGRESSION:
|
||||
npp 512 : 1096.7 -> 794.8 (-28%)
|
||||
npp 1024: 1413.5 -> 961.1 (-32%)
|
||||
npp 2048: 1671.3 -> 1069.6 (-36%)
|
||||
Decode TG unaffected (~53 t/s both). The kernel is CORRECT but its first
|
||||
untuned config (BM64/BN128/STAGES2, scalar in-register dequant, f32->bf16 cast
|
||||
pre-pass, 4B weight cp.async, BM-tile ragged-utilization waste, per-GEMM host
|
||||
tile-map + 3 H2D copies) does not yet beat the tuned FP4-MMQ grouped path on
|
||||
GB10; it does not realize the profiled vLLM 2.16x. Ships DEFAULT-OFF (like 0033
|
||||
scaffold / 0017) as the validated, env-gated mechanism + bit-exact gate for the
|
||||
tuning follow-ups (deeper pipeline, ldmatrix/16B weight staging, smem-conflict
|
||||
padding, larger/register-resident tiles, removing the cast pre-pass, dropping
|
||||
the per-GEMM host map).
|
||||
|
||||
Build: arch=compute_121a,code=[compute_121a,sm_121a]; BLACKWELL_MMA_AVAILABLE /
|
||||
AMPERE_MMA_AVAILABLE / CP_ASYNC_AVAILABLE guards (NO_DEVICE_CODE off-Blackwell).
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 12 +
|
||||
ggml/src/ggml-cuda/mmq.cu | 17 ++
|
||||
ggml/src/ggml-cuda/w4a16-gemm.cu | 359 ++++++++++++++++++++++++++++++
|
||||
ggml/src/ggml-cuda/w4a16-gemm.cuh | 55 +++++
|
||||
4 files changed, 443 insertions(+)
|
||||
create mode 100644 ggml/src/ggml-cuda/w4a16-gemm.cu
|
||||
create mode 100644 ggml/src/ggml-cuda/w4a16-gemm.cuh
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 3151684..37e4d11 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -26,6 +26,7 @@
|
||||
#include "ggml-cuda/diag.cuh"
|
||||
#include "ggml-cuda/fattn.cuh"
|
||||
#include "ggml-cuda/fp4-gemm.cuh"
|
||||
+#include "ggml-cuda/w4a16-gemm.cuh"
|
||||
#include "ggml-cuda/fwht.cuh"
|
||||
#include "ggml-cuda/getrows.cuh"
|
||||
#include "ggml-cuda/im2col.cuh"
|
||||
@@ -2747,6 +2748,16 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
+ // [paged patch 0035] Marlin-style W4A16 grouped MoE prefill GEMM: one launch over the
|
||||
+ // token-sorted activation buffer (src1_sorted, already f32 + sorted-by-expert above) with a
|
||||
+ // per-tile expert map, in-register FP4->bf16 weight dequant + bf16 mma. Replaces the
|
||||
+ // per-expert host-sync GEMM loop. Engages only when LLAMA_W4A16_PREFILL_M>0 and ne12>thr
|
||||
+ // (large-M prefill); decode / non-NVFP4 keep the loop below (byte-identical to stock).
|
||||
+ if (ggml_cuda_w4a16_moe_grouped_should_engage(src0, src1, dst, cc)) {
|
||||
+ ggml_cuda_mul_mat_id_w4a16_grouped(ctx, src0,
|
||||
+ (const float *) src1_sorted.ptr, (float *) dst_sorted.ptr,
|
||||
+ tokens_per_expert.data(), ne02, ne10, ne0, stream);
|
||||
+ } else {
|
||||
char * src1_data_cur = (char *) src1_sorted.ptr;
|
||||
char * dst_data_cur = (char *) dst_sorted.ptr;
|
||||
for (int64_t i02 = 0; i02 < ne02; ++i02) {
|
||||
@@ -2795,6 +2806,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
src1_data_cur += src1_slice.nb[2];
|
||||
dst_data_cur += dst_slice.nb[2];
|
||||
}
|
||||
+ }
|
||||
|
||||
get_rows_cuda(dst_sorted.ptr, type_dst_sorted, ids_from_sorted, dst->data, dst->type,
|
||||
ne0, ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted,
|
||||
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
|
||||
index 694a402..dc5c2d1 100644
|
||||
--- a/ggml/src/ggml-cuda/mmq.cu
|
||||
+++ b/ggml/src/ggml-cuda/mmq.cu
|
||||
@@ -353,6 +353,23 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
|
||||
}
|
||||
}
|
||||
|
||||
+ // Paged prefill lever (patch 0035): the Marlin-style W4A16 grouped MoE GEMM also needs the
|
||||
+ // grouped FP4-MMQ id-path forced OFF at large M so mul_mat_id falls to the token-sorting
|
||||
+ // host path, where the grouped W4A16 kernel is dispatched (in-register FP4->bf16 dequant +
|
||||
+ // bf16 mma, ZERO activation-quant). Distinct env from 0034; default 0 == stock.
|
||||
+#ifndef LLAMA_W4A16_PREFILL_M
|
||||
+#define LLAMA_W4A16_PREFILL_M 0
|
||||
+#endif // LLAMA_W4A16_PREFILL_M
|
||||
+ if (type == GGML_TYPE_NVFP4 && n_experts > 0 && blackwell_mma_available(cc)) {
|
||||
+ static const int64_t w4a16_prefill_m = [] {
|
||||
+ const char * e = getenv("LLAMA_W4A16_PREFILL_M");
|
||||
+ return e != nullptr ? (int64_t) atoll(e) : (int64_t) LLAMA_W4A16_PREFILL_M;
|
||||
+ }();
|
||||
+ if (w4a16_prefill_m > 0 && ne11 > w4a16_prefill_m) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
if (turing_mma_available(cc)) {
|
||||
return true;
|
||||
}
|
||||
diff --git a/ggml/src/ggml-cuda/w4a16-gemm.cu b/ggml/src/ggml-cuda/w4a16-gemm.cu
|
||||
new file mode 100644
|
||||
index 0000000..f348f31
|
||||
--- /dev/null
|
||||
+++ b/ggml/src/ggml-cuda/w4a16-gemm.cu
|
||||
@@ -0,0 +1,359 @@
|
||||
+#include "w4a16-gemm.cuh"
|
||||
+#include "mma.cuh"
|
||||
+
|
||||
+#include <algorithm>
|
||||
+#include <cstdint>
|
||||
+#include <cstdlib>
|
||||
+#include <vector>
|
||||
+
|
||||
+// ===========================================================================
|
||||
+// [paged patch 0035] Marlin-style W4A16 grouped MoE prefill GEMM. See w4a16-gemm.cuh.
|
||||
+//
|
||||
+// In-register FP4->bf16 weight dequant + bf16 activations + bf16 m16n8k16 mma.sync (mma.cuh),
|
||||
+// cp.async multistage, grouped (ragged, per-tile expert offset) over the token-sorted buffer.
|
||||
+// ===========================================================================
|
||||
+
|
||||
+using namespace ggml_cuda_mma;
|
||||
+typedef tile<16, 8, nv_bfloat162> tile_A; // A operand: M=16, K=16
|
||||
+typedef tile< 8, 8, nv_bfloat162> tile_B; // B operand: N=8, K=16
|
||||
+typedef tile<16, 8, float> tile_C; // accumulator: M=16, N=8
|
||||
+
|
||||
+#ifndef LLAMA_W4A16_PREFILL_M
|
||||
+#define LLAMA_W4A16_PREFILL_M 0
|
||||
+#endif // LLAMA_W4A16_PREFILL_M
|
||||
+
|
||||
+int64_t ggml_cuda_w4a16_prefill_m() {
|
||||
+ static const int64_t m = [] {
|
||||
+ const char * e = getenv("LLAMA_W4A16_PREFILL_M");
|
||||
+ return e != nullptr ? (int64_t) atoll(e) : (int64_t) LLAMA_W4A16_PREFILL_M;
|
||||
+ }();
|
||||
+ return m;
|
||||
+}
|
||||
+
|
||||
+bool ggml_cuda_w4a16_prefill_enabled() {
|
||||
+ return ggml_cuda_w4a16_prefill_m() > 0;
|
||||
+}
|
||||
+
|
||||
+// ---- cp.async helpers (sm80+; raw bytes, no cast) ----
|
||||
+static __device__ __forceinline__ void w4a16_cp_async16(void * smem, const void * gmem) {
|
||||
+#ifdef CP_ASYNC_AVAILABLE
|
||||
+ const unsigned s = (unsigned) __cvta_generic_to_shared(smem);
|
||||
+ asm volatile("cp.async.cg.shared.global [%0],[%1],16;\n" :: "r"(s), "l"(gmem));
|
||||
+#else
|
||||
+ GGML_UNUSED(smem); GGML_UNUSED(gmem); NO_DEVICE_CODE;
|
||||
+#endif // CP_ASYNC_AVAILABLE
|
||||
+}
|
||||
+static __device__ __forceinline__ void w4a16_cp_async4(void * smem, const void * gmem) {
|
||||
+#ifdef CP_ASYNC_AVAILABLE
|
||||
+ const unsigned s = (unsigned) __cvta_generic_to_shared(smem);
|
||||
+ asm volatile("cp.async.ca.shared.global [%0],[%1],4;\n" :: "r"(s), "l"(gmem));
|
||||
+#else
|
||||
+ GGML_UNUSED(smem); GGML_UNUSED(gmem); NO_DEVICE_CODE;
|
||||
+#endif // CP_ASYNC_AVAILABLE
|
||||
+}
|
||||
+static __device__ __forceinline__ void w4a16_cp_commit() {
|
||||
+#ifdef CP_ASYNC_AVAILABLE
|
||||
+ asm volatile("cp.async.commit_group;\n" ::);
|
||||
+#else
|
||||
+ NO_DEVICE_CODE;
|
||||
+#endif // CP_ASYNC_AVAILABLE
|
||||
+}
|
||||
+template<int N> static __device__ __forceinline__ void w4a16_cp_wait() {
|
||||
+#ifdef CP_ASYNC_AVAILABLE
|
||||
+ asm volatile("cp.async.wait_group %0;\n" :: "n"(N));
|
||||
+#else
|
||||
+ NO_DEVICE_CODE;
|
||||
+#endif // CP_ASYNC_AVAILABLE
|
||||
+}
|
||||
+
|
||||
+// ---- f32 -> bf16 activation cast (NO quantize). Pads the [total_rows, pad_rows) tail with 0. ----
|
||||
+static __global__ void w4a16_cast_act_f32_bf16(
|
||||
+ const float * __restrict__ x, nv_bfloat16 * __restrict__ y, int64_t n, int64_t npad) {
|
||||
+ const int64_t i = (int64_t) blockIdx.x * blockDim.x + threadIdx.x;
|
||||
+ if (i >= npad) {
|
||||
+ return;
|
||||
+ }
|
||||
+ y[i] = i < n ? __float2bfloat16(x[i]) : (nv_bfloat16) 0.0f;
|
||||
+}
|
||||
+
|
||||
+// ---------------------------------------------------------------------------
|
||||
+// Grouped W4A16 GEMM. For each output tile (blockIdx.x = N-block, blockIdx.y = M-tile):
|
||||
+// expert e = g_tile_expert[blockIdx.y]
|
||||
+// row_start = g_tile_row0[blockIdx.y] (absolute row in the sorted buffer)
|
||||
+// row_count = g_tile_rows[blockIdx.y] (valid rows in this tile, <= BM)
|
||||
+// Weights read from W = src0 + e*expert_stride_blocks (block_nvfp4 [N,Kb]); activations from
|
||||
+// Abf (bf16, sorted); output to C (f32, sorted, [N, total_rows] = C[row*N + col]).
|
||||
+// Weights are dequantized FP4->bf16 in registers; A via ldmatrix; bf16 m16n8k16 mma.
|
||||
+// BK = 64 (one nvfp4 block per K-step); STAGES-deep cp.async pipeline over the Kb blocks.
|
||||
+// ---------------------------------------------------------------------------
|
||||
+template<int BM, int BN, int WARPS_M, int WARPS_N, int STAGES>
|
||||
+__launch_bounds__(WARPS_M*WARPS_N*32, 1)
|
||||
+static __global__ void w4a16_grouped_kernel(
|
||||
+ const nv_bfloat16 * __restrict__ Abf, // [pad_rows, K] bf16
|
||||
+ const block_nvfp4 * __restrict__ W0, // src0 base (expert 0)
|
||||
+ float * __restrict__ C, // [total_rows, N] f32
|
||||
+ const int * __restrict__ g_tile_expert,
|
||||
+ const int * __restrict__ g_tile_row0,
|
||||
+ const int * __restrict__ g_tile_rows,
|
||||
+ int N, int K, int64_t expert_stride_blocks) {
|
||||
+#if defined(AMPERE_MMA_AVAILABLE) && defined(CP_ASYNC_AVAILABLE)
|
||||
+ constexpr int BK = 64; // one nvfp4 block
|
||||
+ constexpr int NWARP = WARPS_M*WARPS_N;
|
||||
+ constexpr int THREADS = NWARP*32;
|
||||
+ constexpr int WM = BM/WARPS_M, WN = BN/WARPS_N;
|
||||
+ constexpr int MF = WM/16, NF = WN/8;
|
||||
+
|
||||
+ constexpr int AN = BK/2; // bf16 pairs per A smem row (nv_bfloat162)
|
||||
+ constexpr int SZ_A = BM*AN; // nv_bfloat162 per stage
|
||||
+ constexpr int SZ_WQ = BN*8; // u32 per stage (32 qs bytes/row)
|
||||
+ constexpr int SZ_WD = BN; // u32 per stage (4 scale bytes/row)
|
||||
+
|
||||
+ extern __shared__ uint32_t smem_u32[];
|
||||
+ // Layout per stage: [A as u32 = nv_bfloat162][Wq u32][Wd u32]
|
||||
+ constexpr int STAGE_U32 = SZ_A + SZ_WQ + SZ_WD;
|
||||
+ nv_bfloat162 * sA[STAGES];
|
||||
+ uint32_t * sWq[STAGES];
|
||||
+ uint32_t * sWd[STAGES];
|
||||
+#pragma unroll
|
||||
+ for (int s = 0; s < STAGES; s++) {
|
||||
+ uint32_t * base = smem_u32 + s*STAGE_U32;
|
||||
+ sA[s] = (nv_bfloat162 *) base;
|
||||
+ sWq[s] = base + SZ_A;
|
||||
+ sWd[s] = base + SZ_A + SZ_WQ;
|
||||
+ }
|
||||
+
|
||||
+ // mma.cuh's tile ops (load_ldmatrix / mma / tile::get_i/get_j) use threadIdx.x AS THE WARP LANE,
|
||||
+ // so the block MUST be 2D (32, NWARP): threadIdx.x = lane (0..31), threadIdx.y = warp.
|
||||
+ const int lane = threadIdx.x; // 0..31
|
||||
+ const int warp = threadIdx.y; // 0..NWARP-1
|
||||
+ const int tid = warp*32 + lane; // linear id for the cp.async strided copies
|
||||
+ const int wrow = warp / WARPS_N, wcol = warp % WARPS_N;
|
||||
+
|
||||
+ const int e = g_tile_expert[blockIdx.y];
|
||||
+ const int row0 = g_tile_row0[blockIdx.y];
|
||||
+ const int rcount = g_tile_rows[blockIdx.y];
|
||||
+ const int blockCol = blockIdx.x*BN;
|
||||
+ const int Kb = K/64;
|
||||
+ const block_nvfp4 * We = W0 + (int64_t) e*expert_stride_blocks; // expert e weight base
|
||||
+
|
||||
+ tile_C acc[MF][NF];
|
||||
+
|
||||
+ // async-load K-block `kt` into stage `st`
|
||||
+ auto load_tile = [&](int st, int kt) {
|
||||
+ // A: BM rows x BK bf16 = BM x AN nv_bfloat162 = BM x (BK/8) 16B chunks
|
||||
+ const int A_chunks = BM*(BK/8);
|
||||
+#pragma unroll 1
|
||||
+ for (int idx = tid; idx < A_chunks; idx += THREADS) {
|
||||
+ const int c = idx % (BK/8); // 16B chunk in the row
|
||||
+ const int r = idx / (BK/8); // row in tile
|
||||
+ const nv_bfloat16 * src = Abf + (int64_t)(row0 + r)*K + (int64_t)kt*BK + c*8;
|
||||
+ w4a16_cp_async16(((char *) sA[st]) + (r*AN + c*4)*sizeof(uint32_t), src);
|
||||
+ }
|
||||
+ // W qs: BN rows x 32 bytes = BN x 8 u32 (each block's qs at byte offset 4)
|
||||
+#pragma unroll 1
|
||||
+ for (int idx = tid; idx < BN*8; idx += THREADS) {
|
||||
+ const int w = idx & 7; // u32 word in the 32-byte qs
|
||||
+ const int r = idx >> 3; // row in tile
|
||||
+ const block_nvfp4 * blk = We + (int64_t)(blockCol + r)*Kb + kt;
|
||||
+ const char * src = ((const char *) blk) + 4 /*d[4]*/ + w*4;
|
||||
+ w4a16_cp_async4(&sWq[st][r*8 + w], src);
|
||||
+ }
|
||||
+ // W scales: BN rows x 4 bytes (one u32 each, the block's d[4] at byte offset 0)
|
||||
+#pragma unroll 1
|
||||
+ for (int r = tid; r < BN; r += THREADS) {
|
||||
+ const block_nvfp4 * blk = We + (int64_t)(blockCol + r)*Kb + kt;
|
||||
+ w4a16_cp_async4(&sWd[st][r], (const char *) blk);
|
||||
+ }
|
||||
+ };
|
||||
+
|
||||
+ // prologue
|
||||
+#pragma unroll
|
||||
+ for (int s = 0; s < STAGES-1; s++) { if (s < Kb) load_tile(s, s); w4a16_cp_commit(); }
|
||||
+
|
||||
+ for (int kt = 0; kt < Kb; kt++) {
|
||||
+ const int ld = kt + (STAGES-1);
|
||||
+ if (ld < Kb) load_tile(ld % STAGES, ld);
|
||||
+ w4a16_cp_commit();
|
||||
+ w4a16_cp_wait<STAGES-1>();
|
||||
+ __syncthreads();
|
||||
+
|
||||
+ const int rs = kt % STAGES;
|
||||
+ const nv_bfloat162 * sAcur = sA[rs];
|
||||
+ const uint8_t * sWqb = (const uint8_t *) sWq[rs]; // BN rows x 32 bytes
|
||||
+ const uint32_t * sWdw = sWd[rs]; // BN rows x 1 u32 (4 scale bytes)
|
||||
+
|
||||
+#pragma unroll
|
||||
+ for (int kk = 0; kk < BK/16; kk++) { // 4 m16n8k16 sub-steps per 64-block
|
||||
+ const int sub = kk; // sub-block (0..3): selects scale + nibble half
|
||||
+ // A fragments via ldmatrix (bf16)
|
||||
+ tile_A A_frag[MF];
|
||||
+#pragma unroll
|
||||
+ for (int mi = 0; mi < MF; mi++) {
|
||||
+ const int rb = wrow*WM + mi*16;
|
||||
+ load_ldmatrix(A_frag[mi], sAcur + rb*AN + kk*8, AN);
|
||||
+ }
|
||||
+ // B fragments: in-register FP4->bf16 dequant (correct-by-construction via tile get_i/get_j)
|
||||
+ tile_B B_frag[NF];
|
||||
+ const int n_local = lane >> 2; // tile_B::get_i (row N, 0..7)
|
||||
+ const int jc = lane & 3; // lane%4
|
||||
+ const int qbyte = sub*8 + 2*jc; // qs byte index for this lane within the block
|
||||
+#pragma unroll
|
||||
+ for (int ni = 0; ni < NF; ni++) {
|
||||
+ const int nrow = wcol*WN + ni*8 + n_local; // col within BN tile [0,BN)
|
||||
+ const uint8_t * qsb = sWqb + nrow*32; // this row's 32 qs bytes
|
||||
+ const uint8_t b0 = qsb[qbyte];
|
||||
+ const uint8_t b1 = qsb[qbyte + 1];
|
||||
+ const float sc = ggml_cuda_ue4m3_to_fp32(((const uint8_t *) &sWdw[nrow])[sub]);
|
||||
+ // x[0]: low nibbles (k = 2jc, 2jc+1)
|
||||
+ B_frag[ni].x[0].x = __float2bfloat16(sc * (float) kvalues_mxfp4[b0 & 0x0F]);
|
||||
+ B_frag[ni].x[0].y = __float2bfloat16(sc * (float) kvalues_mxfp4[b1 & 0x0F]);
|
||||
+ // x[1]: high nibbles (k = 8+2jc, 9+2jc)
|
||||
+ B_frag[ni].x[1].x = __float2bfloat16(sc * (float) kvalues_mxfp4[b0 >> 4]);
|
||||
+ B_frag[ni].x[1].y = __float2bfloat16(sc * (float) kvalues_mxfp4[b1 >> 4]);
|
||||
+ }
|
||||
+#pragma unroll
|
||||
+ for (int mi = 0; mi < MF; mi++)
|
||||
+#pragma unroll
|
||||
+ for (int ni = 0; ni < NF; ni++)
|
||||
+ mma(acc[mi][ni], A_frag[mi], B_frag[ni]);
|
||||
+ }
|
||||
+ __syncthreads();
|
||||
+ }
|
||||
+
|
||||
+ // write back (mask the ragged per-expert row tail)
|
||||
+#pragma unroll
|
||||
+ for (int mi = 0; mi < MF; mi++)
|
||||
+#pragma unroll
|
||||
+ for (int ni = 0; ni < NF; ni++) {
|
||||
+ const int orow = wrow*WM + mi*16;
|
||||
+ const int ocol = blockCol + wcol*WN + ni*8;
|
||||
+#pragma unroll
|
||||
+ for (int l = 0; l < acc[mi][ni].ne; l++) {
|
||||
+ const int lr = orow + acc[mi][ni].get_i(l); // local row within tile
|
||||
+ const int nc = ocol + acc[mi][ni].get_j(l); // global col
|
||||
+ if (lr < rcount && nc < N) {
|
||||
+ C[(int64_t)(row0 + lr)*N + nc] = acc[mi][ni].x[l];
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+#else
|
||||
+ GGML_UNUSED(Abf); GGML_UNUSED(W0); GGML_UNUSED(C);
|
||||
+ GGML_UNUSED(g_tile_expert); GGML_UNUSED(g_tile_row0); GGML_UNUSED(g_tile_rows);
|
||||
+ GGML_UNUSED(N); GGML_UNUSED(K); GGML_UNUSED(expert_stride_blocks);
|
||||
+ NO_DEVICE_CODE;
|
||||
+#endif // AMPERE_MMA_AVAILABLE && CP_ASYNC_AVAILABLE
|
||||
+}
|
||||
+
|
||||
+// ===========================================================================
|
||||
+// host integration
|
||||
+// ===========================================================================
|
||||
+
|
||||
+bool ggml_cuda_w4a16_moe_grouped_should_engage(
|
||||
+ const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst, int cc) {
|
||||
+ if (src0->type != GGML_TYPE_NVFP4) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (!blackwell_mma_available(cc)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (!ggml_cuda_w4a16_prefill_enabled()) {
|
||||
+ return false; // default-off == stock
|
||||
+ }
|
||||
+ if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ // ne12 = total tokens (aggregate prefill M); only LARGE M (prefill), never decode.
|
||||
+ if (src1->ne[2] <= ggml_cuda_w4a16_prefill_m()) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ const int64_t K = src0->ne[0];
|
||||
+ const int64_t N = src0->ne[1];
|
||||
+ if (N % 128 != 0 || K % 64 != 0) {
|
||||
+ return false; // tile constraints; else fall back to per-expert/MMQ
|
||||
+ }
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+void ggml_cuda_mul_mat_id_w4a16_grouped(
|
||||
+ ggml_backend_cuda_context & ctx,
|
||||
+ const ggml_tensor * src0,
|
||||
+ const float * src1_sorted,
|
||||
+ float * dst_sorted,
|
||||
+ const int * tokens_per_expert,
|
||||
+ int64_t n_experts, int64_t K, int64_t N,
|
||||
+ cudaStream_t stream) {
|
||||
+ GGML_ASSERT(src0->type == GGML_TYPE_NVFP4);
|
||||
+ GGML_ASSERT(N % 128 == 0 && K % 64 == 0);
|
||||
+
|
||||
+ constexpr int BM = 64, BN = 128, WARPS_M = 2, WARPS_N = 4, STAGES = 2;
|
||||
+
|
||||
+ // host: build the per-M-tile expert map (ragged, no tile crosses an expert boundary)
|
||||
+ int64_t total_rows = 0;
|
||||
+ for (int64_t e = 0; e < n_experts; e++) {
|
||||
+ total_rows += tokens_per_expert[e];
|
||||
+ }
|
||||
+ if (total_rows == 0) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ std::vector<int32_t> h_tile_expert, h_tile_row0, h_tile_rows;
|
||||
+ int64_t row = 0;
|
||||
+ for (int64_t e = 0; e < n_experts; e++) {
|
||||
+ const int t = tokens_per_expert[e];
|
||||
+ for (int off = 0; off < t; off += BM) {
|
||||
+ h_tile_expert.push_back((int32_t) e);
|
||||
+ h_tile_row0.push_back((int32_t) (row + off));
|
||||
+ h_tile_rows.push_back((int32_t) std::min(BM, t - off));
|
||||
+ }
|
||||
+ row += t;
|
||||
+ }
|
||||
+ const int n_tiles = (int) h_tile_expert.size();
|
||||
+
|
||||
+ if (getenv("LLAMA_W4A16_DEBUG")) {
|
||||
+ int max_tpe = 0, multi = 0;
|
||||
+ for (int64_t e = 0; e < n_experts; e++) {
|
||||
+ if (tokens_per_expert[e] > max_tpe) max_tpe = tokens_per_expert[e];
|
||||
+ if (tokens_per_expert[e] > BM) multi++;
|
||||
+ }
|
||||
+ fprintf(stderr, "[w4a16] engaged: total_rows=%lld n_experts=%lld K=%lld N=%lld n_tiles=%d max_tpe=%d multi_tile_experts=%d\n",
|
||||
+ (long long) total_rows, (long long) n_experts, (long long) K, (long long) N, n_tiles, max_tpe, multi);
|
||||
+ }
|
||||
+
|
||||
+ // device: tile map
|
||||
+ ggml_cuda_pool_alloc<int32_t> d_tile_expert(ctx.pool(), n_tiles);
|
||||
+ ggml_cuda_pool_alloc<int32_t> d_tile_row0 (ctx.pool(), n_tiles);
|
||||
+ ggml_cuda_pool_alloc<int32_t> d_tile_rows (ctx.pool(), n_tiles);
|
||||
+ CUDA_CHECK(cudaMemcpyAsync(d_tile_expert.ptr, h_tile_expert.data(), n_tiles*sizeof(int32_t), cudaMemcpyHostToDevice, stream));
|
||||
+ CUDA_CHECK(cudaMemcpyAsync(d_tile_row0.ptr, h_tile_row0.data(), n_tiles*sizeof(int32_t), cudaMemcpyHostToDevice, stream));
|
||||
+ CUDA_CHECK(cudaMemcpyAsync(d_tile_rows.ptr, h_tile_rows.data(), n_tiles*sizeof(int32_t), cudaMemcpyHostToDevice, stream));
|
||||
+
|
||||
+ // activations: f32 -> bf16 (cheap cast, NO act-quant), zero-padded so every tile's BM-row read
|
||||
+ // stays in-bounds. A tile's row0 is generally NOT BM-aligned (experts start mid-buffer), and a
|
||||
+ // tile can begin as late as total_rows-1, so it can read up to total_rows-1+BM; add a full BM of
|
||||
+ // zero headroom on top of the BM-rounded length to cover that worst case.
|
||||
+ const int64_t pad_rows = (((total_rows + BM - 1) / BM) + 1) * BM;
|
||||
+ ggml_cuda_pool_alloc<nv_bfloat16> Abf(ctx.pool(), (size_t) pad_rows * K);
|
||||
+ {
|
||||
+ const int64_t n = total_rows * K;
|
||||
+ const int64_t npad = pad_rows * K;
|
||||
+ const int threads = 256;
|
||||
+ const int64_t grid = (npad + threads - 1) / threads;
|
||||
+ w4a16_cast_act_f32_bf16<<<grid, threads, 0, stream>>>(src1_sorted, Abf.get(), n, npad);
|
||||
+ CUDA_CHECK(cudaGetLastError());
|
||||
+ }
|
||||
+
|
||||
+ const int64_t expert_stride_blocks = (int64_t) (src0->nb[2] / sizeof(block_nvfp4));
|
||||
+
|
||||
+ auto kern = w4a16_grouped_kernel<BM, BN, WARPS_M, WARPS_N, STAGES>;
|
||||
+ constexpr int STAGE_U32 = BM*(64/2) + BN*8 + BN;
|
||||
+ const int smem_bytes = STAGES * STAGE_U32 * (int) sizeof(uint32_t);
|
||||
+ CUDA_CHECK(cudaFuncSetAttribute(kern, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes));
|
||||
+
|
||||
+ dim3 grid((unsigned) (N / BN), (unsigned) n_tiles);
|
||||
+ dim3 block(32, WARPS_M*WARPS_N); // 2D: threadIdx.x = warp lane, threadIdx.y = warp
|
||||
+ kern<<<grid, block, smem_bytes, stream>>>(
|
||||
+ Abf.get(), (const block_nvfp4 *) src0->data, dst_sorted,
|
||||
+ d_tile_expert.ptr, d_tile_row0.ptr, d_tile_rows.ptr,
|
||||
+ (int) N, (int) K, expert_stride_blocks);
|
||||
+ CUDA_CHECK(cudaGetLastError());
|
||||
+}
|
||||
diff --git a/ggml/src/ggml-cuda/w4a16-gemm.cuh b/ggml/src/ggml-cuda/w4a16-gemm.cuh
|
||||
new file mode 100644
|
||||
index 0000000..2287d6f
|
||||
--- /dev/null
|
||||
+++ b/ggml/src/ggml-cuda/w4a16-gemm.cuh
|
||||
@@ -0,0 +1,55 @@
|
||||
+#pragma once
|
||||
+
|
||||
+#include "common.cuh"
|
||||
+
|
||||
+// [paged patch 0035] Marlin-style W4A16 GROUPED MoE prefill GEMM for Blackwell sm_121a (GB10).
|
||||
+//
|
||||
+// This is the profile-validated #2 prefill lever and a DISTINCT kernel from the two prefill
|
||||
+// rejects:
|
||||
+// - NOT patch 0033 (separate-pass dequant -> bf16 cuBLAS / nvjet): that pays a full per-step
|
||||
+// weight dequant + 4x bf16 weight traffic and lost to FP4-MMQ at large M.
|
||||
+// - NOT patch 0034 (native W4A4 FP4-MMA, mxf4nvf4 block-scale OMMA): that quantizes the
|
||||
+// activations to FP4 and so still pays the quantize_mmq_nvfp4 activation-quant tax.
|
||||
+//
|
||||
+// The winning shape vLLM uses on this silicon (Marlin W4A16): the FP4 expert weights are
|
||||
+// dequantized to bf16 IN REGISTERS right before the MMA (never materialized to global/smem as
|
||||
+// bf16), the activations stay bf16 (a cheap f32->bf16 cast, NO per-block FP4 amax/code-search
|
||||
+// quantize), and the product is a standard bf16 m16n8k16 mma.sync feeding f32 accumulators,
|
||||
+// cp.async multistage-pipelined over the K loop. So W4A16 pays ZERO activation-quant (the paged
|
||||
+// FP4-MMQ path's quantize_mmq_nvfp4 is +15 us/tok) and the GEMM runs as a bf16 tensor-core GEMM
|
||||
+// with the weight read at 4 bits.
|
||||
+//
|
||||
+// GROUPED: the kernel is launched ONCE over the whole mul_mat_id token-sorted activation buffer
|
||||
+// (src1_sorted is already sorted-by-expert by the existing host-loop), with a per-M-tile expert
|
||||
+// map so each output tile reads its expert's weight matrix (src0 + expert*nb02) and the ragged
|
||||
+// per-expert row tail is masked. No per-expert kernel launch, no per-expert M-padding waste.
|
||||
+//
|
||||
+// Engages ONLY at large aggregate-M (prefill), behind LLAMA_W4A16_PREFILL_M (default 0 == OFF
|
||||
+// == stock); decode (small ne12) and the non-MoE / non-NVFP4 paths are byte-untouched. The bf16
|
||||
+// tiles are mma.cuh's (mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32).
|
||||
+
|
||||
+// True if the grouped W4A16 path should handle this mul_mat_id:
|
||||
+// src0 NVFP4, src1 f32, dst f32, Blackwell, LLAMA_W4A16_PREFILL_M>0,
|
||||
+// ne12 (total tokens / aggregate prefill M) > threshold, N=ne0 % 128 == 0, K=ne10 % 64 == 0.
|
||||
+bool ggml_cuda_w4a16_moe_grouped_should_engage(
|
||||
+ const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst, int cc);
|
||||
+
|
||||
+// True iff LLAMA_W4A16_PREFILL_M > 0 (the master on/off for the mmq.cu grouped-MMQ-off gate).
|
||||
+bool ggml_cuda_w4a16_prefill_enabled();
|
||||
+int64_t ggml_cuda_w4a16_prefill_m();
|
||||
+
|
||||
+// Grouped W4A16 MoE GEMM over the token-sorted buffer.
|
||||
+// src0 : NVFP4 weights [K, N, n_experts] (one [K,N] matrix per expert)
|
||||
+// src1_sorted : f32 [K, total_rows], rows already sorted by expert (the mul_mat_id host-loop's
|
||||
+// src1_sorted), with tokens_per_expert[e] consecutive rows per expert e
|
||||
+// dst_sorted : f32 [N, total_rows], written in the same sorted order
|
||||
+// tokens_per_expert : host vector, length n_experts
|
||||
+// Streams on `stream`, pool-allocates scratch (bf16 activations + device tile map); no host sync.
|
||||
+void ggml_cuda_mul_mat_id_w4a16_grouped(
|
||||
+ ggml_backend_cuda_context & ctx,
|
||||
+ const ggml_tensor * src0,
|
||||
+ const float * src1_sorted,
|
||||
+ float * dst_sorted,
|
||||
+ const int * tokens_per_expert,
|
||||
+ int64_t n_experts, int64_t K, int64_t N,
|
||||
+ cudaStream_t stream);
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,322 @@
|
||||
From b81fa71360c3f6b46e97c6ad504efc10bdaea484 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Sun, 28 Jun 2026 20:00:04 +0200
|
||||
Subject: [PATCH 40/41] feat(paged): S1 paged decode-graph reuse across serving
|
||||
steps (patch 0040)
|
||||
|
||||
The continuous-serving decode gap (paged ~3.7 vs vLLM ~5.9 tok/s/seq) is
|
||||
host-bound: llama-context layer-A graph reuse was 0% in serving, so the host
|
||||
rebuilt the ggml graph EVERY decode step (the +1.85 ms/step the Phase-0 profile
|
||||
attributes to the rebuild; set_inputs/block-table are negligible). Root cause:
|
||||
the paged decode inputs (input_block_table / input_gather_idxs in paged-attn.cpp)
|
||||
never overrode llm_graph_input_i::can_reuse, which defaults to false - so any
|
||||
graph carrying a paged input could never be reused, even with a constant batch
|
||||
shape. (This is also why the paged decode graph rebuilt in static batched-bench.)
|
||||
|
||||
S1 gives the paged inputs a correct can_reuse:
|
||||
- reuse iff the input tensor dims are unchanged. The block table is
|
||||
[n_view, n_stream] with n_view = PAD(n_gather, 256) clamped to n_kv, so it is
|
||||
bucketed to 256 and stays constant across a 256-token decode window; n_stream
|
||||
follows n_seqs_unq. The index CONTENTS are refilled at set_input on every step
|
||||
(incl. reused steps), so a reused graph reads the current step's cells.
|
||||
- the stored kv-cache context is refreshed from the owning attn input
|
||||
(llm_graph_input_attn_kv, whose mctx is updated per-decode by attn_kv /
|
||||
mem_hybrid can_reuse earlier in the input list), so a reused graph picks up the
|
||||
live memory context. mem_hybrid::can_reuse now also refreshes inp_attn->mctx.
|
||||
|
||||
Master switch paged_attn::decode_graph_reuse() (ON by default when paged;
|
||||
LLAMA_PAGED_NO_GRAPH_REUSE=1 forces the pre-S1 rebuild-every-step path for A/B).
|
||||
Also surfaces the run-wide graph-reuse rate in the [L5INSTR] exit line
|
||||
(l5_add_proc) since llama-server does not print llama_perf.
|
||||
|
||||
BIT-EXACT: greedy md5 byte-identical with reuse ON vs OFF on every path -
|
||||
dense 5951a5b4d624ce891e22ab5fca9bc439, paged-MoE 8cb0ce23777bf55f92f63d0292c756b0.
|
||||
Reuse only skips the host-side rebuild; set_inputs still re-runs every step.
|
||||
|
||||
Measured (GB10): batched-bench paged decode graph reuse 0% -> 95.5% (hostproc
|
||||
dense 3.31->2.66, MoE 2.44->1.82 ms/step); static throughput flat as expected
|
||||
(static regime is GPU-bound). The serving payoff needs S3 (patch 0041): S1 alone
|
||||
holds only 13.8% reuse in serving because co-batched prefill churns the shape
|
||||
every step.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
src/llama-context.cpp | 3 ++
|
||||
src/llama-graph.cpp | 13 +++++-
|
||||
src/paged-attn.cpp | 94 ++++++++++++++++++++++++++++++++++++++-----
|
||||
src/paged-attn.h | 14 +++++++
|
||||
4 files changed, 112 insertions(+), 12 deletions(-)
|
||||
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index c408eef..306a506 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -1347,6 +1347,7 @@ bool llama_context::set_adapter_cvec(
|
||||
|
||||
extern "C" void l5_add_setinp(double ns);
|
||||
extern "C" void l5_add_hostproc(double ns);
|
||||
+extern "C" void l5_add_proc(int reused); // [S1] per-step graph-reuse counter
|
||||
static inline double l5c_now_ns(){ struct timespec ts; clock_gettime(CLOCK_MONOTONIC,&ts); return (double)ts.tv_sec*1e9+(double)ts.tv_nsec; }
|
||||
llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
|
||||
double _l5_t0=l5c_now_ns();
|
||||
@@ -1374,7 +1375,9 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
|
||||
}
|
||||
|
||||
n_reused++;
|
||||
+ l5_add_proc(1);
|
||||
} else {
|
||||
+ l5_add_proc(0);
|
||||
res->reset();
|
||||
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
|
||||
index 931258d..0337742 100644
|
||||
--- a/src/llama-graph.cpp
|
||||
+++ b/src/llama-graph.cpp
|
||||
@@ -699,6 +699,12 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
|
||||
|
||||
this->mctx = mctx;
|
||||
|
||||
+ // [S1] refresh the attn sub-input's memory context so paged decode inputs
|
||||
+ // (which read owner->mctx in their can_reuse, run later in the input list)
|
||||
+ // pick up the live per-decode context on a reused graph. Harmless for the
|
||||
+ // non-paged path: inp_attn->mctx is only consumed at graph-build time there.
|
||||
+ inp_attn->mctx = mctx->get_attn();
|
||||
+
|
||||
bool res = true;
|
||||
|
||||
res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
|
||||
@@ -2370,8 +2376,11 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
ggml_tensor * kq_mask_g = kq_mask;
|
||||
ggml_tensor * block_table = nullptr;
|
||||
const bool is_decode = (q_cur->ne[2] == k->ne[3]); // 1 query token per stream
|
||||
- if (!(is_decode && paged_attn::in_kernel_decode(ctx0, res, mctx_cur, &k, &v, &kq_mask_g, &block_table))) {
|
||||
- paged_attn::gather(ctx0, res, mctx_cur, &k, &v, &kq_mask_g);
|
||||
+ // [S1] pass `inp` (the attn input) as the reuse owner: its mctx is refreshed
|
||||
+ // per-decode by attn_kv/mem_hybrid can_reuse, and the paged inputs read it so
|
||||
+ // a reused graph picks up the live memory context.
|
||||
+ if (!(is_decode && paged_attn::in_kernel_decode(ctx0, res, mctx_cur, inp, &k, &v, &kq_mask_g, &block_table))) {
|
||||
+ paged_attn::gather(ctx0, res, mctx_cur, inp, &k, &v, &kq_mask_g);
|
||||
}
|
||||
|
||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask_g, sinks, v_mla, kq_scale, il, block_table);
|
||||
diff --git a/src/paged-attn.cpp b/src/paged-attn.cpp
|
||||
index ebd92be..d543c7f 100644
|
||||
--- a/src/paged-attn.cpp
|
||||
+++ b/src/paged-attn.cpp
|
||||
@@ -11,9 +11,13 @@
|
||||
#include <ctime>
|
||||
namespace { static inline double l5_now_ns(){ struct timespec ts; clock_gettime(CLOCK_MONOTONIC,&ts); return (double)ts.tv_sec*1e9+(double)ts.tv_nsec; } }
|
||||
double g_l5_t_gbt=0, g_l5_t_setinp=0, g_l5_t_hostproc=0; long g_l5_n_gbt=0, g_l5_n_setinp=0, g_l5_n_hostproc=0;
|
||||
+// [S1] graph-reuse counters across the whole run (the serving reuse-rate signal -
|
||||
+// llama-server does not print llama_perf, so surface it here at process exit).
|
||||
+long g_l5_n_proc=0, g_l5_n_reused=0;
|
||||
extern "C" void l5_add_setinp(double ns){ g_l5_t_setinp+=ns; g_l5_n_setinp++; }
|
||||
extern "C" void l5_add_hostproc(double ns){ g_l5_t_hostproc+=ns; g_l5_n_hostproc++; }
|
||||
-namespace { struct L5Printer { ~L5Printer(){ fprintf(stderr,"[L5INSTR] get_block_table n=%ld sum=%.2fms mean=%.4fms | set_inputs n=%ld sum=%.2fms mean=%.4fms | hostproc n=%ld sum=%.2fms mean=%.4fms\n", g_l5_n_gbt, g_l5_t_gbt/1e6, g_l5_n_gbt? g_l5_t_gbt/1e6/g_l5_n_gbt:0.0, g_l5_n_setinp, g_l5_t_setinp/1e6, g_l5_n_setinp? g_l5_t_setinp/1e6/g_l5_n_setinp:0.0, g_l5_n_hostproc, g_l5_t_hostproc/1e6, g_l5_n_hostproc? g_l5_t_hostproc/1e6/g_l5_n_hostproc:0.0 ); } } g_l5_printer; }
|
||||
+extern "C" void l5_add_proc(int reused){ g_l5_n_proc++; if (reused) g_l5_n_reused++; }
|
||||
+namespace { struct L5Printer { ~L5Printer(){ fprintf(stderr,"[L5INSTR] get_block_table n=%ld sum=%.2fms mean=%.4fms | set_inputs n=%ld sum=%.2fms mean=%.4fms | hostproc n=%ld sum=%.2fms mean=%.4fms | graph_reuse %ld/%ld = %.1f%%\n", g_l5_n_gbt, g_l5_t_gbt/1e6, g_l5_n_gbt? g_l5_t_gbt/1e6/g_l5_n_gbt:0.0, g_l5_n_setinp, g_l5_t_setinp/1e6, g_l5_n_setinp? g_l5_t_setinp/1e6/g_l5_n_setinp:0.0, g_l5_n_hostproc, g_l5_t_hostproc/1e6, g_l5_n_hostproc? g_l5_t_hostproc/1e6/g_l5_n_hostproc:0.0, g_l5_n_reused, g_l5_n_proc, g_l5_n_proc? 100.0*g_l5_n_reused/g_l5_n_proc:0.0 ); } } g_l5_printer; }
|
||||
|
||||
|
||||
namespace paged_attn {
|
||||
@@ -28,17 +32,52 @@ static bool debug() {
|
||||
return d;
|
||||
}
|
||||
|
||||
+// [S1] paged decode-graph reuse master switch. ON by default whenever paging is
|
||||
+// active; LLAMA_PAGED_NO_GRAPH_REUSE=1 forces it off (A/B probe / safety hatch).
|
||||
+bool decode_graph_reuse() {
|
||||
+ static const bool on = active() && (std::getenv("LLAMA_PAGED_NO_GRAPH_REUSE") == nullptr);
|
||||
+ return on;
|
||||
+}
|
||||
+
|
||||
namespace {
|
||||
|
||||
+// [S1] Recompute the block-table view length the SAME way in_kernel_decode()
|
||||
+// builds it, so can_reuse() can compare against the stored tensor dim. n_view is
|
||||
+// PAD(n_gather,256) clamped to the physical window n_kv: it only changes when
|
||||
+// n_gather crosses a 256 boundary, so a steady decode reuses across many steps.
|
||||
+static inline int64_t paged_block_table_n_view(const llama_kv_cache_context * mctx) {
|
||||
+ const int64_t n_gather = (int64_t) mctx->get_n_gather();
|
||||
+ if (n_gather <= 0) {
|
||||
+ return 0;
|
||||
+ }
|
||||
+ int64_t n_view = GGML_PAD(n_gather, 256);
|
||||
+ const int64_t n_kv = (int64_t) mctx->get_n_kv();
|
||||
+ if (n_view > n_kv) {
|
||||
+ n_view = n_kv;
|
||||
+ }
|
||||
+ return n_view;
|
||||
+}
|
||||
+
|
||||
+// [S1] Number of attention streams the paged inputs build over - matches K->ne[3]
|
||||
+// at build time and the n_stream used by can_reuse_kq_mask in llama-graph.cpp.
|
||||
+static inline int64_t paged_n_stream(const llm_graph_params & params) {
|
||||
+ return params.cparams.kv_unified ? 1 : (int64_t) params.ubatch.n_seqs_unq;
|
||||
+}
|
||||
+
|
||||
// Graph input that, at set_input time, fills an I32 [n_gather, n_stream] tensor
|
||||
// with each stream's non-empty cell indices (position-sorted, padded with a
|
||||
-// masked/empty cell) by delegating to the kv-cache context. Private to this
|
||||
-// unit; default can_reuse()==false keeps the graph from being reused across
|
||||
-// decodes (n_gather grows every step).
|
||||
+// masked/empty cell) by delegating to the kv-cache context. Private to this unit.
|
||||
+//
|
||||
+// [S1] can_reuse: the graph topology depends only on the tensor SHAPE
|
||||
+// [n_gather, n_stream] - the index CONTENTS are refilled at set_input every step,
|
||||
+// so they need not match. n_gather is UNPADDED here (the gather path is used for
|
||||
+// prefill / transposed-V fallback), so it grows every decode and reuse rarely
|
||||
+// holds - correct and harmless. mctx is refreshed from the owning attn input
|
||||
+// (whose mctx is updated by attn_kv/mem_hybrid can_reuse earlier in the input list).
|
||||
class input_gather_idxs : public llm_graph_input_i {
|
||||
public:
|
||||
- input_gather_idxs(const llama_kv_cache_context * mctx, ggml_tensor * idxs)
|
||||
- : mctx(mctx), idxs(idxs) {}
|
||||
+ input_gather_idxs(const llama_kv_cache_context * mctx, const llm_graph_input_attn_kv * owner, ggml_tensor * idxs)
|
||||
+ : mctx(mctx), owner(owner), idxs(idxs) {}
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override {
|
||||
GGML_UNUSED(ubatch);
|
||||
@@ -46,17 +85,37 @@ public:
|
||||
mctx->get_gather_idxs((int32_t *) idxs->data);
|
||||
}
|
||||
|
||||
+ bool can_reuse(const llm_graph_params & params) override {
|
||||
+ if (!owner || !paged_attn::decode_graph_reuse()) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ mctx = owner->mctx; // refresh to the live per-decode context
|
||||
+ const int64_t n_gather = (int64_t) mctx->get_n_gather();
|
||||
+ if (n_gather <= 0) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ return idxs->ne[0] == n_gather && idxs->ne[1] == paged_n_stream(params);
|
||||
+ }
|
||||
+
|
||||
const llama_kv_cache_context * mctx;
|
||||
+ const llm_graph_input_attn_kv * owner;
|
||||
ggml_tensor * idxs;
|
||||
};
|
||||
|
||||
// Block table filler for the in-kernel paged read: fills an I32 [n_blk, n_stream]
|
||||
// tensor with each stream's position-ordered cells, padded to n_blk (per column)
|
||||
// with a masked empty cell, by delegating to the kv-cache context.
|
||||
+//
|
||||
+// [S1] can_reuse: reuse iff the block-table tensor dims [n_view, n_stream] are
|
||||
+// unchanged - n_view is bucketed to 256 (paged_block_table_n_view), so the decode
|
||||
+// graph reuses across every step within a 256-token window. The table CONTENTS
|
||||
+// are refilled at set_input on every step (incl. reused steps), so the reused
|
||||
+// graph reads the current step's cells. mctx is refreshed from the owning attn
|
||||
+// input so the reused graph's set_input/get_block_table uses the live context.
|
||||
class input_block_table : public llm_graph_input_i {
|
||||
public:
|
||||
- input_block_table(const llama_kv_cache_context * mctx, ggml_tensor * idxs, uint32_t n_blk)
|
||||
- : mctx(mctx), idxs(idxs), n_blk(n_blk) {}
|
||||
+ input_block_table(const llama_kv_cache_context * mctx, const llm_graph_input_attn_kv * owner, ggml_tensor * idxs, uint32_t n_blk)
|
||||
+ : mctx(mctx), owner(owner), idxs(idxs), n_blk(n_blk) {}
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override {
|
||||
GGML_UNUSED(ubatch);
|
||||
@@ -66,7 +125,20 @@ public:
|
||||
g_l5_t_gbt += l5_now_ns()-_t; g_l5_n_gbt++;
|
||||
}
|
||||
|
||||
+ bool can_reuse(const llm_graph_params & params) override {
|
||||
+ if (!owner || !paged_attn::decode_graph_reuse()) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ mctx = owner->mctx; // refresh to the live per-decode context
|
||||
+ const int64_t n_view = paged_block_table_n_view(mctx);
|
||||
+ if (n_view <= 0 || n_view != (int64_t) n_blk) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ return idxs->ne[0] == n_view && idxs->ne[1] == paged_n_stream(params);
|
||||
+ }
|
||||
+
|
||||
const llama_kv_cache_context * mctx;
|
||||
+ const llm_graph_input_attn_kv * owner;
|
||||
ggml_tensor * idxs;
|
||||
uint32_t n_blk;
|
||||
};
|
||||
@@ -76,6 +148,7 @@ public:
|
||||
void gather(ggml_context * ctx0,
|
||||
llm_graph_result * res,
|
||||
const llama_kv_cache_context * mctx,
|
||||
+ const llm_graph_input_attn_kv * owner,
|
||||
ggml_tensor ** k,
|
||||
ggml_tensor ** v,
|
||||
ggml_tensor ** kq_mask) {
|
||||
@@ -114,7 +187,7 @@ void gather(ggml_context * ctx0,
|
||||
// n_stream, so column s gathers from stream s of the source.
|
||||
ggml_tensor * idx = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_gather, n_stream);
|
||||
ggml_set_input(idx);
|
||||
- res->add_input(llm_graph_input_ptr(new input_gather_idxs(mctx, idx)));
|
||||
+ res->add_input(llm_graph_input_ptr(new input_gather_idxs(mctx, owner, idx)));
|
||||
|
||||
// --- gather K: collapse (head_dim, n_head) so cells become the row axis ---
|
||||
{
|
||||
@@ -156,6 +229,7 @@ void gather(ggml_context * ctx0,
|
||||
bool in_kernel_decode(ggml_context * ctx0,
|
||||
llm_graph_result * res,
|
||||
const llama_kv_cache_context * mctx,
|
||||
+ const llm_graph_input_attn_kv * owner,
|
||||
ggml_tensor ** k,
|
||||
ggml_tensor ** v,
|
||||
ggml_tensor ** kq_mask,
|
||||
@@ -221,7 +295,7 @@ bool in_kernel_decode(ggml_context * ctx0,
|
||||
|
||||
ggml_tensor * idx = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_view, n_stream);
|
||||
ggml_set_input(idx);
|
||||
- res->add_input(llm_graph_input_ptr(new input_block_table(mctx, idx, (uint32_t) n_view)));
|
||||
+ res->add_input(llm_graph_input_ptr(new input_block_table(mctx, owner, idx, (uint32_t) n_view)));
|
||||
|
||||
// Present K and V as [d, h, n_view, ns] VIEWS of the full physical window:
|
||||
// identical per-cell (nb1,nb2) and per-stream (nb3) strides, only the cell
|
||||
diff --git a/src/paged-attn.h b/src/paged-attn.h
|
||||
index 23e2184..fafe821 100644
|
||||
--- a/src/paged-attn.h
|
||||
+++ b/src/paged-attn.h
|
||||
@@ -21,18 +21,31 @@ struct ggml_context;
|
||||
struct ggml_tensor;
|
||||
class llm_graph_result;
|
||||
class llama_kv_cache_context;
|
||||
+class llm_graph_input_attn_kv;
|
||||
|
||||
namespace paged_attn {
|
||||
|
||||
// true iff env LLAMA_KV_PAGED is set (evaluated once).
|
||||
bool active();
|
||||
|
||||
+// [S1] true iff the paged decode-graph reuse (layer-A can_reuse on the paged
|
||||
+// inputs) is ENABLED. Default ON when active(); LLAMA_PAGED_NO_GRAPH_REUSE=1
|
||||
+// forces it off (A/B probe / safety hatch). When off the paged inputs keep the
|
||||
+// stock default can_reuse()==false, i.e. the pre-S1 behaviour (rebuild every
|
||||
+// step). Bit-exact either way - reuse only skips the host-side graph rebuild,
|
||||
+// set_inputs still re-runs every step.
|
||||
+bool decode_graph_reuse();
|
||||
+
|
||||
// Gather K, V and the kq_mask down to the current sequence's non-empty cells.
|
||||
// No-op (returns immediately) unless active(). On return *k, *v and *kq_mask
|
||||
// point at the compacted tensors; pass them straight to build_attn_mha.
|
||||
+// `owner` is the attention input that owns the live (per-decode-refreshed) memory
|
||||
+// context; the paged input reads owner->mctx in can_reuse so a reused graph picks
|
||||
+// up the fresh context (see input_gather_idxs::can_reuse). May be null (no reuse).
|
||||
void gather(ggml_context * ctx0,
|
||||
llm_graph_result * res,
|
||||
const llama_kv_cache_context * mctx,
|
||||
+ const llm_graph_input_attn_kv * owner,
|
||||
ggml_tensor ** k,
|
||||
ggml_tensor ** v,
|
||||
ggml_tensor ** kq_mask);
|
||||
@@ -50,6 +63,7 @@ void gather(ggml_context * ctx0,
|
||||
bool in_kernel_decode(ggml_context * ctx0,
|
||||
llm_graph_result * res,
|
||||
const llama_kv_cache_context * mctx,
|
||||
+ const llm_graph_input_attn_kv * owner,
|
||||
ggml_tensor ** k,
|
||||
ggml_tensor ** v,
|
||||
ggml_tensor ** kq_mask,
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,114 @@
|
||||
From ddff2279f23f18cadfbbb907a397d66b3609e9cd Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Sun, 28 Jun 2026 20:00:24 +0200
|
||||
Subject: [PATCH 41/41] feat(paged): S3 decode-shape-stable scheduling (patch
|
||||
0041)
|
||||
|
||||
The S1 paged decode-graph reuse (patch 0040) is necessary but not sufficient in
|
||||
continuous serving: with cont_batching a co-batched prefill chunk inflates the
|
||||
step from n_tokens==D (pure decode) to D+P, which changes the ubatch shape and
|
||||
breaks llama-context layer-A reuse on (nearly) every step. Measured: S1 alone
|
||||
holds only 13.8% graph reuse in a 128-client serving load.
|
||||
|
||||
S3 makes the scheduler EMIT graph-reusable steps to match what S1 makes reusable.
|
||||
While there is live decode load it runs PURE-decode steps (skip Phase-2 prompt
|
||||
admission) so the decode batch shape stays constant, and admits a prefill chunk
|
||||
only on a bounded cadence (every LLAMA_PAGED_PREFILL_PERIOD steps, default 8) or
|
||||
when no decode is active. The deferred prefill chunk still runs within at most
|
||||
(period-1) decode steps, so prompt latency rises by a bounded amount.
|
||||
|
||||
Pure policy change inside update_slots(), built on the patch-0016 decode-first
|
||||
budget; no new slot states, no batch-formation rewrite, zero libllama changes.
|
||||
|
||||
BIT-EXACT: only changes WHICH step a prompt chunk is admitted in. Each sequence's
|
||||
decode logits depend on its own tokens + its own KV only (the paged decode read is
|
||||
per-stream, attention is permutation-invariant over the co-batched set), so
|
||||
deferring another slot's prefill never changes a generating slot's output. Does
|
||||
not run in the single-sequence greedy md5 gate (that path is llama-completion).
|
||||
|
||||
DEFAULT-OFF (A/B finding): a measured end-to-end A/B proved that making S3
|
||||
default-on under paged KV is a serving mistake. Deferring prefill admission on the
|
||||
period-8 cadence defers prompt admission: 2.5x worse TTFT (60s vs 24s at N=256)
|
||||
and 20-29% lower end-to-end throughput, with no end-to-end win at any concurrency.
|
||||
Its apparent decode_agg gain was a metric artifact (faster per-step decode bought
|
||||
by starving prefill). So S3 now defaults OFF (prefer prompt prefill admission for
|
||||
good TTFT) and is opt-in via LLAMA_PAGED_DECODE_STABLE=1, intended only for
|
||||
decode-dominated, low-arrival traffic where TTFT is not a concern. With
|
||||
LLAMA_PAGED_DECODE_STABLE unset => byte-identical to patch 0016.
|
||||
|
||||
Measured (GB10, MoE Qwen3.6-35B-A3B-NVFP4, 128-client staggered streaming load):
|
||||
S1+S3 vs baseline (graphs rebuilt every step): graph reuse 0% -> 72.2%, hostproc
|
||||
15.98 -> 6.31 ms/step, decode 4.05 -> 5.52 tok/s/seq median (4.24 -> 5.96 mean,
|
||||
at vLLM's ~5.9 sustained). NOTE these are per-step decode metrics; the A/B above
|
||||
shows they do not translate to an end-to-end serving win, hence default-off.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
tools/server/server-context.cpp | 46 ++++++++++++++++++++++++++++++++-
|
||||
1 file changed, 45 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
|
||||
index 64775dc..a77e267 100644
|
||||
--- a/tools/server/server-context.cpp
|
||||
+++ b/tools/server/server-context.cpp
|
||||
@@ -3138,11 +3138,55 @@ private:
|
||||
}
|
||||
int32_t n_prompt_budgeted = 0; // prompt tokens added to the batch this step (across slots)
|
||||
|
||||
+ // PAGED serving lever (patch 0041, S3): decode-shape-stable scheduling.
|
||||
+ // Pairs with the S1 paged decode-graph reuse (patch 0040): S1 makes a
|
||||
+ // pure-decode step graph-reusable, S3 makes the scheduler EMIT pure-decode
|
||||
+ // steps. With continuous batching a co-batched prefill chunk inflates the
|
||||
+ // step from n_tokens==D (pure decode) to D+P, which changes the ubatch
|
||||
+ // shape and breaks layer-A graph reuse on EVERY step. S3 keeps prefill out
|
||||
+ // of the decode step: while there is live decode load it runs pure-decode
|
||||
+ // steps (reuse holds) and admits a prefill chunk only on a bounded cadence
|
||||
+ // (every LLAMA_PAGED_PREFILL_PERIOD steps, default 8) or when no decode is
|
||||
+ // active. The deferred prefill chunk still runs within a few steps, so
|
||||
+ // prompt latency rises by at most (period-1) decode steps.
|
||||
+ //
|
||||
+ // BIT-EXACT: this only changes WHICH step a prompt chunk is admitted in.
|
||||
+ // Each sequence's decode logits depend on its own tokens + its own KV only
|
||||
+ // (the paged decode read is per-stream, attention is permutation-invariant
|
||||
+ // over the co-batched set), so deferring another slot's prefill never
|
||||
+ // changes a generating slot's output. Does not run in the single-sequence
|
||||
+ // greedy md5 gate (that path is llama-completion, not update_slots).
|
||||
+ //
|
||||
+ // DEFAULT-OFF (A/B finding): an end-to-end A/B proved S3-on is a serving
|
||||
+ // mistake. Deferring prefill admission on the period-8 cadence delays prompt
|
||||
+ // admission: 2.5x worse TTFT (60s vs 24s at N=256) and 20-29% lower end-to-end
|
||||
+ // throughput, with no end-to-end win at any concurrency. Its apparent
|
||||
+ // decode_agg gain was a metric artifact (faster per-step decode bought by
|
||||
+ // starving prefill). So the default prefers prompt prefill admission for good
|
||||
+ // TTFT; S3 is opt-in (LLAMA_PAGED_DECODE_STABLE=1) only for decode-dominated,
|
||||
+ // low-arrival traffic where TTFT is not a concern.
|
||||
+ bool decode_only_step = false;
|
||||
+ {
|
||||
+ static const int s3_enabled = [](){
|
||||
+ const char * e = getenv("LLAMA_PAGED_DECODE_STABLE");
|
||||
+ return e ? atoi(e) : 0; // default OFF; opt-in via LLAMA_PAGED_DECODE_STABLE=1
|
||||
+ }();
|
||||
+ if (s3_enabled && n_decode_in_batch > 0) {
|
||||
+ static const int s3_period = [](){ const char * e = getenv("LLAMA_PAGED_PREFILL_PERIOD"); int p = e ? atoi(e) : 8; return p > 0 ? p : 8; }();
|
||||
+ static long s3_step = 0;
|
||||
+ const bool prefill_due = (s3_step % s3_period) == 0;
|
||||
+ s3_step++;
|
||||
+ decode_only_step = !prefill_due;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
auto & alora_scale = batch.alora_scale;
|
||||
auto & alora_disabled_id = batch.alora_disabled_id;
|
||||
|
||||
// next, batch any pending prompts without exceeding n_batch
|
||||
- if (params_base.cont_batching || batch.size() == 0) {
|
||||
+ // (patch 0041, S3) skip prompt admission on a pure-decode step to keep the
|
||||
+ // decode batch shape reuse-stable
|
||||
+ if ((params_base.cont_batching || batch.size() == 0) && !decode_only_step) {
|
||||
bool add_ok = true; // false means the batch is full, skip remaining slots
|
||||
|
||||
iterate(slots, [&](server_slot & slot) {
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,365 @@
|
||||
From 1434cf7e078217c625062dcfde4fa91cf487ee86 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Sun, 28 Jun 2026 20:19:31 +0200
|
||||
Subject: [PATCH] feat(paged): fused residual-add + RMS norm + weight multiply
|
||||
(patch 0042)
|
||||
|
||||
The transformer pre-norm residual chain `h = x + sub_out; n = rms_norm(h) * w`
|
||||
runs as separate CUDA launches in the paged prefill graph: a k_bin_bcast ADD
|
||||
(the residual) feeding the existing fused rms_norm+mul. ggml-cuda already fuses
|
||||
rms_norm+mul (and rms_norm+mul+ADD, where the ADD is a *post*-norm bias) but NOT
|
||||
the *pre*-norm residual add that feeds the norm. This is the classic add-RMSNorm
|
||||
fusion (as in vLLM / TensorRT-LLM) that ggml-cuda lacks; it is part of the
|
||||
unfused-tail prefill gap vs vLLM's torch.compile fusions.
|
||||
|
||||
Add it as a CUDA-family graph fusion (paged series owns it; stock stays pure):
|
||||
- ggml_cuda_can_fuse recognizes { ADD, RMS_NORM, MUL } via ggml_can_fuse_subgraph
|
||||
with BOTH the ADD (node_idx) and the MUL (node_idx+2) marked as outputs - the
|
||||
residual ADD has a second consumer (the later skip-connection add), so it
|
||||
cannot pass the single-use ggml_can_fuse() gate the other rms_norm fusions use.
|
||||
- New kernel rms_norm_pre_add_mul_f32 computes h = a + b, publishes h to the
|
||||
residual buffer (downstream skip add reads it), then sum(h^2) -> scale ->
|
||||
dst = scale * h * w in ONE launch, emitting BOTH outputs the graph needs.
|
||||
- Gated by LLAMA_FUSE_ADD_RMSNORM (default ON) for a clean single-build A/B.
|
||||
|
||||
BIT-EXACT (per-path canonical greedy md5, n=48 --temp 0 --seed 1, paged):
|
||||
dense q36-27b-nvfp4 : 5951a5b4d624ce891e22ab5fca9bc439 (ON == OFF == canonical)
|
||||
MoE q36-35b-a3b : 8cb0ce23777bf55f92f63d0292c756b0 (ON == OFF == canonical)
|
||||
The fused kernel reproduces the exact FP order of the unfused chain: h = a + b
|
||||
(IEEE add is order-free), the sum(h^2) reduction uses the same block_reduce<SUM>
|
||||
with the same 256/1024 block-size thresholds, and the same rsqrtf(mean+eps)
|
||||
scale, so the byte stream is unchanged. test-backend-ops RMS_NORM/ADD/MUL pass
|
||||
(CUDA0 vs CPU).
|
||||
|
||||
PROFILE (dense prefill, nsys --cuda-graph-trace=node, npp512 ntg4 npl8):
|
||||
rms_norm_f32<1024> 903 launches / 96.6M ns -> 7 / 0.7M ns
|
||||
k_bin_bcast<op_add> 1232 launches / 138.6M ns -> 336 / 1.0M ns
|
||||
rms_norm_pre_add_mul (new) 896 launches / 187.2M ns
|
||||
-> 896 residual-add + 896 rms_norm launches folded into 896 fused launches;
|
||||
the norm+residual slice 233.6M -> 187.2M ns (~20% of that slice, ~1% of
|
||||
total prefill GPU time).
|
||||
S_PP dense (npp512 ntg4 npl32, 3x): 985.5 -> 990.6 t/s (+0.5%, every ON run
|
||||
beats every OFF run). Modest because the residual tail is a small slice of
|
||||
prefill; the dominant unfused cost is k_bin_bcast<op_mul> (11%, the GDN
|
||||
chunked-prefill gating muls) - a separate lever.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 54 +++++++++
|
||||
ggml/src/ggml-cuda/norm.cu | 196 ++++++++++++++++++++++++++++++++
|
||||
ggml/src/ggml-cuda/norm.cuh | 5 +
|
||||
3 files changed, 255 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 0dad6e1..2ecc971 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -3698,6 +3698,48 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph,
|
||||
}
|
||||
}
|
||||
|
||||
+ // Fused residual-add + RMS norm + weight multiply. The transformer residual
|
||||
+ // ADD feeds the next sublayer's RMS norm but is ALSO consumed by the later
|
||||
+ // residual add (skip connection), so the ADD node is a graph output too; it
|
||||
+ // cannot go through the single-use ggml_can_fuse() gate below. Recognize it
|
||||
+ // here with ggml_can_fuse_subgraph, marking both the ADD (node_idx) and the
|
||||
+ // final MUL (node_idx + 2) as outputs.
|
||||
+ std::initializer_list<enum ggml_op> add_rms_norm_mul_ops = { GGML_OP_ADD, GGML_OP_RMS_NORM, GGML_OP_MUL };
|
||||
+ if (is_equal(add_rms_norm_mul_ops, ops) &&
|
||||
+ ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx, node_idx + 2 })) {
|
||||
+ const ggml_tensor * add = cgraph->nodes[node_idx];
|
||||
+ const ggml_tensor * rms_norm = cgraph->nodes[node_idx + 1];
|
||||
+ const ggml_tensor * mul = cgraph->nodes[node_idx + 2];
|
||||
+
|
||||
+ // RMS norm must consume the residual-add output.
|
||||
+ if (rms_norm->src[0] != add) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ // All operands F32 (rms norm / fused mul kernel only support F32).
|
||||
+ if (add->src[0]->type != GGML_TYPE_F32 || add->src[1]->type != GGML_TYPE_F32 ||
|
||||
+ add->type != GGML_TYPE_F32 || rms_norm->type != GGML_TYPE_F32 ||
|
||||
+ mul->src[0]->type != GGML_TYPE_F32 || mul->src[1]->type != GGML_TYPE_F32 ||
|
||||
+ mul->type != GGML_TYPE_F32) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ // The fused kernel computes h = a + b elementwise: same shape, no broadcast.
|
||||
+ if (!ggml_are_same_shape(add->src[0], add->src[1])) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ // rms_norm kernel assumes contiguous rows for the residual operands and weight.
|
||||
+ if (!ggml_is_contiguous(add->src[0]) || !ggml_is_contiguous(add->src[1])) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ // If rms_norm is the B operand of the mul, broadcast of the A operand is unsupported.
|
||||
+ if (rms_norm == mul->src[1] && !ggml_are_same_shape(mul->src[0], rms_norm)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
if (!ggml_can_fuse(cgraph, node_idx, ops)) {
|
||||
return false;
|
||||
}
|
||||
@@ -4220,6 +4262,18 @@ static int ggml_cuda_try_fuse(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph
|
||||
return fused_node_count - 1;
|
||||
}
|
||||
|
||||
+ // Fused residual-add + RMS norm + weight multiply (bit-exact). Default ON;
|
||||
+ // set LLAMA_FUSE_ADD_RMSNORM=0 for a clean A/B against the unfused path.
|
||||
+ static const bool fuse_add_rmsnorm = [] {
|
||||
+ const char * e = getenv("LLAMA_FUSE_ADD_RMSNORM");
|
||||
+ return e == nullptr || atoi(e) != 0;
|
||||
+ }();
|
||||
+ if (fuse_add_rmsnorm &&
|
||||
+ ggml_cuda_can_fuse(cgraph, i, { GGML_OP_ADD, GGML_OP_RMS_NORM, GGML_OP_MUL }, {})) {
|
||||
+ ggml_cuda_op_rms_norm_pre_add_mul(*cuda_ctx, node, cgraph->nodes[i + 1], cgraph->nodes[i + 2]);
|
||||
+ return 2;
|
||||
+ }
|
||||
+
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD }, {})) {
|
||||
ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i + 1], cgraph->nodes[i + 2]);
|
||||
return 2;
|
||||
diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu
|
||||
index 09d9f3a..a07d022 100644
|
||||
--- a/ggml/src/ggml-cuda/norm.cu
|
||||
+++ b/ggml/src/ggml-cuda/norm.cu
|
||||
@@ -154,6 +154,87 @@ static __global__ void rms_norm_f32(const float * x,
|
||||
}
|
||||
}
|
||||
|
||||
+// Fused residual-add + RMS norm + (optional) weight multiply.
|
||||
+// h = a + b (the residual stream, written to h_out)
|
||||
+// dst = rsqrt(mean(h^2)+eps) * h * mul
|
||||
+// `a` and `b` are required to be the same shape and contiguous (the transformer
|
||||
+// residual add), so they share `x`'s strides; `h_out`, `dst` are also contiguous
|
||||
+// with that shape. `mul` (the RMS weight) broadcasts via the packed-modulo path.
|
||||
+//
|
||||
+// Bit-exactness: this reproduces the exact FP order of the unfused chain
|
||||
+// k_bin_bcast(add): h[col] = a[col] + b[col] (f32, elementwise, order-free)
|
||||
+// rms_norm: sumsq over h[col] in column order via block_reduce
|
||||
+// mul: dst[col] = scale * h[col] * mul[col]
|
||||
+// h is summed from the same f32 values in the same order, so the reduction and
|
||||
+// the final scale are byte-identical to running the three kernels separately.
|
||||
+template <int block_size, bool do_multiply = false>
|
||||
+static __global__ void rms_norm_pre_add_mul_f32(const float * a,
|
||||
+ const float * b,
|
||||
+ float * h_out,
|
||||
+ float * dst,
|
||||
+ const int ncols,
|
||||
+ const int64_t stride_row,
|
||||
+ const int64_t stride_channel,
|
||||
+ const int64_t stride_sample,
|
||||
+ const float eps,
|
||||
+ const float * mul = nullptr,
|
||||
+ const int64_t mul_stride_row = 0,
|
||||
+ const int64_t mul_stride_channel = 0,
|
||||
+ const int64_t mul_stride_sample = 0,
|
||||
+ const uint3 mul_ncols_packed = make_uint3(0, 0, 0),
|
||||
+ const uint3 mul_nrows_packed = make_uint3(0, 0, 0),
|
||||
+ const uint3 mul_nchannels_packed = make_uint3(0, 0, 0),
|
||||
+ const uint3 mul_nsamples_packed = make_uint3(0, 0, 0)) {
|
||||
+ ggml_cuda_pdl_lc();
|
||||
+ const int nrows = gridDim.x;
|
||||
+ const int nchannels = gridDim.y;
|
||||
+
|
||||
+ const int row = blockIdx.x;
|
||||
+ const int channel = blockIdx.y;
|
||||
+ const int sample = blockIdx.z;
|
||||
+ const int tid = threadIdx.x;
|
||||
+
|
||||
+ const int64_t row_offset = sample*stride_sample + channel*stride_channel + row*stride_row;
|
||||
+ a += row_offset;
|
||||
+ b += row_offset;
|
||||
+ h_out += row_offset;
|
||||
+ // dst is laid out contiguously by the scheduler for the MUL output
|
||||
+ dst += ((sample*nchannels + channel)*nrows + row)*ncols;
|
||||
+
|
||||
+ if constexpr (do_multiply) {
|
||||
+ const uint32_t mul_row = fastmodulo(row, mul_nrows_packed);
|
||||
+ const uint32_t mul_channel = fastmodulo(channel, mul_nchannels_packed);
|
||||
+ const uint32_t mul_sample = fastmodulo(sample, mul_nsamples_packed);
|
||||
+ mul += mul_sample * mul_stride_sample + mul_channel * mul_stride_channel + mul_row * mul_stride_row;
|
||||
+ }
|
||||
+
|
||||
+ float tmp = 0.0f; // partial sum for thread in warp
|
||||
+
|
||||
+ ggml_cuda_pdl_sync();
|
||||
+ for (int col = tid; col < ncols; col += block_size) {
|
||||
+ const float hi = a[col] + b[col];
|
||||
+ h_out[col] = hi; // publish the residual stream for the next add
|
||||
+ tmp += hi * hi;
|
||||
+ }
|
||||
+
|
||||
+ // sum up partial sums
|
||||
+ extern __shared__ float s_sum[];
|
||||
+ tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
|
||||
+
|
||||
+ const float mean = tmp / ncols;
|
||||
+ const float scale = rsqrtf(mean + eps);
|
||||
+
|
||||
+ for (int col = tid; col < ncols; col += block_size) {
|
||||
+ const float hi = h_out[col];
|
||||
+ if constexpr (do_multiply) {
|
||||
+ const int mul_col = fastmodulo(col, mul_ncols_packed);
|
||||
+ dst[col] = scale * hi * mul[mul_col];
|
||||
+ } else {
|
||||
+ dst[col] = scale * hi;
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
template <int block_size>
|
||||
static __global__ void rms_norm_back_f32(
|
||||
const float * grad, const float * xf, float * dst, const int ncols, const float eps) {
|
||||
@@ -407,6 +488,50 @@ static void rms_norm_mul_f32_cuda(const float * x,
|
||||
}
|
||||
}
|
||||
|
||||
+static void rms_norm_pre_add_mul_f32_cuda(const float * a,
|
||||
+ const float * b,
|
||||
+ float * h_out,
|
||||
+ float * dst,
|
||||
+ const int ncols,
|
||||
+ const int nrows,
|
||||
+ const int nchannels,
|
||||
+ const int nsamples,
|
||||
+ const int64_t stride_row,
|
||||
+ const int64_t stride_channel,
|
||||
+ const int64_t stride_sample,
|
||||
+ const float * mul,
|
||||
+ const int64_t mul_stride_row,
|
||||
+ const int64_t mul_stride_channel,
|
||||
+ const int64_t mul_stride_sample,
|
||||
+ const uint32_t mul_ncols,
|
||||
+ const uint32_t mul_nrows,
|
||||
+ const uint32_t mul_nchannels,
|
||||
+ const uint32_t mul_nsamples,
|
||||
+ const float eps,
|
||||
+ cudaStream_t stream) {
|
||||
+ const dim3 blocks_num(nrows, nchannels, nsamples);
|
||||
+ GGML_ASSERT(mul != nullptr);
|
||||
+ const uint3 mul_ncols_packed = init_fastdiv_values(mul_ncols);
|
||||
+ const uint3 mul_nrows_packed = init_fastdiv_values(mul_nrows);
|
||||
+ const uint3 mul_nchannels_packed = init_fastdiv_values(mul_nchannels);
|
||||
+ const uint3 mul_nsamples_packed = init_fastdiv_values(mul_nsamples);
|
||||
+ if (ncols < 1024) {
|
||||
+ const dim3 block_dims(256, 1, 1);
|
||||
+ const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream};
|
||||
+ ggml_cuda_kernel_launch(rms_norm_pre_add_mul_f32<256, true>, launch_params,
|
||||
+ a, b, h_out, dst, ncols, stride_row, stride_channel, stride_sample, eps,
|
||||
+ mul, mul_stride_row, mul_stride_channel, mul_stride_sample,
|
||||
+ mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed);
|
||||
+ } else {
|
||||
+ const dim3 block_dims(1024, 1, 1);
|
||||
+ const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream};
|
||||
+ ggml_cuda_kernel_launch(rms_norm_pre_add_mul_f32<1024, true>, launch_params,
|
||||
+ a, b, h_out, dst, ncols, stride_row, stride_channel, stride_sample, eps,
|
||||
+ mul, mul_stride_row, mul_stride_channel, mul_stride_sample,
|
||||
+ mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
static void rms_norm_back_f32_cuda(const float * grad, const float * xf, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
||||
if (ncols < 1024) {
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
@@ -647,6 +772,77 @@ void ggml_cuda_op_rms_norm_fused_add(ggml_backend_cuda_context & ctx,
|
||||
eps, stream);
|
||||
}
|
||||
|
||||
+void ggml_cuda_op_rms_norm_pre_add_mul(ggml_backend_cuda_context & ctx,
|
||||
+ ggml_tensor * add_tensor,
|
||||
+ ggml_tensor * rms_norm_tensor,
|
||||
+ ggml_tensor * mul_tensor) {
|
||||
+ // The RMS norm consumes the residual-add output.
|
||||
+ GGML_ASSERT(rms_norm_tensor->src[0] == add_tensor);
|
||||
+
|
||||
+ const ggml_tensor * a_src = add_tensor->src[0];
|
||||
+ const ggml_tensor * b_src = add_tensor->src[1];
|
||||
+
|
||||
+ float eps = 0.0f;
|
||||
+ memcpy(&eps, rms_norm_tensor->op_params, sizeof(float));
|
||||
+ GGML_ASSERT(eps >= 0.0f);
|
||||
+
|
||||
+ const float * a_d = (const float *) a_src->data;
|
||||
+ const float * b_d = (const float *) b_src->data;
|
||||
+ float * h_d = (float *) add_tensor->data;
|
||||
+
|
||||
+ const float * mul_d = nullptr;
|
||||
+ const ggml_tensor * mul_src = nullptr;
|
||||
+ if (mul_tensor->src[0] == rms_norm_tensor) {
|
||||
+ mul_d = (const float *) mul_tensor->src[1]->data;
|
||||
+ mul_src = mul_tensor->src[1];
|
||||
+ } else if (mul_tensor->src[1] == rms_norm_tensor) {
|
||||
+ mul_d = (const float *) mul_tensor->src[0]->data;
|
||||
+ mul_src = mul_tensor->src[0];
|
||||
+ } else {
|
||||
+ GGML_ASSERT(false);
|
||||
+ }
|
||||
+
|
||||
+ float * dst_d = (float *) mul_tensor->data;
|
||||
+ cudaStream_t stream = ctx.stream();
|
||||
+
|
||||
+ GGML_ASSERT(a_src->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(b_src->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(add_tensor->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(rms_norm_tensor->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(mul_tensor->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(ggml_are_same_shape(a_src, b_src));
|
||||
+
|
||||
+ const int64_t ne00 = add_tensor->ne[0];
|
||||
+ const int64_t ne01 = add_tensor->ne[1];
|
||||
+ const int64_t ne02 = add_tensor->ne[2];
|
||||
+ const int64_t ne03 = add_tensor->ne[3];
|
||||
+
|
||||
+ // a and b share the (contiguous) residual layout
|
||||
+ const size_t ts0 = ggml_type_size(a_src->type);
|
||||
+ GGML_ASSERT(a_src->nb[0] == ts0 && b_src->nb[0] == ts0);
|
||||
+ const int64_t s01 = a_src->nb[1] / ts0;
|
||||
+ const int64_t s02 = a_src->nb[2] / ts0;
|
||||
+ const int64_t s03 = a_src->nb[3] / ts0;
|
||||
+
|
||||
+ const size_t ts_mul = ggml_type_size(mul_src->type);
|
||||
+ GGML_ASSERT(mul_src->nb[0] == ts_mul);
|
||||
+ const int64_t mul_s01 = mul_src->nb[1] / ts_mul;
|
||||
+ const int64_t mul_s02 = mul_src->nb[2] / ts_mul;
|
||||
+ const int64_t mul_s03 = mul_src->nb[3] / ts_mul;
|
||||
+
|
||||
+ const int mul_ncols = mul_src->ne[0];
|
||||
+ const int mul_nrows = mul_src->ne[1];
|
||||
+ const int mul_nchannels = mul_src->ne[2];
|
||||
+ const int mul_nsamples = mul_src->ne[3];
|
||||
+
|
||||
+ rms_norm_pre_add_mul_f32_cuda(a_d, b_d, h_d, dst_d,
|
||||
+ ne00, ne01, ne02, ne03,
|
||||
+ /*s00*/ s01, s02, s03,
|
||||
+ mul_d, /*mul_s00*/ mul_s01, mul_s02, mul_s03,
|
||||
+ mul_ncols, mul_nrows, mul_nchannels, mul_nsamples,
|
||||
+ eps, stream);
|
||||
+}
|
||||
+
|
||||
void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * grad = dst->src[0]; // gradients
|
||||
const ggml_tensor * src0f = dst->src[1]; // src0 from forward pass
|
||||
diff --git a/ggml/src/ggml-cuda/norm.cuh b/ggml/src/ggml-cuda/norm.cuh
|
||||
index a74f637..05396cd 100644
|
||||
--- a/ggml/src/ggml-cuda/norm.cuh
|
||||
+++ b/ggml/src/ggml-cuda/norm.cuh
|
||||
@@ -13,6 +13,11 @@ void ggml_cuda_op_rms_norm_fused_add(ggml_backend_cuda_context & ctx,
|
||||
ggml_tensor * mul_tensor,
|
||||
ggml_tensor * add_tensor);
|
||||
|
||||
+void ggml_cuda_op_rms_norm_pre_add_mul(ggml_backend_cuda_context & ctx,
|
||||
+ ggml_tensor * add_tensor,
|
||||
+ ggml_tensor * rms_norm_tensor,
|
||||
+ ggml_tensor * mul_tensor);
|
||||
+
|
||||
void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_l2_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,82 @@
|
||||
From e4716bd0c700d34919e093f99cd454d883ad15ec Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Mon, 29 Jun 2026 02:13:20 +0200
|
||||
Subject: [PATCH] feat(paged): default-on full-step MoE-decode CUDA graph
|
||||
(grouped MMQ, patch 0043)
|
||||
|
||||
D1 lever. The MUL_MAT_ID CUDA-graph guard ([TAG_MUL_MAT_ID_CUDA_GRAPHS])
|
||||
disables CUDA graphs for the WHOLE decode step whenever a MUL_MAT_ID node has
|
||||
ne[2] > mmvq_mmid_max (8 for NVFP4 on sm_121) - i.e. for every multi-token
|
||||
decode. Patch 0025 showed the path actually taken on Blackwell NVFP4,
|
||||
should_use_mmq()==true -> grouped stream-k MMQ id-branch, launches on one
|
||||
stream with NO host sync (only the per-expert host-loop fallback synchronizes),
|
||||
so the disable is conservative and graphs are safe for the grouped path - but
|
||||
0025 left it behind an opt-in env (LLAMA_MOE_FORCE_GRAPHS), so by default the
|
||||
host re-issued every kernel of the step.
|
||||
|
||||
D1 profiling (GB10 sm_121, q36-35b-a3b-nvfp4, batched-bench -fa on, npl128)
|
||||
settled the mechanism:
|
||||
- The grouped MMQ NVFP4 path IS what runs in decode: cudaStreamSynchronize
|
||||
count is IDENTICAL with graphs on vs off (1457 either way) - the per-expert
|
||||
host-loop fallback (the only device->host routing readback) is never hit.
|
||||
MoE routing is already device-side.
|
||||
- Steady-decode GPU-busy is ~99% (1% idle): static decode is GPU-bound, not
|
||||
host-sync-bound. The host cost is per-step kernel RE-ISSUE, removed by
|
||||
replaying a captured full-step graph (incl. the MoE dispatch).
|
||||
|
||||
So make the grouped-path graph capture ON BY DEFAULT; LLAMA_MOE_NO_FORCE_GRAPHS=1
|
||||
forces the conservative pre-0025 disable for A/B. should_use_mmq() is the exact
|
||||
guard: it returns FALSE for the large-M NVFP4 prefill (patch 0034), which
|
||||
deliberately drops to the per-expert host-sync loop, so PREFILL keeps graphs
|
||||
disabled (correct - that path syncs). Decode-only behaviour change; prefill and
|
||||
the stock llama-cpp backend are untouched.
|
||||
|
||||
BIT-EXACT: greedy md5 byte-identical default(on)==LLAMA_MOE_NO_FORCE_GRAPHS(off)
|
||||
==legacy LLAMA_MOE_FORCE_GRAPHS - paged-MoE 8cb0ce23777bf55f92f63d0292c756b0,
|
||||
paged-dense 5951a5b4d624ce891e22ab5fca9bc439 (both match the recorded baselines).
|
||||
|
||||
Measured (GB10, batched-bench paged decode S_TG, default-on vs opt-out):
|
||||
npl 32 467.3 vs 444.3 t/s +5.2%
|
||||
npl 128 788.2 vs 768.1 t/s +2.6%
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 20 +++++++++++++++-----
|
||||
1 file changed, 15 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index a92003c..3151684 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -3306,12 +3306,22 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
const int mmvq_mmid_max = get_mmvq_mmid_max_batch(node->src[0]->type, cc);
|
||||
bool mmid_needs_sync = !ggml_is_quantized(node->src[0]->type) || node->ne[2] > mmvq_mmid_max;
|
||||
- // PROBE (bit-exact, env LLAMA_MOE_FORCE_GRAPHS): the grouped stream-k MMQ id-path is
|
||||
- // launched on-stream with no host sync (only the per-expert host-loop fallback syncs);
|
||||
- // when should_use_mmq() is true (Blackwell NVFP4 grouped path) the op is graph-safe
|
||||
- // even for ne[2] > mmvq_mmid_max, so graphs need not be disabled for the whole step.
|
||||
+ // [D1 / patch 0043] The grouped stream-k MMQ id-path (should_use_mmq()==true, e.g.
|
||||
+ // Blackwell NVFP4) launches on-stream with NO host sync; only the per-expert
|
||||
+ // host-loop fallback synchronizes the stream. So when this MUL_MAT_ID WILL take the
|
||||
+ // grouped path, the whole decode step is graph-safe even for ne[2] > mmvq_mmid_max,
|
||||
+ // and the full-step CUDA graph (incl. the MoE dispatch) can be REPLAYED instead of the
|
||||
+ // host re-issuing every kernel every step. Patch 0025 proved this is bit-exact (graph
|
||||
+ // replay re-issues identical kernels); D1 profiling confirmed the grouped path is what
|
||||
+ // actually runs (no device->host routing readback), that steady decode is ~99% GPU-busy
|
||||
+ // (not host-sync-bound), and that keeping the step graphed lifts throughput (npl32
|
||||
+ // +13%, npl128 +1.9%). It is therefore ON BY DEFAULT for the grouped path now.
|
||||
+ // should_use_mmq() is the exact guard: it returns FALSE for the large-M NVFP4 prefill
|
||||
+ // (patch 0034) that deliberately drops to the per-expert host-sync loop, so PREFILL
|
||||
+ // keeps graphs disabled (correct - that path syncs). Decode is untouched by 0034.
|
||||
+ // LLAMA_MOE_NO_FORCE_GRAPHS=1 forces the conservative pre-0025 disable for A/B.
|
||||
if (mmid_needs_sync && ggml_is_quantized(node->src[0]->type) &&
|
||||
- getenv("LLAMA_MOE_FORCE_GRAPHS") != nullptr &&
|
||||
+ getenv("LLAMA_MOE_NO_FORCE_GRAPHS") == nullptr &&
|
||||
ggml_cuda_should_use_mmq(node->src[0]->type, cc, node->src[1]->ne[2], node->src[0]->ne[2])) {
|
||||
mmid_needs_sync = false;
|
||||
}
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,470 @@
|
||||
From 51168c5eee2e35348d9006f0b2fab3dc6e7c01cc Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Tue, 30 Jun 2026 10:57:05 +0200
|
||||
Subject: [PATCH] feat(paged): fused gated RMSNorm + SiLU gate-mul CUDA op
|
||||
(patch 0044)
|
||||
|
||||
The Qwen3.6 gated-DeltaNet output norm self.norm(core_attn_out, z)
|
||||
(qwen35 / qwen35moe build_norm_gated) runs as (rms_norm(x) * w) * silu(z):
|
||||
on CUDA that was rms_norm_mul + silu_mul, two fused launches with the
|
||||
normalized intermediate round-tripping through HBM. Fuse the whole chain
|
||||
into one kernel so it stays in registers. This is the gated-RMSNorm fusion
|
||||
the vLLM decode-gap analysis ranked #1 (the easy, bit-exact prefill win),
|
||||
a direct sibling of patch 0042 (add-RMSNorm).
|
||||
|
||||
The chain is NOT naturally consecutive in the graph: the gate z-projection
|
||||
(a MUL_MAT) is scheduled between the weight MUL and the SILU, so the default
|
||||
mul(normalized, silu(z)) order leaves a GEMM between them and cannot be
|
||||
fused. build_norm_gated now emits the gate multiply as mul(silu(z),
|
||||
normalized) (commutative, so bit-exact), which lays the chain out as the
|
||||
consecutive subgraph { SILU, RMS_NORM, MUL, MUL } that ggml-cuda can fuse.
|
||||
|
||||
- New kernel rms_norm_gate_mul_f32 (ggml/src/ggml-cuda/norm.cu): same
|
||||
block_reduce<SUM> over x^2, same 256/1024 block-size thresholds and
|
||||
rsqrtf(mean+eps) as rms_norm / patch 0042; the final write computes
|
||||
dst = scale * x * w * silu(z) with silu(z) = z/(1+expf(-z)) (the exact
|
||||
ggml_cuda_op_silu_single form). w (the RMS weight) and z (the gate) both
|
||||
broadcast via the packed-modulo helper.
|
||||
- ggml_cuda_can_fuse recognizes { GGML_OP_UNARY(SILU), RMS_NORM, MUL, MUL }
|
||||
via ggml_can_fuse_subgraph with the final MUL as the only output (the SILU
|
||||
reads an external gate; RMS_NORM and the weight MUL are single-use within).
|
||||
- Gated by LLAMA_FUSE_GATE_RMSNORM (default ON) for a clean single-build A/B;
|
||||
OFF keeps the original operand order AND the unfused kernels, so OFF is
|
||||
byte- and kernel-identical to the pre-patch path.
|
||||
|
||||
BIT-EXACT (per-path canonical greedy md5, n=48 --temp 0 --seed 1):
|
||||
dense q36-27b-nvfp4 : 5951a5b4d624ce891e22ab5fca9bc439 (ON == OFF == canonical, paged and non-paged)
|
||||
MoE q36-35b-a3b : 8cb0ce23777bf55f92f63d0292c756b0 (ON == OFF == canonical, paged)
|
||||
Multiply is commutative, so ((scale*x)*w)*silu(z) is byte-identical to the
|
||||
unfused silu(z)*((scale*x)*w); the sum(x^2) reduction and rsqrt scale are
|
||||
unchanged. test-backend-ops 12979/12979 (CUDA0 vs CPU).
|
||||
|
||||
PROFILE (dense prefill, nsys --cuda-graph-trace=node, npp512 ntg4 npl8):
|
||||
rms_norm_f32<256,1,0> 560 -> 224 launches
|
||||
unary_gated_op_kernel<op_silu> 784 -> 448 launches
|
||||
rms_norm_gate_mul_f32 (new) 336 launches / 69.7M ns
|
||||
-> the 336 gated-norm rms_norm_mul + 336 silu_mul launches (672) fold into
|
||||
336 fused launches, removing the normalized HBM round-trip.
|
||||
S_PP (npp512 ntg4 npl32, 3x interleaved A/B, every ON beats every OFF):
|
||||
dense q36-27b : 1002.5 -> 1013.4 t/s (+1.1%, ~+10 us/tok)
|
||||
MoE q36-35b : 2626.9 -> 2651.8 t/s (+0.9%)
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 67 ++++++++++
|
||||
ggml/src/ggml-cuda/norm.cu | 215 ++++++++++++++++++++++++++++++++
|
||||
ggml/src/ggml-cuda/norm.cuh | 6 +
|
||||
src/models/qwen35.cpp | 16 +++
|
||||
src/models/qwen35moe.cpp | 16 +++
|
||||
5 files changed, 320 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 42bcd4a77..374949f25 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -3816,6 +3816,60 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph,
|
||||
return true;
|
||||
}
|
||||
|
||||
+ // Fused gated RMS norm: SiLU gate multiply over (RMS norm * weight), the
|
||||
+ // gated-DeltaNet output norm `out = (rms_norm(x) * w) * silu(z)` of the Qwen3.6
|
||||
+ // hybrid models (qwen35 / qwen35moe build_norm_gated). The model emits the gate
|
||||
+ // multiply as mul(silu(z), normalized) (default; see LLAMA_FUSE_GATE_RMSNORM in
|
||||
+ // build_norm_gated) so the chain forms the consecutive subgraph
|
||||
+ // { SILU, RMS_NORM, MUL, MUL } - the gate z-projection is scheduled before the
|
||||
+ // SILU, so the natural mul(normalized, silu) order leaves a GEMM between the
|
||||
+ // weight MUL and the SILU and cannot be fused. The SILU (node_idx) reads an
|
||||
+ // external gate and the final gate MUL (node_idx + 3) feeds the o_proj, so mark
|
||||
+ // node_idx + 3 as the only output; the RMS_NORM (node_idx + 1) and weight MUL
|
||||
+ // (node_idx + 2) are single-use within the subgraph.
|
||||
+ std::initializer_list<enum ggml_op> rms_norm_gate_mul_ops = { GGML_OP_UNARY, GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_MUL };
|
||||
+ if (is_equal(rms_norm_gate_mul_ops, ops) && unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_SILU &&
|
||||
+ ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3 })) {
|
||||
+ const ggml_tensor * silu = cgraph->nodes[node_idx];
|
||||
+ const ggml_tensor * rms_norm = cgraph->nodes[node_idx + 1];
|
||||
+ const ggml_tensor * mul = cgraph->nodes[node_idx + 2];
|
||||
+ const ggml_tensor * gate_mul = cgraph->nodes[node_idx + 3];
|
||||
+
|
||||
+ if (ggml_get_unary_op(silu) != GGML_UNARY_OP_SILU) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ // The weight MUL must consume the RMS norm output; the gate MUL must
|
||||
+ // consume both the weight MUL and the SILU output.
|
||||
+ if (mul->src[0] != rms_norm && mul->src[1] != rms_norm) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if ((gate_mul->src[0] != mul && gate_mul->src[1] != mul) ||
|
||||
+ (gate_mul->src[0] != silu && gate_mul->src[1] != silu)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ // All operands F32 (rms norm / fused mul / silu kernel only support F32).
|
||||
+ if (rms_norm->src[0]->type != GGML_TYPE_F32 || rms_norm->type != GGML_TYPE_F32 ||
|
||||
+ mul->src[0]->type != GGML_TYPE_F32 || mul->src[1]->type != GGML_TYPE_F32 || mul->type != GGML_TYPE_F32 ||
|
||||
+ silu->src[0]->type != GGML_TYPE_F32 || silu->type != GGML_TYPE_F32 ||
|
||||
+ gate_mul->src[0]->type != GGML_TYPE_F32 || gate_mul->src[1]->type != GGML_TYPE_F32 ||
|
||||
+ gate_mul->type != GGML_TYPE_F32) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ // If rms_norm is the B operand of the weight mul, broadcast of A is unsupported.
|
||||
+ if (rms_norm == mul->src[1] && !ggml_are_same_shape(mul->src[0], rms_norm)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ // The fused kernel reads contiguous rows for the norm input, the weight,
|
||||
+ // and the gate, and writes a contiguous output.
|
||||
+ if (!ggml_is_contiguous_rows(rms_norm->src[0]) ||
|
||||
+ !ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1]) ||
|
||||
+ !ggml_is_contiguous_rows(silu->src[0]) ||
|
||||
+ !ggml_is_contiguous_rows(gate_mul->src[0]) || !ggml_is_contiguous_rows(gate_mul->src[1])) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
if (!ggml_can_fuse(cgraph, node_idx, ops)) {
|
||||
return false;
|
||||
}
|
||||
@@ -4350,6 +4404,19 @@ static int ggml_cuda_try_fuse(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph
|
||||
return 2;
|
||||
}
|
||||
|
||||
+ // Fused gated RMS norm: RMS norm + weight multiply + SiLU-gated multiply
|
||||
+ // (bit-exact). The Qwen3.6 gated-DeltaNet output norm. Default ON; set
|
||||
+ // LLAMA_FUSE_GATE_RMSNORM=0 for a clean A/B against the unfused path.
|
||||
+ static const bool fuse_gate_rmsnorm = [] {
|
||||
+ const char * e = getenv("LLAMA_FUSE_GATE_RMSNORM");
|
||||
+ return e == nullptr || atoi(e) != 0;
|
||||
+ }();
|
||||
+ if (fuse_gate_rmsnorm &&
|
||||
+ ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_MUL }, { GGML_UNARY_OP_SILU })) {
|
||||
+ ggml_cuda_op_rms_norm_gate_mul(*cuda_ctx, cgraph->nodes[i + 1], cgraph->nodes[i + 2], node, cgraph->nodes[i + 3]);
|
||||
+ return 3;
|
||||
+ }
|
||||
+
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD }, {})) {
|
||||
ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i + 1], cgraph->nodes[i + 2]);
|
||||
return 2;
|
||||
diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu
|
||||
index a07d02276..e776c67d2 100644
|
||||
--- a/ggml/src/ggml-cuda/norm.cu
|
||||
+++ b/ggml/src/ggml-cuda/norm.cu
|
||||
@@ -235,6 +235,95 @@ static __global__ void rms_norm_pre_add_mul_f32(const float * a,
|
||||
}
|
||||
}
|
||||
|
||||
+// Fused gated RMS norm: RMS norm + weight multiply + SiLU gate multiply.
|
||||
+// dst = (rsqrt(mean(x^2)+eps) * x * w) * silu(z) with silu(z) = z/(1+expf(-z))
|
||||
+// This is the gated-DeltaNet output norm `self.norm(core_attn_out, z)` of the
|
||||
+// Qwen3.6 hybrid models (build_norm_gated): rms_norm(x) scaled by the per-head
|
||||
+// ssm_norm weight `w`, then gated by silu of the gate activation `z`. Unfused it
|
||||
+// runs as rms_norm_mul (scale*x*w) -> silu(z) -> mul; fusing it keeps the
|
||||
+// normalized intermediate in registers so it never round-trips to HBM.
|
||||
+//
|
||||
+// Bit-exactness: the sum(x^2) reduction uses the same block_reduce<SUM> with the
|
||||
+// same 256/1024 block-size thresholds and the same rsqrtf(mean+eps) as rms_norm,
|
||||
+// the weight multiply reproduces rms_norm_mul's `scale*x[col]*w[col]` order, and
|
||||
+// silu reuses the exact `z/(1+expf(-z))` of ggml_cuda_op_silu_single. Float
|
||||
+// multiply is commutative, so `(scale*x*w) * silu(z)` is byte-identical to the
|
||||
+// unfused `mul(rms_norm_mul, silu(z))` (whether or not silu+mul was itself fused).
|
||||
+// `w` (the RMS weight) and `z` (the gate) both broadcast via the packed-modulo path.
|
||||
+template <int block_size>
|
||||
+static __global__ void rms_norm_gate_mul_f32(const float * x,
|
||||
+ float * dst,
|
||||
+ const int ncols,
|
||||
+ const int64_t stride_row,
|
||||
+ const int64_t stride_channel,
|
||||
+ const int64_t stride_sample,
|
||||
+ const float eps,
|
||||
+ const float * mul,
|
||||
+ const int64_t mul_stride_row,
|
||||
+ const int64_t mul_stride_channel,
|
||||
+ const int64_t mul_stride_sample,
|
||||
+ const uint3 mul_ncols_packed,
|
||||
+ const uint3 mul_nrows_packed,
|
||||
+ const uint3 mul_nchannels_packed,
|
||||
+ const uint3 mul_nsamples_packed,
|
||||
+ const float * gate,
|
||||
+ const int64_t gate_stride_row,
|
||||
+ const int64_t gate_stride_channel,
|
||||
+ const int64_t gate_stride_sample,
|
||||
+ const uint3 gate_ncols_packed,
|
||||
+ const uint3 gate_nrows_packed,
|
||||
+ const uint3 gate_nchannels_packed,
|
||||
+ const uint3 gate_nsamples_packed) {
|
||||
+ ggml_cuda_pdl_lc();
|
||||
+ const int nrows = gridDim.x;
|
||||
+ const int nchannels = gridDim.y;
|
||||
+
|
||||
+ const int row = blockIdx.x;
|
||||
+ const int channel = blockIdx.y;
|
||||
+ const int sample = blockIdx.z;
|
||||
+ const int tid = threadIdx.x;
|
||||
+
|
||||
+ x += sample*stride_sample + channel*stride_channel + row*stride_row;
|
||||
+ // dst is laid out contiguously by the scheduler for the (final) MUL output
|
||||
+ dst += ((sample*nchannels + channel)*nrows + row)*ncols;
|
||||
+
|
||||
+ {
|
||||
+ const uint32_t mul_row = fastmodulo(row, mul_nrows_packed);
|
||||
+ const uint32_t mul_channel = fastmodulo(channel, mul_nchannels_packed);
|
||||
+ const uint32_t mul_sample = fastmodulo(sample, mul_nsamples_packed);
|
||||
+ mul += mul_sample * mul_stride_sample + mul_channel * mul_stride_channel + mul_row * mul_stride_row;
|
||||
+ }
|
||||
+ {
|
||||
+ const uint32_t gate_row = fastmodulo(row, gate_nrows_packed);
|
||||
+ const uint32_t gate_channel = fastmodulo(channel, gate_nchannels_packed);
|
||||
+ const uint32_t gate_sample = fastmodulo(sample, gate_nsamples_packed);
|
||||
+ gate += gate_sample * gate_stride_sample + gate_channel * gate_stride_channel + gate_row * gate_stride_row;
|
||||
+ }
|
||||
+
|
||||
+ float tmp = 0.0f; // partial sum for thread in warp
|
||||
+
|
||||
+ ggml_cuda_pdl_sync();
|
||||
+ for (int col = tid; col < ncols; col += block_size) {
|
||||
+ const float xi = x[col];
|
||||
+ tmp += xi * xi;
|
||||
+ }
|
||||
+
|
||||
+ // sum up partial sums
|
||||
+ extern __shared__ float s_sum[];
|
||||
+ tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
|
||||
+
|
||||
+ const float mean = tmp / ncols;
|
||||
+ const float scale = rsqrtf(mean + eps);
|
||||
+
|
||||
+ for (int col = tid; col < ncols; col += block_size) {
|
||||
+ const int mul_col = fastmodulo(col, mul_ncols_packed);
|
||||
+ const int gate_col = fastmodulo(col, gate_ncols_packed);
|
||||
+ const float zi = gate[gate_col];
|
||||
+ const float silu_z = zi / (1.0f + expf(-zi));
|
||||
+ dst[col] = scale * x[col] * mul[mul_col] * silu_z;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
template <int block_size>
|
||||
static __global__ void rms_norm_back_f32(
|
||||
const float * grad, const float * xf, float * dst, const int ncols, const float eps) {
|
||||
@@ -532,6 +621,65 @@ static void rms_norm_pre_add_mul_f32_cuda(const float * a,
|
||||
}
|
||||
}
|
||||
|
||||
+static void rms_norm_gate_mul_f32_cuda(const float * x,
|
||||
+ float * dst,
|
||||
+ const int ncols,
|
||||
+ const int nrows,
|
||||
+ const int nchannels,
|
||||
+ const int nsamples,
|
||||
+ const int64_t stride_row,
|
||||
+ const int64_t stride_channel,
|
||||
+ const int64_t stride_sample,
|
||||
+ const float * mul,
|
||||
+ const int64_t mul_stride_row,
|
||||
+ const int64_t mul_stride_channel,
|
||||
+ const int64_t mul_stride_sample,
|
||||
+ const uint32_t mul_ncols,
|
||||
+ const uint32_t mul_nrows,
|
||||
+ const uint32_t mul_nchannels,
|
||||
+ const uint32_t mul_nsamples,
|
||||
+ const float * gate,
|
||||
+ const int64_t gate_stride_row,
|
||||
+ const int64_t gate_stride_channel,
|
||||
+ const int64_t gate_stride_sample,
|
||||
+ const uint32_t gate_ncols,
|
||||
+ const uint32_t gate_nrows,
|
||||
+ const uint32_t gate_nchannels,
|
||||
+ const uint32_t gate_nsamples,
|
||||
+ const float eps,
|
||||
+ cudaStream_t stream) {
|
||||
+ const dim3 blocks_num(nrows, nchannels, nsamples);
|
||||
+ GGML_ASSERT(mul != nullptr);
|
||||
+ GGML_ASSERT(gate != nullptr);
|
||||
+ const uint3 mul_ncols_packed = init_fastdiv_values(mul_ncols);
|
||||
+ const uint3 mul_nrows_packed = init_fastdiv_values(mul_nrows);
|
||||
+ const uint3 mul_nchannels_packed = init_fastdiv_values(mul_nchannels);
|
||||
+ const uint3 mul_nsamples_packed = init_fastdiv_values(mul_nsamples);
|
||||
+ const uint3 gate_ncols_packed = init_fastdiv_values(gate_ncols);
|
||||
+ const uint3 gate_nrows_packed = init_fastdiv_values(gate_nrows);
|
||||
+ const uint3 gate_nchannels_packed = init_fastdiv_values(gate_nchannels);
|
||||
+ const uint3 gate_nsamples_packed = init_fastdiv_values(gate_nsamples);
|
||||
+ if (ncols < 1024) {
|
||||
+ const dim3 block_dims(256, 1, 1);
|
||||
+ const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream};
|
||||
+ ggml_cuda_kernel_launch(rms_norm_gate_mul_f32<256>, launch_params,
|
||||
+ x, dst, ncols, stride_row, stride_channel, stride_sample, eps,
|
||||
+ mul, mul_stride_row, mul_stride_channel, mul_stride_sample,
|
||||
+ mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed,
|
||||
+ gate, gate_stride_row, gate_stride_channel, gate_stride_sample,
|
||||
+ gate_ncols_packed, gate_nrows_packed, gate_nchannels_packed, gate_nsamples_packed);
|
||||
+ } else {
|
||||
+ const dim3 block_dims(1024, 1, 1);
|
||||
+ const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream};
|
||||
+ ggml_cuda_kernel_launch(rms_norm_gate_mul_f32<1024>, launch_params,
|
||||
+ x, dst, ncols, stride_row, stride_channel, stride_sample, eps,
|
||||
+ mul, mul_stride_row, mul_stride_channel, mul_stride_sample,
|
||||
+ mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed,
|
||||
+ gate, gate_stride_row, gate_stride_channel, gate_stride_sample,
|
||||
+ gate_ncols_packed, gate_nrows_packed, gate_nchannels_packed, gate_nsamples_packed);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
static void rms_norm_back_f32_cuda(const float * grad, const float * xf, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
||||
if (ncols < 1024) {
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
@@ -843,6 +991,73 @@ void ggml_cuda_op_rms_norm_pre_add_mul(ggml_backend_cuda_context & ctx,
|
||||
eps, stream);
|
||||
}
|
||||
|
||||
+void ggml_cuda_op_rms_norm_gate_mul(ggml_backend_cuda_context & ctx,
|
||||
+ ggml_tensor * rms_norm_tensor,
|
||||
+ ggml_tensor * mul_tensor,
|
||||
+ ggml_tensor * silu_tensor,
|
||||
+ ggml_tensor * gate_mul_tensor) {
|
||||
+ // mul = rms_norm(x) * w ; silu = silu(z) ; gate_mul = mul * silu
|
||||
+ GGML_ASSERT(mul_tensor->src[0] == rms_norm_tensor || mul_tensor->src[1] == rms_norm_tensor);
|
||||
+ GGML_ASSERT(gate_mul_tensor->src[0] == silu_tensor || gate_mul_tensor->src[1] == silu_tensor);
|
||||
+
|
||||
+ const ggml_tensor * x_src = rms_norm_tensor->src[0];
|
||||
+ const ggml_tensor * w_src = (mul_tensor->src[0] == rms_norm_tensor) ? mul_tensor->src[1] : mul_tensor->src[0];
|
||||
+ const ggml_tensor * gate_src = silu_tensor->src[0];
|
||||
+
|
||||
+ float eps = 0.0f;
|
||||
+ memcpy(&eps, rms_norm_tensor->op_params, sizeof(float));
|
||||
+ GGML_ASSERT(eps >= 0.0f);
|
||||
+
|
||||
+ const float * x_d = (const float *) x_src->data;
|
||||
+ const float * w_d = (const float *) w_src->data;
|
||||
+ const float * gate_d = (const float *) gate_src->data;
|
||||
+ float * dst_d = (float *) gate_mul_tensor->data;
|
||||
+ cudaStream_t stream = ctx.stream();
|
||||
+
|
||||
+ GGML_ASSERT(x_src->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(w_src->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(gate_src->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(rms_norm_tensor->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(mul_tensor->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(silu_tensor->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(gate_mul_tensor->type == GGML_TYPE_F32);
|
||||
+
|
||||
+ const int64_t ne00 = rms_norm_tensor->ne[0];
|
||||
+ const int64_t ne01 = rms_norm_tensor->ne[1];
|
||||
+ const int64_t ne02 = rms_norm_tensor->ne[2];
|
||||
+ const int64_t ne03 = rms_norm_tensor->ne[3];
|
||||
+
|
||||
+ // x (the rms-norm input) strides; cols must be contiguous
|
||||
+ const size_t ts0 = ggml_type_size(x_src->type);
|
||||
+ GGML_ASSERT(x_src->nb[0] == ts0);
|
||||
+ const int64_t s01 = x_src->nb[1] / ts0;
|
||||
+ const int64_t s02 = x_src->nb[2] / ts0;
|
||||
+ const int64_t s03 = x_src->nb[3] / ts0;
|
||||
+
|
||||
+ // weight (the RMS scale) strides + broadcast extents
|
||||
+ const size_t ts_mul = ggml_type_size(w_src->type);
|
||||
+ GGML_ASSERT(w_src->nb[0] == ts_mul);
|
||||
+ const int64_t mul_s01 = w_src->nb[1] / ts_mul;
|
||||
+ const int64_t mul_s02 = w_src->nb[2] / ts_mul;
|
||||
+ const int64_t mul_s03 = w_src->nb[3] / ts_mul;
|
||||
+
|
||||
+ // gate (the silu activation) strides + broadcast extents
|
||||
+ const size_t ts_gate = ggml_type_size(gate_src->type);
|
||||
+ GGML_ASSERT(gate_src->nb[0] == ts_gate);
|
||||
+ const int64_t gate_s01 = gate_src->nb[1] / ts_gate;
|
||||
+ const int64_t gate_s02 = gate_src->nb[2] / ts_gate;
|
||||
+ const int64_t gate_s03 = gate_src->nb[3] / ts_gate;
|
||||
+
|
||||
+ rms_norm_gate_mul_f32_cuda(x_d, dst_d,
|
||||
+ ne00, ne01, ne02, ne03,
|
||||
+ /*s00*/ s01, s02, s03,
|
||||
+ w_d, /*mul_s00*/ mul_s01, mul_s02, mul_s03,
|
||||
+ w_src->ne[0], w_src->ne[1], w_src->ne[2], w_src->ne[3],
|
||||
+ gate_d, /*gate_s00*/ gate_s01, gate_s02, gate_s03,
|
||||
+ gate_src->ne[0], gate_src->ne[1], gate_src->ne[2], gate_src->ne[3],
|
||||
+ eps, stream);
|
||||
+}
|
||||
+
|
||||
void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * grad = dst->src[0]; // gradients
|
||||
const ggml_tensor * src0f = dst->src[1]; // src0 from forward pass
|
||||
diff --git a/ggml/src/ggml-cuda/norm.cuh b/ggml/src/ggml-cuda/norm.cuh
|
||||
index 05396cdf0..4d6dba6fa 100644
|
||||
--- a/ggml/src/ggml-cuda/norm.cuh
|
||||
+++ b/ggml/src/ggml-cuda/norm.cuh
|
||||
@@ -17,6 +17,12 @@ void ggml_cuda_op_rms_norm_pre_add_mul(ggml_backend_cuda_context & ctx,
|
||||
ggml_tensor * add_tensor,
|
||||
ggml_tensor * rms_norm_tensor,
|
||||
ggml_tensor * mul_tensor);
|
||||
+void ggml_cuda_op_rms_norm_gate_mul(ggml_backend_cuda_context & ctx,
|
||||
+ ggml_tensor * rms_norm_tensor,
|
||||
+ ggml_tensor * mul_tensor,
|
||||
+ ggml_tensor * silu_tensor,
|
||||
+ ggml_tensor * gate_mul_tensor);
|
||||
+
|
||||
|
||||
void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp
|
||||
index 66064869e..98751f7cc 100644
|
||||
--- a/src/models/qwen35.cpp
|
||||
+++ b/src/models/qwen35.cpp
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "models.h"
|
||||
+#include <cstdlib>
|
||||
#include "llama-memory-recurrent.h"
|
||||
|
||||
void llama_model_qwen35::load_arch_hparams(llama_model_loader & ml) {
|
||||
@@ -251,6 +252,21 @@ ggml_tensor * llama_model_qwen35::graph::build_norm_gated(
|
||||
ggml_tensor * normalized = build_norm(input, weights, nullptr, LLM_NORM_RMS, layer);
|
||||
ggml_tensor * gated_silu = ggml_silu(ctx0, gate);
|
||||
|
||||
+ // Emit the gate multiply as mul(silu(z), normalized) so the gated-DeltaNet
|
||||
+ // output-norm chain forms the consecutive subgraph { SILU, RMS_NORM, MUL, MUL }
|
||||
+ // that the CUDA backend fuses into one rms_norm_gate_mul kernel (the normalized
|
||||
+ // intermediate then never round-trips to HBM). The gate z-projection is scheduled
|
||||
+ // before the SILU, so the natural mul(normalized, silu) order leaves a GEMM
|
||||
+ // between the weight MUL and the SILU and is not fusable. Multiplication is
|
||||
+ // commutative, so this is bit-exact vs mul(normalized, silu).
|
||||
+ // LLAMA_FUSE_GATE_RMSNORM=0 keeps the original operand order (kernel fusion off).
|
||||
+ static const bool fuse_gate_rmsnorm = [] {
|
||||
+ const char * e = getenv("LLAMA_FUSE_GATE_RMSNORM");
|
||||
+ return e == nullptr || atoi(e) != 0;
|
||||
+ }();
|
||||
+ if (fuse_gate_rmsnorm) {
|
||||
+ return ggml_mul(ctx0, gated_silu, normalized);
|
||||
+ }
|
||||
return ggml_mul(ctx0, normalized, gated_silu);
|
||||
}
|
||||
|
||||
diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp
|
||||
index a79917628..071b88daa 100644
|
||||
--- a/src/models/qwen35moe.cpp
|
||||
+++ b/src/models/qwen35moe.cpp
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "models.h"
|
||||
+#include <cstdlib>
|
||||
#include "llama-memory-recurrent.h"
|
||||
|
||||
void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) {
|
||||
@@ -275,6 +276,21 @@ ggml_tensor * llama_model_qwen35moe::graph::build_norm_gated(
|
||||
ggml_tensor * normalized = build_norm(input, weights, nullptr, LLM_NORM_RMS, layer);
|
||||
ggml_tensor * gated_silu = ggml_silu(ctx0, gate);
|
||||
|
||||
+ // Emit the gate multiply as mul(silu(z), normalized) so the gated-DeltaNet
|
||||
+ // output-norm chain forms the consecutive subgraph { SILU, RMS_NORM, MUL, MUL }
|
||||
+ // that the CUDA backend fuses into one rms_norm_gate_mul kernel (the normalized
|
||||
+ // intermediate then never round-trips to HBM). The gate z-projection is scheduled
|
||||
+ // before the SILU, so the natural mul(normalized, silu) order leaves a GEMM
|
||||
+ // between the weight MUL and the SILU and is not fusable. Multiplication is
|
||||
+ // commutative, so this is bit-exact vs mul(normalized, silu).
|
||||
+ // LLAMA_FUSE_GATE_RMSNORM=0 keeps the original operand order (kernel fusion off).
|
||||
+ static const bool fuse_gate_rmsnorm = [] {
|
||||
+ const char * e = getenv("LLAMA_FUSE_GATE_RMSNORM");
|
||||
+ return e == nullptr || atoi(e) != 0;
|
||||
+ }();
|
||||
+ if (fuse_gate_rmsnorm) {
|
||||
+ return ggml_mul(ctx0, gated_silu, normalized);
|
||||
+ }
|
||||
return ggml_mul(ctx0, normalized, gated_silu);
|
||||
}
|
||||
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,64 @@
|
||||
From 85266d4c10750b419716e4b8939ebd96ab424630 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Tue, 30 Jun 2026 00:51:26 +0000
|
||||
Subject: [PATCH] feat(paged): gate GDN prefill geometry by scan length (patch
|
||||
0046)
|
||||
|
||||
Patch 0022 retuned the gated-DeltaNet (GDN) sequential-recurrence dispatch
|
||||
(case 128) to a (NUM_WARPS=16, COLS_PER_WARP=8) column-fold tile. That is a
|
||||
DECODE win (short scans: small n_tokens, large n_seqs) but an UNCONDITIONAL
|
||||
dense-prefill regression vs stock: on a long sequential scan the launch grid.z
|
||||
collapses from S_v/4=32 to S_v/(16*8)=1, so the SMs starve. Profiling the
|
||||
dense-prefill path attributed the whole regression (~-6%) to gated_delta_net
|
||||
(+54% GPU time) at the (16,8) geometry.
|
||||
|
||||
Gate the geometry by per-call scan length instead of applying (16,8)
|
||||
unconditionally. Long scans (prefill, n_tokens >= GDN_PREFILL_NTOK, default 256)
|
||||
take stock's high-grid.z (4,1) geometry; short scans (decode) keep the (16,8)
|
||||
retune. This recovers dense prefill +7.2% back to stock parity while preserving
|
||||
the (16,8) decode win.
|
||||
|
||||
Bit-exact: patch 0022 proved every selectable {NUM_WARPS, COLS_PER_WARP} variant
|
||||
is byte-identical (the sweep cannot change the md5), so this scan-length gate is
|
||||
greedy-md5 bit-exact. GDN_PREFILL_NTOK tunes the crossover; the explicit
|
||||
GDN_NW / GDN_CPW one-build %peak sweep still wins (the gate yields when either is
|
||||
set), so the A/B harness is unchanged.
|
||||
|
||||
Root cause: patch 0022 applied the (16,8) tile unconditionally. This patch
|
||||
sequences after 0022/0044 (it edits the same gated_delta_net.cu case-128
|
||||
dispatch) and adds only the scan-length gate.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/gated_delta_net.cu b/ggml/src/ggml-cuda/gated_delta_net.cu
|
||||
index 7121d807f..26667afa2 100644
|
||||
--- a/ggml/src/ggml-cuda/gated_delta_net.cu
|
||||
+++ b/ggml/src/ggml-cuda/gated_delta_net.cu
|
||||
@@ -550,6 +550,23 @@ static void launch_gated_delta_net(
|
||||
launch_gdn_variant<64, KDA, keep_rs_t, 4, 1, 2>(GDN_LAUNCH_ARGS);
|
||||
break;
|
||||
case 128: {
|
||||
+ // Dense-prefill regression fix: gate patch 0022's column-fold geometry by per-call scan
|
||||
+ // length. The (16,8) tile is a DECODE win (short scans: n_tokens small, n_seqs large) but a
|
||||
+ // long-sequential-scan PREFILL loss - grid.z collapses from S_v/4=32 to S_v/(16*8)=1, so the
|
||||
+ // SMs starve on the long scan (profiled: gated_delta_net +54% GPU time == the whole dense-
|
||||
+ // prefill regression). Long scans (prefill) take stock's high-grid.z (4,1) geometry; short
|
||||
+ // scans (decode) keep the (16,8) winner. Every {NW,CPW} variant is byte-identical (patch 0022
|
||||
+ // proved md5-invariance across the ladder), so this stays greedy-md5 bit-exact. Default-on;
|
||||
+ // GDN_PREFILL_NTOK tunes the crossover; the explicit GDN_NW/GDN_CPW sweep still wins (gate
|
||||
+ // yields when either is set) so the one-build %peak A/B harness is unchanged.
|
||||
+ static const int64_t gdn_prefill_ntok =
|
||||
+ []{ const char * e = getenv("GDN_PREFILL_NTOK"); return e ? (int64_t) atoll(e) : (int64_t) 256; }();
|
||||
+ static const bool gdn_nw_forced = (getenv("GDN_NW") != nullptr);
|
||||
+ static const bool gdn_cpw_forced = (getenv("GDN_CPW") != nullptr);
|
||||
+ if (n_tokens >= gdn_prefill_ntok && !gdn_nw_forced && !gdn_cpw_forced) {
|
||||
+ launch_gdn_variant<128, KDA, keep_rs_t, 4, 1, 2>(GDN_LAUNCH_ARGS);
|
||||
+ break;
|
||||
+ }
|
||||
// Bit-exact occupancy/coalescing retune (patch 0022): fold COLS_PER_WARP columns per warp
|
||||
// to raise per-warp memory-level parallelism on this bandwidth-bound recurrence. Default is
|
||||
// the measured winner; GDN_NW / GDN_CPW override it for the one-build %peak sweep (every
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -0,0 +1,713 @@
|
||||
From 2c32ab8b7a6c5bc90454881b8c10f8bad4f7cee0 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Tue, 30 Jun 2026 09:45:13 +0200
|
||||
Subject: [PATCH] feat(paged): GDN M5 tensor-core chunked-scan prefill,
|
||||
f32-only re-port (was patch 0044)
|
||||
|
||||
Re-port the M5 tensor-core chunked gated-DeltaNet (GDN) prefill kernel from the
|
||||
bf16/hybrid dev tree as an f32-only native commit, recovering the prefill win
|
||||
that patch 0044 encoded, on the f32-only series (0026 ssm_bf16_tau dropped).
|
||||
|
||||
What landed (f32/tf32 only):
|
||||
- The mma.sync m16n8k8 helpers (tf32 + 3xtf32 limb-split; decays/gamma/beta stay
|
||||
f32 outside the mma to preserve the bounded de-gating).
|
||||
- gated_delta_net_chunked_cuda<S_v, C, TC>: the full tensor-core chunked scan,
|
||||
KK/QK Gram (M2), KS/QS state-boundary 3xtf32 (M3), P*U output (M4), and the
|
||||
form-T (A^-1) solve + Kc^T*DU state-update (M5). Selected by GDN_TC (0=serial
|
||||
.. 4/5+=M5); the C=16 chunk-state stays in the 64KB smem buffer.
|
||||
- Default-on under paged KV: GDN_TC=5, GDN_CHUNK_MIN=64 when LLAMA_KV_PAGED is set
|
||||
and the user has not overridden either; OFF (INT_MAX) otherwise so the stock /
|
||||
non-paged default is regression-free. GDN_CHUNK_MIN must stay > 1 (decode is 1
|
||||
token/call; at 1 the chunked path swallows decode and collapses S_TG).
|
||||
|
||||
Stripped (not part of the f32-only series): the STATE_BF16 / HYBRID / gdn_state_t
|
||||
/ gdn_hybrid_args template machinery (from dropped patch 0026), and the bf16
|
||||
CONFIG-C (M8) plus register-resident M6/M7 occupancy variants. The 0046 dense-
|
||||
prefill geometry gate is untouched and coexists (it gates the SERIAL path; M5 is
|
||||
the chunked path).
|
||||
|
||||
Gates (GB10, sm_121a):
|
||||
- Builds clean.
|
||||
- Greedy md5 bit-exact (per-path, n=48 --temp 0 --seed 1, paged): dense
|
||||
q36-27b-nvfp4 = 5951a5b4d624ce891e22ab5fca9bc439, MoE q36-35b-a3b-nvfp4 =
|
||||
8cb0ce23777bf55f92f63d0292c756b0, both default AND force-M5 (GDN_CHUNK_MIN=1).
|
||||
test-backend-ops GATED_DELTA_NET 46/46 default and force-M5 (incl. the
|
||||
multi-chunk, tail-chunk and multi-seq shapes).
|
||||
- Prefill S_PP, MoE, LLAMA_KV_PAGED=1 LLAMA_MOE_FORCE_GRAPHS=1, -ntg 4 -npl 32,
|
||||
vs the patch-0044 baseline (pre-0046, GDN_PREFILL_NTOK huge): +4.3% @512,
|
||||
+17.8% @2048 (reproduces patch 0044; M5-on absolute matches patch 0044 M5).
|
||||
vs the current 0046 baseline (0046 already raised the long-scan sequential
|
||||
prefill): +4.3% @512, +1.2% @2048.
|
||||
- Decode S_TG unchanged (within run noise).
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
ggml/src/ggml-cuda/gated_delta_net.cu | 550 +++++++++++++++++++++++---
|
||||
tests/test-backend-ops.cpp | 5 +
|
||||
2 files changed, 496 insertions(+), 59 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/gated_delta_net.cu b/ggml/src/ggml-cuda/gated_delta_net.cu
|
||||
index 26667afa2..0ceb1bc8f 100644
|
||||
--- a/ggml/src/ggml-cuda/gated_delta_net.cu
|
||||
+++ b/ggml/src/ggml-cuda/gated_delta_net.cu
|
||||
@@ -298,7 +298,115 @@ static void launch_gdn_variant(
|
||||
// strong-decay tokens underflow to the correct zero rather than to inf. The math
|
||||
// is equivalent to the sequential recurrence up to FP reduction order (a NEW
|
||||
// per-path result, validated benign by test-backend-ops NMSE and greedy output).
|
||||
-template <int S_v, int C>
|
||||
+// --- Phase-1 tensor-core Gram helpers (tf32 m16n8k8 mma.sync; sm_80+/sm_121a). ---
|
||||
+// Reproduces the PoC-proven path (~/scratch_tc_gdn_poc/gdn_gram_bench.cu, tf32 NMSE ~3e-9):
|
||||
+// out[rowbase..+15][colbase..+7] = Xs[rows] . Ys[cols], Xs/Ys row-major [*][DK].
|
||||
+__device__ __forceinline__ unsigned gdn_f2tf32(float f) {
|
||||
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
|
||||
+ unsigned r;
|
||||
+ asm("cvt.rna.tf32.f32 %0, %1;" : "=r"(r) : "f"(f));
|
||||
+ return r;
|
||||
+#else
|
||||
+ (void) f;
|
||||
+ return 0u;
|
||||
+#endif
|
||||
+}
|
||||
+
|
||||
+// Operand loaders for the Gram/state mma helpers: stage f32 operands as tf32. This f32-only
|
||||
+// re-port keeps every operand full-width -- the plain-tf32 path (10-bit mantissa, f32 accumulate)
|
||||
+// is the highest-precision tensor-core option on sm_121a, and the 3xtf32 limb-split helpers below
|
||||
+// recover near-f32 accuracy for the decay-coupled state-boundary (KS/QS) and state-carry products
|
||||
+// whose error feeds the A-inverse solve / compounds across chunks.
|
||||
+__device__ __forceinline__ unsigned gdn_ld_tf32(float f) { return gdn_f2tf32(f); }
|
||||
+__device__ __forceinline__ float gdn_ld_f32 (float f) { return f; }
|
||||
+
|
||||
+__device__ __forceinline__ void gdn_mma_m16n8k8(float c[4], const unsigned a[4], const unsigned b[2]) {
|
||||
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
|
||||
+ asm volatile(
|
||||
+ "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 "
|
||||
+ "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%0,%1,%2,%3};\n"
|
||||
+ : "+f"(c[0]), "+f"(c[1]), "+f"(c[2]), "+f"(c[3])
|
||||
+ : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]));
|
||||
+#else
|
||||
+ (void) c; (void) a; (void) b;
|
||||
+#endif
|
||||
+}
|
||||
+
|
||||
+template <int DK, typename TX = float, typename TY = float>
|
||||
+__device__ __forceinline__ void gdn_gram_tile_mma(
|
||||
+ float c[4], const TX * __restrict__ Xs, const TY * __restrict__ Ys,
|
||||
+ int rowbase, int colbase, int lg, int lt) {
|
||||
+ c[0] = c[1] = c[2] = c[3] = 0.0f;
|
||||
+ #pragma unroll
|
||||
+ for (int ks = 0; ks < DK; ks += 8) {
|
||||
+ unsigned a[4], b[2];
|
||||
+ a[0] = gdn_ld_tf32(Xs[(rowbase + lg ) * DK + ks + lt ]);
|
||||
+ a[1] = gdn_ld_tf32(Xs[(rowbase + lg + 8) * DK + ks + lt ]);
|
||||
+ a[2] = gdn_ld_tf32(Xs[(rowbase + lg ) * DK + ks + lt + 4]);
|
||||
+ a[3] = gdn_ld_tf32(Xs[(rowbase + lg + 8) * DK + ks + lt + 4]);
|
||||
+ b[0] = gdn_ld_tf32(Ys[(colbase + lg ) * DK + ks + lt ]);
|
||||
+ b[1] = gdn_ld_tf32(Ys[(colbase + lg ) * DK + ks + lt + 4]);
|
||||
+ gdn_mma_m16n8k8(c, a, b);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+// 3xtf32 (CUTLASS fp32-emulation): split each f32 operand into hi/lo tf32 limbs and run
|
||||
+// 3 limb-products per k-subtile (hi*hi + hi*lo + lo*hi); ~f32 accuracy at ~3x the mma count.
|
||||
+// Used for the state-boundary products (KS/QS) whose error feeds the A-inverse solve (M3).
|
||||
+template <int DK, typename TX = float, typename TY = float>
|
||||
+__device__ __forceinline__ void gdn_gram_tile_mma_3x(
|
||||
+ float c[4], const TX * __restrict__ Xs, const TY * __restrict__ Ys,
|
||||
+ int rowbase, int colbase, int lg, int lt) {
|
||||
+ c[0] = c[1] = c[2] = c[3] = 0.0f;
|
||||
+ #pragma unroll
|
||||
+ for (int ks = 0; ks < DK; ks += 8) {
|
||||
+ float af[4], bf[2];
|
||||
+ af[0] = gdn_ld_f32(Xs[(rowbase + lg ) * DK + ks + lt ]);
|
||||
+ af[1] = gdn_ld_f32(Xs[(rowbase + lg + 8) * DK + ks + lt ]);
|
||||
+ af[2] = gdn_ld_f32(Xs[(rowbase + lg ) * DK + ks + lt + 4]);
|
||||
+ af[3] = gdn_ld_f32(Xs[(rowbase + lg + 8) * DK + ks + lt + 4]);
|
||||
+ bf[0] = gdn_ld_f32(Ys[(colbase + lg ) * DK + ks + lt ]);
|
||||
+ bf[1] = gdn_ld_f32(Ys[(colbase + lg ) * DK + ks + lt + 4]);
|
||||
+ unsigned ahi[4], alo[4], bhi[2], blo[2];
|
||||
+ #pragma unroll
|
||||
+ for (int z = 0; z < 4; z++) { ahi[z] = gdn_f2tf32(af[z]); alo[z] = gdn_f2tf32(af[z] - __uint_as_float(ahi[z])); }
|
||||
+ #pragma unroll
|
||||
+ for (int z = 0; z < 2; z++) { bhi[z] = gdn_f2tf32(bf[z]); blo[z] = gdn_f2tf32(bf[z] - __uint_as_float(bhi[z])); }
|
||||
+ gdn_mma_m16n8k8(c, ahi, bhi); // hi*hi (dominant limb)
|
||||
+ gdn_mma_m16n8k8(c, ahi, blo); // hi*lo
|
||||
+ gdn_mma_m16n8k8(c, alo, bhi); // lo*hi
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+// State-update tile (P6): S_C[i][j] += sum_t Kc[t][i] * DU[t][j], with Kc read TRANSPOSED
|
||||
+// (i as the m16n8k8 M-row, t as the K-contraction) and DU = d(t,last)*U staged in the Ud
|
||||
+// layout (DUd[j*KC + t]). 3xtf32: the cross-chunk carry compounds over every chunk step.
|
||||
+template <int KC, int DK, typename TK = float, typename TD = float>
|
||||
+__device__ __forceinline__ void gdn_state_tile_mma_3x(
|
||||
+ float c[4], const TK * __restrict__ Kc, const TD * __restrict__ DUd,
|
||||
+ int rowbase, int colbase, int lg, int lt) {
|
||||
+ c[0] = c[1] = c[2] = c[3] = 0.0f;
|
||||
+ #pragma unroll
|
||||
+ for (int ks = 0; ks < KC; ks += 8) {
|
||||
+ float af[4], bf[2];
|
||||
+ af[0] = gdn_ld_f32(Kc[(ks + lt ) * DK + (rowbase + lg )]);
|
||||
+ af[1] = gdn_ld_f32(Kc[(ks + lt ) * DK + (rowbase + lg + 8)]);
|
||||
+ af[2] = gdn_ld_f32(Kc[(ks + lt + 4) * DK + (rowbase + lg )]);
|
||||
+ af[3] = gdn_ld_f32(Kc[(ks + lt + 4) * DK + (rowbase + lg + 8)]);
|
||||
+ bf[0] = gdn_ld_f32(DUd[(colbase + lg) * KC + (ks + lt )]);
|
||||
+ bf[1] = gdn_ld_f32(DUd[(colbase + lg) * KC + (ks + lt + 4)]);
|
||||
+ unsigned ahi[4], alo[4], bhi[2], blo[2];
|
||||
+ #pragma unroll
|
||||
+ for (int z = 0; z < 4; z++) { ahi[z] = gdn_f2tf32(af[z]); alo[z] = gdn_f2tf32(af[z] - __uint_as_float(ahi[z])); }
|
||||
+ #pragma unroll
|
||||
+ for (int z = 0; z < 2; z++) { bhi[z] = gdn_f2tf32(bf[z]); blo[z] = gdn_f2tf32(bf[z] - __uint_as_float(bhi[z])); }
|
||||
+ gdn_mma_m16n8k8(c, ahi, bhi);
|
||||
+ gdn_mma_m16n8k8(c, ahi, blo);
|
||||
+ gdn_mma_m16n8k8(c, alo, bhi);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+template <int S_v, int C, int TC = 0>
|
||||
__global__ void gated_delta_net_chunked_cuda(
|
||||
const float * __restrict__ q, const float * __restrict__ k,
|
||||
const float * __restrict__ v, const float * __restrict__ g,
|
||||
@@ -329,6 +437,9 @@ __global__ void gated_delta_net_chunked_cuda(
|
||||
float * csh = Amat + (size_t) C * C; // [C] cumsum(log-gate)
|
||||
float * gam = csh + C; // [C] gamma_t = exp(cs_t)
|
||||
float * bet = gam + C; // [C] beta_t
|
||||
+ // Phase-1 tensor-core Gram scratch (allocated only when GRAM_MMA; KK feeds A, QK feeds P).
|
||||
+ float * KKsh = bet + C; // [C*C] KK[t][t'] = k_t . k_t' (stride C)
|
||||
+ float * QKsh = KKsh + (size_t) C * C; // [C*C] QK[t][t'] = q_t . k_t' (stride C)
|
||||
|
||||
// S0: thread j owns column j (Sd[j*dk + i]); load is a contiguous per-thread copy from the
|
||||
// M-layout cache view (read_state[j*dk + i] = M[j*S_v + i] = S[i][j]). Same identity/gather
|
||||
@@ -357,6 +468,15 @@ __global__ void gated_delta_net_chunked_cuda(
|
||||
Kc[t * dk + i] = k_base[(c0 + t) * sq2 + i];
|
||||
Qc[t * dk + i] = q_base[(c0 + t) * sq2 + i];
|
||||
}
|
||||
+ if constexpr (TC >= 3) {
|
||||
+ // Zero the stale K/Q tail (rows t >= Cc): the tensor-core mma paths contract the full
|
||||
+ // chunk dim and 0*NaN (uninitialized smem) would poison the result. Serial paths only
|
||||
+ // touch t < Cc, so this is gated to the mma levels.
|
||||
+ for (int e = Cc * dk + j; e < C * dk; e += dv) {
|
||||
+ Kc[e] = 0.0f;
|
||||
+ Qc[e] = 0.0f;
|
||||
+ }
|
||||
+ }
|
||||
if (j < Cc) {
|
||||
csh[j] = g[gb_base + (c0 + j) * sb2]; // raw log-gate, prefix-summed below
|
||||
bet[j] = beta[gb_base + (c0 + j) * sb2];
|
||||
@@ -372,15 +492,53 @@ __global__ void gated_delta_net_chunked_cuda(
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
+ // --- Phase-1: tensor-core tf32 Gram products (KK->A via warp0, QK->P via warp1). ---
|
||||
+ // Full C x C tiles into KKsh/QKsh (stride C); decay/beta applied in f32 in the loops below.
|
||||
+ // Tail chunks (Cc<C) compute stale rows >= Cc, but those entries are never read.
|
||||
+ if constexpr (TC >= 1) {
|
||||
+ const int w = threadIdx.x >> 5; // warp: 0 -> KK, 1 -> QK
|
||||
+ const int lane = threadIdx.x & 31;
|
||||
+ const int lg = lane >> 2; // 0..7
|
||||
+ const int lt = lane & 3; // 0..3
|
||||
+ if (w < 2) {
|
||||
+ const float * Xs = (w == 0) ? Kc : Qc;
|
||||
+ float * Out = (w == 0) ? KKsh : QKsh;
|
||||
+ #pragma unroll
|
||||
+ for (int mt = 0; mt < (C + 15) / 16; mt++) {
|
||||
+ const int rowbase = mt * 16;
|
||||
+ #pragma unroll
|
||||
+ for (int nt = 0; nt < (C + 7) / 8; nt++) {
|
||||
+ const int colbase = nt * 8;
|
||||
+ float cc[4];
|
||||
+ gdn_gram_tile_mma<dk>(cc, Xs, Kc, rowbase, colbase, lg, lt);
|
||||
+ const int rr[4] = {rowbase + lg, rowbase + lg, rowbase + lg + 8, rowbase + lg + 8};
|
||||
+ const int ccol[4] = {colbase + 2*lt, colbase + 2*lt + 1, colbase + 2*lt, colbase + 2*lt + 1};
|
||||
+ #pragma unroll
|
||||
+ for (int l = 0; l < 4; l++) {
|
||||
+ if (rr[l] < C && ccol[l] < C) {
|
||||
+ Out[rr[l] * C + ccol[l]] = cc[l];
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ __syncthreads();
|
||||
+ }
|
||||
+
|
||||
// --- A = I + tril(beta_t * d(t',t) * (k_t . k_t'), -1) (cooperative over C*C) ---
|
||||
for (int e = j; e < Cc * Cc; e += dv) {
|
||||
const int t = e / Cc;
|
||||
const int tp = e % Cc;
|
||||
float a = 0.0f;
|
||||
if (tp < t) {
|
||||
- float kk = 0.0f;
|
||||
- for (int i = 0; i < dk; i++) {
|
||||
- kk += Kc[t * dk + i] * Kc[tp * dk + i];
|
||||
+ float kk;
|
||||
+ if constexpr (TC >= 1) {
|
||||
+ kk = KKsh[t * C + tp];
|
||||
+ } else {
|
||||
+ kk = 0.0f;
|
||||
+ for (int i = 0; i < dk; i++) {
|
||||
+ kk += Kc[t * dk + i] * Kc[tp * dk + i];
|
||||
+ }
|
||||
}
|
||||
const float dd = expf(csh[t] - csh[tp]); // d(tp,t) = gamma_t/gamma_tp
|
||||
a = bet[t] * dd * kk;
|
||||
@@ -392,65 +550,304 @@ __global__ void gated_delta_net_chunked_cuda(
|
||||
__syncthreads();
|
||||
|
||||
// --- RHS[t][j] = beta_t (v_t[j] - gamma_t * (S0^T k_t)[j]) -> Ud[j*C + t] ---
|
||||
- for (int t = 0; t < Cc; t++) {
|
||||
- float ks = 0.0f; // (S0^T k_t)[j] = sum_i S[i][j] k_t[i]
|
||||
- for (int i = 0; i < dk; i++) {
|
||||
- ks += Sd[j * dk + i] * Kc[t * dk + i];
|
||||
+ if constexpr (TC >= 2) {
|
||||
+ // M3: fused tensor-core KS = Kc * S0 (3xtf32 state-boundary product). The mma
|
||||
+ // output is consumed straight from registers into RHS -> Ud, so NO extra C*dv
|
||||
+ // smem buffer is needed (the 64KB state still occupies smem until M6). Warp w
|
||||
+ // owns dv n-tiles [w*NTPW, ..); each lane writes the RHS entries it produced.
|
||||
+ const int w = threadIdx.x >> 5;
|
||||
+ const int lane = threadIdx.x & 31;
|
||||
+ const int lg = lane >> 2;
|
||||
+ const int lt = lane & 3;
|
||||
+ constexpr int NWARP = S_v / 32;
|
||||
+ constexpr int NT = dv / 8;
|
||||
+ constexpr int NTPW = (NT + NWARP - 1) / NWARP;
|
||||
+ #pragma unroll
|
||||
+ for (int mt = 0; mt < (C + 15) / 16; mt++) {
|
||||
+ const int rowbase = mt * 16;
|
||||
+ #pragma unroll
|
||||
+ for (int nn = 0; nn < NTPW; nn++) {
|
||||
+ const int nt = w * NTPW + nn;
|
||||
+ if (nt >= NT) break;
|
||||
+ const int colbase = nt * 8;
|
||||
+ float cc[4];
|
||||
+ gdn_gram_tile_mma_3x<dk>(cc, Kc, Sd, rowbase, colbase, lg, lt);
|
||||
+ const int tt[4] = {rowbase + lg, rowbase + lg, rowbase + lg + 8, rowbase + lg + 8};
|
||||
+ const int jj[4] = {colbase + 2*lt, colbase + 2*lt + 1, colbase + 2*lt, colbase + 2*lt + 1};
|
||||
+ #pragma unroll
|
||||
+ for (int l = 0; l < 4; l++) {
|
||||
+ const int t = tt[l], jc = jj[l];
|
||||
+ if (t < Cc && jc < dv) {
|
||||
+ const float vtj = v_base[(c0 + t) * sv2 + jc];
|
||||
+ Ud[jc * C + t] = bet[t] * (vtj - gam[t] * cc[l]);
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ __syncthreads(); // RHS written cross-thread -> publish before the per-column solve
|
||||
+ } else {
|
||||
+ for (int t = 0; t < Cc; t++) {
|
||||
+ float ks = 0.0f; // (S0^T k_t)[j] = sum_i S[i][j] k_t[i]
|
||||
+ for (int i = 0; i < dk; i++) {
|
||||
+ ks += Sd[j * dk + i] * Kc[t * dk + i];
|
||||
+ }
|
||||
+ const float vtj = v_base[(c0 + t) * sv2 + j];
|
||||
+ Ud[j * C + t] = bet[t] * (vtj - gam[t] * ks);
|
||||
}
|
||||
- const float vtj = v_base[(c0 + t) * sv2 + j];
|
||||
- Ud[j * C + t] = bet[t] * (vtj - gam[t] * ks);
|
||||
+ }
|
||||
+ if constexpr (TC >= 3) {
|
||||
+ // Zero the stale RHS tail (rows t >= Cc) before the full-K mma consumers (P*U at TC>=3;
|
||||
+ // apply + state at TC>=4). Without this the masked tail terms compute 0*NaN = NaN.
|
||||
+ for (int t = Cc; t < C; t++) Ud[j * C + t] = 0.0f;
|
||||
+ __syncthreads();
|
||||
}
|
||||
|
||||
- // --- solve A U = RHS in place (unit lower-tri fwd subst); per-thread, no inter-step sync ---
|
||||
- for (int t = 1; t < Cc; t++) {
|
||||
- float acc = Ud[j * C + t];
|
||||
- for (int tp = 0; tp < t; tp++) {
|
||||
- acc -= Amat[t * Cc + tp] * Ud[j * C + tp];
|
||||
+ // --- solve A U = RHS (A unit-lower-tri) ---
|
||||
+ if constexpr (TC >= 4) {
|
||||
+ // M5/P7: form T = A^{-1} explicitly (FLA UT transform), then U = T*RHS as one
|
||||
+ // dependency-free tf32 GEMM. At C<=16 A is a single b=16 block, so the off-diagonal
|
||||
+ // Phase-O is empty; only the f32 in-shared diagonal inverse (Phase-D) + the wide
|
||||
+ // apply remain. Phase-D: column-parallel EXACT f32 inverse of the Cc x Cc unit-
|
||||
+ // lower-tri A -- thread c solves A x = e_c, writing column c of T into KKsh (free
|
||||
+ // since KK was consumed into A). This is the strong-coupling amplifier -> f32.
|
||||
+ if (j < C) {
|
||||
+ if (j < Cc) {
|
||||
+ float x[C];
|
||||
+ #pragma unroll
|
||||
+ for (int r = 0; r < C; r++) x[r] = 0.0f;
|
||||
+ x[j] = 1.0f;
|
||||
+ for (int r = j + 1; r < Cc; r++) {
|
||||
+ float acc = 0.0f;
|
||||
+ for (int m = j; m < r; m++) acc += Amat[r * Cc + m] * x[m];
|
||||
+ x[r] = -acc;
|
||||
+ }
|
||||
+ #pragma unroll
|
||||
+ for (int r = 0; r < C; r++) KKsh[r * C + j] = x[r]; // rows >= Cc are 0
|
||||
+ } else {
|
||||
+ #pragma unroll
|
||||
+ for (int r = 0; r < C; r++) KKsh[r * C + j] = 0.0f; // cols >= Cc are 0
|
||||
+ }
|
||||
+ }
|
||||
+ __syncthreads();
|
||||
+ // Apply U = T*RHS, M=C N=dv K=C; T=KKsh (stride C), RHS=Ud (stride C). In place on
|
||||
+ // Ud: hold every output tile in registers, sync to finish the RHS reads, then
|
||||
+ // overwrite Ud with U (avoids the read/write aliasing of a same-buffer GEMM).
|
||||
+ {
|
||||
+ const int w = threadIdx.x >> 5;
|
||||
+ const int lane = threadIdx.x & 31;
|
||||
+ const int lg = lane >> 2;
|
||||
+ const int lt = lane & 3;
|
||||
+ constexpr int NWARP = S_v / 32;
|
||||
+ constexpr int NT = dv / 8;
|
||||
+ constexpr int NTPW = (NT + NWARP - 1) / NWARP;
|
||||
+ float ureg[NTPW][4];
|
||||
+ #pragma unroll
|
||||
+ for (int nn = 0; nn < NTPW; nn++) {
|
||||
+ const int nt = w * NTPW + nn;
|
||||
+ if (nt < NT) gdn_gram_tile_mma<C>(ureg[nn], KKsh, Ud, 0, nt * 8, lg, lt);
|
||||
+ }
|
||||
+ __syncthreads(); // all RHS(Ud) reads done before overwriting with U
|
||||
+ #pragma unroll
|
||||
+ for (int nn = 0; nn < NTPW; nn++) {
|
||||
+ const int nt = w * NTPW + nn;
|
||||
+ if (nt >= NT) continue;
|
||||
+ const int colbase = nt * 8;
|
||||
+ const int tt[4] = {lg, lg, lg + 8, lg + 8};
|
||||
+ const int jj[4] = {colbase + 2*lt, colbase + 2*lt + 1, colbase + 2*lt, colbase + 2*lt + 1};
|
||||
+ #pragma unroll
|
||||
+ for (int l = 0; l < 4; l++) {
|
||||
+ const int t = tt[l], jc = jj[l];
|
||||
+ if (t < Cc && jc < dv) Ud[jc * C + t] = ureg[nn][l];
|
||||
+ }
|
||||
+ }
|
||||
+ __syncthreads();
|
||||
}
|
||||
- Ud[j * C + t] = acc;
|
||||
+ } else {
|
||||
+ for (int t = 1; t < Cc; t++) {
|
||||
+ float acc = Ud[j * C + t];
|
||||
+ for (int tp = 0; tp < t; tp++) {
|
||||
+ acc -= Amat[t * Cc + tp] * Ud[j * C + tp];
|
||||
+ }
|
||||
+ Ud[j * C + t] = acc;
|
||||
+ }
|
||||
+ __syncthreads(); // U finalized; Amat free for P below
|
||||
}
|
||||
- __syncthreads(); // U finalized; Amat free for P below (and Ud read across-thread? no, own col)
|
||||
|
||||
- // --- P[t][t'] = d(t',t) * (q_t . k_t') for t' <= t (reuse Amat) ---
|
||||
- for (int e = j; e < Cc * Cc; e += dv) {
|
||||
- const int t = e / Cc;
|
||||
- const int tp = e % Cc;
|
||||
- float p = 0.0f;
|
||||
- if (tp <= t) {
|
||||
- float qk = 0.0f;
|
||||
- for (int i = 0; i < dk; i++) {
|
||||
- qk += Qc[t * dk + i] * Kc[tp * dk + i];
|
||||
+ // --- P[t][t'] = d(t',t) * (q_t . k_t') for t' <= t ---
|
||||
+ if constexpr (TC >= 3) {
|
||||
+ // M4: build P (lower-tri, decay pre-baked in f32 -> bounded) IN PLACE in QKsh at
|
||||
+ // fixed stride C so the P*U output mma can read it as a tf32 A-operand. Full C*C
|
||||
+ // grid: upper-tri / out-of-range entries are zeroed so the K=C mma needs no masking.
|
||||
+ for (int e = j; e < C * C; e += dv) {
|
||||
+ const int t = e / C;
|
||||
+ const int tp = e % C;
|
||||
+ float p = 0.0f;
|
||||
+ if (tp <= t && t < Cc && tp < Cc) {
|
||||
+ const float dd = expf(csh[t] - csh[tp]);
|
||||
+ p = dd * QKsh[t * C + tp]; // QKsh holds QK (M2); overwrite in place with P
|
||||
+ }
|
||||
+ QKsh[t * C + tp] = p;
|
||||
+ }
|
||||
+ __syncthreads();
|
||||
+ } else {
|
||||
+ for (int e = j; e < Cc * Cc; e += dv) {
|
||||
+ const int t = e / Cc;
|
||||
+ const int tp = e % Cc;
|
||||
+ float p = 0.0f;
|
||||
+ if (tp <= t) {
|
||||
+ float qk;
|
||||
+ if constexpr (TC >= 1) {
|
||||
+ qk = QKsh[t * C + tp];
|
||||
+ } else {
|
||||
+ qk = 0.0f;
|
||||
+ for (int i = 0; i < dk; i++) {
|
||||
+ qk += Qc[t * dk + i] * Kc[tp * dk + i];
|
||||
+ }
|
||||
+ }
|
||||
+ const float dd = expf(csh[t] - csh[tp]);
|
||||
+ p = dd * qk;
|
||||
}
|
||||
- const float dd = expf(csh[t] - csh[tp]);
|
||||
- p = dd * qk;
|
||||
+ Amat[t * Cc + tp] = p;
|
||||
}
|
||||
- Amat[t * Cc + tp] = p;
|
||||
+ __syncthreads();
|
||||
}
|
||||
- __syncthreads();
|
||||
|
||||
// --- O[t][j] = gamma_t (S0^T q_t)[j] + sum_{t'<=t} P[t][t'] U[t'][j] (* scale) ---
|
||||
- for (int t = 0; t < Cc; t++) {
|
||||
- float qs = 0.0f; // (S0^T q_t)[j] (uses pre-update S)
|
||||
- for (int i = 0; i < dk; i++) {
|
||||
- qs += Sd[j * dk + i] * Qc[t * dk + i];
|
||||
+ if constexpr (TC >= 2) {
|
||||
+ // M3: fused tensor-core QS = Qc * S0 (3xtf32, pre-update S0). Deposit the
|
||||
+ // gamma_t*QS[t][j] cross-chunk term into dst from the mma registers; the O loop
|
||||
+ // below reads it back (published via __syncthreads) and adds the intra-chunk P*U.
|
||||
+ const int w = threadIdx.x >> 5;
|
||||
+ const int lane = threadIdx.x & 31;
|
||||
+ const int lg = lane >> 2;
|
||||
+ const int lt = lane & 3;
|
||||
+ constexpr int NWARP = S_v / 32;
|
||||
+ constexpr int NT = dv / 8;
|
||||
+ constexpr int NTPW = (NT + NWARP - 1) / NWARP;
|
||||
+ #pragma unroll
|
||||
+ for (int mt = 0; mt < (C + 15) / 16; mt++) {
|
||||
+ const int rowbase = mt * 16;
|
||||
+ #pragma unroll
|
||||
+ for (int nn = 0; nn < NTPW; nn++) {
|
||||
+ const int nt = w * NTPW + nn;
|
||||
+ if (nt >= NT) break;
|
||||
+ const int colbase = nt * 8;
|
||||
+ float cc[4];
|
||||
+ gdn_gram_tile_mma_3x<dk>(cc, Qc, Sd, rowbase, colbase, lg, lt);
|
||||
+ const int tt[4] = {rowbase + lg, rowbase + lg, rowbase + lg + 8, rowbase + lg + 8};
|
||||
+ const int jj[4] = {colbase + 2*lt, colbase + 2*lt + 1, colbase + 2*lt, colbase + 2*lt + 1};
|
||||
+ #pragma unroll
|
||||
+ for (int l = 0; l < 4; l++) {
|
||||
+ const int t = tt[l], jc = jj[l];
|
||||
+ if (t < Cc && jc < dv) {
|
||||
+ attn_base[(c0 + t) * S_v * H + jc] = gam[t] * cc[l];
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
}
|
||||
- float o = gam[t] * qs;
|
||||
- for (int tp = 0; tp <= t; tp++) {
|
||||
- o += Amat[t * Cc + tp] * Ud[j * C + tp];
|
||||
+ __syncthreads();
|
||||
+ }
|
||||
+ if constexpr (TC >= 3) {
|
||||
+ // M4: O += P*U via tensor-core (tf32-safe: P is f32-bounded, decay pre-baked).
|
||||
+ // GEMM O[t][j] += sum_t' P[t][t']*U[t'][j], M=C N=dv K=C; P=QKsh (stride C),
|
||||
+ // U=Ud (stride C). The gamma_t*QS cross-chunk term was deposited into dst above;
|
||||
+ // fold it in here then * scale. Warp w owns dv n-tiles [w*NTPW, ..).
|
||||
+ const int w = threadIdx.x >> 5;
|
||||
+ const int lane = threadIdx.x & 31;
|
||||
+ const int lg = lane >> 2;
|
||||
+ const int lt = lane & 3;
|
||||
+ constexpr int NWARP = S_v / 32;
|
||||
+ constexpr int NT = dv / 8;
|
||||
+ constexpr int NTPW = (NT + NWARP - 1) / NWARP;
|
||||
+ #pragma unroll
|
||||
+ for (int mt = 0; mt < (C + 15) / 16; mt++) {
|
||||
+ const int rowbase = mt * 16;
|
||||
+ #pragma unroll
|
||||
+ for (int nn = 0; nn < NTPW; nn++) {
|
||||
+ const int nt = w * NTPW + nn;
|
||||
+ if (nt >= NT) break;
|
||||
+ const int colbase = nt * 8;
|
||||
+ float cc[4];
|
||||
+ gdn_gram_tile_mma<C>(cc, QKsh, Ud, rowbase, colbase, lg, lt);
|
||||
+ const int tt[4] = {rowbase + lg, rowbase + lg, rowbase + lg + 8, rowbase + lg + 8};
|
||||
+ const int jj[4] = {colbase + 2*lt, colbase + 2*lt + 1, colbase + 2*lt, colbase + 2*lt + 1};
|
||||
+ #pragma unroll
|
||||
+ for (int l = 0; l < 4; l++) {
|
||||
+ const int t = tt[l], jc = jj[l];
|
||||
+ if (t < Cc && jc < dv) {
|
||||
+ const int64_t oi = (int64_t)(c0 + t) * S_v * H + jc;
|
||||
+ attn_base[oi] = (attn_base[oi] + cc[l]) * scale; // QS term + P*U
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ } else {
|
||||
+ for (int t = 0; t < Cc; t++) {
|
||||
+ float o;
|
||||
+ if constexpr (TC >= 2) {
|
||||
+ o = attn_base[(c0 + t) * S_v * H + j]; // gamma_t*QS[t][j] deposited above
|
||||
+ } else {
|
||||
+ float qs = 0.0f; // (S0^T q_t)[j] (uses pre-update S)
|
||||
+ for (int i = 0; i < dk; i++) {
|
||||
+ qs += Sd[j * dk + i] * Qc[t * dk + i];
|
||||
+ }
|
||||
+ o = gam[t] * qs;
|
||||
+ }
|
||||
+ for (int tp = 0; tp <= t; tp++) {
|
||||
+ o += Amat[t * Cc + tp] * Ud[j * C + tp];
|
||||
+ }
|
||||
+ attn_base[(c0 + t) * S_v * H + j] = o * scale;
|
||||
}
|
||||
- attn_base[(c0 + t) * S_v * H + j] = o * scale;
|
||||
}
|
||||
|
||||
// --- S_C[i][j] = gamma_{C-1} S[i][j] + sum_t d(t,C-1) k_t[i] u_t[j] ---
|
||||
const float glast = gam[Cc - 1];
|
||||
const float cslast = csh[Cc - 1];
|
||||
- for (int i = 0; i < dk; i++) {
|
||||
- float s = glast * Sd[j * dk + i];
|
||||
- for (int t = 0; t < Cc; t++) {
|
||||
- const float dd = expf(cslast - csh[t]); // d(t, last)
|
||||
- s += dd * Kc[t * dk + i] * Ud[j * C + t];
|
||||
+ if constexpr (TC >= 4) {
|
||||
+ // M5/P6: state carry S_C = glast*S0 + Kc^T * DU via 3xtf32 mma. DU[t][j] =
|
||||
+ // d(t,last)*U[t][j] is built IN PLACE in Ud (t>=Cc zeroed so the K=C contraction
|
||||
+ // needs no per-k masking), then S_C accumulates over the chunk dim t. Kc is read
|
||||
+ // transposed (i as M-row). M=dk N=dv K=C. Each output (i,j) has a unique owner so
|
||||
+ // the glast*S0 read-modify-write is race-free.
|
||||
+ for (int t = 0; t < C; t++) {
|
||||
+ const float dd = (t < Cc) ? expf(cslast - csh[t]) : 0.0f;
|
||||
+ Ud[j * C + t] = dd * Ud[j * C + t]; // thread j owns column j -> DU in place
|
||||
+ }
|
||||
+ __syncthreads();
|
||||
+ const int w = threadIdx.x >> 5;
|
||||
+ const int lane = threadIdx.x & 31;
|
||||
+ const int lg = lane >> 2;
|
||||
+ const int lt = lane & 3;
|
||||
+ constexpr int NWARP = S_v / 32;
|
||||
+ constexpr int MT = dk / 16; // m-tiles over dk
|
||||
+ constexpr int NT = dv / 8; // n-tiles over dv
|
||||
+ constexpr int NTILES = MT * NT;
|
||||
+ constexpr int TPW = (NTILES + NWARP - 1) / NWARP;
|
||||
+ #pragma unroll
|
||||
+ for (int idx = 0; idx < TPW; idx++) {
|
||||
+ const int tile = w * TPW + idx;
|
||||
+ if (tile >= NTILES) break;
|
||||
+ const int rowbase = (tile / NT) * 16;
|
||||
+ const int colbase = (tile % NT) * 8;
|
||||
+ float cc[4];
|
||||
+ gdn_state_tile_mma_3x<C, dk>(cc, Kc, Ud, rowbase, colbase, lg, lt);
|
||||
+ const int ii[4] = {rowbase + lg, rowbase + lg, rowbase + lg + 8, rowbase + lg + 8};
|
||||
+ const int jj[4] = {colbase + 2*lt, colbase + 2*lt + 1, colbase + 2*lt, colbase + 2*lt + 1};
|
||||
+ #pragma unroll
|
||||
+ for (int l = 0; l < 4; l++) {
|
||||
+ const int i = ii[l], jc = jj[l];
|
||||
+ Sd[jc * dk + i] = glast * Sd[jc * dk + i] + cc[l];
|
||||
+ }
|
||||
+ }
|
||||
+ } else {
|
||||
+ for (int i = 0; i < dk; i++) {
|
||||
+ float s = glast * Sd[j * dk + i];
|
||||
+ for (int t = 0; t < Cc; t++) {
|
||||
+ const float dd = expf(cslast - csh[t]); // d(t, last)
|
||||
+ s += dd * Kc[t * dk + i] * Ud[j * C + t];
|
||||
+ }
|
||||
+ Sd[j * dk + i] = s;
|
||||
}
|
||||
- Sd[j * dk + i] = s;
|
||||
}
|
||||
__syncthreads(); // Sd reused as S0 of next chunk; Kc/Qc/Amat reloaded next chunk
|
||||
}
|
||||
@@ -464,8 +861,7 @@ __global__ void gated_delta_net_chunked_cuda(
|
||||
st[j * dk + i] = Sd[j * dk + i];
|
||||
}
|
||||
}
|
||||
-
|
||||
-template <int S_v, int C>
|
||||
+template <int S_v, int C, int TC = 0>
|
||||
static void launch_gdn_chunked(
|
||||
const float * q_d, const float * k_d, const float * v_d,
|
||||
const float * g_d, const float * b_d, const float * s_d,
|
||||
@@ -477,10 +873,11 @@ static void launch_gdn_chunked(
|
||||
const uint3 neqk1_magic, const uint3 rq3_magic,
|
||||
float scale, cudaStream_t stream) {
|
||||
const size_t smem = ((size_t) S_v * S_v + (size_t) 2 * C * S_v + (size_t) S_v * C
|
||||
- + (size_t) C * C + (size_t) 3 * C) * sizeof(float);
|
||||
+ + (size_t) C * C + (size_t) 3 * C
|
||||
+ + (TC >= 1 ? (size_t) 2 * C * C : (size_t) 0)) * sizeof(float);
|
||||
static bool attr_set = false;
|
||||
if (!attr_set) {
|
||||
- const cudaError_t e = cudaFuncSetAttribute(gated_delta_net_chunked_cuda<S_v, C>,
|
||||
+ const cudaError_t e = cudaFuncSetAttribute(gated_delta_net_chunked_cuda<S_v, C, TC>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, (int) smem);
|
||||
if (e != cudaSuccess) {
|
||||
GGML_ABORT("gdn chunked: cudaFuncSetAttribute(maxDynSmem=%zu) failed: %s\n", smem, cudaGetErrorString(e));
|
||||
@@ -489,7 +886,7 @@ static void launch_gdn_chunked(
|
||||
}
|
||||
dim3 grid_dims(H, n_seqs, 1);
|
||||
dim3 block_dims(S_v, 1, 1);
|
||||
- gated_delta_net_chunked_cuda<S_v, C><<<grid_dims, block_dims, smem, stream>>>(
|
||||
+ gated_delta_net_chunked_cuda<S_v, C, TC><<<grid_dims, block_dims, smem, stream>>>(
|
||||
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs,
|
||||
sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2, sb3,
|
||||
neqk1_magic, rq3_magic, scale, state_dst_d, ids_d, rs_head);
|
||||
@@ -519,17 +916,52 @@ static void launch_gated_delta_net(
|
||||
// sequential recurrence. Mathematically equivalent up to FP reduction order (NEW per-path md5;
|
||||
// validated benign by test-backend-ops NMSE + greedy output). Toggle: GDN_CHUNK_OFF / GDN_CHUNK_MIN.
|
||||
if constexpr (!KDA && !keep_rs_t) {
|
||||
- // OPT-IN: this chunked path is bit-exact-benign (test-backend-ops green) but, at C=16
|
||||
- // (forced by GB10 99KB dyn-smem opt-in, all-shared), it is NOT yet faster than the tuned
|
||||
- // sequential recurrence on this model (measured ~22%% slower S_PP, grid-starved at low
|
||||
- // n_seqs + 1 block/SM occupancy). Default OFF so the backend default is regression-free;
|
||||
- // enable for experiments / tuning with GDN_CHUNK_MIN=<token-threshold>. See README section 5 (dev notes / rejected-flat levers).
|
||||
- static const int gdn_chunk_min = []{ const char * e = getenv("GDN_CHUNK_MIN"); return e ? atoi(e) : INT_MAX; }();
|
||||
+ // DEFAULT-ON UNDER PAGED KV (f32-only re-port of patch 0044's M5). The M5 tensor-core path
|
||||
+ // (GDN_TC=5: full-TC form-T solve + state-update mma, state in the 64KB smem buffer, C=16)
|
||||
+ // is greedy-bit-exact (per-path md5 == the sequential canonical on the short gate prompt)
|
||||
+ // and *beats* the tuned sequential recurrence on the Qwen3.6 MoE prefill: GB10,
|
||||
+ // q36-35b-a3b-nvfp4, LLAMA_KV_PAGED=1 LLAMA_MOE_FORCE_GRAPHS=1, -ntg 4 -npl 32:
|
||||
+ // -npp 512 : S_PP +3.5% ; -npp 2048 : S_PP +17.7% (more chunks to parallelize).
|
||||
+ // Decode S_TG is unchanged (1-token calls never reach the engage threshold).
|
||||
+ // GDN_CHUNK_MIN is the per-call engage threshold and MUST stay > 1: decode is 1 token/call,
|
||||
+ // so any threshold above 1 leaves every decode step on the sequential recurrence (at
|
||||
+ // GDN_CHUNK_MIN=1 the chunked path swallows decode and collapses S_TG by ~25%). Tuned to 64:
|
||||
+ // above decode/tiny-call sizes, below the real MoE-prefill per-call count. OFF (INT_MAX) when
|
||||
+ // not paged, so the stock / non-paged default is regression-free. Both knobs env-overridable.
|
||||
+ static const bool kv_paged = (getenv("LLAMA_KV_PAGED") != nullptr);
|
||||
+ static const int gdn_chunk_min = []{
|
||||
+ const char * e = getenv("GDN_CHUNK_MIN");
|
||||
+ if (e) return atoi(e);
|
||||
+ return kv_paged ? 64 : INT_MAX;
|
||||
+ }();
|
||||
+ // Tensor-core level selector (single build, clean runtime A/B). GDN_TC:
|
||||
+ // 0 = serial scan (patch 0031); 1 = KK/QK Gram mma (M2);
|
||||
+ // 2 = + KS/QS state-boundary mma, 3xtf32 (M3); 3 = + P*U output mma (M4);
|
||||
+ // 4/5+ = M5 (full TC: form-T solve + state-update mma) - the DEFAULT under paged KV.
|
||||
+ // (The bf16 CONFIG-C and register-resident M6/M7/M8 occupancy variants of patch 0044 are
|
||||
+ // intentionally absent from this f32-only series; the +3.5/+17.7% prefill win is the M5 path.)
|
||||
+ // GDN_GRAM_MMA=1 is kept as an alias for level 1.
|
||||
+ static const int gdn_tc = []{
|
||||
+ const char * e = getenv("GDN_TC");
|
||||
+ if (e) return atoi(e);
|
||||
+ const char * gm = getenv("GDN_GRAM_MMA");
|
||||
+ if (gm && atoi(gm) != 0) return 1;
|
||||
+ return kv_paged ? 5 : 0;
|
||||
+ }();
|
||||
if (S_v == 128 && n_tokens >= gdn_chunk_min) {
|
||||
- launch_gdn_chunked<128, 16>(
|
||||
- q_d, k_d, v_d, g_d, b_d, (const float *) s_d, dst_d, (float *) state_dst_d, ids_d, rs_head,
|
||||
- H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2, sb3,
|
||||
- neqk1_magic, rq3_magic, scale, stream);
|
||||
+#define GDN_CHUNKED_LAUNCH(TC_) \
|
||||
+ launch_gdn_chunked<128, 16, TC_>( \
|
||||
+ q_d, k_d, v_d, g_d, b_d, (const float *) s_d, dst_d, (float *) state_dst_d, ids_d, rs_head, \
|
||||
+ H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2, sb3, \
|
||||
+ neqk1_magic, rq3_magic, scale, stream)
|
||||
+ switch (gdn_tc) {
|
||||
+ case 0: GDN_CHUNKED_LAUNCH(0); break;
|
||||
+ case 1: GDN_CHUNKED_LAUNCH(1); break;
|
||||
+ case 2: GDN_CHUNKED_LAUNCH(2); break;
|
||||
+ case 3: GDN_CHUNKED_LAUNCH(3); break;
|
||||
+ default: GDN_CHUNKED_LAUNCH(4); break; // GDN_TC >= 4 -> M5 (full TC, kernel TC=4)
|
||||
+ }
|
||||
+#undef GDN_CHUNKED_LAUNCH
|
||||
return;
|
||||
}
|
||||
}
|
||||
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
|
||||
index 4e40d2353..817069860 100644
|
||||
--- a/tests/test-backend-ops.cpp
|
||||
+++ b/tests/test-backend-ops.cpp
|
||||
@@ -9372,6 +9372,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 32, 128, 1, 1));
|
||||
+ // Tensor-core chunked-GDN prefill path (S_v==128): multi-chunk (C=16) coverage,
|
||||
+ // incl. a tail chunk (100 = 6*16+4) and multi-seq. Exercised via GDN_CHUNK_MIN + GDN_TC.
|
||||
+ test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 32, 128, 64, 1));
|
||||
+ test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 32, 128, 100, 1));
|
||||
+ test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 32, 128, 128, 2));
|
||||
test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 32, 16, 1, 1));
|
||||
test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 32, 16, 1, 1, 1, true, true));
|
||||
test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 32, 16, 1, 1, 1, false, true));
|
||||
--
|
||||
2.43.0
|
||||
|
||||
51
backend/cpp/llama-cpp-localai-paged/run.sh
Executable file
51
backend/cpp/llama-cpp-localai-paged/run.sh
Executable file
@@ -0,0 +1,51 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
# Get the absolute current dir where the script is located
|
||||
CURDIR=$(dirname "$(realpath $0)")
|
||||
|
||||
cd /
|
||||
|
||||
echo "CPU info:"
|
||||
grep -e "model\sname" /proc/cpuinfo | head -1
|
||||
grep -e "flags" /proc/cpuinfo | head -1
|
||||
|
||||
BINARY=llama-cpp-localai-paged-fallback
|
||||
|
||||
# x86/arm64 ship a single llama-cpp-localai-paged-cpu-all built with ggml
|
||||
# CPU_ALL_VARIANTS: ggml's backend registry dlopens the best libggml-cpu-*.so for
|
||||
# this host, so no shell-side probing. ROCm ships only the fallback, so fall back
|
||||
# to it when cpu-all is absent.
|
||||
if [ -e $CURDIR/llama-cpp-localai-paged-cpu-all ]; then
|
||||
BINARY=llama-cpp-localai-paged-cpu-all
|
||||
fi
|
||||
|
||||
if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then
|
||||
if [ -e $CURDIR/llama-cpp-localai-paged-grpc ]; then
|
||||
BINARY=llama-cpp-localai-paged-grpc
|
||||
fi
|
||||
fi
|
||||
|
||||
# Extend ld library path with the dir where this script is located/lib
|
||||
if [ "$(uname)" == "Darwin" ]; then
|
||||
export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
|
||||
else
|
||||
export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
|
||||
# Tell rocBLAS where to find TensileLibrary data (GPU kernel tuning files)
|
||||
if [ -d "$CURDIR/lib/rocblas/library" ]; then
|
||||
export ROCBLAS_TENSILE_LIBPATH=$CURDIR/lib/rocblas/library
|
||||
fi
|
||||
fi
|
||||
|
||||
# If there is a lib/ld.so, use it
|
||||
if [ -f $CURDIR/lib/ld.so ]; then
|
||||
echo "Using lib/ld.so"
|
||||
echo "Using binary: $BINARY"
|
||||
exec $CURDIR/lib/ld.so $CURDIR/$BINARY "$@"
|
||||
fi
|
||||
|
||||
echo "Using binary: $BINARY"
|
||||
exec $CURDIR/$BINARY "$@"
|
||||
|
||||
# We should never reach this point, however just in case we do, run fallback
|
||||
exec $CURDIR/llama-cpp-localai-paged-fallback "$@"
|
||||
@@ -1,5 +1,10 @@
|
||||
|
||||
LLAMA_VERSION?=6f4f53f2b7da54fcdbbecaaa734337c337ad6176
|
||||
# This pin is auto-bumped nightly by .github/workflows/bump_deps.yaml (the stock
|
||||
# llama-cpp backend is patch-free, so a naive bump is safe). The paged backend
|
||||
# (backend/cpp/llama-cpp-localai-paged) does NOT inherit this pin: it owns its
|
||||
# own LLAMA_VERSION because its vendored patch series would break on a naive
|
||||
# bump and is advanced only by the manual PIN_SYNC process.
|
||||
LLAMA_VERSION?=0ed235ea2c17a19fc8238668653946721ed136fd
|
||||
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
|
||||
|
||||
CMAKE_ARGS?=
|
||||
@@ -169,7 +174,12 @@ llama.cpp:
|
||||
git remote add origin $(LLAMA_REPO) && \
|
||||
git fetch --all --tags && \
|
||||
git checkout -b build $(LLAMA_VERSION) && \
|
||||
git submodule update --init --recursive --depth 1 --single-branch
|
||||
git submodule update --init --recursive --depth 1 --single-branch && \
|
||||
for p in $(CURRENT_MAKEFILE_DIR)patches/0*.patch; do \
|
||||
[ -e "$$p" ] || continue; \
|
||||
echo "applying llama.cpp patch: $$p"; \
|
||||
git apply --verbose "$$p" || { echo "patch failed: $$p"; exit 1; }; \
|
||||
done
|
||||
|
||||
llama.cpp/tools/grpc-server: llama.cpp
|
||||
mkdir -p llama.cpp/tools/grpc-server
|
||||
|
||||
@@ -763,6 +763,97 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
|
||||
} else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") {
|
||||
params.kv_unified = false;
|
||||
}
|
||||
// --- paged KV cache (experimental, off by default) ---
|
||||
// Enables the on-demand paged KV-cache engine (vendored PagedKVManager
|
||||
// + paged placement/gather/alloc seams). The engine is gated inside
|
||||
// llama.cpp by the LLAMA_KV_PAGED env var, evaluated once at first use;
|
||||
// here we expose it as a per-server model option instead of forcing the
|
||||
// operator to export a process-wide env. When enabled we set the env
|
||||
// BEFORE the model/context is created (later in this handler), so the
|
||||
// engine latches on. When the option is absent we touch nothing, so an
|
||||
// externally exported LLAMA_KV_PAGED still works as an escape hatch.
|
||||
// Note: the engine's env check is process-wide and latches on first
|
||||
// use, so enabling it for one model enables it for the worker process;
|
||||
// LocalAI runs one model per llama.cpp worker, so this maps cleanly to
|
||||
// per-server configuration. `kv_paged_debug` turns on the per-slot
|
||||
// [paged-alloc]/free trace (LLAMA_KV_PAGED_DEBUG).
|
||||
//
|
||||
// The continuous-batching serving loop (update_slots) drives paged KV
|
||||
// transparently through the existing kv-cache seams: each slot's
|
||||
// sequence allocates paged blocks on arrival (find_slot placement) and
|
||||
// returns them on slot release (the seq_rm free seam). This is
|
||||
// token-identical to stock under both the unified and per-sequence
|
||||
// caches. The per-slot allocate/free capacity benefit, however, only
|
||||
// materialises with a per-sequence cache, since paged block ownership
|
||||
// is keyed by stream and the unified cache collapses every slot onto a
|
||||
// single stream. Operators who want that benefit should pair this with
|
||||
// `kv_unified:false`; we do NOT flip kv_unified here, to keep the
|
||||
// default serving behaviour (and the idle-slot prompt cache) unchanged.
|
||||
} else if (!strcmp(optname, "kv_paged") || !strcmp(optname, "paged_kv") || !strcmp(optname, "paged_attention")) {
|
||||
if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") {
|
||||
setenv("LLAMA_KV_PAGED", "1", 1);
|
||||
}
|
||||
} else if (!strcmp(optname, "kv_paged_debug") || !strcmp(optname, "paged_kv_debug")) {
|
||||
if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") {
|
||||
setenv("LLAMA_KV_PAGED_DEBUG", "1", 1);
|
||||
}
|
||||
// --- chunked-prefill QoS budget (experimental, off by default) ---
|
||||
// Caps the number of prompt tokens any single slot may prefill per
|
||||
// update_slots iteration, so a large prompt cannot monopolise the batch
|
||||
// and freeze the in-flight decoders. The serving loop reads this budget
|
||||
// from the LLAMA_PREFILL_BUDGET env var (set BEFORE context init, like
|
||||
// kv_paged above) and splits oversized prompts across iterations,
|
||||
// interleaving decode steps for the other slots. A 6k-token prefill that
|
||||
// stalled 8 decoders ~3.4s drops to ~780ms at budget=512 (4.8x stall
|
||||
// cut) with zero TTFT cost and no steady-state regression. Unset or a
|
||||
// non-positive value leaves the env untouched, so the stock unbounded
|
||||
// prefill behaviour is preserved (an externally exported
|
||||
// LLAMA_PREFILL_BUDGET still works as an escape hatch).
|
||||
} else if (!strcmp(optname, "max_prefill_tokens") || !strcmp(optname, "mpt") || !strcmp(optname, "prefill_budget")) {
|
||||
if (optval != NULL) {
|
||||
try {
|
||||
int budget = std::stoi(optval_str);
|
||||
if (budget > 0) {
|
||||
setenv("LLAMA_PREFILL_BUDGET", std::to_string(budget).c_str(), 1);
|
||||
}
|
||||
} catch (const std::exception& e) {
|
||||
// If conversion fails, leave the budget unset (stock behaviour)
|
||||
}
|
||||
}
|
||||
// --- dynamic decode-first prefill budget (patch 0016, continuous-batch P1) ---
|
||||
// Supersedes max_prefill_tokens (the static patch-0013 cap) with the dynamic
|
||||
// T - D budget read by update_slots(): a single total per-step token budget T
|
||||
// (max_batch_tokens / mbt, the vLLM max_num_batched_tokens analogue) of which
|
||||
// decode claims its live load D first and prefill gets the leftover, plus an
|
||||
// optional per-slot prompt-chunk cap (prefill_cap, the long_prefill_token_
|
||||
// threshold analogue). Both are set BEFORE context init, like kv_paged /
|
||||
// max_prefill_tokens above. Unset leaves the env untouched, so the engine stays
|
||||
// byte-identical to stock (an externally exported LLAMA_MAX_BATCH_TOKENS /
|
||||
// LLAMA_PREFILL_CAP still works as an escape hatch). When max_batch_tokens is set
|
||||
// it takes precedence over max_prefill_tokens: the engine honours the legacy
|
||||
// LLAMA_PREFILL_BUDGET only when the dynamic knob is unset.
|
||||
} else if (!strcmp(optname, "max_batch_tokens") || !strcmp(optname, "mbt")) {
|
||||
if (optval != NULL) {
|
||||
try {
|
||||
int mbt = std::stoi(optval_str);
|
||||
if (mbt > 0) {
|
||||
setenv("LLAMA_MAX_BATCH_TOKENS", std::to_string(mbt).c_str(), 1);
|
||||
}
|
||||
} catch (const std::exception& e) {
|
||||
// If conversion fails, leave the budget unset (stock behaviour)
|
||||
}
|
||||
}
|
||||
} else if (!strcmp(optname, "prefill_cap")) {
|
||||
if (optval != NULL) {
|
||||
try {
|
||||
int cap = std::stoi(optval_str);
|
||||
if (cap > 0) {
|
||||
setenv("LLAMA_PREFILL_CAP", std::to_string(cap).c_str(), 1);
|
||||
}
|
||||
} catch (const std::exception& e) {
|
||||
// If conversion fails, leave the per-slot cap unset (engine default)
|
||||
}
|
||||
}
|
||||
} else if (!strcmp(optname, "n_ctx_checkpoints") || !strcmp(optname, "ctx_checkpoints")) {
|
||||
if (optval != NULL) {
|
||||
try {
|
||||
|
||||
@@ -2,12 +2,18 @@
|
||||
|
||||
## Patches
|
||||
|
||||
## Apply patches from the `patches` directory
|
||||
## Apply the base `patches/` series (top-level *.patch only; *.md/dirs skipped).
|
||||
## The stock llama-cpp backend is patch-free by default, so this normally does
|
||||
## nothing. The Makefile `llama.cpp` target already `git apply`s any base patch
|
||||
## at checkout, so each apply here is `-N` (skip already-applied): re-applying a
|
||||
## git-format patch with `patch` would fuzzily duplicate hunks. This block only
|
||||
## does real work if prepare.sh is run against an unpatched checkout.
|
||||
if [ -d "patches" ]; then
|
||||
for patch in $(ls patches); do
|
||||
for patch in patches/*.patch; do
|
||||
[ -e "$patch" ] || continue
|
||||
echo "Applying patch $patch"
|
||||
patch -d llama.cpp/ -p1 < patches/$patch
|
||||
done
|
||||
patch -d llama.cpp/ -p1 -N -r - < "$patch" || true
|
||||
done
|
||||
fi
|
||||
|
||||
set -e
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
# Local development: point at a working checkout instead of cloning, e.g.
|
||||
# make PRIVACY_FILTER_SRC=$HOME/c/privacy-filter.cpp grpc-server
|
||||
|
||||
PRIVACY_FILTER_VERSION?=595f59630c69d361b5196f2aba2c71c873d0c13c
|
||||
PRIVACY_FILTER_VERSION?=98f52c5ef2250f207cc6b9a6aef05393a120cb7c
|
||||
PRIVACY_FILTER_REPO?=https://github.com/localai-org/privacy-filter.cpp
|
||||
PRIVACY_FILTER_SRC?=
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
||||
|
||||
# CrispASR version (release tag)
|
||||
CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
|
||||
CRISPASR_VERSION?=3b93758f9725d400eca82976f895e4cec3f31260
|
||||
CRISPASR_VERSION?=6514c9da00b03a2f0f1b49a43fae4f3a01a41844
|
||||
SO_TARGET?=libgocrispasr.so
|
||||
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# face-detect backend Makefile.
|
||||
#
|
||||
# Upstream pin lives below as FACEDETECT_VERSION?=e22260d5d5490b37b021b7f795079f386d553afd
|
||||
# Upstream pin lives below as FACEDETECT_VERSION?=06914b0... (.github/bump_deps.sh
|
||||
# can find and update it - matches the voice-detect / parakeet.cpp / whisper.cpp
|
||||
# convention).
|
||||
#
|
||||
@@ -14,7 +14,7 @@
|
||||
# The default target below does the proper clone-at-pin + cmake build so CI does
|
||||
# not need a side-checkout.
|
||||
|
||||
FACEDETECT_VERSION?=e22260d5d5490b37b021b7f795079f386d553afd
|
||||
FACEDETECT_VERSION?=06914b077d52f90d5421299138e7be6bdd06b5e8
|
||||
FACEDETECT_REPO?=https://github.com/mudler/face-detect.cpp
|
||||
|
||||
GOCMD?=go
|
||||
|
||||
@@ -1,81 +0,0 @@
|
||||
package main
|
||||
|
||||
// utteranceBoundary is the single definition of a small state machine that was
|
||||
// previously open-coded three times — as a bare `finalEou` bool with an ad-hoc
|
||||
// toggle — in the live feed (live.go), the file-stream text path, and the
|
||||
// file-stream JSON path (goparakeetcpp.go).
|
||||
//
|
||||
// It answers one running question: does the decode currently rest on an
|
||||
// end-of-utterance boundary? That is the value a closing FinalResult reports as
|
||||
// .Eou and the realtime turn detector treats as a commit point.
|
||||
//
|
||||
// parakeet auto-resets its decoder after every <EOU>/<EOB>, so one streaming
|
||||
// session is a sequence of utterances and this is a LATCH, not a monotonic
|
||||
// flag: it closes on an <EOU> and reopens as soon as the next utterance starts.
|
||||
// (Contrast the realtime API's per-turn `eouSeen`, which only ever goes
|
||||
// false->true because each turn gets a fresh stream. Here the stream outlives
|
||||
// the turn, so the boundary status must be able to reopen.)
|
||||
//
|
||||
// The only transitions, over the events one streamFeedResult carries — an
|
||||
// <EOU>, an <EOB> (backchannel), or plain speech output (text and/or words):
|
||||
//
|
||||
// <EOU>
|
||||
// open ───────────► closed
|
||||
// ▲ ▲ │ │ │
|
||||
// │ └─┘ <EOB>|speech │ │ <EOU>
|
||||
// │ (stay open) │ └─┘ (stay closed)
|
||||
// └──────────────────┘
|
||||
// <EOB>|speech
|
||||
//
|
||||
// open = NOT on an utterance boundary: mid-utterance, the last boundary was
|
||||
// a backchannel <EOB>, or the stream just began (the initial state).
|
||||
// closed = the last meaningful event was an <EOU> with no later speech: a real
|
||||
// turn boundary.
|
||||
//
|
||||
// A feed that carries nothing (no eou/eob/text/words — e.g. a finalize flush
|
||||
// that produced no tail) is a no-op and leaves the state unchanged, matching
|
||||
// the legacy "leave finalEou as it was" behaviour.
|
||||
//
|
||||
// The state carries no data, so it is modelled as a two-valued type (a named
|
||||
// bool) rather than an int enum: every inhabitant is legal, so illegal states
|
||||
// are unrepresentable — the payload-free analog of the sealed sum types the
|
||||
// realtime machines use (those need interfaces because their states carry data,
|
||||
// e.g. Active{ID}, where "Active with no ID" is the illegal combination a scalar
|
||||
// cannot even express).
|
||||
type utteranceBoundary bool
|
||||
|
||||
const (
|
||||
// boundaryOpen is the zero value (false), so a fresh decode starts open —
|
||||
// exactly the legacy `var finalEou bool` (false) initial condition.
|
||||
boundaryOpen utteranceBoundary = false
|
||||
boundaryClosed utteranceBoundary = true
|
||||
)
|
||||
|
||||
// observe folds one decode increment into the latch and returns the new state.
|
||||
//
|
||||
// <EOU> takes priority when a single feed carries both an <EOU> and speech
|
||||
// (e.g. {"text":"hello","eou":1}): the utterance both produced that text AND
|
||||
// ended, so the decode rests on the boundary. This matches the legacy
|
||||
// eou-checked-first ordering at every call site.
|
||||
func (b utteranceBoundary) observe(r streamFeedResult) utteranceBoundary {
|
||||
switch {
|
||||
case r.Eou:
|
||||
return boundaryClosed
|
||||
case r.Eob || r.Delta != "" || len(r.Words) > 0:
|
||||
return boundaryOpen
|
||||
default:
|
||||
return b
|
||||
}
|
||||
}
|
||||
|
||||
// ended reports whether the decode currently rests on an end-of-utterance
|
||||
// boundary (a real <EOU>, not a backchannel <EOB>). This is what a closing
|
||||
// FinalResult carries as .Eou.
|
||||
func (b utteranceBoundary) ended() bool { return b == boundaryClosed }
|
||||
|
||||
func (b utteranceBoundary) String() string {
|
||||
if b == boundaryClosed {
|
||||
return "closed"
|
||||
}
|
||||
return "open"
|
||||
}
|
||||
@@ -1,92 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"math/rand/v2"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("utteranceBoundary (decode end-of-utterance latch)", func() {
|
||||
It("starts open: a fresh decode is not on a boundary", func() {
|
||||
var b utteranceBoundary
|
||||
Expect(b).To(Equal(boundaryOpen))
|
||||
Expect(b.ended()).To(BeFalse())
|
||||
})
|
||||
|
||||
DescribeTable("single feed transition from the open state",
|
||||
func(r streamFeedResult, wantEnded bool) {
|
||||
Expect(boundaryOpen.observe(r).ended()).To(Equal(wantEnded))
|
||||
},
|
||||
Entry("<EOU> closes it", streamFeedResult{Eou: true}, true),
|
||||
Entry("<EOU> with text closes it (eou wins)", streamFeedResult{Delta: "hi", Eou: true}, true),
|
||||
Entry("<EOB> stays open (backchannel is not a turn boundary)", streamFeedResult{Eob: true}, false),
|
||||
Entry("plain text stays open", streamFeedResult{Delta: "hello"}, false),
|
||||
Entry("words-only stays open", streamFeedResult{Words: []transcriptWord{{W: "x"}}}, false),
|
||||
Entry("empty feed is a no-op (stays open)", streamFeedResult{}, false),
|
||||
)
|
||||
|
||||
DescribeTable("single feed transition from the closed state",
|
||||
func(r streamFeedResult, wantEnded bool) {
|
||||
Expect(boundaryClosed.observe(r).ended()).To(Equal(wantEnded))
|
||||
},
|
||||
Entry("another <EOU> stays closed", streamFeedResult{Eou: true}, true),
|
||||
Entry("trailing speech reopens it", streamFeedResult{Delta: "and more"}, false),
|
||||
Entry("words reopen it", streamFeedResult{Words: []transcriptWord{{W: "x"}}}, false),
|
||||
Entry("a backchannel <EOB> reopens it", streamFeedResult{Eob: true}, false),
|
||||
Entry("empty feed is a no-op (stays closed)", streamFeedResult{}, true),
|
||||
)
|
||||
|
||||
It("is a latch: <EOU> then trailing speech reopens, then <EOU> closes again", func() {
|
||||
b := boundaryOpen
|
||||
b = b.observe(streamFeedResult{Delta: "turn one", Eou: true})
|
||||
Expect(b.ended()).To(BeTrue())
|
||||
b = b.observe(streamFeedResult{Delta: " and more"})
|
||||
Expect(b.ended()).To(BeFalse(), "trailing speech without an EOU is an open utterance")
|
||||
b = b.observe(streamFeedResult{Eou: true})
|
||||
Expect(b.ended()).To(BeTrue())
|
||||
})
|
||||
|
||||
It("treats a backchannel before a real EOU correctly", func() {
|
||||
b := boundaryOpen
|
||||
b = b.observe(streamFeedResult{Delta: "uh huh", Eob: true})
|
||||
Expect(b.ended()).To(BeFalse(), "a backchannel must not masquerade as a turn boundary")
|
||||
b = b.observe(streamFeedResult{Delta: "done", Eou: true})
|
||||
Expect(b.ended()).To(BeTrue())
|
||||
})
|
||||
|
||||
It("matches the reference fold over seeded random feed sequences", func() {
|
||||
// The invariant: after any sequence of feeds, ended() is true iff the
|
||||
// last feed that carried ANY event was an <EOU>. <EOU> takes priority
|
||||
// when a feed carries both an EOU and speech; empty feeds are ignored.
|
||||
for seed := uint64(1); seed <= 200; seed++ {
|
||||
rng := rand.New(rand.NewPCG(seed, seed*2654435761))
|
||||
b := boundaryOpen
|
||||
lastWasEou := false // reference: did the last meaningful feed end on EOU?
|
||||
steps := rng.IntN(30)
|
||||
for i := 0; i < steps; i++ {
|
||||
var r streamFeedResult
|
||||
switch rng.IntN(5) {
|
||||
case 0:
|
||||
r = streamFeedResult{Eou: true}
|
||||
case 1:
|
||||
r = streamFeedResult{Eob: true}
|
||||
case 2:
|
||||
r = streamFeedResult{Delta: "w"}
|
||||
case 3:
|
||||
r = streamFeedResult{Delta: "w", Eou: true} // eou + speech, eou wins
|
||||
case 4:
|
||||
r = streamFeedResult{} // empty: no-op
|
||||
}
|
||||
b = b.observe(r)
|
||||
if r.Eou {
|
||||
lastWasEou = true
|
||||
} else if r.Eob || r.Delta != "" || len(r.Words) > 0 {
|
||||
lastWasEou = false
|
||||
}
|
||||
}
|
||||
Expect(b.ended()).To(Equal(lastWasEou),
|
||||
"seed %d: latch disagreed with the reference fold", seed)
|
||||
}
|
||||
})
|
||||
})
|
||||
@@ -1,82 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"google.golang.org/grpc/codes"
|
||||
"google.golang.org/grpc/status"
|
||||
)
|
||||
|
||||
// streamFeedResult is one decode increment from a cache-aware streaming session:
|
||||
// the newly-finalized text plus the model's own per-feed boundary tokens
|
||||
// (<EOU>/<EOB>) and word timings. It is the single event type both the live
|
||||
// (bidi) and file (server-stream) paths fold over, hiding the ABI v4 JSON vs
|
||||
// older text-only entry-point split behind one shape.
|
||||
type streamFeedResult struct {
|
||||
Delta string
|
||||
Eou bool
|
||||
Eob bool
|
||||
Words []transcriptWord
|
||||
}
|
||||
|
||||
// feedChunk feeds one PCM chunk to the streaming session (or finalizes it, when
|
||||
// finalize is true) and returns the unified decode increment. It prefers the
|
||||
// ABI v4 JSON entry points (which also carry per-word timestamps) and falls
|
||||
// back to the older text-only entry points against an older libparakeet.so.
|
||||
//
|
||||
// This is the one place the JSON-vs-text choice is made; every consumer works
|
||||
// in terms of streamFeedResult.
|
||||
func (p *ParakeetCpp) feedChunk(stream uintptr, pcm []float32, finalize bool) (streamFeedResult, error) {
|
||||
if CppStreamFeedJSON != nil {
|
||||
doc, err := p.streamFeedDoc(stream, pcm, finalize)
|
||||
if err != nil {
|
||||
return streamFeedResult{}, err
|
||||
}
|
||||
return streamFeedResult{Delta: doc.Text, Eou: doc.Eou != 0, Eob: doc.Eob != 0, Words: doc.Words}, nil
|
||||
}
|
||||
delta, eou, eob, err := p.streamFeedText(stream, pcm, finalize)
|
||||
if err != nil {
|
||||
return streamFeedResult{}, err
|
||||
}
|
||||
return streamFeedResult{Delta: delta, Eou: eou, Eob: eob}, nil
|
||||
}
|
||||
|
||||
// feedSlices feeds pcm through the session in streamChunkSamples slices,
|
||||
// invoking onFeed for each decode increment. It does NOT finalize: callers
|
||||
// decide when the send side is done. The file path finalizes after the whole
|
||||
// file; the live path finalizes only when its request channel closes, never
|
||||
// between audio messages. Slicing keeps each per-call engineMu hold short so
|
||||
// concurrent unary transcription interleaves fairly (the C session buffers
|
||||
// internally).
|
||||
//
|
||||
// If ctx is non-nil it is checked before each slice so a cancelled file
|
||||
// transcription stops promptly; the live path passes nil (it is bounded by its
|
||||
// request channel instead of a ctx).
|
||||
func (p *ParakeetCpp) feedSlices(ctx context.Context, stream uintptr, pcm []float32, onFeed func(streamFeedResult) error) error {
|
||||
for off := 0; off < len(pcm); off += streamChunkSamples {
|
||||
if ctx != nil {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return status.Error(codes.Canceled, "transcription cancelled")
|
||||
}
|
||||
}
|
||||
end := min(off+streamChunkSamples, len(pcm))
|
||||
res, err := p.feedChunk(stream, pcm[off:end], false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := onFeed(res); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// flushTail finalizes the session once and folds the flushed tail (the last
|
||||
// ~2 encoder frames of text, which only appear on finalize) through onFeed.
|
||||
func (p *ParakeetCpp) flushTail(stream uintptr, onFeed func(streamFeedResult) error) error {
|
||||
res, err := p.feedChunk(stream, nil, true)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return onFeed(res)
|
||||
}
|
||||
@@ -103,13 +103,12 @@ type transcriptJSON struct {
|
||||
// {"text":"...","eou":0,"eob":0,"frame_sec":0.080000,
|
||||
// "words":[{"w":"...","start":0.480,"end":0.640,"conf":0.9100}, ...]}
|
||||
//
|
||||
// "text" is the newly-finalized text since the last call. Under ABI v5 "eou"
|
||||
// is 1 iff an <EOU> fired this feed (the user yielded the turn) and "eob" 1
|
||||
// iff an <EOB> fired (a backchannel like "uh-huh" ended — NOT a turn
|
||||
// boundary). A v4 library has no "eob" field and its "eou" conflates both
|
||||
// tokens: Eob stays 0 and Eou keeps the old any-event meaning. "words" are
|
||||
// the words finalized this call with absolute (stream-relative) start/end
|
||||
// seconds.
|
||||
// "text" is the newly-finalized text since the last call; "eou" is 1 when an
|
||||
// <EOU> (end of utterance) fired this feed and "eob" is 1 when an <EOB>
|
||||
// (backchannel) fired. ABI v4 conflated the two into "eou"; v5 split them, so
|
||||
// we read both and treat either as an utterance boundary for segmentation.
|
||||
// "words" are the words finalized this call with absolute (stream-relative)
|
||||
// start/end seconds.
|
||||
type streamFeedJSON struct {
|
||||
Text string `json:"text"`
|
||||
Eou int `json:"eou"`
|
||||
@@ -365,7 +364,7 @@ var segmentSeparators = []rune{'.', '?', '!'}
|
||||
// the caller requested word granularity; token ids populate each segment's
|
||||
// Tokens by time-window membership. Shared by the batched and direct paths.
|
||||
func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gapFrames int) pb.TranscriptResult {
|
||||
text, eou := stripEouMarker(strings.TrimSpace(doc.Text))
|
||||
text := strings.TrimSpace(doc.Text)
|
||||
|
||||
// Frame-unit gap threshold -> seconds (NeMo segment_gap_threshold). 0 = off.
|
||||
gapSeconds := 0.0
|
||||
@@ -384,7 +383,6 @@ func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gap
|
||||
return pb.TranscriptResult{
|
||||
Text: text,
|
||||
Segments: []*pb.TranscriptSegment{{Id: 0, Text: text}},
|
||||
Eou: eou,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -411,25 +409,7 @@ func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gap
|
||||
}
|
||||
segments = append(segments, seg)
|
||||
}
|
||||
return pb.TranscriptResult{Text: text, Segments: segments, Eou: eou}
|
||||
}
|
||||
|
||||
// stripEouMarker removes a trailing literal <EOU>/<EOB> from offline-decode
|
||||
// text and reports whether the decode ended on an end-of-UTTERANCE token. The
|
||||
// realtime EOU model's offline decode keeps the special token in the
|
||||
// detokenized text (the streaming path strips it and surfaces it as flags
|
||||
// instead); user-visible transcripts must never carry either marker, but only
|
||||
// <EOU> may confirm the semantic_vad retranscribe cross-check — a decode
|
||||
// ending on <EOB> means the last thing heard was a backchannel, not the user
|
||||
// yielding the turn.
|
||||
func stripEouMarker(text string) (string, bool) {
|
||||
if strings.HasSuffix(text, "<EOU>") {
|
||||
return strings.TrimSpace(strings.TrimSuffix(text, "<EOU>")), true
|
||||
}
|
||||
if strings.HasSuffix(text, "<EOB>") {
|
||||
return strings.TrimSpace(strings.TrimSuffix(text, "<EOB>")), false
|
||||
}
|
||||
return text, false
|
||||
return pb.TranscriptResult{Text: text, Segments: segments}
|
||||
}
|
||||
|
||||
// splitWordsIntoSegments groups words into segments exactly as NeMo's
|
||||
@@ -496,55 +476,41 @@ func tokensInWindow(tokens []transcriptToken, start, end float64) []int32 {
|
||||
return ids
|
||||
}
|
||||
|
||||
// streamSegmenter accumulates streaming decode increments into per-utterance
|
||||
// segments. <EOU>/<EOB> are the model's own utterance boundaries; each closes a
|
||||
// segment. When the feed carries per-word timings (ABI v4 JSON), a closed
|
||||
// segment takes its start/end from its first/last word; against an older
|
||||
// text-only library (no words) it falls back to segmenting the delta text, so
|
||||
// the same assembler serves both paths.
|
||||
// streamSegmenter accumulates streaming words into per-utterance segments. EOU
|
||||
// is the model's own utterance boundary; each closed segment takes its start/end
|
||||
// from its first/last accumulated word.
|
||||
type streamSegmenter struct {
|
||||
segs []*pb.TranscriptSegment
|
||||
cur []transcriptWord // words for the open segment (ABI v4 JSON path)
|
||||
curText []string // delta text for the open segment (text-only path)
|
||||
nextID int32
|
||||
segs []*pb.TranscriptSegment
|
||||
cur []transcriptWord
|
||||
nextID int32
|
||||
}
|
||||
|
||||
func (s *streamSegmenter) add(r streamFeedResult) {
|
||||
s.cur = append(s.cur, r.Words...)
|
||||
if len(r.Words) == 0 && r.Delta != "" {
|
||||
// Older libparakeet.so with no per-word timing: segment from the text.
|
||||
s.curText = append(s.curText, r.Delta)
|
||||
}
|
||||
// Both <EOU> and <EOB> reset the decoder, so both close a segment.
|
||||
if r.Eou || r.Eob {
|
||||
func (s *streamSegmenter) add(doc streamFeedJSON) {
|
||||
s.cur = append(s.cur, doc.Words...)
|
||||
// Close the segment on either turn signal: <EOU> (end of utterance) or
|
||||
// <EOB> (backchannel). ABI v4 reported both via "eou"; v5 split them, so we
|
||||
// OR them here to keep the v4 segmentation boundaries.
|
||||
if doc.Eou != 0 || doc.Eob != 0 {
|
||||
s.flush()
|
||||
}
|
||||
}
|
||||
|
||||
func (s *streamSegmenter) flush() {
|
||||
switch {
|
||||
case len(s.cur) > 0:
|
||||
parts := make([]string, len(s.cur))
|
||||
for i, w := range s.cur {
|
||||
parts[i] = w.W
|
||||
}
|
||||
s.segs = append(s.segs, &pb.TranscriptSegment{
|
||||
Id: s.nextID,
|
||||
Start: secondsToNanos(s.cur[0].Start),
|
||||
End: secondsToNanos(s.cur[len(s.cur)-1].End),
|
||||
Text: strings.TrimSpace(strings.Join(parts, " ")),
|
||||
})
|
||||
s.nextID++
|
||||
case len(s.curText) > 0:
|
||||
// No words this segment: emit a text-only segment (no timestamps),
|
||||
// skipping a purely-whitespace one as the legacy text path did.
|
||||
if t := strings.TrimSpace(strings.Join(s.curText, "")); t != "" {
|
||||
s.segs = append(s.segs, &pb.TranscriptSegment{Id: s.nextID, Text: t})
|
||||
s.nextID++
|
||||
}
|
||||
if len(s.cur) == 0 {
|
||||
return
|
||||
}
|
||||
parts := make([]string, len(s.cur))
|
||||
for i, w := range s.cur {
|
||||
parts[i] = w.W
|
||||
}
|
||||
s.segs = append(s.segs, &pb.TranscriptSegment{
|
||||
Id: s.nextID,
|
||||
Start: secondsToNanos(s.cur[0].Start),
|
||||
End: secondsToNanos(s.cur[len(s.cur)-1].End),
|
||||
Text: strings.TrimSpace(strings.Join(parts, " ")),
|
||||
})
|
||||
s.nextID++
|
||||
s.cur = nil
|
||||
s.curText = nil
|
||||
}
|
||||
|
||||
func (s *streamSegmenter) segments() []*pb.TranscriptSegment { return s.segs }
|
||||
@@ -569,119 +535,18 @@ func secondsToNanos(sec float64) int64 {
|
||||
return int64(sec * 1e9)
|
||||
}
|
||||
|
||||
// Per-C-call engine serialization for the streaming paths.
|
||||
//
|
||||
// Every individual C call (begin / feed / finalize / free) takes engineMu and
|
||||
// re-checks ctxPtr under the lock; the lock is NEVER held across a stream's
|
||||
// lifetime. This is safe because each parakeet.cpp call builds its own ggml
|
||||
// graph and all streaming caches live in the session object, not the ctx —
|
||||
// the only ctx-shared mutable state is last_error, which is why it is read
|
||||
// under the same lock as the failing call. Holding the lock per call (rather
|
||||
// than per stream, as this file previously did) keeps a long-lived live
|
||||
// session from starving batched unary transcription and vice versa.
|
||||
//
|
||||
// A stream must not outlive its ctx (C-API contract). Free() takes engineMu
|
||||
// and zeroes ctxPtr, so a racing per-call helper returns ModelNotLoaded
|
||||
// instead of feeding a freed engine; streamFree of an orphaned session only
|
||||
// runs the session destructor, which does not touch the ctx.
|
||||
|
||||
// streamBegin opens a cache-aware streaming session. A 0 stream with nil
|
||||
// error means the loaded model is not a streaming model.
|
||||
func (p *ParakeetCpp) streamBegin(lang string) (uintptr, error) {
|
||||
p.engineMu.Lock()
|
||||
defer p.engineMu.Unlock()
|
||||
if p.ctxPtr == 0 {
|
||||
return 0, grpcerrors.ModelNotLoaded("parakeet-cpp")
|
||||
}
|
||||
if CppStreamBeginLang != nil {
|
||||
return CppStreamBeginLang(p.ctxPtr, lang), nil
|
||||
}
|
||||
return CppStreamBegin(p.ctxPtr), nil
|
||||
}
|
||||
|
||||
func (p *ParakeetCpp) streamFree(stream uintptr) {
|
||||
if stream == 0 {
|
||||
return
|
||||
}
|
||||
p.engineMu.Lock()
|
||||
defer p.engineMu.Unlock()
|
||||
CppStreamFree(stream)
|
||||
}
|
||||
|
||||
// streamFeedText runs one text-mode feed (or the finalize flush when
|
||||
// finalize is true) under engineMu, returning the newly-finalized delta and
|
||||
// whether an <EOU>/<EOB> fired during the call.
|
||||
func (p *ParakeetCpp) streamFeedText(stream uintptr, pcm []float32, finalize bool) (delta string, eou, eob bool, err error) {
|
||||
p.engineMu.Lock()
|
||||
defer p.engineMu.Unlock()
|
||||
if p.ctxPtr == 0 {
|
||||
return "", false, false, grpcerrors.ModelNotLoaded("parakeet-cpp")
|
||||
}
|
||||
var ret uintptr
|
||||
var events int32
|
||||
if finalize {
|
||||
ret = CppStreamFinalize(stream)
|
||||
} else {
|
||||
ret = CppStreamFeed(stream, pcm, int32(len(pcm)), unsafe.Pointer(&events))
|
||||
}
|
||||
if ret == 0 {
|
||||
// last_error is ctx-shared: read it under the same lock as the call.
|
||||
msg := CppLastError(p.ctxPtr)
|
||||
if msg == "" {
|
||||
msg = "unknown error"
|
||||
}
|
||||
return "", false, false, fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
|
||||
}
|
||||
delta = goStringFromCPtr(ret)
|
||||
CppFreeString(ret)
|
||||
// ABI v5: eou_out is a bitmask (bit 0 = <EOU>, bit 1 = <EOB>). A v4
|
||||
// library sets 0/1 for either token, which the bit-0 test reads as the
|
||||
// old conflated eou — the EOB distinction simply isn't available there.
|
||||
return delta, events&1 != 0, events&2 != 0, nil
|
||||
}
|
||||
|
||||
// streamFeedDoc runs one ABI v4 JSON feed (or finalize) under engineMu and
|
||||
// returns the parsed {text,eou,frame_sec,words} document.
|
||||
func (p *ParakeetCpp) streamFeedDoc(stream uintptr, pcm []float32, finalize bool) (streamFeedJSON, error) {
|
||||
p.engineMu.Lock()
|
||||
defer p.engineMu.Unlock()
|
||||
if p.ctxPtr == 0 {
|
||||
return streamFeedJSON{}, grpcerrors.ModelNotLoaded("parakeet-cpp")
|
||||
}
|
||||
var ret uintptr
|
||||
if finalize {
|
||||
ret = CppStreamFinalizeJSON(stream)
|
||||
} else {
|
||||
ret = CppStreamFeedJSON(stream, pcm, int32(len(pcm)))
|
||||
}
|
||||
if ret == 0 {
|
||||
msg := CppLastError(p.ctxPtr)
|
||||
if msg == "" {
|
||||
msg = "unknown error"
|
||||
}
|
||||
return streamFeedJSON{}, fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
|
||||
}
|
||||
raw := goStringFromCPtr(ret)
|
||||
CppFreeString(ret)
|
||||
var doc streamFeedJSON
|
||||
if err := json.Unmarshal([]byte(raw), &doc); err != nil {
|
||||
return streamFeedJSON{}, fmt.Errorf("parakeet-cpp: decode stream json: %w", err)
|
||||
}
|
||||
return doc, nil
|
||||
}
|
||||
|
||||
// AudioTranscriptionStream drives the cache-aware streaming RNN-T over the
|
||||
// audio at opts.Dst: it decodes the file to 16 kHz mono PCM, feeds it through
|
||||
// the shared decode driver (feedSlices/flushTail), and emits each
|
||||
// newly-finalized text run as a TranscriptStreamResponse delta. <EOU>/<EOB>
|
||||
// events close the current segment; a closing FinalResult carries the full
|
||||
// transcript, the per-utterance segments, and whether the file ended on an
|
||||
// utterance boundary.
|
||||
// audio at opts.Dst: it decodes the file to 16 kHz mono PCM, feeds it in
|
||||
// chunks to parakeet_capi_stream_feed, and emits each newly-finalized text
|
||||
// run as a TranscriptStreamResponse delta. <EOU>/<EOB> events close the
|
||||
// current segment; a closing FinalResult carries the full transcript and the
|
||||
// per-utterance segments.
|
||||
//
|
||||
// stream_begin returns 0 for models that are not cache-aware streaming models
|
||||
// (only e.g. nvidia/parakeet_realtime_eou_120m-v1 qualifies). For those this
|
||||
// returns codes.Unimplemented rather than faking a stream from an offline
|
||||
// decode — see the stream==0 branch and grpcerrors.StreamTranscriptionUnsupported.
|
||||
// (only e.g. nvidia/parakeet_realtime_eou_120m-v1 qualifies). For those we fall
|
||||
// back to a single offline transcription emitted as one delta plus a closing
|
||||
// FinalResult, matching LocalAI's non-streaming streaming contract (and the
|
||||
// whisper backend), so the streaming endpoint works for every model.
|
||||
func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.TranscriptRequest, results chan *pb.TranscriptStreamResponse) error {
|
||||
defer close(results)
|
||||
|
||||
@@ -695,73 +560,185 @@ func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.Tra
|
||||
return status.Error(codes.Canceled, "transcription cancelled")
|
||||
}
|
||||
|
||||
stream, err := p.streamBegin(opts.GetLanguage())
|
||||
if err != nil {
|
||||
return err
|
||||
var stream uintptr
|
||||
if CppStreamBeginLang != nil {
|
||||
stream = CppStreamBeginLang(p.ctxPtr, opts.GetLanguage())
|
||||
} else {
|
||||
stream = CppStreamBegin(p.ctxPtr)
|
||||
}
|
||||
if stream == 0 {
|
||||
// Not a cache-aware streaming model. Report the missing capability
|
||||
// honestly instead of decoding offline and emitting it as one "delta"
|
||||
// + final: a client that asked for streaming must learn the model
|
||||
// cannot stream, not receive a batch result dressed as a stream (which
|
||||
// is indistinguishable except qualitatively, and silently breaks any
|
||||
// feature that genuinely needs incremental output). Callers wanting a
|
||||
// plain transcript use the unary AudioTranscription path. This mirrors
|
||||
// AudioTranscriptionLive, which already returns Unimplemented here.
|
||||
return grpcerrors.StreamTranscriptionUnsupported("parakeet-cpp",
|
||||
"loaded model is not a cache-aware streaming model")
|
||||
// Not a cache-aware streaming model: run a normal offline
|
||||
// transcription and emit it as one delta + a closing final result.
|
||||
res, err := p.AudioTranscription(ctx, opts)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if t := strings.TrimSpace(res.Text); t != "" {
|
||||
results <- &pb.TranscriptStreamResponse{Delta: t}
|
||||
}
|
||||
results <- &pb.TranscriptStreamResponse{FinalResult: &res}
|
||||
return nil
|
||||
}
|
||||
defer p.streamFree(stream)
|
||||
defer CppStreamFree(stream)
|
||||
// The C engine is a single shared context: a streaming session and a batched
|
||||
// unary dispatch must never touch it at once, so hold engineMu for the whole
|
||||
// stream. This lock is intentionally taken AFTER the non-streaming fallback
|
||||
// above returns: that fallback goes through AudioTranscription -> the batcher
|
||||
// -> runBatch, which itself acquires engineMu, so locking here first would
|
||||
// deadlock. Do not hoist this lock above the fallback.
|
||||
p.engineMu.Lock()
|
||||
defer p.engineMu.Unlock()
|
||||
|
||||
data, duration, err := decodeWavMono16k(opts.Dst)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Fold the shared decode driver's per-feed increments into the streamed
|
||||
// deltas and the closing batch result: words/text accumulate into
|
||||
// per-utterance segments (streamSegmenter), and the utterance-boundary
|
||||
// latch (boundary.go) records whether the file ended on an <EOU>. These
|
||||
// are the offline path's concern — the live RPC carries none of them.
|
||||
// ABI v4: when the streaming JSON entry points are present, drive them so the
|
||||
// per-utterance segments carry per-word start/end timestamps. Falls through to
|
||||
// the text-only loop below against an older libparakeet.so. Runs under the
|
||||
// engineMu already held above.
|
||||
if CppStreamFeedJSON != nil {
|
||||
return p.streamJSON(ctx, stream, data, duration, results)
|
||||
}
|
||||
|
||||
var (
|
||||
full strings.Builder
|
||||
seg streamSegmenter
|
||||
boundary utteranceBoundary
|
||||
segText strings.Builder
|
||||
segments []*pb.TranscriptSegment
|
||||
segID int32
|
||||
)
|
||||
emit := func(r streamFeedResult) error {
|
||||
if r.Delta != "" {
|
||||
full.WriteString(r.Delta)
|
||||
results <- &pb.TranscriptStreamResponse{Delta: r.Delta}
|
||||
|
||||
flushSegment := func() {
|
||||
t := strings.TrimSpace(segText.String())
|
||||
segText.Reset()
|
||||
if t == "" {
|
||||
return
|
||||
}
|
||||
seg.add(r)
|
||||
boundary = boundary.observe(r)
|
||||
segments = append(segments, &pb.TranscriptSegment{Id: segID, Text: t})
|
||||
segID++
|
||||
}
|
||||
|
||||
// emitDelta consumes the malloc'd char* returned by feed/finalize: frees
|
||||
// it, accumulates the text, and sends a delta when non-empty. A 0 return
|
||||
// is an error (vs the "" empty-but-non-NULL no-new-text case).
|
||||
emitDelta := func(ret uintptr) error {
|
||||
if ret == 0 {
|
||||
msg := CppLastError(p.ctxPtr)
|
||||
if msg == "" {
|
||||
msg = "unknown error"
|
||||
}
|
||||
return fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
|
||||
}
|
||||
delta := goStringFromCPtr(ret)
|
||||
CppFreeString(ret)
|
||||
if delta == "" {
|
||||
return nil
|
||||
}
|
||||
full.WriteString(delta)
|
||||
segText.WriteString(delta)
|
||||
results <- &pb.TranscriptStreamResponse{Delta: delta}
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := p.feedSlices(ctx, stream, data, emit); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := p.flushTail(stream, emit); err != nil {
|
||||
return err
|
||||
}
|
||||
seg.flush() // close a trailing utterance that never saw an <EOU>
|
||||
for off := 0; off < len(data); off += streamChunkSamples {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return status.Error(codes.Canceled, "transcription cancelled")
|
||||
}
|
||||
end := min(off+streamChunkSamples, len(data))
|
||||
chunk := data[off:end]
|
||||
|
||||
// final.Text is the exact concatenation of the streamed deltas (full is
|
||||
// their accumulation), so concat(deltas) == FinalResult.Text holds even
|
||||
// when the model prepends a leading space to the first word (SentencePiece
|
||||
// detokenization). This matches the whisper backend's streaming contract.
|
||||
// The single-segment fallback stays trimmed.
|
||||
fullText := full.String()
|
||||
segments := seg.segments()
|
||||
if trimmed := strings.TrimSpace(fullText); len(segments) == 0 && trimmed != "" {
|
||||
segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: trimmed})
|
||||
var eou int32
|
||||
ret := CppStreamFeed(stream, chunk, int32(len(chunk)), unsafe.Pointer(&eou))
|
||||
if err := emitDelta(ret); err != nil {
|
||||
return err
|
||||
}
|
||||
if eou != 0 {
|
||||
flushSegment()
|
||||
}
|
||||
}
|
||||
|
||||
// Flush the streaming tail (final encoder chunk).
|
||||
if err := emitDelta(CppStreamFinalize(stream)); err != nil {
|
||||
return err
|
||||
}
|
||||
flushSegment()
|
||||
|
||||
text := strings.TrimSpace(full.String())
|
||||
if len(segments) == 0 && text != "" {
|
||||
segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: text})
|
||||
}
|
||||
results <- &pb.TranscriptStreamResponse{
|
||||
FinalResult: &pb.TranscriptResult{
|
||||
Text: fullText,
|
||||
Text: text,
|
||||
Segments: segments,
|
||||
Duration: duration,
|
||||
},
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// streamJSON drives the streaming JSON entry points (present since ABI v4): each
|
||||
// feed/finalize returns a {text,eou,eob,frame_sec,words} document. The
|
||||
// newly-finalized text is emitted as a delta (unchanged streaming contract)
|
||||
// while words are accumulated into per-utterance segments (closed on <EOU> or
|
||||
// <EOB>) so the closing FinalResult carries timestamped segments. Runs under
|
||||
// engineMu (already held by the caller).
|
||||
func (p *ParakeetCpp) streamJSON(ctx context.Context, stream uintptr, data []float32,
|
||||
duration float32, results chan *pb.TranscriptStreamResponse) error {
|
||||
var (
|
||||
full strings.Builder
|
||||
seg streamSegmenter
|
||||
)
|
||||
// consume frees the malloc'd char* (a 0 return is an error), parses the JSON,
|
||||
// emits the delta, and routes words through the segmenter.
|
||||
consume := func(ret uintptr) error {
|
||||
if ret == 0 {
|
||||
msg := CppLastError(p.ctxPtr)
|
||||
if msg == "" {
|
||||
msg = "unknown error"
|
||||
}
|
||||
return fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
|
||||
}
|
||||
raw := goStringFromCPtr(ret)
|
||||
CppFreeString(ret)
|
||||
var doc streamFeedJSON
|
||||
if err := json.Unmarshal([]byte(raw), &doc); err != nil {
|
||||
return fmt.Errorf("parakeet-cpp: decode stream json: %w", err)
|
||||
}
|
||||
if doc.Text != "" {
|
||||
full.WriteString(doc.Text)
|
||||
results <- &pb.TranscriptStreamResponse{Delta: doc.Text}
|
||||
}
|
||||
seg.add(doc)
|
||||
return nil
|
||||
}
|
||||
|
||||
for off := 0; off < len(data); off += streamChunkSamples {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return status.Error(codes.Canceled, "transcription cancelled")
|
||||
}
|
||||
end := min(off+streamChunkSamples, len(data))
|
||||
chunk := data[off:end]
|
||||
if err := consume(CppStreamFeedJSON(stream, chunk, int32(len(chunk)))); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if err := consume(CppStreamFinalizeJSON(stream)); err != nil {
|
||||
return err
|
||||
}
|
||||
seg.flush() // close any trailing utterance that never saw an EOU
|
||||
|
||||
text := strings.TrimSpace(full.String())
|
||||
segments := seg.segments()
|
||||
if len(segments) == 0 && text != "" {
|
||||
segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: text})
|
||||
}
|
||||
results <- &pb.TranscriptStreamResponse{
|
||||
FinalResult: &pb.TranscriptResult{
|
||||
Text: text,
|
||||
Segments: segments,
|
||||
Duration: duration,
|
||||
Eou: boundary.ended(),
|
||||
},
|
||||
}
|
||||
return nil
|
||||
@@ -826,10 +803,6 @@ func (p *ParakeetCpp) Free() error {
|
||||
close(p.batStop)
|
||||
p.batStop = nil
|
||||
}
|
||||
// engineMu so an in-flight streaming call (which locks per C call and
|
||||
// re-checks ctxPtr under the lock) can never feed into a freed ctx.
|
||||
p.engineMu.Lock()
|
||||
defer p.engineMu.Unlock()
|
||||
if p.ctxPtr != 0 {
|
||||
CppFree(p.ctxPtr)
|
||||
p.ctxPtr = 0
|
||||
|
||||
@@ -14,8 +14,6 @@ import (
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
"google.golang.org/grpc/codes"
|
||||
"google.golang.org/grpc/status"
|
||||
)
|
||||
|
||||
func TestParakeetCpp(t *testing.T) {
|
||||
@@ -203,29 +201,6 @@ var _ = Describe("ParakeetCpp", func() {
|
||||
})
|
||||
|
||||
Context("AudioTranscriptionStream", func() {
|
||||
It("returns the typed Unimplemented signal for non-streaming models (no offline fallback)", func() {
|
||||
// stream_begin == 0 means the loaded model is not a cache-aware
|
||||
// streaming model. The backend must surface that, not silently
|
||||
// decode offline and fake a one-shot "stream".
|
||||
savedBegin, savedBeginLang := CppStreamBegin, CppStreamBeginLang
|
||||
defer func() { CppStreamBegin, CppStreamBeginLang = savedBegin, savedBeginLang }()
|
||||
CppStreamBeginLang = nil
|
||||
CppStreamBegin = func(ctx uintptr) uintptr { return 0 }
|
||||
|
||||
p := &ParakeetCpp{ctxPtr: 1}
|
||||
results := make(chan *pb.TranscriptStreamResponse, 8)
|
||||
err := p.AudioTranscriptionStream(context.Background(),
|
||||
&pb.TranscriptRequest{Dst: "ignored.wav"}, results)
|
||||
Expect(status.Code(err)).To(Equal(codes.Unimplemented))
|
||||
|
||||
// Honest signal: nothing was emitted — no faked batch result.
|
||||
var emitted []*pb.TranscriptStreamResponse
|
||||
for r := range results {
|
||||
emitted = append(emitted, r)
|
||||
}
|
||||
Expect(emitted).To(BeEmpty())
|
||||
})
|
||||
|
||||
It("streams deltas and a closing FinalResult from a cache-aware model", func() {
|
||||
// Streaming needs a cache-aware streaming model (e.g.
|
||||
// realtime_eou); the offline test model would fail stream_begin.
|
||||
|
||||
@@ -1,186 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/mudler/LocalAI/pkg/grpc/grpcerrors"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
"github.com/mudler/xlog"
|
||||
"google.golang.org/grpc/codes"
|
||||
"google.golang.org/grpc/status"
|
||||
)
|
||||
|
||||
// liveSampleRate is the only PCM rate the parakeet C streaming API accepts.
|
||||
const liveSampleRate = 16000
|
||||
|
||||
// AudioTranscriptionLive drives one cache-aware streaming session over audio
|
||||
// fed incrementally by the caller (the realtime API's semantic_vad turn
|
||||
// detection). Contract:
|
||||
//
|
||||
// - the first request must carry a Config; a Config mid-stream resets the
|
||||
// decode session (free + begin) and drops accumulated transcript state;
|
||||
// - a Ready ack is sent right after a successful stream_begin so callers
|
||||
// can degrade synchronously when the model has no streaming support
|
||||
// (LiveTranscriptionUnsupported, codes.Unimplemented);
|
||||
// - every feed that produced output is forwarded as {delta, eou, words};
|
||||
// the <EOU>/<EOB> flag is the model's own utterance boundary and the
|
||||
// decoder auto-resets after it, so one session spans many utterances;
|
||||
// - closing the send side finalizes: the held-back tail chunk is flushed
|
||||
// (the last ~2 encoder frames of words only appear here) and a terminal
|
||||
// FinalResult carries the full transcript Text only. Per-utterance
|
||||
// segments, duration, and the terminal <EOU> flag are NOT produced here —
|
||||
// the realtime core consumes the streamed per-feed tokens and the final
|
||||
// Text; those batch fields are the file path's concern (see
|
||||
// AudioTranscriptionStream).
|
||||
//
|
||||
// Engine access is serialized per C call (streamBegin/streamFeed*/streamFree
|
||||
// take engineMu internally), never for the session lifetime — unary
|
||||
// transcription keeps flowing between feeds.
|
||||
func (p *ParakeetCpp) AudioTranscriptionLive(in <-chan *pb.TranscriptLiveRequest, out chan<- *pb.TranscriptLiveResponse) error {
|
||||
defer close(out)
|
||||
|
||||
if p.ctxPtr == 0 {
|
||||
return grpcerrors.ModelNotLoaded("parakeet-cpp")
|
||||
}
|
||||
|
||||
first, ok := <-in
|
||||
if !ok {
|
||||
return nil // caller closed without sending anything
|
||||
}
|
||||
cfg := first.GetConfig()
|
||||
if cfg == nil {
|
||||
return status.Error(codes.InvalidArgument, "parakeet-cpp: first live message must carry a config")
|
||||
}
|
||||
if err := validateLiveConfig(cfg); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
stream, err := p.streamBegin(cfg.GetLanguage())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if stream == 0 {
|
||||
return grpcerrors.LiveTranscriptionUnsupported("parakeet-cpp",
|
||||
"loaded model is not a cache-aware streaming model")
|
||||
}
|
||||
// stream is reassigned on a mid-stream Config reset; free whatever is
|
||||
// current when the RPC unwinds.
|
||||
defer func() { p.streamFree(stream) }()
|
||||
|
||||
out <- &pb.TranscriptLiveResponse{Ready: true}
|
||||
|
||||
var (
|
||||
full strings.Builder
|
||||
fedSecs float64
|
||||
|
||||
// behindSec accumulates how far decode wall time has fallen behind
|
||||
// the audio it was fed. A live caller feeds in real time, so a
|
||||
// persistent positive backlog means every downstream signal —
|
||||
// including the <EOU> the turn detector waits on — arrives that many
|
||||
// seconds late. Warned once per session; reset by a Config reset.
|
||||
behindSec float64
|
||||
behindWarned bool
|
||||
)
|
||||
|
||||
// emit forwards one decode increment: it streams the per-feed tokens the
|
||||
// realtime turn detector consumes (delta/eou/eob/words) and accumulates the
|
||||
// running transcript for the closing FinalResult. No segmentation or
|
||||
// boundary latch here — the live consumer reads only the streamed tokens
|
||||
// and the final Text; per-utterance segments and the terminal <EOU> flag
|
||||
// are an offline-path concern (see AudioTranscriptionStream / boundary.go).
|
||||
emit := func(r streamFeedResult) error {
|
||||
if r.Delta != "" {
|
||||
full.WriteString(r.Delta)
|
||||
}
|
||||
if r.Delta != "" || r.Eou || r.Eob || len(r.Words) > 0 {
|
||||
out <- &pb.TranscriptLiveResponse{
|
||||
Delta: r.Delta,
|
||||
Eou: r.Eou,
|
||||
Eob: r.Eob,
|
||||
Words: liveWordsToProto(r.Words),
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
for req := range in {
|
||||
switch payload := req.GetPayload().(type) {
|
||||
case *pb.TranscriptLiveRequest_Config:
|
||||
if err := validateLiveConfig(payload.Config); err != nil {
|
||||
return err
|
||||
}
|
||||
// Reset: a fresh decode session, dropping accumulated state.
|
||||
p.streamFree(stream)
|
||||
stream, err = p.streamBegin(payload.Config.GetLanguage())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if stream == 0 {
|
||||
return grpcerrors.LiveTranscriptionUnsupported("parakeet-cpp",
|
||||
"loaded model is not a cache-aware streaming model")
|
||||
}
|
||||
full.Reset()
|
||||
fedSecs = 0
|
||||
case *pb.TranscriptLiveRequest_Audio:
|
||||
pcm := payload.Audio.GetPcm()
|
||||
audioSec := float64(len(pcm)) / liveSampleRate
|
||||
fedSecs += audioSec
|
||||
start := time.Now()
|
||||
// nil ctx: a live session is bounded by this request channel, not a
|
||||
// context — cancellation is the caller closing the stream.
|
||||
if err := p.feedSlices(nil, stream, pcm, emit); err != nil {
|
||||
return err
|
||||
}
|
||||
wallSec := time.Since(start).Seconds()
|
||||
behindSec += wallSec - audioSec
|
||||
if behindSec < 0 {
|
||||
behindSec = 0
|
||||
}
|
||||
xlog.Debug("parakeet-cpp: live feed",
|
||||
"audio_ms", int(audioSec*1000), "wall_ms", int(wallSec*1000),
|
||||
"behind_ms", int(behindSec*1000), "fed_s", fedSecs)
|
||||
if behindSec > 1 && !behindWarned {
|
||||
behindWarned = true
|
||||
xlog.Warn("parakeet-cpp: live decode is falling behind real time; "+
|
||||
"end-of-utterance signals will arrive late",
|
||||
"behind_s", behindSec, "fed_s", fedSecs)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Send side closed: flush the streaming tail and emit the final transcript.
|
||||
// The live FinalResult carries only Text — the authoritative full-turn
|
||||
// transcript the realtime core commits. Per-utterance segments, duration,
|
||||
// and the terminal <EOU> flag are not produced on the live path.
|
||||
if err := p.flushTail(stream, emit); err != nil {
|
||||
return err
|
||||
}
|
||||
out <- &pb.TranscriptLiveResponse{
|
||||
FinalResult: &pb.TranscriptResult{Text: strings.TrimSpace(full.String())},
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func validateLiveConfig(cfg *pb.TranscriptLiveConfig) error {
|
||||
if sr := cfg.GetSampleRate(); sr != 0 && sr != liveSampleRate {
|
||||
return status.Errorf(codes.InvalidArgument,
|
||||
"parakeet-cpp: unsupported live sample_rate %d (only %d)", sr, liveSampleRate)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func liveWordsToProto(words []transcriptWord) []*pb.TranscriptWord {
|
||||
if len(words) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make([]*pb.TranscriptWord, len(words))
|
||||
for i, w := range words {
|
||||
out[i] = &pb.TranscriptWord{
|
||||
Start: secondsToNanos(w.Start),
|
||||
End: secondsToNanos(w.End),
|
||||
Text: w.W,
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
@@ -1,417 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
"unsafe"
|
||||
|
||||
"github.com/mudler/LocalAI/pkg/grpc/grpcerrors"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
"google.golang.org/grpc/codes"
|
||||
"google.golang.org/grpc/status"
|
||||
)
|
||||
|
||||
// The live-RPC specs drive AudioTranscriptionLive entirely against stubbed
|
||||
// Cpp* package vars (the same seam batcher_test.go uses), so they run
|
||||
// without libparakeet.so.
|
||||
|
||||
// liveCstrPool hands out NUL-terminated C-style strings backed by Go memory
|
||||
// and keeps them alive for the duration of a spec (goStringFromCPtr reads
|
||||
// through the raw pointer; Go's GC must not collect the backing array while
|
||||
// a stub's return value is in flight).
|
||||
type liveCstrPool struct {
|
||||
mu sync.Mutex
|
||||
bufs [][]byte
|
||||
}
|
||||
|
||||
func (p *liveCstrPool) cstr(s string) uintptr {
|
||||
p.mu.Lock()
|
||||
defer p.mu.Unlock()
|
||||
b := append([]byte(s), 0)
|
||||
p.bufs = append(p.bufs, b)
|
||||
return uintptr(unsafe.Pointer(&b[0]))
|
||||
}
|
||||
|
||||
// liveStubs swaps every C entry point the live path touches and returns a
|
||||
// restore func for AfterEach.
|
||||
func liveStubs() (restore func()) {
|
||||
savedBegin, savedBeginLang := CppStreamBegin, CppStreamBeginLang
|
||||
savedFeed, savedFeedJSON := CppStreamFeed, CppStreamFeedJSON
|
||||
savedFinalize, savedFinalizeJSON := CppStreamFinalize, CppStreamFinalizeJSON
|
||||
savedFree, savedLastError := CppStreamFree, CppLastError
|
||||
savedFreeString := CppFreeString
|
||||
return func() {
|
||||
CppStreamBegin, CppStreamBeginLang = savedBegin, savedBeginLang
|
||||
CppStreamFeed, CppStreamFeedJSON = savedFeed, savedFeedJSON
|
||||
CppStreamFinalize, CppStreamFinalizeJSON = savedFinalize, savedFinalizeJSON
|
||||
CppStreamFree, CppLastError = savedFree, savedLastError
|
||||
CppFreeString = savedFreeString
|
||||
}
|
||||
}
|
||||
|
||||
// runLive starts the RPC on its own goroutine and returns the request
|
||||
// channel plus a collector for everything the backend emitted.
|
||||
func runLive(p *ParakeetCpp) (chan *pb.TranscriptLiveRequest, chan *pb.TranscriptLiveResponse, chan error) {
|
||||
in := make(chan *pb.TranscriptLiveRequest)
|
||||
out := make(chan *pb.TranscriptLiveResponse, 32)
|
||||
errCh := make(chan error, 1)
|
||||
go func() { errCh <- p.AudioTranscriptionLive(in, out) }()
|
||||
return in, out, errCh
|
||||
}
|
||||
|
||||
func liveConfig(lang string) *pb.TranscriptLiveRequest {
|
||||
return &pb.TranscriptLiveRequest{
|
||||
Payload: &pb.TranscriptLiveRequest_Config{Config: &pb.TranscriptLiveConfig{Language: lang}},
|
||||
}
|
||||
}
|
||||
|
||||
func liveAudio(pcm []float32) *pb.TranscriptLiveRequest {
|
||||
return &pb.TranscriptLiveRequest{
|
||||
Payload: &pb.TranscriptLiveRequest_Audio{Audio: &pb.TranscriptLiveAudio{Pcm: pcm}},
|
||||
}
|
||||
}
|
||||
|
||||
func collectLive(out chan *pb.TranscriptLiveResponse) []*pb.TranscriptLiveResponse {
|
||||
var got []*pb.TranscriptLiveResponse
|
||||
for r := range out {
|
||||
got = append(got, r)
|
||||
}
|
||||
return got
|
||||
}
|
||||
|
||||
var _ = Describe("AudioTranscriptionLive (stubbed C API)", func() {
|
||||
var (
|
||||
pool *liveCstrPool
|
||||
restore func()
|
||||
p *ParakeetCpp
|
||||
)
|
||||
|
||||
BeforeEach(func() {
|
||||
pool = &liveCstrPool{}
|
||||
restore = liveStubs()
|
||||
p = &ParakeetCpp{ctxPtr: 1}
|
||||
|
||||
CppStreamBeginLang = nil
|
||||
CppStreamBegin = func(ctx uintptr) uintptr { return 7 }
|
||||
CppStreamFree = func(s uintptr) {}
|
||||
CppFreeString = func(s uintptr) {}
|
||||
CppLastError = func(ctx uintptr) string { return "stub error" }
|
||||
CppStreamFeed = nil
|
||||
CppStreamFeedJSON = nil
|
||||
CppStreamFinalize = nil
|
||||
CppStreamFinalizeJSON = nil
|
||||
})
|
||||
|
||||
AfterEach(func() { restore() })
|
||||
|
||||
It("rejects a stream whose first message is not a config", func() {
|
||||
in, out, errCh := runLive(p)
|
||||
in <- liveAudio([]float32{0.1})
|
||||
close(in)
|
||||
|
||||
err := <-errCh
|
||||
Expect(status.Code(err)).To(Equal(codes.InvalidArgument))
|
||||
Expect(collectLive(out)).To(BeEmpty())
|
||||
})
|
||||
|
||||
It("rejects a non-16k sample rate", func() {
|
||||
in, _, errCh := runLive(p)
|
||||
in <- &pb.TranscriptLiveRequest{
|
||||
Payload: &pb.TranscriptLiveRequest_Config{Config: &pb.TranscriptLiveConfig{SampleRate: 8000}},
|
||||
}
|
||||
close(in)
|
||||
Expect(status.Code(<-errCh)).To(Equal(codes.InvalidArgument))
|
||||
})
|
||||
|
||||
It("returns the typed Unimplemented signal for non-streaming models, before any ack", func() {
|
||||
CppStreamBegin = func(ctx uintptr) uintptr { return 0 }
|
||||
|
||||
in, out, errCh := runLive(p)
|
||||
in <- liveConfig("")
|
||||
close(in)
|
||||
|
||||
err := <-errCh
|
||||
Expect(grpcerrors.IsLiveTranscriptionUnsupported(err)).To(BeTrue())
|
||||
Expect(collectLive(out)).To(BeEmpty())
|
||||
})
|
||||
|
||||
It("streams deltas, eou flags and words on the JSON path and finalizes on close", func() {
|
||||
var freed []uintptr
|
||||
CppStreamFree = func(s uintptr) { freed = append(freed, s) }
|
||||
feeds := 0
|
||||
CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
|
||||
feeds++
|
||||
switch feeds {
|
||||
case 1:
|
||||
return pool.cstr(`{"text":"hello ","eou":0,"frame_sec":0.08,` +
|
||||
`"words":[{"w":"hello","start":0.1,"end":0.4,"conf":0.9}]}`)
|
||||
default:
|
||||
return pool.cstr(`{"text":"world","eou":1,"frame_sec":0.08,` +
|
||||
`"words":[{"w":"world","start":0.5,"end":0.8,"conf":0.9}]}`)
|
||||
}
|
||||
}
|
||||
CppStreamFinalizeJSON = func(s uintptr) uintptr {
|
||||
return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
|
||||
}
|
||||
|
||||
in, out, errCh := runLive(p)
|
||||
in <- liveConfig("en")
|
||||
in <- liveAudio(make([]float32, 100))
|
||||
in <- liveAudio(make([]float32, 200))
|
||||
close(in)
|
||||
Expect(<-errCh).NotTo(HaveOccurred())
|
||||
|
||||
got := collectLive(out)
|
||||
Expect(got).To(HaveLen(4)) // ready, two deltas, final
|
||||
|
||||
Expect(got[0].Ready).To(BeTrue())
|
||||
|
||||
Expect(got[1].Delta).To(Equal("hello "))
|
||||
Expect(got[1].Eou).To(BeFalse())
|
||||
Expect(got[1].Words).To(HaveLen(1))
|
||||
Expect(got[1].Words[0].Text).To(Equal("hello"))
|
||||
|
||||
Expect(got[2].Delta).To(Equal("world"))
|
||||
Expect(got[2].Eou).To(BeTrue())
|
||||
|
||||
final := got[3].FinalResult
|
||||
Expect(final).NotTo(BeNil())
|
||||
Expect(final.Text).To(Equal("hello world"))
|
||||
// The live FinalResult carries only Text. Per-utterance segments,
|
||||
// duration and the terminal eou flag are an offline-path concern (see
|
||||
// boundary.go / AudioTranscriptionStream); the realtime core reads the
|
||||
// streamed per-feed tokens above plus this Text.
|
||||
Expect(final.Eou).To(BeFalse())
|
||||
Expect(final.Segments).To(BeEmpty())
|
||||
Expect(final.Duration).To(BeZero())
|
||||
|
||||
Expect(freed).To(Equal([]uintptr{7}))
|
||||
})
|
||||
|
||||
It("falls back to the text feed (eou out-param) when the JSON entry points are absent", func() {
|
||||
feeds := 0
|
||||
CppStreamFeed = func(s uintptr, pcm []float32, n int32, eouOut unsafe.Pointer) uintptr {
|
||||
feeds++
|
||||
if feeds == 2 {
|
||||
*(*int32)(eouOut) = 1
|
||||
return pool.cstr("done")
|
||||
}
|
||||
return pool.cstr("first ")
|
||||
}
|
||||
CppStreamFinalize = func(s uintptr) uintptr { return pool.cstr("") }
|
||||
|
||||
in, out, errCh := runLive(p)
|
||||
in <- liveConfig("")
|
||||
in <- liveAudio(make([]float32, 10))
|
||||
in <- liveAudio(make([]float32, 10))
|
||||
close(in)
|
||||
Expect(<-errCh).NotTo(HaveOccurred())
|
||||
|
||||
got := collectLive(out)
|
||||
Expect(got).To(HaveLen(4))
|
||||
Expect(got[1].Delta).To(Equal("first "))
|
||||
Expect(got[1].Eou).To(BeFalse())
|
||||
Expect(got[2].Delta).To(Equal("done"))
|
||||
Expect(got[2].Eou).To(BeTrue())
|
||||
Expect(got[3].FinalResult.Text).To(Equal("first done"))
|
||||
})
|
||||
|
||||
It("forwards <EOB> as eob — a backchannel, never an eou (ABI v5 JSON)", func() {
|
||||
feeds := 0
|
||||
CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
|
||||
feeds++
|
||||
if feeds == 1 {
|
||||
return pool.cstr(`{"text":"uh-huh","eou":0,"eob":1,"frame_sec":0.08,` +
|
||||
`"words":[{"w":"uh-huh","start":0.1,"end":0.3,"conf":0.9}]}`)
|
||||
}
|
||||
return pool.cstr(`{"text":"the turn","eou":1,"eob":0,"frame_sec":0.08,` +
|
||||
`"words":[{"w":"the","start":0.5,"end":0.6,"conf":0.9},{"w":"turn","start":0.6,"end":0.8,"conf":0.9}]}`)
|
||||
}
|
||||
CppStreamFinalizeJSON = func(s uintptr) uintptr {
|
||||
return pool.cstr(`{"text":"","eou":0,"eob":0,"frame_sec":0.08,"words":[]}`)
|
||||
}
|
||||
|
||||
in, out, errCh := runLive(p)
|
||||
in <- liveConfig("")
|
||||
in <- liveAudio(make([]float32, 10))
|
||||
in <- liveAudio(make([]float32, 10))
|
||||
close(in)
|
||||
Expect(<-errCh).NotTo(HaveOccurred())
|
||||
|
||||
got := collectLive(out)
|
||||
Expect(got).To(HaveLen(4))
|
||||
Expect(got[1].Eob).To(BeTrue())
|
||||
Expect(got[1].Eou).To(BeFalse(), "a backchannel must not masquerade as a turn boundary")
|
||||
Expect(got[2].Eou).To(BeTrue())
|
||||
})
|
||||
|
||||
It("maps the v5 eou_out bitmask on the text path (bit0 <EOU>, bit1 <EOB>)", func() {
|
||||
feeds := 0
|
||||
CppStreamFeed = func(s uintptr, pcm []float32, n int32, eouOut unsafe.Pointer) uintptr {
|
||||
feeds++
|
||||
if feeds == 1 {
|
||||
*(*int32)(eouOut) = 2 // <EOB> only
|
||||
return pool.cstr("uh-huh")
|
||||
}
|
||||
*(*int32)(eouOut) = 1 // <EOU>
|
||||
return pool.cstr(" done")
|
||||
}
|
||||
CppStreamFinalize = func(s uintptr) uintptr { return pool.cstr("") }
|
||||
|
||||
in, out, errCh := runLive(p)
|
||||
in <- liveConfig("")
|
||||
in <- liveAudio(make([]float32, 10))
|
||||
in <- liveAudio(make([]float32, 10))
|
||||
close(in)
|
||||
Expect(<-errCh).NotTo(HaveOccurred())
|
||||
|
||||
got := collectLive(out)
|
||||
Expect(got).To(HaveLen(4))
|
||||
Expect(got[1].Eob).To(BeTrue())
|
||||
Expect(got[1].Eou).To(BeFalse())
|
||||
Expect(got[2].Eou).To(BeTrue())
|
||||
Expect(got[2].Eob).To(BeFalse())
|
||||
})
|
||||
|
||||
It("accumulates trailing text after an EOU into the final transcript", func() {
|
||||
feeds := 0
|
||||
CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
|
||||
feeds++
|
||||
if feeds == 1 {
|
||||
return pool.cstr(`{"text":"turn one","eou":1,"frame_sec":0.08,"words":[]}`)
|
||||
}
|
||||
return pool.cstr(`{"text":" and more","eou":0,"frame_sec":0.08,"words":[]}`)
|
||||
}
|
||||
CppStreamFinalizeJSON = func(s uintptr) uintptr {
|
||||
return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
|
||||
}
|
||||
|
||||
in, out, errCh := runLive(p)
|
||||
in <- liveConfig("")
|
||||
in <- liveAudio(make([]float32, 10))
|
||||
in <- liveAudio(make([]float32, 10))
|
||||
close(in)
|
||||
Expect(<-errCh).NotTo(HaveOccurred())
|
||||
|
||||
got := collectLive(out)
|
||||
final := got[len(got)-1].FinalResult
|
||||
Expect(final.Text).To(Equal("turn one and more"))
|
||||
})
|
||||
|
||||
It("resets the decode session on a mid-stream config", func() {
|
||||
var begun, freed int
|
||||
CppStreamBegin = func(ctx uintptr) uintptr { begun++; return uintptr(10 + begun) }
|
||||
CppStreamFree = func(s uintptr) { freed++ }
|
||||
CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
|
||||
return pool.cstr(`{"text":"x","eou":0,"frame_sec":0.08,"words":[]}`)
|
||||
}
|
||||
CppStreamFinalizeJSON = func(s uintptr) uintptr {
|
||||
return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
|
||||
}
|
||||
|
||||
in, out, errCh := runLive(p)
|
||||
in <- liveConfig("")
|
||||
in <- liveAudio(make([]float32, 10))
|
||||
in <- liveConfig("") // reset
|
||||
in <- liveAudio(make([]float32, 10))
|
||||
close(in)
|
||||
Expect(<-errCh).NotTo(HaveOccurred())
|
||||
|
||||
got := collectLive(out)
|
||||
final := got[len(got)-1].FinalResult
|
||||
Expect(final.Text).To(Equal("x"), "pre-reset transcript dropped")
|
||||
Expect(begun).To(Equal(2))
|
||||
Expect(freed).To(Equal(2), "old session freed on reset, new one on unwind")
|
||||
})
|
||||
|
||||
It("does not hold engineMu between feeds (unary work interleaves with a live session)", func() {
|
||||
CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
|
||||
return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
|
||||
}
|
||||
CppStreamFinalizeJSON = func(s uintptr) uintptr {
|
||||
return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
|
||||
}
|
||||
|
||||
in, out, errCh := runLive(p)
|
||||
in <- liveConfig("")
|
||||
in <- liveAudio(make([]float32, 10))
|
||||
|
||||
// The session is open and idle between feeds: the engine lock must be
|
||||
// acquirable, which is what lets batched unary transcription proceed
|
||||
// mid-session. Under stream-lifetime locking this probe would block
|
||||
// until the stream ended and the Eventually would time out.
|
||||
locked := make(chan struct{})
|
||||
go func() {
|
||||
p.engineMu.Lock()
|
||||
p.engineMu.Unlock() //nolint:staticcheck // probe: acquire-release proves availability
|
||||
close(locked)
|
||||
}()
|
||||
Eventually(locked, time.Second).Should(BeClosed())
|
||||
|
||||
close(in)
|
||||
Expect(<-errCh).NotTo(HaveOccurred())
|
||||
collectLive(out)
|
||||
})
|
||||
|
||||
It("errors out and reads last_error under the lock when a feed fails", func() {
|
||||
CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr { return 0 }
|
||||
|
||||
in, out, errCh := runLive(p)
|
||||
in <- liveConfig("")
|
||||
in <- liveAudio(make([]float32, 10))
|
||||
|
||||
err := <-errCh
|
||||
Expect(err).To(MatchError(ContainSubstring("stub error")))
|
||||
got := collectLive(out)
|
||||
Expect(got).To(HaveLen(1)) // just the ready ack
|
||||
close(in)
|
||||
})
|
||||
})
|
||||
|
||||
var _ = Describe("stripEouMarker", func() {
|
||||
It("strips a trailing <EOU> and reports it", func() {
|
||||
text, eou := stripEouMarker("it is certainly very like the old portrait<EOU>")
|
||||
Expect(text).To(Equal("it is certainly very like the old portrait"))
|
||||
Expect(eou).To(BeTrue())
|
||||
})
|
||||
|
||||
It("strips a trailing <EOB> WITHOUT reporting an utterance end", func() {
|
||||
// A decode ending on a backchannel must not confirm the
|
||||
// retranscribe gate — the user was acknowledging, not yielding.
|
||||
text, eou := stripEouMarker("uh-huh<EOB>")
|
||||
Expect(text).To(Equal("uh-huh"))
|
||||
Expect(eou).To(BeFalse())
|
||||
})
|
||||
|
||||
It("leaves marker-free text alone", func() {
|
||||
text, eou := stripEouMarker("plain transcript")
|
||||
Expect(text).To(Equal("plain transcript"))
|
||||
Expect(eou).To(BeFalse())
|
||||
})
|
||||
|
||||
It("does not strip a marker in the middle of the text", func() {
|
||||
text, eou := stripEouMarker("a<EOU>b")
|
||||
Expect(text).To(Equal("a<EOU>b"))
|
||||
Expect(eou).To(BeFalse())
|
||||
})
|
||||
})
|
||||
|
||||
var _ = Describe("transcriptResultFromDoc EOU handling", func() {
|
||||
It("strips the offline marker from text and sets the result flag", func() {
|
||||
doc := transcriptJSON{Text: "the old portrait<EOU>"}
|
||||
res := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0)
|
||||
Expect(res.Text).To(Equal("the old portrait"))
|
||||
Expect(res.Eou).To(BeTrue())
|
||||
Expect(res.Segments).To(HaveLen(1))
|
||||
Expect(res.Segments[0].Text).To(Equal("the old portrait"))
|
||||
})
|
||||
|
||||
It("reports eou=false for marker-free decodes", func() {
|
||||
doc := transcriptJSON{Text: "no marker here"}
|
||||
res := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0)
|
||||
Expect(res.Text).To(Equal("no marker here"))
|
||||
Expect(res.Eou).To(BeFalse())
|
||||
})
|
||||
})
|
||||
@@ -106,7 +106,7 @@ var _ = Describe("transcriptResultFromDoc (multi-segment)", func() {
|
||||
var _ = Describe("streaming segment assembly", func() {
|
||||
It("closes a segment with start/end from its words on EOU", func() {
|
||||
acc := &streamSegmenter{}
|
||||
acc.add(streamFeedResult{Delta: "hello world", Eou: true, Words: []transcriptWord{
|
||||
acc.add(streamFeedJSON{Text: "hello world", Eou: 1, Words: []transcriptWord{
|
||||
{W: "hello", Start: 0.0, End: 0.4}, {W: "world", Start: 0.4, End: 0.9},
|
||||
}})
|
||||
segs := acc.segments()
|
||||
@@ -118,9 +118,9 @@ var _ = Describe("streaming segment assembly", func() {
|
||||
|
||||
It("buffers words across feeds until EOU", func() {
|
||||
acc := &streamSegmenter{}
|
||||
acc.add(streamFeedResult{Delta: "hi", Words: []transcriptWord{{W: "hi", Start: 0, End: 0.3}}})
|
||||
acc.add(streamFeedJSON{Text: "hi", Eou: 0, Words: []transcriptWord{{W: "hi", Start: 0, End: 0.3}}})
|
||||
Expect(acc.segments()).To(BeEmpty())
|
||||
acc.add(streamFeedResult{Delta: "there", Eou: true, Words: []transcriptWord{{W: "there", Start: 0.3, End: 0.7}}})
|
||||
acc.add(streamFeedJSON{Text: "there", Eou: 1, Words: []transcriptWord{{W: "there", Start: 0.3, End: 0.7}}})
|
||||
Expect(acc.segments()).To(HaveLen(1))
|
||||
Expect(acc.segments()[0].Text).To(Equal("hi there"))
|
||||
})
|
||||
@@ -129,7 +129,7 @@ var _ = Describe("streaming segment assembly", func() {
|
||||
// field; a backchannel must still close the segment as it did in v4.
|
||||
It("closes a segment on EOB (backchannel) too", func() {
|
||||
acc := &streamSegmenter{}
|
||||
acc.add(streamFeedResult{Delta: "uh huh", Eob: true, Words: []transcriptWord{
|
||||
acc.add(streamFeedJSON{Text: "uh huh", Eou: 0, Eob: 1, Words: []transcriptWord{
|
||||
{W: "uh", Start: 0.0, End: 0.2}, {W: "huh", Start: 0.2, End: 0.5},
|
||||
}})
|
||||
segs := acc.segments()
|
||||
@@ -137,18 +137,4 @@ var _ = Describe("streaming segment assembly", func() {
|
||||
Expect(segs[0].Text).To(Equal("uh huh"))
|
||||
Expect(segs[0].End).To(Equal(secondsToNanos(0.5)))
|
||||
})
|
||||
|
||||
// Older text-only libparakeet.so: no per-word timings, so a segment is cut
|
||||
// from the delta text on each <EOU>/<EOB> (no timestamps), one per utterance.
|
||||
It("falls back to text segments when the feed carries no words", func() {
|
||||
acc := &streamSegmenter{}
|
||||
acc.add(streamFeedResult{Delta: "first turn", Eou: true})
|
||||
acc.add(streamFeedResult{Delta: "second turn", Eou: true})
|
||||
segs := acc.segments()
|
||||
Expect(segs).To(HaveLen(2))
|
||||
Expect(segs[0].Text).To(Equal("first turn"))
|
||||
Expect(segs[1].Text).To(Equal("second turn"))
|
||||
Expect(segs[0].Start).To(Equal(int64(0)), "no per-word timing on the text path")
|
||||
Expect(segs[0].End).To(Equal(int64(0)))
|
||||
})
|
||||
})
|
||||
|
||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
||||
|
||||
# stablediffusion.cpp (ggml)
|
||||
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
|
||||
STABLEDIFFUSION_GGML_VERSION?=3b6c9ca97cfcda8e68e719e6670d06379fcbe943
|
||||
STABLEDIFFUSION_GGML_VERSION?=9956436c925a367daeab097598b1ea1f32d3503f
|
||||
|
||||
CMAKE_ARGS+=-DGGML_MAX_NAME=128
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# voice-detect backend Makefile.
|
||||
#
|
||||
# Upstream pin lives below as VOICEDETECT_VERSION?=1db1759572c90faef6f3a78c36b5941a096a9f89
|
||||
# Upstream pin lives below as VOICEDETECT_VERSION?=3d51077... (.github/bump_deps.sh
|
||||
# can find and update it - matches the parakeet.cpp / whisper.cpp / ds4 convention).
|
||||
#
|
||||
# Local dev shortcut: if you already have an out-of-tree voice-detect.cpp build,
|
||||
@@ -13,7 +13,7 @@
|
||||
# The default target below does the proper clone-at-pin + cmake build so CI does
|
||||
# not need a side-checkout.
|
||||
|
||||
VOICEDETECT_VERSION?=1db1759572c90faef6f3a78c36b5941a096a9f89
|
||||
VOICEDETECT_VERSION?=3d510772357538c5182808ac7de2278b84824e24
|
||||
VOICEDETECT_REPO?=https://github.com/mudler/voice-detect.cpp
|
||||
|
||||
GOCMD?=go
|
||||
|
||||
@@ -72,6 +72,37 @@
|
||||
nvidia-cuda-12: "cuda12-turboquant"
|
||||
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-turboquant"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-turboquant"
|
||||
- &llamacpplocalaipaged
|
||||
name: "llama-cpp-localai-paged"
|
||||
alias: "llama-cpp-localai-paged"
|
||||
license: mit
|
||||
icon: https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png
|
||||
description: |
|
||||
LocalAI's paged-attention llama.cpp variant: on-demand paged KV cache plus a
|
||||
decode-first prefill budget. The SAME upstream llama.cpp grpc-server as the
|
||||
stock llama-cpp backend, with the LocalAI paged patch series applied
|
||||
(vendored in this backend). Tuned for NVFP4 dense / MoE on Blackwell / GB10. Reuses the
|
||||
llama-cpp gRPC server sources; the paged engine is gated at runtime by the
|
||||
paged_kv / max_batch_tokens model options.
|
||||
urls:
|
||||
- https://github.com/ggerganov/llama.cpp
|
||||
tags:
|
||||
- text-to-text
|
||||
- LLM
|
||||
- GPU
|
||||
- CUDA
|
||||
- paged-attention
|
||||
- nvfp4
|
||||
# CUDA-only: the paged patchset's wins (GDN fusions, NVFP4 FP4-MMA) are
|
||||
# CUDA/Blackwell-specific; off-CUDA they gate off and the backend is
|
||||
# neutral-to-negative, so non-CUDA users should use the stock llama-cpp
|
||||
# backend. default points at cuda12 (mirrors faster-qwen3-tts) so the gallery
|
||||
# entries always resolve to a CUDA variant.
|
||||
capabilities:
|
||||
default: "cuda13-llama-cpp-localai-paged"
|
||||
nvidia: "cuda13-llama-cpp-localai-paged"
|
||||
nvidia-cuda-13: "cuda13-llama-cpp-localai-paged"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp-localai-paged"
|
||||
- &ds4
|
||||
name: "ds4"
|
||||
alias: "ds4"
|
||||
@@ -1710,6 +1741,13 @@
|
||||
nvidia-cuda-12: "cuda12-turboquant-development"
|
||||
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-turboquant-development"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-turboquant-development"
|
||||
- !!merge <<: *llamacpplocalaipaged
|
||||
name: "llama-cpp-localai-paged-development"
|
||||
capabilities:
|
||||
default: "cuda13-llama-cpp-localai-paged-development"
|
||||
nvidia: "cuda13-llama-cpp-localai-paged-development"
|
||||
nvidia-cuda-13: "cuda13-llama-cpp-localai-paged-development"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp-localai-paged-development"
|
||||
- !!merge <<: *ds4
|
||||
name: "ds4-development"
|
||||
capabilities:
|
||||
@@ -2378,6 +2416,27 @@
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-turboquant"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-turboquant
|
||||
## llama-cpp-localai-paged (CUDA-only; see backend/cpp/llama-cpp-localai-paged/README.md section 4c)
|
||||
- !!merge <<: *llamacpplocalaipaged
|
||||
name: "cuda13-llama-cpp-localai-paged"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-llama-cpp-localai-paged"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-nvidia-cuda-13-llama-cpp-localai-paged
|
||||
- !!merge <<: *llamacpplocalaipaged
|
||||
name: "cuda13-llama-cpp-localai-paged-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-llama-cpp-localai-paged"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-nvidia-cuda-13-llama-cpp-localai-paged
|
||||
- !!merge <<: *llamacpplocalaipaged
|
||||
name: "cuda13-nvidia-l4t-arm64-llama-cpp-localai-paged"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-llama-cpp-localai-paged"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-llama-cpp-localai-paged
|
||||
- !!merge <<: *llamacpplocalaipaged
|
||||
name: "cuda13-nvidia-l4t-arm64-llama-cpp-localai-paged-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-llama-cpp-localai-paged"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-llama-cpp-localai-paged
|
||||
## ds4
|
||||
- !!merge <<: *ds4
|
||||
name: "cpu-ds4"
|
||||
|
||||
@@ -13,17 +13,6 @@ fi
|
||||
# fish-speech uses pyrootutils which requires a .project-root marker
|
||||
touch "${backend_dir}/.project-root"
|
||||
|
||||
# On darwin arm64 the transitive `tokenizers` dep compiles its Rust extension
|
||||
# from source (Linux uses prebuilt manylinux wheels, so it never compiles
|
||||
# there). The pinned tokenizers crate that fish-speech's stack resolves to
|
||||
# contains a `&T` -> `&mut T` cast that trips the now-deny-by-default
|
||||
# `invalid_reference_casting` lint in the macOS runner's newer Rust toolchain,
|
||||
# breaking the build (seen in the v4.5.5 release CI fish-speech darwin/metal
|
||||
# job). Allow that lint so the unchanged third-party crate compiles as before.
|
||||
# Append rather than clobber any pre-existing RUSTFLAGS; harmless on Linux
|
||||
# where no Rust compile happens.
|
||||
export RUSTFLAGS="${RUSTFLAGS:-} -A invalid_reference_casting"
|
||||
|
||||
installRequirements
|
||||
|
||||
# Clone fish-speech source (the pip package doesn't include inference modules)
|
||||
|
||||
@@ -3,5 +3,4 @@ protobuf
|
||||
certifi
|
||||
packaging==24.1
|
||||
pip
|
||||
chardet
|
||||
click
|
||||
chardet
|
||||
@@ -147,25 +147,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
d["reasoning_content"] = msg.reasoning_content
|
||||
if msg.tool_calls:
|
||||
try:
|
||||
tool_calls = json.loads(msg.tool_calls)
|
||||
d["tool_calls"] = json.loads(msg.tool_calls)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
else:
|
||||
# OpenAI wire format carries function.arguments as a
|
||||
# JSON-encoded string, but chat templates (e.g. Qwen3)
|
||||
# iterate over it as a mapping. The vllm backend
|
||||
# already parses arguments before applying the chat
|
||||
# template (PR #10256); mirror that here so the
|
||||
# sglang backend works with the same wire format.
|
||||
if isinstance(tool_calls, list):
|
||||
for tc in tool_calls:
|
||||
func = tc.get("function") if isinstance(tc, dict) else None
|
||||
if isinstance(func, dict) and isinstance(func.get("arguments"), str):
|
||||
try:
|
||||
func["arguments"] = json.loads(func["arguments"])
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
d["tool_calls"] = tool_calls
|
||||
result.append(d)
|
||||
return result
|
||||
|
||||
|
||||
@@ -104,7 +104,7 @@ if [ "$(uname -s)" = "Darwin" ]; then
|
||||
# can rewrite it. Darwin therefore follows vllm-metal and can lag the Linux
|
||||
# vllm pin (requirements-cublas13-after.txt, bumped independently against
|
||||
# vllm/vllm) until vllm-metal supports a newer vLLM.
|
||||
VLLM_METAL_VERSION="v0.3.0.dev20260628073537"
|
||||
VLLM_METAL_VERSION="v0.3.0.dev20260622062346"
|
||||
|
||||
# The coupled vLLM source version is whatever this vllm-metal release builds
|
||||
# against -- it declares it in its own installer as `vllm_v=`. Derive it from
|
||||
|
||||
@@ -429,7 +429,7 @@ func (l *Launcher) CheckForUpdates() (bool, string, error) {
|
||||
}
|
||||
|
||||
// DownloadUpdate downloads the latest version
|
||||
func (l *Launcher) DownloadUpdate(version string, progressCallback func(downloaded, total int64)) error {
|
||||
func (l *Launcher) DownloadUpdate(version string, progressCallback func(float64)) error {
|
||||
return l.releaseManager.DownloadRelease(version, progressCallback)
|
||||
}
|
||||
|
||||
@@ -486,6 +486,7 @@ func (l *Launcher) showDownloadLocalAIDialog() {
|
||||
fyne.DoAndWait(func() {
|
||||
// Create a standalone window for the download dialog
|
||||
dialogWindow := l.app.NewWindow("LocalAI Installation Required")
|
||||
dialogWindow.Resize(fyne.NewSize(500, 350))
|
||||
dialogWindow.CenterOnScreen()
|
||||
dialogWindow.SetCloseIntercept(func() {
|
||||
dialogWindow.Close()
|
||||
@@ -547,7 +548,6 @@ func (l *Launcher) showDownloadLocalAIDialog() {
|
||||
)
|
||||
|
||||
dialogWindow.SetContent(content)
|
||||
resizeToContent(dialogWindow, content)
|
||||
dialogWindow.Show()
|
||||
})
|
||||
}
|
||||
@@ -621,134 +621,88 @@ func (l *Launcher) showDownloadError(title, message string) {
|
||||
}
|
||||
|
||||
// showDownloadProgress shows a standalone progress window for downloading LocalAI
|
||||
// after a fresh install (no LocalAI binary present yet).
|
||||
func (l *Launcher) showDownloadProgress(version, title string) {
|
||||
l.showDownloadProgressWindow(version, title, func(win fyne.Window) {
|
||||
dialog.ShowConfirm("Installation Complete",
|
||||
"LocalAI has been downloaded and installed successfully. You can now start LocalAI from the launcher.",
|
||||
func(bool) {
|
||||
win.Close()
|
||||
l.updateStatus("LocalAI installed successfully")
|
||||
if l.systray != nil {
|
||||
l.systray.recreateMenu()
|
||||
}
|
||||
}, win)
|
||||
})
|
||||
}
|
||||
|
||||
// showDownloadProgressWindow renders the download progress popup shared by every
|
||||
// "download/upgrade LocalAI" entry point. It owns the progress bar, the
|
||||
// human-readable byte readout, resume-aware retry, and content-fit window
|
||||
// sizing so the behaviour stays identical everywhere. onSuccess runs (on the UI
|
||||
// goroutine) once the download verifies, and is responsible for the success
|
||||
// dialog and any follow-up; the window is passed in so it can be parented/closed.
|
||||
func (l *Launcher) showDownloadProgressWindow(version, title string, onSuccess func(win fyne.Window)) {
|
||||
fyne.DoAndWait(func() {
|
||||
// Create progress window
|
||||
progressWindow := l.app.NewWindow("Downloading LocalAI")
|
||||
progressWindow.Resize(fyne.NewSize(400, 250))
|
||||
progressWindow.CenterOnScreen()
|
||||
progressWindow.SetCloseIntercept(func() {
|
||||
progressWindow.Close()
|
||||
})
|
||||
|
||||
// Progress bar
|
||||
progressBar := widget.NewProgressBar()
|
||||
progressBar.SetValue(0)
|
||||
|
||||
// Status label. Truncate with an ellipsis so a long "Download failed:
|
||||
// <url>" message can't stretch the window (and progress bar) to fit the
|
||||
// whole error on one line.
|
||||
// whole error on one line; the full error is shown in the dialog below.
|
||||
statusLabel := widget.NewLabel("Preparing download...")
|
||||
statusLabel.Truncation = fyne.TextTruncateEllipsis
|
||||
|
||||
// Release notes button
|
||||
releaseNotesButton := widget.NewButton("View Release Notes", func() {
|
||||
releaseNotesURL, err := l.githubReleaseNotesURL(version)
|
||||
if err != nil {
|
||||
log.Printf("Failed to parse URL: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
l.app.OpenURL(releaseNotesURL)
|
||||
})
|
||||
|
||||
// Retry button: hidden until a download fails. GitHub downloads are
|
||||
// flaky, and the underlying download resumes from the partial file, so
|
||||
// a retry continues where it left off rather than starting over.
|
||||
retryButton := widget.NewButton("Retry", nil)
|
||||
retryButton.Importance = widget.HighImportance
|
||||
retryButton.Hide()
|
||||
|
||||
buttonRow := container.NewHBox(releaseNotesButton, retryButton)
|
||||
content := container.NewVBox(
|
||||
// Progress container
|
||||
progressContainer := container.NewVBox(
|
||||
widget.NewLabel(title),
|
||||
progressBar,
|
||||
statusLabel,
|
||||
widget.NewSeparator(),
|
||||
buttonRow,
|
||||
releaseNotesButton,
|
||||
)
|
||||
progressWindow.SetContent(content)
|
||||
resizeToContent(progressWindow, content)
|
||||
|
||||
var startDownload func()
|
||||
startDownload = func() {
|
||||
retryButton.Hide()
|
||||
progressBar.SetValue(0)
|
||||
statusLabel.SetText("Preparing download...")
|
||||
resizeToContent(progressWindow, content)
|
||||
|
||||
go func() {
|
||||
err := l.DownloadUpdate(version, func(downloaded, total int64) {
|
||||
fyne.Do(func() {
|
||||
if total > 0 {
|
||||
progressBar.SetValue(float64(downloaded) / float64(total))
|
||||
statusLabel.SetText(fmt.Sprintf("Downloading… %s / %s", formatBytes(downloaded), formatBytes(total)))
|
||||
} else {
|
||||
statusLabel.SetText(fmt.Sprintf("Downloading… %s", formatBytes(downloaded)))
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
fyne.Do(func() {
|
||||
if err != nil {
|
||||
statusLabel.SetText(fmt.Sprintf("Download failed: %v", err))
|
||||
retryButton.Show()
|
||||
resizeToContent(progressWindow, content)
|
||||
return
|
||||
}
|
||||
progressBar.SetValue(1.0)
|
||||
statusLabel.SetText("Download complete")
|
||||
onSuccess(progressWindow)
|
||||
})
|
||||
}()
|
||||
}
|
||||
retryButton.OnTapped = startDownload
|
||||
|
||||
progressWindow.SetContent(progressContainer)
|
||||
progressWindow.Show()
|
||||
startDownload()
|
||||
|
||||
// Start download in background
|
||||
go func() {
|
||||
err := l.DownloadUpdate(version, func(progress float64) {
|
||||
// Update progress bar
|
||||
fyne.Do(func() {
|
||||
progressBar.SetValue(progress)
|
||||
percentage := int(progress * 100)
|
||||
statusLabel.SetText(fmt.Sprintf("Downloading... %d%%", percentage))
|
||||
})
|
||||
})
|
||||
|
||||
// Handle completion
|
||||
fyne.Do(func() {
|
||||
if err != nil {
|
||||
statusLabel.SetText(fmt.Sprintf("Download failed: %v", err))
|
||||
// Show error dialog
|
||||
dialog.ShowError(err, progressWindow)
|
||||
} else {
|
||||
statusLabel.SetText("Download completed successfully!")
|
||||
progressBar.SetValue(1.0)
|
||||
|
||||
// Show success dialog
|
||||
dialog.ShowConfirm("Installation Complete",
|
||||
"LocalAI has been downloaded and installed successfully. You can now start LocalAI from the launcher.",
|
||||
func(close bool) {
|
||||
progressWindow.Close()
|
||||
// Update status and refresh systray menu
|
||||
l.updateStatus("LocalAI installed successfully")
|
||||
|
||||
if l.systray != nil {
|
||||
l.systray.recreateMenu()
|
||||
}
|
||||
}, progressWindow)
|
||||
}
|
||||
})
|
||||
}()
|
||||
})
|
||||
}
|
||||
|
||||
// resizeToContent sizes a window to fit its content (with a sane minimum width)
|
||||
// so the dialog doesn't show a large blank gap below the last widget.
|
||||
func resizeToContent(w fyne.Window, content fyne.CanvasObject) {
|
||||
size := content.MinSize()
|
||||
if size.Width < 400 {
|
||||
size.Width = 400
|
||||
}
|
||||
w.Resize(size)
|
||||
}
|
||||
|
||||
// formatBytes renders a byte count as a human-readable size (e.g. "12.3 MB").
|
||||
func formatBytes(b int64) string {
|
||||
const unit = 1024
|
||||
if b < unit {
|
||||
return fmt.Sprintf("%d B", b)
|
||||
}
|
||||
div, exp := int64(unit), 0
|
||||
for n := b / unit; n >= unit; n /= unit {
|
||||
div *= unit
|
||||
exp++
|
||||
}
|
||||
return fmt.Sprintf("%.1f %cB", float64(b)/float64(div), "KMGTPE"[exp])
|
||||
}
|
||||
|
||||
// monitorLogs monitors the output of LocalAI and adds it to the log buffer
|
||||
func (l *Launcher) monitorLogs(reader io.Reader, prefix string) {
|
||||
scanner := bufio.NewScanner(reader)
|
||||
|
||||
@@ -11,7 +11,6 @@ import (
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
@@ -51,12 +50,6 @@ type ReleaseManager struct {
|
||||
ChecksumsPath string
|
||||
// MetadataPath is where version metadata is stored
|
||||
MetadataPath string
|
||||
// BaseDownloadURL is the base URL release assets are downloaded from
|
||||
// (defaults to https://github.com; overridable for testing)
|
||||
BaseDownloadURL string
|
||||
// RetryBackoff is the base wait between download attempts; the Nth retry
|
||||
// waits N*RetryBackoff (defaults to 1s; lowered in tests)
|
||||
RetryBackoff time.Duration
|
||||
// HTTPClient is the HTTP client used for downloads
|
||||
HTTPClient *http.Client
|
||||
}
|
||||
@@ -69,94 +62,28 @@ func NewReleaseManager() *ReleaseManager {
|
||||
metadataPath := filepath.Join(homeDir, ".localai", "metadata")
|
||||
|
||||
return &ReleaseManager{
|
||||
GitHubOwner: "mudler",
|
||||
GitHubRepo: "LocalAI",
|
||||
BinaryPath: binaryPath,
|
||||
CurrentVersion: internal.PrintableVersion(),
|
||||
ChecksumsPath: checksumsPath,
|
||||
MetadataPath: metadataPath,
|
||||
BaseDownloadURL: "https://github.com",
|
||||
RetryBackoff: 1 * time.Second,
|
||||
HTTPClient: httpclient.NewWithTimeout(30*time.Second, httpclient.WithFollowRedirects()),
|
||||
GitHubOwner: "mudler",
|
||||
GitHubRepo: "LocalAI",
|
||||
BinaryPath: binaryPath,
|
||||
CurrentVersion: internal.PrintableVersion(),
|
||||
ChecksumsPath: checksumsPath,
|
||||
MetadataPath: metadataPath,
|
||||
HTTPClient: httpclient.NewWithTimeout(30*time.Second, httpclient.WithFollowRedirects()),
|
||||
}
|
||||
}
|
||||
|
||||
// GetLatestRelease resolves the latest LocalAI release.
|
||||
//
|
||||
// It first follows the github.com "releases/latest" redirect, which reveals the
|
||||
// latest tag in the final URL and—crucially—is NOT subject to the
|
||||
// 60-requests/hour unauthenticated rate limit of api.github.com. That limit is
|
||||
// per-IP, so on shared/NAT/CGNAT/cloud addresses the API returns 403 almost
|
||||
// immediately (e.g. on a fresh install with no LocalAI present yet). The
|
||||
// redirect avoids that entirely. The richer JSON API is kept only as a fallback.
|
||||
//
|
||||
// Only the version is consumed by callers, so the redirect's tag is sufficient.
|
||||
// GetLatestRelease fetches the latest release information from GitHub
|
||||
func (rm *ReleaseManager) GetLatestRelease() (*Release, error) {
|
||||
version, redirectErr := rm.latestVersionFromRedirect()
|
||||
if redirectErr == nil {
|
||||
return &Release{Version: version}, nil
|
||||
}
|
||||
log.Printf("Could not resolve latest version via release redirect (%v); falling back to GitHub API", redirectErr)
|
||||
|
||||
release, apiErr := rm.latestReleaseFromAPI()
|
||||
if apiErr != nil {
|
||||
// Surface both failures so a rate-limited API doesn't mask the (usually
|
||||
// more relevant) redirect error.
|
||||
return nil, fmt.Errorf("failed to fetch latest release: %v (redirect: %v)", apiErr, redirectErr)
|
||||
}
|
||||
return release, nil
|
||||
}
|
||||
|
||||
// latestVersionFromRedirect returns the latest tag by following the github.com
|
||||
// "releases/latest" redirect to ".../releases/tag/<tag>".
|
||||
func (rm *ReleaseManager) latestVersionFromRedirect() (string, error) {
|
||||
url := fmt.Sprintf("%s/%s/%s/releases/latest", rm.BaseDownloadURL, rm.GitHubOwner, rm.GitHubRepo)
|
||||
|
||||
resp, err := rm.HTTPClient.Get(url)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("unexpected status %s", resp.Status)
|
||||
}
|
||||
|
||||
// After the redirect is followed, the final request URL is the tag page.
|
||||
version := path.Base(resp.Request.URL.Path)
|
||||
if version == "" || version == "." || version == "latest" {
|
||||
return "", fmt.Errorf("could not determine version from %s", resp.Request.URL.String())
|
||||
}
|
||||
return version, nil
|
||||
}
|
||||
|
||||
// latestReleaseFromAPI fetches the latest release JSON from api.github.com. This
|
||||
// is the fallback path; it is rate-limited unless GITHUB_TOKEN is set.
|
||||
func (rm *ReleaseManager) latestReleaseFromAPI() (*Release, error) {
|
||||
url := fmt.Sprintf("https://api.github.com/repos/%s/%s/releases/latest", rm.GitHubOwner, rm.GitHubRepo)
|
||||
|
||||
req, err := http.NewRequest(http.MethodGet, url, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("Accept", "application/vnd.github+json")
|
||||
// An optional token lifts the unauthenticated 60/hour limit to 5000/hour.
|
||||
if token := os.Getenv("GITHUB_TOKEN"); token != "" {
|
||||
req.Header.Set("Authorization", "Bearer "+token)
|
||||
}
|
||||
|
||||
resp, err := rm.HTTPClient.Do(req)
|
||||
resp, err := rm.HTTPClient.Get(url)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch latest release: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
if (resp.StatusCode == http.StatusForbidden || resp.StatusCode == http.StatusTooManyRequests) &&
|
||||
resp.Header.Get("X-RateLimit-Remaining") == "0" {
|
||||
return nil, fmt.Errorf("GitHub API rate limit exceeded (status %d); retry later or set GITHUB_TOKEN to raise the limit", resp.StatusCode)
|
||||
}
|
||||
return nil, fmt.Errorf("status %d", resp.StatusCode)
|
||||
return nil, fmt.Errorf("failed to fetch latest release: status %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
// Parse the JSON response properly
|
||||
@@ -179,7 +106,7 @@ func (rm *ReleaseManager) latestReleaseFromAPI() (*Release, error) {
|
||||
}
|
||||
|
||||
// DownloadRelease downloads a specific version of LocalAI
|
||||
func (rm *ReleaseManager) DownloadRelease(version string, progressCallback func(downloaded, total int64)) error {
|
||||
func (rm *ReleaseManager) DownloadRelease(version string, progressCallback func(float64)) error {
|
||||
// Ensure the binary directory exists
|
||||
if err := os.MkdirAll(rm.BinaryPath, 0755); err != nil {
|
||||
return fmt.Errorf("failed to create binary directory: %w", err)
|
||||
@@ -190,16 +117,16 @@ func (rm *ReleaseManager) DownloadRelease(version string, progressCallback func(
|
||||
localPath := filepath.Join(rm.BinaryPath, "local-ai")
|
||||
|
||||
// Download the binary
|
||||
downloadURL := fmt.Sprintf("%s/%s/%s/releases/download/%s/%s",
|
||||
rm.BaseDownloadURL, rm.GitHubOwner, rm.GitHubRepo, version, binaryName)
|
||||
downloadURL := fmt.Sprintf("https://github.com/%s/%s/releases/download/%s/%s",
|
||||
rm.GitHubOwner, rm.GitHubRepo, version, binaryName)
|
||||
|
||||
if err := rm.downloadFile(downloadURL, localPath, progressCallback); err != nil {
|
||||
return fmt.Errorf("failed to download binary: %w", err)
|
||||
}
|
||||
|
||||
// Download and verify checksums
|
||||
checksumURL := fmt.Sprintf("%s/%s/%s/releases/download/%s/LocalAI-%s-checksums.txt",
|
||||
rm.BaseDownloadURL, rm.GitHubOwner, rm.GitHubRepo, version, version)
|
||||
checksumURL := fmt.Sprintf("https://github.com/%s/%s/releases/download/%s/LocalAI-%s-checksums.txt",
|
||||
rm.GitHubOwner, rm.GitHubRepo, version, version)
|
||||
|
||||
checksumPath := filepath.Join(rm.BinaryPath, "checksums.txt")
|
||||
manualChecksumPath := filepath.Join(rm.ChecksumsPath, fmt.Sprintf("checksums-%s.txt", version))
|
||||
@@ -227,10 +154,6 @@ func (rm *ReleaseManager) DownloadRelease(version string, progressCallback func(
|
||||
// Verify the checksum if we have a checksum file
|
||||
if _, err := os.Stat(checksumPath); err == nil {
|
||||
if err := rm.VerifyChecksum(localPath, checksumPath, binaryName); err != nil {
|
||||
// Discard the corrupt binary (and any leftover partial) so the next
|
||||
// retry starts from a clean slate rather than resuming corruption.
|
||||
os.Remove(localPath)
|
||||
os.Remove(localPath + ".part")
|
||||
return fmt.Errorf("checksum verification failed: %w", err)
|
||||
}
|
||||
log.Printf("Checksum verification successful")
|
||||
@@ -273,88 +196,44 @@ func (rm *ReleaseManager) GetBinaryName(version string) string {
|
||||
}
|
||||
|
||||
// downloadFile downloads a file from a URL to a local path with optional progress callback
|
||||
func (rm *ReleaseManager) downloadFile(url, filepath string, progressCallback func(downloaded, total int64)) error {
|
||||
func (rm *ReleaseManager) downloadFile(url, filepath string, progressCallback func(float64)) error {
|
||||
return rm.downloadFileWithRetry(url, filepath, progressCallback, 3)
|
||||
}
|
||||
|
||||
// downloadFileWithRetry downloads a file with retry and HTTP Range resume.
|
||||
//
|
||||
// The body is streamed to "<dest>.part" and only renamed to dest on success, so
|
||||
// a dropped connection leaves a partial file that the next attempt continues via
|
||||
// a "Range: bytes=N-" request instead of restarting from zero. This matters for
|
||||
// GitHub release downloads, which are large and flaky.
|
||||
func (rm *ReleaseManager) downloadFileWithRetry(url, dest string, progressCallback func(downloaded, total int64), maxRetries int) error {
|
||||
partPath := dest + ".part"
|
||||
// downloadFileWithRetry downloads a file from a URL with retry logic
|
||||
func (rm *ReleaseManager) downloadFileWithRetry(url, filepath string, progressCallback func(float64), maxRetries int) error {
|
||||
var lastErr error
|
||||
|
||||
for attempt := 1; attempt <= maxRetries; attempt++ {
|
||||
if attempt > 1 {
|
||||
log.Printf("Retrying download (attempt %d/%d): %s", attempt, maxRetries, url)
|
||||
time.Sleep(time.Duration(attempt) * rm.RetryBackoff)
|
||||
time.Sleep(time.Duration(attempt) * time.Second)
|
||||
}
|
||||
|
||||
// Resume from however much we already have on disk.
|
||||
var offset int64
|
||||
if fi, err := os.Stat(partPath); err == nil {
|
||||
offset = fi.Size()
|
||||
}
|
||||
|
||||
req, err := http.NewRequest(http.MethodGet, url, nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if offset > 0 {
|
||||
req.Header.Set("Range", fmt.Sprintf("bytes=%d-", offset))
|
||||
}
|
||||
|
||||
resp, err := rm.HTTPClient.Do(req)
|
||||
resp, err := rm.HTTPClient.Get(url)
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
continue
|
||||
}
|
||||
|
||||
switch resp.StatusCode {
|
||||
case http.StatusOK:
|
||||
// Server ignored the Range (or we had nothing): start fresh.
|
||||
offset = 0
|
||||
case http.StatusPartialContent:
|
||||
// Resume: append to the existing partial file.
|
||||
case http.StatusRequestedRangeNotSatisfiable:
|
||||
// Stale or already-complete partial: discard and restart fresh.
|
||||
resp.Body.Close()
|
||||
os.Remove(partPath)
|
||||
lastErr = fmt.Errorf("partial download no longer valid (status %s), restarting", resp.Status)
|
||||
continue
|
||||
default:
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
resp.Body.Close()
|
||||
lastErr = fmt.Errorf("bad status: %s", resp.Status)
|
||||
continue
|
||||
}
|
||||
|
||||
var out *os.File
|
||||
if offset > 0 {
|
||||
out, err = os.OpenFile(partPath, os.O_WRONLY|os.O_APPEND, 0644)
|
||||
} else {
|
||||
out, err = os.Create(partPath)
|
||||
}
|
||||
out, err := os.Create(filepath)
|
||||
if err != nil {
|
||||
resp.Body.Close()
|
||||
return err
|
||||
}
|
||||
|
||||
// On a 206 the Content-Length is the remaining bytes, so the full size
|
||||
// is what we already have plus what's still to come.
|
||||
total := resp.ContentLength
|
||||
if offset > 0 && total > 0 {
|
||||
total += offset
|
||||
}
|
||||
|
||||
// Create a progress reader if callback is provided
|
||||
var reader io.Reader = resp.Body
|
||||
if progressCallback != nil && total > 0 {
|
||||
if progressCallback != nil && resp.ContentLength > 0 {
|
||||
reader = &progressReader{
|
||||
Reader: resp.Body,
|
||||
Total: total,
|
||||
Current: offset,
|
||||
Total: resp.ContentLength,
|
||||
Callback: progressCallback,
|
||||
}
|
||||
}
|
||||
@@ -364,14 +243,11 @@ func (rm *ReleaseManager) downloadFileWithRetry(url, dest string, progressCallba
|
||||
out.Close()
|
||||
|
||||
if err != nil {
|
||||
// Keep the partial file so the next attempt can resume from it.
|
||||
lastErr = err
|
||||
os.Remove(filepath)
|
||||
continue
|
||||
}
|
||||
|
||||
if err := os.Rename(partPath, dest); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -446,21 +322,20 @@ func (rm *ReleaseManager) saveVersionMetadata(version string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// progressReader wraps an io.Reader to provide download progress as a
|
||||
// (downloaded, total) byte count so callers can render both a progress bar and
|
||||
// a human-readable size.
|
||||
// progressReader wraps an io.Reader to provide download progress
|
||||
type progressReader struct {
|
||||
io.Reader
|
||||
Total int64
|
||||
Current int64
|
||||
Callback func(downloaded, total int64)
|
||||
Callback func(float64)
|
||||
}
|
||||
|
||||
func (pr *progressReader) Read(p []byte) (int, error) {
|
||||
n, err := pr.Reader.Read(p)
|
||||
pr.Current += int64(n)
|
||||
if pr.Callback != nil {
|
||||
pr.Callback(pr.Current, pr.Total)
|
||||
progress := float64(pr.Current) / float64(pr.Total)
|
||||
pr.Callback(progress)
|
||||
}
|
||||
return n, err
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user