From 718b31d063a5083fcc5d57c6245818e2b6d83242 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sat, 20 Jun 2026 21:46:38 +0000 Subject: [PATCH] kernel(P1): W4A16 dispatch seam (gated, byte-identical fallback to MMQ) marlin-w4a16.{cuh,cu} + a gated hook in ggml_cuda_mul_mat (dense path), behind GGML_CUDA_W4A16 + sm_120/121 + Q4_0/Q4_K + f32. Returns false -> MMQ, so the default build is byte-identical. Verified on GB10: clean build, test-backend-ops MUL_MAT 1103/1103, llama-bench pp512 unchanged (717.77 default / 718.26 flagged), and GGML_CUDA_W4A16=1 reaches the seam ([w4a16] P1 warning) before falling back. Source + apply steps under kernel/w4a16/ (DGX checkout is volatile). The frame the P2 correctness kernel + P3 Marlin pipeline fill. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto --- .../paged/W4A16_MARLIN_KERNEL_PLAN.md | 11 +++-- .../cpp/llama-cpp/paged/kernel/w4a16/HOOK.md | 31 +++++++++++++ .../paged/kernel/w4a16/marlin-w4a16.cu | 45 +++++++++++++++++++ .../paged/kernel/w4a16/marlin-w4a16.cuh | 14 ++++++ 4 files changed, 97 insertions(+), 4 deletions(-) create mode 100644 backend/cpp/llama-cpp/paged/kernel/w4a16/HOOK.md create mode 100644 backend/cpp/llama-cpp/paged/kernel/w4a16/marlin-w4a16.cu create mode 100644 backend/cpp/llama-cpp/paged/kernel/w4a16/marlin-w4a16.cuh diff --git a/backend/cpp/llama-cpp/paged/W4A16_MARLIN_KERNEL_PLAN.md b/backend/cpp/llama-cpp/paged/W4A16_MARLIN_KERNEL_PLAN.md index 213e7b94b..89f583dd6 100644 --- a/backend/cpp/llama-cpp/paged/W4A16_MARLIN_KERNEL_PLAN.md +++ b/backend/cpp/llama-cpp/paged/W4A16_MARLIN_KERNEL_PLAN.md @@ -42,10 +42,13 @@ and **Stream-K** partitioning. Sources: IST-DASLab/marlin, arXiv 2408.11743, vLL phase: `test-backend-ops test -o MUL_MAT -b CUDA0` must stay 1103/1103; the q4_K n=512 perf must climb from 47. - test-backend-ops needed `-DLLAMA_BUILD_TESTS=ON`; now built in `~/llama.cpp-pr24423/build`. -### P1 — Dispatch seam (no behavior change) -- New `ggml/src/ggml-cuda/marlin-w4a16.cu` + a gated hook in `ggml_cuda_mul_mat` (dense, non-ids path), - behind `GGML_CUDA_W4A16` + sm_120/121 + type∈{Q4_0,Q4_K}. Initially returns false → falls back to MMQ. - (Mirror of the `fp4-grouped-moe.cu` scaffold seam.) Builds byte-identical by default. +### P1 — Dispatch seam (no behavior change) — DONE +- `marlin-w4a16.{cuh,cu}` + a gated hook in `ggml_cuda_mul_mat` (dense, non-ids path), behind + `GGML_CUDA_W4A16` + sm_120/121 (`cc >= GGML_CUDA_CC_BLACKWELL`) + type∈{Q4_0,Q4_K} + f32 activations. + Returns false → falls back to MMQ. Source + apply instructions: `kernel/w4a16/` (`HOOK.md`). +- **Verified on GB10:** clean build; `test-backend-ops MUL_MAT` = **1103/1103** (byte-identical default); + `llama-bench` dense Q4 pp512 unchanged (717.77 default / 718.26 with flag); `GGML_CUDA_W4A16=1` reaches the + seam (stderr `[w4a16] ... P1 seam - using MMQ`) and falls back. The empty frame P2/P3 fills. ### P2 — Correctness-first kernel (slow OK) - Dequant Q4→BF16 (reuse ggml's `dequantize_block_q4_K`) into shared mem, naive `mma.sync m16n8k16` BF16 diff --git a/backend/cpp/llama-cpp/paged/kernel/w4a16/HOOK.md b/backend/cpp/llama-cpp/paged/kernel/w4a16/HOOK.md new file mode 100644 index 000000000..a701f1496 --- /dev/null +++ b/backend/cpp/llama-cpp/paged/kernel/w4a16/HOOK.md @@ -0,0 +1,31 @@ +# W4A16 seam — how to apply to a llama.cpp / ggml-cuda checkout + +Two source files + two one-line edits to `ggml/src/ggml-cuda/ggml-cuda.cu`. The build picks up the +new `.cu` via the existing `file(GLOB)` after a `cmake -S . -B build` reconfigure (no CMakeLists edit). + +## Files (copy into `ggml/src/ggml-cuda/`) +- `marlin-w4a16.cuh` +- `marlin-w4a16.cu` + +## Edit `ggml/src/ggml-cuda/ggml-cuda.cu` + +1. **Include** — after the existing `#include "ggml-cuda/fp4-grouped-moe.cuh"` (sibling-header style): + ```cpp + #include "ggml-cuda/marlin-w4a16.cuh" + ``` + +2. **Dispatch hook** — immediately before the dense dispatch chain, i.e. before + `if (!split && use_mul_mat_vec_f) {` in `ggml_cuda_mul_mat(...)` (after `const int cc = ...`): + ```cpp + if (!split && ggml_cuda_w4a16_mul_mat(ctx, src0, src1, dst)) { return; } + ``` + +## Verify (P1 acceptance — met) +- `cmake --build build --target test-backend-ops llama-bench` → builds clean. +- `test-backend-ops test -o MUL_MAT -b CUDA0` → **1103/1103** (byte-identical default). +- `llama-bench` dense Q4 pp512 → unchanged (~718, MMQ). +- `GGML_CUDA_W4A16=1 llama-bench` → unchanged + stderr `[w4a16] ... P1 seam - using MMQ` (seam reached, + gating passes on sm_121, falls back). + +The kernel body (P2 correctness → P3 Marlin pipeline) replaces the `TODO(P2/P3)` block in `marlin-w4a16.cu` +and returns `true` once parity holds. diff --git a/backend/cpp/llama-cpp/paged/kernel/w4a16/marlin-w4a16.cu b/backend/cpp/llama-cpp/paged/kernel/w4a16/marlin-w4a16.cu new file mode 100644 index 000000000..9105e0653 --- /dev/null +++ b/backend/cpp/llama-cpp/paged/kernel/w4a16/marlin-w4a16.cu @@ -0,0 +1,45 @@ +#include "marlin-w4a16.cuh" + +#include +#include + +// P1: dispatch seam only. The BF16 Marlin kernel (dequant Q4->BF16 in shared mem, +// mma.sync m16n8k16, cp.async double-buffered pipeline, offline weight reshuffle) +// lands in P2/P3. For now this always falls back to MMQ, so the default build is +// byte-identical and the test-backend-ops MUL_MAT gate stays 1103/1103. + +static bool w4a16_enabled() { + static const bool en = (std::getenv("GGML_CUDA_W4A16") != nullptr); + return en; +} + +bool ggml_cuda_w4a16_mul_mat( + ggml_backend_cuda_context & ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst) { + GGML_UNUSED(ctx); + + if (!w4a16_enabled()) { + return false; + } + if (src0->type != GGML_TYPE_Q4_0 && src0->type != GGML_TYPE_Q4_K) { + return false; + } + if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) { + return false; + } + const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; + if (!GGML_CUDA_CC_IS_NVIDIA(cc) || cc < GGML_CUDA_CC_BLACKWELL) { + return false; // consumer Blackwell (sm_120/121) only + } + + // TODO(P2/P3): launch the W4A16 BF16 Marlin kernel here; verify parity vs MMQ + // (test-backend-ops) before returning true. + static bool warned = false; + if (!warned) { + warned = true; + fprintf(stderr, "[w4a16] GGML_CUDA_W4A16 set, kernel not yet implemented (P1 seam) - using MMQ\n"); + } + return false; +} diff --git a/backend/cpp/llama-cpp/paged/kernel/w4a16/marlin-w4a16.cuh b/backend/cpp/llama-cpp/paged/kernel/w4a16/marlin-w4a16.cuh new file mode 100644 index 000000000..253149d67 --- /dev/null +++ b/backend/cpp/llama-cpp/paged/kernel/w4a16/marlin-w4a16.cuh @@ -0,0 +1,14 @@ +#pragma once + +#include "common.cuh" + +// W4A16 Marlin-style BF16 GEMM for NVIDIA Blackwell consumer GPUs (sm_120/121). +// Dense (non-MoE) 4-bit-weight matmul run on BF16 tensor cores, the path that +// reaches the GB10 BF16 ceiling where MMQ (int8, Ampere-tuned) and cuBLAS (sm_80 +// fallback) both plateau at ~22% of it. Returns true if it handled the op; false +// to fall back to MMQ. Gated behind GGML_CUDA_W4A16 until correct + faster. +bool ggml_cuda_w4a16_mul_mat( + ggml_backend_cuda_context & ctx, + const ggml_tensor * src0, // 4-bit weights (Q4_0/Q4_K) + const ggml_tensor * src1, // F32 activations + ggml_tensor * dst); // F32 output