diff --git a/backend/cpp/llama-cpp/paged/FP4_GROUPED_MOE_KERNEL.md b/backend/cpp/llama-cpp/paged/FP4_GROUPED_MOE_KERNEL.md new file mode 100644 index 000000000..80e198e08 --- /dev/null +++ b/backend/cpp/llama-cpp/paged/FP4_GROUPED_MOE_KERNEL.md @@ -0,0 +1,52 @@ +# FP4 grouped-GEMM MoE kernel (Lever 3) — scaffold + implementation plan + +The one piece of work that actually closes the vLLM gap on Blackwell (GB10/sm_121). Both phases are +bottlenecked by the same kernel: `mul_mat_q` (warp-level `mma.sync` grouped MMQ, ~22 TFLOP/s) is +**37%** of prefill and **54.6%** of decode-at-B=64 GPU time (`BENCHMARKS.md`). Paged attention can't touch +it (proven). The fix is a CUTLASS-3.x collective-mainloop grouped GEMM with block-scaled `e2m1` operands via +tcgen05 tensor-memory MMA — what vLLM/FlashInfer/TRT-LLM use. + +## Scaffold (DONE — builds clean, default byte-identical) + +Lives in the DGX checkout `~/llama.cpp-pr24423/ggml/src/ggml-cuda/` (to be rebased onto the pin as a patch / +upstreamed). Captured diff: `patches/kernel/0001-fp4-grouped-moe-scaffold.patch`. + +- `fp4-grouped-moe.{cuh,cu}` — entry `ggml_cuda_fp4_grouped_moe(ctx, src0, src1, ids, dst) -> bool` + (true = handled, false = fall back to MMQ). Gated behind env `GGML_CUDA_FP4_GROUPED`. Currently always + returns false → **default build unchanged**. +- Hook in `ggml_cuda_mul_mat_id` (the MoE dispatch), before the `ggml_cuda_mul_mat_q(...ids...)` call: + `if (ggml_cuda_fp4_grouped_moe(...)) return;`. Builds via the `file(GLOB "*.cu")` (re-run cmake configure + after adding the file — GLOB is configure-time). + +This is the integration seam. The kernel fills the stub. + +## Implementation phases (each: build on GB10 → numerical parity vs `mul_mat_q` → bench) + +1. **Reference grouped GEMM (correctness first, slow OK).** Per-expert problem sizes + offsets from `ids`; + dequant `e2m1`+scales → BF16; loop CUTLASS (or cuBLAS) per group. Gate: output matches MMQ within fp tol + on a 2-expert toy + the real model (token-identical greedy). Establishes the harness + the data plumbing. +2. **CUTLASS GemmGrouped, sm_120a, BF16 operands.** Replace the loop with one `cutlass::gemm::device:: + GemmGrouped` launch over all experts (per-group offsets). Measures the grouping win alone. +3. **Block-scaled FP4 operands (the real lever).** `e2m1` A/B with `e8m0`(MX)/`e4m3`(NV) block scales via the + Blackwell scaled-MMA collective (tcgen05 tensor-memory). This is where the TFLOP/s jumps. Needs CUTLASS + 3.x + sm_120a; verify the block-scale layout matches ggml's MXFP4/NVFP4 packing. +4. **Fuse activation quant** (the F32→FP4 of src1) into the gather/permute prologue. +5. **Enable by default** on sm_120/121 when parity holds + faster; keep the env as an escape hatch. + +## Dependencies / decisions + +- **CUTLASS is not currently a ggml dependency** (the profile's `cutlass_80_tensorop` is cuBLAS-internal). + Adding it = submodule/fetch + include dir, gated to CUDA sm_120+. Float the approach with ggml maintainers + early (Discussion #18369 is the home; JohannesGaessler asked to discuss arch before big kernel work). +- Target sm_120a/121a (consumer Blackwell). Datacenter Blackwell (sm_100) is a separate tile config. +- Risk: needs ncu-driven iteration on the GB10; this is multi-week, expert-CUDA. No upstream base to fork + (exhaustive search confirmed). Net-new value upstream. + +## DENSE follow-up (TODO #28 — important, do before committing to MoE-only) + +This kernel is **grouped** (MoE). **Dense** models (e.g. Qwen3 ~27B) use the non-grouped FP4 GEMM path — a +different kernel. Before assuming the kernel work is MoE-only, benchmark **Qwen3-27B dense: vLLM NVFP4 vs +llama.cpp Q4_K_M** (prefill+decode, GB10). If dense shows the same large gap → the kernel track must also +deliver a non-grouped block-scaled FP4 GEMM (a CUTLASS dense GEMM, simpler than grouped). If dense is already +competitive (single-stream dense was only ~10% of MoE-model time) → MoE-grouped is the priority and dense can +ride the existing MMQ/cuBLAS path. This decides the kernel scope. diff --git a/backend/cpp/llama-cpp/patches/kernel/0001-fp4-grouped-moe-scaffold.patch b/backend/cpp/llama-cpp/patches/kernel/0001-fp4-grouped-moe-scaffold.patch new file mode 100644 index 000000000..d1920560a --- /dev/null +++ b/backend/cpp/llama-cpp/patches/kernel/0001-fp4-grouped-moe-scaffold.patch @@ -0,0 +1,91 @@ +diff --git a/ggml/src/ggml-cuda/fp4-grouped-moe.cu b/ggml/src/ggml-cuda/fp4-grouped-moe.cu +new file mode 100644 +index 0000000..5f5a782 +--- /dev/null ++++ b/ggml/src/ggml-cuda/fp4-grouped-moe.cu +@@ -0,0 +1,46 @@ ++#include "fp4-grouped-moe.cuh" ++ ++#include ++#include ++ ++// SCAFFOLD for the FP4 grouped-GEMM MoE kernel (Lever 3). ++// ++// Why: on GB10 (sm_121) the MoE matmul runs mul_mat_q - a warp-level mma.sync grouped MMQ - ++// at ~22 effective TFLOP/s, ~27x behind vLLM prefill, and it also dominates decode at concurrency ++// (54.6% of GPU time at B=64). It is the single bottleneck to vLLM parity in BOTH phases; paged ++// attention cannot touch it (proven by profiling). The fix is a CUTLASS-3.x collective-mainloop ++// grouped GEMM over all experts, block-scaled e2m1 operands via tcgen05 tensor-memory MMA. ++// ++// This file is the integration seam. It is currently a no-op that always falls back to MMQ, so the ++// default build is byte-identical. The kernel is filled in over the phases in the design doc. ++ ++static bool fp4_grouped_enabled() { ++ static const bool en = (std::getenv("GGML_CUDA_FP4_GROUPED") != nullptr); ++ return en; ++} ++ ++bool ggml_cuda_fp4_grouped_moe( ++ ggml_backend_cuda_context & ctx, ++ const ggml_tensor * src0, ++ const ggml_tensor * src1, ++ const ggml_tensor * ids, ++ ggml_tensor * dst) { ++ GGML_UNUSED(ctx); GGML_UNUSED(src1); GGML_UNUSED(ids); GGML_UNUSED(dst); ++ ++ if (!fp4_grouped_enabled()) { ++ return false; // default: existing MMQ path ++ } ++ if (src0->type != GGML_TYPE_MXFP4 && src0->type != GGML_TYPE_NVFP4) { ++ return false; ++ } ++ ++ // TODO(kernel - see kernel design doc): CUTLASS 3.x GemmGrouped, sm_120a, block-scaled e2m1, ++ // tcgen05 MMA; per-expert problem offsets from `ids`; fused activation quant; numerical parity ++ // vs mul_mat_q before enabling by default. ++ static bool warned = false; ++ if (!warned) { ++ warned = true; ++ fprintf(stderr, "[fp4-grouped] GGML_CUDA_FP4_GROUPED set, kernel not yet implemented - using MMQ\n"); ++ } ++ return false; // scaffold: fall back until the kernel lands ++} +diff --git a/ggml/src/ggml-cuda/fp4-grouped-moe.cuh b/ggml/src/ggml-cuda/fp4-grouped-moe.cuh +new file mode 100644 +index 0000000..29e1b5a +--- /dev/null ++++ b/ggml/src/ggml-cuda/fp4-grouped-moe.cuh +@@ -0,0 +1,13 @@ ++#pragma once ++ ++#include "common.cuh" ++ ++// Entry point for the tcgen05/CUTLASS block-scaled FP4 (MXFP4/NVFP4) grouped-GEMM MoE kernel for ++// Blackwell consumer GPUs (sm_120/121). Returns true if it handled the op; false to fall back to ++// the existing warp-mma MMQ path. Gated behind GGML_CUDA_FP4_GROUPED until correct + faster. ++bool ggml_cuda_fp4_grouped_moe( ++ ggml_backend_cuda_context & ctx, ++ const ggml_tensor * src0, // expert weights, MXFP4/NVFP4 [n_embd, n_ff, n_expert] ++ const ggml_tensor * src1, // activations, F32 [n_embd, n_tokens, ...] ++ const ggml_tensor * ids, // expert routing, I32 ++ ggml_tensor * dst); // F32 output +diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu +index 8ea462a..104d131 100644 +--- a/ggml/src/ggml-cuda/ggml-cuda.cu ++++ b/ggml/src/ggml-cuda/ggml-cuda.cu +@@ -30,6 +30,7 @@ + #include "ggml-cuda/im2col.cuh" + #include "ggml-cuda/mmf.cuh" + #include "ggml-cuda/mmq.cuh" ++#include "ggml-cuda/fp4-grouped-moe.cuh" + #include "ggml-cuda/mmvf.cuh" + #include "ggml-cuda/mmvq.cuh" + #include "ggml-cuda/norm.cuh" +@@ -2701,6 +2702,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * + } + + if (ggml_cuda_should_use_mmq(src0->type, cc, ne12, /*n_experts=*/ne02)) { ++ if (ggml_cuda_fp4_grouped_moe(ctx, src0, src1, ids, dst)) { return; } + ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst); + return; + }